2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
12 from textwrap import dedent
13 from pathlib import Path
15 from icu import Transliterator
16 import psycopg2.extras
18 from nominatim.db.connection import connect
19 from nominatim.db.properties import set_property, get_property
20 from nominatim.db.sql_preprocessor import SQLPreprocessor
21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
22 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
24 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
25 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
27 LOG = logging.getLogger()
29 def create(dsn, data_dir):
30 """ Create a new instance of the tokenizer provided by this module.
32 return LegacyICUTokenizer(dsn, data_dir)
35 class LegacyICUTokenizer:
36 """ This tokenizer uses libICU to covert names and queries to ASCII.
37 Otherwise it uses the same algorithms and data structures as the
38 normalization routines in Nominatim 3.
41 def __init__(self, dsn, data_dir):
43 self.data_dir = data_dir
44 self.naming_rules = None
45 self.term_normalization = None
46 self.max_word_frequency = None
49 def init_new_db(self, config, init_db=True):
50 """ Set up a new tokenizer for the database.
52 This copies all necessary data in the project directory to make
53 sure the tokenizer remains stable even over updates.
55 if config.TOKENIZER_CONFIG:
56 cfgfile = Path(config.TOKENIZER_CONFIG)
58 cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
60 loader = ICURuleLoader(cfgfile)
61 self.naming_rules = ICUNameProcessorRules(loader=loader)
62 self.term_normalization = config.TERM_NORMALIZATION
63 self.max_word_frequency = config.MAX_WORD_FREQUENCY
65 self._install_php(config.lib_dir.php)
66 self._save_config(config)
69 self.update_sql_functions(config)
70 self._init_db_tables(config)
73 def init_from_project(self):
74 """ Initialise the tokenizer from the project directory.
76 with connect(self.dsn) as conn:
77 self.naming_rules = ICUNameProcessorRules(conn=conn)
78 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
79 self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
82 def finalize_import(self, config):
83 """ Do any required postprocessing to make the tokenizer data ready
86 with connect(self.dsn) as conn:
87 sqlp = SQLPreprocessor(conn, config)
88 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
91 def update_sql_functions(self, config):
92 """ Reimport the SQL functions for this tokenizer.
94 with connect(self.dsn) as conn:
95 max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
96 sqlp = SQLPreprocessor(conn, config)
97 sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
98 max_word_freq=max_word_freq)
101 def check_database(self):
102 """ Check that the tokenizer is set up correctly.
104 self.init_from_project()
106 if self.normalization is None\
107 or self.transliteration is None\
108 or self.abbreviations is None:
109 return "Configuration for tokenizer 'legacy_icu' are missing."
114 def name_analyzer(self):
115 """ Create a new analyzer for tokenizing names and queries
116 using this tokinzer. Analyzers are context managers and should
120 with tokenizer.name_analyzer() as analyzer:
124 When used outside the with construct, the caller must ensure to
125 call the close() function before destructing the analyzer.
127 Analyzers are not thread-safe. You need to instantiate one per thread.
129 return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
132 def _install_php(self, phpdir):
133 """ Install the php script for the tokenizer.
135 php_file = self.data_dir / "tokenizer.php"
136 php_file.write_text(dedent("""\
138 @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
139 @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
140 @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
141 require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
142 """.format(self, phpdir)))
145 def _save_config(self, config):
146 """ Save the configuration that needs to remain stable for the given
147 database as database properties.
149 with connect(self.dsn) as conn:
150 self.naming_rules.save_rules(conn)
152 set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
153 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
156 def _init_db_tables(self, config):
157 """ Set up the word table and fill it with pre-computed word
160 with connect(self.dsn) as conn:
161 sqlp = SQLPreprocessor(conn, config)
162 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
165 LOG.warning("Precomputing word tokens")
167 # get partial words and their frequencies
169 name_proc = ICUNameProcessor(self.naming_rules)
170 with conn.cursor(name="words") as cur:
171 cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
173 for name, cnt in cur:
174 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
175 for term in word.split():
178 # copy them back into the word table
179 copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
182 with conn.cursor() as cur:
184 cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
185 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
186 WHERE word_id is null""")
191 class LegacyICUNameAnalyzer:
192 """ The legacy analyzer uses the ICU library for splitting names.
194 Each instance opens a connection to the database to request the
198 def __init__(self, dsn, name_proc):
199 self.conn = connect(dsn).connection
200 self.conn.autocommit = True
201 self.name_processor = name_proc
203 self._cache = _TokenCache()
210 def __exit__(self, exc_type, exc_value, traceback):
215 """ Free all resources used by the analyzer.
222 def get_word_token_info(self, words):
223 """ Return token information for the given list of words.
224 If a word starts with # it is assumed to be a full name
225 otherwise is a partial name.
227 The function returns a list of tuples with
228 (original word, word token, word id).
230 The function is used for testing and debugging only
231 and not necessarily efficient.
235 if word.startswith('#'):
236 tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
238 tokens[word] = self.name_processor.get_search_normalized(word)
240 with self.conn.cursor() as cur:
241 cur.execute("""SELECT word_token, word_id
242 FROM word, (SELECT unnest(%s::TEXT[]) as term) t
243 WHERE word_token = t.term
244 and class is null and country_code is null""",
245 (list(tokens.values()), ))
246 ids = {r[0]: r[1] for r in cur}
248 return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
252 def normalize_postcode(postcode):
253 """ Convert the postcode to a standardized form.
255 This function must yield exactly the same result as the SQL function
256 'token_normalized_postcode()'.
258 return postcode.strip().upper()
261 def _make_standard_hnr(self, hnr):
262 """ Create a normalised version of a housenumber.
264 This function takes minor shortcuts on transliteration.
266 return self.name_processor.get_search_normalized(hnr)
268 def update_postcodes_from_db(self):
269 """ Update postcode tokens in the word table from the location_postcode
273 copystr = io.StringIO()
274 with self.conn.cursor() as cur:
275 # This finds us the rows in location_postcode and word that are
276 # missing in the other table.
277 cur.execute("""SELECT * FROM
278 (SELECT pc, word FROM
279 (SELECT distinct(postcode) as pc FROM location_postcode) p
281 (SELECT word FROM word
282 WHERE class ='place' and type = 'postcode') w
284 WHERE pc is null or word is null""")
286 for postcode, word in cur:
288 to_delete.append(word)
290 copystr.write(postcode)
292 copystr.write(self.name_processor.get_search_normalized(postcode))
293 copystr.write('\tplace\tpostcode\t0\n')
296 cur.execute("""DELETE FROM WORD
297 WHERE class ='place' and type = 'postcode'
301 if copystr.getvalue():
303 cur.copy_from(copystr, 'word',
304 columns=['word', 'word_token', 'class', 'type',
305 'search_name_count'])
308 def update_special_phrases(self, phrases, should_replace):
309 """ Replace the search index for special phrases with the new phrases.
311 norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
314 with self.conn.cursor() as cur:
315 # Get the old phrases.
316 existing_phrases = set()
317 cur.execute("""SELECT word, class, type, operator FROM word
318 WHERE class != 'place'
319 OR (type != 'house' AND type != 'postcode')""")
320 for label, cls, typ, oper in cur:
321 existing_phrases.add((label, cls, typ, oper or '-'))
323 to_add = norm_phrases - existing_phrases
324 to_delete = existing_phrases - norm_phrases
327 copystr = io.StringIO()
328 for word, cls, typ, oper in to_add:
329 term = self.name_processor.get_search_normalized(word)
339 copystr.write(oper if oper in ('in', 'near') else '\\N')
340 copystr.write('\t0\n')
343 cur.copy_from(copystr, 'word',
344 columns=['word', 'word_token', 'class', 'type',
345 'operator', 'search_name_count'])
347 if to_delete and should_replace:
348 psycopg2.extras.execute_values(
350 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
351 WHERE word = name and class = in_class and type = in_type
352 and ((op = '-' and operator is null) or op = operator)""",
355 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
356 len(norm_phrases), len(to_add), len(to_delete))
359 def add_country_names(self, country_code, names):
360 """ Add names for the given country to the search index.
363 for name in self._compute_full_names(names):
365 word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
367 with self.conn.cursor() as cur:
369 cur.execute("SELECT word_token FROM word WHERE country_code = %s",
371 word_tokens.difference_update((t[0] for t in cur))
374 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
376 (SELECT nextval('seq_word'), token, '{}', 0
377 FROM unnest(%s) as token)
378 """.format(country_code), (list(word_tokens),))
381 def process_place(self, place):
382 """ Determine tokenizer information about the given place.
384 Returns a JSON-serialisable structure that will be handed into
385 the database via the token_info field.
387 token_info = _TokenInfo(self._cache)
389 names = place.get('name')
392 fulls, partials = self._compute_name_tokens(names)
394 token_info.add_names(fulls, partials)
396 country_feature = place.get('country_feature')
397 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
398 self.add_country_names(country_feature.lower(), names)
400 address = place.get('address')
405 for key, value in address.items():
406 if key == 'postcode':
407 self._add_postcode(value)
408 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
410 elif key == 'street':
411 token_info.add_street(*self._compute_name_tokens({'name': value}))
413 token_info.add_place(*self._compute_name_tokens({'name': value}))
414 elif not key.startswith('_') and \
415 key not in ('country', 'full'):
416 addr_terms.append((key, *self._compute_name_tokens({'name': value})))
419 hnrs = self._split_housenumbers(hnrs)
420 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
423 token_info.add_address_terms(addr_terms)
425 return token_info.data
428 def _compute_name_tokens(self, names):
429 """ Computes the full name and partial name tokens for the given
432 full_names = self._compute_full_names(names)
434 partial_tokens = set()
436 for name in full_names:
437 norm_name = self.name_processor.get_normalized(name)
438 full, part = self._cache.names.get(norm_name, (None, None))
440 variants = self.name_processor.get_variants_ascii(norm_name)
441 with self.conn.cursor() as cur:
442 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
443 (norm_name, variants))
444 full, part = cur.fetchone()
446 self._cache.names[norm_name] = (full, part)
448 full_tokens.add(full)
449 partial_tokens.update(part)
451 return full_tokens, partial_tokens
454 def _compute_full_names(self, names):
455 """ Return the set of all full name word ids to be used with the
456 given dictionary of names.
459 for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
460 full_names.add(name.strip())
462 brace_idx = name.find('(')
464 full_names.add(name[:brace_idx].strip())
469 def _add_postcode(self, postcode):
470 """ Make sure the normalized postcode is present in the word table.
472 if re.search(r'[:,;]', postcode) is None:
473 postcode = self.normalize_postcode(postcode)
475 if postcode not in self._cache.postcodes:
476 term = self.name_processor.get_search_normalized(postcode)
480 with self.conn.cursor() as cur:
481 # no word_id needed for postcodes
482 cur.execute("""INSERT INTO word (word, word_token, class, type,
484 (SELECT pc, %s, 'place', 'postcode', 0
485 FROM (VALUES (%s)) as v(pc)
488 WHERE word = pc and class='place' and type='postcode'))
489 """, (' ' + term, postcode))
490 self._cache.postcodes.add(postcode)
494 def _split_housenumbers(hnrs):
495 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
496 # split numbers if necessary
499 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
501 if len(simple_list) > 1:
502 hnrs = list(set(simple_list))
512 """ Collect token information to be sent back to the database.
514 def __init__(self, cache):
519 def _mk_array(tokens):
520 return '{%s}' % ','.join((str(s) for s in tokens))
523 def add_names(self, fulls, partials):
524 """ Adds token information for the normalised names.
526 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
529 def add_housenumbers(self, conn, hnrs):
530 """ Extract housenumber information from a list of normalised
533 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
534 self.data['hnr'] = ';'.join(hnrs)
537 def add_street(self, fulls, partials):
538 """ Add addr:street match terms.
541 self.data['street'] = self._mk_array(fulls)
544 def add_place(self, fulls, partials):
545 """ Add addr:place search and match terms.
548 self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
549 self.data['place_match'] = self._mk_array(fulls)
552 def add_address_terms(self, terms):
553 """ Add additional address terms.
557 for key, fulls, partials in terms:
559 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
560 self._mk_array(fulls)]
563 self.data['addr'] = tokens
567 """ Cache for token information to avoid repeated database queries.
569 This cache is not thread-safe and needs to be instantiated per
574 self.postcodes = set()
575 self.housenumbers = {}
578 def get_hnr_tokens(self, conn, terms):
579 """ Get token ids for a list of housenumbers, looking them up in the
580 database if necessary.
586 token = self.housenumbers.get(term)
593 with conn.cursor() as cur:
594 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
596 for term, tid in cur:
597 self.housenumbers[term] = tid