2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
9 from textwrap import dedent
10 from pathlib import Path
12 from nominatim.db.connection import connect
13 from nominatim.db.properties import set_property, get_property
14 from nominatim.db.utils import CopyBuffer
15 from nominatim.db.sql_preprocessor import SQLPreprocessor
16 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
17 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
19 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
20 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
22 LOG = logging.getLogger()
24 def create(dsn, data_dir):
25 """ Create a new instance of the tokenizer provided by this module.
27 return LegacyICUTokenizer(dsn, data_dir)
30 class LegacyICUTokenizer:
31 """ This tokenizer uses libICU to covert names and queries to ASCII.
32 Otherwise it uses the same algorithms and data structures as the
33 normalization routines in Nominatim 3.
36 def __init__(self, dsn, data_dir):
38 self.data_dir = data_dir
39 self.naming_rules = None
40 self.term_normalization = None
41 self.max_word_frequency = None
44 def init_new_db(self, config, init_db=True):
45 """ Set up a new tokenizer for the database.
47 This copies all necessary data in the project directory to make
48 sure the tokenizer remains stable even over updates.
50 if config.TOKENIZER_CONFIG:
51 cfgfile = Path(config.TOKENIZER_CONFIG)
53 cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
55 loader = ICURuleLoader(cfgfile)
56 self.naming_rules = ICUNameProcessorRules(loader=loader)
57 self.term_normalization = config.TERM_NORMALIZATION
58 self.max_word_frequency = config.MAX_WORD_FREQUENCY
60 self._install_php(config.lib_dir.php)
61 self._save_config(config)
64 self.update_sql_functions(config)
65 self._init_db_tables(config)
68 def init_from_project(self):
69 """ Initialise the tokenizer from the project directory.
71 with connect(self.dsn) as conn:
72 self.naming_rules = ICUNameProcessorRules(conn=conn)
73 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
74 self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
77 def finalize_import(self, _):
78 """ Do any required postprocessing to make the tokenizer data ready
84 def update_sql_functions(self, config):
85 """ Reimport the SQL functions for this tokenizer.
87 with connect(self.dsn) as conn:
88 max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
89 sqlp = SQLPreprocessor(conn, config)
90 sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
91 max_word_freq=max_word_freq)
94 def check_database(self):
95 """ Check that the tokenizer is set up correctly.
97 self.init_from_project()
99 if self.naming_rules is None:
100 return "Configuration for tokenizer 'legacy_icu' are missing."
105 def name_analyzer(self):
106 """ Create a new analyzer for tokenizing names and queries
107 using this tokinzer. Analyzers are context managers and should
111 with tokenizer.name_analyzer() as analyzer:
115 When used outside the with construct, the caller must ensure to
116 call the close() function before destructing the analyzer.
118 Analyzers are not thread-safe. You need to instantiate one per thread.
120 return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
123 def _install_php(self, phpdir):
124 """ Install the php script for the tokenizer.
126 php_file = self.data_dir / "tokenizer.php"
127 php_file.write_text(dedent(f"""\
129 @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
130 @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
131 @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
132 require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');"""))
135 def _save_config(self, config):
136 """ Save the configuration that needs to remain stable for the given
137 database as database properties.
139 with connect(self.dsn) as conn:
140 self.naming_rules.save_rules(conn)
142 set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
143 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
146 def _init_db_tables(self, config):
147 """ Set up the word table and fill it with pre-computed word
150 with connect(self.dsn) as conn:
151 sqlp = SQLPreprocessor(conn, config)
152 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
155 LOG.warning("Precomputing word tokens")
157 # get partial words and their frequencies
159 name_proc = ICUNameProcessor(self.naming_rules)
160 with conn.cursor(name="words") as cur:
161 cur.execute(""" SELECT v, count(*) FROM
162 (SELECT svals(name) as v FROM place)x
163 WHERE length(v) < 75 GROUP BY v""")
165 for name, cnt in cur:
167 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
169 terms.update(word.split())
173 # copy them back into the word table
174 with CopyBuffer() as copystr:
175 for k, v in words.items():
176 copystr.add('w', k, {'count': v})
178 with conn.cursor() as cur:
179 copystr.copy_out(cur, 'word',
180 columns=['type', 'word_token', 'info'])
181 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
182 WHERE word_id is null and type = 'w'""")
187 class LegacyICUNameAnalyzer:
188 """ The legacy analyzer uses the ICU library for splitting names.
190 Each instance opens a connection to the database to request the
194 def __init__(self, dsn, name_proc):
195 self.conn = connect(dsn).connection
196 self.conn.autocommit = True
197 self.name_processor = name_proc
199 self._cache = _TokenCache()
206 def __exit__(self, exc_type, exc_value, traceback):
211 """ Free all resources used by the analyzer.
218 def get_word_token_info(self, words):
219 """ Return token information for the given list of words.
220 If a word starts with # it is assumed to be a full name
221 otherwise is a partial name.
223 The function returns a list of tuples with
224 (original word, word token, word id).
226 The function is used for testing and debugging only
227 and not necessarily efficient.
232 if word.startswith('#'):
233 full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
235 partial_tokens[word] = self.name_processor.get_search_normalized(word)
237 with self.conn.cursor() as cur:
238 cur.execute("""(SELECT word_token, word_id
239 FROM word WHERE word_token = ANY(%s) and type = 'W')
241 (SELECT word_token, word_id
242 FROM word WHERE word_token = ANY(%s) and type = 'w')""",
243 (list(full_tokens.values()),
244 list(partial_tokens.values())))
245 ids = {r[0]: r[1] for r in cur}
247 return [(k, v, ids.get(v, None)) for k, v in full_tokens.items()] \
248 + [(k, v, ids.get(v, None)) for k, v in partial_tokens.items()]
252 def normalize_postcode(postcode):
253 """ Convert the postcode to a standardized form.
255 This function must yield exactly the same result as the SQL function
256 'token_normalized_postcode()'.
258 return postcode.strip().upper()
261 def _make_standard_hnr(self, hnr):
262 """ Create a normalised version of a housenumber.
264 This function takes minor shortcuts on transliteration.
266 return self.name_processor.get_search_normalized(hnr)
268 def update_postcodes_from_db(self):
269 """ Update postcode tokens in the word table from the location_postcode
273 with self.conn.cursor() as cur:
274 # This finds us the rows in location_postcode and word that are
275 # missing in the other table.
276 cur.execute("""SELECT * FROM
277 (SELECT pc, word FROM
278 (SELECT distinct(postcode) as pc FROM location_postcode) p
280 (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w
282 WHERE pc is null or word is null""")
284 with CopyBuffer() as copystr:
285 for postcode, word in cur:
287 to_delete.append(word)
289 copystr.add(self.name_processor.get_search_normalized(postcode),
290 'P', {'postcode': postcode})
293 cur.execute("""DELETE FROM WORD
294 WHERE type ='P' and info->>'postcode' = any(%s)
297 copystr.copy_out(cur, 'word',
298 columns=['word_token', 'type', 'info'])
301 def update_special_phrases(self, phrases, should_replace):
302 """ Replace the search index for special phrases with the new phrases.
303 If `should_replace` is True, then the previous set of will be
304 completely replaced. Otherwise the phrases are added to the
305 already existing ones.
307 norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
310 with self.conn.cursor() as cur:
311 # Get the old phrases.
312 existing_phrases = set()
313 cur.execute("SELECT info FROM word WHERE type = 'S'")
315 existing_phrases.add((info['word'], info['class'], info['type'],
316 info.get('op') or '-'))
318 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
320 deleted = self._remove_special_phrases(cur, norm_phrases,
325 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
326 len(norm_phrases), added, deleted)
329 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
330 """ Add all phrases to the database that are not yet there.
332 to_add = new_phrases - existing_phrases
335 with CopyBuffer() as copystr:
336 for word, cls, typ, oper in to_add:
337 term = self.name_processor.get_search_normalized(word)
339 copystr.add(term, 'S',
340 {'word': word, 'class': cls, 'type': typ,
341 'op': oper if oper in ('in', 'near') else None})
344 copystr.copy_out(cursor, 'word',
345 columns=['word_token', 'type', 'info'])
351 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
352 """ Remove all phrases from the databse that are no longer in the
355 to_delete = existing_phrases - new_phrases
358 cursor.execute_values(
359 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
360 WHERE info->>'word' = name
361 and info->>'class' = in_class and info->>'type' = in_type
362 and ((op = '-' and info->>'op' is null) or op = info->>'op')
365 return len(to_delete)
368 def add_country_names(self, country_code, names):
369 """ Add names for the given country to the search index.
372 for name in self._compute_full_names(names):
373 norm_name = self.name_processor.get_search_normalized(name)
375 word_tokens.add(norm_name)
377 with self.conn.cursor() as cur:
379 cur.execute("""SELECT word_token FROM word
380 WHERE type = 'C' and info->>'cc'= %s""",
382 word_tokens.difference_update((t[0] for t in cur))
384 # Only add those names that are not yet in the list.
386 cur.execute("""INSERT INTO word (word_token, type, info)
387 (SELECT token, 'C', json_build_object('cc', %s)
388 FROM unnest(%s) as token)
389 """, (country_code, list(word_tokens)))
391 # No names are deleted at the moment.
392 # If deletion is made possible, then the static names from the
393 # initial 'country_name' table should be kept.
396 def process_place(self, place):
397 """ Determine tokenizer information about the given place.
399 Returns a JSON-serialisable structure that will be handed into
400 the database via the token_info field.
402 token_info = _TokenInfo(self._cache)
404 names = place.get('name')
407 fulls, partials = self._compute_name_tokens(names)
409 token_info.add_names(fulls, partials)
411 country_feature = place.get('country_feature')
412 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
413 self.add_country_names(country_feature.lower(), names)
415 address = place.get('address')
417 self._process_place_address(token_info, address)
419 return token_info.data
422 def _process_place_address(self, token_info, address):
425 for key, value in address.items():
426 if key == 'postcode':
427 self._add_postcode(value)
428 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
430 elif key == 'street':
431 token_info.add_street(*self._compute_name_tokens({'name': value}))
433 token_info.add_place(*self._compute_name_tokens({'name': value}))
434 elif not key.startswith('_') and \
435 key not in ('country', 'full'):
436 addr_terms.append((key, *self._compute_name_tokens({'name': value})))
439 hnrs = self._split_housenumbers(hnrs)
440 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
443 token_info.add_address_terms(addr_terms)
446 def _compute_name_tokens(self, names):
447 """ Computes the full name and partial name tokens for the given
450 full_names = self._compute_full_names(names)
452 partial_tokens = set()
454 for name in full_names:
455 norm_name = self.name_processor.get_normalized(name)
456 full, part = self._cache.names.get(norm_name, (None, None))
458 variants = self.name_processor.get_variants_ascii(norm_name)
462 with self.conn.cursor() as cur:
463 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
464 (norm_name, variants))
465 full, part = cur.fetchone()
467 self._cache.names[norm_name] = (full, part)
469 full_tokens.add(full)
470 partial_tokens.update(part)
472 return full_tokens, partial_tokens
476 def _compute_full_names(names):
477 """ Return the set of all full name word ids to be used with the
478 given dictionary of names.
481 for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
485 brace_idx = name.find('(')
487 full_names.add(name[:brace_idx].strip())
492 def _add_postcode(self, postcode):
493 """ Make sure the normalized postcode is present in the word table.
495 if re.search(r'[:,;]', postcode) is None:
496 postcode = self.normalize_postcode(postcode)
498 if postcode not in self._cache.postcodes:
499 term = self.name_processor.get_search_normalized(postcode)
503 with self.conn.cursor() as cur:
504 # no word_id needed for postcodes
505 cur.execute("""INSERT INTO word (word_token, type, info)
506 (SELECT %s, 'P', json_build_object('postcode', pc)
507 FROM (VALUES (%s)) as v(pc)
510 WHERE type = 'P' and info->>postcode = pc))
511 """, (term, postcode))
512 self._cache.postcodes.add(postcode)
516 def _split_housenumbers(hnrs):
517 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
518 # split numbers if necessary
521 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
523 if len(simple_list) > 1:
524 hnrs = list(set(simple_list))
534 """ Collect token information to be sent back to the database.
536 def __init__(self, cache):
541 def _mk_array(tokens):
542 return '{%s}' % ','.join((str(s) for s in tokens))
545 def add_names(self, fulls, partials):
546 """ Adds token information for the normalised names.
548 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
551 def add_housenumbers(self, conn, hnrs):
552 """ Extract housenumber information from a list of normalised
555 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
556 self.data['hnr'] = ';'.join(hnrs)
559 def add_street(self, fulls, _):
560 """ Add addr:street match terms.
563 self.data['street'] = self._mk_array(fulls)
566 def add_place(self, fulls, partials):
567 """ Add addr:place search and match terms.
570 self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
571 self.data['place_match'] = self._mk_array(fulls)
574 def add_address_terms(self, terms):
575 """ Add additional address terms.
579 for key, fulls, partials in terms:
581 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
582 self._mk_array(fulls)]
585 self.data['addr'] = tokens
589 """ Cache for token information to avoid repeated database queries.
591 This cache is not thread-safe and needs to be instantiated per
596 self.postcodes = set()
597 self.housenumbers = {}
600 def get_hnr_tokens(self, conn, terms):
601 """ Get token ids for a list of housenumbers, looking them up in the
602 database if necessary. `terms` is an iterable of normalized
609 token = self.housenumbers.get(term)
616 with conn.cursor() as cur:
617 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
619 for term, tid in cur:
620 self.housenumbers[term] = tid