2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
10 from textwrap import dedent
11 from pathlib import Path
13 from nominatim.db.connection import connect
14 from nominatim.db.properties import set_property, get_property
15 from nominatim.db.utils import CopyBuffer
16 from nominatim.db.sql_preprocessor import SQLPreprocessor
17 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
18 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
20 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
21 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
23 LOG = logging.getLogger()
25 def create(dsn, data_dir):
26 """ Create a new instance of the tokenizer provided by this module.
28 return LegacyICUTokenizer(dsn, data_dir)
31 class LegacyICUTokenizer:
32 """ This tokenizer uses libICU to covert names and queries to ASCII.
33 Otherwise it uses the same algorithms and data structures as the
34 normalization routines in Nominatim 3.
37 def __init__(self, dsn, data_dir):
39 self.data_dir = data_dir
40 self.naming_rules = None
41 self.term_normalization = None
42 self.max_word_frequency = None
45 def init_new_db(self, config, init_db=True):
46 """ Set up a new tokenizer for the database.
48 This copies all necessary data in the project directory to make
49 sure the tokenizer remains stable even over updates.
51 if config.TOKENIZER_CONFIG:
52 cfgfile = Path(config.TOKENIZER_CONFIG)
54 cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
56 loader = ICURuleLoader(cfgfile)
57 self.naming_rules = ICUNameProcessorRules(loader=loader)
58 self.term_normalization = config.TERM_NORMALIZATION
59 self.max_word_frequency = config.MAX_WORD_FREQUENCY
61 self._install_php(config.lib_dir.php)
62 self._save_config(config)
65 self.update_sql_functions(config)
66 self._init_db_tables(config)
69 def init_from_project(self):
70 """ Initialise the tokenizer from the project directory.
72 with connect(self.dsn) as conn:
73 self.naming_rules = ICUNameProcessorRules(conn=conn)
74 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
75 self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
78 def finalize_import(self, _):
79 """ Do any required postprocessing to make the tokenizer data ready
84 def update_sql_functions(self, config):
85 """ Reimport the SQL functions for this tokenizer.
87 with connect(self.dsn) as conn:
88 max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
89 sqlp = SQLPreprocessor(conn, config)
90 sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
91 max_word_freq=max_word_freq)
94 def check_database(self):
95 """ Check that the tokenizer is set up correctly.
97 self.init_from_project()
99 if self.naming_rules is None:
100 return "Configuration for tokenizer 'legacy_icu' are missing."
105 def name_analyzer(self):
106 """ Create a new analyzer for tokenizing names and queries
107 using this tokinzer. Analyzers are context managers and should
111 with tokenizer.name_analyzer() as analyzer:
115 When used outside the with construct, the caller must ensure to
116 call the close() function before destructing the analyzer.
118 Analyzers are not thread-safe. You need to instantiate one per thread.
120 return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
123 def _install_php(self, phpdir):
124 """ Install the php script for the tokenizer.
126 php_file = self.data_dir / "tokenizer.php"
127 php_file.write_text(dedent(f"""\
129 @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
130 @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
131 @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
132 require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');"""))
135 def _save_config(self, config):
136 """ Save the configuration that needs to remain stable for the given
137 database as database properties.
139 with connect(self.dsn) as conn:
140 self.naming_rules.save_rules(conn)
142 set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
143 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
146 def _init_db_tables(self, config):
147 """ Set up the word table and fill it with pre-computed word
150 with connect(self.dsn) as conn:
151 sqlp = SQLPreprocessor(conn, config)
152 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
155 LOG.warning("Precomputing word tokens")
157 # get partial words and their frequencies
158 words = self._count_partial_terms(conn)
160 # copy them back into the word table
161 with CopyBuffer() as copystr:
162 for term, cnt in words.items():
163 copystr.add('w', term, json.dumps({'count': cnt}))
165 with conn.cursor() as cur:
166 copystr.copy_out(cur, 'word',
167 columns=['type', 'word_token', 'info'])
168 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
169 WHERE word_id is null and type = 'w'""")
173 def _count_partial_terms(self, conn):
174 """ Count the partial terms from the names in the place table.
177 name_proc = ICUNameProcessor(self.naming_rules)
179 with conn.cursor(name="words") as cur:
180 cur.execute(""" SELECT v, count(*) FROM
181 (SELECT svals(name) as v FROM place)x
182 WHERE length(v) < 75 GROUP BY v""")
184 for name, cnt in cur:
186 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
188 terms.update(word.split())
195 class LegacyICUNameAnalyzer:
196 """ The legacy analyzer uses the ICU library for splitting names.
198 Each instance opens a connection to the database to request the
202 def __init__(self, dsn, name_proc):
203 self.conn = connect(dsn).connection
204 self.conn.autocommit = True
205 self.name_processor = name_proc
207 self._cache = _TokenCache()
214 def __exit__(self, exc_type, exc_value, traceback):
219 """ Free all resources used by the analyzer.
226 def get_word_token_info(self, words):
227 """ Return token information for the given list of words.
228 If a word starts with # it is assumed to be a full name
229 otherwise is a partial name.
231 The function returns a list of tuples with
232 (original word, word token, word id).
234 The function is used for testing and debugging only
235 and not necessarily efficient.
240 if word.startswith('#'):
241 full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
243 partial_tokens[word] = self.name_processor.get_search_normalized(word)
245 with self.conn.cursor() as cur:
246 cur.execute("""SELECT word_token, word_id
247 FROM word WHERE word_token = ANY(%s) and type = 'W'
248 """, (list(full_tokens.values()),))
249 full_ids = {r[0]: r[1] for r in cur}
250 cur.execute("""SELECT word_token, word_id
251 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
252 (list(partial_tokens.values()),))
253 part_ids = {r[0]: r[1] for r in cur}
255 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
256 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
260 def normalize_postcode(postcode):
261 """ Convert the postcode to a standardized form.
263 This function must yield exactly the same result as the SQL function
264 'token_normalized_postcode()'.
266 return postcode.strip().upper()
269 def _make_standard_hnr(self, hnr):
270 """ Create a normalised version of a housenumber.
272 This function takes minor shortcuts on transliteration.
274 return self.name_processor.get_search_normalized(hnr)
276 def update_postcodes_from_db(self):
277 """ Update postcode tokens in the word table from the location_postcode
281 with self.conn.cursor() as cur:
282 # This finds us the rows in location_postcode and word that are
283 # missing in the other table.
284 cur.execute("""SELECT * FROM
285 (SELECT pc, word FROM
286 (SELECT distinct(postcode) as pc FROM location_postcode) p
288 (SELECT word FROM word WHERE type = 'P') w
290 WHERE pc is null or word is null""")
292 with CopyBuffer() as copystr:
293 for postcode, word in cur:
295 to_delete.append(word)
297 copystr.add(self.name_processor.get_search_normalized(postcode),
301 cur.execute("""DELETE FROM WORD
302 WHERE type ='P' and word = any(%s)
305 copystr.copy_out(cur, 'word',
306 columns=['word_token', 'type', 'word'])
309 def update_special_phrases(self, phrases, should_replace):
310 """ Replace the search index for special phrases with the new phrases.
311 If `should_replace` is True, then the previous set of will be
312 completely replaced. Otherwise the phrases are added to the
313 already existing ones.
315 norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
318 with self.conn.cursor() as cur:
319 # Get the old phrases.
320 existing_phrases = set()
321 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
322 for word, info in cur:
323 existing_phrases.add((word, info['class'], info['type'],
324 info.get('op') or '-'))
326 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
328 deleted = self._remove_special_phrases(cur, norm_phrases,
333 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
334 len(norm_phrases), added, deleted)
337 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
338 """ Add all phrases to the database that are not yet there.
340 to_add = new_phrases - existing_phrases
343 with CopyBuffer() as copystr:
344 for word, cls, typ, oper in to_add:
345 term = self.name_processor.get_search_normalized(word)
347 copystr.add(term, 'S', word,
348 json.dumps({'class': cls, 'type': typ,
349 'op': oper if oper in ('in', 'near') else None}))
352 copystr.copy_out(cursor, 'word',
353 columns=['word_token', 'type', 'word', 'info'])
359 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
360 """ Remove all phrases from the databse that are no longer in the
363 to_delete = existing_phrases - new_phrases
366 cursor.execute_values(
367 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
368 WHERE type = 'S' and word = name
369 and info->>'class' = in_class and info->>'type' = in_type
370 and ((op = '-' and info->>'op' is null) or op = info->>'op')
373 return len(to_delete)
376 def add_country_names(self, country_code, names):
377 """ Add names for the given country to the search index.
380 for name in self._compute_full_names(names):
381 norm_name = self.name_processor.get_search_normalized(name)
383 word_tokens.add(norm_name)
385 with self.conn.cursor() as cur:
387 cur.execute("""SELECT word_token FROM word
388 WHERE type = 'C' and word = %s""",
390 word_tokens.difference_update((t[0] for t in cur))
392 # Only add those names that are not yet in the list.
394 cur.execute("""INSERT INTO word (word_token, type, word)
395 (SELECT token, 'C', %s
396 FROM unnest(%s) as token)
397 """, (country_code, list(word_tokens)))
399 # No names are deleted at the moment.
400 # If deletion is made possible, then the static names from the
401 # initial 'country_name' table should be kept.
404 def process_place(self, place):
405 """ Determine tokenizer information about the given place.
407 Returns a JSON-serialisable structure that will be handed into
408 the database via the token_info field.
410 token_info = _TokenInfo(self._cache)
412 names = place.get('name')
415 fulls, partials = self._compute_name_tokens(names)
417 token_info.add_names(fulls, partials)
419 country_feature = place.get('country_feature')
420 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
421 self.add_country_names(country_feature.lower(), names)
423 address = place.get('address')
425 self._process_place_address(token_info, address)
427 return token_info.data
430 def _process_place_address(self, token_info, address):
433 for key, value in address.items():
434 if key == 'postcode':
435 self._add_postcode(value)
436 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
438 elif key == 'street':
439 token_info.add_street(*self._compute_name_tokens({'name': value}))
441 token_info.add_place(*self._compute_name_tokens({'name': value}))
442 elif not key.startswith('_') and \
443 key not in ('country', 'full'):
444 addr_terms.append((key, *self._compute_name_tokens({'name': value})))
447 hnrs = self._split_housenumbers(hnrs)
448 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
451 token_info.add_address_terms(addr_terms)
454 def _compute_name_tokens(self, names):
455 """ Computes the full name and partial name tokens for the given
458 full_names = self._compute_full_names(names)
460 partial_tokens = set()
462 for name in full_names:
463 norm_name = self.name_processor.get_normalized(name)
464 full, part = self._cache.names.get(norm_name, (None, None))
466 variants = self.name_processor.get_variants_ascii(norm_name)
470 with self.conn.cursor() as cur:
471 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
472 (norm_name, variants))
473 full, part = cur.fetchone()
475 self._cache.names[norm_name] = (full, part)
477 full_tokens.add(full)
478 partial_tokens.update(part)
480 return full_tokens, partial_tokens
484 def _compute_full_names(names):
485 """ Return the set of all full name word ids to be used with the
486 given dictionary of names.
489 for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
493 brace_idx = name.find('(')
495 full_names.add(name[:brace_idx].strip())
500 def _add_postcode(self, postcode):
501 """ Make sure the normalized postcode is present in the word table.
503 if re.search(r'[:,;]', postcode) is None:
504 postcode = self.normalize_postcode(postcode)
506 if postcode not in self._cache.postcodes:
507 term = self.name_processor.get_search_normalized(postcode)
511 with self.conn.cursor() as cur:
512 # no word_id needed for postcodes
513 cur.execute("""INSERT INTO word (word_token, type, word)
514 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
517 WHERE type = 'P' and word = pc))
518 """, (term, postcode))
519 self._cache.postcodes.add(postcode)
523 def _split_housenumbers(hnrs):
524 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
525 # split numbers if necessary
528 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
530 if len(simple_list) > 1:
531 hnrs = list(set(simple_list))
541 """ Collect token information to be sent back to the database.
543 def __init__(self, cache):
548 def _mk_array(tokens):
549 return '{%s}' % ','.join((str(s) for s in tokens))
552 def add_names(self, fulls, partials):
553 """ Adds token information for the normalised names.
555 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
558 def add_housenumbers(self, conn, hnrs):
559 """ Extract housenumber information from a list of normalised
562 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
563 self.data['hnr'] = ';'.join(hnrs)
566 def add_street(self, fulls, _):
567 """ Add addr:street match terms.
570 self.data['street'] = self._mk_array(fulls)
573 def add_place(self, fulls, partials):
574 """ Add addr:place search and match terms.
577 self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
578 self.data['place_match'] = self._mk_array(fulls)
581 def add_address_terms(self, terms):
582 """ Add additional address terms.
586 for key, fulls, partials in terms:
588 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
589 self._mk_array(fulls)]
592 self.data['addr'] = tokens
596 """ Cache for token information to avoid repeated database queries.
598 This cache is not thread-safe and needs to be instantiated per
603 self.postcodes = set()
604 self.housenumbers = {}
607 def get_hnr_tokens(self, conn, terms):
608 """ Get token ids for a list of housenumbers, looking them up in the
609 database if necessary. `terms` is an iterable of normalized
616 token = self.housenumbers.get(term)
623 with conn.cursor() as cur:
624 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
626 for term, tid in cur:
627 self.housenumbers[term] = tid