2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
5 from collections import Counter
10 from textwrap import dedent
11 from pathlib import Path
13 from nominatim.db.connection import connect
14 from nominatim.db.properties import set_property, get_property
15 from nominatim.db.utils import CopyBuffer
16 from nominatim.db.sql_preprocessor import SQLPreprocessor
17 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
18 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
19 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
21 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
22 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
24 LOG = logging.getLogger()
26 def create(dsn, data_dir):
27 """ Create a new instance of the tokenizer provided by this module.
29 return LegacyICUTokenizer(dsn, data_dir)
32 class LegacyICUTokenizer(AbstractTokenizer):
33 """ This tokenizer uses libICU to covert names and queries to ASCII.
34 Otherwise it uses the same algorithms and data structures as the
35 normalization routines in Nominatim 3.
38 def __init__(self, dsn, data_dir):
40 self.data_dir = data_dir
41 self.naming_rules = None
42 self.term_normalization = None
43 self.max_word_frequency = None
46 def init_new_db(self, config, init_db=True):
47 """ Set up a new tokenizer for the database.
49 This copies all necessary data in the project directory to make
50 sure the tokenizer remains stable even over updates.
52 if config.TOKENIZER_CONFIG:
53 cfgfile = Path(config.TOKENIZER_CONFIG)
55 cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
57 loader = ICURuleLoader(cfgfile)
58 self.naming_rules = ICUNameProcessorRules(loader=loader)
59 self.term_normalization = config.TERM_NORMALIZATION
60 self.max_word_frequency = config.MAX_WORD_FREQUENCY
62 self._install_php(config.lib_dir.php)
63 self._save_config(config)
66 self.update_sql_functions(config)
67 self._init_db_tables(config)
70 def init_from_project(self):
71 """ Initialise the tokenizer from the project directory.
73 with connect(self.dsn) as conn:
74 self.naming_rules = ICUNameProcessorRules(conn=conn)
75 self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
76 self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
79 def finalize_import(self, _):
80 """ Do any required postprocessing to make the tokenizer data ready
85 def update_sql_functions(self, config):
86 """ Reimport the SQL functions for this tokenizer.
88 with connect(self.dsn) as conn:
89 max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
90 sqlp = SQLPreprocessor(conn, config)
91 sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
92 max_word_freq=max_word_freq)
95 def check_database(self):
96 """ Check that the tokenizer is set up correctly.
98 self.init_from_project()
100 if self.naming_rules is None:
101 return "Configuration for tokenizer 'legacy_icu' are missing."
106 def name_analyzer(self):
107 """ Create a new analyzer for tokenizing names and queries
108 using this tokinzer. Analyzers are context managers and should
112 with tokenizer.name_analyzer() as analyzer:
116 When used outside the with construct, the caller must ensure to
117 call the close() function before destructing the analyzer.
119 Analyzers are not thread-safe. You need to instantiate one per thread.
121 return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
124 def _install_php(self, phpdir):
125 """ Install the php script for the tokenizer.
127 php_file = self.data_dir / "tokenizer.php"
128 php_file.write_text(dedent(f"""\
130 @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
131 @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
132 @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
133 require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');"""))
136 def _save_config(self, config):
137 """ Save the configuration that needs to remain stable for the given
138 database as database properties.
140 with connect(self.dsn) as conn:
141 self.naming_rules.save_rules(conn)
143 set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
144 set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
147 def _init_db_tables(self, config):
148 """ Set up the word table and fill it with pre-computed word
151 with connect(self.dsn) as conn:
152 sqlp = SQLPreprocessor(conn, config)
153 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
156 LOG.warning("Precomputing word tokens")
158 # get partial words and their frequencies
159 words = self._count_partial_terms(conn)
161 # copy them back into the word table
162 with CopyBuffer() as copystr:
163 for term, cnt in words.items():
164 copystr.add('w', term, json.dumps({'count': cnt}))
166 with conn.cursor() as cur:
167 copystr.copy_out(cur, 'word',
168 columns=['type', 'word_token', 'info'])
169 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
170 WHERE word_id is null and type = 'w'""")
174 def _count_partial_terms(self, conn):
175 """ Count the partial terms from the names in the place table.
178 name_proc = ICUNameProcessor(self.naming_rules)
180 with conn.cursor(name="words") as cur:
181 cur.execute(""" SELECT v, count(*) FROM
182 (SELECT svals(name) as v FROM place)x
183 WHERE length(v) < 75 GROUP BY v""")
185 for name, cnt in cur:
187 for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
189 terms.update(word.split())
196 class LegacyICUNameAnalyzer(AbstractAnalyzer):
197 """ The legacy analyzer uses the ICU library for splitting names.
199 Each instance opens a connection to the database to request the
203 def __init__(self, dsn, name_proc):
204 self.conn = connect(dsn).connection
205 self.conn.autocommit = True
206 self.name_processor = name_proc
208 self._cache = _TokenCache()
212 """ Free all resources used by the analyzer.
219 def get_word_token_info(self, words):
220 """ Return token information for the given list of words.
221 If a word starts with # it is assumed to be a full name
222 otherwise is a partial name.
224 The function returns a list of tuples with
225 (original word, word token, word id).
227 The function is used for testing and debugging only
228 and not necessarily efficient.
233 if word.startswith('#'):
234 full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
236 partial_tokens[word] = self.name_processor.get_search_normalized(word)
238 with self.conn.cursor() as cur:
239 cur.execute("""SELECT word_token, word_id
240 FROM word WHERE word_token = ANY(%s) and type = 'W'
241 """, (list(full_tokens.values()),))
242 full_ids = {r[0]: r[1] for r in cur}
243 cur.execute("""SELECT word_token, word_id
244 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
245 (list(partial_tokens.values()),))
246 part_ids = {r[0]: r[1] for r in cur}
248 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
249 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
253 def normalize_postcode(postcode):
254 """ Convert the postcode to a standardized form.
256 This function must yield exactly the same result as the SQL function
257 'token_normalized_postcode()'.
259 return postcode.strip().upper()
262 def _make_standard_hnr(self, hnr):
263 """ Create a normalised version of a housenumber.
265 This function takes minor shortcuts on transliteration.
267 return self.name_processor.get_search_normalized(hnr)
269 def update_postcodes_from_db(self):
270 """ Update postcode tokens in the word table from the location_postcode
274 with self.conn.cursor() as cur:
275 # This finds us the rows in location_postcode and word that are
276 # missing in the other table.
277 cur.execute("""SELECT * FROM
278 (SELECT pc, word FROM
279 (SELECT distinct(postcode) as pc FROM location_postcode) p
281 (SELECT word FROM word WHERE type = 'P') w
283 WHERE pc is null or word is null""")
285 with CopyBuffer() as copystr:
286 for postcode, word in cur:
288 to_delete.append(word)
290 copystr.add(self.name_processor.get_search_normalized(postcode),
294 cur.execute("""DELETE FROM WORD
295 WHERE type ='P' and word = any(%s)
298 copystr.copy_out(cur, 'word',
299 columns=['word_token', 'type', 'word'])
302 def update_special_phrases(self, phrases, should_replace):
303 """ Replace the search index for special phrases with the new phrases.
304 If `should_replace` is True, then the previous set of will be
305 completely replaced. Otherwise the phrases are added to the
306 already existing ones.
308 norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
311 with self.conn.cursor() as cur:
312 # Get the old phrases.
313 existing_phrases = set()
314 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
315 for word, info in cur:
316 existing_phrases.add((word, info['class'], info['type'],
317 info.get('op') or '-'))
319 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
321 deleted = self._remove_special_phrases(cur, norm_phrases,
326 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
327 len(norm_phrases), added, deleted)
330 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
331 """ Add all phrases to the database that are not yet there.
333 to_add = new_phrases - existing_phrases
336 with CopyBuffer() as copystr:
337 for word, cls, typ, oper in to_add:
338 term = self.name_processor.get_search_normalized(word)
340 copystr.add(term, 'S', word,
341 json.dumps({'class': cls, 'type': typ,
342 'op': oper if oper in ('in', 'near') else None}))
345 copystr.copy_out(cursor, 'word',
346 columns=['word_token', 'type', 'word', 'info'])
352 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
353 """ Remove all phrases from the databse that are no longer in the
356 to_delete = existing_phrases - new_phrases
359 cursor.execute_values(
360 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
361 WHERE type = 'S' and word = name
362 and info->>'class' = in_class and info->>'type' = in_type
363 and ((op = '-' and info->>'op' is null) or op = info->>'op')
366 return len(to_delete)
369 def add_country_names(self, country_code, names):
370 """ Add names for the given country to the search index.
373 for name in self._compute_full_names(names):
374 norm_name = self.name_processor.get_search_normalized(name)
376 word_tokens.add(norm_name)
378 with self.conn.cursor() as cur:
380 cur.execute("""SELECT word_token FROM word
381 WHERE type = 'C' and word = %s""",
383 word_tokens.difference_update((t[0] for t in cur))
385 # Only add those names that are not yet in the list.
387 cur.execute("""INSERT INTO word (word_token, type, word)
388 (SELECT token, 'C', %s
389 FROM unnest(%s) as token)
390 """, (country_code, list(word_tokens)))
392 # No names are deleted at the moment.
393 # If deletion is made possible, then the static names from the
394 # initial 'country_name' table should be kept.
397 def process_place(self, place):
398 """ Determine tokenizer information about the given place.
400 Returns a JSON-serialisable structure that will be handed into
401 the database via the token_info field.
403 token_info = _TokenInfo(self._cache)
405 names = place.get('name')
408 fulls, partials = self._compute_name_tokens(names)
410 token_info.add_names(fulls, partials)
412 country_feature = place.get('country_feature')
413 if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
414 self.add_country_names(country_feature.lower(), names)
416 address = place.get('address')
418 self._process_place_address(token_info, address)
420 return token_info.data
423 def _process_place_address(self, token_info, address):
426 for key, value in address.items():
427 if key == 'postcode':
428 self._add_postcode(value)
429 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
431 elif key == 'street':
432 token_info.add_street(*self._compute_name_tokens({'name': value}))
434 token_info.add_place(*self._compute_name_tokens({'name': value}))
435 elif not key.startswith('_') and \
436 key not in ('country', 'full'):
437 addr_terms.append((key, *self._compute_name_tokens({'name': value})))
440 hnrs = self._split_housenumbers(hnrs)
441 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
444 token_info.add_address_terms(addr_terms)
447 def _compute_name_tokens(self, names):
448 """ Computes the full name and partial name tokens for the given
451 full_names = self._compute_full_names(names)
453 partial_tokens = set()
455 for name in full_names:
456 norm_name = self.name_processor.get_normalized(name)
457 full, part = self._cache.names.get(norm_name, (None, None))
459 variants = self.name_processor.get_variants_ascii(norm_name)
463 with self.conn.cursor() as cur:
464 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
465 (norm_name, variants))
466 full, part = cur.fetchone()
468 self._cache.names[norm_name] = (full, part)
470 full_tokens.add(full)
471 partial_tokens.update(part)
473 return full_tokens, partial_tokens
477 def _compute_full_names(names):
478 """ Return the set of all full name word ids to be used with the
479 given dictionary of names.
482 for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
486 brace_idx = name.find('(')
488 full_names.add(name[:brace_idx].strip())
493 def _add_postcode(self, postcode):
494 """ Make sure the normalized postcode is present in the word table.
496 if re.search(r'[:,;]', postcode) is None:
497 postcode = self.normalize_postcode(postcode)
499 if postcode not in self._cache.postcodes:
500 term = self.name_processor.get_search_normalized(postcode)
504 with self.conn.cursor() as cur:
505 # no word_id needed for postcodes
506 cur.execute("""INSERT INTO word (word_token, type, word)
507 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
510 WHERE type = 'P' and word = pc))
511 """, (term, postcode))
512 self._cache.postcodes.add(postcode)
516 def _split_housenumbers(hnrs):
517 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
518 # split numbers if necessary
521 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
523 if len(simple_list) > 1:
524 hnrs = list(set(simple_list))
534 """ Collect token information to be sent back to the database.
536 def __init__(self, cache):
541 def _mk_array(tokens):
542 return '{%s}' % ','.join((str(s) for s in tokens))
545 def add_names(self, fulls, partials):
546 """ Adds token information for the normalised names.
548 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
551 def add_housenumbers(self, conn, hnrs):
552 """ Extract housenumber information from a list of normalised
555 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
556 self.data['hnr'] = ';'.join(hnrs)
559 def add_street(self, fulls, _):
560 """ Add addr:street match terms.
563 self.data['street'] = self._mk_array(fulls)
566 def add_place(self, fulls, partials):
567 """ Add addr:place search and match terms.
570 self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
571 self.data['place_match'] = self._mk_array(fulls)
574 def add_address_terms(self, terms):
575 """ Add additional address terms.
579 for key, fulls, partials in terms:
581 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
582 self._mk_array(fulls)]
585 self.data['addr'] = tokens
589 """ Cache for token information to avoid repeated database queries.
591 This cache is not thread-safe and needs to be instantiated per
596 self.postcodes = set()
597 self.housenumbers = {}
600 def get_hnr_tokens(self, conn, terms):
601 """ Get token ids for a list of housenumbers, looking them up in the
602 database if necessary. `terms` is an iterable of normalized
609 token = self.housenumbers.get(term)
616 with conn.cursor() as cur:
617 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
619 for term, tid in cur:
620 self.housenumbers[term] = tid