2 Tokenizer implementing normalisation as used before Nominatim 4 but using
3 libICU instead of the PostgreSQL module.
9 from textwrap import dedent
11 from nominatim.db.connection import connect
12 from nominatim.db.utils import CopyBuffer
13 from nominatim.db.sql_preprocessor import SQLPreprocessor
14 from nominatim.indexer.place_info import PlaceInfo
15 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
16 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
18 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
20 LOG = logging.getLogger()
22 def create(dsn, data_dir):
23 """ Create a new instance of the tokenizer provided by this module.
25 return LegacyICUTokenizer(dsn, data_dir)
28 class LegacyICUTokenizer(AbstractTokenizer):
29 """ This tokenizer uses libICU to covert names and queries to ASCII.
30 Otherwise it uses the same algorithms and data structures as the
31 normalization routines in Nominatim 3.
34 def __init__(self, dsn, data_dir):
36 self.data_dir = data_dir
40 def init_new_db(self, config, init_db=True):
41 """ Set up a new tokenizer for the database.
43 This copies all necessary data in the project directory to make
44 sure the tokenizer remains stable even over updates.
46 self.loader = ICURuleLoader(config)
48 self._install_php(config.lib_dir.php)
52 self.update_sql_functions(config)
53 self._init_db_tables(config)
56 def init_from_project(self, config):
57 """ Initialise the tokenizer from the project directory.
59 self.loader = ICURuleLoader(config)
61 with connect(self.dsn) as conn:
62 self.loader.load_config_from_db(conn)
65 def finalize_import(self, config):
66 """ Do any required postprocessing to make the tokenizer data ready
69 with connect(self.dsn) as conn:
70 sqlp = SQLPreprocessor(conn, config)
71 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
74 def update_sql_functions(self, config):
75 """ Reimport the SQL functions for this tokenizer.
77 with connect(self.dsn) as conn:
78 sqlp = SQLPreprocessor(conn, config)
79 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
82 def check_database(self, config):
83 """ Check that the tokenizer is set up correctly.
85 # Will throw an error if there is an issue.
86 self.init_from_project(config)
89 def update_statistics(self):
90 """ Recompute frequencies for all name words.
92 with connect(self.dsn) as conn:
93 if conn.table_exists('search_name'):
94 with conn.cursor() as cur:
95 cur.drop_table("word_frequencies")
96 LOG.info("Computing word frequencies")
97 cur.execute("""CREATE TEMP TABLE word_frequencies AS
98 SELECT unnest(name_vector) as id, count(*)
99 FROM search_name GROUP BY id""")
100 cur.execute("CREATE INDEX ON word_frequencies(id)")
101 LOG.info("Update word table with recomputed frequencies")
102 cur.execute("""UPDATE word
103 SET info = info || jsonb_build_object('count', count)
104 FROM word_frequencies WHERE word_id = id""")
105 cur.drop_table("word_frequencies")
109 def name_analyzer(self):
110 """ Create a new analyzer for tokenizing names and queries
111 using this tokinzer. Analyzers are context managers and should
115 with tokenizer.name_analyzer() as analyzer:
119 When used outside the with construct, the caller must ensure to
120 call the close() function before destructing the analyzer.
122 Analyzers are not thread-safe. You need to instantiate one per thread.
124 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
125 self.loader.make_token_analysis())
128 def _install_php(self, phpdir):
129 """ Install the php script for the tokenizer.
131 php_file = self.data_dir / "tokenizer.php"
132 php_file.write_text(dedent(f"""\
134 @define('CONST_Max_Word_Frequency', 10000000);
135 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
136 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
137 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
140 def _save_config(self):
141 """ Save the configuration that needs to remain stable for the given
142 database as database properties.
144 with connect(self.dsn) as conn:
145 self.loader.save_config_to_db(conn)
148 def _init_db_tables(self, config):
149 """ Set up the word table and fill it with pre-computed word
152 with connect(self.dsn) as conn:
153 sqlp = SQLPreprocessor(conn, config)
154 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
158 class LegacyICUNameAnalyzer(AbstractAnalyzer):
159 """ The legacy analyzer uses the ICU library for splitting names.
161 Each instance opens a connection to the database to request the
165 def __init__(self, dsn, sanitizer, token_analysis):
166 self.conn = connect(dsn).connection
167 self.conn.autocommit = True
168 self.sanitizer = sanitizer
169 self.token_analysis = token_analysis
171 self._cache = _TokenCache()
175 """ Free all resources used by the analyzer.
182 def _search_normalized(self, name):
183 """ Return the search token transliteration of the given name.
185 return self.token_analysis.search.transliterate(name).strip()
188 def _normalized(self, name):
189 """ Return the normalized version of the given name with all
190 non-relevant information removed.
192 return self.token_analysis.normalizer.transliterate(name).strip()
195 def get_word_token_info(self, words):
196 """ Return token information for the given list of words.
197 If a word starts with # it is assumed to be a full name
198 otherwise is a partial name.
200 The function returns a list of tuples with
201 (original word, word token, word id).
203 The function is used for testing and debugging only
204 and not necessarily efficient.
209 if word.startswith('#'):
210 full_tokens[word] = self._search_normalized(word[1:])
212 partial_tokens[word] = self._search_normalized(word)
214 with self.conn.cursor() as cur:
215 cur.execute("""SELECT word_token, word_id
216 FROM word WHERE word_token = ANY(%s) and type = 'W'
217 """, (list(full_tokens.values()),))
218 full_ids = {r[0]: r[1] for r in cur}
219 cur.execute("""SELECT word_token, word_id
220 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
221 (list(partial_tokens.values()),))
222 part_ids = {r[0]: r[1] for r in cur}
224 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
225 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
229 def normalize_postcode(postcode):
230 """ Convert the postcode to a standardized form.
232 This function must yield exactly the same result as the SQL function
233 'token_normalized_postcode()'.
235 return postcode.strip().upper()
238 def _make_standard_hnr(self, hnr):
239 """ Create a normalised version of a housenumber.
241 This function takes minor shortcuts on transliteration.
243 return self._search_normalized(hnr)
245 def update_postcodes_from_db(self):
246 """ Update postcode tokens in the word table from the location_postcode
250 with self.conn.cursor() as cur:
251 # This finds us the rows in location_postcode and word that are
252 # missing in the other table.
253 cur.execute("""SELECT * FROM
254 (SELECT pc, word FROM
255 (SELECT distinct(postcode) as pc FROM location_postcode) p
257 (SELECT word FROM word WHERE type = 'P') w
259 WHERE pc is null or word is null""")
261 with CopyBuffer() as copystr:
262 for postcode, word in cur:
264 to_delete.append(word)
266 copystr.add(self._search_normalized(postcode),
270 cur.execute("""DELETE FROM WORD
271 WHERE type ='P' and word = any(%s)
274 copystr.copy_out(cur, 'word',
275 columns=['word_token', 'type', 'word'])
278 def update_special_phrases(self, phrases, should_replace):
279 """ Replace the search index for special phrases with the new phrases.
280 If `should_replace` is True, then the previous set of will be
281 completely replaced. Otherwise the phrases are added to the
282 already existing ones.
284 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
287 with self.conn.cursor() as cur:
288 # Get the old phrases.
289 existing_phrases = set()
290 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
291 for word, info in cur:
292 existing_phrases.add((word, info['class'], info['type'],
293 info.get('op') or '-'))
295 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
297 deleted = self._remove_special_phrases(cur, norm_phrases,
302 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
303 len(norm_phrases), added, deleted)
306 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
307 """ Add all phrases to the database that are not yet there.
309 to_add = new_phrases - existing_phrases
312 with CopyBuffer() as copystr:
313 for word, cls, typ, oper in to_add:
314 term = self._search_normalized(word)
316 copystr.add(term, 'S', word,
317 json.dumps({'class': cls, 'type': typ,
318 'op': oper if oper in ('in', 'near') else None}))
321 copystr.copy_out(cursor, 'word',
322 columns=['word_token', 'type', 'word', 'info'])
328 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
329 """ Remove all phrases from the databse that are no longer in the
332 to_delete = existing_phrases - new_phrases
335 cursor.execute_values(
336 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
337 WHERE type = 'S' and word = name
338 and info->>'class' = in_class and info->>'type' = in_type
339 and ((op = '-' and info->>'op' is null) or op = info->>'op')
342 return len(to_delete)
345 def add_country_names(self, country_code, names):
346 """ Add names for the given country to the search index.
348 # Make sure any name preprocessing for country names applies.
349 info = PlaceInfo({'name': names, 'country_code': country_code,
350 'rank_address': 4, 'class': 'boundary',
351 'type': 'administrative'})
352 self._add_country_full_names(country_code,
353 self.sanitizer.process_names(info)[0])
356 def _add_country_full_names(self, country_code, names):
357 """ Add names for the given country from an already sanitized
362 norm_name = self._search_normalized(name.name)
364 word_tokens.add(norm_name)
366 with self.conn.cursor() as cur:
368 cur.execute("""SELECT word_token FROM word
369 WHERE type = 'C' and word = %s""",
371 word_tokens.difference_update((t[0] for t in cur))
373 # Only add those names that are not yet in the list.
375 cur.execute("""INSERT INTO word (word_token, type, word)
376 (SELECT token, 'C', %s
377 FROM unnest(%s) as token)
378 """, (country_code, list(word_tokens)))
380 # No names are deleted at the moment.
381 # If deletion is made possible, then the static names from the
382 # initial 'country_name' table should be kept.
385 def process_place(self, place):
386 """ Determine tokenizer information about the given place.
388 Returns a JSON-serializable structure that will be handed into
389 the database via the token_info field.
391 token_info = _TokenInfo(self._cache)
393 names, address = self.sanitizer.process_names(place)
396 fulls, partials = self._compute_name_tokens(names)
398 token_info.add_names(fulls, partials)
400 if place.is_country():
401 self._add_country_full_names(place.country_code, names)
404 self._process_place_address(token_info, address)
406 return token_info.data
409 def _process_place_address(self, token_info, address):
413 if item.kind == 'postcode':
414 self._add_postcode(item.name)
415 elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
416 hnrs.append(item.name)
417 elif item.kind == 'street':
418 token_info.add_street(self._compute_partial_tokens(item.name))
419 elif item.kind == 'place':
420 token_info.add_place(self._compute_partial_tokens(item.name))
421 elif not item.kind.startswith('_') and \
422 item.kind not in ('country', 'full'):
423 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
426 hnrs = self._split_housenumbers(hnrs)
427 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
430 token_info.add_address_terms(addr_terms)
433 def _compute_partial_tokens(self, name):
434 """ Normalize the given term, split it into partial words and return
435 then token list for them.
437 norm_name = self._search_normalized(name)
441 for partial in norm_name.split():
442 token = self._cache.partials.get(partial)
446 need_lookup.append(partial)
449 with self.conn.cursor() as cur:
450 cur.execute("""SELECT word, getorcreate_partial_word(word)
451 FROM unnest(%s) word""",
454 for partial, token in cur:
456 self._cache.partials[partial] = token
461 def _compute_name_tokens(self, names):
462 """ Computes the full name and partial name tokens for the given
466 partial_tokens = set()
469 analyzer_id = name.get_attr('analyzer')
470 norm_name = self._normalized(name.name)
471 if analyzer_id is None:
474 token_id = f'{norm_name}@{analyzer_id}'
476 full, part = self._cache.names.get(token_id, (None, None))
478 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
482 with self.conn.cursor() as cur:
483 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
484 (token_id, variants))
485 full, part = cur.fetchone()
487 self._cache.names[token_id] = (full, part)
489 full_tokens.add(full)
490 partial_tokens.update(part)
492 return full_tokens, partial_tokens
495 def _add_postcode(self, postcode):
496 """ Make sure the normalized postcode is present in the word table.
498 if re.search(r'[:,;]', postcode) is None:
499 postcode = self.normalize_postcode(postcode)
501 if postcode not in self._cache.postcodes:
502 term = self._search_normalized(postcode)
506 with self.conn.cursor() as cur:
507 # no word_id needed for postcodes
508 cur.execute("""INSERT INTO word (word_token, type, word)
509 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
512 WHERE type = 'P' and word = pc))
513 """, (term, postcode))
514 self._cache.postcodes.add(postcode)
518 def _split_housenumbers(hnrs):
519 if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
520 # split numbers if necessary
523 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
525 if len(simple_list) > 1:
526 hnrs = list(set(simple_list))
536 """ Collect token information to be sent back to the database.
538 def __init__(self, cache):
543 def _mk_array(tokens):
544 return '{%s}' % ','.join((str(s) for s in tokens))
547 def add_names(self, fulls, partials):
548 """ Adds token information for the normalised names.
550 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
553 def add_housenumbers(self, conn, hnrs):
554 """ Extract housenumber information from a list of normalised
557 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
558 self.data['hnr'] = ';'.join(hnrs)
561 def add_street(self, tokens):
562 """ Add addr:street match terms.
565 self.data['street'] = self._mk_array(tokens)
568 def add_place(self, tokens):
569 """ Add addr:place search and match terms.
572 self.data['place'] = self._mk_array(tokens)
575 def add_address_terms(self, terms):
576 """ Add additional address terms.
578 tokens = {key: self._mk_array(partials)
579 for key, partials in terms if partials}
582 self.data['addr'] = tokens
586 """ Cache for token information to avoid repeated database queries.
588 This cache is not thread-safe and needs to be instantiated per
594 self.postcodes = set()
595 self.housenumbers = {}
598 def get_hnr_tokens(self, conn, terms):
599 """ Get token ids for a list of housenumbers, looking them up in the
600 database if necessary. `terms` is an iterable of normalized
607 token = self.housenumbers.get(term)
614 with conn.cursor() as cur:
615 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
617 for term, tid in cur:
618 self.housenumbers[term] = tid