1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
15 from textwrap import dedent
17 from nominatim.db.connection import connect
18 from nominatim.db.utils import CopyBuffer
19 from nominatim.db.sql_preprocessor import SQLPreprocessor
20 from nominatim.indexer.place_info import PlaceInfo
21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
22 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
24 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
26 LOG = logging.getLogger()
28 def create(dsn, data_dir):
29 """ Create a new instance of the tokenizer provided by this module.
31 return LegacyICUTokenizer(dsn, data_dir)
34 class LegacyICUTokenizer(AbstractTokenizer):
35 """ This tokenizer uses libICU to covert names and queries to ASCII.
36 Otherwise it uses the same algorithms and data structures as the
37 normalization routines in Nominatim 3.
40 def __init__(self, dsn, data_dir):
42 self.data_dir = data_dir
46 def init_new_db(self, config, init_db=True):
47 """ Set up a new tokenizer for the database.
49 This copies all necessary data in the project directory to make
50 sure the tokenizer remains stable even over updates.
52 self.loader = ICURuleLoader(config)
54 self._install_php(config.lib_dir.php)
58 self.update_sql_functions(config)
59 self._init_db_tables(config)
62 def init_from_project(self, config):
63 """ Initialise the tokenizer from the project directory.
65 self.loader = ICURuleLoader(config)
67 with connect(self.dsn) as conn:
68 self.loader.load_config_from_db(conn)
71 def finalize_import(self, config):
72 """ Do any required postprocessing to make the tokenizer data ready
75 with connect(self.dsn) as conn:
76 sqlp = SQLPreprocessor(conn, config)
77 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
80 def update_sql_functions(self, config):
81 """ Reimport the SQL functions for this tokenizer.
83 with connect(self.dsn) as conn:
84 sqlp = SQLPreprocessor(conn, config)
85 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
88 def check_database(self, config):
89 """ Check that the tokenizer is set up correctly.
91 # Will throw an error if there is an issue.
92 self.init_from_project(config)
95 def update_statistics(self):
96 """ Recompute frequencies for all name words.
98 with connect(self.dsn) as conn:
99 if conn.table_exists('search_name'):
100 with conn.cursor() as cur:
101 cur.drop_table("word_frequencies")
102 LOG.info("Computing word frequencies")
103 cur.execute("""CREATE TEMP TABLE word_frequencies AS
104 SELECT unnest(name_vector) as id, count(*)
105 FROM search_name GROUP BY id""")
106 cur.execute("CREATE INDEX ON word_frequencies(id)")
107 LOG.info("Update word table with recomputed frequencies")
108 cur.execute("""UPDATE word
109 SET info = info || jsonb_build_object('count', count)
110 FROM word_frequencies WHERE word_id = id""")
111 cur.drop_table("word_frequencies")
115 def name_analyzer(self):
116 """ Create a new analyzer for tokenizing names and queries
117 using this tokinzer. Analyzers are context managers and should
121 with tokenizer.name_analyzer() as analyzer:
125 When used outside the with construct, the caller must ensure to
126 call the close() function before destructing the analyzer.
128 Analyzers are not thread-safe. You need to instantiate one per thread.
130 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
131 self.loader.make_token_analysis())
134 def _install_php(self, phpdir):
135 """ Install the php script for the tokenizer.
137 php_file = self.data_dir / "tokenizer.php"
138 php_file.write_text(dedent(f"""\
140 @define('CONST_Max_Word_Frequency', 10000000);
141 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
142 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
143 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
146 def _save_config(self):
147 """ Save the configuration that needs to remain stable for the given
148 database as database properties.
150 with connect(self.dsn) as conn:
151 self.loader.save_config_to_db(conn)
154 def _init_db_tables(self, config):
155 """ Set up the word table and fill it with pre-computed word
158 with connect(self.dsn) as conn:
159 sqlp = SQLPreprocessor(conn, config)
160 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
164 class LegacyICUNameAnalyzer(AbstractAnalyzer):
165 """ The legacy analyzer uses the ICU library for splitting names.
167 Each instance opens a connection to the database to request the
171 def __init__(self, dsn, sanitizer, token_analysis):
172 self.conn = connect(dsn).connection
173 self.conn.autocommit = True
174 self.sanitizer = sanitizer
175 self.token_analysis = token_analysis
177 self._cache = _TokenCache()
181 """ Free all resources used by the analyzer.
188 def _search_normalized(self, name):
189 """ Return the search token transliteration of the given name.
191 return self.token_analysis.search.transliterate(name).strip()
194 def _normalized(self, name):
195 """ Return the normalized version of the given name with all
196 non-relevant information removed.
198 return self.token_analysis.normalizer.transliterate(name).strip()
201 def get_word_token_info(self, words):
202 """ Return token information for the given list of words.
203 If a word starts with # it is assumed to be a full name
204 otherwise is a partial name.
206 The function returns a list of tuples with
207 (original word, word token, word id).
209 The function is used for testing and debugging only
210 and not necessarily efficient.
215 if word.startswith('#'):
216 full_tokens[word] = self._search_normalized(word[1:])
218 partial_tokens[word] = self._search_normalized(word)
220 with self.conn.cursor() as cur:
221 cur.execute("""SELECT word_token, word_id
222 FROM word WHERE word_token = ANY(%s) and type = 'W'
223 """, (list(full_tokens.values()),))
224 full_ids = {r[0]: r[1] for r in cur}
225 cur.execute("""SELECT word_token, word_id
226 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
227 (list(partial_tokens.values()),))
228 part_ids = {r[0]: r[1] for r in cur}
230 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
231 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
235 def normalize_postcode(postcode):
236 """ Convert the postcode to a standardized form.
238 This function must yield exactly the same result as the SQL function
239 'token_normalized_postcode()'.
241 return postcode.strip().upper()
244 def _make_standard_hnr(self, hnr):
245 """ Create a normalised version of a housenumber.
247 This function takes minor shortcuts on transliteration.
249 return self._search_normalized(hnr)
251 def update_postcodes_from_db(self):
252 """ Update postcode tokens in the word table from the location_postcode
256 with self.conn.cursor() as cur:
257 # This finds us the rows in location_postcode and word that are
258 # missing in the other table.
259 cur.execute("""SELECT * FROM
260 (SELECT pc, word FROM
261 (SELECT distinct(postcode) as pc FROM location_postcode) p
263 (SELECT word FROM word WHERE type = 'P') w
265 WHERE pc is null or word is null""")
267 with CopyBuffer() as copystr:
268 for postcode, word in cur:
270 to_delete.append(word)
272 copystr.add(self._search_normalized(postcode),
276 cur.execute("""DELETE FROM WORD
277 WHERE type ='P' and word = any(%s)
280 copystr.copy_out(cur, 'word',
281 columns=['word_token', 'type', 'word'])
284 def update_special_phrases(self, phrases, should_replace):
285 """ Replace the search index for special phrases with the new phrases.
286 If `should_replace` is True, then the previous set of will be
287 completely replaced. Otherwise the phrases are added to the
288 already existing ones.
290 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
293 with self.conn.cursor() as cur:
294 # Get the old phrases.
295 existing_phrases = set()
296 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
297 for word, info in cur:
298 existing_phrases.add((word, info['class'], info['type'],
299 info.get('op') or '-'))
301 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
303 deleted = self._remove_special_phrases(cur, norm_phrases,
308 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
309 len(norm_phrases), added, deleted)
312 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
313 """ Add all phrases to the database that are not yet there.
315 to_add = new_phrases - existing_phrases
318 with CopyBuffer() as copystr:
319 for word, cls, typ, oper in to_add:
320 term = self._search_normalized(word)
322 copystr.add(term, 'S', word,
323 json.dumps({'class': cls, 'type': typ,
324 'op': oper if oper in ('in', 'near') else None}))
327 copystr.copy_out(cursor, 'word',
328 columns=['word_token', 'type', 'word', 'info'])
334 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
335 """ Remove all phrases from the databse that are no longer in the
338 to_delete = existing_phrases - new_phrases
341 cursor.execute_values(
342 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
343 WHERE type = 'S' and word = name
344 and info->>'class' = in_class and info->>'type' = in_type
345 and ((op = '-' and info->>'op' is null) or op = info->>'op')
348 return len(to_delete)
351 def add_country_names(self, country_code, names):
352 """ Add names for the given country to the search index.
354 # Make sure any name preprocessing for country names applies.
355 info = PlaceInfo({'name': names, 'country_code': country_code,
356 'rank_address': 4, 'class': 'boundary',
357 'type': 'administrative'})
358 self._add_country_full_names(country_code,
359 self.sanitizer.process_names(info)[0])
362 def _add_country_full_names(self, country_code, names):
363 """ Add names for the given country from an already sanitized
368 norm_name = self._search_normalized(name.name)
370 word_tokens.add(norm_name)
372 with self.conn.cursor() as cur:
374 cur.execute("""SELECT word_token FROM word
375 WHERE type = 'C' and word = %s""",
377 word_tokens.difference_update((t[0] for t in cur))
379 # Only add those names that are not yet in the list.
381 cur.execute("""INSERT INTO word (word_token, type, word)
382 (SELECT token, 'C', %s
383 FROM unnest(%s) as token)
384 """, (country_code, list(word_tokens)))
386 # No names are deleted at the moment.
387 # If deletion is made possible, then the static names from the
388 # initial 'country_name' table should be kept.
391 def process_place(self, place):
392 """ Determine tokenizer information about the given place.
394 Returns a JSON-serializable structure that will be handed into
395 the database via the token_info field.
397 token_info = _TokenInfo(self._cache)
399 names, address = self.sanitizer.process_names(place)
402 fulls, partials = self._compute_name_tokens(names)
404 token_info.add_names(fulls, partials)
406 if place.is_country():
407 self._add_country_full_names(place.country_code, names)
410 self._process_place_address(token_info, address)
412 return token_info.data
415 def _process_place_address(self, token_info, address):
420 if item.kind == 'postcode':
421 self._add_postcode(item.name)
422 elif item.kind == 'housenumber':
423 norm_name = self._make_standard_hnr(item.name)
426 elif item.kind == 'street':
427 streets.extend(self._retrieve_full_tokens(item.name))
428 elif item.kind == 'place':
430 token_info.add_place(self._compute_partial_tokens(item.name))
431 elif not item.kind.startswith('_') and not item.suffix and \
432 item.kind not in ('country', 'full'):
433 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
436 token_info.add_housenumbers(self.conn, hnrs)
439 token_info.add_address_terms(addr_terms)
442 token_info.add_street(streets)
445 def _compute_partial_tokens(self, name):
446 """ Normalize the given term, split it into partial words and return
447 then token list for them.
449 norm_name = self._search_normalized(name)
453 for partial in norm_name.split():
454 token = self._cache.partials.get(partial)
458 need_lookup.append(partial)
461 with self.conn.cursor() as cur:
462 cur.execute("""SELECT word, getorcreate_partial_word(word)
463 FROM unnest(%s) word""",
466 for partial, token in cur:
468 self._cache.partials[partial] = token
473 def _retrieve_full_tokens(self, name):
474 """ Get the full name token for the given name, if it exists.
475 The name is only retrived for the standard analyser.
477 norm_name = self._search_normalized(name)
479 # return cached if possible
480 if norm_name in self._cache.fulls:
481 return self._cache.fulls[norm_name]
483 with self.conn.cursor() as cur:
484 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
486 full = [row[0] for row in cur]
488 self._cache.fulls[norm_name] = full
493 def _compute_name_tokens(self, names):
494 """ Computes the full name and partial name tokens for the given
498 partial_tokens = set()
501 analyzer_id = name.get_attr('analyzer')
502 norm_name = self._normalized(name.name)
503 if analyzer_id is None:
506 token_id = f'{norm_name}@{analyzer_id}'
508 full, part = self._cache.names.get(token_id, (None, None))
510 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
514 with self.conn.cursor() as cur:
515 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
516 (token_id, variants))
517 full, part = cur.fetchone()
519 self._cache.names[token_id] = (full, part)
521 full_tokens.add(full)
522 partial_tokens.update(part)
524 return full_tokens, partial_tokens
527 def _add_postcode(self, postcode):
528 """ Make sure the normalized postcode is present in the word table.
530 if re.search(r'[:,;]', postcode) is None:
531 postcode = self.normalize_postcode(postcode)
533 if postcode not in self._cache.postcodes:
534 term = self._search_normalized(postcode)
538 with self.conn.cursor() as cur:
539 # no word_id needed for postcodes
540 cur.execute("""INSERT INTO word (word_token, type, word)
541 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
544 WHERE type = 'P' and word = pc))
545 """, (term, postcode))
546 self._cache.postcodes.add(postcode)
550 """ Collect token information to be sent back to the database.
552 def __init__(self, cache):
557 def _mk_array(tokens):
558 return '{%s}' % ','.join((str(s) for s in tokens))
561 def add_names(self, fulls, partials):
562 """ Adds token information for the normalised names.
564 self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
567 def add_housenumbers(self, conn, hnrs):
568 """ Extract housenumber information from a list of normalised
571 self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
572 self.data['hnr'] = ';'.join(hnrs)
575 def add_street(self, tokens):
576 """ Add addr:street match terms.
578 self.data['street'] = self._mk_array(tokens)
581 def add_place(self, tokens):
582 """ Add addr:place search and match terms.
585 self.data['place'] = self._mk_array(tokens)
588 def add_address_terms(self, terms):
589 """ Add additional address terms.
591 tokens = {key: self._mk_array(partials)
592 for key, partials in terms if partials}
595 self.data['addr'] = tokens
599 """ Cache for token information to avoid repeated database queries.
601 This cache is not thread-safe and needs to be instantiated per
608 self.postcodes = set()
609 self.housenumbers = {}
612 def get_hnr_tokens(self, conn, terms):
613 """ Get token ids for a list of housenumbers, looking them up in the
614 database if necessary. `terms` is an iterable of normalized
621 token = self.housenumbers.get(term)
628 with conn.cursor() as cur:
629 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
631 for term, tid in cur:
632 self.housenumbers[term] = tid