1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
15 from textwrap import dedent
17 from nominatim.db.connection import connect
18 from nominatim.db.utils import CopyBuffer
19 from nominatim.db.sql_preprocessor import SQLPreprocessor
20 from nominatim.indexer.place_info import PlaceInfo
21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
22 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
24 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
26 LOG = logging.getLogger()
28 def create(dsn, data_dir):
29 """ Create a new instance of the tokenizer provided by this module.
31 return LegacyICUTokenizer(dsn, data_dir)
34 class LegacyICUTokenizer(AbstractTokenizer):
35 """ This tokenizer uses libICU to covert names and queries to ASCII.
36 Otherwise it uses the same algorithms and data structures as the
37 normalization routines in Nominatim 3.
40 def __init__(self, dsn, data_dir):
42 self.data_dir = data_dir
46 def init_new_db(self, config, init_db=True):
47 """ Set up a new tokenizer for the database.
49 This copies all necessary data in the project directory to make
50 sure the tokenizer remains stable even over updates.
52 self.loader = ICURuleLoader(config)
54 self._install_php(config.lib_dir.php, overwrite=True)
58 self.update_sql_functions(config)
59 self._init_db_tables(config)
62 def init_from_project(self, config):
63 """ Initialise the tokenizer from the project directory.
65 self.loader = ICURuleLoader(config)
67 with connect(self.dsn) as conn:
68 self.loader.load_config_from_db(conn)
70 self._install_php(config.lib_dir.php, overwrite=False)
73 def finalize_import(self, config):
74 """ Do any required postprocessing to make the tokenizer data ready
77 with connect(self.dsn) as conn:
78 sqlp = SQLPreprocessor(conn, config)
79 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
82 def update_sql_functions(self, config):
83 """ Reimport the SQL functions for this tokenizer.
85 with connect(self.dsn) as conn:
86 sqlp = SQLPreprocessor(conn, config)
87 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
90 def check_database(self, config):
91 """ Check that the tokenizer is set up correctly.
93 # Will throw an error if there is an issue.
94 self.init_from_project(config)
97 def update_statistics(self):
98 """ Recompute frequencies for all name words.
100 with connect(self.dsn) as conn:
101 if conn.table_exists('search_name'):
102 with conn.cursor() as cur:
103 cur.drop_table("word_frequencies")
104 LOG.info("Computing word frequencies")
105 cur.execute("""CREATE TEMP TABLE word_frequencies AS
106 SELECT unnest(name_vector) as id, count(*)
107 FROM search_name GROUP BY id""")
108 cur.execute("CREATE INDEX ON word_frequencies(id)")
109 LOG.info("Update word table with recomputed frequencies")
110 cur.execute("""UPDATE word
111 SET info = info || jsonb_build_object('count', count)
112 FROM word_frequencies WHERE word_id = id""")
113 cur.drop_table("word_frequencies")
117 def _cleanup_housenumbers(self):
118 """ Remove unused house numbers.
120 with connect(self.dsn) as conn:
121 if not conn.table_exists('search_name'):
123 with conn.cursor(name="hnr_counter") as cur:
124 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
127 AND NOT EXISTS(SELECT * FROM search_name
128 WHERE ARRAY[word.word_id] && name_vector)
129 AND (char_length(coalesce(word, word_token)) > 6
130 OR coalesce(word, word_token) not similar to '\\d+')
132 candidates = {token: wid for wid, token in cur}
133 with conn.cursor(name="hnr_counter") as cur:
134 cur.execute("""SELECT housenumber FROM placex
135 WHERE housenumber is not null
136 AND (char_length(housenumber) > 6
137 OR housenumber not similar to '\\d+')
140 for hnr in row[0].split(';'):
141 candidates.pop(hnr, None)
142 LOG.info("There are %s outdated housenumbers.", len(candidates))
143 LOG.debug("Outdated housenumbers: %s", candidates.keys())
145 with conn.cursor() as cur:
146 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
147 (list(candidates.values()), ))
152 def update_word_tokens(self):
153 """ Remove unused tokens.
155 LOG.warning("Cleaning up housenumber tokens.")
156 self._cleanup_housenumbers()
157 LOG.warning("Tokenizer house-keeping done.")
160 def name_analyzer(self):
161 """ Create a new analyzer for tokenizing names and queries
162 using this tokinzer. Analyzers are context managers and should
166 with tokenizer.name_analyzer() as analyzer:
170 When used outside the with construct, the caller must ensure to
171 call the close() function before destructing the analyzer.
173 Analyzers are not thread-safe. You need to instantiate one per thread.
175 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
176 self.loader.make_token_analysis())
179 def _install_php(self, phpdir, overwrite=True):
180 """ Install the php script for the tokenizer.
182 php_file = self.data_dir / "tokenizer.php"
184 if not php_file.exists() or overwrite:
185 php_file.write_text(dedent(f"""\
187 @define('CONST_Max_Word_Frequency', 10000000);
188 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
189 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
190 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
193 def _save_config(self):
194 """ Save the configuration that needs to remain stable for the given
195 database as database properties.
197 with connect(self.dsn) as conn:
198 self.loader.save_config_to_db(conn)
201 def _init_db_tables(self, config):
202 """ Set up the word table and fill it with pre-computed word
205 with connect(self.dsn) as conn:
206 sqlp = SQLPreprocessor(conn, config)
207 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
211 class LegacyICUNameAnalyzer(AbstractAnalyzer):
212 """ The legacy analyzer uses the ICU library for splitting names.
214 Each instance opens a connection to the database to request the
218 def __init__(self, dsn, sanitizer, token_analysis):
219 self.conn = connect(dsn).connection
220 self.conn.autocommit = True
221 self.sanitizer = sanitizer
222 self.token_analysis = token_analysis
224 self._cache = _TokenCache()
228 """ Free all resources used by the analyzer.
235 def _search_normalized(self, name):
236 """ Return the search token transliteration of the given name.
238 return self.token_analysis.search.transliterate(name).strip()
241 def _normalized(self, name):
242 """ Return the normalized version of the given name with all
243 non-relevant information removed.
245 return self.token_analysis.normalizer.transliterate(name).strip()
248 def get_word_token_info(self, words):
249 """ Return token information for the given list of words.
250 If a word starts with # it is assumed to be a full name
251 otherwise is a partial name.
253 The function returns a list of tuples with
254 (original word, word token, word id).
256 The function is used for testing and debugging only
257 and not necessarily efficient.
262 if word.startswith('#'):
263 full_tokens[word] = self._search_normalized(word[1:])
265 partial_tokens[word] = self._search_normalized(word)
267 with self.conn.cursor() as cur:
268 cur.execute("""SELECT word_token, word_id
269 FROM word WHERE word_token = ANY(%s) and type = 'W'
270 """, (list(full_tokens.values()),))
271 full_ids = {r[0]: r[1] for r in cur}
272 cur.execute("""SELECT word_token, word_id
273 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
274 (list(partial_tokens.values()),))
275 part_ids = {r[0]: r[1] for r in cur}
277 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
278 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
282 def normalize_postcode(postcode):
283 """ Convert the postcode to a standardized form.
285 This function must yield exactly the same result as the SQL function
286 'token_normalized_postcode()'.
288 return postcode.strip().upper()
291 def update_postcodes_from_db(self):
292 """ Update postcode tokens in the word table from the location_postcode
296 with self.conn.cursor() as cur:
297 # This finds us the rows in location_postcode and word that are
298 # missing in the other table.
299 cur.execute("""SELECT * FROM
300 (SELECT pc, word FROM
301 (SELECT distinct(postcode) as pc FROM location_postcode) p
303 (SELECT word FROM word WHERE type = 'P') w
305 WHERE pc is null or word is null""")
307 with CopyBuffer() as copystr:
308 for postcode, word in cur:
310 to_delete.append(word)
312 copystr.add(self._search_normalized(postcode),
316 cur.execute("""DELETE FROM WORD
317 WHERE type ='P' and word = any(%s)
320 copystr.copy_out(cur, 'word',
321 columns=['word_token', 'type', 'word'])
324 def update_special_phrases(self, phrases, should_replace):
325 """ Replace the search index for special phrases with the new phrases.
326 If `should_replace` is True, then the previous set of will be
327 completely replaced. Otherwise the phrases are added to the
328 already existing ones.
330 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
333 with self.conn.cursor() as cur:
334 # Get the old phrases.
335 existing_phrases = set()
336 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
337 for word, info in cur:
338 existing_phrases.add((word, info['class'], info['type'],
339 info.get('op') or '-'))
341 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
343 deleted = self._remove_special_phrases(cur, norm_phrases,
348 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
349 len(norm_phrases), added, deleted)
352 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
353 """ Add all phrases to the database that are not yet there.
355 to_add = new_phrases - existing_phrases
358 with CopyBuffer() as copystr:
359 for word, cls, typ, oper in to_add:
360 term = self._search_normalized(word)
362 copystr.add(term, 'S', word,
363 json.dumps({'class': cls, 'type': typ,
364 'op': oper if oper in ('in', 'near') else None}))
367 copystr.copy_out(cursor, 'word',
368 columns=['word_token', 'type', 'word', 'info'])
374 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
375 """ Remove all phrases from the databse that are no longer in the
378 to_delete = existing_phrases - new_phrases
381 cursor.execute_values(
382 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
383 WHERE type = 'S' and word = name
384 and info->>'class' = in_class and info->>'type' = in_type
385 and ((op = '-' and info->>'op' is null) or op = info->>'op')
388 return len(to_delete)
391 def add_country_names(self, country_code, names):
392 """ Add default names for the given country to the search index.
394 # Make sure any name preprocessing for country names applies.
395 info = PlaceInfo({'name': names, 'country_code': country_code,
396 'rank_address': 4, 'class': 'boundary',
397 'type': 'administrative'})
398 self._add_country_full_names(country_code,
399 self.sanitizer.process_names(info)[0],
403 def _add_country_full_names(self, country_code, names, internal=False):
404 """ Add names for the given country from an already sanitized
409 norm_name = self._search_normalized(name.name)
411 word_tokens.add(norm_name)
413 with self.conn.cursor() as cur:
415 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
417 WHERE type = 'C' and word = %s""",
419 existing_tokens = {True: set(), False: set()} # internal/external names
421 existing_tokens[word[1]].add(word[0])
423 # Delete names that no longer exist.
424 gone_tokens = existing_tokens[internal] - word_tokens
426 gone_tokens.update(existing_tokens[False] & word_tokens)
428 cur.execute("""DELETE FROM word
429 USING unnest(%s) as token
430 WHERE type = 'C' and word = %s
431 and word_token = token""",
432 (list(gone_tokens), country_code))
434 # Only add those names that are not yet in the list.
435 new_tokens = word_tokens - existing_tokens[True]
437 new_tokens -= existing_tokens[False]
440 sql = """INSERT INTO word (word_token, type, word, info)
441 (SELECT token, 'C', %s, '{"internal": "yes"}'
442 FROM unnest(%s) as token)
445 sql = """INSERT INTO word (word_token, type, word)
446 (SELECT token, 'C', %s
447 FROM unnest(%s) as token)
449 cur.execute(sql, (country_code, list(new_tokens)))
452 def process_place(self, place):
453 """ Determine tokenizer information about the given place.
455 Returns a JSON-serializable structure that will be handed into
456 the database via the token_info field.
458 token_info = _TokenInfo()
460 names, address = self.sanitizer.process_names(place)
463 token_info.set_names(*self._compute_name_tokens(names))
465 if place.is_country():
466 self._add_country_full_names(place.country_code, names)
469 self._process_place_address(token_info, address)
471 return token_info.to_dict()
474 def _process_place_address(self, token_info, address):
476 if item.kind == 'postcode':
477 self._add_postcode(item.name)
478 elif item.kind == 'housenumber':
479 token_info.add_housenumber(*self._compute_housenumber_token(item))
480 elif item.kind == 'street':
481 token_info.add_street(self._retrieve_full_tokens(item.name))
482 elif item.kind == 'place':
484 token_info.add_place(self._compute_partial_tokens(item.name))
485 elif not item.kind.startswith('_') and not item.suffix and \
486 item.kind not in ('country', 'full'):
487 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
490 def _compute_housenumber_token(self, hnr):
491 """ Normalize the housenumber and return the word token and the
494 analyzer = self.token_analysis.analysis.get('@housenumber')
498 # When no custom analyzer is set, simply normalize and transliterate
499 norm_name = self._search_normalized(hnr.name)
501 result = self._cache.housenumbers.get(norm_name, result)
502 if result[0] is None:
503 with self.conn.cursor() as cur:
504 cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
505 result = cur.fetchone()[0], norm_name
506 self._cache.housenumbers[norm_name] = result
508 # Otherwise use the analyzer to determine the canonical name.
509 # Per convention we use the first variant as the 'lookup name', the
510 # name that gets saved in the housenumber field of the place.
511 norm_name = analyzer.normalize(hnr.name)
513 result = self._cache.housenumbers.get(norm_name, result)
514 if result[0] is None:
515 variants = analyzer.get_variants_ascii(norm_name)
517 with self.conn.cursor() as cur:
518 cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
519 (norm_name, list(variants)))
520 result = cur.fetchone()[0], variants[0]
521 self._cache.housenumbers[norm_name] = result
526 def _compute_partial_tokens(self, name):
527 """ Normalize the given term, split it into partial words and return
528 then token list for them.
530 norm_name = self._search_normalized(name)
534 for partial in norm_name.split():
535 token = self._cache.partials.get(partial)
539 need_lookup.append(partial)
542 with self.conn.cursor() as cur:
543 cur.execute("""SELECT word, getorcreate_partial_word(word)
544 FROM unnest(%s) word""",
547 for partial, token in cur:
549 self._cache.partials[partial] = token
554 def _retrieve_full_tokens(self, name):
555 """ Get the full name token for the given name, if it exists.
556 The name is only retrived for the standard analyser.
558 norm_name = self._search_normalized(name)
560 # return cached if possible
561 if norm_name in self._cache.fulls:
562 return self._cache.fulls[norm_name]
564 with self.conn.cursor() as cur:
565 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
567 full = [row[0] for row in cur]
569 self._cache.fulls[norm_name] = full
574 def _compute_name_tokens(self, names):
575 """ Computes the full name and partial name tokens for the given
579 partial_tokens = set()
582 analyzer_id = name.get_attr('analyzer')
583 analyzer = self.token_analysis.get_analyzer(analyzer_id)
584 norm_name = analyzer.normalize(name.name)
585 if analyzer_id is None:
588 token_id = f'{norm_name}@{analyzer_id}'
590 full, part = self._cache.names.get(token_id, (None, None))
592 variants = analyzer.get_variants_ascii(norm_name)
596 with self.conn.cursor() as cur:
597 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
598 (token_id, variants))
599 full, part = cur.fetchone()
601 self._cache.names[token_id] = (full, part)
603 full_tokens.add(full)
604 partial_tokens.update(part)
606 return full_tokens, partial_tokens
609 def _add_postcode(self, postcode):
610 """ Make sure the normalized postcode is present in the word table.
612 if re.search(r'[:,;]', postcode) is None:
613 postcode = self.normalize_postcode(postcode)
615 if postcode not in self._cache.postcodes:
616 term = self._search_normalized(postcode)
620 with self.conn.cursor() as cur:
621 # no word_id needed for postcodes
622 cur.execute("""INSERT INTO word (word_token, type, word)
623 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
626 WHERE type = 'P' and word = pc))
627 """, (term, postcode))
628 self._cache.postcodes.add(postcode)
632 """ Collect token information to be sent back to the database.
636 self.housenumbers = set()
637 self.housenumber_tokens = set()
638 self.street_tokens = set()
639 self.place_tokens = set()
640 self.address_tokens = {}
644 def _mk_array(tokens):
645 return f"{{{','.join((str(s) for s in tokens))}}}"
649 """ Return the token information in database importable format.
654 out['names'] = self.names
656 if self.housenumbers:
657 out['hnr'] = ';'.join(self.housenumbers)
658 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
660 if self.street_tokens:
661 out['street'] = self._mk_array(self.street_tokens)
663 if self.place_tokens:
664 out['place'] = self._mk_array(self.place_tokens)
666 if self.address_tokens:
667 out['addr'] = self.address_tokens
672 def set_names(self, fulls, partials):
673 """ Adds token information for the normalised names.
675 self.names = self._mk_array(itertools.chain(fulls, partials))
678 def add_housenumber(self, token, hnr):
679 """ Extract housenumber information from a list of normalised
683 self.housenumbers.add(hnr)
684 self.housenumber_tokens.add(token)
687 def add_street(self, tokens):
688 """ Add addr:street match terms.
690 self.street_tokens.update(tokens)
693 def add_place(self, tokens):
694 """ Add addr:place search and match terms.
696 self.place_tokens.update(tokens)
699 def add_address_term(self, key, partials):
700 """ Add additional address terms.
703 self.address_tokens[key] = self._mk_array(partials)
707 """ Cache for token information to avoid repeated database queries.
709 This cache is not thread-safe and needs to be instantiated per
716 self.postcodes = set()
717 self.housenumbers = {}