1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
15 from textwrap import dedent
17 from nominatim.db.connection import connect
18 from nominatim.db.utils import CopyBuffer
19 from nominatim.db.sql_preprocessor import SQLPreprocessor
20 from nominatim.indexer.place_info import PlaceInfo
21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
22 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
24 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
26 LOG = logging.getLogger()
28 def create(dsn, data_dir):
29 """ Create a new instance of the tokenizer provided by this module.
31 return LegacyICUTokenizer(dsn, data_dir)
34 class LegacyICUTokenizer(AbstractTokenizer):
35 """ This tokenizer uses libICU to covert names and queries to ASCII.
36 Otherwise it uses the same algorithms and data structures as the
37 normalization routines in Nominatim 3.
40 def __init__(self, dsn, data_dir):
42 self.data_dir = data_dir
46 def init_new_db(self, config, init_db=True):
47 """ Set up a new tokenizer for the database.
49 This copies all necessary data in the project directory to make
50 sure the tokenizer remains stable even over updates.
52 self.loader = ICURuleLoader(config)
54 self._install_php(config.lib_dir.php, overwrite=True)
58 self.update_sql_functions(config)
59 self._init_db_tables(config)
62 def init_from_project(self, config):
63 """ Initialise the tokenizer from the project directory.
65 self.loader = ICURuleLoader(config)
67 with connect(self.dsn) as conn:
68 self.loader.load_config_from_db(conn)
70 self._install_php(config.lib_dir.php, overwrite=False)
73 def finalize_import(self, config):
74 """ Do any required postprocessing to make the tokenizer data ready
77 with connect(self.dsn) as conn:
78 sqlp = SQLPreprocessor(conn, config)
79 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
82 def update_sql_functions(self, config):
83 """ Reimport the SQL functions for this tokenizer.
85 with connect(self.dsn) as conn:
86 sqlp = SQLPreprocessor(conn, config)
87 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
90 def check_database(self, config):
91 """ Check that the tokenizer is set up correctly.
93 # Will throw an error if there is an issue.
94 self.init_from_project(config)
97 def update_statistics(self):
98 """ Recompute frequencies for all name words.
100 with connect(self.dsn) as conn:
101 if conn.table_exists('search_name'):
102 with conn.cursor() as cur:
103 cur.drop_table("word_frequencies")
104 LOG.info("Computing word frequencies")
105 cur.execute("""CREATE TEMP TABLE word_frequencies AS
106 SELECT unnest(name_vector) as id, count(*)
107 FROM search_name GROUP BY id""")
108 cur.execute("CREATE INDEX ON word_frequencies(id)")
109 LOG.info("Update word table with recomputed frequencies")
110 cur.execute("""UPDATE word
111 SET info = info || jsonb_build_object('count', count)
112 FROM word_frequencies WHERE word_id = id""")
113 cur.drop_table("word_frequencies")
117 def _cleanup_housenumbers(self):
118 """ Remove unused house numbers.
120 with connect(self.dsn) as conn:
121 if not conn.table_exists('search_name'):
123 with conn.cursor(name="hnr_counter") as cur:
124 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
127 AND NOT EXISTS(SELECT * FROM search_name
128 WHERE ARRAY[word.word_id] && name_vector)
129 AND (char_length(coalesce(word, word_token)) > 6
130 OR coalesce(word, word_token) not similar to '\\d+')
132 candidates = {token: wid for wid, token in cur}
133 with conn.cursor(name="hnr_counter") as cur:
134 cur.execute("""SELECT housenumber FROM placex
135 WHERE housenumber is not null
136 AND (char_length(housenumber) > 6
137 OR housenumber not similar to '\\d+')
140 for hnr in row[0].split(';'):
141 candidates.pop(hnr, None)
142 LOG.info("There are %s outdated housenumbers.", len(candidates))
143 LOG.debug("Outdated housenumbers: %s", candidates.keys())
145 with conn.cursor() as cur:
146 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
147 (list(candidates.values()), ))
152 def update_word_tokens(self):
153 """ Remove unused tokens.
155 LOG.warning("Cleaning up housenumber tokens.")
156 self._cleanup_housenumbers()
157 LOG.warning("Tokenizer house-keeping done.")
160 def name_analyzer(self):
161 """ Create a new analyzer for tokenizing names and queries
162 using this tokinzer. Analyzers are context managers and should
166 with tokenizer.name_analyzer() as analyzer:
170 When used outside the with construct, the caller must ensure to
171 call the close() function before destructing the analyzer.
173 Analyzers are not thread-safe. You need to instantiate one per thread.
175 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
176 self.loader.make_token_analysis())
179 def _install_php(self, phpdir, overwrite=True):
180 """ Install the php script for the tokenizer.
182 php_file = self.data_dir / "tokenizer.php"
184 if not php_file.exists() or overwrite:
185 php_file.write_text(dedent(f"""\
187 @define('CONST_Max_Word_Frequency', 10000000);
188 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
189 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
190 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
193 def _save_config(self):
194 """ Save the configuration that needs to remain stable for the given
195 database as database properties.
197 with connect(self.dsn) as conn:
198 self.loader.save_config_to_db(conn)
201 def _init_db_tables(self, config):
202 """ Set up the word table and fill it with pre-computed word
205 with connect(self.dsn) as conn:
206 sqlp = SQLPreprocessor(conn, config)
207 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
211 class LegacyICUNameAnalyzer(AbstractAnalyzer):
212 """ The legacy analyzer uses the ICU library for splitting names.
214 Each instance opens a connection to the database to request the
218 def __init__(self, dsn, sanitizer, token_analysis):
219 self.conn = connect(dsn).connection
220 self.conn.autocommit = True
221 self.sanitizer = sanitizer
222 self.token_analysis = token_analysis
224 self._cache = _TokenCache()
228 """ Free all resources used by the analyzer.
235 def _search_normalized(self, name):
236 """ Return the search token transliteration of the given name.
238 return self.token_analysis.search.transliterate(name).strip()
241 def _normalized(self, name):
242 """ Return the normalized version of the given name with all
243 non-relevant information removed.
245 return self.token_analysis.normalizer.transliterate(name).strip()
248 def get_word_token_info(self, words):
249 """ Return token information for the given list of words.
250 If a word starts with # it is assumed to be a full name
251 otherwise is a partial name.
253 The function returns a list of tuples with
254 (original word, word token, word id).
256 The function is used for testing and debugging only
257 and not necessarily efficient.
262 if word.startswith('#'):
263 full_tokens[word] = self._search_normalized(word[1:])
265 partial_tokens[word] = self._search_normalized(word)
267 with self.conn.cursor() as cur:
268 cur.execute("""SELECT word_token, word_id
269 FROM word WHERE word_token = ANY(%s) and type = 'W'
270 """, (list(full_tokens.values()),))
271 full_ids = {r[0]: r[1] for r in cur}
272 cur.execute("""SELECT word_token, word_id
273 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
274 (list(partial_tokens.values()),))
275 part_ids = {r[0]: r[1] for r in cur}
277 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
278 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
281 def normalize_postcode(self, postcode):
282 """ Convert the postcode to a standardized form.
284 This function must yield exactly the same result as the SQL function
285 'token_normalized_postcode()'.
287 return postcode.strip().upper()
290 def update_postcodes_from_db(self):
291 """ Update postcode tokens in the word table from the location_postcode
295 with self.conn.cursor() as cur:
296 # This finds us the rows in location_postcode and word that are
297 # missing in the other table.
298 cur.execute("""SELECT * FROM
299 (SELECT pc, word FROM
300 (SELECT distinct(postcode) as pc FROM location_postcode) p
302 (SELECT word FROM word WHERE type = 'P') w
304 WHERE pc is null or word is null""")
306 with CopyBuffer() as copystr:
307 for postcode, word in cur:
309 to_delete.append(word)
311 copystr.add(self._search_normalized(postcode),
315 cur.execute("""DELETE FROM WORD
316 WHERE type ='P' and word = any(%s)
319 copystr.copy_out(cur, 'word',
320 columns=['word_token', 'type', 'word'])
323 def update_special_phrases(self, phrases, should_replace):
324 """ Replace the search index for special phrases with the new phrases.
325 If `should_replace` is True, then the previous set of will be
326 completely replaced. Otherwise the phrases are added to the
327 already existing ones.
329 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
332 with self.conn.cursor() as cur:
333 # Get the old phrases.
334 existing_phrases = set()
335 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
336 for word, info in cur:
337 existing_phrases.add((word, info['class'], info['type'],
338 info.get('op') or '-'))
340 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
342 deleted = self._remove_special_phrases(cur, norm_phrases,
347 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
348 len(norm_phrases), added, deleted)
351 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
352 """ Add all phrases to the database that are not yet there.
354 to_add = new_phrases - existing_phrases
357 with CopyBuffer() as copystr:
358 for word, cls, typ, oper in to_add:
359 term = self._search_normalized(word)
361 copystr.add(term, 'S', word,
362 json.dumps({'class': cls, 'type': typ,
363 'op': oper if oper in ('in', 'near') else None}))
366 copystr.copy_out(cursor, 'word',
367 columns=['word_token', 'type', 'word', 'info'])
373 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
374 """ Remove all phrases from the databse that are no longer in the
377 to_delete = existing_phrases - new_phrases
380 cursor.execute_values(
381 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
382 WHERE type = 'S' and word = name
383 and info->>'class' = in_class and info->>'type' = in_type
384 and ((op = '-' and info->>'op' is null) or op = info->>'op')
387 return len(to_delete)
390 def add_country_names(self, country_code, names):
391 """ Add default names for the given country to the search index.
393 # Make sure any name preprocessing for country names applies.
394 info = PlaceInfo({'name': names, 'country_code': country_code,
395 'rank_address': 4, 'class': 'boundary',
396 'type': 'administrative'})
397 self._add_country_full_names(country_code,
398 self.sanitizer.process_names(info)[0],
402 def _add_country_full_names(self, country_code, names, internal=False):
403 """ Add names for the given country from an already sanitized
408 norm_name = self._search_normalized(name.name)
410 word_tokens.add(norm_name)
412 with self.conn.cursor() as cur:
414 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
416 WHERE type = 'C' and word = %s""",
418 existing_tokens = {True: set(), False: set()} # internal/external names
420 existing_tokens[word[1]].add(word[0])
422 # Delete names that no longer exist.
423 gone_tokens = existing_tokens[internal] - word_tokens
425 gone_tokens.update(existing_tokens[False] & word_tokens)
427 cur.execute("""DELETE FROM word
428 USING unnest(%s) as token
429 WHERE type = 'C' and word = %s
430 and word_token = token""",
431 (list(gone_tokens), country_code))
433 # Only add those names that are not yet in the list.
434 new_tokens = word_tokens - existing_tokens[True]
436 new_tokens -= existing_tokens[False]
439 sql = """INSERT INTO word (word_token, type, word, info)
440 (SELECT token, 'C', %s, '{"internal": "yes"}'
441 FROM unnest(%s) as token)
444 sql = """INSERT INTO word (word_token, type, word)
445 (SELECT token, 'C', %s
446 FROM unnest(%s) as token)
448 cur.execute(sql, (country_code, list(new_tokens)))
451 def process_place(self, place):
452 """ Determine tokenizer information about the given place.
454 Returns a JSON-serializable structure that will be handed into
455 the database via the token_info field.
457 token_info = _TokenInfo()
459 names, address = self.sanitizer.process_names(place)
462 token_info.set_names(*self._compute_name_tokens(names))
464 if place.is_country():
465 self._add_country_full_names(place.country_code, names)
468 self._process_place_address(token_info, address)
470 return token_info.to_dict()
473 def _process_place_address(self, token_info, address):
475 if item.kind == 'postcode':
476 self._add_postcode(item.name)
477 elif item.kind == 'housenumber':
478 token_info.add_housenumber(*self._compute_housenumber_token(item))
479 elif item.kind == 'street':
480 token_info.add_street(self._retrieve_full_tokens(item.name))
481 elif item.kind == 'place':
483 token_info.add_place(self._compute_partial_tokens(item.name))
484 elif not item.kind.startswith('_') and not item.suffix and \
485 item.kind not in ('country', 'full'):
486 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
489 def _compute_housenumber_token(self, hnr):
490 """ Normalize the housenumber and return the word token and the
493 analyzer = self.token_analysis.analysis.get('@housenumber')
497 # When no custom analyzer is set, simply normalize and transliterate
498 norm_name = self._search_normalized(hnr.name)
500 result = self._cache.housenumbers.get(norm_name, result)
501 if result[0] is None:
502 with self.conn.cursor() as cur:
503 cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
504 result = cur.fetchone()[0], norm_name
505 self._cache.housenumbers[norm_name] = result
507 # Otherwise use the analyzer to determine the canonical name.
508 # Per convention we use the first variant as the 'lookup name', the
509 # name that gets saved in the housenumber field of the place.
510 norm_name = analyzer.normalize(hnr.name)
512 result = self._cache.housenumbers.get(norm_name, result)
513 if result[0] is None:
514 variants = analyzer.get_variants_ascii(norm_name)
516 with self.conn.cursor() as cur:
517 cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
518 (norm_name, list(variants)))
519 result = cur.fetchone()[0], variants[0]
520 self._cache.housenumbers[norm_name] = result
525 def _compute_partial_tokens(self, name):
526 """ Normalize the given term, split it into partial words and return
527 then token list for them.
529 norm_name = self._search_normalized(name)
533 for partial in norm_name.split():
534 token = self._cache.partials.get(partial)
538 need_lookup.append(partial)
541 with self.conn.cursor() as cur:
542 cur.execute("""SELECT word, getorcreate_partial_word(word)
543 FROM unnest(%s) word""",
546 for partial, token in cur:
548 self._cache.partials[partial] = token
553 def _retrieve_full_tokens(self, name):
554 """ Get the full name token for the given name, if it exists.
555 The name is only retrived for the standard analyser.
557 norm_name = self._search_normalized(name)
559 # return cached if possible
560 if norm_name in self._cache.fulls:
561 return self._cache.fulls[norm_name]
563 with self.conn.cursor() as cur:
564 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
566 full = [row[0] for row in cur]
568 self._cache.fulls[norm_name] = full
573 def _compute_name_tokens(self, names):
574 """ Computes the full name and partial name tokens for the given
578 partial_tokens = set()
581 analyzer_id = name.get_attr('analyzer')
582 analyzer = self.token_analysis.get_analyzer(analyzer_id)
583 norm_name = analyzer.normalize(name.name)
584 if analyzer_id is None:
587 token_id = f'{norm_name}@{analyzer_id}'
589 full, part = self._cache.names.get(token_id, (None, None))
591 variants = analyzer.get_variants_ascii(norm_name)
595 with self.conn.cursor() as cur:
596 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
597 (token_id, variants))
598 full, part = cur.fetchone()
600 self._cache.names[token_id] = (full, part)
602 full_tokens.add(full)
603 partial_tokens.update(part)
605 return full_tokens, partial_tokens
608 def _add_postcode(self, postcode):
609 """ Make sure the normalized postcode is present in the word table.
611 if re.search(r'[:,;]', postcode) is None:
612 postcode = self.normalize_postcode(postcode)
614 if postcode not in self._cache.postcodes:
615 term = self._search_normalized(postcode)
619 with self.conn.cursor() as cur:
620 # no word_id needed for postcodes
621 cur.execute("""INSERT INTO word (word_token, type, word)
622 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
625 WHERE type = 'P' and word = pc))
626 """, (term, postcode))
627 self._cache.postcodes.add(postcode)
631 """ Collect token information to be sent back to the database.
635 self.housenumbers = set()
636 self.housenumber_tokens = set()
637 self.street_tokens = set()
638 self.place_tokens = set()
639 self.address_tokens = {}
643 def _mk_array(tokens):
644 return f"{{{','.join((str(s) for s in tokens))}}}"
648 """ Return the token information in database importable format.
653 out['names'] = self.names
655 if self.housenumbers:
656 out['hnr'] = ';'.join(self.housenumbers)
657 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
659 if self.street_tokens:
660 out['street'] = self._mk_array(self.street_tokens)
662 if self.place_tokens:
663 out['place'] = self._mk_array(self.place_tokens)
665 if self.address_tokens:
666 out['addr'] = self.address_tokens
671 def set_names(self, fulls, partials):
672 """ Adds token information for the normalised names.
674 self.names = self._mk_array(itertools.chain(fulls, partials))
677 def add_housenumber(self, token, hnr):
678 """ Extract housenumber information from a list of normalised
682 self.housenumbers.add(hnr)
683 self.housenumber_tokens.add(token)
686 def add_street(self, tokens):
687 """ Add addr:street match terms.
689 self.street_tokens.update(tokens)
692 def add_place(self, tokens):
693 """ Add addr:place search and match terms.
695 self.place_tokens.update(tokens)
698 def add_address_term(self, key, partials):
699 """ Add additional address terms.
702 self.address_tokens[key] = self._mk_array(partials)
706 """ Cache for token information to avoid repeated database queries.
708 This cache is not thread-safe and needs to be instantiated per
715 self.postcodes = set()
716 self.housenumbers = {}