1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
14 from textwrap import dedent
16 from nominatim.db.connection import connect
17 from nominatim.db.utils import CopyBuffer
18 from nominatim.db.sql_preprocessor import SQLPreprocessor
19 from nominatim.indexer.place_info import PlaceInfo
20 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
21 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
23 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
25 LOG = logging.getLogger()
27 def create(dsn, data_dir):
28 """ Create a new instance of the tokenizer provided by this module.
30 return LegacyICUTokenizer(dsn, data_dir)
33 class LegacyICUTokenizer(AbstractTokenizer):
34 """ This tokenizer uses libICU to covert names and queries to ASCII.
35 Otherwise it uses the same algorithms and data structures as the
36 normalization routines in Nominatim 3.
39 def __init__(self, dsn, data_dir):
41 self.data_dir = data_dir
45 def init_new_db(self, config, init_db=True):
46 """ Set up a new tokenizer for the database.
48 This copies all necessary data in the project directory to make
49 sure the tokenizer remains stable even over updates.
51 self.loader = ICURuleLoader(config)
53 self._install_php(config.lib_dir.php, overwrite=True)
57 self.update_sql_functions(config)
58 self._init_db_tables(config)
61 def init_from_project(self, config):
62 """ Initialise the tokenizer from the project directory.
64 self.loader = ICURuleLoader(config)
66 with connect(self.dsn) as conn:
67 self.loader.load_config_from_db(conn)
69 self._install_php(config.lib_dir.php, overwrite=False)
72 def finalize_import(self, config):
73 """ Do any required postprocessing to make the tokenizer data ready
76 with connect(self.dsn) as conn:
77 sqlp = SQLPreprocessor(conn, config)
78 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
81 def update_sql_functions(self, config):
82 """ Reimport the SQL functions for this tokenizer.
84 with connect(self.dsn) as conn:
85 sqlp = SQLPreprocessor(conn, config)
86 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
89 def check_database(self, config):
90 """ Check that the tokenizer is set up correctly.
92 # Will throw an error if there is an issue.
93 self.init_from_project(config)
96 def update_statistics(self):
97 """ Recompute frequencies for all name words.
99 with connect(self.dsn) as conn:
100 if conn.table_exists('search_name'):
101 with conn.cursor() as cur:
102 cur.drop_table("word_frequencies")
103 LOG.info("Computing word frequencies")
104 cur.execute("""CREATE TEMP TABLE word_frequencies AS
105 SELECT unnest(name_vector) as id, count(*)
106 FROM search_name GROUP BY id""")
107 cur.execute("CREATE INDEX ON word_frequencies(id)")
108 LOG.info("Update word table with recomputed frequencies")
109 cur.execute("""UPDATE word
110 SET info = info || jsonb_build_object('count', count)
111 FROM word_frequencies WHERE word_id = id""")
112 cur.drop_table("word_frequencies")
116 def _cleanup_housenumbers(self):
117 """ Remove unused house numbers.
119 with connect(self.dsn) as conn:
120 if not conn.table_exists('search_name'):
122 with conn.cursor(name="hnr_counter") as cur:
123 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
126 AND NOT EXISTS(SELECT * FROM search_name
127 WHERE ARRAY[word.word_id] && name_vector)
128 AND (char_length(coalesce(word, word_token)) > 6
129 OR coalesce(word, word_token) not similar to '\\d+')
131 candidates = {token: wid for wid, token in cur}
132 with conn.cursor(name="hnr_counter") as cur:
133 cur.execute("""SELECT housenumber FROM placex
134 WHERE housenumber is not null
135 AND (char_length(housenumber) > 6
136 OR housenumber not similar to '\\d+')
139 for hnr in row[0].split(';'):
140 candidates.pop(hnr, None)
141 LOG.info("There are %s outdated housenumbers.", len(candidates))
142 LOG.debug("Outdated housenumbers: %s", candidates.keys())
144 with conn.cursor() as cur:
145 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
146 (list(candidates.values()), ))
151 def update_word_tokens(self):
152 """ Remove unused tokens.
154 LOG.warning("Cleaning up housenumber tokens.")
155 self._cleanup_housenumbers()
156 LOG.warning("Tokenizer house-keeping done.")
159 def name_analyzer(self):
160 """ Create a new analyzer for tokenizing names and queries
161 using this tokinzer. Analyzers are context managers and should
165 with tokenizer.name_analyzer() as analyzer:
169 When used outside the with construct, the caller must ensure to
170 call the close() function before destructing the analyzer.
172 Analyzers are not thread-safe. You need to instantiate one per thread.
174 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
175 self.loader.make_token_analysis())
178 def _install_php(self, phpdir, overwrite=True):
179 """ Install the php script for the tokenizer.
181 php_file = self.data_dir / "tokenizer.php"
183 if not php_file.exists() or overwrite:
184 php_file.write_text(dedent(f"""\
186 @define('CONST_Max_Word_Frequency', 10000000);
187 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
188 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
189 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
192 def _save_config(self):
193 """ Save the configuration that needs to remain stable for the given
194 database as database properties.
196 with connect(self.dsn) as conn:
197 self.loader.save_config_to_db(conn)
200 def _init_db_tables(self, config):
201 """ Set up the word table and fill it with pre-computed word
204 with connect(self.dsn) as conn:
205 sqlp = SQLPreprocessor(conn, config)
206 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
210 class LegacyICUNameAnalyzer(AbstractAnalyzer):
211 """ The legacy analyzer uses the ICU library for splitting names.
213 Each instance opens a connection to the database to request the
217 def __init__(self, dsn, sanitizer, token_analysis):
218 self.conn = connect(dsn).connection
219 self.conn.autocommit = True
220 self.sanitizer = sanitizer
221 self.token_analysis = token_analysis
223 self._cache = _TokenCache()
227 """ Free all resources used by the analyzer.
234 def _search_normalized(self, name):
235 """ Return the search token transliteration of the given name.
237 return self.token_analysis.search.transliterate(name).strip()
240 def _normalized(self, name):
241 """ Return the normalized version of the given name with all
242 non-relevant information removed.
244 return self.token_analysis.normalizer.transliterate(name).strip()
247 def get_word_token_info(self, words):
248 """ Return token information for the given list of words.
249 If a word starts with # it is assumed to be a full name
250 otherwise is a partial name.
252 The function returns a list of tuples with
253 (original word, word token, word id).
255 The function is used for testing and debugging only
256 and not necessarily efficient.
261 if word.startswith('#'):
262 full_tokens[word] = self._search_normalized(word[1:])
264 partial_tokens[word] = self._search_normalized(word)
266 with self.conn.cursor() as cur:
267 cur.execute("""SELECT word_token, word_id
268 FROM word WHERE word_token = ANY(%s) and type = 'W'
269 """, (list(full_tokens.values()),))
270 full_ids = {r[0]: r[1] for r in cur}
271 cur.execute("""SELECT word_token, word_id
272 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
273 (list(partial_tokens.values()),))
274 part_ids = {r[0]: r[1] for r in cur}
276 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
277 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
280 def normalize_postcode(self, postcode):
281 """ Convert the postcode to a standardized form.
283 This function must yield exactly the same result as the SQL function
284 'token_normalized_postcode()'.
286 return postcode.strip().upper()
289 def update_postcodes_from_db(self):
290 """ Update postcode tokens in the word table from the location_postcode
293 analyzer = self.token_analysis.analysis.get('@postcode')
295 with self.conn.cursor() as cur:
296 # First get all postcode names currently in the word table.
297 cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
298 word_entries = set((entry[0] for entry in cur))
300 # Then compute the required postcode names from the postcode table.
301 needed_entries = set()
302 cur.execute("SELECT country_code, postcode FROM location_postcode")
303 for cc, postcode in cur:
304 info = PlaceInfo({'country_code': cc,
305 'class': 'place', 'type': 'postcode',
306 'address': {'postcode': postcode}})
307 address = self.sanitizer.process_names(info)[1]
308 for place in address:
309 if place.kind == 'postcode':
311 postcode_name = place.name.strip().upper()
314 postcode_name = analyzer.normalize(place.name)
315 variant_base = place.get_attr("variant")
318 needed_entries.add(f'{postcode_name}@{variant_base}')
320 needed_entries.add(postcode_name)
323 # Now update the word table.
324 self._delete_unused_postcode_words(word_entries - needed_entries)
325 self._add_missing_postcode_words(needed_entries - word_entries)
327 def _delete_unused_postcode_words(self, tokens):
329 with self.conn.cursor() as cur:
330 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
333 def _add_missing_postcode_words(self, tokens):
337 analyzer = self.token_analysis.analysis.get('@postcode')
340 for postcode_name in tokens:
341 if '@' in postcode_name:
342 term, variant = postcode_name.split('@', 2)
343 term = self._search_normalized(term)
345 if analyzer is not None:
346 variants.update(analyzer.get_variants_ascii(variant))
347 variants = list(variants)
349 variants = [self._search_normalized(postcode_name)]
350 terms.append((postcode_name, variants))
353 with self.conn.cursor() as cur:
354 cur.execute_values("""SELECT create_postcode_word(pc, var)
355 FROM (VALUES %s) AS v(pc, var)""",
361 def update_special_phrases(self, phrases, should_replace):
362 """ Replace the search index for special phrases with the new phrases.
363 If `should_replace` is True, then the previous set of will be
364 completely replaced. Otherwise the phrases are added to the
365 already existing ones.
367 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
370 with self.conn.cursor() as cur:
371 # Get the old phrases.
372 existing_phrases = set()
373 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
374 for word, info in cur:
375 existing_phrases.add((word, info['class'], info['type'],
376 info.get('op') or '-'))
378 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
380 deleted = self._remove_special_phrases(cur, norm_phrases,
385 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
386 len(norm_phrases), added, deleted)
389 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
390 """ Add all phrases to the database that are not yet there.
392 to_add = new_phrases - existing_phrases
395 with CopyBuffer() as copystr:
396 for word, cls, typ, oper in to_add:
397 term = self._search_normalized(word)
399 copystr.add(term, 'S', word,
400 json.dumps({'class': cls, 'type': typ,
401 'op': oper if oper in ('in', 'near') else None}))
404 copystr.copy_out(cursor, 'word',
405 columns=['word_token', 'type', 'word', 'info'])
411 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
412 """ Remove all phrases from the databse that are no longer in the
415 to_delete = existing_phrases - new_phrases
418 cursor.execute_values(
419 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
420 WHERE type = 'S' and word = name
421 and info->>'class' = in_class and info->>'type' = in_type
422 and ((op = '-' and info->>'op' is null) or op = info->>'op')
425 return len(to_delete)
428 def add_country_names(self, country_code, names):
429 """ Add default names for the given country to the search index.
431 # Make sure any name preprocessing for country names applies.
432 info = PlaceInfo({'name': names, 'country_code': country_code,
433 'rank_address': 4, 'class': 'boundary',
434 'type': 'administrative'})
435 self._add_country_full_names(country_code,
436 self.sanitizer.process_names(info)[0],
440 def _add_country_full_names(self, country_code, names, internal=False):
441 """ Add names for the given country from an already sanitized
446 norm_name = self._search_normalized(name.name)
448 word_tokens.add(norm_name)
450 with self.conn.cursor() as cur:
452 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
454 WHERE type = 'C' and word = %s""",
456 existing_tokens = {True: set(), False: set()} # internal/external names
458 existing_tokens[word[1]].add(word[0])
460 # Delete names that no longer exist.
461 gone_tokens = existing_tokens[internal] - word_tokens
463 gone_tokens.update(existing_tokens[False] & word_tokens)
465 cur.execute("""DELETE FROM word
466 USING unnest(%s) as token
467 WHERE type = 'C' and word = %s
468 and word_token = token""",
469 (list(gone_tokens), country_code))
471 # Only add those names that are not yet in the list.
472 new_tokens = word_tokens - existing_tokens[True]
474 new_tokens -= existing_tokens[False]
477 sql = """INSERT INTO word (word_token, type, word, info)
478 (SELECT token, 'C', %s, '{"internal": "yes"}'
479 FROM unnest(%s) as token)
482 sql = """INSERT INTO word (word_token, type, word)
483 (SELECT token, 'C', %s
484 FROM unnest(%s) as token)
486 cur.execute(sql, (country_code, list(new_tokens)))
489 def process_place(self, place):
490 """ Determine tokenizer information about the given place.
492 Returns a JSON-serializable structure that will be handed into
493 the database via the token_info field.
495 token_info = _TokenInfo()
497 names, address = self.sanitizer.process_names(place)
500 token_info.set_names(*self._compute_name_tokens(names))
502 if place.is_country():
503 self._add_country_full_names(place.country_code, names)
506 self._process_place_address(token_info, address)
508 return token_info.to_dict()
511 def _process_place_address(self, token_info, address):
513 if item.kind == 'postcode':
514 token_info.set_postcode(self._add_postcode(item))
515 elif item.kind == 'housenumber':
516 token_info.add_housenumber(*self._compute_housenumber_token(item))
517 elif item.kind == 'street':
518 token_info.add_street(self._retrieve_full_tokens(item.name))
519 elif item.kind == 'place':
521 token_info.add_place(self._compute_partial_tokens(item.name))
522 elif not item.kind.startswith('_') and not item.suffix and \
523 item.kind not in ('country', 'full', 'inclusion'):
524 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
527 def _compute_housenumber_token(self, hnr):
528 """ Normalize the housenumber and return the word token and the
531 analyzer = self.token_analysis.analysis.get('@housenumber')
535 # When no custom analyzer is set, simply normalize and transliterate
536 norm_name = self._search_normalized(hnr.name)
538 result = self._cache.housenumbers.get(norm_name, result)
539 if result[0] is None:
540 with self.conn.cursor() as cur:
541 cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
542 result = cur.fetchone()[0], norm_name
543 self._cache.housenumbers[norm_name] = result
545 # Otherwise use the analyzer to determine the canonical name.
546 # Per convention we use the first variant as the 'lookup name', the
547 # name that gets saved in the housenumber field of the place.
548 norm_name = analyzer.normalize(hnr.name)
550 result = self._cache.housenumbers.get(norm_name, result)
551 if result[0] is None:
552 variants = analyzer.get_variants_ascii(norm_name)
554 with self.conn.cursor() as cur:
555 cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
556 (norm_name, list(variants)))
557 result = cur.fetchone()[0], variants[0]
558 self._cache.housenumbers[norm_name] = result
563 def _compute_partial_tokens(self, name):
564 """ Normalize the given term, split it into partial words and return
565 then token list for them.
567 norm_name = self._search_normalized(name)
571 for partial in norm_name.split():
572 token = self._cache.partials.get(partial)
576 need_lookup.append(partial)
579 with self.conn.cursor() as cur:
580 cur.execute("""SELECT word, getorcreate_partial_word(word)
581 FROM unnest(%s) word""",
584 for partial, token in cur:
586 self._cache.partials[partial] = token
591 def _retrieve_full_tokens(self, name):
592 """ Get the full name token for the given name, if it exists.
593 The name is only retrived for the standard analyser.
595 norm_name = self._search_normalized(name)
597 # return cached if possible
598 if norm_name in self._cache.fulls:
599 return self._cache.fulls[norm_name]
601 with self.conn.cursor() as cur:
602 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
604 full = [row[0] for row in cur]
606 self._cache.fulls[norm_name] = full
611 def _compute_name_tokens(self, names):
612 """ Computes the full name and partial name tokens for the given
616 partial_tokens = set()
619 analyzer_id = name.get_attr('analyzer')
620 analyzer = self.token_analysis.get_analyzer(analyzer_id)
621 norm_name = analyzer.normalize(name.name)
622 if analyzer_id is None:
625 token_id = f'{norm_name}@{analyzer_id}'
627 full, part = self._cache.names.get(token_id, (None, None))
629 variants = analyzer.get_variants_ascii(norm_name)
633 with self.conn.cursor() as cur:
634 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
635 (token_id, variants))
636 full, part = cur.fetchone()
638 self._cache.names[token_id] = (full, part)
640 full_tokens.add(full)
641 partial_tokens.update(part)
643 return full_tokens, partial_tokens
646 def _add_postcode(self, item):
647 """ Make sure the normalized postcode is present in the word table.
649 analyzer = self.token_analysis.analysis.get('@postcode')
652 postcode_name = item.name.strip().upper()
655 postcode_name = analyzer.normalize(item.name)
656 variant_base = item.get_attr("variant")
659 postcode = f'{postcode_name}@{variant_base}'
661 postcode = postcode_name
663 if postcode not in self._cache.postcodes:
664 term = self._search_normalized(postcode_name)
669 if analyzer is not None and variant_base:
670 variants.update(analyzer.get_variants_ascii(variant_base))
672 with self.conn.cursor() as cur:
673 cur.execute("SELECT create_postcode_word(%s, %s)",
674 (postcode, list(variants)))
675 self._cache.postcodes.add(postcode)
681 """ Collect token information to be sent back to the database.
685 self.housenumbers = set()
686 self.housenumber_tokens = set()
687 self.street_tokens = set()
688 self.place_tokens = set()
689 self.address_tokens = {}
694 def _mk_array(tokens):
695 return f"{{{','.join((str(s) for s in tokens))}}}"
699 """ Return the token information in database importable format.
704 out['names'] = self.names
706 if self.housenumbers:
707 out['hnr'] = ';'.join(self.housenumbers)
708 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
710 if self.street_tokens:
711 out['street'] = self._mk_array(self.street_tokens)
713 if self.place_tokens:
714 out['place'] = self._mk_array(self.place_tokens)
716 if self.address_tokens:
717 out['addr'] = self.address_tokens
720 out['postcode'] = self.postcode
725 def set_names(self, fulls, partials):
726 """ Adds token information for the normalised names.
728 self.names = self._mk_array(itertools.chain(fulls, partials))
731 def add_housenumber(self, token, hnr):
732 """ Extract housenumber information from a list of normalised
736 self.housenumbers.add(hnr)
737 self.housenumber_tokens.add(token)
740 def add_street(self, tokens):
741 """ Add addr:street match terms.
743 self.street_tokens.update(tokens)
746 def add_place(self, tokens):
747 """ Add addr:place search and match terms.
749 self.place_tokens.update(tokens)
752 def add_address_term(self, key, partials):
753 """ Add additional address terms.
756 self.address_tokens[key] = self._mk_array(partials)
758 def set_postcode(self, postcode):
759 """ Set the postcode to the given one.
761 self.postcode = postcode
765 """ Cache for token information to avoid repeated database queries.
767 This cache is not thread-safe and needs to be instantiated per
774 self.postcodes = set()
775 self.housenumbers = {}