1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
14 from textwrap import dedent
16 from nominatim.db.connection import connect
17 from nominatim.db.utils import CopyBuffer
18 from nominatim.db.sql_preprocessor import SQLPreprocessor
19 from nominatim.indexer.place_info import PlaceInfo
20 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
21 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
23 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
25 LOG = logging.getLogger()
27 def create(dsn, data_dir):
28 """ Create a new instance of the tokenizer provided by this module.
30 return LegacyICUTokenizer(dsn, data_dir)
33 class LegacyICUTokenizer(AbstractTokenizer):
34 """ This tokenizer uses libICU to covert names and queries to ASCII.
35 Otherwise it uses the same algorithms and data structures as the
36 normalization routines in Nominatim 3.
39 def __init__(self, dsn, data_dir):
41 self.data_dir = data_dir
45 def init_new_db(self, config, init_db=True):
46 """ Set up a new tokenizer for the database.
48 This copies all necessary data in the project directory to make
49 sure the tokenizer remains stable even over updates.
51 self.loader = ICURuleLoader(config)
53 self._install_php(config.lib_dir.php, overwrite=True)
57 self.update_sql_functions(config)
58 self._init_db_tables(config)
61 def init_from_project(self, config):
62 """ Initialise the tokenizer from the project directory.
64 self.loader = ICURuleLoader(config)
66 with connect(self.dsn) as conn:
67 self.loader.load_config_from_db(conn)
69 self._install_php(config.lib_dir.php, overwrite=False)
72 def finalize_import(self, config):
73 """ Do any required postprocessing to make the tokenizer data ready
76 with connect(self.dsn) as conn:
77 sqlp = SQLPreprocessor(conn, config)
78 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
81 def update_sql_functions(self, config):
82 """ Reimport the SQL functions for this tokenizer.
84 with connect(self.dsn) as conn:
85 sqlp = SQLPreprocessor(conn, config)
86 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
89 def check_database(self, config):
90 """ Check that the tokenizer is set up correctly.
92 # Will throw an error if there is an issue.
93 self.init_from_project(config)
96 def update_statistics(self):
97 """ Recompute frequencies for all name words.
99 with connect(self.dsn) as conn:
100 if conn.table_exists('search_name'):
101 with conn.cursor() as cur:
102 cur.drop_table("word_frequencies")
103 LOG.info("Computing word frequencies")
104 cur.execute("""CREATE TEMP TABLE word_frequencies AS
105 SELECT unnest(name_vector) as id, count(*)
106 FROM search_name GROUP BY id""")
107 cur.execute("CREATE INDEX ON word_frequencies(id)")
108 LOG.info("Update word table with recomputed frequencies")
109 cur.execute("""UPDATE word
110 SET info = info || jsonb_build_object('count', count)
111 FROM word_frequencies WHERE word_id = id""")
112 cur.drop_table("word_frequencies")
116 def _cleanup_housenumbers(self):
117 """ Remove unused house numbers.
119 with connect(self.dsn) as conn:
120 if not conn.table_exists('search_name'):
122 with conn.cursor(name="hnr_counter") as cur:
123 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
126 AND NOT EXISTS(SELECT * FROM search_name
127 WHERE ARRAY[word.word_id] && name_vector)
128 AND (char_length(coalesce(word, word_token)) > 6
129 OR coalesce(word, word_token) not similar to '\\d+')
131 candidates = {token: wid for wid, token in cur}
132 with conn.cursor(name="hnr_counter") as cur:
133 cur.execute("""SELECT housenumber FROM placex
134 WHERE housenumber is not null
135 AND (char_length(housenumber) > 6
136 OR housenumber not similar to '\\d+')
139 for hnr in row[0].split(';'):
140 candidates.pop(hnr, None)
141 LOG.info("There are %s outdated housenumbers.", len(candidates))
142 LOG.debug("Outdated housenumbers: %s", candidates.keys())
144 with conn.cursor() as cur:
145 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
146 (list(candidates.values()), ))
151 def update_word_tokens(self):
152 """ Remove unused tokens.
154 LOG.warning("Cleaning up housenumber tokens.")
155 self._cleanup_housenumbers()
156 LOG.warning("Tokenizer house-keeping done.")
159 def name_analyzer(self):
160 """ Create a new analyzer for tokenizing names and queries
161 using this tokinzer. Analyzers are context managers and should
165 with tokenizer.name_analyzer() as analyzer:
169 When used outside the with construct, the caller must ensure to
170 call the close() function before destructing the analyzer.
172 Analyzers are not thread-safe. You need to instantiate one per thread.
174 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
175 self.loader.make_token_analysis())
178 def _install_php(self, phpdir, overwrite=True):
179 """ Install the php script for the tokenizer.
181 php_file = self.data_dir / "tokenizer.php"
183 if not php_file.exists() or overwrite:
184 php_file.write_text(dedent(f"""\
186 @define('CONST_Max_Word_Frequency', 10000000);
187 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
188 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
189 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
192 def _save_config(self):
193 """ Save the configuration that needs to remain stable for the given
194 database as database properties.
196 with connect(self.dsn) as conn:
197 self.loader.save_config_to_db(conn)
200 def _init_db_tables(self, config):
201 """ Set up the word table and fill it with pre-computed word
204 with connect(self.dsn) as conn:
205 sqlp = SQLPreprocessor(conn, config)
206 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
210 class LegacyICUNameAnalyzer(AbstractAnalyzer):
211 """ The legacy analyzer uses the ICU library for splitting names.
213 Each instance opens a connection to the database to request the
217 def __init__(self, dsn, sanitizer, token_analysis):
218 self.conn = connect(dsn).connection
219 self.conn.autocommit = True
220 self.sanitizer = sanitizer
221 self.token_analysis = token_analysis
223 self._cache = _TokenCache()
227 """ Free all resources used by the analyzer.
234 def _search_normalized(self, name):
235 """ Return the search token transliteration of the given name.
237 return self.token_analysis.search.transliterate(name).strip()
240 def _normalized(self, name):
241 """ Return the normalized version of the given name with all
242 non-relevant information removed.
244 return self.token_analysis.normalizer.transliterate(name).strip()
247 def get_word_token_info(self, words):
248 """ Return token information for the given list of words.
249 If a word starts with # it is assumed to be a full name
250 otherwise is a partial name.
252 The function returns a list of tuples with
253 (original word, word token, word id).
255 The function is used for testing and debugging only
256 and not necessarily efficient.
261 if word.startswith('#'):
262 full_tokens[word] = self._search_normalized(word[1:])
264 partial_tokens[word] = self._search_normalized(word)
266 with self.conn.cursor() as cur:
267 cur.execute("""SELECT word_token, word_id
268 FROM word WHERE word_token = ANY(%s) and type = 'W'
269 """, (list(full_tokens.values()),))
270 full_ids = {r[0]: r[1] for r in cur}
271 cur.execute("""SELECT word_token, word_id
272 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
273 (list(partial_tokens.values()),))
274 part_ids = {r[0]: r[1] for r in cur}
276 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
277 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
280 def normalize_postcode(self, postcode):
281 """ Convert the postcode to a standardized form.
283 This function must yield exactly the same result as the SQL function
284 'token_normalized_postcode()'.
286 return postcode.strip().upper()
289 def update_postcodes_from_db(self):
290 """ Update postcode tokens in the word table from the location_postcode
294 with self.conn.cursor() as cur:
295 # This finds us the rows in location_postcode and word that are
296 # missing in the other table.
297 cur.execute("""SELECT * FROM
298 (SELECT pc, word FROM
299 (SELECT distinct(postcode) as pc FROM location_postcode) p
301 (SELECT word FROM word WHERE type = 'P') w
303 WHERE pc is null or word is null""")
305 with CopyBuffer() as copystr:
306 for postcode, word in cur:
308 to_delete.append(word)
310 copystr.add(self._search_normalized(postcode),
314 cur.execute("""DELETE FROM WORD
315 WHERE type ='P' and word = any(%s)
318 copystr.copy_out(cur, 'word',
319 columns=['word_token', 'type', 'word'])
322 def update_special_phrases(self, phrases, should_replace):
323 """ Replace the search index for special phrases with the new phrases.
324 If `should_replace` is True, then the previous set of will be
325 completely replaced. Otherwise the phrases are added to the
326 already existing ones.
328 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
331 with self.conn.cursor() as cur:
332 # Get the old phrases.
333 existing_phrases = set()
334 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
335 for word, info in cur:
336 existing_phrases.add((word, info['class'], info['type'],
337 info.get('op') or '-'))
339 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
341 deleted = self._remove_special_phrases(cur, norm_phrases,
346 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
347 len(norm_phrases), added, deleted)
350 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
351 """ Add all phrases to the database that are not yet there.
353 to_add = new_phrases - existing_phrases
356 with CopyBuffer() as copystr:
357 for word, cls, typ, oper in to_add:
358 term = self._search_normalized(word)
360 copystr.add(term, 'S', word,
361 json.dumps({'class': cls, 'type': typ,
362 'op': oper if oper in ('in', 'near') else None}))
365 copystr.copy_out(cursor, 'word',
366 columns=['word_token', 'type', 'word', 'info'])
372 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
373 """ Remove all phrases from the databse that are no longer in the
376 to_delete = existing_phrases - new_phrases
379 cursor.execute_values(
380 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
381 WHERE type = 'S' and word = name
382 and info->>'class' = in_class and info->>'type' = in_type
383 and ((op = '-' and info->>'op' is null) or op = info->>'op')
386 return len(to_delete)
389 def add_country_names(self, country_code, names):
390 """ Add default names for the given country to the search index.
392 # Make sure any name preprocessing for country names applies.
393 info = PlaceInfo({'name': names, 'country_code': country_code,
394 'rank_address': 4, 'class': 'boundary',
395 'type': 'administrative'})
396 self._add_country_full_names(country_code,
397 self.sanitizer.process_names(info)[0],
401 def _add_country_full_names(self, country_code, names, internal=False):
402 """ Add names for the given country from an already sanitized
407 norm_name = self._search_normalized(name.name)
409 word_tokens.add(norm_name)
411 with self.conn.cursor() as cur:
413 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
415 WHERE type = 'C' and word = %s""",
417 existing_tokens = {True: set(), False: set()} # internal/external names
419 existing_tokens[word[1]].add(word[0])
421 # Delete names that no longer exist.
422 gone_tokens = existing_tokens[internal] - word_tokens
424 gone_tokens.update(existing_tokens[False] & word_tokens)
426 cur.execute("""DELETE FROM word
427 USING unnest(%s) as token
428 WHERE type = 'C' and word = %s
429 and word_token = token""",
430 (list(gone_tokens), country_code))
432 # Only add those names that are not yet in the list.
433 new_tokens = word_tokens - existing_tokens[True]
435 new_tokens -= existing_tokens[False]
438 sql = """INSERT INTO word (word_token, type, word, info)
439 (SELECT token, 'C', %s, '{"internal": "yes"}'
440 FROM unnest(%s) as token)
443 sql = """INSERT INTO word (word_token, type, word)
444 (SELECT token, 'C', %s
445 FROM unnest(%s) as token)
447 cur.execute(sql, (country_code, list(new_tokens)))
450 def process_place(self, place):
451 """ Determine tokenizer information about the given place.
453 Returns a JSON-serializable structure that will be handed into
454 the database via the token_info field.
456 token_info = _TokenInfo()
458 names, address = self.sanitizer.process_names(place)
461 token_info.set_names(*self._compute_name_tokens(names))
463 if place.is_country():
464 self._add_country_full_names(place.country_code, names)
467 self._process_place_address(token_info, address)
469 return token_info.to_dict()
472 def _process_place_address(self, token_info, address):
474 if item.kind == 'postcode':
475 token_info.set_postcode(self._add_postcode(item))
476 elif item.kind == 'housenumber':
477 token_info.add_housenumber(*self._compute_housenumber_token(item))
478 elif item.kind == 'street':
479 token_info.add_street(self._retrieve_full_tokens(item.name))
480 elif item.kind == 'place':
482 token_info.add_place(self._compute_partial_tokens(item.name))
483 elif not item.kind.startswith('_') and not item.suffix and \
484 item.kind not in ('country', 'full', 'inclusion'):
485 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
488 def _compute_housenumber_token(self, hnr):
489 """ Normalize the housenumber and return the word token and the
492 analyzer = self.token_analysis.analysis.get('@housenumber')
496 # When no custom analyzer is set, simply normalize and transliterate
497 norm_name = self._search_normalized(hnr.name)
499 result = self._cache.housenumbers.get(norm_name, result)
500 if result[0] is None:
501 with self.conn.cursor() as cur:
502 cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
503 result = cur.fetchone()[0], norm_name
504 self._cache.housenumbers[norm_name] = result
506 # Otherwise use the analyzer to determine the canonical name.
507 # Per convention we use the first variant as the 'lookup name', the
508 # name that gets saved in the housenumber field of the place.
509 norm_name = analyzer.normalize(hnr.name)
511 result = self._cache.housenumbers.get(norm_name, result)
512 if result[0] is None:
513 variants = analyzer.get_variants_ascii(norm_name)
515 with self.conn.cursor() as cur:
516 cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
517 (norm_name, list(variants)))
518 result = cur.fetchone()[0], variants[0]
519 self._cache.housenumbers[norm_name] = result
524 def _compute_partial_tokens(self, name):
525 """ Normalize the given term, split it into partial words and return
526 then token list for them.
528 norm_name = self._search_normalized(name)
532 for partial in norm_name.split():
533 token = self._cache.partials.get(partial)
537 need_lookup.append(partial)
540 with self.conn.cursor() as cur:
541 cur.execute("""SELECT word, getorcreate_partial_word(word)
542 FROM unnest(%s) word""",
545 for partial, token in cur:
547 self._cache.partials[partial] = token
552 def _retrieve_full_tokens(self, name):
553 """ Get the full name token for the given name, if it exists.
554 The name is only retrived for the standard analyser.
556 norm_name = self._search_normalized(name)
558 # return cached if possible
559 if norm_name in self._cache.fulls:
560 return self._cache.fulls[norm_name]
562 with self.conn.cursor() as cur:
563 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
565 full = [row[0] for row in cur]
567 self._cache.fulls[norm_name] = full
572 def _compute_name_tokens(self, names):
573 """ Computes the full name and partial name tokens for the given
577 partial_tokens = set()
580 analyzer_id = name.get_attr('analyzer')
581 analyzer = self.token_analysis.get_analyzer(analyzer_id)
582 norm_name = analyzer.normalize(name.name)
583 if analyzer_id is None:
586 token_id = f'{norm_name}@{analyzer_id}'
588 full, part = self._cache.names.get(token_id, (None, None))
590 variants = analyzer.get_variants_ascii(norm_name)
594 with self.conn.cursor() as cur:
595 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
596 (token_id, variants))
597 full, part = cur.fetchone()
599 self._cache.names[token_id] = (full, part)
601 full_tokens.add(full)
602 partial_tokens.update(part)
604 return full_tokens, partial_tokens
607 def _add_postcode(self, item):
608 """ Make sure the normalized postcode is present in the word table.
610 analyzer = self.token_analysis.get_analyzer('@postcode')
613 postcode_name = item.name.strip().upper()
616 postcode_name = analyzer.normalize(item.name)
617 variant_base = item.get_attr("variant")
619 if variant_base is not None:
620 postcode = f'{postcode_name}@{variant_base}'
622 postcode = postcode_name
624 if postcode not in self._cache.postcodes:
625 term = self._search_normalized(postcode_name)
630 if analyzer is not None and variant_base is not None:
631 variants.update(analyzer.get_variants_ascii(variant_base))
633 with self.conn.cursor() as cur:
634 cur.execute("SELECT create_postcode_word(%s, %s)",
635 (postcode, list(variants)))
636 self._cache.postcodes.add(postcode)
640 """ Collect token information to be sent back to the database.
644 self.housenumbers = set()
645 self.housenumber_tokens = set()
646 self.street_tokens = set()
647 self.place_tokens = set()
648 self.address_tokens = {}
653 def _mk_array(tokens):
654 return f"{{{','.join((str(s) for s in tokens))}}}"
658 """ Return the token information in database importable format.
663 out['names'] = self.names
665 if self.housenumbers:
666 out['hnr'] = ';'.join(self.housenumbers)
667 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
669 if self.street_tokens:
670 out['street'] = self._mk_array(self.street_tokens)
672 if self.place_tokens:
673 out['place'] = self._mk_array(self.place_tokens)
675 if self.address_tokens:
676 out['addr'] = self.address_tokens
681 def set_names(self, fulls, partials):
682 """ Adds token information for the normalised names.
684 self.names = self._mk_array(itertools.chain(fulls, partials))
687 def add_housenumber(self, token, hnr):
688 """ Extract housenumber information from a list of normalised
692 self.housenumbers.add(hnr)
693 self.housenumber_tokens.add(token)
696 def add_street(self, tokens):
697 """ Add addr:street match terms.
699 self.street_tokens.update(tokens)
702 def add_place(self, tokens):
703 """ Add addr:place search and match terms.
705 self.place_tokens.update(tokens)
708 def add_address_term(self, key, partials):
709 """ Add additional address terms.
712 self.address_tokens[key] = self._mk_array(partials)
714 def set_postcode(self, postcode):
715 """ Set the postcode to the given one.
717 self.postcode = postcode
721 """ Cache for token information to avoid repeated database queries.
723 This cache is not thread-safe and needs to be instantiated per
730 self.postcodes = set()
731 self.housenumbers = {}