1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
15 from textwrap import dedent
17 from nominatim.db.connection import connect
18 from nominatim.db.utils import CopyBuffer
19 from nominatim.db.sql_preprocessor import SQLPreprocessor
20 from nominatim.indexer.place_info import PlaceInfo
21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
22 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
24 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
26 LOG = logging.getLogger()
28 def create(dsn, data_dir):
29 """ Create a new instance of the tokenizer provided by this module.
31 return LegacyICUTokenizer(dsn, data_dir)
34 class LegacyICUTokenizer(AbstractTokenizer):
35 """ This tokenizer uses libICU to covert names and queries to ASCII.
36 Otherwise it uses the same algorithms and data structures as the
37 normalization routines in Nominatim 3.
40 def __init__(self, dsn, data_dir):
42 self.data_dir = data_dir
46 def init_new_db(self, config, init_db=True):
47 """ Set up a new tokenizer for the database.
49 This copies all necessary data in the project directory to make
50 sure the tokenizer remains stable even over updates.
52 self.loader = ICURuleLoader(config)
54 self._install_php(config.lib_dir.php)
58 self.update_sql_functions(config)
59 self._init_db_tables(config)
62 def init_from_project(self, config):
63 """ Initialise the tokenizer from the project directory.
65 self.loader = ICURuleLoader(config)
67 with connect(self.dsn) as conn:
68 self.loader.load_config_from_db(conn)
71 def finalize_import(self, config):
72 """ Do any required postprocessing to make the tokenizer data ready
75 with connect(self.dsn) as conn:
76 sqlp = SQLPreprocessor(conn, config)
77 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
80 def update_sql_functions(self, config):
81 """ Reimport the SQL functions for this tokenizer.
83 with connect(self.dsn) as conn:
84 sqlp = SQLPreprocessor(conn, config)
85 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
88 def check_database(self, config):
89 """ Check that the tokenizer is set up correctly.
91 # Will throw an error if there is an issue.
92 self.init_from_project(config)
95 def update_statistics(self):
96 """ Recompute frequencies for all name words.
98 with connect(self.dsn) as conn:
99 if conn.table_exists('search_name'):
100 with conn.cursor() as cur:
101 cur.drop_table("word_frequencies")
102 LOG.info("Computing word frequencies")
103 cur.execute("""CREATE TEMP TABLE word_frequencies AS
104 SELECT unnest(name_vector) as id, count(*)
105 FROM search_name GROUP BY id""")
106 cur.execute("CREATE INDEX ON word_frequencies(id)")
107 LOG.info("Update word table with recomputed frequencies")
108 cur.execute("""UPDATE word
109 SET info = info || jsonb_build_object('count', count)
110 FROM word_frequencies WHERE word_id = id""")
111 cur.drop_table("word_frequencies")
115 def _cleanup_housenumbers(self):
116 """ Remove unused house numbers.
118 with connect(self.dsn) as conn:
119 if not conn.table_exists('search_name'):
121 with conn.cursor(name="hnr_counter") as cur:
122 cur.execute("""SELECT word_id, word_token FROM word
124 AND NOT EXISTS(SELECT * FROM search_name
125 WHERE ARRAY[word.word_id] && name_vector)
126 AND (char_length(word_token) > 6
127 OR word_token not similar to '\\d+')
129 candidates = {token: wid for wid, token in cur}
130 with conn.cursor(name="hnr_counter") as cur:
131 cur.execute("""SELECT housenumber FROM placex
132 WHERE housenumber is not null
133 AND (char_length(housenumber) > 6
134 OR housenumber not similar to '\\d+')
137 for hnr in row[0].split(';'):
138 candidates.pop(hnr, None)
139 LOG.info("There are %s outdated housenumbers.", len(candidates))
141 with conn.cursor() as cur:
142 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
143 (list(candidates.values()), ))
148 def update_word_tokens(self):
149 """ Remove unused tokens.
151 LOG.warning("Cleaning up housenumber tokens.")
152 self._cleanup_housenumbers()
153 LOG.warning("Tokenizer house-keeping done.")
156 def name_analyzer(self):
157 """ Create a new analyzer for tokenizing names and queries
158 using this tokinzer. Analyzers are context managers and should
162 with tokenizer.name_analyzer() as analyzer:
166 When used outside the with construct, the caller must ensure to
167 call the close() function before destructing the analyzer.
169 Analyzers are not thread-safe. You need to instantiate one per thread.
171 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
172 self.loader.make_token_analysis())
175 def _install_php(self, phpdir):
176 """ Install the php script for the tokenizer.
178 php_file = self.data_dir / "tokenizer.php"
179 php_file.write_text(dedent(f"""\
181 @define('CONST_Max_Word_Frequency', 10000000);
182 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
183 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
184 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
187 def _save_config(self):
188 """ Save the configuration that needs to remain stable for the given
189 database as database properties.
191 with connect(self.dsn) as conn:
192 self.loader.save_config_to_db(conn)
195 def _init_db_tables(self, config):
196 """ Set up the word table and fill it with pre-computed word
199 with connect(self.dsn) as conn:
200 sqlp = SQLPreprocessor(conn, config)
201 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
205 class LegacyICUNameAnalyzer(AbstractAnalyzer):
206 """ The legacy analyzer uses the ICU library for splitting names.
208 Each instance opens a connection to the database to request the
212 def __init__(self, dsn, sanitizer, token_analysis):
213 self.conn = connect(dsn).connection
214 self.conn.autocommit = True
215 self.sanitizer = sanitizer
216 self.token_analysis = token_analysis
218 self._cache = _TokenCache()
222 """ Free all resources used by the analyzer.
229 def _search_normalized(self, name):
230 """ Return the search token transliteration of the given name.
232 return self.token_analysis.search.transliterate(name).strip()
235 def _normalized(self, name):
236 """ Return the normalized version of the given name with all
237 non-relevant information removed.
239 return self.token_analysis.normalizer.transliterate(name).strip()
242 def get_word_token_info(self, words):
243 """ Return token information for the given list of words.
244 If a word starts with # it is assumed to be a full name
245 otherwise is a partial name.
247 The function returns a list of tuples with
248 (original word, word token, word id).
250 The function is used for testing and debugging only
251 and not necessarily efficient.
256 if word.startswith('#'):
257 full_tokens[word] = self._search_normalized(word[1:])
259 partial_tokens[word] = self._search_normalized(word)
261 with self.conn.cursor() as cur:
262 cur.execute("""SELECT word_token, word_id
263 FROM word WHERE word_token = ANY(%s) and type = 'W'
264 """, (list(full_tokens.values()),))
265 full_ids = {r[0]: r[1] for r in cur}
266 cur.execute("""SELECT word_token, word_id
267 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
268 (list(partial_tokens.values()),))
269 part_ids = {r[0]: r[1] for r in cur}
271 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
272 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
276 def normalize_postcode(postcode):
277 """ Convert the postcode to a standardized form.
279 This function must yield exactly the same result as the SQL function
280 'token_normalized_postcode()'.
282 return postcode.strip().upper()
285 def update_postcodes_from_db(self):
286 """ Update postcode tokens in the word table from the location_postcode
290 with self.conn.cursor() as cur:
291 # This finds us the rows in location_postcode and word that are
292 # missing in the other table.
293 cur.execute("""SELECT * FROM
294 (SELECT pc, word FROM
295 (SELECT distinct(postcode) as pc FROM location_postcode) p
297 (SELECT word FROM word WHERE type = 'P') w
299 WHERE pc is null or word is null""")
301 with CopyBuffer() as copystr:
302 for postcode, word in cur:
304 to_delete.append(word)
306 copystr.add(self._search_normalized(postcode),
310 cur.execute("""DELETE FROM WORD
311 WHERE type ='P' and word = any(%s)
314 copystr.copy_out(cur, 'word',
315 columns=['word_token', 'type', 'word'])
318 def update_special_phrases(self, phrases, should_replace):
319 """ Replace the search index for special phrases with the new phrases.
320 If `should_replace` is True, then the previous set of will be
321 completely replaced. Otherwise the phrases are added to the
322 already existing ones.
324 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
327 with self.conn.cursor() as cur:
328 # Get the old phrases.
329 existing_phrases = set()
330 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
331 for word, info in cur:
332 existing_phrases.add((word, info['class'], info['type'],
333 info.get('op') or '-'))
335 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
337 deleted = self._remove_special_phrases(cur, norm_phrases,
342 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
343 len(norm_phrases), added, deleted)
346 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
347 """ Add all phrases to the database that are not yet there.
349 to_add = new_phrases - existing_phrases
352 with CopyBuffer() as copystr:
353 for word, cls, typ, oper in to_add:
354 term = self._search_normalized(word)
356 copystr.add(term, 'S', word,
357 json.dumps({'class': cls, 'type': typ,
358 'op': oper if oper in ('in', 'near') else None}))
361 copystr.copy_out(cursor, 'word',
362 columns=['word_token', 'type', 'word', 'info'])
368 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
369 """ Remove all phrases from the databse that are no longer in the
372 to_delete = existing_phrases - new_phrases
375 cursor.execute_values(
376 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
377 WHERE type = 'S' and word = name
378 and info->>'class' = in_class and info->>'type' = in_type
379 and ((op = '-' and info->>'op' is null) or op = info->>'op')
382 return len(to_delete)
385 def add_country_names(self, country_code, names):
386 """ Add default names for the given country to the search index.
388 # Make sure any name preprocessing for country names applies.
389 info = PlaceInfo({'name': names, 'country_code': country_code,
390 'rank_address': 4, 'class': 'boundary',
391 'type': 'administrative'})
392 self._add_country_full_names(country_code,
393 self.sanitizer.process_names(info)[0],
397 def _add_country_full_names(self, country_code, names, internal=False):
398 """ Add names for the given country from an already sanitized
403 norm_name = self._search_normalized(name.name)
405 word_tokens.add(norm_name)
407 with self.conn.cursor() as cur:
409 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
411 WHERE type = 'C' and word = %s""",
413 existing_tokens = {True: set(), False: set()} # internal/external names
415 existing_tokens[word[1]].add(word[0])
417 # Delete names that no longer exist.
418 gone_tokens = existing_tokens[internal] - word_tokens
420 gone_tokens.update(existing_tokens[False] & word_tokens)
422 cur.execute("""DELETE FROM word
423 USING unnest(%s) as token
424 WHERE type = 'C' and word = %s
425 and word_token = token""",
426 (list(gone_tokens), country_code))
428 # Only add those names that are not yet in the list.
429 new_tokens = word_tokens - existing_tokens[True]
431 new_tokens -= existing_tokens[False]
434 sql = """INSERT INTO word (word_token, type, word, info)
435 (SELECT token, 'C', %s, '{"internal": "yes"}'
436 FROM unnest(%s) as token)
439 sql = """INSERT INTO word (word_token, type, word)
440 (SELECT token, 'C', %s
441 FROM unnest(%s) as token)
443 cur.execute(sql, (country_code, list(new_tokens)))
446 def process_place(self, place):
447 """ Determine tokenizer information about the given place.
449 Returns a JSON-serializable structure that will be handed into
450 the database via the token_info field.
452 token_info = _TokenInfo()
454 names, address = self.sanitizer.process_names(place)
457 token_info.set_names(*self._compute_name_tokens(names))
459 if place.is_country():
460 self._add_country_full_names(place.country_code, names)
463 self._process_place_address(token_info, address)
465 return token_info.to_dict()
468 def _process_place_address(self, token_info, address):
470 if item.kind == 'postcode':
471 self._add_postcode(item.name)
472 elif item.kind == 'housenumber':
473 token_info.add_housenumber(*self._compute_housenumber_token(item))
474 elif item.kind == 'street':
475 token_info.add_street(self._retrieve_full_tokens(item.name))
476 elif item.kind == 'place':
478 token_info.add_place(self._compute_partial_tokens(item.name))
479 elif not item.kind.startswith('_') and not item.suffix and \
480 item.kind not in ('country', 'full'):
481 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
484 def _compute_housenumber_token(self, hnr):
485 """ Normalize the housenumber and return the word token and the
488 analyzer = self.token_analysis.analysis.get('@housenumber')
492 # When no custom analyzer is set, simply normalize and transliterate
493 norm_name = self._search_normalized(hnr.name)
495 result = self._cache.housenumbers.get(norm_name, result)
496 if result[0] is None:
497 with self.conn.cursor() as cur:
498 cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
499 result = cur.fetchone()[0], norm_name
500 self._cache.housenumbers[norm_name] = result
502 # Otherwise use the analyzer to determine the canonical name.
503 # Per convention we use the first variant as the 'lookup name', the
504 # name that gets saved in the housenumber field of the place.
505 norm_name = analyzer.normalize(hnr.name)
507 result = self._cache.housenumbers.get(norm_name, result)
508 if result[0] is None:
509 variants = analyzer.get_variants_ascii(norm_name)
511 with self.conn.cursor() as cur:
512 cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
513 (norm_name, list(variants)))
514 result = cur.fetchone()[0], variants[0]
515 self._cache.housenumbers[norm_name] = result
520 def _compute_partial_tokens(self, name):
521 """ Normalize the given term, split it into partial words and return
522 then token list for them.
524 norm_name = self._search_normalized(name)
528 for partial in norm_name.split():
529 token = self._cache.partials.get(partial)
533 need_lookup.append(partial)
536 with self.conn.cursor() as cur:
537 cur.execute("""SELECT word, getorcreate_partial_word(word)
538 FROM unnest(%s) word""",
541 for partial, token in cur:
543 self._cache.partials[partial] = token
548 def _retrieve_full_tokens(self, name):
549 """ Get the full name token for the given name, if it exists.
550 The name is only retrived for the standard analyser.
552 norm_name = self._search_normalized(name)
554 # return cached if possible
555 if norm_name in self._cache.fulls:
556 return self._cache.fulls[norm_name]
558 with self.conn.cursor() as cur:
559 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
561 full = [row[0] for row in cur]
563 self._cache.fulls[norm_name] = full
568 def _compute_name_tokens(self, names):
569 """ Computes the full name and partial name tokens for the given
573 partial_tokens = set()
576 analyzer_id = name.get_attr('analyzer')
577 analyzer = self.token_analysis.get_analyzer(analyzer_id)
578 norm_name = analyzer.normalize(name.name)
579 if analyzer_id is None:
582 token_id = f'{norm_name}@{analyzer_id}'
584 full, part = self._cache.names.get(token_id, (None, None))
586 variants = analyzer.get_variants_ascii(norm_name)
590 with self.conn.cursor() as cur:
591 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
592 (token_id, variants))
593 full, part = cur.fetchone()
595 self._cache.names[token_id] = (full, part)
597 full_tokens.add(full)
598 partial_tokens.update(part)
600 return full_tokens, partial_tokens
603 def _add_postcode(self, postcode):
604 """ Make sure the normalized postcode is present in the word table.
606 if re.search(r'[:,;]', postcode) is None:
607 postcode = self.normalize_postcode(postcode)
609 if postcode not in self._cache.postcodes:
610 term = self._search_normalized(postcode)
614 with self.conn.cursor() as cur:
615 # no word_id needed for postcodes
616 cur.execute("""INSERT INTO word (word_token, type, word)
617 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
620 WHERE type = 'P' and word = pc))
621 """, (term, postcode))
622 self._cache.postcodes.add(postcode)
626 """ Collect token information to be sent back to the database.
630 self.housenumbers = set()
631 self.housenumber_tokens = set()
632 self.street_tokens = set()
633 self.place_tokens = set()
634 self.address_tokens = {}
638 def _mk_array(tokens):
639 return f"{{{','.join((str(s) for s in tokens))}}}"
643 """ Return the token information in database importable format.
648 out['names'] = self.names
650 if self.housenumbers:
651 out['hnr'] = ';'.join(self.housenumbers)
652 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
654 if self.street_tokens:
655 out['street'] = self._mk_array(self.street_tokens)
657 if self.place_tokens:
658 out['place'] = self._mk_array(self.place_tokens)
660 if self.address_tokens:
661 out['addr'] = self.address_tokens
666 def set_names(self, fulls, partials):
667 """ Adds token information for the normalised names.
669 self.names = self._mk_array(itertools.chain(fulls, partials))
672 def add_housenumber(self, token, hnr):
673 """ Extract housenumber information from a list of normalised
677 self.housenumbers.add(hnr)
678 self.housenumber_tokens.add(token)
681 def add_street(self, tokens):
682 """ Add addr:street match terms.
684 self.street_tokens.update(tokens)
687 def add_place(self, tokens):
688 """ Add addr:place search and match terms.
690 self.place_tokens.update(tokens)
693 def add_address_term(self, key, partials):
694 """ Add additional address terms.
697 self.address_tokens[key] = self._mk_array(partials)
701 """ Cache for token information to avoid repeated database queries.
703 This cache is not thread-safe and needs to be instantiated per
710 self.postcodes = set()
711 self.housenumbers = {}