1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
15 from textwrap import dedent
17 from nominatim.db.connection import connect
18 from nominatim.db.utils import CopyBuffer
19 from nominatim.db.sql_preprocessor import SQLPreprocessor
20 from nominatim.indexer.place_info import PlaceInfo
21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
22 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
24 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
26 LOG = logging.getLogger()
28 def create(dsn, data_dir):
29 """ Create a new instance of the tokenizer provided by this module.
31 return LegacyICUTokenizer(dsn, data_dir)
34 class LegacyICUTokenizer(AbstractTokenizer):
35 """ This tokenizer uses libICU to covert names and queries to ASCII.
36 Otherwise it uses the same algorithms and data structures as the
37 normalization routines in Nominatim 3.
40 def __init__(self, dsn, data_dir):
42 self.data_dir = data_dir
46 def init_new_db(self, config, init_db=True):
47 """ Set up a new tokenizer for the database.
49 This copies all necessary data in the project directory to make
50 sure the tokenizer remains stable even over updates.
52 self.loader = ICURuleLoader(config)
54 self._install_php(config.lib_dir.php)
58 self.update_sql_functions(config)
59 self._init_db_tables(config)
62 def init_from_project(self, config):
63 """ Initialise the tokenizer from the project directory.
65 self.loader = ICURuleLoader(config)
67 with connect(self.dsn) as conn:
68 self.loader.load_config_from_db(conn)
71 def finalize_import(self, config):
72 """ Do any required postprocessing to make the tokenizer data ready
75 with connect(self.dsn) as conn:
76 sqlp = SQLPreprocessor(conn, config)
77 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
80 def update_sql_functions(self, config):
81 """ Reimport the SQL functions for this tokenizer.
83 with connect(self.dsn) as conn:
84 sqlp = SQLPreprocessor(conn, config)
85 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
88 def check_database(self, config):
89 """ Check that the tokenizer is set up correctly.
91 # Will throw an error if there is an issue.
92 self.init_from_project(config)
95 def update_statistics(self):
96 """ Recompute frequencies for all name words.
98 with connect(self.dsn) as conn:
99 if conn.table_exists('search_name'):
100 with conn.cursor() as cur:
101 cur.drop_table("word_frequencies")
102 LOG.info("Computing word frequencies")
103 cur.execute("""CREATE TEMP TABLE word_frequencies AS
104 SELECT unnest(name_vector) as id, count(*)
105 FROM search_name GROUP BY id""")
106 cur.execute("CREATE INDEX ON word_frequencies(id)")
107 LOG.info("Update word table with recomputed frequencies")
108 cur.execute("""UPDATE word
109 SET info = info || jsonb_build_object('count', count)
110 FROM word_frequencies WHERE word_id = id""")
111 cur.drop_table("word_frequencies")
115 def _cleanup_housenumbers(self):
116 """ Remove unused house numbers.
118 with connect(self.dsn) as conn:
119 if not conn.table_exists('search_name'):
121 with conn.cursor(name="hnr_counter") as cur:
122 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token) FROM word
124 AND NOT EXISTS(SELECT * FROM search_name
125 WHERE ARRAY[word.word_id] && name_vector)
126 AND (char_length(coalesce(word, word_token)) > 6
127 OR coalesce(word, word_token) not similar to '\\d+')
129 candidates = {token: wid for wid, token in cur}
130 with conn.cursor(name="hnr_counter") as cur:
131 cur.execute("""SELECT housenumber FROM placex
132 WHERE housenumber is not null
133 AND (char_length(housenumber) > 6
134 OR housenumber not similar to '\\d+')
137 for hnr in row[0].split(';'):
138 candidates.pop(hnr, None)
139 LOG.info("There are %s outdated housenumbers.", len(candidates))
140 LOG.debug("Outdated housenumbers: %s", candidates.keys())
142 with conn.cursor() as cur:
143 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
144 (list(candidates.values()), ))
149 def update_word_tokens(self):
150 """ Remove unused tokens.
152 LOG.warning("Cleaning up housenumber tokens.")
153 self._cleanup_housenumbers()
154 LOG.warning("Tokenizer house-keeping done.")
157 def name_analyzer(self):
158 """ Create a new analyzer for tokenizing names and queries
159 using this tokinzer. Analyzers are context managers and should
163 with tokenizer.name_analyzer() as analyzer:
167 When used outside the with construct, the caller must ensure to
168 call the close() function before destructing the analyzer.
170 Analyzers are not thread-safe. You need to instantiate one per thread.
172 return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
173 self.loader.make_token_analysis())
176 def _install_php(self, phpdir):
177 """ Install the php script for the tokenizer.
179 php_file = self.data_dir / "tokenizer.php"
180 php_file.write_text(dedent(f"""\
182 @define('CONST_Max_Word_Frequency', 10000000);
183 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
184 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
185 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
188 def _save_config(self):
189 """ Save the configuration that needs to remain stable for the given
190 database as database properties.
192 with connect(self.dsn) as conn:
193 self.loader.save_config_to_db(conn)
196 def _init_db_tables(self, config):
197 """ Set up the word table and fill it with pre-computed word
200 with connect(self.dsn) as conn:
201 sqlp = SQLPreprocessor(conn, config)
202 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
206 class LegacyICUNameAnalyzer(AbstractAnalyzer):
207 """ The legacy analyzer uses the ICU library for splitting names.
209 Each instance opens a connection to the database to request the
213 def __init__(self, dsn, sanitizer, token_analysis):
214 self.conn = connect(dsn).connection
215 self.conn.autocommit = True
216 self.sanitizer = sanitizer
217 self.token_analysis = token_analysis
219 self._cache = _TokenCache()
223 """ Free all resources used by the analyzer.
230 def _search_normalized(self, name):
231 """ Return the search token transliteration of the given name.
233 return self.token_analysis.search.transliterate(name).strip()
236 def _normalized(self, name):
237 """ Return the normalized version of the given name with all
238 non-relevant information removed.
240 return self.token_analysis.normalizer.transliterate(name).strip()
243 def get_word_token_info(self, words):
244 """ Return token information for the given list of words.
245 If a word starts with # it is assumed to be a full name
246 otherwise is a partial name.
248 The function returns a list of tuples with
249 (original word, word token, word id).
251 The function is used for testing and debugging only
252 and not necessarily efficient.
257 if word.startswith('#'):
258 full_tokens[word] = self._search_normalized(word[1:])
260 partial_tokens[word] = self._search_normalized(word)
262 with self.conn.cursor() as cur:
263 cur.execute("""SELECT word_token, word_id
264 FROM word WHERE word_token = ANY(%s) and type = 'W'
265 """, (list(full_tokens.values()),))
266 full_ids = {r[0]: r[1] for r in cur}
267 cur.execute("""SELECT word_token, word_id
268 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
269 (list(partial_tokens.values()),))
270 part_ids = {r[0]: r[1] for r in cur}
272 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
273 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
277 def normalize_postcode(postcode):
278 """ Convert the postcode to a standardized form.
280 This function must yield exactly the same result as the SQL function
281 'token_normalized_postcode()'.
283 return postcode.strip().upper()
286 def update_postcodes_from_db(self):
287 """ Update postcode tokens in the word table from the location_postcode
291 with self.conn.cursor() as cur:
292 # This finds us the rows in location_postcode and word that are
293 # missing in the other table.
294 cur.execute("""SELECT * FROM
295 (SELECT pc, word FROM
296 (SELECT distinct(postcode) as pc FROM location_postcode) p
298 (SELECT word FROM word WHERE type = 'P') w
300 WHERE pc is null or word is null""")
302 with CopyBuffer() as copystr:
303 for postcode, word in cur:
305 to_delete.append(word)
307 copystr.add(self._search_normalized(postcode),
311 cur.execute("""DELETE FROM WORD
312 WHERE type ='P' and word = any(%s)
315 copystr.copy_out(cur, 'word',
316 columns=['word_token', 'type', 'word'])
319 def update_special_phrases(self, phrases, should_replace):
320 """ Replace the search index for special phrases with the new phrases.
321 If `should_replace` is True, then the previous set of will be
322 completely replaced. Otherwise the phrases are added to the
323 already existing ones.
325 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
328 with self.conn.cursor() as cur:
329 # Get the old phrases.
330 existing_phrases = set()
331 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
332 for word, info in cur:
333 existing_phrases.add((word, info['class'], info['type'],
334 info.get('op') or '-'))
336 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
338 deleted = self._remove_special_phrases(cur, norm_phrases,
343 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
344 len(norm_phrases), added, deleted)
347 def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
348 """ Add all phrases to the database that are not yet there.
350 to_add = new_phrases - existing_phrases
353 with CopyBuffer() as copystr:
354 for word, cls, typ, oper in to_add:
355 term = self._search_normalized(word)
357 copystr.add(term, 'S', word,
358 json.dumps({'class': cls, 'type': typ,
359 'op': oper if oper in ('in', 'near') else None}))
362 copystr.copy_out(cursor, 'word',
363 columns=['word_token', 'type', 'word', 'info'])
369 def _remove_special_phrases(cursor, new_phrases, existing_phrases):
370 """ Remove all phrases from the databse that are no longer in the
373 to_delete = existing_phrases - new_phrases
376 cursor.execute_values(
377 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
378 WHERE type = 'S' and word = name
379 and info->>'class' = in_class and info->>'type' = in_type
380 and ((op = '-' and info->>'op' is null) or op = info->>'op')
383 return len(to_delete)
386 def add_country_names(self, country_code, names):
387 """ Add default names for the given country to the search index.
389 # Make sure any name preprocessing for country names applies.
390 info = PlaceInfo({'name': names, 'country_code': country_code,
391 'rank_address': 4, 'class': 'boundary',
392 'type': 'administrative'})
393 self._add_country_full_names(country_code,
394 self.sanitizer.process_names(info)[0],
398 def _add_country_full_names(self, country_code, names, internal=False):
399 """ Add names for the given country from an already sanitized
404 norm_name = self._search_normalized(name.name)
406 word_tokens.add(norm_name)
408 with self.conn.cursor() as cur:
410 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
412 WHERE type = 'C' and word = %s""",
414 existing_tokens = {True: set(), False: set()} # internal/external names
416 existing_tokens[word[1]].add(word[0])
418 # Delete names that no longer exist.
419 gone_tokens = existing_tokens[internal] - word_tokens
421 gone_tokens.update(existing_tokens[False] & word_tokens)
423 cur.execute("""DELETE FROM word
424 USING unnest(%s) as token
425 WHERE type = 'C' and word = %s
426 and word_token = token""",
427 (list(gone_tokens), country_code))
429 # Only add those names that are not yet in the list.
430 new_tokens = word_tokens - existing_tokens[True]
432 new_tokens -= existing_tokens[False]
435 sql = """INSERT INTO word (word_token, type, word, info)
436 (SELECT token, 'C', %s, '{"internal": "yes"}'
437 FROM unnest(%s) as token)
440 sql = """INSERT INTO word (word_token, type, word)
441 (SELECT token, 'C', %s
442 FROM unnest(%s) as token)
444 cur.execute(sql, (country_code, list(new_tokens)))
447 def process_place(self, place):
448 """ Determine tokenizer information about the given place.
450 Returns a JSON-serializable structure that will be handed into
451 the database via the token_info field.
453 token_info = _TokenInfo()
455 names, address = self.sanitizer.process_names(place)
458 token_info.set_names(*self._compute_name_tokens(names))
460 if place.is_country():
461 self._add_country_full_names(place.country_code, names)
464 self._process_place_address(token_info, address)
466 return token_info.to_dict()
469 def _process_place_address(self, token_info, address):
471 if item.kind == 'postcode':
472 self._add_postcode(item.name)
473 elif item.kind == 'housenumber':
474 token_info.add_housenumber(*self._compute_housenumber_token(item))
475 elif item.kind == 'street':
476 token_info.add_street(self._retrieve_full_tokens(item.name))
477 elif item.kind == 'place':
479 token_info.add_place(self._compute_partial_tokens(item.name))
480 elif not item.kind.startswith('_') and not item.suffix and \
481 item.kind not in ('country', 'full'):
482 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
485 def _compute_housenumber_token(self, hnr):
486 """ Normalize the housenumber and return the word token and the
489 analyzer = self.token_analysis.analysis.get('@housenumber')
493 # When no custom analyzer is set, simply normalize and transliterate
494 norm_name = self._search_normalized(hnr.name)
496 result = self._cache.housenumbers.get(norm_name, result)
497 if result[0] is None:
498 with self.conn.cursor() as cur:
499 cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
500 result = cur.fetchone()[0], norm_name
501 self._cache.housenumbers[norm_name] = result
503 # Otherwise use the analyzer to determine the canonical name.
504 # Per convention we use the first variant as the 'lookup name', the
505 # name that gets saved in the housenumber field of the place.
506 norm_name = analyzer.normalize(hnr.name)
508 result = self._cache.housenumbers.get(norm_name, result)
509 if result[0] is None:
510 variants = analyzer.get_variants_ascii(norm_name)
512 with self.conn.cursor() as cur:
513 cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
514 (norm_name, list(variants)))
515 result = cur.fetchone()[0], variants[0]
516 self._cache.housenumbers[norm_name] = result
521 def _compute_partial_tokens(self, name):
522 """ Normalize the given term, split it into partial words and return
523 then token list for them.
525 norm_name = self._search_normalized(name)
529 for partial in norm_name.split():
530 token = self._cache.partials.get(partial)
534 need_lookup.append(partial)
537 with self.conn.cursor() as cur:
538 cur.execute("""SELECT word, getorcreate_partial_word(word)
539 FROM unnest(%s) word""",
542 for partial, token in cur:
544 self._cache.partials[partial] = token
549 def _retrieve_full_tokens(self, name):
550 """ Get the full name token for the given name, if it exists.
551 The name is only retrived for the standard analyser.
553 norm_name = self._search_normalized(name)
555 # return cached if possible
556 if norm_name in self._cache.fulls:
557 return self._cache.fulls[norm_name]
559 with self.conn.cursor() as cur:
560 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
562 full = [row[0] for row in cur]
564 self._cache.fulls[norm_name] = full
569 def _compute_name_tokens(self, names):
570 """ Computes the full name and partial name tokens for the given
574 partial_tokens = set()
577 analyzer_id = name.get_attr('analyzer')
578 analyzer = self.token_analysis.get_analyzer(analyzer_id)
579 norm_name = analyzer.normalize(name.name)
580 if analyzer_id is None:
583 token_id = f'{norm_name}@{analyzer_id}'
585 full, part = self._cache.names.get(token_id, (None, None))
587 variants = analyzer.get_variants_ascii(norm_name)
591 with self.conn.cursor() as cur:
592 cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
593 (token_id, variants))
594 full, part = cur.fetchone()
596 self._cache.names[token_id] = (full, part)
598 full_tokens.add(full)
599 partial_tokens.update(part)
601 return full_tokens, partial_tokens
604 def _add_postcode(self, postcode):
605 """ Make sure the normalized postcode is present in the word table.
607 if re.search(r'[:,;]', postcode) is None:
608 postcode = self.normalize_postcode(postcode)
610 if postcode not in self._cache.postcodes:
611 term = self._search_normalized(postcode)
615 with self.conn.cursor() as cur:
616 # no word_id needed for postcodes
617 cur.execute("""INSERT INTO word (word_token, type, word)
618 (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
621 WHERE type = 'P' and word = pc))
622 """, (term, postcode))
623 self._cache.postcodes.add(postcode)
627 """ Collect token information to be sent back to the database.
631 self.housenumbers = set()
632 self.housenumber_tokens = set()
633 self.street_tokens = set()
634 self.place_tokens = set()
635 self.address_tokens = {}
639 def _mk_array(tokens):
640 return f"{{{','.join((str(s) for s in tokens))}}}"
644 """ Return the token information in database importable format.
649 out['names'] = self.names
651 if self.housenumbers:
652 out['hnr'] = ';'.join(self.housenumbers)
653 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
655 if self.street_tokens:
656 out['street'] = self._mk_array(self.street_tokens)
658 if self.place_tokens:
659 out['place'] = self._mk_array(self.place_tokens)
661 if self.address_tokens:
662 out['addr'] = self.address_tokens
667 def set_names(self, fulls, partials):
668 """ Adds token information for the normalised names.
670 self.names = self._mk_array(itertools.chain(fulls, partials))
673 def add_housenumber(self, token, hnr):
674 """ Extract housenumber information from a list of normalised
678 self.housenumbers.add(hnr)
679 self.housenumber_tokens.add(token)
682 def add_street(self, tokens):
683 """ Add addr:street match terms.
685 self.street_tokens.update(tokens)
688 def add_place(self, tokens):
689 """ Add addr:place search and match terms.
691 self.place_tokens.update(tokens)
694 def add_address_term(self, key, partials):
695 """ Add additional address terms.
698 self.address_tokens[key] = self._mk_array(partials)
702 """ Cache for token information to avoid repeated database queries.
704 This cache is not thread-safe and needs to be instantiated per
711 self.postcodes = set()
712 self.housenumbers = {}