1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
16 from pathlib import Path
17 from textwrap import dedent
19 from nominatim.db.connection import connect, Connection, Cursor
20 from nominatim.config import Configuration
21 from nominatim.db.utils import CopyBuffer
22 from nominatim.db.sql_preprocessor import SQLPreprocessor
23 from nominatim.data.place_info import PlaceInfo
24 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
25 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
26 from nominatim.tokenizer.sanitizers.base import PlaceName
27 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
28 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
30 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
32 LOG = logging.getLogger()
34 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
35 """ Create a new instance of the tokenizer provided by this module.
37 return ICUTokenizer(dsn, data_dir)
40 class ICUTokenizer(AbstractTokenizer):
41 """ This tokenizer uses libICU to convert names and queries to ASCII.
42 Otherwise it uses the same algorithms and data structures as the
43 normalization routines in Nominatim 3.
46 def __init__(self, dsn: str, data_dir: Path) -> None:
48 self.data_dir = data_dir
49 self.loader: Optional[ICURuleLoader] = None
52 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
53 """ Set up a new tokenizer for the database.
55 This copies all necessary data in the project directory to make
56 sure the tokenizer remains stable even over updates.
58 self.loader = ICURuleLoader(config)
60 self._install_php(config.lib_dir.php, overwrite=True)
64 self.update_sql_functions(config)
65 self._init_db_tables(config)
68 def init_from_project(self, config: Configuration) -> None:
69 """ Initialise the tokenizer from the project directory.
71 self.loader = ICURuleLoader(config)
73 with connect(self.dsn) as conn:
74 self.loader.load_config_from_db(conn)
76 self._install_php(config.lib_dir.php, overwrite=False)
79 def finalize_import(self, config: Configuration) -> None:
80 """ Do any required postprocessing to make the tokenizer data ready
83 with connect(self.dsn) as conn:
84 sqlp = SQLPreprocessor(conn, config)
85 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
88 def update_sql_functions(self, config: Configuration) -> None:
89 """ Reimport the SQL functions for this tokenizer.
91 with connect(self.dsn) as conn:
92 sqlp = SQLPreprocessor(conn, config)
93 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
96 def check_database(self, config: Configuration) -> None:
97 """ Check that the tokenizer is set up correctly.
99 # Will throw an error if there is an issue.
100 self.init_from_project(config)
103 def update_statistics(self) -> None:
104 """ Recompute frequencies for all name words.
106 with connect(self.dsn) as conn:
107 if conn.table_exists('search_name'):
108 with conn.cursor() as cur:
109 cur.drop_table("word_frequencies")
110 LOG.info("Computing word frequencies")
111 cur.execute("""CREATE TEMP TABLE word_frequencies AS
112 SELECT unnest(name_vector) as id, count(*)
113 FROM search_name GROUP BY id""")
114 cur.execute("CREATE INDEX ON word_frequencies(id)")
115 LOG.info("Update word table with recomputed frequencies")
116 cur.execute("""UPDATE word
117 SET info = info || jsonb_build_object('count', count)
118 FROM word_frequencies WHERE word_id = id""")
119 cur.drop_table("word_frequencies")
123 def _cleanup_housenumbers(self) -> None:
124 """ Remove unused house numbers.
126 with connect(self.dsn) as conn:
127 if not conn.table_exists('search_name'):
129 with conn.cursor(name="hnr_counter") as cur:
130 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
133 AND NOT EXISTS(SELECT * FROM search_name
134 WHERE ARRAY[word.word_id] && name_vector)
135 AND (char_length(coalesce(word, word_token)) > 6
136 OR coalesce(word, word_token) not similar to '\\d+')
138 candidates = {token: wid for wid, token in cur}
139 with conn.cursor(name="hnr_counter") as cur:
140 cur.execute("""SELECT housenumber FROM placex
141 WHERE housenumber is not null
142 AND (char_length(housenumber) > 6
143 OR housenumber not similar to '\\d+')
146 for hnr in row[0].split(';'):
147 candidates.pop(hnr, None)
148 LOG.info("There are %s outdated housenumbers.", len(candidates))
149 LOG.debug("Outdated housenumbers: %s", candidates.keys())
151 with conn.cursor() as cur:
152 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
153 (list(candidates.values()), ))
158 def update_word_tokens(self) -> None:
159 """ Remove unused tokens.
161 LOG.warning("Cleaning up housenumber tokens.")
162 self._cleanup_housenumbers()
163 LOG.warning("Tokenizer house-keeping done.")
166 def name_analyzer(self) -> 'ICUNameAnalyzer':
167 """ Create a new analyzer for tokenizing names and queries
168 using this tokinzer. Analyzers are context managers and should
172 with tokenizer.name_analyzer() as analyzer:
176 When used outside the with construct, the caller must ensure to
177 call the close() function before destructing the analyzer.
179 Analyzers are not thread-safe. You need to instantiate one per thread.
181 assert self.loader is not None
182 return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
183 self.loader.make_token_analysis())
186 def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
187 """ Install the php script for the tokenizer.
189 assert self.loader is not None
190 php_file = self.data_dir / "tokenizer.php"
192 if not php_file.exists() or overwrite:
193 php_file.write_text(dedent(f"""\
195 @define('CONST_Max_Word_Frequency', 10000000);
196 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
197 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
198 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
201 def _save_config(self) -> None:
202 """ Save the configuration that needs to remain stable for the given
203 database as database properties.
205 assert self.loader is not None
206 with connect(self.dsn) as conn:
207 self.loader.save_config_to_db(conn)
210 def _init_db_tables(self, config: Configuration) -> None:
211 """ Set up the word table and fill it with pre-computed word
214 with connect(self.dsn) as conn:
215 sqlp = SQLPreprocessor(conn, config)
216 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
220 class ICUNameAnalyzer(AbstractAnalyzer):
221 """ The ICU analyzer uses the ICU library for splitting names.
223 Each instance opens a connection to the database to request the
227 def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
228 token_analysis: ICUTokenAnalysis) -> None:
229 self.conn: Optional[Connection] = connect(dsn).connection
230 self.conn.autocommit = True
231 self.sanitizer = sanitizer
232 self.token_analysis = token_analysis
234 self._cache = _TokenCache()
237 def close(self) -> None:
238 """ Free all resources used by the analyzer.
245 def _search_normalized(self, name: str) -> str:
246 """ Return the search token transliteration of the given name.
248 return cast(str, self.token_analysis.search.transliterate(name)).strip()
251 def _normalized(self, name: str) -> str:
252 """ Return the normalized version of the given name with all
253 non-relevant information removed.
255 return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
258 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
259 """ Return token information for the given list of words.
260 If a word starts with # it is assumed to be a full name
261 otherwise is a partial name.
263 The function returns a list of tuples with
264 (original word, word token, word id).
266 The function is used for testing and debugging only
267 and not necessarily efficient.
269 assert self.conn is not None
273 if word.startswith('#'):
274 full_tokens[word] = self._search_normalized(word[1:])
276 partial_tokens[word] = self._search_normalized(word)
278 with self.conn.cursor() as cur:
279 cur.execute("""SELECT word_token, word_id
280 FROM word WHERE word_token = ANY(%s) and type = 'W'
281 """, (list(full_tokens.values()),))
282 full_ids = {r[0]: r[1] for r in cur}
283 cur.execute("""SELECT word_token, word_id
284 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
285 (list(partial_tokens.values()),))
286 part_ids = {r[0]: r[1] for r in cur}
288 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
289 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
292 def normalize_postcode(self, postcode: str) -> str:
293 """ Convert the postcode to a standardized form.
295 This function must yield exactly the same result as the SQL function
296 'token_normalized_postcode()'.
298 return postcode.strip().upper()
301 def update_postcodes_from_db(self) -> None:
302 """ Update postcode tokens in the word table from the location_postcode
305 assert self.conn is not None
306 analyzer = self.token_analysis.analysis.get('@postcode')
308 with self.conn.cursor() as cur:
309 # First get all postcode names currently in the word table.
310 cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
311 word_entries = set((entry[0] for entry in cur))
313 # Then compute the required postcode names from the postcode table.
314 needed_entries = set()
315 cur.execute("SELECT country_code, postcode FROM location_postcode")
316 for cc, postcode in cur:
317 info = PlaceInfo({'country_code': cc,
318 'class': 'place', 'type': 'postcode',
319 'address': {'postcode': postcode}})
320 address = self.sanitizer.process_names(info)[1]
321 for place in address:
322 if place.kind == 'postcode':
324 postcode_name = place.name.strip().upper()
327 postcode_name = analyzer.normalize(place.name)
328 variant_base = place.get_attr("variant")
331 needed_entries.add(f'{postcode_name}@{variant_base}')
333 needed_entries.add(postcode_name)
336 # Now update the word table.
337 self._delete_unused_postcode_words(word_entries - needed_entries)
338 self._add_missing_postcode_words(needed_entries - word_entries)
340 def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
341 assert self.conn is not None
343 with self.conn.cursor() as cur:
344 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
347 def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
348 assert self.conn is not None
352 analyzer = self.token_analysis.analysis.get('@postcode')
355 for postcode_name in tokens:
356 if '@' in postcode_name:
357 term, variant = postcode_name.split('@', 2)
358 term = self._search_normalized(term)
362 variants = analyzer.get_variants_ascii(variant)
363 if term not in variants:
364 variants.append(term)
366 variants = [self._search_normalized(postcode_name)]
367 terms.append((postcode_name, variants))
370 with self.conn.cursor() as cur:
371 cur.execute_values("""SELECT create_postcode_word(pc, var)
372 FROM (VALUES %s) AS v(pc, var)""",
378 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
379 should_replace: bool) -> None:
380 """ Replace the search index for special phrases with the new phrases.
381 If `should_replace` is True, then the previous set of will be
382 completely replaced. Otherwise the phrases are added to the
383 already existing ones.
385 assert self.conn is not None
386 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
389 with self.conn.cursor() as cur:
390 # Get the old phrases.
391 existing_phrases = set()
392 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
393 for word, info in cur:
394 existing_phrases.add((word, info['class'], info['type'],
395 info.get('op') or '-'))
397 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
399 deleted = self._remove_special_phrases(cur, norm_phrases,
404 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
405 len(norm_phrases), added, deleted)
408 def _add_special_phrases(self, cursor: Cursor,
409 new_phrases: Set[Tuple[str, str, str, str]],
410 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
411 """ Add all phrases to the database that are not yet there.
413 to_add = new_phrases - existing_phrases
416 with CopyBuffer() as copystr:
417 for word, cls, typ, oper in to_add:
418 term = self._search_normalized(word)
420 copystr.add(term, 'S', word,
421 json.dumps({'class': cls, 'type': typ,
422 'op': oper if oper in ('in', 'near') else None}))
425 copystr.copy_out(cursor, 'word',
426 columns=['word_token', 'type', 'word', 'info'])
431 def _remove_special_phrases(self, cursor: Cursor,
432 new_phrases: Set[Tuple[str, str, str, str]],
433 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
434 """ Remove all phrases from the database that are no longer in the
437 to_delete = existing_phrases - new_phrases
440 cursor.execute_values(
441 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
442 WHERE type = 'S' and word = name
443 and info->>'class' = in_class and info->>'type' = in_type
444 and ((op = '-' and info->>'op' is null) or op = info->>'op')
447 return len(to_delete)
450 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
451 """ Add default names for the given country to the search index.
453 # Make sure any name preprocessing for country names applies.
454 info = PlaceInfo({'name': names, 'country_code': country_code,
455 'rank_address': 4, 'class': 'boundary',
456 'type': 'administrative'})
457 self._add_country_full_names(country_code,
458 self.sanitizer.process_names(info)[0],
462 def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
463 internal: bool = False) -> None:
464 """ Add names for the given country from an already sanitized
467 assert self.conn is not None
470 norm_name = self._search_normalized(name.name)
472 word_tokens.add(norm_name)
474 with self.conn.cursor() as cur:
476 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
478 WHERE type = 'C' and word = %s""",
480 # internal/external names
481 existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
483 existing_tokens[word[1]].add(word[0])
485 # Delete names that no longer exist.
486 gone_tokens = existing_tokens[internal] - word_tokens
488 gone_tokens.update(existing_tokens[False] & word_tokens)
490 cur.execute("""DELETE FROM word
491 USING unnest(%s) as token
492 WHERE type = 'C' and word = %s
493 and word_token = token""",
494 (list(gone_tokens), country_code))
496 # Only add those names that are not yet in the list.
497 new_tokens = word_tokens - existing_tokens[True]
499 new_tokens -= existing_tokens[False]
502 sql = """INSERT INTO word (word_token, type, word, info)
503 (SELECT token, 'C', %s, '{"internal": "yes"}'
504 FROM unnest(%s) as token)
507 sql = """INSERT INTO word (word_token, type, word)
508 (SELECT token, 'C', %s
509 FROM unnest(%s) as token)
511 cur.execute(sql, (country_code, list(new_tokens)))
514 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
515 """ Determine tokenizer information about the given place.
517 Returns a JSON-serializable structure that will be handed into
518 the database via the token_info field.
520 token_info = _TokenInfo()
522 names, address = self.sanitizer.process_names(place)
525 token_info.set_names(*self._compute_name_tokens(names))
527 if place.is_country():
528 assert place.country_code is not None
529 self._add_country_full_names(place.country_code, names)
532 self._process_place_address(token_info, address)
534 return token_info.to_dict()
537 def _process_place_address(self, token_info: '_TokenInfo',
538 address: Sequence[PlaceName]) -> None:
540 if item.kind == 'postcode':
541 token_info.set_postcode(self._add_postcode(item))
542 elif item.kind == 'housenumber':
543 token_info.add_housenumber(*self._compute_housenumber_token(item))
544 elif item.kind == 'street':
545 token_info.add_street(self._retrieve_full_tokens(item.name))
546 elif item.kind == 'place':
548 token_info.add_place(self._compute_partial_tokens(item.name))
549 elif not item.kind.startswith('_') and not item.suffix and \
550 item.kind not in ('country', 'full', 'inclusion'):
551 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
554 def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
555 """ Normalize the housenumber and return the word token and the
558 assert self.conn is not None
559 analyzer = self.token_analysis.analysis.get('@housenumber')
560 result: Tuple[Optional[int], Optional[str]] = (None, None)
563 # When no custom analyzer is set, simply normalize and transliterate
564 norm_name = self._search_normalized(hnr.name)
566 result = self._cache.housenumbers.get(norm_name, result)
567 if result[0] is None:
568 with self.conn.cursor() as cur:
569 cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
570 result = cur.fetchone()[0], norm_name # type: ignore[no-untyped-call]
571 self._cache.housenumbers[norm_name] = result
573 # Otherwise use the analyzer to determine the canonical name.
574 # Per convention we use the first variant as the 'lookup name', the
575 # name that gets saved in the housenumber field of the place.
576 norm_name = analyzer.normalize(hnr.name)
578 result = self._cache.housenumbers.get(norm_name, result)
579 if result[0] is None:
580 variants = analyzer.get_variants_ascii(norm_name)
582 with self.conn.cursor() as cur:
583 cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
584 (norm_name, list(variants)))
585 result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call]
586 self._cache.housenumbers[norm_name] = result
591 def _compute_partial_tokens(self, name: str) -> List[int]:
592 """ Normalize the given term, split it into partial words and return
593 then token list for them.
595 assert self.conn is not None
596 norm_name = self._search_normalized(name)
600 for partial in norm_name.split():
601 token = self._cache.partials.get(partial)
605 need_lookup.append(partial)
608 with self.conn.cursor() as cur:
609 cur.execute("""SELECT word, getorcreate_partial_word(word)
610 FROM unnest(%s) word""",
613 for partial, token in cur:
614 assert token is not None
616 self._cache.partials[partial] = token
621 def _retrieve_full_tokens(self, name: str) -> List[int]:
622 """ Get the full name token for the given name, if it exists.
623 The name is only retrieved for the standard analyser.
625 assert self.conn is not None
626 norm_name = self._search_normalized(name)
628 # return cached if possible
629 if norm_name in self._cache.fulls:
630 return self._cache.fulls[norm_name]
632 with self.conn.cursor() as cur:
633 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
635 full = [row[0] for row in cur]
637 self._cache.fulls[norm_name] = full
642 def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
643 """ Computes the full name and partial name tokens for the given
646 assert self.conn is not None
647 full_tokens: Set[int] = set()
648 partial_tokens: Set[int] = set()
651 analyzer_id = name.get_attr('analyzer')
652 analyzer = self.token_analysis.get_analyzer(analyzer_id)
653 norm_name = analyzer.normalize(name.name)
654 if analyzer_id is None:
657 token_id = f'{norm_name}@{analyzer_id}'
659 full, part = self._cache.names.get(token_id, (None, None))
661 variants = analyzer.get_variants_ascii(norm_name)
665 with self.conn.cursor() as cur:
666 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
667 (token_id, variants))
668 full, part = cast(Tuple[int, List[int]],
669 cur.fetchone()) # type: ignore[no-untyped-call]
671 self._cache.names[token_id] = (full, part)
673 assert part is not None
675 full_tokens.add(full)
676 partial_tokens.update(part)
678 return full_tokens, partial_tokens
681 def _add_postcode(self, item: PlaceName) -> Optional[str]:
682 """ Make sure the normalized postcode is present in the word table.
684 assert self.conn is not None
685 analyzer = self.token_analysis.analysis.get('@postcode')
688 postcode_name = item.name.strip().upper()
691 postcode_name = analyzer.normalize(item.name)
692 variant_base = item.get_attr("variant")
695 postcode = f'{postcode_name}@{variant_base}'
697 postcode = postcode_name
699 if postcode not in self._cache.postcodes:
700 term = self._search_normalized(postcode_name)
705 if analyzer is not None and variant_base:
706 variants.update(analyzer.get_variants_ascii(variant_base))
708 with self.conn.cursor() as cur:
709 cur.execute("SELECT create_postcode_word(%s, %s)",
710 (postcode, list(variants)))
711 self._cache.postcodes.add(postcode)
717 """ Collect token information to be sent back to the database.
719 def __init__(self) -> None:
720 self.names: Optional[str] = None
721 self.housenumbers: Set[str] = set()
722 self.housenumber_tokens: Set[int] = set()
723 self.street_tokens: Set[int] = set()
724 self.place_tokens: Set[int] = set()
725 self.address_tokens: Dict[str, str] = {}
726 self.postcode: Optional[str] = None
729 def _mk_array(self, tokens: Iterable[Any]) -> str:
730 return f"{{{','.join((str(s) for s in tokens))}}}"
733 def to_dict(self) -> Dict[str, Any]:
734 """ Return the token information in database importable format.
736 out: Dict[str, Any] = {}
739 out['names'] = self.names
741 if self.housenumbers:
742 out['hnr'] = ';'.join(self.housenumbers)
743 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
745 if self.street_tokens:
746 out['street'] = self._mk_array(self.street_tokens)
748 if self.place_tokens:
749 out['place'] = self._mk_array(self.place_tokens)
751 if self.address_tokens:
752 out['addr'] = self.address_tokens
755 out['postcode'] = self.postcode
760 def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
761 """ Adds token information for the normalised names.
763 self.names = self._mk_array(itertools.chain(fulls, partials))
766 def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
767 """ Extract housenumber information from a list of normalised
771 assert hnr is not None
772 self.housenumbers.add(hnr)
773 self.housenumber_tokens.add(token)
776 def add_street(self, tokens: Iterable[int]) -> None:
777 """ Add addr:street match terms.
779 self.street_tokens.update(tokens)
782 def add_place(self, tokens: Iterable[int]) -> None:
783 """ Add addr:place search and match terms.
785 self.place_tokens.update(tokens)
788 def add_address_term(self, key: str, partials: Iterable[int]) -> None:
789 """ Add additional address terms.
792 self.address_tokens[key] = self._mk_array(partials)
794 def set_postcode(self, postcode: Optional[str]) -> None:
795 """ Set the postcode to the given one.
797 self.postcode = postcode
801 """ Cache for token information to avoid repeated database queries.
803 This cache is not thread-safe and needs to be instantiated per
806 def __init__(self) -> None:
807 self.names: Dict[str, Tuple[int, List[int]]] = {}
808 self.partials: Dict[str, int] = {}
809 self.fulls: Dict[str, List[int]] = {}
810 self.postcodes: Set[str] = set()
811 self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}