1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, Dict, Set, Iterable
15 from pathlib import Path
16 from textwrap import dedent
18 from nominatim.db.connection import connect, Connection, Cursor
19 from nominatim.config import Configuration
20 from nominatim.db.utils import CopyBuffer
21 from nominatim.db.sql_preprocessor import SQLPreprocessor
22 from nominatim.data.place_info import PlaceInfo
23 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
24 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
25 from nominatim.tokenizer.sanitizers.base import PlaceName
26 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
27 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
29 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
31 LOG = logging.getLogger()
33 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
34 """ Create a new instance of the tokenizer provided by this module.
36 return ICUTokenizer(dsn, data_dir)
39 class ICUTokenizer(AbstractTokenizer):
40 """ This tokenizer uses libICU to covert names and queries to ASCII.
41 Otherwise it uses the same algorithms and data structures as the
42 normalization routines in Nominatim 3.
45 def __init__(self, dsn: str, data_dir: Path) -> None:
47 self.data_dir = data_dir
48 self.loader: Optional[ICURuleLoader] = None
51 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
52 """ Set up a new tokenizer for the database.
54 This copies all necessary data in the project directory to make
55 sure the tokenizer remains stable even over updates.
57 self.loader = ICURuleLoader(config)
59 self._install_php(config.lib_dir.php, overwrite=True)
63 self.update_sql_functions(config)
64 self._init_db_tables(config)
67 def init_from_project(self, config: Configuration) -> None:
68 """ Initialise the tokenizer from the project directory.
70 self.loader = ICURuleLoader(config)
72 with connect(self.dsn) as conn:
73 self.loader.load_config_from_db(conn)
75 self._install_php(config.lib_dir.php, overwrite=False)
78 def finalize_import(self, config: Configuration) -> None:
79 """ Do any required postprocessing to make the tokenizer data ready
82 with connect(self.dsn) as conn:
83 sqlp = SQLPreprocessor(conn, config)
84 sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
87 def update_sql_functions(self, config: Configuration) -> None:
88 """ Reimport the SQL functions for this tokenizer.
90 with connect(self.dsn) as conn:
91 sqlp = SQLPreprocessor(conn, config)
92 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
95 def check_database(self, config: Configuration) -> None:
96 """ Check that the tokenizer is set up correctly.
98 # Will throw an error if there is an issue.
99 self.init_from_project(config)
102 def update_statistics(self) -> None:
103 """ Recompute frequencies for all name words.
105 with connect(self.dsn) as conn:
106 if conn.table_exists('search_name'):
107 with conn.cursor() as cur:
108 cur.drop_table("word_frequencies")
109 LOG.info("Computing word frequencies")
110 cur.execute("""CREATE TEMP TABLE word_frequencies AS
111 SELECT unnest(name_vector) as id, count(*)
112 FROM search_name GROUP BY id""")
113 cur.execute("CREATE INDEX ON word_frequencies(id)")
114 LOG.info("Update word table with recomputed frequencies")
115 cur.execute("""UPDATE word
116 SET info = info || jsonb_build_object('count', count)
117 FROM word_frequencies WHERE word_id = id""")
118 cur.drop_table("word_frequencies")
122 def _cleanup_housenumbers(self) -> None:
123 """ Remove unused house numbers.
125 with connect(self.dsn) as conn:
126 if not conn.table_exists('search_name'):
128 with conn.cursor(name="hnr_counter") as cur:
129 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
132 AND NOT EXISTS(SELECT * FROM search_name
133 WHERE ARRAY[word.word_id] && name_vector)
134 AND (char_length(coalesce(word, word_token)) > 6
135 OR coalesce(word, word_token) not similar to '\\d+')
137 candidates = {token: wid for wid, token in cur}
138 with conn.cursor(name="hnr_counter") as cur:
139 cur.execute("""SELECT housenumber FROM placex
140 WHERE housenumber is not null
141 AND (char_length(housenumber) > 6
142 OR housenumber not similar to '\\d+')
145 for hnr in row[0].split(';'):
146 candidates.pop(hnr, None)
147 LOG.info("There are %s outdated housenumbers.", len(candidates))
148 LOG.debug("Outdated housenumbers: %s", candidates.keys())
150 with conn.cursor() as cur:
151 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
152 (list(candidates.values()), ))
157 def update_word_tokens(self) -> None:
158 """ Remove unused tokens.
160 LOG.warning("Cleaning up housenumber tokens.")
161 self._cleanup_housenumbers()
162 LOG.warning("Tokenizer house-keeping done.")
165 def name_analyzer(self) -> 'ICUNameAnalyzer':
166 """ Create a new analyzer for tokenizing names and queries
167 using this tokinzer. Analyzers are context managers and should
171 with tokenizer.name_analyzer() as analyzer:
175 When used outside the with construct, the caller must ensure to
176 call the close() function before destructing the analyzer.
178 Analyzers are not thread-safe. You need to instantiate one per thread.
180 assert self.loader is not None
181 return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
182 self.loader.make_token_analysis())
185 def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
186 """ Install the php script for the tokenizer.
188 assert self.loader is not None
189 php_file = self.data_dir / "tokenizer.php"
191 if not php_file.exists() or overwrite:
192 php_file.write_text(dedent(f"""\
194 @define('CONST_Max_Word_Frequency', 10000000);
195 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
196 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
197 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
200 def _save_config(self) -> None:
201 """ Save the configuration that needs to remain stable for the given
202 database as database properties.
204 assert self.loader is not None
205 with connect(self.dsn) as conn:
206 self.loader.save_config_to_db(conn)
209 def _init_db_tables(self, config: Configuration) -> None:
210 """ Set up the word table and fill it with pre-computed word
213 with connect(self.dsn) as conn:
214 sqlp = SQLPreprocessor(conn, config)
215 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
219 class ICUNameAnalyzer(AbstractAnalyzer):
220 """ The ICU analyzer uses the ICU library for splitting names.
222 Each instance opens a connection to the database to request the
226 def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
227 token_analysis: ICUTokenAnalysis) -> None:
228 self.conn: Optional[Connection] = connect(dsn).connection
229 self.conn.autocommit = True
230 self.sanitizer = sanitizer
231 self.token_analysis = token_analysis
233 self._cache = _TokenCache()
236 def close(self) -> None:
237 """ Free all resources used by the analyzer.
244 def _search_normalized(self, name: str) -> str:
245 """ Return the search token transliteration of the given name.
247 return cast(str, self.token_analysis.search.transliterate(name)).strip()
250 def _normalized(self, name: str) -> str:
251 """ Return the normalized version of the given name with all
252 non-relevant information removed.
254 return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
257 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
258 """ Return token information for the given list of words.
259 If a word starts with # it is assumed to be a full name
260 otherwise is a partial name.
262 The function returns a list of tuples with
263 (original word, word token, word id).
265 The function is used for testing and debugging only
266 and not necessarily efficient.
268 assert self.conn is not None
272 if word.startswith('#'):
273 full_tokens[word] = self._search_normalized(word[1:])
275 partial_tokens[word] = self._search_normalized(word)
277 with self.conn.cursor() as cur:
278 cur.execute("""SELECT word_token, word_id
279 FROM word WHERE word_token = ANY(%s) and type = 'W'
280 """, (list(full_tokens.values()),))
281 full_ids = {r[0]: r[1] for r in cur}
282 cur.execute("""SELECT word_token, word_id
283 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
284 (list(partial_tokens.values()),))
285 part_ids = {r[0]: r[1] for r in cur}
287 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
288 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
291 def normalize_postcode(self, postcode: str) -> str:
292 """ Convert the postcode to a standardized form.
294 This function must yield exactly the same result as the SQL function
295 'token_normalized_postcode()'.
297 return postcode.strip().upper()
300 def update_postcodes_from_db(self) -> None:
301 """ Update postcode tokens in the word table from the location_postcode
304 assert self.conn is not None
305 analyzer = self.token_analysis.analysis.get('@postcode')
307 with self.conn.cursor() as cur:
308 # First get all postcode names currently in the word table.
309 cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
310 word_entries = set((entry[0] for entry in cur))
312 # Then compute the required postcode names from the postcode table.
313 needed_entries = set()
314 cur.execute("SELECT country_code, postcode FROM location_postcode")
315 for cc, postcode in cur:
316 info = PlaceInfo({'country_code': cc,
317 'class': 'place', 'type': 'postcode',
318 'address': {'postcode': postcode}})
319 address = self.sanitizer.process_names(info)[1]
320 for place in address:
321 if place.kind == 'postcode':
323 postcode_name = place.name.strip().upper()
326 postcode_name = analyzer.normalize(place.name)
327 variant_base = place.get_attr("variant")
330 needed_entries.add(f'{postcode_name}@{variant_base}')
332 needed_entries.add(postcode_name)
335 # Now update the word table.
336 self._delete_unused_postcode_words(word_entries - needed_entries)
337 self._add_missing_postcode_words(needed_entries - word_entries)
339 def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
340 assert self.conn is not None
342 with self.conn.cursor() as cur:
343 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
346 def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
347 assert self.conn is not None
351 analyzer = self.token_analysis.analysis.get('@postcode')
354 for postcode_name in tokens:
355 if '@' in postcode_name:
356 term, variant = postcode_name.split('@', 2)
357 term = self._search_normalized(term)
361 variants = analyzer.get_variants_ascii(variant)
362 if term not in variants:
363 variants.append(term)
365 variants = [self._search_normalized(postcode_name)]
366 terms.append((postcode_name, variants))
369 with self.conn.cursor() as cur:
370 cur.execute_values("""SELECT create_postcode_word(pc, var)
371 FROM (VALUES %s) AS v(pc, var)""",
377 def update_special_phrases(self, phrases: Sequence[Tuple[str, str, str, str]],
378 should_replace: bool) -> None:
379 """ Replace the search index for special phrases with the new phrases.
380 If `should_replace` is True, then the previous set of will be
381 completely replaced. Otherwise the phrases are added to the
382 already existing ones.
384 assert self.conn is not None
385 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
388 with self.conn.cursor() as cur:
389 # Get the old phrases.
390 existing_phrases = set()
391 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
392 for word, info in cur:
393 existing_phrases.add((word, info['class'], info['type'],
394 info.get('op') or '-'))
396 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
398 deleted = self._remove_special_phrases(cur, norm_phrases,
403 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
404 len(norm_phrases), added, deleted)
407 def _add_special_phrases(self, cursor: Cursor,
408 new_phrases: Set[Tuple[str, str, str, str]],
409 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
410 """ Add all phrases to the database that are not yet there.
412 to_add = new_phrases - existing_phrases
415 with CopyBuffer() as copystr:
416 for word, cls, typ, oper in to_add:
417 term = self._search_normalized(word)
419 copystr.add(term, 'S', word,
420 json.dumps({'class': cls, 'type': typ,
421 'op': oper if oper in ('in', 'near') else None}))
424 copystr.copy_out(cursor, 'word',
425 columns=['word_token', 'type', 'word', 'info'])
430 def _remove_special_phrases(self, cursor: Cursor,
431 new_phrases: Set[Tuple[str, str, str, str]],
432 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
433 """ Remove all phrases from the databse that are no longer in the
436 to_delete = existing_phrases - new_phrases
439 cursor.execute_values(
440 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
441 WHERE type = 'S' and word = name
442 and info->>'class' = in_class and info->>'type' = in_type
443 and ((op = '-' and info->>'op' is null) or op = info->>'op')
446 return len(to_delete)
449 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
450 """ Add default names for the given country to the search index.
452 # Make sure any name preprocessing for country names applies.
453 info = PlaceInfo({'name': names, 'country_code': country_code,
454 'rank_address': 4, 'class': 'boundary',
455 'type': 'administrative'})
456 self._add_country_full_names(country_code,
457 self.sanitizer.process_names(info)[0],
461 def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
462 internal: bool = False) -> None:
463 """ Add names for the given country from an already sanitized
466 assert self.conn is not None
469 norm_name = self._search_normalized(name.name)
471 word_tokens.add(norm_name)
473 with self.conn.cursor() as cur:
475 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
477 WHERE type = 'C' and word = %s""",
479 # internal/external names
480 existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
482 existing_tokens[word[1]].add(word[0])
484 # Delete names that no longer exist.
485 gone_tokens = existing_tokens[internal] - word_tokens
487 gone_tokens.update(existing_tokens[False] & word_tokens)
489 cur.execute("""DELETE FROM word
490 USING unnest(%s) as token
491 WHERE type = 'C' and word = %s
492 and word_token = token""",
493 (list(gone_tokens), country_code))
495 # Only add those names that are not yet in the list.
496 new_tokens = word_tokens - existing_tokens[True]
498 new_tokens -= existing_tokens[False]
501 sql = """INSERT INTO word (word_token, type, word, info)
502 (SELECT token, 'C', %s, '{"internal": "yes"}'
503 FROM unnest(%s) as token)
506 sql = """INSERT INTO word (word_token, type, word)
507 (SELECT token, 'C', %s
508 FROM unnest(%s) as token)
510 cur.execute(sql, (country_code, list(new_tokens)))
513 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
514 """ Determine tokenizer information about the given place.
516 Returns a JSON-serializable structure that will be handed into
517 the database via the token_info field.
519 token_info = _TokenInfo()
521 names, address = self.sanitizer.process_names(place)
524 token_info.set_names(*self._compute_name_tokens(names))
526 if place.is_country():
527 assert place.country_code is not None
528 self._add_country_full_names(place.country_code, names)
531 self._process_place_address(token_info, address)
533 return token_info.to_dict()
536 def _process_place_address(self, token_info: '_TokenInfo',
537 address: Sequence[PlaceName]) -> None:
539 if item.kind == 'postcode':
540 token_info.set_postcode(self._add_postcode(item))
541 elif item.kind == 'housenumber':
542 token_info.add_housenumber(*self._compute_housenumber_token(item))
543 elif item.kind == 'street':
544 token_info.add_street(self._retrieve_full_tokens(item.name))
545 elif item.kind == 'place':
547 token_info.add_place(self._compute_partial_tokens(item.name))
548 elif not item.kind.startswith('_') and not item.suffix and \
549 item.kind not in ('country', 'full', 'inclusion'):
550 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
553 def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
554 """ Normalize the housenumber and return the word token and the
557 assert self.conn is not None
558 analyzer = self.token_analysis.analysis.get('@housenumber')
559 result: Tuple[Optional[int], Optional[str]] = (None, None)
562 # When no custom analyzer is set, simply normalize and transliterate
563 norm_name = self._search_normalized(hnr.name)
565 result = self._cache.housenumbers.get(norm_name, result)
566 if result[0] is None:
567 with self.conn.cursor() as cur:
568 cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
569 result = cur.fetchone()[0], norm_name # type: ignore[no-untyped-call]
570 self._cache.housenumbers[norm_name] = result
572 # Otherwise use the analyzer to determine the canonical name.
573 # Per convention we use the first variant as the 'lookup name', the
574 # name that gets saved in the housenumber field of the place.
575 norm_name = analyzer.normalize(hnr.name)
577 result = self._cache.housenumbers.get(norm_name, result)
578 if result[0] is None:
579 variants = analyzer.get_variants_ascii(norm_name)
581 with self.conn.cursor() as cur:
582 cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
583 (norm_name, list(variants)))
584 result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call]
585 self._cache.housenumbers[norm_name] = result
590 def _compute_partial_tokens(self, name: str) -> List[int]:
591 """ Normalize the given term, split it into partial words and return
592 then token list for them.
594 assert self.conn is not None
595 norm_name = self._search_normalized(name)
599 for partial in norm_name.split():
600 token = self._cache.partials.get(partial)
604 need_lookup.append(partial)
607 with self.conn.cursor() as cur:
608 cur.execute("""SELECT word, getorcreate_partial_word(word)
609 FROM unnest(%s) word""",
612 for partial, token in cur:
613 assert token is not None
615 self._cache.partials[partial] = token
620 def _retrieve_full_tokens(self, name: str) -> List[int]:
621 """ Get the full name token for the given name, if it exists.
622 The name is only retrived for the standard analyser.
624 assert self.conn is not None
625 norm_name = self._search_normalized(name)
627 # return cached if possible
628 if norm_name in self._cache.fulls:
629 return self._cache.fulls[norm_name]
631 with self.conn.cursor() as cur:
632 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
634 full = [row[0] for row in cur]
636 self._cache.fulls[norm_name] = full
641 def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
642 """ Computes the full name and partial name tokens for the given
645 assert self.conn is not None
646 full_tokens: Set[int] = set()
647 partial_tokens: Set[int] = set()
650 analyzer_id = name.get_attr('analyzer')
651 analyzer = self.token_analysis.get_analyzer(analyzer_id)
652 norm_name = analyzer.normalize(name.name)
653 if analyzer_id is None:
656 token_id = f'{norm_name}@{analyzer_id}'
658 full, part = self._cache.names.get(token_id, (None, None))
660 variants = analyzer.get_variants_ascii(norm_name)
664 with self.conn.cursor() as cur:
665 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
666 (token_id, variants))
667 full, part = cast(Tuple[int, List[int]],
668 cur.fetchone()) # type: ignore[no-untyped-call]
670 self._cache.names[token_id] = (full, part)
672 assert part is not None
674 full_tokens.add(full)
675 partial_tokens.update(part)
677 return full_tokens, partial_tokens
680 def _add_postcode(self, item: PlaceName) -> Optional[str]:
681 """ Make sure the normalized postcode is present in the word table.
683 assert self.conn is not None
684 analyzer = self.token_analysis.analysis.get('@postcode')
687 postcode_name = item.name.strip().upper()
690 postcode_name = analyzer.normalize(item.name)
691 variant_base = item.get_attr("variant")
694 postcode = f'{postcode_name}@{variant_base}'
696 postcode = postcode_name
698 if postcode not in self._cache.postcodes:
699 term = self._search_normalized(postcode_name)
704 if analyzer is not None and variant_base:
705 variants.update(analyzer.get_variants_ascii(variant_base))
707 with self.conn.cursor() as cur:
708 cur.execute("SELECT create_postcode_word(%s, %s)",
709 (postcode, list(variants)))
710 self._cache.postcodes.add(postcode)
716 """ Collect token information to be sent back to the database.
718 def __init__(self) -> None:
719 self.names: Optional[str] = None
720 self.housenumbers: Set[str] = set()
721 self.housenumber_tokens: Set[int] = set()
722 self.street_tokens: Set[int] = set()
723 self.place_tokens: Set[int] = set()
724 self.address_tokens: Dict[str, str] = {}
725 self.postcode: Optional[str] = None
728 def _mk_array(self, tokens: Iterable[Any]) -> str:
729 return f"{{{','.join((str(s) for s in tokens))}}}"
732 def to_dict(self) -> Dict[str, Any]:
733 """ Return the token information in database importable format.
735 out: Dict[str, Any] = {}
738 out['names'] = self.names
740 if self.housenumbers:
741 out['hnr'] = ';'.join(self.housenumbers)
742 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
744 if self.street_tokens:
745 out['street'] = self._mk_array(self.street_tokens)
747 if self.place_tokens:
748 out['place'] = self._mk_array(self.place_tokens)
750 if self.address_tokens:
751 out['addr'] = self.address_tokens
754 out['postcode'] = self.postcode
759 def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
760 """ Adds token information for the normalised names.
762 self.names = self._mk_array(itertools.chain(fulls, partials))
765 def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
766 """ Extract housenumber information from a list of normalised
770 assert hnr is not None
771 self.housenumbers.add(hnr)
772 self.housenumber_tokens.add(token)
775 def add_street(self, tokens: Iterable[int]) -> None:
776 """ Add addr:street match terms.
778 self.street_tokens.update(tokens)
781 def add_place(self, tokens: Iterable[int]) -> None:
782 """ Add addr:place search and match terms.
784 self.place_tokens.update(tokens)
787 def add_address_term(self, key: str, partials: Iterable[int]) -> None:
788 """ Add additional address terms.
791 self.address_tokens[key] = self._mk_array(partials)
793 def set_postcode(self, postcode: Optional[str]) -> None:
794 """ Set the postcode to the given one.
796 self.postcode = postcode
800 """ Cache for token information to avoid repeated database queries.
802 This cache is not thread-safe and needs to be instantiated per
805 def __init__(self) -> None:
806 self.names: Dict[str, Tuple[int, List[int]]] = {}
807 self.partials: Dict[str, int] = {}
808 self.fulls: Dict[str, List[int]] = {}
809 self.postcodes: Set[str] = set()
810 self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}