nominatim/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, Dict, Set, Iterable
  12 import itertools
  13 import json
  14 import logging
  15 from pathlib import Path
  16 from textwrap import dedent
  17
  18 from nominatim.db.connection import connect, Connection, Cursor
  19 from nominatim.config import Configuration
  20 from nominatim.db.utils import CopyBuffer
  21 from nominatim.db.sql_preprocessor import SQLPreprocessor
  22 from nominatim.data.place_info import PlaceInfo
  23 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  24 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
  25 from nominatim.tokenizer.sanitizers.base import PlaceName
  26 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
  27 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  28
  29 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  30
  31 LOG = logging.getLogger()
  32
  33 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
  34     """ Create a new instance of the tokenizer provided by this module.
  35     """
  36     return ICUTokenizer(dsn, data_dir)
  37
  38
  39 class ICUTokenizer(AbstractTokenizer):
  40     """ This tokenizer uses libICU to covert names and queries to ASCII.
  41         Otherwise it uses the same algorithms and data structures as the
  42         normalization routines in Nominatim 3.
  43     """
  44
  45     def __init__(self, dsn: str, data_dir: Path) -> None:
  46         self.dsn = dsn
  47         self.data_dir = data_dir
  48         self.loader: Optional[ICURuleLoader] = None
  49
  50
  51     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
  52         """ Set up a new tokenizer for the database.
  53
  54             This copies all necessary data in the project directory to make
  55             sure the tokenizer remains stable even over updates.
  56         """
  57         self.loader = ICURuleLoader(config)
  58
  59         self._install_php(config.lib_dir.php, overwrite=True)
  60         self._save_config()
  61
  62         if init_db:
  63             self.update_sql_functions(config)
  64             self._init_db_tables(config)
  65
  66
  67     def init_from_project(self, config: Configuration) -> None:
  68         """ Initialise the tokenizer from the project directory.
  69         """
  70         self.loader = ICURuleLoader(config)
  71
  72         with connect(self.dsn) as conn:
  73             self.loader.load_config_from_db(conn)
  74
  75         self._install_php(config.lib_dir.php, overwrite=False)
  76
  77
  78     def finalize_import(self, config: Configuration) -> None:
  79         """ Do any required postprocessing to make the tokenizer data ready
  80             for use.
  81         """
  82         with connect(self.dsn) as conn:
  83             sqlp = SQLPreprocessor(conn, config)
  84             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  85
  86
  87     def update_sql_functions(self, config: Configuration) -> None:
  88         """ Reimport the SQL functions for this tokenizer.
  89         """
  90         with connect(self.dsn) as conn:
  91             sqlp = SQLPreprocessor(conn, config)
  92             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  93
  94
  95     def check_database(self, config: Configuration) -> None:
  96         """ Check that the tokenizer is set up correctly.
  97         """
  98         # Will throw an error if there is an issue.
  99         self.init_from_project(config)
 100
 101
 102     def update_statistics(self) -> None:
 103         """ Recompute frequencies for all name words.
 104         """
 105         with connect(self.dsn) as conn:
 106             if conn.table_exists('search_name'):
 107                 with conn.cursor() as cur:
 108                     cur.drop_table("word_frequencies")
 109                     LOG.info("Computing word frequencies")
 110                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 111                                      SELECT unnest(name_vector) as id, count(*)
 112                                      FROM search_name GROUP BY id""")
 113                     cur.execute("CREATE INDEX ON word_frequencies(id)")
 114                     LOG.info("Update word table with recomputed frequencies")
 115                     cur.execute("""UPDATE word
 116                                    SET info = info || jsonb_build_object('count', count)
 117                                    FROM word_frequencies WHERE word_id = id""")
 118                     cur.drop_table("word_frequencies")
 119             conn.commit()
 120
 121
 122     def _cleanup_housenumbers(self) -> None:
 123         """ Remove unused house numbers.
 124         """
 125         with connect(self.dsn) as conn:
 126             if not conn.table_exists('search_name'):
 127                 return
 128             with conn.cursor(name="hnr_counter") as cur:
 129                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
 130                                FROM word
 131                                WHERE type = 'H'
 132                                  AND NOT EXISTS(SELECT * FROM search_name
 133                                                 WHERE ARRAY[word.word_id] && name_vector)
 134                                  AND (char_length(coalesce(word, word_token)) > 6
 135                                       OR coalesce(word, word_token) not similar to '\\d+')
 136                             """)
 137                 candidates = {token: wid for wid, token in cur}
 138             with conn.cursor(name="hnr_counter") as cur:
 139                 cur.execute("""SELECT housenumber FROM placex
 140                                WHERE housenumber is not null
 141                                      AND (char_length(housenumber) > 6
 142                                           OR housenumber not similar to '\\d+')
 143                             """)
 144                 for row in cur:
 145                     for hnr in row[0].split(';'):
 146                         candidates.pop(hnr, None)
 147             LOG.info("There are %s outdated housenumbers.", len(candidates))
 148             LOG.debug("Outdated housenumbers: %s", candidates.keys())
 149             if candidates:
 150                 with conn.cursor() as cur:
 151                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 152                                 (list(candidates.values()), ))
 153                 conn.commit()
 154
 155
 156
 157     def update_word_tokens(self) -> None:
 158         """ Remove unused tokens.
 159         """
 160         LOG.warning("Cleaning up housenumber tokens.")
 161         self._cleanup_housenumbers()
 162         LOG.warning("Tokenizer house-keeping done.")
 163
 164
 165     def name_analyzer(self) -> 'ICUNameAnalyzer':
 166         """ Create a new analyzer for tokenizing names and queries
 167             using this tokinzer. Analyzers are context managers and should
 168             be used accordingly:
 169
 170             ```
 171             with tokenizer.name_analyzer() as analyzer:
 172                 analyser.tokenize()
 173             ```
 174
 175             When used outside the with construct, the caller must ensure to
 176             call the close() function before destructing the analyzer.
 177
 178             Analyzers are not thread-safe. You need to instantiate one per thread.
 179         """
 180         assert self.loader is not None
 181         return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 182                                self.loader.make_token_analysis())
 183
 184
 185     def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
 186         """ Install the php script for the tokenizer.
 187         """
 188         assert self.loader is not None
 189         php_file = self.data_dir / "tokenizer.php"
 190
 191         if not php_file.exists() or overwrite:
 192             php_file.write_text(dedent(f"""\
 193                 <?php
 194                 @define('CONST_Max_Word_Frequency', 10000000);
 195                 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
 196                 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 197                 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
 198
 199
 200     def _save_config(self) -> None:
 201         """ Save the configuration that needs to remain stable for the given
 202             database as database properties.
 203         """
 204         assert self.loader is not None
 205         with connect(self.dsn) as conn:
 206             self.loader.save_config_to_db(conn)
 207
 208
 209     def _init_db_tables(self, config: Configuration) -> None:
 210         """ Set up the word table and fill it with pre-computed word
 211             frequencies.
 212         """
 213         with connect(self.dsn) as conn:
 214             sqlp = SQLPreprocessor(conn, config)
 215             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 216             conn.commit()
 217
 218
 219 class ICUNameAnalyzer(AbstractAnalyzer):
 220     """ The ICU analyzer uses the ICU library for splitting names.
 221
 222         Each instance opens a connection to the database to request the
 223         normalization.
 224     """
 225
 226     def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
 227                  token_analysis: ICUTokenAnalysis) -> None:
 228         self.conn: Optional[Connection] = connect(dsn).connection
 229         self.conn.autocommit = True
 230         self.sanitizer = sanitizer
 231         self.token_analysis = token_analysis
 232
 233         self._cache = _TokenCache()
 234
 235
 236     def close(self) -> None:
 237         """ Free all resources used by the analyzer.
 238         """
 239         if self.conn:
 240             self.conn.close()
 241             self.conn = None
 242
 243
 244     def _search_normalized(self, name: str) -> str:
 245         """ Return the search token transliteration of the given name.
 246         """
 247         return cast(str, self.token_analysis.search.transliterate(name)).strip()
 248
 249
 250     def _normalized(self, name: str) -> str:
 251         """ Return the normalized version of the given name with all
 252             non-relevant information removed.
 253         """
 254         return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
 255
 256
 257     def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
 258         """ Return token information for the given list of words.
 259             If a word starts with # it is assumed to be a full name
 260             otherwise is a partial name.
 261
 262             The function returns a list of tuples with
 263             (original word, word token, word id).
 264
 265             The function is used for testing and debugging only
 266             and not necessarily efficient.
 267         """
 268         assert self.conn is not None
 269         full_tokens = {}
 270         partial_tokens = {}
 271         for word in words:
 272             if word.startswith('#'):
 273                 full_tokens[word] = self._search_normalized(word[1:])
 274             else:
 275                 partial_tokens[word] = self._search_normalized(word)
 276
 277         with self.conn.cursor() as cur:
 278             cur.execute("""SELECT word_token, word_id
 279                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 280                         """, (list(full_tokens.values()),))
 281             full_ids = {r[0]: r[1] for r in cur}
 282             cur.execute("""SELECT word_token, word_id
 283                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 284                         (list(partial_tokens.values()),))
 285             part_ids = {r[0]: r[1] for r in cur}
 286
 287         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 288                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 289
 290
 291     def normalize_postcode(self, postcode: str) -> str:
 292         """ Convert the postcode to a standardized form.
 293
 294             This function must yield exactly the same result as the SQL function
 295             'token_normalized_postcode()'.
 296         """
 297         return postcode.strip().upper()
 298
 299
 300     def update_postcodes_from_db(self) -> None:
 301         """ Update postcode tokens in the word table from the location_postcode
 302             table.
 303         """
 304         assert self.conn is not None
 305         analyzer = self.token_analysis.analysis.get('@postcode')
 306
 307         with self.conn.cursor() as cur:
 308             # First get all postcode names currently in the word table.
 309             cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
 310             word_entries = set((entry[0] for entry in cur))
 311
 312             # Then compute the required postcode names from the postcode table.
 313             needed_entries = set()
 314             cur.execute("SELECT country_code, postcode FROM location_postcode")
 315             for cc, postcode in cur:
 316                 info = PlaceInfo({'country_code': cc,
 317                                   'class': 'place', 'type': 'postcode',
 318                                   'address': {'postcode': postcode}})
 319                 address = self.sanitizer.process_names(info)[1]
 320                 for place in address:
 321                     if place.kind == 'postcode':
 322                         if analyzer is None:
 323                             postcode_name = place.name.strip().upper()
 324                             variant_base = None
 325                         else:
 326                             postcode_name = analyzer.normalize(place.name)
 327                             variant_base = place.get_attr("variant")
 328
 329                         if variant_base:
 330                             needed_entries.add(f'{postcode_name}@{variant_base}')
 331                         else:
 332                             needed_entries.add(postcode_name)
 333                         break
 334
 335         # Now update the word table.
 336         self._delete_unused_postcode_words(word_entries - needed_entries)
 337         self._add_missing_postcode_words(needed_entries - word_entries)
 338
 339     def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
 340         assert self.conn is not None
 341         if tokens:
 342             with self.conn.cursor() as cur:
 343                 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
 344                             (list(tokens), ))
 345
 346     def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
 347         assert self.conn is not None
 348         if not tokens:
 349             return
 350
 351         analyzer = self.token_analysis.analysis.get('@postcode')
 352         terms = []
 353
 354         for postcode_name in tokens:
 355             if '@' in postcode_name:
 356                 term, variant = postcode_name.split('@', 2)
 357                 term = self._search_normalized(term)
 358                 if analyzer is None:
 359                     variants = [term]
 360                 else:
 361                     variants = analyzer.get_variants_ascii(variant)
 362                     if term not in variants:
 363                         variants.append(term)
 364             else:
 365                 variants = [self._search_normalized(postcode_name)]
 366             terms.append((postcode_name, variants))
 367
 368         if terms:
 369             with self.conn.cursor() as cur:
 370                 cur.execute_values("""SELECT create_postcode_word(pc, var)
 371                                       FROM (VALUES %s) AS v(pc, var)""",
 372                                    terms)
 373
 374
 375
 376
 377     def update_special_phrases(self, phrases: Sequence[Tuple[str, str, str, str]],
 378                                should_replace: bool) -> None:
 379         """ Replace the search index for special phrases with the new phrases.
 380             If `should_replace` is True, then the previous set of will be
 381             completely replaced. Otherwise the phrases are added to the
 382             already existing ones.
 383         """
 384         assert self.conn is not None
 385         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 386                             for p in phrases))
 387
 388         with self.conn.cursor() as cur:
 389             # Get the old phrases.
 390             existing_phrases = set()
 391             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 392             for word, info in cur:
 393                 existing_phrases.add((word, info['class'], info['type'],
 394                                       info.get('op') or '-'))
 395
 396             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 397             if should_replace:
 398                 deleted = self._remove_special_phrases(cur, norm_phrases,
 399                                                        existing_phrases)
 400             else:
 401                 deleted = 0
 402
 403         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 404                  len(norm_phrases), added, deleted)
 405
 406
 407     def _add_special_phrases(self, cursor: Cursor,
 408                              new_phrases: Set[Tuple[str, str, str, str]],
 409                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 410         """ Add all phrases to the database that are not yet there.
 411         """
 412         to_add = new_phrases - existing_phrases
 413
 414         added = 0
 415         with CopyBuffer() as copystr:
 416             for word, cls, typ, oper in to_add:
 417                 term = self._search_normalized(word)
 418                 if term:
 419                     copystr.add(term, 'S', word,
 420                                 json.dumps({'class': cls, 'type': typ,
 421                                             'op': oper if oper in ('in', 'near') else None}))
 422                     added += 1
 423
 424             copystr.copy_out(cursor, 'word',
 425                              columns=['word_token', 'type', 'word', 'info'])
 426
 427         return added
 428
 429
 430     def _remove_special_phrases(self, cursor: Cursor,
 431                              new_phrases: Set[Tuple[str, str, str, str]],
 432                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 433         """ Remove all phrases from the databse that are no longer in the
 434             new phrase list.
 435         """
 436         to_delete = existing_phrases - new_phrases
 437
 438         if to_delete:
 439             cursor.execute_values(
 440                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 441                     WHERE type = 'S' and word = name
 442                           and info->>'class' = in_class and info->>'type' = in_type
 443                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 444                 """, to_delete)
 445
 446         return len(to_delete)
 447
 448
 449     def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
 450         """ Add default names for the given country to the search index.
 451         """
 452         # Make sure any name preprocessing for country names applies.
 453         info = PlaceInfo({'name': names, 'country_code': country_code,
 454                           'rank_address': 4, 'class': 'boundary',
 455                           'type': 'administrative'})
 456         self._add_country_full_names(country_code,
 457                                      self.sanitizer.process_names(info)[0],
 458                                      internal=True)
 459
 460
 461     def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
 462                                 internal: bool = False) -> None:
 463         """ Add names for the given country from an already sanitized
 464             name list.
 465         """
 466         assert self.conn is not None
 467         word_tokens = set()
 468         for name in names:
 469             norm_name = self._search_normalized(name.name)
 470             if norm_name:
 471                 word_tokens.add(norm_name)
 472
 473         with self.conn.cursor() as cur:
 474             # Get existing names
 475             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
 476                              FROM word
 477                              WHERE type = 'C' and word = %s""",
 478                         (country_code, ))
 479             # internal/external names
 480             existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
 481             for word in cur:
 482                 existing_tokens[word[1]].add(word[0])
 483
 484             # Delete names that no longer exist.
 485             gone_tokens = existing_tokens[internal] - word_tokens
 486             if internal:
 487                 gone_tokens.update(existing_tokens[False] & word_tokens)
 488             if gone_tokens:
 489                 cur.execute("""DELETE FROM word
 490                                USING unnest(%s) as token
 491                                WHERE type = 'C' and word = %s
 492                                      and word_token = token""",
 493                             (list(gone_tokens), country_code))
 494
 495             # Only add those names that are not yet in the list.
 496             new_tokens = word_tokens - existing_tokens[True]
 497             if not internal:
 498                 new_tokens -= existing_tokens[False]
 499             if new_tokens:
 500                 if internal:
 501                     sql = """INSERT INTO word (word_token, type, word, info)
 502                                (SELECT token, 'C', %s, '{"internal": "yes"}'
 503                                   FROM unnest(%s) as token)
 504                            """
 505                 else:
 506                     sql = """INSERT INTO word (word_token, type, word)
 507                                    (SELECT token, 'C', %s
 508                                     FROM unnest(%s) as token)
 509                           """
 510                 cur.execute(sql, (country_code, list(new_tokens)))
 511
 512
 513     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
 514         """ Determine tokenizer information about the given place.
 515
 516             Returns a JSON-serializable structure that will be handed into
 517             the database via the token_info field.
 518         """
 519         token_info = _TokenInfo()
 520
 521         names, address = self.sanitizer.process_names(place)
 522
 523         if names:
 524             token_info.set_names(*self._compute_name_tokens(names))
 525
 526             if place.is_country():
 527                 assert place.country_code is not None
 528                 self._add_country_full_names(place.country_code, names)
 529
 530         if address:
 531             self._process_place_address(token_info, address)
 532
 533         return token_info.to_dict()
 534
 535
 536     def _process_place_address(self, token_info: '_TokenInfo',
 537                                address: Sequence[PlaceName]) -> None:
 538         for item in address:
 539             if item.kind == 'postcode':
 540                 token_info.set_postcode(self._add_postcode(item))
 541             elif item.kind == 'housenumber':
 542                 token_info.add_housenumber(*self._compute_housenumber_token(item))
 543             elif item.kind == 'street':
 544                 token_info.add_street(self._retrieve_full_tokens(item.name))
 545             elif item.kind == 'place':
 546                 if not item.suffix:
 547                     token_info.add_place(self._compute_partial_tokens(item.name))
 548             elif not item.kind.startswith('_') and not item.suffix and \
 549                  item.kind not in ('country', 'full', 'inclusion'):
 550                 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
 551
 552
 553     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
 554         """ Normalize the housenumber and return the word token and the
 555             canonical form.
 556         """
 557         assert self.conn is not None
 558         analyzer = self.token_analysis.analysis.get('@housenumber')
 559         result: Tuple[Optional[int], Optional[str]] = (None, None)
 560
 561         if analyzer is None:
 562             # When no custom analyzer is set, simply normalize and transliterate
 563             norm_name = self._search_normalized(hnr.name)
 564             if norm_name:
 565                 result = self._cache.housenumbers.get(norm_name, result)
 566                 if result[0] is None:
 567                     with self.conn.cursor() as cur:
 568                         cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
 569                         result = cur.fetchone()[0], norm_name # type: ignore[no-untyped-call]
 570                         self._cache.housenumbers[norm_name] = result
 571         else:
 572             # Otherwise use the analyzer to determine the canonical name.
 573             # Per convention we use the first variant as the 'lookup name', the
 574             # name that gets saved in the housenumber field of the place.
 575             norm_name = analyzer.normalize(hnr.name)
 576             if norm_name:
 577                 result = self._cache.housenumbers.get(norm_name, result)
 578                 if result[0] is None:
 579                     variants = analyzer.get_variants_ascii(norm_name)
 580                     if variants:
 581                         with self.conn.cursor() as cur:
 582                             cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
 583                                         (norm_name, list(variants)))
 584                             result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call]
 585                             self._cache.housenumbers[norm_name] = result
 586
 587         return result
 588
 589
 590     def _compute_partial_tokens(self, name: str) -> List[int]:
 591         """ Normalize the given term, split it into partial words and return
 592             then token list for them.
 593         """
 594         assert self.conn is not None
 595         norm_name = self._search_normalized(name)
 596
 597         tokens = []
 598         need_lookup = []
 599         for partial in norm_name.split():
 600             token = self._cache.partials.get(partial)
 601             if token:
 602                 tokens.append(token)
 603             else:
 604                 need_lookup.append(partial)
 605
 606         if need_lookup:
 607             with self.conn.cursor() as cur:
 608                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 609                                FROM unnest(%s) word""",
 610                             (need_lookup, ))
 611
 612                 for partial, token in cur:
 613                     assert token is not None
 614                     tokens.append(token)
 615                     self._cache.partials[partial] = token
 616
 617         return tokens
 618
 619
 620     def _retrieve_full_tokens(self, name: str) -> List[int]:
 621         """ Get the full name token for the given name, if it exists.
 622             The name is only retrived for the standard analyser.
 623         """
 624         assert self.conn is not None
 625         norm_name = self._search_normalized(name)
 626
 627         # return cached if possible
 628         if norm_name in self._cache.fulls:
 629             return self._cache.fulls[norm_name]
 630
 631         with self.conn.cursor() as cur:
 632             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 633                         (norm_name, ))
 634             full = [row[0] for row in cur]
 635
 636         self._cache.fulls[norm_name] = full
 637
 638         return full
 639
 640
 641     def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
 642         """ Computes the full name and partial name tokens for the given
 643             dictionary of names.
 644         """
 645         assert self.conn is not None
 646         full_tokens: Set[int] = set()
 647         partial_tokens: Set[int] = set()
 648
 649         for name in names:
 650             analyzer_id = name.get_attr('analyzer')
 651             analyzer = self.token_analysis.get_analyzer(analyzer_id)
 652             norm_name = analyzer.normalize(name.name)
 653             if analyzer_id is None:
 654                 token_id = norm_name
 655             else:
 656                 token_id = f'{norm_name}@{analyzer_id}'
 657
 658             full, part = self._cache.names.get(token_id, (None, None))
 659             if full is None:
 660                 variants = analyzer.get_variants_ascii(norm_name)
 661                 if not variants:
 662                     continue
 663
 664                 with self.conn.cursor() as cur:
 665                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
 666                                 (token_id, variants))
 667                     full, part = cast(Tuple[int, List[int]],
 668                                       cur.fetchone()) # type: ignore[no-untyped-call]
 669
 670                 self._cache.names[token_id] = (full, part)
 671
 672             assert part is not None
 673
 674             full_tokens.add(full)
 675             partial_tokens.update(part)
 676
 677         return full_tokens, partial_tokens
 678
 679
 680     def _add_postcode(self, item: PlaceName) -> Optional[str]:
 681         """ Make sure the normalized postcode is present in the word table.
 682         """
 683         assert self.conn is not None
 684         analyzer = self.token_analysis.analysis.get('@postcode')
 685
 686         if analyzer is None:
 687             postcode_name = item.name.strip().upper()
 688             variant_base = None
 689         else:
 690             postcode_name = analyzer.normalize(item.name)
 691             variant_base = item.get_attr("variant")
 692
 693         if variant_base:
 694             postcode = f'{postcode_name}@{variant_base}'
 695         else:
 696             postcode = postcode_name
 697
 698         if postcode not in self._cache.postcodes:
 699             term = self._search_normalized(postcode_name)
 700             if not term:
 701                 return None
 702
 703             variants = {term}
 704             if analyzer is not None and variant_base:
 705                 variants.update(analyzer.get_variants_ascii(variant_base))
 706
 707             with self.conn.cursor() as cur:
 708                 cur.execute("SELECT create_postcode_word(%s, %s)",
 709                             (postcode, list(variants)))
 710             self._cache.postcodes.add(postcode)
 711
 712         return postcode_name
 713
 714
 715 class _TokenInfo:
 716     """ Collect token information to be sent back to the database.
 717     """
 718     def __init__(self) -> None:
 719         self.names: Optional[str] = None
 720         self.housenumbers: Set[str] = set()
 721         self.housenumber_tokens: Set[int] = set()
 722         self.street_tokens: Set[int] = set()
 723         self.place_tokens: Set[int] = set()
 724         self.address_tokens: Dict[str, str] = {}
 725         self.postcode: Optional[str] = None
 726
 727
 728     def _mk_array(self, tokens: Iterable[Any]) -> str:
 729         return f"{{{','.join((str(s) for s in tokens))}}}"
 730
 731
 732     def to_dict(self) -> Dict[str, Any]:
 733         """ Return the token information in database importable format.
 734         """
 735         out: Dict[str, Any] = {}
 736
 737         if self.names:
 738             out['names'] = self.names
 739
 740         if self.housenumbers:
 741             out['hnr'] = ';'.join(self.housenumbers)
 742             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 743
 744         if self.street_tokens:
 745             out['street'] = self._mk_array(self.street_tokens)
 746
 747         if self.place_tokens:
 748             out['place'] = self._mk_array(self.place_tokens)
 749
 750         if self.address_tokens:
 751             out['addr'] = self.address_tokens
 752
 753         if self.postcode:
 754             out['postcode'] = self.postcode
 755
 756         return out
 757
 758
 759     def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
 760         """ Adds token information for the normalised names.
 761         """
 762         self.names = self._mk_array(itertools.chain(fulls, partials))
 763
 764
 765     def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
 766         """ Extract housenumber information from a list of normalised
 767             housenumbers.
 768         """
 769         if token:
 770             assert hnr is not None
 771             self.housenumbers.add(hnr)
 772             self.housenumber_tokens.add(token)
 773
 774
 775     def add_street(self, tokens: Iterable[int]) -> None:
 776         """ Add addr:street match terms.
 777         """
 778         self.street_tokens.update(tokens)
 779
 780
 781     def add_place(self, tokens: Iterable[int]) -> None:
 782         """ Add addr:place search and match terms.
 783         """
 784         self.place_tokens.update(tokens)
 785
 786
 787     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
 788         """ Add additional address terms.
 789         """
 790         if partials:
 791             self.address_tokens[key] = self._mk_array(partials)
 792
 793     def set_postcode(self, postcode: Optional[str]) -> None:
 794         """ Set the postcode to the given one.
 795         """
 796         self.postcode = postcode
 797
 798
 799 class _TokenCache:
 800     """ Cache for token information to avoid repeated database queries.
 801
 802         This cache is not thread-safe and needs to be instantiated per
 803         analyzer.
 804     """
 805     def __init__(self) -> None:
 806         self.names: Dict[str, Tuple[int, List[int]]] = {}
 807         self.partials: Dict[str, int] = {}
 808         self.fulls: Dict[str, List[int]] = {}
 809         self.postcodes: Set[str] = set()
 810         self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}