nominatim/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
  12                    Dict, Set, Iterable
  13 import itertools
  14 import json
  15 import logging
  16 from pathlib import Path
  17 from textwrap import dedent
  18
  19 from nominatim.db.connection import connect, Connection, Cursor
  20 from nominatim.config import Configuration
  21 from nominatim.db.utils import CopyBuffer
  22 from nominatim.db.sql_preprocessor import SQLPreprocessor
  23 from nominatim.data.place_info import PlaceInfo
  24 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  25 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
  26 from nominatim.data.place_name import PlaceName
  27 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
  28 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  29
  30 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  31
  32 LOG = logging.getLogger()
  33
  34 WORD_TYPES =(('country_names', 'C'),
  35              ('postcodes', 'P'),
  36              ('full_word', 'W'),
  37              ('housenumbers', 'H'))
  38
  39 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
  40     """ Create a new instance of the tokenizer provided by this module.
  41     """
  42     return ICUTokenizer(dsn, data_dir)
  43
  44
  45 class ICUTokenizer(AbstractTokenizer):
  46     """ This tokenizer uses libICU to convert names and queries to ASCII.
  47         Otherwise it uses the same algorithms and data structures as the
  48         normalization routines in Nominatim 3.
  49     """
  50
  51     def __init__(self, dsn: str, data_dir: Path) -> None:
  52         self.dsn = dsn
  53         self.data_dir = data_dir
  54         self.loader: Optional[ICURuleLoader] = None
  55
  56
  57     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
  58         """ Set up a new tokenizer for the database.
  59
  60             This copies all necessary data in the project directory to make
  61             sure the tokenizer remains stable even over updates.
  62         """
  63         self.loader = ICURuleLoader(config)
  64
  65         self._install_php(config.lib_dir.php, overwrite=True)
  66         self._save_config()
  67
  68         if init_db:
  69             self.update_sql_functions(config)
  70             self._setup_db_tables(config, 'word')
  71             self._create_base_indices(config, 'word')
  72
  73
  74     def init_from_project(self, config: Configuration) -> None:
  75         """ Initialise the tokenizer from the project directory.
  76         """
  77         self.loader = ICURuleLoader(config)
  78
  79         with connect(self.dsn) as conn:
  80             self.loader.load_config_from_db(conn)
  81
  82         self._install_php(config.lib_dir.php, overwrite=False)
  83
  84
  85     def finalize_import(self, config: Configuration) -> None:
  86         """ Do any required postprocessing to make the tokenizer data ready
  87             for use.
  88         """
  89         self._create_lookup_indices(config, 'word')
  90
  91
  92     def update_sql_functions(self, config: Configuration) -> None:
  93         """ Reimport the SQL functions for this tokenizer.
  94         """
  95         with connect(self.dsn) as conn:
  96             sqlp = SQLPreprocessor(conn, config)
  97             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  98
  99
 100     def check_database(self, config: Configuration) -> None:
 101         """ Check that the tokenizer is set up correctly.
 102         """
 103         # Will throw an error if there is an issue.
 104         self.init_from_project(config)
 105
 106
 107     def update_statistics(self, config: Configuration) -> None:
 108         """ Recompute frequencies for all name words.
 109         """
 110         with connect(self.dsn) as conn:
 111             if not conn.table_exists('search_name'):
 112                 return
 113
 114             with conn.cursor() as cur:
 115                 LOG.info('Computing word frequencies')
 116                 cur.drop_table('word_frequencies')
 117                 cur.execute("""CREATE TEMP TABLE word_frequencies AS
 118                                  SELECT unnest(name_vector) as id, count(*)
 119                                  FROM search_name GROUP BY id""")
 120                 cur.execute('CREATE INDEX ON word_frequencies(id)')
 121                 LOG.info('Update word table with recomputed frequencies')
 122                 cur.drop_table('tmp_word')
 123                 cur.execute("""CREATE TABLE tmp_word AS
 124                                 SELECT word_id, word_token, type, word,
 125                                        (CASE WHEN wf.count is null THEN info
 126                                           ELSE info || jsonb_build_object('count', wf.count)
 127                                         END) as info
 128                                 FROM word LEFT JOIN word_frequencies wf
 129                                   ON word.word_id = wf.id""")
 130                 cur.drop_table('word_frequencies')
 131             conn.commit()
 132         self._create_base_indices(config, 'tmp_word')
 133         self._create_lookup_indices(config, 'tmp_word')
 134         self._move_temporary_word_table('tmp_word')
 135
 136
 137
 138     def _cleanup_housenumbers(self) -> None:
 139         """ Remove unused house numbers.
 140         """
 141         with connect(self.dsn) as conn:
 142             if not conn.table_exists('search_name'):
 143                 return
 144             with conn.cursor(name="hnr_counter") as cur:
 145                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
 146                                FROM word
 147                                WHERE type = 'H'
 148                                  AND NOT EXISTS(SELECT * FROM search_name
 149                                                 WHERE ARRAY[word.word_id] && name_vector)
 150                                  AND (char_length(coalesce(word, word_token)) > 6
 151                                       OR coalesce(word, word_token) not similar to '\\d+')
 152                             """)
 153                 candidates = {token: wid for wid, token in cur}
 154             with conn.cursor(name="hnr_counter") as cur:
 155                 cur.execute("""SELECT housenumber FROM placex
 156                                WHERE housenumber is not null
 157                                      AND (char_length(housenumber) > 6
 158                                           OR housenumber not similar to '\\d+')
 159                             """)
 160                 for row in cur:
 161                     for hnr in row[0].split(';'):
 162                         candidates.pop(hnr, None)
 163             LOG.info("There are %s outdated housenumbers.", len(candidates))
 164             LOG.debug("Outdated housenumbers: %s", candidates.keys())
 165             if candidates:
 166                 with conn.cursor() as cur:
 167                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 168                                 (list(candidates.values()), ))
 169                 conn.commit()
 170
 171
 172
 173     def update_word_tokens(self) -> None:
 174         """ Remove unused tokens.
 175         """
 176         LOG.warning("Cleaning up housenumber tokens.")
 177         self._cleanup_housenumbers()
 178         LOG.warning("Tokenizer house-keeping done.")
 179
 180
 181     def name_analyzer(self) -> 'ICUNameAnalyzer':
 182         """ Create a new analyzer for tokenizing names and queries
 183             using this tokinzer. Analyzers are context managers and should
 184             be used accordingly:
 185
 186             ```
 187             with tokenizer.name_analyzer() as analyzer:
 188                 analyser.tokenize()
 189             ```
 190
 191             When used outside the with construct, the caller must ensure to
 192             call the close() function before destructing the analyzer.
 193
 194             Analyzers are not thread-safe. You need to instantiate one per thread.
 195         """
 196         assert self.loader is not None
 197         return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 198                                self.loader.make_token_analysis())
 199
 200
 201     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
 202         """ Return a list of the `num` most frequent full words
 203             in the database.
 204         """
 205         with conn.cursor() as cur:
 206             cur.execute("""SELECT word, sum((info->>'count')::int) as count
 207                              FROM word WHERE type = 'W'
 208                              GROUP BY word
 209                              ORDER BY count DESC LIMIT %s""", (num,))
 210             return list(s[0].split('@')[0] for s in cur)
 211
 212
 213     def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
 214         """ Install the php script for the tokenizer.
 215         """
 216         assert self.loader is not None
 217         php_file = self.data_dir / "tokenizer.php"
 218
 219         if not php_file.exists() or overwrite:
 220             php_file.write_text(dedent(f"""\
 221                 <?php
 222                 @define('CONST_Max_Word_Frequency', 10000000);
 223                 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
 224                 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 225                 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
 226
 227
 228     def _save_config(self) -> None:
 229         """ Save the configuration that needs to remain stable for the given
 230             database as database properties.
 231         """
 232         assert self.loader is not None
 233         with connect(self.dsn) as conn:
 234             self.loader.save_config_to_db(conn)
 235
 236
 237     def _setup_db_tables(self, config: Configuration, table_name: str) -> None:
 238         """ Set up the word table and fill it with pre-computed word
 239             frequencies.
 240         """
 241         with connect(self.dsn) as conn:
 242             with conn.cursor() as cur:
 243                 cur.drop_table(table_name)
 244             sqlp = SQLPreprocessor(conn, config)
 245             sqlp.run_string(conn, """
 246                 CREATE TABLE {{table_name}} (
 247                       word_id INTEGER,
 248                       word_token text NOT NULL,
 249                       type text NOT NULL,
 250                       word text,
 251                       info jsonb
 252                     ) {{db.tablespace.search_data}};
 253                 GRANT SELECT ON {{table_name}} TO "{{config.DATABASE_WEBUSER}}";
 254
 255                 DROP SEQUENCE IF EXISTS seq_{{table_name}};
 256                 CREATE SEQUENCE seq_{{table_name}} start 1;
 257                 GRANT SELECT ON seq_{{table_name}} to "{{config.DATABASE_WEBUSER}}";
 258             """, table_name=table_name)
 259
 260
 261     def _create_base_indices(self, config: Configuration, table_name: str) -> None:
 262         """ Set up the word table and fill it with pre-computed word
 263             frequencies.
 264         """
 265         with connect(self.dsn) as conn:
 266             sqlp = SQLPreprocessor(conn, config)
 267             sqlp.run_string(conn,
 268                             """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
 269                                USING BTREE (word_token) {{db.tablespace.search_index}}""",
 270                             table_name=table_name)
 271             for name, ctype in WORD_TYPES:
 272                 sqlp.run_string(conn,
 273                                 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
 274                                    USING BTREE (word) {{db.tablespace.address_index}}
 275                                    WHERE type = '{{column_type}}'
 276                                 """,
 277                                 table_name=table_name, idx_name=name,
 278                                 column_type=ctype)
 279
 280
 281     def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
 282         """ Create addtional indexes used when running the API.
 283         """
 284         with connect(self.dsn) as conn:
 285             sqlp = SQLPreprocessor(conn, config)
 286             # Index required for details lookup.
 287             sqlp.run_string(conn, """
 288                 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
 289                   ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
 290             """,
 291             table_name=table_name)
 292
 293
 294     def _move_temporary_word_table(self, old: str) -> None:
 295         """ Rename all tables and indexes used by the tokenizer.
 296         """
 297         with connect(self.dsn) as conn:
 298             with conn.cursor() as cur:
 299                 cur.drop_table('word')
 300                 cur.execute(f"ALTER TABLE {old} RENAME TO word")
 301                 for idx in ('word_token', 'word_id'):
 302                     cur.execute(f"""ALTER INDEX idx_{old}_{idx}
 303                                       RENAME TO idx_word_{idx}""")
 304                 for name, _ in WORD_TYPES:
 305                     cur.execute(f"""ALTER INDEX idx_{old}_{name}
 306                                     RENAME TO idx_word_{name}""")
 307             conn.commit()
 308
 309
 310
 311
 312 class ICUNameAnalyzer(AbstractAnalyzer):
 313     """ The ICU analyzer uses the ICU library for splitting names.
 314
 315         Each instance opens a connection to the database to request the
 316         normalization.
 317     """
 318
 319     def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
 320                  token_analysis: ICUTokenAnalysis) -> None:
 321         self.conn: Optional[Connection] = connect(dsn).connection
 322         self.conn.autocommit = True
 323         self.sanitizer = sanitizer
 324         self.token_analysis = token_analysis
 325
 326         self._cache = _TokenCache()
 327
 328
 329     def close(self) -> None:
 330         """ Free all resources used by the analyzer.
 331         """
 332         if self.conn:
 333             self.conn.close()
 334             self.conn = None
 335
 336
 337     def _search_normalized(self, name: str) -> str:
 338         """ Return the search token transliteration of the given name.
 339         """
 340         return cast(str, self.token_analysis.search.transliterate(name)).strip()
 341
 342
 343     def _normalized(self, name: str) -> str:
 344         """ Return the normalized version of the given name with all
 345             non-relevant information removed.
 346         """
 347         return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
 348
 349
 350     def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
 351         """ Return token information for the given list of words.
 352             If a word starts with # it is assumed to be a full name
 353             otherwise is a partial name.
 354
 355             The function returns a list of tuples with
 356             (original word, word token, word id).
 357
 358             The function is used for testing and debugging only
 359             and not necessarily efficient.
 360         """
 361         assert self.conn is not None
 362         full_tokens = {}
 363         partial_tokens = {}
 364         for word in words:
 365             if word.startswith('#'):
 366                 full_tokens[word] = self._search_normalized(word[1:])
 367             else:
 368                 partial_tokens[word] = self._search_normalized(word)
 369
 370         with self.conn.cursor() as cur:
 371             cur.execute("""SELECT word_token, word_id
 372                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 373                         """, (list(full_tokens.values()),))
 374             full_ids = {r[0]: r[1] for r in cur}
 375             cur.execute("""SELECT word_token, word_id
 376                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 377                         (list(partial_tokens.values()),))
 378             part_ids = {r[0]: r[1] for r in cur}
 379
 380         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 381                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 382
 383
 384     def normalize_postcode(self, postcode: str) -> str:
 385         """ Convert the postcode to a standardized form.
 386
 387             This function must yield exactly the same result as the SQL function
 388             'token_normalized_postcode()'.
 389         """
 390         return postcode.strip().upper()
 391
 392
 393     def update_postcodes_from_db(self) -> None:
 394         """ Update postcode tokens in the word table from the location_postcode
 395             table.
 396         """
 397         assert self.conn is not None
 398         analyzer = self.token_analysis.analysis.get('@postcode')
 399
 400         with self.conn.cursor() as cur:
 401             # First get all postcode names currently in the word table.
 402             cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
 403             word_entries = set((entry[0] for entry in cur))
 404
 405             # Then compute the required postcode names from the postcode table.
 406             needed_entries = set()
 407             cur.execute("SELECT country_code, postcode FROM location_postcode")
 408             for cc, postcode in cur:
 409                 info = PlaceInfo({'country_code': cc,
 410                                   'class': 'place', 'type': 'postcode',
 411                                   'address': {'postcode': postcode}})
 412                 address = self.sanitizer.process_names(info)[1]
 413                 for place in address:
 414                     if place.kind == 'postcode':
 415                         if analyzer is None:
 416                             postcode_name = place.name.strip().upper()
 417                             variant_base = None
 418                         else:
 419                             postcode_name = analyzer.get_canonical_id(place)
 420                             variant_base = place.get_attr("variant")
 421
 422                         if variant_base:
 423                             needed_entries.add(f'{postcode_name}@{variant_base}')
 424                         else:
 425                             needed_entries.add(postcode_name)
 426                         break
 427
 428         # Now update the word table.
 429         self._delete_unused_postcode_words(word_entries - needed_entries)
 430         self._add_missing_postcode_words(needed_entries - word_entries)
 431
 432     def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
 433         assert self.conn is not None
 434         if tokens:
 435             with self.conn.cursor() as cur:
 436                 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
 437                             (list(tokens), ))
 438
 439     def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
 440         assert self.conn is not None
 441         if not tokens:
 442             return
 443
 444         analyzer = self.token_analysis.analysis.get('@postcode')
 445         terms = []
 446
 447         for postcode_name in tokens:
 448             if '@' in postcode_name:
 449                 term, variant = postcode_name.split('@', 2)
 450                 term = self._search_normalized(term)
 451                 if analyzer is None:
 452                     variants = [term]
 453                 else:
 454                     variants = analyzer.compute_variants(variant)
 455                     if term not in variants:
 456                         variants.append(term)
 457             else:
 458                 variants = [self._search_normalized(postcode_name)]
 459             terms.append((postcode_name, variants))
 460
 461         if terms:
 462             with self.conn.cursor() as cur:
 463                 cur.execute_values("""SELECT create_postcode_word(pc, var)
 464                                       FROM (VALUES %s) AS v(pc, var)""",
 465                                    terms)
 466
 467
 468
 469
 470     def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
 471                                should_replace: bool) -> None:
 472         """ Replace the search index for special phrases with the new phrases.
 473             If `should_replace` is True, then the previous set of will be
 474             completely replaced. Otherwise the phrases are added to the
 475             already existing ones.
 476         """
 477         assert self.conn is not None
 478         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 479                             for p in phrases))
 480
 481         with self.conn.cursor() as cur:
 482             # Get the old phrases.
 483             existing_phrases = set()
 484             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 485             for word, info in cur:
 486                 existing_phrases.add((word, info['class'], info['type'],
 487                                       info.get('op') or '-'))
 488
 489             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 490             if should_replace:
 491                 deleted = self._remove_special_phrases(cur, norm_phrases,
 492                                                        existing_phrases)
 493             else:
 494                 deleted = 0
 495
 496         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 497                  len(norm_phrases), added, deleted)
 498
 499
 500     def _add_special_phrases(self, cursor: Cursor,
 501                              new_phrases: Set[Tuple[str, str, str, str]],
 502                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 503         """ Add all phrases to the database that are not yet there.
 504         """
 505         to_add = new_phrases - existing_phrases
 506
 507         added = 0
 508         with CopyBuffer() as copystr:
 509             for word, cls, typ, oper in to_add:
 510                 term = self._search_normalized(word)
 511                 if term:
 512                     copystr.add(term, 'S', word,
 513                                 json.dumps({'class': cls, 'type': typ,
 514                                             'op': oper if oper in ('in', 'near') else None}))
 515                     added += 1
 516
 517             copystr.copy_out(cursor, 'word',
 518                              columns=['word_token', 'type', 'word', 'info'])
 519
 520         return added
 521
 522
 523     def _remove_special_phrases(self, cursor: Cursor,
 524                              new_phrases: Set[Tuple[str, str, str, str]],
 525                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 526         """ Remove all phrases from the database that are no longer in the
 527             new phrase list.
 528         """
 529         to_delete = existing_phrases - new_phrases
 530
 531         if to_delete:
 532             cursor.execute_values(
 533                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 534                     WHERE type = 'S' and word = name
 535                           and info->>'class' = in_class and info->>'type' = in_type
 536                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 537                 """, to_delete)
 538
 539         return len(to_delete)
 540
 541
 542     def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
 543         """ Add default names for the given country to the search index.
 544         """
 545         # Make sure any name preprocessing for country names applies.
 546         info = PlaceInfo({'name': names, 'country_code': country_code,
 547                           'rank_address': 4, 'class': 'boundary',
 548                           'type': 'administrative'})
 549         self._add_country_full_names(country_code,
 550                                      self.sanitizer.process_names(info)[0],
 551                                      internal=True)
 552
 553
 554     def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
 555                                 internal: bool = False) -> None:
 556         """ Add names for the given country from an already sanitized
 557             name list.
 558         """
 559         assert self.conn is not None
 560         word_tokens = set()
 561         for name in names:
 562             norm_name = self._search_normalized(name.name)
 563             if norm_name:
 564                 word_tokens.add(norm_name)
 565
 566         with self.conn.cursor() as cur:
 567             # Get existing names
 568             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
 569                              FROM word
 570                              WHERE type = 'C' and word = %s""",
 571                         (country_code, ))
 572             # internal/external names
 573             existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
 574             for word in cur:
 575                 existing_tokens[word[1]].add(word[0])
 576
 577             # Delete names that no longer exist.
 578             gone_tokens = existing_tokens[internal] - word_tokens
 579             if internal:
 580                 gone_tokens.update(existing_tokens[False] & word_tokens)
 581             if gone_tokens:
 582                 cur.execute("""DELETE FROM word
 583                                USING unnest(%s) as token
 584                                WHERE type = 'C' and word = %s
 585                                      and word_token = token""",
 586                             (list(gone_tokens), country_code))
 587
 588             # Only add those names that are not yet in the list.
 589             new_tokens = word_tokens - existing_tokens[True]
 590             if not internal:
 591                 new_tokens -= existing_tokens[False]
 592             if new_tokens:
 593                 if internal:
 594                     sql = """INSERT INTO word (word_token, type, word, info)
 595                                (SELECT token, 'C', %s, '{"internal": "yes"}'
 596                                   FROM unnest(%s) as token)
 597                            """
 598                 else:
 599                     sql = """INSERT INTO word (word_token, type, word)
 600                                    (SELECT token, 'C', %s
 601                                     FROM unnest(%s) as token)
 602                           """
 603                 cur.execute(sql, (country_code, list(new_tokens)))
 604
 605
 606     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
 607         """ Determine tokenizer information about the given place.
 608
 609             Returns a JSON-serializable structure that will be handed into
 610             the database via the token_info field.
 611         """
 612         token_info = _TokenInfo()
 613
 614         names, address = self.sanitizer.process_names(place)
 615
 616         if names:
 617             token_info.set_names(*self._compute_name_tokens(names))
 618
 619             if place.is_country():
 620                 assert place.country_code is not None
 621                 self._add_country_full_names(place.country_code, names)
 622
 623         if address:
 624             self._process_place_address(token_info, address)
 625
 626         return token_info.to_dict()
 627
 628
 629     def _process_place_address(self, token_info: '_TokenInfo',
 630                                address: Sequence[PlaceName]) -> None:
 631         for item in address:
 632             if item.kind == 'postcode':
 633                 token_info.set_postcode(self._add_postcode(item))
 634             elif item.kind == 'housenumber':
 635                 token_info.add_housenumber(*self._compute_housenumber_token(item))
 636             elif item.kind == 'street':
 637                 token_info.add_street(self._retrieve_full_tokens(item.name))
 638             elif item.kind == 'place':
 639                 if not item.suffix:
 640                     token_info.add_place(self._compute_partial_tokens(item.name))
 641             elif not item.kind.startswith('_') and not item.suffix and \
 642                  item.kind not in ('country', 'full', 'inclusion'):
 643                 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
 644
 645
 646     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
 647         """ Normalize the housenumber and return the word token and the
 648             canonical form.
 649         """
 650         assert self.conn is not None
 651         analyzer = self.token_analysis.analysis.get('@housenumber')
 652         result: Tuple[Optional[int], Optional[str]] = (None, None)
 653
 654         if analyzer is None:
 655             # When no custom analyzer is set, simply normalize and transliterate
 656             norm_name = self._search_normalized(hnr.name)
 657             if norm_name:
 658                 result = self._cache.housenumbers.get(norm_name, result)
 659                 if result[0] is None:
 660                     with self.conn.cursor() as cur:
 661                         hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
 662
 663                         result = hid, norm_name
 664                         self._cache.housenumbers[norm_name] = result
 665         else:
 666             # Otherwise use the analyzer to determine the canonical name.
 667             # Per convention we use the first variant as the 'lookup name', the
 668             # name that gets saved in the housenumber field of the place.
 669             word_id = analyzer.get_canonical_id(hnr)
 670             if word_id:
 671                 result = self._cache.housenumbers.get(word_id, result)
 672                 if result[0] is None:
 673                     variants = analyzer.compute_variants(word_id)
 674                     if variants:
 675                         with self.conn.cursor() as cur:
 676                             hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
 677                                              (word_id, list(variants)))
 678                             result = hid, variants[0]
 679                             self._cache.housenumbers[word_id] = result
 680
 681         return result
 682
 683
 684     def _compute_partial_tokens(self, name: str) -> List[int]:
 685         """ Normalize the given term, split it into partial words and return
 686             then token list for them.
 687         """
 688         assert self.conn is not None
 689         norm_name = self._search_normalized(name)
 690
 691         tokens = []
 692         need_lookup = []
 693         for partial in norm_name.split():
 694             token = self._cache.partials.get(partial)
 695             if token:
 696                 tokens.append(token)
 697             else:
 698                 need_lookup.append(partial)
 699
 700         if need_lookup:
 701             with self.conn.cursor() as cur:
 702                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 703                                FROM unnest(%s) word""",
 704                             (need_lookup, ))
 705
 706                 for partial, token in cur:
 707                     assert token is not None
 708                     tokens.append(token)
 709                     self._cache.partials[partial] = token
 710
 711         return tokens
 712
 713
 714     def _retrieve_full_tokens(self, name: str) -> List[int]:
 715         """ Get the full name token for the given name, if it exists.
 716             The name is only retrieved for the standard analyser.
 717         """
 718         assert self.conn is not None
 719         norm_name = self._search_normalized(name)
 720
 721         # return cached if possible
 722         if norm_name in self._cache.fulls:
 723             return self._cache.fulls[norm_name]
 724
 725         with self.conn.cursor() as cur:
 726             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 727                         (norm_name, ))
 728             full = [row[0] for row in cur]
 729
 730         self._cache.fulls[norm_name] = full
 731
 732         return full
 733
 734
 735     def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
 736         """ Computes the full name and partial name tokens for the given
 737             dictionary of names.
 738         """
 739         assert self.conn is not None
 740         full_tokens: Set[int] = set()
 741         partial_tokens: Set[int] = set()
 742
 743         for name in names:
 744             analyzer_id = name.get_attr('analyzer')
 745             analyzer = self.token_analysis.get_analyzer(analyzer_id)
 746             word_id = analyzer.get_canonical_id(name)
 747             if analyzer_id is None:
 748                 token_id = word_id
 749             else:
 750                 token_id = f'{word_id}@{analyzer_id}'
 751
 752             full, part = self._cache.names.get(token_id, (None, None))
 753             if full is None:
 754                 variants = analyzer.compute_variants(word_id)
 755                 if not variants:
 756                     continue
 757
 758                 with self.conn.cursor() as cur:
 759                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
 760                                 (token_id, variants))
 761                     full, part = cast(Tuple[int, List[int]], cur.fetchone())
 762
 763                 self._cache.names[token_id] = (full, part)
 764
 765             assert part is not None
 766
 767             full_tokens.add(full)
 768             partial_tokens.update(part)
 769
 770         return full_tokens, partial_tokens
 771
 772
 773     def _add_postcode(self, item: PlaceName) -> Optional[str]:
 774         """ Make sure the normalized postcode is present in the word table.
 775         """
 776         assert self.conn is not None
 777         analyzer = self.token_analysis.analysis.get('@postcode')
 778
 779         if analyzer is None:
 780             postcode_name = item.name.strip().upper()
 781             variant_base = None
 782         else:
 783             postcode_name = analyzer.get_canonical_id(item)
 784             variant_base = item.get_attr("variant")
 785
 786         if variant_base:
 787             postcode = f'{postcode_name}@{variant_base}'
 788         else:
 789             postcode = postcode_name
 790
 791         if postcode not in self._cache.postcodes:
 792             term = self._search_normalized(postcode_name)
 793             if not term:
 794                 return None
 795
 796             variants = {term}
 797             if analyzer is not None and variant_base:
 798                 variants.update(analyzer.compute_variants(variant_base))
 799
 800             with self.conn.cursor() as cur:
 801                 cur.execute("SELECT create_postcode_word(%s, %s)",
 802                             (postcode, list(variants)))
 803             self._cache.postcodes.add(postcode)
 804
 805         return postcode_name
 806
 807
 808 class _TokenInfo:
 809     """ Collect token information to be sent back to the database.
 810     """
 811     def __init__(self) -> None:
 812         self.names: Optional[str] = None
 813         self.housenumbers: Set[str] = set()
 814         self.housenumber_tokens: Set[int] = set()
 815         self.street_tokens: Optional[Set[int]] = None
 816         self.place_tokens: Set[int] = set()
 817         self.address_tokens: Dict[str, str] = {}
 818         self.postcode: Optional[str] = None
 819
 820
 821     def _mk_array(self, tokens: Iterable[Any]) -> str:
 822         return f"{{{','.join((str(s) for s in tokens))}}}"
 823
 824
 825     def to_dict(self) -> Dict[str, Any]:
 826         """ Return the token information in database importable format.
 827         """
 828         out: Dict[str, Any] = {}
 829
 830         if self.names:
 831             out['names'] = self.names
 832
 833         if self.housenumbers:
 834             out['hnr'] = ';'.join(self.housenumbers)
 835             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 836
 837         if self.street_tokens is not None:
 838             out['street'] = self._mk_array(self.street_tokens)
 839
 840         if self.place_tokens:
 841             out['place'] = self._mk_array(self.place_tokens)
 842
 843         if self.address_tokens:
 844             out['addr'] = self.address_tokens
 845
 846         if self.postcode:
 847             out['postcode'] = self.postcode
 848
 849         return out
 850
 851
 852     def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
 853         """ Adds token information for the normalised names.
 854         """
 855         self.names = self._mk_array(itertools.chain(fulls, partials))
 856
 857
 858     def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
 859         """ Extract housenumber information from a list of normalised
 860             housenumbers.
 861         """
 862         if token:
 863             assert hnr is not None
 864             self.housenumbers.add(hnr)
 865             self.housenumber_tokens.add(token)
 866
 867
 868     def add_street(self, tokens: Iterable[int]) -> None:
 869         """ Add addr:street match terms.
 870         """
 871         if self.street_tokens is None:
 872             self.street_tokens = set()
 873         self.street_tokens.update(tokens)
 874
 875
 876     def add_place(self, tokens: Iterable[int]) -> None:
 877         """ Add addr:place search and match terms.
 878         """
 879         self.place_tokens.update(tokens)
 880
 881
 882     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
 883         """ Add additional address terms.
 884         """
 885         if partials:
 886             self.address_tokens[key] = self._mk_array(partials)
 887
 888     def set_postcode(self, postcode: Optional[str]) -> None:
 889         """ Set the postcode to the given one.
 890         """
 891         self.postcode = postcode
 892
 893
 894 class _TokenCache:
 895     """ Cache for token information to avoid repeated database queries.
 896
 897         This cache is not thread-safe and needs to be instantiated per
 898         analyzer.
 899     """
 900     def __init__(self) -> None:
 901         self.names: Dict[str, Tuple[int, List[int]]] = {}
 902         self.partials: Dict[str, int] = {}
 903         self.fulls: Dict[str, List[int]] = {}
 904         self.postcodes: Set[str] = set()
 905         self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}