src/nominatim_db/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2024 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
  12                    Dict, Set, Iterable
  13 import itertools
  14 import logging
  15 from pathlib import Path
  16
  17 from psycopg.types.json import Jsonb
  18 from psycopg import sql as pysql
  19
  20 from ..db.connection import connect, Connection, Cursor, server_version_tuple, \
  21                             drop_tables, table_exists, execute_scalar
  22 from ..config import Configuration
  23 from ..db.sql_preprocessor import SQLPreprocessor
  24 from ..data.place_info import PlaceInfo
  25 from ..data.place_name import PlaceName
  26 from .icu_rule_loader import ICURuleLoader
  27 from .place_sanitizer import PlaceSanitizer
  28 from .icu_token_analysis import ICUTokenAnalysis
  29 from .base import AbstractAnalyzer, AbstractTokenizer
  30
  31 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  32
  33 LOG = logging.getLogger()
  34
  35 WORD_TYPES = (('country_names', 'C'),
  36               ('postcodes', 'P'),
  37               ('full_word', 'W'),
  38               ('housenumbers', 'H'))
  39
  40
  41 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
  42     """ Create a new instance of the tokenizer provided by this module.
  43     """
  44     return ICUTokenizer(dsn, data_dir)
  45
  46
  47 class ICUTokenizer(AbstractTokenizer):
  48     """ This tokenizer uses libICU to convert names and queries to ASCII.
  49         Otherwise it uses the same algorithms and data structures as the
  50         normalization routines in Nominatim 3.
  51     """
  52
  53     def __init__(self, dsn: str, data_dir: Path) -> None:
  54         self.dsn = dsn
  55         self.data_dir = data_dir
  56         self.loader: Optional[ICURuleLoader] = None
  57
  58     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
  59         """ Set up a new tokenizer for the database.
  60
  61             This copies all necessary data in the project directory to make
  62             sure the tokenizer remains stable even over updates.
  63         """
  64         self.loader = ICURuleLoader(config)
  65
  66         self._save_config()
  67
  68         if init_db:
  69             self.update_sql_functions(config)
  70             self._setup_db_tables(config)
  71             self._create_base_indices(config, 'word')
  72
  73     def init_from_project(self, config: Configuration) -> None:
  74         """ Initialise the tokenizer from the project directory.
  75         """
  76         self.loader = ICURuleLoader(config)
  77
  78         with connect(self.dsn) as conn:
  79             self.loader.load_config_from_db(conn)
  80
  81     def finalize_import(self, config: Configuration) -> None:
  82         """ Do any required postprocessing to make the tokenizer data ready
  83             for use.
  84         """
  85         self._create_lookup_indices(config, 'word')
  86
  87     def update_sql_functions(self, config: Configuration) -> None:
  88         """ Reimport the SQL functions for this tokenizer.
  89         """
  90         with connect(self.dsn) as conn:
  91             sqlp = SQLPreprocessor(conn, config)
  92             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  93
  94     def check_database(self, config: Configuration) -> None:
  95         """ Check that the tokenizer is set up correctly.
  96         """
  97         # Will throw an error if there is an issue.
  98         self.init_from_project(config)
  99
 100     def update_statistics(self, config: Configuration, threads: int = 2) -> None:
 101         """ Recompute frequencies for all name words.
 102         """
 103         with connect(self.dsn) as conn:
 104             if not table_exists(conn, 'search_name'):
 105                 return
 106
 107             with conn.cursor() as cur:
 108                 cur.execute('ANALYSE search_name')
 109                 if threads > 1:
 110                     cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
 111                                      .format(pysql.Literal(min(threads, 6),)))
 112
 113                 if server_version_tuple(conn) < (12, 0):
 114                     LOG.info('Computing word frequencies')
 115                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
 116                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 117                                      SELECT unnest(name_vector) as id, count(*)
 118                                      FROM search_name GROUP BY id""")
 119                     cur.execute('CREATE INDEX ON word_frequencies(id)')
 120                     cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
 121                                      SELECT unnest(nameaddress_vector) as id, count(*)
 122                                      FROM search_name GROUP BY id""")
 123                     cur.execute('CREATE INDEX ON addressword_frequencies(id)')
 124                     cur.execute("""
 125                         CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
 126                                                                     INOUT info JSONB)
 127                         AS $$
 128                         DECLARE rec RECORD;
 129                         BEGIN
 130                         IF info is null THEN
 131                           info = '{}'::jsonb;
 132                         END IF;
 133                         FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
 134                         LOOP
 135                           info = info || jsonb_build_object('count', rec.count);
 136                         END LOOP;
 137                         FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
 138                         LOOP
 139                           info = info || jsonb_build_object('addr_count', rec.count);
 140                         END LOOP;
 141                         IF info = '{}'::jsonb THEN
 142                           info = null;
 143                         END IF;
 144                         END;
 145                         $$ LANGUAGE plpgsql IMMUTABLE;
 146                         """)
 147                     LOG.info('Update word table with recomputed frequencies')
 148                     drop_tables(conn, 'tmp_word')
 149                     cur.execute("""CREATE TABLE tmp_word AS
 150                                     SELECT word_id, word_token, type, word,
 151                                            word_freq_update(word_id, info) as info
 152                                     FROM word
 153                                 """)
 154                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
 155                 else:
 156                     LOG.info('Computing word frequencies')
 157                     drop_tables(conn, 'word_frequencies')
 158                     cur.execute("""
 159                       CREATE TEMP TABLE word_frequencies AS
 160                       WITH word_freq AS MATERIALIZED (
 161                                SELECT unnest(name_vector) as id, count(*)
 162                                      FROM search_name GROUP BY id),
 163                            addr_freq AS MATERIALIZED (
 164                                SELECT unnest(nameaddress_vector) as id, count(*)
 165                                      FROM search_name GROUP BY id)
 166                       SELECT coalesce(a.id, w.id) as id,
 167                              (CASE WHEN w.count is null THEN '{}'::JSONB
 168                                   ELSE jsonb_build_object('count', w.count) END
 169                               ||
 170                               CASE WHEN a.count is null THEN '{}'::JSONB
 171                                   ELSE jsonb_build_object('addr_count', a.count) END) as info
 172                       FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
 173                       """)
 174                     cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
 175                     cur.execute('ANALYSE word_frequencies')
 176                     LOG.info('Update word table with recomputed frequencies')
 177                     drop_tables(conn, 'tmp_word')
 178                     cur.execute("""CREATE TABLE tmp_word AS
 179                                     SELECT word_id, word_token, type, word,
 180                                            (CASE WHEN wf.info is null THEN word.info
 181                                             ELSE coalesce(word.info, '{}'::jsonb) || wf.info
 182                                             END) as info
 183                                     FROM word LEFT JOIN word_frequencies wf
 184                                          ON word.word_id = wf.id
 185                                     ORDER BY word_id
 186                                 """)
 187                     drop_tables(conn, 'word_frequencies')
 188
 189             with conn.cursor() as cur:
 190                 cur.execute('SET max_parallel_workers_per_gather TO 0')
 191
 192             sqlp = SQLPreprocessor(conn, config)
 193             sqlp.run_string(conn,
 194                             'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
 195             conn.commit()
 196         self._create_base_indices(config, 'tmp_word')
 197         self._create_lookup_indices(config, 'tmp_word')
 198         self._move_temporary_word_table('tmp_word')
 199
 200     def _cleanup_housenumbers(self) -> None:
 201         """ Remove unused house numbers.
 202         """
 203         with connect(self.dsn) as conn:
 204             if not table_exists(conn, 'search_name'):
 205                 return
 206             with conn.cursor(name="hnr_counter") as cur:
 207                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
 208                                FROM word
 209                                WHERE type = 'H'
 210                                  AND NOT EXISTS(SELECT * FROM search_name
 211                                                 WHERE ARRAY[word.word_id] && name_vector)
 212                                  AND (char_length(coalesce(word, word_token)) > 6
 213                                       OR coalesce(word, word_token) not similar to '\\d+')
 214                             """)
 215                 candidates = {token: wid for wid, token in cur}
 216             with conn.cursor(name="hnr_counter") as cur:
 217                 cur.execute("""SELECT housenumber FROM placex
 218                                WHERE housenumber is not null
 219                                      AND (char_length(housenumber) > 6
 220                                           OR housenumber not similar to '\\d+')
 221                             """)
 222                 for row in cur:
 223                     for hnr in row[0].split(';'):
 224                         candidates.pop(hnr, None)
 225             LOG.info("There are %s outdated housenumbers.", len(candidates))
 226             LOG.debug("Outdated housenumbers: %s", candidates.keys())
 227             if candidates:
 228                 with conn.cursor() as cur:
 229                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 230                                 (list(candidates.values()), ))
 231                 conn.commit()
 232
 233     def update_word_tokens(self) -> None:
 234         """ Remove unused tokens.
 235         """
 236         LOG.warning("Cleaning up housenumber tokens.")
 237         self._cleanup_housenumbers()
 238         LOG.warning("Tokenizer house-keeping done.")
 239
 240     def name_analyzer(self) -> 'ICUNameAnalyzer':
 241         """ Create a new analyzer for tokenizing names and queries
 242             using this tokinzer. Analyzers are context managers and should
 243             be used accordingly:
 244
 245             ```
 246             with tokenizer.name_analyzer() as analyzer:
 247                 analyser.tokenize()
 248             ```
 249
 250             When used outside the with construct, the caller must ensure to
 251             call the close() function before destructing the analyzer.
 252
 253             Analyzers are not thread-safe. You need to instantiate one per thread.
 254         """
 255         assert self.loader is not None
 256         return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 257                                self.loader.make_token_analysis())
 258
 259     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
 260         """ Return a list of the `num` most frequent full words
 261             in the database.
 262         """
 263         with conn.cursor() as cur:
 264             cur.execute("""SELECT word, sum((info->>'count')::int) as count
 265                              FROM word WHERE type = 'W'
 266                              GROUP BY word
 267                              ORDER BY count DESC LIMIT %s""", (num,))
 268             return list(s[0].split('@')[0] for s in cur)
 269
 270     def _save_config(self) -> None:
 271         """ Save the configuration that needs to remain stable for the given
 272             database as database properties.
 273         """
 274         assert self.loader is not None
 275         with connect(self.dsn) as conn:
 276             self.loader.save_config_to_db(conn)
 277
 278     def _setup_db_tables(self, config: Configuration) -> None:
 279         """ Set up the word table and fill it with pre-computed word
 280             frequencies.
 281         """
 282         with connect(self.dsn) as conn:
 283             drop_tables(conn, 'word')
 284             sqlp = SQLPreprocessor(conn, config)
 285             sqlp.run_string(conn, """
 286                 CREATE TABLE word (
 287                       word_id INTEGER,
 288                       word_token text NOT NULL,
 289                       type text NOT NULL,
 290                       word text,
 291                       info jsonb
 292                     ) {{db.tablespace.search_data}};
 293                 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
 294
 295                 DROP SEQUENCE IF EXISTS seq_word;
 296                 CREATE SEQUENCE seq_word start 1;
 297                 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
 298             """)
 299             conn.commit()
 300
 301     def _create_base_indices(self, config: Configuration, table_name: str) -> None:
 302         """ Set up the word table and fill it with pre-computed word
 303             frequencies.
 304         """
 305         with connect(self.dsn) as conn:
 306             sqlp = SQLPreprocessor(conn, config)
 307             sqlp.run_string(conn,
 308                             """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
 309                                USING BTREE (word_token) {{db.tablespace.search_index}}""",
 310                             table_name=table_name)
 311             for name, ctype in WORD_TYPES:
 312                 sqlp.run_string(conn,
 313                                 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
 314                                    USING BTREE (word) {{db.tablespace.address_index}}
 315                                    WHERE type = '{{column_type}}'
 316                                 """,
 317                                 table_name=table_name, idx_name=name,
 318                                 column_type=ctype)
 319             conn.commit()
 320
 321     def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
 322         """ Create additional indexes used when running the API.
 323         """
 324         with connect(self.dsn) as conn:
 325             sqlp = SQLPreprocessor(conn, config)
 326             # Index required for details lookup.
 327             sqlp.run_string(
 328                 conn,
 329                 """
 330                 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
 331                   ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
 332                 """,
 333                 table_name=table_name)
 334             conn.commit()
 335
 336     def _move_temporary_word_table(self, old: str) -> None:
 337         """ Rename all tables and indexes used by the tokenizer.
 338         """
 339         with connect(self.dsn) as conn:
 340             drop_tables(conn, 'word')
 341             with conn.cursor() as cur:
 342                 cur.execute(f"ALTER TABLE {old} RENAME TO word")
 343                 for idx in ('word_token', 'word_id'):
 344                     cur.execute(f"""ALTER INDEX idx_{old}_{idx}
 345                                       RENAME TO idx_word_{idx}""")
 346                 for name, _ in WORD_TYPES:
 347                     cur.execute(f"""ALTER INDEX idx_{old}_{name}
 348                                     RENAME TO idx_word_{name}""")
 349             conn.commit()
 350
 351
 352 class ICUNameAnalyzer(AbstractAnalyzer):
 353     """ The ICU analyzer uses the ICU library for splitting names.
 354
 355         Each instance opens a connection to the database to request the
 356         normalization.
 357     """
 358
 359     def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
 360                  token_analysis: ICUTokenAnalysis) -> None:
 361         self.conn: Optional[Connection] = connect(dsn)
 362         self.conn.autocommit = True
 363         self.sanitizer = sanitizer
 364         self.token_analysis = token_analysis
 365
 366         self._cache = _TokenCache()
 367
 368     def close(self) -> None:
 369         """ Free all resources used by the analyzer.
 370         """
 371         if self.conn:
 372             self.conn.close()
 373             self.conn = None
 374
 375     def _search_normalized(self, name: str) -> str:
 376         """ Return the search token transliteration of the given name.
 377         """
 378         return cast(str, self.token_analysis.search.transliterate(name)).strip()
 379
 380     def _normalized(self, name: str) -> str:
 381         """ Return the normalized version of the given name with all
 382             non-relevant information removed.
 383         """
 384         return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
 385
 386     def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
 387         """ Return token information for the given list of words.
 388             If a word starts with # it is assumed to be a full name
 389             otherwise is a partial name.
 390
 391             The function returns a list of tuples with
 392             (original word, word token, word id).
 393
 394             The function is used for testing and debugging only
 395             and not necessarily efficient.
 396         """
 397         assert self.conn is not None
 398         full_tokens = {}
 399         partial_tokens = {}
 400         for word in words:
 401             if word.startswith('#'):
 402                 full_tokens[word] = self._search_normalized(word[1:])
 403             else:
 404                 partial_tokens[word] = self._search_normalized(word)
 405
 406         with self.conn.cursor() as cur:
 407             cur.execute("""SELECT word_token, word_id
 408                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 409                         """, (list(full_tokens.values()),))
 410             full_ids = {r[0]: r[1] for r in cur}
 411             cur.execute("""SELECT word_token, word_id
 412                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 413                         (list(partial_tokens.values()),))
 414             part_ids = {r[0]: r[1] for r in cur}
 415
 416         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 417             + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 418
 419     def normalize_postcode(self, postcode: str) -> str:
 420         """ Convert the postcode to a standardized form.
 421
 422             This function must yield exactly the same result as the SQL function
 423             'token_normalized_postcode()'.
 424         """
 425         return postcode.strip().upper()
 426
 427     def update_postcodes_from_db(self) -> None:
 428         """ Update postcode tokens in the word table from the location_postcode
 429             table.
 430         """
 431         assert self.conn is not None
 432         analyzer = self.token_analysis.analysis.get('@postcode')
 433
 434         with self.conn.cursor() as cur:
 435             # First get all postcode names currently in the word table.
 436             cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
 437             word_entries = set((entry[0] for entry in cur))
 438
 439             # Then compute the required postcode names from the postcode table.
 440             needed_entries = set()
 441             cur.execute("SELECT country_code, postcode FROM location_postcode")
 442             for cc, postcode in cur:
 443                 info = PlaceInfo({'country_code': cc,
 444                                   'class': 'place', 'type': 'postcode',
 445                                   'address': {'postcode': postcode}})
 446                 address = self.sanitizer.process_names(info)[1]
 447                 for place in address:
 448                     if place.kind == 'postcode':
 449                         if analyzer is None:
 450                             postcode_name = place.name.strip().upper()
 451                             variant_base = None
 452                         else:
 453                             postcode_name = analyzer.get_canonical_id(place)
 454                             variant_base = place.get_attr("variant")
 455
 456                         if variant_base:
 457                             needed_entries.add(f'{postcode_name}@{variant_base}')
 458                         else:
 459                             needed_entries.add(postcode_name)
 460                         break
 461
 462         # Now update the word table.
 463         self._delete_unused_postcode_words(word_entries - needed_entries)
 464         self._add_missing_postcode_words(needed_entries - word_entries)
 465
 466     def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
 467         assert self.conn is not None
 468         if tokens:
 469             with self.conn.cursor() as cur:
 470                 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
 471                             (list(tokens), ))
 472
 473     def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
 474         assert self.conn is not None
 475         if not tokens:
 476             return
 477
 478         analyzer = self.token_analysis.analysis.get('@postcode')
 479         terms = []
 480
 481         for postcode_name in tokens:
 482             if '@' in postcode_name:
 483                 term, variant = postcode_name.split('@', 2)
 484                 term = self._search_normalized(term)
 485                 if analyzer is None:
 486                     variants = [term]
 487                 else:
 488                     variants = analyzer.compute_variants(variant)
 489                     if term not in variants:
 490                         variants.append(term)
 491             else:
 492                 variants = [self._search_normalized(postcode_name)]
 493             terms.append((postcode_name, variants))
 494
 495         if terms:
 496             with self.conn.cursor() as cur:
 497                 cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
 498
 499     def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
 500                                should_replace: bool) -> None:
 501         """ Replace the search index for special phrases with the new phrases.
 502             If `should_replace` is True, then the previous set of will be
 503             completely replaced. Otherwise the phrases are added to the
 504             already existing ones.
 505         """
 506         assert self.conn is not None
 507         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 508                             for p in phrases))
 509
 510         with self.conn.cursor() as cur:
 511             # Get the old phrases.
 512             existing_phrases = set()
 513             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 514             for word, info in cur:
 515                 existing_phrases.add((word, info['class'], info['type'],
 516                                       info.get('op') or '-'))
 517
 518             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 519             if should_replace:
 520                 deleted = self._remove_special_phrases(cur, norm_phrases,
 521                                                        existing_phrases)
 522             else:
 523                 deleted = 0
 524
 525         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 526                  len(norm_phrases), added, deleted)
 527
 528     def _add_special_phrases(self, cursor: Cursor,
 529                              new_phrases: Set[Tuple[str, str, str, str]],
 530                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 531         """ Add all phrases to the database that are not yet there.
 532         """
 533         to_add = new_phrases - existing_phrases
 534
 535         added = 0
 536         with cursor.copy('COPY word(word_token, type, word, info) FROM STDIN') as copy:
 537             for word, cls, typ, oper in to_add:
 538                 term = self._search_normalized(word)
 539                 if term:
 540                     copy.write_row((term, 'S', word,
 541                                     Jsonb({'class': cls, 'type': typ,
 542                                            'op': oper if oper in ('in', 'near') else None})))
 543                     added += 1
 544
 545         return added
 546
 547     def _remove_special_phrases(self, cursor: Cursor,
 548                                 new_phrases: Set[Tuple[str, str, str, str]],
 549                                 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 550         """ Remove all phrases from the database that are no longer in the
 551             new phrase list.
 552         """
 553         to_delete = existing_phrases - new_phrases
 554
 555         if to_delete:
 556             cursor.executemany(
 557                 """ DELETE FROM word
 558                       WHERE type = 'S' and word = %s
 559                             and info->>'class' = %s and info->>'type' = %s
 560                             and %s = coalesce(info->>'op', '-')
 561                 """, to_delete)
 562
 563         return len(to_delete)
 564
 565     def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
 566         """ Add default names for the given country to the search index.
 567         """
 568         # Make sure any name preprocessing for country names applies.
 569         info = PlaceInfo({'name': names, 'country_code': country_code,
 570                           'rank_address': 4, 'class': 'boundary',
 571                           'type': 'administrative'})
 572         self._add_country_full_names(country_code,
 573                                      self.sanitizer.process_names(info)[0],
 574                                      internal=True)
 575
 576     def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
 577                                 internal: bool = False) -> None:
 578         """ Add names for the given country from an already sanitized
 579             name list.
 580         """
 581         assert self.conn is not None
 582         word_tokens = set()
 583         for name in names:
 584             norm_name = self._search_normalized(name.name)
 585             if norm_name:
 586                 word_tokens.add(norm_name)
 587
 588         with self.conn.cursor() as cur:
 589             # Get existing names
 590             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
 591                              FROM word
 592                              WHERE type = 'C' and word = %s""",
 593                         (country_code, ))
 594             # internal/external names
 595             existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
 596             for word in cur:
 597                 existing_tokens[word[1]].add(word[0])
 598
 599             # Delete names that no longer exist.
 600             gone_tokens = existing_tokens[internal] - word_tokens
 601             if internal:
 602                 gone_tokens.update(existing_tokens[False] & word_tokens)
 603             if gone_tokens:
 604                 cur.execute("""DELETE FROM word
 605                                USING unnest(%s::text[]) as token
 606                                WHERE type = 'C' and word = %s
 607                                      and word_token = token""",
 608                             (list(gone_tokens), country_code))
 609
 610             # Only add those names that are not yet in the list.
 611             new_tokens = word_tokens - existing_tokens[True]
 612             if not internal:
 613                 new_tokens -= existing_tokens[False]
 614             if new_tokens:
 615                 if internal:
 616                     sql = """INSERT INTO word (word_token, type, word, info)
 617                                (SELECT token, 'C', %s, '{"internal": "yes"}'
 618                                   FROM unnest(%s::text[]) as token)
 619                            """
 620                 else:
 621                     sql = """INSERT INTO word (word_token, type, word)
 622                                    (SELECT token, 'C', %s
 623                                     FROM unnest(%s::text[]) as token)
 624                           """
 625                 cur.execute(sql, (country_code, list(new_tokens)))
 626
 627     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
 628         """ Determine tokenizer information about the given place.
 629
 630             Returns a JSON-serializable structure that will be handed into
 631             the database via the token_info field.
 632         """
 633         token_info = _TokenInfo()
 634
 635         names, address = self.sanitizer.process_names(place)
 636
 637         if names:
 638             token_info.set_names(*self._compute_name_tokens(names))
 639
 640             if place.is_country():
 641                 assert place.country_code is not None
 642                 self._add_country_full_names(place.country_code, names)
 643
 644         if address:
 645             self._process_place_address(token_info, address)
 646
 647         return token_info.to_dict()
 648
 649     def _process_place_address(self, token_info: '_TokenInfo',
 650                                address: Sequence[PlaceName]) -> None:
 651         for item in address:
 652             if item.kind == 'postcode':
 653                 token_info.set_postcode(self._add_postcode(item))
 654             elif item.kind == 'housenumber':
 655                 token_info.add_housenumber(*self._compute_housenumber_token(item))
 656             elif item.kind == 'street':
 657                 token_info.add_street(self._retrieve_full_tokens(item.name))
 658             elif item.kind == 'place':
 659                 if not item.suffix:
 660                     token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
 661             elif (not item.kind.startswith('_') and not item.suffix and
 662                   item.kind not in ('country', 'full', 'inclusion')):
 663                 token_info.add_address_term(item.kind,
 664                                             itertools.chain(*self._compute_name_tokens([item])))
 665
 666     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
 667         """ Normalize the housenumber and return the word token and the
 668             canonical form.
 669         """
 670         assert self.conn is not None
 671         analyzer = self.token_analysis.analysis.get('@housenumber')
 672         result: Tuple[Optional[int], Optional[str]] = (None, None)
 673
 674         if analyzer is None:
 675             # When no custom analyzer is set, simply normalize and transliterate
 676             norm_name = self._search_normalized(hnr.name)
 677             if norm_name:
 678                 result = self._cache.housenumbers.get(norm_name, result)
 679                 if result[0] is None:
 680                     hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
 681
 682                     result = hid, norm_name
 683                     self._cache.housenumbers[norm_name] = result
 684         else:
 685             # Otherwise use the analyzer to determine the canonical name.
 686             # Per convention we use the first variant as the 'lookup name', the
 687             # name that gets saved in the housenumber field of the place.
 688             word_id = analyzer.get_canonical_id(hnr)
 689             if word_id:
 690                 result = self._cache.housenumbers.get(word_id, result)
 691                 if result[0] is None:
 692                     variants = analyzer.compute_variants(word_id)
 693                     if variants:
 694                         hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
 695                                              (word_id, list(variants)))
 696                         result = hid, variants[0]
 697                         self._cache.housenumbers[word_id] = result
 698
 699         return result
 700
 701     def _retrieve_full_tokens(self, name: str) -> List[int]:
 702         """ Get the full name token for the given name, if it exists.
 703             The name is only retrieved for the standard analyser.
 704         """
 705         assert self.conn is not None
 706         norm_name = self._search_normalized(name)
 707
 708         # return cached if possible
 709         if norm_name in self._cache.fulls:
 710             return self._cache.fulls[norm_name]
 711
 712         with self.conn.cursor() as cur:
 713             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 714                         (norm_name, ))
 715             full = [row[0] for row in cur]
 716
 717         self._cache.fulls[norm_name] = full
 718
 719         return full
 720
 721     def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
 722         """ Computes the full name and partial name tokens for the given
 723             dictionary of names.
 724         """
 725         assert self.conn is not None
 726         full_tokens: Set[int] = set()
 727         partial_tokens: Set[int] = set()
 728
 729         for name in names:
 730             analyzer_id = name.get_attr('analyzer')
 731             analyzer = self.token_analysis.get_analyzer(analyzer_id)
 732             word_id = analyzer.get_canonical_id(name)
 733             if analyzer_id is None:
 734                 token_id = word_id
 735             else:
 736                 token_id = f'{word_id}@{analyzer_id}'
 737
 738             full, part = self._cache.names.get(token_id, (None, None))
 739             if full is None:
 740                 variants = analyzer.compute_variants(word_id)
 741                 if not variants:
 742                     continue
 743
 744                 with self.conn.cursor() as cur:
 745                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
 746                                 (token_id, variants))
 747                     full, part = cast(Tuple[int, List[int]], cur.fetchone())
 748
 749                 self._cache.names[token_id] = (full, part)
 750
 751             assert part is not None
 752
 753             full_tokens.add(full)
 754             partial_tokens.update(part)
 755
 756         return full_tokens, partial_tokens
 757
 758     def _add_postcode(self, item: PlaceName) -> Optional[str]:
 759         """ Make sure the normalized postcode is present in the word table.
 760         """
 761         assert self.conn is not None
 762         analyzer = self.token_analysis.analysis.get('@postcode')
 763
 764         if analyzer is None:
 765             postcode_name = item.name.strip().upper()
 766             variant_base = None
 767         else:
 768             postcode_name = analyzer.get_canonical_id(item)
 769             variant_base = item.get_attr("variant")
 770
 771         if variant_base:
 772             postcode = f'{postcode_name}@{variant_base}'
 773         else:
 774             postcode = postcode_name
 775
 776         if postcode not in self._cache.postcodes:
 777             term = self._search_normalized(postcode_name)
 778             if not term:
 779                 return None
 780
 781             variants = {term}
 782             if analyzer is not None and variant_base:
 783                 variants.update(analyzer.compute_variants(variant_base))
 784
 785             with self.conn.cursor() as cur:
 786                 cur.execute("SELECT create_postcode_word(%s, %s)",
 787                             (postcode, list(variants)))
 788             self._cache.postcodes.add(postcode)
 789
 790         return postcode_name
 791
 792
 793 class _TokenInfo:
 794     """ Collect token information to be sent back to the database.
 795     """
 796     def __init__(self) -> None:
 797         self.names: Optional[str] = None
 798         self.housenumbers: Set[str] = set()
 799         self.housenumber_tokens: Set[int] = set()
 800         self.street_tokens: Optional[Set[int]] = None
 801         self.place_tokens: Set[int] = set()
 802         self.address_tokens: Dict[str, str] = {}
 803         self.postcode: Optional[str] = None
 804
 805     def _mk_array(self, tokens: Iterable[Any]) -> str:
 806         return f"{{{','.join((str(s) for s in tokens))}}}"
 807
 808     def to_dict(self) -> Dict[str, Any]:
 809         """ Return the token information in database importable format.
 810         """
 811         out: Dict[str, Any] = {}
 812
 813         if self.names:
 814             out['names'] = self.names
 815
 816         if self.housenumbers:
 817             out['hnr'] = ';'.join(self.housenumbers)
 818             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 819
 820         if self.street_tokens is not None:
 821             out['street'] = self._mk_array(self.street_tokens)
 822
 823         if self.place_tokens:
 824             out['place'] = self._mk_array(self.place_tokens)
 825
 826         if self.address_tokens:
 827             out['addr'] = self.address_tokens
 828
 829         if self.postcode:
 830             out['postcode'] = self.postcode
 831
 832         return out
 833
 834     def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
 835         """ Adds token information for the normalised names.
 836         """
 837         self.names = self._mk_array(itertools.chain(fulls, partials))
 838
 839     def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
 840         """ Extract housenumber information from a list of normalised
 841             housenumbers.
 842         """
 843         if token:
 844             assert hnr is not None
 845             self.housenumbers.add(hnr)
 846             self.housenumber_tokens.add(token)
 847
 848     def add_street(self, tokens: Iterable[int]) -> None:
 849         """ Add addr:street match terms.
 850         """
 851         if self.street_tokens is None:
 852             self.street_tokens = set()
 853         self.street_tokens.update(tokens)
 854
 855     def add_place(self, tokens: Iterable[int]) -> None:
 856         """ Add addr:place search and match terms.
 857         """
 858         self.place_tokens.update(tokens)
 859
 860     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
 861         """ Add additional address terms.
 862         """
 863         array = self._mk_array(partials)
 864         if len(array) > 2:
 865             self.address_tokens[key] = array
 866
 867     def set_postcode(self, postcode: Optional[str]) -> None:
 868         """ Set the postcode to the given one.
 869         """
 870         self.postcode = postcode
 871
 872
 873 class _TokenCache:
 874     """ Cache for token information to avoid repeated database queries.
 875
 876         This cache is not thread-safe and needs to be instantiated per
 877         analyzer.
 878     """
 879     def __init__(self) -> None:
 880         self.names: Dict[str, Tuple[int, List[int]]] = {}
 881         self.partials: Dict[str, int] = {}
 882         self.fulls: Dict[str, List[int]] = {}
 883         self.postcodes: Set[str] = set()
 884         self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}