src/nominatim_db/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2024 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
  12                    Dict, Set, Iterable
  13 import itertools
  14 import logging
  15 from pathlib import Path
  16
  17 from psycopg.types.json import Jsonb
  18 from psycopg import sql as pysql
  19
  20 from ..db.connection import connect, Connection, Cursor, server_version_tuple, \
  21                             drop_tables, table_exists, execute_scalar
  22 from ..config import Configuration
  23 from ..db.sql_preprocessor import SQLPreprocessor
  24 from ..data.place_info import PlaceInfo
  25 from ..data.place_name import PlaceName
  26 from .icu_rule_loader import ICURuleLoader
  27 from .place_sanitizer import PlaceSanitizer
  28 from .icu_token_analysis import ICUTokenAnalysis
  29 from .base import AbstractAnalyzer, AbstractTokenizer
  30
  31 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  32
  33 LOG = logging.getLogger()
  34
  35 WORD_TYPES = (('country_names', 'C'),
  36               ('postcodes', 'P'),
  37               ('full_word', 'W'),
  38               ('housenumbers', 'H'))
  39
  40
  41 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
  42     """ Create a new instance of the tokenizer provided by this module.
  43     """
  44     return ICUTokenizer(dsn, data_dir)
  45
  46
  47 class ICUTokenizer(AbstractTokenizer):
  48     """ This tokenizer uses libICU to convert names and queries to ASCII.
  49         Otherwise it uses the same algorithms and data structures as the
  50         normalization routines in Nominatim 3.
  51     """
  52
  53     def __init__(self, dsn: str, data_dir: Path) -> None:
  54         self.dsn = dsn
  55         self.data_dir = data_dir
  56         self.loader: Optional[ICURuleLoader] = None
  57
  58     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
  59         """ Set up a new tokenizer for the database.
  60
  61             This copies all necessary data in the project directory to make
  62             sure the tokenizer remains stable even over updates.
  63         """
  64         self.loader = ICURuleLoader(config)
  65
  66         self._save_config()
  67
  68         if init_db:
  69             self.update_sql_functions(config)
  70             self._setup_db_tables(config)
  71             self._create_base_indices(config, 'word')
  72
  73     def init_from_project(self, config: Configuration) -> None:
  74         """ Initialise the tokenizer from the project directory.
  75         """
  76         self.loader = ICURuleLoader(config)
  77
  78         with connect(self.dsn) as conn:
  79             self.loader.load_config_from_db(conn)
  80
  81     def finalize_import(self, config: Configuration) -> None:
  82         """ Do any required postprocessing to make the tokenizer data ready
  83             for use.
  84         """
  85         self._create_lookup_indices(config, 'word')
  86
  87     def update_sql_functions(self, config: Configuration) -> None:
  88         """ Reimport the SQL functions for this tokenizer.
  89         """
  90         with connect(self.dsn) as conn:
  91             sqlp = SQLPreprocessor(conn, config)
  92             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  93
  94     def check_database(self, config: Configuration) -> None:
  95         """ Check that the tokenizer is set up correctly.
  96         """
  97         # Will throw an error if there is an issue.
  98         self.init_from_project(config)
  99
 100     def update_statistics(self, config: Configuration, threads: int = 2) -> None:
 101         """ Recompute frequencies for all name words.
 102         """
 103         with connect(self.dsn) as conn:
 104             if not table_exists(conn, 'search_name'):
 105                 return
 106
 107             with conn.cursor() as cur:
 108                 cur.execute('ANALYSE search_name')
 109                 if threads > 1:
 110                     cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
 111                                      .format(pysql.Literal(min(threads, 6),)))
 112
 113                 if server_version_tuple(conn) < (12, 0):
 114                     LOG.info('Computing word frequencies')
 115                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
 116                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 117                                      SELECT unnest(name_vector) as id, count(*)
 118                                      FROM search_name GROUP BY id""")
 119                     cur.execute('CREATE INDEX ON word_frequencies(id)')
 120                     cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
 121                                      SELECT unnest(nameaddress_vector) as id, count(*)
 122                                      FROM search_name GROUP BY id""")
 123                     cur.execute('CREATE INDEX ON addressword_frequencies(id)')
 124                     cur.execute("""
 125                         CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
 126                                                                     INOUT info JSONB)
 127                         AS $$
 128                         DECLARE rec RECORD;
 129                         BEGIN
 130                         IF info is null THEN
 131                           info = '{}'::jsonb;
 132                         END IF;
 133                         FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
 134                         LOOP
 135                           info = info || jsonb_build_object('count', rec.count);
 136                         END LOOP;
 137                         FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
 138                         LOOP
 139                           info = info || jsonb_build_object('addr_count', rec.count);
 140                         END LOOP;
 141                         IF info = '{}'::jsonb THEN
 142                           info = null;
 143                         END IF;
 144                         END;
 145                         $$ LANGUAGE plpgsql IMMUTABLE;
 146                         """)
 147                     LOG.info('Update word table with recomputed frequencies')
 148                     drop_tables(conn, 'tmp_word')
 149                     cur.execute("""CREATE TABLE tmp_word AS
 150                                     SELECT word_id, word_token, type, word,
 151                                            word_freq_update(word_id, info) as info
 152                                     FROM word
 153                                 """)
 154                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
 155                 else:
 156                     LOG.info('Computing word frequencies')
 157                     drop_tables(conn, 'word_frequencies')
 158                     cur.execute("""
 159                       CREATE TEMP TABLE word_frequencies AS
 160                       WITH word_freq AS MATERIALIZED (
 161                                SELECT unnest(name_vector) as id, count(*)
 162                                      FROM search_name GROUP BY id),
 163                            addr_freq AS MATERIALIZED (
 164                                SELECT unnest(nameaddress_vector) as id, count(*)
 165                                      FROM search_name GROUP BY id)
 166                       SELECT coalesce(a.id, w.id) as id,
 167                              (CASE WHEN w.count is null THEN '{}'::JSONB
 168                                   ELSE jsonb_build_object('count', w.count) END
 169                               ||
 170                               CASE WHEN a.count is null THEN '{}'::JSONB
 171                                   ELSE jsonb_build_object('addr_count', a.count) END) as info
 172                       FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
 173                       """)
 174                     cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
 175                     cur.execute('ANALYSE word_frequencies')
 176                     LOG.info('Update word table with recomputed frequencies')
 177                     drop_tables(conn, 'tmp_word')
 178                     cur.execute("""CREATE TABLE tmp_word AS
 179                                     SELECT word_id, word_token, type, word,
 180                                            (CASE WHEN wf.info is null THEN word.info
 181                                             ELSE coalesce(word.info, '{}'::jsonb) || wf.info
 182                                             END) as info
 183                                     FROM word LEFT JOIN word_frequencies wf
 184                                          ON word.word_id = wf.id
 185                                 """)
 186                     drop_tables(conn, 'word_frequencies')
 187
 188             with conn.cursor() as cur:
 189                 cur.execute('SET max_parallel_workers_per_gather TO 0')
 190
 191             sqlp = SQLPreprocessor(conn, config)
 192             sqlp.run_string(conn,
 193                             'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
 194             conn.commit()
 195         self._create_base_indices(config, 'tmp_word')
 196         self._create_lookup_indices(config, 'tmp_word')
 197         self._move_temporary_word_table('tmp_word')
 198
 199     def _cleanup_housenumbers(self) -> None:
 200         """ Remove unused house numbers.
 201         """
 202         with connect(self.dsn) as conn:
 203             if not table_exists(conn, 'search_name'):
 204                 return
 205             with conn.cursor(name="hnr_counter") as cur:
 206                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
 207                                FROM word
 208                                WHERE type = 'H'
 209                                  AND NOT EXISTS(SELECT * FROM search_name
 210                                                 WHERE ARRAY[word.word_id] && name_vector)
 211                                  AND (char_length(coalesce(word, word_token)) > 6
 212                                       OR coalesce(word, word_token) not similar to '\\d+')
 213                             """)
 214                 candidates = {token: wid for wid, token in cur}
 215             with conn.cursor(name="hnr_counter") as cur:
 216                 cur.execute("""SELECT housenumber FROM placex
 217                                WHERE housenumber is not null
 218                                      AND (char_length(housenumber) > 6
 219                                           OR housenumber not similar to '\\d+')
 220                             """)
 221                 for row in cur:
 222                     for hnr in row[0].split(';'):
 223                         candidates.pop(hnr, None)
 224             LOG.info("There are %s outdated housenumbers.", len(candidates))
 225             LOG.debug("Outdated housenumbers: %s", candidates.keys())
 226             if candidates:
 227                 with conn.cursor() as cur:
 228                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 229                                 (list(candidates.values()), ))
 230                 conn.commit()
 231
 232     def update_word_tokens(self) -> None:
 233         """ Remove unused tokens.
 234         """
 235         LOG.warning("Cleaning up housenumber tokens.")
 236         self._cleanup_housenumbers()
 237         LOG.warning("Tokenizer house-keeping done.")
 238
 239     def name_analyzer(self) -> 'ICUNameAnalyzer':
 240         """ Create a new analyzer for tokenizing names and queries
 241             using this tokinzer. Analyzers are context managers and should
 242             be used accordingly:
 243
 244             ```
 245             with tokenizer.name_analyzer() as analyzer:
 246                 analyser.tokenize()
 247             ```
 248
 249             When used outside the with construct, the caller must ensure to
 250             call the close() function before destructing the analyzer.
 251
 252             Analyzers are not thread-safe. You need to instantiate one per thread.
 253         """
 254         assert self.loader is not None
 255         return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 256                                self.loader.make_token_analysis())
 257
 258     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
 259         """ Return a list of the `num` most frequent full words
 260             in the database.
 261         """
 262         with conn.cursor() as cur:
 263             cur.execute("""SELECT word, sum((info->>'count')::int) as count
 264                              FROM word WHERE type = 'W'
 265                              GROUP BY word
 266                              ORDER BY count DESC LIMIT %s""", (num,))
 267             return list(s[0].split('@')[0] for s in cur)
 268
 269     def _save_config(self) -> None:
 270         """ Save the configuration that needs to remain stable for the given
 271             database as database properties.
 272         """
 273         assert self.loader is not None
 274         with connect(self.dsn) as conn:
 275             self.loader.save_config_to_db(conn)
 276
 277     def _setup_db_tables(self, config: Configuration) -> None:
 278         """ Set up the word table and fill it with pre-computed word
 279             frequencies.
 280         """
 281         with connect(self.dsn) as conn:
 282             drop_tables(conn, 'word')
 283             sqlp = SQLPreprocessor(conn, config)
 284             sqlp.run_string(conn, """
 285                 CREATE TABLE word (
 286                       word_id INTEGER,
 287                       word_token text NOT NULL,
 288                       type text NOT NULL,
 289                       word text,
 290                       info jsonb
 291                     ) {{db.tablespace.search_data}};
 292                 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
 293
 294                 DROP SEQUENCE IF EXISTS seq_word;
 295                 CREATE SEQUENCE seq_word start 1;
 296                 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
 297             """)
 298             conn.commit()
 299
 300     def _create_base_indices(self, config: Configuration, table_name: str) -> None:
 301         """ Set up the word table and fill it with pre-computed word
 302             frequencies.
 303         """
 304         with connect(self.dsn) as conn:
 305             sqlp = SQLPreprocessor(conn, config)
 306             sqlp.run_string(conn,
 307                             """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
 308                                USING BTREE (word_token) {{db.tablespace.search_index}}""",
 309                             table_name=table_name)
 310             for name, ctype in WORD_TYPES:
 311                 sqlp.run_string(conn,
 312                                 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
 313                                    USING BTREE (word) {{db.tablespace.address_index}}
 314                                    WHERE type = '{{column_type}}'
 315                                 """,
 316                                 table_name=table_name, idx_name=name,
 317                                 column_type=ctype)
 318             conn.commit()
 319
 320     def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
 321         """ Create additional indexes used when running the API.
 322         """
 323         with connect(self.dsn) as conn:
 324             sqlp = SQLPreprocessor(conn, config)
 325             # Index required for details lookup.
 326             sqlp.run_string(
 327                 conn,
 328                 """
 329                 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
 330                   ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
 331                 """,
 332                 table_name=table_name)
 333             conn.commit()
 334
 335     def _move_temporary_word_table(self, old: str) -> None:
 336         """ Rename all tables and indexes used by the tokenizer.
 337         """
 338         with connect(self.dsn) as conn:
 339             drop_tables(conn, 'word')
 340             with conn.cursor() as cur:
 341                 cur.execute(f"ALTER TABLE {old} RENAME TO word")
 342                 for idx in ('word_token', 'word_id'):
 343                     cur.execute(f"""ALTER INDEX idx_{old}_{idx}
 344                                       RENAME TO idx_word_{idx}""")
 345                 for name, _ in WORD_TYPES:
 346                     cur.execute(f"""ALTER INDEX idx_{old}_{name}
 347                                     RENAME TO idx_word_{name}""")
 348             conn.commit()
 349
 350
 351 class ICUNameAnalyzer(AbstractAnalyzer):
 352     """ The ICU analyzer uses the ICU library for splitting names.
 353
 354         Each instance opens a connection to the database to request the
 355         normalization.
 356     """
 357
 358     def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
 359                  token_analysis: ICUTokenAnalysis) -> None:
 360         self.conn: Optional[Connection] = connect(dsn)
 361         self.conn.autocommit = True
 362         self.sanitizer = sanitizer
 363         self.token_analysis = token_analysis
 364
 365         self._cache = _TokenCache()
 366
 367     def close(self) -> None:
 368         """ Free all resources used by the analyzer.
 369         """
 370         if self.conn:
 371             self.conn.close()
 372             self.conn = None
 373
 374     def _search_normalized(self, name: str) -> str:
 375         """ Return the search token transliteration of the given name.
 376         """
 377         return cast(str, self.token_analysis.search.transliterate(name)).strip()
 378
 379     def _normalized(self, name: str) -> str:
 380         """ Return the normalized version of the given name with all
 381             non-relevant information removed.
 382         """
 383         return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
 384
 385     def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
 386         """ Return token information for the given list of words.
 387             If a word starts with # it is assumed to be a full name
 388             otherwise is a partial name.
 389
 390             The function returns a list of tuples with
 391             (original word, word token, word id).
 392
 393             The function is used for testing and debugging only
 394             and not necessarily efficient.
 395         """
 396         assert self.conn is not None
 397         full_tokens = {}
 398         partial_tokens = {}
 399         for word in words:
 400             if word.startswith('#'):
 401                 full_tokens[word] = self._search_normalized(word[1:])
 402             else:
 403                 partial_tokens[word] = self._search_normalized(word)
 404
 405         with self.conn.cursor() as cur:
 406             cur.execute("""SELECT word_token, word_id
 407                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 408                         """, (list(full_tokens.values()),))
 409             full_ids = {r[0]: r[1] for r in cur}
 410             cur.execute("""SELECT word_token, word_id
 411                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 412                         (list(partial_tokens.values()),))
 413             part_ids = {r[0]: r[1] for r in cur}
 414
 415         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 416             + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 417
 418     def normalize_postcode(self, postcode: str) -> str:
 419         """ Convert the postcode to a standardized form.
 420
 421             This function must yield exactly the same result as the SQL function
 422             'token_normalized_postcode()'.
 423         """
 424         return postcode.strip().upper()
 425
 426     def update_postcodes_from_db(self) -> None:
 427         """ Update postcode tokens in the word table from the location_postcode
 428             table.
 429         """
 430         assert self.conn is not None
 431         analyzer = self.token_analysis.analysis.get('@postcode')
 432
 433         with self.conn.cursor() as cur:
 434             # First get all postcode names currently in the word table.
 435             cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
 436             word_entries = set((entry[0] for entry in cur))
 437
 438             # Then compute the required postcode names from the postcode table.
 439             needed_entries = set()
 440             cur.execute("SELECT country_code, postcode FROM location_postcode")
 441             for cc, postcode in cur:
 442                 info = PlaceInfo({'country_code': cc,
 443                                   'class': 'place', 'type': 'postcode',
 444                                   'address': {'postcode': postcode}})
 445                 address = self.sanitizer.process_names(info)[1]
 446                 for place in address:
 447                     if place.kind == 'postcode':
 448                         if analyzer is None:
 449                             postcode_name = place.name.strip().upper()
 450                             variant_base = None
 451                         else:
 452                             postcode_name = analyzer.get_canonical_id(place)
 453                             variant_base = place.get_attr("variant")
 454
 455                         if variant_base:
 456                             needed_entries.add(f'{postcode_name}@{variant_base}')
 457                         else:
 458                             needed_entries.add(postcode_name)
 459                         break
 460
 461         # Now update the word table.
 462         self._delete_unused_postcode_words(word_entries - needed_entries)
 463         self._add_missing_postcode_words(needed_entries - word_entries)
 464
 465     def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
 466         assert self.conn is not None
 467         if tokens:
 468             with self.conn.cursor() as cur:
 469                 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
 470                             (list(tokens), ))
 471
 472     def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
 473         assert self.conn is not None
 474         if not tokens:
 475             return
 476
 477         analyzer = self.token_analysis.analysis.get('@postcode')
 478         terms = []
 479
 480         for postcode_name in tokens:
 481             if '@' in postcode_name:
 482                 term, variant = postcode_name.split('@', 2)
 483                 term = self._search_normalized(term)
 484                 if analyzer is None:
 485                     variants = [term]
 486                 else:
 487                     variants = analyzer.compute_variants(variant)
 488                     if term not in variants:
 489                         variants.append(term)
 490             else:
 491                 variants = [self._search_normalized(postcode_name)]
 492             terms.append((postcode_name, variants))
 493
 494         if terms:
 495             with self.conn.cursor() as cur:
 496                 cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
 497
 498     def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
 499                                should_replace: bool) -> None:
 500         """ Replace the search index for special phrases with the new phrases.
 501             If `should_replace` is True, then the previous set of will be
 502             completely replaced. Otherwise the phrases are added to the
 503             already existing ones.
 504         """
 505         assert self.conn is not None
 506         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 507                             for p in phrases))
 508
 509         with self.conn.cursor() as cur:
 510             # Get the old phrases.
 511             existing_phrases = set()
 512             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 513             for word, info in cur:
 514                 existing_phrases.add((word, info['class'], info['type'],
 515                                       info.get('op') or '-'))
 516
 517             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 518             if should_replace:
 519                 deleted = self._remove_special_phrases(cur, norm_phrases,
 520                                                        existing_phrases)
 521             else:
 522                 deleted = 0
 523
 524         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 525                  len(norm_phrases), added, deleted)
 526
 527     def _add_special_phrases(self, cursor: Cursor,
 528                              new_phrases: Set[Tuple[str, str, str, str]],
 529                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 530         """ Add all phrases to the database that are not yet there.
 531         """
 532         to_add = new_phrases - existing_phrases
 533
 534         added = 0
 535         with cursor.copy('COPY word(word_token, type, word, info) FROM STDIN') as copy:
 536             for word, cls, typ, oper in to_add:
 537                 term = self._search_normalized(word)
 538                 if term:
 539                     copy.write_row((term, 'S', word,
 540                                     Jsonb({'class': cls, 'type': typ,
 541                                            'op': oper if oper in ('in', 'near') else None})))
 542                     added += 1
 543
 544         return added
 545
 546     def _remove_special_phrases(self, cursor: Cursor,
 547                                 new_phrases: Set[Tuple[str, str, str, str]],
 548                                 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 549         """ Remove all phrases from the database that are no longer in the
 550             new phrase list.
 551         """
 552         to_delete = existing_phrases - new_phrases
 553
 554         if to_delete:
 555             cursor.executemany(
 556                 """ DELETE FROM word
 557                       WHERE type = 'S' and word = %s
 558                             and info->>'class' = %s and info->>'type' = %s
 559                             and %s = coalesce(info->>'op', '-')
 560                 """, to_delete)
 561
 562         return len(to_delete)
 563
 564     def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
 565         """ Add default names for the given country to the search index.
 566         """
 567         # Make sure any name preprocessing for country names applies.
 568         info = PlaceInfo({'name': names, 'country_code': country_code,
 569                           'rank_address': 4, 'class': 'boundary',
 570                           'type': 'administrative'})
 571         self._add_country_full_names(country_code,
 572                                      self.sanitizer.process_names(info)[0],
 573                                      internal=True)
 574
 575     def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
 576                                 internal: bool = False) -> None:
 577         """ Add names for the given country from an already sanitized
 578             name list.
 579         """
 580         assert self.conn is not None
 581         word_tokens = set()
 582         for name in names:
 583             norm_name = self._search_normalized(name.name)
 584             if norm_name:
 585                 word_tokens.add(norm_name)
 586
 587         with self.conn.cursor() as cur:
 588             # Get existing names
 589             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
 590                              FROM word
 591                              WHERE type = 'C' and word = %s""",
 592                         (country_code, ))
 593             # internal/external names
 594             existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
 595             for word in cur:
 596                 existing_tokens[word[1]].add(word[0])
 597
 598             # Delete names that no longer exist.
 599             gone_tokens = existing_tokens[internal] - word_tokens
 600             if internal:
 601                 gone_tokens.update(existing_tokens[False] & word_tokens)
 602             if gone_tokens:
 603                 cur.execute("""DELETE FROM word
 604                                USING unnest(%s::text[]) as token
 605                                WHERE type = 'C' and word = %s
 606                                      and word_token = token""",
 607                             (list(gone_tokens), country_code))
 608
 609             # Only add those names that are not yet in the list.
 610             new_tokens = word_tokens - existing_tokens[True]
 611             if not internal:
 612                 new_tokens -= existing_tokens[False]
 613             if new_tokens:
 614                 if internal:
 615                     sql = """INSERT INTO word (word_token, type, word, info)
 616                                (SELECT token, 'C', %s, '{"internal": "yes"}'
 617                                   FROM unnest(%s::text[]) as token)
 618                            """
 619                 else:
 620                     sql = """INSERT INTO word (word_token, type, word)
 621                                    (SELECT token, 'C', %s
 622                                     FROM unnest(%s::text[]) as token)
 623                           """
 624                 cur.execute(sql, (country_code, list(new_tokens)))
 625
 626     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
 627         """ Determine tokenizer information about the given place.
 628
 629             Returns a JSON-serializable structure that will be handed into
 630             the database via the token_info field.
 631         """
 632         token_info = _TokenInfo()
 633
 634         names, address = self.sanitizer.process_names(place)
 635
 636         if names:
 637             token_info.set_names(*self._compute_name_tokens(names))
 638
 639             if place.is_country():
 640                 assert place.country_code is not None
 641                 self._add_country_full_names(place.country_code, names)
 642
 643         if address:
 644             self._process_place_address(token_info, address)
 645
 646         return token_info.to_dict()
 647
 648     def _process_place_address(self, token_info: '_TokenInfo',
 649                                address: Sequence[PlaceName]) -> None:
 650         for item in address:
 651             if item.kind == 'postcode':
 652                 token_info.set_postcode(self._add_postcode(item))
 653             elif item.kind == 'housenumber':
 654                 token_info.add_housenumber(*self._compute_housenumber_token(item))
 655             elif item.kind == 'street':
 656                 token_info.add_street(self._retrieve_full_tokens(item.name))
 657             elif item.kind == 'place':
 658                 if not item.suffix:
 659                     token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
 660             elif (not item.kind.startswith('_') and not item.suffix and
 661                   item.kind not in ('country', 'full', 'inclusion')):
 662                 token_info.add_address_term(item.kind,
 663                                             itertools.chain(*self._compute_name_tokens([item])))
 664
 665     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
 666         """ Normalize the housenumber and return the word token and the
 667             canonical form.
 668         """
 669         assert self.conn is not None
 670         analyzer = self.token_analysis.analysis.get('@housenumber')
 671         result: Tuple[Optional[int], Optional[str]] = (None, None)
 672
 673         if analyzer is None:
 674             # When no custom analyzer is set, simply normalize and transliterate
 675             norm_name = self._search_normalized(hnr.name)
 676             if norm_name:
 677                 result = self._cache.housenumbers.get(norm_name, result)
 678                 if result[0] is None:
 679                     hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
 680
 681                     result = hid, norm_name
 682                     self._cache.housenumbers[norm_name] = result
 683         else:
 684             # Otherwise use the analyzer to determine the canonical name.
 685             # Per convention we use the first variant as the 'lookup name', the
 686             # name that gets saved in the housenumber field of the place.
 687             word_id = analyzer.get_canonical_id(hnr)
 688             if word_id:
 689                 result = self._cache.housenumbers.get(word_id, result)
 690                 if result[0] is None:
 691                     variants = analyzer.compute_variants(word_id)
 692                     if variants:
 693                         hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
 694                                              (word_id, list(variants)))
 695                         result = hid, variants[0]
 696                         self._cache.housenumbers[word_id] = result
 697
 698         return result
 699
 700     def _retrieve_full_tokens(self, name: str) -> List[int]:
 701         """ Get the full name token for the given name, if it exists.
 702             The name is only retrieved for the standard analyser.
 703         """
 704         assert self.conn is not None
 705         norm_name = self._search_normalized(name)
 706
 707         # return cached if possible
 708         if norm_name in self._cache.fulls:
 709             return self._cache.fulls[norm_name]
 710
 711         with self.conn.cursor() as cur:
 712             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 713                         (norm_name, ))
 714             full = [row[0] for row in cur]
 715
 716         self._cache.fulls[norm_name] = full
 717
 718         return full
 719
 720     def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
 721         """ Computes the full name and partial name tokens for the given
 722             dictionary of names.
 723         """
 724         assert self.conn is not None
 725         full_tokens: Set[int] = set()
 726         partial_tokens: Set[int] = set()
 727
 728         for name in names:
 729             analyzer_id = name.get_attr('analyzer')
 730             analyzer = self.token_analysis.get_analyzer(analyzer_id)
 731             word_id = analyzer.get_canonical_id(name)
 732             if analyzer_id is None:
 733                 token_id = word_id
 734             else:
 735                 token_id = f'{word_id}@{analyzer_id}'
 736
 737             full, part = self._cache.names.get(token_id, (None, None))
 738             if full is None:
 739                 variants = analyzer.compute_variants(word_id)
 740                 if not variants:
 741                     continue
 742
 743                 with self.conn.cursor() as cur:
 744                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
 745                                 (token_id, variants))
 746                     full, part = cast(Tuple[int, List[int]], cur.fetchone())
 747
 748                 self._cache.names[token_id] = (full, part)
 749
 750             assert part is not None
 751
 752             full_tokens.add(full)
 753             partial_tokens.update(part)
 754
 755         return full_tokens, partial_tokens
 756
 757     def _add_postcode(self, item: PlaceName) -> Optional[str]:
 758         """ Make sure the normalized postcode is present in the word table.
 759         """
 760         assert self.conn is not None
 761         analyzer = self.token_analysis.analysis.get('@postcode')
 762
 763         if analyzer is None:
 764             postcode_name = item.name.strip().upper()
 765             variant_base = None
 766         else:
 767             postcode_name = analyzer.get_canonical_id(item)
 768             variant_base = item.get_attr("variant")
 769
 770         if variant_base:
 771             postcode = f'{postcode_name}@{variant_base}'
 772         else:
 773             postcode = postcode_name
 774
 775         if postcode not in self._cache.postcodes:
 776             term = self._search_normalized(postcode_name)
 777             if not term:
 778                 return None
 779
 780             variants = {term}
 781             if analyzer is not None and variant_base:
 782                 variants.update(analyzer.compute_variants(variant_base))
 783
 784             with self.conn.cursor() as cur:
 785                 cur.execute("SELECT create_postcode_word(%s, %s)",
 786                             (postcode, list(variants)))
 787             self._cache.postcodes.add(postcode)
 788
 789         return postcode_name
 790
 791
 792 class _TokenInfo:
 793     """ Collect token information to be sent back to the database.
 794     """
 795     def __init__(self) -> None:
 796         self.names: Optional[str] = None
 797         self.housenumbers: Set[str] = set()
 798         self.housenumber_tokens: Set[int] = set()
 799         self.street_tokens: Optional[Set[int]] = None
 800         self.place_tokens: Set[int] = set()
 801         self.address_tokens: Dict[str, str] = {}
 802         self.postcode: Optional[str] = None
 803
 804     def _mk_array(self, tokens: Iterable[Any]) -> str:
 805         return f"{{{','.join((str(s) for s in tokens))}}}"
 806
 807     def to_dict(self) -> Dict[str, Any]:
 808         """ Return the token information in database importable format.
 809         """
 810         out: Dict[str, Any] = {}
 811
 812         if self.names:
 813             out['names'] = self.names
 814
 815         if self.housenumbers:
 816             out['hnr'] = ';'.join(self.housenumbers)
 817             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 818
 819         if self.street_tokens is not None:
 820             out['street'] = self._mk_array(self.street_tokens)
 821
 822         if self.place_tokens:
 823             out['place'] = self._mk_array(self.place_tokens)
 824
 825         if self.address_tokens:
 826             out['addr'] = self.address_tokens
 827
 828         if self.postcode:
 829             out['postcode'] = self.postcode
 830
 831         return out
 832
 833     def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
 834         """ Adds token information for the normalised names.
 835         """
 836         self.names = self._mk_array(itertools.chain(fulls, partials))
 837
 838     def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
 839         """ Extract housenumber information from a list of normalised
 840             housenumbers.
 841         """
 842         if token:
 843             assert hnr is not None
 844             self.housenumbers.add(hnr)
 845             self.housenumber_tokens.add(token)
 846
 847     def add_street(self, tokens: Iterable[int]) -> None:
 848         """ Add addr:street match terms.
 849         """
 850         if self.street_tokens is None:
 851             self.street_tokens = set()
 852         self.street_tokens.update(tokens)
 853
 854     def add_place(self, tokens: Iterable[int]) -> None:
 855         """ Add addr:place search and match terms.
 856         """
 857         self.place_tokens.update(tokens)
 858
 859     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
 860         """ Add additional address terms.
 861         """
 862         array = self._mk_array(partials)
 863         if len(array) > 2:
 864             self.address_tokens[key] = array
 865
 866     def set_postcode(self, postcode: Optional[str]) -> None:
 867         """ Set the postcode to the given one.
 868         """
 869         self.postcode = postcode
 870
 871
 872 class _TokenCache:
 873     """ Cache for token information to avoid repeated database queries.
 874
 875         This cache is not thread-safe and needs to be instantiated per
 876         analyzer.
 877     """
 878     def __init__(self) -> None:
 879         self.names: Dict[str, Tuple[int, List[int]]] = {}
 880         self.partials: Dict[str, int] = {}
 881         self.fulls: Dict[str, List[int]] = {}
 882         self.postcodes: Set[str] = set()
 883         self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}