src/nominatim_db/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2024 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
  12                    Dict, Set, Iterable
  13 import itertools
  14 import logging
  15 from pathlib import Path
  16
  17 from psycopg.types.json import Jsonb
  18 from psycopg import sql as pysql
  19
  20 from ..db.connection import connect, Connection, Cursor, server_version_tuple,\
  21                             drop_tables, table_exists, execute_scalar
  22 from ..config import Configuration
  23 from ..db.sql_preprocessor import SQLPreprocessor
  24 from ..data.place_info import PlaceInfo
  25 from ..data.place_name import PlaceName
  26 from .icu_rule_loader import ICURuleLoader
  27 from .place_sanitizer import PlaceSanitizer
  28 from .icu_token_analysis import ICUTokenAnalysis
  29 from .base import AbstractAnalyzer, AbstractTokenizer
  30
  31 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  32
  33 LOG = logging.getLogger()
  34
  35 WORD_TYPES =(('country_names', 'C'),
  36              ('postcodes', 'P'),
  37              ('full_word', 'W'),
  38              ('housenumbers', 'H'))
  39
  40 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
  41     """ Create a new instance of the tokenizer provided by this module.
  42     """
  43     return ICUTokenizer(dsn, data_dir)
  44
  45
  46 class ICUTokenizer(AbstractTokenizer):
  47     """ This tokenizer uses libICU to convert names and queries to ASCII.
  48         Otherwise it uses the same algorithms and data structures as the
  49         normalization routines in Nominatim 3.
  50     """
  51
  52     def __init__(self, dsn: str, data_dir: Path) -> None:
  53         self.dsn = dsn
  54         self.data_dir = data_dir
  55         self.loader: Optional[ICURuleLoader] = None
  56
  57
  58     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
  59         """ Set up a new tokenizer for the database.
  60
  61             This copies all necessary data in the project directory to make
  62             sure the tokenizer remains stable even over updates.
  63         """
  64         self.loader = ICURuleLoader(config)
  65
  66         self._save_config()
  67
  68         if init_db:
  69             self.update_sql_functions(config)
  70             self._setup_db_tables(config)
  71             self._create_base_indices(config, 'word')
  72
  73
  74     def init_from_project(self, config: Configuration) -> None:
  75         """ Initialise the tokenizer from the project directory.
  76         """
  77         self.loader = ICURuleLoader(config)
  78
  79         with connect(self.dsn) as conn:
  80             self.loader.load_config_from_db(conn)
  81
  82
  83     def finalize_import(self, config: Configuration) -> None:
  84         """ Do any required postprocessing to make the tokenizer data ready
  85             for use.
  86         """
  87         self._create_lookup_indices(config, 'word')
  88
  89
  90     def update_sql_functions(self, config: Configuration) -> None:
  91         """ Reimport the SQL functions for this tokenizer.
  92         """
  93         with connect(self.dsn) as conn:
  94             sqlp = SQLPreprocessor(conn, config)
  95             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  96
  97
  98     def check_database(self, config: Configuration) -> None:
  99         """ Check that the tokenizer is set up correctly.
 100         """
 101         # Will throw an error if there is an issue.
 102         self.init_from_project(config)
 103
 104
 105     def update_statistics(self, config: Configuration, threads: int = 2) -> None:
 106         """ Recompute frequencies for all name words.
 107         """
 108         with connect(self.dsn) as conn:
 109             if not table_exists(conn, 'search_name'):
 110                 return
 111
 112             with conn.cursor() as cur:
 113                 cur.execute('ANALYSE search_name')
 114                 if threads > 1:
 115                     cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
 116                                      .format(pysql.Literal(min(threads, 6),)))
 117
 118                 if server_version_tuple(conn) < (12, 0):
 119                     LOG.info('Computing word frequencies')
 120                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
 121                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 122                                      SELECT unnest(name_vector) as id, count(*)
 123                                      FROM search_name GROUP BY id""")
 124                     cur.execute('CREATE INDEX ON word_frequencies(id)')
 125                     cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
 126                                      SELECT unnest(nameaddress_vector) as id, count(*)
 127                                      FROM search_name GROUP BY id""")
 128                     cur.execute('CREATE INDEX ON addressword_frequencies(id)')
 129                     cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
 130                                                                                INOUT info JSONB)
 131                                    AS $$
 132                                    DECLARE rec RECORD;
 133                                    BEGIN
 134                                    IF info is null THEN
 135                                      info = '{}'::jsonb;
 136                                    END IF;
 137                                    FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
 138                                    LOOP
 139                                      info = info || jsonb_build_object('count', rec.count);
 140                                    END LOOP;
 141                                    FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
 142                                    LOOP
 143                                      info = info || jsonb_build_object('addr_count', rec.count);
 144                                    END LOOP;
 145                                    IF info = '{}'::jsonb THEN
 146                                      info = null;
 147                                    END IF;
 148                                    END;
 149                                    $$ LANGUAGE plpgsql IMMUTABLE;
 150                                 """)
 151                     LOG.info('Update word table with recomputed frequencies')
 152                     drop_tables(conn, 'tmp_word')
 153                     cur.execute("""CREATE TABLE tmp_word AS
 154                                     SELECT word_id, word_token, type, word,
 155                                            word_freq_update(word_id, info) as info
 156                                     FROM word
 157                                 """)
 158                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
 159                 else:
 160                     LOG.info('Computing word frequencies')
 161                     drop_tables(conn, 'word_frequencies')
 162                     cur.execute("""
 163                       CREATE TEMP TABLE word_frequencies AS
 164                       WITH word_freq AS MATERIALIZED (
 165                                SELECT unnest(name_vector) as id, count(*)
 166                                      FROM search_name GROUP BY id),
 167                            addr_freq AS MATERIALIZED (
 168                                SELECT unnest(nameaddress_vector) as id, count(*)
 169                                      FROM search_name GROUP BY id)
 170                       SELECT coalesce(a.id, w.id) as id,
 171                              (CASE WHEN w.count is null THEN '{}'::JSONB
 172                                   ELSE jsonb_build_object('count', w.count) END
 173                               ||
 174                               CASE WHEN a.count is null THEN '{}'::JSONB
 175                                   ELSE jsonb_build_object('addr_count', a.count) END) as info
 176                       FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
 177                       """)
 178                     cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
 179                     cur.execute('ANALYSE word_frequencies')
 180                     LOG.info('Update word table with recomputed frequencies')
 181                     drop_tables(conn, 'tmp_word')
 182                     cur.execute("""CREATE TABLE tmp_word AS
 183                                     SELECT word_id, word_token, type, word,
 184                                            (CASE WHEN wf.info is null THEN word.info
 185                                             ELSE coalesce(word.info, '{}'::jsonb) || wf.info
 186                                             END) as info
 187                                     FROM word LEFT JOIN word_frequencies wf
 188                                          ON word.word_id = wf.id
 189                                 """)
 190                     drop_tables(conn, 'word_frequencies')
 191
 192             with conn.cursor() as cur:
 193                 cur.execute('SET max_parallel_workers_per_gather TO 0')
 194
 195             sqlp = SQLPreprocessor(conn, config)
 196             sqlp.run_string(conn,
 197                             'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
 198             conn.commit()
 199         self._create_base_indices(config, 'tmp_word')
 200         self._create_lookup_indices(config, 'tmp_word')
 201         self._move_temporary_word_table('tmp_word')
 202
 203
 204
 205     def _cleanup_housenumbers(self) -> None:
 206         """ Remove unused house numbers.
 207         """
 208         with connect(self.dsn) as conn:
 209             if not table_exists(conn, 'search_name'):
 210                 return
 211             with conn.cursor(name="hnr_counter") as cur:
 212                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
 213                                FROM word
 214                                WHERE type = 'H'
 215                                  AND NOT EXISTS(SELECT * FROM search_name
 216                                                 WHERE ARRAY[word.word_id] && name_vector)
 217                                  AND (char_length(coalesce(word, word_token)) > 6
 218                                       OR coalesce(word, word_token) not similar to '\\d+')
 219                             """)
 220                 candidates = {token: wid for wid, token in cur}
 221             with conn.cursor(name="hnr_counter") as cur:
 222                 cur.execute("""SELECT housenumber FROM placex
 223                                WHERE housenumber is not null
 224                                      AND (char_length(housenumber) > 6
 225                                           OR housenumber not similar to '\\d+')
 226                             """)
 227                 for row in cur:
 228                     for hnr in row[0].split(';'):
 229                         candidates.pop(hnr, None)
 230             LOG.info("There are %s outdated housenumbers.", len(candidates))
 231             LOG.debug("Outdated housenumbers: %s", candidates.keys())
 232             if candidates:
 233                 with conn.cursor() as cur:
 234                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 235                                 (list(candidates.values()), ))
 236                 conn.commit()
 237
 238
 239
 240     def update_word_tokens(self) -> None:
 241         """ Remove unused tokens.
 242         """
 243         LOG.warning("Cleaning up housenumber tokens.")
 244         self._cleanup_housenumbers()
 245         LOG.warning("Tokenizer house-keeping done.")
 246
 247
 248     def name_analyzer(self) -> 'ICUNameAnalyzer':
 249         """ Create a new analyzer for tokenizing names and queries
 250             using this tokinzer. Analyzers are context managers and should
 251             be used accordingly:
 252
 253             ```
 254             with tokenizer.name_analyzer() as analyzer:
 255                 analyser.tokenize()
 256             ```
 257
 258             When used outside the with construct, the caller must ensure to
 259             call the close() function before destructing the analyzer.
 260
 261             Analyzers are not thread-safe. You need to instantiate one per thread.
 262         """
 263         assert self.loader is not None
 264         return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 265                                self.loader.make_token_analysis())
 266
 267
 268     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
 269         """ Return a list of the `num` most frequent full words
 270             in the database.
 271         """
 272         with conn.cursor() as cur:
 273             cur.execute("""SELECT word, sum((info->>'count')::int) as count
 274                              FROM word WHERE type = 'W'
 275                              GROUP BY word
 276                              ORDER BY count DESC LIMIT %s""", (num,))
 277             return list(s[0].split('@')[0] for s in cur)
 278
 279
 280     def _save_config(self) -> None:
 281         """ Save the configuration that needs to remain stable for the given
 282             database as database properties.
 283         """
 284         assert self.loader is not None
 285         with connect(self.dsn) as conn:
 286             self.loader.save_config_to_db(conn)
 287
 288
 289     def _setup_db_tables(self, config: Configuration) -> None:
 290         """ Set up the word table and fill it with pre-computed word
 291             frequencies.
 292         """
 293         with connect(self.dsn) as conn:
 294             drop_tables(conn, 'word')
 295             sqlp = SQLPreprocessor(conn, config)
 296             sqlp.run_string(conn, """
 297                 CREATE TABLE word (
 298                       word_id INTEGER,
 299                       word_token text NOT NULL,
 300                       type text NOT NULL,
 301                       word text,
 302                       info jsonb
 303                     ) {{db.tablespace.search_data}};
 304                 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
 305
 306                 DROP SEQUENCE IF EXISTS seq_word;
 307                 CREATE SEQUENCE seq_word start 1;
 308                 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
 309             """)
 310             conn.commit()
 311
 312
 313     def _create_base_indices(self, config: Configuration, table_name: str) -> None:
 314         """ Set up the word table and fill it with pre-computed word
 315             frequencies.
 316         """
 317         with connect(self.dsn) as conn:
 318             sqlp = SQLPreprocessor(conn, config)
 319             sqlp.run_string(conn,
 320                             """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
 321                                USING BTREE (word_token) {{db.tablespace.search_index}}""",
 322                             table_name=table_name)
 323             for name, ctype in WORD_TYPES:
 324                 sqlp.run_string(conn,
 325                                 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
 326                                    USING BTREE (word) {{db.tablespace.address_index}}
 327                                    WHERE type = '{{column_type}}'
 328                                 """,
 329                                 table_name=table_name, idx_name=name,
 330                                 column_type=ctype)
 331             conn.commit()
 332
 333
 334     def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
 335         """ Create additional indexes used when running the API.
 336         """
 337         with connect(self.dsn) as conn:
 338             sqlp = SQLPreprocessor(conn, config)
 339             # Index required for details lookup.
 340             sqlp.run_string(conn, """
 341                 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
 342                   ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
 343             """,
 344             table_name=table_name)
 345             conn.commit()
 346
 347
 348     def _move_temporary_word_table(self, old: str) -> None:
 349         """ Rename all tables and indexes used by the tokenizer.
 350         """
 351         with connect(self.dsn) as conn:
 352             drop_tables(conn, 'word')
 353             with conn.cursor() as cur:
 354                 cur.execute(f"ALTER TABLE {old} RENAME TO word")
 355                 for idx in ('word_token', 'word_id'):
 356                     cur.execute(f"""ALTER INDEX idx_{old}_{idx}
 357                                       RENAME TO idx_word_{idx}""")
 358                 for name, _ in WORD_TYPES:
 359                     cur.execute(f"""ALTER INDEX idx_{old}_{name}
 360                                     RENAME TO idx_word_{name}""")
 361             conn.commit()
 362
 363
 364
 365
 366 class ICUNameAnalyzer(AbstractAnalyzer):
 367     """ The ICU analyzer uses the ICU library for splitting names.
 368
 369         Each instance opens a connection to the database to request the
 370         normalization.
 371     """
 372
 373     def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
 374                  token_analysis: ICUTokenAnalysis) -> None:
 375         self.conn: Optional[Connection] = connect(dsn)
 376         self.conn.autocommit = True
 377         self.sanitizer = sanitizer
 378         self.token_analysis = token_analysis
 379
 380         self._cache = _TokenCache()
 381
 382
 383     def close(self) -> None:
 384         """ Free all resources used by the analyzer.
 385         """
 386         if self.conn:
 387             self.conn.close()
 388             self.conn = None
 389
 390
 391     def _search_normalized(self, name: str) -> str:
 392         """ Return the search token transliteration of the given name.
 393         """
 394         return cast(str, self.token_analysis.search.transliterate(name)).strip()
 395
 396
 397     def _normalized(self, name: str) -> str:
 398         """ Return the normalized version of the given name with all
 399             non-relevant information removed.
 400         """
 401         return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
 402
 403
 404     def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
 405         """ Return token information for the given list of words.
 406             If a word starts with # it is assumed to be a full name
 407             otherwise is a partial name.
 408
 409             The function returns a list of tuples with
 410             (original word, word token, word id).
 411
 412             The function is used for testing and debugging only
 413             and not necessarily efficient.
 414         """
 415         assert self.conn is not None
 416         full_tokens = {}
 417         partial_tokens = {}
 418         for word in words:
 419             if word.startswith('#'):
 420                 full_tokens[word] = self._search_normalized(word[1:])
 421             else:
 422                 partial_tokens[word] = self._search_normalized(word)
 423
 424         with self.conn.cursor() as cur:
 425             cur.execute("""SELECT word_token, word_id
 426                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 427                         """, (list(full_tokens.values()),))
 428             full_ids = {r[0]: r[1] for r in cur}
 429             cur.execute("""SELECT word_token, word_id
 430                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 431                         (list(partial_tokens.values()),))
 432             part_ids = {r[0]: r[1] for r in cur}
 433
 434         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 435                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 436
 437
 438     def normalize_postcode(self, postcode: str) -> str:
 439         """ Convert the postcode to a standardized form.
 440
 441             This function must yield exactly the same result as the SQL function
 442             'token_normalized_postcode()'.
 443         """
 444         return postcode.strip().upper()
 445
 446
 447     def update_postcodes_from_db(self) -> None:
 448         """ Update postcode tokens in the word table from the location_postcode
 449             table.
 450         """
 451         assert self.conn is not None
 452         analyzer = self.token_analysis.analysis.get('@postcode')
 453
 454         with self.conn.cursor() as cur:
 455             # First get all postcode names currently in the word table.
 456             cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
 457             word_entries = set((entry[0] for entry in cur))
 458
 459             # Then compute the required postcode names from the postcode table.
 460             needed_entries = set()
 461             cur.execute("SELECT country_code, postcode FROM location_postcode")
 462             for cc, postcode in cur:
 463                 info = PlaceInfo({'country_code': cc,
 464                                   'class': 'place', 'type': 'postcode',
 465                                   'address': {'postcode': postcode}})
 466                 address = self.sanitizer.process_names(info)[1]
 467                 for place in address:
 468                     if place.kind == 'postcode':
 469                         if analyzer is None:
 470                             postcode_name = place.name.strip().upper()
 471                             variant_base = None
 472                         else:
 473                             postcode_name = analyzer.get_canonical_id(place)
 474                             variant_base = place.get_attr("variant")
 475
 476                         if variant_base:
 477                             needed_entries.add(f'{postcode_name}@{variant_base}')
 478                         else:
 479                             needed_entries.add(postcode_name)
 480                         break
 481
 482         # Now update the word table.
 483         self._delete_unused_postcode_words(word_entries - needed_entries)
 484         self._add_missing_postcode_words(needed_entries - word_entries)
 485
 486     def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
 487         assert self.conn is not None
 488         if tokens:
 489             with self.conn.cursor() as cur:
 490                 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
 491                             (list(tokens), ))
 492
 493     def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
 494         assert self.conn is not None
 495         if not tokens:
 496             return
 497
 498         analyzer = self.token_analysis.analysis.get('@postcode')
 499         terms = []
 500
 501         for postcode_name in tokens:
 502             if '@' in postcode_name:
 503                 term, variant = postcode_name.split('@', 2)
 504                 term = self._search_normalized(term)
 505                 if analyzer is None:
 506                     variants = [term]
 507                 else:
 508                     variants = analyzer.compute_variants(variant)
 509                     if term not in variants:
 510                         variants.append(term)
 511             else:
 512                 variants = [self._search_normalized(postcode_name)]
 513             terms.append((postcode_name, variants))
 514
 515         if terms:
 516             with self.conn.cursor() as cur:
 517                 cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
 518
 519
 520
 521
 522     def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
 523                                should_replace: bool) -> None:
 524         """ Replace the search index for special phrases with the new phrases.
 525             If `should_replace` is True, then the previous set of will be
 526             completely replaced. Otherwise the phrases are added to the
 527             already existing ones.
 528         """
 529         assert self.conn is not None
 530         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 531                             for p in phrases))
 532
 533         with self.conn.cursor() as cur:
 534             # Get the old phrases.
 535             existing_phrases = set()
 536             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 537             for word, info in cur:
 538                 existing_phrases.add((word, info['class'], info['type'],
 539                                       info.get('op') or '-'))
 540
 541             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 542             if should_replace:
 543                 deleted = self._remove_special_phrases(cur, norm_phrases,
 544                                                        existing_phrases)
 545             else:
 546                 deleted = 0
 547
 548         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 549                  len(norm_phrases), added, deleted)
 550
 551
 552     def _add_special_phrases(self, cursor: Cursor,
 553                              new_phrases: Set[Tuple[str, str, str, str]],
 554                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 555         """ Add all phrases to the database that are not yet there.
 556         """
 557         to_add = new_phrases - existing_phrases
 558
 559         added = 0
 560         with cursor.copy('COPY word(word_token, type, word, info) FROM STDIN') as copy:
 561             for word, cls, typ, oper in to_add:
 562                 term = self._search_normalized(word)
 563                 if term:
 564                     copy.write_row((term, 'S', word,
 565                                     Jsonb({'class': cls, 'type': typ,
 566                                            'op': oper if oper in ('in', 'near') else None})))
 567                     added += 1
 568
 569         return added
 570
 571
 572     def _remove_special_phrases(self, cursor: Cursor,
 573                              new_phrases: Set[Tuple[str, str, str, str]],
 574                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 575         """ Remove all phrases from the database that are no longer in the
 576             new phrase list.
 577         """
 578         to_delete = existing_phrases - new_phrases
 579
 580         if to_delete:
 581             cursor.executemany(
 582                 """ DELETE FROM word
 583                       WHERE type = 'S' and word = %s
 584                             and info->>'class' = %s and info->>'type' = %s
 585                             and %s = coalesce(info->>'op', '-')
 586                 """, to_delete)
 587
 588         return len(to_delete)
 589
 590
 591     def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
 592         """ Add default names for the given country to the search index.
 593         """
 594         # Make sure any name preprocessing for country names applies.
 595         info = PlaceInfo({'name': names, 'country_code': country_code,
 596                           'rank_address': 4, 'class': 'boundary',
 597                           'type': 'administrative'})
 598         self._add_country_full_names(country_code,
 599                                      self.sanitizer.process_names(info)[0],
 600                                      internal=True)
 601
 602
 603     def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
 604                                 internal: bool = False) -> None:
 605         """ Add names for the given country from an already sanitized
 606             name list.
 607         """
 608         assert self.conn is not None
 609         word_tokens = set()
 610         for name in names:
 611             norm_name = self._search_normalized(name.name)
 612             if norm_name:
 613                 word_tokens.add(norm_name)
 614
 615         with self.conn.cursor() as cur:
 616             # Get existing names
 617             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
 618                              FROM word
 619                              WHERE type = 'C' and word = %s""",
 620                         (country_code, ))
 621             # internal/external names
 622             existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
 623             for word in cur:
 624                 existing_tokens[word[1]].add(word[0])
 625
 626             # Delete names that no longer exist.
 627             gone_tokens = existing_tokens[internal] - word_tokens
 628             if internal:
 629                 gone_tokens.update(existing_tokens[False] & word_tokens)
 630             if gone_tokens:
 631                 cur.execute("""DELETE FROM word
 632                                USING unnest(%s::text[]) as token
 633                                WHERE type = 'C' and word = %s
 634                                      and word_token = token""",
 635                             (list(gone_tokens), country_code))
 636
 637             # Only add those names that are not yet in the list.
 638             new_tokens = word_tokens - existing_tokens[True]
 639             if not internal:
 640                 new_tokens -= existing_tokens[False]
 641             if new_tokens:
 642                 if internal:
 643                     sql = """INSERT INTO word (word_token, type, word, info)
 644                                (SELECT token, 'C', %s, '{"internal": "yes"}'
 645                                   FROM unnest(%s::text[]) as token)
 646                            """
 647                 else:
 648                     sql = """INSERT INTO word (word_token, type, word)
 649                                    (SELECT token, 'C', %s
 650                                     FROM unnest(%s::text[]) as token)
 651                           """
 652                 cur.execute(sql, (country_code, list(new_tokens)))
 653
 654
 655     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
 656         """ Determine tokenizer information about the given place.
 657
 658             Returns a JSON-serializable structure that will be handed into
 659             the database via the token_info field.
 660         """
 661         token_info = _TokenInfo()
 662
 663         names, address = self.sanitizer.process_names(place)
 664
 665         if names:
 666             token_info.set_names(*self._compute_name_tokens(names))
 667
 668             if place.is_country():
 669                 assert place.country_code is not None
 670                 self._add_country_full_names(place.country_code, names)
 671
 672         if address:
 673             self._process_place_address(token_info, address)
 674
 675         return token_info.to_dict()
 676
 677
 678     def _process_place_address(self, token_info: '_TokenInfo',
 679                                address: Sequence[PlaceName]) -> None:
 680         for item in address:
 681             if item.kind == 'postcode':
 682                 token_info.set_postcode(self._add_postcode(item))
 683             elif item.kind == 'housenumber':
 684                 token_info.add_housenumber(*self._compute_housenumber_token(item))
 685             elif item.kind == 'street':
 686                 token_info.add_street(self._retrieve_full_tokens(item.name))
 687             elif item.kind == 'place':
 688                 if not item.suffix:
 689                     token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
 690             elif not item.kind.startswith('_') and not item.suffix and \
 691                  item.kind not in ('country', 'full', 'inclusion'):
 692                 token_info.add_address_term(item.kind,
 693                                             itertools.chain(*self._compute_name_tokens([item])))
 694
 695
 696     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
 697         """ Normalize the housenumber and return the word token and the
 698             canonical form.
 699         """
 700         assert self.conn is not None
 701         analyzer = self.token_analysis.analysis.get('@housenumber')
 702         result: Tuple[Optional[int], Optional[str]] = (None, None)
 703
 704         if analyzer is None:
 705             # When no custom analyzer is set, simply normalize and transliterate
 706             norm_name = self._search_normalized(hnr.name)
 707             if norm_name:
 708                 result = self._cache.housenumbers.get(norm_name, result)
 709                 if result[0] is None:
 710                     hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
 711
 712                     result = hid, norm_name
 713                     self._cache.housenumbers[norm_name] = result
 714         else:
 715             # Otherwise use the analyzer to determine the canonical name.
 716             # Per convention we use the first variant as the 'lookup name', the
 717             # name that gets saved in the housenumber field of the place.
 718             word_id = analyzer.get_canonical_id(hnr)
 719             if word_id:
 720                 result = self._cache.housenumbers.get(word_id, result)
 721                 if result[0] is None:
 722                     variants = analyzer.compute_variants(word_id)
 723                     if variants:
 724                         hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
 725                                              (word_id, list(variants)))
 726                         result = hid, variants[0]
 727                         self._cache.housenumbers[word_id] = result
 728
 729         return result
 730
 731
 732     def _retrieve_full_tokens(self, name: str) -> List[int]:
 733         """ Get the full name token for the given name, if it exists.
 734             The name is only retrieved for the standard analyser.
 735         """
 736         assert self.conn is not None
 737         norm_name = self._search_normalized(name)
 738
 739         # return cached if possible
 740         if norm_name in self._cache.fulls:
 741             return self._cache.fulls[norm_name]
 742
 743         with self.conn.cursor() as cur:
 744             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 745                         (norm_name, ))
 746             full = [row[0] for row in cur]
 747
 748         self._cache.fulls[norm_name] = full
 749
 750         return full
 751
 752
 753     def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
 754         """ Computes the full name and partial name tokens for the given
 755             dictionary of names.
 756         """
 757         assert self.conn is not None
 758         full_tokens: Set[int] = set()
 759         partial_tokens: Set[int] = set()
 760
 761         for name in names:
 762             analyzer_id = name.get_attr('analyzer')
 763             analyzer = self.token_analysis.get_analyzer(analyzer_id)
 764             word_id = analyzer.get_canonical_id(name)
 765             if analyzer_id is None:
 766                 token_id = word_id
 767             else:
 768                 token_id = f'{word_id}@{analyzer_id}'
 769
 770             full, part = self._cache.names.get(token_id, (None, None))
 771             if full is None:
 772                 variants = analyzer.compute_variants(word_id)
 773                 if not variants:
 774                     continue
 775
 776                 with self.conn.cursor() as cur:
 777                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
 778                                 (token_id, variants))
 779                     full, part = cast(Tuple[int, List[int]], cur.fetchone())
 780
 781                 self._cache.names[token_id] = (full, part)
 782
 783             assert part is not None
 784
 785             full_tokens.add(full)
 786             partial_tokens.update(part)
 787
 788         return full_tokens, partial_tokens
 789
 790
 791     def _add_postcode(self, item: PlaceName) -> Optional[str]:
 792         """ Make sure the normalized postcode is present in the word table.
 793         """
 794         assert self.conn is not None
 795         analyzer = self.token_analysis.analysis.get('@postcode')
 796
 797         if analyzer is None:
 798             postcode_name = item.name.strip().upper()
 799             variant_base = None
 800         else:
 801             postcode_name = analyzer.get_canonical_id(item)
 802             variant_base = item.get_attr("variant")
 803
 804         if variant_base:
 805             postcode = f'{postcode_name}@{variant_base}'
 806         else:
 807             postcode = postcode_name
 808
 809         if postcode not in self._cache.postcodes:
 810             term = self._search_normalized(postcode_name)
 811             if not term:
 812                 return None
 813
 814             variants = {term}
 815             if analyzer is not None and variant_base:
 816                 variants.update(analyzer.compute_variants(variant_base))
 817
 818             with self.conn.cursor() as cur:
 819                 cur.execute("SELECT create_postcode_word(%s, %s)",
 820                             (postcode, list(variants)))
 821             self._cache.postcodes.add(postcode)
 822
 823         return postcode_name
 824
 825
 826 class _TokenInfo:
 827     """ Collect token information to be sent back to the database.
 828     """
 829     def __init__(self) -> None:
 830         self.names: Optional[str] = None
 831         self.housenumbers: Set[str] = set()
 832         self.housenumber_tokens: Set[int] = set()
 833         self.street_tokens: Optional[Set[int]] = None
 834         self.place_tokens: Set[int] = set()
 835         self.address_tokens: Dict[str, str] = {}
 836         self.postcode: Optional[str] = None
 837
 838
 839     def _mk_array(self, tokens: Iterable[Any]) -> str:
 840         return f"{{{','.join((str(s) for s in tokens))}}}"
 841
 842
 843     def to_dict(self) -> Dict[str, Any]:
 844         """ Return the token information in database importable format.
 845         """
 846         out: Dict[str, Any] = {}
 847
 848         if self.names:
 849             out['names'] = self.names
 850
 851         if self.housenumbers:
 852             out['hnr'] = ';'.join(self.housenumbers)
 853             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 854
 855         if self.street_tokens is not None:
 856             out['street'] = self._mk_array(self.street_tokens)
 857
 858         if self.place_tokens:
 859             out['place'] = self._mk_array(self.place_tokens)
 860
 861         if self.address_tokens:
 862             out['addr'] = self.address_tokens
 863
 864         if self.postcode:
 865             out['postcode'] = self.postcode
 866
 867         return out
 868
 869
 870     def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
 871         """ Adds token information for the normalised names.
 872         """
 873         self.names = self._mk_array(itertools.chain(fulls, partials))
 874
 875
 876     def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
 877         """ Extract housenumber information from a list of normalised
 878             housenumbers.
 879         """
 880         if token:
 881             assert hnr is not None
 882             self.housenumbers.add(hnr)
 883             self.housenumber_tokens.add(token)
 884
 885
 886     def add_street(self, tokens: Iterable[int]) -> None:
 887         """ Add addr:street match terms.
 888         """
 889         if self.street_tokens is None:
 890             self.street_tokens = set()
 891         self.street_tokens.update(tokens)
 892
 893
 894     def add_place(self, tokens: Iterable[int]) -> None:
 895         """ Add addr:place search and match terms.
 896         """
 897         self.place_tokens.update(tokens)
 898
 899
 900     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
 901         """ Add additional address terms.
 902         """
 903         array = self._mk_array(partials)
 904         if len(array) > 2:
 905             self.address_tokens[key] = array
 906
 907     def set_postcode(self, postcode: Optional[str]) -> None:
 908         """ Set the postcode to the given one.
 909         """
 910         self.postcode = postcode
 911
 912
 913 class _TokenCache:
 914     """ Cache for token information to avoid repeated database queries.
 915
 916         This cache is not thread-safe and needs to be instantiated per
 917         analyzer.
 918     """
 919     def __init__(self) -> None:
 920         self.names: Dict[str, Tuple[int, List[int]]] = {}
 921         self.partials: Dict[str, int] = {}
 922         self.fulls: Dict[str, List[int]] = {}
 923         self.postcodes: Set[str] = set()
 924         self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}