src/nominatim_db/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2024 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
  12                    Dict, Set, Iterable
  13 import itertools
  14 import logging
  15 from pathlib import Path
  16
  17 from psycopg.types.json import Jsonb
  18 from psycopg import sql as pysql
  19
  20 from ..db.connection import connect, Connection, Cursor, server_version_tuple,\
  21                             drop_tables, table_exists, execute_scalar
  22 from ..config import Configuration
  23 from ..db.sql_preprocessor import SQLPreprocessor
  24 from ..data.place_info import PlaceInfo
  25 from ..data.place_name import PlaceName
  26 from .icu_rule_loader import ICURuleLoader
  27 from .place_sanitizer import PlaceSanitizer
  28 from .icu_token_analysis import ICUTokenAnalysis
  29 from .base import AbstractAnalyzer, AbstractTokenizer
  30
  31 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  32
  33 LOG = logging.getLogger()
  34
  35 WORD_TYPES =(('country_names', 'C'),
  36              ('postcodes', 'P'),
  37              ('full_word', 'W'),
  38              ('housenumbers', 'H'))
  39
  40 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
  41     """ Create a new instance of the tokenizer provided by this module.
  42     """
  43     return ICUTokenizer(dsn, data_dir)
  44
  45
  46 class ICUTokenizer(AbstractTokenizer):
  47     """ This tokenizer uses libICU to convert names and queries to ASCII.
  48         Otherwise it uses the same algorithms and data structures as the
  49         normalization routines in Nominatim 3.
  50     """
  51
  52     def __init__(self, dsn: str, data_dir: Path) -> None:
  53         self.dsn = dsn
  54         self.data_dir = data_dir
  55         self.loader: Optional[ICURuleLoader] = None
  56
  57
  58     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
  59         """ Set up a new tokenizer for the database.
  60
  61             This copies all necessary data in the project directory to make
  62             sure the tokenizer remains stable even over updates.
  63         """
  64         self.loader = ICURuleLoader(config)
  65
  66         self._save_config()
  67
  68         if init_db:
  69             self.update_sql_functions(config)
  70             self._setup_db_tables(config)
  71             self._create_base_indices(config, 'word')
  72
  73
  74     def init_from_project(self, config: Configuration) -> None:
  75         """ Initialise the tokenizer from the project directory.
  76         """
  77         self.loader = ICURuleLoader(config)
  78
  79         with connect(self.dsn) as conn:
  80             self.loader.load_config_from_db(conn)
  81
  82
  83     def finalize_import(self, config: Configuration) -> None:
  84         """ Do any required postprocessing to make the tokenizer data ready
  85             for use.
  86         """
  87         self._create_lookup_indices(config, 'word')
  88
  89
  90     def update_sql_functions(self, config: Configuration) -> None:
  91         """ Reimport the SQL functions for this tokenizer.
  92         """
  93         with connect(self.dsn) as conn:
  94             sqlp = SQLPreprocessor(conn, config)
  95             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  96
  97
  98     def check_database(self, config: Configuration) -> None:
  99         """ Check that the tokenizer is set up correctly.
 100         """
 101         # Will throw an error if there is an issue.
 102         self.init_from_project(config)
 103
 104
 105     def update_statistics(self, config: Configuration, threads: int = 2) -> None:
 106         """ Recompute frequencies for all name words.
 107         """
 108         with connect(self.dsn) as conn:
 109             if not table_exists(conn, 'search_name'):
 110                 return
 111
 112             with conn.cursor() as cur:
 113                 cur.execute('ANALYSE search_name')
 114                 if threads > 1:
 115                     cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
 116                                      .format(pysql.Literal(min(threads, 6),)))
 117
 118                 if server_version_tuple(conn) < (12, 0):
 119                     LOG.info('Computing word frequencies')
 120                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
 121                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 122                                      SELECT unnest(name_vector) as id, count(*)
 123                                      FROM search_name GROUP BY id""")
 124                     cur.execute('CREATE INDEX ON word_frequencies(id)')
 125                     cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
 126                                      SELECT unnest(nameaddress_vector) as id, count(*)
 127                                      FROM search_name GROUP BY id""")
 128                     cur.execute('CREATE INDEX ON addressword_frequencies(id)')
 129                     cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
 130                                                                                INOUT info JSONB)
 131                                    AS $$
 132                                    DECLARE rec RECORD;
 133                                    BEGIN
 134                                    IF info is null THEN
 135                                      info = '{}'::jsonb;
 136                                    END IF;
 137                                    FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
 138                                    LOOP
 139                                      info = info || jsonb_build_object('count', rec.count);
 140                                    END LOOP;
 141                                    FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
 142                                    LOOP
 143                                      info = info || jsonb_build_object('addr_count', rec.count);
 144                                    END LOOP;
 145                                    IF info = '{}'::jsonb THEN
 146                                      info = null;
 147                                    END IF;
 148                                    END;
 149                                    $$ LANGUAGE plpgsql IMMUTABLE;
 150                                 """)
 151                     LOG.info('Update word table with recomputed frequencies')
 152                     drop_tables(conn, 'tmp_word')
 153                     cur.execute("""CREATE TABLE tmp_word AS
 154                                     SELECT word_id, word_token, type, word,
 155                                            word_freq_update(word_id, info) as info
 156                                     FROM word
 157                                 """)
 158                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
 159                 else:
 160                     LOG.info('Computing word frequencies')
 161                     drop_tables(conn, 'word_frequencies')
 162                     cur.execute("""
 163                       CREATE TEMP TABLE word_frequencies AS
 164                       WITH word_freq AS MATERIALIZED (
 165                                SELECT unnest(name_vector) as id, count(*)
 166                                      FROM search_name GROUP BY id),
 167                            addr_freq AS MATERIALIZED (
 168                                SELECT unnest(nameaddress_vector) as id, count(*)
 169                                      FROM search_name GROUP BY id)
 170                       SELECT coalesce(a.id, w.id) as id,
 171                              (CASE WHEN w.count is null THEN '{}'::JSONB
 172                                   ELSE jsonb_build_object('count', w.count) END
 173                               ||
 174                               CASE WHEN a.count is null THEN '{}'::JSONB
 175                                   ELSE jsonb_build_object('addr_count', a.count) END) as info
 176                       FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
 177                       """)
 178                     cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
 179                     cur.execute('ANALYSE word_frequencies')
 180                     LOG.info('Update word table with recomputed frequencies')
 181                     drop_tables(conn, 'tmp_word')
 182                     cur.execute("""CREATE TABLE tmp_word AS
 183                                     SELECT word_id, word_token, type, word,
 184                                            (CASE WHEN wf.info is null THEN word.info
 185                                             ELSE coalesce(word.info, '{}'::jsonb) || wf.info
 186                                             END) as info
 187                                     FROM word LEFT JOIN word_frequencies wf
 188                                          ON word.word_id = wf.id
 189                                     ORDER BY word_id
 190                                 """)
 191                     drop_tables(conn, 'word_frequencies')
 192
 193             with conn.cursor() as cur:
 194                 cur.execute('SET max_parallel_workers_per_gather TO 0')
 195
 196             sqlp = SQLPreprocessor(conn, config)
 197             sqlp.run_string(conn,
 198                             'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
 199             conn.commit()
 200         self._create_base_indices(config, 'tmp_word')
 201         self._create_lookup_indices(config, 'tmp_word')
 202         self._move_temporary_word_table('tmp_word')
 203
 204
 205
 206     def _cleanup_housenumbers(self) -> None:
 207         """ Remove unused house numbers.
 208         """
 209         with connect(self.dsn) as conn:
 210             if not table_exists(conn, 'search_name'):
 211                 return
 212             with conn.cursor(name="hnr_counter") as cur:
 213                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
 214                                FROM word
 215                                WHERE type = 'H'
 216                                  AND NOT EXISTS(SELECT * FROM search_name
 217                                                 WHERE ARRAY[word.word_id] && name_vector)
 218                                  AND (char_length(coalesce(word, word_token)) > 6
 219                                       OR coalesce(word, word_token) not similar to '\\d+')
 220                             """)
 221                 candidates = {token: wid for wid, token in cur}
 222             with conn.cursor(name="hnr_counter") as cur:
 223                 cur.execute("""SELECT housenumber FROM placex
 224                                WHERE housenumber is not null
 225                                      AND (char_length(housenumber) > 6
 226                                           OR housenumber not similar to '\\d+')
 227                             """)
 228                 for row in cur:
 229                     for hnr in row[0].split(';'):
 230                         candidates.pop(hnr, None)
 231             LOG.info("There are %s outdated housenumbers.", len(candidates))
 232             LOG.debug("Outdated housenumbers: %s", candidates.keys())
 233             if candidates:
 234                 with conn.cursor() as cur:
 235                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 236                                 (list(candidates.values()), ))
 237                 conn.commit()
 238
 239
 240
 241     def update_word_tokens(self) -> None:
 242         """ Remove unused tokens.
 243         """
 244         LOG.warning("Cleaning up housenumber tokens.")
 245         self._cleanup_housenumbers()
 246         LOG.warning("Tokenizer house-keeping done.")
 247
 248
 249     def name_analyzer(self) -> 'ICUNameAnalyzer':
 250         """ Create a new analyzer for tokenizing names and queries
 251             using this tokinzer. Analyzers are context managers and should
 252             be used accordingly:
 253
 254             ```
 255             with tokenizer.name_analyzer() as analyzer:
 256                 analyser.tokenize()
 257             ```
 258
 259             When used outside the with construct, the caller must ensure to
 260             call the close() function before destructing the analyzer.
 261
 262             Analyzers are not thread-safe. You need to instantiate one per thread.
 263         """
 264         assert self.loader is not None
 265         return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 266                                self.loader.make_token_analysis())
 267
 268
 269     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
 270         """ Return a list of the `num` most frequent full words
 271             in the database.
 272         """
 273         with conn.cursor() as cur:
 274             cur.execute("""SELECT word, sum((info->>'count')::int) as count
 275                              FROM word WHERE type = 'W'
 276                              GROUP BY word
 277                              ORDER BY count DESC LIMIT %s""", (num,))
 278             return list(s[0].split('@')[0] for s in cur)
 279
 280
 281     def _save_config(self) -> None:
 282         """ Save the configuration that needs to remain stable for the given
 283             database as database properties.
 284         """
 285         assert self.loader is not None
 286         with connect(self.dsn) as conn:
 287             self.loader.save_config_to_db(conn)
 288
 289
 290     def _setup_db_tables(self, config: Configuration) -> None:
 291         """ Set up the word table and fill it with pre-computed word
 292             frequencies.
 293         """
 294         with connect(self.dsn) as conn:
 295             drop_tables(conn, 'word')
 296             sqlp = SQLPreprocessor(conn, config)
 297             sqlp.run_string(conn, """
 298                 CREATE TABLE word (
 299                       word_id INTEGER,
 300                       word_token text NOT NULL,
 301                       type text NOT NULL,
 302                       word text,
 303                       info jsonb
 304                     ) {{db.tablespace.search_data}};
 305                 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
 306
 307                 DROP SEQUENCE IF EXISTS seq_word;
 308                 CREATE SEQUENCE seq_word start 1;
 309                 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
 310             """)
 311             conn.commit()
 312
 313
 314     def _create_base_indices(self, config: Configuration, table_name: str) -> None:
 315         """ Set up the word table and fill it with pre-computed word
 316             frequencies.
 317         """
 318         with connect(self.dsn) as conn:
 319             sqlp = SQLPreprocessor(conn, config)
 320             sqlp.run_string(conn,
 321                             """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
 322                                USING BTREE (word_token) {{db.tablespace.search_index}}""",
 323                             table_name=table_name)
 324             for name, ctype in WORD_TYPES:
 325                 sqlp.run_string(conn,
 326                                 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
 327                                    USING BTREE (word) {{db.tablespace.address_index}}
 328                                    WHERE type = '{{column_type}}'
 329                                 """,
 330                                 table_name=table_name, idx_name=name,
 331                                 column_type=ctype)
 332             conn.commit()
 333
 334
 335     def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
 336         """ Create additional indexes used when running the API.
 337         """
 338         with connect(self.dsn) as conn:
 339             sqlp = SQLPreprocessor(conn, config)
 340             # Index required for details lookup.
 341             sqlp.run_string(conn, """
 342                 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
 343                   ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
 344             """,
 345             table_name=table_name)
 346             conn.commit()
 347
 348
 349     def _move_temporary_word_table(self, old: str) -> None:
 350         """ Rename all tables and indexes used by the tokenizer.
 351         """
 352         with connect(self.dsn) as conn:
 353             drop_tables(conn, 'word')
 354             with conn.cursor() as cur:
 355                 cur.execute(f"ALTER TABLE {old} RENAME TO word")
 356                 for idx in ('word_token', 'word_id'):
 357                     cur.execute(f"""ALTER INDEX idx_{old}_{idx}
 358                                       RENAME TO idx_word_{idx}""")
 359                 for name, _ in WORD_TYPES:
 360                     cur.execute(f"""ALTER INDEX idx_{old}_{name}
 361                                     RENAME TO idx_word_{name}""")
 362             conn.commit()
 363
 364
 365
 366
 367 class ICUNameAnalyzer(AbstractAnalyzer):
 368     """ The ICU analyzer uses the ICU library for splitting names.
 369
 370         Each instance opens a connection to the database to request the
 371         normalization.
 372     """
 373
 374     def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
 375                  token_analysis: ICUTokenAnalysis) -> None:
 376         self.conn: Optional[Connection] = connect(dsn)
 377         self.conn.autocommit = True
 378         self.sanitizer = sanitizer
 379         self.token_analysis = token_analysis
 380
 381         self._cache = _TokenCache()
 382
 383
 384     def close(self) -> None:
 385         """ Free all resources used by the analyzer.
 386         """
 387         if self.conn:
 388             self.conn.close()
 389             self.conn = None
 390
 391
 392     def _search_normalized(self, name: str) -> str:
 393         """ Return the search token transliteration of the given name.
 394         """
 395         return cast(str, self.token_analysis.search.transliterate(name)).strip()
 396
 397
 398     def _normalized(self, name: str) -> str:
 399         """ Return the normalized version of the given name with all
 400             non-relevant information removed.
 401         """
 402         return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
 403
 404
 405     def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
 406         """ Return token information for the given list of words.
 407             If a word starts with # it is assumed to be a full name
 408             otherwise is a partial name.
 409
 410             The function returns a list of tuples with
 411             (original word, word token, word id).
 412
 413             The function is used for testing and debugging only
 414             and not necessarily efficient.
 415         """
 416         assert self.conn is not None
 417         full_tokens = {}
 418         partial_tokens = {}
 419         for word in words:
 420             if word.startswith('#'):
 421                 full_tokens[word] = self._search_normalized(word[1:])
 422             else:
 423                 partial_tokens[word] = self._search_normalized(word)
 424
 425         with self.conn.cursor() as cur:
 426             cur.execute("""SELECT word_token, word_id
 427                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 428                         """, (list(full_tokens.values()),))
 429             full_ids = {r[0]: r[1] for r in cur}
 430             cur.execute("""SELECT word_token, word_id
 431                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 432                         (list(partial_tokens.values()),))
 433             part_ids = {r[0]: r[1] for r in cur}
 434
 435         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 436                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 437
 438
 439     def normalize_postcode(self, postcode: str) -> str:
 440         """ Convert the postcode to a standardized form.
 441
 442             This function must yield exactly the same result as the SQL function
 443             'token_normalized_postcode()'.
 444         """
 445         return postcode.strip().upper()
 446
 447
 448     def update_postcodes_from_db(self) -> None:
 449         """ Update postcode tokens in the word table from the location_postcode
 450             table.
 451         """
 452         assert self.conn is not None
 453         analyzer = self.token_analysis.analysis.get('@postcode')
 454
 455         with self.conn.cursor() as cur:
 456             # First get all postcode names currently in the word table.
 457             cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
 458             word_entries = set((entry[0] for entry in cur))
 459
 460             # Then compute the required postcode names from the postcode table.
 461             needed_entries = set()
 462             cur.execute("SELECT country_code, postcode FROM location_postcode")
 463             for cc, postcode in cur:
 464                 info = PlaceInfo({'country_code': cc,
 465                                   'class': 'place', 'type': 'postcode',
 466                                   'address': {'postcode': postcode}})
 467                 address = self.sanitizer.process_names(info)[1]
 468                 for place in address:
 469                     if place.kind == 'postcode':
 470                         if analyzer is None:
 471                             postcode_name = place.name.strip().upper()
 472                             variant_base = None
 473                         else:
 474                             postcode_name = analyzer.get_canonical_id(place)
 475                             variant_base = place.get_attr("variant")
 476
 477                         if variant_base:
 478                             needed_entries.add(f'{postcode_name}@{variant_base}')
 479                         else:
 480                             needed_entries.add(postcode_name)
 481                         break
 482
 483         # Now update the word table.
 484         self._delete_unused_postcode_words(word_entries - needed_entries)
 485         self._add_missing_postcode_words(needed_entries - word_entries)
 486
 487     def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
 488         assert self.conn is not None
 489         if tokens:
 490             with self.conn.cursor() as cur:
 491                 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
 492                             (list(tokens), ))
 493
 494     def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
 495         assert self.conn is not None
 496         if not tokens:
 497             return
 498
 499         analyzer = self.token_analysis.analysis.get('@postcode')
 500         terms = []
 501
 502         for postcode_name in tokens:
 503             if '@' in postcode_name:
 504                 term, variant = postcode_name.split('@', 2)
 505                 term = self._search_normalized(term)
 506                 if analyzer is None:
 507                     variants = [term]
 508                 else:
 509                     variants = analyzer.compute_variants(variant)
 510                     if term not in variants:
 511                         variants.append(term)
 512             else:
 513                 variants = [self._search_normalized(postcode_name)]
 514             terms.append((postcode_name, variants))
 515
 516         if terms:
 517             with self.conn.cursor() as cur:
 518                 cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
 519
 520
 521
 522
 523     def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
 524                                should_replace: bool) -> None:
 525         """ Replace the search index for special phrases with the new phrases.
 526             If `should_replace` is True, then the previous set of will be
 527             completely replaced. Otherwise the phrases are added to the
 528             already existing ones.
 529         """
 530         assert self.conn is not None
 531         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 532                             for p in phrases))
 533
 534         with self.conn.cursor() as cur:
 535             # Get the old phrases.
 536             existing_phrases = set()
 537             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 538             for word, info in cur:
 539                 existing_phrases.add((word, info['class'], info['type'],
 540                                       info.get('op') or '-'))
 541
 542             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 543             if should_replace:
 544                 deleted = self._remove_special_phrases(cur, norm_phrases,
 545                                                        existing_phrases)
 546             else:
 547                 deleted = 0
 548
 549         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 550                  len(norm_phrases), added, deleted)
 551
 552
 553     def _add_special_phrases(self, cursor: Cursor,
 554                              new_phrases: Set[Tuple[str, str, str, str]],
 555                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 556         """ Add all phrases to the database that are not yet there.
 557         """
 558         to_add = new_phrases - existing_phrases
 559
 560         added = 0
 561         with cursor.copy('COPY word(word_token, type, word, info) FROM STDIN') as copy:
 562             for word, cls, typ, oper in to_add:
 563                 term = self._search_normalized(word)
 564                 if term:
 565                     copy.write_row((term, 'S', word,
 566                                     Jsonb({'class': cls, 'type': typ,
 567                                            'op': oper if oper in ('in', 'near') else None})))
 568                     added += 1
 569
 570         return added
 571
 572
 573     def _remove_special_phrases(self, cursor: Cursor,
 574                              new_phrases: Set[Tuple[str, str, str, str]],
 575                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 576         """ Remove all phrases from the database that are no longer in the
 577             new phrase list.
 578         """
 579         to_delete = existing_phrases - new_phrases
 580
 581         if to_delete:
 582             cursor.executemany(
 583                 """ DELETE FROM word
 584                       WHERE type = 'S' and word = %s
 585                             and info->>'class' = %s and info->>'type' = %s
 586                             and %s = coalesce(info->>'op', '-')
 587                 """, to_delete)
 588
 589         return len(to_delete)
 590
 591
 592     def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
 593         """ Add default names for the given country to the search index.
 594         """
 595         # Make sure any name preprocessing for country names applies.
 596         info = PlaceInfo({'name': names, 'country_code': country_code,
 597                           'rank_address': 4, 'class': 'boundary',
 598                           'type': 'administrative'})
 599         self._add_country_full_names(country_code,
 600                                      self.sanitizer.process_names(info)[0],
 601                                      internal=True)
 602
 603
 604     def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
 605                                 internal: bool = False) -> None:
 606         """ Add names for the given country from an already sanitized
 607             name list.
 608         """
 609         assert self.conn is not None
 610         word_tokens = set()
 611         for name in names:
 612             norm_name = self._search_normalized(name.name)
 613             if norm_name:
 614                 word_tokens.add(norm_name)
 615
 616         with self.conn.cursor() as cur:
 617             # Get existing names
 618             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
 619                              FROM word
 620                              WHERE type = 'C' and word = %s""",
 621                         (country_code, ))
 622             # internal/external names
 623             existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
 624             for word in cur:
 625                 existing_tokens[word[1]].add(word[0])
 626
 627             # Delete names that no longer exist.
 628             gone_tokens = existing_tokens[internal] - word_tokens
 629             if internal:
 630                 gone_tokens.update(existing_tokens[False] & word_tokens)
 631             if gone_tokens:
 632                 cur.execute("""DELETE FROM word
 633                                USING unnest(%s::text[]) as token
 634                                WHERE type = 'C' and word = %s
 635                                      and word_token = token""",
 636                             (list(gone_tokens), country_code))
 637
 638             # Only add those names that are not yet in the list.
 639             new_tokens = word_tokens - existing_tokens[True]
 640             if not internal:
 641                 new_tokens -= existing_tokens[False]
 642             if new_tokens:
 643                 if internal:
 644                     sql = """INSERT INTO word (word_token, type, word, info)
 645                                (SELECT token, 'C', %s, '{"internal": "yes"}'
 646                                   FROM unnest(%s::text[]) as token)
 647                            """
 648                 else:
 649                     sql = """INSERT INTO word (word_token, type, word)
 650                                    (SELECT token, 'C', %s
 651                                     FROM unnest(%s::text[]) as token)
 652                           """
 653                 cur.execute(sql, (country_code, list(new_tokens)))
 654
 655
 656     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
 657         """ Determine tokenizer information about the given place.
 658
 659             Returns a JSON-serializable structure that will be handed into
 660             the database via the token_info field.
 661         """
 662         token_info = _TokenInfo()
 663
 664         names, address = self.sanitizer.process_names(place)
 665
 666         if names:
 667             token_info.set_names(*self._compute_name_tokens(names))
 668
 669             if place.is_country():
 670                 assert place.country_code is not None
 671                 self._add_country_full_names(place.country_code, names)
 672
 673         if address:
 674             self._process_place_address(token_info, address)
 675
 676         return token_info.to_dict()
 677
 678
 679     def _process_place_address(self, token_info: '_TokenInfo',
 680                                address: Sequence[PlaceName]) -> None:
 681         for item in address:
 682             if item.kind == 'postcode':
 683                 token_info.set_postcode(self._add_postcode(item))
 684             elif item.kind == 'housenumber':
 685                 token_info.add_housenumber(*self._compute_housenumber_token(item))
 686             elif item.kind == 'street':
 687                 token_info.add_street(self._retrieve_full_tokens(item.name))
 688             elif item.kind == 'place':
 689                 if not item.suffix:
 690                     token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
 691             elif not item.kind.startswith('_') and not item.suffix and \
 692                  item.kind not in ('country', 'full', 'inclusion'):
 693                 token_info.add_address_term(item.kind,
 694                                             itertools.chain(*self._compute_name_tokens([item])))
 695
 696
 697     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
 698         """ Normalize the housenumber and return the word token and the
 699             canonical form.
 700         """
 701         assert self.conn is not None
 702         analyzer = self.token_analysis.analysis.get('@housenumber')
 703         result: Tuple[Optional[int], Optional[str]] = (None, None)
 704
 705         if analyzer is None:
 706             # When no custom analyzer is set, simply normalize and transliterate
 707             norm_name = self._search_normalized(hnr.name)
 708             if norm_name:
 709                 result = self._cache.housenumbers.get(norm_name, result)
 710                 if result[0] is None:
 711                     hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
 712
 713                     result = hid, norm_name
 714                     self._cache.housenumbers[norm_name] = result
 715         else:
 716             # Otherwise use the analyzer to determine the canonical name.
 717             # Per convention we use the first variant as the 'lookup name', the
 718             # name that gets saved in the housenumber field of the place.
 719             word_id = analyzer.get_canonical_id(hnr)
 720             if word_id:
 721                 result = self._cache.housenumbers.get(word_id, result)
 722                 if result[0] is None:
 723                     variants = analyzer.compute_variants(word_id)
 724                     if variants:
 725                         hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
 726                                              (word_id, list(variants)))
 727                         result = hid, variants[0]
 728                         self._cache.housenumbers[word_id] = result
 729
 730         return result
 731
 732
 733     def _retrieve_full_tokens(self, name: str) -> List[int]:
 734         """ Get the full name token for the given name, if it exists.
 735             The name is only retrieved for the standard analyser.
 736         """
 737         assert self.conn is not None
 738         norm_name = self._search_normalized(name)
 739
 740         # return cached if possible
 741         if norm_name in self._cache.fulls:
 742             return self._cache.fulls[norm_name]
 743
 744         with self.conn.cursor() as cur:
 745             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 746                         (norm_name, ))
 747             full = [row[0] for row in cur]
 748
 749         self._cache.fulls[norm_name] = full
 750
 751         return full
 752
 753
 754     def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
 755         """ Computes the full name and partial name tokens for the given
 756             dictionary of names.
 757         """
 758         assert self.conn is not None
 759         full_tokens: Set[int] = set()
 760         partial_tokens: Set[int] = set()
 761
 762         for name in names:
 763             analyzer_id = name.get_attr('analyzer')
 764             analyzer = self.token_analysis.get_analyzer(analyzer_id)
 765             word_id = analyzer.get_canonical_id(name)
 766             if analyzer_id is None:
 767                 token_id = word_id
 768             else:
 769                 token_id = f'{word_id}@{analyzer_id}'
 770
 771             full, part = self._cache.names.get(token_id, (None, None))
 772             if full is None:
 773                 variants = analyzer.compute_variants(word_id)
 774                 if not variants:
 775                     continue
 776
 777                 with self.conn.cursor() as cur:
 778                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
 779                                 (token_id, variants))
 780                     full, part = cast(Tuple[int, List[int]], cur.fetchone())
 781
 782                 self._cache.names[token_id] = (full, part)
 783
 784             assert part is not None
 785
 786             full_tokens.add(full)
 787             partial_tokens.update(part)
 788
 789         return full_tokens, partial_tokens
 790
 791
 792     def _add_postcode(self, item: PlaceName) -> Optional[str]:
 793         """ Make sure the normalized postcode is present in the word table.
 794         """
 795         assert self.conn is not None
 796         analyzer = self.token_analysis.analysis.get('@postcode')
 797
 798         if analyzer is None:
 799             postcode_name = item.name.strip().upper()
 800             variant_base = None
 801         else:
 802             postcode_name = analyzer.get_canonical_id(item)
 803             variant_base = item.get_attr("variant")
 804
 805         if variant_base:
 806             postcode = f'{postcode_name}@{variant_base}'
 807         else:
 808             postcode = postcode_name
 809
 810         if postcode not in self._cache.postcodes:
 811             term = self._search_normalized(postcode_name)
 812             if not term:
 813                 return None
 814
 815             variants = {term}
 816             if analyzer is not None and variant_base:
 817                 variants.update(analyzer.compute_variants(variant_base))
 818
 819             with self.conn.cursor() as cur:
 820                 cur.execute("SELECT create_postcode_word(%s, %s)",
 821                             (postcode, list(variants)))
 822             self._cache.postcodes.add(postcode)
 823
 824         return postcode_name
 825
 826
 827 class _TokenInfo:
 828     """ Collect token information to be sent back to the database.
 829     """
 830     def __init__(self) -> None:
 831         self.names: Optional[str] = None
 832         self.housenumbers: Set[str] = set()
 833         self.housenumber_tokens: Set[int] = set()
 834         self.street_tokens: Optional[Set[int]] = None
 835         self.place_tokens: Set[int] = set()
 836         self.address_tokens: Dict[str, str] = {}
 837         self.postcode: Optional[str] = None
 838
 839
 840     def _mk_array(self, tokens: Iterable[Any]) -> str:
 841         return f"{{{','.join((str(s) for s in tokens))}}}"
 842
 843
 844     def to_dict(self) -> Dict[str, Any]:
 845         """ Return the token information in database importable format.
 846         """
 847         out: Dict[str, Any] = {}
 848
 849         if self.names:
 850             out['names'] = self.names
 851
 852         if self.housenumbers:
 853             out['hnr'] = ';'.join(self.housenumbers)
 854             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 855
 856         if self.street_tokens is not None:
 857             out['street'] = self._mk_array(self.street_tokens)
 858
 859         if self.place_tokens:
 860             out['place'] = self._mk_array(self.place_tokens)
 861
 862         if self.address_tokens:
 863             out['addr'] = self.address_tokens
 864
 865         if self.postcode:
 866             out['postcode'] = self.postcode
 867
 868         return out
 869
 870
 871     def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
 872         """ Adds token information for the normalised names.
 873         """
 874         self.names = self._mk_array(itertools.chain(fulls, partials))
 875
 876
 877     def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
 878         """ Extract housenumber information from a list of normalised
 879             housenumbers.
 880         """
 881         if token:
 882             assert hnr is not None
 883             self.housenumbers.add(hnr)
 884             self.housenumber_tokens.add(token)
 885
 886
 887     def add_street(self, tokens: Iterable[int]) -> None:
 888         """ Add addr:street match terms.
 889         """
 890         if self.street_tokens is None:
 891             self.street_tokens = set()
 892         self.street_tokens.update(tokens)
 893
 894
 895     def add_place(self, tokens: Iterable[int]) -> None:
 896         """ Add addr:place search and match terms.
 897         """
 898         self.place_tokens.update(tokens)
 899
 900
 901     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
 902         """ Add additional address terms.
 903         """
 904         array = self._mk_array(partials)
 905         if len(array) > 2:
 906             self.address_tokens[key] = array
 907
 908     def set_postcode(self, postcode: Optional[str]) -> None:
 909         """ Set the postcode to the given one.
 910         """
 911         self.postcode = postcode
 912
 913
 914 class _TokenCache:
 915     """ Cache for token information to avoid repeated database queries.
 916
 917         This cache is not thread-safe and needs to be instantiated per
 918         analyzer.
 919     """
 920     def __init__(self) -> None:
 921         self.names: Dict[str, Tuple[int, List[int]]] = {}
 922         self.partials: Dict[str, int] = {}
 923         self.fulls: Dict[str, List[int]] = {}
 924         self.postcodes: Set[str] = set()
 925         self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}