src/nominatim_db/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2024 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
  12                    Dict, Set, Iterable
  13 import itertools
  14 import json
  15 import logging
  16 from pathlib import Path
  17 from textwrap import dedent
  18
  19 from ..db.connection import connect, Connection, Cursor, server_version_tuple,\
  20                             drop_tables, table_exists, execute_scalar
  21 from ..config import Configuration
  22 from ..db.utils import CopyBuffer
  23 from ..db.sql_preprocessor import SQLPreprocessor
  24 from ..data.place_info import PlaceInfo
  25 from ..data.place_name import PlaceName
  26 from .icu_rule_loader import ICURuleLoader
  27 from .place_sanitizer import PlaceSanitizer
  28 from .icu_token_analysis import ICUTokenAnalysis
  29 from .base import AbstractAnalyzer, AbstractTokenizer
  30
  31 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  32
  33 LOG = logging.getLogger()
  34
  35 WORD_TYPES =(('country_names', 'C'),
  36              ('postcodes', 'P'),
  37              ('full_word', 'W'),
  38              ('housenumbers', 'H'))
  39
  40 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
  41     """ Create a new instance of the tokenizer provided by this module.
  42     """
  43     return ICUTokenizer(dsn, data_dir)
  44
  45
  46 class ICUTokenizer(AbstractTokenizer):
  47     """ This tokenizer uses libICU to convert names and queries to ASCII.
  48         Otherwise it uses the same algorithms and data structures as the
  49         normalization routines in Nominatim 3.
  50     """
  51
  52     def __init__(self, dsn: str, data_dir: Path) -> None:
  53         self.dsn = dsn
  54         self.data_dir = data_dir
  55         self.loader: Optional[ICURuleLoader] = None
  56
  57
  58     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
  59         """ Set up a new tokenizer for the database.
  60
  61             This copies all necessary data in the project directory to make
  62             sure the tokenizer remains stable even over updates.
  63         """
  64         self.loader = ICURuleLoader(config)
  65
  66         self._install_php(config.lib_dir.php, overwrite=True)
  67         self._save_config()
  68
  69         if init_db:
  70             self.update_sql_functions(config)
  71             self._setup_db_tables(config)
  72             self._create_base_indices(config, 'word')
  73
  74
  75     def init_from_project(self, config: Configuration) -> None:
  76         """ Initialise the tokenizer from the project directory.
  77         """
  78         self.loader = ICURuleLoader(config)
  79
  80         with connect(self.dsn) as conn:
  81             self.loader.load_config_from_db(conn)
  82
  83         self._install_php(config.lib_dir.php, overwrite=False)
  84
  85
  86     def finalize_import(self, config: Configuration) -> None:
  87         """ Do any required postprocessing to make the tokenizer data ready
  88             for use.
  89         """
  90         self._create_lookup_indices(config, 'word')
  91
  92
  93     def update_sql_functions(self, config: Configuration) -> None:
  94         """ Reimport the SQL functions for this tokenizer.
  95         """
  96         with connect(self.dsn) as conn:
  97             sqlp = SQLPreprocessor(conn, config)
  98             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  99
 100
 101     def check_database(self, config: Configuration) -> None:
 102         """ Check that the tokenizer is set up correctly.
 103         """
 104         # Will throw an error if there is an issue.
 105         self.init_from_project(config)
 106
 107
 108     def update_statistics(self, config: Configuration, threads: int = 2) -> None:
 109         """ Recompute frequencies for all name words.
 110         """
 111         with connect(self.dsn) as conn:
 112             if not table_exists(conn, 'search_name'):
 113                 return
 114
 115             with conn.cursor() as cur:
 116                 cur.execute('ANALYSE search_name')
 117                 if threads > 1:
 118                     cur.execute('SET max_parallel_workers_per_gather TO %s',
 119                                 (min(threads, 6),))
 120
 121                 if server_version_tuple(conn) < (12, 0):
 122                     LOG.info('Computing word frequencies')
 123                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
 124                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 125                                      SELECT unnest(name_vector) as id, count(*)
 126                                      FROM search_name GROUP BY id""")
 127                     cur.execute('CREATE INDEX ON word_frequencies(id)')
 128                     cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
 129                                      SELECT unnest(nameaddress_vector) as id, count(*)
 130                                      FROM search_name GROUP BY id""")
 131                     cur.execute('CREATE INDEX ON addressword_frequencies(id)')
 132                     cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
 133                                                                                INOUT info JSONB)
 134                                    AS $$
 135                                    DECLARE rec RECORD;
 136                                    BEGIN
 137                                    IF info is null THEN
 138                                      info = '{}'::jsonb;
 139                                    END IF;
 140                                    FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
 141                                    LOOP
 142                                      info = info || jsonb_build_object('count', rec.count);
 143                                    END LOOP;
 144                                    FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
 145                                    LOOP
 146                                      info = info || jsonb_build_object('addr_count', rec.count);
 147                                    END LOOP;
 148                                    IF info = '{}'::jsonb THEN
 149                                      info = null;
 150                                    END IF;
 151                                    END;
 152                                    $$ LANGUAGE plpgsql IMMUTABLE;
 153                                 """)
 154                     LOG.info('Update word table with recomputed frequencies')
 155                     drop_tables(conn, 'tmp_word')
 156                     cur.execute("""CREATE TABLE tmp_word AS
 157                                     SELECT word_id, word_token, type, word,
 158                                            word_freq_update(word_id, info) as info
 159                                     FROM word
 160                                 """)
 161                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
 162                 else:
 163                     LOG.info('Computing word frequencies')
 164                     drop_tables(conn, 'word_frequencies')
 165                     cur.execute("""
 166                       CREATE TEMP TABLE word_frequencies AS
 167                       WITH word_freq AS MATERIALIZED (
 168                                SELECT unnest(name_vector) as id, count(*)
 169                                      FROM search_name GROUP BY id),
 170                            addr_freq AS MATERIALIZED (
 171                                SELECT unnest(nameaddress_vector) as id, count(*)
 172                                      FROM search_name GROUP BY id)
 173                       SELECT coalesce(a.id, w.id) as id,
 174                              (CASE WHEN w.count is null THEN '{}'::JSONB
 175                                   ELSE jsonb_build_object('count', w.count) END
 176                               ||
 177                               CASE WHEN a.count is null THEN '{}'::JSONB
 178                                   ELSE jsonb_build_object('addr_count', a.count) END) as info
 179                       FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
 180                       """)
 181                     cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
 182                     cur.execute('ANALYSE word_frequencies')
 183                     LOG.info('Update word table with recomputed frequencies')
 184                     drop_tables(conn, 'tmp_word')
 185                     cur.execute("""CREATE TABLE tmp_word AS
 186                                     SELECT word_id, word_token, type, word,
 187                                            (CASE WHEN wf.info is null THEN word.info
 188                                             ELSE coalesce(word.info, '{}'::jsonb) || wf.info
 189                                             END) as info
 190                                     FROM word LEFT JOIN word_frequencies wf
 191                                          ON word.word_id = wf.id
 192                                 """)
 193                     drop_tables(conn, 'word_frequencies')
 194
 195             with conn.cursor() as cur:
 196                 cur.execute('SET max_parallel_workers_per_gather TO 0')
 197
 198             sqlp = SQLPreprocessor(conn, config)
 199             sqlp.run_string(conn,
 200                             'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
 201             conn.commit()
 202         self._create_base_indices(config, 'tmp_word')
 203         self._create_lookup_indices(config, 'tmp_word')
 204         self._move_temporary_word_table('tmp_word')
 205
 206
 207
 208     def _cleanup_housenumbers(self) -> None:
 209         """ Remove unused house numbers.
 210         """
 211         with connect(self.dsn) as conn:
 212             if not table_exists(conn, 'search_name'):
 213                 return
 214             with conn.cursor(name="hnr_counter") as cur:
 215                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
 216                                FROM word
 217                                WHERE type = 'H'
 218                                  AND NOT EXISTS(SELECT * FROM search_name
 219                                                 WHERE ARRAY[word.word_id] && name_vector)
 220                                  AND (char_length(coalesce(word, word_token)) > 6
 221                                       OR coalesce(word, word_token) not similar to '\\d+')
 222                             """)
 223                 candidates = {token: wid for wid, token in cur}
 224             with conn.cursor(name="hnr_counter") as cur:
 225                 cur.execute("""SELECT housenumber FROM placex
 226                                WHERE housenumber is not null
 227                                      AND (char_length(housenumber) > 6
 228                                           OR housenumber not similar to '\\d+')
 229                             """)
 230                 for row in cur:
 231                     for hnr in row[0].split(';'):
 232                         candidates.pop(hnr, None)
 233             LOG.info("There are %s outdated housenumbers.", len(candidates))
 234             LOG.debug("Outdated housenumbers: %s", candidates.keys())
 235             if candidates:
 236                 with conn.cursor() as cur:
 237                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 238                                 (list(candidates.values()), ))
 239                 conn.commit()
 240
 241
 242
 243     def update_word_tokens(self) -> None:
 244         """ Remove unused tokens.
 245         """
 246         LOG.warning("Cleaning up housenumber tokens.")
 247         self._cleanup_housenumbers()
 248         LOG.warning("Tokenizer house-keeping done.")
 249
 250
 251     def name_analyzer(self) -> 'ICUNameAnalyzer':
 252         """ Create a new analyzer for tokenizing names and queries
 253             using this tokinzer. Analyzers are context managers and should
 254             be used accordingly:
 255
 256             ```
 257             with tokenizer.name_analyzer() as analyzer:
 258                 analyser.tokenize()
 259             ```
 260
 261             When used outside the with construct, the caller must ensure to
 262             call the close() function before destructing the analyzer.
 263
 264             Analyzers are not thread-safe. You need to instantiate one per thread.
 265         """
 266         assert self.loader is not None
 267         return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 268                                self.loader.make_token_analysis())
 269
 270
 271     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
 272         """ Return a list of the `num` most frequent full words
 273             in the database.
 274         """
 275         with conn.cursor() as cur:
 276             cur.execute("""SELECT word, sum((info->>'count')::int) as count
 277                              FROM word WHERE type = 'W'
 278                              GROUP BY word
 279                              ORDER BY count DESC LIMIT %s""", (num,))
 280             return list(s[0].split('@')[0] for s in cur)
 281
 282
 283     def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
 284         """ Install the php script for the tokenizer.
 285         """
 286         if phpdir is not None:
 287             assert self.loader is not None
 288             php_file = self.data_dir / "tokenizer.php"
 289
 290             if not php_file.exists() or overwrite:
 291                 php_file.write_text(dedent(f"""\
 292                     <?php
 293                     @define('CONST_Max_Word_Frequency', 10000000);
 294                     @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
 295                     @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 296                     require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
 297
 298
 299     def _save_config(self) -> None:
 300         """ Save the configuration that needs to remain stable for the given
 301             database as database properties.
 302         """
 303         assert self.loader is not None
 304         with connect(self.dsn) as conn:
 305             self.loader.save_config_to_db(conn)
 306
 307
 308     def _setup_db_tables(self, config: Configuration) -> None:
 309         """ Set up the word table and fill it with pre-computed word
 310             frequencies.
 311         """
 312         with connect(self.dsn) as conn:
 313             drop_tables(conn, 'word')
 314             sqlp = SQLPreprocessor(conn, config)
 315             sqlp.run_string(conn, """
 316                 CREATE TABLE word (
 317                       word_id INTEGER,
 318                       word_token text NOT NULL,
 319                       type text NOT NULL,
 320                       word text,
 321                       info jsonb
 322                     ) {{db.tablespace.search_data}};
 323                 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
 324
 325                 DROP SEQUENCE IF EXISTS seq_word;
 326                 CREATE SEQUENCE seq_word start 1;
 327                 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
 328             """)
 329             conn.commit()
 330
 331
 332     def _create_base_indices(self, config: Configuration, table_name: str) -> None:
 333         """ Set up the word table and fill it with pre-computed word
 334             frequencies.
 335         """
 336         with connect(self.dsn) as conn:
 337             sqlp = SQLPreprocessor(conn, config)
 338             sqlp.run_string(conn,
 339                             """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
 340                                USING BTREE (word_token) {{db.tablespace.search_index}}""",
 341                             table_name=table_name)
 342             for name, ctype in WORD_TYPES:
 343                 sqlp.run_string(conn,
 344                                 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
 345                                    USING BTREE (word) {{db.tablespace.address_index}}
 346                                    WHERE type = '{{column_type}}'
 347                                 """,
 348                                 table_name=table_name, idx_name=name,
 349                                 column_type=ctype)
 350             conn.commit()
 351
 352
 353     def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
 354         """ Create additional indexes used when running the API.
 355         """
 356         with connect(self.dsn) as conn:
 357             sqlp = SQLPreprocessor(conn, config)
 358             # Index required for details lookup.
 359             sqlp.run_string(conn, """
 360                 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
 361                   ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
 362             """,
 363             table_name=table_name)
 364             conn.commit()
 365
 366
 367     def _move_temporary_word_table(self, old: str) -> None:
 368         """ Rename all tables and indexes used by the tokenizer.
 369         """
 370         with connect(self.dsn) as conn:
 371             drop_tables(conn, 'word')
 372             with conn.cursor() as cur:
 373                 cur.execute(f"ALTER TABLE {old} RENAME TO word")
 374                 for idx in ('word_token', 'word_id'):
 375                     cur.execute(f"""ALTER INDEX idx_{old}_{idx}
 376                                       RENAME TO idx_word_{idx}""")
 377                 for name, _ in WORD_TYPES:
 378                     cur.execute(f"""ALTER INDEX idx_{old}_{name}
 379                                     RENAME TO idx_word_{name}""")
 380             conn.commit()
 381
 382
 383
 384
 385 class ICUNameAnalyzer(AbstractAnalyzer):
 386     """ The ICU analyzer uses the ICU library for splitting names.
 387
 388         Each instance opens a connection to the database to request the
 389         normalization.
 390     """
 391
 392     def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
 393                  token_analysis: ICUTokenAnalysis) -> None:
 394         self.conn: Optional[Connection] = connect(dsn).connection
 395         self.conn.autocommit = True
 396         self.sanitizer = sanitizer
 397         self.token_analysis = token_analysis
 398
 399         self._cache = _TokenCache()
 400
 401
 402     def close(self) -> None:
 403         """ Free all resources used by the analyzer.
 404         """
 405         if self.conn:
 406             self.conn.close()
 407             self.conn = None
 408
 409
 410     def _search_normalized(self, name: str) -> str:
 411         """ Return the search token transliteration of the given name.
 412         """
 413         return cast(str, self.token_analysis.search.transliterate(name)).strip()
 414
 415
 416     def _normalized(self, name: str) -> str:
 417         """ Return the normalized version of the given name with all
 418             non-relevant information removed.
 419         """
 420         return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
 421
 422
 423     def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
 424         """ Return token information for the given list of words.
 425             If a word starts with # it is assumed to be a full name
 426             otherwise is a partial name.
 427
 428             The function returns a list of tuples with
 429             (original word, word token, word id).
 430
 431             The function is used for testing and debugging only
 432             and not necessarily efficient.
 433         """
 434         assert self.conn is not None
 435         full_tokens = {}
 436         partial_tokens = {}
 437         for word in words:
 438             if word.startswith('#'):
 439                 full_tokens[word] = self._search_normalized(word[1:])
 440             else:
 441                 partial_tokens[word] = self._search_normalized(word)
 442
 443         with self.conn.cursor() as cur:
 444             cur.execute("""SELECT word_token, word_id
 445                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 446                         """, (list(full_tokens.values()),))
 447             full_ids = {r[0]: r[1] for r in cur}
 448             cur.execute("""SELECT word_token, word_id
 449                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 450                         (list(partial_tokens.values()),))
 451             part_ids = {r[0]: r[1] for r in cur}
 452
 453         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 454                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 455
 456
 457     def normalize_postcode(self, postcode: str) -> str:
 458         """ Convert the postcode to a standardized form.
 459
 460             This function must yield exactly the same result as the SQL function
 461             'token_normalized_postcode()'.
 462         """
 463         return postcode.strip().upper()
 464
 465
 466     def update_postcodes_from_db(self) -> None:
 467         """ Update postcode tokens in the word table from the location_postcode
 468             table.
 469         """
 470         assert self.conn is not None
 471         analyzer = self.token_analysis.analysis.get('@postcode')
 472
 473         with self.conn.cursor() as cur:
 474             # First get all postcode names currently in the word table.
 475             cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
 476             word_entries = set((entry[0] for entry in cur))
 477
 478             # Then compute the required postcode names from the postcode table.
 479             needed_entries = set()
 480             cur.execute("SELECT country_code, postcode FROM location_postcode")
 481             for cc, postcode in cur:
 482                 info = PlaceInfo({'country_code': cc,
 483                                   'class': 'place', 'type': 'postcode',
 484                                   'address': {'postcode': postcode}})
 485                 address = self.sanitizer.process_names(info)[1]
 486                 for place in address:
 487                     if place.kind == 'postcode':
 488                         if analyzer is None:
 489                             postcode_name = place.name.strip().upper()
 490                             variant_base = None
 491                         else:
 492                             postcode_name = analyzer.get_canonical_id(place)
 493                             variant_base = place.get_attr("variant")
 494
 495                         if variant_base:
 496                             needed_entries.add(f'{postcode_name}@{variant_base}')
 497                         else:
 498                             needed_entries.add(postcode_name)
 499                         break
 500
 501         # Now update the word table.
 502         self._delete_unused_postcode_words(word_entries - needed_entries)
 503         self._add_missing_postcode_words(needed_entries - word_entries)
 504
 505     def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
 506         assert self.conn is not None
 507         if tokens:
 508             with self.conn.cursor() as cur:
 509                 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
 510                             (list(tokens), ))
 511
 512     def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
 513         assert self.conn is not None
 514         if not tokens:
 515             return
 516
 517         analyzer = self.token_analysis.analysis.get('@postcode')
 518         terms = []
 519
 520         for postcode_name in tokens:
 521             if '@' in postcode_name:
 522                 term, variant = postcode_name.split('@', 2)
 523                 term = self._search_normalized(term)
 524                 if analyzer is None:
 525                     variants = [term]
 526                 else:
 527                     variants = analyzer.compute_variants(variant)
 528                     if term not in variants:
 529                         variants.append(term)
 530             else:
 531                 variants = [self._search_normalized(postcode_name)]
 532             terms.append((postcode_name, variants))
 533
 534         if terms:
 535             with self.conn.cursor() as cur:
 536                 cur.execute_values("""SELECT create_postcode_word(pc, var)
 537                                       FROM (VALUES %s) AS v(pc, var)""",
 538                                    terms)
 539
 540
 541
 542
 543     def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
 544                                should_replace: bool) -> None:
 545         """ Replace the search index for special phrases with the new phrases.
 546             If `should_replace` is True, then the previous set of will be
 547             completely replaced. Otherwise the phrases are added to the
 548             already existing ones.
 549         """
 550         assert self.conn is not None
 551         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 552                             for p in phrases))
 553
 554         with self.conn.cursor() as cur:
 555             # Get the old phrases.
 556             existing_phrases = set()
 557             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 558             for word, info in cur:
 559                 existing_phrases.add((word, info['class'], info['type'],
 560                                       info.get('op') or '-'))
 561
 562             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 563             if should_replace:
 564                 deleted = self._remove_special_phrases(cur, norm_phrases,
 565                                                        existing_phrases)
 566             else:
 567                 deleted = 0
 568
 569         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 570                  len(norm_phrases), added, deleted)
 571
 572
 573     def _add_special_phrases(self, cursor: Cursor,
 574                              new_phrases: Set[Tuple[str, str, str, str]],
 575                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 576         """ Add all phrases to the database that are not yet there.
 577         """
 578         to_add = new_phrases - existing_phrases
 579
 580         added = 0
 581         with CopyBuffer() as copystr:
 582             for word, cls, typ, oper in to_add:
 583                 term = self._search_normalized(word)
 584                 if term:
 585                     copystr.add(term, 'S', word,
 586                                 json.dumps({'class': cls, 'type': typ,
 587                                             'op': oper if oper in ('in', 'near') else None}))
 588                     added += 1
 589
 590             copystr.copy_out(cursor, 'word',
 591                              columns=['word_token', 'type', 'word', 'info'])
 592
 593         return added
 594
 595
 596     def _remove_special_phrases(self, cursor: Cursor,
 597                              new_phrases: Set[Tuple[str, str, str, str]],
 598                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 599         """ Remove all phrases from the database that are no longer in the
 600             new phrase list.
 601         """
 602         to_delete = existing_phrases - new_phrases
 603
 604         if to_delete:
 605             cursor.execute_values(
 606                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 607                     WHERE type = 'S' and word = name
 608                           and info->>'class' = in_class and info->>'type' = in_type
 609                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 610                 """, to_delete)
 611
 612         return len(to_delete)
 613
 614
 615     def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
 616         """ Add default names for the given country to the search index.
 617         """
 618         # Make sure any name preprocessing for country names applies.
 619         info = PlaceInfo({'name': names, 'country_code': country_code,
 620                           'rank_address': 4, 'class': 'boundary',
 621                           'type': 'administrative'})
 622         self._add_country_full_names(country_code,
 623                                      self.sanitizer.process_names(info)[0],
 624                                      internal=True)
 625
 626
 627     def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
 628                                 internal: bool = False) -> None:
 629         """ Add names for the given country from an already sanitized
 630             name list.
 631         """
 632         assert self.conn is not None
 633         word_tokens = set()
 634         for name in names:
 635             norm_name = self._search_normalized(name.name)
 636             if norm_name:
 637                 word_tokens.add(norm_name)
 638
 639         with self.conn.cursor() as cur:
 640             # Get existing names
 641             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
 642                              FROM word
 643                              WHERE type = 'C' and word = %s""",
 644                         (country_code, ))
 645             # internal/external names
 646             existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
 647             for word in cur:
 648                 existing_tokens[word[1]].add(word[0])
 649
 650             # Delete names that no longer exist.
 651             gone_tokens = existing_tokens[internal] - word_tokens
 652             if internal:
 653                 gone_tokens.update(existing_tokens[False] & word_tokens)
 654             if gone_tokens:
 655                 cur.execute("""DELETE FROM word
 656                                USING unnest(%s) as token
 657                                WHERE type = 'C' and word = %s
 658                                      and word_token = token""",
 659                             (list(gone_tokens), country_code))
 660
 661             # Only add those names that are not yet in the list.
 662             new_tokens = word_tokens - existing_tokens[True]
 663             if not internal:
 664                 new_tokens -= existing_tokens[False]
 665             if new_tokens:
 666                 if internal:
 667                     sql = """INSERT INTO word (word_token, type, word, info)
 668                                (SELECT token, 'C', %s, '{"internal": "yes"}'
 669                                   FROM unnest(%s) as token)
 670                            """
 671                 else:
 672                     sql = """INSERT INTO word (word_token, type, word)
 673                                    (SELECT token, 'C', %s
 674                                     FROM unnest(%s) as token)
 675                           """
 676                 cur.execute(sql, (country_code, list(new_tokens)))
 677
 678
 679     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
 680         """ Determine tokenizer information about the given place.
 681
 682             Returns a JSON-serializable structure that will be handed into
 683             the database via the token_info field.
 684         """
 685         token_info = _TokenInfo()
 686
 687         names, address = self.sanitizer.process_names(place)
 688
 689         if names:
 690             token_info.set_names(*self._compute_name_tokens(names))
 691
 692             if place.is_country():
 693                 assert place.country_code is not None
 694                 self._add_country_full_names(place.country_code, names)
 695
 696         if address:
 697             self._process_place_address(token_info, address)
 698
 699         return token_info.to_dict()
 700
 701
 702     def _process_place_address(self, token_info: '_TokenInfo',
 703                                address: Sequence[PlaceName]) -> None:
 704         for item in address:
 705             if item.kind == 'postcode':
 706                 token_info.set_postcode(self._add_postcode(item))
 707             elif item.kind == 'housenumber':
 708                 token_info.add_housenumber(*self._compute_housenumber_token(item))
 709             elif item.kind == 'street':
 710                 token_info.add_street(self._retrieve_full_tokens(item.name))
 711             elif item.kind == 'place':
 712                 if not item.suffix:
 713                     token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
 714             elif not item.kind.startswith('_') and not item.suffix and \
 715                  item.kind not in ('country', 'full', 'inclusion'):
 716                 token_info.add_address_term(item.kind,
 717                                             itertools.chain(*self._compute_name_tokens([item])))
 718
 719
 720     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
 721         """ Normalize the housenumber and return the word token and the
 722             canonical form.
 723         """
 724         assert self.conn is not None
 725         analyzer = self.token_analysis.analysis.get('@housenumber')
 726         result: Tuple[Optional[int], Optional[str]] = (None, None)
 727
 728         if analyzer is None:
 729             # When no custom analyzer is set, simply normalize and transliterate
 730             norm_name = self._search_normalized(hnr.name)
 731             if norm_name:
 732                 result = self._cache.housenumbers.get(norm_name, result)
 733                 if result[0] is None:
 734                     hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
 735
 736                     result = hid, norm_name
 737                     self._cache.housenumbers[norm_name] = result
 738         else:
 739             # Otherwise use the analyzer to determine the canonical name.
 740             # Per convention we use the first variant as the 'lookup name', the
 741             # name that gets saved in the housenumber field of the place.
 742             word_id = analyzer.get_canonical_id(hnr)
 743             if word_id:
 744                 result = self._cache.housenumbers.get(word_id, result)
 745                 if result[0] is None:
 746                     variants = analyzer.compute_variants(word_id)
 747                     if variants:
 748                         hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
 749                                              (word_id, list(variants)))
 750                         result = hid, variants[0]
 751                         self._cache.housenumbers[word_id] = result
 752
 753         return result
 754
 755
 756     def _retrieve_full_tokens(self, name: str) -> List[int]:
 757         """ Get the full name token for the given name, if it exists.
 758             The name is only retrieved for the standard analyser.
 759         """
 760         assert self.conn is not None
 761         norm_name = self._search_normalized(name)
 762
 763         # return cached if possible
 764         if norm_name in self._cache.fulls:
 765             return self._cache.fulls[norm_name]
 766
 767         with self.conn.cursor() as cur:
 768             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 769                         (norm_name, ))
 770             full = [row[0] for row in cur]
 771
 772         self._cache.fulls[norm_name] = full
 773
 774         return full
 775
 776
 777     def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
 778         """ Computes the full name and partial name tokens for the given
 779             dictionary of names.
 780         """
 781         assert self.conn is not None
 782         full_tokens: Set[int] = set()
 783         partial_tokens: Set[int] = set()
 784
 785         for name in names:
 786             analyzer_id = name.get_attr('analyzer')
 787             analyzer = self.token_analysis.get_analyzer(analyzer_id)
 788             word_id = analyzer.get_canonical_id(name)
 789             if analyzer_id is None:
 790                 token_id = word_id
 791             else:
 792                 token_id = f'{word_id}@{analyzer_id}'
 793
 794             full, part = self._cache.names.get(token_id, (None, None))
 795             if full is None:
 796                 variants = analyzer.compute_variants(word_id)
 797                 if not variants:
 798                     continue
 799
 800                 with self.conn.cursor() as cur:
 801                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
 802                                 (token_id, variants))
 803                     full, part = cast(Tuple[int, List[int]], cur.fetchone())
 804
 805                 self._cache.names[token_id] = (full, part)
 806
 807             assert part is not None
 808
 809             full_tokens.add(full)
 810             partial_tokens.update(part)
 811
 812         return full_tokens, partial_tokens
 813
 814
 815     def _add_postcode(self, item: PlaceName) -> Optional[str]:
 816         """ Make sure the normalized postcode is present in the word table.
 817         """
 818         assert self.conn is not None
 819         analyzer = self.token_analysis.analysis.get('@postcode')
 820
 821         if analyzer is None:
 822             postcode_name = item.name.strip().upper()
 823             variant_base = None
 824         else:
 825             postcode_name = analyzer.get_canonical_id(item)
 826             variant_base = item.get_attr("variant")
 827
 828         if variant_base:
 829             postcode = f'{postcode_name}@{variant_base}'
 830         else:
 831             postcode = postcode_name
 832
 833         if postcode not in self._cache.postcodes:
 834             term = self._search_normalized(postcode_name)
 835             if not term:
 836                 return None
 837
 838             variants = {term}
 839             if analyzer is not None and variant_base:
 840                 variants.update(analyzer.compute_variants(variant_base))
 841
 842             with self.conn.cursor() as cur:
 843                 cur.execute("SELECT create_postcode_word(%s, %s)",
 844                             (postcode, list(variants)))
 845             self._cache.postcodes.add(postcode)
 846
 847         return postcode_name
 848
 849
 850 class _TokenInfo:
 851     """ Collect token information to be sent back to the database.
 852     """
 853     def __init__(self) -> None:
 854         self.names: Optional[str] = None
 855         self.housenumbers: Set[str] = set()
 856         self.housenumber_tokens: Set[int] = set()
 857         self.street_tokens: Optional[Set[int]] = None
 858         self.place_tokens: Set[int] = set()
 859         self.address_tokens: Dict[str, str] = {}
 860         self.postcode: Optional[str] = None
 861
 862
 863     def _mk_array(self, tokens: Iterable[Any]) -> str:
 864         return f"{{{','.join((str(s) for s in tokens))}}}"
 865
 866
 867     def to_dict(self) -> Dict[str, Any]:
 868         """ Return the token information in database importable format.
 869         """
 870         out: Dict[str, Any] = {}
 871
 872         if self.names:
 873             out['names'] = self.names
 874
 875         if self.housenumbers:
 876             out['hnr'] = ';'.join(self.housenumbers)
 877             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 878
 879         if self.street_tokens is not None:
 880             out['street'] = self._mk_array(self.street_tokens)
 881
 882         if self.place_tokens:
 883             out['place'] = self._mk_array(self.place_tokens)
 884
 885         if self.address_tokens:
 886             out['addr'] = self.address_tokens
 887
 888         if self.postcode:
 889             out['postcode'] = self.postcode
 890
 891         return out
 892
 893
 894     def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
 895         """ Adds token information for the normalised names.
 896         """
 897         self.names = self._mk_array(itertools.chain(fulls, partials))
 898
 899
 900     def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
 901         """ Extract housenumber information from a list of normalised
 902             housenumbers.
 903         """
 904         if token:
 905             assert hnr is not None
 906             self.housenumbers.add(hnr)
 907             self.housenumber_tokens.add(token)
 908
 909
 910     def add_street(self, tokens: Iterable[int]) -> None:
 911         """ Add addr:street match terms.
 912         """
 913         if self.street_tokens is None:
 914             self.street_tokens = set()
 915         self.street_tokens.update(tokens)
 916
 917
 918     def add_place(self, tokens: Iterable[int]) -> None:
 919         """ Add addr:place search and match terms.
 920         """
 921         self.place_tokens.update(tokens)
 922
 923
 924     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
 925         """ Add additional address terms.
 926         """
 927         array = self._mk_array(partials)
 928         if len(array) > 2:
 929             self.address_tokens[key] = array
 930
 931     def set_postcode(self, postcode: Optional[str]) -> None:
 932         """ Set the postcode to the given one.
 933         """
 934         self.postcode = postcode
 935
 936
 937 class _TokenCache:
 938     """ Cache for token information to avoid repeated database queries.
 939
 940         This cache is not thread-safe and needs to be instantiated per
 941         analyzer.
 942     """
 943     def __init__(self) -> None:
 944         self.names: Dict[str, Tuple[int, List[int]]] = {}
 945         self.partials: Dict[str, int] = {}
 946         self.fulls: Dict[str, List[int]] = {}
 947         self.postcodes: Set[str] = set()
 948         self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}