nominatim/tokenizer/icu_tokenizer.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Tokenizer implementing normalisation as used before Nominatim 4 but using
   9 libICU instead of the PostgreSQL module.
  10 """
  11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
  12                    Dict, Set, Iterable
  13 import itertools
  14 import json
  15 import logging
  16 from pathlib import Path
  17 from textwrap import dedent
  18
  19 from nominatim.db.connection import connect, Connection, Cursor
  20 from nominatim.config import Configuration
  21 from nominatim.db.utils import CopyBuffer
  22 from nominatim.db.sql_preprocessor import SQLPreprocessor
  23 from nominatim.data.place_info import PlaceInfo
  24 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  25 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
  26 from nominatim.data.place_name import PlaceName
  27 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
  28 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  29
  30 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  31
  32 LOG = logging.getLogger()
  33
  34 WORD_TYPES =(('country_names', 'C'),
  35              ('postcodes', 'P'),
  36              ('full_word', 'W'),
  37              ('housenumbers', 'H'))
  38
  39 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
  40     """ Create a new instance of the tokenizer provided by this module.
  41     """
  42     return ICUTokenizer(dsn, data_dir)
  43
  44
  45 class ICUTokenizer(AbstractTokenizer):
  46     """ This tokenizer uses libICU to convert names and queries to ASCII.
  47         Otherwise it uses the same algorithms and data structures as the
  48         normalization routines in Nominatim 3.
  49     """
  50
  51     def __init__(self, dsn: str, data_dir: Path) -> None:
  52         self.dsn = dsn
  53         self.data_dir = data_dir
  54         self.loader: Optional[ICURuleLoader] = None
  55
  56
  57     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
  58         """ Set up a new tokenizer for the database.
  59
  60             This copies all necessary data in the project directory to make
  61             sure the tokenizer remains stable even over updates.
  62         """
  63         self.loader = ICURuleLoader(config)
  64
  65         self._install_php(config.lib_dir.php, overwrite=True)
  66         self._save_config()
  67
  68         if init_db:
  69             self.update_sql_functions(config)
  70             self._setup_db_tables(config)
  71             self._create_base_indices(config, 'word')
  72
  73
  74     def init_from_project(self, config: Configuration) -> None:
  75         """ Initialise the tokenizer from the project directory.
  76         """
  77         self.loader = ICURuleLoader(config)
  78
  79         with connect(self.dsn) as conn:
  80             self.loader.load_config_from_db(conn)
  81
  82         self._install_php(config.lib_dir.php, overwrite=False)
  83
  84
  85     def finalize_import(self, config: Configuration) -> None:
  86         """ Do any required postprocessing to make the tokenizer data ready
  87             for use.
  88         """
  89         self._create_lookup_indices(config, 'word')
  90
  91
  92     def update_sql_functions(self, config: Configuration) -> None:
  93         """ Reimport the SQL functions for this tokenizer.
  94         """
  95         with connect(self.dsn) as conn:
  96             sqlp = SQLPreprocessor(conn, config)
  97             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  98
  99
 100     def check_database(self, config: Configuration) -> None:
 101         """ Check that the tokenizer is set up correctly.
 102         """
 103         # Will throw an error if there is an issue.
 104         self.init_from_project(config)
 105
 106
 107     def update_statistics(self, config: Configuration, threads: int = 2) -> None:
 108         """ Recompute frequencies for all name words.
 109         """
 110         with connect(self.dsn) as conn:
 111             if not conn.table_exists('search_name'):
 112                 return
 113
 114             with conn.cursor() as cur:
 115                 cur.execute('ANALYSE search_name')
 116                 if threads > 1:
 117                     cur.execute('SET max_parallel_workers_per_gather TO %s',
 118                                 (min(threads, 6),))
 119
 120                 if conn.server_version_tuple() < (12, 0):
 121                     LOG.info('Computing word frequencies')
 122                     cur.drop_table('word_frequencies')
 123                     cur.drop_table('addressword_frequencies')
 124                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
 125                                      SELECT unnest(name_vector) as id, count(*)
 126                                      FROM search_name GROUP BY id""")
 127                     cur.execute('CREATE INDEX ON word_frequencies(id)')
 128                     cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
 129                                      SELECT unnest(nameaddress_vector) as id, count(*)
 130                                      FROM search_name GROUP BY id""")
 131                     cur.execute('CREATE INDEX ON addressword_frequencies(id)')
 132                     cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
 133                                                                                INOUT info JSONB)
 134                                    AS $$
 135                                    DECLARE rec RECORD;
 136                                    BEGIN
 137                                    IF info is null THEN
 138                                      info = '{}'::jsonb;
 139                                    END IF;
 140                                    FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
 141                                    LOOP
 142                                      info = info || jsonb_build_object('count', rec.count);
 143                                    END LOOP;
 144                                    FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
 145                                    LOOP
 146                                      info = info || jsonb_build_object('addr_count', rec.count);
 147                                    END LOOP;
 148                                    IF info = '{}'::jsonb THEN
 149                                      info = null;
 150                                    END IF;
 151                                    END;
 152                                    $$ LANGUAGE plpgsql IMMUTABLE;
 153                                 """)
 154                     LOG.info('Update word table with recomputed frequencies')
 155                     cur.drop_table('tmp_word')
 156                     cur.execute("""CREATE TABLE tmp_word AS
 157                                     SELECT word_id, word_token, type, word,
 158                                            word_freq_update(word_id, info) as info
 159                                     FROM word
 160                                 """)
 161                     cur.drop_table('word_frequencies')
 162                     cur.drop_table('addressword_frequencies')
 163                 else:
 164                     LOG.info('Computing word frequencies')
 165                     cur.drop_table('word_frequencies')
 166                     cur.execute('ANALYSE search_name')
 167                     cur.execute('ANALYSE word')
 168                     cur.execute("""
 169                       CREATE TEMP TABLE word_frequencies AS
 170                       WITH word_freq AS MATERIALIZED (
 171                                SELECT unnest(name_vector) as id, count(*)
 172                                      FROM search_name GROUP BY id),
 173                            addr_freq AS MATERIALIZED (
 174                                SELECT unnest(nameaddress_vector) as id, count(*)
 175                                      FROM search_name GROUP BY id)
 176                       SELECT coalesce(a.id, w.id) as id,
 177                              (CASE WHEN w.count is null THEN '{}'::JSONB
 178                                   ELSE jsonb_build_object('count', w.count) END
 179                               ||
 180                               CASE WHEN a.count is null THEN '{}'::JSONB
 181                                   ELSE jsonb_build_object('addr_count', a.count) END) as info
 182                       FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
 183                       """)
 184                     cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
 185                     cur.execute('ANALYSE word_frequencies')
 186                     LOG.info('Update word table with recomputed frequencies')
 187                     cur.drop_table('tmp_word')
 188                     cur.execute("""CREATE TABLE tmp_word AS
 189                                     SELECT word_id, word_token, type, word,
 190                                            (CASE WHEN wf.info is null THEN word.info
 191                                             ELSE coalesce(word.info, '{}'::jsonb) || wf.info
 192                                             END) as info
 193                                     FROM word LEFT JOIN word_frequencies wf
 194                                          ON word.word_id = wf.id
 195                                     ORDER BY word_id
 196                                 """)
 197                     cur.drop_table('word_frequencies')
 198
 199             with conn.cursor() as cur:
 200                 cur.execute('SET max_parallel_workers_per_gather TO 0')
 201
 202             sqlp = SQLPreprocessor(conn, config)
 203             sqlp.run_string(conn,
 204                             'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
 205             conn.commit()
 206         self._create_base_indices(config, 'tmp_word')
 207         self._create_lookup_indices(config, 'tmp_word')
 208         self._move_temporary_word_table('tmp_word')
 209
 210
 211
 212     def _cleanup_housenumbers(self) -> None:
 213         """ Remove unused house numbers.
 214         """
 215         with connect(self.dsn) as conn:
 216             if not conn.table_exists('search_name'):
 217                 return
 218             with conn.cursor(name="hnr_counter") as cur:
 219                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
 220                                FROM word
 221                                WHERE type = 'H'
 222                                  AND NOT EXISTS(SELECT * FROM search_name
 223                                                 WHERE ARRAY[word.word_id] && name_vector)
 224                                  AND (char_length(coalesce(word, word_token)) > 6
 225                                       OR coalesce(word, word_token) not similar to '\\d+')
 226                             """)
 227                 candidates = {token: wid for wid, token in cur}
 228             with conn.cursor(name="hnr_counter") as cur:
 229                 cur.execute("""SELECT housenumber FROM placex
 230                                WHERE housenumber is not null
 231                                      AND (char_length(housenumber) > 6
 232                                           OR housenumber not similar to '\\d+')
 233                             """)
 234                 for row in cur:
 235                     for hnr in row[0].split(';'):
 236                         candidates.pop(hnr, None)
 237             LOG.info("There are %s outdated housenumbers.", len(candidates))
 238             LOG.debug("Outdated housenumbers: %s", candidates.keys())
 239             if candidates:
 240                 with conn.cursor() as cur:
 241                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
 242                                 (list(candidates.values()), ))
 243                 conn.commit()
 244
 245
 246
 247     def update_word_tokens(self) -> None:
 248         """ Remove unused tokens.
 249         """
 250         LOG.warning("Cleaning up housenumber tokens.")
 251         self._cleanup_housenumbers()
 252         LOG.warning("Tokenizer house-keeping done.")
 253
 254
 255     def name_analyzer(self) -> 'ICUNameAnalyzer':
 256         """ Create a new analyzer for tokenizing names and queries
 257             using this tokinzer. Analyzers are context managers and should
 258             be used accordingly:
 259
 260             ```
 261             with tokenizer.name_analyzer() as analyzer:
 262                 analyser.tokenize()
 263             ```
 264
 265             When used outside the with construct, the caller must ensure to
 266             call the close() function before destructing the analyzer.
 267
 268             Analyzers are not thread-safe. You need to instantiate one per thread.
 269         """
 270         assert self.loader is not None
 271         return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 272                                self.loader.make_token_analysis())
 273
 274
 275     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
 276         """ Return a list of the `num` most frequent full words
 277             in the database.
 278         """
 279         with conn.cursor() as cur:
 280             cur.execute("""SELECT word, sum((info->>'count')::int) as count
 281                              FROM word WHERE type = 'W'
 282                              GROUP BY word
 283                              ORDER BY count DESC LIMIT %s""", (num,))
 284             return list(s[0].split('@')[0] for s in cur)
 285
 286
 287     def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
 288         """ Install the php script for the tokenizer.
 289         """
 290         if phpdir is not None:
 291             assert self.loader is not None
 292             php_file = self.data_dir / "tokenizer.php"
 293
 294             if not php_file.exists() or overwrite:
 295                 php_file.write_text(dedent(f"""\
 296                     <?php
 297                     @define('CONST_Max_Word_Frequency', 10000000);
 298                     @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
 299                     @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 300                     require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
 301
 302
 303     def _save_config(self) -> None:
 304         """ Save the configuration that needs to remain stable for the given
 305             database as database properties.
 306         """
 307         assert self.loader is not None
 308         with connect(self.dsn) as conn:
 309             self.loader.save_config_to_db(conn)
 310
 311
 312     def _setup_db_tables(self, config: Configuration) -> None:
 313         """ Set up the word table and fill it with pre-computed word
 314             frequencies.
 315         """
 316         with connect(self.dsn) as conn:
 317             with conn.cursor() as cur:
 318                 cur.drop_table('word')
 319             sqlp = SQLPreprocessor(conn, config)
 320             sqlp.run_string(conn, """
 321                 CREATE TABLE word (
 322                       word_id INTEGER,
 323                       word_token text NOT NULL,
 324                       type text NOT NULL,
 325                       word text,
 326                       info jsonb
 327                     ) {{db.tablespace.search_data}};
 328                 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
 329
 330                 DROP SEQUENCE IF EXISTS seq_word;
 331                 CREATE SEQUENCE seq_word start 1;
 332                 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
 333             """)
 334             conn.commit()
 335
 336
 337     def _create_base_indices(self, config: Configuration, table_name: str) -> None:
 338         """ Set up the word table and fill it with pre-computed word
 339             frequencies.
 340         """
 341         with connect(self.dsn) as conn:
 342             sqlp = SQLPreprocessor(conn, config)
 343             sqlp.run_string(conn,
 344                             """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
 345                                USING BTREE (word_token) {{db.tablespace.search_index}}""",
 346                             table_name=table_name)
 347             for name, ctype in WORD_TYPES:
 348                 sqlp.run_string(conn,
 349                                 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
 350                                    USING BTREE (word) {{db.tablespace.address_index}}
 351                                    WHERE type = '{{column_type}}'
 352                                 """,
 353                                 table_name=table_name, idx_name=name,
 354                                 column_type=ctype)
 355             conn.commit()
 356
 357
 358     def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
 359         """ Create additional indexes used when running the API.
 360         """
 361         with connect(self.dsn) as conn:
 362             sqlp = SQLPreprocessor(conn, config)
 363             # Index required for details lookup.
 364             sqlp.run_string(conn, """
 365                 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
 366                   ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
 367             """,
 368             table_name=table_name)
 369             conn.commit()
 370
 371
 372     def _move_temporary_word_table(self, old: str) -> None:
 373         """ Rename all tables and indexes used by the tokenizer.
 374         """
 375         with connect(self.dsn) as conn:
 376             with conn.cursor() as cur:
 377                 cur.drop_table('word')
 378                 cur.execute(f"ALTER TABLE {old} RENAME TO word")
 379                 for idx in ('word_token', 'word_id'):
 380                     cur.execute(f"""ALTER INDEX idx_{old}_{idx}
 381                                       RENAME TO idx_word_{idx}""")
 382                 for name, _ in WORD_TYPES:
 383                     cur.execute(f"""ALTER INDEX idx_{old}_{name}
 384                                     RENAME TO idx_word_{name}""")
 385             conn.commit()
 386
 387
 388
 389
 390 class ICUNameAnalyzer(AbstractAnalyzer):
 391     """ The ICU analyzer uses the ICU library for splitting names.
 392
 393         Each instance opens a connection to the database to request the
 394         normalization.
 395     """
 396
 397     def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
 398                  token_analysis: ICUTokenAnalysis) -> None:
 399         self.conn: Optional[Connection] = connect(dsn).connection
 400         self.conn.autocommit = True
 401         self.sanitizer = sanitizer
 402         self.token_analysis = token_analysis
 403
 404         self._cache = _TokenCache()
 405
 406
 407     def close(self) -> None:
 408         """ Free all resources used by the analyzer.
 409         """
 410         if self.conn:
 411             self.conn.close()
 412             self.conn = None
 413
 414
 415     def _search_normalized(self, name: str) -> str:
 416         """ Return the search token transliteration of the given name.
 417         """
 418         return cast(str, self.token_analysis.search.transliterate(name)).strip()
 419
 420
 421     def _normalized(self, name: str) -> str:
 422         """ Return the normalized version of the given name with all
 423             non-relevant information removed.
 424         """
 425         return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
 426
 427
 428     def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
 429         """ Return token information for the given list of words.
 430             If a word starts with # it is assumed to be a full name
 431             otherwise is a partial name.
 432
 433             The function returns a list of tuples with
 434             (original word, word token, word id).
 435
 436             The function is used for testing and debugging only
 437             and not necessarily efficient.
 438         """
 439         assert self.conn is not None
 440         full_tokens = {}
 441         partial_tokens = {}
 442         for word in words:
 443             if word.startswith('#'):
 444                 full_tokens[word] = self._search_normalized(word[1:])
 445             else:
 446                 partial_tokens[word] = self._search_normalized(word)
 447
 448         with self.conn.cursor() as cur:
 449             cur.execute("""SELECT word_token, word_id
 450                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 451                         """, (list(full_tokens.values()),))
 452             full_ids = {r[0]: r[1] for r in cur}
 453             cur.execute("""SELECT word_token, word_id
 454                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 455                         (list(partial_tokens.values()),))
 456             part_ids = {r[0]: r[1] for r in cur}
 457
 458         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 459                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 460
 461
 462     def normalize_postcode(self, postcode: str) -> str:
 463         """ Convert the postcode to a standardized form.
 464
 465             This function must yield exactly the same result as the SQL function
 466             'token_normalized_postcode()'.
 467         """
 468         return postcode.strip().upper()
 469
 470
 471     def update_postcodes_from_db(self) -> None:
 472         """ Update postcode tokens in the word table from the location_postcode
 473             table.
 474         """
 475         assert self.conn is not None
 476         analyzer = self.token_analysis.analysis.get('@postcode')
 477
 478         with self.conn.cursor() as cur:
 479             # First get all postcode names currently in the word table.
 480             cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
 481             word_entries = set((entry[0] for entry in cur))
 482
 483             # Then compute the required postcode names from the postcode table.
 484             needed_entries = set()
 485             cur.execute("SELECT country_code, postcode FROM location_postcode")
 486             for cc, postcode in cur:
 487                 info = PlaceInfo({'country_code': cc,
 488                                   'class': 'place', 'type': 'postcode',
 489                                   'address': {'postcode': postcode}})
 490                 address = self.sanitizer.process_names(info)[1]
 491                 for place in address:
 492                     if place.kind == 'postcode':
 493                         if analyzer is None:
 494                             postcode_name = place.name.strip().upper()
 495                             variant_base = None
 496                         else:
 497                             postcode_name = analyzer.get_canonical_id(place)
 498                             variant_base = place.get_attr("variant")
 499
 500                         if variant_base:
 501                             needed_entries.add(f'{postcode_name}@{variant_base}')
 502                         else:
 503                             needed_entries.add(postcode_name)
 504                         break
 505
 506         # Now update the word table.
 507         self._delete_unused_postcode_words(word_entries - needed_entries)
 508         self._add_missing_postcode_words(needed_entries - word_entries)
 509
 510     def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
 511         assert self.conn is not None
 512         if tokens:
 513             with self.conn.cursor() as cur:
 514                 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
 515                             (list(tokens), ))
 516
 517     def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
 518         assert self.conn is not None
 519         if not tokens:
 520             return
 521
 522         analyzer = self.token_analysis.analysis.get('@postcode')
 523         terms = []
 524
 525         for postcode_name in tokens:
 526             if '@' in postcode_name:
 527                 term, variant = postcode_name.split('@', 2)
 528                 term = self._search_normalized(term)
 529                 if analyzer is None:
 530                     variants = [term]
 531                 else:
 532                     variants = analyzer.compute_variants(variant)
 533                     if term not in variants:
 534                         variants.append(term)
 535             else:
 536                 variants = [self._search_normalized(postcode_name)]
 537             terms.append((postcode_name, variants))
 538
 539         if terms:
 540             with self.conn.cursor() as cur:
 541                 cur.execute_values("""SELECT create_postcode_word(pc, var)
 542                                       FROM (VALUES %s) AS v(pc, var)""",
 543                                    terms)
 544
 545
 546
 547
 548     def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
 549                                should_replace: bool) -> None:
 550         """ Replace the search index for special phrases with the new phrases.
 551             If `should_replace` is True, then the previous set of will be
 552             completely replaced. Otherwise the phrases are added to the
 553             already existing ones.
 554         """
 555         assert self.conn is not None
 556         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 557                             for p in phrases))
 558
 559         with self.conn.cursor() as cur:
 560             # Get the old phrases.
 561             existing_phrases = set()
 562             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 563             for word, info in cur:
 564                 existing_phrases.add((word, info['class'], info['type'],
 565                                       info.get('op') or '-'))
 566
 567             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 568             if should_replace:
 569                 deleted = self._remove_special_phrases(cur, norm_phrases,
 570                                                        existing_phrases)
 571             else:
 572                 deleted = 0
 573
 574         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 575                  len(norm_phrases), added, deleted)
 576
 577
 578     def _add_special_phrases(self, cursor: Cursor,
 579                              new_phrases: Set[Tuple[str, str, str, str]],
 580                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 581         """ Add all phrases to the database that are not yet there.
 582         """
 583         to_add = new_phrases - existing_phrases
 584
 585         added = 0
 586         with CopyBuffer() as copystr:
 587             for word, cls, typ, oper in to_add:
 588                 term = self._search_normalized(word)
 589                 if term:
 590                     copystr.add(term, 'S', word,
 591                                 json.dumps({'class': cls, 'type': typ,
 592                                             'op': oper if oper in ('in', 'near') else None}))
 593                     added += 1
 594
 595             copystr.copy_out(cursor, 'word',
 596                              columns=['word_token', 'type', 'word', 'info'])
 597
 598         return added
 599
 600
 601     def _remove_special_phrases(self, cursor: Cursor,
 602                              new_phrases: Set[Tuple[str, str, str, str]],
 603                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
 604         """ Remove all phrases from the database that are no longer in the
 605             new phrase list.
 606         """
 607         to_delete = existing_phrases - new_phrases
 608
 609         if to_delete:
 610             cursor.execute_values(
 611                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 612                     WHERE type = 'S' and word = name
 613                           and info->>'class' = in_class and info->>'type' = in_type
 614                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 615                 """, to_delete)
 616
 617         return len(to_delete)
 618
 619
 620     def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
 621         """ Add default names for the given country to the search index.
 622         """
 623         # Make sure any name preprocessing for country names applies.
 624         info = PlaceInfo({'name': names, 'country_code': country_code,
 625                           'rank_address': 4, 'class': 'boundary',
 626                           'type': 'administrative'})
 627         self._add_country_full_names(country_code,
 628                                      self.sanitizer.process_names(info)[0],
 629                                      internal=True)
 630
 631
 632     def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
 633                                 internal: bool = False) -> None:
 634         """ Add names for the given country from an already sanitized
 635             name list.
 636         """
 637         assert self.conn is not None
 638         word_tokens = set()
 639         for name in names:
 640             norm_name = self._search_normalized(name.name)
 641             if norm_name:
 642                 word_tokens.add(norm_name)
 643
 644         with self.conn.cursor() as cur:
 645             # Get existing names
 646             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
 647                              FROM word
 648                              WHERE type = 'C' and word = %s""",
 649                         (country_code, ))
 650             # internal/external names
 651             existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
 652             for word in cur:
 653                 existing_tokens[word[1]].add(word[0])
 654
 655             # Delete names that no longer exist.
 656             gone_tokens = existing_tokens[internal] - word_tokens
 657             if internal:
 658                 gone_tokens.update(existing_tokens[False] & word_tokens)
 659             if gone_tokens:
 660                 cur.execute("""DELETE FROM word
 661                                USING unnest(%s) as token
 662                                WHERE type = 'C' and word = %s
 663                                      and word_token = token""",
 664                             (list(gone_tokens), country_code))
 665
 666             # Only add those names that are not yet in the list.
 667             new_tokens = word_tokens - existing_tokens[True]
 668             if not internal:
 669                 new_tokens -= existing_tokens[False]
 670             if new_tokens:
 671                 if internal:
 672                     sql = """INSERT INTO word (word_token, type, word, info)
 673                                (SELECT token, 'C', %s, '{"internal": "yes"}'
 674                                   FROM unnest(%s) as token)
 675                            """
 676                 else:
 677                     sql = """INSERT INTO word (word_token, type, word)
 678                                    (SELECT token, 'C', %s
 679                                     FROM unnest(%s) as token)
 680                           """
 681                 cur.execute(sql, (country_code, list(new_tokens)))
 682
 683
 684     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
 685         """ Determine tokenizer information about the given place.
 686
 687             Returns a JSON-serializable structure that will be handed into
 688             the database via the token_info field.
 689         """
 690         token_info = _TokenInfo()
 691
 692         names, address = self.sanitizer.process_names(place)
 693
 694         if names:
 695             token_info.set_names(*self._compute_name_tokens(names))
 696
 697             if place.is_country():
 698                 assert place.country_code is not None
 699                 self._add_country_full_names(place.country_code, names)
 700
 701         if address:
 702             self._process_place_address(token_info, address)
 703
 704         return token_info.to_dict()
 705
 706
 707     def _process_place_address(self, token_info: '_TokenInfo',
 708                                address: Sequence[PlaceName]) -> None:
 709         for item in address:
 710             if item.kind == 'postcode':
 711                 token_info.set_postcode(self._add_postcode(item))
 712             elif item.kind == 'housenumber':
 713                 token_info.add_housenumber(*self._compute_housenumber_token(item))
 714             elif item.kind == 'street':
 715                 token_info.add_street(self._retrieve_full_tokens(item.name))
 716             elif item.kind == 'place':
 717                 if not item.suffix:
 718                     token_info.add_place(self._compute_partial_tokens(item.name))
 719             elif not item.kind.startswith('_') and not item.suffix and \
 720                  item.kind not in ('country', 'full', 'inclusion'):
 721                 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
 722
 723
 724     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
 725         """ Normalize the housenumber and return the word token and the
 726             canonical form.
 727         """
 728         assert self.conn is not None
 729         analyzer = self.token_analysis.analysis.get('@housenumber')
 730         result: Tuple[Optional[int], Optional[str]] = (None, None)
 731
 732         if analyzer is None:
 733             # When no custom analyzer is set, simply normalize and transliterate
 734             norm_name = self._search_normalized(hnr.name)
 735             if norm_name:
 736                 result = self._cache.housenumbers.get(norm_name, result)
 737                 if result[0] is None:
 738                     with self.conn.cursor() as cur:
 739                         hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
 740
 741                         result = hid, norm_name
 742                         self._cache.housenumbers[norm_name] = result
 743         else:
 744             # Otherwise use the analyzer to determine the canonical name.
 745             # Per convention we use the first variant as the 'lookup name', the
 746             # name that gets saved in the housenumber field of the place.
 747             word_id = analyzer.get_canonical_id(hnr)
 748             if word_id:
 749                 result = self._cache.housenumbers.get(word_id, result)
 750                 if result[0] is None:
 751                     variants = analyzer.compute_variants(word_id)
 752                     if variants:
 753                         with self.conn.cursor() as cur:
 754                             hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
 755                                              (word_id, list(variants)))
 756                             result = hid, variants[0]
 757                             self._cache.housenumbers[word_id] = result
 758
 759         return result
 760
 761
 762     def _compute_partial_tokens(self, name: str) -> List[int]:
 763         """ Normalize the given term, split it into partial words and return
 764             then token list for them.
 765         """
 766         assert self.conn is not None
 767         norm_name = self._search_normalized(name)
 768
 769         tokens = []
 770         need_lookup = []
 771         for partial in norm_name.split():
 772             token = self._cache.partials.get(partial)
 773             if token:
 774                 tokens.append(token)
 775             else:
 776                 need_lookup.append(partial)
 777
 778         if need_lookup:
 779             with self.conn.cursor() as cur:
 780                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 781                                FROM unnest(%s) word""",
 782                             (need_lookup, ))
 783
 784                 for partial, token in cur:
 785                     assert token is not None
 786                     tokens.append(token)
 787                     self._cache.partials[partial] = token
 788
 789         return tokens
 790
 791
 792     def _retrieve_full_tokens(self, name: str) -> List[int]:
 793         """ Get the full name token for the given name, if it exists.
 794             The name is only retrieved for the standard analyser.
 795         """
 796         assert self.conn is not None
 797         norm_name = self._search_normalized(name)
 798
 799         # return cached if possible
 800         if norm_name in self._cache.fulls:
 801             return self._cache.fulls[norm_name]
 802
 803         with self.conn.cursor() as cur:
 804             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
 805                         (norm_name, ))
 806             full = [row[0] for row in cur]
 807
 808         self._cache.fulls[norm_name] = full
 809
 810         return full
 811
 812
 813     def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
 814         """ Computes the full name and partial name tokens for the given
 815             dictionary of names.
 816         """
 817         assert self.conn is not None
 818         full_tokens: Set[int] = set()
 819         partial_tokens: Set[int] = set()
 820
 821         for name in names:
 822             analyzer_id = name.get_attr('analyzer')
 823             analyzer = self.token_analysis.get_analyzer(analyzer_id)
 824             word_id = analyzer.get_canonical_id(name)
 825             if analyzer_id is None:
 826                 token_id = word_id
 827             else:
 828                 token_id = f'{word_id}@{analyzer_id}'
 829
 830             full, part = self._cache.names.get(token_id, (None, None))
 831             if full is None:
 832                 variants = analyzer.compute_variants(word_id)
 833                 if not variants:
 834                     continue
 835
 836                 with self.conn.cursor() as cur:
 837                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
 838                                 (token_id, variants))
 839                     full, part = cast(Tuple[int, List[int]], cur.fetchone())
 840
 841                 self._cache.names[token_id] = (full, part)
 842
 843             assert part is not None
 844
 845             full_tokens.add(full)
 846             partial_tokens.update(part)
 847
 848         return full_tokens, partial_tokens
 849
 850
 851     def _add_postcode(self, item: PlaceName) -> Optional[str]:
 852         """ Make sure the normalized postcode is present in the word table.
 853         """
 854         assert self.conn is not None
 855         analyzer = self.token_analysis.analysis.get('@postcode')
 856
 857         if analyzer is None:
 858             postcode_name = item.name.strip().upper()
 859             variant_base = None
 860         else:
 861             postcode_name = analyzer.get_canonical_id(item)
 862             variant_base = item.get_attr("variant")
 863
 864         if variant_base:
 865             postcode = f'{postcode_name}@{variant_base}'
 866         else:
 867             postcode = postcode_name
 868
 869         if postcode not in self._cache.postcodes:
 870             term = self._search_normalized(postcode_name)
 871             if not term:
 872                 return None
 873
 874             variants = {term}
 875             if analyzer is not None and variant_base:
 876                 variants.update(analyzer.compute_variants(variant_base))
 877
 878             with self.conn.cursor() as cur:
 879                 cur.execute("SELECT create_postcode_word(%s, %s)",
 880                             (postcode, list(variants)))
 881             self._cache.postcodes.add(postcode)
 882
 883         return postcode_name
 884
 885
 886 class _TokenInfo:
 887     """ Collect token information to be sent back to the database.
 888     """
 889     def __init__(self) -> None:
 890         self.names: Optional[str] = None
 891         self.housenumbers: Set[str] = set()
 892         self.housenumber_tokens: Set[int] = set()
 893         self.street_tokens: Optional[Set[int]] = None
 894         self.place_tokens: Set[int] = set()
 895         self.address_tokens: Dict[str, str] = {}
 896         self.postcode: Optional[str] = None
 897
 898
 899     def _mk_array(self, tokens: Iterable[Any]) -> str:
 900         return f"{{{','.join((str(s) for s in tokens))}}}"
 901
 902
 903     def to_dict(self) -> Dict[str, Any]:
 904         """ Return the token information in database importable format.
 905         """
 906         out: Dict[str, Any] = {}
 907
 908         if self.names:
 909             out['names'] = self.names
 910
 911         if self.housenumbers:
 912             out['hnr'] = ';'.join(self.housenumbers)
 913             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
 914
 915         if self.street_tokens is not None:
 916             out['street'] = self._mk_array(self.street_tokens)
 917
 918         if self.place_tokens:
 919             out['place'] = self._mk_array(self.place_tokens)
 920
 921         if self.address_tokens:
 922             out['addr'] = self.address_tokens
 923
 924         if self.postcode:
 925             out['postcode'] = self.postcode
 926
 927         return out
 928
 929
 930     def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
 931         """ Adds token information for the normalised names.
 932         """
 933         self.names = self._mk_array(itertools.chain(fulls, partials))
 934
 935
 936     def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
 937         """ Extract housenumber information from a list of normalised
 938             housenumbers.
 939         """
 940         if token:
 941             assert hnr is not None
 942             self.housenumbers.add(hnr)
 943             self.housenumber_tokens.add(token)
 944
 945
 946     def add_street(self, tokens: Iterable[int]) -> None:
 947         """ Add addr:street match terms.
 948         """
 949         if self.street_tokens is None:
 950             self.street_tokens = set()
 951         self.street_tokens.update(tokens)
 952
 953
 954     def add_place(self, tokens: Iterable[int]) -> None:
 955         """ Add addr:place search and match terms.
 956         """
 957         self.place_tokens.update(tokens)
 958
 959
 960     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
 961         """ Add additional address terms.
 962         """
 963         if partials:
 964             self.address_tokens[key] = self._mk_array(partials)
 965
 966     def set_postcode(self, postcode: Optional[str]) -> None:
 967         """ Set the postcode to the given one.
 968         """
 969         self.postcode = postcode
 970
 971
 972 class _TokenCache:
 973     """ Cache for token information to avoid repeated database queries.
 974
 975         This cache is not thread-safe and needs to be instantiated per
 976         analyzer.
 977     """
 978     def __init__(self) -> None:
 979         self.names: Dict[str, Tuple[int, List[int]]] = {}
 980         self.partials: Dict[str, int] = {}
 981         self.fulls: Dict[str, List[int]] = {}
 982         self.postcodes: Set[str] = set()
 983         self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}