]> git.openstreetmap.org Git - nominatim.git/blob - src/nominatim_db/tokenizer/icu_tokenizer.py
70c5c27a096dd1d6c8764370fa7ec6d504a4e99d
[nominatim.git] / src / nominatim_db / tokenizer / icu_tokenizer.py
1 # SPDX-License-Identifier: GPL-3.0-or-later
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
10 """
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
12                    Dict, Set, Iterable
13 import itertools
14 import json
15 import logging
16 from pathlib import Path
17 from textwrap import dedent
18
19 from ..db.connection import connect, Connection, Cursor, server_version_tuple,\
20                             drop_tables, table_exists, execute_scalar
21 from ..config import Configuration
22 from ..db.utils import CopyBuffer
23 from ..db.sql_preprocessor import SQLPreprocessor
24 from ..data.place_info import PlaceInfo
25 from ..data.place_name import PlaceName
26 from .icu_rule_loader import ICURuleLoader
27 from .place_sanitizer import PlaceSanitizer
28 from .icu_token_analysis import ICUTokenAnalysis
29 from .base import AbstractAnalyzer, AbstractTokenizer
30
31 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
32
33 LOG = logging.getLogger()
34
35 WORD_TYPES =(('country_names', 'C'),
36              ('postcodes', 'P'),
37              ('full_word', 'W'),
38              ('housenumbers', 'H'))
39
40 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
41     """ Create a new instance of the tokenizer provided by this module.
42     """
43     return ICUTokenizer(dsn, data_dir)
44
45
46 class ICUTokenizer(AbstractTokenizer):
47     """ This tokenizer uses libICU to convert names and queries to ASCII.
48         Otherwise it uses the same algorithms and data structures as the
49         normalization routines in Nominatim 3.
50     """
51
52     def __init__(self, dsn: str, data_dir: Path) -> None:
53         self.dsn = dsn
54         self.data_dir = data_dir
55         self.loader: Optional[ICURuleLoader] = None
56
57
58     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
59         """ Set up a new tokenizer for the database.
60
61             This copies all necessary data in the project directory to make
62             sure the tokenizer remains stable even over updates.
63         """
64         self.loader = ICURuleLoader(config)
65
66         self._install_php(config.lib_dir.php, overwrite=True)
67         self._save_config()
68
69         if init_db:
70             self.update_sql_functions(config)
71             self._setup_db_tables(config)
72             self._create_base_indices(config, 'word')
73
74
75     def init_from_project(self, config: Configuration) -> None:
76         """ Initialise the tokenizer from the project directory.
77         """
78         self.loader = ICURuleLoader(config)
79
80         with connect(self.dsn) as conn:
81             self.loader.load_config_from_db(conn)
82
83         self._install_php(config.lib_dir.php, overwrite=False)
84
85
86     def finalize_import(self, config: Configuration) -> None:
87         """ Do any required postprocessing to make the tokenizer data ready
88             for use.
89         """
90         self._create_lookup_indices(config, 'word')
91
92
93     def update_sql_functions(self, config: Configuration) -> None:
94         """ Reimport the SQL functions for this tokenizer.
95         """
96         with connect(self.dsn) as conn:
97             sqlp = SQLPreprocessor(conn, config)
98             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
99
100
101     def check_database(self, config: Configuration) -> None:
102         """ Check that the tokenizer is set up correctly.
103         """
104         # Will throw an error if there is an issue.
105         self.init_from_project(config)
106
107
108     def update_statistics(self, config: Configuration, threads: int = 2) -> None:
109         """ Recompute frequencies for all name words.
110         """
111         with connect(self.dsn) as conn:
112             if not table_exists(conn, 'search_name'):
113                 return
114
115             with conn.cursor() as cur:
116                 cur.execute('ANALYSE search_name')
117                 if threads > 1:
118                     cur.execute('SET max_parallel_workers_per_gather TO %s',
119                                 (min(threads, 6),))
120
121                 if server_version_tuple(conn) < (12, 0):
122                     LOG.info('Computing word frequencies')
123                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
124                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
125                                      SELECT unnest(name_vector) as id, count(*)
126                                      FROM search_name GROUP BY id""")
127                     cur.execute('CREATE INDEX ON word_frequencies(id)')
128                     cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
129                                      SELECT unnest(nameaddress_vector) as id, count(*)
130                                      FROM search_name GROUP BY id""")
131                     cur.execute('CREATE INDEX ON addressword_frequencies(id)')
132                     cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
133                                                                                INOUT info JSONB)
134                                    AS $$
135                                    DECLARE rec RECORD;
136                                    BEGIN
137                                    IF info is null THEN
138                                      info = '{}'::jsonb;
139                                    END IF;
140                                    FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
141                                    LOOP
142                                      info = info || jsonb_build_object('count', rec.count);
143                                    END LOOP;
144                                    FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
145                                    LOOP
146                                      info = info || jsonb_build_object('addr_count', rec.count);
147                                    END LOOP;
148                                    IF info = '{}'::jsonb THEN
149                                      info = null;
150                                    END IF;
151                                    END;
152                                    $$ LANGUAGE plpgsql IMMUTABLE;
153                                 """)
154                     LOG.info('Update word table with recomputed frequencies')
155                     drop_tables(conn, 'tmp_word')
156                     cur.execute("""CREATE TABLE tmp_word AS
157                                     SELECT word_id, word_token, type, word,
158                                            word_freq_update(word_id, info) as info
159                                     FROM word
160                                 """)
161                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
162                 else:
163                     LOG.info('Computing word frequencies')
164                     drop_tables(conn, 'word_frequencies')
165                     cur.execute("""
166                       CREATE TEMP TABLE word_frequencies AS
167                       WITH word_freq AS MATERIALIZED (
168                                SELECT unnest(name_vector) as id, count(*)
169                                      FROM search_name GROUP BY id),
170                            addr_freq AS MATERIALIZED (
171                                SELECT unnest(nameaddress_vector) as id, count(*)
172                                      FROM search_name GROUP BY id)
173                       SELECT coalesce(a.id, w.id) as id,
174                              (CASE WHEN w.count is null THEN '{}'::JSONB
175                                   ELSE jsonb_build_object('count', w.count) END
176                               ||
177                               CASE WHEN a.count is null THEN '{}'::JSONB
178                                   ELSE jsonb_build_object('addr_count', a.count) END) as info
179                       FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
180                       """)
181                     cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
182                     cur.execute('ANALYSE word_frequencies')
183                     LOG.info('Update word table with recomputed frequencies')
184                     drop_tables(conn, 'tmp_word')
185                     cur.execute("""CREATE TABLE tmp_word AS
186                                     SELECT word_id, word_token, type, word,
187                                            (CASE WHEN wf.info is null THEN word.info
188                                             ELSE coalesce(word.info, '{}'::jsonb) || wf.info
189                                             END) as info
190                                     FROM word LEFT JOIN word_frequencies wf
191                                          ON word.word_id = wf.id
192                                 """)
193                     drop_tables(conn, 'word_frequencies')
194
195             with conn.cursor() as cur:
196                 cur.execute('SET max_parallel_workers_per_gather TO 0')
197
198             sqlp = SQLPreprocessor(conn, config)
199             sqlp.run_string(conn,
200                             'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
201             conn.commit()
202         self._create_base_indices(config, 'tmp_word')
203         self._create_lookup_indices(config, 'tmp_word')
204         self._move_temporary_word_table('tmp_word')
205
206
207
208     def _cleanup_housenumbers(self) -> None:
209         """ Remove unused house numbers.
210         """
211         with connect(self.dsn) as conn:
212             if not table_exists(conn, 'search_name'):
213                 return
214             with conn.cursor(name="hnr_counter") as cur:
215                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
216                                FROM word
217                                WHERE type = 'H'
218                                  AND NOT EXISTS(SELECT * FROM search_name
219                                                 WHERE ARRAY[word.word_id] && name_vector)
220                                  AND (char_length(coalesce(word, word_token)) > 6
221                                       OR coalesce(word, word_token) not similar to '\\d+')
222                             """)
223                 candidates = {token: wid for wid, token in cur}
224             with conn.cursor(name="hnr_counter") as cur:
225                 cur.execute("""SELECT housenumber FROM placex
226                                WHERE housenumber is not null
227                                      AND (char_length(housenumber) > 6
228                                           OR housenumber not similar to '\\d+')
229                             """)
230                 for row in cur:
231                     for hnr in row[0].split(';'):
232                         candidates.pop(hnr, None)
233             LOG.info("There are %s outdated housenumbers.", len(candidates))
234             LOG.debug("Outdated housenumbers: %s", candidates.keys())
235             if candidates:
236                 with conn.cursor() as cur:
237                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
238                                 (list(candidates.values()), ))
239                 conn.commit()
240
241
242
243     def update_word_tokens(self) -> None:
244         """ Remove unused tokens.
245         """
246         LOG.warning("Cleaning up housenumber tokens.")
247         self._cleanup_housenumbers()
248         LOG.warning("Tokenizer house-keeping done.")
249
250
251     def name_analyzer(self) -> 'ICUNameAnalyzer':
252         """ Create a new analyzer for tokenizing names and queries
253             using this tokinzer. Analyzers are context managers and should
254             be used accordingly:
255
256             ```
257             with tokenizer.name_analyzer() as analyzer:
258                 analyser.tokenize()
259             ```
260
261             When used outside the with construct, the caller must ensure to
262             call the close() function before destructing the analyzer.
263
264             Analyzers are not thread-safe. You need to instantiate one per thread.
265         """
266         assert self.loader is not None
267         return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
268                                self.loader.make_token_analysis())
269
270
271     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
272         """ Return a list of the `num` most frequent full words
273             in the database.
274         """
275         with conn.cursor() as cur:
276             cur.execute("""SELECT word, sum((info->>'count')::int) as count
277                              FROM word WHERE type = 'W'
278                              GROUP BY word
279                              ORDER BY count DESC LIMIT %s""", (num,))
280             return list(s[0].split('@')[0] for s in cur)
281
282
283     def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
284         """ Install the php script for the tokenizer.
285         """
286         if phpdir is not None:
287             assert self.loader is not None
288             php_file = self.data_dir / "tokenizer.php"
289
290             if not php_file.exists() or overwrite:
291                 php_file.write_text(dedent(f"""\
292                     <?php
293                     @define('CONST_Max_Word_Frequency', 10000000);
294                     @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
295                     @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
296                     require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
297
298
299     def _save_config(self) -> None:
300         """ Save the configuration that needs to remain stable for the given
301             database as database properties.
302         """
303         assert self.loader is not None
304         with connect(self.dsn) as conn:
305             self.loader.save_config_to_db(conn)
306
307
308     def _setup_db_tables(self, config: Configuration) -> None:
309         """ Set up the word table and fill it with pre-computed word
310             frequencies.
311         """
312         with connect(self.dsn) as conn:
313             drop_tables(conn, 'word')
314             sqlp = SQLPreprocessor(conn, config)
315             sqlp.run_string(conn, """
316                 CREATE TABLE word (
317                       word_id INTEGER,
318                       word_token text NOT NULL,
319                       type text NOT NULL,
320                       word text,
321                       info jsonb
322                     ) {{db.tablespace.search_data}};
323                 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
324
325                 DROP SEQUENCE IF EXISTS seq_word;
326                 CREATE SEQUENCE seq_word start 1;
327                 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
328             """)
329             conn.commit()
330
331
332     def _create_base_indices(self, config: Configuration, table_name: str) -> None:
333         """ Set up the word table and fill it with pre-computed word
334             frequencies.
335         """
336         with connect(self.dsn) as conn:
337             sqlp = SQLPreprocessor(conn, config)
338             sqlp.run_string(conn,
339                             """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
340                                USING BTREE (word_token) {{db.tablespace.search_index}}""",
341                             table_name=table_name)
342             for name, ctype in WORD_TYPES:
343                 sqlp.run_string(conn,
344                                 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
345                                    USING BTREE (word) {{db.tablespace.address_index}}
346                                    WHERE type = '{{column_type}}'
347                                 """,
348                                 table_name=table_name, idx_name=name,
349                                 column_type=ctype)
350             conn.commit()
351
352
353     def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
354         """ Create additional indexes used when running the API.
355         """
356         with connect(self.dsn) as conn:
357             sqlp = SQLPreprocessor(conn, config)
358             # Index required for details lookup.
359             sqlp.run_string(conn, """
360                 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
361                   ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
362             """,
363             table_name=table_name)
364             conn.commit()
365
366
367     def _move_temporary_word_table(self, old: str) -> None:
368         """ Rename all tables and indexes used by the tokenizer.
369         """
370         with connect(self.dsn) as conn:
371             drop_tables(conn, 'word')
372             with conn.cursor() as cur:
373                 cur.execute(f"ALTER TABLE {old} RENAME TO word")
374                 for idx in ('word_token', 'word_id'):
375                     cur.execute(f"""ALTER INDEX idx_{old}_{idx}
376                                       RENAME TO idx_word_{idx}""")
377                 for name, _ in WORD_TYPES:
378                     cur.execute(f"""ALTER INDEX idx_{old}_{name}
379                                     RENAME TO idx_word_{name}""")
380             conn.commit()
381
382
383
384
385 class ICUNameAnalyzer(AbstractAnalyzer):
386     """ The ICU analyzer uses the ICU library for splitting names.
387
388         Each instance opens a connection to the database to request the
389         normalization.
390     """
391
392     def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
393                  token_analysis: ICUTokenAnalysis) -> None:
394         self.conn: Optional[Connection] = connect(dsn).connection
395         self.conn.autocommit = True
396         self.sanitizer = sanitizer
397         self.token_analysis = token_analysis
398
399         self._cache = _TokenCache()
400
401
402     def close(self) -> None:
403         """ Free all resources used by the analyzer.
404         """
405         if self.conn:
406             self.conn.close()
407             self.conn = None
408
409
410     def _search_normalized(self, name: str) -> str:
411         """ Return the search token transliteration of the given name.
412         """
413         return cast(str, self.token_analysis.search.transliterate(name)).strip()
414
415
416     def _normalized(self, name: str) -> str:
417         """ Return the normalized version of the given name with all
418             non-relevant information removed.
419         """
420         return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
421
422
423     def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
424         """ Return token information for the given list of words.
425             If a word starts with # it is assumed to be a full name
426             otherwise is a partial name.
427
428             The function returns a list of tuples with
429             (original word, word token, word id).
430
431             The function is used for testing and debugging only
432             and not necessarily efficient.
433         """
434         assert self.conn is not None
435         full_tokens = {}
436         partial_tokens = {}
437         for word in words:
438             if word.startswith('#'):
439                 full_tokens[word] = self._search_normalized(word[1:])
440             else:
441                 partial_tokens[word] = self._search_normalized(word)
442
443         with self.conn.cursor() as cur:
444             cur.execute("""SELECT word_token, word_id
445                             FROM word WHERE word_token = ANY(%s) and type = 'W'
446                         """, (list(full_tokens.values()),))
447             full_ids = {r[0]: r[1] for r in cur}
448             cur.execute("""SELECT word_token, word_id
449                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
450                         (list(partial_tokens.values()),))
451             part_ids = {r[0]: r[1] for r in cur}
452
453         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
454                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
455
456
457     def normalize_postcode(self, postcode: str) -> str:
458         """ Convert the postcode to a standardized form.
459
460             This function must yield exactly the same result as the SQL function
461             'token_normalized_postcode()'.
462         """
463         return postcode.strip().upper()
464
465
466     def update_postcodes_from_db(self) -> None:
467         """ Update postcode tokens in the word table from the location_postcode
468             table.
469         """
470         assert self.conn is not None
471         analyzer = self.token_analysis.analysis.get('@postcode')
472
473         with self.conn.cursor() as cur:
474             # First get all postcode names currently in the word table.
475             cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
476             word_entries = set((entry[0] for entry in cur))
477
478             # Then compute the required postcode names from the postcode table.
479             needed_entries = set()
480             cur.execute("SELECT country_code, postcode FROM location_postcode")
481             for cc, postcode in cur:
482                 info = PlaceInfo({'country_code': cc,
483                                   'class': 'place', 'type': 'postcode',
484                                   'address': {'postcode': postcode}})
485                 address = self.sanitizer.process_names(info)[1]
486                 for place in address:
487                     if place.kind == 'postcode':
488                         if analyzer is None:
489                             postcode_name = place.name.strip().upper()
490                             variant_base = None
491                         else:
492                             postcode_name = analyzer.get_canonical_id(place)
493                             variant_base = place.get_attr("variant")
494
495                         if variant_base:
496                             needed_entries.add(f'{postcode_name}@{variant_base}')
497                         else:
498                             needed_entries.add(postcode_name)
499                         break
500
501         # Now update the word table.
502         self._delete_unused_postcode_words(word_entries - needed_entries)
503         self._add_missing_postcode_words(needed_entries - word_entries)
504
505     def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
506         assert self.conn is not None
507         if tokens:
508             with self.conn.cursor() as cur:
509                 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
510                             (list(tokens), ))
511
512     def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
513         assert self.conn is not None
514         if not tokens:
515             return
516
517         analyzer = self.token_analysis.analysis.get('@postcode')
518         terms = []
519
520         for postcode_name in tokens:
521             if '@' in postcode_name:
522                 term, variant = postcode_name.split('@', 2)
523                 term = self._search_normalized(term)
524                 if analyzer is None:
525                     variants = [term]
526                 else:
527                     variants = analyzer.compute_variants(variant)
528                     if term not in variants:
529                         variants.append(term)
530             else:
531                 variants = [self._search_normalized(postcode_name)]
532             terms.append((postcode_name, variants))
533
534         if terms:
535             with self.conn.cursor() as cur:
536                 cur.execute_values("""SELECT create_postcode_word(pc, var)
537                                       FROM (VALUES %s) AS v(pc, var)""",
538                                    terms)
539
540
541
542
543     def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
544                                should_replace: bool) -> None:
545         """ Replace the search index for special phrases with the new phrases.
546             If `should_replace` is True, then the previous set of will be
547             completely replaced. Otherwise the phrases are added to the
548             already existing ones.
549         """
550         assert self.conn is not None
551         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
552                             for p in phrases))
553
554         with self.conn.cursor() as cur:
555             # Get the old phrases.
556             existing_phrases = set()
557             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
558             for word, info in cur:
559                 existing_phrases.add((word, info['class'], info['type'],
560                                       info.get('op') or '-'))
561
562             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
563             if should_replace:
564                 deleted = self._remove_special_phrases(cur, norm_phrases,
565                                                        existing_phrases)
566             else:
567                 deleted = 0
568
569         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
570                  len(norm_phrases), added, deleted)
571
572
573     def _add_special_phrases(self, cursor: Cursor,
574                              new_phrases: Set[Tuple[str, str, str, str]],
575                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
576         """ Add all phrases to the database that are not yet there.
577         """
578         to_add = new_phrases - existing_phrases
579
580         added = 0
581         with CopyBuffer() as copystr:
582             for word, cls, typ, oper in to_add:
583                 term = self._search_normalized(word)
584                 if term:
585                     copystr.add(term, 'S', word,
586                                 json.dumps({'class': cls, 'type': typ,
587                                             'op': oper if oper in ('in', 'near') else None}))
588                     added += 1
589
590             copystr.copy_out(cursor, 'word',
591                              columns=['word_token', 'type', 'word', 'info'])
592
593         return added
594
595
596     def _remove_special_phrases(self, cursor: Cursor,
597                              new_phrases: Set[Tuple[str, str, str, str]],
598                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
599         """ Remove all phrases from the database that are no longer in the
600             new phrase list.
601         """
602         to_delete = existing_phrases - new_phrases
603
604         if to_delete:
605             cursor.execute_values(
606                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
607                     WHERE type = 'S' and word = name
608                           and info->>'class' = in_class and info->>'type' = in_type
609                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
610                 """, to_delete)
611
612         return len(to_delete)
613
614
615     def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
616         """ Add default names for the given country to the search index.
617         """
618         # Make sure any name preprocessing for country names applies.
619         info = PlaceInfo({'name': names, 'country_code': country_code,
620                           'rank_address': 4, 'class': 'boundary',
621                           'type': 'administrative'})
622         self._add_country_full_names(country_code,
623                                      self.sanitizer.process_names(info)[0],
624                                      internal=True)
625
626
627     def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
628                                 internal: bool = False) -> None:
629         """ Add names for the given country from an already sanitized
630             name list.
631         """
632         assert self.conn is not None
633         word_tokens = set()
634         for name in names:
635             norm_name = self._search_normalized(name.name)
636             if norm_name:
637                 word_tokens.add(norm_name)
638
639         with self.conn.cursor() as cur:
640             # Get existing names
641             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
642                              FROM word
643                              WHERE type = 'C' and word = %s""",
644                         (country_code, ))
645             # internal/external names
646             existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
647             for word in cur:
648                 existing_tokens[word[1]].add(word[0])
649
650             # Delete names that no longer exist.
651             gone_tokens = existing_tokens[internal] - word_tokens
652             if internal:
653                 gone_tokens.update(existing_tokens[False] & word_tokens)
654             if gone_tokens:
655                 cur.execute("""DELETE FROM word
656                                USING unnest(%s) as token
657                                WHERE type = 'C' and word = %s
658                                      and word_token = token""",
659                             (list(gone_tokens), country_code))
660
661             # Only add those names that are not yet in the list.
662             new_tokens = word_tokens - existing_tokens[True]
663             if not internal:
664                 new_tokens -= existing_tokens[False]
665             if new_tokens:
666                 if internal:
667                     sql = """INSERT INTO word (word_token, type, word, info)
668                                (SELECT token, 'C', %s, '{"internal": "yes"}'
669                                   FROM unnest(%s) as token)
670                            """
671                 else:
672                     sql = """INSERT INTO word (word_token, type, word)
673                                    (SELECT token, 'C', %s
674                                     FROM unnest(%s) as token)
675                           """
676                 cur.execute(sql, (country_code, list(new_tokens)))
677
678
679     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
680         """ Determine tokenizer information about the given place.
681
682             Returns a JSON-serializable structure that will be handed into
683             the database via the token_info field.
684         """
685         token_info = _TokenInfo()
686
687         names, address = self.sanitizer.process_names(place)
688
689         if names:
690             token_info.set_names(*self._compute_name_tokens(names))
691
692             if place.is_country():
693                 assert place.country_code is not None
694                 self._add_country_full_names(place.country_code, names)
695
696         if address:
697             self._process_place_address(token_info, address)
698
699         return token_info.to_dict()
700
701
702     def _process_place_address(self, token_info: '_TokenInfo',
703                                address: Sequence[PlaceName]) -> None:
704         for item in address:
705             if item.kind == 'postcode':
706                 token_info.set_postcode(self._add_postcode(item))
707             elif item.kind == 'housenumber':
708                 token_info.add_housenumber(*self._compute_housenumber_token(item))
709             elif item.kind == 'street':
710                 token_info.add_street(self._retrieve_full_tokens(item.name))
711             elif item.kind == 'place':
712                 if not item.suffix:
713                     token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
714             elif not item.kind.startswith('_') and not item.suffix and \
715                  item.kind not in ('country', 'full', 'inclusion'):
716                 token_info.add_address_term(item.kind,
717                                             itertools.chain(*self._compute_name_tokens([item])))
718
719
720     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
721         """ Normalize the housenumber and return the word token and the
722             canonical form.
723         """
724         assert self.conn is not None
725         analyzer = self.token_analysis.analysis.get('@housenumber')
726         result: Tuple[Optional[int], Optional[str]] = (None, None)
727
728         if analyzer is None:
729             # When no custom analyzer is set, simply normalize and transliterate
730             norm_name = self._search_normalized(hnr.name)
731             if norm_name:
732                 result = self._cache.housenumbers.get(norm_name, result)
733                 if result[0] is None:
734                     hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
735
736                     result = hid, norm_name
737                     self._cache.housenumbers[norm_name] = result
738         else:
739             # Otherwise use the analyzer to determine the canonical name.
740             # Per convention we use the first variant as the 'lookup name', the
741             # name that gets saved in the housenumber field of the place.
742             word_id = analyzer.get_canonical_id(hnr)
743             if word_id:
744                 result = self._cache.housenumbers.get(word_id, result)
745                 if result[0] is None:
746                     variants = analyzer.compute_variants(word_id)
747                     if variants:
748                         hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
749                                              (word_id, list(variants)))
750                         result = hid, variants[0]
751                         self._cache.housenumbers[word_id] = result
752
753         return result
754
755
756     def _retrieve_full_tokens(self, name: str) -> List[int]:
757         """ Get the full name token for the given name, if it exists.
758             The name is only retrieved for the standard analyser.
759         """
760         assert self.conn is not None
761         norm_name = self._search_normalized(name)
762
763         # return cached if possible
764         if norm_name in self._cache.fulls:
765             return self._cache.fulls[norm_name]
766
767         with self.conn.cursor() as cur:
768             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
769                         (norm_name, ))
770             full = [row[0] for row in cur]
771
772         self._cache.fulls[norm_name] = full
773
774         return full
775
776
777     def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
778         """ Computes the full name and partial name tokens for the given
779             dictionary of names.
780         """
781         assert self.conn is not None
782         full_tokens: Set[int] = set()
783         partial_tokens: Set[int] = set()
784
785         for name in names:
786             analyzer_id = name.get_attr('analyzer')
787             analyzer = self.token_analysis.get_analyzer(analyzer_id)
788             word_id = analyzer.get_canonical_id(name)
789             if analyzer_id is None:
790                 token_id = word_id
791             else:
792                 token_id = f'{word_id}@{analyzer_id}'
793
794             full, part = self._cache.names.get(token_id, (None, None))
795             if full is None:
796                 variants = analyzer.compute_variants(word_id)
797                 if not variants:
798                     continue
799
800                 with self.conn.cursor() as cur:
801                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
802                                 (token_id, variants))
803                     full, part = cast(Tuple[int, List[int]], cur.fetchone())
804
805                 self._cache.names[token_id] = (full, part)
806
807             assert part is not None
808
809             full_tokens.add(full)
810             partial_tokens.update(part)
811
812         return full_tokens, partial_tokens
813
814
815     def _add_postcode(self, item: PlaceName) -> Optional[str]:
816         """ Make sure the normalized postcode is present in the word table.
817         """
818         assert self.conn is not None
819         analyzer = self.token_analysis.analysis.get('@postcode')
820
821         if analyzer is None:
822             postcode_name = item.name.strip().upper()
823             variant_base = None
824         else:
825             postcode_name = analyzer.get_canonical_id(item)
826             variant_base = item.get_attr("variant")
827
828         if variant_base:
829             postcode = f'{postcode_name}@{variant_base}'
830         else:
831             postcode = postcode_name
832
833         if postcode not in self._cache.postcodes:
834             term = self._search_normalized(postcode_name)
835             if not term:
836                 return None
837
838             variants = {term}
839             if analyzer is not None and variant_base:
840                 variants.update(analyzer.compute_variants(variant_base))
841
842             with self.conn.cursor() as cur:
843                 cur.execute("SELECT create_postcode_word(%s, %s)",
844                             (postcode, list(variants)))
845             self._cache.postcodes.add(postcode)
846
847         return postcode_name
848
849
850 class _TokenInfo:
851     """ Collect token information to be sent back to the database.
852     """
853     def __init__(self) -> None:
854         self.names: Optional[str] = None
855         self.housenumbers: Set[str] = set()
856         self.housenumber_tokens: Set[int] = set()
857         self.street_tokens: Optional[Set[int]] = None
858         self.place_tokens: Set[int] = set()
859         self.address_tokens: Dict[str, str] = {}
860         self.postcode: Optional[str] = None
861
862
863     def _mk_array(self, tokens: Iterable[Any]) -> str:
864         return f"{{{','.join((str(s) for s in tokens))}}}"
865
866
867     def to_dict(self) -> Dict[str, Any]:
868         """ Return the token information in database importable format.
869         """
870         out: Dict[str, Any] = {}
871
872         if self.names:
873             out['names'] = self.names
874
875         if self.housenumbers:
876             out['hnr'] = ';'.join(self.housenumbers)
877             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
878
879         if self.street_tokens is not None:
880             out['street'] = self._mk_array(self.street_tokens)
881
882         if self.place_tokens:
883             out['place'] = self._mk_array(self.place_tokens)
884
885         if self.address_tokens:
886             out['addr'] = self.address_tokens
887
888         if self.postcode:
889             out['postcode'] = self.postcode
890
891         return out
892
893
894     def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
895         """ Adds token information for the normalised names.
896         """
897         self.names = self._mk_array(itertools.chain(fulls, partials))
898
899
900     def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
901         """ Extract housenumber information from a list of normalised
902             housenumbers.
903         """
904         if token:
905             assert hnr is not None
906             self.housenumbers.add(hnr)
907             self.housenumber_tokens.add(token)
908
909
910     def add_street(self, tokens: Iterable[int]) -> None:
911         """ Add addr:street match terms.
912         """
913         if self.street_tokens is None:
914             self.street_tokens = set()
915         self.street_tokens.update(tokens)
916
917
918     def add_place(self, tokens: Iterable[int]) -> None:
919         """ Add addr:place search and match terms.
920         """
921         self.place_tokens.update(tokens)
922
923
924     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
925         """ Add additional address terms.
926         """
927         array = self._mk_array(partials)
928         if len(array) > 2:
929             self.address_tokens[key] = array
930
931     def set_postcode(self, postcode: Optional[str]) -> None:
932         """ Set the postcode to the given one.
933         """
934         self.postcode = postcode
935
936
937 class _TokenCache:
938     """ Cache for token information to avoid repeated database queries.
939
940         This cache is not thread-safe and needs to be instantiated per
941         analyzer.
942     """
943     def __init__(self) -> None:
944         self.names: Dict[str, Tuple[int, List[int]]] = {}
945         self.partials: Dict[str, int] = {}
946         self.fulls: Dict[str, List[int]] = {}
947         self.postcodes: Set[str] = set()
948         self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}