]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/tokenizer/icu_tokenizer.py
Merge remote-tracking branch 'upstream/master'
[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
1 # SPDX-License-Identifier: GPL-2.0-only
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
10 """
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
12                    Dict, Set, Iterable
13 import itertools
14 import json
15 import logging
16 from pathlib import Path
17 from textwrap import dedent
18
19 from nominatim.db.connection import connect, Connection, Cursor
20 from nominatim.config import Configuration
21 from nominatim.db.utils import CopyBuffer
22 from nominatim.db.sql_preprocessor import SQLPreprocessor
23 from nominatim.data.place_info import PlaceInfo
24 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
25 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
26 from nominatim.data.place_name import PlaceName
27 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
28 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
29
30 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
31
32 LOG = logging.getLogger()
33
34 WORD_TYPES =(('country_names', 'C'),
35              ('postcodes', 'P'),
36              ('full_word', 'W'),
37              ('housenumbers', 'H'))
38
39 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
40     """ Create a new instance of the tokenizer provided by this module.
41     """
42     return ICUTokenizer(dsn, data_dir)
43
44
45 class ICUTokenizer(AbstractTokenizer):
46     """ This tokenizer uses libICU to convert names and queries to ASCII.
47         Otherwise it uses the same algorithms and data structures as the
48         normalization routines in Nominatim 3.
49     """
50
51     def __init__(self, dsn: str, data_dir: Path) -> None:
52         self.dsn = dsn
53         self.data_dir = data_dir
54         self.loader: Optional[ICURuleLoader] = None
55
56
57     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
58         """ Set up a new tokenizer for the database.
59
60             This copies all necessary data in the project directory to make
61             sure the tokenizer remains stable even over updates.
62         """
63         self.loader = ICURuleLoader(config)
64
65         self._install_php(config.lib_dir.php, overwrite=True)
66         self._save_config()
67
68         if init_db:
69             self.update_sql_functions(config)
70             self._setup_db_tables(config)
71             self._create_base_indices(config, 'word')
72
73
74     def init_from_project(self, config: Configuration) -> None:
75         """ Initialise the tokenizer from the project directory.
76         """
77         self.loader = ICURuleLoader(config)
78
79         with connect(self.dsn) as conn:
80             self.loader.load_config_from_db(conn)
81
82         self._install_php(config.lib_dir.php, overwrite=False)
83
84
85     def finalize_import(self, config: Configuration) -> None:
86         """ Do any required postprocessing to make the tokenizer data ready
87             for use.
88         """
89         self._create_lookup_indices(config, 'word')
90
91
92     def update_sql_functions(self, config: Configuration) -> None:
93         """ Reimport the SQL functions for this tokenizer.
94         """
95         with connect(self.dsn) as conn:
96             sqlp = SQLPreprocessor(conn, config)
97             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
98
99
100     def check_database(self, config: Configuration) -> None:
101         """ Check that the tokenizer is set up correctly.
102         """
103         # Will throw an error if there is an issue.
104         self.init_from_project(config)
105
106
107     def update_statistics(self, config: Configuration, threads: int = 2) -> None:
108         """ Recompute frequencies for all name words.
109         """
110         with connect(self.dsn) as conn:
111             if not conn.table_exists('search_name'):
112                 return
113
114             with conn.cursor() as cur:
115                 cur.execute('ANALYSE search_name')
116                 if threads > 1:
117                     cur.execute('SET max_parallel_workers_per_gather TO %s',
118                                 (min(threads, 6),))
119
120                 if conn.server_version_tuple() < (12, 0):
121                     LOG.info('Computing word frequencies')
122                     cur.drop_table('word_frequencies')
123                     cur.drop_table('addressword_frequencies')
124                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
125                                      SELECT unnest(name_vector) as id, count(*)
126                                      FROM search_name GROUP BY id""")
127                     cur.execute('CREATE INDEX ON word_frequencies(id)')
128                     cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
129                                      SELECT unnest(nameaddress_vector) as id, count(*)
130                                      FROM search_name GROUP BY id""")
131                     cur.execute('CREATE INDEX ON addressword_frequencies(id)')
132                     cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
133                                                                                INOUT info JSONB)
134                                    AS $$
135                                    DECLARE rec RECORD;
136                                    BEGIN
137                                    IF info is null THEN
138                                      info = '{}'::jsonb;
139                                    END IF;
140                                    FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
141                                    LOOP
142                                      info = info || jsonb_build_object('count', rec.count);
143                                    END LOOP;
144                                    FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
145                                    LOOP
146                                      info = info || jsonb_build_object('addr_count', rec.count);
147                                    END LOOP;
148                                    IF info = '{}'::jsonb THEN
149                                      info = null;
150                                    END IF;
151                                    END;
152                                    $$ LANGUAGE plpgsql IMMUTABLE;
153                                 """)
154                     LOG.info('Update word table with recomputed frequencies')
155                     cur.drop_table('tmp_word')
156                     cur.execute("""CREATE TABLE tmp_word AS
157                                     SELECT word_id, word_token, type, word,
158                                            word_freq_update(word_id, info) as info
159                                     FROM word
160                                 """)
161                     cur.drop_table('word_frequencies')
162                     cur.drop_table('addressword_frequencies')
163                 else:
164                     LOG.info('Computing word frequencies')
165                     cur.drop_table('word_frequencies')
166                     cur.execute('ANALYSE search_name')
167                     cur.execute('ANALYSE word')
168                     cur.execute("""
169                       CREATE TEMP TABLE word_frequencies AS
170                       WITH word_freq AS MATERIALIZED (
171                                SELECT unnest(name_vector) as id, count(*)
172                                      FROM search_name GROUP BY id),
173                            addr_freq AS MATERIALIZED (
174                                SELECT unnest(nameaddress_vector) as id, count(*)
175                                      FROM search_name GROUP BY id)
176                       SELECT coalesce(a.id, w.id) as id,
177                              (CASE WHEN w.count is null THEN '{}'::JSONB
178                                   ELSE jsonb_build_object('count', w.count) END
179                               ||
180                               CASE WHEN a.count is null THEN '{}'::JSONB
181                                   ELSE jsonb_build_object('addr_count', a.count) END) as info
182                       FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
183                       """)
184                     cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
185                     cur.execute('ANALYSE word_frequencies')
186                     LOG.info('Update word table with recomputed frequencies')
187                     cur.drop_table('tmp_word')
188                     cur.execute("""CREATE TABLE tmp_word AS
189                                     SELECT word_id, word_token, type, word,
190                                            (CASE WHEN wf.info is null THEN word.info
191                                             ELSE coalesce(word.info, '{}'::jsonb) || wf.info
192                                             END) as info
193                                     FROM word LEFT JOIN word_frequencies wf
194                                          ON word.word_id = wf.id
195                                     ORDER BY word_id
196                                 """)
197                     cur.drop_table('word_frequencies')
198
199             with conn.cursor() as cur:
200                 cur.execute('SET max_parallel_workers_per_gather TO 0')
201
202             sqlp = SQLPreprocessor(conn, config)
203             sqlp.run_string(conn,
204                             'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
205             conn.commit()
206         self._create_base_indices(config, 'tmp_word')
207         self._create_lookup_indices(config, 'tmp_word')
208         self._move_temporary_word_table('tmp_word')
209
210
211
212     def _cleanup_housenumbers(self) -> None:
213         """ Remove unused house numbers.
214         """
215         with connect(self.dsn) as conn:
216             if not conn.table_exists('search_name'):
217                 return
218             with conn.cursor(name="hnr_counter") as cur:
219                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
220                                FROM word
221                                WHERE type = 'H'
222                                  AND NOT EXISTS(SELECT * FROM search_name
223                                                 WHERE ARRAY[word.word_id] && name_vector)
224                                  AND (char_length(coalesce(word, word_token)) > 6
225                                       OR coalesce(word, word_token) not similar to '\\d+')
226                             """)
227                 candidates = {token: wid for wid, token in cur}
228             with conn.cursor(name="hnr_counter") as cur:
229                 cur.execute("""SELECT housenumber FROM placex
230                                WHERE housenumber is not null
231                                      AND (char_length(housenumber) > 6
232                                           OR housenumber not similar to '\\d+')
233                             """)
234                 for row in cur:
235                     for hnr in row[0].split(';'):
236                         candidates.pop(hnr, None)
237             LOG.info("There are %s outdated housenumbers.", len(candidates))
238             LOG.debug("Outdated housenumbers: %s", candidates.keys())
239             if candidates:
240                 with conn.cursor() as cur:
241                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
242                                 (list(candidates.values()), ))
243                 conn.commit()
244
245
246
247     def update_word_tokens(self) -> None:
248         """ Remove unused tokens.
249         """
250         LOG.warning("Cleaning up housenumber tokens.")
251         self._cleanup_housenumbers()
252         LOG.warning("Tokenizer house-keeping done.")
253
254
255     def name_analyzer(self) -> 'ICUNameAnalyzer':
256         """ Create a new analyzer for tokenizing names and queries
257             using this tokinzer. Analyzers are context managers and should
258             be used accordingly:
259
260             ```
261             with tokenizer.name_analyzer() as analyzer:
262                 analyser.tokenize()
263             ```
264
265             When used outside the with construct, the caller must ensure to
266             call the close() function before destructing the analyzer.
267
268             Analyzers are not thread-safe. You need to instantiate one per thread.
269         """
270         assert self.loader is not None
271         return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
272                                self.loader.make_token_analysis())
273
274
275     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
276         """ Return a list of the `num` most frequent full words
277             in the database.
278         """
279         with conn.cursor() as cur:
280             cur.execute("""SELECT word, sum((info->>'count')::int) as count
281                              FROM word WHERE type = 'W'
282                              GROUP BY word
283                              ORDER BY count DESC LIMIT %s""", (num,))
284             return list(s[0].split('@')[0] for s in cur)
285
286
287     def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
288         """ Install the php script for the tokenizer.
289         """
290         if phpdir is not None:
291             assert self.loader is not None
292             php_file = self.data_dir / "tokenizer.php"
293
294             if not php_file.exists() or overwrite:
295                 php_file.write_text(dedent(f"""\
296                     <?php
297                     @define('CONST_Max_Word_Frequency', 10000000);
298                     @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
299                     @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
300                     require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
301
302
303     def _save_config(self) -> None:
304         """ Save the configuration that needs to remain stable for the given
305             database as database properties.
306         """
307         assert self.loader is not None
308         with connect(self.dsn) as conn:
309             self.loader.save_config_to_db(conn)
310
311
312     def _setup_db_tables(self, config: Configuration) -> None:
313         """ Set up the word table and fill it with pre-computed word
314             frequencies.
315         """
316         with connect(self.dsn) as conn:
317             with conn.cursor() as cur:
318                 cur.drop_table('word')
319             sqlp = SQLPreprocessor(conn, config)
320             sqlp.run_string(conn, """
321                 CREATE TABLE word (
322                       word_id INTEGER,
323                       word_token text NOT NULL,
324                       type text NOT NULL,
325                       word text,
326                       info jsonb
327                     ) {{db.tablespace.search_data}};
328                 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
329
330                 DROP SEQUENCE IF EXISTS seq_word;
331                 CREATE SEQUENCE seq_word start 1;
332                 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
333             """)
334             conn.commit()
335
336
337     def _create_base_indices(self, config: Configuration, table_name: str) -> None:
338         """ Set up the word table and fill it with pre-computed word
339             frequencies.
340         """
341         with connect(self.dsn) as conn:
342             sqlp = SQLPreprocessor(conn, config)
343             sqlp.run_string(conn,
344                             """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
345                                USING BTREE (word_token) {{db.tablespace.search_index}}""",
346                             table_name=table_name)
347             for name, ctype in WORD_TYPES:
348                 sqlp.run_string(conn,
349                                 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
350                                    USING BTREE (word) {{db.tablespace.address_index}}
351                                    WHERE type = '{{column_type}}'
352                                 """,
353                                 table_name=table_name, idx_name=name,
354                                 column_type=ctype)
355             conn.commit()
356
357
358     def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
359         """ Create additional indexes used when running the API.
360         """
361         with connect(self.dsn) as conn:
362             sqlp = SQLPreprocessor(conn, config)
363             # Index required for details lookup.
364             sqlp.run_string(conn, """
365                 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
366                   ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
367             """,
368             table_name=table_name)
369             conn.commit()
370
371
372     def _move_temporary_word_table(self, old: str) -> None:
373         """ Rename all tables and indexes used by the tokenizer.
374         """
375         with connect(self.dsn) as conn:
376             with conn.cursor() as cur:
377                 cur.drop_table('word')
378                 cur.execute(f"ALTER TABLE {old} RENAME TO word")
379                 for idx in ('word_token', 'word_id'):
380                     cur.execute(f"""ALTER INDEX idx_{old}_{idx}
381                                       RENAME TO idx_word_{idx}""")
382                 for name, _ in WORD_TYPES:
383                     cur.execute(f"""ALTER INDEX idx_{old}_{name}
384                                     RENAME TO idx_word_{name}""")
385             conn.commit()
386
387
388
389
390 class ICUNameAnalyzer(AbstractAnalyzer):
391     """ The ICU analyzer uses the ICU library for splitting names.
392
393         Each instance opens a connection to the database to request the
394         normalization.
395     """
396
397     def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
398                  token_analysis: ICUTokenAnalysis) -> None:
399         self.conn: Optional[Connection] = connect(dsn).connection
400         self.conn.autocommit = True
401         self.sanitizer = sanitizer
402         self.token_analysis = token_analysis
403
404         self._cache = _TokenCache()
405
406
407     def close(self) -> None:
408         """ Free all resources used by the analyzer.
409         """
410         if self.conn:
411             self.conn.close()
412             self.conn = None
413
414
415     def _search_normalized(self, name: str) -> str:
416         """ Return the search token transliteration of the given name.
417         """
418         return cast(str, self.token_analysis.search.transliterate(name)).strip()
419
420
421     def _normalized(self, name: str) -> str:
422         """ Return the normalized version of the given name with all
423             non-relevant information removed.
424         """
425         return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
426
427
428     def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
429         """ Return token information for the given list of words.
430             If a word starts with # it is assumed to be a full name
431             otherwise is a partial name.
432
433             The function returns a list of tuples with
434             (original word, word token, word id).
435
436             The function is used for testing and debugging only
437             and not necessarily efficient.
438         """
439         assert self.conn is not None
440         full_tokens = {}
441         partial_tokens = {}
442         for word in words:
443             if word.startswith('#'):
444                 full_tokens[word] = self._search_normalized(word[1:])
445             else:
446                 partial_tokens[word] = self._search_normalized(word)
447
448         with self.conn.cursor() as cur:
449             cur.execute("""SELECT word_token, word_id
450                             FROM word WHERE word_token = ANY(%s) and type = 'W'
451                         """, (list(full_tokens.values()),))
452             full_ids = {r[0]: r[1] for r in cur}
453             cur.execute("""SELECT word_token, word_id
454                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
455                         (list(partial_tokens.values()),))
456             part_ids = {r[0]: r[1] for r in cur}
457
458         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
459                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
460
461
462     def normalize_postcode(self, postcode: str) -> str:
463         """ Convert the postcode to a standardized form.
464
465             This function must yield exactly the same result as the SQL function
466             'token_normalized_postcode()'.
467         """
468         return postcode.strip().upper()
469
470
471     def update_postcodes_from_db(self) -> None:
472         """ Update postcode tokens in the word table from the location_postcode
473             table.
474         """
475         assert self.conn is not None
476         analyzer = self.token_analysis.analysis.get('@postcode')
477
478         with self.conn.cursor() as cur:
479             # First get all postcode names currently in the word table.
480             cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
481             word_entries = set((entry[0] for entry in cur))
482
483             # Then compute the required postcode names from the postcode table.
484             needed_entries = set()
485             cur.execute("SELECT country_code, postcode FROM location_postcode")
486             for cc, postcode in cur:
487                 info = PlaceInfo({'country_code': cc,
488                                   'class': 'place', 'type': 'postcode',
489                                   'address': {'postcode': postcode}})
490                 address = self.sanitizer.process_names(info)[1]
491                 for place in address:
492                     if place.kind == 'postcode':
493                         if analyzer is None:
494                             postcode_name = place.name.strip().upper()
495                             variant_base = None
496                         else:
497                             postcode_name = analyzer.get_canonical_id(place)
498                             variant_base = place.get_attr("variant")
499
500                         if variant_base:
501                             needed_entries.add(f'{postcode_name}@{variant_base}')
502                         else:
503                             needed_entries.add(postcode_name)
504                         break
505
506         # Now update the word table.
507         self._delete_unused_postcode_words(word_entries - needed_entries)
508         self._add_missing_postcode_words(needed_entries - word_entries)
509
510     def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
511         assert self.conn is not None
512         if tokens:
513             with self.conn.cursor() as cur:
514                 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
515                             (list(tokens), ))
516
517     def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
518         assert self.conn is not None
519         if not tokens:
520             return
521
522         analyzer = self.token_analysis.analysis.get('@postcode')
523         terms = []
524
525         for postcode_name in tokens:
526             if '@' in postcode_name:
527                 term, variant = postcode_name.split('@', 2)
528                 term = self._search_normalized(term)
529                 if analyzer is None:
530                     variants = [term]
531                 else:
532                     variants = analyzer.compute_variants(variant)
533                     if term not in variants:
534                         variants.append(term)
535             else:
536                 variants = [self._search_normalized(postcode_name)]
537             terms.append((postcode_name, variants))
538
539         if terms:
540             with self.conn.cursor() as cur:
541                 cur.execute_values("""SELECT create_postcode_word(pc, var)
542                                       FROM (VALUES %s) AS v(pc, var)""",
543                                    terms)
544
545
546
547
548     def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
549                                should_replace: bool) -> None:
550         """ Replace the search index for special phrases with the new phrases.
551             If `should_replace` is True, then the previous set of will be
552             completely replaced. Otherwise the phrases are added to the
553             already existing ones.
554         """
555         assert self.conn is not None
556         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
557                             for p in phrases))
558
559         with self.conn.cursor() as cur:
560             # Get the old phrases.
561             existing_phrases = set()
562             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
563             for word, info in cur:
564                 existing_phrases.add((word, info['class'], info['type'],
565                                       info.get('op') or '-'))
566
567             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
568             if should_replace:
569                 deleted = self._remove_special_phrases(cur, norm_phrases,
570                                                        existing_phrases)
571             else:
572                 deleted = 0
573
574         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
575                  len(norm_phrases), added, deleted)
576
577
578     def _add_special_phrases(self, cursor: Cursor,
579                              new_phrases: Set[Tuple[str, str, str, str]],
580                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
581         """ Add all phrases to the database that are not yet there.
582         """
583         to_add = new_phrases - existing_phrases
584
585         added = 0
586         with CopyBuffer() as copystr:
587             for word, cls, typ, oper in to_add:
588                 term = self._search_normalized(word)
589                 if term:
590                     copystr.add(term, 'S', word,
591                                 json.dumps({'class': cls, 'type': typ,
592                                             'op': oper if oper in ('in', 'near') else None}))
593                     added += 1
594
595             copystr.copy_out(cursor, 'word',
596                              columns=['word_token', 'type', 'word', 'info'])
597
598         return added
599
600
601     def _remove_special_phrases(self, cursor: Cursor,
602                              new_phrases: Set[Tuple[str, str, str, str]],
603                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
604         """ Remove all phrases from the database that are no longer in the
605             new phrase list.
606         """
607         to_delete = existing_phrases - new_phrases
608
609         if to_delete:
610             cursor.execute_values(
611                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
612                     WHERE type = 'S' and word = name
613                           and info->>'class' = in_class and info->>'type' = in_type
614                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
615                 """, to_delete)
616
617         return len(to_delete)
618
619
620     def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
621         """ Add default names for the given country to the search index.
622         """
623         # Make sure any name preprocessing for country names applies.
624         info = PlaceInfo({'name': names, 'country_code': country_code,
625                           'rank_address': 4, 'class': 'boundary',
626                           'type': 'administrative'})
627         self._add_country_full_names(country_code,
628                                      self.sanitizer.process_names(info)[0],
629                                      internal=True)
630
631
632     def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
633                                 internal: bool = False) -> None:
634         """ Add names for the given country from an already sanitized
635             name list.
636         """
637         assert self.conn is not None
638         word_tokens = set()
639         for name in names:
640             norm_name = self._search_normalized(name.name)
641             if norm_name:
642                 word_tokens.add(norm_name)
643
644         with self.conn.cursor() as cur:
645             # Get existing names
646             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
647                              FROM word
648                              WHERE type = 'C' and word = %s""",
649                         (country_code, ))
650             # internal/external names
651             existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
652             for word in cur:
653                 existing_tokens[word[1]].add(word[0])
654
655             # Delete names that no longer exist.
656             gone_tokens = existing_tokens[internal] - word_tokens
657             if internal:
658                 gone_tokens.update(existing_tokens[False] & word_tokens)
659             if gone_tokens:
660                 cur.execute("""DELETE FROM word
661                                USING unnest(%s) as token
662                                WHERE type = 'C' and word = %s
663                                      and word_token = token""",
664                             (list(gone_tokens), country_code))
665
666             # Only add those names that are not yet in the list.
667             new_tokens = word_tokens - existing_tokens[True]
668             if not internal:
669                 new_tokens -= existing_tokens[False]
670             if new_tokens:
671                 if internal:
672                     sql = """INSERT INTO word (word_token, type, word, info)
673                                (SELECT token, 'C', %s, '{"internal": "yes"}'
674                                   FROM unnest(%s) as token)
675                            """
676                 else:
677                     sql = """INSERT INTO word (word_token, type, word)
678                                    (SELECT token, 'C', %s
679                                     FROM unnest(%s) as token)
680                           """
681                 cur.execute(sql, (country_code, list(new_tokens)))
682
683
684     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
685         """ Determine tokenizer information about the given place.
686
687             Returns a JSON-serializable structure that will be handed into
688             the database via the token_info field.
689         """
690         token_info = _TokenInfo()
691
692         names, address = self.sanitizer.process_names(place)
693
694         if names:
695             token_info.set_names(*self._compute_name_tokens(names))
696
697             if place.is_country():
698                 assert place.country_code is not None
699                 self._add_country_full_names(place.country_code, names)
700
701         if address:
702             self._process_place_address(token_info, address)
703
704         return token_info.to_dict()
705
706
707     def _process_place_address(self, token_info: '_TokenInfo',
708                                address: Sequence[PlaceName]) -> None:
709         for item in address:
710             if item.kind == 'postcode':
711                 token_info.set_postcode(self._add_postcode(item))
712             elif item.kind == 'housenumber':
713                 token_info.add_housenumber(*self._compute_housenumber_token(item))
714             elif item.kind == 'street':
715                 token_info.add_street(self._retrieve_full_tokens(item.name))
716             elif item.kind == 'place':
717                 if not item.suffix:
718                     token_info.add_place(self._compute_partial_tokens(item.name))
719             elif not item.kind.startswith('_') and not item.suffix and \
720                  item.kind not in ('country', 'full', 'inclusion'):
721                 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
722
723
724     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
725         """ Normalize the housenumber and return the word token and the
726             canonical form.
727         """
728         assert self.conn is not None
729         analyzer = self.token_analysis.analysis.get('@housenumber')
730         result: Tuple[Optional[int], Optional[str]] = (None, None)
731
732         if analyzer is None:
733             # When no custom analyzer is set, simply normalize and transliterate
734             norm_name = self._search_normalized(hnr.name)
735             if norm_name:
736                 result = self._cache.housenumbers.get(norm_name, result)
737                 if result[0] is None:
738                     with self.conn.cursor() as cur:
739                         hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
740
741                         result = hid, norm_name
742                         self._cache.housenumbers[norm_name] = result
743         else:
744             # Otherwise use the analyzer to determine the canonical name.
745             # Per convention we use the first variant as the 'lookup name', the
746             # name that gets saved in the housenumber field of the place.
747             word_id = analyzer.get_canonical_id(hnr)
748             if word_id:
749                 result = self._cache.housenumbers.get(word_id, result)
750                 if result[0] is None:
751                     variants = analyzer.compute_variants(word_id)
752                     if variants:
753                         with self.conn.cursor() as cur:
754                             hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
755                                              (word_id, list(variants)))
756                             result = hid, variants[0]
757                             self._cache.housenumbers[word_id] = result
758
759         return result
760
761
762     def _compute_partial_tokens(self, name: str) -> List[int]:
763         """ Normalize the given term, split it into partial words and return
764             then token list for them.
765         """
766         assert self.conn is not None
767         norm_name = self._search_normalized(name)
768
769         tokens = []
770         need_lookup = []
771         for partial in norm_name.split():
772             token = self._cache.partials.get(partial)
773             if token:
774                 tokens.append(token)
775             else:
776                 need_lookup.append(partial)
777
778         if need_lookup:
779             with self.conn.cursor() as cur:
780                 cur.execute("""SELECT word, getorcreate_partial_word(word)
781                                FROM unnest(%s) word""",
782                             (need_lookup, ))
783
784                 for partial, token in cur:
785                     assert token is not None
786                     tokens.append(token)
787                     self._cache.partials[partial] = token
788
789         return tokens
790
791
792     def _retrieve_full_tokens(self, name: str) -> List[int]:
793         """ Get the full name token for the given name, if it exists.
794             The name is only retrieved for the standard analyser.
795         """
796         assert self.conn is not None
797         norm_name = self._search_normalized(name)
798
799         # return cached if possible
800         if norm_name in self._cache.fulls:
801             return self._cache.fulls[norm_name]
802
803         with self.conn.cursor() as cur:
804             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
805                         (norm_name, ))
806             full = [row[0] for row in cur]
807
808         self._cache.fulls[norm_name] = full
809
810         return full
811
812
813     def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
814         """ Computes the full name and partial name tokens for the given
815             dictionary of names.
816         """
817         assert self.conn is not None
818         full_tokens: Set[int] = set()
819         partial_tokens: Set[int] = set()
820
821         for name in names:
822             analyzer_id = name.get_attr('analyzer')
823             analyzer = self.token_analysis.get_analyzer(analyzer_id)
824             word_id = analyzer.get_canonical_id(name)
825             if analyzer_id is None:
826                 token_id = word_id
827             else:
828                 token_id = f'{word_id}@{analyzer_id}'
829
830             full, part = self._cache.names.get(token_id, (None, None))
831             if full is None:
832                 variants = analyzer.compute_variants(word_id)
833                 if not variants:
834                     continue
835
836                 with self.conn.cursor() as cur:
837                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
838                                 (token_id, variants))
839                     full, part = cast(Tuple[int, List[int]], cur.fetchone())
840
841                 self._cache.names[token_id] = (full, part)
842
843             assert part is not None
844
845             full_tokens.add(full)
846             partial_tokens.update(part)
847
848         return full_tokens, partial_tokens
849
850
851     def _add_postcode(self, item: PlaceName) -> Optional[str]:
852         """ Make sure the normalized postcode is present in the word table.
853         """
854         assert self.conn is not None
855         analyzer = self.token_analysis.analysis.get('@postcode')
856
857         if analyzer is None:
858             postcode_name = item.name.strip().upper()
859             variant_base = None
860         else:
861             postcode_name = analyzer.get_canonical_id(item)
862             variant_base = item.get_attr("variant")
863
864         if variant_base:
865             postcode = f'{postcode_name}@{variant_base}'
866         else:
867             postcode = postcode_name
868
869         if postcode not in self._cache.postcodes:
870             term = self._search_normalized(postcode_name)
871             if not term:
872                 return None
873
874             variants = {term}
875             if analyzer is not None and variant_base:
876                 variants.update(analyzer.compute_variants(variant_base))
877
878             with self.conn.cursor() as cur:
879                 cur.execute("SELECT create_postcode_word(%s, %s)",
880                             (postcode, list(variants)))
881             self._cache.postcodes.add(postcode)
882
883         return postcode_name
884
885
886 class _TokenInfo:
887     """ Collect token information to be sent back to the database.
888     """
889     def __init__(self) -> None:
890         self.names: Optional[str] = None
891         self.housenumbers: Set[str] = set()
892         self.housenumber_tokens: Set[int] = set()
893         self.street_tokens: Optional[Set[int]] = None
894         self.place_tokens: Set[int] = set()
895         self.address_tokens: Dict[str, str] = {}
896         self.postcode: Optional[str] = None
897
898
899     def _mk_array(self, tokens: Iterable[Any]) -> str:
900         return f"{{{','.join((str(s) for s in tokens))}}}"
901
902
903     def to_dict(self) -> Dict[str, Any]:
904         """ Return the token information in database importable format.
905         """
906         out: Dict[str, Any] = {}
907
908         if self.names:
909             out['names'] = self.names
910
911         if self.housenumbers:
912             out['hnr'] = ';'.join(self.housenumbers)
913             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
914
915         if self.street_tokens is not None:
916             out['street'] = self._mk_array(self.street_tokens)
917
918         if self.place_tokens:
919             out['place'] = self._mk_array(self.place_tokens)
920
921         if self.address_tokens:
922             out['addr'] = self.address_tokens
923
924         if self.postcode:
925             out['postcode'] = self.postcode
926
927         return out
928
929
930     def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
931         """ Adds token information for the normalised names.
932         """
933         self.names = self._mk_array(itertools.chain(fulls, partials))
934
935
936     def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
937         """ Extract housenumber information from a list of normalised
938             housenumbers.
939         """
940         if token:
941             assert hnr is not None
942             self.housenumbers.add(hnr)
943             self.housenumber_tokens.add(token)
944
945
946     def add_street(self, tokens: Iterable[int]) -> None:
947         """ Add addr:street match terms.
948         """
949         if self.street_tokens is None:
950             self.street_tokens = set()
951         self.street_tokens.update(tokens)
952
953
954     def add_place(self, tokens: Iterable[int]) -> None:
955         """ Add addr:place search and match terms.
956         """
957         self.place_tokens.update(tokens)
958
959
960     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
961         """ Add additional address terms.
962         """
963         if partials:
964             self.address_tokens[key] = self._mk_array(partials)
965
966     def set_postcode(self, postcode: Optional[str]) -> None:
967         """ Set the postcode to the given one.
968         """
969         self.postcode = postcode
970
971
972 class _TokenCache:
973     """ Cache for token information to avoid repeated database queries.
974
975         This cache is not thread-safe and needs to be instantiated per
976         analyzer.
977     """
978     def __init__(self) -> None:
979         self.names: Dict[str, Tuple[int, List[int]]] = {}
980         self.partials: Dict[str, int] = {}
981         self.fulls: Dict[str, List[int]] = {}
982         self.postcodes: Set[str] = set()
983         self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}