]> git.openstreetmap.org Git - nominatim.git/blob - src/nominatim_db/tokenizer/icu_tokenizer.py
release 4.5.0.post3
[nominatim.git] / src / nominatim_db / tokenizer / icu_tokenizer.py
1 # SPDX-License-Identifier: GPL-3.0-or-later
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
10 """
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
12                    Dict, Set, Iterable
13 import itertools
14 import logging
15 from pathlib import Path
16
17 from psycopg.types.json import Jsonb
18 from psycopg import sql as pysql
19
20 from ..db.connection import connect, Connection, Cursor, server_version_tuple,\
21                             drop_tables, table_exists, execute_scalar
22 from ..config import Configuration
23 from ..db.sql_preprocessor import SQLPreprocessor
24 from ..data.place_info import PlaceInfo
25 from ..data.place_name import PlaceName
26 from .icu_rule_loader import ICURuleLoader
27 from .place_sanitizer import PlaceSanitizer
28 from .icu_token_analysis import ICUTokenAnalysis
29 from .base import AbstractAnalyzer, AbstractTokenizer
30
31 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
32
33 LOG = logging.getLogger()
34
35 WORD_TYPES =(('country_names', 'C'),
36              ('postcodes', 'P'),
37              ('full_word', 'W'),
38              ('housenumbers', 'H'))
39
40 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
41     """ Create a new instance of the tokenizer provided by this module.
42     """
43     return ICUTokenizer(dsn, data_dir)
44
45
46 class ICUTokenizer(AbstractTokenizer):
47     """ This tokenizer uses libICU to convert names and queries to ASCII.
48         Otherwise it uses the same algorithms and data structures as the
49         normalization routines in Nominatim 3.
50     """
51
52     def __init__(self, dsn: str, data_dir: Path) -> None:
53         self.dsn = dsn
54         self.data_dir = data_dir
55         self.loader: Optional[ICURuleLoader] = None
56
57
58     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
59         """ Set up a new tokenizer for the database.
60
61             This copies all necessary data in the project directory to make
62             sure the tokenizer remains stable even over updates.
63         """
64         self.loader = ICURuleLoader(config)
65
66         self._save_config()
67
68         if init_db:
69             self.update_sql_functions(config)
70             self._setup_db_tables(config)
71             self._create_base_indices(config, 'word')
72
73
74     def init_from_project(self, config: Configuration) -> None:
75         """ Initialise the tokenizer from the project directory.
76         """
77         self.loader = ICURuleLoader(config)
78
79         with connect(self.dsn) as conn:
80             self.loader.load_config_from_db(conn)
81
82
83     def finalize_import(self, config: Configuration) -> None:
84         """ Do any required postprocessing to make the tokenizer data ready
85             for use.
86         """
87         self._create_lookup_indices(config, 'word')
88
89
90     def update_sql_functions(self, config: Configuration) -> None:
91         """ Reimport the SQL functions for this tokenizer.
92         """
93         with connect(self.dsn) as conn:
94             sqlp = SQLPreprocessor(conn, config)
95             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
96
97
98     def check_database(self, config: Configuration) -> None:
99         """ Check that the tokenizer is set up correctly.
100         """
101         # Will throw an error if there is an issue.
102         self.init_from_project(config)
103
104
105     def update_statistics(self, config: Configuration, threads: int = 2) -> None:
106         """ Recompute frequencies for all name words.
107         """
108         with connect(self.dsn) as conn:
109             if not table_exists(conn, 'search_name'):
110                 return
111
112             with conn.cursor() as cur:
113                 cur.execute('ANALYSE search_name')
114                 if threads > 1:
115                     cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
116                                      .format(pysql.Literal(min(threads, 6),)))
117
118                 if server_version_tuple(conn) < (12, 0):
119                     LOG.info('Computing word frequencies')
120                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
121                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
122                                      SELECT unnest(name_vector) as id, count(*)
123                                      FROM search_name GROUP BY id""")
124                     cur.execute('CREATE INDEX ON word_frequencies(id)')
125                     cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
126                                      SELECT unnest(nameaddress_vector) as id, count(*)
127                                      FROM search_name GROUP BY id""")
128                     cur.execute('CREATE INDEX ON addressword_frequencies(id)')
129                     cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
130                                                                                INOUT info JSONB)
131                                    AS $$
132                                    DECLARE rec RECORD;
133                                    BEGIN
134                                    IF info is null THEN
135                                      info = '{}'::jsonb;
136                                    END IF;
137                                    FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
138                                    LOOP
139                                      info = info || jsonb_build_object('count', rec.count);
140                                    END LOOP;
141                                    FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
142                                    LOOP
143                                      info = info || jsonb_build_object('addr_count', rec.count);
144                                    END LOOP;
145                                    IF info = '{}'::jsonb THEN
146                                      info = null;
147                                    END IF;
148                                    END;
149                                    $$ LANGUAGE plpgsql IMMUTABLE;
150                                 """)
151                     LOG.info('Update word table with recomputed frequencies')
152                     drop_tables(conn, 'tmp_word')
153                     cur.execute("""CREATE TABLE tmp_word AS
154                                     SELECT word_id, word_token, type, word,
155                                            word_freq_update(word_id, info) as info
156                                     FROM word
157                                 """)
158                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
159                 else:
160                     LOG.info('Computing word frequencies')
161                     drop_tables(conn, 'word_frequencies')
162                     cur.execute("""
163                       CREATE TEMP TABLE word_frequencies AS
164                       WITH word_freq AS MATERIALIZED (
165                                SELECT unnest(name_vector) as id, count(*)
166                                      FROM search_name GROUP BY id),
167                            addr_freq AS MATERIALIZED (
168                                SELECT unnest(nameaddress_vector) as id, count(*)
169                                      FROM search_name GROUP BY id)
170                       SELECT coalesce(a.id, w.id) as id,
171                              (CASE WHEN w.count is null THEN '{}'::JSONB
172                                   ELSE jsonb_build_object('count', w.count) END
173                               ||
174                               CASE WHEN a.count is null THEN '{}'::JSONB
175                                   ELSE jsonb_build_object('addr_count', a.count) END) as info
176                       FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
177                       """)
178                     cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
179                     cur.execute('ANALYSE word_frequencies')
180                     LOG.info('Update word table with recomputed frequencies')
181                     drop_tables(conn, 'tmp_word')
182                     cur.execute("""CREATE TABLE tmp_word AS
183                                     SELECT word_id, word_token, type, word,
184                                            (CASE WHEN wf.info is null THEN word.info
185                                             ELSE coalesce(word.info, '{}'::jsonb) || wf.info
186                                             END) as info
187                                     FROM word LEFT JOIN word_frequencies wf
188                                          ON word.word_id = wf.id
189                                     ORDER BY word_id
190                                 """)
191                     drop_tables(conn, 'word_frequencies')
192
193             with conn.cursor() as cur:
194                 cur.execute('SET max_parallel_workers_per_gather TO 0')
195
196             sqlp = SQLPreprocessor(conn, config)
197             sqlp.run_string(conn,
198                             'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
199             conn.commit()
200         self._create_base_indices(config, 'tmp_word')
201         self._create_lookup_indices(config, 'tmp_word')
202         self._move_temporary_word_table('tmp_word')
203
204
205
206     def _cleanup_housenumbers(self) -> None:
207         """ Remove unused house numbers.
208         """
209         with connect(self.dsn) as conn:
210             if not table_exists(conn, 'search_name'):
211                 return
212             with conn.cursor(name="hnr_counter") as cur:
213                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
214                                FROM word
215                                WHERE type = 'H'
216                                  AND NOT EXISTS(SELECT * FROM search_name
217                                                 WHERE ARRAY[word.word_id] && name_vector)
218                                  AND (char_length(coalesce(word, word_token)) > 6
219                                       OR coalesce(word, word_token) not similar to '\\d+')
220                             """)
221                 candidates = {token: wid for wid, token in cur}
222             with conn.cursor(name="hnr_counter") as cur:
223                 cur.execute("""SELECT housenumber FROM placex
224                                WHERE housenumber is not null
225                                      AND (char_length(housenumber) > 6
226                                           OR housenumber not similar to '\\d+')
227                             """)
228                 for row in cur:
229                     for hnr in row[0].split(';'):
230                         candidates.pop(hnr, None)
231             LOG.info("There are %s outdated housenumbers.", len(candidates))
232             LOG.debug("Outdated housenumbers: %s", candidates.keys())
233             if candidates:
234                 with conn.cursor() as cur:
235                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
236                                 (list(candidates.values()), ))
237                 conn.commit()
238
239
240
241     def update_word_tokens(self) -> None:
242         """ Remove unused tokens.
243         """
244         LOG.warning("Cleaning up housenumber tokens.")
245         self._cleanup_housenumbers()
246         LOG.warning("Tokenizer house-keeping done.")
247
248
249     def name_analyzer(self) -> 'ICUNameAnalyzer':
250         """ Create a new analyzer for tokenizing names and queries
251             using this tokinzer. Analyzers are context managers and should
252             be used accordingly:
253
254             ```
255             with tokenizer.name_analyzer() as analyzer:
256                 analyser.tokenize()
257             ```
258
259             When used outside the with construct, the caller must ensure to
260             call the close() function before destructing the analyzer.
261
262             Analyzers are not thread-safe. You need to instantiate one per thread.
263         """
264         assert self.loader is not None
265         return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
266                                self.loader.make_token_analysis())
267
268
269     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
270         """ Return a list of the `num` most frequent full words
271             in the database.
272         """
273         with conn.cursor() as cur:
274             cur.execute("""SELECT word, sum((info->>'count')::int) as count
275                              FROM word WHERE type = 'W'
276                              GROUP BY word
277                              ORDER BY count DESC LIMIT %s""", (num,))
278             return list(s[0].split('@')[0] for s in cur)
279
280
281     def _save_config(self) -> None:
282         """ Save the configuration that needs to remain stable for the given
283             database as database properties.
284         """
285         assert self.loader is not None
286         with connect(self.dsn) as conn:
287             self.loader.save_config_to_db(conn)
288
289
290     def _setup_db_tables(self, config: Configuration) -> None:
291         """ Set up the word table and fill it with pre-computed word
292             frequencies.
293         """
294         with connect(self.dsn) as conn:
295             drop_tables(conn, 'word')
296             sqlp = SQLPreprocessor(conn, config)
297             sqlp.run_string(conn, """
298                 CREATE TABLE word (
299                       word_id INTEGER,
300                       word_token text NOT NULL,
301                       type text NOT NULL,
302                       word text,
303                       info jsonb
304                     ) {{db.tablespace.search_data}};
305                 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
306
307                 DROP SEQUENCE IF EXISTS seq_word;
308                 CREATE SEQUENCE seq_word start 1;
309                 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
310             """)
311             conn.commit()
312
313
314     def _create_base_indices(self, config: Configuration, table_name: str) -> None:
315         """ Set up the word table and fill it with pre-computed word
316             frequencies.
317         """
318         with connect(self.dsn) as conn:
319             sqlp = SQLPreprocessor(conn, config)
320             sqlp.run_string(conn,
321                             """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
322                                USING BTREE (word_token) {{db.tablespace.search_index}}""",
323                             table_name=table_name)
324             for name, ctype in WORD_TYPES:
325                 sqlp.run_string(conn,
326                                 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
327                                    USING BTREE (word) {{db.tablespace.address_index}}
328                                    WHERE type = '{{column_type}}'
329                                 """,
330                                 table_name=table_name, idx_name=name,
331                                 column_type=ctype)
332             conn.commit()
333
334
335     def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
336         """ Create additional indexes used when running the API.
337         """
338         with connect(self.dsn) as conn:
339             sqlp = SQLPreprocessor(conn, config)
340             # Index required for details lookup.
341             sqlp.run_string(conn, """
342                 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
343                   ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
344             """,
345             table_name=table_name)
346             conn.commit()
347
348
349     def _move_temporary_word_table(self, old: str) -> None:
350         """ Rename all tables and indexes used by the tokenizer.
351         """
352         with connect(self.dsn) as conn:
353             drop_tables(conn, 'word')
354             with conn.cursor() as cur:
355                 cur.execute(f"ALTER TABLE {old} RENAME TO word")
356                 for idx in ('word_token', 'word_id'):
357                     cur.execute(f"""ALTER INDEX idx_{old}_{idx}
358                                       RENAME TO idx_word_{idx}""")
359                 for name, _ in WORD_TYPES:
360                     cur.execute(f"""ALTER INDEX idx_{old}_{name}
361                                     RENAME TO idx_word_{name}""")
362             conn.commit()
363
364
365
366
367 class ICUNameAnalyzer(AbstractAnalyzer):
368     """ The ICU analyzer uses the ICU library for splitting names.
369
370         Each instance opens a connection to the database to request the
371         normalization.
372     """
373
374     def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
375                  token_analysis: ICUTokenAnalysis) -> None:
376         self.conn: Optional[Connection] = connect(dsn)
377         self.conn.autocommit = True
378         self.sanitizer = sanitizer
379         self.token_analysis = token_analysis
380
381         self._cache = _TokenCache()
382
383
384     def close(self) -> None:
385         """ Free all resources used by the analyzer.
386         """
387         if self.conn:
388             self.conn.close()
389             self.conn = None
390
391
392     def _search_normalized(self, name: str) -> str:
393         """ Return the search token transliteration of the given name.
394         """
395         return cast(str, self.token_analysis.search.transliterate(name)).strip()
396
397
398     def _normalized(self, name: str) -> str:
399         """ Return the normalized version of the given name with all
400             non-relevant information removed.
401         """
402         return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
403
404
405     def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
406         """ Return token information for the given list of words.
407             If a word starts with # it is assumed to be a full name
408             otherwise is a partial name.
409
410             The function returns a list of tuples with
411             (original word, word token, word id).
412
413             The function is used for testing and debugging only
414             and not necessarily efficient.
415         """
416         assert self.conn is not None
417         full_tokens = {}
418         partial_tokens = {}
419         for word in words:
420             if word.startswith('#'):
421                 full_tokens[word] = self._search_normalized(word[1:])
422             else:
423                 partial_tokens[word] = self._search_normalized(word)
424
425         with self.conn.cursor() as cur:
426             cur.execute("""SELECT word_token, word_id
427                             FROM word WHERE word_token = ANY(%s) and type = 'W'
428                         """, (list(full_tokens.values()),))
429             full_ids = {r[0]: r[1] for r in cur}
430             cur.execute("""SELECT word_token, word_id
431                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
432                         (list(partial_tokens.values()),))
433             part_ids = {r[0]: r[1] for r in cur}
434
435         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
436                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
437
438
439     def normalize_postcode(self, postcode: str) -> str:
440         """ Convert the postcode to a standardized form.
441
442             This function must yield exactly the same result as the SQL function
443             'token_normalized_postcode()'.
444         """
445         return postcode.strip().upper()
446
447
448     def update_postcodes_from_db(self) -> None:
449         """ Update postcode tokens in the word table from the location_postcode
450             table.
451         """
452         assert self.conn is not None
453         analyzer = self.token_analysis.analysis.get('@postcode')
454
455         with self.conn.cursor() as cur:
456             # First get all postcode names currently in the word table.
457             cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
458             word_entries = set((entry[0] for entry in cur))
459
460             # Then compute the required postcode names from the postcode table.
461             needed_entries = set()
462             cur.execute("SELECT country_code, postcode FROM location_postcode")
463             for cc, postcode in cur:
464                 info = PlaceInfo({'country_code': cc,
465                                   'class': 'place', 'type': 'postcode',
466                                   'address': {'postcode': postcode}})
467                 address = self.sanitizer.process_names(info)[1]
468                 for place in address:
469                     if place.kind == 'postcode':
470                         if analyzer is None:
471                             postcode_name = place.name.strip().upper()
472                             variant_base = None
473                         else:
474                             postcode_name = analyzer.get_canonical_id(place)
475                             variant_base = place.get_attr("variant")
476
477                         if variant_base:
478                             needed_entries.add(f'{postcode_name}@{variant_base}')
479                         else:
480                             needed_entries.add(postcode_name)
481                         break
482
483         # Now update the word table.
484         self._delete_unused_postcode_words(word_entries - needed_entries)
485         self._add_missing_postcode_words(needed_entries - word_entries)
486
487     def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
488         assert self.conn is not None
489         if tokens:
490             with self.conn.cursor() as cur:
491                 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
492                             (list(tokens), ))
493
494     def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
495         assert self.conn is not None
496         if not tokens:
497             return
498
499         analyzer = self.token_analysis.analysis.get('@postcode')
500         terms = []
501
502         for postcode_name in tokens:
503             if '@' in postcode_name:
504                 term, variant = postcode_name.split('@', 2)
505                 term = self._search_normalized(term)
506                 if analyzer is None:
507                     variants = [term]
508                 else:
509                     variants = analyzer.compute_variants(variant)
510                     if term not in variants:
511                         variants.append(term)
512             else:
513                 variants = [self._search_normalized(postcode_name)]
514             terms.append((postcode_name, variants))
515
516         if terms:
517             with self.conn.cursor() as cur:
518                 cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
519
520
521
522
523     def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
524                                should_replace: bool) -> None:
525         """ Replace the search index for special phrases with the new phrases.
526             If `should_replace` is True, then the previous set of will be
527             completely replaced. Otherwise the phrases are added to the
528             already existing ones.
529         """
530         assert self.conn is not None
531         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
532                             for p in phrases))
533
534         with self.conn.cursor() as cur:
535             # Get the old phrases.
536             existing_phrases = set()
537             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
538             for word, info in cur:
539                 existing_phrases.add((word, info['class'], info['type'],
540                                       info.get('op') or '-'))
541
542             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
543             if should_replace:
544                 deleted = self._remove_special_phrases(cur, norm_phrases,
545                                                        existing_phrases)
546             else:
547                 deleted = 0
548
549         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
550                  len(norm_phrases), added, deleted)
551
552
553     def _add_special_phrases(self, cursor: Cursor,
554                              new_phrases: Set[Tuple[str, str, str, str]],
555                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
556         """ Add all phrases to the database that are not yet there.
557         """
558         to_add = new_phrases - existing_phrases
559
560         added = 0
561         with cursor.copy('COPY word(word_token, type, word, info) FROM STDIN') as copy:
562             for word, cls, typ, oper in to_add:
563                 term = self._search_normalized(word)
564                 if term:
565                     copy.write_row((term, 'S', word,
566                                     Jsonb({'class': cls, 'type': typ,
567                                            'op': oper if oper in ('in', 'near') else None})))
568                     added += 1
569
570         return added
571
572
573     def _remove_special_phrases(self, cursor: Cursor,
574                              new_phrases: Set[Tuple[str, str, str, str]],
575                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
576         """ Remove all phrases from the database that are no longer in the
577             new phrase list.
578         """
579         to_delete = existing_phrases - new_phrases
580
581         if to_delete:
582             cursor.executemany(
583                 """ DELETE FROM word
584                       WHERE type = 'S' and word = %s
585                             and info->>'class' = %s and info->>'type' = %s
586                             and %s = coalesce(info->>'op', '-')
587                 """, to_delete)
588
589         return len(to_delete)
590
591
592     def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
593         """ Add default names for the given country to the search index.
594         """
595         # Make sure any name preprocessing for country names applies.
596         info = PlaceInfo({'name': names, 'country_code': country_code,
597                           'rank_address': 4, 'class': 'boundary',
598                           'type': 'administrative'})
599         self._add_country_full_names(country_code,
600                                      self.sanitizer.process_names(info)[0],
601                                      internal=True)
602
603
604     def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
605                                 internal: bool = False) -> None:
606         """ Add names for the given country from an already sanitized
607             name list.
608         """
609         assert self.conn is not None
610         word_tokens = set()
611         for name in names:
612             norm_name = self._search_normalized(name.name)
613             if norm_name:
614                 word_tokens.add(norm_name)
615
616         with self.conn.cursor() as cur:
617             # Get existing names
618             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
619                              FROM word
620                              WHERE type = 'C' and word = %s""",
621                         (country_code, ))
622             # internal/external names
623             existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
624             for word in cur:
625                 existing_tokens[word[1]].add(word[0])
626
627             # Delete names that no longer exist.
628             gone_tokens = existing_tokens[internal] - word_tokens
629             if internal:
630                 gone_tokens.update(existing_tokens[False] & word_tokens)
631             if gone_tokens:
632                 cur.execute("""DELETE FROM word
633                                USING unnest(%s::text[]) as token
634                                WHERE type = 'C' and word = %s
635                                      and word_token = token""",
636                             (list(gone_tokens), country_code))
637
638             # Only add those names that are not yet in the list.
639             new_tokens = word_tokens - existing_tokens[True]
640             if not internal:
641                 new_tokens -= existing_tokens[False]
642             if new_tokens:
643                 if internal:
644                     sql = """INSERT INTO word (word_token, type, word, info)
645                                (SELECT token, 'C', %s, '{"internal": "yes"}'
646                                   FROM unnest(%s::text[]) as token)
647                            """
648                 else:
649                     sql = """INSERT INTO word (word_token, type, word)
650                                    (SELECT token, 'C', %s
651                                     FROM unnest(%s::text[]) as token)
652                           """
653                 cur.execute(sql, (country_code, list(new_tokens)))
654
655
656     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
657         """ Determine tokenizer information about the given place.
658
659             Returns a JSON-serializable structure that will be handed into
660             the database via the token_info field.
661         """
662         token_info = _TokenInfo()
663
664         names, address = self.sanitizer.process_names(place)
665
666         if names:
667             token_info.set_names(*self._compute_name_tokens(names))
668
669             if place.is_country():
670                 assert place.country_code is not None
671                 self._add_country_full_names(place.country_code, names)
672
673         if address:
674             self._process_place_address(token_info, address)
675
676         return token_info.to_dict()
677
678
679     def _process_place_address(self, token_info: '_TokenInfo',
680                                address: Sequence[PlaceName]) -> None:
681         for item in address:
682             if item.kind == 'postcode':
683                 token_info.set_postcode(self._add_postcode(item))
684             elif item.kind == 'housenumber':
685                 token_info.add_housenumber(*self._compute_housenumber_token(item))
686             elif item.kind == 'street':
687                 token_info.add_street(self._retrieve_full_tokens(item.name))
688             elif item.kind == 'place':
689                 if not item.suffix:
690                     token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
691             elif not item.kind.startswith('_') and not item.suffix and \
692                  item.kind not in ('country', 'full', 'inclusion'):
693                 token_info.add_address_term(item.kind,
694                                             itertools.chain(*self._compute_name_tokens([item])))
695
696
697     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
698         """ Normalize the housenumber and return the word token and the
699             canonical form.
700         """
701         assert self.conn is not None
702         analyzer = self.token_analysis.analysis.get('@housenumber')
703         result: Tuple[Optional[int], Optional[str]] = (None, None)
704
705         if analyzer is None:
706             # When no custom analyzer is set, simply normalize and transliterate
707             norm_name = self._search_normalized(hnr.name)
708             if norm_name:
709                 result = self._cache.housenumbers.get(norm_name, result)
710                 if result[0] is None:
711                     hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
712
713                     result = hid, norm_name
714                     self._cache.housenumbers[norm_name] = result
715         else:
716             # Otherwise use the analyzer to determine the canonical name.
717             # Per convention we use the first variant as the 'lookup name', the
718             # name that gets saved in the housenumber field of the place.
719             word_id = analyzer.get_canonical_id(hnr)
720             if word_id:
721                 result = self._cache.housenumbers.get(word_id, result)
722                 if result[0] is None:
723                     variants = analyzer.compute_variants(word_id)
724                     if variants:
725                         hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
726                                              (word_id, list(variants)))
727                         result = hid, variants[0]
728                         self._cache.housenumbers[word_id] = result
729
730         return result
731
732
733     def _retrieve_full_tokens(self, name: str) -> List[int]:
734         """ Get the full name token for the given name, if it exists.
735             The name is only retrieved for the standard analyser.
736         """
737         assert self.conn is not None
738         norm_name = self._search_normalized(name)
739
740         # return cached if possible
741         if norm_name in self._cache.fulls:
742             return self._cache.fulls[norm_name]
743
744         with self.conn.cursor() as cur:
745             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
746                         (norm_name, ))
747             full = [row[0] for row in cur]
748
749         self._cache.fulls[norm_name] = full
750
751         return full
752
753
754     def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
755         """ Computes the full name and partial name tokens for the given
756             dictionary of names.
757         """
758         assert self.conn is not None
759         full_tokens: Set[int] = set()
760         partial_tokens: Set[int] = set()
761
762         for name in names:
763             analyzer_id = name.get_attr('analyzer')
764             analyzer = self.token_analysis.get_analyzer(analyzer_id)
765             word_id = analyzer.get_canonical_id(name)
766             if analyzer_id is None:
767                 token_id = word_id
768             else:
769                 token_id = f'{word_id}@{analyzer_id}'
770
771             full, part = self._cache.names.get(token_id, (None, None))
772             if full is None:
773                 variants = analyzer.compute_variants(word_id)
774                 if not variants:
775                     continue
776
777                 with self.conn.cursor() as cur:
778                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
779                                 (token_id, variants))
780                     full, part = cast(Tuple[int, List[int]], cur.fetchone())
781
782                 self._cache.names[token_id] = (full, part)
783
784             assert part is not None
785
786             full_tokens.add(full)
787             partial_tokens.update(part)
788
789         return full_tokens, partial_tokens
790
791
792     def _add_postcode(self, item: PlaceName) -> Optional[str]:
793         """ Make sure the normalized postcode is present in the word table.
794         """
795         assert self.conn is not None
796         analyzer = self.token_analysis.analysis.get('@postcode')
797
798         if analyzer is None:
799             postcode_name = item.name.strip().upper()
800             variant_base = None
801         else:
802             postcode_name = analyzer.get_canonical_id(item)
803             variant_base = item.get_attr("variant")
804
805         if variant_base:
806             postcode = f'{postcode_name}@{variant_base}'
807         else:
808             postcode = postcode_name
809
810         if postcode not in self._cache.postcodes:
811             term = self._search_normalized(postcode_name)
812             if not term:
813                 return None
814
815             variants = {term}
816             if analyzer is not None and variant_base:
817                 variants.update(analyzer.compute_variants(variant_base))
818
819             with self.conn.cursor() as cur:
820                 cur.execute("SELECT create_postcode_word(%s, %s)",
821                             (postcode, list(variants)))
822             self._cache.postcodes.add(postcode)
823
824         return postcode_name
825
826
827 class _TokenInfo:
828     """ Collect token information to be sent back to the database.
829     """
830     def __init__(self) -> None:
831         self.names: Optional[str] = None
832         self.housenumbers: Set[str] = set()
833         self.housenumber_tokens: Set[int] = set()
834         self.street_tokens: Optional[Set[int]] = None
835         self.place_tokens: Set[int] = set()
836         self.address_tokens: Dict[str, str] = {}
837         self.postcode: Optional[str] = None
838
839
840     def _mk_array(self, tokens: Iterable[Any]) -> str:
841         return f"{{{','.join((str(s) for s in tokens))}}}"
842
843
844     def to_dict(self) -> Dict[str, Any]:
845         """ Return the token information in database importable format.
846         """
847         out: Dict[str, Any] = {}
848
849         if self.names:
850             out['names'] = self.names
851
852         if self.housenumbers:
853             out['hnr'] = ';'.join(self.housenumbers)
854             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
855
856         if self.street_tokens is not None:
857             out['street'] = self._mk_array(self.street_tokens)
858
859         if self.place_tokens:
860             out['place'] = self._mk_array(self.place_tokens)
861
862         if self.address_tokens:
863             out['addr'] = self.address_tokens
864
865         if self.postcode:
866             out['postcode'] = self.postcode
867
868         return out
869
870
871     def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
872         """ Adds token information for the normalised names.
873         """
874         self.names = self._mk_array(itertools.chain(fulls, partials))
875
876
877     def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
878         """ Extract housenumber information from a list of normalised
879             housenumbers.
880         """
881         if token:
882             assert hnr is not None
883             self.housenumbers.add(hnr)
884             self.housenumber_tokens.add(token)
885
886
887     def add_street(self, tokens: Iterable[int]) -> None:
888         """ Add addr:street match terms.
889         """
890         if self.street_tokens is None:
891             self.street_tokens = set()
892         self.street_tokens.update(tokens)
893
894
895     def add_place(self, tokens: Iterable[int]) -> None:
896         """ Add addr:place search and match terms.
897         """
898         self.place_tokens.update(tokens)
899
900
901     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
902         """ Add additional address terms.
903         """
904         array = self._mk_array(partials)
905         if len(array) > 2:
906             self.address_tokens[key] = array
907
908     def set_postcode(self, postcode: Optional[str]) -> None:
909         """ Set the postcode to the given one.
910         """
911         self.postcode = postcode
912
913
914 class _TokenCache:
915     """ Cache for token information to avoid repeated database queries.
916
917         This cache is not thread-safe and needs to be instantiated per
918         analyzer.
919     """
920     def __init__(self) -> None:
921         self.names: Dict[str, Tuple[int, List[int]]] = {}
922         self.partials: Dict[str, int] = {}
923         self.fulls: Dict[str, List[int]] = {}
924         self.postcodes: Set[str] = set()
925         self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}