]> git.openstreetmap.org Git - nominatim.git/blob - src/nominatim_db/tokenizer/icu_tokenizer.py
release 4.5.0.post6
[nominatim.git] / src / nominatim_db / tokenizer / icu_tokenizer.py
1 # SPDX-License-Identifier: GPL-3.0-or-later
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
10 """
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
12                    Dict, Set, Iterable
13 import itertools
14 import logging
15 from pathlib import Path
16
17 from psycopg.types.json import Jsonb
18 from psycopg import sql as pysql
19
20 from ..db.connection import connect, Connection, Cursor, server_version_tuple, \
21                             drop_tables, table_exists, execute_scalar
22 from ..config import Configuration
23 from ..db.sql_preprocessor import SQLPreprocessor
24 from ..data.place_info import PlaceInfo
25 from ..data.place_name import PlaceName
26 from .icu_rule_loader import ICURuleLoader
27 from .place_sanitizer import PlaceSanitizer
28 from .icu_token_analysis import ICUTokenAnalysis
29 from .base import AbstractAnalyzer, AbstractTokenizer
30
31 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
32
33 LOG = logging.getLogger()
34
35 WORD_TYPES = (('country_names', 'C'),
36               ('postcodes', 'P'),
37               ('full_word', 'W'),
38               ('housenumbers', 'H'))
39
40
41 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
42     """ Create a new instance of the tokenizer provided by this module.
43     """
44     return ICUTokenizer(dsn, data_dir)
45
46
47 class ICUTokenizer(AbstractTokenizer):
48     """ This tokenizer uses libICU to convert names and queries to ASCII.
49         Otherwise it uses the same algorithms and data structures as the
50         normalization routines in Nominatim 3.
51     """
52
53     def __init__(self, dsn: str, data_dir: Path) -> None:
54         self.dsn = dsn
55         self.data_dir = data_dir
56         self.loader: Optional[ICURuleLoader] = None
57
58     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
59         """ Set up a new tokenizer for the database.
60
61             This copies all necessary data in the project directory to make
62             sure the tokenizer remains stable even over updates.
63         """
64         self.loader = ICURuleLoader(config)
65
66         self._save_config()
67
68         if init_db:
69             self.update_sql_functions(config)
70             self._setup_db_tables(config)
71             self._create_base_indices(config, 'word')
72
73     def init_from_project(self, config: Configuration) -> None:
74         """ Initialise the tokenizer from the project directory.
75         """
76         self.loader = ICURuleLoader(config)
77
78         with connect(self.dsn) as conn:
79             self.loader.load_config_from_db(conn)
80
81     def finalize_import(self, config: Configuration) -> None:
82         """ Do any required postprocessing to make the tokenizer data ready
83             for use.
84         """
85         self._create_lookup_indices(config, 'word')
86
87     def update_sql_functions(self, config: Configuration) -> None:
88         """ Reimport the SQL functions for this tokenizer.
89         """
90         with connect(self.dsn) as conn:
91             sqlp = SQLPreprocessor(conn, config)
92             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
93
94     def check_database(self, config: Configuration) -> None:
95         """ Check that the tokenizer is set up correctly.
96         """
97         # Will throw an error if there is an issue.
98         self.init_from_project(config)
99
100     def update_statistics(self, config: Configuration, threads: int = 2) -> None:
101         """ Recompute frequencies for all name words.
102         """
103         with connect(self.dsn) as conn:
104             if not table_exists(conn, 'search_name'):
105                 return
106
107             with conn.cursor() as cur:
108                 cur.execute('ANALYSE search_name')
109                 if threads > 1:
110                     cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
111                                      .format(pysql.Literal(min(threads, 6),)))
112
113                 if server_version_tuple(conn) < (12, 0):
114                     LOG.info('Computing word frequencies')
115                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
116                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
117                                      SELECT unnest(name_vector) as id, count(*)
118                                      FROM search_name GROUP BY id""")
119                     cur.execute('CREATE INDEX ON word_frequencies(id)')
120                     cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
121                                      SELECT unnest(nameaddress_vector) as id, count(*)
122                                      FROM search_name GROUP BY id""")
123                     cur.execute('CREATE INDEX ON addressword_frequencies(id)')
124                     cur.execute("""
125                         CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
126                                                                     INOUT info JSONB)
127                         AS $$
128                         DECLARE rec RECORD;
129                         BEGIN
130                         IF info is null THEN
131                           info = '{}'::jsonb;
132                         END IF;
133                         FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
134                         LOOP
135                           info = info || jsonb_build_object('count', rec.count);
136                         END LOOP;
137                         FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
138                         LOOP
139                           info = info || jsonb_build_object('addr_count', rec.count);
140                         END LOOP;
141                         IF info = '{}'::jsonb THEN
142                           info = null;
143                         END IF;
144                         END;
145                         $$ LANGUAGE plpgsql IMMUTABLE;
146                         """)
147                     LOG.info('Update word table with recomputed frequencies')
148                     drop_tables(conn, 'tmp_word')
149                     cur.execute("""CREATE TABLE tmp_word AS
150                                     SELECT word_id, word_token, type, word,
151                                            word_freq_update(word_id, info) as info
152                                     FROM word
153                                 """)
154                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
155                 else:
156                     LOG.info('Computing word frequencies')
157                     drop_tables(conn, 'word_frequencies')
158                     cur.execute("""
159                       CREATE TEMP TABLE word_frequencies AS
160                       WITH word_freq AS MATERIALIZED (
161                                SELECT unnest(name_vector) as id, count(*)
162                                      FROM search_name GROUP BY id),
163                            addr_freq AS MATERIALIZED (
164                                SELECT unnest(nameaddress_vector) as id, count(*)
165                                      FROM search_name GROUP BY id)
166                       SELECT coalesce(a.id, w.id) as id,
167                              (CASE WHEN w.count is null THEN '{}'::JSONB
168                                   ELSE jsonb_build_object('count', w.count) END
169                               ||
170                               CASE WHEN a.count is null THEN '{}'::JSONB
171                                   ELSE jsonb_build_object('addr_count', a.count) END) as info
172                       FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
173                       """)
174                     cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
175                     cur.execute('ANALYSE word_frequencies')
176                     LOG.info('Update word table with recomputed frequencies')
177                     drop_tables(conn, 'tmp_word')
178                     cur.execute("""CREATE TABLE tmp_word AS
179                                     SELECT word_id, word_token, type, word,
180                                            (CASE WHEN wf.info is null THEN word.info
181                                             ELSE coalesce(word.info, '{}'::jsonb) || wf.info
182                                             END) as info
183                                     FROM word LEFT JOIN word_frequencies wf
184                                          ON word.word_id = wf.id
185                                     ORDER BY word_id
186                                 """)
187                     drop_tables(conn, 'word_frequencies')
188
189             with conn.cursor() as cur:
190                 cur.execute('SET max_parallel_workers_per_gather TO 0')
191
192             sqlp = SQLPreprocessor(conn, config)
193             sqlp.run_string(conn,
194                             'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
195             conn.commit()
196         self._create_base_indices(config, 'tmp_word')
197         self._create_lookup_indices(config, 'tmp_word')
198         self._move_temporary_word_table('tmp_word')
199
200     def _cleanup_housenumbers(self) -> None:
201         """ Remove unused house numbers.
202         """
203         with connect(self.dsn) as conn:
204             if not table_exists(conn, 'search_name'):
205                 return
206             with conn.cursor(name="hnr_counter") as cur:
207                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
208                                FROM word
209                                WHERE type = 'H'
210                                  AND NOT EXISTS(SELECT * FROM search_name
211                                                 WHERE ARRAY[word.word_id] && name_vector)
212                                  AND (char_length(coalesce(word, word_token)) > 6
213                                       OR coalesce(word, word_token) not similar to '\\d+')
214                             """)
215                 candidates = {token: wid for wid, token in cur}
216             with conn.cursor(name="hnr_counter") as cur:
217                 cur.execute("""SELECT housenumber FROM placex
218                                WHERE housenumber is not null
219                                      AND (char_length(housenumber) > 6
220                                           OR housenumber not similar to '\\d+')
221                             """)
222                 for row in cur:
223                     for hnr in row[0].split(';'):
224                         candidates.pop(hnr, None)
225             LOG.info("There are %s outdated housenumbers.", len(candidates))
226             LOG.debug("Outdated housenumbers: %s", candidates.keys())
227             if candidates:
228                 with conn.cursor() as cur:
229                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
230                                 (list(candidates.values()), ))
231                 conn.commit()
232
233     def update_word_tokens(self) -> None:
234         """ Remove unused tokens.
235         """
236         LOG.warning("Cleaning up housenumber tokens.")
237         self._cleanup_housenumbers()
238         LOG.warning("Tokenizer house-keeping done.")
239
240     def name_analyzer(self) -> 'ICUNameAnalyzer':
241         """ Create a new analyzer for tokenizing names and queries
242             using this tokinzer. Analyzers are context managers and should
243             be used accordingly:
244
245             ```
246             with tokenizer.name_analyzer() as analyzer:
247                 analyser.tokenize()
248             ```
249
250             When used outside the with construct, the caller must ensure to
251             call the close() function before destructing the analyzer.
252
253             Analyzers are not thread-safe. You need to instantiate one per thread.
254         """
255         assert self.loader is not None
256         return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
257                                self.loader.make_token_analysis())
258
259     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
260         """ Return a list of the `num` most frequent full words
261             in the database.
262         """
263         with conn.cursor() as cur:
264             cur.execute("""SELECT word, sum((info->>'count')::int) as count
265                              FROM word WHERE type = 'W'
266                              GROUP BY word
267                              ORDER BY count DESC LIMIT %s""", (num,))
268             return list(s[0].split('@')[0] for s in cur)
269
270     def _save_config(self) -> None:
271         """ Save the configuration that needs to remain stable for the given
272             database as database properties.
273         """
274         assert self.loader is not None
275         with connect(self.dsn) as conn:
276             self.loader.save_config_to_db(conn)
277
278     def _setup_db_tables(self, config: Configuration) -> None:
279         """ Set up the word table and fill it with pre-computed word
280             frequencies.
281         """
282         with connect(self.dsn) as conn:
283             drop_tables(conn, 'word')
284             sqlp = SQLPreprocessor(conn, config)
285             sqlp.run_string(conn, """
286                 CREATE TABLE word (
287                       word_id INTEGER,
288                       word_token text NOT NULL,
289                       type text NOT NULL,
290                       word text,
291                       info jsonb
292                     ) {{db.tablespace.search_data}};
293                 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
294
295                 DROP SEQUENCE IF EXISTS seq_word;
296                 CREATE SEQUENCE seq_word start 1;
297                 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
298             """)
299             conn.commit()
300
301     def _create_base_indices(self, config: Configuration, table_name: str) -> None:
302         """ Set up the word table and fill it with pre-computed word
303             frequencies.
304         """
305         with connect(self.dsn) as conn:
306             sqlp = SQLPreprocessor(conn, config)
307             sqlp.run_string(conn,
308                             """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
309                                USING BTREE (word_token) {{db.tablespace.search_index}}""",
310                             table_name=table_name)
311             for name, ctype in WORD_TYPES:
312                 sqlp.run_string(conn,
313                                 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
314                                    USING BTREE (word) {{db.tablespace.address_index}}
315                                    WHERE type = '{{column_type}}'
316                                 """,
317                                 table_name=table_name, idx_name=name,
318                                 column_type=ctype)
319             conn.commit()
320
321     def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
322         """ Create additional indexes used when running the API.
323         """
324         with connect(self.dsn) as conn:
325             sqlp = SQLPreprocessor(conn, config)
326             # Index required for details lookup.
327             sqlp.run_string(
328                 conn,
329                 """
330                 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
331                   ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
332                 """,
333                 table_name=table_name)
334             conn.commit()
335
336     def _move_temporary_word_table(self, old: str) -> None:
337         """ Rename all tables and indexes used by the tokenizer.
338         """
339         with connect(self.dsn) as conn:
340             drop_tables(conn, 'word')
341             with conn.cursor() as cur:
342                 cur.execute(f"ALTER TABLE {old} RENAME TO word")
343                 for idx in ('word_token', 'word_id'):
344                     cur.execute(f"""ALTER INDEX idx_{old}_{idx}
345                                       RENAME TO idx_word_{idx}""")
346                 for name, _ in WORD_TYPES:
347                     cur.execute(f"""ALTER INDEX idx_{old}_{name}
348                                     RENAME TO idx_word_{name}""")
349             conn.commit()
350
351
352 class ICUNameAnalyzer(AbstractAnalyzer):
353     """ The ICU analyzer uses the ICU library for splitting names.
354
355         Each instance opens a connection to the database to request the
356         normalization.
357     """
358
359     def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
360                  token_analysis: ICUTokenAnalysis) -> None:
361         self.conn: Optional[Connection] = connect(dsn)
362         self.conn.autocommit = True
363         self.sanitizer = sanitizer
364         self.token_analysis = token_analysis
365
366         self._cache = _TokenCache()
367
368     def close(self) -> None:
369         """ Free all resources used by the analyzer.
370         """
371         if self.conn:
372             self.conn.close()
373             self.conn = None
374
375     def _search_normalized(self, name: str) -> str:
376         """ Return the search token transliteration of the given name.
377         """
378         return cast(str, self.token_analysis.search.transliterate(name)).strip()
379
380     def _normalized(self, name: str) -> str:
381         """ Return the normalized version of the given name with all
382             non-relevant information removed.
383         """
384         return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
385
386     def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
387         """ Return token information for the given list of words.
388             If a word starts with # it is assumed to be a full name
389             otherwise is a partial name.
390
391             The function returns a list of tuples with
392             (original word, word token, word id).
393
394             The function is used for testing and debugging only
395             and not necessarily efficient.
396         """
397         assert self.conn is not None
398         full_tokens = {}
399         partial_tokens = {}
400         for word in words:
401             if word.startswith('#'):
402                 full_tokens[word] = self._search_normalized(word[1:])
403             else:
404                 partial_tokens[word] = self._search_normalized(word)
405
406         with self.conn.cursor() as cur:
407             cur.execute("""SELECT word_token, word_id
408                             FROM word WHERE word_token = ANY(%s) and type = 'W'
409                         """, (list(full_tokens.values()),))
410             full_ids = {r[0]: r[1] for r in cur}
411             cur.execute("""SELECT word_token, word_id
412                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
413                         (list(partial_tokens.values()),))
414             part_ids = {r[0]: r[1] for r in cur}
415
416         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
417             + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
418
419     def normalize_postcode(self, postcode: str) -> str:
420         """ Convert the postcode to a standardized form.
421
422             This function must yield exactly the same result as the SQL function
423             'token_normalized_postcode()'.
424         """
425         return postcode.strip().upper()
426
427     def update_postcodes_from_db(self) -> None:
428         """ Update postcode tokens in the word table from the location_postcode
429             table.
430         """
431         assert self.conn is not None
432         analyzer = self.token_analysis.analysis.get('@postcode')
433
434         with self.conn.cursor() as cur:
435             # First get all postcode names currently in the word table.
436             cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
437             word_entries = set((entry[0] for entry in cur))
438
439             # Then compute the required postcode names from the postcode table.
440             needed_entries = set()
441             cur.execute("SELECT country_code, postcode FROM location_postcode")
442             for cc, postcode in cur:
443                 info = PlaceInfo({'country_code': cc,
444                                   'class': 'place', 'type': 'postcode',
445                                   'address': {'postcode': postcode}})
446                 address = self.sanitizer.process_names(info)[1]
447                 for place in address:
448                     if place.kind == 'postcode':
449                         if analyzer is None:
450                             postcode_name = place.name.strip().upper()
451                             variant_base = None
452                         else:
453                             postcode_name = analyzer.get_canonical_id(place)
454                             variant_base = place.get_attr("variant")
455
456                         if variant_base:
457                             needed_entries.add(f'{postcode_name}@{variant_base}')
458                         else:
459                             needed_entries.add(postcode_name)
460                         break
461
462         # Now update the word table.
463         self._delete_unused_postcode_words(word_entries - needed_entries)
464         self._add_missing_postcode_words(needed_entries - word_entries)
465
466     def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
467         assert self.conn is not None
468         if tokens:
469             with self.conn.cursor() as cur:
470                 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
471                             (list(tokens), ))
472
473     def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
474         assert self.conn is not None
475         if not tokens:
476             return
477
478         analyzer = self.token_analysis.analysis.get('@postcode')
479         terms = []
480
481         for postcode_name in tokens:
482             if '@' in postcode_name:
483                 term, variant = postcode_name.split('@', 2)
484                 term = self._search_normalized(term)
485                 if analyzer is None:
486                     variants = [term]
487                 else:
488                     variants = analyzer.compute_variants(variant)
489                     if term not in variants:
490                         variants.append(term)
491             else:
492                 variants = [self._search_normalized(postcode_name)]
493             terms.append((postcode_name, variants))
494
495         if terms:
496             with self.conn.cursor() as cur:
497                 cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
498
499     def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
500                                should_replace: bool) -> None:
501         """ Replace the search index for special phrases with the new phrases.
502             If `should_replace` is True, then the previous set of will be
503             completely replaced. Otherwise the phrases are added to the
504             already existing ones.
505         """
506         assert self.conn is not None
507         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
508                             for p in phrases))
509
510         with self.conn.cursor() as cur:
511             # Get the old phrases.
512             existing_phrases = set()
513             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
514             for word, info in cur:
515                 existing_phrases.add((word, info['class'], info['type'],
516                                       info.get('op') or '-'))
517
518             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
519             if should_replace:
520                 deleted = self._remove_special_phrases(cur, norm_phrases,
521                                                        existing_phrases)
522             else:
523                 deleted = 0
524
525         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
526                  len(norm_phrases), added, deleted)
527
528     def _add_special_phrases(self, cursor: Cursor,
529                              new_phrases: Set[Tuple[str, str, str, str]],
530                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
531         """ Add all phrases to the database that are not yet there.
532         """
533         to_add = new_phrases - existing_phrases
534
535         added = 0
536         with cursor.copy('COPY word(word_token, type, word, info) FROM STDIN') as copy:
537             for word, cls, typ, oper in to_add:
538                 term = self._search_normalized(word)
539                 if term:
540                     copy.write_row((term, 'S', word,
541                                     Jsonb({'class': cls, 'type': typ,
542                                            'op': oper if oper in ('in', 'near') else None})))
543                     added += 1
544
545         return added
546
547     def _remove_special_phrases(self, cursor: Cursor,
548                                 new_phrases: Set[Tuple[str, str, str, str]],
549                                 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
550         """ Remove all phrases from the database that are no longer in the
551             new phrase list.
552         """
553         to_delete = existing_phrases - new_phrases
554
555         if to_delete:
556             cursor.executemany(
557                 """ DELETE FROM word
558                       WHERE type = 'S' and word = %s
559                             and info->>'class' = %s and info->>'type' = %s
560                             and %s = coalesce(info->>'op', '-')
561                 """, to_delete)
562
563         return len(to_delete)
564
565     def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
566         """ Add default names for the given country to the search index.
567         """
568         # Make sure any name preprocessing for country names applies.
569         info = PlaceInfo({'name': names, 'country_code': country_code,
570                           'rank_address': 4, 'class': 'boundary',
571                           'type': 'administrative'})
572         self._add_country_full_names(country_code,
573                                      self.sanitizer.process_names(info)[0],
574                                      internal=True)
575
576     def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
577                                 internal: bool = False) -> None:
578         """ Add names for the given country from an already sanitized
579             name list.
580         """
581         assert self.conn is not None
582         word_tokens = set()
583         for name in names:
584             norm_name = self._search_normalized(name.name)
585             if norm_name:
586                 word_tokens.add(norm_name)
587
588         with self.conn.cursor() as cur:
589             # Get existing names
590             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
591                              FROM word
592                              WHERE type = 'C' and word = %s""",
593                         (country_code, ))
594             # internal/external names
595             existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
596             for word in cur:
597                 existing_tokens[word[1]].add(word[0])
598
599             # Delete names that no longer exist.
600             gone_tokens = existing_tokens[internal] - word_tokens
601             if internal:
602                 gone_tokens.update(existing_tokens[False] & word_tokens)
603             if gone_tokens:
604                 cur.execute("""DELETE FROM word
605                                USING unnest(%s::text[]) as token
606                                WHERE type = 'C' and word = %s
607                                      and word_token = token""",
608                             (list(gone_tokens), country_code))
609
610             # Only add those names that are not yet in the list.
611             new_tokens = word_tokens - existing_tokens[True]
612             if not internal:
613                 new_tokens -= existing_tokens[False]
614             if new_tokens:
615                 if internal:
616                     sql = """INSERT INTO word (word_token, type, word, info)
617                                (SELECT token, 'C', %s, '{"internal": "yes"}'
618                                   FROM unnest(%s::text[]) as token)
619                            """
620                 else:
621                     sql = """INSERT INTO word (word_token, type, word)
622                                    (SELECT token, 'C', %s
623                                     FROM unnest(%s::text[]) as token)
624                           """
625                 cur.execute(sql, (country_code, list(new_tokens)))
626
627     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
628         """ Determine tokenizer information about the given place.
629
630             Returns a JSON-serializable structure that will be handed into
631             the database via the token_info field.
632         """
633         token_info = _TokenInfo()
634
635         names, address = self.sanitizer.process_names(place)
636
637         if names:
638             token_info.set_names(*self._compute_name_tokens(names))
639
640             if place.is_country():
641                 assert place.country_code is not None
642                 self._add_country_full_names(place.country_code, names)
643
644         if address:
645             self._process_place_address(token_info, address)
646
647         return token_info.to_dict()
648
649     def _process_place_address(self, token_info: '_TokenInfo',
650                                address: Sequence[PlaceName]) -> None:
651         for item in address:
652             if item.kind == 'postcode':
653                 token_info.set_postcode(self._add_postcode(item))
654             elif item.kind == 'housenumber':
655                 token_info.add_housenumber(*self._compute_housenumber_token(item))
656             elif item.kind == 'street':
657                 token_info.add_street(self._retrieve_full_tokens(item.name))
658             elif item.kind == 'place':
659                 if not item.suffix:
660                     token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
661             elif (not item.kind.startswith('_') and not item.suffix and
662                   item.kind not in ('country', 'full', 'inclusion')):
663                 token_info.add_address_term(item.kind,
664                                             itertools.chain(*self._compute_name_tokens([item])))
665
666     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
667         """ Normalize the housenumber and return the word token and the
668             canonical form.
669         """
670         assert self.conn is not None
671         analyzer = self.token_analysis.analysis.get('@housenumber')
672         result: Tuple[Optional[int], Optional[str]] = (None, None)
673
674         if analyzer is None:
675             # When no custom analyzer is set, simply normalize and transliterate
676             norm_name = self._search_normalized(hnr.name)
677             if norm_name:
678                 result = self._cache.housenumbers.get(norm_name, result)
679                 if result[0] is None:
680                     hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
681
682                     result = hid, norm_name
683                     self._cache.housenumbers[norm_name] = result
684         else:
685             # Otherwise use the analyzer to determine the canonical name.
686             # Per convention we use the first variant as the 'lookup name', the
687             # name that gets saved in the housenumber field of the place.
688             word_id = analyzer.get_canonical_id(hnr)
689             if word_id:
690                 result = self._cache.housenumbers.get(word_id, result)
691                 if result[0] is None:
692                     variants = analyzer.compute_variants(word_id)
693                     if variants:
694                         hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
695                                              (word_id, list(variants)))
696                         result = hid, variants[0]
697                         self._cache.housenumbers[word_id] = result
698
699         return result
700
701     def _retrieve_full_tokens(self, name: str) -> List[int]:
702         """ Get the full name token for the given name, if it exists.
703             The name is only retrieved for the standard analyser.
704         """
705         assert self.conn is not None
706         norm_name = self._search_normalized(name)
707
708         # return cached if possible
709         if norm_name in self._cache.fulls:
710             return self._cache.fulls[norm_name]
711
712         with self.conn.cursor() as cur:
713             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
714                         (norm_name, ))
715             full = [row[0] for row in cur]
716
717         self._cache.fulls[norm_name] = full
718
719         return full
720
721     def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
722         """ Computes the full name and partial name tokens for the given
723             dictionary of names.
724         """
725         assert self.conn is not None
726         full_tokens: Set[int] = set()
727         partial_tokens: Set[int] = set()
728
729         for name in names:
730             analyzer_id = name.get_attr('analyzer')
731             analyzer = self.token_analysis.get_analyzer(analyzer_id)
732             word_id = analyzer.get_canonical_id(name)
733             if analyzer_id is None:
734                 token_id = word_id
735             else:
736                 token_id = f'{word_id}@{analyzer_id}'
737
738             full, part = self._cache.names.get(token_id, (None, None))
739             if full is None:
740                 variants = analyzer.compute_variants(word_id)
741                 if not variants:
742                     continue
743
744                 with self.conn.cursor() as cur:
745                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
746                                 (token_id, variants))
747                     full, part = cast(Tuple[int, List[int]], cur.fetchone())
748
749                 self._cache.names[token_id] = (full, part)
750
751             assert part is not None
752
753             full_tokens.add(full)
754             partial_tokens.update(part)
755
756         return full_tokens, partial_tokens
757
758     def _add_postcode(self, item: PlaceName) -> Optional[str]:
759         """ Make sure the normalized postcode is present in the word table.
760         """
761         assert self.conn is not None
762         analyzer = self.token_analysis.analysis.get('@postcode')
763
764         if analyzer is None:
765             postcode_name = item.name.strip().upper()
766             variant_base = None
767         else:
768             postcode_name = analyzer.get_canonical_id(item)
769             variant_base = item.get_attr("variant")
770
771         if variant_base:
772             postcode = f'{postcode_name}@{variant_base}'
773         else:
774             postcode = postcode_name
775
776         if postcode not in self._cache.postcodes:
777             term = self._search_normalized(postcode_name)
778             if not term:
779                 return None
780
781             variants = {term}
782             if analyzer is not None and variant_base:
783                 variants.update(analyzer.compute_variants(variant_base))
784
785             with self.conn.cursor() as cur:
786                 cur.execute("SELECT create_postcode_word(%s, %s)",
787                             (postcode, list(variants)))
788             self._cache.postcodes.add(postcode)
789
790         return postcode_name
791
792
793 class _TokenInfo:
794     """ Collect token information to be sent back to the database.
795     """
796     def __init__(self) -> None:
797         self.names: Optional[str] = None
798         self.housenumbers: Set[str] = set()
799         self.housenumber_tokens: Set[int] = set()
800         self.street_tokens: Optional[Set[int]] = None
801         self.place_tokens: Set[int] = set()
802         self.address_tokens: Dict[str, str] = {}
803         self.postcode: Optional[str] = None
804
805     def _mk_array(self, tokens: Iterable[Any]) -> str:
806         return f"{{{','.join((str(s) for s in tokens))}}}"
807
808     def to_dict(self) -> Dict[str, Any]:
809         """ Return the token information in database importable format.
810         """
811         out: Dict[str, Any] = {}
812
813         if self.names:
814             out['names'] = self.names
815
816         if self.housenumbers:
817             out['hnr'] = ';'.join(self.housenumbers)
818             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
819
820         if self.street_tokens is not None:
821             out['street'] = self._mk_array(self.street_tokens)
822
823         if self.place_tokens:
824             out['place'] = self._mk_array(self.place_tokens)
825
826         if self.address_tokens:
827             out['addr'] = self.address_tokens
828
829         if self.postcode:
830             out['postcode'] = self.postcode
831
832         return out
833
834     def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
835         """ Adds token information for the normalised names.
836         """
837         self.names = self._mk_array(itertools.chain(fulls, partials))
838
839     def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
840         """ Extract housenumber information from a list of normalised
841             housenumbers.
842         """
843         if token:
844             assert hnr is not None
845             self.housenumbers.add(hnr)
846             self.housenumber_tokens.add(token)
847
848     def add_street(self, tokens: Iterable[int]) -> None:
849         """ Add addr:street match terms.
850         """
851         if self.street_tokens is None:
852             self.street_tokens = set()
853         self.street_tokens.update(tokens)
854
855     def add_place(self, tokens: Iterable[int]) -> None:
856         """ Add addr:place search and match terms.
857         """
858         self.place_tokens.update(tokens)
859
860     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
861         """ Add additional address terms.
862         """
863         array = self._mk_array(partials)
864         if len(array) > 2:
865             self.address_tokens[key] = array
866
867     def set_postcode(self, postcode: Optional[str]) -> None:
868         """ Set the postcode to the given one.
869         """
870         self.postcode = postcode
871
872
873 class _TokenCache:
874     """ Cache for token information to avoid repeated database queries.
875
876         This cache is not thread-safe and needs to be instantiated per
877         analyzer.
878     """
879     def __init__(self) -> None:
880         self.names: Dict[str, Tuple[int, List[int]]] = {}
881         self.partials: Dict[str, int] = {}
882         self.fulls: Dict[str, List[int]] = {}
883         self.postcodes: Set[str] = set()
884         self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}