]> git.openstreetmap.org Git - nominatim.git/blob - src/nominatim_db/tokenizer/icu_tokenizer.py
look up all places at once
[nominatim.git] / src / nominatim_db / tokenizer / icu_tokenizer.py
1 # SPDX-License-Identifier: GPL-3.0-or-later
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
10 """
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
12                    Dict, Set, Iterable
13 import itertools
14 import logging
15 from pathlib import Path
16
17 from psycopg.types.json import Jsonb
18 from psycopg import sql as pysql
19
20 from ..db.connection import connect, Connection, Cursor, server_version_tuple, \
21                             drop_tables, table_exists, execute_scalar
22 from ..config import Configuration
23 from ..db.sql_preprocessor import SQLPreprocessor
24 from ..data.place_info import PlaceInfo
25 from ..data.place_name import PlaceName
26 from .icu_rule_loader import ICURuleLoader
27 from .place_sanitizer import PlaceSanitizer
28 from .icu_token_analysis import ICUTokenAnalysis
29 from .base import AbstractAnalyzer, AbstractTokenizer
30
31 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
32
33 LOG = logging.getLogger()
34
35 WORD_TYPES = (('country_names', 'C'),
36               ('postcodes', 'P'),
37               ('full_word', 'W'),
38               ('housenumbers', 'H'))
39
40
41 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
42     """ Create a new instance of the tokenizer provided by this module.
43     """
44     return ICUTokenizer(dsn, data_dir)
45
46
47 class ICUTokenizer(AbstractTokenizer):
48     """ This tokenizer uses libICU to convert names and queries to ASCII.
49         Otherwise it uses the same algorithms and data structures as the
50         normalization routines in Nominatim 3.
51     """
52
53     def __init__(self, dsn: str, data_dir: Path) -> None:
54         self.dsn = dsn
55         self.data_dir = data_dir
56         self.loader: Optional[ICURuleLoader] = None
57
58     def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
59         """ Set up a new tokenizer for the database.
60
61             This copies all necessary data in the project directory to make
62             sure the tokenizer remains stable even over updates.
63         """
64         self.loader = ICURuleLoader(config)
65
66         self._save_config()
67
68         if init_db:
69             self.update_sql_functions(config)
70             self._setup_db_tables(config)
71             self._create_base_indices(config, 'word')
72
73     def init_from_project(self, config: Configuration) -> None:
74         """ Initialise the tokenizer from the project directory.
75         """
76         self.loader = ICURuleLoader(config)
77
78         with connect(self.dsn) as conn:
79             self.loader.load_config_from_db(conn)
80
81     def finalize_import(self, config: Configuration) -> None:
82         """ Do any required postprocessing to make the tokenizer data ready
83             for use.
84         """
85         self._create_lookup_indices(config, 'word')
86
87     def update_sql_functions(self, config: Configuration) -> None:
88         """ Reimport the SQL functions for this tokenizer.
89         """
90         with connect(self.dsn) as conn:
91             sqlp = SQLPreprocessor(conn, config)
92             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
93
94     def check_database(self, config: Configuration) -> None:
95         """ Check that the tokenizer is set up correctly.
96         """
97         # Will throw an error if there is an issue.
98         self.init_from_project(config)
99
100     def update_statistics(self, config: Configuration, threads: int = 2) -> None:
101         """ Recompute frequencies for all name words.
102         """
103         with connect(self.dsn) as conn:
104             if not table_exists(conn, 'search_name'):
105                 return
106
107             with conn.cursor() as cur:
108                 cur.execute('ANALYSE search_name')
109                 if threads > 1:
110                     cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
111                                      .format(pysql.Literal(min(threads, 6),)))
112
113                 if server_version_tuple(conn) < (12, 0):
114                     LOG.info('Computing word frequencies')
115                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
116                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
117                                      SELECT unnest(name_vector) as id, count(*)
118                                      FROM search_name GROUP BY id""")
119                     cur.execute('CREATE INDEX ON word_frequencies(id)')
120                     cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
121                                      SELECT unnest(nameaddress_vector) as id, count(*)
122                                      FROM search_name GROUP BY id""")
123                     cur.execute('CREATE INDEX ON addressword_frequencies(id)')
124                     cur.execute("""
125                         CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
126                                                                     INOUT info JSONB)
127                         AS $$
128                         DECLARE rec RECORD;
129                         BEGIN
130                         IF info is null THEN
131                           info = '{}'::jsonb;
132                         END IF;
133                         FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
134                         LOOP
135                           info = info || jsonb_build_object('count', rec.count);
136                         END LOOP;
137                         FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
138                         LOOP
139                           info = info || jsonb_build_object('addr_count', rec.count);
140                         END LOOP;
141                         IF info = '{}'::jsonb THEN
142                           info = null;
143                         END IF;
144                         END;
145                         $$ LANGUAGE plpgsql IMMUTABLE;
146                         """)
147                     LOG.info('Update word table with recomputed frequencies')
148                     drop_tables(conn, 'tmp_word')
149                     cur.execute("""CREATE TABLE tmp_word AS
150                                     SELECT word_id, word_token, type, word,
151                                            word_freq_update(word_id, info) as info
152                                     FROM word
153                                 """)
154                     drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
155                 else:
156                     LOG.info('Computing word frequencies')
157                     drop_tables(conn, 'word_frequencies')
158                     cur.execute("""
159                       CREATE TEMP TABLE word_frequencies AS
160                       WITH word_freq AS MATERIALIZED (
161                                SELECT unnest(name_vector) as id, count(*)
162                                      FROM search_name GROUP BY id),
163                            addr_freq AS MATERIALIZED (
164                                SELECT unnest(nameaddress_vector) as id, count(*)
165                                      FROM search_name GROUP BY id)
166                       SELECT coalesce(a.id, w.id) as id,
167                              (CASE WHEN w.count is null THEN '{}'::JSONB
168                                   ELSE jsonb_build_object('count', w.count) END
169                               ||
170                               CASE WHEN a.count is null THEN '{}'::JSONB
171                                   ELSE jsonb_build_object('addr_count', a.count) END) as info
172                       FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
173                       """)
174                     cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
175                     cur.execute('ANALYSE word_frequencies')
176                     LOG.info('Update word table with recomputed frequencies')
177                     drop_tables(conn, 'tmp_word')
178                     cur.execute("""CREATE TABLE tmp_word AS
179                                     SELECT word_id, word_token, type, word,
180                                            (CASE WHEN wf.info is null THEN word.info
181                                             ELSE coalesce(word.info, '{}'::jsonb) || wf.info
182                                             END) as info
183                                     FROM word LEFT JOIN word_frequencies wf
184                                          ON word.word_id = wf.id
185                                 """)
186                     drop_tables(conn, 'word_frequencies')
187
188             with conn.cursor() as cur:
189                 cur.execute('SET max_parallel_workers_per_gather TO 0')
190
191             sqlp = SQLPreprocessor(conn, config)
192             sqlp.run_string(conn,
193                             'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
194             conn.commit()
195         self._create_base_indices(config, 'tmp_word')
196         self._create_lookup_indices(config, 'tmp_word')
197         self._move_temporary_word_table('tmp_word')
198
199     def _cleanup_housenumbers(self) -> None:
200         """ Remove unused house numbers.
201         """
202         with connect(self.dsn) as conn:
203             if not table_exists(conn, 'search_name'):
204                 return
205             with conn.cursor(name="hnr_counter") as cur:
206                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
207                                FROM word
208                                WHERE type = 'H'
209                                  AND NOT EXISTS(SELECT * FROM search_name
210                                                 WHERE ARRAY[word.word_id] && name_vector)
211                                  AND (char_length(coalesce(word, word_token)) > 6
212                                       OR coalesce(word, word_token) not similar to '\\d+')
213                             """)
214                 candidates = {token: wid for wid, token in cur}
215             with conn.cursor(name="hnr_counter") as cur:
216                 cur.execute("""SELECT housenumber FROM placex
217                                WHERE housenumber is not null
218                                      AND (char_length(housenumber) > 6
219                                           OR housenumber not similar to '\\d+')
220                             """)
221                 for row in cur:
222                     for hnr in row[0].split(';'):
223                         candidates.pop(hnr, None)
224             LOG.info("There are %s outdated housenumbers.", len(candidates))
225             LOG.debug("Outdated housenumbers: %s", candidates.keys())
226             if candidates:
227                 with conn.cursor() as cur:
228                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
229                                 (list(candidates.values()), ))
230                 conn.commit()
231
232     def update_word_tokens(self) -> None:
233         """ Remove unused tokens.
234         """
235         LOG.warning("Cleaning up housenumber tokens.")
236         self._cleanup_housenumbers()
237         LOG.warning("Tokenizer house-keeping done.")
238
239     def name_analyzer(self) -> 'ICUNameAnalyzer':
240         """ Create a new analyzer for tokenizing names and queries
241             using this tokinzer. Analyzers are context managers and should
242             be used accordingly:
243
244             ```
245             with tokenizer.name_analyzer() as analyzer:
246                 analyser.tokenize()
247             ```
248
249             When used outside the with construct, the caller must ensure to
250             call the close() function before destructing the analyzer.
251
252             Analyzers are not thread-safe. You need to instantiate one per thread.
253         """
254         assert self.loader is not None
255         return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
256                                self.loader.make_token_analysis())
257
258     def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
259         """ Return a list of the `num` most frequent full words
260             in the database.
261         """
262         with conn.cursor() as cur:
263             cur.execute("""SELECT word, sum((info->>'count')::int) as count
264                              FROM word WHERE type = 'W'
265                              GROUP BY word
266                              ORDER BY count DESC LIMIT %s""", (num,))
267             return list(s[0].split('@')[0] for s in cur)
268
269     def _save_config(self) -> None:
270         """ Save the configuration that needs to remain stable for the given
271             database as database properties.
272         """
273         assert self.loader is not None
274         with connect(self.dsn) as conn:
275             self.loader.save_config_to_db(conn)
276
277     def _setup_db_tables(self, config: Configuration) -> None:
278         """ Set up the word table and fill it with pre-computed word
279             frequencies.
280         """
281         with connect(self.dsn) as conn:
282             drop_tables(conn, 'word')
283             sqlp = SQLPreprocessor(conn, config)
284             sqlp.run_string(conn, """
285                 CREATE TABLE word (
286                       word_id INTEGER,
287                       word_token text NOT NULL,
288                       type text NOT NULL,
289                       word text,
290                       info jsonb
291                     ) {{db.tablespace.search_data}};
292                 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
293
294                 DROP SEQUENCE IF EXISTS seq_word;
295                 CREATE SEQUENCE seq_word start 1;
296                 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
297             """)
298             conn.commit()
299
300     def _create_base_indices(self, config: Configuration, table_name: str) -> None:
301         """ Set up the word table and fill it with pre-computed word
302             frequencies.
303         """
304         with connect(self.dsn) as conn:
305             sqlp = SQLPreprocessor(conn, config)
306             sqlp.run_string(conn,
307                             """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
308                                USING BTREE (word_token) {{db.tablespace.search_index}}""",
309                             table_name=table_name)
310             for name, ctype in WORD_TYPES:
311                 sqlp.run_string(conn,
312                                 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
313                                    USING BTREE (word) {{db.tablespace.address_index}}
314                                    WHERE type = '{{column_type}}'
315                                 """,
316                                 table_name=table_name, idx_name=name,
317                                 column_type=ctype)
318             conn.commit()
319
320     def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
321         """ Create additional indexes used when running the API.
322         """
323         with connect(self.dsn) as conn:
324             sqlp = SQLPreprocessor(conn, config)
325             # Index required for details lookup.
326             sqlp.run_string(
327                 conn,
328                 """
329                 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
330                   ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
331                 """,
332                 table_name=table_name)
333             conn.commit()
334
335     def _move_temporary_word_table(self, old: str) -> None:
336         """ Rename all tables and indexes used by the tokenizer.
337         """
338         with connect(self.dsn) as conn:
339             drop_tables(conn, 'word')
340             with conn.cursor() as cur:
341                 cur.execute(f"ALTER TABLE {old} RENAME TO word")
342                 for idx in ('word_token', 'word_id'):
343                     cur.execute(f"""ALTER INDEX idx_{old}_{idx}
344                                       RENAME TO idx_word_{idx}""")
345                 for name, _ in WORD_TYPES:
346                     cur.execute(f"""ALTER INDEX idx_{old}_{name}
347                                     RENAME TO idx_word_{name}""")
348             conn.commit()
349
350
351 class ICUNameAnalyzer(AbstractAnalyzer):
352     """ The ICU analyzer uses the ICU library for splitting names.
353
354         Each instance opens a connection to the database to request the
355         normalization.
356     """
357
358     def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
359                  token_analysis: ICUTokenAnalysis) -> None:
360         self.conn: Optional[Connection] = connect(dsn)
361         self.conn.autocommit = True
362         self.sanitizer = sanitizer
363         self.token_analysis = token_analysis
364
365         self._cache = _TokenCache()
366
367     def close(self) -> None:
368         """ Free all resources used by the analyzer.
369         """
370         if self.conn:
371             self.conn.close()
372             self.conn = None
373
374     def _search_normalized(self, name: str) -> str:
375         """ Return the search token transliteration of the given name.
376         """
377         return cast(str, self.token_analysis.search.transliterate(name)).strip()
378
379     def _normalized(self, name: str) -> str:
380         """ Return the normalized version of the given name with all
381             non-relevant information removed.
382         """
383         return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
384
385     def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
386         """ Return token information for the given list of words.
387             If a word starts with # it is assumed to be a full name
388             otherwise is a partial name.
389
390             The function returns a list of tuples with
391             (original word, word token, word id).
392
393             The function is used for testing and debugging only
394             and not necessarily efficient.
395         """
396         assert self.conn is not None
397         full_tokens = {}
398         partial_tokens = {}
399         for word in words:
400             if word.startswith('#'):
401                 full_tokens[word] = self._search_normalized(word[1:])
402             else:
403                 partial_tokens[word] = self._search_normalized(word)
404
405         with self.conn.cursor() as cur:
406             cur.execute("""SELECT word_token, word_id
407                             FROM word WHERE word_token = ANY(%s) and type = 'W'
408                         """, (list(full_tokens.values()),))
409             full_ids = {r[0]: r[1] for r in cur}
410             cur.execute("""SELECT word_token, word_id
411                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
412                         (list(partial_tokens.values()),))
413             part_ids = {r[0]: r[1] for r in cur}
414
415         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
416             + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
417
418     def normalize_postcode(self, postcode: str) -> str:
419         """ Convert the postcode to a standardized form.
420
421             This function must yield exactly the same result as the SQL function
422             'token_normalized_postcode()'.
423         """
424         return postcode.strip().upper()
425
426     def update_postcodes_from_db(self) -> None:
427         """ Update postcode tokens in the word table from the location_postcode
428             table.
429         """
430         assert self.conn is not None
431         analyzer = self.token_analysis.analysis.get('@postcode')
432
433         with self.conn.cursor() as cur:
434             # First get all postcode names currently in the word table.
435             cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
436             word_entries = set((entry[0] for entry in cur))
437
438             # Then compute the required postcode names from the postcode table.
439             needed_entries = set()
440             cur.execute("SELECT country_code, postcode FROM location_postcode")
441             for cc, postcode in cur:
442                 info = PlaceInfo({'country_code': cc,
443                                   'class': 'place', 'type': 'postcode',
444                                   'address': {'postcode': postcode}})
445                 address = self.sanitizer.process_names(info)[1]
446                 for place in address:
447                     if place.kind == 'postcode':
448                         if analyzer is None:
449                             postcode_name = place.name.strip().upper()
450                             variant_base = None
451                         else:
452                             postcode_name = analyzer.get_canonical_id(place)
453                             variant_base = place.get_attr("variant")
454
455                         if variant_base:
456                             needed_entries.add(f'{postcode_name}@{variant_base}')
457                         else:
458                             needed_entries.add(postcode_name)
459                         break
460
461         # Now update the word table.
462         self._delete_unused_postcode_words(word_entries - needed_entries)
463         self._add_missing_postcode_words(needed_entries - word_entries)
464
465     def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
466         assert self.conn is not None
467         if tokens:
468             with self.conn.cursor() as cur:
469                 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
470                             (list(tokens), ))
471
472     def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
473         assert self.conn is not None
474         if not tokens:
475             return
476
477         analyzer = self.token_analysis.analysis.get('@postcode')
478         terms = []
479
480         for postcode_name in tokens:
481             if '@' in postcode_name:
482                 term, variant = postcode_name.split('@', 2)
483                 term = self._search_normalized(term)
484                 if analyzer is None:
485                     variants = [term]
486                 else:
487                     variants = analyzer.compute_variants(variant)
488                     if term not in variants:
489                         variants.append(term)
490             else:
491                 variants = [self._search_normalized(postcode_name)]
492             terms.append((postcode_name, variants))
493
494         if terms:
495             with self.conn.cursor() as cur:
496                 cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
497
498     def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
499                                should_replace: bool) -> None:
500         """ Replace the search index for special phrases with the new phrases.
501             If `should_replace` is True, then the previous set of will be
502             completely replaced. Otherwise the phrases are added to the
503             already existing ones.
504         """
505         assert self.conn is not None
506         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
507                             for p in phrases))
508
509         with self.conn.cursor() as cur:
510             # Get the old phrases.
511             existing_phrases = set()
512             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
513             for word, info in cur:
514                 existing_phrases.add((word, info['class'], info['type'],
515                                       info.get('op') or '-'))
516
517             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
518             if should_replace:
519                 deleted = self._remove_special_phrases(cur, norm_phrases,
520                                                        existing_phrases)
521             else:
522                 deleted = 0
523
524         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
525                  len(norm_phrases), added, deleted)
526
527     def _add_special_phrases(self, cursor: Cursor,
528                              new_phrases: Set[Tuple[str, str, str, str]],
529                              existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
530         """ Add all phrases to the database that are not yet there.
531         """
532         to_add = new_phrases - existing_phrases
533
534         added = 0
535         with cursor.copy('COPY word(word_token, type, word, info) FROM STDIN') as copy:
536             for word, cls, typ, oper in to_add:
537                 term = self._search_normalized(word)
538                 if term:
539                     copy.write_row((term, 'S', word,
540                                     Jsonb({'class': cls, 'type': typ,
541                                            'op': oper if oper in ('in', 'near') else None})))
542                     added += 1
543
544         return added
545
546     def _remove_special_phrases(self, cursor: Cursor,
547                                 new_phrases: Set[Tuple[str, str, str, str]],
548                                 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
549         """ Remove all phrases from the database that are no longer in the
550             new phrase list.
551         """
552         to_delete = existing_phrases - new_phrases
553
554         if to_delete:
555             cursor.executemany(
556                 """ DELETE FROM word
557                       WHERE type = 'S' and word = %s
558                             and info->>'class' = %s and info->>'type' = %s
559                             and %s = coalesce(info->>'op', '-')
560                 """, to_delete)
561
562         return len(to_delete)
563
564     def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
565         """ Add default names for the given country to the search index.
566         """
567         # Make sure any name preprocessing for country names applies.
568         info = PlaceInfo({'name': names, 'country_code': country_code,
569                           'rank_address': 4, 'class': 'boundary',
570                           'type': 'administrative'})
571         self._add_country_full_names(country_code,
572                                      self.sanitizer.process_names(info)[0],
573                                      internal=True)
574
575     def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
576                                 internal: bool = False) -> None:
577         """ Add names for the given country from an already sanitized
578             name list.
579         """
580         assert self.conn is not None
581         word_tokens = set()
582         for name in names:
583             norm_name = self._search_normalized(name.name)
584             if norm_name:
585                 word_tokens.add(norm_name)
586
587         with self.conn.cursor() as cur:
588             # Get existing names
589             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
590                              FROM word
591                              WHERE type = 'C' and word = %s""",
592                         (country_code, ))
593             # internal/external names
594             existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
595             for word in cur:
596                 existing_tokens[word[1]].add(word[0])
597
598             # Delete names that no longer exist.
599             gone_tokens = existing_tokens[internal] - word_tokens
600             if internal:
601                 gone_tokens.update(existing_tokens[False] & word_tokens)
602             if gone_tokens:
603                 cur.execute("""DELETE FROM word
604                                USING unnest(%s::text[]) as token
605                                WHERE type = 'C' and word = %s
606                                      and word_token = token""",
607                             (list(gone_tokens), country_code))
608
609             # Only add those names that are not yet in the list.
610             new_tokens = word_tokens - existing_tokens[True]
611             if not internal:
612                 new_tokens -= existing_tokens[False]
613             if new_tokens:
614                 if internal:
615                     sql = """INSERT INTO word (word_token, type, word, info)
616                                (SELECT token, 'C', %s, '{"internal": "yes"}'
617                                   FROM unnest(%s::text[]) as token)
618                            """
619                 else:
620                     sql = """INSERT INTO word (word_token, type, word)
621                                    (SELECT token, 'C', %s
622                                     FROM unnest(%s::text[]) as token)
623                           """
624                 cur.execute(sql, (country_code, list(new_tokens)))
625
626     def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
627         """ Determine tokenizer information about the given place.
628
629             Returns a JSON-serializable structure that will be handed into
630             the database via the token_info field.
631         """
632         token_info = _TokenInfo()
633
634         names, address = self.sanitizer.process_names(place)
635
636         if names:
637             token_info.set_names(*self._compute_name_tokens(names))
638
639             if place.is_country():
640                 assert place.country_code is not None
641                 self._add_country_full_names(place.country_code, names)
642
643         if address:
644             self._process_place_address(token_info, address)
645
646         return token_info.to_dict()
647
648     def _process_place_address(self, token_info: '_TokenInfo',
649                                address: Sequence[PlaceName]) -> None:
650         for item in address:
651             if item.kind == 'postcode':
652                 token_info.set_postcode(self._add_postcode(item))
653             elif item.kind == 'housenumber':
654                 token_info.add_housenumber(*self._compute_housenumber_token(item))
655             elif item.kind == 'street':
656                 token_info.add_street(self._retrieve_full_tokens(item.name))
657             elif item.kind == 'place':
658                 if not item.suffix:
659                     token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
660             elif (not item.kind.startswith('_') and not item.suffix and
661                   item.kind not in ('country', 'full', 'inclusion')):
662                 token_info.add_address_term(item.kind,
663                                             itertools.chain(*self._compute_name_tokens([item])))
664
665     def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
666         """ Normalize the housenumber and return the word token and the
667             canonical form.
668         """
669         assert self.conn is not None
670         analyzer = self.token_analysis.analysis.get('@housenumber')
671         result: Tuple[Optional[int], Optional[str]] = (None, None)
672
673         if analyzer is None:
674             # When no custom analyzer is set, simply normalize and transliterate
675             norm_name = self._search_normalized(hnr.name)
676             if norm_name:
677                 result = self._cache.housenumbers.get(norm_name, result)
678                 if result[0] is None:
679                     hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
680
681                     result = hid, norm_name
682                     self._cache.housenumbers[norm_name] = result
683         else:
684             # Otherwise use the analyzer to determine the canonical name.
685             # Per convention we use the first variant as the 'lookup name', the
686             # name that gets saved in the housenumber field of the place.
687             word_id = analyzer.get_canonical_id(hnr)
688             if word_id:
689                 result = self._cache.housenumbers.get(word_id, result)
690                 if result[0] is None:
691                     variants = analyzer.compute_variants(word_id)
692                     if variants:
693                         hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
694                                              (word_id, list(variants)))
695                         result = hid, variants[0]
696                         self._cache.housenumbers[word_id] = result
697
698         return result
699
700     def _retrieve_full_tokens(self, name: str) -> List[int]:
701         """ Get the full name token for the given name, if it exists.
702             The name is only retrieved for the standard analyser.
703         """
704         assert self.conn is not None
705         norm_name = self._search_normalized(name)
706
707         # return cached if possible
708         if norm_name in self._cache.fulls:
709             return self._cache.fulls[norm_name]
710
711         with self.conn.cursor() as cur:
712             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
713                         (norm_name, ))
714             full = [row[0] for row in cur]
715
716         self._cache.fulls[norm_name] = full
717
718         return full
719
720     def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
721         """ Computes the full name and partial name tokens for the given
722             dictionary of names.
723         """
724         assert self.conn is not None
725         full_tokens: Set[int] = set()
726         partial_tokens: Set[int] = set()
727
728         for name in names:
729             analyzer_id = name.get_attr('analyzer')
730             analyzer = self.token_analysis.get_analyzer(analyzer_id)
731             word_id = analyzer.get_canonical_id(name)
732             if analyzer_id is None:
733                 token_id = word_id
734             else:
735                 token_id = f'{word_id}@{analyzer_id}'
736
737             full, part = self._cache.names.get(token_id, (None, None))
738             if full is None:
739                 variants = analyzer.compute_variants(word_id)
740                 if not variants:
741                     continue
742
743                 with self.conn.cursor() as cur:
744                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
745                                 (token_id, variants))
746                     full, part = cast(Tuple[int, List[int]], cur.fetchone())
747
748                 self._cache.names[token_id] = (full, part)
749
750             assert part is not None
751
752             full_tokens.add(full)
753             partial_tokens.update(part)
754
755         return full_tokens, partial_tokens
756
757     def _add_postcode(self, item: PlaceName) -> Optional[str]:
758         """ Make sure the normalized postcode is present in the word table.
759         """
760         assert self.conn is not None
761         analyzer = self.token_analysis.analysis.get('@postcode')
762
763         if analyzer is None:
764             postcode_name = item.name.strip().upper()
765             variant_base = None
766         else:
767             postcode_name = analyzer.get_canonical_id(item)
768             variant_base = item.get_attr("variant")
769
770         if variant_base:
771             postcode = f'{postcode_name}@{variant_base}'
772         else:
773             postcode = postcode_name
774
775         if postcode not in self._cache.postcodes:
776             term = self._search_normalized(postcode_name)
777             if not term:
778                 return None
779
780             variants = {term}
781             if analyzer is not None and variant_base:
782                 variants.update(analyzer.compute_variants(variant_base))
783
784             with self.conn.cursor() as cur:
785                 cur.execute("SELECT create_postcode_word(%s, %s)",
786                             (postcode, list(variants)))
787             self._cache.postcodes.add(postcode)
788
789         return postcode_name
790
791
792 class _TokenInfo:
793     """ Collect token information to be sent back to the database.
794     """
795     def __init__(self) -> None:
796         self.names: Optional[str] = None
797         self.housenumbers: Set[str] = set()
798         self.housenumber_tokens: Set[int] = set()
799         self.street_tokens: Optional[Set[int]] = None
800         self.place_tokens: Set[int] = set()
801         self.address_tokens: Dict[str, str] = {}
802         self.postcode: Optional[str] = None
803
804     def _mk_array(self, tokens: Iterable[Any]) -> str:
805         return f"{{{','.join((str(s) for s in tokens))}}}"
806
807     def to_dict(self) -> Dict[str, Any]:
808         """ Return the token information in database importable format.
809         """
810         out: Dict[str, Any] = {}
811
812         if self.names:
813             out['names'] = self.names
814
815         if self.housenumbers:
816             out['hnr'] = ';'.join(self.housenumbers)
817             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
818
819         if self.street_tokens is not None:
820             out['street'] = self._mk_array(self.street_tokens)
821
822         if self.place_tokens:
823             out['place'] = self._mk_array(self.place_tokens)
824
825         if self.address_tokens:
826             out['addr'] = self.address_tokens
827
828         if self.postcode:
829             out['postcode'] = self.postcode
830
831         return out
832
833     def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
834         """ Adds token information for the normalised names.
835         """
836         self.names = self._mk_array(itertools.chain(fulls, partials))
837
838     def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
839         """ Extract housenumber information from a list of normalised
840             housenumbers.
841         """
842         if token:
843             assert hnr is not None
844             self.housenumbers.add(hnr)
845             self.housenumber_tokens.add(token)
846
847     def add_street(self, tokens: Iterable[int]) -> None:
848         """ Add addr:street match terms.
849         """
850         if self.street_tokens is None:
851             self.street_tokens = set()
852         self.street_tokens.update(tokens)
853
854     def add_place(self, tokens: Iterable[int]) -> None:
855         """ Add addr:place search and match terms.
856         """
857         self.place_tokens.update(tokens)
858
859     def add_address_term(self, key: str, partials: Iterable[int]) -> None:
860         """ Add additional address terms.
861         """
862         array = self._mk_array(partials)
863         if len(array) > 2:
864             self.address_tokens[key] = array
865
866     def set_postcode(self, postcode: Optional[str]) -> None:
867         """ Set the postcode to the given one.
868         """
869         self.postcode = postcode
870
871
872 class _TokenCache:
873     """ Cache for token information to avoid repeated database queries.
874
875         This cache is not thread-safe and needs to be instantiated per
876         analyzer.
877     """
878     def __init__(self) -> None:
879         self.names: Dict[str, Tuple[int, List[int]]] = {}
880         self.partials: Dict[str, int] = {}
881         self.fulls: Dict[str, List[int]] = {}
882         self.postcodes: Set[str] = set()
883         self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}