1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
15 from pathlib import Path
17 from psycopg.types.json import Jsonb
18 from psycopg import sql as pysql
20 from ..db.connection import connect, Connection, Cursor, server_version_tuple,\
21 drop_tables, table_exists, execute_scalar
22 from ..config import Configuration
23 from ..db.sql_preprocessor import SQLPreprocessor
24 from ..data.place_info import PlaceInfo
25 from ..data.place_name import PlaceName
26 from .icu_rule_loader import ICURuleLoader
27 from .place_sanitizer import PlaceSanitizer
28 from .icu_token_analysis import ICUTokenAnalysis
29 from .base import AbstractAnalyzer, AbstractTokenizer
31 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
33 LOG = logging.getLogger()
35 WORD_TYPES =(('country_names', 'C'),
38 ('housenumbers', 'H'))
40 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
41 """ Create a new instance of the tokenizer provided by this module.
43 return ICUTokenizer(dsn, data_dir)
46 class ICUTokenizer(AbstractTokenizer):
47 """ This tokenizer uses libICU to convert names and queries to ASCII.
48 Otherwise it uses the same algorithms and data structures as the
49 normalization routines in Nominatim 3.
52 def __init__(self, dsn: str, data_dir: Path) -> None:
54 self.data_dir = data_dir
55 self.loader: Optional[ICURuleLoader] = None
58 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
59 """ Set up a new tokenizer for the database.
61 This copies all necessary data in the project directory to make
62 sure the tokenizer remains stable even over updates.
64 self.loader = ICURuleLoader(config)
69 self.update_sql_functions(config)
70 self._setup_db_tables(config)
71 self._create_base_indices(config, 'word')
74 def init_from_project(self, config: Configuration) -> None:
75 """ Initialise the tokenizer from the project directory.
77 self.loader = ICURuleLoader(config)
79 with connect(self.dsn) as conn:
80 self.loader.load_config_from_db(conn)
83 def finalize_import(self, config: Configuration) -> None:
84 """ Do any required postprocessing to make the tokenizer data ready
87 self._create_lookup_indices(config, 'word')
90 def update_sql_functions(self, config: Configuration) -> None:
91 """ Reimport the SQL functions for this tokenizer.
93 with connect(self.dsn) as conn:
94 sqlp = SQLPreprocessor(conn, config)
95 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
98 def check_database(self, config: Configuration) -> None:
99 """ Check that the tokenizer is set up correctly.
101 # Will throw an error if there is an issue.
102 self.init_from_project(config)
105 def update_statistics(self, config: Configuration, threads: int = 2) -> None:
106 """ Recompute frequencies for all name words.
108 with connect(self.dsn) as conn:
109 if not table_exists(conn, 'search_name'):
112 with conn.cursor() as cur:
113 cur.execute('ANALYSE search_name')
115 cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
116 .format(pysql.Literal(min(threads, 6),)))
118 if server_version_tuple(conn) < (12, 0):
119 LOG.info('Computing word frequencies')
120 drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
121 cur.execute("""CREATE TEMP TABLE word_frequencies AS
122 SELECT unnest(name_vector) as id, count(*)
123 FROM search_name GROUP BY id""")
124 cur.execute('CREATE INDEX ON word_frequencies(id)')
125 cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
126 SELECT unnest(nameaddress_vector) as id, count(*)
127 FROM search_name GROUP BY id""")
128 cur.execute('CREATE INDEX ON addressword_frequencies(id)')
129 cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
137 FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
139 info = info || jsonb_build_object('count', rec.count);
141 FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
143 info = info || jsonb_build_object('addr_count', rec.count);
145 IF info = '{}'::jsonb THEN
149 $$ LANGUAGE plpgsql IMMUTABLE;
151 LOG.info('Update word table with recomputed frequencies')
152 drop_tables(conn, 'tmp_word')
153 cur.execute("""CREATE TABLE tmp_word AS
154 SELECT word_id, word_token, type, word,
155 word_freq_update(word_id, info) as info
158 drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
160 LOG.info('Computing word frequencies')
161 drop_tables(conn, 'word_frequencies')
163 CREATE TEMP TABLE word_frequencies AS
164 WITH word_freq AS MATERIALIZED (
165 SELECT unnest(name_vector) as id, count(*)
166 FROM search_name GROUP BY id),
167 addr_freq AS MATERIALIZED (
168 SELECT unnest(nameaddress_vector) as id, count(*)
169 FROM search_name GROUP BY id)
170 SELECT coalesce(a.id, w.id) as id,
171 (CASE WHEN w.count is null THEN '{}'::JSONB
172 ELSE jsonb_build_object('count', w.count) END
174 CASE WHEN a.count is null THEN '{}'::JSONB
175 ELSE jsonb_build_object('addr_count', a.count) END) as info
176 FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
178 cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
179 cur.execute('ANALYSE word_frequencies')
180 LOG.info('Update word table with recomputed frequencies')
181 drop_tables(conn, 'tmp_word')
182 cur.execute("""CREATE TABLE tmp_word AS
183 SELECT word_id, word_token, type, word,
184 (CASE WHEN wf.info is null THEN word.info
185 ELSE coalesce(word.info, '{}'::jsonb) || wf.info
187 FROM word LEFT JOIN word_frequencies wf
188 ON word.word_id = wf.id
191 drop_tables(conn, 'word_frequencies')
193 with conn.cursor() as cur:
194 cur.execute('SET max_parallel_workers_per_gather TO 0')
196 sqlp = SQLPreprocessor(conn, config)
197 sqlp.run_string(conn,
198 'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
200 self._create_base_indices(config, 'tmp_word')
201 self._create_lookup_indices(config, 'tmp_word')
202 self._move_temporary_word_table('tmp_word')
206 def _cleanup_housenumbers(self) -> None:
207 """ Remove unused house numbers.
209 with connect(self.dsn) as conn:
210 if not table_exists(conn, 'search_name'):
212 with conn.cursor(name="hnr_counter") as cur:
213 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
216 AND NOT EXISTS(SELECT * FROM search_name
217 WHERE ARRAY[word.word_id] && name_vector)
218 AND (char_length(coalesce(word, word_token)) > 6
219 OR coalesce(word, word_token) not similar to '\\d+')
221 candidates = {token: wid for wid, token in cur}
222 with conn.cursor(name="hnr_counter") as cur:
223 cur.execute("""SELECT housenumber FROM placex
224 WHERE housenumber is not null
225 AND (char_length(housenumber) > 6
226 OR housenumber not similar to '\\d+')
229 for hnr in row[0].split(';'):
230 candidates.pop(hnr, None)
231 LOG.info("There are %s outdated housenumbers.", len(candidates))
232 LOG.debug("Outdated housenumbers: %s", candidates.keys())
234 with conn.cursor() as cur:
235 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
236 (list(candidates.values()), ))
241 def update_word_tokens(self) -> None:
242 """ Remove unused tokens.
244 LOG.warning("Cleaning up housenumber tokens.")
245 self._cleanup_housenumbers()
246 LOG.warning("Tokenizer house-keeping done.")
249 def name_analyzer(self) -> 'ICUNameAnalyzer':
250 """ Create a new analyzer for tokenizing names and queries
251 using this tokinzer. Analyzers are context managers and should
255 with tokenizer.name_analyzer() as analyzer:
259 When used outside the with construct, the caller must ensure to
260 call the close() function before destructing the analyzer.
262 Analyzers are not thread-safe. You need to instantiate one per thread.
264 assert self.loader is not None
265 return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
266 self.loader.make_token_analysis())
269 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
270 """ Return a list of the `num` most frequent full words
273 with conn.cursor() as cur:
274 cur.execute("""SELECT word, sum((info->>'count')::int) as count
275 FROM word WHERE type = 'W'
277 ORDER BY count DESC LIMIT %s""", (num,))
278 return list(s[0].split('@')[0] for s in cur)
281 def _save_config(self) -> None:
282 """ Save the configuration that needs to remain stable for the given
283 database as database properties.
285 assert self.loader is not None
286 with connect(self.dsn) as conn:
287 self.loader.save_config_to_db(conn)
290 def _setup_db_tables(self, config: Configuration) -> None:
291 """ Set up the word table and fill it with pre-computed word
294 with connect(self.dsn) as conn:
295 drop_tables(conn, 'word')
296 sqlp = SQLPreprocessor(conn, config)
297 sqlp.run_string(conn, """
300 word_token text NOT NULL,
304 ) {{db.tablespace.search_data}};
305 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
307 DROP SEQUENCE IF EXISTS seq_word;
308 CREATE SEQUENCE seq_word start 1;
309 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
314 def _create_base_indices(self, config: Configuration, table_name: str) -> None:
315 """ Set up the word table and fill it with pre-computed word
318 with connect(self.dsn) as conn:
319 sqlp = SQLPreprocessor(conn, config)
320 sqlp.run_string(conn,
321 """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
322 USING BTREE (word_token) {{db.tablespace.search_index}}""",
323 table_name=table_name)
324 for name, ctype in WORD_TYPES:
325 sqlp.run_string(conn,
326 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
327 USING BTREE (word) {{db.tablespace.address_index}}
328 WHERE type = '{{column_type}}'
330 table_name=table_name, idx_name=name,
335 def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
336 """ Create additional indexes used when running the API.
338 with connect(self.dsn) as conn:
339 sqlp = SQLPreprocessor(conn, config)
340 # Index required for details lookup.
341 sqlp.run_string(conn, """
342 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
343 ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
345 table_name=table_name)
349 def _move_temporary_word_table(self, old: str) -> None:
350 """ Rename all tables and indexes used by the tokenizer.
352 with connect(self.dsn) as conn:
353 drop_tables(conn, 'word')
354 with conn.cursor() as cur:
355 cur.execute(f"ALTER TABLE {old} RENAME TO word")
356 for idx in ('word_token', 'word_id'):
357 cur.execute(f"""ALTER INDEX idx_{old}_{idx}
358 RENAME TO idx_word_{idx}""")
359 for name, _ in WORD_TYPES:
360 cur.execute(f"""ALTER INDEX idx_{old}_{name}
361 RENAME TO idx_word_{name}""")
367 class ICUNameAnalyzer(AbstractAnalyzer):
368 """ The ICU analyzer uses the ICU library for splitting names.
370 Each instance opens a connection to the database to request the
374 def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
375 token_analysis: ICUTokenAnalysis) -> None:
376 self.conn: Optional[Connection] = connect(dsn)
377 self.conn.autocommit = True
378 self.sanitizer = sanitizer
379 self.token_analysis = token_analysis
381 self._cache = _TokenCache()
384 def close(self) -> None:
385 """ Free all resources used by the analyzer.
392 def _search_normalized(self, name: str) -> str:
393 """ Return the search token transliteration of the given name.
395 return cast(str, self.token_analysis.search.transliterate(name)).strip()
398 def _normalized(self, name: str) -> str:
399 """ Return the normalized version of the given name with all
400 non-relevant information removed.
402 return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
405 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
406 """ Return token information for the given list of words.
407 If a word starts with # it is assumed to be a full name
408 otherwise is a partial name.
410 The function returns a list of tuples with
411 (original word, word token, word id).
413 The function is used for testing and debugging only
414 and not necessarily efficient.
416 assert self.conn is not None
420 if word.startswith('#'):
421 full_tokens[word] = self._search_normalized(word[1:])
423 partial_tokens[word] = self._search_normalized(word)
425 with self.conn.cursor() as cur:
426 cur.execute("""SELECT word_token, word_id
427 FROM word WHERE word_token = ANY(%s) and type = 'W'
428 """, (list(full_tokens.values()),))
429 full_ids = {r[0]: r[1] for r in cur}
430 cur.execute("""SELECT word_token, word_id
431 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
432 (list(partial_tokens.values()),))
433 part_ids = {r[0]: r[1] for r in cur}
435 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
436 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
439 def normalize_postcode(self, postcode: str) -> str:
440 """ Convert the postcode to a standardized form.
442 This function must yield exactly the same result as the SQL function
443 'token_normalized_postcode()'.
445 return postcode.strip().upper()
448 def update_postcodes_from_db(self) -> None:
449 """ Update postcode tokens in the word table from the location_postcode
452 assert self.conn is not None
453 analyzer = self.token_analysis.analysis.get('@postcode')
455 with self.conn.cursor() as cur:
456 # First get all postcode names currently in the word table.
457 cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
458 word_entries = set((entry[0] for entry in cur))
460 # Then compute the required postcode names from the postcode table.
461 needed_entries = set()
462 cur.execute("SELECT country_code, postcode FROM location_postcode")
463 for cc, postcode in cur:
464 info = PlaceInfo({'country_code': cc,
465 'class': 'place', 'type': 'postcode',
466 'address': {'postcode': postcode}})
467 address = self.sanitizer.process_names(info)[1]
468 for place in address:
469 if place.kind == 'postcode':
471 postcode_name = place.name.strip().upper()
474 postcode_name = analyzer.get_canonical_id(place)
475 variant_base = place.get_attr("variant")
478 needed_entries.add(f'{postcode_name}@{variant_base}')
480 needed_entries.add(postcode_name)
483 # Now update the word table.
484 self._delete_unused_postcode_words(word_entries - needed_entries)
485 self._add_missing_postcode_words(needed_entries - word_entries)
487 def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
488 assert self.conn is not None
490 with self.conn.cursor() as cur:
491 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
494 def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
495 assert self.conn is not None
499 analyzer = self.token_analysis.analysis.get('@postcode')
502 for postcode_name in tokens:
503 if '@' in postcode_name:
504 term, variant = postcode_name.split('@', 2)
505 term = self._search_normalized(term)
509 variants = analyzer.compute_variants(variant)
510 if term not in variants:
511 variants.append(term)
513 variants = [self._search_normalized(postcode_name)]
514 terms.append((postcode_name, variants))
517 with self.conn.cursor() as cur:
518 cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
523 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
524 should_replace: bool) -> None:
525 """ Replace the search index for special phrases with the new phrases.
526 If `should_replace` is True, then the previous set of will be
527 completely replaced. Otherwise the phrases are added to the
528 already existing ones.
530 assert self.conn is not None
531 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
534 with self.conn.cursor() as cur:
535 # Get the old phrases.
536 existing_phrases = set()
537 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
538 for word, info in cur:
539 existing_phrases.add((word, info['class'], info['type'],
540 info.get('op') or '-'))
542 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
544 deleted = self._remove_special_phrases(cur, norm_phrases,
549 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
550 len(norm_phrases), added, deleted)
553 def _add_special_phrases(self, cursor: Cursor,
554 new_phrases: Set[Tuple[str, str, str, str]],
555 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
556 """ Add all phrases to the database that are not yet there.
558 to_add = new_phrases - existing_phrases
561 with cursor.copy('COPY word(word_token, type, word, info) FROM STDIN') as copy:
562 for word, cls, typ, oper in to_add:
563 term = self._search_normalized(word)
565 copy.write_row((term, 'S', word,
566 Jsonb({'class': cls, 'type': typ,
567 'op': oper if oper in ('in', 'near') else None})))
573 def _remove_special_phrases(self, cursor: Cursor,
574 new_phrases: Set[Tuple[str, str, str, str]],
575 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
576 """ Remove all phrases from the database that are no longer in the
579 to_delete = existing_phrases - new_phrases
584 WHERE type = 'S' and word = %s
585 and info->>'class' = %s and info->>'type' = %s
586 and %s = coalesce(info->>'op', '-')
589 return len(to_delete)
592 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
593 """ Add default names for the given country to the search index.
595 # Make sure any name preprocessing for country names applies.
596 info = PlaceInfo({'name': names, 'country_code': country_code,
597 'rank_address': 4, 'class': 'boundary',
598 'type': 'administrative'})
599 self._add_country_full_names(country_code,
600 self.sanitizer.process_names(info)[0],
604 def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
605 internal: bool = False) -> None:
606 """ Add names for the given country from an already sanitized
609 assert self.conn is not None
612 norm_name = self._search_normalized(name.name)
614 word_tokens.add(norm_name)
616 with self.conn.cursor() as cur:
618 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
620 WHERE type = 'C' and word = %s""",
622 # internal/external names
623 existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
625 existing_tokens[word[1]].add(word[0])
627 # Delete names that no longer exist.
628 gone_tokens = existing_tokens[internal] - word_tokens
630 gone_tokens.update(existing_tokens[False] & word_tokens)
632 cur.execute("""DELETE FROM word
633 USING unnest(%s::text[]) as token
634 WHERE type = 'C' and word = %s
635 and word_token = token""",
636 (list(gone_tokens), country_code))
638 # Only add those names that are not yet in the list.
639 new_tokens = word_tokens - existing_tokens[True]
641 new_tokens -= existing_tokens[False]
644 sql = """INSERT INTO word (word_token, type, word, info)
645 (SELECT token, 'C', %s, '{"internal": "yes"}'
646 FROM unnest(%s::text[]) as token)
649 sql = """INSERT INTO word (word_token, type, word)
650 (SELECT token, 'C', %s
651 FROM unnest(%s::text[]) as token)
653 cur.execute(sql, (country_code, list(new_tokens)))
656 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
657 """ Determine tokenizer information about the given place.
659 Returns a JSON-serializable structure that will be handed into
660 the database via the token_info field.
662 token_info = _TokenInfo()
664 names, address = self.sanitizer.process_names(place)
667 token_info.set_names(*self._compute_name_tokens(names))
669 if place.is_country():
670 assert place.country_code is not None
671 self._add_country_full_names(place.country_code, names)
674 self._process_place_address(token_info, address)
676 return token_info.to_dict()
679 def _process_place_address(self, token_info: '_TokenInfo',
680 address: Sequence[PlaceName]) -> None:
682 if item.kind == 'postcode':
683 token_info.set_postcode(self._add_postcode(item))
684 elif item.kind == 'housenumber':
685 token_info.add_housenumber(*self._compute_housenumber_token(item))
686 elif item.kind == 'street':
687 token_info.add_street(self._retrieve_full_tokens(item.name))
688 elif item.kind == 'place':
690 token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
691 elif not item.kind.startswith('_') and not item.suffix and \
692 item.kind not in ('country', 'full', 'inclusion'):
693 token_info.add_address_term(item.kind,
694 itertools.chain(*self._compute_name_tokens([item])))
697 def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
698 """ Normalize the housenumber and return the word token and the
701 assert self.conn is not None
702 analyzer = self.token_analysis.analysis.get('@housenumber')
703 result: Tuple[Optional[int], Optional[str]] = (None, None)
706 # When no custom analyzer is set, simply normalize and transliterate
707 norm_name = self._search_normalized(hnr.name)
709 result = self._cache.housenumbers.get(norm_name, result)
710 if result[0] is None:
711 hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
713 result = hid, norm_name
714 self._cache.housenumbers[norm_name] = result
716 # Otherwise use the analyzer to determine the canonical name.
717 # Per convention we use the first variant as the 'lookup name', the
718 # name that gets saved in the housenumber field of the place.
719 word_id = analyzer.get_canonical_id(hnr)
721 result = self._cache.housenumbers.get(word_id, result)
722 if result[0] is None:
723 variants = analyzer.compute_variants(word_id)
725 hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
726 (word_id, list(variants)))
727 result = hid, variants[0]
728 self._cache.housenumbers[word_id] = result
733 def _retrieve_full_tokens(self, name: str) -> List[int]:
734 """ Get the full name token for the given name, if it exists.
735 The name is only retrieved for the standard analyser.
737 assert self.conn is not None
738 norm_name = self._search_normalized(name)
740 # return cached if possible
741 if norm_name in self._cache.fulls:
742 return self._cache.fulls[norm_name]
744 with self.conn.cursor() as cur:
745 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
747 full = [row[0] for row in cur]
749 self._cache.fulls[norm_name] = full
754 def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
755 """ Computes the full name and partial name tokens for the given
758 assert self.conn is not None
759 full_tokens: Set[int] = set()
760 partial_tokens: Set[int] = set()
763 analyzer_id = name.get_attr('analyzer')
764 analyzer = self.token_analysis.get_analyzer(analyzer_id)
765 word_id = analyzer.get_canonical_id(name)
766 if analyzer_id is None:
769 token_id = f'{word_id}@{analyzer_id}'
771 full, part = self._cache.names.get(token_id, (None, None))
773 variants = analyzer.compute_variants(word_id)
777 with self.conn.cursor() as cur:
778 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
779 (token_id, variants))
780 full, part = cast(Tuple[int, List[int]], cur.fetchone())
782 self._cache.names[token_id] = (full, part)
784 assert part is not None
786 full_tokens.add(full)
787 partial_tokens.update(part)
789 return full_tokens, partial_tokens
792 def _add_postcode(self, item: PlaceName) -> Optional[str]:
793 """ Make sure the normalized postcode is present in the word table.
795 assert self.conn is not None
796 analyzer = self.token_analysis.analysis.get('@postcode')
799 postcode_name = item.name.strip().upper()
802 postcode_name = analyzer.get_canonical_id(item)
803 variant_base = item.get_attr("variant")
806 postcode = f'{postcode_name}@{variant_base}'
808 postcode = postcode_name
810 if postcode not in self._cache.postcodes:
811 term = self._search_normalized(postcode_name)
816 if analyzer is not None and variant_base:
817 variants.update(analyzer.compute_variants(variant_base))
819 with self.conn.cursor() as cur:
820 cur.execute("SELECT create_postcode_word(%s, %s)",
821 (postcode, list(variants)))
822 self._cache.postcodes.add(postcode)
828 """ Collect token information to be sent back to the database.
830 def __init__(self) -> None:
831 self.names: Optional[str] = None
832 self.housenumbers: Set[str] = set()
833 self.housenumber_tokens: Set[int] = set()
834 self.street_tokens: Optional[Set[int]] = None
835 self.place_tokens: Set[int] = set()
836 self.address_tokens: Dict[str, str] = {}
837 self.postcode: Optional[str] = None
840 def _mk_array(self, tokens: Iterable[Any]) -> str:
841 return f"{{{','.join((str(s) for s in tokens))}}}"
844 def to_dict(self) -> Dict[str, Any]:
845 """ Return the token information in database importable format.
847 out: Dict[str, Any] = {}
850 out['names'] = self.names
852 if self.housenumbers:
853 out['hnr'] = ';'.join(self.housenumbers)
854 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
856 if self.street_tokens is not None:
857 out['street'] = self._mk_array(self.street_tokens)
859 if self.place_tokens:
860 out['place'] = self._mk_array(self.place_tokens)
862 if self.address_tokens:
863 out['addr'] = self.address_tokens
866 out['postcode'] = self.postcode
871 def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
872 """ Adds token information for the normalised names.
874 self.names = self._mk_array(itertools.chain(fulls, partials))
877 def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
878 """ Extract housenumber information from a list of normalised
882 assert hnr is not None
883 self.housenumbers.add(hnr)
884 self.housenumber_tokens.add(token)
887 def add_street(self, tokens: Iterable[int]) -> None:
888 """ Add addr:street match terms.
890 if self.street_tokens is None:
891 self.street_tokens = set()
892 self.street_tokens.update(tokens)
895 def add_place(self, tokens: Iterable[int]) -> None:
896 """ Add addr:place search and match terms.
898 self.place_tokens.update(tokens)
901 def add_address_term(self, key: str, partials: Iterable[int]) -> None:
902 """ Add additional address terms.
904 array = self._mk_array(partials)
906 self.address_tokens[key] = array
908 def set_postcode(self, postcode: Optional[str]) -> None:
909 """ Set the postcode to the given one.
911 self.postcode = postcode
915 """ Cache for token information to avoid repeated database queries.
917 This cache is not thread-safe and needs to be instantiated per
920 def __init__(self) -> None:
921 self.names: Dict[str, Tuple[int, List[int]]] = {}
922 self.partials: Dict[str, int] = {}
923 self.fulls: Dict[str, List[int]] = {}
924 self.postcodes: Set[str] = set()
925 self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}