1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
15 from pathlib import Path
17 from psycopg.types.json import Jsonb
18 from psycopg import sql as pysql
20 from ..db.connection import connect, Connection, Cursor, \
21 drop_tables, table_exists, execute_scalar
22 from ..config import Configuration
23 from ..db.sql_preprocessor import SQLPreprocessor
24 from ..data.place_info import PlaceInfo
25 from ..data.place_name import PlaceName
26 from .icu_rule_loader import ICURuleLoader
27 from .place_sanitizer import PlaceSanitizer
28 from .icu_token_analysis import ICUTokenAnalysis
29 from .base import AbstractAnalyzer, AbstractTokenizer
31 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
33 LOG = logging.getLogger()
35 WORD_TYPES = (('country_names', 'C'),
38 ('housenumbers', 'H'))
41 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
42 """ Create a new instance of the tokenizer provided by this module.
44 return ICUTokenizer(dsn, data_dir)
47 class ICUTokenizer(AbstractTokenizer):
48 """ This tokenizer uses libICU to convert names and queries to ASCII.
49 Otherwise it uses the same algorithms and data structures as the
50 normalization routines in Nominatim 3.
53 def __init__(self, dsn: str, data_dir: Path) -> None:
55 self.data_dir = data_dir
56 self.loader: Optional[ICURuleLoader] = None
58 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
59 """ Set up a new tokenizer for the database.
61 This copies all necessary data in the project directory to make
62 sure the tokenizer remains stable even over updates.
64 self.loader = ICURuleLoader(config)
69 self.update_sql_functions(config)
70 self._setup_db_tables(config)
71 self._create_base_indices(config, 'word')
73 def init_from_project(self, config: Configuration) -> None:
74 """ Initialise the tokenizer from the project directory.
76 self.loader = ICURuleLoader(config)
78 with connect(self.dsn) as conn:
79 self.loader.load_config_from_db(conn)
81 def finalize_import(self, config: Configuration) -> None:
82 """ Do any required postprocessing to make the tokenizer data ready
85 self._create_lookup_indices(config, 'word')
87 def update_sql_functions(self, config: Configuration) -> None:
88 """ Reimport the SQL functions for this tokenizer.
90 with connect(self.dsn) as conn:
91 sqlp = SQLPreprocessor(conn, config)
92 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
94 def check_database(self, config: Configuration) -> None:
95 """ Check that the tokenizer is set up correctly.
97 # Will throw an error if there is an issue.
98 self.init_from_project(config)
100 def update_statistics(self, config: Configuration, threads: int = 2) -> None:
101 """ Recompute frequencies for all name words.
103 with connect(self.dsn) as conn:
104 if not table_exists(conn, 'search_name'):
107 with conn.cursor() as cur:
108 cur.execute('ANALYSE search_name')
110 cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
111 .format(pysql.Literal(min(threads, 6),)))
113 LOG.info('Computing word frequencies')
114 drop_tables(conn, 'word_frequencies')
116 CREATE TEMP TABLE word_frequencies AS
117 WITH word_freq AS MATERIALIZED (
118 SELECT unnest(name_vector) as id, count(*)
119 FROM search_name GROUP BY id),
120 addr_freq AS MATERIALIZED (
121 SELECT unnest(nameaddress_vector) as id, count(*)
122 FROM search_name GROUP BY id)
123 SELECT coalesce(a.id, w.id) as id,
124 (CASE WHEN w.count is null THEN '{}'::JSONB
125 ELSE jsonb_build_object('count', w.count) END
127 CASE WHEN a.count is null THEN '{}'::JSONB
128 ELSE jsonb_build_object('addr_count', a.count) END) as info
129 FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
131 cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
132 cur.execute('ANALYSE word_frequencies')
133 LOG.info('Update word table with recomputed frequencies')
134 drop_tables(conn, 'tmp_word')
135 cur.execute("""CREATE TABLE tmp_word AS
136 SELECT word_id, word_token, type, word,
137 (CASE WHEN wf.info is null THEN word.info
138 ELSE coalesce(word.info, '{}'::jsonb) || wf.info
140 FROM word LEFT JOIN word_frequencies wf
141 ON word.word_id = wf.id
143 drop_tables(conn, 'word_frequencies')
145 with conn.cursor() as cur:
146 cur.execute('SET max_parallel_workers_per_gather TO 0')
148 sqlp = SQLPreprocessor(conn, config)
149 sqlp.run_string(conn,
150 'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
152 self._create_base_indices(config, 'tmp_word')
153 self._create_lookup_indices(config, 'tmp_word')
154 self._move_temporary_word_table('tmp_word')
156 def _cleanup_housenumbers(self) -> None:
157 """ Remove unused house numbers.
159 with connect(self.dsn) as conn:
160 if not table_exists(conn, 'search_name'):
162 with conn.cursor(name="hnr_counter") as cur:
163 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
166 AND NOT EXISTS(SELECT * FROM search_name
167 WHERE ARRAY[word.word_id] && name_vector)
168 AND (char_length(coalesce(word, word_token)) > 6
169 OR coalesce(word, word_token) not similar to '\\d+')
171 candidates = {token: wid for wid, token in cur}
172 with conn.cursor(name="hnr_counter") as cur:
173 cur.execute("""SELECT housenumber FROM placex
174 WHERE housenumber is not null
175 AND (char_length(housenumber) > 6
176 OR housenumber not similar to '\\d+')
179 for hnr in row[0].split(';'):
180 candidates.pop(hnr, None)
181 LOG.info("There are %s outdated housenumbers.", len(candidates))
182 LOG.debug("Outdated housenumbers: %s", candidates.keys())
184 with conn.cursor() as cur:
185 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
186 (list(candidates.values()), ))
189 def update_word_tokens(self) -> None:
190 """ Remove unused tokens.
192 LOG.warning("Cleaning up housenumber tokens.")
193 self._cleanup_housenumbers()
194 LOG.warning("Tokenizer house-keeping done.")
196 def name_analyzer(self) -> 'ICUNameAnalyzer':
197 """ Create a new analyzer for tokenizing names and queries
198 using this tokinzer. Analyzers are context managers and should
202 with tokenizer.name_analyzer() as analyzer:
206 When used outside the with construct, the caller must ensure to
207 call the close() function before destructing the analyzer.
209 Analyzers are not thread-safe. You need to instantiate one per thread.
211 assert self.loader is not None
212 return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
213 self.loader.make_token_analysis())
215 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
216 """ Return a list of the `num` most frequent full words
219 with conn.cursor() as cur:
220 cur.execute("""SELECT word, sum((info->>'count')::int) as count
221 FROM word WHERE type = 'W'
223 ORDER BY count DESC LIMIT %s""", (num,))
224 return list(s[0].split('@')[0] for s in cur)
226 def _save_config(self) -> None:
227 """ Save the configuration that needs to remain stable for the given
228 database as database properties.
230 assert self.loader is not None
231 with connect(self.dsn) as conn:
232 self.loader.save_config_to_db(conn)
234 def _setup_db_tables(self, config: Configuration) -> None:
235 """ Set up the word table and fill it with pre-computed word
238 with connect(self.dsn) as conn:
239 drop_tables(conn, 'word')
240 sqlp = SQLPreprocessor(conn, config)
241 sqlp.run_string(conn, """
244 word_token text NOT NULL,
248 ) {{db.tablespace.search_data}};
249 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
251 DROP SEQUENCE IF EXISTS seq_word;
252 CREATE SEQUENCE seq_word start 1;
253 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
257 def _create_base_indices(self, config: Configuration, table_name: str) -> None:
258 """ Set up the word table and fill it with pre-computed word
261 with connect(self.dsn) as conn:
262 sqlp = SQLPreprocessor(conn, config)
263 sqlp.run_string(conn,
264 """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
265 USING BTREE (word_token) {{db.tablespace.search_index}}""",
266 table_name=table_name)
267 for name, ctype in WORD_TYPES:
268 sqlp.run_string(conn,
269 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
270 USING BTREE (word) {{db.tablespace.address_index}}
271 WHERE type = '{{column_type}}'
273 table_name=table_name, idx_name=name,
277 def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
278 """ Create additional indexes used when running the API.
280 with connect(self.dsn) as conn:
281 sqlp = SQLPreprocessor(conn, config)
282 # Index required for details lookup.
286 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
287 ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
289 table_name=table_name)
292 def _move_temporary_word_table(self, old: str) -> None:
293 """ Rename all tables and indexes used by the tokenizer.
295 with connect(self.dsn) as conn:
296 drop_tables(conn, 'word')
297 with conn.cursor() as cur:
298 cur.execute(f"ALTER TABLE {old} RENAME TO word")
299 for idx in ('word_token', 'word_id'):
300 cur.execute(f"""ALTER INDEX idx_{old}_{idx}
301 RENAME TO idx_word_{idx}""")
302 for name, _ in WORD_TYPES:
303 cur.execute(f"""ALTER INDEX idx_{old}_{name}
304 RENAME TO idx_word_{name}""")
308 class ICUNameAnalyzer(AbstractAnalyzer):
309 """ The ICU analyzer uses the ICU library for splitting names.
311 Each instance opens a connection to the database to request the
315 def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
316 token_analysis: ICUTokenAnalysis) -> None:
317 self.conn: Optional[Connection] = connect(dsn)
318 self.conn.autocommit = True
319 self.sanitizer = sanitizer
320 self.token_analysis = token_analysis
322 self._cache = _TokenCache()
324 def close(self) -> None:
325 """ Free all resources used by the analyzer.
331 def _search_normalized(self, name: str) -> str:
332 """ Return the search token transliteration of the given name.
334 return cast(str, self.token_analysis.search.transliterate(name)).strip()
336 def _normalized(self, name: str) -> str:
337 """ Return the normalized version of the given name with all
338 non-relevant information removed.
340 return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
342 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
343 """ Return token information for the given list of words.
344 If a word starts with # it is assumed to be a full name
345 otherwise is a partial name.
347 The function returns a list of tuples with
348 (original word, word token, word id).
350 The function is used for testing and debugging only
351 and not necessarily efficient.
353 assert self.conn is not None
357 if word.startswith('#'):
358 full_tokens[word] = self._search_normalized(word[1:])
360 partial_tokens[word] = self._search_normalized(word)
362 with self.conn.cursor() as cur:
363 cur.execute("""SELECT word_token, word_id
364 FROM word WHERE word_token = ANY(%s) and type = 'W'
365 """, (list(full_tokens.values()),))
366 full_ids = {r[0]: r[1] for r in cur}
367 cur.execute("""SELECT word_token, word_id
368 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
369 (list(partial_tokens.values()),))
370 part_ids = {r[0]: r[1] for r in cur}
372 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
373 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
375 def normalize_postcode(self, postcode: str) -> str:
376 """ Convert the postcode to a standardized form.
378 This function must yield exactly the same result as the SQL function
379 'token_normalized_postcode()'.
381 return postcode.strip().upper()
383 def update_postcodes_from_db(self) -> None:
384 """ Update postcode tokens in the word table from the location_postcode
387 assert self.conn is not None
388 analyzer = self.token_analysis.analysis.get('@postcode')
390 with self.conn.cursor() as cur:
391 # First get all postcode names currently in the word table.
392 cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
393 word_entries = set((entry[0] for entry in cur))
395 # Then compute the required postcode names from the postcode table.
396 needed_entries = set()
397 cur.execute("SELECT country_code, postcode FROM location_postcode")
398 for cc, postcode in cur:
399 info = PlaceInfo({'country_code': cc,
400 'class': 'place', 'type': 'postcode',
401 'address': {'postcode': postcode}})
402 address = self.sanitizer.process_names(info)[1]
403 for place in address:
404 if place.kind == 'postcode':
406 postcode_name = place.name.strip().upper()
409 postcode_name = analyzer.get_canonical_id(place)
410 variant_base = place.get_attr("variant")
413 needed_entries.add(f'{postcode_name}@{variant_base}')
415 needed_entries.add(postcode_name)
418 # Now update the word table.
419 self._delete_unused_postcode_words(word_entries - needed_entries)
420 self._add_missing_postcode_words(needed_entries - word_entries)
422 def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
423 assert self.conn is not None
425 with self.conn.cursor() as cur:
426 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
429 def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
430 assert self.conn is not None
434 analyzer = self.token_analysis.analysis.get('@postcode')
437 for postcode_name in tokens:
438 if '@' in postcode_name:
439 term, variant = postcode_name.split('@', 2)
440 term = self._search_normalized(term)
444 variants = analyzer.compute_variants(variant)
445 if term not in variants:
446 variants.append(term)
448 variants = [self._search_normalized(postcode_name)]
449 terms.append((postcode_name, variants))
452 with self.conn.cursor() as cur:
453 cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
455 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
456 should_replace: bool) -> None:
457 """ Replace the search index for special phrases with the new phrases.
458 If `should_replace` is True, then the previous set of will be
459 completely replaced. Otherwise the phrases are added to the
460 already existing ones.
462 assert self.conn is not None
463 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
466 with self.conn.cursor() as cur:
467 # Get the old phrases.
468 existing_phrases = set()
469 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
470 for word, info in cur:
471 existing_phrases.add((word, info['class'], info['type'],
472 info.get('op') or '-'))
474 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
476 deleted = self._remove_special_phrases(cur, norm_phrases,
481 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
482 len(norm_phrases), added, deleted)
484 def _add_special_phrases(self, cursor: Cursor,
485 new_phrases: Set[Tuple[str, str, str, str]],
486 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
487 """ Add all phrases to the database that are not yet there.
489 to_add = new_phrases - existing_phrases
492 with cursor.copy('COPY word(word_token, type, word, info) FROM STDIN') as copy:
493 for word, cls, typ, oper in to_add:
494 term = self._search_normalized(word)
496 copy.write_row((term, 'S', word,
497 Jsonb({'class': cls, 'type': typ,
498 'op': oper if oper in ('in', 'near') else None})))
503 def _remove_special_phrases(self, cursor: Cursor,
504 new_phrases: Set[Tuple[str, str, str, str]],
505 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
506 """ Remove all phrases from the database that are no longer in the
509 to_delete = existing_phrases - new_phrases
514 WHERE type = 'S' and word = %s
515 and info->>'class' = %s and info->>'type' = %s
516 and %s = coalesce(info->>'op', '-')
519 return len(to_delete)
521 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
522 """ Add default names for the given country to the search index.
524 # Make sure any name preprocessing for country names applies.
525 info = PlaceInfo({'name': names, 'country_code': country_code,
526 'rank_address': 4, 'class': 'boundary',
527 'type': 'administrative'})
528 self._add_country_full_names(country_code,
529 self.sanitizer.process_names(info)[0],
532 def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
533 internal: bool = False) -> None:
534 """ Add names for the given country from an already sanitized
537 assert self.conn is not None
540 norm_name = self._search_normalized(name.name)
542 word_tokens.add(norm_name)
544 with self.conn.cursor() as cur:
546 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
548 WHERE type = 'C' and word = %s""",
550 # internal/external names
551 existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
553 existing_tokens[word[1]].add(word[0])
555 # Delete names that no longer exist.
556 gone_tokens = existing_tokens[internal] - word_tokens
558 gone_tokens.update(existing_tokens[False] & word_tokens)
560 cur.execute("""DELETE FROM word
561 USING unnest(%s::text[]) as token
562 WHERE type = 'C' and word = %s
563 and word_token = token""",
564 (list(gone_tokens), country_code))
566 # Only add those names that are not yet in the list.
567 new_tokens = word_tokens - existing_tokens[True]
569 new_tokens -= existing_tokens[False]
572 sql = """INSERT INTO word (word_token, type, word, info)
573 (SELECT token, 'C', %s, '{"internal": "yes"}'
574 FROM unnest(%s::text[]) as token)
577 sql = """INSERT INTO word (word_token, type, word)
578 (SELECT token, 'C', %s
579 FROM unnest(%s::text[]) as token)
581 cur.execute(sql, (country_code, list(new_tokens)))
583 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
584 """ Determine tokenizer information about the given place.
586 Returns a JSON-serializable structure that will be handed into
587 the database via the token_info field.
589 token_info = _TokenInfo()
591 names, address = self.sanitizer.process_names(place)
594 token_info.set_names(*self._compute_name_tokens(names))
596 if place.is_country():
597 assert place.country_code is not None
598 self._add_country_full_names(place.country_code, names)
601 self._process_place_address(token_info, address)
603 return token_info.to_dict()
605 def _process_place_address(self, token_info: '_TokenInfo',
606 address: Sequence[PlaceName]) -> None:
608 if item.kind == 'postcode':
609 token_info.set_postcode(self._add_postcode(item))
610 elif item.kind == 'housenumber':
611 token_info.add_housenumber(*self._compute_housenumber_token(item))
612 elif item.kind == 'street':
613 token_info.add_street(self._retrieve_full_tokens(item.name))
614 elif item.kind == 'place':
616 token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
617 elif (not item.kind.startswith('_') and not item.suffix and
618 item.kind not in ('country', 'full', 'inclusion')):
619 token_info.add_address_term(item.kind,
620 itertools.chain(*self._compute_name_tokens([item])))
622 def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
623 """ Normalize the housenumber and return the word token and the
626 assert self.conn is not None
627 analyzer = self.token_analysis.analysis.get('@housenumber')
628 result: Tuple[Optional[int], Optional[str]] = (None, None)
631 # When no custom analyzer is set, simply normalize and transliterate
632 norm_name = self._search_normalized(hnr.name)
634 result = self._cache.housenumbers.get(norm_name, result)
635 if result[0] is None:
636 hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
638 result = hid, norm_name
639 self._cache.housenumbers[norm_name] = result
641 # Otherwise use the analyzer to determine the canonical name.
642 # Per convention we use the first variant as the 'lookup name', the
643 # name that gets saved in the housenumber field of the place.
644 word_id = analyzer.get_canonical_id(hnr)
646 result = self._cache.housenumbers.get(word_id, result)
647 if result[0] is None:
648 variants = analyzer.compute_variants(word_id)
650 hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
651 (word_id, list(variants)))
652 result = hid, variants[0]
653 self._cache.housenumbers[word_id] = result
657 def _retrieve_full_tokens(self, name: str) -> List[int]:
658 """ Get the full name token for the given name, if it exists.
659 The name is only retrieved for the standard analyser.
661 assert self.conn is not None
662 norm_name = self._search_normalized(name)
664 # return cached if possible
665 if norm_name in self._cache.fulls:
666 return self._cache.fulls[norm_name]
668 with self.conn.cursor() as cur:
669 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
671 full = [row[0] for row in cur]
673 self._cache.fulls[norm_name] = full
677 def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
678 """ Computes the full name and partial name tokens for the given
681 assert self.conn is not None
682 full_tokens: Set[int] = set()
683 partial_tokens: Set[int] = set()
686 analyzer_id = name.get_attr('analyzer')
687 analyzer = self.token_analysis.get_analyzer(analyzer_id)
688 word_id = analyzer.get_canonical_id(name)
689 if analyzer_id is None:
692 token_id = f'{word_id}@{analyzer_id}'
694 full, part = self._cache.names.get(token_id, (None, None))
696 variants = analyzer.compute_variants(word_id)
700 with self.conn.cursor() as cur:
701 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
702 (token_id, variants))
703 full, part = cast(Tuple[int, List[int]], cur.fetchone())
705 self._cache.names[token_id] = (full, part)
707 assert part is not None
709 full_tokens.add(full)
710 partial_tokens.update(part)
712 return full_tokens, partial_tokens
714 def _add_postcode(self, item: PlaceName) -> Optional[str]:
715 """ Make sure the normalized postcode is present in the word table.
717 assert self.conn is not None
718 analyzer = self.token_analysis.analysis.get('@postcode')
721 postcode_name = item.name.strip().upper()
724 postcode_name = analyzer.get_canonical_id(item)
725 variant_base = item.get_attr("variant")
728 postcode = f'{postcode_name}@{variant_base}'
730 postcode = postcode_name
732 if postcode not in self._cache.postcodes:
733 term = self._search_normalized(postcode_name)
738 if analyzer is not None and variant_base:
739 variants.update(analyzer.compute_variants(variant_base))
741 with self.conn.cursor() as cur:
742 cur.execute("SELECT create_postcode_word(%s, %s)",
743 (postcode, list(variants)))
744 self._cache.postcodes.add(postcode)
750 """ Collect token information to be sent back to the database.
752 def __init__(self) -> None:
753 self.names: Optional[str] = None
754 self.housenumbers: Set[str] = set()
755 self.housenumber_tokens: Set[int] = set()
756 self.street_tokens: Optional[Set[int]] = None
757 self.place_tokens: Set[int] = set()
758 self.address_tokens: Dict[str, str] = {}
759 self.postcode: Optional[str] = None
761 def _mk_array(self, tokens: Iterable[Any]) -> str:
762 return f"{{{','.join((str(s) for s in tokens))}}}"
764 def to_dict(self) -> Dict[str, Any]:
765 """ Return the token information in database importable format.
767 out: Dict[str, Any] = {}
770 out['names'] = self.names
772 if self.housenumbers:
773 out['hnr'] = ';'.join(self.housenumbers)
774 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
776 if self.street_tokens is not None:
777 out['street'] = self._mk_array(self.street_tokens)
779 if self.place_tokens:
780 out['place'] = self._mk_array(self.place_tokens)
782 if self.address_tokens:
783 out['addr'] = self.address_tokens
786 out['postcode'] = self.postcode
790 def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
791 """ Adds token information for the normalised names.
793 self.names = self._mk_array(itertools.chain(fulls, partials))
795 def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
796 """ Extract housenumber information from a list of normalised
800 assert hnr is not None
801 self.housenumbers.add(hnr)
802 self.housenumber_tokens.add(token)
804 def add_street(self, tokens: Iterable[int]) -> None:
805 """ Add addr:street match terms.
807 if self.street_tokens is None:
808 self.street_tokens = set()
809 self.street_tokens.update(tokens)
811 def add_place(self, tokens: Iterable[int]) -> None:
812 """ Add addr:place search and match terms.
814 self.place_tokens.update(tokens)
816 def add_address_term(self, key: str, partials: Iterable[int]) -> None:
817 """ Add additional address terms.
819 array = self._mk_array(partials)
821 self.address_tokens[key] = array
823 def set_postcode(self, postcode: Optional[str]) -> None:
824 """ Set the postcode to the given one.
826 self.postcode = postcode
830 """ Cache for token information to avoid repeated database queries.
832 This cache is not thread-safe and needs to be instantiated per
835 def __init__(self) -> None:
836 self.names: Dict[str, Tuple[int, List[int]]] = {}
837 self.partials: Dict[str, int] = {}
838 self.fulls: Dict[str, List[int]] = {}
839 self.postcodes: Set[str] = set()
840 self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}