1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
15 from pathlib import Path
17 from psycopg.types.json import Jsonb
18 from psycopg import sql as pysql
20 from ..db.connection import connect, Connection, Cursor, \
21 drop_tables, table_exists, execute_scalar
22 from ..config import Configuration
23 from ..db.sql_preprocessor import SQLPreprocessor
24 from ..data.place_info import PlaceInfo
25 from ..data.place_name import PlaceName
26 from .icu_rule_loader import ICURuleLoader
27 from .place_sanitizer import PlaceSanitizer
28 from .icu_token_analysis import ICUTokenAnalysis
29 from .base import AbstractAnalyzer, AbstractTokenizer
31 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
33 LOG = logging.getLogger()
35 WORD_TYPES = (('country_names', 'C'),
38 ('housenumbers', 'H'))
41 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
42 """ Create a new instance of the tokenizer provided by this module.
44 return ICUTokenizer(dsn, data_dir)
47 class ICUTokenizer(AbstractTokenizer):
48 """ This tokenizer uses libICU to convert names and queries to ASCII.
49 Otherwise it uses the same algorithms and data structures as the
50 normalization routines in Nominatim 3.
53 def __init__(self, dsn: str, data_dir: Path) -> None:
55 self.data_dir = data_dir
56 self.loader: Optional[ICURuleLoader] = None
58 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
59 """ Set up a new tokenizer for the database.
61 This copies all necessary data in the project directory to make
62 sure the tokenizer remains stable even over updates.
64 self.loader = ICURuleLoader(config)
69 self.update_sql_functions(config)
70 self._setup_db_tables(config)
71 self._create_base_indices(config, 'word')
73 def init_from_project(self, config: Configuration) -> None:
74 """ Initialise the tokenizer from the project directory.
76 self.loader = ICURuleLoader(config)
78 with connect(self.dsn) as conn:
79 self.loader.load_config_from_db(conn)
81 def finalize_import(self, config: Configuration) -> None:
82 """ Do any required postprocessing to make the tokenizer data ready
85 self._create_lookup_indices(config, 'word')
87 def update_sql_functions(self, config: Configuration) -> None:
88 """ Reimport the SQL functions for this tokenizer.
90 with connect(self.dsn) as conn:
91 sqlp = SQLPreprocessor(conn, config)
92 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
94 def check_database(self, config: Configuration) -> None:
95 """ Check that the tokenizer is set up correctly.
97 # Will throw an error if there is an issue.
98 self.init_from_project(config)
100 def update_statistics(self, config: Configuration, threads: int = 2) -> None:
101 """ Recompute frequencies for all name words.
103 with connect(self.dsn) as conn:
104 if not table_exists(conn, 'search_name'):
107 with conn.cursor() as cur:
108 cur.execute('ANALYSE search_name')
110 cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
111 .format(pysql.Literal(min(threads, 6),)))
113 LOG.info('Computing word frequencies')
114 drop_tables(conn, 'word_frequencies')
116 CREATE TEMP TABLE word_frequencies AS
117 WITH word_freq AS MATERIALIZED (
118 SELECT unnest(name_vector) as id, count(*)
119 FROM search_name GROUP BY id),
120 addr_freq AS MATERIALIZED (
121 SELECT unnest(nameaddress_vector) as id, count(*)
122 FROM search_name GROUP BY id)
123 SELECT coalesce(a.id, w.id) as id,
124 (CASE WHEN w.count is null THEN '{}'::JSONB
125 ELSE jsonb_build_object('count', w.count) END
127 CASE WHEN a.count is null THEN '{}'::JSONB
128 ELSE jsonb_build_object('addr_count', a.count) END) as info
129 FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
131 cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
132 cur.execute('ANALYSE word_frequencies')
133 LOG.info('Update word table with recomputed frequencies')
134 drop_tables(conn, 'tmp_word')
135 cur.execute("""CREATE TABLE tmp_word AS
136 SELECT word_id, word_token, type, word,
137 (CASE WHEN wf.info is null THEN word.info
138 ELSE coalesce(word.info, '{}'::jsonb) || wf.info
140 FROM word LEFT JOIN word_frequencies wf
141 ON word.word_id = wf.id
144 drop_tables(conn, 'word_frequencies')
146 with conn.cursor() as cur:
147 cur.execute('SET max_parallel_workers_per_gather TO 0')
149 sqlp = SQLPreprocessor(conn, config)
150 sqlp.run_string(conn,
151 'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
153 self._create_base_indices(config, 'tmp_word')
154 self._create_lookup_indices(config, 'tmp_word')
155 self._move_temporary_word_table('tmp_word')
157 def _cleanup_housenumbers(self) -> None:
158 """ Remove unused house numbers.
160 with connect(self.dsn) as conn:
161 if not table_exists(conn, 'search_name'):
163 with conn.cursor(name="hnr_counter") as cur:
164 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
167 AND NOT EXISTS(SELECT * FROM search_name
168 WHERE ARRAY[word.word_id] && name_vector)
169 AND (char_length(coalesce(word, word_token)) > 6
170 OR coalesce(word, word_token) not similar to '\\d+')
172 candidates = {token: wid for wid, token in cur}
173 with conn.cursor(name="hnr_counter") as cur:
174 cur.execute("""SELECT housenumber FROM placex
175 WHERE housenumber is not null
176 AND (char_length(housenumber) > 6
177 OR housenumber not similar to '\\d+')
180 for hnr in row[0].split(';'):
181 candidates.pop(hnr, None)
182 LOG.info("There are %s outdated housenumbers.", len(candidates))
183 LOG.debug("Outdated housenumbers: %s", candidates.keys())
185 with conn.cursor() as cur:
186 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
187 (list(candidates.values()), ))
190 def update_word_tokens(self) -> None:
191 """ Remove unused tokens.
193 LOG.warning("Cleaning up housenumber tokens.")
194 self._cleanup_housenumbers()
195 LOG.warning("Tokenizer house-keeping done.")
197 def name_analyzer(self) -> 'ICUNameAnalyzer':
198 """ Create a new analyzer for tokenizing names and queries
199 using this tokinzer. Analyzers are context managers and should
203 with tokenizer.name_analyzer() as analyzer:
207 When used outside the with construct, the caller must ensure to
208 call the close() function before destructing the analyzer.
210 Analyzers are not thread-safe. You need to instantiate one per thread.
212 assert self.loader is not None
213 return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
214 self.loader.make_token_analysis())
216 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
217 """ Return a list of the `num` most frequent full words
220 with conn.cursor() as cur:
221 cur.execute("""SELECT word, sum((info->>'count')::int) as count
222 FROM word WHERE type = 'W'
224 ORDER BY count DESC LIMIT %s""", (num,))
225 return list(s[0].split('@')[0] for s in cur)
227 def _save_config(self) -> None:
228 """ Save the configuration that needs to remain stable for the given
229 database as database properties.
231 assert self.loader is not None
232 with connect(self.dsn) as conn:
233 self.loader.save_config_to_db(conn)
235 def _setup_db_tables(self, config: Configuration) -> None:
236 """ Set up the word table and fill it with pre-computed word
239 with connect(self.dsn) as conn:
240 drop_tables(conn, 'word')
241 sqlp = SQLPreprocessor(conn, config)
242 sqlp.run_string(conn, """
245 word_token text NOT NULL,
249 ) {{db.tablespace.search_data}};
250 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
252 DROP SEQUENCE IF EXISTS seq_word;
253 CREATE SEQUENCE seq_word start 1;
254 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
258 def _create_base_indices(self, config: Configuration, table_name: str) -> None:
259 """ Set up the word table and fill it with pre-computed word
262 with connect(self.dsn) as conn:
263 sqlp = SQLPreprocessor(conn, config)
264 sqlp.run_string(conn,
265 """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
266 USING BTREE (word_token) {{db.tablespace.search_index}}""",
267 table_name=table_name)
268 for name, ctype in WORD_TYPES:
269 sqlp.run_string(conn,
270 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
271 USING BTREE (word) {{db.tablespace.address_index}}
272 WHERE type = '{{column_type}}'
274 table_name=table_name, idx_name=name,
278 def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
279 """ Create additional indexes used when running the API.
281 with connect(self.dsn) as conn:
282 sqlp = SQLPreprocessor(conn, config)
283 # Index required for details lookup.
287 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
288 ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
290 table_name=table_name)
293 def _move_temporary_word_table(self, old: str) -> None:
294 """ Rename all tables and indexes used by the tokenizer.
296 with connect(self.dsn) as conn:
297 drop_tables(conn, 'word')
298 with conn.cursor() as cur:
299 cur.execute(f"ALTER TABLE {old} RENAME TO word")
300 for idx in ('word_token', 'word_id'):
301 cur.execute(f"""ALTER INDEX idx_{old}_{idx}
302 RENAME TO idx_word_{idx}""")
303 for name, _ in WORD_TYPES:
304 cur.execute(f"""ALTER INDEX idx_{old}_{name}
305 RENAME TO idx_word_{name}""")
309 class ICUNameAnalyzer(AbstractAnalyzer):
310 """ The ICU analyzer uses the ICU library for splitting names.
312 Each instance opens a connection to the database to request the
316 def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
317 token_analysis: ICUTokenAnalysis) -> None:
318 self.conn: Optional[Connection] = connect(dsn)
319 self.conn.autocommit = True
320 self.sanitizer = sanitizer
321 self.token_analysis = token_analysis
323 self._cache = _TokenCache()
325 def close(self) -> None:
326 """ Free all resources used by the analyzer.
332 def _search_normalized(self, name: str) -> str:
333 """ Return the search token transliteration of the given name.
335 return cast(str, self.token_analysis.search.transliterate(name)).strip()
337 def _normalized(self, name: str) -> str:
338 """ Return the normalized version of the given name with all
339 non-relevant information removed.
341 return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
343 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
344 """ Return token information for the given list of words.
345 If a word starts with # it is assumed to be a full name
346 otherwise is a partial name.
348 The function returns a list of tuples with
349 (original word, word token, word id).
351 The function is used for testing and debugging only
352 and not necessarily efficient.
354 assert self.conn is not None
358 if word.startswith('#'):
359 full_tokens[word] = self._search_normalized(word[1:])
361 partial_tokens[word] = self._search_normalized(word)
363 with self.conn.cursor() as cur:
364 cur.execute("""SELECT word_token, word_id
365 FROM word WHERE word_token = ANY(%s) and type = 'W'
366 """, (list(full_tokens.values()),))
367 full_ids = {r[0]: r[1] for r in cur}
368 cur.execute("""SELECT word_token, word_id
369 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
370 (list(partial_tokens.values()),))
371 part_ids = {r[0]: r[1] for r in cur}
373 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
374 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
376 def normalize_postcode(self, postcode: str) -> str:
377 """ Convert the postcode to a standardized form.
379 This function must yield exactly the same result as the SQL function
380 'token_normalized_postcode()'.
382 return postcode.strip().upper()
384 def update_postcodes_from_db(self) -> None:
385 """ Update postcode tokens in the word table from the location_postcode
388 assert self.conn is not None
389 analyzer = self.token_analysis.analysis.get('@postcode')
391 with self.conn.cursor() as cur:
392 # First get all postcode names currently in the word table.
393 cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
394 word_entries = set((entry[0] for entry in cur))
396 # Then compute the required postcode names from the postcode table.
397 needed_entries = set()
398 cur.execute("SELECT country_code, postcode FROM location_postcode")
399 for cc, postcode in cur:
400 info = PlaceInfo({'country_code': cc,
401 'class': 'place', 'type': 'postcode',
402 'address': {'postcode': postcode}})
403 address = self.sanitizer.process_names(info)[1]
404 for place in address:
405 if place.kind == 'postcode':
407 postcode_name = place.name.strip().upper()
410 postcode_name = analyzer.get_canonical_id(place)
411 variant_base = place.get_attr("variant")
414 needed_entries.add(f'{postcode_name}@{variant_base}')
416 needed_entries.add(postcode_name)
419 # Now update the word table.
420 self._delete_unused_postcode_words(word_entries - needed_entries)
421 self._add_missing_postcode_words(needed_entries - word_entries)
423 def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
424 assert self.conn is not None
426 with self.conn.cursor() as cur:
427 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
430 def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
431 assert self.conn is not None
435 analyzer = self.token_analysis.analysis.get('@postcode')
438 for postcode_name in tokens:
439 if '@' in postcode_name:
440 term, variant = postcode_name.split('@', 2)
441 term = self._search_normalized(term)
445 variants = analyzer.compute_variants(variant)
446 if term not in variants:
447 variants.append(term)
449 variants = [self._search_normalized(postcode_name)]
450 terms.append((postcode_name, variants))
453 with self.conn.cursor() as cur:
454 cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
456 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
457 should_replace: bool) -> None:
458 """ Replace the search index for special phrases with the new phrases.
459 If `should_replace` is True, then the previous set of will be
460 completely replaced. Otherwise the phrases are added to the
461 already existing ones.
463 assert self.conn is not None
464 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
467 with self.conn.cursor() as cur:
468 # Get the old phrases.
469 existing_phrases = set()
470 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
471 for word, info in cur:
472 existing_phrases.add((word, info['class'], info['type'],
473 info.get('op') or '-'))
475 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
477 deleted = self._remove_special_phrases(cur, norm_phrases,
482 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
483 len(norm_phrases), added, deleted)
485 def _add_special_phrases(self, cursor: Cursor,
486 new_phrases: Set[Tuple[str, str, str, str]],
487 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
488 """ Add all phrases to the database that are not yet there.
490 to_add = new_phrases - existing_phrases
493 with cursor.copy('COPY word(word_token, type, word, info) FROM STDIN') as copy:
494 for word, cls, typ, oper in to_add:
495 term = self._search_normalized(word)
497 copy.write_row((term, 'S', word,
498 Jsonb({'class': cls, 'type': typ,
499 'op': oper if oper in ('in', 'near') else None})))
504 def _remove_special_phrases(self, cursor: Cursor,
505 new_phrases: Set[Tuple[str, str, str, str]],
506 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
507 """ Remove all phrases from the database that are no longer in the
510 to_delete = existing_phrases - new_phrases
515 WHERE type = 'S' and word = %s
516 and info->>'class' = %s and info->>'type' = %s
517 and %s = coalesce(info->>'op', '-')
520 return len(to_delete)
522 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
523 """ Add default names for the given country to the search index.
525 # Make sure any name preprocessing for country names applies.
526 info = PlaceInfo({'name': names, 'country_code': country_code,
527 'rank_address': 4, 'class': 'boundary',
528 'type': 'administrative'})
529 self._add_country_full_names(country_code,
530 self.sanitizer.process_names(info)[0],
533 def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
534 internal: bool = False) -> None:
535 """ Add names for the given country from an already sanitized
538 assert self.conn is not None
541 norm_name = self._search_normalized(name.name)
543 word_tokens.add(norm_name)
545 with self.conn.cursor() as cur:
547 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
549 WHERE type = 'C' and word = %s""",
551 # internal/external names
552 existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
554 existing_tokens[word[1]].add(word[0])
556 # Delete names that no longer exist.
557 gone_tokens = existing_tokens[internal] - word_tokens
559 gone_tokens.update(existing_tokens[False] & word_tokens)
561 cur.execute("""DELETE FROM word
562 USING unnest(%s::text[]) as token
563 WHERE type = 'C' and word = %s
564 and word_token = token""",
565 (list(gone_tokens), country_code))
567 # Only add those names that are not yet in the list.
568 new_tokens = word_tokens - existing_tokens[True]
570 new_tokens -= existing_tokens[False]
573 sql = """INSERT INTO word (word_token, type, word, info)
574 (SELECT token, 'C', %s, '{"internal": "yes"}'
575 FROM unnest(%s::text[]) as token)
578 sql = """INSERT INTO word (word_token, type, word)
579 (SELECT token, 'C', %s
580 FROM unnest(%s::text[]) as token)
582 cur.execute(sql, (country_code, list(new_tokens)))
584 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
585 """ Determine tokenizer information about the given place.
587 Returns a JSON-serializable structure that will be handed into
588 the database via the token_info field.
590 token_info = _TokenInfo()
592 names, address = self.sanitizer.process_names(place)
595 token_info.set_names(*self._compute_name_tokens(names))
597 if place.is_country():
598 assert place.country_code is not None
599 self._add_country_full_names(place.country_code, names)
602 self._process_place_address(token_info, address)
604 return token_info.to_dict()
606 def _process_place_address(self, token_info: '_TokenInfo',
607 address: Sequence[PlaceName]) -> None:
609 if item.kind == 'postcode':
610 token_info.set_postcode(self._add_postcode(item))
611 elif item.kind == 'housenumber':
612 token_info.add_housenumber(*self._compute_housenumber_token(item))
613 elif item.kind == 'street':
614 token_info.add_street(self._retrieve_full_tokens(item.name))
615 elif item.kind == 'place':
617 token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
618 elif (not item.kind.startswith('_') and not item.suffix and
619 item.kind not in ('country', 'full', 'inclusion')):
620 token_info.add_address_term(item.kind,
621 itertools.chain(*self._compute_name_tokens([item])))
623 def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
624 """ Normalize the housenumber and return the word token and the
627 assert self.conn is not None
628 analyzer = self.token_analysis.analysis.get('@housenumber')
629 result: Tuple[Optional[int], Optional[str]] = (None, None)
632 # When no custom analyzer is set, simply normalize and transliterate
633 norm_name = self._search_normalized(hnr.name)
635 result = self._cache.housenumbers.get(norm_name, result)
636 if result[0] is None:
637 hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
639 result = hid, norm_name
640 self._cache.housenumbers[norm_name] = result
642 # Otherwise use the analyzer to determine the canonical name.
643 # Per convention we use the first variant as the 'lookup name', the
644 # name that gets saved in the housenumber field of the place.
645 word_id = analyzer.get_canonical_id(hnr)
647 result = self._cache.housenumbers.get(word_id, result)
648 if result[0] is None:
649 variants = analyzer.compute_variants(word_id)
651 hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
652 (word_id, list(variants)))
653 result = hid, variants[0]
654 self._cache.housenumbers[word_id] = result
658 def _retrieve_full_tokens(self, name: str) -> List[int]:
659 """ Get the full name token for the given name, if it exists.
660 The name is only retrieved for the standard analyser.
662 assert self.conn is not None
663 norm_name = self._search_normalized(name)
665 # return cached if possible
666 if norm_name in self._cache.fulls:
667 return self._cache.fulls[norm_name]
669 with self.conn.cursor() as cur:
670 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
672 full = [row[0] for row in cur]
674 self._cache.fulls[norm_name] = full
678 def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
679 """ Computes the full name and partial name tokens for the given
682 assert self.conn is not None
683 full_tokens: Set[int] = set()
684 partial_tokens: Set[int] = set()
687 analyzer_id = name.get_attr('analyzer')
688 analyzer = self.token_analysis.get_analyzer(analyzer_id)
689 word_id = analyzer.get_canonical_id(name)
690 if analyzer_id is None:
693 token_id = f'{word_id}@{analyzer_id}'
695 full, part = self._cache.names.get(token_id, (None, None))
697 variants = analyzer.compute_variants(word_id)
701 with self.conn.cursor() as cur:
702 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
703 (token_id, variants))
704 full, part = cast(Tuple[int, List[int]], cur.fetchone())
706 self._cache.names[token_id] = (full, part)
708 assert part is not None
710 full_tokens.add(full)
711 partial_tokens.update(part)
713 return full_tokens, partial_tokens
715 def _add_postcode(self, item: PlaceName) -> Optional[str]:
716 """ Make sure the normalized postcode is present in the word table.
718 assert self.conn is not None
719 analyzer = self.token_analysis.analysis.get('@postcode')
722 postcode_name = item.name.strip().upper()
725 postcode_name = analyzer.get_canonical_id(item)
726 variant_base = item.get_attr("variant")
729 postcode = f'{postcode_name}@{variant_base}'
731 postcode = postcode_name
733 if postcode not in self._cache.postcodes:
734 term = self._search_normalized(postcode_name)
739 if analyzer is not None and variant_base:
740 variants.update(analyzer.compute_variants(variant_base))
742 with self.conn.cursor() as cur:
743 cur.execute("SELECT create_postcode_word(%s, %s)",
744 (postcode, list(variants)))
745 self._cache.postcodes.add(postcode)
751 """ Collect token information to be sent back to the database.
753 def __init__(self) -> None:
754 self.names: Optional[str] = None
755 self.housenumbers: Set[str] = set()
756 self.housenumber_tokens: Set[int] = set()
757 self.street_tokens: Optional[Set[int]] = None
758 self.place_tokens: Set[int] = set()
759 self.address_tokens: Dict[str, str] = {}
760 self.postcode: Optional[str] = None
762 def _mk_array(self, tokens: Iterable[Any]) -> str:
763 return f"{{{','.join((str(s) for s in tokens))}}}"
765 def to_dict(self) -> Dict[str, Any]:
766 """ Return the token information in database importable format.
768 out: Dict[str, Any] = {}
771 out['names'] = self.names
773 if self.housenumbers:
774 out['hnr'] = ';'.join(self.housenumbers)
775 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
777 if self.street_tokens is not None:
778 out['street'] = self._mk_array(self.street_tokens)
780 if self.place_tokens:
781 out['place'] = self._mk_array(self.place_tokens)
783 if self.address_tokens:
784 out['addr'] = self.address_tokens
787 out['postcode'] = self.postcode
791 def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
792 """ Adds token information for the normalised names.
794 self.names = self._mk_array(itertools.chain(fulls, partials))
796 def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
797 """ Extract housenumber information from a list of normalised
801 assert hnr is not None
802 self.housenumbers.add(hnr)
803 self.housenumber_tokens.add(token)
805 def add_street(self, tokens: Iterable[int]) -> None:
806 """ Add addr:street match terms.
808 if self.street_tokens is None:
809 self.street_tokens = set()
810 self.street_tokens.update(tokens)
812 def add_place(self, tokens: Iterable[int]) -> None:
813 """ Add addr:place search and match terms.
815 self.place_tokens.update(tokens)
817 def add_address_term(self, key: str, partials: Iterable[int]) -> None:
818 """ Add additional address terms.
820 array = self._mk_array(partials)
822 self.address_tokens[key] = array
824 def set_postcode(self, postcode: Optional[str]) -> None:
825 """ Set the postcode to the given one.
827 self.postcode = postcode
831 """ Cache for token information to avoid repeated database queries.
833 This cache is not thread-safe and needs to be instantiated per
836 def __init__(self) -> None:
837 self.names: Dict[str, Tuple[int, List[int]]] = {}
838 self.partials: Dict[str, int] = {}
839 self.fulls: Dict[str, List[int]] = {}
840 self.postcodes: Set[str] = set()
841 self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}