1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
16 from pathlib import Path
17 from textwrap import dedent
19 from nominatim.db.connection import connect, Connection, Cursor
20 from nominatim.config import Configuration
21 from nominatim.db.utils import CopyBuffer
22 from nominatim.db.sql_preprocessor import SQLPreprocessor
23 from nominatim.data.place_info import PlaceInfo
24 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
25 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
26 from nominatim.data.place_name import PlaceName
27 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
28 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
30 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
32 LOG = logging.getLogger()
34 WORD_TYPES =(('country_names', 'C'),
37 ('housenumbers', 'H'))
39 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
40 """ Create a new instance of the tokenizer provided by this module.
42 return ICUTokenizer(dsn, data_dir)
45 class ICUTokenizer(AbstractTokenizer):
46 """ This tokenizer uses libICU to convert names and queries to ASCII.
47 Otherwise it uses the same algorithms and data structures as the
48 normalization routines in Nominatim 3.
51 def __init__(self, dsn: str, data_dir: Path) -> None:
53 self.data_dir = data_dir
54 self.loader: Optional[ICURuleLoader] = None
57 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
58 """ Set up a new tokenizer for the database.
60 This copies all necessary data in the project directory to make
61 sure the tokenizer remains stable even over updates.
63 self.loader = ICURuleLoader(config)
65 self._install_php(config.lib_dir.php, overwrite=True)
69 self.update_sql_functions(config)
70 self._setup_db_tables(config, 'word')
71 self._create_base_indices(config, 'word')
74 def init_from_project(self, config: Configuration) -> None:
75 """ Initialise the tokenizer from the project directory.
77 self.loader = ICURuleLoader(config)
79 with connect(self.dsn) as conn:
80 self.loader.load_config_from_db(conn)
82 self._install_php(config.lib_dir.php, overwrite=False)
85 def finalize_import(self, config: Configuration) -> None:
86 """ Do any required postprocessing to make the tokenizer data ready
89 self._create_lookup_indices(config, 'word')
92 def update_sql_functions(self, config: Configuration) -> None:
93 """ Reimport the SQL functions for this tokenizer.
95 with connect(self.dsn) as conn:
96 sqlp = SQLPreprocessor(conn, config)
97 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
100 def check_database(self, config: Configuration) -> None:
101 """ Check that the tokenizer is set up correctly.
103 # Will throw an error if there is an issue.
104 self.init_from_project(config)
107 def update_statistics(self, config: Configuration) -> None:
108 """ Recompute frequencies for all name words.
110 with connect(self.dsn) as conn:
111 if not conn.table_exists('search_name'):
114 with conn.cursor() as cur:
115 LOG.info('Computing word frequencies')
116 cur.drop_table('word_frequencies')
117 cur.execute("""CREATE TEMP TABLE word_frequencies AS
118 SELECT unnest(name_vector) as id, count(*)
119 FROM search_name GROUP BY id""")
120 cur.execute('CREATE INDEX ON word_frequencies(id)')
121 LOG.info('Update word table with recomputed frequencies')
122 cur.drop_table('tmp_word')
123 cur.execute("""CREATE TABLE tmp_word AS
124 SELECT word_id, word_token, type, word,
125 (CASE WHEN wf.count is null THEN info
126 ELSE info || jsonb_build_object('count', wf.count)
128 FROM word LEFT JOIN word_frequencies wf
129 ON word.word_id = wf.id""")
130 cur.drop_table('word_frequencies')
132 self._create_base_indices(config, 'tmp_word')
133 self._create_lookup_indices(config, 'tmp_word')
134 self._move_temporary_word_table('tmp_word')
138 def _cleanup_housenumbers(self) -> None:
139 """ Remove unused house numbers.
141 with connect(self.dsn) as conn:
142 if not conn.table_exists('search_name'):
144 with conn.cursor(name="hnr_counter") as cur:
145 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
148 AND NOT EXISTS(SELECT * FROM search_name
149 WHERE ARRAY[word.word_id] && name_vector)
150 AND (char_length(coalesce(word, word_token)) > 6
151 OR coalesce(word, word_token) not similar to '\\d+')
153 candidates = {token: wid for wid, token in cur}
154 with conn.cursor(name="hnr_counter") as cur:
155 cur.execute("""SELECT housenumber FROM placex
156 WHERE housenumber is not null
157 AND (char_length(housenumber) > 6
158 OR housenumber not similar to '\\d+')
161 for hnr in row[0].split(';'):
162 candidates.pop(hnr, None)
163 LOG.info("There are %s outdated housenumbers.", len(candidates))
164 LOG.debug("Outdated housenumbers: %s", candidates.keys())
166 with conn.cursor() as cur:
167 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
168 (list(candidates.values()), ))
173 def update_word_tokens(self) -> None:
174 """ Remove unused tokens.
176 LOG.warning("Cleaning up housenumber tokens.")
177 self._cleanup_housenumbers()
178 LOG.warning("Tokenizer house-keeping done.")
181 def name_analyzer(self) -> 'ICUNameAnalyzer':
182 """ Create a new analyzer for tokenizing names and queries
183 using this tokinzer. Analyzers are context managers and should
187 with tokenizer.name_analyzer() as analyzer:
191 When used outside the with construct, the caller must ensure to
192 call the close() function before destructing the analyzer.
194 Analyzers are not thread-safe. You need to instantiate one per thread.
196 assert self.loader is not None
197 return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
198 self.loader.make_token_analysis())
201 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
202 """ Return a list of the `num` most frequent full words
205 with conn.cursor() as cur:
206 cur.execute("""SELECT word, sum((info->>'count')::int) as count
207 FROM word WHERE type = 'W'
209 ORDER BY count DESC LIMIT %s""", (num,))
210 return list(s[0].split('@')[0] for s in cur)
213 def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
214 """ Install the php script for the tokenizer.
216 assert self.loader is not None
217 php_file = self.data_dir / "tokenizer.php"
219 if not php_file.exists() or overwrite:
220 php_file.write_text(dedent(f"""\
222 @define('CONST_Max_Word_Frequency', 10000000);
223 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
224 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
225 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
228 def _save_config(self) -> None:
229 """ Save the configuration that needs to remain stable for the given
230 database as database properties.
232 assert self.loader is not None
233 with connect(self.dsn) as conn:
234 self.loader.save_config_to_db(conn)
237 def _setup_db_tables(self, config: Configuration, table_name: str) -> None:
238 """ Set up the word table and fill it with pre-computed word
241 with connect(self.dsn) as conn:
242 with conn.cursor() as cur:
243 cur.drop_table(table_name)
244 sqlp = SQLPreprocessor(conn, config)
245 sqlp.run_string(conn, """
246 CREATE TABLE {{table_name}} (
248 word_token text NOT NULL,
252 ) {{db.tablespace.search_data}};
253 GRANT SELECT ON {{table_name}} TO "{{config.DATABASE_WEBUSER}}";
255 DROP SEQUENCE IF EXISTS seq_{{table_name}};
256 CREATE SEQUENCE seq_{{table_name}} start 1;
257 GRANT SELECT ON seq_{{table_name}} to "{{config.DATABASE_WEBUSER}}";
258 """, table_name=table_name)
261 def _create_base_indices(self, config: Configuration, table_name: str) -> None:
262 """ Set up the word table and fill it with pre-computed word
265 with connect(self.dsn) as conn:
266 sqlp = SQLPreprocessor(conn, config)
267 sqlp.run_string(conn,
268 """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
269 USING BTREE (word_token) {{db.tablespace.search_index}}""",
270 table_name=table_name)
271 for name, ctype in WORD_TYPES:
272 sqlp.run_string(conn,
273 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
274 USING BTREE (word) {{db.tablespace.address_index}}
275 WHERE type = '{{column_type}}'
277 table_name=table_name, idx_name=name,
281 def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
282 """ Create addtional indexes used when running the API.
284 with connect(self.dsn) as conn:
285 sqlp = SQLPreprocessor(conn, config)
286 # Index required for details lookup.
287 sqlp.run_string(conn, """
288 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
289 ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
291 table_name=table_name)
294 def _move_temporary_word_table(self, old: str) -> None:
295 """ Rename all tables and indexes used by the tokenizer.
297 with connect(self.dsn) as conn:
298 with conn.cursor() as cur:
299 cur.drop_table('word')
300 cur.execute(f"ALTER TABLE {old} RENAME TO word")
301 for idx in ('word_token', 'word_id'):
302 cur.execute(f"""ALTER INDEX idx_{old}_{idx}
303 RENAME TO idx_word_{idx}""")
304 for name, _ in WORD_TYPES:
305 cur.execute(f"""ALTER INDEX idx_{old}_{name}
306 RENAME TO idx_word_{name}""")
312 class ICUNameAnalyzer(AbstractAnalyzer):
313 """ The ICU analyzer uses the ICU library for splitting names.
315 Each instance opens a connection to the database to request the
319 def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
320 token_analysis: ICUTokenAnalysis) -> None:
321 self.conn: Optional[Connection] = connect(dsn).connection
322 self.conn.autocommit = True
323 self.sanitizer = sanitizer
324 self.token_analysis = token_analysis
326 self._cache = _TokenCache()
329 def close(self) -> None:
330 """ Free all resources used by the analyzer.
337 def _search_normalized(self, name: str) -> str:
338 """ Return the search token transliteration of the given name.
340 return cast(str, self.token_analysis.search.transliterate(name)).strip()
343 def _normalized(self, name: str) -> str:
344 """ Return the normalized version of the given name with all
345 non-relevant information removed.
347 return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
350 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
351 """ Return token information for the given list of words.
352 If a word starts with # it is assumed to be a full name
353 otherwise is a partial name.
355 The function returns a list of tuples with
356 (original word, word token, word id).
358 The function is used for testing and debugging only
359 and not necessarily efficient.
361 assert self.conn is not None
365 if word.startswith('#'):
366 full_tokens[word] = self._search_normalized(word[1:])
368 partial_tokens[word] = self._search_normalized(word)
370 with self.conn.cursor() as cur:
371 cur.execute("""SELECT word_token, word_id
372 FROM word WHERE word_token = ANY(%s) and type = 'W'
373 """, (list(full_tokens.values()),))
374 full_ids = {r[0]: r[1] for r in cur}
375 cur.execute("""SELECT word_token, word_id
376 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
377 (list(partial_tokens.values()),))
378 part_ids = {r[0]: r[1] for r in cur}
380 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
381 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
384 def normalize_postcode(self, postcode: str) -> str:
385 """ Convert the postcode to a standardized form.
387 This function must yield exactly the same result as the SQL function
388 'token_normalized_postcode()'.
390 return postcode.strip().upper()
393 def update_postcodes_from_db(self) -> None:
394 """ Update postcode tokens in the word table from the location_postcode
397 assert self.conn is not None
398 analyzer = self.token_analysis.analysis.get('@postcode')
400 with self.conn.cursor() as cur:
401 # First get all postcode names currently in the word table.
402 cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
403 word_entries = set((entry[0] for entry in cur))
405 # Then compute the required postcode names from the postcode table.
406 needed_entries = set()
407 cur.execute("SELECT country_code, postcode FROM location_postcode")
408 for cc, postcode in cur:
409 info = PlaceInfo({'country_code': cc,
410 'class': 'place', 'type': 'postcode',
411 'address': {'postcode': postcode}})
412 address = self.sanitizer.process_names(info)[1]
413 for place in address:
414 if place.kind == 'postcode':
416 postcode_name = place.name.strip().upper()
419 postcode_name = analyzer.get_canonical_id(place)
420 variant_base = place.get_attr("variant")
423 needed_entries.add(f'{postcode_name}@{variant_base}')
425 needed_entries.add(postcode_name)
428 # Now update the word table.
429 self._delete_unused_postcode_words(word_entries - needed_entries)
430 self._add_missing_postcode_words(needed_entries - word_entries)
432 def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
433 assert self.conn is not None
435 with self.conn.cursor() as cur:
436 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
439 def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
440 assert self.conn is not None
444 analyzer = self.token_analysis.analysis.get('@postcode')
447 for postcode_name in tokens:
448 if '@' in postcode_name:
449 term, variant = postcode_name.split('@', 2)
450 term = self._search_normalized(term)
454 variants = analyzer.compute_variants(variant)
455 if term not in variants:
456 variants.append(term)
458 variants = [self._search_normalized(postcode_name)]
459 terms.append((postcode_name, variants))
462 with self.conn.cursor() as cur:
463 cur.execute_values("""SELECT create_postcode_word(pc, var)
464 FROM (VALUES %s) AS v(pc, var)""",
470 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
471 should_replace: bool) -> None:
472 """ Replace the search index for special phrases with the new phrases.
473 If `should_replace` is True, then the previous set of will be
474 completely replaced. Otherwise the phrases are added to the
475 already existing ones.
477 assert self.conn is not None
478 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
481 with self.conn.cursor() as cur:
482 # Get the old phrases.
483 existing_phrases = set()
484 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
485 for word, info in cur:
486 existing_phrases.add((word, info['class'], info['type'],
487 info.get('op') or '-'))
489 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
491 deleted = self._remove_special_phrases(cur, norm_phrases,
496 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
497 len(norm_phrases), added, deleted)
500 def _add_special_phrases(self, cursor: Cursor,
501 new_phrases: Set[Tuple[str, str, str, str]],
502 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
503 """ Add all phrases to the database that are not yet there.
505 to_add = new_phrases - existing_phrases
508 with CopyBuffer() as copystr:
509 for word, cls, typ, oper in to_add:
510 term = self._search_normalized(word)
512 copystr.add(term, 'S', word,
513 json.dumps({'class': cls, 'type': typ,
514 'op': oper if oper in ('in', 'near') else None}))
517 copystr.copy_out(cursor, 'word',
518 columns=['word_token', 'type', 'word', 'info'])
523 def _remove_special_phrases(self, cursor: Cursor,
524 new_phrases: Set[Tuple[str, str, str, str]],
525 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
526 """ Remove all phrases from the database that are no longer in the
529 to_delete = existing_phrases - new_phrases
532 cursor.execute_values(
533 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
534 WHERE type = 'S' and word = name
535 and info->>'class' = in_class and info->>'type' = in_type
536 and ((op = '-' and info->>'op' is null) or op = info->>'op')
539 return len(to_delete)
542 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
543 """ Add default names for the given country to the search index.
545 # Make sure any name preprocessing for country names applies.
546 info = PlaceInfo({'name': names, 'country_code': country_code,
547 'rank_address': 4, 'class': 'boundary',
548 'type': 'administrative'})
549 self._add_country_full_names(country_code,
550 self.sanitizer.process_names(info)[0],
554 def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
555 internal: bool = False) -> None:
556 """ Add names for the given country from an already sanitized
559 assert self.conn is not None
562 norm_name = self._search_normalized(name.name)
564 word_tokens.add(norm_name)
566 with self.conn.cursor() as cur:
568 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
570 WHERE type = 'C' and word = %s""",
572 # internal/external names
573 existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
575 existing_tokens[word[1]].add(word[0])
577 # Delete names that no longer exist.
578 gone_tokens = existing_tokens[internal] - word_tokens
580 gone_tokens.update(existing_tokens[False] & word_tokens)
582 cur.execute("""DELETE FROM word
583 USING unnest(%s) as token
584 WHERE type = 'C' and word = %s
585 and word_token = token""",
586 (list(gone_tokens), country_code))
588 # Only add those names that are not yet in the list.
589 new_tokens = word_tokens - existing_tokens[True]
591 new_tokens -= existing_tokens[False]
594 sql = """INSERT INTO word (word_token, type, word, info)
595 (SELECT token, 'C', %s, '{"internal": "yes"}'
596 FROM unnest(%s) as token)
599 sql = """INSERT INTO word (word_token, type, word)
600 (SELECT token, 'C', %s
601 FROM unnest(%s) as token)
603 cur.execute(sql, (country_code, list(new_tokens)))
606 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
607 """ Determine tokenizer information about the given place.
609 Returns a JSON-serializable structure that will be handed into
610 the database via the token_info field.
612 token_info = _TokenInfo()
614 names, address = self.sanitizer.process_names(place)
617 token_info.set_names(*self._compute_name_tokens(names))
619 if place.is_country():
620 assert place.country_code is not None
621 self._add_country_full_names(place.country_code, names)
624 self._process_place_address(token_info, address)
626 return token_info.to_dict()
629 def _process_place_address(self, token_info: '_TokenInfo',
630 address: Sequence[PlaceName]) -> None:
632 if item.kind == 'postcode':
633 token_info.set_postcode(self._add_postcode(item))
634 elif item.kind == 'housenumber':
635 token_info.add_housenumber(*self._compute_housenumber_token(item))
636 elif item.kind == 'street':
637 token_info.add_street(self._retrieve_full_tokens(item.name))
638 elif item.kind == 'place':
640 token_info.add_place(self._compute_partial_tokens(item.name))
641 elif not item.kind.startswith('_') and not item.suffix and \
642 item.kind not in ('country', 'full', 'inclusion'):
643 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
646 def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
647 """ Normalize the housenumber and return the word token and the
650 assert self.conn is not None
651 analyzer = self.token_analysis.analysis.get('@housenumber')
652 result: Tuple[Optional[int], Optional[str]] = (None, None)
655 # When no custom analyzer is set, simply normalize and transliterate
656 norm_name = self._search_normalized(hnr.name)
658 result = self._cache.housenumbers.get(norm_name, result)
659 if result[0] is None:
660 with self.conn.cursor() as cur:
661 hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
663 result = hid, norm_name
664 self._cache.housenumbers[norm_name] = result
666 # Otherwise use the analyzer to determine the canonical name.
667 # Per convention we use the first variant as the 'lookup name', the
668 # name that gets saved in the housenumber field of the place.
669 word_id = analyzer.get_canonical_id(hnr)
671 result = self._cache.housenumbers.get(word_id, result)
672 if result[0] is None:
673 variants = analyzer.compute_variants(word_id)
675 with self.conn.cursor() as cur:
676 hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
677 (word_id, list(variants)))
678 result = hid, variants[0]
679 self._cache.housenumbers[word_id] = result
684 def _compute_partial_tokens(self, name: str) -> List[int]:
685 """ Normalize the given term, split it into partial words and return
686 then token list for them.
688 assert self.conn is not None
689 norm_name = self._search_normalized(name)
693 for partial in norm_name.split():
694 token = self._cache.partials.get(partial)
698 need_lookup.append(partial)
701 with self.conn.cursor() as cur:
702 cur.execute("""SELECT word, getorcreate_partial_word(word)
703 FROM unnest(%s) word""",
706 for partial, token in cur:
707 assert token is not None
709 self._cache.partials[partial] = token
714 def _retrieve_full_tokens(self, name: str) -> List[int]:
715 """ Get the full name token for the given name, if it exists.
716 The name is only retrieved for the standard analyser.
718 assert self.conn is not None
719 norm_name = self._search_normalized(name)
721 # return cached if possible
722 if norm_name in self._cache.fulls:
723 return self._cache.fulls[norm_name]
725 with self.conn.cursor() as cur:
726 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
728 full = [row[0] for row in cur]
730 self._cache.fulls[norm_name] = full
735 def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
736 """ Computes the full name and partial name tokens for the given
739 assert self.conn is not None
740 full_tokens: Set[int] = set()
741 partial_tokens: Set[int] = set()
744 analyzer_id = name.get_attr('analyzer')
745 analyzer = self.token_analysis.get_analyzer(analyzer_id)
746 word_id = analyzer.get_canonical_id(name)
747 if analyzer_id is None:
750 token_id = f'{word_id}@{analyzer_id}'
752 full, part = self._cache.names.get(token_id, (None, None))
754 variants = analyzer.compute_variants(word_id)
758 with self.conn.cursor() as cur:
759 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
760 (token_id, variants))
761 full, part = cast(Tuple[int, List[int]], cur.fetchone())
763 self._cache.names[token_id] = (full, part)
765 assert part is not None
767 full_tokens.add(full)
768 partial_tokens.update(part)
770 return full_tokens, partial_tokens
773 def _add_postcode(self, item: PlaceName) -> Optional[str]:
774 """ Make sure the normalized postcode is present in the word table.
776 assert self.conn is not None
777 analyzer = self.token_analysis.analysis.get('@postcode')
780 postcode_name = item.name.strip().upper()
783 postcode_name = analyzer.get_canonical_id(item)
784 variant_base = item.get_attr("variant")
787 postcode = f'{postcode_name}@{variant_base}'
789 postcode = postcode_name
791 if postcode not in self._cache.postcodes:
792 term = self._search_normalized(postcode_name)
797 if analyzer is not None and variant_base:
798 variants.update(analyzer.compute_variants(variant_base))
800 with self.conn.cursor() as cur:
801 cur.execute("SELECT create_postcode_word(%s, %s)",
802 (postcode, list(variants)))
803 self._cache.postcodes.add(postcode)
809 """ Collect token information to be sent back to the database.
811 def __init__(self) -> None:
812 self.names: Optional[str] = None
813 self.housenumbers: Set[str] = set()
814 self.housenumber_tokens: Set[int] = set()
815 self.street_tokens: Optional[Set[int]] = None
816 self.place_tokens: Set[int] = set()
817 self.address_tokens: Dict[str, str] = {}
818 self.postcode: Optional[str] = None
821 def _mk_array(self, tokens: Iterable[Any]) -> str:
822 return f"{{{','.join((str(s) for s in tokens))}}}"
825 def to_dict(self) -> Dict[str, Any]:
826 """ Return the token information in database importable format.
828 out: Dict[str, Any] = {}
831 out['names'] = self.names
833 if self.housenumbers:
834 out['hnr'] = ';'.join(self.housenumbers)
835 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
837 if self.street_tokens is not None:
838 out['street'] = self._mk_array(self.street_tokens)
840 if self.place_tokens:
841 out['place'] = self._mk_array(self.place_tokens)
843 if self.address_tokens:
844 out['addr'] = self.address_tokens
847 out['postcode'] = self.postcode
852 def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
853 """ Adds token information for the normalised names.
855 self.names = self._mk_array(itertools.chain(fulls, partials))
858 def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
859 """ Extract housenumber information from a list of normalised
863 assert hnr is not None
864 self.housenumbers.add(hnr)
865 self.housenumber_tokens.add(token)
868 def add_street(self, tokens: Iterable[int]) -> None:
869 """ Add addr:street match terms.
871 if self.street_tokens is None:
872 self.street_tokens = set()
873 self.street_tokens.update(tokens)
876 def add_place(self, tokens: Iterable[int]) -> None:
877 """ Add addr:place search and match terms.
879 self.place_tokens.update(tokens)
882 def add_address_term(self, key: str, partials: Iterable[int]) -> None:
883 """ Add additional address terms.
886 self.address_tokens[key] = self._mk_array(partials)
888 def set_postcode(self, postcode: Optional[str]) -> None:
889 """ Set the postcode to the given one.
891 self.postcode = postcode
895 """ Cache for token information to avoid repeated database queries.
897 This cache is not thread-safe and needs to be instantiated per
900 def __init__(self) -> None:
901 self.names: Dict[str, Tuple[int, List[int]]] = {}
902 self.partials: Dict[str, int] = {}
903 self.fulls: Dict[str, List[int]] = {}
904 self.postcodes: Set[str] = set()
905 self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}