1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
16 from pathlib import Path
17 from textwrap import dedent
19 from nominatim.db.connection import connect, Connection, Cursor
20 from nominatim.config import Configuration
21 from nominatim.db.utils import CopyBuffer
22 from nominatim.db.sql_preprocessor import SQLPreprocessor
23 from nominatim.data.place_info import PlaceInfo
24 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
25 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
26 from nominatim.data.place_name import PlaceName
27 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
28 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
30 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
32 LOG = logging.getLogger()
34 WORD_TYPES =(('country_names', 'C'),
37 ('housenumbers', 'H'))
39 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
40 """ Create a new instance of the tokenizer provided by this module.
42 return ICUTokenizer(dsn, data_dir)
45 class ICUTokenizer(AbstractTokenizer):
46 """ This tokenizer uses libICU to convert names and queries to ASCII.
47 Otherwise it uses the same algorithms and data structures as the
48 normalization routines in Nominatim 3.
51 def __init__(self, dsn: str, data_dir: Path) -> None:
53 self.data_dir = data_dir
54 self.loader: Optional[ICURuleLoader] = None
57 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
58 """ Set up a new tokenizer for the database.
60 This copies all necessary data in the project directory to make
61 sure the tokenizer remains stable even over updates.
63 self.loader = ICURuleLoader(config)
65 self._install_php(config.lib_dir.php, overwrite=True)
69 self.update_sql_functions(config)
70 self._setup_db_tables(config)
71 self._create_base_indices(config, 'word')
74 def init_from_project(self, config: Configuration) -> None:
75 """ Initialise the tokenizer from the project directory.
77 self.loader = ICURuleLoader(config)
79 with connect(self.dsn) as conn:
80 self.loader.load_config_from_db(conn)
82 self._install_php(config.lib_dir.php, overwrite=False)
85 def finalize_import(self, config: Configuration) -> None:
86 """ Do any required postprocessing to make the tokenizer data ready
89 self._create_lookup_indices(config, 'word')
92 def update_sql_functions(self, config: Configuration) -> None:
93 """ Reimport the SQL functions for this tokenizer.
95 with connect(self.dsn) as conn:
96 sqlp = SQLPreprocessor(conn, config)
97 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
100 def check_database(self, config: Configuration) -> None:
101 """ Check that the tokenizer is set up correctly.
103 # Will throw an error if there is an issue.
104 self.init_from_project(config)
107 def update_statistics(self, config: Configuration) -> None:
108 """ Recompute frequencies for all name words.
110 with connect(self.dsn) as conn:
111 if not conn.table_exists('search_name'):
114 with conn.cursor() as cur:
115 LOG.info('Computing word frequencies')
116 cur.drop_table('word_frequencies')
117 cur.execute("""CREATE TEMP TABLE word_frequencies AS
118 SELECT unnest(name_vector) as id, count(*)
119 FROM search_name GROUP BY id""")
120 cur.execute('CREATE INDEX ON word_frequencies(id)')
121 LOG.info('Update word table with recomputed frequencies')
122 cur.drop_table('tmp_word')
123 cur.execute("""CREATE TABLE tmp_word AS
124 SELECT word_id, word_token, type, word,
125 (CASE WHEN wf.count is null THEN info
126 ELSE info || jsonb_build_object('count', wf.count)
128 FROM word LEFT JOIN word_frequencies wf
129 ON word.word_id = wf.id""")
130 cur.drop_table('word_frequencies')
132 sqlp = SQLPreprocessor(conn, config)
133 sqlp.run_string(conn,
134 'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
136 self._create_base_indices(config, 'tmp_word')
137 self._create_lookup_indices(config, 'tmp_word')
138 self._move_temporary_word_table('tmp_word')
142 def _cleanup_housenumbers(self) -> None:
143 """ Remove unused house numbers.
145 with connect(self.dsn) as conn:
146 if not conn.table_exists('search_name'):
148 with conn.cursor(name="hnr_counter") as cur:
149 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
152 AND NOT EXISTS(SELECT * FROM search_name
153 WHERE ARRAY[word.word_id] && name_vector)
154 AND (char_length(coalesce(word, word_token)) > 6
155 OR coalesce(word, word_token) not similar to '\\d+')
157 candidates = {token: wid for wid, token in cur}
158 with conn.cursor(name="hnr_counter") as cur:
159 cur.execute("""SELECT housenumber FROM placex
160 WHERE housenumber is not null
161 AND (char_length(housenumber) > 6
162 OR housenumber not similar to '\\d+')
165 for hnr in row[0].split(';'):
166 candidates.pop(hnr, None)
167 LOG.info("There are %s outdated housenumbers.", len(candidates))
168 LOG.debug("Outdated housenumbers: %s", candidates.keys())
170 with conn.cursor() as cur:
171 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
172 (list(candidates.values()), ))
177 def update_word_tokens(self) -> None:
178 """ Remove unused tokens.
180 LOG.warning("Cleaning up housenumber tokens.")
181 self._cleanup_housenumbers()
182 LOG.warning("Tokenizer house-keeping done.")
185 def name_analyzer(self) -> 'ICUNameAnalyzer':
186 """ Create a new analyzer for tokenizing names and queries
187 using this tokinzer. Analyzers are context managers and should
191 with tokenizer.name_analyzer() as analyzer:
195 When used outside the with construct, the caller must ensure to
196 call the close() function before destructing the analyzer.
198 Analyzers are not thread-safe. You need to instantiate one per thread.
200 assert self.loader is not None
201 return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
202 self.loader.make_token_analysis())
205 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
206 """ Return a list of the `num` most frequent full words
209 with conn.cursor() as cur:
210 cur.execute("""SELECT word, sum((info->>'count')::int) as count
211 FROM word WHERE type = 'W'
213 ORDER BY count DESC LIMIT %s""", (num,))
214 return list(s[0].split('@')[0] for s in cur)
217 def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
218 """ Install the php script for the tokenizer.
220 assert self.loader is not None
221 php_file = self.data_dir / "tokenizer.php"
223 if not php_file.exists() or overwrite:
224 php_file.write_text(dedent(f"""\
226 @define('CONST_Max_Word_Frequency', 10000000);
227 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
228 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
229 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
232 def _save_config(self) -> None:
233 """ Save the configuration that needs to remain stable for the given
234 database as database properties.
236 assert self.loader is not None
237 with connect(self.dsn) as conn:
238 self.loader.save_config_to_db(conn)
241 def _setup_db_tables(self, config: Configuration) -> None:
242 """ Set up the word table and fill it with pre-computed word
245 with connect(self.dsn) as conn:
246 with conn.cursor() as cur:
247 cur.drop_table('word')
248 sqlp = SQLPreprocessor(conn, config)
249 sqlp.run_string(conn, """
252 word_token text NOT NULL,
256 ) {{db.tablespace.search_data}};
257 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
259 DROP SEQUENCE IF EXISTS seq_word;
260 CREATE SEQUENCE seq_word start 1;
261 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
266 def _create_base_indices(self, config: Configuration, table_name: str) -> None:
267 """ Set up the word table and fill it with pre-computed word
270 with connect(self.dsn) as conn:
271 sqlp = SQLPreprocessor(conn, config)
272 sqlp.run_string(conn,
273 """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
274 USING BTREE (word_token) {{db.tablespace.search_index}}""",
275 table_name=table_name)
276 for name, ctype in WORD_TYPES:
277 sqlp.run_string(conn,
278 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
279 USING BTREE (word) {{db.tablespace.address_index}}
280 WHERE type = '{{column_type}}'
282 table_name=table_name, idx_name=name,
287 def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
288 """ Create addtional indexes used when running the API.
290 with connect(self.dsn) as conn:
291 sqlp = SQLPreprocessor(conn, config)
292 # Index required for details lookup.
293 sqlp.run_string(conn, """
294 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
295 ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
297 table_name=table_name)
301 def _move_temporary_word_table(self, old: str) -> None:
302 """ Rename all tables and indexes used by the tokenizer.
304 with connect(self.dsn) as conn:
305 with conn.cursor() as cur:
306 cur.drop_table('word')
307 cur.execute(f"ALTER TABLE {old} RENAME TO word")
308 for idx in ('word_token', 'word_id'):
309 cur.execute(f"""ALTER INDEX idx_{old}_{idx}
310 RENAME TO idx_word_{idx}""")
311 for name, _ in WORD_TYPES:
312 cur.execute(f"""ALTER INDEX idx_{old}_{name}
313 RENAME TO idx_word_{name}""")
319 class ICUNameAnalyzer(AbstractAnalyzer):
320 """ The ICU analyzer uses the ICU library for splitting names.
322 Each instance opens a connection to the database to request the
326 def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
327 token_analysis: ICUTokenAnalysis) -> None:
328 self.conn: Optional[Connection] = connect(dsn).connection
329 self.conn.autocommit = True
330 self.sanitizer = sanitizer
331 self.token_analysis = token_analysis
333 self._cache = _TokenCache()
336 def close(self) -> None:
337 """ Free all resources used by the analyzer.
344 def _search_normalized(self, name: str) -> str:
345 """ Return the search token transliteration of the given name.
347 return cast(str, self.token_analysis.search.transliterate(name)).strip()
350 def _normalized(self, name: str) -> str:
351 """ Return the normalized version of the given name with all
352 non-relevant information removed.
354 return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
357 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
358 """ Return token information for the given list of words.
359 If a word starts with # it is assumed to be a full name
360 otherwise is a partial name.
362 The function returns a list of tuples with
363 (original word, word token, word id).
365 The function is used for testing and debugging only
366 and not necessarily efficient.
368 assert self.conn is not None
372 if word.startswith('#'):
373 full_tokens[word] = self._search_normalized(word[1:])
375 partial_tokens[word] = self._search_normalized(word)
377 with self.conn.cursor() as cur:
378 cur.execute("""SELECT word_token, word_id
379 FROM word WHERE word_token = ANY(%s) and type = 'W'
380 """, (list(full_tokens.values()),))
381 full_ids = {r[0]: r[1] for r in cur}
382 cur.execute("""SELECT word_token, word_id
383 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
384 (list(partial_tokens.values()),))
385 part_ids = {r[0]: r[1] for r in cur}
387 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
388 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
391 def normalize_postcode(self, postcode: str) -> str:
392 """ Convert the postcode to a standardized form.
394 This function must yield exactly the same result as the SQL function
395 'token_normalized_postcode()'.
397 return postcode.strip().upper()
400 def update_postcodes_from_db(self) -> None:
401 """ Update postcode tokens in the word table from the location_postcode
404 assert self.conn is not None
405 analyzer = self.token_analysis.analysis.get('@postcode')
407 with self.conn.cursor() as cur:
408 # First get all postcode names currently in the word table.
409 cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
410 word_entries = set((entry[0] for entry in cur))
412 # Then compute the required postcode names from the postcode table.
413 needed_entries = set()
414 cur.execute("SELECT country_code, postcode FROM location_postcode")
415 for cc, postcode in cur:
416 info = PlaceInfo({'country_code': cc,
417 'class': 'place', 'type': 'postcode',
418 'address': {'postcode': postcode}})
419 address = self.sanitizer.process_names(info)[1]
420 for place in address:
421 if place.kind == 'postcode':
423 postcode_name = place.name.strip().upper()
426 postcode_name = analyzer.get_canonical_id(place)
427 variant_base = place.get_attr("variant")
430 needed_entries.add(f'{postcode_name}@{variant_base}')
432 needed_entries.add(postcode_name)
435 # Now update the word table.
436 self._delete_unused_postcode_words(word_entries - needed_entries)
437 self._add_missing_postcode_words(needed_entries - word_entries)
439 def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
440 assert self.conn is not None
442 with self.conn.cursor() as cur:
443 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
446 def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
447 assert self.conn is not None
451 analyzer = self.token_analysis.analysis.get('@postcode')
454 for postcode_name in tokens:
455 if '@' in postcode_name:
456 term, variant = postcode_name.split('@', 2)
457 term = self._search_normalized(term)
461 variants = analyzer.compute_variants(variant)
462 if term not in variants:
463 variants.append(term)
465 variants = [self._search_normalized(postcode_name)]
466 terms.append((postcode_name, variants))
469 with self.conn.cursor() as cur:
470 cur.execute_values("""SELECT create_postcode_word(pc, var)
471 FROM (VALUES %s) AS v(pc, var)""",
477 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
478 should_replace: bool) -> None:
479 """ Replace the search index for special phrases with the new phrases.
480 If `should_replace` is True, then the previous set of will be
481 completely replaced. Otherwise the phrases are added to the
482 already existing ones.
484 assert self.conn is not None
485 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
488 with self.conn.cursor() as cur:
489 # Get the old phrases.
490 existing_phrases = set()
491 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
492 for word, info in cur:
493 existing_phrases.add((word, info['class'], info['type'],
494 info.get('op') or '-'))
496 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
498 deleted = self._remove_special_phrases(cur, norm_phrases,
503 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
504 len(norm_phrases), added, deleted)
507 def _add_special_phrases(self, cursor: Cursor,
508 new_phrases: Set[Tuple[str, str, str, str]],
509 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
510 """ Add all phrases to the database that are not yet there.
512 to_add = new_phrases - existing_phrases
515 with CopyBuffer() as copystr:
516 for word, cls, typ, oper in to_add:
517 term = self._search_normalized(word)
519 copystr.add(term, 'S', word,
520 json.dumps({'class': cls, 'type': typ,
521 'op': oper if oper in ('in', 'near') else None}))
524 copystr.copy_out(cursor, 'word',
525 columns=['word_token', 'type', 'word', 'info'])
530 def _remove_special_phrases(self, cursor: Cursor,
531 new_phrases: Set[Tuple[str, str, str, str]],
532 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
533 """ Remove all phrases from the database that are no longer in the
536 to_delete = existing_phrases - new_phrases
539 cursor.execute_values(
540 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
541 WHERE type = 'S' and word = name
542 and info->>'class' = in_class and info->>'type' = in_type
543 and ((op = '-' and info->>'op' is null) or op = info->>'op')
546 return len(to_delete)
549 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
550 """ Add default names for the given country to the search index.
552 # Make sure any name preprocessing for country names applies.
553 info = PlaceInfo({'name': names, 'country_code': country_code,
554 'rank_address': 4, 'class': 'boundary',
555 'type': 'administrative'})
556 self._add_country_full_names(country_code,
557 self.sanitizer.process_names(info)[0],
561 def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
562 internal: bool = False) -> None:
563 """ Add names for the given country from an already sanitized
566 assert self.conn is not None
569 norm_name = self._search_normalized(name.name)
571 word_tokens.add(norm_name)
573 with self.conn.cursor() as cur:
575 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
577 WHERE type = 'C' and word = %s""",
579 # internal/external names
580 existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
582 existing_tokens[word[1]].add(word[0])
584 # Delete names that no longer exist.
585 gone_tokens = existing_tokens[internal] - word_tokens
587 gone_tokens.update(existing_tokens[False] & word_tokens)
589 cur.execute("""DELETE FROM word
590 USING unnest(%s) as token
591 WHERE type = 'C' and word = %s
592 and word_token = token""",
593 (list(gone_tokens), country_code))
595 # Only add those names that are not yet in the list.
596 new_tokens = word_tokens - existing_tokens[True]
598 new_tokens -= existing_tokens[False]
601 sql = """INSERT INTO word (word_token, type, word, info)
602 (SELECT token, 'C', %s, '{"internal": "yes"}'
603 FROM unnest(%s) as token)
606 sql = """INSERT INTO word (word_token, type, word)
607 (SELECT token, 'C', %s
608 FROM unnest(%s) as token)
610 cur.execute(sql, (country_code, list(new_tokens)))
613 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
614 """ Determine tokenizer information about the given place.
616 Returns a JSON-serializable structure that will be handed into
617 the database via the token_info field.
619 token_info = _TokenInfo()
621 names, address = self.sanitizer.process_names(place)
624 token_info.set_names(*self._compute_name_tokens(names))
626 if place.is_country():
627 assert place.country_code is not None
628 self._add_country_full_names(place.country_code, names)
631 self._process_place_address(token_info, address)
633 return token_info.to_dict()
636 def _process_place_address(self, token_info: '_TokenInfo',
637 address: Sequence[PlaceName]) -> None:
639 if item.kind == 'postcode':
640 token_info.set_postcode(self._add_postcode(item))
641 elif item.kind == 'housenumber':
642 token_info.add_housenumber(*self._compute_housenumber_token(item))
643 elif item.kind == 'street':
644 token_info.add_street(self._retrieve_full_tokens(item.name))
645 elif item.kind == 'place':
647 token_info.add_place(self._compute_partial_tokens(item.name))
648 elif not item.kind.startswith('_') and not item.suffix and \
649 item.kind not in ('country', 'full', 'inclusion'):
650 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
653 def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
654 """ Normalize the housenumber and return the word token and the
657 assert self.conn is not None
658 analyzer = self.token_analysis.analysis.get('@housenumber')
659 result: Tuple[Optional[int], Optional[str]] = (None, None)
662 # When no custom analyzer is set, simply normalize and transliterate
663 norm_name = self._search_normalized(hnr.name)
665 result = self._cache.housenumbers.get(norm_name, result)
666 if result[0] is None:
667 with self.conn.cursor() as cur:
668 hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
670 result = hid, norm_name
671 self._cache.housenumbers[norm_name] = result
673 # Otherwise use the analyzer to determine the canonical name.
674 # Per convention we use the first variant as the 'lookup name', the
675 # name that gets saved in the housenumber field of the place.
676 word_id = analyzer.get_canonical_id(hnr)
678 result = self._cache.housenumbers.get(word_id, result)
679 if result[0] is None:
680 variants = analyzer.compute_variants(word_id)
682 with self.conn.cursor() as cur:
683 hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
684 (word_id, list(variants)))
685 result = hid, variants[0]
686 self._cache.housenumbers[word_id] = result
691 def _compute_partial_tokens(self, name: str) -> List[int]:
692 """ Normalize the given term, split it into partial words and return
693 then token list for them.
695 assert self.conn is not None
696 norm_name = self._search_normalized(name)
700 for partial in norm_name.split():
701 token = self._cache.partials.get(partial)
705 need_lookup.append(partial)
708 with self.conn.cursor() as cur:
709 cur.execute("""SELECT word, getorcreate_partial_word(word)
710 FROM unnest(%s) word""",
713 for partial, token in cur:
714 assert token is not None
716 self._cache.partials[partial] = token
721 def _retrieve_full_tokens(self, name: str) -> List[int]:
722 """ Get the full name token for the given name, if it exists.
723 The name is only retrieved for the standard analyser.
725 assert self.conn is not None
726 norm_name = self._search_normalized(name)
728 # return cached if possible
729 if norm_name in self._cache.fulls:
730 return self._cache.fulls[norm_name]
732 with self.conn.cursor() as cur:
733 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
735 full = [row[0] for row in cur]
737 self._cache.fulls[norm_name] = full
742 def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
743 """ Computes the full name and partial name tokens for the given
746 assert self.conn is not None
747 full_tokens: Set[int] = set()
748 partial_tokens: Set[int] = set()
751 analyzer_id = name.get_attr('analyzer')
752 analyzer = self.token_analysis.get_analyzer(analyzer_id)
753 word_id = analyzer.get_canonical_id(name)
754 if analyzer_id is None:
757 token_id = f'{word_id}@{analyzer_id}'
759 full, part = self._cache.names.get(token_id, (None, None))
761 variants = analyzer.compute_variants(word_id)
765 with self.conn.cursor() as cur:
766 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
767 (token_id, variants))
768 full, part = cast(Tuple[int, List[int]], cur.fetchone())
770 self._cache.names[token_id] = (full, part)
772 assert part is not None
774 full_tokens.add(full)
775 partial_tokens.update(part)
777 return full_tokens, partial_tokens
780 def _add_postcode(self, item: PlaceName) -> Optional[str]:
781 """ Make sure the normalized postcode is present in the word table.
783 assert self.conn is not None
784 analyzer = self.token_analysis.analysis.get('@postcode')
787 postcode_name = item.name.strip().upper()
790 postcode_name = analyzer.get_canonical_id(item)
791 variant_base = item.get_attr("variant")
794 postcode = f'{postcode_name}@{variant_base}'
796 postcode = postcode_name
798 if postcode not in self._cache.postcodes:
799 term = self._search_normalized(postcode_name)
804 if analyzer is not None and variant_base:
805 variants.update(analyzer.compute_variants(variant_base))
807 with self.conn.cursor() as cur:
808 cur.execute("SELECT create_postcode_word(%s, %s)",
809 (postcode, list(variants)))
810 self._cache.postcodes.add(postcode)
816 """ Collect token information to be sent back to the database.
818 def __init__(self) -> None:
819 self.names: Optional[str] = None
820 self.housenumbers: Set[str] = set()
821 self.housenumber_tokens: Set[int] = set()
822 self.street_tokens: Optional[Set[int]] = None
823 self.place_tokens: Set[int] = set()
824 self.address_tokens: Dict[str, str] = {}
825 self.postcode: Optional[str] = None
828 def _mk_array(self, tokens: Iterable[Any]) -> str:
829 return f"{{{','.join((str(s) for s in tokens))}}}"
832 def to_dict(self) -> Dict[str, Any]:
833 """ Return the token information in database importable format.
835 out: Dict[str, Any] = {}
838 out['names'] = self.names
840 if self.housenumbers:
841 out['hnr'] = ';'.join(self.housenumbers)
842 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
844 if self.street_tokens is not None:
845 out['street'] = self._mk_array(self.street_tokens)
847 if self.place_tokens:
848 out['place'] = self._mk_array(self.place_tokens)
850 if self.address_tokens:
851 out['addr'] = self.address_tokens
854 out['postcode'] = self.postcode
859 def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
860 """ Adds token information for the normalised names.
862 self.names = self._mk_array(itertools.chain(fulls, partials))
865 def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
866 """ Extract housenumber information from a list of normalised
870 assert hnr is not None
871 self.housenumbers.add(hnr)
872 self.housenumber_tokens.add(token)
875 def add_street(self, tokens: Iterable[int]) -> None:
876 """ Add addr:street match terms.
878 if self.street_tokens is None:
879 self.street_tokens = set()
880 self.street_tokens.update(tokens)
883 def add_place(self, tokens: Iterable[int]) -> None:
884 """ Add addr:place search and match terms.
886 self.place_tokens.update(tokens)
889 def add_address_term(self, key: str, partials: Iterable[int]) -> None:
890 """ Add additional address terms.
893 self.address_tokens[key] = self._mk_array(partials)
895 def set_postcode(self, postcode: Optional[str]) -> None:
896 """ Set the postcode to the given one.
898 self.postcode = postcode
902 """ Cache for token information to avoid repeated database queries.
904 This cache is not thread-safe and needs to be instantiated per
907 def __init__(self) -> None:
908 self.names: Dict[str, Tuple[int, List[int]]] = {}
909 self.partials: Dict[str, int] = {}
910 self.fulls: Dict[str, List[int]] = {}
911 self.postcodes: Set[str] = set()
912 self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}