1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
16 from pathlib import Path
17 from textwrap import dedent
19 from nominatim.db.connection import connect, Connection, Cursor
20 from nominatim.config import Configuration
21 from nominatim.db.utils import CopyBuffer
22 from nominatim.db.sql_preprocessor import SQLPreprocessor
23 from nominatim.data.place_info import PlaceInfo
24 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
25 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
26 from nominatim.data.place_name import PlaceName
27 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
28 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
30 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
32 LOG = logging.getLogger()
34 WORD_TYPES =(('country_names', 'C'),
37 ('housenumbers', 'H'))
39 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
40 """ Create a new instance of the tokenizer provided by this module.
42 return ICUTokenizer(dsn, data_dir)
45 class ICUTokenizer(AbstractTokenizer):
46 """ This tokenizer uses libICU to convert names and queries to ASCII.
47 Otherwise it uses the same algorithms and data structures as the
48 normalization routines in Nominatim 3.
51 def __init__(self, dsn: str, data_dir: Path) -> None:
53 self.data_dir = data_dir
54 self.loader: Optional[ICURuleLoader] = None
57 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
58 """ Set up a new tokenizer for the database.
60 This copies all necessary data in the project directory to make
61 sure the tokenizer remains stable even over updates.
63 self.loader = ICURuleLoader(config)
65 self._install_php(config.lib_dir.php, overwrite=True)
69 self.update_sql_functions(config)
70 self._setup_db_tables(config)
71 self._create_base_indices(config, 'word')
74 def init_from_project(self, config: Configuration) -> None:
75 """ Initialise the tokenizer from the project directory.
77 self.loader = ICURuleLoader(config)
79 with connect(self.dsn) as conn:
80 self.loader.load_config_from_db(conn)
82 self._install_php(config.lib_dir.php, overwrite=False)
85 def finalize_import(self, config: Configuration) -> None:
86 """ Do any required postprocessing to make the tokenizer data ready
89 self._create_lookup_indices(config, 'word')
92 def update_sql_functions(self, config: Configuration) -> None:
93 """ Reimport the SQL functions for this tokenizer.
95 with connect(self.dsn) as conn:
96 sqlp = SQLPreprocessor(conn, config)
97 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
100 def check_database(self, config: Configuration) -> None:
101 """ Check that the tokenizer is set up correctly.
103 # Will throw an error if there is an issue.
104 self.init_from_project(config)
107 def update_statistics(self, config: Configuration) -> None:
108 """ Recompute frequencies for all name words.
110 with connect(self.dsn) as conn:
111 if not conn.table_exists('search_name'):
114 with conn.cursor() as cur:
115 LOG.info('Computing word frequencies')
116 cur.drop_table('word_frequencies')
117 cur.execute("""CREATE TEMP TABLE word_frequencies AS
118 SELECT unnest(name_vector) as id, count(*)
119 FROM search_name GROUP BY id""")
120 cur.execute('CREATE INDEX ON word_frequencies(id)')
121 LOG.info('Update word table with recomputed frequencies')
122 cur.drop_table('tmp_word')
123 cur.execute("""CREATE TABLE tmp_word AS
124 SELECT word_id, word_token, type, word,
125 (CASE WHEN wf.count is null THEN info
126 ELSE info || jsonb_build_object('count', wf.count)
128 FROM word LEFT JOIN word_frequencies wf
129 ON word.word_id = wf.id""")
130 cur.drop_table('word_frequencies')
132 sqlp = SQLPreprocessor(conn, config)
133 sqlp.run_string(conn,
134 'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
136 self._create_base_indices(config, 'tmp_word')
137 self._create_lookup_indices(config, 'tmp_word')
138 self._move_temporary_word_table('tmp_word')
142 def _cleanup_housenumbers(self) -> None:
143 """ Remove unused house numbers.
145 with connect(self.dsn) as conn:
146 if not conn.table_exists('search_name'):
148 with conn.cursor(name="hnr_counter") as cur:
149 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
152 AND NOT EXISTS(SELECT * FROM search_name
153 WHERE ARRAY[word.word_id] && name_vector)
154 AND (char_length(coalesce(word, word_token)) > 6
155 OR coalesce(word, word_token) not similar to '\\d+')
157 candidates = {token: wid for wid, token in cur}
158 with conn.cursor(name="hnr_counter") as cur:
159 cur.execute("""SELECT housenumber FROM placex
160 WHERE housenumber is not null
161 AND (char_length(housenumber) > 6
162 OR housenumber not similar to '\\d+')
165 for hnr in row[0].split(';'):
166 candidates.pop(hnr, None)
167 LOG.info("There are %s outdated housenumbers.", len(candidates))
168 LOG.debug("Outdated housenumbers: %s", candidates.keys())
170 with conn.cursor() as cur:
171 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
172 (list(candidates.values()), ))
177 def update_word_tokens(self) -> None:
178 """ Remove unused tokens.
180 LOG.warning("Cleaning up housenumber tokens.")
181 self._cleanup_housenumbers()
182 LOG.warning("Tokenizer house-keeping done.")
185 def name_analyzer(self) -> 'ICUNameAnalyzer':
186 """ Create a new analyzer for tokenizing names and queries
187 using this tokinzer. Analyzers are context managers and should
191 with tokenizer.name_analyzer() as analyzer:
195 When used outside the with construct, the caller must ensure to
196 call the close() function before destructing the analyzer.
198 Analyzers are not thread-safe. You need to instantiate one per thread.
200 assert self.loader is not None
201 return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
202 self.loader.make_token_analysis())
205 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
206 """ Return a list of the `num` most frequent full words
209 with conn.cursor() as cur:
210 cur.execute("""SELECT word, sum((info->>'count')::int) as count
211 FROM word WHERE type = 'W'
213 ORDER BY count DESC LIMIT %s""", (num,))
214 return list(s[0].split('@')[0] for s in cur)
217 def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
218 """ Install the php script for the tokenizer.
220 if phpdir is not None:
221 assert self.loader is not None
222 php_file = self.data_dir / "tokenizer.php"
224 if not php_file.exists() or overwrite:
225 php_file.write_text(dedent(f"""\
227 @define('CONST_Max_Word_Frequency', 10000000);
228 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
229 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
230 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
233 def _save_config(self) -> None:
234 """ Save the configuration that needs to remain stable for the given
235 database as database properties.
237 assert self.loader is not None
238 with connect(self.dsn) as conn:
239 self.loader.save_config_to_db(conn)
242 def _setup_db_tables(self, config: Configuration) -> None:
243 """ Set up the word table and fill it with pre-computed word
246 with connect(self.dsn) as conn:
247 with conn.cursor() as cur:
248 cur.drop_table('word')
249 sqlp = SQLPreprocessor(conn, config)
250 sqlp.run_string(conn, """
253 word_token text NOT NULL,
257 ) {{db.tablespace.search_data}};
258 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
260 DROP SEQUENCE IF EXISTS seq_word;
261 CREATE SEQUENCE seq_word start 1;
262 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
267 def _create_base_indices(self, config: Configuration, table_name: str) -> None:
268 """ Set up the word table and fill it with pre-computed word
271 with connect(self.dsn) as conn:
272 sqlp = SQLPreprocessor(conn, config)
273 sqlp.run_string(conn,
274 """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
275 USING BTREE (word_token) {{db.tablespace.search_index}}""",
276 table_name=table_name)
277 for name, ctype in WORD_TYPES:
278 sqlp.run_string(conn,
279 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
280 USING BTREE (word) {{db.tablespace.address_index}}
281 WHERE type = '{{column_type}}'
283 table_name=table_name, idx_name=name,
288 def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
289 """ Create additional indexes used when running the API.
291 with connect(self.dsn) as conn:
292 sqlp = SQLPreprocessor(conn, config)
293 # Index required for details lookup.
294 sqlp.run_string(conn, """
295 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
296 ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
298 table_name=table_name)
302 def _move_temporary_word_table(self, old: str) -> None:
303 """ Rename all tables and indexes used by the tokenizer.
305 with connect(self.dsn) as conn:
306 with conn.cursor() as cur:
307 cur.drop_table('word')
308 cur.execute(f"ALTER TABLE {old} RENAME TO word")
309 for idx in ('word_token', 'word_id'):
310 cur.execute(f"""ALTER INDEX idx_{old}_{idx}
311 RENAME TO idx_word_{idx}""")
312 for name, _ in WORD_TYPES:
313 cur.execute(f"""ALTER INDEX idx_{old}_{name}
314 RENAME TO idx_word_{name}""")
320 class ICUNameAnalyzer(AbstractAnalyzer):
321 """ The ICU analyzer uses the ICU library for splitting names.
323 Each instance opens a connection to the database to request the
327 def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
328 token_analysis: ICUTokenAnalysis) -> None:
329 self.conn: Optional[Connection] = connect(dsn).connection
330 self.conn.autocommit = True
331 self.sanitizer = sanitizer
332 self.token_analysis = token_analysis
334 self._cache = _TokenCache()
337 def close(self) -> None:
338 """ Free all resources used by the analyzer.
345 def _search_normalized(self, name: str) -> str:
346 """ Return the search token transliteration of the given name.
348 return cast(str, self.token_analysis.search.transliterate(name)).strip()
351 def _normalized(self, name: str) -> str:
352 """ Return the normalized version of the given name with all
353 non-relevant information removed.
355 return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
358 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
359 """ Return token information for the given list of words.
360 If a word starts with # it is assumed to be a full name
361 otherwise is a partial name.
363 The function returns a list of tuples with
364 (original word, word token, word id).
366 The function is used for testing and debugging only
367 and not necessarily efficient.
369 assert self.conn is not None
373 if word.startswith('#'):
374 full_tokens[word] = self._search_normalized(word[1:])
376 partial_tokens[word] = self._search_normalized(word)
378 with self.conn.cursor() as cur:
379 cur.execute("""SELECT word_token, word_id
380 FROM word WHERE word_token = ANY(%s) and type = 'W'
381 """, (list(full_tokens.values()),))
382 full_ids = {r[0]: r[1] for r in cur}
383 cur.execute("""SELECT word_token, word_id
384 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
385 (list(partial_tokens.values()),))
386 part_ids = {r[0]: r[1] for r in cur}
388 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
389 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
392 def normalize_postcode(self, postcode: str) -> str:
393 """ Convert the postcode to a standardized form.
395 This function must yield exactly the same result as the SQL function
396 'token_normalized_postcode()'.
398 return postcode.strip().upper()
401 def update_postcodes_from_db(self) -> None:
402 """ Update postcode tokens in the word table from the location_postcode
405 assert self.conn is not None
406 analyzer = self.token_analysis.analysis.get('@postcode')
408 with self.conn.cursor() as cur:
409 # First get all postcode names currently in the word table.
410 cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
411 word_entries = set((entry[0] for entry in cur))
413 # Then compute the required postcode names from the postcode table.
414 needed_entries = set()
415 cur.execute("SELECT country_code, postcode FROM location_postcode")
416 for cc, postcode in cur:
417 info = PlaceInfo({'country_code': cc,
418 'class': 'place', 'type': 'postcode',
419 'address': {'postcode': postcode}})
420 address = self.sanitizer.process_names(info)[1]
421 for place in address:
422 if place.kind == 'postcode':
424 postcode_name = place.name.strip().upper()
427 postcode_name = analyzer.get_canonical_id(place)
428 variant_base = place.get_attr("variant")
431 needed_entries.add(f'{postcode_name}@{variant_base}')
433 needed_entries.add(postcode_name)
436 # Now update the word table.
437 self._delete_unused_postcode_words(word_entries - needed_entries)
438 self._add_missing_postcode_words(needed_entries - word_entries)
440 def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
441 assert self.conn is not None
443 with self.conn.cursor() as cur:
444 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
447 def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
448 assert self.conn is not None
452 analyzer = self.token_analysis.analysis.get('@postcode')
455 for postcode_name in tokens:
456 if '@' in postcode_name:
457 term, variant = postcode_name.split('@', 2)
458 term = self._search_normalized(term)
462 variants = analyzer.compute_variants(variant)
463 if term not in variants:
464 variants.append(term)
466 variants = [self._search_normalized(postcode_name)]
467 terms.append((postcode_name, variants))
470 with self.conn.cursor() as cur:
471 cur.execute_values("""SELECT create_postcode_word(pc, var)
472 FROM (VALUES %s) AS v(pc, var)""",
478 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
479 should_replace: bool) -> None:
480 """ Replace the search index for special phrases with the new phrases.
481 If `should_replace` is True, then the previous set of will be
482 completely replaced. Otherwise the phrases are added to the
483 already existing ones.
485 assert self.conn is not None
486 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
489 with self.conn.cursor() as cur:
490 # Get the old phrases.
491 existing_phrases = set()
492 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
493 for word, info in cur:
494 existing_phrases.add((word, info['class'], info['type'],
495 info.get('op') or '-'))
497 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
499 deleted = self._remove_special_phrases(cur, norm_phrases,
504 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
505 len(norm_phrases), added, deleted)
508 def _add_special_phrases(self, cursor: Cursor,
509 new_phrases: Set[Tuple[str, str, str, str]],
510 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
511 """ Add all phrases to the database that are not yet there.
513 to_add = new_phrases - existing_phrases
516 with CopyBuffer() as copystr:
517 for word, cls, typ, oper in to_add:
518 term = self._search_normalized(word)
520 copystr.add(term, 'S', word,
521 json.dumps({'class': cls, 'type': typ,
522 'op': oper if oper in ('in', 'near') else None}))
525 copystr.copy_out(cursor, 'word',
526 columns=['word_token', 'type', 'word', 'info'])
531 def _remove_special_phrases(self, cursor: Cursor,
532 new_phrases: Set[Tuple[str, str, str, str]],
533 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
534 """ Remove all phrases from the database that are no longer in the
537 to_delete = existing_phrases - new_phrases
540 cursor.execute_values(
541 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
542 WHERE type = 'S' and word = name
543 and info->>'class' = in_class and info->>'type' = in_type
544 and ((op = '-' and info->>'op' is null) or op = info->>'op')
547 return len(to_delete)
550 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
551 """ Add default names for the given country to the search index.
553 # Make sure any name preprocessing for country names applies.
554 info = PlaceInfo({'name': names, 'country_code': country_code,
555 'rank_address': 4, 'class': 'boundary',
556 'type': 'administrative'})
557 self._add_country_full_names(country_code,
558 self.sanitizer.process_names(info)[0],
562 def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
563 internal: bool = False) -> None:
564 """ Add names for the given country from an already sanitized
567 assert self.conn is not None
570 norm_name = self._search_normalized(name.name)
572 word_tokens.add(norm_name)
574 with self.conn.cursor() as cur:
576 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
578 WHERE type = 'C' and word = %s""",
580 # internal/external names
581 existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
583 existing_tokens[word[1]].add(word[0])
585 # Delete names that no longer exist.
586 gone_tokens = existing_tokens[internal] - word_tokens
588 gone_tokens.update(existing_tokens[False] & word_tokens)
590 cur.execute("""DELETE FROM word
591 USING unnest(%s) as token
592 WHERE type = 'C' and word = %s
593 and word_token = token""",
594 (list(gone_tokens), country_code))
596 # Only add those names that are not yet in the list.
597 new_tokens = word_tokens - existing_tokens[True]
599 new_tokens -= existing_tokens[False]
602 sql = """INSERT INTO word (word_token, type, word, info)
603 (SELECT token, 'C', %s, '{"internal": "yes"}'
604 FROM unnest(%s) as token)
607 sql = """INSERT INTO word (word_token, type, word)
608 (SELECT token, 'C', %s
609 FROM unnest(%s) as token)
611 cur.execute(sql, (country_code, list(new_tokens)))
614 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
615 """ Determine tokenizer information about the given place.
617 Returns a JSON-serializable structure that will be handed into
618 the database via the token_info field.
620 token_info = _TokenInfo()
622 names, address = self.sanitizer.process_names(place)
625 token_info.set_names(*self._compute_name_tokens(names))
627 if place.is_country():
628 assert place.country_code is not None
629 self._add_country_full_names(place.country_code, names)
632 self._process_place_address(token_info, address)
634 return token_info.to_dict()
637 def _process_place_address(self, token_info: '_TokenInfo',
638 address: Sequence[PlaceName]) -> None:
640 if item.kind == 'postcode':
641 token_info.set_postcode(self._add_postcode(item))
642 elif item.kind == 'housenumber':
643 token_info.add_housenumber(*self._compute_housenumber_token(item))
644 elif item.kind == 'street':
645 token_info.add_street(self._retrieve_full_tokens(item.name))
646 elif item.kind == 'place':
648 token_info.add_place(self._compute_partial_tokens(item.name))
649 elif not item.kind.startswith('_') and not item.suffix and \
650 item.kind not in ('country', 'full', 'inclusion'):
651 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
654 def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
655 """ Normalize the housenumber and return the word token and the
658 assert self.conn is not None
659 analyzer = self.token_analysis.analysis.get('@housenumber')
660 result: Tuple[Optional[int], Optional[str]] = (None, None)
663 # When no custom analyzer is set, simply normalize and transliterate
664 norm_name = self._search_normalized(hnr.name)
666 result = self._cache.housenumbers.get(norm_name, result)
667 if result[0] is None:
668 with self.conn.cursor() as cur:
669 hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
671 result = hid, norm_name
672 self._cache.housenumbers[norm_name] = result
674 # Otherwise use the analyzer to determine the canonical name.
675 # Per convention we use the first variant as the 'lookup name', the
676 # name that gets saved in the housenumber field of the place.
677 word_id = analyzer.get_canonical_id(hnr)
679 result = self._cache.housenumbers.get(word_id, result)
680 if result[0] is None:
681 variants = analyzer.compute_variants(word_id)
683 with self.conn.cursor() as cur:
684 hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
685 (word_id, list(variants)))
686 result = hid, variants[0]
687 self._cache.housenumbers[word_id] = result
692 def _compute_partial_tokens(self, name: str) -> List[int]:
693 """ Normalize the given term, split it into partial words and return
694 then token list for them.
696 assert self.conn is not None
697 norm_name = self._search_normalized(name)
701 for partial in norm_name.split():
702 token = self._cache.partials.get(partial)
706 need_lookup.append(partial)
709 with self.conn.cursor() as cur:
710 cur.execute("""SELECT word, getorcreate_partial_word(word)
711 FROM unnest(%s) word""",
714 for partial, token in cur:
715 assert token is not None
717 self._cache.partials[partial] = token
722 def _retrieve_full_tokens(self, name: str) -> List[int]:
723 """ Get the full name token for the given name, if it exists.
724 The name is only retrieved for the standard analyser.
726 assert self.conn is not None
727 norm_name = self._search_normalized(name)
729 # return cached if possible
730 if norm_name in self._cache.fulls:
731 return self._cache.fulls[norm_name]
733 with self.conn.cursor() as cur:
734 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
736 full = [row[0] for row in cur]
738 self._cache.fulls[norm_name] = full
743 def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
744 """ Computes the full name and partial name tokens for the given
747 assert self.conn is not None
748 full_tokens: Set[int] = set()
749 partial_tokens: Set[int] = set()
752 analyzer_id = name.get_attr('analyzer')
753 analyzer = self.token_analysis.get_analyzer(analyzer_id)
754 word_id = analyzer.get_canonical_id(name)
755 if analyzer_id is None:
758 token_id = f'{word_id}@{analyzer_id}'
760 full, part = self._cache.names.get(token_id, (None, None))
762 variants = analyzer.compute_variants(word_id)
766 with self.conn.cursor() as cur:
767 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
768 (token_id, variants))
769 full, part = cast(Tuple[int, List[int]], cur.fetchone())
771 self._cache.names[token_id] = (full, part)
773 assert part is not None
775 full_tokens.add(full)
776 partial_tokens.update(part)
778 return full_tokens, partial_tokens
781 def _add_postcode(self, item: PlaceName) -> Optional[str]:
782 """ Make sure the normalized postcode is present in the word table.
784 assert self.conn is not None
785 analyzer = self.token_analysis.analysis.get('@postcode')
788 postcode_name = item.name.strip().upper()
791 postcode_name = analyzer.get_canonical_id(item)
792 variant_base = item.get_attr("variant")
795 postcode = f'{postcode_name}@{variant_base}'
797 postcode = postcode_name
799 if postcode not in self._cache.postcodes:
800 term = self._search_normalized(postcode_name)
805 if analyzer is not None and variant_base:
806 variants.update(analyzer.compute_variants(variant_base))
808 with self.conn.cursor() as cur:
809 cur.execute("SELECT create_postcode_word(%s, %s)",
810 (postcode, list(variants)))
811 self._cache.postcodes.add(postcode)
817 """ Collect token information to be sent back to the database.
819 def __init__(self) -> None:
820 self.names: Optional[str] = None
821 self.housenumbers: Set[str] = set()
822 self.housenumber_tokens: Set[int] = set()
823 self.street_tokens: Optional[Set[int]] = None
824 self.place_tokens: Set[int] = set()
825 self.address_tokens: Dict[str, str] = {}
826 self.postcode: Optional[str] = None
829 def _mk_array(self, tokens: Iterable[Any]) -> str:
830 return f"{{{','.join((str(s) for s in tokens))}}}"
833 def to_dict(self) -> Dict[str, Any]:
834 """ Return the token information in database importable format.
836 out: Dict[str, Any] = {}
839 out['names'] = self.names
841 if self.housenumbers:
842 out['hnr'] = ';'.join(self.housenumbers)
843 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
845 if self.street_tokens is not None:
846 out['street'] = self._mk_array(self.street_tokens)
848 if self.place_tokens:
849 out['place'] = self._mk_array(self.place_tokens)
851 if self.address_tokens:
852 out['addr'] = self.address_tokens
855 out['postcode'] = self.postcode
860 def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
861 """ Adds token information for the normalised names.
863 self.names = self._mk_array(itertools.chain(fulls, partials))
866 def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
867 """ Extract housenumber information from a list of normalised
871 assert hnr is not None
872 self.housenumbers.add(hnr)
873 self.housenumber_tokens.add(token)
876 def add_street(self, tokens: Iterable[int]) -> None:
877 """ Add addr:street match terms.
879 if self.street_tokens is None:
880 self.street_tokens = set()
881 self.street_tokens.update(tokens)
884 def add_place(self, tokens: Iterable[int]) -> None:
885 """ Add addr:place search and match terms.
887 self.place_tokens.update(tokens)
890 def add_address_term(self, key: str, partials: Iterable[int]) -> None:
891 """ Add additional address terms.
894 self.address_tokens[key] = self._mk_array(partials)
896 def set_postcode(self, postcode: Optional[str]) -> None:
897 """ Set the postcode to the given one.
899 self.postcode = postcode
903 """ Cache for token information to avoid repeated database queries.
905 This cache is not thread-safe and needs to be instantiated per
908 def __init__(self) -> None:
909 self.names: Dict[str, Tuple[int, List[int]]] = {}
910 self.partials: Dict[str, int] = {}
911 self.fulls: Dict[str, List[int]] = {}
912 self.postcodes: Set[str] = set()
913 self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}