1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
15 from pathlib import Path
17 from psycopg.types.json import Jsonb
18 from psycopg import sql as pysql
20 from ..db.connection import connect, Connection, Cursor, server_version_tuple,\
21 drop_tables, table_exists, execute_scalar
22 from ..config import Configuration
23 from ..db.sql_preprocessor import SQLPreprocessor
24 from ..data.place_info import PlaceInfo
25 from ..data.place_name import PlaceName
26 from .icu_rule_loader import ICURuleLoader
27 from .place_sanitizer import PlaceSanitizer
28 from .icu_token_analysis import ICUTokenAnalysis
29 from .base import AbstractAnalyzer, AbstractTokenizer
31 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
33 LOG = logging.getLogger()
35 WORD_TYPES =(('country_names', 'C'),
38 ('housenumbers', 'H'))
40 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
41 """ Create a new instance of the tokenizer provided by this module.
43 return ICUTokenizer(dsn, data_dir)
46 class ICUTokenizer(AbstractTokenizer):
47 """ This tokenizer uses libICU to convert names and queries to ASCII.
48 Otherwise it uses the same algorithms and data structures as the
49 normalization routines in Nominatim 3.
52 def __init__(self, dsn: str, data_dir: Path) -> None:
54 self.data_dir = data_dir
55 self.loader: Optional[ICURuleLoader] = None
58 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
59 """ Set up a new tokenizer for the database.
61 This copies all necessary data in the project directory to make
62 sure the tokenizer remains stable even over updates.
64 self.loader = ICURuleLoader(config)
69 self.update_sql_functions(config)
70 self._setup_db_tables(config)
71 self._create_base_indices(config, 'word')
74 def init_from_project(self, config: Configuration) -> None:
75 """ Initialise the tokenizer from the project directory.
77 self.loader = ICURuleLoader(config)
79 with connect(self.dsn) as conn:
80 self.loader.load_config_from_db(conn)
83 def finalize_import(self, config: Configuration) -> None:
84 """ Do any required postprocessing to make the tokenizer data ready
87 self._create_lookup_indices(config, 'word')
90 def update_sql_functions(self, config: Configuration) -> None:
91 """ Reimport the SQL functions for this tokenizer.
93 with connect(self.dsn) as conn:
94 sqlp = SQLPreprocessor(conn, config)
95 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
98 def check_database(self, config: Configuration) -> None:
99 """ Check that the tokenizer is set up correctly.
101 # Will throw an error if there is an issue.
102 self.init_from_project(config)
105 def update_statistics(self, config: Configuration, threads: int = 2) -> None:
106 """ Recompute frequencies for all name words.
108 with connect(self.dsn) as conn:
109 if not table_exists(conn, 'search_name'):
112 with conn.cursor() as cur:
113 cur.execute('ANALYSE search_name')
115 cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
116 .format(pysql.Literal(min(threads, 6),)))
118 if server_version_tuple(conn) < (12, 0):
119 LOG.info('Computing word frequencies')
120 drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
121 cur.execute("""CREATE TEMP TABLE word_frequencies AS
122 SELECT unnest(name_vector) as id, count(*)
123 FROM search_name GROUP BY id""")
124 cur.execute('CREATE INDEX ON word_frequencies(id)')
125 cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
126 SELECT unnest(nameaddress_vector) as id, count(*)
127 FROM search_name GROUP BY id""")
128 cur.execute('CREATE INDEX ON addressword_frequencies(id)')
129 cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
137 FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
139 info = info || jsonb_build_object('count', rec.count);
141 FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
143 info = info || jsonb_build_object('addr_count', rec.count);
145 IF info = '{}'::jsonb THEN
149 $$ LANGUAGE plpgsql IMMUTABLE;
151 LOG.info('Update word table with recomputed frequencies')
152 drop_tables(conn, 'tmp_word')
153 cur.execute("""CREATE TABLE tmp_word AS
154 SELECT word_id, word_token, type, word,
155 word_freq_update(word_id, info) as info
158 drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
160 LOG.info('Computing word frequencies')
161 drop_tables(conn, 'word_frequencies')
163 CREATE TEMP TABLE word_frequencies AS
164 WITH word_freq AS MATERIALIZED (
165 SELECT unnest(name_vector) as id, count(*)
166 FROM search_name GROUP BY id),
167 addr_freq AS MATERIALIZED (
168 SELECT unnest(nameaddress_vector) as id, count(*)
169 FROM search_name GROUP BY id)
170 SELECT coalesce(a.id, w.id) as id,
171 (CASE WHEN w.count is null THEN '{}'::JSONB
172 ELSE jsonb_build_object('count', w.count) END
174 CASE WHEN a.count is null THEN '{}'::JSONB
175 ELSE jsonb_build_object('addr_count', a.count) END) as info
176 FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
178 cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
179 cur.execute('ANALYSE word_frequencies')
180 LOG.info('Update word table with recomputed frequencies')
181 drop_tables(conn, 'tmp_word')
182 cur.execute("""CREATE TABLE tmp_word AS
183 SELECT word_id, word_token, type, word,
184 (CASE WHEN wf.info is null THEN word.info
185 ELSE coalesce(word.info, '{}'::jsonb) || wf.info
187 FROM word LEFT JOIN word_frequencies wf
188 ON word.word_id = wf.id
190 drop_tables(conn, 'word_frequencies')
192 with conn.cursor() as cur:
193 cur.execute('SET max_parallel_workers_per_gather TO 0')
195 sqlp = SQLPreprocessor(conn, config)
196 sqlp.run_string(conn,
197 'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
199 self._create_base_indices(config, 'tmp_word')
200 self._create_lookup_indices(config, 'tmp_word')
201 self._move_temporary_word_table('tmp_word')
205 def _cleanup_housenumbers(self) -> None:
206 """ Remove unused house numbers.
208 with connect(self.dsn) as conn:
209 if not table_exists(conn, 'search_name'):
211 with conn.cursor(name="hnr_counter") as cur:
212 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
215 AND NOT EXISTS(SELECT * FROM search_name
216 WHERE ARRAY[word.word_id] && name_vector)
217 AND (char_length(coalesce(word, word_token)) > 6
218 OR coalesce(word, word_token) not similar to '\\d+')
220 candidates = {token: wid for wid, token in cur}
221 with conn.cursor(name="hnr_counter") as cur:
222 cur.execute("""SELECT housenumber FROM placex
223 WHERE housenumber is not null
224 AND (char_length(housenumber) > 6
225 OR housenumber not similar to '\\d+')
228 for hnr in row[0].split(';'):
229 candidates.pop(hnr, None)
230 LOG.info("There are %s outdated housenumbers.", len(candidates))
231 LOG.debug("Outdated housenumbers: %s", candidates.keys())
233 with conn.cursor() as cur:
234 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
235 (list(candidates.values()), ))
240 def update_word_tokens(self) -> None:
241 """ Remove unused tokens.
243 LOG.warning("Cleaning up housenumber tokens.")
244 self._cleanup_housenumbers()
245 LOG.warning("Tokenizer house-keeping done.")
248 def name_analyzer(self) -> 'ICUNameAnalyzer':
249 """ Create a new analyzer for tokenizing names and queries
250 using this tokinzer. Analyzers are context managers and should
254 with tokenizer.name_analyzer() as analyzer:
258 When used outside the with construct, the caller must ensure to
259 call the close() function before destructing the analyzer.
261 Analyzers are not thread-safe. You need to instantiate one per thread.
263 assert self.loader is not None
264 return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
265 self.loader.make_token_analysis())
268 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
269 """ Return a list of the `num` most frequent full words
272 with conn.cursor() as cur:
273 cur.execute("""SELECT word, sum((info->>'count')::int) as count
274 FROM word WHERE type = 'W'
276 ORDER BY count DESC LIMIT %s""", (num,))
277 return list(s[0].split('@')[0] for s in cur)
280 def _save_config(self) -> None:
281 """ Save the configuration that needs to remain stable for the given
282 database as database properties.
284 assert self.loader is not None
285 with connect(self.dsn) as conn:
286 self.loader.save_config_to_db(conn)
289 def _setup_db_tables(self, config: Configuration) -> None:
290 """ Set up the word table and fill it with pre-computed word
293 with connect(self.dsn) as conn:
294 drop_tables(conn, 'word')
295 sqlp = SQLPreprocessor(conn, config)
296 sqlp.run_string(conn, """
299 word_token text NOT NULL,
303 ) {{db.tablespace.search_data}};
304 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
306 DROP SEQUENCE IF EXISTS seq_word;
307 CREATE SEQUENCE seq_word start 1;
308 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
313 def _create_base_indices(self, config: Configuration, table_name: str) -> None:
314 """ Set up the word table and fill it with pre-computed word
317 with connect(self.dsn) as conn:
318 sqlp = SQLPreprocessor(conn, config)
319 sqlp.run_string(conn,
320 """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
321 USING BTREE (word_token) {{db.tablespace.search_index}}""",
322 table_name=table_name)
323 for name, ctype in WORD_TYPES:
324 sqlp.run_string(conn,
325 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
326 USING BTREE (word) {{db.tablespace.address_index}}
327 WHERE type = '{{column_type}}'
329 table_name=table_name, idx_name=name,
334 def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
335 """ Create additional indexes used when running the API.
337 with connect(self.dsn) as conn:
338 sqlp = SQLPreprocessor(conn, config)
339 # Index required for details lookup.
340 sqlp.run_string(conn, """
341 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
342 ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
344 table_name=table_name)
348 def _move_temporary_word_table(self, old: str) -> None:
349 """ Rename all tables and indexes used by the tokenizer.
351 with connect(self.dsn) as conn:
352 drop_tables(conn, 'word')
353 with conn.cursor() as cur:
354 cur.execute(f"ALTER TABLE {old} RENAME TO word")
355 for idx in ('word_token', 'word_id'):
356 cur.execute(f"""ALTER INDEX idx_{old}_{idx}
357 RENAME TO idx_word_{idx}""")
358 for name, _ in WORD_TYPES:
359 cur.execute(f"""ALTER INDEX idx_{old}_{name}
360 RENAME TO idx_word_{name}""")
366 class ICUNameAnalyzer(AbstractAnalyzer):
367 """ The ICU analyzer uses the ICU library for splitting names.
369 Each instance opens a connection to the database to request the
373 def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
374 token_analysis: ICUTokenAnalysis) -> None:
375 self.conn: Optional[Connection] = connect(dsn)
376 self.conn.autocommit = True
377 self.sanitizer = sanitizer
378 self.token_analysis = token_analysis
380 self._cache = _TokenCache()
383 def close(self) -> None:
384 """ Free all resources used by the analyzer.
391 def _search_normalized(self, name: str) -> str:
392 """ Return the search token transliteration of the given name.
394 return cast(str, self.token_analysis.search.transliterate(name)).strip()
397 def _normalized(self, name: str) -> str:
398 """ Return the normalized version of the given name with all
399 non-relevant information removed.
401 return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
404 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
405 """ Return token information for the given list of words.
406 If a word starts with # it is assumed to be a full name
407 otherwise is a partial name.
409 The function returns a list of tuples with
410 (original word, word token, word id).
412 The function is used for testing and debugging only
413 and not necessarily efficient.
415 assert self.conn is not None
419 if word.startswith('#'):
420 full_tokens[word] = self._search_normalized(word[1:])
422 partial_tokens[word] = self._search_normalized(word)
424 with self.conn.cursor() as cur:
425 cur.execute("""SELECT word_token, word_id
426 FROM word WHERE word_token = ANY(%s) and type = 'W'
427 """, (list(full_tokens.values()),))
428 full_ids = {r[0]: r[1] for r in cur}
429 cur.execute("""SELECT word_token, word_id
430 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
431 (list(partial_tokens.values()),))
432 part_ids = {r[0]: r[1] for r in cur}
434 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
435 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
438 def normalize_postcode(self, postcode: str) -> str:
439 """ Convert the postcode to a standardized form.
441 This function must yield exactly the same result as the SQL function
442 'token_normalized_postcode()'.
444 return postcode.strip().upper()
447 def update_postcodes_from_db(self) -> None:
448 """ Update postcode tokens in the word table from the location_postcode
451 assert self.conn is not None
452 analyzer = self.token_analysis.analysis.get('@postcode')
454 with self.conn.cursor() as cur:
455 # First get all postcode names currently in the word table.
456 cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
457 word_entries = set((entry[0] for entry in cur))
459 # Then compute the required postcode names from the postcode table.
460 needed_entries = set()
461 cur.execute("SELECT country_code, postcode FROM location_postcode")
462 for cc, postcode in cur:
463 info = PlaceInfo({'country_code': cc,
464 'class': 'place', 'type': 'postcode',
465 'address': {'postcode': postcode}})
466 address = self.sanitizer.process_names(info)[1]
467 for place in address:
468 if place.kind == 'postcode':
470 postcode_name = place.name.strip().upper()
473 postcode_name = analyzer.get_canonical_id(place)
474 variant_base = place.get_attr("variant")
477 needed_entries.add(f'{postcode_name}@{variant_base}')
479 needed_entries.add(postcode_name)
482 # Now update the word table.
483 self._delete_unused_postcode_words(word_entries - needed_entries)
484 self._add_missing_postcode_words(needed_entries - word_entries)
486 def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
487 assert self.conn is not None
489 with self.conn.cursor() as cur:
490 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
493 def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
494 assert self.conn is not None
498 analyzer = self.token_analysis.analysis.get('@postcode')
501 for postcode_name in tokens:
502 if '@' in postcode_name:
503 term, variant = postcode_name.split('@', 2)
504 term = self._search_normalized(term)
508 variants = analyzer.compute_variants(variant)
509 if term not in variants:
510 variants.append(term)
512 variants = [self._search_normalized(postcode_name)]
513 terms.append((postcode_name, variants))
516 with self.conn.cursor() as cur:
517 cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
522 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
523 should_replace: bool) -> None:
524 """ Replace the search index for special phrases with the new phrases.
525 If `should_replace` is True, then the previous set of will be
526 completely replaced. Otherwise the phrases are added to the
527 already existing ones.
529 assert self.conn is not None
530 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
533 with self.conn.cursor() as cur:
534 # Get the old phrases.
535 existing_phrases = set()
536 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
537 for word, info in cur:
538 existing_phrases.add((word, info['class'], info['type'],
539 info.get('op') or '-'))
541 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
543 deleted = self._remove_special_phrases(cur, norm_phrases,
548 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
549 len(norm_phrases), added, deleted)
552 def _add_special_phrases(self, cursor: Cursor,
553 new_phrases: Set[Tuple[str, str, str, str]],
554 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
555 """ Add all phrases to the database that are not yet there.
557 to_add = new_phrases - existing_phrases
560 with cursor.copy('COPY word(word_token, type, word, info) FROM STDIN') as copy:
561 for word, cls, typ, oper in to_add:
562 term = self._search_normalized(word)
564 copy.write_row((term, 'S', word,
565 Jsonb({'class': cls, 'type': typ,
566 'op': oper if oper in ('in', 'near') else None})))
572 def _remove_special_phrases(self, cursor: Cursor,
573 new_phrases: Set[Tuple[str, str, str, str]],
574 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
575 """ Remove all phrases from the database that are no longer in the
578 to_delete = existing_phrases - new_phrases
583 WHERE type = 'S' and word = %s
584 and info->>'class' = %s and info->>'type' = %s
585 and %s = coalesce(info->>'op', '-')
588 return len(to_delete)
591 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
592 """ Add default names for the given country to the search index.
594 # Make sure any name preprocessing for country names applies.
595 info = PlaceInfo({'name': names, 'country_code': country_code,
596 'rank_address': 4, 'class': 'boundary',
597 'type': 'administrative'})
598 self._add_country_full_names(country_code,
599 self.sanitizer.process_names(info)[0],
603 def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
604 internal: bool = False) -> None:
605 """ Add names for the given country from an already sanitized
608 assert self.conn is not None
611 norm_name = self._search_normalized(name.name)
613 word_tokens.add(norm_name)
615 with self.conn.cursor() as cur:
617 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
619 WHERE type = 'C' and word = %s""",
621 # internal/external names
622 existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
624 existing_tokens[word[1]].add(word[0])
626 # Delete names that no longer exist.
627 gone_tokens = existing_tokens[internal] - word_tokens
629 gone_tokens.update(existing_tokens[False] & word_tokens)
631 cur.execute("""DELETE FROM word
632 USING unnest(%s::text[]) as token
633 WHERE type = 'C' and word = %s
634 and word_token = token""",
635 (list(gone_tokens), country_code))
637 # Only add those names that are not yet in the list.
638 new_tokens = word_tokens - existing_tokens[True]
640 new_tokens -= existing_tokens[False]
643 sql = """INSERT INTO word (word_token, type, word, info)
644 (SELECT token, 'C', %s, '{"internal": "yes"}'
645 FROM unnest(%s::text[]) as token)
648 sql = """INSERT INTO word (word_token, type, word)
649 (SELECT token, 'C', %s
650 FROM unnest(%s::text[]) as token)
652 cur.execute(sql, (country_code, list(new_tokens)))
655 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
656 """ Determine tokenizer information about the given place.
658 Returns a JSON-serializable structure that will be handed into
659 the database via the token_info field.
661 token_info = _TokenInfo()
663 names, address = self.sanitizer.process_names(place)
666 token_info.set_names(*self._compute_name_tokens(names))
668 if place.is_country():
669 assert place.country_code is not None
670 self._add_country_full_names(place.country_code, names)
673 self._process_place_address(token_info, address)
675 return token_info.to_dict()
678 def _process_place_address(self, token_info: '_TokenInfo',
679 address: Sequence[PlaceName]) -> None:
681 if item.kind == 'postcode':
682 token_info.set_postcode(self._add_postcode(item))
683 elif item.kind == 'housenumber':
684 token_info.add_housenumber(*self._compute_housenumber_token(item))
685 elif item.kind == 'street':
686 token_info.add_street(self._retrieve_full_tokens(item.name))
687 elif item.kind == 'place':
689 token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
690 elif not item.kind.startswith('_') and not item.suffix and \
691 item.kind not in ('country', 'full', 'inclusion'):
692 token_info.add_address_term(item.kind,
693 itertools.chain(*self._compute_name_tokens([item])))
696 def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
697 """ Normalize the housenumber and return the word token and the
700 assert self.conn is not None
701 analyzer = self.token_analysis.analysis.get('@housenumber')
702 result: Tuple[Optional[int], Optional[str]] = (None, None)
705 # When no custom analyzer is set, simply normalize and transliterate
706 norm_name = self._search_normalized(hnr.name)
708 result = self._cache.housenumbers.get(norm_name, result)
709 if result[0] is None:
710 hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
712 result = hid, norm_name
713 self._cache.housenumbers[norm_name] = result
715 # Otherwise use the analyzer to determine the canonical name.
716 # Per convention we use the first variant as the 'lookup name', the
717 # name that gets saved in the housenumber field of the place.
718 word_id = analyzer.get_canonical_id(hnr)
720 result = self._cache.housenumbers.get(word_id, result)
721 if result[0] is None:
722 variants = analyzer.compute_variants(word_id)
724 hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
725 (word_id, list(variants)))
726 result = hid, variants[0]
727 self._cache.housenumbers[word_id] = result
732 def _retrieve_full_tokens(self, name: str) -> List[int]:
733 """ Get the full name token for the given name, if it exists.
734 The name is only retrieved for the standard analyser.
736 assert self.conn is not None
737 norm_name = self._search_normalized(name)
739 # return cached if possible
740 if norm_name in self._cache.fulls:
741 return self._cache.fulls[norm_name]
743 with self.conn.cursor() as cur:
744 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
746 full = [row[0] for row in cur]
748 self._cache.fulls[norm_name] = full
753 def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
754 """ Computes the full name and partial name tokens for the given
757 assert self.conn is not None
758 full_tokens: Set[int] = set()
759 partial_tokens: Set[int] = set()
762 analyzer_id = name.get_attr('analyzer')
763 analyzer = self.token_analysis.get_analyzer(analyzer_id)
764 word_id = analyzer.get_canonical_id(name)
765 if analyzer_id is None:
768 token_id = f'{word_id}@{analyzer_id}'
770 full, part = self._cache.names.get(token_id, (None, None))
772 variants = analyzer.compute_variants(word_id)
776 with self.conn.cursor() as cur:
777 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
778 (token_id, variants))
779 full, part = cast(Tuple[int, List[int]], cur.fetchone())
781 self._cache.names[token_id] = (full, part)
783 assert part is not None
785 full_tokens.add(full)
786 partial_tokens.update(part)
788 return full_tokens, partial_tokens
791 def _add_postcode(self, item: PlaceName) -> Optional[str]:
792 """ Make sure the normalized postcode is present in the word table.
794 assert self.conn is not None
795 analyzer = self.token_analysis.analysis.get('@postcode')
798 postcode_name = item.name.strip().upper()
801 postcode_name = analyzer.get_canonical_id(item)
802 variant_base = item.get_attr("variant")
805 postcode = f'{postcode_name}@{variant_base}'
807 postcode = postcode_name
809 if postcode not in self._cache.postcodes:
810 term = self._search_normalized(postcode_name)
815 if analyzer is not None and variant_base:
816 variants.update(analyzer.compute_variants(variant_base))
818 with self.conn.cursor() as cur:
819 cur.execute("SELECT create_postcode_word(%s, %s)",
820 (postcode, list(variants)))
821 self._cache.postcodes.add(postcode)
827 """ Collect token information to be sent back to the database.
829 def __init__(self) -> None:
830 self.names: Optional[str] = None
831 self.housenumbers: Set[str] = set()
832 self.housenumber_tokens: Set[int] = set()
833 self.street_tokens: Optional[Set[int]] = None
834 self.place_tokens: Set[int] = set()
835 self.address_tokens: Dict[str, str] = {}
836 self.postcode: Optional[str] = None
839 def _mk_array(self, tokens: Iterable[Any]) -> str:
840 return f"{{{','.join((str(s) for s in tokens))}}}"
843 def to_dict(self) -> Dict[str, Any]:
844 """ Return the token information in database importable format.
846 out: Dict[str, Any] = {}
849 out['names'] = self.names
851 if self.housenumbers:
852 out['hnr'] = ';'.join(self.housenumbers)
853 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
855 if self.street_tokens is not None:
856 out['street'] = self._mk_array(self.street_tokens)
858 if self.place_tokens:
859 out['place'] = self._mk_array(self.place_tokens)
861 if self.address_tokens:
862 out['addr'] = self.address_tokens
865 out['postcode'] = self.postcode
870 def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
871 """ Adds token information for the normalised names.
873 self.names = self._mk_array(itertools.chain(fulls, partials))
876 def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
877 """ Extract housenumber information from a list of normalised
881 assert hnr is not None
882 self.housenumbers.add(hnr)
883 self.housenumber_tokens.add(token)
886 def add_street(self, tokens: Iterable[int]) -> None:
887 """ Add addr:street match terms.
889 if self.street_tokens is None:
890 self.street_tokens = set()
891 self.street_tokens.update(tokens)
894 def add_place(self, tokens: Iterable[int]) -> None:
895 """ Add addr:place search and match terms.
897 self.place_tokens.update(tokens)
900 def add_address_term(self, key: str, partials: Iterable[int]) -> None:
901 """ Add additional address terms.
903 array = self._mk_array(partials)
905 self.address_tokens[key] = array
907 def set_postcode(self, postcode: Optional[str]) -> None:
908 """ Set the postcode to the given one.
910 self.postcode = postcode
914 """ Cache for token information to avoid repeated database queries.
916 This cache is not thread-safe and needs to be instantiated per
919 def __init__(self) -> None:
920 self.names: Dict[str, Tuple[int, List[int]]] = {}
921 self.partials: Dict[str, int] = {}
922 self.fulls: Dict[str, List[int]] = {}
923 self.postcodes: Set[str] = set()
924 self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}