1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
15 from pathlib import Path
17 from psycopg.types.json import Jsonb
18 from psycopg import sql as pysql
20 from ..db.connection import connect, Connection, Cursor, server_version_tuple, \
21 drop_tables, table_exists, execute_scalar
22 from ..config import Configuration
23 from ..db.sql_preprocessor import SQLPreprocessor
24 from ..data.place_info import PlaceInfo
25 from ..data.place_name import PlaceName
26 from .icu_rule_loader import ICURuleLoader
27 from .place_sanitizer import PlaceSanitizer
28 from .icu_token_analysis import ICUTokenAnalysis
29 from .base import AbstractAnalyzer, AbstractTokenizer
31 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
33 LOG = logging.getLogger()
35 WORD_TYPES = (('country_names', 'C'),
38 ('housenumbers', 'H'))
41 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
42 """ Create a new instance of the tokenizer provided by this module.
44 return ICUTokenizer(dsn, data_dir)
47 class ICUTokenizer(AbstractTokenizer):
48 """ This tokenizer uses libICU to convert names and queries to ASCII.
49 Otherwise it uses the same algorithms and data structures as the
50 normalization routines in Nominatim 3.
53 def __init__(self, dsn: str, data_dir: Path) -> None:
55 self.data_dir = data_dir
56 self.loader: Optional[ICURuleLoader] = None
58 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
59 """ Set up a new tokenizer for the database.
61 This copies all necessary data in the project directory to make
62 sure the tokenizer remains stable even over updates.
64 self.loader = ICURuleLoader(config)
69 self.update_sql_functions(config)
70 self._setup_db_tables(config)
71 self._create_base_indices(config, 'word')
73 def init_from_project(self, config: Configuration) -> None:
74 """ Initialise the tokenizer from the project directory.
76 self.loader = ICURuleLoader(config)
78 with connect(self.dsn) as conn:
79 self.loader.load_config_from_db(conn)
81 def finalize_import(self, config: Configuration) -> None:
82 """ Do any required postprocessing to make the tokenizer data ready
85 self._create_lookup_indices(config, 'word')
87 def update_sql_functions(self, config: Configuration) -> None:
88 """ Reimport the SQL functions for this tokenizer.
90 with connect(self.dsn) as conn:
91 sqlp = SQLPreprocessor(conn, config)
92 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
94 def check_database(self, config: Configuration) -> None:
95 """ Check that the tokenizer is set up correctly.
97 # Will throw an error if there is an issue.
98 self.init_from_project(config)
100 def update_statistics(self, config: Configuration, threads: int = 2) -> None:
101 """ Recompute frequencies for all name words.
103 with connect(self.dsn) as conn:
104 if not table_exists(conn, 'search_name'):
107 with conn.cursor() as cur:
108 cur.execute('ANALYSE search_name')
110 cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
111 .format(pysql.Literal(min(threads, 6),)))
113 if server_version_tuple(conn) < (12, 0):
114 LOG.info('Computing word frequencies')
115 drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
116 cur.execute("""CREATE TEMP TABLE word_frequencies AS
117 SELECT unnest(name_vector) as id, count(*)
118 FROM search_name GROUP BY id""")
119 cur.execute('CREATE INDEX ON word_frequencies(id)')
120 cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
121 SELECT unnest(nameaddress_vector) as id, count(*)
122 FROM search_name GROUP BY id""")
123 cur.execute('CREATE INDEX ON addressword_frequencies(id)')
125 CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
133 FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
135 info = info || jsonb_build_object('count', rec.count);
137 FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
139 info = info || jsonb_build_object('addr_count', rec.count);
141 IF info = '{}'::jsonb THEN
145 $$ LANGUAGE plpgsql IMMUTABLE;
147 LOG.info('Update word table with recomputed frequencies')
148 drop_tables(conn, 'tmp_word')
149 cur.execute("""CREATE TABLE tmp_word AS
150 SELECT word_id, word_token, type, word,
151 word_freq_update(word_id, info) as info
154 drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
156 LOG.info('Computing word frequencies')
157 drop_tables(conn, 'word_frequencies')
159 CREATE TEMP TABLE word_frequencies AS
160 WITH word_freq AS MATERIALIZED (
161 SELECT unnest(name_vector) as id, count(*)
162 FROM search_name GROUP BY id),
163 addr_freq AS MATERIALIZED (
164 SELECT unnest(nameaddress_vector) as id, count(*)
165 FROM search_name GROUP BY id)
166 SELECT coalesce(a.id, w.id) as id,
167 (CASE WHEN w.count is null THEN '{}'::JSONB
168 ELSE jsonb_build_object('count', w.count) END
170 CASE WHEN a.count is null THEN '{}'::JSONB
171 ELSE jsonb_build_object('addr_count', a.count) END) as info
172 FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
174 cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
175 cur.execute('ANALYSE word_frequencies')
176 LOG.info('Update word table with recomputed frequencies')
177 drop_tables(conn, 'tmp_word')
178 cur.execute("""CREATE TABLE tmp_word AS
179 SELECT word_id, word_token, type, word,
180 (CASE WHEN wf.info is null THEN word.info
181 ELSE coalesce(word.info, '{}'::jsonb) || wf.info
183 FROM word LEFT JOIN word_frequencies wf
184 ON word.word_id = wf.id
187 drop_tables(conn, 'word_frequencies')
189 with conn.cursor() as cur:
190 cur.execute('SET max_parallel_workers_per_gather TO 0')
192 sqlp = SQLPreprocessor(conn, config)
193 sqlp.run_string(conn,
194 'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
196 self._create_base_indices(config, 'tmp_word')
197 self._create_lookup_indices(config, 'tmp_word')
198 self._move_temporary_word_table('tmp_word')
200 def _cleanup_housenumbers(self) -> None:
201 """ Remove unused house numbers.
203 with connect(self.dsn) as conn:
204 if not table_exists(conn, 'search_name'):
206 with conn.cursor(name="hnr_counter") as cur:
207 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
210 AND NOT EXISTS(SELECT * FROM search_name
211 WHERE ARRAY[word.word_id] && name_vector)
212 AND (char_length(coalesce(word, word_token)) > 6
213 OR coalesce(word, word_token) not similar to '\\d+')
215 candidates = {token: wid for wid, token in cur}
216 with conn.cursor(name="hnr_counter") as cur:
217 cur.execute("""SELECT housenumber FROM placex
218 WHERE housenumber is not null
219 AND (char_length(housenumber) > 6
220 OR housenumber not similar to '\\d+')
223 for hnr in row[0].split(';'):
224 candidates.pop(hnr, None)
225 LOG.info("There are %s outdated housenumbers.", len(candidates))
226 LOG.debug("Outdated housenumbers: %s", candidates.keys())
228 with conn.cursor() as cur:
229 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
230 (list(candidates.values()), ))
233 def update_word_tokens(self) -> None:
234 """ Remove unused tokens.
236 LOG.warning("Cleaning up housenumber tokens.")
237 self._cleanup_housenumbers()
238 LOG.warning("Tokenizer house-keeping done.")
240 def name_analyzer(self) -> 'ICUNameAnalyzer':
241 """ Create a new analyzer for tokenizing names and queries
242 using this tokinzer. Analyzers are context managers and should
246 with tokenizer.name_analyzer() as analyzer:
250 When used outside the with construct, the caller must ensure to
251 call the close() function before destructing the analyzer.
253 Analyzers are not thread-safe. You need to instantiate one per thread.
255 assert self.loader is not None
256 return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
257 self.loader.make_token_analysis())
259 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
260 """ Return a list of the `num` most frequent full words
263 with conn.cursor() as cur:
264 cur.execute("""SELECT word, sum((info->>'count')::int) as count
265 FROM word WHERE type = 'W'
267 ORDER BY count DESC LIMIT %s""", (num,))
268 return list(s[0].split('@')[0] for s in cur)
270 def _save_config(self) -> None:
271 """ Save the configuration that needs to remain stable for the given
272 database as database properties.
274 assert self.loader is not None
275 with connect(self.dsn) as conn:
276 self.loader.save_config_to_db(conn)
278 def _setup_db_tables(self, config: Configuration) -> None:
279 """ Set up the word table and fill it with pre-computed word
282 with connect(self.dsn) as conn:
283 drop_tables(conn, 'word')
284 sqlp = SQLPreprocessor(conn, config)
285 sqlp.run_string(conn, """
288 word_token text NOT NULL,
292 ) {{db.tablespace.search_data}};
293 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
295 DROP SEQUENCE IF EXISTS seq_word;
296 CREATE SEQUENCE seq_word start 1;
297 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
301 def _create_base_indices(self, config: Configuration, table_name: str) -> None:
302 """ Set up the word table and fill it with pre-computed word
305 with connect(self.dsn) as conn:
306 sqlp = SQLPreprocessor(conn, config)
307 sqlp.run_string(conn,
308 """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
309 USING BTREE (word_token) {{db.tablespace.search_index}}""",
310 table_name=table_name)
311 for name, ctype in WORD_TYPES:
312 sqlp.run_string(conn,
313 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
314 USING BTREE (word) {{db.tablespace.address_index}}
315 WHERE type = '{{column_type}}'
317 table_name=table_name, idx_name=name,
321 def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
322 """ Create additional indexes used when running the API.
324 with connect(self.dsn) as conn:
325 sqlp = SQLPreprocessor(conn, config)
326 # Index required for details lookup.
330 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
331 ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
333 table_name=table_name)
336 def _move_temporary_word_table(self, old: str) -> None:
337 """ Rename all tables and indexes used by the tokenizer.
339 with connect(self.dsn) as conn:
340 drop_tables(conn, 'word')
341 with conn.cursor() as cur:
342 cur.execute(f"ALTER TABLE {old} RENAME TO word")
343 for idx in ('word_token', 'word_id'):
344 cur.execute(f"""ALTER INDEX idx_{old}_{idx}
345 RENAME TO idx_word_{idx}""")
346 for name, _ in WORD_TYPES:
347 cur.execute(f"""ALTER INDEX idx_{old}_{name}
348 RENAME TO idx_word_{name}""")
352 class ICUNameAnalyzer(AbstractAnalyzer):
353 """ The ICU analyzer uses the ICU library for splitting names.
355 Each instance opens a connection to the database to request the
359 def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
360 token_analysis: ICUTokenAnalysis) -> None:
361 self.conn: Optional[Connection] = connect(dsn)
362 self.conn.autocommit = True
363 self.sanitizer = sanitizer
364 self.token_analysis = token_analysis
366 self._cache = _TokenCache()
368 def close(self) -> None:
369 """ Free all resources used by the analyzer.
375 def _search_normalized(self, name: str) -> str:
376 """ Return the search token transliteration of the given name.
378 return cast(str, self.token_analysis.search.transliterate(name)).strip()
380 def _normalized(self, name: str) -> str:
381 """ Return the normalized version of the given name with all
382 non-relevant information removed.
384 return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
386 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
387 """ Return token information for the given list of words.
388 If a word starts with # it is assumed to be a full name
389 otherwise is a partial name.
391 The function returns a list of tuples with
392 (original word, word token, word id).
394 The function is used for testing and debugging only
395 and not necessarily efficient.
397 assert self.conn is not None
401 if word.startswith('#'):
402 full_tokens[word] = self._search_normalized(word[1:])
404 partial_tokens[word] = self._search_normalized(word)
406 with self.conn.cursor() as cur:
407 cur.execute("""SELECT word_token, word_id
408 FROM word WHERE word_token = ANY(%s) and type = 'W'
409 """, (list(full_tokens.values()),))
410 full_ids = {r[0]: r[1] for r in cur}
411 cur.execute("""SELECT word_token, word_id
412 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
413 (list(partial_tokens.values()),))
414 part_ids = {r[0]: r[1] for r in cur}
416 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
417 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
419 def normalize_postcode(self, postcode: str) -> str:
420 """ Convert the postcode to a standardized form.
422 This function must yield exactly the same result as the SQL function
423 'token_normalized_postcode()'.
425 return postcode.strip().upper()
427 def update_postcodes_from_db(self) -> None:
428 """ Update postcode tokens in the word table from the location_postcode
431 assert self.conn is not None
432 analyzer = self.token_analysis.analysis.get('@postcode')
434 with self.conn.cursor() as cur:
435 # First get all postcode names currently in the word table.
436 cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
437 word_entries = set((entry[0] for entry in cur))
439 # Then compute the required postcode names from the postcode table.
440 needed_entries = set()
441 cur.execute("SELECT country_code, postcode FROM location_postcode")
442 for cc, postcode in cur:
443 info = PlaceInfo({'country_code': cc,
444 'class': 'place', 'type': 'postcode',
445 'address': {'postcode': postcode}})
446 address = self.sanitizer.process_names(info)[1]
447 for place in address:
448 if place.kind == 'postcode':
450 postcode_name = place.name.strip().upper()
453 postcode_name = analyzer.get_canonical_id(place)
454 variant_base = place.get_attr("variant")
457 needed_entries.add(f'{postcode_name}@{variant_base}')
459 needed_entries.add(postcode_name)
462 # Now update the word table.
463 self._delete_unused_postcode_words(word_entries - needed_entries)
464 self._add_missing_postcode_words(needed_entries - word_entries)
466 def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
467 assert self.conn is not None
469 with self.conn.cursor() as cur:
470 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
473 def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
474 assert self.conn is not None
478 analyzer = self.token_analysis.analysis.get('@postcode')
481 for postcode_name in tokens:
482 if '@' in postcode_name:
483 term, variant = postcode_name.split('@', 2)
484 term = self._search_normalized(term)
488 variants = analyzer.compute_variants(variant)
489 if term not in variants:
490 variants.append(term)
492 variants = [self._search_normalized(postcode_name)]
493 terms.append((postcode_name, variants))
496 with self.conn.cursor() as cur:
497 cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
499 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
500 should_replace: bool) -> None:
501 """ Replace the search index for special phrases with the new phrases.
502 If `should_replace` is True, then the previous set of will be
503 completely replaced. Otherwise the phrases are added to the
504 already existing ones.
506 assert self.conn is not None
507 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
510 with self.conn.cursor() as cur:
511 # Get the old phrases.
512 existing_phrases = set()
513 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
514 for word, info in cur:
515 existing_phrases.add((word, info['class'], info['type'],
516 info.get('op') or '-'))
518 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
520 deleted = self._remove_special_phrases(cur, norm_phrases,
525 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
526 len(norm_phrases), added, deleted)
528 def _add_special_phrases(self, cursor: Cursor,
529 new_phrases: Set[Tuple[str, str, str, str]],
530 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
531 """ Add all phrases to the database that are not yet there.
533 to_add = new_phrases - existing_phrases
536 with cursor.copy('COPY word(word_token, type, word, info) FROM STDIN') as copy:
537 for word, cls, typ, oper in to_add:
538 term = self._search_normalized(word)
540 copy.write_row((term, 'S', word,
541 Jsonb({'class': cls, 'type': typ,
542 'op': oper if oper in ('in', 'near') else None})))
547 def _remove_special_phrases(self, cursor: Cursor,
548 new_phrases: Set[Tuple[str, str, str, str]],
549 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
550 """ Remove all phrases from the database that are no longer in the
553 to_delete = existing_phrases - new_phrases
558 WHERE type = 'S' and word = %s
559 and info->>'class' = %s and info->>'type' = %s
560 and %s = coalesce(info->>'op', '-')
563 return len(to_delete)
565 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
566 """ Add default names for the given country to the search index.
568 # Make sure any name preprocessing for country names applies.
569 info = PlaceInfo({'name': names, 'country_code': country_code,
570 'rank_address': 4, 'class': 'boundary',
571 'type': 'administrative'})
572 self._add_country_full_names(country_code,
573 self.sanitizer.process_names(info)[0],
576 def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
577 internal: bool = False) -> None:
578 """ Add names for the given country from an already sanitized
581 assert self.conn is not None
584 norm_name = self._search_normalized(name.name)
586 word_tokens.add(norm_name)
588 with self.conn.cursor() as cur:
590 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
592 WHERE type = 'C' and word = %s""",
594 # internal/external names
595 existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
597 existing_tokens[word[1]].add(word[0])
599 # Delete names that no longer exist.
600 gone_tokens = existing_tokens[internal] - word_tokens
602 gone_tokens.update(existing_tokens[False] & word_tokens)
604 cur.execute("""DELETE FROM word
605 USING unnest(%s::text[]) as token
606 WHERE type = 'C' and word = %s
607 and word_token = token""",
608 (list(gone_tokens), country_code))
610 # Only add those names that are not yet in the list.
611 new_tokens = word_tokens - existing_tokens[True]
613 new_tokens -= existing_tokens[False]
616 sql = """INSERT INTO word (word_token, type, word, info)
617 (SELECT token, 'C', %s, '{"internal": "yes"}'
618 FROM unnest(%s::text[]) as token)
621 sql = """INSERT INTO word (word_token, type, word)
622 (SELECT token, 'C', %s
623 FROM unnest(%s::text[]) as token)
625 cur.execute(sql, (country_code, list(new_tokens)))
627 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
628 """ Determine tokenizer information about the given place.
630 Returns a JSON-serializable structure that will be handed into
631 the database via the token_info field.
633 token_info = _TokenInfo()
635 names, address = self.sanitizer.process_names(place)
638 token_info.set_names(*self._compute_name_tokens(names))
640 if place.is_country():
641 assert place.country_code is not None
642 self._add_country_full_names(place.country_code, names)
645 self._process_place_address(token_info, address)
647 return token_info.to_dict()
649 def _process_place_address(self, token_info: '_TokenInfo',
650 address: Sequence[PlaceName]) -> None:
652 if item.kind == 'postcode':
653 token_info.set_postcode(self._add_postcode(item))
654 elif item.kind == 'housenumber':
655 token_info.add_housenumber(*self._compute_housenumber_token(item))
656 elif item.kind == 'street':
657 token_info.add_street(self._retrieve_full_tokens(item.name))
658 elif item.kind == 'place':
660 token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
661 elif (not item.kind.startswith('_') and not item.suffix and
662 item.kind not in ('country', 'full', 'inclusion')):
663 token_info.add_address_term(item.kind,
664 itertools.chain(*self._compute_name_tokens([item])))
666 def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
667 """ Normalize the housenumber and return the word token and the
670 assert self.conn is not None
671 analyzer = self.token_analysis.analysis.get('@housenumber')
672 result: Tuple[Optional[int], Optional[str]] = (None, None)
675 # When no custom analyzer is set, simply normalize and transliterate
676 norm_name = self._search_normalized(hnr.name)
678 result = self._cache.housenumbers.get(norm_name, result)
679 if result[0] is None:
680 hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
682 result = hid, norm_name
683 self._cache.housenumbers[norm_name] = result
685 # Otherwise use the analyzer to determine the canonical name.
686 # Per convention we use the first variant as the 'lookup name', the
687 # name that gets saved in the housenumber field of the place.
688 word_id = analyzer.get_canonical_id(hnr)
690 result = self._cache.housenumbers.get(word_id, result)
691 if result[0] is None:
692 variants = analyzer.compute_variants(word_id)
694 hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
695 (word_id, list(variants)))
696 result = hid, variants[0]
697 self._cache.housenumbers[word_id] = result
701 def _retrieve_full_tokens(self, name: str) -> List[int]:
702 """ Get the full name token for the given name, if it exists.
703 The name is only retrieved for the standard analyser.
705 assert self.conn is not None
706 norm_name = self._search_normalized(name)
708 # return cached if possible
709 if norm_name in self._cache.fulls:
710 return self._cache.fulls[norm_name]
712 with self.conn.cursor() as cur:
713 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
715 full = [row[0] for row in cur]
717 self._cache.fulls[norm_name] = full
721 def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
722 """ Computes the full name and partial name tokens for the given
725 assert self.conn is not None
726 full_tokens: Set[int] = set()
727 partial_tokens: Set[int] = set()
730 analyzer_id = name.get_attr('analyzer')
731 analyzer = self.token_analysis.get_analyzer(analyzer_id)
732 word_id = analyzer.get_canonical_id(name)
733 if analyzer_id is None:
736 token_id = f'{word_id}@{analyzer_id}'
738 full, part = self._cache.names.get(token_id, (None, None))
740 variants = analyzer.compute_variants(word_id)
744 with self.conn.cursor() as cur:
745 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
746 (token_id, variants))
747 full, part = cast(Tuple[int, List[int]], cur.fetchone())
749 self._cache.names[token_id] = (full, part)
751 assert part is not None
753 full_tokens.add(full)
754 partial_tokens.update(part)
756 return full_tokens, partial_tokens
758 def _add_postcode(self, item: PlaceName) -> Optional[str]:
759 """ Make sure the normalized postcode is present in the word table.
761 assert self.conn is not None
762 analyzer = self.token_analysis.analysis.get('@postcode')
765 postcode_name = item.name.strip().upper()
768 postcode_name = analyzer.get_canonical_id(item)
769 variant_base = item.get_attr("variant")
772 postcode = f'{postcode_name}@{variant_base}'
774 postcode = postcode_name
776 if postcode not in self._cache.postcodes:
777 term = self._search_normalized(postcode_name)
782 if analyzer is not None and variant_base:
783 variants.update(analyzer.compute_variants(variant_base))
785 with self.conn.cursor() as cur:
786 cur.execute("SELECT create_postcode_word(%s, %s)",
787 (postcode, list(variants)))
788 self._cache.postcodes.add(postcode)
794 """ Collect token information to be sent back to the database.
796 def __init__(self) -> None:
797 self.names: Optional[str] = None
798 self.housenumbers: Set[str] = set()
799 self.housenumber_tokens: Set[int] = set()
800 self.street_tokens: Optional[Set[int]] = None
801 self.place_tokens: Set[int] = set()
802 self.address_tokens: Dict[str, str] = {}
803 self.postcode: Optional[str] = None
805 def _mk_array(self, tokens: Iterable[Any]) -> str:
806 return f"{{{','.join((str(s) for s in tokens))}}}"
808 def to_dict(self) -> Dict[str, Any]:
809 """ Return the token information in database importable format.
811 out: Dict[str, Any] = {}
814 out['names'] = self.names
816 if self.housenumbers:
817 out['hnr'] = ';'.join(self.housenumbers)
818 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
820 if self.street_tokens is not None:
821 out['street'] = self._mk_array(self.street_tokens)
823 if self.place_tokens:
824 out['place'] = self._mk_array(self.place_tokens)
826 if self.address_tokens:
827 out['addr'] = self.address_tokens
830 out['postcode'] = self.postcode
834 def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
835 """ Adds token information for the normalised names.
837 self.names = self._mk_array(itertools.chain(fulls, partials))
839 def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
840 """ Extract housenumber information from a list of normalised
844 assert hnr is not None
845 self.housenumbers.add(hnr)
846 self.housenumber_tokens.add(token)
848 def add_street(self, tokens: Iterable[int]) -> None:
849 """ Add addr:street match terms.
851 if self.street_tokens is None:
852 self.street_tokens = set()
853 self.street_tokens.update(tokens)
855 def add_place(self, tokens: Iterable[int]) -> None:
856 """ Add addr:place search and match terms.
858 self.place_tokens.update(tokens)
860 def add_address_term(self, key: str, partials: Iterable[int]) -> None:
861 """ Add additional address terms.
863 array = self._mk_array(partials)
865 self.address_tokens[key] = array
867 def set_postcode(self, postcode: Optional[str]) -> None:
868 """ Set the postcode to the given one.
870 self.postcode = postcode
874 """ Cache for token information to avoid repeated database queries.
876 This cache is not thread-safe and needs to be instantiated per
879 def __init__(self) -> None:
880 self.names: Dict[str, Tuple[int, List[int]]] = {}
881 self.partials: Dict[str, int] = {}
882 self.fulls: Dict[str, List[int]] = {}
883 self.postcodes: Set[str] = set()
884 self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}