1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
15 from pathlib import Path
17 from psycopg.types.json import Jsonb
18 from psycopg import sql as pysql
20 from ..db.connection import connect, Connection, Cursor, server_version_tuple, \
21 drop_tables, table_exists, execute_scalar
22 from ..config import Configuration
23 from ..db.sql_preprocessor import SQLPreprocessor
24 from ..data.place_info import PlaceInfo
25 from ..data.place_name import PlaceName
26 from .icu_rule_loader import ICURuleLoader
27 from .place_sanitizer import PlaceSanitizer
28 from .icu_token_analysis import ICUTokenAnalysis
29 from .base import AbstractAnalyzer, AbstractTokenizer
31 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
33 LOG = logging.getLogger()
35 WORD_TYPES = (('country_names', 'C'),
38 ('housenumbers', 'H'))
41 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
42 """ Create a new instance of the tokenizer provided by this module.
44 return ICUTokenizer(dsn, data_dir)
47 class ICUTokenizer(AbstractTokenizer):
48 """ This tokenizer uses libICU to convert names and queries to ASCII.
49 Otherwise it uses the same algorithms and data structures as the
50 normalization routines in Nominatim 3.
53 def __init__(self, dsn: str, data_dir: Path) -> None:
55 self.data_dir = data_dir
56 self.loader: Optional[ICURuleLoader] = None
58 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
59 """ Set up a new tokenizer for the database.
61 This copies all necessary data in the project directory to make
62 sure the tokenizer remains stable even over updates.
64 self.loader = ICURuleLoader(config)
69 self.update_sql_functions(config)
70 self._setup_db_tables(config)
71 self._create_base_indices(config, 'word')
73 def init_from_project(self, config: Configuration) -> None:
74 """ Initialise the tokenizer from the project directory.
76 self.loader = ICURuleLoader(config)
78 with connect(self.dsn) as conn:
79 self.loader.load_config_from_db(conn)
81 def finalize_import(self, config: Configuration) -> None:
82 """ Do any required postprocessing to make the tokenizer data ready
85 self._create_lookup_indices(config, 'word')
87 def update_sql_functions(self, config: Configuration) -> None:
88 """ Reimport the SQL functions for this tokenizer.
90 with connect(self.dsn) as conn:
91 sqlp = SQLPreprocessor(conn, config)
92 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
94 def check_database(self, config: Configuration) -> None:
95 """ Check that the tokenizer is set up correctly.
97 # Will throw an error if there is an issue.
98 self.init_from_project(config)
100 def update_statistics(self, config: Configuration, threads: int = 2) -> None:
101 """ Recompute frequencies for all name words.
103 with connect(self.dsn) as conn:
104 if not table_exists(conn, 'search_name'):
107 with conn.cursor() as cur:
108 cur.execute('ANALYSE search_name')
110 cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
111 .format(pysql.Literal(min(threads, 6),)))
113 if server_version_tuple(conn) < (12, 0):
114 LOG.info('Computing word frequencies')
115 drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
116 cur.execute("""CREATE TEMP TABLE word_frequencies AS
117 SELECT unnest(name_vector) as id, count(*)
118 FROM search_name GROUP BY id""")
119 cur.execute('CREATE INDEX ON word_frequencies(id)')
120 cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
121 SELECT unnest(nameaddress_vector) as id, count(*)
122 FROM search_name GROUP BY id""")
123 cur.execute('CREATE INDEX ON addressword_frequencies(id)')
125 CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
133 FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
135 info = info || jsonb_build_object('count', rec.count);
137 FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
139 info = info || jsonb_build_object('addr_count', rec.count);
141 IF info = '{}'::jsonb THEN
145 $$ LANGUAGE plpgsql IMMUTABLE;
147 LOG.info('Update word table with recomputed frequencies')
148 drop_tables(conn, 'tmp_word')
149 cur.execute("""CREATE TABLE tmp_word AS
150 SELECT word_id, word_token, type, word,
151 word_freq_update(word_id, info) as info
154 drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
156 LOG.info('Computing word frequencies')
157 drop_tables(conn, 'word_frequencies')
159 CREATE TEMP TABLE word_frequencies AS
160 WITH word_freq AS MATERIALIZED (
161 SELECT unnest(name_vector) as id, count(*)
162 FROM search_name GROUP BY id),
163 addr_freq AS MATERIALIZED (
164 SELECT unnest(nameaddress_vector) as id, count(*)
165 FROM search_name GROUP BY id)
166 SELECT coalesce(a.id, w.id) as id,
167 (CASE WHEN w.count is null THEN '{}'::JSONB
168 ELSE jsonb_build_object('count', w.count) END
170 CASE WHEN a.count is null THEN '{}'::JSONB
171 ELSE jsonb_build_object('addr_count', a.count) END) as info
172 FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
174 cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
175 cur.execute('ANALYSE word_frequencies')
176 LOG.info('Update word table with recomputed frequencies')
177 drop_tables(conn, 'tmp_word')
178 cur.execute("""CREATE TABLE tmp_word AS
179 SELECT word_id, word_token, type, word,
180 (CASE WHEN wf.info is null THEN word.info
181 ELSE coalesce(word.info, '{}'::jsonb) || wf.info
183 FROM word LEFT JOIN word_frequencies wf
184 ON word.word_id = wf.id
186 drop_tables(conn, 'word_frequencies')
188 with conn.cursor() as cur:
189 cur.execute('SET max_parallel_workers_per_gather TO 0')
191 sqlp = SQLPreprocessor(conn, config)
192 sqlp.run_string(conn,
193 'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
195 self._create_base_indices(config, 'tmp_word')
196 self._create_lookup_indices(config, 'tmp_word')
197 self._move_temporary_word_table('tmp_word')
199 def _cleanup_housenumbers(self) -> None:
200 """ Remove unused house numbers.
202 with connect(self.dsn) as conn:
203 if not table_exists(conn, 'search_name'):
205 with conn.cursor(name="hnr_counter") as cur:
206 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
209 AND NOT EXISTS(SELECT * FROM search_name
210 WHERE ARRAY[word.word_id] && name_vector)
211 AND (char_length(coalesce(word, word_token)) > 6
212 OR coalesce(word, word_token) not similar to '\\d+')
214 candidates = {token: wid for wid, token in cur}
215 with conn.cursor(name="hnr_counter") as cur:
216 cur.execute("""SELECT housenumber FROM placex
217 WHERE housenumber is not null
218 AND (char_length(housenumber) > 6
219 OR housenumber not similar to '\\d+')
222 for hnr in row[0].split(';'):
223 candidates.pop(hnr, None)
224 LOG.info("There are %s outdated housenumbers.", len(candidates))
225 LOG.debug("Outdated housenumbers: %s", candidates.keys())
227 with conn.cursor() as cur:
228 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
229 (list(candidates.values()), ))
232 def update_word_tokens(self) -> None:
233 """ Remove unused tokens.
235 LOG.warning("Cleaning up housenumber tokens.")
236 self._cleanup_housenumbers()
237 LOG.warning("Tokenizer house-keeping done.")
239 def name_analyzer(self) -> 'ICUNameAnalyzer':
240 """ Create a new analyzer for tokenizing names and queries
241 using this tokinzer. Analyzers are context managers and should
245 with tokenizer.name_analyzer() as analyzer:
249 When used outside the with construct, the caller must ensure to
250 call the close() function before destructing the analyzer.
252 Analyzers are not thread-safe. You need to instantiate one per thread.
254 assert self.loader is not None
255 return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
256 self.loader.make_token_analysis())
258 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
259 """ Return a list of the `num` most frequent full words
262 with conn.cursor() as cur:
263 cur.execute("""SELECT word, sum((info->>'count')::int) as count
264 FROM word WHERE type = 'W'
266 ORDER BY count DESC LIMIT %s""", (num,))
267 return list(s[0].split('@')[0] for s in cur)
269 def _save_config(self) -> None:
270 """ Save the configuration that needs to remain stable for the given
271 database as database properties.
273 assert self.loader is not None
274 with connect(self.dsn) as conn:
275 self.loader.save_config_to_db(conn)
277 def _setup_db_tables(self, config: Configuration) -> None:
278 """ Set up the word table and fill it with pre-computed word
281 with connect(self.dsn) as conn:
282 drop_tables(conn, 'word')
283 sqlp = SQLPreprocessor(conn, config)
284 sqlp.run_string(conn, """
287 word_token text NOT NULL,
291 ) {{db.tablespace.search_data}};
292 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
294 DROP SEQUENCE IF EXISTS seq_word;
295 CREATE SEQUENCE seq_word start 1;
296 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
300 def _create_base_indices(self, config: Configuration, table_name: str) -> None:
301 """ Set up the word table and fill it with pre-computed word
304 with connect(self.dsn) as conn:
305 sqlp = SQLPreprocessor(conn, config)
306 sqlp.run_string(conn,
307 """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
308 USING BTREE (word_token) {{db.tablespace.search_index}}""",
309 table_name=table_name)
310 for name, ctype in WORD_TYPES:
311 sqlp.run_string(conn,
312 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
313 USING BTREE (word) {{db.tablespace.address_index}}
314 WHERE type = '{{column_type}}'
316 table_name=table_name, idx_name=name,
320 def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
321 """ Create additional indexes used when running the API.
323 with connect(self.dsn) as conn:
324 sqlp = SQLPreprocessor(conn, config)
325 # Index required for details lookup.
329 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
330 ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
332 table_name=table_name)
335 def _move_temporary_word_table(self, old: str) -> None:
336 """ Rename all tables and indexes used by the tokenizer.
338 with connect(self.dsn) as conn:
339 drop_tables(conn, 'word')
340 with conn.cursor() as cur:
341 cur.execute(f"ALTER TABLE {old} RENAME TO word")
342 for idx in ('word_token', 'word_id'):
343 cur.execute(f"""ALTER INDEX idx_{old}_{idx}
344 RENAME TO idx_word_{idx}""")
345 for name, _ in WORD_TYPES:
346 cur.execute(f"""ALTER INDEX idx_{old}_{name}
347 RENAME TO idx_word_{name}""")
351 class ICUNameAnalyzer(AbstractAnalyzer):
352 """ The ICU analyzer uses the ICU library for splitting names.
354 Each instance opens a connection to the database to request the
358 def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
359 token_analysis: ICUTokenAnalysis) -> None:
360 self.conn: Optional[Connection] = connect(dsn)
361 self.conn.autocommit = True
362 self.sanitizer = sanitizer
363 self.token_analysis = token_analysis
365 self._cache = _TokenCache()
367 def close(self) -> None:
368 """ Free all resources used by the analyzer.
374 def _search_normalized(self, name: str) -> str:
375 """ Return the search token transliteration of the given name.
377 return cast(str, self.token_analysis.search.transliterate(name)).strip()
379 def _normalized(self, name: str) -> str:
380 """ Return the normalized version of the given name with all
381 non-relevant information removed.
383 return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
385 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
386 """ Return token information for the given list of words.
387 If a word starts with # it is assumed to be a full name
388 otherwise is a partial name.
390 The function returns a list of tuples with
391 (original word, word token, word id).
393 The function is used for testing and debugging only
394 and not necessarily efficient.
396 assert self.conn is not None
400 if word.startswith('#'):
401 full_tokens[word] = self._search_normalized(word[1:])
403 partial_tokens[word] = self._search_normalized(word)
405 with self.conn.cursor() as cur:
406 cur.execute("""SELECT word_token, word_id
407 FROM word WHERE word_token = ANY(%s) and type = 'W'
408 """, (list(full_tokens.values()),))
409 full_ids = {r[0]: r[1] for r in cur}
410 cur.execute("""SELECT word_token, word_id
411 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
412 (list(partial_tokens.values()),))
413 part_ids = {r[0]: r[1] for r in cur}
415 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
416 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
418 def normalize_postcode(self, postcode: str) -> str:
419 """ Convert the postcode to a standardized form.
421 This function must yield exactly the same result as the SQL function
422 'token_normalized_postcode()'.
424 return postcode.strip().upper()
426 def update_postcodes_from_db(self) -> None:
427 """ Update postcode tokens in the word table from the location_postcode
430 assert self.conn is not None
431 analyzer = self.token_analysis.analysis.get('@postcode')
433 with self.conn.cursor() as cur:
434 # First get all postcode names currently in the word table.
435 cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
436 word_entries = set((entry[0] for entry in cur))
438 # Then compute the required postcode names from the postcode table.
439 needed_entries = set()
440 cur.execute("SELECT country_code, postcode FROM location_postcode")
441 for cc, postcode in cur:
442 info = PlaceInfo({'country_code': cc,
443 'class': 'place', 'type': 'postcode',
444 'address': {'postcode': postcode}})
445 address = self.sanitizer.process_names(info)[1]
446 for place in address:
447 if place.kind == 'postcode':
449 postcode_name = place.name.strip().upper()
452 postcode_name = analyzer.get_canonical_id(place)
453 variant_base = place.get_attr("variant")
456 needed_entries.add(f'{postcode_name}@{variant_base}')
458 needed_entries.add(postcode_name)
461 # Now update the word table.
462 self._delete_unused_postcode_words(word_entries - needed_entries)
463 self._add_missing_postcode_words(needed_entries - word_entries)
465 def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
466 assert self.conn is not None
468 with self.conn.cursor() as cur:
469 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
472 def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
473 assert self.conn is not None
477 analyzer = self.token_analysis.analysis.get('@postcode')
480 for postcode_name in tokens:
481 if '@' in postcode_name:
482 term, variant = postcode_name.split('@', 2)
483 term = self._search_normalized(term)
487 variants = analyzer.compute_variants(variant)
488 if term not in variants:
489 variants.append(term)
491 variants = [self._search_normalized(postcode_name)]
492 terms.append((postcode_name, variants))
495 with self.conn.cursor() as cur:
496 cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
498 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
499 should_replace: bool) -> None:
500 """ Replace the search index for special phrases with the new phrases.
501 If `should_replace` is True, then the previous set of will be
502 completely replaced. Otherwise the phrases are added to the
503 already existing ones.
505 assert self.conn is not None
506 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
509 with self.conn.cursor() as cur:
510 # Get the old phrases.
511 existing_phrases = set()
512 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
513 for word, info in cur:
514 existing_phrases.add((word, info['class'], info['type'],
515 info.get('op') or '-'))
517 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
519 deleted = self._remove_special_phrases(cur, norm_phrases,
524 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
525 len(norm_phrases), added, deleted)
527 def _add_special_phrases(self, cursor: Cursor,
528 new_phrases: Set[Tuple[str, str, str, str]],
529 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
530 """ Add all phrases to the database that are not yet there.
532 to_add = new_phrases - existing_phrases
535 with cursor.copy('COPY word(word_token, type, word, info) FROM STDIN') as copy:
536 for word, cls, typ, oper in to_add:
537 term = self._search_normalized(word)
539 copy.write_row((term, 'S', word,
540 Jsonb({'class': cls, 'type': typ,
541 'op': oper if oper in ('in', 'near') else None})))
546 def _remove_special_phrases(self, cursor: Cursor,
547 new_phrases: Set[Tuple[str, str, str, str]],
548 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
549 """ Remove all phrases from the database that are no longer in the
552 to_delete = existing_phrases - new_phrases
557 WHERE type = 'S' and word = %s
558 and info->>'class' = %s and info->>'type' = %s
559 and %s = coalesce(info->>'op', '-')
562 return len(to_delete)
564 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
565 """ Add default names for the given country to the search index.
567 # Make sure any name preprocessing for country names applies.
568 info = PlaceInfo({'name': names, 'country_code': country_code,
569 'rank_address': 4, 'class': 'boundary',
570 'type': 'administrative'})
571 self._add_country_full_names(country_code,
572 self.sanitizer.process_names(info)[0],
575 def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
576 internal: bool = False) -> None:
577 """ Add names for the given country from an already sanitized
580 assert self.conn is not None
583 norm_name = self._search_normalized(name.name)
585 word_tokens.add(norm_name)
587 with self.conn.cursor() as cur:
589 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
591 WHERE type = 'C' and word = %s""",
593 # internal/external names
594 existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
596 existing_tokens[word[1]].add(word[0])
598 # Delete names that no longer exist.
599 gone_tokens = existing_tokens[internal] - word_tokens
601 gone_tokens.update(existing_tokens[False] & word_tokens)
603 cur.execute("""DELETE FROM word
604 USING unnest(%s::text[]) as token
605 WHERE type = 'C' and word = %s
606 and word_token = token""",
607 (list(gone_tokens), country_code))
609 # Only add those names that are not yet in the list.
610 new_tokens = word_tokens - existing_tokens[True]
612 new_tokens -= existing_tokens[False]
615 sql = """INSERT INTO word (word_token, type, word, info)
616 (SELECT token, 'C', %s, '{"internal": "yes"}'
617 FROM unnest(%s::text[]) as token)
620 sql = """INSERT INTO word (word_token, type, word)
621 (SELECT token, 'C', %s
622 FROM unnest(%s::text[]) as token)
624 cur.execute(sql, (country_code, list(new_tokens)))
626 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
627 """ Determine tokenizer information about the given place.
629 Returns a JSON-serializable structure that will be handed into
630 the database via the token_info field.
632 token_info = _TokenInfo()
634 names, address = self.sanitizer.process_names(place)
637 token_info.set_names(*self._compute_name_tokens(names))
639 if place.is_country():
640 assert place.country_code is not None
641 self._add_country_full_names(place.country_code, names)
644 self._process_place_address(token_info, address)
646 return token_info.to_dict()
648 def _process_place_address(self, token_info: '_TokenInfo',
649 address: Sequence[PlaceName]) -> None:
651 if item.kind == 'postcode':
652 token_info.set_postcode(self._add_postcode(item))
653 elif item.kind == 'housenumber':
654 token_info.add_housenumber(*self._compute_housenumber_token(item))
655 elif item.kind == 'street':
656 token_info.add_street(self._retrieve_full_tokens(item.name))
657 elif item.kind == 'place':
659 token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
660 elif (not item.kind.startswith('_') and not item.suffix and
661 item.kind not in ('country', 'full', 'inclusion')):
662 token_info.add_address_term(item.kind,
663 itertools.chain(*self._compute_name_tokens([item])))
665 def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
666 """ Normalize the housenumber and return the word token and the
669 assert self.conn is not None
670 analyzer = self.token_analysis.analysis.get('@housenumber')
671 result: Tuple[Optional[int], Optional[str]] = (None, None)
674 # When no custom analyzer is set, simply normalize and transliterate
675 norm_name = self._search_normalized(hnr.name)
677 result = self._cache.housenumbers.get(norm_name, result)
678 if result[0] is None:
679 hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
681 result = hid, norm_name
682 self._cache.housenumbers[norm_name] = result
684 # Otherwise use the analyzer to determine the canonical name.
685 # Per convention we use the first variant as the 'lookup name', the
686 # name that gets saved in the housenumber field of the place.
687 word_id = analyzer.get_canonical_id(hnr)
689 result = self._cache.housenumbers.get(word_id, result)
690 if result[0] is None:
691 variants = analyzer.compute_variants(word_id)
693 hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
694 (word_id, list(variants)))
695 result = hid, variants[0]
696 self._cache.housenumbers[word_id] = result
700 def _retrieve_full_tokens(self, name: str) -> List[int]:
701 """ Get the full name token for the given name, if it exists.
702 The name is only retrieved for the standard analyser.
704 assert self.conn is not None
705 norm_name = self._search_normalized(name)
707 # return cached if possible
708 if norm_name in self._cache.fulls:
709 return self._cache.fulls[norm_name]
711 with self.conn.cursor() as cur:
712 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
714 full = [row[0] for row in cur]
716 self._cache.fulls[norm_name] = full
720 def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
721 """ Computes the full name and partial name tokens for the given
724 assert self.conn is not None
725 full_tokens: Set[int] = set()
726 partial_tokens: Set[int] = set()
729 analyzer_id = name.get_attr('analyzer')
730 analyzer = self.token_analysis.get_analyzer(analyzer_id)
731 word_id = analyzer.get_canonical_id(name)
732 if analyzer_id is None:
735 token_id = f'{word_id}@{analyzer_id}'
737 full, part = self._cache.names.get(token_id, (None, None))
739 variants = analyzer.compute_variants(word_id)
743 with self.conn.cursor() as cur:
744 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
745 (token_id, variants))
746 full, part = cast(Tuple[int, List[int]], cur.fetchone())
748 self._cache.names[token_id] = (full, part)
750 assert part is not None
752 full_tokens.add(full)
753 partial_tokens.update(part)
755 return full_tokens, partial_tokens
757 def _add_postcode(self, item: PlaceName) -> Optional[str]:
758 """ Make sure the normalized postcode is present in the word table.
760 assert self.conn is not None
761 analyzer = self.token_analysis.analysis.get('@postcode')
764 postcode_name = item.name.strip().upper()
767 postcode_name = analyzer.get_canonical_id(item)
768 variant_base = item.get_attr("variant")
771 postcode = f'{postcode_name}@{variant_base}'
773 postcode = postcode_name
775 if postcode not in self._cache.postcodes:
776 term = self._search_normalized(postcode_name)
781 if analyzer is not None and variant_base:
782 variants.update(analyzer.compute_variants(variant_base))
784 with self.conn.cursor() as cur:
785 cur.execute("SELECT create_postcode_word(%s, %s)",
786 (postcode, list(variants)))
787 self._cache.postcodes.add(postcode)
793 """ Collect token information to be sent back to the database.
795 def __init__(self) -> None:
796 self.names: Optional[str] = None
797 self.housenumbers: Set[str] = set()
798 self.housenumber_tokens: Set[int] = set()
799 self.street_tokens: Optional[Set[int]] = None
800 self.place_tokens: Set[int] = set()
801 self.address_tokens: Dict[str, str] = {}
802 self.postcode: Optional[str] = None
804 def _mk_array(self, tokens: Iterable[Any]) -> str:
805 return f"{{{','.join((str(s) for s in tokens))}}}"
807 def to_dict(self) -> Dict[str, Any]:
808 """ Return the token information in database importable format.
810 out: Dict[str, Any] = {}
813 out['names'] = self.names
815 if self.housenumbers:
816 out['hnr'] = ';'.join(self.housenumbers)
817 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
819 if self.street_tokens is not None:
820 out['street'] = self._mk_array(self.street_tokens)
822 if self.place_tokens:
823 out['place'] = self._mk_array(self.place_tokens)
825 if self.address_tokens:
826 out['addr'] = self.address_tokens
829 out['postcode'] = self.postcode
833 def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
834 """ Adds token information for the normalised names.
836 self.names = self._mk_array(itertools.chain(fulls, partials))
838 def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
839 """ Extract housenumber information from a list of normalised
843 assert hnr is not None
844 self.housenumbers.add(hnr)
845 self.housenumber_tokens.add(token)
847 def add_street(self, tokens: Iterable[int]) -> None:
848 """ Add addr:street match terms.
850 if self.street_tokens is None:
851 self.street_tokens = set()
852 self.street_tokens.update(tokens)
854 def add_place(self, tokens: Iterable[int]) -> None:
855 """ Add addr:place search and match terms.
857 self.place_tokens.update(tokens)
859 def add_address_term(self, key: str, partials: Iterable[int]) -> None:
860 """ Add additional address terms.
862 array = self._mk_array(partials)
864 self.address_tokens[key] = array
866 def set_postcode(self, postcode: Optional[str]) -> None:
867 """ Set the postcode to the given one.
869 self.postcode = postcode
873 """ Cache for token information to avoid repeated database queries.
875 This cache is not thread-safe and needs to be instantiated per
878 def __init__(self) -> None:
879 self.names: Dict[str, Tuple[int, List[int]]] = {}
880 self.partials: Dict[str, int] = {}
881 self.fulls: Dict[str, List[int]] = {}
882 self.postcodes: Set[str] = set()
883 self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}