1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
16 from pathlib import Path
17 from textwrap import dedent
19 from ..db.connection import connect, Connection, Cursor, server_version_tuple,\
20 drop_tables, table_exists, execute_scalar
21 from ..config import Configuration
22 from ..db.utils import CopyBuffer
23 from ..db.sql_preprocessor import SQLPreprocessor
24 from ..data.place_info import PlaceInfo
25 from ..data.place_name import PlaceName
26 from .icu_rule_loader import ICURuleLoader
27 from .place_sanitizer import PlaceSanitizer
28 from .icu_token_analysis import ICUTokenAnalysis
29 from .base import AbstractAnalyzer, AbstractTokenizer
31 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
33 LOG = logging.getLogger()
35 WORD_TYPES =(('country_names', 'C'),
38 ('housenumbers', 'H'))
40 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
41 """ Create a new instance of the tokenizer provided by this module.
43 return ICUTokenizer(dsn, data_dir)
46 class ICUTokenizer(AbstractTokenizer):
47 """ This tokenizer uses libICU to convert names and queries to ASCII.
48 Otherwise it uses the same algorithms and data structures as the
49 normalization routines in Nominatim 3.
52 def __init__(self, dsn: str, data_dir: Path) -> None:
54 self.data_dir = data_dir
55 self.loader: Optional[ICURuleLoader] = None
58 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
59 """ Set up a new tokenizer for the database.
61 This copies all necessary data in the project directory to make
62 sure the tokenizer remains stable even over updates.
64 self.loader = ICURuleLoader(config)
66 self._install_php(config.lib_dir.php, overwrite=True)
70 self.update_sql_functions(config)
71 self._setup_db_tables(config)
72 self._create_base_indices(config, 'word')
75 def init_from_project(self, config: Configuration) -> None:
76 """ Initialise the tokenizer from the project directory.
78 self.loader = ICURuleLoader(config)
80 with connect(self.dsn) as conn:
81 self.loader.load_config_from_db(conn)
83 self._install_php(config.lib_dir.php, overwrite=False)
86 def finalize_import(self, config: Configuration) -> None:
87 """ Do any required postprocessing to make the tokenizer data ready
90 self._create_lookup_indices(config, 'word')
93 def update_sql_functions(self, config: Configuration) -> None:
94 """ Reimport the SQL functions for this tokenizer.
96 with connect(self.dsn) as conn:
97 sqlp = SQLPreprocessor(conn, config)
98 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
101 def check_database(self, config: Configuration) -> None:
102 """ Check that the tokenizer is set up correctly.
104 # Will throw an error if there is an issue.
105 self.init_from_project(config)
108 def update_statistics(self, config: Configuration, threads: int = 2) -> None:
109 """ Recompute frequencies for all name words.
111 with connect(self.dsn) as conn:
112 if not table_exists(conn, 'search_name'):
115 with conn.cursor() as cur:
116 cur.execute('ANALYSE search_name')
118 cur.execute('SET max_parallel_workers_per_gather TO %s',
121 if server_version_tuple(conn) < (12, 0):
122 LOG.info('Computing word frequencies')
123 drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
124 cur.execute("""CREATE TEMP TABLE word_frequencies AS
125 SELECT unnest(name_vector) as id, count(*)
126 FROM search_name GROUP BY id""")
127 cur.execute('CREATE INDEX ON word_frequencies(id)')
128 cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
129 SELECT unnest(nameaddress_vector) as id, count(*)
130 FROM search_name GROUP BY id""")
131 cur.execute('CREATE INDEX ON addressword_frequencies(id)')
132 cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
140 FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
142 info = info || jsonb_build_object('count', rec.count);
144 FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
146 info = info || jsonb_build_object('addr_count', rec.count);
148 IF info = '{}'::jsonb THEN
152 $$ LANGUAGE plpgsql IMMUTABLE;
154 LOG.info('Update word table with recomputed frequencies')
155 drop_tables(conn, 'tmp_word')
156 cur.execute("""CREATE TABLE tmp_word AS
157 SELECT word_id, word_token, type, word,
158 word_freq_update(word_id, info) as info
161 drop_tables(conn, 'word_frequencies', 'addressword_frequencies')
163 LOG.info('Computing word frequencies')
164 drop_tables(conn, 'word_frequencies')
166 CREATE TEMP TABLE word_frequencies AS
167 WITH word_freq AS MATERIALIZED (
168 SELECT unnest(name_vector) as id, count(*)
169 FROM search_name GROUP BY id),
170 addr_freq AS MATERIALIZED (
171 SELECT unnest(nameaddress_vector) as id, count(*)
172 FROM search_name GROUP BY id)
173 SELECT coalesce(a.id, w.id) as id,
174 (CASE WHEN w.count is null THEN '{}'::JSONB
175 ELSE jsonb_build_object('count', w.count) END
177 CASE WHEN a.count is null THEN '{}'::JSONB
178 ELSE jsonb_build_object('addr_count', a.count) END) as info
179 FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
181 cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
182 cur.execute('ANALYSE word_frequencies')
183 LOG.info('Update word table with recomputed frequencies')
184 drop_tables(conn, 'tmp_word')
185 cur.execute("""CREATE TABLE tmp_word AS
186 SELECT word_id, word_token, type, word,
187 (CASE WHEN wf.info is null THEN word.info
188 ELSE coalesce(word.info, '{}'::jsonb) || wf.info
190 FROM word LEFT JOIN word_frequencies wf
191 ON word.word_id = wf.id
193 drop_tables(conn, 'word_frequencies')
195 with conn.cursor() as cur:
196 cur.execute('SET max_parallel_workers_per_gather TO 0')
198 sqlp = SQLPreprocessor(conn, config)
199 sqlp.run_string(conn,
200 'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
202 self._create_base_indices(config, 'tmp_word')
203 self._create_lookup_indices(config, 'tmp_word')
204 self._move_temporary_word_table('tmp_word')
208 def _cleanup_housenumbers(self) -> None:
209 """ Remove unused house numbers.
211 with connect(self.dsn) as conn:
212 if not table_exists(conn, 'search_name'):
214 with conn.cursor(name="hnr_counter") as cur:
215 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
218 AND NOT EXISTS(SELECT * FROM search_name
219 WHERE ARRAY[word.word_id] && name_vector)
220 AND (char_length(coalesce(word, word_token)) > 6
221 OR coalesce(word, word_token) not similar to '\\d+')
223 candidates = {token: wid for wid, token in cur}
224 with conn.cursor(name="hnr_counter") as cur:
225 cur.execute("""SELECT housenumber FROM placex
226 WHERE housenumber is not null
227 AND (char_length(housenumber) > 6
228 OR housenumber not similar to '\\d+')
231 for hnr in row[0].split(';'):
232 candidates.pop(hnr, None)
233 LOG.info("There are %s outdated housenumbers.", len(candidates))
234 LOG.debug("Outdated housenumbers: %s", candidates.keys())
236 with conn.cursor() as cur:
237 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
238 (list(candidates.values()), ))
243 def update_word_tokens(self) -> None:
244 """ Remove unused tokens.
246 LOG.warning("Cleaning up housenumber tokens.")
247 self._cleanup_housenumbers()
248 LOG.warning("Tokenizer house-keeping done.")
251 def name_analyzer(self) -> 'ICUNameAnalyzer':
252 """ Create a new analyzer for tokenizing names and queries
253 using this tokinzer. Analyzers are context managers and should
257 with tokenizer.name_analyzer() as analyzer:
261 When used outside the with construct, the caller must ensure to
262 call the close() function before destructing the analyzer.
264 Analyzers are not thread-safe. You need to instantiate one per thread.
266 assert self.loader is not None
267 return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
268 self.loader.make_token_analysis())
271 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
272 """ Return a list of the `num` most frequent full words
275 with conn.cursor() as cur:
276 cur.execute("""SELECT word, sum((info->>'count')::int) as count
277 FROM word WHERE type = 'W'
279 ORDER BY count DESC LIMIT %s""", (num,))
280 return list(s[0].split('@')[0] for s in cur)
283 def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
284 """ Install the php script for the tokenizer.
286 if phpdir is not None:
287 assert self.loader is not None
288 php_file = self.data_dir / "tokenizer.php"
290 if not php_file.exists() or overwrite:
291 php_file.write_text(dedent(f"""\
293 @define('CONST_Max_Word_Frequency', 10000000);
294 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
295 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
296 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
299 def _save_config(self) -> None:
300 """ Save the configuration that needs to remain stable for the given
301 database as database properties.
303 assert self.loader is not None
304 with connect(self.dsn) as conn:
305 self.loader.save_config_to_db(conn)
308 def _setup_db_tables(self, config: Configuration) -> None:
309 """ Set up the word table and fill it with pre-computed word
312 with connect(self.dsn) as conn:
313 drop_tables(conn, 'word')
314 sqlp = SQLPreprocessor(conn, config)
315 sqlp.run_string(conn, """
318 word_token text NOT NULL,
322 ) {{db.tablespace.search_data}};
323 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
325 DROP SEQUENCE IF EXISTS seq_word;
326 CREATE SEQUENCE seq_word start 1;
327 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
332 def _create_base_indices(self, config: Configuration, table_name: str) -> None:
333 """ Set up the word table and fill it with pre-computed word
336 with connect(self.dsn) as conn:
337 sqlp = SQLPreprocessor(conn, config)
338 sqlp.run_string(conn,
339 """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
340 USING BTREE (word_token) {{db.tablespace.search_index}}""",
341 table_name=table_name)
342 for name, ctype in WORD_TYPES:
343 sqlp.run_string(conn,
344 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
345 USING BTREE (word) {{db.tablespace.address_index}}
346 WHERE type = '{{column_type}}'
348 table_name=table_name, idx_name=name,
353 def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
354 """ Create additional indexes used when running the API.
356 with connect(self.dsn) as conn:
357 sqlp = SQLPreprocessor(conn, config)
358 # Index required for details lookup.
359 sqlp.run_string(conn, """
360 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
361 ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
363 table_name=table_name)
367 def _move_temporary_word_table(self, old: str) -> None:
368 """ Rename all tables and indexes used by the tokenizer.
370 with connect(self.dsn) as conn:
371 drop_tables(conn, 'word')
372 with conn.cursor() as cur:
373 cur.execute(f"ALTER TABLE {old} RENAME TO word")
374 for idx in ('word_token', 'word_id'):
375 cur.execute(f"""ALTER INDEX idx_{old}_{idx}
376 RENAME TO idx_word_{idx}""")
377 for name, _ in WORD_TYPES:
378 cur.execute(f"""ALTER INDEX idx_{old}_{name}
379 RENAME TO idx_word_{name}""")
385 class ICUNameAnalyzer(AbstractAnalyzer):
386 """ The ICU analyzer uses the ICU library for splitting names.
388 Each instance opens a connection to the database to request the
392 def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
393 token_analysis: ICUTokenAnalysis) -> None:
394 self.conn: Optional[Connection] = connect(dsn).connection
395 self.conn.autocommit = True
396 self.sanitizer = sanitizer
397 self.token_analysis = token_analysis
399 self._cache = _TokenCache()
402 def close(self) -> None:
403 """ Free all resources used by the analyzer.
410 def _search_normalized(self, name: str) -> str:
411 """ Return the search token transliteration of the given name.
413 return cast(str, self.token_analysis.search.transliterate(name)).strip()
416 def _normalized(self, name: str) -> str:
417 """ Return the normalized version of the given name with all
418 non-relevant information removed.
420 return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
423 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
424 """ Return token information for the given list of words.
425 If a word starts with # it is assumed to be a full name
426 otherwise is a partial name.
428 The function returns a list of tuples with
429 (original word, word token, word id).
431 The function is used for testing and debugging only
432 and not necessarily efficient.
434 assert self.conn is not None
438 if word.startswith('#'):
439 full_tokens[word] = self._search_normalized(word[1:])
441 partial_tokens[word] = self._search_normalized(word)
443 with self.conn.cursor() as cur:
444 cur.execute("""SELECT word_token, word_id
445 FROM word WHERE word_token = ANY(%s) and type = 'W'
446 """, (list(full_tokens.values()),))
447 full_ids = {r[0]: r[1] for r in cur}
448 cur.execute("""SELECT word_token, word_id
449 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
450 (list(partial_tokens.values()),))
451 part_ids = {r[0]: r[1] for r in cur}
453 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
454 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
457 def normalize_postcode(self, postcode: str) -> str:
458 """ Convert the postcode to a standardized form.
460 This function must yield exactly the same result as the SQL function
461 'token_normalized_postcode()'.
463 return postcode.strip().upper()
466 def update_postcodes_from_db(self) -> None:
467 """ Update postcode tokens in the word table from the location_postcode
470 assert self.conn is not None
471 analyzer = self.token_analysis.analysis.get('@postcode')
473 with self.conn.cursor() as cur:
474 # First get all postcode names currently in the word table.
475 cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
476 word_entries = set((entry[0] for entry in cur))
478 # Then compute the required postcode names from the postcode table.
479 needed_entries = set()
480 cur.execute("SELECT country_code, postcode FROM location_postcode")
481 for cc, postcode in cur:
482 info = PlaceInfo({'country_code': cc,
483 'class': 'place', 'type': 'postcode',
484 'address': {'postcode': postcode}})
485 address = self.sanitizer.process_names(info)[1]
486 for place in address:
487 if place.kind == 'postcode':
489 postcode_name = place.name.strip().upper()
492 postcode_name = analyzer.get_canonical_id(place)
493 variant_base = place.get_attr("variant")
496 needed_entries.add(f'{postcode_name}@{variant_base}')
498 needed_entries.add(postcode_name)
501 # Now update the word table.
502 self._delete_unused_postcode_words(word_entries - needed_entries)
503 self._add_missing_postcode_words(needed_entries - word_entries)
505 def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
506 assert self.conn is not None
508 with self.conn.cursor() as cur:
509 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
512 def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
513 assert self.conn is not None
517 analyzer = self.token_analysis.analysis.get('@postcode')
520 for postcode_name in tokens:
521 if '@' in postcode_name:
522 term, variant = postcode_name.split('@', 2)
523 term = self._search_normalized(term)
527 variants = analyzer.compute_variants(variant)
528 if term not in variants:
529 variants.append(term)
531 variants = [self._search_normalized(postcode_name)]
532 terms.append((postcode_name, variants))
535 with self.conn.cursor() as cur:
536 cur.execute_values("""SELECT create_postcode_word(pc, var)
537 FROM (VALUES %s) AS v(pc, var)""",
543 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
544 should_replace: bool) -> None:
545 """ Replace the search index for special phrases with the new phrases.
546 If `should_replace` is True, then the previous set of will be
547 completely replaced. Otherwise the phrases are added to the
548 already existing ones.
550 assert self.conn is not None
551 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
554 with self.conn.cursor() as cur:
555 # Get the old phrases.
556 existing_phrases = set()
557 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
558 for word, info in cur:
559 existing_phrases.add((word, info['class'], info['type'],
560 info.get('op') or '-'))
562 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
564 deleted = self._remove_special_phrases(cur, norm_phrases,
569 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
570 len(norm_phrases), added, deleted)
573 def _add_special_phrases(self, cursor: Cursor,
574 new_phrases: Set[Tuple[str, str, str, str]],
575 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
576 """ Add all phrases to the database that are not yet there.
578 to_add = new_phrases - existing_phrases
581 with CopyBuffer() as copystr:
582 for word, cls, typ, oper in to_add:
583 term = self._search_normalized(word)
585 copystr.add(term, 'S', word,
586 json.dumps({'class': cls, 'type': typ,
587 'op': oper if oper in ('in', 'near') else None}))
590 copystr.copy_out(cursor, 'word',
591 columns=['word_token', 'type', 'word', 'info'])
596 def _remove_special_phrases(self, cursor: Cursor,
597 new_phrases: Set[Tuple[str, str, str, str]],
598 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
599 """ Remove all phrases from the database that are no longer in the
602 to_delete = existing_phrases - new_phrases
605 cursor.execute_values(
606 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
607 WHERE type = 'S' and word = name
608 and info->>'class' = in_class and info->>'type' = in_type
609 and ((op = '-' and info->>'op' is null) or op = info->>'op')
612 return len(to_delete)
615 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
616 """ Add default names for the given country to the search index.
618 # Make sure any name preprocessing for country names applies.
619 info = PlaceInfo({'name': names, 'country_code': country_code,
620 'rank_address': 4, 'class': 'boundary',
621 'type': 'administrative'})
622 self._add_country_full_names(country_code,
623 self.sanitizer.process_names(info)[0],
627 def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
628 internal: bool = False) -> None:
629 """ Add names for the given country from an already sanitized
632 assert self.conn is not None
635 norm_name = self._search_normalized(name.name)
637 word_tokens.add(norm_name)
639 with self.conn.cursor() as cur:
641 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
643 WHERE type = 'C' and word = %s""",
645 # internal/external names
646 existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
648 existing_tokens[word[1]].add(word[0])
650 # Delete names that no longer exist.
651 gone_tokens = existing_tokens[internal] - word_tokens
653 gone_tokens.update(existing_tokens[False] & word_tokens)
655 cur.execute("""DELETE FROM word
656 USING unnest(%s) as token
657 WHERE type = 'C' and word = %s
658 and word_token = token""",
659 (list(gone_tokens), country_code))
661 # Only add those names that are not yet in the list.
662 new_tokens = word_tokens - existing_tokens[True]
664 new_tokens -= existing_tokens[False]
667 sql = """INSERT INTO word (word_token, type, word, info)
668 (SELECT token, 'C', %s, '{"internal": "yes"}'
669 FROM unnest(%s) as token)
672 sql = """INSERT INTO word (word_token, type, word)
673 (SELECT token, 'C', %s
674 FROM unnest(%s) as token)
676 cur.execute(sql, (country_code, list(new_tokens)))
679 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
680 """ Determine tokenizer information about the given place.
682 Returns a JSON-serializable structure that will be handed into
683 the database via the token_info field.
685 token_info = _TokenInfo()
687 names, address = self.sanitizer.process_names(place)
690 token_info.set_names(*self._compute_name_tokens(names))
692 if place.is_country():
693 assert place.country_code is not None
694 self._add_country_full_names(place.country_code, names)
697 self._process_place_address(token_info, address)
699 return token_info.to_dict()
702 def _process_place_address(self, token_info: '_TokenInfo',
703 address: Sequence[PlaceName]) -> None:
705 if item.kind == 'postcode':
706 token_info.set_postcode(self._add_postcode(item))
707 elif item.kind == 'housenumber':
708 token_info.add_housenumber(*self._compute_housenumber_token(item))
709 elif item.kind == 'street':
710 token_info.add_street(self._retrieve_full_tokens(item.name))
711 elif item.kind == 'place':
713 token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
714 elif not item.kind.startswith('_') and not item.suffix and \
715 item.kind not in ('country', 'full', 'inclusion'):
716 token_info.add_address_term(item.kind,
717 itertools.chain(*self._compute_name_tokens([item])))
720 def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
721 """ Normalize the housenumber and return the word token and the
724 assert self.conn is not None
725 analyzer = self.token_analysis.analysis.get('@housenumber')
726 result: Tuple[Optional[int], Optional[str]] = (None, None)
729 # When no custom analyzer is set, simply normalize and transliterate
730 norm_name = self._search_normalized(hnr.name)
732 result = self._cache.housenumbers.get(norm_name, result)
733 if result[0] is None:
734 hid = execute_scalar(self.conn, "SELECT getorcreate_hnr_id(%s)", (norm_name, ))
736 result = hid, norm_name
737 self._cache.housenumbers[norm_name] = result
739 # Otherwise use the analyzer to determine the canonical name.
740 # Per convention we use the first variant as the 'lookup name', the
741 # name that gets saved in the housenumber field of the place.
742 word_id = analyzer.get_canonical_id(hnr)
744 result = self._cache.housenumbers.get(word_id, result)
745 if result[0] is None:
746 variants = analyzer.compute_variants(word_id)
748 hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)",
749 (word_id, list(variants)))
750 result = hid, variants[0]
751 self._cache.housenumbers[word_id] = result
756 def _retrieve_full_tokens(self, name: str) -> List[int]:
757 """ Get the full name token for the given name, if it exists.
758 The name is only retrieved for the standard analyser.
760 assert self.conn is not None
761 norm_name = self._search_normalized(name)
763 # return cached if possible
764 if norm_name in self._cache.fulls:
765 return self._cache.fulls[norm_name]
767 with self.conn.cursor() as cur:
768 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
770 full = [row[0] for row in cur]
772 self._cache.fulls[norm_name] = full
777 def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
778 """ Computes the full name and partial name tokens for the given
781 assert self.conn is not None
782 full_tokens: Set[int] = set()
783 partial_tokens: Set[int] = set()
786 analyzer_id = name.get_attr('analyzer')
787 analyzer = self.token_analysis.get_analyzer(analyzer_id)
788 word_id = analyzer.get_canonical_id(name)
789 if analyzer_id is None:
792 token_id = f'{word_id}@{analyzer_id}'
794 full, part = self._cache.names.get(token_id, (None, None))
796 variants = analyzer.compute_variants(word_id)
800 with self.conn.cursor() as cur:
801 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
802 (token_id, variants))
803 full, part = cast(Tuple[int, List[int]], cur.fetchone())
805 self._cache.names[token_id] = (full, part)
807 assert part is not None
809 full_tokens.add(full)
810 partial_tokens.update(part)
812 return full_tokens, partial_tokens
815 def _add_postcode(self, item: PlaceName) -> Optional[str]:
816 """ Make sure the normalized postcode is present in the word table.
818 assert self.conn is not None
819 analyzer = self.token_analysis.analysis.get('@postcode')
822 postcode_name = item.name.strip().upper()
825 postcode_name = analyzer.get_canonical_id(item)
826 variant_base = item.get_attr("variant")
829 postcode = f'{postcode_name}@{variant_base}'
831 postcode = postcode_name
833 if postcode not in self._cache.postcodes:
834 term = self._search_normalized(postcode_name)
839 if analyzer is not None and variant_base:
840 variants.update(analyzer.compute_variants(variant_base))
842 with self.conn.cursor() as cur:
843 cur.execute("SELECT create_postcode_word(%s, %s)",
844 (postcode, list(variants)))
845 self._cache.postcodes.add(postcode)
851 """ Collect token information to be sent back to the database.
853 def __init__(self) -> None:
854 self.names: Optional[str] = None
855 self.housenumbers: Set[str] = set()
856 self.housenumber_tokens: Set[int] = set()
857 self.street_tokens: Optional[Set[int]] = None
858 self.place_tokens: Set[int] = set()
859 self.address_tokens: Dict[str, str] = {}
860 self.postcode: Optional[str] = None
863 def _mk_array(self, tokens: Iterable[Any]) -> str:
864 return f"{{{','.join((str(s) for s in tokens))}}}"
867 def to_dict(self) -> Dict[str, Any]:
868 """ Return the token information in database importable format.
870 out: Dict[str, Any] = {}
873 out['names'] = self.names
875 if self.housenumbers:
876 out['hnr'] = ';'.join(self.housenumbers)
877 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
879 if self.street_tokens is not None:
880 out['street'] = self._mk_array(self.street_tokens)
882 if self.place_tokens:
883 out['place'] = self._mk_array(self.place_tokens)
885 if self.address_tokens:
886 out['addr'] = self.address_tokens
889 out['postcode'] = self.postcode
894 def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
895 """ Adds token information for the normalised names.
897 self.names = self._mk_array(itertools.chain(fulls, partials))
900 def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
901 """ Extract housenumber information from a list of normalised
905 assert hnr is not None
906 self.housenumbers.add(hnr)
907 self.housenumber_tokens.add(token)
910 def add_street(self, tokens: Iterable[int]) -> None:
911 """ Add addr:street match terms.
913 if self.street_tokens is None:
914 self.street_tokens = set()
915 self.street_tokens.update(tokens)
918 def add_place(self, tokens: Iterable[int]) -> None:
919 """ Add addr:place search and match terms.
921 self.place_tokens.update(tokens)
924 def add_address_term(self, key: str, partials: Iterable[int]) -> None:
925 """ Add additional address terms.
927 array = self._mk_array(partials)
929 self.address_tokens[key] = array
931 def set_postcode(self, postcode: Optional[str]) -> None:
932 """ Set the postcode to the given one.
934 self.postcode = postcode
938 """ Cache for token information to avoid repeated database queries.
940 This cache is not thread-safe and needs to be instantiated per
943 def __init__(self) -> None:
944 self.names: Dict[str, Tuple[int, List[int]]] = {}
945 self.partials: Dict[str, int] = {}
946 self.fulls: Dict[str, List[int]] = {}
947 self.postcodes: Set[str] = set()
948 self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}