1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
16 from pathlib import Path
17 from textwrap import dedent
19 from nominatim.db.connection import connect, Connection, Cursor
20 from nominatim.config import Configuration
21 from nominatim.db.utils import CopyBuffer
22 from nominatim.db.sql_preprocessor import SQLPreprocessor
23 from nominatim.data.place_info import PlaceInfo
24 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
25 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
26 from nominatim.data.place_name import PlaceName
27 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
28 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
30 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
32 LOG = logging.getLogger()
34 WORD_TYPES =(('country_names', 'C'),
37 ('housenumbers', 'H'))
39 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
40 """ Create a new instance of the tokenizer provided by this module.
42 return ICUTokenizer(dsn, data_dir)
45 class ICUTokenizer(AbstractTokenizer):
46 """ This tokenizer uses libICU to convert names and queries to ASCII.
47 Otherwise it uses the same algorithms and data structures as the
48 normalization routines in Nominatim 3.
51 def __init__(self, dsn: str, data_dir: Path) -> None:
53 self.data_dir = data_dir
54 self.loader: Optional[ICURuleLoader] = None
57 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
58 """ Set up a new tokenizer for the database.
60 This copies all necessary data in the project directory to make
61 sure the tokenizer remains stable even over updates.
63 self.loader = ICURuleLoader(config)
65 self._install_php(config.lib_dir.php, overwrite=True)
69 self.update_sql_functions(config)
70 self._setup_db_tables(config)
71 self._create_base_indices(config, 'word')
74 def init_from_project(self, config: Configuration) -> None:
75 """ Initialise the tokenizer from the project directory.
77 self.loader = ICURuleLoader(config)
79 with connect(self.dsn) as conn:
80 self.loader.load_config_from_db(conn)
82 self._install_php(config.lib_dir.php, overwrite=False)
85 def finalize_import(self, config: Configuration) -> None:
86 """ Do any required postprocessing to make the tokenizer data ready
89 self._create_lookup_indices(config, 'word')
92 def update_sql_functions(self, config: Configuration) -> None:
93 """ Reimport the SQL functions for this tokenizer.
95 with connect(self.dsn) as conn:
96 sqlp = SQLPreprocessor(conn, config)
97 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
100 def check_database(self, config: Configuration) -> None:
101 """ Check that the tokenizer is set up correctly.
103 # Will throw an error if there is an issue.
104 self.init_from_project(config)
107 def update_statistics(self, config: Configuration, threads: int = 2) -> None:
108 """ Recompute frequencies for all name words.
110 with connect(self.dsn) as conn:
111 if not conn.table_exists('search_name'):
114 with conn.cursor() as cur:
115 cur.execute('ANALYSE search_name')
117 cur.execute('SET max_parallel_workers_per_gather TO %s',
120 if conn.server_version_tuple() < (12, 0):
121 LOG.info('Computing word frequencies')
122 cur.drop_table('word_frequencies')
123 cur.drop_table('addressword_frequencies')
124 cur.execute("""CREATE TEMP TABLE word_frequencies AS
125 SELECT unnest(name_vector) as id, count(*)
126 FROM search_name GROUP BY id""")
127 cur.execute('CREATE INDEX ON word_frequencies(id)')
128 cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
129 SELECT unnest(nameaddress_vector) as id, count(*)
130 FROM search_name GROUP BY id""")
131 cur.execute('CREATE INDEX ON addressword_frequencies(id)')
132 cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
140 FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
142 info = info || jsonb_build_object('count', rec.count);
144 FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
146 info = info || jsonb_build_object('addr_count', rec.count);
148 IF info = '{}'::jsonb THEN
152 $$ LANGUAGE plpgsql IMMUTABLE;
154 LOG.info('Update word table with recomputed frequencies')
155 cur.drop_table('tmp_word')
156 cur.execute("""CREATE TABLE tmp_word AS
157 SELECT word_id, word_token, type, word,
158 word_freq_update(word_id, info) as info
161 cur.drop_table('word_frequencies')
162 cur.drop_table('addressword_frequencies')
164 LOG.info('Computing word frequencies')
165 cur.drop_table('word_frequencies')
166 cur.execute('ANALYSE search_name')
167 cur.execute('ANALYSE word')
169 CREATE TEMP TABLE word_frequencies AS
170 WITH word_freq AS MATERIALIZED (
171 SELECT unnest(name_vector) as id, count(*)
172 FROM search_name GROUP BY id),
173 addr_freq AS MATERIALIZED (
174 SELECT unnest(nameaddress_vector) as id, count(*)
175 FROM search_name GROUP BY id)
176 SELECT coalesce(a.id, w.id) as id,
177 (CASE WHEN w.count is null THEN '{}'::JSONB
178 ELSE jsonb_build_object('count', w.count) END
180 CASE WHEN a.count is null THEN '{}'::JSONB
181 ELSE jsonb_build_object('addr_count', a.count) END) as info
182 FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
184 cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
185 cur.execute('ANALYSE word_frequencies')
186 LOG.info('Update word table with recomputed frequencies')
187 cur.drop_table('tmp_word')
188 cur.execute("""CREATE TABLE tmp_word AS
189 SELECT word_id, word_token, type, word,
190 (CASE WHEN wf.info is null THEN word.info
191 ELSE coalesce(word.info, '{}'::jsonb) || wf.info
193 FROM word LEFT JOIN word_frequencies wf
194 ON word.word_id = wf.id
197 cur.drop_table('word_frequencies')
199 with conn.cursor() as cur:
200 cur.execute('SET max_parallel_workers_per_gather TO 0')
202 sqlp = SQLPreprocessor(conn, config)
203 sqlp.run_string(conn,
204 'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
206 self._create_base_indices(config, 'tmp_word')
207 self._create_lookup_indices(config, 'tmp_word')
208 self._move_temporary_word_table('tmp_word')
212 def _cleanup_housenumbers(self) -> None:
213 """ Remove unused house numbers.
215 with connect(self.dsn) as conn:
216 if not conn.table_exists('search_name'):
218 with conn.cursor(name="hnr_counter") as cur:
219 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
222 AND NOT EXISTS(SELECT * FROM search_name
223 WHERE ARRAY[word.word_id] && name_vector)
224 AND (char_length(coalesce(word, word_token)) > 6
225 OR coalesce(word, word_token) not similar to '\\d+')
227 candidates = {token: wid for wid, token in cur}
228 with conn.cursor(name="hnr_counter") as cur:
229 cur.execute("""SELECT housenumber FROM placex
230 WHERE housenumber is not null
231 AND (char_length(housenumber) > 6
232 OR housenumber not similar to '\\d+')
235 for hnr in row[0].split(';'):
236 candidates.pop(hnr, None)
237 LOG.info("There are %s outdated housenumbers.", len(candidates))
238 LOG.debug("Outdated housenumbers: %s", candidates.keys())
240 with conn.cursor() as cur:
241 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
242 (list(candidates.values()), ))
247 def update_word_tokens(self) -> None:
248 """ Remove unused tokens.
250 LOG.warning("Cleaning up housenumber tokens.")
251 self._cleanup_housenumbers()
252 LOG.warning("Tokenizer house-keeping done.")
255 def name_analyzer(self) -> 'ICUNameAnalyzer':
256 """ Create a new analyzer for tokenizing names and queries
257 using this tokinzer. Analyzers are context managers and should
261 with tokenizer.name_analyzer() as analyzer:
265 When used outside the with construct, the caller must ensure to
266 call the close() function before destructing the analyzer.
268 Analyzers are not thread-safe. You need to instantiate one per thread.
270 assert self.loader is not None
271 return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
272 self.loader.make_token_analysis())
275 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
276 """ Return a list of the `num` most frequent full words
279 with conn.cursor() as cur:
280 cur.execute("""SELECT word, sum((info->>'count')::int) as count
281 FROM word WHERE type = 'W'
283 ORDER BY count DESC LIMIT %s""", (num,))
284 return list(s[0].split('@')[0] for s in cur)
287 def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
288 """ Install the php script for the tokenizer.
290 if phpdir is not None:
291 assert self.loader is not None
292 php_file = self.data_dir / "tokenizer.php"
294 if not php_file.exists() or overwrite:
295 php_file.write_text(dedent(f"""\
297 @define('CONST_Max_Word_Frequency', 10000000);
298 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
299 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
300 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
303 def _save_config(self) -> None:
304 """ Save the configuration that needs to remain stable for the given
305 database as database properties.
307 assert self.loader is not None
308 with connect(self.dsn) as conn:
309 self.loader.save_config_to_db(conn)
312 def _setup_db_tables(self, config: Configuration) -> None:
313 """ Set up the word table and fill it with pre-computed word
316 with connect(self.dsn) as conn:
317 with conn.cursor() as cur:
318 cur.drop_table('word')
319 sqlp = SQLPreprocessor(conn, config)
320 sqlp.run_string(conn, """
323 word_token text NOT NULL,
327 ) {{db.tablespace.search_data}};
328 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
330 DROP SEQUENCE IF EXISTS seq_word;
331 CREATE SEQUENCE seq_word start 1;
332 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
337 def _create_base_indices(self, config: Configuration, table_name: str) -> None:
338 """ Set up the word table and fill it with pre-computed word
341 with connect(self.dsn) as conn:
342 sqlp = SQLPreprocessor(conn, config)
343 sqlp.run_string(conn,
344 """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
345 USING BTREE (word_token) {{db.tablespace.search_index}}""",
346 table_name=table_name)
347 for name, ctype in WORD_TYPES:
348 sqlp.run_string(conn,
349 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
350 USING BTREE (word) {{db.tablespace.address_index}}
351 WHERE type = '{{column_type}}'
353 table_name=table_name, idx_name=name,
358 def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
359 """ Create additional indexes used when running the API.
361 with connect(self.dsn) as conn:
362 sqlp = SQLPreprocessor(conn, config)
363 # Index required for details lookup.
364 sqlp.run_string(conn, """
365 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
366 ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
368 table_name=table_name)
372 def _move_temporary_word_table(self, old: str) -> None:
373 """ Rename all tables and indexes used by the tokenizer.
375 with connect(self.dsn) as conn:
376 with conn.cursor() as cur:
377 cur.drop_table('word')
378 cur.execute(f"ALTER TABLE {old} RENAME TO word")
379 for idx in ('word_token', 'word_id'):
380 cur.execute(f"""ALTER INDEX idx_{old}_{idx}
381 RENAME TO idx_word_{idx}""")
382 for name, _ in WORD_TYPES:
383 cur.execute(f"""ALTER INDEX idx_{old}_{name}
384 RENAME TO idx_word_{name}""")
390 class ICUNameAnalyzer(AbstractAnalyzer):
391 """ The ICU analyzer uses the ICU library for splitting names.
393 Each instance opens a connection to the database to request the
397 def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
398 token_analysis: ICUTokenAnalysis) -> None:
399 self.conn: Optional[Connection] = connect(dsn).connection
400 self.conn.autocommit = True
401 self.sanitizer = sanitizer
402 self.token_analysis = token_analysis
404 self._cache = _TokenCache()
407 def close(self) -> None:
408 """ Free all resources used by the analyzer.
415 def _search_normalized(self, name: str) -> str:
416 """ Return the search token transliteration of the given name.
418 return cast(str, self.token_analysis.search.transliterate(name)).strip()
421 def _normalized(self, name: str) -> str:
422 """ Return the normalized version of the given name with all
423 non-relevant information removed.
425 return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
428 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
429 """ Return token information for the given list of words.
430 If a word starts with # it is assumed to be a full name
431 otherwise is a partial name.
433 The function returns a list of tuples with
434 (original word, word token, word id).
436 The function is used for testing and debugging only
437 and not necessarily efficient.
439 assert self.conn is not None
443 if word.startswith('#'):
444 full_tokens[word] = self._search_normalized(word[1:])
446 partial_tokens[word] = self._search_normalized(word)
448 with self.conn.cursor() as cur:
449 cur.execute("""SELECT word_token, word_id
450 FROM word WHERE word_token = ANY(%s) and type = 'W'
451 """, (list(full_tokens.values()),))
452 full_ids = {r[0]: r[1] for r in cur}
453 cur.execute("""SELECT word_token, word_id
454 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
455 (list(partial_tokens.values()),))
456 part_ids = {r[0]: r[1] for r in cur}
458 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
459 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
462 def normalize_postcode(self, postcode: str) -> str:
463 """ Convert the postcode to a standardized form.
465 This function must yield exactly the same result as the SQL function
466 'token_normalized_postcode()'.
468 return postcode.strip().upper()
471 def update_postcodes_from_db(self) -> None:
472 """ Update postcode tokens in the word table from the location_postcode
475 assert self.conn is not None
476 analyzer = self.token_analysis.analysis.get('@postcode')
478 with self.conn.cursor() as cur:
479 # First get all postcode names currently in the word table.
480 cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
481 word_entries = set((entry[0] for entry in cur))
483 # Then compute the required postcode names from the postcode table.
484 needed_entries = set()
485 cur.execute("SELECT country_code, postcode FROM location_postcode")
486 for cc, postcode in cur:
487 info = PlaceInfo({'country_code': cc,
488 'class': 'place', 'type': 'postcode',
489 'address': {'postcode': postcode}})
490 address = self.sanitizer.process_names(info)[1]
491 for place in address:
492 if place.kind == 'postcode':
494 postcode_name = place.name.strip().upper()
497 postcode_name = analyzer.get_canonical_id(place)
498 variant_base = place.get_attr("variant")
501 needed_entries.add(f'{postcode_name}@{variant_base}')
503 needed_entries.add(postcode_name)
506 # Now update the word table.
507 self._delete_unused_postcode_words(word_entries - needed_entries)
508 self._add_missing_postcode_words(needed_entries - word_entries)
510 def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
511 assert self.conn is not None
513 with self.conn.cursor() as cur:
514 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
517 def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
518 assert self.conn is not None
522 analyzer = self.token_analysis.analysis.get('@postcode')
525 for postcode_name in tokens:
526 if '@' in postcode_name:
527 term, variant = postcode_name.split('@', 2)
528 term = self._search_normalized(term)
532 variants = analyzer.compute_variants(variant)
533 if term not in variants:
534 variants.append(term)
536 variants = [self._search_normalized(postcode_name)]
537 terms.append((postcode_name, variants))
540 with self.conn.cursor() as cur:
541 cur.execute_values("""SELECT create_postcode_word(pc, var)
542 FROM (VALUES %s) AS v(pc, var)""",
548 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
549 should_replace: bool) -> None:
550 """ Replace the search index for special phrases with the new phrases.
551 If `should_replace` is True, then the previous set of will be
552 completely replaced. Otherwise the phrases are added to the
553 already existing ones.
555 assert self.conn is not None
556 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
559 with self.conn.cursor() as cur:
560 # Get the old phrases.
561 existing_phrases = set()
562 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
563 for word, info in cur:
564 existing_phrases.add((word, info['class'], info['type'],
565 info.get('op') or '-'))
567 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
569 deleted = self._remove_special_phrases(cur, norm_phrases,
574 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
575 len(norm_phrases), added, deleted)
578 def _add_special_phrases(self, cursor: Cursor,
579 new_phrases: Set[Tuple[str, str, str, str]],
580 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
581 """ Add all phrases to the database that are not yet there.
583 to_add = new_phrases - existing_phrases
586 with CopyBuffer() as copystr:
587 for word, cls, typ, oper in to_add:
588 term = self._search_normalized(word)
590 copystr.add(term, 'S', word,
591 json.dumps({'class': cls, 'type': typ,
592 'op': oper if oper in ('in', 'near') else None}))
595 copystr.copy_out(cursor, 'word',
596 columns=['word_token', 'type', 'word', 'info'])
601 def _remove_special_phrases(self, cursor: Cursor,
602 new_phrases: Set[Tuple[str, str, str, str]],
603 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
604 """ Remove all phrases from the database that are no longer in the
607 to_delete = existing_phrases - new_phrases
610 cursor.execute_values(
611 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
612 WHERE type = 'S' and word = name
613 and info->>'class' = in_class and info->>'type' = in_type
614 and ((op = '-' and info->>'op' is null) or op = info->>'op')
617 return len(to_delete)
620 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
621 """ Add default names for the given country to the search index.
623 # Make sure any name preprocessing for country names applies.
624 info = PlaceInfo({'name': names, 'country_code': country_code,
625 'rank_address': 4, 'class': 'boundary',
626 'type': 'administrative'})
627 self._add_country_full_names(country_code,
628 self.sanitizer.process_names(info)[0],
632 def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
633 internal: bool = False) -> None:
634 """ Add names for the given country from an already sanitized
637 assert self.conn is not None
640 norm_name = self._search_normalized(name.name)
642 word_tokens.add(norm_name)
644 with self.conn.cursor() as cur:
646 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
648 WHERE type = 'C' and word = %s""",
650 # internal/external names
651 existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
653 existing_tokens[word[1]].add(word[0])
655 # Delete names that no longer exist.
656 gone_tokens = existing_tokens[internal] - word_tokens
658 gone_tokens.update(existing_tokens[False] & word_tokens)
660 cur.execute("""DELETE FROM word
661 USING unnest(%s) as token
662 WHERE type = 'C' and word = %s
663 and word_token = token""",
664 (list(gone_tokens), country_code))
666 # Only add those names that are not yet in the list.
667 new_tokens = word_tokens - existing_tokens[True]
669 new_tokens -= existing_tokens[False]
672 sql = """INSERT INTO word (word_token, type, word, info)
673 (SELECT token, 'C', %s, '{"internal": "yes"}'
674 FROM unnest(%s) as token)
677 sql = """INSERT INTO word (word_token, type, word)
678 (SELECT token, 'C', %s
679 FROM unnest(%s) as token)
681 cur.execute(sql, (country_code, list(new_tokens)))
684 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
685 """ Determine tokenizer information about the given place.
687 Returns a JSON-serializable structure that will be handed into
688 the database via the token_info field.
690 token_info = _TokenInfo()
692 names, address = self.sanitizer.process_names(place)
695 token_info.set_names(*self._compute_name_tokens(names))
697 if place.is_country():
698 assert place.country_code is not None
699 self._add_country_full_names(place.country_code, names)
702 self._process_place_address(token_info, address)
704 return token_info.to_dict()
707 def _process_place_address(self, token_info: '_TokenInfo',
708 address: Sequence[PlaceName]) -> None:
710 if item.kind == 'postcode':
711 token_info.set_postcode(self._add_postcode(item))
712 elif item.kind == 'housenumber':
713 token_info.add_housenumber(*self._compute_housenumber_token(item))
714 elif item.kind == 'street':
715 token_info.add_street(self._retrieve_full_tokens(item.name))
716 elif item.kind == 'place':
718 token_info.add_place(self._compute_partial_tokens(item.name))
719 elif not item.kind.startswith('_') and not item.suffix and \
720 item.kind not in ('country', 'full', 'inclusion'):
721 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
724 def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
725 """ Normalize the housenumber and return the word token and the
728 assert self.conn is not None
729 analyzer = self.token_analysis.analysis.get('@housenumber')
730 result: Tuple[Optional[int], Optional[str]] = (None, None)
733 # When no custom analyzer is set, simply normalize and transliterate
734 norm_name = self._search_normalized(hnr.name)
736 result = self._cache.housenumbers.get(norm_name, result)
737 if result[0] is None:
738 with self.conn.cursor() as cur:
739 hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
741 result = hid, norm_name
742 self._cache.housenumbers[norm_name] = result
744 # Otherwise use the analyzer to determine the canonical name.
745 # Per convention we use the first variant as the 'lookup name', the
746 # name that gets saved in the housenumber field of the place.
747 word_id = analyzer.get_canonical_id(hnr)
749 result = self._cache.housenumbers.get(word_id, result)
750 if result[0] is None:
751 variants = analyzer.compute_variants(word_id)
753 with self.conn.cursor() as cur:
754 hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
755 (word_id, list(variants)))
756 result = hid, variants[0]
757 self._cache.housenumbers[word_id] = result
762 def _compute_partial_tokens(self, name: str) -> List[int]:
763 """ Normalize the given term, split it into partial words and return
764 then token list for them.
766 assert self.conn is not None
767 norm_name = self._search_normalized(name)
771 for partial in norm_name.split():
772 token = self._cache.partials.get(partial)
776 need_lookup.append(partial)
779 with self.conn.cursor() as cur:
780 cur.execute("""SELECT word, getorcreate_partial_word(word)
781 FROM unnest(%s) word""",
784 for partial, token in cur:
785 assert token is not None
787 self._cache.partials[partial] = token
792 def _retrieve_full_tokens(self, name: str) -> List[int]:
793 """ Get the full name token for the given name, if it exists.
794 The name is only retrieved for the standard analyser.
796 assert self.conn is not None
797 norm_name = self._search_normalized(name)
799 # return cached if possible
800 if norm_name in self._cache.fulls:
801 return self._cache.fulls[norm_name]
803 with self.conn.cursor() as cur:
804 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
806 full = [row[0] for row in cur]
808 self._cache.fulls[norm_name] = full
813 def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
814 """ Computes the full name and partial name tokens for the given
817 assert self.conn is not None
818 full_tokens: Set[int] = set()
819 partial_tokens: Set[int] = set()
822 analyzer_id = name.get_attr('analyzer')
823 analyzer = self.token_analysis.get_analyzer(analyzer_id)
824 word_id = analyzer.get_canonical_id(name)
825 if analyzer_id is None:
828 token_id = f'{word_id}@{analyzer_id}'
830 full, part = self._cache.names.get(token_id, (None, None))
832 variants = analyzer.compute_variants(word_id)
836 with self.conn.cursor() as cur:
837 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
838 (token_id, variants))
839 full, part = cast(Tuple[int, List[int]], cur.fetchone())
841 self._cache.names[token_id] = (full, part)
843 assert part is not None
845 full_tokens.add(full)
846 partial_tokens.update(part)
848 return full_tokens, partial_tokens
851 def _add_postcode(self, item: PlaceName) -> Optional[str]:
852 """ Make sure the normalized postcode is present in the word table.
854 assert self.conn is not None
855 analyzer = self.token_analysis.analysis.get('@postcode')
858 postcode_name = item.name.strip().upper()
861 postcode_name = analyzer.get_canonical_id(item)
862 variant_base = item.get_attr("variant")
865 postcode = f'{postcode_name}@{variant_base}'
867 postcode = postcode_name
869 if postcode not in self._cache.postcodes:
870 term = self._search_normalized(postcode_name)
875 if analyzer is not None and variant_base:
876 variants.update(analyzer.compute_variants(variant_base))
878 with self.conn.cursor() as cur:
879 cur.execute("SELECT create_postcode_word(%s, %s)",
880 (postcode, list(variants)))
881 self._cache.postcodes.add(postcode)
887 """ Collect token information to be sent back to the database.
889 def __init__(self) -> None:
890 self.names: Optional[str] = None
891 self.housenumbers: Set[str] = set()
892 self.housenumber_tokens: Set[int] = set()
893 self.street_tokens: Optional[Set[int]] = None
894 self.place_tokens: Set[int] = set()
895 self.address_tokens: Dict[str, str] = {}
896 self.postcode: Optional[str] = None
899 def _mk_array(self, tokens: Iterable[Any]) -> str:
900 return f"{{{','.join((str(s) for s in tokens))}}}"
903 def to_dict(self) -> Dict[str, Any]:
904 """ Return the token information in database importable format.
906 out: Dict[str, Any] = {}
909 out['names'] = self.names
911 if self.housenumbers:
912 out['hnr'] = ';'.join(self.housenumbers)
913 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
915 if self.street_tokens is not None:
916 out['street'] = self._mk_array(self.street_tokens)
918 if self.place_tokens:
919 out['place'] = self._mk_array(self.place_tokens)
921 if self.address_tokens:
922 out['addr'] = self.address_tokens
925 out['postcode'] = self.postcode
930 def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
931 """ Adds token information for the normalised names.
933 self.names = self._mk_array(itertools.chain(fulls, partials))
936 def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
937 """ Extract housenumber information from a list of normalised
941 assert hnr is not None
942 self.housenumbers.add(hnr)
943 self.housenumber_tokens.add(token)
946 def add_street(self, tokens: Iterable[int]) -> None:
947 """ Add addr:street match terms.
949 if self.street_tokens is None:
950 self.street_tokens = set()
951 self.street_tokens.update(tokens)
954 def add_place(self, tokens: Iterable[int]) -> None:
955 """ Add addr:place search and match terms.
957 self.place_tokens.update(tokens)
960 def add_address_term(self, key: str, partials: Iterable[int]) -> None:
961 """ Add additional address terms.
964 self.address_tokens[key] = self._mk_array(partials)
966 def set_postcode(self, postcode: Optional[str]) -> None:
967 """ Set the postcode to the given one.
969 self.postcode = postcode
973 """ Cache for token information to avoid repeated database queries.
975 This cache is not thread-safe and needs to be instantiated per
978 def __init__(self) -> None:
979 self.names: Dict[str, Tuple[int, List[int]]] = {}
980 self.partials: Dict[str, int] = {}
981 self.fulls: Dict[str, List[int]] = {}
982 self.postcodes: Set[str] = set()
983 self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}