1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
11 from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
16 from pathlib import Path
17 from textwrap import dedent
19 from nominatim.db.connection import connect, Connection, Cursor
20 from nominatim.config import Configuration
21 from nominatim.db.utils import CopyBuffer
22 from nominatim.db.sql_preprocessor import SQLPreprocessor
23 from nominatim.data.place_info import PlaceInfo
24 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
25 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
26 from nominatim.data.place_name import PlaceName
27 from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
28 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
30 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
32 LOG = logging.getLogger()
34 WORD_TYPES =(('country_names', 'C'),
37 ('housenumbers', 'H'))
39 def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
40 """ Create a new instance of the tokenizer provided by this module.
42 return ICUTokenizer(dsn, data_dir)
45 class ICUTokenizer(AbstractTokenizer):
46 """ This tokenizer uses libICU to convert names and queries to ASCII.
47 Otherwise it uses the same algorithms and data structures as the
48 normalization routines in Nominatim 3.
51 def __init__(self, dsn: str, data_dir: Path) -> None:
53 self.data_dir = data_dir
54 self.loader: Optional[ICURuleLoader] = None
57 def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
58 """ Set up a new tokenizer for the database.
60 This copies all necessary data in the project directory to make
61 sure the tokenizer remains stable even over updates.
63 self.loader = ICURuleLoader(config)
65 self._install_php(config.lib_dir.php, overwrite=True)
69 self.update_sql_functions(config)
70 self._setup_db_tables(config)
71 self._create_base_indices(config, 'word')
74 def init_from_project(self, config: Configuration) -> None:
75 """ Initialise the tokenizer from the project directory.
77 self.loader = ICURuleLoader(config)
79 with connect(self.dsn) as conn:
80 self.loader.load_config_from_db(conn)
82 self._install_php(config.lib_dir.php, overwrite=False)
85 def finalize_import(self, config: Configuration) -> None:
86 """ Do any required postprocessing to make the tokenizer data ready
89 self._create_lookup_indices(config, 'word')
92 def update_sql_functions(self, config: Configuration) -> None:
93 """ Reimport the SQL functions for this tokenizer.
95 with connect(self.dsn) as conn:
96 sqlp = SQLPreprocessor(conn, config)
97 sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
100 def check_database(self, config: Configuration) -> None:
101 """ Check that the tokenizer is set up correctly.
103 # Will throw an error if there is an issue.
104 self.init_from_project(config)
107 def update_statistics(self, config: Configuration, threads: int = 2) -> None:
108 """ Recompute frequencies for all name words.
110 with connect(self.dsn) as conn:
111 if not conn.table_exists('search_name'):
114 with conn.cursor() as cur:
115 cur.execute('ANALYSE search_name')
117 cur.execute('SET max_parallel_workers_per_gather TO %s',
120 if conn.server_version_tuple() < (12, 0):
121 LOG.info('Computing word frequencies')
122 cur.drop_table('word_frequencies')
123 cur.drop_table('addressword_frequencies')
124 cur.execute("""CREATE TEMP TABLE word_frequencies AS
125 SELECT unnest(name_vector) as id, count(*)
126 FROM search_name GROUP BY id""")
127 cur.execute('CREATE INDEX ON word_frequencies(id)')
128 cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
129 SELECT unnest(nameaddress_vector) as id, count(*)
130 FROM search_name GROUP BY id""")
131 cur.execute('CREATE INDEX ON addressword_frequencies(id)')
132 cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
140 FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
142 info = info || jsonb_build_object('count', rec.count);
144 FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
146 info = info || jsonb_build_object('addr_count', rec.count);
148 IF info = '{}'::jsonb THEN
152 $$ LANGUAGE plpgsql IMMUTABLE;
154 LOG.info('Update word table with recomputed frequencies')
155 cur.drop_table('tmp_word')
156 cur.execute("""CREATE TABLE tmp_word AS
157 SELECT word_id, word_token, type, word,
158 word_freq_update(word_id, info) as info
161 cur.drop_table('word_frequencies')
162 cur.drop_table('addressword_frequencies')
164 LOG.info('Computing word frequencies')
165 cur.drop_table('word_frequencies')
167 CREATE TEMP TABLE word_frequencies AS
168 WITH word_freq AS MATERIALIZED (
169 SELECT unnest(name_vector) as id, count(*)
170 FROM search_name GROUP BY id),
171 addr_freq AS MATERIALIZED (
172 SELECT unnest(nameaddress_vector) as id, count(*)
173 FROM search_name GROUP BY id)
174 SELECT coalesce(a.id, w.id) as id,
175 (CASE WHEN w.count is null THEN '{}'::JSONB
176 ELSE jsonb_build_object('count', w.count) END
178 CASE WHEN a.count is null THEN '{}'::JSONB
179 ELSE jsonb_build_object('addr_count', a.count) END) as info
180 FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
182 cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
183 cur.execute('ANALYSE word_frequencies')
184 LOG.info('Update word table with recomputed frequencies')
185 cur.drop_table('tmp_word')
186 cur.execute("""CREATE TABLE tmp_word AS
187 SELECT word_id, word_token, type, word,
188 (CASE WHEN wf.info is null THEN word.info
189 ELSE coalesce(word.info, '{}'::jsonb) || wf.info
191 FROM word LEFT JOIN word_frequencies wf
192 ON word.word_id = wf.id
194 cur.drop_table('word_frequencies')
196 with conn.cursor() as cur:
197 cur.execute('SET max_parallel_workers_per_gather TO 0')
199 sqlp = SQLPreprocessor(conn, config)
200 sqlp.run_string(conn,
201 'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
203 self._create_base_indices(config, 'tmp_word')
204 self._create_lookup_indices(config, 'tmp_word')
205 self._move_temporary_word_table('tmp_word')
209 def _cleanup_housenumbers(self) -> None:
210 """ Remove unused house numbers.
212 with connect(self.dsn) as conn:
213 if not conn.table_exists('search_name'):
215 with conn.cursor(name="hnr_counter") as cur:
216 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
219 AND NOT EXISTS(SELECT * FROM search_name
220 WHERE ARRAY[word.word_id] && name_vector)
221 AND (char_length(coalesce(word, word_token)) > 6
222 OR coalesce(word, word_token) not similar to '\\d+')
224 candidates = {token: wid for wid, token in cur}
225 with conn.cursor(name="hnr_counter") as cur:
226 cur.execute("""SELECT housenumber FROM placex
227 WHERE housenumber is not null
228 AND (char_length(housenumber) > 6
229 OR housenumber not similar to '\\d+')
232 for hnr in row[0].split(';'):
233 candidates.pop(hnr, None)
234 LOG.info("There are %s outdated housenumbers.", len(candidates))
235 LOG.debug("Outdated housenumbers: %s", candidates.keys())
237 with conn.cursor() as cur:
238 cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
239 (list(candidates.values()), ))
244 def update_word_tokens(self) -> None:
245 """ Remove unused tokens.
247 LOG.warning("Cleaning up housenumber tokens.")
248 self._cleanup_housenumbers()
249 LOG.warning("Tokenizer house-keeping done.")
252 def name_analyzer(self) -> 'ICUNameAnalyzer':
253 """ Create a new analyzer for tokenizing names and queries
254 using this tokinzer. Analyzers are context managers and should
258 with tokenizer.name_analyzer() as analyzer:
262 When used outside the with construct, the caller must ensure to
263 call the close() function before destructing the analyzer.
265 Analyzers are not thread-safe. You need to instantiate one per thread.
267 assert self.loader is not None
268 return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
269 self.loader.make_token_analysis())
272 def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
273 """ Return a list of the `num` most frequent full words
276 with conn.cursor() as cur:
277 cur.execute("""SELECT word, sum((info->>'count')::int) as count
278 FROM word WHERE type = 'W'
280 ORDER BY count DESC LIMIT %s""", (num,))
281 return list(s[0].split('@')[0] for s in cur)
284 def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
285 """ Install the php script for the tokenizer.
287 if phpdir is not None:
288 assert self.loader is not None
289 php_file = self.data_dir / "tokenizer.php"
291 if not php_file.exists() or overwrite:
292 php_file.write_text(dedent(f"""\
294 @define('CONST_Max_Word_Frequency', 10000000);
295 @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
296 @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
297 require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
300 def _save_config(self) -> None:
301 """ Save the configuration that needs to remain stable for the given
302 database as database properties.
304 assert self.loader is not None
305 with connect(self.dsn) as conn:
306 self.loader.save_config_to_db(conn)
309 def _setup_db_tables(self, config: Configuration) -> None:
310 """ Set up the word table and fill it with pre-computed word
313 with connect(self.dsn) as conn:
314 with conn.cursor() as cur:
315 cur.drop_table('word')
316 sqlp = SQLPreprocessor(conn, config)
317 sqlp.run_string(conn, """
320 word_token text NOT NULL,
324 ) {{db.tablespace.search_data}};
325 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
327 DROP SEQUENCE IF EXISTS seq_word;
328 CREATE SEQUENCE seq_word start 1;
329 GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
334 def _create_base_indices(self, config: Configuration, table_name: str) -> None:
335 """ Set up the word table and fill it with pre-computed word
338 with connect(self.dsn) as conn:
339 sqlp = SQLPreprocessor(conn, config)
340 sqlp.run_string(conn,
341 """CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
342 USING BTREE (word_token) {{db.tablespace.search_index}}""",
343 table_name=table_name)
344 for name, ctype in WORD_TYPES:
345 sqlp.run_string(conn,
346 """CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
347 USING BTREE (word) {{db.tablespace.address_index}}
348 WHERE type = '{{column_type}}'
350 table_name=table_name, idx_name=name,
355 def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
356 """ Create additional indexes used when running the API.
358 with connect(self.dsn) as conn:
359 sqlp = SQLPreprocessor(conn, config)
360 # Index required for details lookup.
361 sqlp.run_string(conn, """
362 CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
363 ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
365 table_name=table_name)
369 def _move_temporary_word_table(self, old: str) -> None:
370 """ Rename all tables and indexes used by the tokenizer.
372 with connect(self.dsn) as conn:
373 with conn.cursor() as cur:
374 cur.drop_table('word')
375 cur.execute(f"ALTER TABLE {old} RENAME TO word")
376 for idx in ('word_token', 'word_id'):
377 cur.execute(f"""ALTER INDEX idx_{old}_{idx}
378 RENAME TO idx_word_{idx}""")
379 for name, _ in WORD_TYPES:
380 cur.execute(f"""ALTER INDEX idx_{old}_{name}
381 RENAME TO idx_word_{name}""")
387 class ICUNameAnalyzer(AbstractAnalyzer):
388 """ The ICU analyzer uses the ICU library for splitting names.
390 Each instance opens a connection to the database to request the
394 def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
395 token_analysis: ICUTokenAnalysis) -> None:
396 self.conn: Optional[Connection] = connect(dsn).connection
397 self.conn.autocommit = True
398 self.sanitizer = sanitizer
399 self.token_analysis = token_analysis
401 self._cache = _TokenCache()
404 def close(self) -> None:
405 """ Free all resources used by the analyzer.
412 def _search_normalized(self, name: str) -> str:
413 """ Return the search token transliteration of the given name.
415 return cast(str, self.token_analysis.search.transliterate(name)).strip()
418 def _normalized(self, name: str) -> str:
419 """ Return the normalized version of the given name with all
420 non-relevant information removed.
422 return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
425 def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
426 """ Return token information for the given list of words.
427 If a word starts with # it is assumed to be a full name
428 otherwise is a partial name.
430 The function returns a list of tuples with
431 (original word, word token, word id).
433 The function is used for testing and debugging only
434 and not necessarily efficient.
436 assert self.conn is not None
440 if word.startswith('#'):
441 full_tokens[word] = self._search_normalized(word[1:])
443 partial_tokens[word] = self._search_normalized(word)
445 with self.conn.cursor() as cur:
446 cur.execute("""SELECT word_token, word_id
447 FROM word WHERE word_token = ANY(%s) and type = 'W'
448 """, (list(full_tokens.values()),))
449 full_ids = {r[0]: r[1] for r in cur}
450 cur.execute("""SELECT word_token, word_id
451 FROM word WHERE word_token = ANY(%s) and type = 'w'""",
452 (list(partial_tokens.values()),))
453 part_ids = {r[0]: r[1] for r in cur}
455 return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
456 + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
459 def normalize_postcode(self, postcode: str) -> str:
460 """ Convert the postcode to a standardized form.
462 This function must yield exactly the same result as the SQL function
463 'token_normalized_postcode()'.
465 return postcode.strip().upper()
468 def update_postcodes_from_db(self) -> None:
469 """ Update postcode tokens in the word table from the location_postcode
472 assert self.conn is not None
473 analyzer = self.token_analysis.analysis.get('@postcode')
475 with self.conn.cursor() as cur:
476 # First get all postcode names currently in the word table.
477 cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
478 word_entries = set((entry[0] for entry in cur))
480 # Then compute the required postcode names from the postcode table.
481 needed_entries = set()
482 cur.execute("SELECT country_code, postcode FROM location_postcode")
483 for cc, postcode in cur:
484 info = PlaceInfo({'country_code': cc,
485 'class': 'place', 'type': 'postcode',
486 'address': {'postcode': postcode}})
487 address = self.sanitizer.process_names(info)[1]
488 for place in address:
489 if place.kind == 'postcode':
491 postcode_name = place.name.strip().upper()
494 postcode_name = analyzer.get_canonical_id(place)
495 variant_base = place.get_attr("variant")
498 needed_entries.add(f'{postcode_name}@{variant_base}')
500 needed_entries.add(postcode_name)
503 # Now update the word table.
504 self._delete_unused_postcode_words(word_entries - needed_entries)
505 self._add_missing_postcode_words(needed_entries - word_entries)
507 def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
508 assert self.conn is not None
510 with self.conn.cursor() as cur:
511 cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
514 def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
515 assert self.conn is not None
519 analyzer = self.token_analysis.analysis.get('@postcode')
522 for postcode_name in tokens:
523 if '@' in postcode_name:
524 term, variant = postcode_name.split('@', 2)
525 term = self._search_normalized(term)
529 variants = analyzer.compute_variants(variant)
530 if term not in variants:
531 variants.append(term)
533 variants = [self._search_normalized(postcode_name)]
534 terms.append((postcode_name, variants))
537 with self.conn.cursor() as cur:
538 cur.execute_values("""SELECT create_postcode_word(pc, var)
539 FROM (VALUES %s) AS v(pc, var)""",
545 def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
546 should_replace: bool) -> None:
547 """ Replace the search index for special phrases with the new phrases.
548 If `should_replace` is True, then the previous set of will be
549 completely replaced. Otherwise the phrases are added to the
550 already existing ones.
552 assert self.conn is not None
553 norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
556 with self.conn.cursor() as cur:
557 # Get the old phrases.
558 existing_phrases = set()
559 cur.execute("SELECT word, info FROM word WHERE type = 'S'")
560 for word, info in cur:
561 existing_phrases.add((word, info['class'], info['type'],
562 info.get('op') or '-'))
564 added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
566 deleted = self._remove_special_phrases(cur, norm_phrases,
571 LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
572 len(norm_phrases), added, deleted)
575 def _add_special_phrases(self, cursor: Cursor,
576 new_phrases: Set[Tuple[str, str, str, str]],
577 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
578 """ Add all phrases to the database that are not yet there.
580 to_add = new_phrases - existing_phrases
583 with CopyBuffer() as copystr:
584 for word, cls, typ, oper in to_add:
585 term = self._search_normalized(word)
587 copystr.add(term, 'S', word,
588 json.dumps({'class': cls, 'type': typ,
589 'op': oper if oper in ('in', 'near') else None}))
592 copystr.copy_out(cursor, 'word',
593 columns=['word_token', 'type', 'word', 'info'])
598 def _remove_special_phrases(self, cursor: Cursor,
599 new_phrases: Set[Tuple[str, str, str, str]],
600 existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
601 """ Remove all phrases from the database that are no longer in the
604 to_delete = existing_phrases - new_phrases
607 cursor.execute_values(
608 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
609 WHERE type = 'S' and word = name
610 and info->>'class' = in_class and info->>'type' = in_type
611 and ((op = '-' and info->>'op' is null) or op = info->>'op')
614 return len(to_delete)
617 def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
618 """ Add default names for the given country to the search index.
620 # Make sure any name preprocessing for country names applies.
621 info = PlaceInfo({'name': names, 'country_code': country_code,
622 'rank_address': 4, 'class': 'boundary',
623 'type': 'administrative'})
624 self._add_country_full_names(country_code,
625 self.sanitizer.process_names(info)[0],
629 def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
630 internal: bool = False) -> None:
631 """ Add names for the given country from an already sanitized
634 assert self.conn is not None
637 norm_name = self._search_normalized(name.name)
639 word_tokens.add(norm_name)
641 with self.conn.cursor() as cur:
643 cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
645 WHERE type = 'C' and word = %s""",
647 # internal/external names
648 existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
650 existing_tokens[word[1]].add(word[0])
652 # Delete names that no longer exist.
653 gone_tokens = existing_tokens[internal] - word_tokens
655 gone_tokens.update(existing_tokens[False] & word_tokens)
657 cur.execute("""DELETE FROM word
658 USING unnest(%s) as token
659 WHERE type = 'C' and word = %s
660 and word_token = token""",
661 (list(gone_tokens), country_code))
663 # Only add those names that are not yet in the list.
664 new_tokens = word_tokens - existing_tokens[True]
666 new_tokens -= existing_tokens[False]
669 sql = """INSERT INTO word (word_token, type, word, info)
670 (SELECT token, 'C', %s, '{"internal": "yes"}'
671 FROM unnest(%s) as token)
674 sql = """INSERT INTO word (word_token, type, word)
675 (SELECT token, 'C', %s
676 FROM unnest(%s) as token)
678 cur.execute(sql, (country_code, list(new_tokens)))
681 def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
682 """ Determine tokenizer information about the given place.
684 Returns a JSON-serializable structure that will be handed into
685 the database via the token_info field.
687 token_info = _TokenInfo()
689 names, address = self.sanitizer.process_names(place)
692 token_info.set_names(*self._compute_name_tokens(names))
694 if place.is_country():
695 assert place.country_code is not None
696 self._add_country_full_names(place.country_code, names)
699 self._process_place_address(token_info, address)
701 return token_info.to_dict()
704 def _process_place_address(self, token_info: '_TokenInfo',
705 address: Sequence[PlaceName]) -> None:
707 if item.kind == 'postcode':
708 token_info.set_postcode(self._add_postcode(item))
709 elif item.kind == 'housenumber':
710 token_info.add_housenumber(*self._compute_housenumber_token(item))
711 elif item.kind == 'street':
712 token_info.add_street(self._retrieve_full_tokens(item.name))
713 elif item.kind == 'place':
715 token_info.add_place(self._compute_partial_tokens(item.name))
716 elif not item.kind.startswith('_') and not item.suffix and \
717 item.kind not in ('country', 'full', 'inclusion'):
718 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
721 def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
722 """ Normalize the housenumber and return the word token and the
725 assert self.conn is not None
726 analyzer = self.token_analysis.analysis.get('@housenumber')
727 result: Tuple[Optional[int], Optional[str]] = (None, None)
730 # When no custom analyzer is set, simply normalize and transliterate
731 norm_name = self._search_normalized(hnr.name)
733 result = self._cache.housenumbers.get(norm_name, result)
734 if result[0] is None:
735 with self.conn.cursor() as cur:
736 hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
738 result = hid, norm_name
739 self._cache.housenumbers[norm_name] = result
741 # Otherwise use the analyzer to determine the canonical name.
742 # Per convention we use the first variant as the 'lookup name', the
743 # name that gets saved in the housenumber field of the place.
744 word_id = analyzer.get_canonical_id(hnr)
746 result = self._cache.housenumbers.get(word_id, result)
747 if result[0] is None:
748 variants = analyzer.compute_variants(word_id)
750 with self.conn.cursor() as cur:
751 hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
752 (word_id, list(variants)))
753 result = hid, variants[0]
754 self._cache.housenumbers[word_id] = result
759 def _compute_partial_tokens(self, name: str) -> List[int]:
760 """ Normalize the given term, split it into partial words and return
761 then token list for them.
763 assert self.conn is not None
764 norm_name = self._search_normalized(name)
768 for partial in norm_name.split():
769 token = self._cache.partials.get(partial)
773 need_lookup.append(partial)
776 with self.conn.cursor() as cur:
777 cur.execute("""SELECT word, getorcreate_partial_word(word)
778 FROM unnest(%s) word""",
781 for partial, token in cur:
782 assert token is not None
784 self._cache.partials[partial] = token
789 def _retrieve_full_tokens(self, name: str) -> List[int]:
790 """ Get the full name token for the given name, if it exists.
791 The name is only retrieved for the standard analyser.
793 assert self.conn is not None
794 norm_name = self._search_normalized(name)
796 # return cached if possible
797 if norm_name in self._cache.fulls:
798 return self._cache.fulls[norm_name]
800 with self.conn.cursor() as cur:
801 cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
803 full = [row[0] for row in cur]
805 self._cache.fulls[norm_name] = full
810 def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
811 """ Computes the full name and partial name tokens for the given
814 assert self.conn is not None
815 full_tokens: Set[int] = set()
816 partial_tokens: Set[int] = set()
819 analyzer_id = name.get_attr('analyzer')
820 analyzer = self.token_analysis.get_analyzer(analyzer_id)
821 word_id = analyzer.get_canonical_id(name)
822 if analyzer_id is None:
825 token_id = f'{word_id}@{analyzer_id}'
827 full, part = self._cache.names.get(token_id, (None, None))
829 variants = analyzer.compute_variants(word_id)
833 with self.conn.cursor() as cur:
834 cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
835 (token_id, variants))
836 full, part = cast(Tuple[int, List[int]], cur.fetchone())
838 self._cache.names[token_id] = (full, part)
840 assert part is not None
842 full_tokens.add(full)
843 partial_tokens.update(part)
845 return full_tokens, partial_tokens
848 def _add_postcode(self, item: PlaceName) -> Optional[str]:
849 """ Make sure the normalized postcode is present in the word table.
851 assert self.conn is not None
852 analyzer = self.token_analysis.analysis.get('@postcode')
855 postcode_name = item.name.strip().upper()
858 postcode_name = analyzer.get_canonical_id(item)
859 variant_base = item.get_attr("variant")
862 postcode = f'{postcode_name}@{variant_base}'
864 postcode = postcode_name
866 if postcode not in self._cache.postcodes:
867 term = self._search_normalized(postcode_name)
872 if analyzer is not None and variant_base:
873 variants.update(analyzer.compute_variants(variant_base))
875 with self.conn.cursor() as cur:
876 cur.execute("SELECT create_postcode_word(%s, %s)",
877 (postcode, list(variants)))
878 self._cache.postcodes.add(postcode)
884 """ Collect token information to be sent back to the database.
886 def __init__(self) -> None:
887 self.names: Optional[str] = None
888 self.housenumbers: Set[str] = set()
889 self.housenumber_tokens: Set[int] = set()
890 self.street_tokens: Optional[Set[int]] = None
891 self.place_tokens: Set[int] = set()
892 self.address_tokens: Dict[str, str] = {}
893 self.postcode: Optional[str] = None
896 def _mk_array(self, tokens: Iterable[Any]) -> str:
897 return f"{{{','.join((str(s) for s in tokens))}}}"
900 def to_dict(self) -> Dict[str, Any]:
901 """ Return the token information in database importable format.
903 out: Dict[str, Any] = {}
906 out['names'] = self.names
908 if self.housenumbers:
909 out['hnr'] = ';'.join(self.housenumbers)
910 out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
912 if self.street_tokens is not None:
913 out['street'] = self._mk_array(self.street_tokens)
915 if self.place_tokens:
916 out['place'] = self._mk_array(self.place_tokens)
918 if self.address_tokens:
919 out['addr'] = self.address_tokens
922 out['postcode'] = self.postcode
927 def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
928 """ Adds token information for the normalised names.
930 self.names = self._mk_array(itertools.chain(fulls, partials))
933 def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
934 """ Extract housenumber information from a list of normalised
938 assert hnr is not None
939 self.housenumbers.add(hnr)
940 self.housenumber_tokens.add(token)
943 def add_street(self, tokens: Iterable[int]) -> None:
944 """ Add addr:street match terms.
946 if self.street_tokens is None:
947 self.street_tokens = set()
948 self.street_tokens.update(tokens)
951 def add_place(self, tokens: Iterable[int]) -> None:
952 """ Add addr:place search and match terms.
954 self.place_tokens.update(tokens)
957 def add_address_term(self, key: str, partials: Iterable[int]) -> None:
958 """ Add additional address terms.
961 self.address_tokens[key] = self._mk_array(partials)
963 def set_postcode(self, postcode: Optional[str]) -> None:
964 """ Set the postcode to the given one.
966 self.postcode = postcode
970 """ Cache for token information to avoid repeated database queries.
972 This cache is not thread-safe and needs to be instantiated per
975 def __init__(self) -> None:
976 self.names: Dict[str, Tuple[int, List[int]]] = {}
977 self.partials: Dict[str, int] = {}
978 self.fulls: Dict[str, List[int]] = {}
979 self.postcodes: Set[str] = set()
980 self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}