From: Sarah Hoffmann Date: Fri, 21 Mar 2025 08:02:52 +0000 (+0100) Subject: add lookup word to variants in word table X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/3cb183ffb074661a3ab82f8f0a69cd6bf2b27531 add lookup word to variants in word table --- diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql index 29039987..8cf13120 100644 --- a/lib-sql/tokenizer/icu_tokenizer.sql +++ b/lib-sql/tokenizer/icu_tokenizer.sql @@ -164,6 +164,60 @@ $$ LANGUAGE plpgsql; +CREATE OR REPLACE FUNCTION getorcreate_full_word(norm_term TEXT, + lookup_terms TEXT[], + lookup_norm_terms TEXT[], + OUT full_token INT, + OUT partial_tokens INT[]) + AS $$ +DECLARE + partial_terms TEXT[] = '{}'::TEXT[]; + term TEXT; + term_id INTEGER; +BEGIN + SELECT min(word_id) INTO full_token + FROM word WHERE word = norm_term and type = 'W'; + + IF full_token IS NULL THEN + full_token := nextval('seq_word'); + IF lookup_norm_terms IS NULL THEN + INSERT INTO word (word_id, word_token, type, word) + SELECT full_token, lookup_term, 'W', norm_term + FROM unnest(lookup_terms) as lookup_term; + ELSE + INSERT INTO word (word_id, word_token, type, word, info) + SELECT full_token, t.lookup, 'W', norm_term, + CASE WHEN norm_term = t.norm THEN null + ELSE json_build_object('lookup', t.norm) END + FROM unnest(lookup_terms, lookup_norm_terms) as t(lookup, norm); + END IF; + END IF; + + FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP + term := trim(term); + IF NOT (ARRAY[term] <@ partial_terms) THEN + partial_terms := partial_terms || term; + END IF; + END LOOP; + + partial_tokens := '{}'::INT[]; + FOR term IN SELECT unnest(partial_terms) LOOP + SELECT min(word_id) INTO term_id + FROM word WHERE word_token = term and type = 'w'; + + IF term_id IS NULL THEN + term_id := nextval('seq_word'); + INSERT INTO word (word_id, word_token, type) + VALUES (term_id, term, 'w'); + END IF; + + partial_tokens := array_merge(partial_tokens, ARRAY[term_id]); + END LOOP; +END; +$$ +LANGUAGE plpgsql; + + CREATE OR REPLACE FUNCTION getorcreate_partial_word(partial TEXT) RETURNS INTEGER AS $$ diff --git a/src/nominatim_db/tokenizer/icu_tokenizer.py b/src/nominatim_db/tokenizer/icu_tokenizer.py index 2b17d611..297c9ef9 100644 --- a/src/nominatim_db/tokenizer/icu_tokenizer.py +++ b/src/nominatim_db/tokenizer/icu_tokenizer.py @@ -585,10 +585,14 @@ class ICUNameAnalyzer(AbstractAnalyzer): if word_id: result = self._cache.housenumbers.get(word_id, result) if result[0] is None: - variants = analyzer.compute_variants(word_id) + varout = analyzer.compute_variants(word_id) + if isinstance(varout, tuple): + variants = varout[0] + else: + variants = varout if variants: hid = execute_scalar(self.conn, "SELECT create_analyzed_hnr_id(%s, %s)", - (word_id, list(variants))) + (word_id, variants)) result = hid, variants[0] self._cache.housenumbers[word_id] = result @@ -633,13 +637,17 @@ class ICUNameAnalyzer(AbstractAnalyzer): full, part = self._cache.names.get(token_id, (None, None)) if full is None: - variants = analyzer.compute_variants(word_id) + varset = analyzer.compute_variants(word_id) + if isinstance(varset, tuple): + variants, lookups = varset + else: + variants, lookups = varset, None if not variants: continue with self.conn.cursor() as cur: - cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)", - (token_id, variants)) + cur.execute("SELECT * FROM getorcreate_full_word(%s, %s, %s)", + (token_id, variants, lookups)) full, part = cast(Tuple[int, List[int]], cur.fetchone()) self._cache.names[token_id] = (full, part) diff --git a/src/nominatim_db/tokenizer/token_analysis/base.py b/src/nominatim_db/tokenizer/token_analysis/base.py index 52ee8013..186f1d3e 100644 --- a/src/nominatim_db/tokenizer/token_analysis/base.py +++ b/src/nominatim_db/tokenizer/token_analysis/base.py @@ -7,7 +7,7 @@ """ Common data types and protocols for analysers. """ -from typing import Mapping, List, Any +from typing import Mapping, List, Any, Union, Tuple from ...typing import Protocol from ...data.place_name import PlaceName @@ -33,7 +33,7 @@ class Analyzer(Protocol): for example because the character set in use does not match. """ - def compute_variants(self, canonical_id: str) -> List[str]: + def compute_variants(self, canonical_id: str) -> Union[List[str], Tuple[List[str], List[str]]]: """ Compute the transliterated spelling variants for the given canonical ID. diff --git a/src/nominatim_db/tokenizer/token_analysis/generic.py b/src/nominatim_db/tokenizer/token_analysis/generic.py index fa9dc4df..b01cebf7 100644 --- a/src/nominatim_db/tokenizer/token_analysis/generic.py +++ b/src/nominatim_db/tokenizer/token_analysis/generic.py @@ -7,7 +7,7 @@ """ Generic processor for names that creates abbreviation variants. """ -from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast +from typing import Mapping, Dict, Any, Iterable, Optional, List, cast, Tuple import itertools from ...errors import UsageError @@ -78,7 +78,7 @@ class GenericTokenAnalysis: """ return cast(str, self.norm.transliterate(name.name)).strip() - def compute_variants(self, norm_name: str) -> List[str]: + def compute_variants(self, norm_name: str) -> Tuple[List[str], List[str]]: """ Compute the spelling variants for the given normalized name and transliterate the result. """ @@ -87,18 +87,20 @@ class GenericTokenAnalysis: for mutation in self.mutations: variants = mutation.generate(variants) - return [name for name in self._transliterate_unique_list(norm_name, variants) if name] - - def _transliterate_unique_list(self, norm_name: str, - iterable: Iterable[str]) -> Iterator[Optional[str]]: - seen = set() + varset = set(map(str.strip, variants)) if self.variant_only: - seen.add(norm_name) + varset.discard(norm_name) + + trans = [] + norm = [] + + for var in varset: + t = self.to_ascii.transliterate(var).strip() + if t: + trans.append(t) + norm.append(var) - for variant in map(str.strip, iterable): - if variant not in seen: - seen.add(variant) - yield self.to_ascii.transliterate(variant).strip() + return trans, norm def _generate_word_variants(self, norm_name: str) -> Iterable[str]: baseform = '^ ' + norm_name + ' ^'