from nominatim.data.place_info import PlaceInfo
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
-from nominatim.tokenizer.sanitizers.base import PlaceName
+from nominatim.data.place_name import PlaceName
from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
class ICUTokenizer(AbstractTokenizer):
- """ This tokenizer uses libICU to covert names and queries to ASCII.
+ """ This tokenizer uses libICU to convert names and queries to ASCII.
Otherwise it uses the same algorithms and data structures as the
normalization routines in Nominatim 3.
"""
postcode_name = place.name.strip().upper()
variant_base = None
else:
- postcode_name = analyzer.normalize(place.name)
+ postcode_name = analyzer.get_canonical_id(place)
variant_base = place.get_attr("variant")
if variant_base:
if analyzer is None:
variants = [term]
else:
- variants = analyzer.get_variants_ascii(variant)
+ variants = analyzer.compute_variants(variant)
if term not in variants:
variants.append(term)
else:
def _remove_special_phrases(self, cursor: Cursor,
new_phrases: Set[Tuple[str, str, str, str]],
existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
- """ Remove all phrases from the databse that are no longer in the
+ """ Remove all phrases from the database that are no longer in the
new phrase list.
"""
to_delete = existing_phrases - new_phrases
# Otherwise use the analyzer to determine the canonical name.
# Per convention we use the first variant as the 'lookup name', the
# name that gets saved in the housenumber field of the place.
- norm_name = analyzer.normalize(hnr.name)
- if norm_name:
- result = self._cache.housenumbers.get(norm_name, result)
+ word_id = analyzer.get_canonical_id(hnr)
+ if word_id:
+ result = self._cache.housenumbers.get(word_id, result)
if result[0] is None:
- variants = analyzer.get_variants_ascii(norm_name)
+ variants = analyzer.compute_variants(word_id)
if variants:
with self.conn.cursor() as cur:
cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
- (norm_name, list(variants)))
+ (word_id, list(variants)))
result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call]
- self._cache.housenumbers[norm_name] = result
+ self._cache.housenumbers[word_id] = result
return result
def _retrieve_full_tokens(self, name: str) -> List[int]:
""" Get the full name token for the given name, if it exists.
- The name is only retrived for the standard analyser.
+ The name is only retrieved for the standard analyser.
"""
assert self.conn is not None
norm_name = self._search_normalized(name)
for name in names:
analyzer_id = name.get_attr('analyzer')
analyzer = self.token_analysis.get_analyzer(analyzer_id)
- norm_name = analyzer.normalize(name.name)
+ word_id = analyzer.get_canonical_id(name)
if analyzer_id is None:
- token_id = norm_name
+ token_id = word_id
else:
- token_id = f'{norm_name}@{analyzer_id}'
+ token_id = f'{word_id}@{analyzer_id}'
full, part = self._cache.names.get(token_id, (None, None))
if full is None:
- variants = analyzer.get_variants_ascii(norm_name)
+ variants = analyzer.compute_variants(word_id)
if not variants:
continue
postcode_name = item.name.strip().upper()
variant_base = None
else:
- postcode_name = analyzer.normalize(item.name)
+ postcode_name = analyzer.get_canonical_id(item)
variant_base = item.get_attr("variant")
if variant_base:
variants = {term}
if analyzer is not None and variant_base:
- variants.update(analyzer.get_variants_ascii(variant_base))
+ variants.update(analyzer.compute_variants(variant_base))
with self.conn.cursor() as cur:
cur.execute("SELECT create_postcode_word(%s, %s)",