Tokenizer implementing normalisation as used before Nominatim 4 but using
libICU instead of the PostgreSQL module.
"""
-from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, Dict, Set, Iterable
+from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
+ Dict, Set, Iterable
import itertools
import json
import logging
from nominatim.data.place_info import PlaceInfo
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
-from nominatim.tokenizer.sanitizers.base import PlaceName
+from nominatim.data.place_name import PlaceName
from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
class ICUTokenizer(AbstractTokenizer):
- """ This tokenizer uses libICU to covert names and queries to ASCII.
+ """ This tokenizer uses libICU to convert names and queries to ASCII.
Otherwise it uses the same algorithms and data structures as the
normalization routines in Nominatim 3.
"""
self.loader.make_token_analysis())
+ def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
+ """ Return a list of the `num` most frequent full words
+ in the database.
+ """
+ with conn.cursor() as cur:
+ cur.execute("""SELECT word, sum((info->>'count')::int) as count
+ FROM word WHERE type = 'W'
+ GROUP BY word
+ ORDER BY count DESC LIMIT %s""", (num,))
+ return list(s[0].split('@')[0] for s in cur)
+
+
def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
""" Install the php script for the tokenizer.
"""
postcode_name = place.name.strip().upper()
variant_base = None
else:
- postcode_name = analyzer.normalize(place.name)
+ postcode_name = analyzer.get_canonical_id(place)
variant_base = place.get_attr("variant")
if variant_base:
if analyzer is None:
variants = [term]
else:
- variants = analyzer.get_variants_ascii(variant)
+ variants = analyzer.compute_variants(variant)
if term not in variants:
variants.append(term)
else:
- def update_special_phrases(self, phrases: Sequence[Tuple[str, str, str, str]],
+ def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
should_replace: bool) -> None:
""" Replace the search index for special phrases with the new phrases.
If `should_replace` is True, then the previous set of will be
def _remove_special_phrases(self, cursor: Cursor,
new_phrases: Set[Tuple[str, str, str, str]],
existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
- """ Remove all phrases from the databse that are no longer in the
+ """ Remove all phrases from the database that are no longer in the
new phrase list.
"""
to_delete = existing_phrases - new_phrases
result = self._cache.housenumbers.get(norm_name, result)
if result[0] is None:
with self.conn.cursor() as cur:
- cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
- result = cur.fetchone()[0], norm_name # type: ignore[no-untyped-call]
+ hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
+
+ result = hid, norm_name
self._cache.housenumbers[norm_name] = result
else:
# Otherwise use the analyzer to determine the canonical name.
# Per convention we use the first variant as the 'lookup name', the
# name that gets saved in the housenumber field of the place.
- norm_name = analyzer.normalize(hnr.name)
- if norm_name:
- result = self._cache.housenumbers.get(norm_name, result)
+ word_id = analyzer.get_canonical_id(hnr)
+ if word_id:
+ result = self._cache.housenumbers.get(word_id, result)
if result[0] is None:
- variants = analyzer.get_variants_ascii(norm_name)
+ variants = analyzer.compute_variants(word_id)
if variants:
with self.conn.cursor() as cur:
- cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
- (norm_name, list(variants)))
- result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call]
- self._cache.housenumbers[norm_name] = result
+ hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
+ (word_id, list(variants)))
+ result = hid, variants[0]
+ self._cache.housenumbers[word_id] = result
return result
def _retrieve_full_tokens(self, name: str) -> List[int]:
""" Get the full name token for the given name, if it exists.
- The name is only retrived for the standard analyser.
+ The name is only retrieved for the standard analyser.
"""
assert self.conn is not None
norm_name = self._search_normalized(name)
for name in names:
analyzer_id = name.get_attr('analyzer')
analyzer = self.token_analysis.get_analyzer(analyzer_id)
- norm_name = analyzer.normalize(name.name)
+ word_id = analyzer.get_canonical_id(name)
if analyzer_id is None:
- token_id = norm_name
+ token_id = word_id
else:
- token_id = f'{norm_name}@{analyzer_id}'
+ token_id = f'{word_id}@{analyzer_id}'
full, part = self._cache.names.get(token_id, (None, None))
if full is None:
- variants = analyzer.get_variants_ascii(norm_name)
+ variants = analyzer.compute_variants(word_id)
if not variants:
continue
with self.conn.cursor() as cur:
cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
(token_id, variants))
- full, part = cast(Tuple[int, List[int]],
- cur.fetchone()) # type: ignore[no-untyped-call]
+ full, part = cast(Tuple[int, List[int]], cur.fetchone())
self._cache.names[token_id] = (full, part)
postcode_name = item.name.strip().upper()
variant_base = None
else:
- postcode_name = analyzer.normalize(item.name)
+ postcode_name = analyzer.get_canonical_id(item)
variant_base = item.get_attr("variant")
if variant_base:
variants = {term}
if analyzer is not None and variant_base:
- variants.update(analyzer.get_variants_ascii(variant_base))
+ variants.update(analyzer.compute_variants(variant_base))
with self.conn.cursor() as cur:
cur.execute("SELECT create_postcode_word(%s, %s)",
self.names: Optional[str] = None
self.housenumbers: Set[str] = set()
self.housenumber_tokens: Set[int] = set()
- self.street_tokens: Set[int] = set()
+ self.street_tokens: Optional[Set[int]] = None
self.place_tokens: Set[int] = set()
self.address_tokens: Dict[str, str] = {}
self.postcode: Optional[str] = None
out['hnr'] = ';'.join(self.housenumbers)
out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
- if self.street_tokens:
+ if self.street_tokens is not None:
out['street'] = self._mk_array(self.street_tokens)
if self.place_tokens:
def add_street(self, tokens: Iterable[int]) -> None:
""" Add addr:street match terms.
"""
+ if self.street_tokens is None:
+ self.street_tokens = set()
self.street_tokens.update(tokens)