X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/34d27ed45cac67fd155f6997d82fa6a5d8946ce9..fd26310d6adc5fc5685bdd0de36afa66e85b9c9c:/nominatim/tokenizer/icu_tokenizer.py diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 98c80376..799ff559 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -183,6 +183,18 @@ class ICUTokenizer(AbstractTokenizer): self.loader.make_token_analysis()) + def most_frequent_words(self, conn: Connection, num: int) -> List[str]: + """ Return a list of the `num` most frequent full words + in the database. + """ + with conn.cursor() as cur: + cur.execute("""SELECT word, sum((info->>'count')::int) as count + FROM word WHERE type = 'W' + GROUP BY word + ORDER BY count DESC LIMIT %s""", (num,)) + return list(s[0].split('@')[0] for s in cur) + + def _install_php(self, phpdir: Path, overwrite: bool = True) -> None: """ Install the php script for the tokenizer. """ @@ -324,7 +336,7 @@ class ICUNameAnalyzer(AbstractAnalyzer): postcode_name = place.name.strip().upper() variant_base = None else: - postcode_name = analyzer.normalize(place.name) + postcode_name = analyzer.get_canonical_id(place) variant_base = place.get_attr("variant") if variant_base: @@ -359,7 +371,7 @@ class ICUNameAnalyzer(AbstractAnalyzer): if analyzer is None: variants = [term] else: - variants = analyzer.get_variants_ascii(variant) + variants = analyzer.compute_variants(variant) if term not in variants: variants.append(term) else: @@ -566,24 +578,25 @@ class ICUNameAnalyzer(AbstractAnalyzer): result = self._cache.housenumbers.get(norm_name, result) if result[0] is None: with self.conn.cursor() as cur: - cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, )) - result = cur.fetchone()[0], norm_name # type: ignore[no-untyped-call] + hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, )) + + result = hid, norm_name self._cache.housenumbers[norm_name] = result else: # Otherwise use the analyzer to determine the canonical name. # Per convention we use the first variant as the 'lookup name', the # name that gets saved in the housenumber field of the place. - norm_name = analyzer.normalize(hnr.name) - if norm_name: - result = self._cache.housenumbers.get(norm_name, result) + word_id = analyzer.get_canonical_id(hnr) + if word_id: + result = self._cache.housenumbers.get(word_id, result) if result[0] is None: - variants = analyzer.get_variants_ascii(norm_name) + variants = analyzer.compute_variants(word_id) if variants: with self.conn.cursor() as cur: - cur.execute("SELECT create_analyzed_hnr_id(%s, %s)", - (norm_name, list(variants))) - result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call] - self._cache.housenumbers[norm_name] = result + hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)", + (word_id, list(variants))) + result = hid, variants[0] + self._cache.housenumbers[word_id] = result return result @@ -650,23 +663,22 @@ class ICUNameAnalyzer(AbstractAnalyzer): for name in names: analyzer_id = name.get_attr('analyzer') analyzer = self.token_analysis.get_analyzer(analyzer_id) - norm_name = analyzer.normalize(name.name) + word_id = analyzer.get_canonical_id(name) if analyzer_id is None: - token_id = norm_name + token_id = word_id else: - token_id = f'{norm_name}@{analyzer_id}' + token_id = f'{word_id}@{analyzer_id}' full, part = self._cache.names.get(token_id, (None, None)) if full is None: - variants = analyzer.get_variants_ascii(norm_name) + variants = analyzer.compute_variants(word_id) if not variants: continue with self.conn.cursor() as cur: cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)", (token_id, variants)) - full, part = cast(Tuple[int, List[int]], - cur.fetchone()) # type: ignore[no-untyped-call] + full, part = cast(Tuple[int, List[int]], cur.fetchone()) self._cache.names[token_id] = (full, part) @@ -688,7 +700,7 @@ class ICUNameAnalyzer(AbstractAnalyzer): postcode_name = item.name.strip().upper() variant_base = None else: - postcode_name = analyzer.normalize(item.name) + postcode_name = analyzer.get_canonical_id(item) variant_base = item.get_attr("variant") if variant_base: @@ -703,7 +715,7 @@ class ICUNameAnalyzer(AbstractAnalyzer): variants = {term} if analyzer is not None and variant_base: - variants.update(analyzer.get_variants_ascii(variant_base)) + variants.update(analyzer.compute_variants(variant_base)) with self.conn.cursor() as cur: cur.execute("SELECT create_postcode_word(%s, %s)", @@ -720,7 +732,7 @@ class _TokenInfo: self.names: Optional[str] = None self.housenumbers: Set[str] = set() self.housenumber_tokens: Set[int] = set() - self.street_tokens: Set[int] = set() + self.street_tokens: Optional[Set[int]] = None self.place_tokens: Set[int] = set() self.address_tokens: Dict[str, str] = {} self.postcode: Optional[str] = None @@ -742,7 +754,7 @@ class _TokenInfo: out['hnr'] = ';'.join(self.housenumbers) out['hnr_tokens'] = self._mk_array(self.housenumber_tokens) - if self.street_tokens: + if self.street_tokens is not None: out['street'] = self._mk_array(self.street_tokens) if self.place_tokens: @@ -776,6 +788,8 @@ class _TokenInfo: def add_street(self, tokens: Iterable[int]) -> None: """ Add addr:street match terms. """ + if self.street_tokens is None: + self.street_tokens = set() self.street_tokens.update(tokens)