X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/2bd6c761b0e573189a03e8819c6db715eb711eb1..5972eb8ee06d9fdbef948b2d260a6aeb50541e57:/nominatim/tokenizer/icu_tokenizer.py diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 319838a1..799ff559 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -183,6 +183,18 @@ class ICUTokenizer(AbstractTokenizer): self.loader.make_token_analysis()) + def most_frequent_words(self, conn: Connection, num: int) -> List[str]: + """ Return a list of the `num` most frequent full words + in the database. + """ + with conn.cursor() as cur: + cur.execute("""SELECT word, sum((info->>'count')::int) as count + FROM word WHERE type = 'W' + GROUP BY word + ORDER BY count DESC LIMIT %s""", (num,)) + return list(s[0].split('@')[0] for s in cur) + + def _install_php(self, phpdir: Path, overwrite: bool = True) -> None: """ Install the php script for the tokenizer. """ @@ -566,8 +578,9 @@ class ICUNameAnalyzer(AbstractAnalyzer): result = self._cache.housenumbers.get(norm_name, result) if result[0] is None: with self.conn.cursor() as cur: - cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, )) - result = cur.fetchone()[0], norm_name # type: ignore[no-untyped-call] + hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, )) + + result = hid, norm_name self._cache.housenumbers[norm_name] = result else: # Otherwise use the analyzer to determine the canonical name. @@ -580,9 +593,9 @@ class ICUNameAnalyzer(AbstractAnalyzer): variants = analyzer.compute_variants(word_id) if variants: with self.conn.cursor() as cur: - cur.execute("SELECT create_analyzed_hnr_id(%s, %s)", - (word_id, list(variants))) - result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call] + hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)", + (word_id, list(variants))) + result = hid, variants[0] self._cache.housenumbers[word_id] = result return result @@ -665,8 +678,7 @@ class ICUNameAnalyzer(AbstractAnalyzer): with self.conn.cursor() as cur: cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)", (token_id, variants)) - full, part = cast(Tuple[int, List[int]], - cur.fetchone()) # type: ignore[no-untyped-call] + full, part = cast(Tuple[int, List[int]], cur.fetchone()) self._cache.names[token_id] = (full, part) @@ -720,7 +732,7 @@ class _TokenInfo: self.names: Optional[str] = None self.housenumbers: Set[str] = set() self.housenumber_tokens: Set[int] = set() - self.street_tokens: Set[int] = set() + self.street_tokens: Optional[Set[int]] = None self.place_tokens: Set[int] = set() self.address_tokens: Dict[str, str] = {} self.postcode: Optional[str] = None @@ -742,7 +754,7 @@ class _TokenInfo: out['hnr'] = ';'.join(self.housenumbers) out['hnr_tokens'] = self._mk_array(self.housenumber_tokens) - if self.street_tokens: + if self.street_tokens is not None: out['street'] = self._mk_array(self.street_tokens) if self.place_tokens: @@ -776,6 +788,8 @@ class _TokenInfo: def add_street(self, tokens: Iterable[int]) -> None: """ Add addr:street match terms. """ + if self.street_tokens is None: + self.street_tokens = set() self.street_tokens.update(tokens)