X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/8d082c13e01f2c271318bb6ccf2a2f8e5000e315..5972eb8ee06d9fdbef948b2d260a6aeb50541e57:/nominatim/tokenizer/icu_tokenizer.py diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 79f383f6..799ff559 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -183,6 +183,18 @@ class ICUTokenizer(AbstractTokenizer): self.loader.make_token_analysis()) + def most_frequent_words(self, conn: Connection, num: int) -> List[str]: + """ Return a list of the `num` most frequent full words + in the database. + """ + with conn.cursor() as cur: + cur.execute("""SELECT word, sum((info->>'count')::int) as count + FROM word WHERE type = 'W' + GROUP BY word + ORDER BY count DESC LIMIT %s""", (num,)) + return list(s[0].split('@')[0] for s in cur) + + def _install_php(self, phpdir: Path, overwrite: bool = True) -> None: """ Install the php script for the tokenizer. """ @@ -720,7 +732,7 @@ class _TokenInfo: self.names: Optional[str] = None self.housenumbers: Set[str] = set() self.housenumber_tokens: Set[int] = set() - self.street_tokens: Set[int] = set() + self.street_tokens: Optional[Set[int]] = None self.place_tokens: Set[int] = set() self.address_tokens: Dict[str, str] = {} self.postcode: Optional[str] = None @@ -742,7 +754,7 @@ class _TokenInfo: out['hnr'] = ';'.join(self.housenumbers) out['hnr_tokens'] = self._mk_array(self.housenumber_tokens) - if self.street_tokens: + if self.street_tokens is not None: out['street'] = self._mk_array(self.street_tokens) if self.place_tokens: @@ -776,6 +788,8 @@ class _TokenInfo: def add_street(self, tokens: Iterable[int]) -> None: """ Add addr:street match terms. """ + if self.street_tokens is None: + self.street_tokens = set() self.street_tokens.update(tokens)