X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/90b29aa8080ffae08084cf556b45e05f9f2d3198..1c33cb3186a38ceb5cc4de0975ae1956c861f9b5:/nominatim/tokenizer/legacy_tokenizer.py?ds=sidebyside diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index a50dedb2..1b68a494 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -256,6 +256,16 @@ class LegacyTokenizer(AbstractTokenizer): return LegacyNameAnalyzer(self.dsn, normalizer) + def most_frequent_words(self, conn: Connection, num: int) -> List[str]: + """ Return a list of the `num` most frequent full words + in the database. + """ + with conn.cursor() as cur: + cur.execute(""" SELECT word FROM word WHERE word is not null + ORDER BY search_name_count DESC LIMIT %s""", (num,)) + return list(s[0] for s in cur) + + def _install_php(self, config: Configuration, overwrite: bool = True) -> None: """ Install the php script for the tokenizer. """ @@ -564,14 +574,13 @@ class _TokenInfo: def add_street(self, conn: Connection, street: str) -> None: """ Add addr:street match terms. """ - def _get_street(name: str) -> List[int]: + def _get_street(name: str) -> Optional[str]: with conn.cursor() as cur: - return cast(List[int], + return cast(Optional[str], cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))) tokens = self.cache.streets.get(street, _get_street) - if tokens: - self.data['street'] = tokens + self.data['street'] = tokens or '{}' def add_place(self, conn: Connection, place: str) -> None: