X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/45d13bc295dbe83940ae47aaa4ee3b9032a46df4..a96b6a1289e3a595b2d3753a1a038abc3f19721a:/nominatim/tokenizer/legacy_tokenizer.py diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index a50dedb2..93808cc3 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -210,7 +210,7 @@ class LegacyTokenizer(AbstractTokenizer): self._save_config(conn, config) - def update_statistics(self) -> None: + def update_statistics(self, config: Configuration, threads: int = 1) -> None: """ Recompute the frequency of full words. """ with connect(self.dsn) as conn: @@ -256,18 +256,29 @@ class LegacyTokenizer(AbstractTokenizer): return LegacyNameAnalyzer(self.dsn, normalizer) + def most_frequent_words(self, conn: Connection, num: int) -> List[str]: + """ Return a list of the `num` most frequent full words + in the database. + """ + with conn.cursor() as cur: + cur.execute(""" SELECT word FROM word WHERE word is not null + ORDER BY search_name_count DESC LIMIT %s""", (num,)) + return list(s[0] for s in cur) + + def _install_php(self, config: Configuration, overwrite: bool = True) -> None: """ Install the php script for the tokenizer. """ - php_file = self.data_dir / "tokenizer.php" + if config.lib_dir.php is not None: + php_file = self.data_dir / "tokenizer.php" - if not php_file.exists() or overwrite: - php_file.write_text(dedent(f"""\ - None: @@ -564,14 +575,13 @@ class _TokenInfo: def add_street(self, conn: Connection, street: str) -> None: """ Add addr:street match terms. """ - def _get_street(name: str) -> List[int]: + def _get_street(name: str) -> Optional[str]: with conn.cursor() as cur: - return cast(List[int], + return cast(Optional[str], cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))) tokens = self.cache.streets.get(street, _get_street) - if tokens: - self.data['street'] = tokens + self.data['street'] = tokens or '{}' def add_place(self, conn: Connection, place: str) -> None: