X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/7d911f9ffbdf63b2b2a45c3a3ee7063d006a5779..28101967ef5b89b85f492304b1002b8a85e6fe21:/nominatim/tokenizer/legacy_tokenizer.py?ds=sidebyside diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index e09700d9..93808cc3 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -210,7 +210,7 @@ class LegacyTokenizer(AbstractTokenizer): self._save_config(conn, config) - def update_statistics(self) -> None: + def update_statistics(self, config: Configuration, threads: int = 1) -> None: """ Recompute the frequency of full words. """ with connect(self.dsn) as conn: @@ -256,18 +256,29 @@ class LegacyTokenizer(AbstractTokenizer): return LegacyNameAnalyzer(self.dsn, normalizer) + def most_frequent_words(self, conn: Connection, num: int) -> List[str]: + """ Return a list of the `num` most frequent full words + in the database. + """ + with conn.cursor() as cur: + cur.execute(""" SELECT word FROM word WHERE word is not null + ORDER BY search_name_count DESC LIMIT %s""", (num,)) + return list(s[0] for s in cur) + + def _install_php(self, config: Configuration, overwrite: bool = True) -> None: """ Install the php script for the tokenizer. """ - php_file = self.data_dir / "tokenizer.php" - - if not php_file.exists() or overwrite: - php_file.write_text(dedent(f"""\ - None: