X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/18b16e06ca9e89136cb8445b00d7471840fd22e2..a7a920a9a5b55bf4290b184f05898a8589c95b40:/nominatim/tokenizer/legacy_tokenizer.py diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index 848d6191..93808cc3 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -7,7 +7,8 @@ """ Tokenizer implementing normalisation as used before Nominatim 4. """ -from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, cast, Dict, Set +from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, \ + cast, Dict, Set, Iterable from collections import OrderedDict import logging from pathlib import Path @@ -105,6 +106,7 @@ class LegacyTokenizer(AbstractTokenizer): This copies all necessary data in the project directory to make sure the tokenizer remains stable even over updates. """ + assert config.project_dir is not None module_dir = _install_module(config.DATABASE_MODULE_PATH, config.lib_dir.module, config.project_dir / 'module') @@ -126,6 +128,8 @@ class LegacyTokenizer(AbstractTokenizer): def init_from_project(self, config: Configuration) -> None: """ Initialise the tokenizer from the project directory. """ + assert config.project_dir is not None + with connect(self.dsn) as conn: self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION) @@ -148,6 +152,8 @@ class LegacyTokenizer(AbstractTokenizer): def update_sql_functions(self, config: Configuration) -> None: """ Reimport the SQL functions for this tokenizer. """ + assert config.project_dir is not None + with connect(self.dsn) as conn: max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ) modulepath = config.DATABASE_MODULE_PATH or \ @@ -192,6 +198,8 @@ class LegacyTokenizer(AbstractTokenizer): This is a special migration function for updating existing databases to new software versions. """ + assert config.project_dir is not None + self.normalization = config.TERM_NORMALIZATION module_dir = _install_module(config.DATABASE_MODULE_PATH, config.lib_dir.module, @@ -202,7 +210,7 @@ class LegacyTokenizer(AbstractTokenizer): self._save_config(conn, config) - def update_statistics(self) -> None: + def update_statistics(self, config: Configuration, threads: int = 1) -> None: """ Recompute the frequency of full words. """ with connect(self.dsn) as conn: @@ -248,18 +256,29 @@ class LegacyTokenizer(AbstractTokenizer): return LegacyNameAnalyzer(self.dsn, normalizer) + def most_frequent_words(self, conn: Connection, num: int) -> List[str]: + """ Return a list of the `num` most frequent full words + in the database. + """ + with conn.cursor() as cur: + cur.execute(""" SELECT word FROM word WHERE word is not null + ORDER BY search_name_count DESC LIMIT %s""", (num,)) + return list(s[0] for s in cur) + + def _install_php(self, config: Configuration, overwrite: bool = True) -> None: """ Install the php script for the tokenizer. """ - php_file = self.data_dir / "tokenizer.php" + if config.lib_dir.php is not None: + php_file = self.data_dir / "tokenizer.php" - if not php_file.exists() or overwrite: - php_file.write_text(dedent(f"""\ - None: @@ -392,7 +411,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer): - def update_special_phrases(self, phrases: Sequence[Tuple[str, str, str, str]], + def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]], should_replace: bool) -> None: """ Replace the search index for special phrases with the new phrases. """ @@ -543,8 +562,9 @@ class _TokenInfo: with conn.cursor() as cur: cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, )) - self.data['hnr_tokens'], self.data['hnr'] = \ - cur.fetchone() # type: ignore[no-untyped-call] + result = cur.fetchone() + assert result is not None + self.data['hnr_tokens'], self.data['hnr'] = result def set_postcode(self, postcode: str) -> None: @@ -555,14 +575,13 @@ class _TokenInfo: def add_street(self, conn: Connection, street: str) -> None: """ Add addr:street match terms. """ - def _get_street(name: str) -> List[int]: + def _get_street(name: str) -> Optional[str]: with conn.cursor() as cur: - return cast(List[int], + return cast(Optional[str], cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))) tokens = self.cache.streets.get(street, _get_street) - if tokens: - self.data['street'] = tokens + self.data['street'] = tokens or '{}' def add_place(self, conn: Connection, place: str) -> None: @@ -573,8 +592,7 @@ class _TokenInfo: cur.execute("""SELECT make_keywords(hstore('name' , %s))::text, word_ids_from_name(%s)::text""", (name, name)) - return cast(Tuple[List[int], List[int]], - cur.fetchone()) # type: ignore[no-untyped-call] + return cast(Tuple[List[int], List[int]], cur.fetchone()) self.data['place_search'], self.data['place_match'] = \ self.cache.places.get(place, _get_place) @@ -588,8 +606,7 @@ class _TokenInfo: cur.execute("""SELECT addr_ids_from_name(%s)::text, word_ids_from_name(%s)::text""", (name, name)) - return cast(Tuple[List[int], List[int]], - cur.fetchone()) # type: ignore[no-untyped-call] + return cast(Tuple[List[int], List[int]], cur.fetchone()) tokens = {} for key, value in terms: