X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/86588419fb1c3fffe131c0e8d99ecea3c77d67c5..a6dab5e300de9c5664f714eb2c9290d18f01067f:/nominatim/tokenizer/icu_tokenizer.py diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index cfbb44e3..171d4392 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -11,13 +11,12 @@ libICU instead of the PostgreSQL module. import itertools import json import logging -import re from textwrap import dedent from nominatim.db.connection import connect from nominatim.db.utils import CopyBuffer from nominatim.db.sql_preprocessor import SQLPreprocessor -from nominatim.indexer.place_info import PlaceInfo +from nominatim.data.place_info import PlaceInfo from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer @@ -51,7 +50,7 @@ class LegacyICUTokenizer(AbstractTokenizer): """ self.loader = ICURuleLoader(config) - self._install_php(config.lib_dir.php) + self._install_php(config.lib_dir.php, overwrite=True) self._save_config() if init_db: @@ -67,6 +66,8 @@ class LegacyICUTokenizer(AbstractTokenizer): with connect(self.dsn) as conn: self.loader.load_config_from_db(conn) + self._install_php(config.lib_dir.php, overwrite=False) + def finalize_import(self, config): """ Do any required postprocessing to make the tokenizer data ready @@ -112,6 +113,49 @@ class LegacyICUTokenizer(AbstractTokenizer): conn.commit() + def _cleanup_housenumbers(self): + """ Remove unused house numbers. + """ + with connect(self.dsn) as conn: + if not conn.table_exists('search_name'): + return + with conn.cursor(name="hnr_counter") as cur: + cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token) + FROM word + WHERE type = 'H' + AND NOT EXISTS(SELECT * FROM search_name + WHERE ARRAY[word.word_id] && name_vector) + AND (char_length(coalesce(word, word_token)) > 6 + OR coalesce(word, word_token) not similar to '\\d+') + """) + candidates = {token: wid for wid, token in cur} + with conn.cursor(name="hnr_counter") as cur: + cur.execute("""SELECT housenumber FROM placex + WHERE housenumber is not null + AND (char_length(housenumber) > 6 + OR housenumber not similar to '\\d+') + """) + for row in cur: + for hnr in row[0].split(';'): + candidates.pop(hnr, None) + LOG.info("There are %s outdated housenumbers.", len(candidates)) + LOG.debug("Outdated housenumbers: %s", candidates.keys()) + if candidates: + with conn.cursor() as cur: + cur.execute("""DELETE FROM word WHERE word_id = any(%s)""", + (list(candidates.values()), )) + conn.commit() + + + + def update_word_tokens(self): + """ Remove unused tokens. + """ + LOG.warning("Cleaning up housenumber tokens.") + self._cleanup_housenumbers() + LOG.warning("Tokenizer house-keeping done.") + + def name_analyzer(self): """ Create a new analyzer for tokenizing names and queries using this tokinzer. Analyzers are context managers and should @@ -131,16 +175,18 @@ class LegacyICUTokenizer(AbstractTokenizer): self.loader.make_token_analysis()) - def _install_php(self, phpdir): + def _install_php(self, phpdir, overwrite=True): """ Install the php script for the tokenizer. """ php_file = self.data_dir / "tokenizer.php" - php_file.write_text(dedent(f"""\ -