X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/0bb59b2e225d9015462cbf069f83d6ed75eaaf3a..5c2c0604805452f8235e7574b7e4986f9f89802e:/nominatim/tokenizer/icu_tokenizer.py?ds=sidebyside diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 05c5a3ea..bf5544ed 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -51,7 +51,7 @@ class LegacyICUTokenizer(AbstractTokenizer): """ self.loader = ICURuleLoader(config) - self._install_php(config.lib_dir.php) + self._install_php(config.lib_dir.php, overwrite=True) self._save_config() if init_db: @@ -67,6 +67,8 @@ class LegacyICUTokenizer(AbstractTokenizer): with connect(self.dsn) as conn: self.loader.load_config_from_db(conn) + self._install_php(config.lib_dir.php, overwrite=False) + def finalize_import(self, config): """ Do any required postprocessing to make the tokenizer data ready @@ -119,12 +121,13 @@ class LegacyICUTokenizer(AbstractTokenizer): if not conn.table_exists('search_name'): return with conn.cursor(name="hnr_counter") as cur: - cur.execute("""SELECT word_id, word_token FROM word + cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token) + FROM word WHERE type = 'H' AND NOT EXISTS(SELECT * FROM search_name WHERE ARRAY[word.word_id] && name_vector) - AND (char_length(word_token) > 6 - OR word_token not similar to '\\d+') + AND (char_length(coalesce(word, word_token)) > 6 + OR coalesce(word, word_token) not similar to '\\d+') """) candidates = {token: wid for wid, token in cur} with conn.cursor(name="hnr_counter") as cur: @@ -137,6 +140,7 @@ class LegacyICUTokenizer(AbstractTokenizer): for hnr in row[0].split(';'): candidates.pop(hnr, None) LOG.info("There are %s outdated housenumbers.", len(candidates)) + LOG.debug("Outdated housenumbers: %s", candidates.keys()) if candidates: with conn.cursor() as cur: cur.execute("""DELETE FROM word WHERE word_id = any(%s)""", @@ -172,16 +176,18 @@ class LegacyICUTokenizer(AbstractTokenizer): self.loader.make_token_analysis()) - def _install_php(self, phpdir): + def _install_php(self, phpdir, overwrite=True): """ Install the php script for the tokenizer. """ php_file = self.data_dir / "tokenizer.php" - php_file.write_text(dedent(f"""\ -