X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/1618aba5f282a27fc45af28c4eeebb6dcd28c332..1e9f37ab82db1758235bedf83c659693f4ca6c3e:/nominatim/tokenizer/legacy_icu_tokenizer.py?ds=sidebyside diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 32dd6535..a887ae28 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -4,6 +4,7 @@ libICU instead of the PostgreSQL module. """ from collections import Counter import itertools +import json import logging import re from textwrap import dedent @@ -74,13 +75,10 @@ class LegacyICUTokenizer: self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ) - def finalize_import(self, config): + def finalize_import(self, _): """ Do any required postprocessing to make the tokenizer data ready for use. """ - with connect(self.dsn) as conn: - sqlp = SQLPreprocessor(conn, config) - sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql') def update_sql_functions(self, config): @@ -121,18 +119,17 @@ class LegacyICUTokenizer: """ return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules)) - # pylint: disable=missing-format-attribute + def _install_php(self, phpdir): """ Install the php script for the tokenizer. """ php_file = self.data_dir / "tokenizer.php" - php_file.write_text(dedent("""\ + php_file.write_text(dedent(f"""\ >'class' = in_class and info->>'type' = in_type + and ((op = '-' and info->>'op' is null) or op = info->>'op') + """, to_delete) return len(to_delete) @@ -378,14 +385,14 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # Get existing names cur.execute("""SELECT word_token FROM word - WHERE type = 'C' and info->>'cc'= %s""", + WHERE type = 'C' and word = %s""", (country_code, )) word_tokens.difference_update((t[0] for t in cur)) # Only add those names that are not yet in the list. if word_tokens: - cur.execute("""INSERT INTO word (word_token, type, info) - (SELECT token, 'C', json_build_object('cc', %s) + cur.execute("""INSERT INTO word (word_token, type, word) + (SELECT token, 'C', %s FROM unnest(%s) as token) """, (country_code, list(word_tokens))) @@ -503,14 +510,12 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word, word_token, class, type, - search_name_count) - (SELECT pc, %s, 'place', 'postcode', 0 - FROM (VALUES (%s)) as v(pc) + cur.execute("""INSERT INTO word (word_token, type, word) + (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc) WHERE NOT EXISTS (SELECT * FROM word - WHERE word = pc and class='place' and type='postcode')) - """, (' ' + term, postcode)) + WHERE type = 'P' and word = pc)) + """, (term, postcode)) self._cache.postcodes.add(postcode) @@ -601,7 +606,8 @@ class _TokenCache: def get_hnr_tokens(self, conn, terms): """ Get token ids for a list of housenumbers, looking them up in the - database if necessary. + database if necessary. `terms` is an iterable of normalized + housenumbers. """ tokens = [] askdb = []