X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/b4fec57b6d53f8e8a45c46ff11f13cbcbea1006a..70f154be8b69d3b57eebd25eff225ee29ccc97ba:/nominatim/tokenizer/legacy_icu_tokenizer.py diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 12ee0404..14fa5b60 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -9,8 +9,6 @@ import re from textwrap import dedent from pathlib import Path -import psycopg2.extras - from nominatim.db.connection import connect from nominatim.db.properties import set_property, get_property from nominatim.db.utils import CopyBuffer @@ -76,13 +74,11 @@ class LegacyICUTokenizer: self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ) - def finalize_import(self, config): + def finalize_import(self, _): """ Do any required postprocessing to make the tokenizer data ready for use. """ - with connect(self.dsn) as conn: - sqlp = SQLPreprocessor(conn, config) - sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql') + pass def update_sql_functions(self, config): @@ -123,18 +119,17 @@ class LegacyICUTokenizer: """ return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules)) - # pylint: disable=missing-format-attribute + def _install_php(self, phpdir): """ Install the php script for the tokenizer. """ php_file = self.data_dir / "tokenizer.php" - php_file.write_text(dedent("""\ + php_file.write_text(dedent(f"""\ >'postcode' as word FROM word WHERE type = 'P') w ON pc = word) x WHERE pc is null or word is null""") @@ -288,24 +286,23 @@ class LegacyICUNameAnalyzer: if postcode is None: to_delete.append(word) else: - copystr.add( - postcode, - ' ' + self.name_processor.get_search_normalized(postcode), - 'place', 'postcode', 0) + copystr.add(self.name_processor.get_search_normalized(postcode), + 'P', {'postcode': postcode}) if to_delete: cur.execute("""DELETE FROM WORD - WHERE class ='place' and type = 'postcode' - and word = any(%s) + WHERE type ='P' and info->>'postcode' = any(%s) """, (to_delete, )) copystr.copy_out(cur, 'word', - columns=['word', 'word_token', 'class', 'type', - 'search_name_count']) + columns=['word_token', 'type', 'info']) def update_special_phrases(self, phrases, should_replace): """ Replace the search index for special phrases with the new phrases. + If `should_replace` is True, then the previous set of will be + completely replaced. Otherwise the phrases are added to the + already existing ones. """ norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3]) for p in phrases)) @@ -313,11 +310,10 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # Get the old phrases. existing_phrases = set() - cur.execute("""SELECT word, class, type, operator FROM word - WHERE class != 'place' - OR (type != 'house' AND type != 'postcode')""") - for label, cls, typ, oper in cur: - existing_phrases.add((label, cls, typ, oper or '-')) + cur.execute("SELECT info FROM word WHERE type = 'S'") + for (info, ) in cur: + existing_phrases.add((info['word'], info['class'], info['type'], + info.get('op') or '-')) added = self._add_special_phrases(cur, norm_phrases, existing_phrases) if should_replace: @@ -340,13 +336,13 @@ class LegacyICUNameAnalyzer: for word, cls, typ, oper in to_add: term = self.name_processor.get_search_normalized(word) if term: - copystr.add(word, ' ' + term, cls, typ, - oper if oper in ('in', 'near') else None, 0) + copystr.add(term, 'S', + {'word': word, 'class': cls, 'type': typ, + 'op': oper if oper in ('in', 'near') else None}) added += 1 copystr.copy_out(cursor, 'word', - columns=['word', 'word_token', 'class', 'type', - 'operator', 'search_name_count']) + columns=['word_token', 'type', 'info']) return added @@ -359,12 +355,12 @@ class LegacyICUNameAnalyzer: to_delete = existing_phrases - new_phrases if to_delete: - psycopg2.extras.execute_values( - cursor, + cursor.execute_values( """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) - WHERE word = name and class = in_class and type = in_type - and ((op = '-' and operator is null) or op = operator)""", - to_delete) + WHERE info->>'word' = name + and info->>'class' = in_class and info->>'type' = in_type + and ((op = '-' and info->>'op' is null) or op = info->>'op') + """, to_delete) return len(to_delete) @@ -374,21 +370,27 @@ class LegacyICUNameAnalyzer: """ word_tokens = set() for name in self._compute_full_names(names): - if name: - word_tokens.add(' ' + self.name_processor.get_search_normalized(name)) + norm_name = self.name_processor.get_search_normalized(name) + if norm_name: + word_tokens.add(norm_name) with self.conn.cursor() as cur: # Get existing names - cur.execute("SELECT word_token FROM word WHERE country_code = %s", + cur.execute("""SELECT word_token FROM word + WHERE type = 'C' and info->>'cc'= %s""", (country_code, )) word_tokens.difference_update((t[0] for t in cur)) + # Only add those names that are not yet in the list. if word_tokens: - cur.execute("""INSERT INTO word (word_id, word_token, country_code, - search_name_count) - (SELECT nextval('seq_word'), token, '{}', 0 + cur.execute("""INSERT INTO word (word_token, type, info) + (SELECT token, 'C', json_build_object('cc', %s) FROM unnest(%s) as token) - """.format(country_code), (list(word_tokens),)) + """, (country_code, list(word_tokens))) + + # No names are deleted at the moment. + # If deletion is made possible, then the static names from the + # initial 'country_name' table should be kept. def process_place(self, place): @@ -500,14 +502,13 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word, word_token, class, type, - search_name_count) - (SELECT pc, %s, 'place', 'postcode', 0 + cur.execute("""INSERT INTO word (word_token, type, info) + (SELECT %s, 'P', json_build_object('postcode', pc) FROM (VALUES (%s)) as v(pc) WHERE NOT EXISTS (SELECT * FROM word - WHERE word = pc and class='place' and type='postcode')) - """, (' ' + term, postcode)) + WHERE type = 'P' and info->>postcode = pc)) + """, (term, postcode)) self._cache.postcodes.add(postcode) @@ -598,7 +599,8 @@ class _TokenCache: def get_hnr_tokens(self, conn, terms): """ Get token ids for a list of housenumbers, looking them up in the - database if necessary. + database if necessary. `terms` is an iterable of normalized + housenumbers. """ tokens = [] askdb = []