X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/a0a7b05c9fd88ac1f808ef2345d71ba577096e40..87dedde5d65becff99b9ff3c4b467cf4f9381403:/nominatim/tokenizer/legacy_icu_tokenizer.py?ds=sidebyside diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index af53e825..a887ae28 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -3,15 +3,13 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using libICU instead of the PostgreSQL module. """ from collections import Counter -import io import itertools +import json import logging import re from textwrap import dedent from pathlib import Path -import psycopg2.extras - from nominatim.db.connection import connect from nominatim.db.properties import set_property, get_property from nominatim.db.utils import CopyBuffer @@ -77,13 +75,10 @@ class LegacyICUTokenizer: self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ) - def finalize_import(self, config): + def finalize_import(self, _): """ Do any required postprocessing to make the tokenizer data ready for use. """ - with connect(self.dsn) as conn: - sqlp = SQLPreprocessor(conn, config) - sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql') def update_sql_functions(self, config): @@ -129,13 +124,12 @@ class LegacyICUTokenizer: """ Install the php script for the tokenizer. """ php_file = self.data_dir / "tokenizer.php" - php_file.write_text(dedent("""\ + php_file.write_text(dedent(f"""\ >'class' = in_class and info->>'type' = in_type + and ((op = '-' and info->>'op' is null) or op = info->>'op') + """, to_delete) return len(to_delete) @@ -370,21 +378,27 @@ class LegacyICUNameAnalyzer: """ word_tokens = set() for name in self._compute_full_names(names): - if name: - word_tokens.add(' ' + self.name_processor.get_search_normalized(name)) + norm_name = self.name_processor.get_search_normalized(name) + if norm_name: + word_tokens.add(norm_name) with self.conn.cursor() as cur: # Get existing names - cur.execute("SELECT word_token FROM word WHERE country_code = %s", + cur.execute("""SELECT word_token FROM word + WHERE type = 'C' and word = %s""", (country_code, )) word_tokens.difference_update((t[0] for t in cur)) + # Only add those names that are not yet in the list. if word_tokens: - cur.execute("""INSERT INTO word (word_id, word_token, country_code, - search_name_count) - (SELECT nextval('seq_word'), token, '{}', 0 + cur.execute("""INSERT INTO word (word_token, type, word) + (SELECT token, 'C', %s FROM unnest(%s) as token) - """.format(country_code), (list(word_tokens),)) + """, (country_code, list(word_tokens))) + + # No names are deleted at the moment. + # If deletion is made possible, then the static names from the + # initial 'country_name' table should be kept. def process_place(self, place): @@ -407,33 +421,36 @@ class LegacyICUNameAnalyzer: self.add_country_names(country_feature.lower(), names) address = place.get('address') - if address: - hnrs = [] - addr_terms = [] - for key, value in address.items(): - if key == 'postcode': - self._add_postcode(value) - elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): - hnrs.append(value) - elif key == 'street': - token_info.add_street(*self._compute_name_tokens({'name': value})) - elif key == 'place': - token_info.add_place(*self._compute_name_tokens({'name': value})) - elif not key.startswith('_') and \ - key not in ('country', 'full'): - addr_terms.append((key, *self._compute_name_tokens({'name': value}))) - - if hnrs: - hnrs = self._split_housenumbers(hnrs) - token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs]) - - if addr_terms: - token_info.add_address_terms(addr_terms) + self._process_place_address(token_info, address) return token_info.data + def _process_place_address(self, token_info, address): + hnrs = [] + addr_terms = [] + for key, value in address.items(): + if key == 'postcode': + self._add_postcode(value) + elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): + hnrs.append(value) + elif key == 'street': + token_info.add_street(*self._compute_name_tokens({'name': value})) + elif key == 'place': + token_info.add_place(*self._compute_name_tokens({'name': value})) + elif not key.startswith('_') and \ + key not in ('country', 'full'): + addr_terms.append((key, *self._compute_name_tokens({'name': value}))) + + if hnrs: + hnrs = self._split_housenumbers(hnrs) + token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs]) + + if addr_terms: + token_info.add_address_terms(addr_terms) + + def _compute_name_tokens(self, names): """ Computes the full name and partial name tokens for the given dictionary of names. @@ -447,6 +464,9 @@ class LegacyICUNameAnalyzer: full, part = self._cache.names.get(norm_name, (None, None)) if full is None: variants = self.name_processor.get_variants_ascii(norm_name) + if not variants: + continue + with self.conn.cursor() as cur: cur.execute("SELECT (getorcreate_full_word(%s, %s)).*", (norm_name, variants)) @@ -466,12 +486,13 @@ class LegacyICUNameAnalyzer: given dictionary of names. """ full_names = set() - for name in (n for ns in names.values() for n in re.split('[;,]', ns)): - full_names.add(name.strip()) + for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)): + if name: + full_names.add(name) - brace_idx = name.find('(') - if brace_idx >= 0: - full_names.add(name[:brace_idx].strip()) + brace_idx = name.find('(') + if brace_idx >= 0: + full_names.add(name[:brace_idx].strip()) return full_names @@ -489,14 +510,12 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word, word_token, class, type, - search_name_count) - (SELECT pc, %s, 'place', 'postcode', 0 - FROM (VALUES (%s)) as v(pc) + cur.execute("""INSERT INTO word (word_token, type, word) + (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc) WHERE NOT EXISTS (SELECT * FROM word - WHERE word = pc and class='place' and type='postcode')) - """, (' ' + term, postcode)) + WHERE type = 'P' and word = pc)) + """, (term, postcode)) self._cache.postcodes.add(postcode) @@ -587,7 +606,8 @@ class _TokenCache: def get_hnr_tokens(self, conn, terms): """ Get token ids for a list of housenumbers, looking them up in the - database if necessary. + database if necessary. `terms` is an iterable of normalized + housenumbers. """ tokens = [] askdb = []