X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/32ca631b7490661be1050ab78c3be5b5039d0a52..bc8b2d4ae0dbaef64448ddcb530de9626da9d82d:/nominatim/tokenizer/legacy_icu_tokenizer.py diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 6148e459..6d3d11c1 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -9,8 +9,6 @@ import re from textwrap import dedent from pathlib import Path -import psycopg2.extras - from nominatim.db.connection import connect from nominatim.db.properties import set_property, get_property from nominatim.db.utils import CopyBuffer @@ -123,7 +121,7 @@ class LegacyICUTokenizer: """ return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules)) - + # pylint: disable=missing-format-attribute def _install_php(self, phpdir): """ Install the php script for the tokenizer. """ @@ -134,7 +132,7 @@ class LegacyICUTokenizer: @define('CONST_Term_Normalization_Rules', "{0.term_normalization}"); @define('CONST_Transliteration', "{0.naming_rules.search_rules}"); require_once('{1}/tokenizer/legacy_icu_tokenizer.php'); - """.format(self, phpdir))) # pylint: disable=missing-format-attribute + """.format(self, phpdir))) def _save_config(self, config): @@ -163,12 +161,17 @@ class LegacyICUTokenizer: words = Counter() name_proc = ICUNameProcessor(self.naming_rules) with conn.cursor(name="words") as cur: - cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v") + cur.execute(""" SELECT v, count(*) FROM + (SELECT svals(name) as v FROM place)x + WHERE length(v) < 75 GROUP BY v""") for name, cnt in cur: + terms = set() for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)): - for term in word.split(): - words[term] += cnt + if ' ' in word: + terms.update(word.split()) + for term in terms: + words[term] += cnt # copy them back into the word table with CopyBuffer() as copystr: @@ -336,7 +339,7 @@ class LegacyICUNameAnalyzer: term = self.name_processor.get_search_normalized(word) if term: copystr.add(word, ' ' + term, cls, typ, - oper if oper in ('in', 'near') else None, 0) + oper if oper in ('in', 'near') else None, 0) added += 1 copystr.copy_out(cursor, 'word', @@ -354,8 +357,7 @@ class LegacyICUNameAnalyzer: to_delete = existing_phrases - new_phrases if to_delete: - psycopg2.extras.execute_values( - cursor, + cursor.execute_values( """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) WHERE word = name and class = in_class and type = in_type and ((op = '-' and operator is null) or op = operator)""", @@ -381,9 +383,9 @@ class LegacyICUNameAnalyzer: if word_tokens: cur.execute("""INSERT INTO word (word_id, word_token, country_code, search_name_count) - (SELECT nextval('seq_word'), token, '{}', 0 + (SELECT nextval('seq_word'), token, %s, 0 FROM unnest(%s) as token) - """.format(country_code), (list(word_tokens),)) + """, (country_code, list(word_tokens))) def process_place(self, place): @@ -406,33 +408,36 @@ class LegacyICUNameAnalyzer: self.add_country_names(country_feature.lower(), names) address = place.get('address') - if address: - hnrs = [] - addr_terms = [] - for key, value in address.items(): - if key == 'postcode': - self._add_postcode(value) - elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): - hnrs.append(value) - elif key == 'street': - token_info.add_street(*self._compute_name_tokens({'name': value})) - elif key == 'place': - token_info.add_place(*self._compute_name_tokens({'name': value})) - elif not key.startswith('_') and \ - key not in ('country', 'full'): - addr_terms.append((key, *self._compute_name_tokens({'name': value}))) - - if hnrs: - hnrs = self._split_housenumbers(hnrs) - token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs]) - - if addr_terms: - token_info.add_address_terms(addr_terms) + self._process_place_address(token_info, address) return token_info.data + def _process_place_address(self, token_info, address): + hnrs = [] + addr_terms = [] + for key, value in address.items(): + if key == 'postcode': + self._add_postcode(value) + elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): + hnrs.append(value) + elif key == 'street': + token_info.add_street(*self._compute_name_tokens({'name': value})) + elif key == 'place': + token_info.add_place(*self._compute_name_tokens({'name': value})) + elif not key.startswith('_') and \ + key not in ('country', 'full'): + addr_terms.append((key, *self._compute_name_tokens({'name': value}))) + + if hnrs: + hnrs = self._split_housenumbers(hnrs) + token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs]) + + if addr_terms: + token_info.add_address_terms(addr_terms) + + def _compute_name_tokens(self, names): """ Computes the full name and partial name tokens for the given dictionary of names. @@ -446,6 +451,9 @@ class LegacyICUNameAnalyzer: full, part = self._cache.names.get(norm_name, (None, None)) if full is None: variants = self.name_processor.get_variants_ascii(norm_name) + if not variants: + continue + with self.conn.cursor() as cur: cur.execute("SELECT (getorcreate_full_word(%s, %s)).*", (norm_name, variants)) @@ -465,12 +473,13 @@ class LegacyICUNameAnalyzer: given dictionary of names. """ full_names = set() - for name in (n for ns in names.values() for n in re.split('[;,]', ns)): - full_names.add(name.strip()) + for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)): + if name: + full_names.add(name) - brace_idx = name.find('(') - if brace_idx >= 0: - full_names.add(name[:brace_idx].strip()) + brace_idx = name.find('(') + if brace_idx >= 0: + full_names.add(name[:brace_idx].strip()) return full_names