X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/d33c82cb66a5d7edf7202e7e4ede8e2496011580..ca7b46511d41d67e229f758e638367c241815c11:/nominatim/tokenizer/icu_tokenizer.py diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index b553dbc6..e9812ba0 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -11,7 +11,6 @@ libICU instead of the PostgreSQL module. import itertools import json import logging -import re from textwrap import dedent from nominatim.db.connection import connect @@ -187,7 +186,7 @@ class LegacyICUTokenizer(AbstractTokenizer): @define('CONST_Max_Word_Frequency', 10000000); @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}"); @define('CONST_Transliteration', "{self.loader.get_search_rules()}"); - require_once('{phpdir}/tokenizer/icu_tokenizer.php');""")) + require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8') def _save_config(self): @@ -278,8 +277,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()] - @staticmethod - def normalize_postcode(postcode): + def normalize_postcode(self, postcode): """ Convert the postcode to a standardized form. This function must yield exactly the same result as the SQL function @@ -474,7 +472,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def _process_place_address(self, token_info, address): for item in address: if item.kind == 'postcode': - self._add_postcode(item.name) + token_info.set_postcode(self._add_postcode(item)) elif item.kind == 'housenumber': token_info.add_housenumber(*self._compute_housenumber_token(item)) elif item.kind == 'street': @@ -483,7 +481,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): if not item.suffix: token_info.add_place(self._compute_partial_tokens(item.name)) elif not item.kind.startswith('_') and not item.suffix and \ - item.kind not in ('country', 'full'): + item.kind not in ('country', 'full', 'inclusion'): token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name)) @@ -606,26 +604,36 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): return full_tokens, partial_tokens - def _add_postcode(self, postcode): + def _add_postcode(self, item): """ Make sure the normalized postcode is present in the word table. """ - if re.search(r'[:,;]', postcode) is None: - postcode = self.normalize_postcode(postcode) + analyzer = self.token_analysis.get_analyzer('@postcode') + + if analyzer is None: + postcode_name = item.name.strip().upper() + variant_base = None + else: + postcode_name = analyzer.normalize(item.name) + variant_base = item.get_attr("variant") - if postcode not in self._cache.postcodes: - term = self._search_normalized(postcode) - if not term: - return + if variant_base is not None: + postcode = f'{postcode_name}@{variant_base}' + else: + postcode = postcode_name - with self.conn.cursor() as cur: - # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word_token, type, word) - (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc) - WHERE NOT EXISTS - (SELECT * FROM word - WHERE type = 'P' and word = pc)) - """, (term, postcode)) - self._cache.postcodes.add(postcode) + if postcode not in self._cache.postcodes: + term = self._search_normalized(postcode_name) + if not term: + return + + variants = {term} + if analyzer is not None and variant_base is not None: + variants.update(analyzer.get_variants_ascii(variant_base)) + + with self.conn.cursor() as cur: + cur.execute("SELECT create_postcode_word(%s, %s)", + (postcode, list(variants))) + self._cache.postcodes.add(postcode) class _TokenInfo: @@ -638,6 +646,7 @@ class _TokenInfo: self.street_tokens = set() self.place_tokens = set() self.address_tokens = {} + self.postcode = None @staticmethod @@ -702,6 +711,11 @@ class _TokenInfo: if partials: self.address_tokens[key] = self._mk_array(partials) + def set_postcode(self, postcode): + """ Set the postcode to the given one. + """ + self.postcode = postcode + class _TokenCache: """ Cache for token information to avoid repeated database queries.