X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/5ff35d998459260e60bcb01aa7302f4706d043b1..4885fdf0f97d0615027fa6b2ed410e75ae1a2e20:/nominatim/tokenizer/icu_tokenizer.py diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 9c7138ce..61c47c11 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -11,7 +11,6 @@ libICU instead of the PostgreSQL module. import itertools import json import logging -import re from textwrap import dedent from nominatim.db.connection import connect @@ -278,8 +277,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()] - @staticmethod - def normalize_postcode(postcode): + def normalize_postcode(self, postcode): """ Convert the postcode to a standardized form. This function must yield exactly the same result as the SQL function @@ -474,7 +472,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def _process_place_address(self, token_info, address): for item in address: if item.kind == 'postcode': - self._add_postcode(item.name) + token_info.set_postcode(self._add_postcode(item)) elif item.kind == 'housenumber': token_info.add_housenumber(*self._compute_housenumber_token(item)) elif item.kind == 'street': @@ -483,7 +481,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): if not item.suffix: token_info.add_place(self._compute_partial_tokens(item.name)) elif not item.kind.startswith('_') and not item.suffix and \ - item.kind not in ('country', 'full'): + item.kind not in ('country', 'full', 'inclusion'): token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name)) @@ -606,26 +604,36 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): return full_tokens, partial_tokens - def _add_postcode(self, postcode): + def _add_postcode(self, item): """ Make sure the normalized postcode is present in the word table. """ - if re.search(r'[:,;]', postcode) is None: - postcode = self.normalize_postcode(postcode) + analyzer = self.token_analysis.get_analyzer('@postcode') - if postcode not in self._cache.postcodes: - term = self._search_normalized(postcode) - if not term: - return + if analyzer is None: + postcode_name = item.name.strip().upper() + variant_base = None + else: + postcode_name = analyzer.normalize(item.name) + variant_base = item.get_attr("variant") - with self.conn.cursor() as cur: - # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word_token, type, word) - (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc) - WHERE NOT EXISTS - (SELECT * FROM word - WHERE type = 'P' and word = pc)) - """, (term, postcode)) - self._cache.postcodes.add(postcode) + if variant_base is not None: + postcode = f'{postcode_name}@{variant_base}' + else: + postcode = postcode_name + + if postcode not in self._cache.postcodes: + term = self._search_normalized(postcode_name) + if not term: + return + + variants = {term} + if analyzer is not None and variant_base is not None: + variants.update(analyzer.get_variants_ascii(variant_base)) + + with self.conn.cursor() as cur: + cur.execute("SELECT create_postcode_word(%s, %s)", + (postcode, list(variants))) + self._cache.postcodes.add(postcode) class _TokenInfo: @@ -638,6 +646,7 @@ class _TokenInfo: self.street_tokens = set() self.place_tokens = set() self.address_tokens = {} + self.postcode = None @staticmethod @@ -666,6 +675,9 @@ class _TokenInfo: if self.address_tokens: out['addr'] = self.address_tokens + if self.postcode: + out['postcode'] = self.postcode + return out @@ -702,6 +714,11 @@ class _TokenInfo: if partials: self.address_tokens[key] = self._mk_array(partials) + def set_postcode(self, postcode): + """ Set the postcode to the given one. + """ + self.postcode = postcode + class _TokenCache: """ Cache for token information to avoid repeated database queries.