From: Sarah Hoffmann Date: Tue, 24 May 2022 19:45:06 +0000 (+0200) Subject: introduce and use analyzer for postcodes X-Git-Tag: v4.1.0~22^2~18 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/ca7b46511d41d67e229f758e638367c241815c11?ds=sidebyside introduce and use analyzer for postcodes --- diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql index a3dac8dd..f323334b 100644 --- a/lib-sql/tokenizer/icu_tokenizer.sql +++ b/lib-sql/tokenizer/icu_tokenizer.sql @@ -223,3 +223,26 @@ BEGIN END; $$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION create_postcode_word(postcode TEXT, lookup_terms TEXT[]) + RETURNS BOOLEAN + AS $$ +DECLARE + existing INTEGER; +BEGIN + SELECT count(*) INTO existing + FROM word WHERE word = postcode and type = 'P'; + + IF existing > 0 THEN + RETURN TRUE; + END IF; + + -- postcodes don't need word ids + INSERT INTO word (word_token, type, word) + SELECT lookup_term, 'P', postcode FROM unnest(lookup_terms) as lookup_term; + + RETURN FALSE; +END; +$$ +LANGUAGE plpgsql; + diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 4678af66..e9812ba0 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -11,7 +11,6 @@ libICU instead of the PostgreSQL module. import itertools import json import logging -import re from textwrap import dedent from nominatim.db.connection import connect @@ -473,7 +472,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def _process_place_address(self, token_info, address): for item in address: if item.kind == 'postcode': - self._add_postcode(item.name) + token_info.set_postcode(self._add_postcode(item)) elif item.kind == 'housenumber': token_info.add_housenumber(*self._compute_housenumber_token(item)) elif item.kind == 'street': @@ -605,26 +604,36 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): return full_tokens, partial_tokens - def _add_postcode(self, postcode): + def _add_postcode(self, item): """ Make sure the normalized postcode is present in the word table. """ - if re.search(r'[:,;]', postcode) is None: - postcode = self.normalize_postcode(postcode) + analyzer = self.token_analysis.get_analyzer('@postcode') - if postcode not in self._cache.postcodes: - term = self._search_normalized(postcode) - if not term: - return + if analyzer is None: + postcode_name = item.name.strip().upper() + variant_base = None + else: + postcode_name = analyzer.normalize(item.name) + variant_base = item.get_attr("variant") - with self.conn.cursor() as cur: - # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word_token, type, word) - (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc) - WHERE NOT EXISTS - (SELECT * FROM word - WHERE type = 'P' and word = pc)) - """, (term, postcode)) - self._cache.postcodes.add(postcode) + if variant_base is not None: + postcode = f'{postcode_name}@{variant_base}' + else: + postcode = postcode_name + + if postcode not in self._cache.postcodes: + term = self._search_normalized(postcode_name) + if not term: + return + + variants = {term} + if analyzer is not None and variant_base is not None: + variants.update(analyzer.get_variants_ascii(variant_base)) + + with self.conn.cursor() as cur: + cur.execute("SELECT create_postcode_word(%s, %s)", + (postcode, list(variants))) + self._cache.postcodes.add(postcode) class _TokenInfo: @@ -637,6 +646,7 @@ class _TokenInfo: self.street_tokens = set() self.place_tokens = set() self.address_tokens = {} + self.postcode = None @staticmethod @@ -701,6 +711,11 @@ class _TokenInfo: if partials: self.address_tokens[key] = self._mk_array(partials) + def set_postcode(self, postcode): + """ Set the postcode to the given one. + """ + self.postcode = postcode + class _TokenCache: """ Cache for token information to avoid repeated database queries. diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py index c6292a29..d1edc60d 100644 --- a/nominatim/tokenizer/sanitizers/clean_postcodes.py +++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py @@ -98,7 +98,7 @@ class _PostcodeSanitizer: obj.address.pop(pos) else: postcode.name = formatted[0] - postcode.set_attr('lookup', formatted[1]) + postcode.set_attr('variant', formatted[1]) def scan(self, postcode, country): diff --git a/nominatim/tokenizer/token_analysis/postcodes.py b/nominatim/tokenizer/token_analysis/postcodes.py new file mode 100644 index 00000000..e105b132 --- /dev/null +++ b/nominatim/tokenizer/token_analysis/postcodes.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Specialized processor for postcodes. Supports a 'lookup' variant of the +token, which produces variants with optional spaces. +""" + +from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator + +### Configuration section + +def configure(rules, normalization_rules): # pylint: disable=W0613 + """ All behaviour is currently hard-coded. + """ + return None + +### Analysis section + +def create(normalizer, transliterator, config): # pylint: disable=W0613 + """ Create a new token analysis instance for this module. + """ + return PostcodeTokenAnalysis(normalizer, transliterator) + +class PostcodeTokenAnalysis: + """ Detects common housenumber patterns and normalizes them. + """ + def __init__(self, norm, trans): + self.norm = norm + self.trans = trans + + self.mutator = MutationVariantGenerator(' ', (' ', '')) + + + def normalize(self, name): + """ Return the standard form of the postcode. + """ + return name.strip().upper() + + + def get_variants_ascii(self, norm_name): + """ Compute the spelling variants for the given normalized postcode. + + The official form creates one variant. If a 'lookup version' is + given, then it will create variants with optional spaces. + """ + # Postcodes follow their own transliteration rules. + # Make sure at this point, that the terms are normalized in a way + # that they are searchable with the standard transliteration rules. + return [self.trans.transliterate(term) for term in + self.mutator.generate([self.norm.transliterate(norm_name)])] diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index f682bbcd..212fdcb9 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -34,7 +34,7 @@ sanitizers: - (\A|.*,)[^\d,]{3,}(,.*|\Z) - step: clean-postcodes convert-to-address: yes - default-pattern: [A-Z0-9- ]{3,12} + default-pattern: "[A-Z0-9- ]{3,12}" - step: split-name-list - step: strip-brace-terms - step: tag-analyzer-by-language @@ -46,6 +46,8 @@ token-analysis: - analyzer: generic - id: "@housenumber" analyzer: housenumbers + - id: "@postcode" + analyzer: postcodes - id: bg analyzer: generic mode: variant-only