From 90d4d339dbed83cc90823401634f01a20e129548 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 19 May 2022 15:49:36 +0200 Subject: [PATCH] initial postcode cleaner for simple patterns Moves postcodes that are either in countries without a postcode system or don't correspond to the local pattern for postcodes into a field for a normal address part. Makes them searchable but not as a special address. This has two consequences: they are no longer a skippable part of the address and the postcodes cannot be searched on their own. --- .../tokenizer/sanitizers/clean_postcodes.py | 99 +++++++++++++++++++ .../sanitizers/test_clean_postcodes.py | 54 ++++++++++ 2 files changed, 153 insertions(+) create mode 100644 nominatim/tokenizer/sanitizers/clean_postcodes.py create mode 100644 test/python/tokenizer/sanitizers/test_clean_postcodes.py diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py new file mode 100644 index 00000000..b07908cd --- /dev/null +++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Sanitizer that filters postcodes by their officially allowed pattern. + +Arguments: + convert-to-address: If set to 'yes' (the default), then postcodes that do + not conform with their country-specific pattern are + converted to an address component. That means that + the postcode does not take part when computing the + postcode centroids of a country but is still searchable. + When set to 'no', non-conforming postcodes are not + searchable either. +""" +import re + +from nominatim.errors import UsageError +from nominatim.tools import country_info + +class _PostcodeMatcher: + """ Matches and formats a postcode according to the format definition. + """ + def __init__(self, country_code, config): + if 'pattern' not in config: + raise UsageError("Field 'pattern' required for 'postcode' " + f"for country '{country_code}'") + + self.pattern = re.compile(config['pattern'].replace('d', '[0-9]') + .replace('l', '[A-Z]')) + + + def normalize(self, postcode): + """ Return the normalized version of the postcode. If the given postcode + does not correspond to the usage-pattern, return null. + """ + normalized = postcode.strip().upper() + + return normalized if self.pattern.fullmatch(normalized) else None + + +class _PostcodeSanitizer: + + def __init__(self, config): + self.convert_to_address = config.get_bool('convert-to-address', True) + # Objects without a country code can't have a postcode per definition. + self.country_without_postcode = {None} + self.country_matcher = {} + + for ccode, prop in country_info.iterate('postcode'): + if prop is False: + self.country_without_postcode.add(ccode) + elif isinstance(prop, dict): + self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop) + else: + raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'") + + + def __call__(self, obj): + if not obj.address: + return + + postcodes = ((i, o) for i, o in enumerate(obj.address) if o.kind == 'postcode') + + for pos, postcode in postcodes: + formatted = self.scan(postcode.name, obj.place.country_code) + + if formatted is None: + if self.convert_to_address: + postcode.kind = 'unofficial_postcode' + else: + obj.address.pop(pos) + else: + postcode.name = formatted + + + def scan(self, postcode, country): + """ Check the postcode for correct formatting and return the + normalized version. Returns None if the postcode does not + correspond to the oficial format of the given country. + """ + if country in self.country_without_postcode: + return None + + if country in self.country_matcher: + return self.country_matcher[country].normalize(postcode) + + return postcode.upper() + + + +def create(config): + """ Create a housenumber processing function. + """ + + return _PostcodeSanitizer(config) diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py new file mode 100644 index 00000000..7cb3c70f --- /dev/null +++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for the sanitizer that normalizes postcodes. +""" +import pytest + +from nominatim.tokenizer.place_sanitizer import PlaceSanitizer +from nominatim.indexer.place_info import PlaceInfo +from nominatim.tools import country_info + +@pytest.fixture +def sanitize(def_config, request): + country_info.setup_country_config(def_config) + sanitizer_args = {'step': 'clean-postcodes'} + for mark in request.node.iter_markers(name="sanitizer_params"): + sanitizer_args.update({k.replace('_', '-') : v for k,v in mark.kwargs.items()}) + + def _run(country=None, **kwargs): + pi = {'address': kwargs} + if country is not None: + pi['country_code'] = country + + _, address = PlaceSanitizer([sanitizer_args]).process_names(PlaceInfo(pi)) + + return sorted([(p.kind, p.name) for p in address]) + + return _run + + +@pytest.mark.parametrize("country", (None, 'ae')) +def test_postcode_no_country(sanitize, country): + assert sanitize(country=country, postcode='23231') == [('unofficial_postcode', '23231')] + + +@pytest.mark.parametrize("country", (None, 'ae')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_no_country_drop(sanitize, country): + assert sanitize(country=country, postcode='23231') == [] + + +@pytest.mark.parametrize("postcode", ('12345', ' 34009 ')) +def test_postcode_pass_good_format(sanitize, postcode): + assert sanitize(country='de', postcode=postcode) == [('postcode', postcode.strip())] + + +@pytest.mark.parametrize("postcode", ('123456', '', ' ', '.....')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_drop_bad_format(sanitize, postcode): + assert sanitize(country='de', postcode=postcode) == [] -- 2.39.5