From 18864afa8aee710a5aa7fe65565711119ca7a663 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 24 May 2022 18:25:37 +0200 Subject: [PATCH] postcodes: introduce a default pattern for countries without postcodes --- .../tokenizer/sanitizers/clean_postcodes.py | 22 +++++++++++++------ settings/icu_tokenizer.yaml | 1 + .../sanitizers/test_clean_postcodes.py | 12 ++++++++++ 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py index 42beea37..c6292a29 100644 --- a/nominatim/tokenizer/sanitizers/clean_postcodes.py +++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py @@ -75,6 +75,12 @@ class _PostcodeSanitizer: else: raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'") + default_pattern = config.get('default-pattern') + if default_pattern is not None and isinstance(default_pattern, str): + self.default_matcher = _PostcodeMatcher('', {'pattern': default_pattern}) + else: + self.default_matcher = None + def __call__(self, obj): if not obj.address: @@ -103,14 +109,16 @@ class _PostcodeSanitizer: if country in self.country_without_postcode: return None - matcher = self.country_matcher.get(country) - if matcher is not None: - match = matcher.match(postcode) - if match is None: - return None - return matcher.normalize(match), ' '.join(match.groups()) + matcher = self.country_matcher.get(country, self.default_matcher) + if matcher is None: + return postcode.upper(), '' + + match = matcher.match(postcode) + if match is None: + return None + + return matcher.normalize(match), ' '.join(match.groups()) - return postcode.upper(), '' diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index 544bd81d..f682bbcd 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -34,6 +34,7 @@ sanitizers: - (\A|.*,)[^\d,]{3,}(,.*|\Z) - step: clean-postcodes convert-to-address: yes + default-pattern: [A-Z0-9- ]{3,12} - step: split-name-list - step: strip-brace-terms - step: tag-analyzer-by-language diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py index 228c2f3a..44376196 100644 --- a/test/python/tokenizer/sanitizers/test_clean_postcodes.py +++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py @@ -88,3 +88,15 @@ def test_postcode_sweden_pass(sanitize, postcode): def test_postcode_sweden_fail(sanitize, postcode): assert sanitize(country='se', postcode=postcode) == [] + +@pytest.mark.parametrize("postcode", ('AB1', '123-456-7890', '1 as 44')) +@pytest.mark.sanitizer_params(default_pattern='[A-Z0-9- ]{3,12}') +def test_postcode_default_pattern_pass(sanitize, postcode): + assert sanitize(country='an', postcode=postcode) == [('postcode', postcode.upper())] + + +@pytest.mark.parametrize("postcode", ('C', '12', 'ABC123DEF 456', '1234,5678', '11223;11224')) +@pytest.mark.sanitizer_params(convert_to_address=False, default_pattern='[A-Z0-9- ]{3,12}') +def test_postcode_default_pattern_fail(sanitize, postcode): + assert sanitize(country='an', postcode=postcode) == [] + -- 2.39.5