From: Sarah Hoffmann Date: Mon, 23 May 2022 09:01:57 +0000 (+0200) Subject: postcodes: strip leading country codes X-Git-Tag: v4.1.0~22^2~24 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/baee6f3de09226c3dc41cb2314a0ac348e865561?ds=inline postcodes: strip leading country codes --- diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py index b07908cd..ae1cd62d 100644 --- a/nominatim/tokenizer/sanitizers/clean_postcodes.py +++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py @@ -29,8 +29,9 @@ class _PostcodeMatcher: raise UsageError("Field 'pattern' required for 'postcode' " f"for country '{country_code}'") - self.pattern = re.compile(config['pattern'].replace('d', '[0-9]') - .replace('l', '[A-Z]')) + pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]') + + self.pattern = re.compile(f'(?:{country_code.upper()}[ -]?)?({pc_pattern})') def normalize(self, postcode): @@ -39,7 +40,9 @@ class _PostcodeMatcher: """ normalized = postcode.strip().upper() - return normalized if self.pattern.fullmatch(normalized) else None + match = self.pattern.fullmatch(normalized) + + return match.group(1) if match else None class _PostcodeSanitizer: diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py index d6371e07..e5c07596 100644 --- a/test/python/tokenizer/sanitizers/test_clean_postcodes.py +++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py @@ -43,12 +43,14 @@ def test_postcode_no_country_drop(sanitize, country): assert sanitize(country=country, postcode='23231') == [] -@pytest.mark.parametrize("postcode", ('12345', ' 34009 ')) +@pytest.mark.parametrize("postcode", ('12345', ' 12345 ', 'de 12345', + 'DE12345', 'DE 12345', 'DE-12345')) def test_postcode_pass_good_format(sanitize, postcode): - assert sanitize(country='de', postcode=postcode) == [('postcode', postcode.strip())] + assert sanitize(country='de', postcode=postcode) == [('postcode', '12345')] -@pytest.mark.parametrize("postcode", ('123456', '', ' ', '.....')) +@pytest.mark.parametrize("postcode", ('123456', '', ' ', '.....', + 'DE 12345', 'DEF12345', 'CH 12345')) @pytest.mark.sanitizer_params(convert_to_address=False) def test_postcode_drop_bad_format(sanitize, postcode): assert sanitize(country='de', postcode=postcode) == []