From fbe40e005dfeecb937c92355492049dc4eb05f7d Mon Sep 17 00:00:00 2001 From: =?utf8?q?Pawe=C5=82=20Wroniszewski?= Date: Tue, 17 Oct 2023 00:44:24 +0200 Subject: [PATCH] Properly validate postcodes with country code Include postcode pattern in postcode normalisation regex, instead of removing it from postcode pattern in config. It properly handles postcode validation and normalization when country code is part of the postcode, e.g. for Isle of Man, Jersey, Anguilla, Andorra, Cayman Islands and more. Fixes #3227. --- nominatim/data/postcode_format.py | 2 +- settings/country_settings.yaml | 23 +-- .../sanitizers/test_clean_postcodes.py | 138 +++++++++++++++++- 3 files changed, 146 insertions(+), 17 deletions(-) diff --git a/nominatim/data/postcode_format.py b/nominatim/data/postcode_format.py index dad35b7a..132dd41f 100644 --- a/nominatim/data/postcode_format.py +++ b/nominatim/data/postcode_format.py @@ -25,7 +25,7 @@ class CountryPostcodeMatcher: pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]') - self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*') + self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?({pc_pattern})\\s*') self.pattern = re.compile(pc_pattern) self.output = config.get('output', r'\g<0>') diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml index 667684c6..bda3b9e7 100644 --- a/settings/country_settings.yaml +++ b/settings/country_settings.yaml @@ -4,8 +4,7 @@ ad: languages: ca names: !include country-names/ad.yaml postcode: - pattern: "(ddd)" - output: AD\1 + pattern: "ADddd" # United Arab Emirates (الإمارات العربية المتحدة) @@ -39,8 +38,7 @@ ai: languages: en names: !include country-names/ai.yaml postcode: - pattern: "2640" - output: AI-2640 + pattern: "AI-2640" # Albania (Shqipëria) @@ -965,8 +963,7 @@ ky: languages: en names: !include country-names/ky.yaml postcode: - pattern: "(d)-(dddd)" - output: KY\1-\2 + pattern: "KYd-dddd" # Kazakhstan (Қазақстан) @@ -1002,7 +999,7 @@ lc: languages: en names: !include country-names/lc.yaml postcode: - pattern: "(dd) ?(ddd)" + pattern: "LC(dd) ?(ddd)" output: LC\1 \2 @@ -1066,8 +1063,7 @@ lv: languages: lv names: !include country-names/lv.yaml postcode: - pattern: "(dddd)" - output: LV-\1 + pattern: "LV-dddd" # Libya (ليبيا) @@ -1102,8 +1098,7 @@ md: languages: ro, ru, uk names: !include country-names/md.yaml postcode: - pattern: "(dddd)" - output: MD-\1 + pattern: "MD-dddd" # Montenegro (Crna Gora / Црна Гора) @@ -1845,8 +1840,7 @@ vc: languages: en names: !include country-names/vc.yaml postcode: - pattern: "(dddd)" - output: VC\1 + pattern: "VCdddd" # Venezuela (Venezuela) @@ -1864,8 +1858,7 @@ vg: languages: en names: !include country-names/vg.yaml postcode: - pattern: "(dddd)" - output: VG\1 + pattern: "VGdddd" # Vietnam (Việt Nam) diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py index f2c965ad..63d77202 100644 --- a/test/python/tokenizer/sanitizers/test_clean_postcodes.py +++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py @@ -89,6 +89,143 @@ def test_postcode_sweden_fail(sanitize, postcode): assert sanitize(country='se', postcode=postcode) == [] +@pytest.mark.parametrize("postcode", ('AD123', 'AD AD123')) +def test_postcode_andorra_pass(sanitize, postcode): + assert sanitize(country='ad', postcode=postcode) == [('postcode', 'AD123')] + + +@pytest.mark.parametrize("postcode", ('123', 'AD 123', 'AD-123', 'AD1234')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_andorra_fail(sanitize, postcode): + assert sanitize(country='ad', postcode=postcode) == [] + + +@pytest.mark.parametrize("postcode", ('AI-2640', 'AI AI-2640')) +def test_postcode_anguilla_pass(sanitize, postcode): + assert sanitize(country='ai', postcode=postcode) == [('postcode', 'AI-2640')] + + +@pytest.mark.parametrize("postcode", ('2640', 'AI 2640', 'AI-2000', 'AI US-2640')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_anguilla_fail(sanitize, postcode): + assert sanitize(country='ai', postcode=postcode) == [] + + +@pytest.mark.parametrize("postcode", ('BN1111', 'BN 1111', 'BN BN1111', 'BN BN 1111')) +def test_postcode_brunei_pass(sanitize, postcode): + assert sanitize(country='bn', postcode=postcode) == [('postcode', 'BN1111')] + + +@pytest.mark.parametrize("postcode", ('BN-1111', 'BNN1111')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_brunei_fail(sanitize, postcode): + assert sanitize(country='bn', postcode=postcode) == [] + + +@pytest.mark.parametrize("postcode", ('IM1 1AA', 'IM11AA', 'IM IM11AA')) +def test_postcode_isle_of_man_pass(sanitize, postcode): + assert sanitize(country='im', postcode=postcode) == [('postcode', 'IM1 1AA')] + + +@pytest.mark.parametrize("postcode", ('IZ1 1AA', 'IM1 AA')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_isle_of_man_fail(sanitize, postcode): + assert sanitize(country='im', postcode=postcode) == [] + + +@pytest.mark.parametrize("postcode", ('JE5 0LA', 'JE50LA', 'JE JE50LA', 'je JE5 0LA')) +def test_postcode_jersey_pass(sanitize, postcode): + assert sanitize(country='je', postcode=postcode) == [('postcode', 'JE5 0LA')] + + +@pytest.mark.parametrize("postcode", ('gb JE5 0LA', 'IM50LA', 'IM5 012')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_jersey_fail(sanitize, postcode): + assert sanitize(country='je', postcode=postcode) == [] + + +@pytest.mark.parametrize("postcode", ('KY1-1234', 'KY KY1-1234')) +def test_postcode_cayman_islands_pass(sanitize, postcode): + assert sanitize(country='ky', postcode=postcode) == [('postcode', 'KY1-1234')] + + +@pytest.mark.parametrize("postcode", ('1-1234', 'KY-1234', 'KZ1-1234', 'KY1 1234', 'KY 1-1234', 'KY1-123')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_cayman_islands_fail(sanitize, postcode): + assert sanitize(country='ky', postcode=postcode) == [] + + +@pytest.mark.parametrize("postcode", ('LC11 222', 'LC LC11 222', 'LC LC11 222')) +def test_postcode_saint_lucia_pass(sanitize, postcode): + assert sanitize(country='lc', postcode=postcode) == [('postcode', 'LC11 222')] + + +@pytest.mark.parametrize("postcode", ('11 222', '11222', 'LC 11 222')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_saint_lucia_fail(sanitize, postcode): + assert sanitize(country='lc', postcode=postcode) == [] + + +@pytest.mark.parametrize("postcode", ('LV-1111', 'LV LV-1111')) +def test_postcode_latvia_pass(sanitize, postcode): + assert sanitize(country='lv', postcode=postcode) == [('postcode', 'LV-1111')] + + +@pytest.mark.parametrize("postcode", ('1111', 'LV 1111', 'LV1111', 'LV LV 1111')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_latvia_fail(sanitize, postcode): + assert sanitize(country='lv', postcode=postcode) == [] + + +@pytest.mark.parametrize("postcode", ('MD-1111', 'MD MD-1111')) +def test_postcode_moldova_pass(sanitize, postcode): + assert sanitize(country='md', postcode=postcode) == [('postcode', 'MD-1111')] + + +@pytest.mark.parametrize("postcode", ('1111', 'MD 1111', 'MD1111')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_moldova_fail(sanitize, postcode): + assert sanitize(country='md', postcode=postcode) == [] + + +@pytest.mark.parametrize("postcode", ('VLT 1117', 'GDJ 1234', 'BZN 2222')) +def test_postcode_malta_pass(sanitize, postcode): + assert sanitize(country='mt', postcode=postcode) == [('postcode', postcode)] + + +@pytest.mark.parametrize("postcode", ('MTF 1111', 'MT MTF 1111', 'MTF1111', 'MT MTF1111')) +def test_postcode_malta_mtarfa_pass(sanitize, postcode): + assert sanitize(country='mt', postcode=postcode) == [('postcode', 'MTF 1111')] + + +@pytest.mark.parametrize("postcode", ('1111', 'MTMT 1111')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_malta_fail(sanitize, postcode): + assert sanitize(country='mt', postcode=postcode) == [] + + +@pytest.mark.parametrize("postcode", ('VC1111', 'VC VC1111')) +def test_postcode_saint_vincent_pass(sanitize, postcode): + assert sanitize(country='vc', postcode=postcode) == [('postcode', 'VC1111')] + + +@pytest.mark.parametrize("postcode", ('1111', 'VC-1111', 'VC 1111', 'VC11')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_saint_vincent_fail(sanitize, postcode): + assert sanitize(country='vc', postcode=postcode) == [] + + +@pytest.mark.parametrize("postcode", ('VG1111', 'VG VG1111')) +def test_postcode_virgin_islands_pass(sanitize, postcode): + assert sanitize(country='vg', postcode=postcode) == [('postcode', 'VG1111')] + + +@pytest.mark.parametrize("postcode", ('1111', 'VG 1111', 'VG-1111')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_virgin_islands_fail(sanitize, postcode): + assert sanitize(country='vg', postcode=postcode) == [] + + @pytest.mark.parametrize("postcode", ('AB1', '123-456-7890', '1 as 44')) @pytest.mark.sanitizer_params(default_pattern='[A-Z0-9- ]{3,12}') def test_postcode_default_pattern_pass(sanitize, postcode): @@ -99,4 +236,3 @@ def test_postcode_default_pattern_pass(sanitize, postcode): @pytest.mark.sanitizer_params(convert_to_address=False, default_pattern='[A-Z0-9- ]{3,12}') def test_postcode_default_pattern_fail(sanitize, postcode): assert sanitize(country='an', postcode=postcode) == [] - -- 2.39.5