]> git.openstreetmap.org Git - nominatim.git/commitdiff
Properly validate postcodes with country code
authorPaweł Wroniszewski <pawel.wroniszewski@traveltime.com>
Mon, 16 Oct 2023 22:44:24 +0000 (00:44 +0200)
committerPaweł Wroniszewski <pwronisz@gmail.com>
Mon, 16 Oct 2023 23:04:07 +0000 (01:04 +0200)
Include postcode pattern in postcode normalisation regex, instead of
removing it from postcode pattern in config.

It properly handles postcode validation and normalization when country code
is part of the postcode, e.g. for Isle of Man, Jersey, Anguilla, Andorra,
Cayman Islands and more.

Fixes #3227.

nominatim/data/postcode_format.py
settings/country_settings.yaml
test/python/tokenizer/sanitizers/test_clean_postcodes.py

index dad35b7a9965c6a6d4c90149c5879c764ff8e5cc..132dd41fe97df9ff9c86fa8ca37869e7d1aeb76b 100644 (file)
@@ -25,7 +25,7 @@ class CountryPostcodeMatcher:
 
         pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
 
 
         pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
 
-        self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
+        self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?({pc_pattern})\\s*')
         self.pattern = re.compile(pc_pattern)
 
         self.output = config.get('output', r'\g<0>')
         self.pattern = re.compile(pc_pattern)
 
         self.output = config.get('output', r'\g<0>')
index 667684c670a0dacb46b1d052f4a4f82547c3ed83..bda3b9e79edb263e0386c007258c0c3220bb2380 100644 (file)
@@ -4,8 +4,7 @@ ad:
     languages: ca
     names: !include country-names/ad.yaml
     postcode:
     languages: ca
     names: !include country-names/ad.yaml
     postcode:
-      pattern: "(ddd)"
-      output: AD\1
+      pattern: "ADddd"
 
 
 # United Arab Emirates (الإمارات العربية المتحدة)
 
 
 # United Arab Emirates (الإمارات العربية المتحدة)
@@ -39,8 +38,7 @@ ai:
     languages: en
     names: !include country-names/ai.yaml
     postcode:
     languages: en
     names: !include country-names/ai.yaml
     postcode:
-      pattern: "2640"
-      output: AI-2640
+      pattern: "AI-2640"
 
 
 # Albania (Shqipëria)
 
 
 # Albania (Shqipëria)
@@ -965,8 +963,7 @@ ky:
     languages: en
     names: !include country-names/ky.yaml
     postcode:
     languages: en
     names: !include country-names/ky.yaml
     postcode:
-      pattern: "(d)-(dddd)"
-      output: KY\1-\2
+      pattern: "KYd-dddd"
 
 
 # Kazakhstan (Қазақстан)
 
 
 # Kazakhstan (Қазақстан)
@@ -1002,7 +999,7 @@ lc:
     languages: en
     names: !include country-names/lc.yaml
     postcode:
     languages: en
     names: !include country-names/lc.yaml
     postcode:
-      pattern: "(dd) ?(ddd)"
+      pattern: "LC(dd) ?(ddd)"
       output: LC\1 \2
 
 
       output: LC\1 \2
 
 
@@ -1066,8 +1063,7 @@ lv:
     languages: lv
     names: !include country-names/lv.yaml
     postcode:
     languages: lv
     names: !include country-names/lv.yaml
     postcode:
-      pattern: "(dddd)"
-      output: LV-\1
+      pattern: "LV-dddd"
 
 
 # Libya (ليبيا)
 
 
 # Libya (ليبيا)
@@ -1102,8 +1098,7 @@ md:
     languages: ro, ru, uk
     names: !include country-names/md.yaml
     postcode:
     languages: ro, ru, uk
     names: !include country-names/md.yaml
     postcode:
-      pattern: "(dddd)"
-      output: MD-\1
+      pattern: "MD-dddd"
 
 
 # Montenegro (Crna Gora / Црна Гора)
 
 
 # Montenegro (Crna Gora / Црна Гора)
@@ -1845,8 +1840,7 @@ vc:
     languages: en
     names: !include country-names/vc.yaml
     postcode:
     languages: en
     names: !include country-names/vc.yaml
     postcode:
-      pattern: "(dddd)"
-      output: VC\1
+      pattern: "VCdddd"
 
 
 # Venezuela (Venezuela)
 
 
 # Venezuela (Venezuela)
@@ -1864,8 +1858,7 @@ vg:
     languages: en
     names: !include country-names/vg.yaml
     postcode:
     languages: en
     names: !include country-names/vg.yaml
     postcode:
-      pattern: "(dddd)"
-      output: VG\1
+      pattern: "VGdddd"
 
 
 # Vietnam (Việt Nam)
 
 
 # Vietnam (Việt Nam)
index f2c965ad9b1db0017864b5bbaec1677023b1d838..63d77202b09f60553911cc21f50c6110acf41745 100644 (file)
@@ -89,6 +89,143 @@ def test_postcode_sweden_fail(sanitize, postcode):
     assert sanitize(country='se', postcode=postcode) == []
 
 
     assert sanitize(country='se', postcode=postcode) == []
 
 
+@pytest.mark.parametrize("postcode", ('AD123', 'AD AD123'))
+def test_postcode_andorra_pass(sanitize, postcode):
+    assert sanitize(country='ad', postcode=postcode) == [('postcode', 'AD123')]
+
+
+@pytest.mark.parametrize("postcode", ('123', 'AD 123', 'AD-123', 'AD1234'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_andorra_fail(sanitize, postcode):
+    assert sanitize(country='ad', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('AI-2640', 'AI AI-2640'))
+def test_postcode_anguilla_pass(sanitize, postcode):
+    assert sanitize(country='ai', postcode=postcode) == [('postcode', 'AI-2640')]
+
+
+@pytest.mark.parametrize("postcode", ('2640', 'AI 2640', 'AI-2000', 'AI US-2640'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_anguilla_fail(sanitize, postcode):
+    assert sanitize(country='ai', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('BN1111', 'BN 1111', 'BN BN1111', 'BN BN 1111'))
+def test_postcode_brunei_pass(sanitize, postcode):
+    assert sanitize(country='bn', postcode=postcode) == [('postcode', 'BN1111')]
+
+
+@pytest.mark.parametrize("postcode", ('BN-1111', 'BNN1111'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_brunei_fail(sanitize, postcode):
+    assert sanitize(country='bn', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('IM1 1AA', 'IM11AA', 'IM IM11AA'))
+def test_postcode_isle_of_man_pass(sanitize, postcode):
+    assert sanitize(country='im', postcode=postcode) == [('postcode', 'IM1 1AA')]
+
+
+@pytest.mark.parametrize("postcode", ('IZ1 1AA', 'IM1 AA'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_isle_of_man_fail(sanitize, postcode):
+    assert sanitize(country='im', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('JE5 0LA', 'JE50LA', 'JE JE50LA', 'je JE5 0LA'))
+def test_postcode_jersey_pass(sanitize, postcode):
+    assert sanitize(country='je', postcode=postcode) == [('postcode', 'JE5 0LA')]
+
+
+@pytest.mark.parametrize("postcode", ('gb JE5 0LA', 'IM50LA', 'IM5 012'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_jersey_fail(sanitize, postcode):
+    assert sanitize(country='je', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('KY1-1234', 'KY KY1-1234'))
+def test_postcode_cayman_islands_pass(sanitize, postcode):
+    assert sanitize(country='ky', postcode=postcode) == [('postcode', 'KY1-1234')]
+
+
+@pytest.mark.parametrize("postcode", ('1-1234', 'KY-1234', 'KZ1-1234', 'KY1 1234', 'KY 1-1234', 'KY1-123'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_cayman_islands_fail(sanitize, postcode):
+    assert sanitize(country='ky', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('LC11 222', 'LC LC11 222', 'LC LC11 222'))
+def test_postcode_saint_lucia_pass(sanitize, postcode):
+    assert sanitize(country='lc', postcode=postcode) == [('postcode', 'LC11 222')]
+
+
+@pytest.mark.parametrize("postcode", ('11 222', '11222', 'LC 11 222'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_saint_lucia_fail(sanitize, postcode):
+    assert sanitize(country='lc', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('LV-1111', 'LV LV-1111'))
+def test_postcode_latvia_pass(sanitize, postcode):
+    assert sanitize(country='lv', postcode=postcode) == [('postcode', 'LV-1111')]
+
+
+@pytest.mark.parametrize("postcode", ('1111', 'LV 1111', 'LV1111', 'LV LV 1111'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_latvia_fail(sanitize, postcode):
+    assert sanitize(country='lv', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('MD-1111', 'MD MD-1111'))
+def test_postcode_moldova_pass(sanitize, postcode):
+    assert sanitize(country='md', postcode=postcode) == [('postcode', 'MD-1111')]
+
+
+@pytest.mark.parametrize("postcode", ('1111', 'MD 1111', 'MD1111'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_moldova_fail(sanitize, postcode):
+    assert sanitize(country='md', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('VLT 1117', 'GDJ 1234', 'BZN 2222'))
+def test_postcode_malta_pass(sanitize, postcode):
+    assert sanitize(country='mt', postcode=postcode) == [('postcode', postcode)]
+
+
+@pytest.mark.parametrize("postcode", ('MTF 1111', 'MT MTF 1111', 'MTF1111', 'MT MTF1111'))
+def test_postcode_malta_mtarfa_pass(sanitize, postcode):
+    assert sanitize(country='mt', postcode=postcode) == [('postcode', 'MTF 1111')]
+
+
+@pytest.mark.parametrize("postcode", ('1111', 'MTMT 1111'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_malta_fail(sanitize, postcode):
+    assert sanitize(country='mt', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('VC1111', 'VC VC1111'))
+def test_postcode_saint_vincent_pass(sanitize, postcode):
+    assert sanitize(country='vc', postcode=postcode) == [('postcode', 'VC1111')]
+
+
+@pytest.mark.parametrize("postcode", ('1111', 'VC-1111', 'VC 1111', 'VC11'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_saint_vincent_fail(sanitize, postcode):
+    assert sanitize(country='vc', postcode=postcode) == []
+
+
+@pytest.mark.parametrize("postcode", ('VG1111', 'VG VG1111'))
+def test_postcode_virgin_islands_pass(sanitize, postcode):
+    assert sanitize(country='vg', postcode=postcode) == [('postcode', 'VG1111')]
+
+
+@pytest.mark.parametrize("postcode", ('1111', 'VG 1111', 'VG-1111'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_virgin_islands_fail(sanitize, postcode):
+    assert sanitize(country='vg', postcode=postcode) == []
+
+
 @pytest.mark.parametrize("postcode", ('AB1', '123-456-7890', '1 as 44'))
 @pytest.mark.sanitizer_params(default_pattern='[A-Z0-9- ]{3,12}')
 def test_postcode_default_pattern_pass(sanitize, postcode):
 @pytest.mark.parametrize("postcode", ('AB1', '123-456-7890', '1 as 44'))
 @pytest.mark.sanitizer_params(default_pattern='[A-Z0-9- ]{3,12}')
 def test_postcode_default_pattern_pass(sanitize, postcode):
@@ -99,4 +236,3 @@ def test_postcode_default_pattern_pass(sanitize, postcode):
 @pytest.mark.sanitizer_params(convert_to_address=False, default_pattern='[A-Z0-9- ]{3,12}')
 def test_postcode_default_pattern_fail(sanitize, postcode):
     assert sanitize(country='an', postcode=postcode) == []
 @pytest.mark.sanitizer_params(convert_to_address=False, default_pattern='[A-Z0-9- ]{3,12}')
 def test_postcode_default_pattern_fail(sanitize, postcode):
     assert sanitize(country='an', postcode=postcode) == []
-