From: Sarah Hoffmann <lonvia@denofr.de>
Date: Mon, 23 May 2022 12:04:22 +0000 (+0200)
Subject: postcodes: add support for optional spaces
X-Git-Tag: v4.1.0~22^2~22
X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/9172696324d1a3cd489428d16c2c8d88cf9adaca

postcodes: add support for optional spaces
---

diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py
index ae1cd62d..a968c9db 100644
--- a/nominatim/tokenizer/sanitizers/clean_postcodes.py
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@@ -31,18 +31,24 @@ class _PostcodeMatcher:
 
         pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
 
-        self.pattern = re.compile(f'(?:{country_code.upper()}[ -]?)?({pc_pattern})')
+        self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
+        self.pattern = re.compile(pc_pattern)
+
+        self.output = config.get('output', r'\g<0>')
 
 
     def normalize(self, postcode):
         """ Return the normalized version of the postcode. If the given postcode
             does not correspond to the usage-pattern, return null.
         """
-        normalized = postcode.strip().upper()
+        # Upper-case, strip spaces and leading country code.
+        normalized = self.norm_pattern.fullmatch(postcode.upper())
 
-        match = self.pattern.fullmatch(normalized)
+        if normalized:
+            match = self.pattern.fullmatch(normalized.group(1))
+            return match.expand(self.output) if match else None
 
-        return match.group(1) if match else None
+        return None
 
 
 class _PostcodeSanitizer:
diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml
index adb7593e..f09de046 100644
--- a/settings/country_settings.yaml
+++ b/settings/country_settings.yaml
@@ -456,6 +456,9 @@ cz:
     partition: 124
     languages: cs
     names: !include country-names/cz.yaml
+    postcode:
+      pattern: "(ddd) ?(dd)"
+      output: \1 \2
 
 
 # Germany (Deutschland)
@@ -1618,6 +1621,9 @@ se:
     partition: 112
     languages: sv
     names: !include country-names/se.yaml
+    postcode:
+      pattern: "(ddd) ?(dd)"
+      output: \1 \2
 
 
 # Singapore (Singapore)
@@ -1657,6 +1663,9 @@ sk:
     partition: 172
     languages: sk
     names: !include country-names/sk.yaml
+    postcode:
+      pattern: "(ddd) ?(dd)"
+      output: \1 \2
 
 
 # Sierra Leone (Sierra Leone)
diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py
index e5c07596..228c2f3a 100644
--- a/test/python/tokenizer/sanitizers/test_clean_postcodes.py
+++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py
@@ -77,3 +77,14 @@ def test_postcode_kazakhstan_pass(sanitize, postcode):
 def test_postcode_kazakhstan_fail(sanitize, postcode):
     assert sanitize(country='kz', postcode=postcode) == []
 
+
+@pytest.mark.parametrize("postcode", ('675 34', '67534', 'SE-675 34', 'SE67534'))
+def test_postcode_sweden_pass(sanitize, postcode):
+    assert sanitize(country='se', postcode=postcode) == [('postcode', '675 34')]
+
+
+@pytest.mark.parametrize("postcode", ('67 345', '671123'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_sweden_fail(sanitize, postcode):
+    assert sanitize(country='se', postcode=postcode) == []
+