postcodes: add support for optional spaces

author Sarah Hoffmann <lonvia@denofr.de>

Mon, 23 May 2022 12:04:22 +0000 (14:04 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Thu, 23 Jun 2022 21:42:31 +0000 (23:42 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Mon, 23 May 2022 12:04:22 +0000 (14:04 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Thu, 23 Jun 2022 21:42:31 +0000 (23:42 +0200)
diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py

index ae1cd62d8d09f7c9afe5b1fac3949fccaca8b941..a968c9db0787f8a4bb009d72b20db72800145094 100644 (file)
--- a/nominatim/tokenizer/sanitizers/clean_postcodes.py
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@@ -31,18 +31,24 @@ class _PostcodeMatcher:
  
          pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
  
-        self.pattern = re.compile(f'(?:{country_code.upper()}[ -]?)?({pc_pattern})')
+        self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
+        self.pattern = re.compile(pc_pattern)
+
+        self.output = config.get('output', r'\g<0>')
  
  
      def normalize(self, postcode):
          """ Return the normalized version of the postcode. If the given postcode
              does not correspond to the usage-pattern, return null.
          """
-        normalized = postcode.strip().upper()
+        # Upper-case, strip spaces and leading country code.
+        normalized = self.norm_pattern.fullmatch(postcode.upper())
  
-        match = self.pattern.fullmatch(normalized)
+        if normalized:
+            match = self.pattern.fullmatch(normalized.group(1))
+            return match.expand(self.output) if match else None
  
-        return match.group(1) if match else None
+        return None
  
  
  class _PostcodeSanitizer:
diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml

index adb7593ed538f1def1d39a2fe0c1f70928c62632..f09de046fcdb6785677ae5c88b4e85f47e2a366a 100644 (file)
--- a/settings/country_settings.yaml
+++ b/settings/country_settings.yaml
@@ -456,6 +456,9 @@ cz:
      partition: 124
      languages: cs
      names: !include country-names/cz.yaml
+    postcode:
+      pattern: "(ddd) ?(dd)"
+      output: \1 \2
  
  
  # Germany (Deutschland)
@@ -1618,6 +1621,9 @@ se:
      partition: 112
      languages: sv
      names: !include country-names/se.yaml
+    postcode:
+      pattern: "(ddd) ?(dd)"
+      output: \1 \2
  
  
  # Singapore (Singapore)
@@ -1657,6 +1663,9 @@ sk:
      partition: 172
      languages: sk
      names: !include country-names/sk.yaml
+    postcode:
+      pattern: "(ddd) ?(dd)"
+      output: \1 \2
  
  
  # Sierra Leone (Sierra Leone)
diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py

index e5c07596a7942c7a34678d1c93deab257c02e983..228c2f3a1a9adf3331c4536ce03468a875f96426 100644 (file)
--- a/test/python/tokenizer/sanitizers/test_clean_postcodes.py
+++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py
@@ -77,3 +77,14 @@ def test_postcode_kazakhstan_pass(sanitize, postcode):
  def test_postcode_kazakhstan_fail(sanitize, postcode):
      assert sanitize(country='kz', postcode=postcode) == []
  
+
+@pytest.mark.parametrize("postcode", ('675 34', '67534', 'SE-675 34', 'SE67534'))
+def test_postcode_sweden_pass(sanitize, postcode):
+    assert sanitize(country='se', postcode=postcode) == [('postcode', '675 34')]
+
+
+@pytest.mark.parametrize("postcode", ('67 345', '671123'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_sweden_fail(sanitize, postcode):
+    assert sanitize(country='se', postcode=postcode) == []
+
author	Sarah Hoffmann <lonvia@denofr.de>
	Mon, 23 May 2022 12:04:22 +0000 (14:04 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Thu, 23 Jun 2022 21:42:31 +0000 (23:42 +0200)
nominatim/tokenizer/sanitizers/clean_postcodes.py		patch \| blob \| history
settings/country_settings.yaml		patch \| blob \| history
test/python/tokenizer/sanitizers/test_clean_postcodes.py		patch \| blob \| history