From: Sarah Hoffmann Date: Mon, 23 May 2022 12:04:22 +0000 (+0200) Subject: postcodes: add support for optional spaces X-Git-Tag: v4.1.0~22^2~22 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/9172696324d1a3cd489428d16c2c8d88cf9adaca postcodes: add support for optional spaces --- diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py index ae1cd62d..a968c9db 100644 --- a/nominatim/tokenizer/sanitizers/clean_postcodes.py +++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py @@ -31,18 +31,24 @@ class _PostcodeMatcher: pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]') - self.pattern = re.compile(f'(?:{country_code.upper()}[ -]?)?({pc_pattern})') + self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*') + self.pattern = re.compile(pc_pattern) + + self.output = config.get('output', r'\g<0>') def normalize(self, postcode): """ Return the normalized version of the postcode. If the given postcode does not correspond to the usage-pattern, return null. """ - normalized = postcode.strip().upper() + # Upper-case, strip spaces and leading country code. + normalized = self.norm_pattern.fullmatch(postcode.upper()) - match = self.pattern.fullmatch(normalized) + if normalized: + match = self.pattern.fullmatch(normalized.group(1)) + return match.expand(self.output) if match else None - return match.group(1) if match else None + return None class _PostcodeSanitizer: diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml index adb7593e..f09de046 100644 --- a/settings/country_settings.yaml +++ b/settings/country_settings.yaml @@ -456,6 +456,9 @@ cz: partition: 124 languages: cs names: !include country-names/cz.yaml + postcode: + pattern: "(ddd) ?(dd)" + output: \1 \2 # Germany (Deutschland) @@ -1618,6 +1621,9 @@ se: partition: 112 languages: sv names: !include country-names/se.yaml + postcode: + pattern: "(ddd) ?(dd)" + output: \1 \2 # Singapore (Singapore) @@ -1657,6 +1663,9 @@ sk: partition: 172 languages: sk names: !include country-names/sk.yaml + postcode: + pattern: "(ddd) ?(dd)" + output: \1 \2 # Sierra Leone (Sierra Leone) diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py index e5c07596..228c2f3a 100644 --- a/test/python/tokenizer/sanitizers/test_clean_postcodes.py +++ b/test/python/tokenizer/sanitizers/test_clean_postcodes.py @@ -77,3 +77,14 @@ def test_postcode_kazakhstan_pass(sanitize, postcode): def test_postcode_kazakhstan_fail(sanitize, postcode): assert sanitize(country='kz', postcode=postcode) == [] + +@pytest.mark.parametrize("postcode", ('675 34', '67534', 'SE-675 34', 'SE67534')) +def test_postcode_sweden_pass(sanitize, postcode): + assert sanitize(country='se', postcode=postcode) == [('postcode', '675 34')] + + +@pytest.mark.parametrize("postcode", ('67 345', '671123')) +@pytest.mark.sanitizer_params(convert_to_address=False) +def test_postcode_sweden_fail(sanitize, postcode): + assert sanitize(country='se', postcode=postcode) == [] +