]> git.openstreetmap.org Git - nominatim.git/commitdiff
initial postcode cleaner for simple patterns
authorSarah Hoffmann <lonvia@denofr.de>
Thu, 19 May 2022 13:49:36 +0000 (15:49 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Thu, 23 Jun 2022 21:42:31 +0000 (23:42 +0200)
Moves postcodes that are either in countries without a postcode
system or don't correspond to the local pattern for postcodes into
a field for a normal address part. Makes them searchable but not as
a special address. This has two consequences: they are no longer a
skippable part of the address and the postcodes cannot be searched
on their own.

nominatim/tokenizer/sanitizers/clean_postcodes.py [new file with mode: 0644]
test/python/tokenizer/sanitizers/test_clean_postcodes.py [new file with mode: 0644]

diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py
new file mode 100644 (file)
index 0000000..b07908c
--- /dev/null
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Sanitizer that filters postcodes by their officially allowed pattern.
+
+Arguments:
+    convert-to-address: If set to 'yes' (the default), then postcodes that do
+                        not conform with their country-specific pattern are
+                        converted to an address component. That means that
+                        the postcode does not take part when computing the
+                        postcode centroids of a country but is still searchable.
+                        When set to 'no', non-conforming postcodes are not
+                        searchable either.
+"""
+import re
+
+from nominatim.errors import UsageError
+from nominatim.tools import country_info
+
+class _PostcodeMatcher:
+    """ Matches and formats a postcode according to the format definition.
+    """
+    def __init__(self, country_code, config):
+        if 'pattern' not in config:
+            raise UsageError("Field 'pattern' required for 'postcode' "
+                             f"for country '{country_code}'")
+
+        self.pattern = re.compile(config['pattern'].replace('d', '[0-9]')
+                                                   .replace('l', '[A-Z]'))
+
+
+    def normalize(self, postcode):
+        """ Return the normalized version of the postcode. If the given postcode
+            does not correspond to the usage-pattern, return null.
+        """
+        normalized = postcode.strip().upper()
+
+        return normalized if self.pattern.fullmatch(normalized) else None
+
+
+class _PostcodeSanitizer:
+
+    def __init__(self, config):
+        self.convert_to_address = config.get_bool('convert-to-address', True)
+        # Objects without a country code can't have a postcode per definition.
+        self.country_without_postcode = {None}
+        self.country_matcher = {}
+
+        for ccode, prop in country_info.iterate('postcode'):
+            if prop is False:
+                self.country_without_postcode.add(ccode)
+            elif isinstance(prop, dict):
+                self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop)
+            else:
+                raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
+
+
+    def __call__(self, obj):
+        if not obj.address:
+            return
+
+        postcodes = ((i, o) for i, o in enumerate(obj.address) if o.kind == 'postcode')
+
+        for pos, postcode in postcodes:
+            formatted = self.scan(postcode.name, obj.place.country_code)
+
+            if formatted is None:
+                if self.convert_to_address:
+                    postcode.kind = 'unofficial_postcode'
+                else:
+                    obj.address.pop(pos)
+            else:
+                postcode.name = formatted
+
+
+    def scan(self, postcode, country):
+        """ Check the postcode for correct formatting and return the
+            normalized version. Returns None if the postcode does not
+            correspond to the oficial format of the given country.
+        """
+        if country in self.country_without_postcode:
+            return None
+
+        if country in self.country_matcher:
+            return self.country_matcher[country].normalize(postcode)
+
+        return postcode.upper()
+
+
+
+def create(config):
+    """ Create a housenumber processing function.
+    """
+
+    return _PostcodeSanitizer(config)
diff --git a/test/python/tokenizer/sanitizers/test_clean_postcodes.py b/test/python/tokenizer/sanitizers/test_clean_postcodes.py
new file mode 100644 (file)
index 0000000..7cb3c70
--- /dev/null
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for the sanitizer that normalizes postcodes.
+"""
+import pytest
+
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from nominatim.indexer.place_info import PlaceInfo
+from nominatim.tools import country_info
+
+@pytest.fixture
+def sanitize(def_config, request):
+    country_info.setup_country_config(def_config)
+    sanitizer_args = {'step': 'clean-postcodes'}
+    for mark in request.node.iter_markers(name="sanitizer_params"):
+        sanitizer_args.update({k.replace('_', '-') : v for k,v in mark.kwargs.items()})
+
+    def _run(country=None, **kwargs):
+        pi = {'address': kwargs}
+        if country is not None:
+            pi['country_code'] = country
+
+        _, address = PlaceSanitizer([sanitizer_args]).process_names(PlaceInfo(pi))
+
+        return sorted([(p.kind, p.name) for p in address])
+
+    return _run
+
+
+@pytest.mark.parametrize("country", (None, 'ae'))
+def test_postcode_no_country(sanitize, country):
+    assert sanitize(country=country, postcode='23231') == [('unofficial_postcode', '23231')]
+
+
+@pytest.mark.parametrize("country", (None, 'ae'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_no_country_drop(sanitize, country):
+    assert sanitize(country=country, postcode='23231') == []
+
+
+@pytest.mark.parametrize("postcode", ('12345', '  34009  '))
+def test_postcode_pass_good_format(sanitize, postcode):
+    assert sanitize(country='de', postcode=postcode) == [('postcode', postcode.strip())]
+
+
+@pytest.mark.parametrize("postcode", ('123456', '', '   ', '.....'))
+@pytest.mark.sanitizer_params(convert_to_address=False)
+def test_postcode_drop_bad_format(sanitize, postcode):
+    assert sanitize(country='de', postcode=postcode) == []