]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/sanitizers/clean_postcodes.py
Merge remote-tracking branch 'upstream/master'
[nominatim.git] / nominatim / tokenizer / sanitizers / clean_postcodes.py
index ae1cd62d8d09f7c9afe5b1fac3949fccaca8b941..05e90ca122fa71eb4f8eb8f482bd15819fa623c2 100644 (file)
@@ -15,51 +15,22 @@ Arguments:
                         postcode centroids of a country but is still searchable.
                         When set to 'no', non-conforming postcodes are not
                         searchable either.
+    default-pattern:    Pattern to use, when there is none available for the
+                        country in question. Warning: will not be used for
+                        objects that have no country assigned. These are always
+                        assumed to have no postcode.
 """
-import re
-
-from nominatim.errors import UsageError
-from nominatim.tools import country_info
-
-class _PostcodeMatcher:
-    """ Matches and formats a postcode according to the format definition.
-    """
-    def __init__(self, country_code, config):
-        if 'pattern' not in config:
-            raise UsageError("Field 'pattern' required for 'postcode' "
-                             f"for country '{country_code}'")
-
-        pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
-
-        self.pattern = re.compile(f'(?:{country_code.upper()}[ -]?)?({pc_pattern})')
-
-
-    def normalize(self, postcode):
-        """ Return the normalized version of the postcode. If the given postcode
-            does not correspond to the usage-pattern, return null.
-        """
-        normalized = postcode.strip().upper()
-
-        match = self.pattern.fullmatch(normalized)
-
-        return match.group(1) if match else None
-
+from nominatim.data.postcode_format import PostcodeFormatter
 
 class _PostcodeSanitizer:
 
     def __init__(self, config):
         self.convert_to_address = config.get_bool('convert-to-address', True)
-        # Objects without a country code can't have a postcode per definition.
-        self.country_without_postcode = {None}
-        self.country_matcher = {}
-
-        for ccode, prop in country_info.iterate('postcode'):
-            if prop is False:
-                self.country_without_postcode.add(ccode)
-            elif isinstance(prop, dict):
-                self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop)
-            else:
-                raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
+        self.matcher = PostcodeFormatter()
+
+        default_pattern = config.get('default-pattern')
+        if default_pattern is not None and isinstance(default_pattern, str):
+            self.matcher.set_default_pattern(default_pattern)
 
 
     def __call__(self, obj):
@@ -77,7 +48,8 @@ class _PostcodeSanitizer:
                 else:
                     obj.address.pop(pos)
             else:
-                postcode.name = formatted
+                postcode.name = formatted[0]
+                postcode.set_attr('variant', formatted[1])
 
 
     def scan(self, postcode, country):
@@ -85,13 +57,13 @@ class _PostcodeSanitizer:
             normalized version. Returns None if the postcode does not
             correspond to the oficial format of the given country.
         """
-        if country in self.country_without_postcode:
+        match = self.matcher.match(country, postcode)
+        if match is None:
             return None
 
-        if country in self.country_matcher:
-            return self.country_matcher[country].normalize(postcode)
+        return self.matcher.normalize(country, match),\
+               ' '.join(filter(lambda p: p is not None, match.groups()))
 
-        return postcode.upper()