]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/sanitizers/clean_postcodes.py
Vagrant and CI tests for Ubuntu 22.04
[nominatim.git] / nominatim / tokenizer / sanitizers / clean_postcodes.py
index a968c9db0787f8a4bb009d72b20db72800145094..05e90ca122fa71eb4f8eb8f482bd15819fa623c2 100644 (file)
@@ -15,57 +15,22 @@ Arguments:
                         postcode centroids of a country but is still searchable.
                         When set to 'no', non-conforming postcodes are not
                         searchable either.
                         postcode centroids of a country but is still searchable.
                         When set to 'no', non-conforming postcodes are not
                         searchable either.
+    default-pattern:    Pattern to use, when there is none available for the
+                        country in question. Warning: will not be used for
+                        objects that have no country assigned. These are always
+                        assumed to have no postcode.
 """
 """
-import re
-
-from nominatim.errors import UsageError
-from nominatim.tools import country_info
-
-class _PostcodeMatcher:
-    """ Matches and formats a postcode according to the format definition.
-    """
-    def __init__(self, country_code, config):
-        if 'pattern' not in config:
-            raise UsageError("Field 'pattern' required for 'postcode' "
-                             f"for country '{country_code}'")
-
-        pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
-
-        self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
-        self.pattern = re.compile(pc_pattern)
-
-        self.output = config.get('output', r'\g<0>')
-
-
-    def normalize(self, postcode):
-        """ Return the normalized version of the postcode. If the given postcode
-            does not correspond to the usage-pattern, return null.
-        """
-        # Upper-case, strip spaces and leading country code.
-        normalized = self.norm_pattern.fullmatch(postcode.upper())
-
-        if normalized:
-            match = self.pattern.fullmatch(normalized.group(1))
-            return match.expand(self.output) if match else None
-
-        return None
-
+from nominatim.data.postcode_format import PostcodeFormatter
 
 class _PostcodeSanitizer:
 
     def __init__(self, config):
         self.convert_to_address = config.get_bool('convert-to-address', True)
 
 class _PostcodeSanitizer:
 
     def __init__(self, config):
         self.convert_to_address = config.get_bool('convert-to-address', True)
-        # Objects without a country code can't have a postcode per definition.
-        self.country_without_postcode = {None}
-        self.country_matcher = {}
-
-        for ccode, prop in country_info.iterate('postcode'):
-            if prop is False:
-                self.country_without_postcode.add(ccode)
-            elif isinstance(prop, dict):
-                self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop)
-            else:
-                raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
+        self.matcher = PostcodeFormatter()
+
+        default_pattern = config.get('default-pattern')
+        if default_pattern is not None and isinstance(default_pattern, str):
+            self.matcher.set_default_pattern(default_pattern)
 
 
     def __call__(self, obj):
 
 
     def __call__(self, obj):
@@ -83,7 +48,8 @@ class _PostcodeSanitizer:
                 else:
                     obj.address.pop(pos)
             else:
                 else:
                     obj.address.pop(pos)
             else:
-                postcode.name = formatted
+                postcode.name = formatted[0]
+                postcode.set_attr('variant', formatted[1])
 
 
     def scan(self, postcode, country):
 
 
     def scan(self, postcode, country):
@@ -91,13 +57,13 @@ class _PostcodeSanitizer:
             normalized version. Returns None if the postcode does not
             correspond to the oficial format of the given country.
         """
             normalized version. Returns None if the postcode does not
             correspond to the oficial format of the given country.
         """
-        if country in self.country_without_postcode:
+        match = self.matcher.match(country, postcode)
+        if match is None:
             return None
 
             return None
 
-        if country in self.country_matcher:
-            return self.country_matcher[country].normalize(postcode)
+        return self.matcher.normalize(country, match),\
+               ' '.join(filter(lambda p: p is not None, match.groups()))
 
 
-        return postcode.upper()