]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/sanitizers/clean_postcodes.py
Merge pull request #3167 from lonvia/explicit-encoding
[nominatim.git] / nominatim / tokenizer / sanitizers / clean_postcodes.py
index c6292a2942b217ae866c8b43ded81a5aa75ff089..5eaea3917c7aea9a2e8047f773cd03ac17990d34 100644 (file)
@@ -15,74 +15,29 @@ Arguments:
                         postcode centroids of a country but is still searchable.
                         When set to 'no', non-conforming postcodes are not
                         searchable either.
                         postcode centroids of a country but is still searchable.
                         When set to 'no', non-conforming postcodes are not
                         searchable either.
+    default-pattern:    Pattern to use, when there is none available for the
+                        country in question. Warning: will not be used for
+                        objects that have no country assigned. These are always
+                        assumed to have no postcode.
 """
 """
-import re
-
-from nominatim.errors import UsageError
-from nominatim.tools import country_info
-
-class _PostcodeMatcher:
-    """ Matches and formats a postcode according to the format definition.
-    """
-    def __init__(self, country_code, config):
-        if 'pattern' not in config:
-            raise UsageError("Field 'pattern' required for 'postcode' "
-                             f"for country '{country_code}'")
-
-        pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
-
-        self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
-        self.pattern = re.compile(pc_pattern)
-
-        self.output = config.get('output', r'\g<0>')
-
-
-    def match(self, postcode):
-        """ Match the given postcode against the postcode pattern for this
-            matcher. Returns a `re.Match` object if the match was successful
-            and None otherwise.
-        """
-        # Upper-case, strip spaces and leading country code.
-        normalized = self.norm_pattern.fullmatch(postcode.upper())
-
-        if normalized:
-            return self.pattern.fullmatch(normalized.group(1))
-
-        return None
-
-
-    def normalize(self, match):
-        """ Return the default format of the postcode for the given match.
-            `match` must be a `re.Match` object previously returned by
-            `match()`
-        """
-        return match.expand(self.output)
+from typing import Callable, Optional, Tuple
 
 
+from nominatim.data.postcode_format import PostcodeFormatter
+from nominatim.tokenizer.sanitizers.base import ProcessInfo
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
 
 class _PostcodeSanitizer:
 
 
 class _PostcodeSanitizer:
 
-    def __init__(self, config):
+    def __init__(self, config: SanitizerConfig) -> None:
         self.convert_to_address = config.get_bool('convert-to-address', True)
         self.convert_to_address = config.get_bool('convert-to-address', True)
-        # Objects without a country code can't have a postcode per definition.
-        self.country_without_postcode = {None}
-        self.country_matcher = {}
-
-        for ccode, prop in country_info.iterate('postcode'):
-            if prop is False:
-                self.country_without_postcode.add(ccode)
-            elif isinstance(prop, dict):
-                self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop)
-            else:
-                raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
+        self.matcher = PostcodeFormatter()
 
         default_pattern = config.get('default-pattern')
         if default_pattern is not None and isinstance(default_pattern, str):
 
         default_pattern = config.get('default-pattern')
         if default_pattern is not None and isinstance(default_pattern, str):
-            self.default_matcher = _PostcodeMatcher('', {'pattern': default_pattern})
-        else:
-            self.default_matcher = None
+            self.matcher.set_default_pattern(default_pattern)
 
 
 
 
-    def __call__(self, obj):
+    def __call__(self, obj: ProcessInfo) -> None:
         if not obj.address:
             return
 
         if not obj.address:
             return
 
@@ -98,32 +53,28 @@ class _PostcodeSanitizer:
                     obj.address.pop(pos)
             else:
                 postcode.name = formatted[0]
                     obj.address.pop(pos)
             else:
                 postcode.name = formatted[0]
-                postcode.set_attr('lookup', formatted[1])
+                postcode.set_attr('variant', formatted[1])
 
 
 
 
-    def scan(self, postcode, country):
+    def scan(self, postcode: str, country: Optional[str]) -> Optional[Tuple[str, str]]:
         """ Check the postcode for correct formatting and return the
             normalized version. Returns None if the postcode does not
         """ Check the postcode for correct formatting and return the
             normalized version. Returns None if the postcode does not
-            correspond to the oficial format of the given country.
+            correspond to the official format of the given country.
         """
         """
-        if country in self.country_without_postcode:
-            return None
-
-        matcher = self.country_matcher.get(country, self.default_matcher)
-        if matcher is None:
-            return postcode.upper(), ''
-
-        match = matcher.match(postcode)
+        match = self.matcher.match(country, postcode)
         if match is None:
             return None
 
         if match is None:
             return None
 
-        return matcher.normalize(match), ' '.join(match.groups())
+        assert country is not None
+
+        return self.matcher.normalize(country, match),\
+               ' '.join(filter(lambda p: p is not None, match.groups()))
 
 
 
 
 
 
 
 
-def create(config):
-    """ Create a housenumber processing function.
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
+    """ Create a function that filters postcodes by their officially allowed pattern.
     """
 
     return _PostcodeSanitizer(config)
     """
 
     return _PostcodeSanitizer(config)