]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/sanitizers/clean_postcodes.py
reintroduce cutoffs when searching for very frequent words
[nominatim.git] / nominatim / tokenizer / sanitizers / clean_postcodes.py
index b07908cdee9339118b058d26aa1736451c5659c1..5eaea3917c7aea9a2e8047f773cd03ac17990d34 100644 (file)
@@ -15,51 +15,29 @@ Arguments:
                         postcode centroids of a country but is still searchable.
                         When set to 'no', non-conforming postcodes are not
                         searchable either.
+    default-pattern:    Pattern to use, when there is none available for the
+                        country in question. Warning: will not be used for
+                        objects that have no country assigned. These are always
+                        assumed to have no postcode.
 """
-import re
-
-from nominatim.errors import UsageError
-from nominatim.tools import country_info
-
-class _PostcodeMatcher:
-    """ Matches and formats a postcode according to the format definition.
-    """
-    def __init__(self, country_code, config):
-        if 'pattern' not in config:
-            raise UsageError("Field 'pattern' required for 'postcode' "
-                             f"for country '{country_code}'")
-
-        self.pattern = re.compile(config['pattern'].replace('d', '[0-9]')
-                                                   .replace('l', '[A-Z]'))
-
-
-    def normalize(self, postcode):
-        """ Return the normalized version of the postcode. If the given postcode
-            does not correspond to the usage-pattern, return null.
-        """
-        normalized = postcode.strip().upper()
-
-        return normalized if self.pattern.fullmatch(normalized) else None
+from typing import Callable, Optional, Tuple
 
+from nominatim.data.postcode_format import PostcodeFormatter
+from nominatim.tokenizer.sanitizers.base import ProcessInfo
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
 
 class _PostcodeSanitizer:
 
-    def __init__(self, config):
+    def __init__(self, config: SanitizerConfig) -> None:
         self.convert_to_address = config.get_bool('convert-to-address', True)
-        # Objects without a country code can't have a postcode per definition.
-        self.country_without_postcode = {None}
-        self.country_matcher = {}
-
-        for ccode, prop in country_info.iterate('postcode'):
-            if prop is False:
-                self.country_without_postcode.add(ccode)
-            elif isinstance(prop, dict):
-                self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop)
-            else:
-                raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
+        self.matcher = PostcodeFormatter()
 
+        default_pattern = config.get('default-pattern')
+        if default_pattern is not None and isinstance(default_pattern, str):
+            self.matcher.set_default_pattern(default_pattern)
 
-    def __call__(self, obj):
+
+    def __call__(self, obj: ProcessInfo) -> None:
         if not obj.address:
             return
 
@@ -74,26 +52,29 @@ class _PostcodeSanitizer:
                 else:
                     obj.address.pop(pos)
             else:
-                postcode.name = formatted
+                postcode.name = formatted[0]
+                postcode.set_attr('variant', formatted[1])
 
 
-    def scan(self, postcode, country):
+    def scan(self, postcode: str, country: Optional[str]) -> Optional[Tuple[str, str]]:
         """ Check the postcode for correct formatting and return the
             normalized version. Returns None if the postcode does not
-            correspond to the oficial format of the given country.
+            correspond to the official format of the given country.
         """
-        if country in self.country_without_postcode:
+        match = self.matcher.match(country, postcode)
+        if match is None:
             return None
 
-        if country in self.country_matcher:
-            return self.country_matcher[country].normalize(postcode)
+        assert country is not None
+
+        return self.matcher.normalize(country, match),\
+               ' '.join(filter(lambda p: p is not None, match.groups()))
 
-        return postcode.upper()
 
 
 
-def create(config):
-    """ Create a housenumber processing function.
+def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
+    """ Create a function that filters postcodes by their officially allowed pattern.
     """
 
     return _PostcodeSanitizer(config)