]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/sanitizers/clean_postcodes.py
introduce and use analyzer for postcodes
[nominatim.git] / nominatim / tokenizer / sanitizers / clean_postcodes.py
index b07908cdee9339118b058d26aa1736451c5659c1..d1edc60d1e5e8c109a446323a7832f3970e3c3a9 100644 (file)
@@ -29,17 +29,34 @@ class _PostcodeMatcher:
             raise UsageError("Field 'pattern' required for 'postcode' "
                              f"for country '{country_code}'")
 
-        self.pattern = re.compile(config['pattern'].replace('d', '[0-9]')
-                                                   .replace('l', '[A-Z]'))
+        pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
 
+        self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
+        self.pattern = re.compile(pc_pattern)
 
-    def normalize(self, postcode):
-        """ Return the normalized version of the postcode. If the given postcode
-            does not correspond to the usage-pattern, return null.
+        self.output = config.get('output', r'\g<0>')
+
+
+    def match(self, postcode):
+        """ Match the given postcode against the postcode pattern for this
+            matcher. Returns a `re.Match` object if the match was successful
+            and None otherwise.
         """
-        normalized = postcode.strip().upper()
+        # Upper-case, strip spaces and leading country code.
+        normalized = self.norm_pattern.fullmatch(postcode.upper())
+
+        if normalized:
+            return self.pattern.fullmatch(normalized.group(1))
+
+        return None
+
 
-        return normalized if self.pattern.fullmatch(normalized) else None
+    def normalize(self, match):
+        """ Return the default format of the postcode for the given match.
+            `match` must be a `re.Match` object previously returned by
+            `match()`
+        """
+        return match.expand(self.output)
 
 
 class _PostcodeSanitizer:
@@ -58,6 +75,12 @@ class _PostcodeSanitizer:
             else:
                 raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
 
+        default_pattern = config.get('default-pattern')
+        if default_pattern is not None and isinstance(default_pattern, str):
+            self.default_matcher = _PostcodeMatcher('', {'pattern': default_pattern})
+        else:
+            self.default_matcher = None
+
 
     def __call__(self, obj):
         if not obj.address:
@@ -74,7 +97,8 @@ class _PostcodeSanitizer:
                 else:
                     obj.address.pop(pos)
             else:
-                postcode.name = formatted
+                postcode.name = formatted[0]
+                postcode.set_attr('variant', formatted[1])
 
 
     def scan(self, postcode, country):
@@ -85,10 +109,16 @@ class _PostcodeSanitizer:
         if country in self.country_without_postcode:
             return None
 
-        if country in self.country_matcher:
-            return self.country_matcher[country].normalize(postcode)
+        matcher = self.country_matcher.get(country, self.default_matcher)
+        if matcher is None:
+            return postcode.upper(), ''
+
+        match = matcher.match(postcode)
+        if match is None:
+            return None
+
+        return matcher.normalize(match), ' '.join(match.groups())
 
-        return postcode.upper()