From 80ea13437df4c6d57ea503adbdfc9928de8d859c Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Mon, 6 Jun 2022 23:37:04 +0200
Subject: [PATCH] move postcode matcher in a separate file

---
 nominatim/data/__init__.py                    |  0
 nominatim/data/postcode_format.py             | 97 +++++++++++++++++++
 nominatim/tokenizer/icu_tokenizer.py          |  2 +-
 .../tokenizer/sanitizers/clean_postcodes.py   | 70 +------------
 test/python/tokenizer/test_icu.py             |  7 --
 5 files changed, 103 insertions(+), 73 deletions(-)
 create mode 100644 nominatim/data/__init__.py
 create mode 100644 nominatim/data/postcode_format.py

diff --git a/nominatim/data/__init__.py b/nominatim/data/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/nominatim/data/postcode_format.py b/nominatim/data/postcode_format.py
new file mode 100644
index 00000000..0158111a
--- /dev/null
+++ b/nominatim/data/postcode_format.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for formatting postcodes according to their country-specific
+format.
+"""
+import re
+
+from nominatim.errors import UsageError
+from nominatim.tools import country_info
+
+class CountryPostcodeMatcher:
+    """ Matches and formats a postcode according to a format definition
+        of the given country.
+    """
+    def __init__(self, country_code, config):
+        if 'pattern' not in config:
+            raise UsageError("Field 'pattern' required for 'postcode' "
+                             f"for country '{country_code}'")
+
+        pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
+
+        self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
+        self.pattern = re.compile(pc_pattern)
+
+        self.output = config.get('output', r'\g<0>')
+
+
+    def match(self, postcode):
+        """ Match the given postcode against the postcode pattern for this
+            matcher. Returns a `re.Match` object if the match was successful
+            and None otherwise.
+        """
+        # Upper-case, strip spaces and leading country code.
+        normalized = self.norm_pattern.fullmatch(postcode.upper())
+
+        if normalized:
+            return self.pattern.fullmatch(normalized.group(1))
+
+        return None
+
+
+    def normalize(self, match):
+        """ Return the default format of the postcode for the given match.
+            `match` must be a `re.Match` object previously returned by
+            `match()`
+        """
+        return match.expand(self.output)
+
+
+class PostcodeFormatter:
+    """ Container for different postcode formats of the world and
+        access functions.
+    """
+    def __init__(self):
+        # Objects without a country code can't have a postcode per definition.
+        self.country_without_postcode = {None}
+        self.country_matcher = {}
+        self.default_matcher = CountryPostcodeMatcher('', {'pattern': '.*'})
+
+        for ccode, prop in country_info.iterate('postcode'):
+            if prop is False:
+                self.country_without_postcode.add(ccode)
+            elif isinstance(prop, dict):
+                self.country_matcher[ccode] = CountryPostcodeMatcher(ccode, prop)
+            else:
+                raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
+
+
+    def set_default_pattern(self, pattern):
+        """ Set the postcode match pattern to use, when a country does not
+            have a specific pattern or is marked as country without postcode.
+        """
+        self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
+
+
+    def match(self, country_code, postcode):
+        """ Match the given postcode against the postcode pattern for this
+            matcher. Returns a `re.Match` object if the country has a pattern
+            and the match was successful or None if the match failed.
+        """
+        if country_code in self.country_without_postcode:
+            return None
+
+        return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
+
+
+    def normalize(self, country_code, match):
+        """ Return the default format of the postcode for the given match.
+            `match` must be a `re.Match` object previously returned by
+            `match()`
+        """
+        return self.country_matcher.get(country_code, self.default_matcher).normalize(match)
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py
index 61c47c11..0dc551e1 100644
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -607,7 +607,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
     def _add_postcode(self, item):
         """ Make sure the normalized postcode is present in the word table.
         """
-        analyzer = self.token_analysis.get_analyzer('@postcode')
+        analyzer = self.token_analysis.analysis.get('@postcode')
 
         if analyzer is None:
             postcode_name = item.name.strip().upper()
diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py
index d1edc60d..fbc46fa5 100644
--- a/nominatim/tokenizer/sanitizers/clean_postcodes.py
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@@ -16,70 +16,17 @@ Arguments:
                         When set to 'no', non-conforming postcodes are not
                         searchable either.
 """
-import re
-
-from nominatim.errors import UsageError
-from nominatim.tools import country_info
-
-class _PostcodeMatcher:
-    """ Matches and formats a postcode according to the format definition.
-    """
-    def __init__(self, country_code, config):
-        if 'pattern' not in config:
-            raise UsageError("Field 'pattern' required for 'postcode' "
-                             f"for country '{country_code}'")
-
-        pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
-
-        self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
-        self.pattern = re.compile(pc_pattern)
-
-        self.output = config.get('output', r'\g<0>')
-
-
-    def match(self, postcode):
-        """ Match the given postcode against the postcode pattern for this
-            matcher. Returns a `re.Match` object if the match was successful
-            and None otherwise.
-        """
-        # Upper-case, strip spaces and leading country code.
-        normalized = self.norm_pattern.fullmatch(postcode.upper())
-
-        if normalized:
-            return self.pattern.fullmatch(normalized.group(1))
-
-        return None
-
-
-    def normalize(self, match):
-        """ Return the default format of the postcode for the given match.
-            `match` must be a `re.Match` object previously returned by
-            `match()`
-        """
-        return match.expand(self.output)
-
+from nominatim.data.postcode_format import PostcodeFormatter
 
 class _PostcodeSanitizer:
 
     def __init__(self, config):
         self.convert_to_address = config.get_bool('convert-to-address', True)
-        # Objects without a country code can't have a postcode per definition.
-        self.country_without_postcode = {None}
-        self.country_matcher = {}
-
-        for ccode, prop in country_info.iterate('postcode'):
-            if prop is False:
-                self.country_without_postcode.add(ccode)
-            elif isinstance(prop, dict):
-                self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop)
-            else:
-                raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
+        self.matcher = PostcodeFormatter()
 
         default_pattern = config.get('default-pattern')
         if default_pattern is not None and isinstance(default_pattern, str):
-            self.default_matcher = _PostcodeMatcher('', {'pattern': default_pattern})
-        else:
-            self.default_matcher = None
+            self.matcher.set_default_pattern(default_pattern)
 
 
     def __call__(self, obj):
@@ -106,18 +53,11 @@ class _PostcodeSanitizer:
             normalized version. Returns None if the postcode does not
             correspond to the oficial format of the given country.
         """
-        if country in self.country_without_postcode:
-            return None
-
-        matcher = self.country_matcher.get(country, self.default_matcher)
-        if matcher is None:
-            return postcode.upper(), ''
-
-        match = matcher.match(postcode)
+        match = self.matcher.match(country, postcode)
         if match is None:
             return None
 
-        return matcher.normalize(match), ' '.join(match.groups())
+        return self.matcher.normalize(country, match), ' '.join(match.groups())
 
 
 
diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py
index d85a5b65..6138a03a 100644
--- a/test/python/tokenizer/test_icu.py
+++ b/test/python/tokenizer/test_icu.py
@@ -437,13 +437,6 @@ class TestPlaceAddress:
         assert word_table.get_postcodes() == {pcode, }
 
 
-    @pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836'])
-    def test_process_place_bad_postcode(self, word_table, pcode):
-        self.process_address(postcode=pcode)
-
-        assert not word_table.get_postcodes()
-
-
     @pytest.mark.parametrize('hnr', ['123a', '1', '101'])
     def test_process_place_housenumbers_simple(self, hnr, getorcreate_hnr_id):
         info = self.process_address(housenumber=hnr)
-- 
2.39.5