]> git.openstreetmap.org Git - nominatim.git/commitdiff
move postcode matcher in a separate file
authorSarah Hoffmann <lonvia@denofr.de>
Mon, 6 Jun 2022 21:37:04 +0000 (23:37 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Thu, 23 Jun 2022 21:42:31 +0000 (23:42 +0200)
nominatim/data/__init__.py [new file with mode: 0644]
nominatim/data/postcode_format.py [new file with mode: 0644]
nominatim/tokenizer/icu_tokenizer.py
nominatim/tokenizer/sanitizers/clean_postcodes.py
test/python/tokenizer/test_icu.py

diff --git a/nominatim/data/__init__.py b/nominatim/data/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/nominatim/data/postcode_format.py b/nominatim/data/postcode_format.py
new file mode 100644 (file)
index 0000000..0158111
--- /dev/null
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for formatting postcodes according to their country-specific
+format.
+"""
+import re
+
+from nominatim.errors import UsageError
+from nominatim.tools import country_info
+
+class CountryPostcodeMatcher:
+    """ Matches and formats a postcode according to a format definition
+        of the given country.
+    """
+    def __init__(self, country_code, config):
+        if 'pattern' not in config:
+            raise UsageError("Field 'pattern' required for 'postcode' "
+                             f"for country '{country_code}'")
+
+        pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
+
+        self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
+        self.pattern = re.compile(pc_pattern)
+
+        self.output = config.get('output', r'\g<0>')
+
+
+    def match(self, postcode):
+        """ Match the given postcode against the postcode pattern for this
+            matcher. Returns a `re.Match` object if the match was successful
+            and None otherwise.
+        """
+        # Upper-case, strip spaces and leading country code.
+        normalized = self.norm_pattern.fullmatch(postcode.upper())
+
+        if normalized:
+            return self.pattern.fullmatch(normalized.group(1))
+
+        return None
+
+
+    def normalize(self, match):
+        """ Return the default format of the postcode for the given match.
+            `match` must be a `re.Match` object previously returned by
+            `match()`
+        """
+        return match.expand(self.output)
+
+
+class PostcodeFormatter:
+    """ Container for different postcode formats of the world and
+        access functions.
+    """
+    def __init__(self):
+        # Objects without a country code can't have a postcode per definition.
+        self.country_without_postcode = {None}
+        self.country_matcher = {}
+        self.default_matcher = CountryPostcodeMatcher('', {'pattern': '.*'})
+
+        for ccode, prop in country_info.iterate('postcode'):
+            if prop is False:
+                self.country_without_postcode.add(ccode)
+            elif isinstance(prop, dict):
+                self.country_matcher[ccode] = CountryPostcodeMatcher(ccode, prop)
+            else:
+                raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
+
+
+    def set_default_pattern(self, pattern):
+        """ Set the postcode match pattern to use, when a country does not
+            have a specific pattern or is marked as country without postcode.
+        """
+        self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
+
+
+    def match(self, country_code, postcode):
+        """ Match the given postcode against the postcode pattern for this
+            matcher. Returns a `re.Match` object if the country has a pattern
+            and the match was successful or None if the match failed.
+        """
+        if country_code in self.country_without_postcode:
+            return None
+
+        return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
+
+
+    def normalize(self, country_code, match):
+        """ Return the default format of the postcode for the given match.
+            `match` must be a `re.Match` object previously returned by
+            `match()`
+        """
+        return self.country_matcher.get(country_code, self.default_matcher).normalize(match)
index 61c47c118805f4638cd5fc022d006564e1ff9b15..0dc551e1b4ce6e828720fad913502c185e49c327 100644 (file)
@@ -607,7 +607,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
     def _add_postcode(self, item):
         """ Make sure the normalized postcode is present in the word table.
         """
     def _add_postcode(self, item):
         """ Make sure the normalized postcode is present in the word table.
         """
-        analyzer = self.token_analysis.get_analyzer('@postcode')
+        analyzer = self.token_analysis.analysis.get('@postcode')
 
         if analyzer is None:
             postcode_name = item.name.strip().upper()
 
         if analyzer is None:
             postcode_name = item.name.strip().upper()
index d1edc60d1e5e8c109a446323a7832f3970e3c3a9..fbc46fa582215460f7f7e62f6fe63087b3019ab9 100644 (file)
@@ -16,70 +16,17 @@ Arguments:
                         When set to 'no', non-conforming postcodes are not
                         searchable either.
 """
                         When set to 'no', non-conforming postcodes are not
                         searchable either.
 """
-import re
-
-from nominatim.errors import UsageError
-from nominatim.tools import country_info
-
-class _PostcodeMatcher:
-    """ Matches and formats a postcode according to the format definition.
-    """
-    def __init__(self, country_code, config):
-        if 'pattern' not in config:
-            raise UsageError("Field 'pattern' required for 'postcode' "
-                             f"for country '{country_code}'")
-
-        pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
-
-        self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
-        self.pattern = re.compile(pc_pattern)
-
-        self.output = config.get('output', r'\g<0>')
-
-
-    def match(self, postcode):
-        """ Match the given postcode against the postcode pattern for this
-            matcher. Returns a `re.Match` object if the match was successful
-            and None otherwise.
-        """
-        # Upper-case, strip spaces and leading country code.
-        normalized = self.norm_pattern.fullmatch(postcode.upper())
-
-        if normalized:
-            return self.pattern.fullmatch(normalized.group(1))
-
-        return None
-
-
-    def normalize(self, match):
-        """ Return the default format of the postcode for the given match.
-            `match` must be a `re.Match` object previously returned by
-            `match()`
-        """
-        return match.expand(self.output)
-
+from nominatim.data.postcode_format import PostcodeFormatter
 
 class _PostcodeSanitizer:
 
     def __init__(self, config):
         self.convert_to_address = config.get_bool('convert-to-address', True)
 
 class _PostcodeSanitizer:
 
     def __init__(self, config):
         self.convert_to_address = config.get_bool('convert-to-address', True)
-        # Objects without a country code can't have a postcode per definition.
-        self.country_without_postcode = {None}
-        self.country_matcher = {}
-
-        for ccode, prop in country_info.iterate('postcode'):
-            if prop is False:
-                self.country_without_postcode.add(ccode)
-            elif isinstance(prop, dict):
-                self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop)
-            else:
-                raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
+        self.matcher = PostcodeFormatter()
 
         default_pattern = config.get('default-pattern')
         if default_pattern is not None and isinstance(default_pattern, str):
 
         default_pattern = config.get('default-pattern')
         if default_pattern is not None and isinstance(default_pattern, str):
-            self.default_matcher = _PostcodeMatcher('', {'pattern': default_pattern})
-        else:
-            self.default_matcher = None
+            self.matcher.set_default_pattern(default_pattern)
 
 
     def __call__(self, obj):
 
 
     def __call__(self, obj):
@@ -106,18 +53,11 @@ class _PostcodeSanitizer:
             normalized version. Returns None if the postcode does not
             correspond to the oficial format of the given country.
         """
             normalized version. Returns None if the postcode does not
             correspond to the oficial format of the given country.
         """
-        if country in self.country_without_postcode:
-            return None
-
-        matcher = self.country_matcher.get(country, self.default_matcher)
-        if matcher is None:
-            return postcode.upper(), ''
-
-        match = matcher.match(postcode)
+        match = self.matcher.match(country, postcode)
         if match is None:
             return None
 
         if match is None:
             return None
 
-        return matcher.normalize(match), ' '.join(match.groups())
+        return self.matcher.normalize(country, match), ' '.join(match.groups())
 
 
 
 
 
 
index d85a5b65e565d83187b2688839afe72c1f175fcb..6138a03a42b49f4eb702e333e55396d5c73a22e8 100644 (file)
@@ -437,13 +437,6 @@ class TestPlaceAddress:
         assert word_table.get_postcodes() == {pcode, }
 
 
         assert word_table.get_postcodes() == {pcode, }
 
 
-    @pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836'])
-    def test_process_place_bad_postcode(self, word_table, pcode):
-        self.process_address(postcode=pcode)
-
-        assert not word_table.get_postcodes()
-
-
     @pytest.mark.parametrize('hnr', ['123a', '1', '101'])
     def test_process_place_housenumbers_simple(self, hnr, getorcreate_hnr_id):
         info = self.process_address(housenumber=hnr)
     @pytest.mark.parametrize('hnr', ['123a', '1', '101'])
     def test_process_place_housenumbers_simple(self, hnr, getorcreate_hnr_id):
         info = self.process_address(housenumber=hnr)