move postcode matcher in a separate file

author Sarah Hoffmann <lonvia@denofr.de>

Mon, 6 Jun 2022 21:37:04 +0000 (23:37 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Thu, 23 Jun 2022 21:42:31 +0000 (23:42 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Mon, 6 Jun 2022 21:37:04 +0000 (23:37 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Thu, 23 Jun 2022 21:42:31 +0000 (23:42 +0200)
diff --git a/nominatim/data/__init__.py b/nominatim/data/__init__.py

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/nominatim/data/postcode_format.py b/nominatim/data/postcode_format.py

new file mode 100644 (file)

index 0000000..0158111
--- /dev/null
+++ b/nominatim/data/postcode_format.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Functions for formatting postcodes according to their country-specific
+format.
+"""
+import re
+
+from nominatim.errors import UsageError
+from nominatim.tools import country_info
+
+class CountryPostcodeMatcher:
+    """ Matches and formats a postcode according to a format definition
+        of the given country.
+    """
+    def __init__(self, country_code, config):
+        if 'pattern' not in config:
+            raise UsageError("Field 'pattern' required for 'postcode' "
+                             f"for country '{country_code}'")
+
+        pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
+
+        self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
+        self.pattern = re.compile(pc_pattern)
+
+        self.output = config.get('output', r'\g<0>')
+
+
+    def match(self, postcode):
+        """ Match the given postcode against the postcode pattern for this
+            matcher. Returns a `re.Match` object if the match was successful
+            and None otherwise.
+        """
+        # Upper-case, strip spaces and leading country code.
+        normalized = self.norm_pattern.fullmatch(postcode.upper())
+
+        if normalized:
+            return self.pattern.fullmatch(normalized.group(1))
+
+        return None
+
+
+    def normalize(self, match):
+        """ Return the default format of the postcode for the given match.
+            `match` must be a `re.Match` object previously returned by
+            `match()`
+        """
+        return match.expand(self.output)
+
+
+class PostcodeFormatter:
+    """ Container for different postcode formats of the world and
+        access functions.
+    """
+    def __init__(self):
+        # Objects without a country code can't have a postcode per definition.
+        self.country_without_postcode = {None}
+        self.country_matcher = {}
+        self.default_matcher = CountryPostcodeMatcher('', {'pattern': '.*'})
+
+        for ccode, prop in country_info.iterate('postcode'):
+            if prop is False:
+                self.country_without_postcode.add(ccode)
+            elif isinstance(prop, dict):
+                self.country_matcher[ccode] = CountryPostcodeMatcher(ccode, prop)
+            else:
+                raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
+
+
+    def set_default_pattern(self, pattern):
+        """ Set the postcode match pattern to use, when a country does not
+            have a specific pattern or is marked as country without postcode.
+        """
+        self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
+
+
+    def match(self, country_code, postcode):
+        """ Match the given postcode against the postcode pattern for this
+            matcher. Returns a `re.Match` object if the country has a pattern
+            and the match was successful or None if the match failed.
+        """
+        if country_code in self.country_without_postcode:
+            return None
+
+        return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
+
+
+    def normalize(self, country_code, match):
+        """ Return the default format of the postcode for the given match.
+            `match` must be a `re.Match` object previously returned by
+            `match()`
+        """
+        return self.country_matcher.get(country_code, self.default_matcher).normalize(match)
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index 61c47c118805f4638cd5fc022d006564e1ff9b15..0dc551e1b4ce6e828720fad913502c185e49c327 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -607,7 +607,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
      def _add_postcode(self, item):
          """ Make sure the normalized postcode is present in the word table.
          """
-        analyzer = self.token_analysis.get_analyzer('@postcode')
+        analyzer = self.token_analysis.analysis.get('@postcode')
  
          if analyzer is None:
              postcode_name = item.name.strip().upper()
diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py

index d1edc60d1e5e8c109a446323a7832f3970e3c3a9..fbc46fa582215460f7f7e62f6fe63087b3019ab9 100644 (file)
--- a/nominatim/tokenizer/sanitizers/clean_postcodes.py
+++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py
@@ -16,70 +16,17 @@ Arguments:
                          When set to 'no', non-conforming postcodes are not
                          searchable either.
  """
-import re
-
-from nominatim.errors import UsageError
-from nominatim.tools import country_info
-
-class _PostcodeMatcher:
-    """ Matches and formats a postcode according to the format definition.
-    """
-    def __init__(self, country_code, config):
-        if 'pattern' not in config:
-            raise UsageError("Field 'pattern' required for 'postcode' "
-                             f"for country '{country_code}'")
-
-        pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
-
-        self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*')
-        self.pattern = re.compile(pc_pattern)
-
-        self.output = config.get('output', r'\g<0>')
-
-
-    def match(self, postcode):
-        """ Match the given postcode against the postcode pattern for this
-            matcher. Returns a `re.Match` object if the match was successful
-            and None otherwise.
-        """
-        # Upper-case, strip spaces and leading country code.
-        normalized = self.norm_pattern.fullmatch(postcode.upper())
-
-        if normalized:
-            return self.pattern.fullmatch(normalized.group(1))
-
-        return None
-
-
-    def normalize(self, match):
-        """ Return the default format of the postcode for the given match.
-            `match` must be a `re.Match` object previously returned by
-            `match()`
-        """
-        return match.expand(self.output)
-
+from nominatim.data.postcode_format import PostcodeFormatter
  
  class _PostcodeSanitizer:
  
      def __init__(self, config):
          self.convert_to_address = config.get_bool('convert-to-address', True)
-        # Objects without a country code can't have a postcode per definition.
-        self.country_without_postcode = {None}
-        self.country_matcher = {}
-
-        for ccode, prop in country_info.iterate('postcode'):
-            if prop is False:
-                self.country_without_postcode.add(ccode)
-            elif isinstance(prop, dict):
-                self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop)
-            else:
-                raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
+        self.matcher = PostcodeFormatter()
  
          default_pattern = config.get('default-pattern')
          if default_pattern is not None and isinstance(default_pattern, str):
-            self.default_matcher = _PostcodeMatcher('', {'pattern': default_pattern})
-        else:
-            self.default_matcher = None
+            self.matcher.set_default_pattern(default_pattern)
  
  
      def __call__(self, obj):
@@ -106,18 +53,11 @@ class _PostcodeSanitizer:
              normalized version. Returns None if the postcode does not
              correspond to the oficial format of the given country.
          """
-        if country in self.country_without_postcode:
-            return None
-
-        matcher = self.country_matcher.get(country, self.default_matcher)
-        if matcher is None:
-            return postcode.upper(), ''
-
-        match = matcher.match(postcode)
+        match = self.matcher.match(country, postcode)
          if match is None:
              return None
  
-        return matcher.normalize(match), ' '.join(match.groups())
+        return self.matcher.normalize(country, match), ' '.join(match.groups())
  
  
  
diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py

index d85a5b65e565d83187b2688839afe72c1f175fcb..6138a03a42b49f4eb702e333e55396d5c73a22e8 100644 (file)
--- a/test/python/tokenizer/test_icu.py
+++ b/test/python/tokenizer/test_icu.py
@@ -437,13 +437,6 @@ class TestPlaceAddress:
          assert word_table.get_postcodes() == {pcode, }
  
  
-    @pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836'])
-    def test_process_place_bad_postcode(self, word_table, pcode):
-        self.process_address(postcode=pcode)
-
-        assert not word_table.get_postcodes()
-
-
      @pytest.mark.parametrize('hnr', ['123a', '1', '101'])
      def test_process_place_housenumbers_simple(self, hnr, getorcreate_hnr_id):
          info = self.process_address(housenumber=hnr)
author	Sarah Hoffmann <lonvia@denofr.de>
	Mon, 6 Jun 2022 21:37:04 +0000 (23:37 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Thu, 23 Jun 2022 21:42:31 +0000 (23:42 +0200)
nominatim/data/__init__.py	[new file with mode: 0644]	patch \| blob
nominatim/data/postcode_format.py	[new file with mode: 0644]	patch \| blob
nominatim/tokenizer/icu_tokenizer.py		patch \| blob \| history
nominatim/tokenizer/sanitizers/clean_postcodes.py		patch \| blob \| history
test/python/tokenizer/test_icu.py		patch \| blob \| history