generalize filter for sanitizers

author biswajit-k <biswajitkaushik02@gmail.com>

Sat, 11 Mar 2023 08:22:07 +0000 (13:52 +0530)

committer biswajit-k <biswajitkaushik02@gmail.com>

Sat, 1 Apr 2023 13:54:09 +0000 (19:24 +0530)
author biswajit-k <biswajitkaushik02@gmail.com>
Sat, 11 Mar 2023 08:22:07 +0000 (13:52 +0530)
committer biswajit-k <biswajitkaushik02@gmail.com>
Sat, 1 Apr 2023 13:54:09 +0000 (19:24 +0530)
diff --git a/nominatim/tokenizer/sanitizers/clean_housenumbers.py b/nominatim/tokenizer/sanitizers/clean_housenumbers.py

index 417d68d2025777b944d1944371dea3d9268c0616..ae6349a9c846cba2d9fa403471fe0c0f5b335e17 100644 (file)
--- a/nominatim/tokenizer/sanitizers/clean_housenumbers.py
+++ b/nominatim/tokenizer/sanitizers/clean_housenumbers.py
@@ -25,7 +25,6 @@ Arguments:
                       expression that must match the full house number value.
  """
  from typing import Callable, Iterator, List
-import re
  
  from nominatim.tokenizer.sanitizers.base import ProcessInfo
  from nominatim.data.place_name import PlaceName
@@ -34,12 +33,10 @@ from nominatim.tokenizer.sanitizers.config import SanitizerConfig
  class _HousenumberSanitizer:
  
      def __init__(self, config: SanitizerConfig) -> None:
-        self.filter_kind = config.get_filter_kind('housenumber')
+        self.filter_kind = config.get_filter('filter-kind', ['housenumber'])
          self.split_regexp = config.get_delimiter()
  
-        nameregexps = config.get_string_list('convert-to-name', [])
-        self.is_name_regexp = [re.compile(r) for r in nameregexps]
-
+        self.filter_name = config.get_filter('convert-to-name', 'FAIL_ALL')
  
  
      def __call__(self, obj: ProcessInfo) -> None:
@@ -49,7 +46,7 @@ class _HousenumberSanitizer:
          new_address: List[PlaceName] = []
          for item in obj.address:
              if self.filter_kind(item.kind):
-                if self._treat_as_name(item.name):
+                if self.filter_name(item.name):
                      obj.names.append(item.clone(kind='housenumber'))
                  else:
                      new_address.extend(item.clone(kind='housenumber', name=n)
@@ -76,10 +73,6 @@ class _HousenumberSanitizer:
          yield hnr
  
  
-    def _treat_as_name(self, housenumber: str) -> bool:
-        return any(r.fullmatch(housenumber) is not None for r in self.is_name_regexp)
-
-
  def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
      """ Create a housenumber processing function.
      """
diff --git a/nominatim/tokenizer/sanitizers/config.py b/nominatim/tokenizer/sanitizers/config.py

index 8b9164c6b81497863e1cffbc4878ad59c940d6c8..9b4f763ac81780508efbef1c9a66df14fd013cd0 100644 (file)
--- a/nominatim/tokenizer/sanitizers/config.py
+++ b/nominatim/tokenizer/sanitizers/config.py
@@ -7,7 +7,7 @@
  """
  Configuration for Sanitizers.
  """
-from typing import Sequence, Optional, Pattern, Callable, Any, TYPE_CHECKING
+from typing import Sequence, Union, Optional, Pattern, Callable, Any, TYPE_CHECKING
  from collections import UserDict
  import re
  
@@ -33,7 +33,11 @@ class SanitizerConfig(_BaseUserDict):
  
              Arguments:
                  param: Name of the configuration parameter.
-                default: Value to return, when the parameter is missing.
+                default: Takes a tuple or list of strings which will
+                         be returned if the parameter is missing in the
+                         sanitizer configuration.
+                         Note that if this default parameter is not
+                         provided then an empty list is returned.
  
              Returns:
                  If the parameter value is a simple string, it is returned as a
@@ -44,7 +48,7 @@ class SanitizerConfig(_BaseUserDict):
          values = self.data.get(param, None)
  
          if values is None:
-            return None if default is None else list(default)
+            return list(default)
  
          if isinstance(values, str):
              return [values] if values else []
@@ -74,7 +78,7 @@ class SanitizerConfig(_BaseUserDict):
          value = self.data.get(param, default)
  
          if not isinstance(value, bool):
-            raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no'.")
+            raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no').")
  
          return value
  
@@ -102,30 +106,46 @@ class SanitizerConfig(_BaseUserDict):
          return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
  
  
-    def get_filter_kind(self, *default: str) -> Callable[[str], bool]:
-        """ Return a filter function for the name kind from the 'filter-kind'
-            config parameter.
+    def get_filter(self, param: str, default: Union[str, Sequence[str]] = 'PASS_ALL'
+                   ) -> Callable[[str], bool]:
+        """ Returns a filter function for the given parameter of the sanitizer
+            configuration.
  
-            If the 'filter-kind' parameter is empty, the filter lets all items
-            pass. If the parameter is a string, it is interpreted as a single
-            regular expression that must match the full kind string.
-            If the parameter is a list then
-            any of the regular expressions in the list must match to pass.
+            The value provided for the parameter in sanitizer configuration
+            should be a string or list of strings, where each string is a regular
+            expression. These regular expressions will later be used by the
+            filter function to filter strings.
  
              Arguments:
-                default: Filters to be used, when the 'filter-kind' parameter
-                         is not specified. If omitted then the default is to
-                         let all names pass.
+                param: The parameter for which the filter function
+                       will be created.
+                default: Defines the behaviour of filter function if
+                         parameter is missing in the sanitizer configuration.
+                         Takes a string(PASS_ALL or FAIL_ALL) or a list of strings.
+                         Any other value of string or an empty list is not allowed,
+                         and will raise a ValueError. If the value is PASS_ALL, the filter
+                         function will let all strings to pass, if the value is FAIL_ALL,
+                         filter function will let no strings to pass.
+                         If value provided is a list of strings each string
+                         is treated as a regular expression. In this case these regular
+                         expressions will be used by the filter function.
+                         By default allow filter function to let all strings pass.
  
              Returns:
-                A filter function which takes a name string and returns
-                True when the item passes the filter.
+                A filter function that takes a target string as the argument and
+                returns True if it fully matches any of the regular expressions
+                otherwise returns False.
          """
-        filters = self.get_string_list('filter-kind', default)
+        filters = self.get_string_list(param) or default
  
-        if not filters:
+        if filters == 'PASS_ALL':
              return lambda _: True
+        if filters == 'FAIL_ALL':
+            return lambda _: False
  
-        regexes = [re.compile(regex) for regex in filters]
+        if filters and isinstance(filters, (list, tuple)):
+            regexes = [re.compile(regex) for regex in filters]
+            return lambda target: any(regex.fullmatch(target) for regex in regexes)
  
-        return lambda name: any(regex.fullmatch(name) for regex in regexes)
+        raise ValueError("Default parameter must be a non-empty list or a string value \
+                          ('PASS_ALL' or 'FAIL_ALL').")
diff --git a/nominatim/tokenizer/sanitizers/delete_tags.py b/nominatim/tokenizer/sanitizers/delete_tags.py

index fd35de488c617f0624271323fe525e23f5ff8b2f..f18a894b07e480fdab61251dba246a4f65c925ad 100644 (file)
--- a/nominatim/tokenizer/sanitizers/delete_tags.py
+++ b/nominatim/tokenizer/sanitizers/delete_tags.py
@@ -54,8 +54,7 @@ Arguments:
  \r
  \r
  """\r
-from typing import Callable, List, Optional, Pattern, Tuple, Sequence\r
-import re\r
+from typing import Callable, List, Tuple, Sequence\r
  \r
  from nominatim.tokenizer.sanitizers.base import ProcessInfo\r
  from nominatim.data.place_name import PlaceName\r
@@ -65,37 +64,33 @@ class _TagSanitizer:
  \r
      def __init__(self, config: SanitizerConfig) -> None:\r
          self.type = config.get('type', 'name')\r
-        self.filter_kind = config.get_filter_kind()\r
+        self.filter_kind = config.get_filter('filter-kind')\r
          self.country_codes = config.get_string_list('country_code', [])\r
-        self.allowed_ranks = self._set_allowed_ranks( \\r
-                                            config.get_string_list('rank_address', ['0-30']))\r
+        self.filter_suffix = config.get_filter('suffix')\r
+        self.filter_name = config.get_filter('name')\r
+        self.allowed_ranks = self._set_allowed_ranks(\r
+            config.get_string_list("rank_address", ["0-30"])\r
+        )\r
  \r
          self.has_country_code = config.get('country_code', None) is not None\r
  \r
-        suffixregexps = config.get_string_list('suffix', [r'[\s\S]*'])\r
-        self.suffix_regexp = [re.compile(r) for r in suffixregexps]\r
-\r
-        nameregexps = config.get_string_list('name', [r'[\s\S]*'])\r
-        self.name_regexp = [re.compile(r) for r in nameregexps]\r
-\r
-\r
  \r
      def __call__(self, obj: ProcessInfo) -> None:\r
          tags = obj.names if self.type == 'name' else obj.address\r
  \r
-        if (not tags or\r
-             self.has_country_code and\r
-              obj.place.country_code not in self.country_codes or\r
-               not self.allowed_ranks[obj.place.rank_address]):\r
+        if not tags \\r
+           or not self.allowed_ranks[obj.place.rank_address] \\r
+           or self.has_country_code \\r
+           and obj.place.country_code not in self.country_codes:\r
              return\r
  \r
          filtered_tags: List[PlaceName] = []\r
  \r
          for tag in tags:\r
  \r
-            if (not self.filter_kind(tag.kind) or\r
-                  not self._matches(tag.suffix, self.suffix_regexp) or\r
-                    not self._matches(tag.name, self.name_regexp)):\r
+            if not self.filter_kind(tag.kind) \\r
+               or not self.filter_suffix(tag.suffix or '') \\r
+               or not self.filter_name(tag.name):\r
                  filtered_tags.append(tag)\r
  \r
  \r
@@ -117,7 +112,7 @@ class _TagSanitizer:
          for rank in ranks:\r
              intvl = [int(x) for x in rank.split('-')]\r
  \r
-            start, end = (intvl[0], intvl[0]) if len(intvl) == 1 else (intvl[0], intvl[1])\r
+            start, end = intvl[0], intvl[0] if len(intvl) == 1 else intvl[1]\r
  \r
              for i in range(start, end + 1):\r
                  allowed_ranks[i] = True\r
@@ -126,17 +121,6 @@ class _TagSanitizer:
          return tuple(allowed_ranks)\r
  \r
  \r
-    def _matches(self, value: Optional[str], patterns: List[Pattern[str]]) -> bool:\r
-        """ Returns True if the given value fully matches any of the regular\r
-            expression pattern in the list. Otherwise, returns False.\r
-\r
-            Note that if the value is None, it is taken as an empty string.\r
-        """\r
-        target = '' if value is None else value\r
-        return any(r.fullmatch(target) is not None for r in patterns)\r
-\r
-\r
-\r
  def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:\r
      """ Create a function to process removal of certain tags.\r
      """\r
diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py

index 032b69a8cb6c07dd67e673351e6810bbd334cbf9..f3bc26248bffc94d2e4916852633a9e39293f442 100644 (file)
--- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
+++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
@@ -41,7 +41,7 @@ class _AnalyzerByLanguage:
      """
  
      def __init__(self, config: SanitizerConfig) -> None:
-        self.filter_kind = config.get_filter_kind()
+        self.filter_kind = config.get_filter('filter-kind')
          self.replace = config.get('mode', 'replace') != 'append'
          self.whitelist = config.get('whitelist')
  
diff --git a/test/python/tokenizer/sanitizers/test_delete_tags.py b/test/python/tokenizer/sanitizers/test_delete_tags.py

index f9ccc2f619a061c9f207ad41b207d4bde46e202f..77366c326066ebe62c81d4029f313e3738cd10b3 100644 (file)
--- a/test/python/tokenizer/sanitizers/test_delete_tags.py
+++ b/test/python/tokenizer/sanitizers/test_delete_tags.py
@@ -302,7 +302,7 @@ class TestAllParameters:
  \r
      def test_list_arguments_pass(self):\r
          res = self.run_sanitizer_on(['de', 'in'], ['20-28', '30'], [r'abc.*', r'[\s\S]*'],\r
-                                    name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')\r
+                                    name='foo', ref='foo', name_abcxx='bar', ref_pqr='baz')\r
  \r
          assert res == []\r
  \r
@@ -315,7 +315,7 @@ class TestAllParameters:
  \r
      def test_mix_arguments_pass(self):\r
          res = self.run_sanitizer_on('de', ['10', '20-28', '30'], r'[\s\S]*',\r
-                                    name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')\r
+                                    name_abc='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')\r
  \r
          assert res == []\r
  \r
diff --git a/test/python/tokenizer/sanitizers/test_sanitizer_config.py b/test/python/tokenizer/sanitizers/test_sanitizer_config.py

index 0dbbc7a0dfdd7726ec8f1850c8efde23da093e94..d8514b4aa553f4ebd52b510fc6f416b8c9be7710 100644 (file)
--- a/test/python/tokenizer/sanitizers/test_sanitizer_config.py
+++ b/test/python/tokenizer/sanitizers/test_sanitizer_config.py
@@ -10,17 +10,12 @@ Tests for sanitizer configuration helper functions.
  import pytest
  
  from nominatim.errors import UsageError
-from nominatim.tokenizer.place_sanitizer import PlaceName
  from nominatim.tokenizer.sanitizers.config import SanitizerConfig
  
  def test_string_list_default_empty():
      assert SanitizerConfig().get_string_list('op') == []
  
  
-def test_string_list_default_none():
-    assert SanitizerConfig().get_string_list('op', default=None) is None
-
-
  def test_string_list_default_something():
      assert SanitizerConfig().get_string_list('op', default=['a', 'b']) == ['a', 'b']
  
@@ -78,36 +73,77 @@ def test_create_split_regex_empty_delimiter():
          regex = SanitizerConfig({'delimiters': ''}).get_delimiter()
  
  
-@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*'))
-def test_create_kind_filter_no_params(inp):
-    filt = SanitizerConfig().get_filter_kind()
+@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*', ''))
+def test_create_name_filter_no_param_no_default(inp):
+    filt = SanitizerConfig({'filter-kind': 'place'}).get_filter('name')
+
+    assert filt(inp)
+
+
+@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*', ''))
+def test_create_name_filter_no_param_default_pass_all(inp):
+    filt = SanitizerConfig().get_filter('name', 'PASS_ALL')
  
      assert filt(inp)
  
  
+@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*', ''))
+def test_create_name_filter_no_param_default_fail_all(inp):
+    filt = SanitizerConfig().get_filter('name', 'FAIL_ALL')
+
+    assert not filt(inp)
+
+
+def test_create_name_filter_no_param_default_invalid_string():
+    with pytest.raises(ValueError):
+        filt = SanitizerConfig().get_filter('name', 'abc')
+
+
+def test_create_name_filter_no_param_default_empty_list():
+    with pytest.raises(ValueError):
+        filt = SanitizerConfig().get_filter('name', [])
+
+
  @pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
+def test_create_kind_filter_default_positive(kind):
+    filt = SanitizerConfig().get_filter('filter-kind', ['.*de'])
+
+    assert filt(kind)
+
+
+@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
+def test_create_kind_filter_default_negetive(kind):
+    filt = SanitizerConfig().get_filter('filter-kind', ['.*fr'])
+
+    assert not filt(kind)
+
+
+@pytest.mark.parametrize('kind', ('lang', 'lang:de', 'langxx'))
  def test_create_kind_filter_custom_regex_positive(kind):
-    filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
+    filt = SanitizerConfig({'filter-kind': 'lang.*'}
+    ).get_filter('filter-kind', ['.*fr'])
  
      assert filt(kind)
  
  
  @pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
  def test_create_kind_filter_custom_regex_negative(kind):
-    filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
+    filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter('filter-kind')
  
      assert not filt(kind)
  
  
  @pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
  def test_create_kind_filter_many_positive(kind):
-    filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
+    filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}
+    ).get_filter('filter-kind')
  
      assert filt(kind)
  
  
  @pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
  def test_create_kind_filter_many_negative(kind):
-    filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
+    filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}
+    ).get_filter('filter-kind')
  
      assert not filt(kind)
author	biswajit-k <biswajitkaushik02@gmail.com>
	Sat, 11 Mar 2023 08:22:07 +0000 (13:52 +0530)
committer	biswajit-k <biswajitkaushik02@gmail.com>
	Sat, 1 Apr 2023 13:54:09 +0000 (19:24 +0530)
nominatim/tokenizer/sanitizers/clean_housenumbers.py		patch \| blob \| history
nominatim/tokenizer/sanitizers/config.py		patch \| blob \| history
nominatim/tokenizer/sanitizers/delete_tags.py		patch \| blob \| history
nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py		patch \| blob \| history
test/python/tokenizer/sanitizers/test_delete_tags.py		patch \| blob \| history
test/python/tokenizer/sanitizers/test_sanitizer_config.py		patch \| blob \| history