add unit tests for new sanatizer functions

author Sarah Hoffmann <lonvia@denofr.de>

Fri, 1 Oct 2021 07:50:17 +0000 (09:50 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Fri, 1 Oct 2021 10:27:24 +0000 (12:27 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Fri, 1 Oct 2021 07:50:17 +0000 (09:50 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Fri, 1 Oct 2021 10:27:24 +0000 (12:27 +0200)
diff --git a/nominatim/tokenizer/sanitizers/split_name_list.py b/nominatim/tokenizer/sanitizers/split_name_list.py

index 93651f3e8b3283d1095c58d9b357332f24b6d7e6..f151420396db9825a00067e81c507b0908ef835d 100644 (file)
--- a/nominatim/tokenizer/sanitizers/split_name_list.py
+++ b/nominatim/tokenizer/sanitizers/split_name_list.py
@@ -3,13 +3,19 @@ Name processor that splits name values with multiple values into their component
  """
  import re
  
  """
  import re
  
+from nominatim.errors import UsageError
+
  def create(func):
      """ Create a name processing function that splits name values with
          multiple values into their components. The optional parameter
          'delimiters' can be used to define the characters that should be used
          for splitting. The default is ',;'.
      """
  def create(func):
      """ Create a name processing function that splits name values with
          multiple values into their components. The optional parameter
          'delimiters' can be used to define the characters that should be used
          for splitting. The default is ',;'.
      """
-    regexp = re.compile('[{}]'.format(func.get('delimiters', ',;')))
+    delimiter_set = set(func.get('delimiters', ',;'))
+    if not delimiter_set:
+        raise UsageError("Set of delimiters in split-name-list sanitizer is empty.")
+
+    regexp = re.compile('\\s*[{}]\\s*'.format(''.join('\\' + d for d in delimiter_set)))
  
      def _process(obj):
          if not obj.names:
  
      def _process(obj):
          if not obj.names:
@@ -18,10 +24,11 @@ def create(func):
          new_names = []
          for name in obj.names:
              split_names = regexp.split(name.name)
          new_names = []
          for name in obj.names:
              split_names = regexp.split(name.name)
+            print(split_names)
              if len(split_names) == 1:
                  new_names.append(name)
              else:
              if len(split_names) == 1:
                  new_names.append(name)
              else:
-                new_names.extend(name.clone(name=n) for n in split_names)
+                new_names.extend(name.clone(name=n) for n in split_names if n)
  
          obj.names = new_names
  
  
          obj.names = new_names
  
diff --git a/nominatim/tokenizer/sanitizers/strip_brace_terms.py b/nominatim/tokenizer/sanitizers/strip_brace_terms.py

index 4423d3058ce1c20d6e001ee2979b875fa9e8a6c3..ec91bac926d2ae3938d9c46f44bb45e1124b0dcd 100644 (file)
--- a/nominatim/tokenizer/sanitizers/strip_brace_terms.py
+++ b/nominatim/tokenizer/sanitizers/strip_brace_terms.py
@@ -10,13 +10,13 @@ def create(_):
      def _process(obj):
          """ Add variants for names that have a bracket extension.
          """
      def _process(obj):
          """ Add variants for names that have a bracket extension.
          """
-        new_names = []
          if obj.names:
          if obj.names:
+            new_names = []
              for name in (n for n in obj.names if '(' in n.name):
                  new_name = name.name.split('(')[0].strip()
                  if new_name:
                      new_names.append(name.clone(name=new_name))
  
              for name in (n for n in obj.names if '(' in n.name):
                  new_name = name.name.split('(')[0].strip()
                  if new_name:
                      new_names.append(name.clone(name=new_name))
  
-        obj.names.extend(new_names)
+            obj.names.extend(new_names)
  
      return _process
  
      return _process
diff --git a/test/python/tokenizer/sanitizers/test_split_name_list.py b/test/python/tokenizer/sanitizers/test_split_name_list.py

new file mode 100644 (file)

index 0000000..ee74546
--- /dev/null
+++ b/test/python/tokenizer/sanitizers/test_split_name_list.py
@@ -0,0 +1,65 @@
+"""
+Tests for the sanitizer that splitts multivalue lists.
+"""
+import pytest
+
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from nominatim.indexer.place_info import PlaceInfo
+
+from nominatim.errors import UsageError
+
+def run_sanitizer_on(**kwargs):
+    place = PlaceInfo({'name': kwargs})
+    name, _ = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place)
+
+    return sorted([(p.name, p.kind, p.suffix) for p in name])
+
+
+def sanitize_with_delimiter(delimiter, name):
+    place = PlaceInfo({'name': {'name': name}})
+    san = PlaceSanitizer([{'step': 'split-name-list', 'delimiters': delimiter}])
+    name, _ = san.process_names(place)
+
+    return sorted([p.name for p in name])
+
+
+def test_simple():
+    assert run_sanitizer_on(name='ABC') == [('ABC', 'name', None)]
+    assert run_sanitizer_on(name='') == [('', 'name', None)]
+
+
+def test_splits():
+    assert run_sanitizer_on(name='A;B;C') == [('A', 'name', None),
+                                              ('B', 'name', None),
+                                              ('C', 'name', None)]
+    assert run_sanitizer_on(short_name=' House, boat ') == [('House', 'short_name', None),
+                                                            ('boat', 'short_name', None)]
+
+
+def test_empty_fields():
+    assert run_sanitizer_on(name='A;;B') == [('A', 'name', None),
+                                             ('B', 'name', None)]
+    assert run_sanitizer_on(name='A; ,B') == [('A', 'name', None),
+                                              ('B', 'name', None)]
+    assert run_sanitizer_on(name=' ;B') == [('B', 'name', None)]
+    assert run_sanitizer_on(name='B,') == [('B', 'name', None)]
+
+
+def test_custom_delimiters():
+    assert sanitize_with_delimiter(':', '12:45,3') == ['12', '45,3']
+    assert sanitize_with_delimiter('\\', 'a;\\b!#@ \\') == ['a;', 'b!#@']
+    assert sanitize_with_delimiter('[]', 'foo[to]be') == ['be', 'foo', 'to']
+    assert sanitize_with_delimiter(' ', 'morning  sun') == ['morning', 'sun']
+
+
+def test_empty_delimiter_set():
+    with pytest.raises(UsageError):
+        sanitize_with_delimiter('', 'abc')
+
+
+def test_no_name_list():
+    place = PlaceInfo({'address': {'housenumber': '3'}})
+    name, address = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place)
+
+    assert not name
+    assert len(address) == 1
diff --git a/test/python/tokenizer/sanitizers/test_strip_brace_terms.py b/test/python/tokenizer/sanitizers/test_strip_brace_terms.py

new file mode 100644 (file)

index 0000000..50af244
--- /dev/null
+++ b/test/python/tokenizer/sanitizers/test_strip_brace_terms.py
@@ -0,0 +1,44 @@
+"""
+Tests for the sanitizer that handles braced suffixes.
+"""
+import pytest
+
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from nominatim.indexer.place_info import PlaceInfo
+
+def run_sanitizer_on(**kwargs):
+    place = PlaceInfo({'name': kwargs})
+    name, _ = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place)
+
+    return sorted([(p.name, p.kind, p.suffix) for p in name])
+
+
+def test_no_braces():
+    assert run_sanitizer_on(name='foo', ref='23') == [('23', 'ref', None),
+                                                      ('foo', 'name', None)]
+
+
+def test_simple_braces():
+    assert run_sanitizer_on(name='Halle (Saale)', ref='3')\
+      == [('3', 'ref', None), ('Halle', 'name', None), ('Halle (Saale)', 'name', None)]
+    assert run_sanitizer_on(name='ack ( bar')\
+      == [('ack', 'name', None), ('ack ( bar', 'name', None)]
+
+
+def test_only_braces():
+    assert run_sanitizer_on(name='(maybe)') == [('(maybe)', 'name', None)]
+
+
+def test_double_braces():
+    assert run_sanitizer_on(name='a((b))') == [('a', 'name', None),
+                                               ('a((b))', 'name', None)]
+    assert run_sanitizer_on(name='a (b) (c)') == [('a', 'name', None),
+                                                  ('a (b) (c)', 'name', None)]
+
+
+def test_no_names():
+    place = PlaceInfo({'address': {'housenumber': '3'}})
+    name, address = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place)
+
+    assert not name
+    assert len(address) == 1
diff --git a/test/python/tokenizer/test_place_sanitizer.py b/test/python/tokenizer/test_place_sanitizer.py

new file mode 100644 (file)

index 0000000..389b068
--- /dev/null
+++ b/test/python/tokenizer/test_place_sanitizer.py
@@ -0,0 +1,71 @@
+"""
+Tests for execution of the sanitztion step.
+"""
+import pytest
+
+from nominatim.errors import UsageError
+import nominatim.tokenizer.place_sanitizer as sanitizer
+from nominatim.indexer.place_info import PlaceInfo
+
+
+def test_placeinfo_clone_new_name():
+    place = sanitizer.PlaceName('foo', 'ki', 'su')
+
+    newplace = place.clone(name='bar')
+
+    assert place.name == 'foo'
+    assert newplace.name == 'bar'
+    assert newplace.kind == 'ki'
+    assert newplace.suffix == 'su'
+
+
+def test_placeinfo_clone_merge_attr():
+    place = sanitizer.PlaceName('foo', 'ki', 'su')
+    place.set_attr('a1', 'v1')
+    place.set_attr('a2', 'v2')
+
+    newplace = place.clone(attr={'a2': 'new', 'b2': 'foo'})
+
+    assert place.get_attr('a2') == 'v2'
+    assert place.get_attr('b2') is None
+    assert newplace.get_attr('a1') == 'v1'
+    assert newplace.get_attr('a2') == 'new'
+    assert newplace.get_attr('b2') == 'foo'
+
+
+def test_placeinfo_has_attr():
+    place = sanitizer.PlaceName('foo', 'ki', 'su')
+    place.set_attr('a1', 'v1')
+
+    assert place.has_attr('a1')
+    assert not place.has_attr('whatever')
+
+
+def test_sanitizer_default():
+    san = sanitizer.PlaceSanitizer([{'step': 'split-name-list'}])
+
+    name, address =  san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'},
+                                                  'address': {'street': 'Bald'}}))
+
+    assert len(name) == 3
+    assert all(isinstance(n, sanitizer.PlaceName) for n in name)
+    assert all(n.kind == 'name'  for n in name)
+    assert all(n.suffix == 'de:de'  for n in name)
+
+    assert len(address) == 1
+    assert all(isinstance(n, sanitizer.PlaceName) for n in address)
+
+
+@pytest.mark.parametrize('rules', [None, []])
+def test_sanitizer_empty_list(rules):
+    san = sanitizer.PlaceSanitizer(rules)
+
+    name, address =  san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'}}))
+
+    assert len(name) == 1
+    assert all(isinstance(n, sanitizer.PlaceName) for n in name)
+
+
+def test_sanitizer_missing_step_definition():
+    with pytest.raises(UsageError):
+        san = sanitizer.PlaceSanitizer([{'id': 'split-name-list'}])
author	Sarah Hoffmann <lonvia@denofr.de>
	Fri, 1 Oct 2021 07:50:17 +0000 (09:50 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Fri, 1 Oct 2021 10:27:24 +0000 (12:27 +0200)
nominatim/tokenizer/sanitizers/split_name_list.py		patch \| blob \| history
nominatim/tokenizer/sanitizers/strip_brace_terms.py		patch \| blob \| history
test/python/tokenizer/sanitizers/test_split_name_list.py	[new file with mode: 0644]	patch \| blob
test/python/tokenizer/sanitizers/test_strip_brace_terms.py	[new file with mode: 0644]	patch \| blob
test/python/tokenizer/test_place_sanitizer.py	[new file with mode: 0644]	patch \| blob