From 8171fe4571a57bf8e5b2a8f676989e973897e2e7 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Thu, 30 Sep 2021 21:30:13 +0200
Subject: [PATCH] introduce sanitizer step before token analysis

Sanatizer functions allow to transform name and address tags before
they are handed to the tokenizer. Theses transformations are visible
only for the tokenizer and thus only have an influence on the
search terms and address match terms for a place.

Currently two sanitizers are implemented which are responsible for
splitting names with multiple values and removing bracket additions.
Both was previously hard-coded in the tokenizer.
---
 nominatim/tokenizer/icu_rule_loader.py        |  10 ++
 nominatim/tokenizer/icu_tokenizer.py          | 110 ++++++++-------
 nominatim/tokenizer/place_sanitizer.py        | 127 ++++++++++++++++++
 nominatim/tokenizer/sanitizers/__init__.py    |   0
 .../tokenizer/sanitizers/split_name_list.py   |  28 ++++
 .../tokenizer/sanitizers/strip_brace_terms.py |  22 +++
 settings/icu_tokenizer.yaml                   |   3 +
 test/python/test_tokenizer_icu.py             |  17 +--
 8 files changed, 259 insertions(+), 58 deletions(-)
 create mode 100644 nominatim/tokenizer/place_sanitizer.py
 create mode 100644 nominatim/tokenizer/sanitizers/__init__.py
 create mode 100644 nominatim/tokenizer/sanitizers/split_name_list.py
 create mode 100644 nominatim/tokenizer/sanitizers/strip_brace_terms.py

diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py
index bd0739f2..330179bb 100644
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -12,6 +12,7 @@ from icu import Transliterator
 from nominatim.db.properties import set_property, get_property
 from nominatim.errors import UsageError
 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 import nominatim.tokenizer.icu_variants as variants
 
 LOG = logging.getLogger()
@@ -65,6 +66,9 @@ class ICURuleLoader:
         self.analysis_rules = self._get_section(rules, 'variants')
         self._parse_variant_list()
 
+        # Load optional sanitizer rule set.
+        self.sanitizer_rules = rules.get('sanitizers', [])
+
 
     def load_config_from_db(self, conn):
         """ Get previously saved parts of the configuration from the
@@ -85,6 +89,12 @@ class ICURuleLoader:
         set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
 
 
+    def make_sanitizer(self):
+        """ Create a place sanitizer from the configured rules.
+        """
+        return PlaceSanitizer(self.sanitizer_rules)
+
+
     def make_token_analysis(self):
         """ Create a token analyser from the reviouly loaded rules.
         """
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py
index 87906d71..2ece10f2 100644
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -13,6 +13,7 @@ from nominatim.db.connection import connect
 from nominatim.db.properties import set_property, get_property
 from nominatim.db.utils import CopyBuffer
 from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.indexer.place_info import PlaceInfo
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 
@@ -107,7 +108,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
 
             Analyzers are not thread-safe. You need to instantiate one per thread.
         """
-        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis())
+        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
+                                     self.loader.make_token_analysis())
 
 
     def _install_php(self, phpdir):
@@ -187,10 +189,11 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         normalization.
     """
 
-    def __init__(self, dsn, name_proc):
+    def __init__(self, dsn, sanitizer, token_analysis):
         self.conn = connect(dsn).connection
         self.conn.autocommit = True
-        self.name_processor = name_proc
+        self.sanitizer = sanitizer
+        self.token_analysis = token_analysis
 
         self._cache = _TokenCache()
 
@@ -203,6 +206,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
             self.conn = None
 
 
+    def _search_normalized(self, name):
+        """ Return the search token transliteration of the given name.
+        """
+        return self.token_analysis.get_search_normalized(name)
+
+
+    def _normalized(self, name):
+        """ Return the normalized version of the given name with all
+            non-relevant information removed.
+        """
+        return self.token_analysis.get_normalized(name)
+
+
     def get_word_token_info(self, words):
         """ Return token information for the given list of words.
             If a word starts with # it is assumed to be a full name
@@ -218,9 +234,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         partial_tokens = {}
         for word in words:
             if word.startswith('#'):
-                full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
+                full_tokens[word] = self._search_normalized(word[1:])
             else:
-                partial_tokens[word] = self.name_processor.get_search_normalized(word)
+                partial_tokens[word] = self._search_normalized(word)
 
         with self.conn.cursor() as cur:
             cur.execute("""SELECT word_token, word_id
@@ -251,7 +267,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
 
             This function takes minor shortcuts on transliteration.
         """
-        return self.name_processor.get_search_normalized(hnr)
+        return self._search_normalized(hnr)
 
     def update_postcodes_from_db(self):
         """ Update postcode tokens in the word table from the location_postcode
@@ -274,7 +290,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                     if postcode is None:
                         to_delete.append(word)
                     else:
-                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                        copystr.add(self._search_normalized(postcode),
                                     'P', postcode)
 
                 if to_delete:
@@ -292,7 +308,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
             completely replaced. Otherwise the phrases are added to the
             already existing ones.
         """
-        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
                             for p in phrases))
 
         with self.conn.cursor() as cur:
@@ -322,7 +338,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         added = 0
         with CopyBuffer() as copystr:
             for word, cls, typ, oper in to_add:
-                term = self.name_processor.get_search_normalized(word)
+                term = self._search_normalized(word)
                 if term:
                     copystr.add(term, 'S', word,
                                 json.dumps({'class': cls, 'type': typ,
@@ -356,9 +372,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
     def add_country_names(self, country_code, names):
         """ Add names for the given country to the search index.
         """
+        # Make sure any name preprocessing for country names applies.
+        info = PlaceInfo({'name': names, 'country_code': country_code,
+                          'rank_address': 4, 'class': 'boundary',
+                          'type': 'administrative'})
+        self._add_country_full_names(country_code,
+                                     self.sanitizer.process_names(info)[0])
+
+
+    def _add_country_full_names(self, country_code, names):
+        """ Add names for the given country from an already sanitized
+            name list.
+        """
         word_tokens = set()
-        for name in self._compute_full_names(names):
-            norm_name = self.name_processor.get_search_normalized(name)
+        for name in names:
+            norm_name = self._search_normalized(name.name)
             if norm_name:
                 word_tokens.add(norm_name)
 
@@ -384,12 +412,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
     def process_place(self, place):
         """ Determine tokenizer information about the given place.
 
-            Returns a JSON-serialisable structure that will be handed into
+            Returns a JSON-serializable structure that will be handed into
             the database via the token_info field.
         """
         token_info = _TokenInfo(self._cache)
 
-        names = place.name
+        names, address = self.sanitizer.process_names(place)
 
         if names:
             fulls, partials = self._compute_name_tokens(names)
@@ -397,9 +425,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
             token_info.add_names(fulls, partials)
 
             if place.is_country():
-                self.add_country_names(place.country_code, names)
+                self._add_country_full_names(place.country_code, names)
 
-        address = place.address
         if address:
             self._process_place_address(token_info, address)
 
@@ -409,18 +436,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
     def _process_place_address(self, token_info, address):
         hnrs = []
         addr_terms = []
-        for key, value in address.items():
-            if key == 'postcode':
-                self._add_postcode(value)
-            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                hnrs.append(value)
-            elif key == 'street':
-                token_info.add_street(self._compute_partial_tokens(value))
-            elif key == 'place':
-                token_info.add_place(self._compute_partial_tokens(value))
-            elif not key.startswith('_') and \
-                 key not in ('country', 'full'):
-                addr_terms.append((key, self._compute_partial_tokens(value)))
+        for item in address:
+            if item.kind == 'postcode':
+                self._add_postcode(item.name)
+            elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                hnrs.append(item.name)
+            elif item.kind == 'street':
+                token_info.add_street(self._compute_partial_tokens(item.name))
+            elif item.kind == 'place':
+                token_info.add_place(self._compute_partial_tokens(item.name))
+            elif not item.kind.startswith('_') and \
+                 item.kind not in ('country', 'full'):
+                addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
 
         if hnrs:
             hnrs = self._split_housenumbers(hnrs)
@@ -433,7 +460,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         """ Normalize the given term, split it into partial words and return
             then token list for them.
         """
-        norm_name = self.name_processor.get_search_normalized(name)
+        norm_name = self._search_normalized(name)
 
         tokens = []
         need_lookup = []
@@ -456,19 +483,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
 
         return tokens
 
+
     def _compute_name_tokens(self, names):
         """ Computes the full name and partial name tokens for the given
             dictionary of names.
         """
-        full_names = self._compute_full_names(names)
         full_tokens = set()
         partial_tokens = set()
 
-        for name in full_names:
-            norm_name = self.name_processor.get_normalized(name)
+        for name in names:
+            norm_name = self._normalized(name.name)
             full, part = self._cache.names.get(norm_name, (None, None))
             if full is None:
-                variants = self.name_processor.get_variants_ascii(norm_name)
+                variants = self.token_analysis.get_variants_ascii(norm_name)
                 if not variants:
                     continue
 
@@ -485,23 +512,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         return full_tokens, partial_tokens
 
 
-    @staticmethod
-    def _compute_full_names(names):
-        """ Return the set of all full name word ids to be used with the
-            given dictionary of names.
-        """
-        full_names = set()
-        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
-            if name:
-                full_names.add(name)
-
-                brace_idx = name.find('(')
-                if brace_idx >= 0:
-                    full_names.add(name[:brace_idx].strip())
-
-        return full_names
-
-
     def _add_postcode(self, postcode):
         """ Make sure the normalized postcode is present in the word table.
         """
@@ -509,7 +519,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
             postcode = self.normalize_postcode(postcode)
 
             if postcode not in self._cache.postcodes:
-                term = self.name_processor.get_search_normalized(postcode)
+                term = self._search_normalized(postcode)
                 if not term:
                     return
 
diff --git a/nominatim/tokenizer/place_sanitizer.py b/nominatim/tokenizer/place_sanitizer.py
new file mode 100644
index 00000000..5961dcf0
--- /dev/null
+++ b/nominatim/tokenizer/place_sanitizer.py
@@ -0,0 +1,127 @@
+"""
+Handler for cleaning name and address tags in place information before it
+is handed to the token analysis.
+"""
+import importlib
+
+from nominatim.errors import UsageError
+
+class PlaceName:
+    """ A searchable name for a place together with properties.
+        Every name object saves the name proper and two basic properties:
+        * 'kind' describes the name of the OSM key used without any suffixes
+          (i.e. the part after the colon removed)
+        * 'suffix' contains the suffix of the OSM tag, if any. The suffix
+          is the part of the key after the first colon.
+        In addition to that, the name may have arbitrary additional attributes.
+        Which attributes are used, depends on the token analyser.
+    """
+
+    def __init__(self, name, kind, suffix):
+        self.name = name
+        self.kind = kind
+        self.suffix = suffix
+        self.attr = {}
+
+
+    def __repr__(self):
+        return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
+
+
+    def clone(self, name=None, kind=None, suffix=None, attr=None):
+        """ Create a deep copy of the place name, optionally with the
+            given parameters replaced. In the attribute list only the given
+            keys are updated. The list is not replaced completely.
+            In particular, the function cannot to be used to remove an
+            attribute from a place name.
+        """
+        newobj = PlaceName(name or self.name,
+                           kind or self.kind,
+                           suffix or self.suffix)
+
+        newobj.attr.update(self.attr)
+        if attr:
+            newobj.attr.update(attr)
+
+        return newobj
+
+
+    def set_attr(self, key, value):
+        """ Add the given property to the name. If the property was already
+            set, then the value is overwritten.
+        """
+        self.attr[key] = value
+
+
+    def get_attr(self, key, default=None):
+        """ Return the given property or the value of 'default' if it
+            is not set.
+        """
+        return self.attr.get(key, default)
+
+
+    def has_attr(self, key):
+        """ Check if the given attribute is set.
+        """
+        return key in self.attr
+
+
+class _ProcessInfo:
+    """ Container class for information handed into to handler functions.
+        The 'names' and 'address' members are mutable. A handler must change
+        them by either modifying the lists place or replacing the old content
+        with a new list.
+    """
+
+    def __init__(self, place):
+        self.place = place
+        self.names = self._convert_name_dict(place.name)
+        self.address = self._convert_name_dict(place.address)
+
+
+    @staticmethod
+    def _convert_name_dict(names):
+        """ Convert a dictionary of names into a list of PlaceNames.
+            The dictionary key is split into the primary part of the key
+            and the suffix (the part after an optional colon).
+        """
+        out = []
+
+        if names:
+            for key, value in names.items():
+                parts = key.split(':', 1)
+                out.append(PlaceName(value.strip(),
+                                     parts[0].strip(),
+                                     parts[1].strip() if len(parts) > 1 else None))
+
+        return out
+
+
+class PlaceSanitizer:
+    """ Controller class which applies sanitizer functions on the place
+        names and address before they are used by the token analysers.
+    """
+
+    def __init__(self, rules):
+        self.handlers = []
+
+        if rules:
+            for func in rules:
+                if 'step' not in func:
+                    raise UsageError("Sanitizer rule is missing the 'step' attribute.")
+                module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
+                handler_module = importlib.import_module(module_name)
+                self.handlers.append(handler_module.create(func))
+
+
+    def process_names(self, place):
+        """ Extract a sanitized list of names and address parts from the
+            given place. The function returns a tuple
+            (list of names, list of address names)
+        """
+        obj = _ProcessInfo(place)
+
+        for func in self.handlers:
+            func(obj)
+
+        return obj.names, obj.address
diff --git a/nominatim/tokenizer/sanitizers/__init__.py b/nominatim/tokenizer/sanitizers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/nominatim/tokenizer/sanitizers/split_name_list.py b/nominatim/tokenizer/sanitizers/split_name_list.py
new file mode 100644
index 00000000..93651f3e
--- /dev/null
+++ b/nominatim/tokenizer/sanitizers/split_name_list.py
@@ -0,0 +1,28 @@
+"""
+Name processor that splits name values with multiple values into their components.
+"""
+import re
+
+def create(func):
+    """ Create a name processing function that splits name values with
+        multiple values into their components. The optional parameter
+        'delimiters' can be used to define the characters that should be used
+        for splitting. The default is ',;'.
+    """
+    regexp = re.compile('[{}]'.format(func.get('delimiters', ',;')))
+
+    def _process(obj):
+        if not obj.names:
+            return
+
+        new_names = []
+        for name in obj.names:
+            split_names = regexp.split(name.name)
+            if len(split_names) == 1:
+                new_names.append(name)
+            else:
+                new_names.extend(name.clone(name=n) for n in split_names)
+
+        obj.names = new_names
+
+    return _process
diff --git a/nominatim/tokenizer/sanitizers/strip_brace_terms.py b/nominatim/tokenizer/sanitizers/strip_brace_terms.py
new file mode 100644
index 00000000..4423d305
--- /dev/null
+++ b/nominatim/tokenizer/sanitizers/strip_brace_terms.py
@@ -0,0 +1,22 @@
+"""
+Sanitizer handling names with addendums in braces.
+"""
+
+def create(_):
+    """ Create a name processing function that creates additional name variants
+        when a name has an addendum in brackets (e.g. "Halle (Saale)"). The
+        additional variant only contains the main name without the bracket part.
+    """
+    def _process(obj):
+        """ Add variants for names that have a bracket extension.
+        """
+        new_names = []
+        if obj.names:
+            for name in (n for n in obj.names if '(' in n.name):
+                new_name = name.name.split('(')[0].strip()
+                if new_name:
+                    new_names.append(name.clone(name=new_name))
+
+        obj.names.extend(new_names)
+
+    return _process
diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml
index c0c8c043..08b7a7ff 100644
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -24,6 +24,9 @@ transliteration:
     - "[^[:Ascii:]] >"
     - ":: lower ()"
     - ":: NFC ()"
+sanitizers:
+    - step: split-name-list
+    - step: strip-brace-terms
 variants:
     - !include icu-rules/variants-bg.yaml
     - !include icu-rules/variants-ca.yaml
diff --git a/test/python/test_tokenizer_icu.py b/test/python/test_tokenizer_icu.py
index 4b7c56d5..9a6f5a94 100644
--- a/test/python/test_tokenizer_icu.py
+++ b/test/python/test_tokenizer_icu.py
@@ -67,10 +67,12 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
     monkeypatch.undo()
 
     def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
-                     variants=('~gasse -> gasse', 'street => st', )):
+                     variants=('~gasse -> gasse', 'street => st', ),
+                     sanitizers=[]):
         cfgstr = {'normalization' : list(norm),
-                   'transliteration' : list(trans),
-                   'variants' : [ {'words': list(variants)}]}
+                  'sanitizers' : sanitizers,
+                  'transliteration' : list(trans),
+                  'variants' : [ {'words': list(variants)}]}
         (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
         tok.loader = ICURuleLoader(test_config)
 
@@ -309,14 +311,15 @@ class TestPlaceNames:
 
     @pytest.fixture(autouse=True)
     def setup(self, analyzer, sql_functions):
-        with analyzer() as anl:
+        sanitizers = [{'step': 'split-name-list'},
+                      {'step': 'strip-brace-terms'}]
+        with analyzer(sanitizers=sanitizers) as anl:
             self.analyzer = anl
             yield anl
 
 
     def expect_name_terms(self, info, *expected_terms):
         tokens = self.analyzer.get_word_token_info(expected_terms)
-        print (tokens)
         for token in tokens:
             assert token[2] is not None, "No token for {0}".format(token)
 
@@ -324,9 +327,7 @@ class TestPlaceNames:
 
 
     def process_named_place(self, names):
-        place = {'name': names}
-
-        return self.analyzer.process_place(PlaceInfo(place))
+        return self.analyzer.process_place(PlaceInfo({'name': names}))
 
 
     def test_simple_names(self):
-- 
2.39.5