introduce sanitizer step before token analysis

author Sarah Hoffmann <lonvia@denofr.de>

Thu, 30 Sep 2021 19:30:13 +0000 (21:30 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Fri, 1 Oct 2021 10:27:24 +0000 (12:27 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Thu, 30 Sep 2021 19:30:13 +0000 (21:30 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Fri, 1 Oct 2021 10:27:24 +0000 (12:27 +0200)
diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py

index bd0739f2f40e0b8023a6e45fb8fd2dfe03ad1c2f..330179bb32630367448d38548f70e5e5b5b98934 100644 (file)
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -12,6 +12,7 @@ from icu import Transliterator
  from nominatim.db.properties import set_property, get_property
  from nominatim.errors import UsageError
  from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
  import nominatim.tokenizer.icu_variants as variants
  
  LOG = logging.getLogger()
@@ -65,6 +66,9 @@ class ICURuleLoader:
          self.analysis_rules = self._get_section(rules, 'variants')
          self._parse_variant_list()
  
+        # Load optional sanitizer rule set.
+        self.sanitizer_rules = rules.get('sanitizers', [])
+
  
      def load_config_from_db(self, conn):
          """ Get previously saved parts of the configuration from the
@@ -85,6 +89,12 @@ class ICURuleLoader:
          set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
  
  
+    def make_sanitizer(self):
+        """ Create a place sanitizer from the configured rules.
+        """
+        return PlaceSanitizer(self.sanitizer_rules)
+
+
      def make_token_analysis(self):
          """ Create a token analyser from the reviouly loaded rules.
          """
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index 87906d71d75484639078c56d7b1dd9c0295a8572..2ece10f2ccd28aecb3a181ceafabeb86c9f289a7 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -13,6 +13,7 @@ from nominatim.db.connection import connect
  from nominatim.db.properties import set_property, get_property
  from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.indexer.place_info import PlaceInfo
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
@@ -107,7 +108,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
  
              Analyzers are not thread-safe. You need to instantiate one per thread.
          """
-        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis())
+        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
+                                     self.loader.make_token_analysis())
  
  
      def _install_php(self, phpdir):
@@ -187,10 +189,11 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          normalization.
      """
  
-    def __init__(self, dsn, name_proc):
+    def __init__(self, dsn, sanitizer, token_analysis):
          self.conn = connect(dsn).connection
          self.conn.autocommit = True
-        self.name_processor = name_proc
+        self.sanitizer = sanitizer
+        self.token_analysis = token_analysis
  
          self._cache = _TokenCache()
  
@@ -203,6 +206,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
              self.conn = None
  
  
+    def _search_normalized(self, name):
+        """ Return the search token transliteration of the given name.
+        """
+        return self.token_analysis.get_search_normalized(name)
+
+
+    def _normalized(self, name):
+        """ Return the normalized version of the given name with all
+            non-relevant information removed.
+        """
+        return self.token_analysis.get_normalized(name)
+
+
      def get_word_token_info(self, words):
          """ Return token information for the given list of words.
              If a word starts with # it is assumed to be a full name
@@ -218,9 +234,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          partial_tokens = {}
          for word in words:
              if word.startswith('#'):
-                full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
+                full_tokens[word] = self._search_normalized(word[1:])
              else:
-                partial_tokens[word] = self.name_processor.get_search_normalized(word)
+                partial_tokens[word] = self._search_normalized(word)
  
          with self.conn.cursor() as cur:
              cur.execute("""SELECT word_token, word_id
@@ -251,7 +267,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
  
              This function takes minor shortcuts on transliteration.
          """
-        return self.name_processor.get_search_normalized(hnr)
+        return self._search_normalized(hnr)
  
      def update_postcodes_from_db(self):
          """ Update postcode tokens in the word table from the location_postcode
@@ -274,7 +290,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                      if postcode is None:
                          to_delete.append(word)
                      else:
-                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                        copystr.add(self._search_normalized(postcode),
                                      'P', postcode)
  
                  if to_delete:
@@ -292,7 +308,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
              completely replaced. Otherwise the phrases are added to the
              already existing ones.
          """
-        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
                              for p in phrases))
  
          with self.conn.cursor() as cur:
@@ -322,7 +338,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          added = 0
          with CopyBuffer() as copystr:
              for word, cls, typ, oper in to_add:
-                term = self.name_processor.get_search_normalized(word)
+                term = self._search_normalized(word)
                  if term:
                      copystr.add(term, 'S', word,
                                  json.dumps({'class': cls, 'type': typ,
@@ -356,9 +372,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
      def add_country_names(self, country_code, names):
          """ Add names for the given country to the search index.
          """
+        # Make sure any name preprocessing for country names applies.
+        info = PlaceInfo({'name': names, 'country_code': country_code,
+                          'rank_address': 4, 'class': 'boundary',
+                          'type': 'administrative'})
+        self._add_country_full_names(country_code,
+                                     self.sanitizer.process_names(info)[0])
+
+
+    def _add_country_full_names(self, country_code, names):
+        """ Add names for the given country from an already sanitized
+            name list.
+        """
          word_tokens = set()
-        for name in self._compute_full_names(names):
-            norm_name = self.name_processor.get_search_normalized(name)
+        for name in names:
+            norm_name = self._search_normalized(name.name)
              if norm_name:
                  word_tokens.add(norm_name)
  
@@ -384,12 +412,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
      def process_place(self, place):
          """ Determine tokenizer information about the given place.
  
-            Returns a JSON-serialisable structure that will be handed into
+            Returns a JSON-serializable structure that will be handed into
              the database via the token_info field.
          """
          token_info = _TokenInfo(self._cache)
  
-        names = place.name
+        names, address = self.sanitizer.process_names(place)
  
          if names:
              fulls, partials = self._compute_name_tokens(names)
@@ -397,9 +425,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
              token_info.add_names(fulls, partials)
  
              if place.is_country():
-                self.add_country_names(place.country_code, names)
+                self._add_country_full_names(place.country_code, names)
  
-        address = place.address
          if address:
              self._process_place_address(token_info, address)
  
@@ -409,18 +436,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
      def _process_place_address(self, token_info, address):
          hnrs = []
          addr_terms = []
-        for key, value in address.items():
-            if key == 'postcode':
-                self._add_postcode(value)
-            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                hnrs.append(value)
-            elif key == 'street':
-                token_info.add_street(self._compute_partial_tokens(value))
-            elif key == 'place':
-                token_info.add_place(self._compute_partial_tokens(value))
-            elif not key.startswith('_') and \
-                 key not in ('country', 'full'):
-                addr_terms.append((key, self._compute_partial_tokens(value)))
+        for item in address:
+            if item.kind == 'postcode':
+                self._add_postcode(item.name)
+            elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                hnrs.append(item.name)
+            elif item.kind == 'street':
+                token_info.add_street(self._compute_partial_tokens(item.name))
+            elif item.kind == 'place':
+                token_info.add_place(self._compute_partial_tokens(item.name))
+            elif not item.kind.startswith('_') and \
+                 item.kind not in ('country', 'full'):
+                addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
  
          if hnrs:
              hnrs = self._split_housenumbers(hnrs)
@@ -433,7 +460,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          """ Normalize the given term, split it into partial words and return
              then token list for them.
          """
-        norm_name = self.name_processor.get_search_normalized(name)
+        norm_name = self._search_normalized(name)
  
          tokens = []
          need_lookup = []
@@ -456,19 +483,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
  
          return tokens
  
+
      def _compute_name_tokens(self, names):
          """ Computes the full name and partial name tokens for the given
              dictionary of names.
          """
-        full_names = self._compute_full_names(names)
          full_tokens = set()
          partial_tokens = set()
  
-        for name in full_names:
-            norm_name = self.name_processor.get_normalized(name)
+        for name in names:
+            norm_name = self._normalized(name.name)
              full, part = self._cache.names.get(norm_name, (None, None))
              if full is None:
-                variants = self.name_processor.get_variants_ascii(norm_name)
+                variants = self.token_analysis.get_variants_ascii(norm_name)
                  if not variants:
                      continue
  
@@ -485,23 +512,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          return full_tokens, partial_tokens
  
  
-    @staticmethod
-    def _compute_full_names(names):
-        """ Return the set of all full name word ids to be used with the
-            given dictionary of names.
-        """
-        full_names = set()
-        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
-            if name:
-                full_names.add(name)
-
-                brace_idx = name.find('(')
-                if brace_idx >= 0:
-                    full_names.add(name[:brace_idx].strip())
-
-        return full_names
-
-
      def _add_postcode(self, postcode):
          """ Make sure the normalized postcode is present in the word table.
          """
@@ -509,7 +519,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
              postcode = self.normalize_postcode(postcode)
  
              if postcode not in self._cache.postcodes:
-                term = self.name_processor.get_search_normalized(postcode)
+                term = self._search_normalized(postcode)
                  if not term:
                      return
  
diff --git a/nominatim/tokenizer/place_sanitizer.py b/nominatim/tokenizer/place_sanitizer.py

new file mode 100644 (file)

index 0000000..5961dcf
--- /dev/null
+++ b/nominatim/tokenizer/place_sanitizer.py
@@ -0,0 +1,127 @@
+"""
+Handler for cleaning name and address tags in place information before it
+is handed to the token analysis.
+"""
+import importlib
+
+from nominatim.errors import UsageError
+
+class PlaceName:
+    """ A searchable name for a place together with properties.
+        Every name object saves the name proper and two basic properties:
+        * 'kind' describes the name of the OSM key used without any suffixes
+          (i.e. the part after the colon removed)
+        * 'suffix' contains the suffix of the OSM tag, if any. The suffix
+          is the part of the key after the first colon.
+        In addition to that, the name may have arbitrary additional attributes.
+        Which attributes are used, depends on the token analyser.
+    """
+
+    def __init__(self, name, kind, suffix):
+        self.name = name
+        self.kind = kind
+        self.suffix = suffix
+        self.attr = {}
+
+
+    def __repr__(self):
+        return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
+
+
+    def clone(self, name=None, kind=None, suffix=None, attr=None):
+        """ Create a deep copy of the place name, optionally with the
+            given parameters replaced. In the attribute list only the given
+            keys are updated. The list is not replaced completely.
+            In particular, the function cannot to be used to remove an
+            attribute from a place name.
+        """
+        newobj = PlaceName(name or self.name,
+                           kind or self.kind,
+                           suffix or self.suffix)
+
+        newobj.attr.update(self.attr)
+        if attr:
+            newobj.attr.update(attr)
+
+        return newobj
+
+
+    def set_attr(self, key, value):
+        """ Add the given property to the name. If the property was already
+            set, then the value is overwritten.
+        """
+        self.attr[key] = value
+
+
+    def get_attr(self, key, default=None):
+        """ Return the given property or the value of 'default' if it
+            is not set.
+        """
+        return self.attr.get(key, default)
+
+
+    def has_attr(self, key):
+        """ Check if the given attribute is set.
+        """
+        return key in self.attr
+
+
+class _ProcessInfo:
+    """ Container class for information handed into to handler functions.
+        The 'names' and 'address' members are mutable. A handler must change
+        them by either modifying the lists place or replacing the old content
+        with a new list.
+    """
+
+    def __init__(self, place):
+        self.place = place
+        self.names = self._convert_name_dict(place.name)
+        self.address = self._convert_name_dict(place.address)
+
+
+    @staticmethod
+    def _convert_name_dict(names):
+        """ Convert a dictionary of names into a list of PlaceNames.
+            The dictionary key is split into the primary part of the key
+            and the suffix (the part after an optional colon).
+        """
+        out = []
+
+        if names:
+            for key, value in names.items():
+                parts = key.split(':', 1)
+                out.append(PlaceName(value.strip(),
+                                     parts[0].strip(),
+                                     parts[1].strip() if len(parts) > 1 else None))
+
+        return out
+
+
+class PlaceSanitizer:
+    """ Controller class which applies sanitizer functions on the place
+        names and address before they are used by the token analysers.
+    """
+
+    def __init__(self, rules):
+        self.handlers = []
+
+        if rules:
+            for func in rules:
+                if 'step' not in func:
+                    raise UsageError("Sanitizer rule is missing the 'step' attribute.")
+                module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
+                handler_module = importlib.import_module(module_name)
+                self.handlers.append(handler_module.create(func))
+
+
+    def process_names(self, place):
+        """ Extract a sanitized list of names and address parts from the
+            given place. The function returns a tuple
+            (list of names, list of address names)
+        """
+        obj = _ProcessInfo(place)
+
+        for func in self.handlers:
+            func(obj)
+
+        return obj.names, obj.address
diff --git a/nominatim/tokenizer/sanitizers/__init__.py b/nominatim/tokenizer/sanitizers/__init__.py

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/nominatim/tokenizer/sanitizers/split_name_list.py b/nominatim/tokenizer/sanitizers/split_name_list.py

new file mode 100644 (file)

index 0000000..93651f3
--- /dev/null
+++ b/nominatim/tokenizer/sanitizers/split_name_list.py
@@ -0,0 +1,28 @@
+"""
+Name processor that splits name values with multiple values into their components.
+"""
+import re
+
+def create(func):
+    """ Create a name processing function that splits name values with
+        multiple values into their components. The optional parameter
+        'delimiters' can be used to define the characters that should be used
+        for splitting. The default is ',;'.
+    """
+    regexp = re.compile('[{}]'.format(func.get('delimiters', ',;')))
+
+    def _process(obj):
+        if not obj.names:
+            return
+
+        new_names = []
+        for name in obj.names:
+            split_names = regexp.split(name.name)
+            if len(split_names) == 1:
+                new_names.append(name)
+            else:
+                new_names.extend(name.clone(name=n) for n in split_names)
+
+        obj.names = new_names
+
+    return _process
diff --git a/nominatim/tokenizer/sanitizers/strip_brace_terms.py b/nominatim/tokenizer/sanitizers/strip_brace_terms.py

new file mode 100644 (file)

index 0000000..4423d30
--- /dev/null
+++ b/nominatim/tokenizer/sanitizers/strip_brace_terms.py
@@ -0,0 +1,22 @@
+"""
+Sanitizer handling names with addendums in braces.
+"""
+
+def create(_):
+    """ Create a name processing function that creates additional name variants
+        when a name has an addendum in brackets (e.g. "Halle (Saale)"). The
+        additional variant only contains the main name without the bracket part.
+    """
+    def _process(obj):
+        """ Add variants for names that have a bracket extension.
+        """
+        new_names = []
+        if obj.names:
+            for name in (n for n in obj.names if '(' in n.name):
+                new_name = name.name.split('(')[0].strip()
+                if new_name:
+                    new_names.append(name.clone(name=new_name))
+
+        obj.names.extend(new_names)
+
+    return _process
diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml

index c0c8c043c289bdb31f699ce2725e5ca8d0c631be..08b7a7ff3570890da389447c5ae93efe64959a5d 100644 (file)
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -24,6 +24,9 @@ transliteration:
      - "[^[:Ascii:]] >"
      - ":: lower ()"
      - ":: NFC ()"
+sanitizers:
+    - step: split-name-list
+    - step: strip-brace-terms
  variants:
      - !include icu-rules/variants-bg.yaml
      - !include icu-rules/variants-ca.yaml
diff --git a/test/python/test_tokenizer_icu.py b/test/python/test_tokenizer_icu.py

index 4b7c56d58778e577af4dc96663c1c73fca020990..9a6f5a94f01c7b3486bfe659bd4878979581838b 100644 (file)
--- a/test/python/test_tokenizer_icu.py
+++ b/test/python/test_tokenizer_icu.py
@@ -67,10 +67,12 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
      monkeypatch.undo()
  
      def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
-                     variants=('~gasse -> gasse', 'street => st', )):
+                     variants=('~gasse -> gasse', 'street => st', ),
+                     sanitizers=[]):
          cfgstr = {'normalization' : list(norm),
-                   'transliteration' : list(trans),
-                   'variants' : [ {'words': list(variants)}]}
+                  'sanitizers' : sanitizers,
+                  'transliteration' : list(trans),
+                  'variants' : [ {'words': list(variants)}]}
          (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
          tok.loader = ICURuleLoader(test_config)
  
@@ -309,14 +311,15 @@ class TestPlaceNames:
  
      @pytest.fixture(autouse=True)
      def setup(self, analyzer, sql_functions):
-        with analyzer() as anl:
+        sanitizers = [{'step': 'split-name-list'},
+                      {'step': 'strip-brace-terms'}]
+        with analyzer(sanitizers=sanitizers) as anl:
              self.analyzer = anl
              yield anl
  
  
      def expect_name_terms(self, info, *expected_terms):
          tokens = self.analyzer.get_word_token_info(expected_terms)
-        print (tokens)
          for token in tokens:
              assert token[2] is not None, "No token for {0}".format(token)
  
@@ -324,9 +327,7 @@ class TestPlaceNames:
  
  
      def process_named_place(self, names):
-        place = {'name': names}
-
-        return self.analyzer.process_place(PlaceInfo(place))
+        return self.analyzer.process_place(PlaceInfo({'name': names}))
  
  
      def test_simple_names(self):
author	Sarah Hoffmann <lonvia@denofr.de>
	Thu, 30 Sep 2021 19:30:13 +0000 (21:30 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Fri, 1 Oct 2021 10:27:24 +0000 (12:27 +0200)
nominatim/tokenizer/icu_rule_loader.py		patch \| blob \| history
nominatim/tokenizer/icu_tokenizer.py		patch \| blob \| history
nominatim/tokenizer/place_sanitizer.py	[new file with mode: 0644]	patch \| blob
nominatim/tokenizer/sanitizers/__init__.py	[new file with mode: 0644]	patch \| blob
nominatim/tokenizer/sanitizers/split_name_list.py	[new file with mode: 0644]	patch \| blob
nominatim/tokenizer/sanitizers/strip_brace_terms.py	[new file with mode: 0644]	patch \| blob
settings/icu_tokenizer.yaml		patch \| blob \| history
test/python/test_tokenizer_icu.py		patch \| blob \| history