From c171d881945e8c744e4a4a44c5f33edd0d8468fb Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Mon, 4 Oct 2021 18:31:58 +0200
Subject: [PATCH] move parsing of token analysis config to analyzer

Adds a second callback for the analyzer which is responsible
for parsing the configuration rules and converting it to
whatever format necessary. This way, each analyzer implementation
can define its own configuration rules.
---
 nominatim/tokenizer/icu_rule_loader.py        | 132 +-----------------
 nominatim/tokenizer/token_analysis/generic.py | 124 ++++++++++++++++
 2 files changed, 125 insertions(+), 131 deletions(-)

diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py
index a8bdba93..cb38cfdf 100644
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -5,16 +5,11 @@ import importlib
 import io
 import json
 import logging
-import itertools
-import re
-
-from icu import Transliterator
 
 from nominatim.config import flatten_config_list
 from nominatim.db.properties import set_property, get_property
 from nominatim.errors import UsageError
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
-import nominatim.tokenizer.icu_variants as variants
 
 LOG = logging.getLogger()
 
@@ -34,18 +29,6 @@ def _get_section(rules, section):
     return rules[section]
 
 
-class VariantRule:
-    """ Saves a single variant expansion.
-
-        An expansion consists of the normalized replacement term and
-        a dicitonary of properties that describe when the expansion applies.
-    """
-
-    def __init__(self, replacement, properties):
-        self.replacement = replacement
-        self.properties = properties or {}
-
-
 class ICURuleLoader:
     """ Compiler for ICU rules from a tokenizer configuration file.
     """
@@ -169,8 +152,7 @@ class TokenAnalyzerRule:
         self._mod_create = analysis_mod.create
 
         # Load the configuration.
-        self.config = {}
-        self._parse_variant_list(rules.get('variants'), normalization_rules)
+        self.config = analysis_mod.configure(rules, normalization_rules)
 
 
     def create(self, normalization_rules, transliteration_rules):
@@ -179,115 +161,3 @@ class TokenAnalyzerRule:
         return self._mod_create(normalization_rules,
                                 transliteration_rules,
                                 self.config)
-
-
-    def _parse_variant_list(self, rules, normalization_rules):
-        vset = set()
-
-        if not rules:
-            return
-
-        rules = flatten_config_list(rules, 'variants')
-
-        vmaker = _VariantMaker(normalization_rules)
-
-        properties = []
-        for section in rules:
-            # Create the property field and deduplicate against existing
-            # instances.
-            props = variants.ICUVariantProperties.from_rules(section)
-            for existing in properties:
-                if existing == props:
-                    props = existing
-                    break
-            else:
-                properties.append(props)
-
-            for rule in (section.get('words') or []):
-                vset.update(vmaker.compute(rule, props))
-
-        self.config['variants'] = vset
-
-
-class _VariantMaker:
-    """ Generater for all necessary ICUVariants from a single variant rule.
-
-        All text in rules is normalized to make sure the variants match later.
-    """
-
-    def __init__(self, norm_rules):
-        self.norm = Transliterator.createFromRules("rule_loader_normalization",
-                                                   norm_rules)
-
-
-    def compute(self, rule, props):
-        """ Generator for all ICUVariant tuples from a single variant rule.
-        """
-        parts = re.split(r'(\|)?([=-])>', rule)
-        if len(parts) != 4:
-            raise UsageError("Syntax error in variant rule: " + rule)
-
-        decompose = parts[1] is None
-        src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
-        repl_terms = (self.norm.transliterate(t.strip()) for t in parts[3].split(','))
-
-        # If the source should be kept, add a 1:1 replacement
-        if parts[2] == '-':
-            for src in src_terms:
-                if src:
-                    for froms, tos in _create_variants(*src, src[0], decompose):
-                        yield variants.ICUVariant(froms, tos, props)
-
-        for src, repl in itertools.product(src_terms, repl_terms):
-            if src and repl:
-                for froms, tos in _create_variants(*src, repl, decompose):
-                    yield variants.ICUVariant(froms, tos, props)
-
-
-    def _parse_variant_word(self, name):
-        name = name.strip()
-        match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
-        if match is None or (match.group(1) == '~' and match.group(3) == '~'):
-            raise UsageError("Invalid variant word descriptor '{}'".format(name))
-        norm_name = self.norm.transliterate(match.group(2))
-        if not norm_name:
-            return None
-
-        return norm_name, match.group(1), match.group(3)
-
-
-_FLAG_MATCH = {'^': '^ ',
-               '$': ' ^',
-               '': ' '}
-
-
-def _create_variants(src, preflag, postflag, repl, decompose):
-    if preflag == '~':
-        postfix = _FLAG_MATCH[postflag]
-        # suffix decomposition
-        src = src + postfix
-        repl = repl + postfix
-
-        yield src, repl
-        yield ' ' + src, ' ' + repl
-
-        if decompose:
-            yield src, ' ' + repl
-            yield ' ' + src, repl
-    elif postflag == '~':
-        # prefix decomposition
-        prefix = _FLAG_MATCH[preflag]
-        src = prefix + src
-        repl = prefix + repl
-
-        yield src, repl
-        yield src + ' ', repl + ' '
-
-        if decompose:
-            yield src, repl + ' '
-            yield src + ' ', repl
-    else:
-        prefix = _FLAG_MATCH[preflag]
-        postfix = _FLAG_MATCH[postflag]
-
-        yield prefix + src + postfix, prefix + repl + postfix
diff --git a/nominatim/tokenizer/token_analysis/generic.py b/nominatim/tokenizer/token_analysis/generic.py
index 2c720f1d..f0de0cca 100644
--- a/nominatim/tokenizer/token_analysis/generic.py
+++ b/nominatim/tokenizer/token_analysis/generic.py
@@ -3,10 +3,134 @@ Generic processor for names that creates abbreviation variants.
 """
 from collections import defaultdict
 import itertools
+import re
 
 from icu import Transliterator
 import datrie
 
+from nominatim.config import flatten_config_list
+from nominatim.errors import UsageError
+import nominatim.tokenizer.icu_variants as variants
+
+### Configuration section
+
+def configure(rules, normalization_rules):
+    """ Extract and preprocess the configuration for this module.
+    """
+    return {'variants': _parse_variant_list(rules.get('variants'),
+                                            normalization_rules)}
+
+
+def _parse_variant_list(rules, normalization_rules):
+    vset = set()
+
+    if rules:
+        rules = flatten_config_list(rules, 'variants')
+
+        vmaker = _VariantMaker(normalization_rules)
+
+        properties = []
+        for section in rules:
+            # Create the property field and deduplicate against existing
+            # instances.
+            props = variants.ICUVariantProperties.from_rules(section)
+            for existing in properties:
+                if existing == props:
+                    props = existing
+                    break
+            else:
+                properties.append(props)
+
+            for rule in (section.get('words') or []):
+                vset.update(vmaker.compute(rule, props))
+
+    return vset
+
+
+class _VariantMaker:
+    """ Generater for all necessary ICUVariants from a single variant rule.
+
+        All text in rules is normalized to make sure the variants match later.
+    """
+
+    def __init__(self, norm_rules):
+        self.norm = Transliterator.createFromRules("rule_loader_normalization",
+                                                   norm_rules)
+
+
+    def compute(self, rule, props):
+        """ Generator for all ICUVariant tuples from a single variant rule.
+        """
+        parts = re.split(r'(\|)?([=-])>', rule)
+        if len(parts) != 4:
+            raise UsageError("Syntax error in variant rule: " + rule)
+
+        decompose = parts[1] is None
+        src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
+        repl_terms = (self.norm.transliterate(t.strip()) for t in parts[3].split(','))
+
+        # If the source should be kept, add a 1:1 replacement
+        if parts[2] == '-':
+            for src in src_terms:
+                if src:
+                    for froms, tos in _create_variants(*src, src[0], decompose):
+                        yield variants.ICUVariant(froms, tos, props)
+
+        for src, repl in itertools.product(src_terms, repl_terms):
+            if src and repl:
+                for froms, tos in _create_variants(*src, repl, decompose):
+                    yield variants.ICUVariant(froms, tos, props)
+
+
+    def _parse_variant_word(self, name):
+        name = name.strip()
+        match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
+        if match is None or (match.group(1) == '~' and match.group(3) == '~'):
+            raise UsageError("Invalid variant word descriptor '{}'".format(name))
+        norm_name = self.norm.transliterate(match.group(2))
+        if not norm_name:
+            return None
+
+        return norm_name, match.group(1), match.group(3)
+
+
+_FLAG_MATCH = {'^': '^ ',
+               '$': ' ^',
+               '': ' '}
+
+
+def _create_variants(src, preflag, postflag, repl, decompose):
+    if preflag == '~':
+        postfix = _FLAG_MATCH[postflag]
+        # suffix decomposition
+        src = src + postfix
+        repl = repl + postfix
+
+        yield src, repl
+        yield ' ' + src, ' ' + repl
+
+        if decompose:
+            yield src, ' ' + repl
+            yield ' ' + src, repl
+    elif postflag == '~':
+        # prefix decomposition
+        prefix = _FLAG_MATCH[preflag]
+        src = prefix + src
+        repl = prefix + repl
+
+        yield src, repl
+        yield src + ' ', repl + ' '
+
+        if decompose:
+            yield src, repl + ' '
+            yield src + ' ', repl
+    else:
+        prefix = _FLAG_MATCH[preflag]
+        postfix = _FLAG_MATCH[postflag]
+
+        yield prefix + src + postfix, prefix + repl + postfix
+
+
 ### Analysis section
 
 def create(norm_rules, trans_rules, config):
-- 
2.39.5