Merge pull request #2454 from lonvia/sort-out-token-assignment-in-sql

[nominatim.git] / nominatim / tokenizer / icu_rule_loader.py
diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py

index 269faed981abbbb9ffc530bd32d6b38ae0c30df4..0e6e40b4c88dc3109e5aa9fa60cb27925458454b 100644 (file)
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -3,51 +3,66 @@ Helper class to create ICU rules from a configuration file.
  """
  import io
  import logging
-from collections import defaultdict
  import itertools
+import re
  
-import yaml
  from icu import Transliterator
  
  from nominatim.errors import UsageError
+import nominatim.tokenizer.icu_variants as variants
  
  LOG = logging.getLogger()
  
+def _flatten_config_list(content):
+    if not content:
+        return []
+
+    if not isinstance(content, list):
+        raise UsageError("List expected in ICU configuration.")
+
+    output = []
+    for ele in content:
+        if isinstance(ele, list):
+            output.extend(_flatten_config_list(ele))
+        else:
+            output.append(ele)
+
+    return output
+
+
+class VariantRule:
+    """ Saves a single variant expansion.
+
+        An expansion consists of the normalized replacement term and
+        a dicitonary of properties that describe when the expansion applies.
+    """
+
+    def __init__(self, replacement, properties):
+        self.replacement = replacement
+        self.properties = properties or {}
+
  
  class ICURuleLoader:
      """ Compiler for ICU rules from a tokenizer configuration file.
      """
  
-    def __init__(self, configfile):
-        self.configfile = configfile
-        self.compound_suffixes = set()
-        self.abbreviations = defaultdict()
+    def __init__(self, rules):
+        self.variants = set()
  
-        if configfile.suffix == '.yaml':
-            self._load_from_yaml()
-        else:
-            raise UsageError("Unknown format of tokenizer configuration.")
+        self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
+        self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
+        self._parse_variant_list(self._get_section(rules, 'variants'))
  
  
      def get_search_rules(self):
          """ Return the ICU rules to be used during search.
-            The rules combine normalization, compound decomposition (including
-            abbreviated compounds) and transliteration.
+            The rules combine normalization and transliteration.
          """
          # First apply the normalization rules.
          rules = io.StringIO()
          rules.write(self.normalization_rules)
  
-        # For all compound suffixes: add them in their full and any abbreviated form.
-        suffixes = set()
-        for suffix in self.compound_suffixes:
-            suffixes.add(suffix)
-            suffixes.update(self.abbreviations.get(suffix, []))
-
-        for suffix in sorted(suffixes, key=len, reverse=True):
-            rules.write("'{0} ' > ' {0} ';".format(suffix))
-
-        # Finally add transliteration.
+        # Then add transliteration.
          rules.write(self.transliteration_rules)
          return rules.getvalue()
  
@@ -67,47 +82,16 @@ class ICURuleLoader:
              The result is a list of pairs: the first item is the sequence to
              replace, the second is a list of replacements.
          """
-        synonyms = defaultdict(set)
+        return self.variants
  
-        for full, abbr in self.abbreviations.items():
-            key = ' ' + full + ' '
-            # Entries in the abbreviation list always apply to full words:
-            synonyms[key].update((' ' + a + ' ' for a in abbr))
-            # Replacements are optional, so add a noop
-            synonyms[key].add(key)
  
-        # Entries in the compound list expand to themselves and to
-        # abbreviations.
-        for suffix in self.compound_suffixes:
-            keyset = synonyms[suffix + ' ']
-            keyset.add(' ' + suffix + ' ')
-            keyset.update((' ' + a + ' ' for a in self.abbreviations.get(suffix, [])))
-            # The terms the entries are shortended to, need to be decompunded as well.
-            for abbr in self.abbreviations.get(suffix, []):
-                synonyms[abbr + ' '].add(' ' + abbr + ' ')
-
-        # sort the resulting list by descending length (longer matches are prefered).
-        sorted_keys = sorted(synonyms.keys(), key=len, reverse=True)
-
-        return [(k, list(synonyms[k])) for k in sorted_keys]
-
-
-    def _load_from_yaml(self):
-        rules = yaml.safe_load(self.configfile.read_text())
-
-        self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
-        self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
-        self._parse_compound_suffix_list(self._get_section(rules, 'compound_suffixes'))
-        self._parse_abbreviation_list(self._get_section(rules, 'abbreviations'))
-
-
-    def _get_section(self, rules, section):
+    @staticmethod
+    def _get_section(rules, section):
          """ Get the section named 'section' from the rules. If the section does
              not exist, raise a usage error with a meaningful message.
          """
          if section not in rules:
-            LOG.fatal("Section '%s' not found in tokenizer config '%s'.",
-                      section, str(self.configfile))
+            LOG.fatal("Section '%s' not found in tokenizer config.", section)
              raise UsageError("Syntax error in tokenizer configuration file.")
  
          return rules[section]
@@ -125,43 +109,114 @@ class ICURuleLoader:
          if content is None:
              return ''
  
-        if isinstance(content, str):
-            return (self.configfile.parent / content).read_text().replace('\n', ' ')
+        return ';'.join(_flatten_config_list(content)) + ';'
  
-        return ';'.join(content) + ';'
  
+    def _parse_variant_list(self, rules):
+        self.variants.clear()
  
-    def _parse_compound_suffix_list(self, rules):
          if not rules:
-            self.compound_suffixes = set()
              return
  
-        norm = Transliterator.createFromRules("rule_loader_normalization",
-                                              self.normalization_rules)
+        rules = _flatten_config_list(rules)
  
-        # Make sure all suffixes are in their normalised form.
-        self.compound_suffixes = set((norm.transliterate(s) for s in rules))
+        vmaker = _VariantMaker(self.normalization_rules)
  
+        properties = []
+        for section in rules:
+            # Create the property field and deduplicate against existing
+            # instances.
+            props = variants.ICUVariantProperties.from_rules(section)
+            for existing in properties:
+                if existing == props:
+                    props = existing
+                    break
+            else:
+                properties.append(props)
  
-    def _parse_abbreviation_list(self, rules):
-        self.abbreviations = defaultdict(list)
+            for rule in (section.get('words') or []):
+                self.variants.update(vmaker.compute(rule, props))
  
-        if not rules:
-            return
  
-        norm = Transliterator.createFromRules("rule_loader_normalization",
-                                              self.normalization_rules)
+class _VariantMaker:
+    """ Generater for all necessary ICUVariants from a single variant rule.
  
-        for rule in rules:
-            parts = rule.split('=>')
-            if len(parts) != 2:
-                LOG.fatal("Syntax error in abbreviation section, line: %s", rule)
-                raise UsageError("Syntax error in tokenizer configuration file.")
+        All text in rules is normalized to make sure the variants match later.
+    """
+
+    def __init__(self, norm_rules):
+        self.norm = Transliterator.createFromRules("rule_loader_normalization",
+                                                   norm_rules)
  
-            # Make sure all terms match the normalised version.
-            fullterms = (norm.transliterate(t.strip()) for t in parts[0].split(','))
-            abbrterms = (norm.transliterate(t.strip()) for t in parts[1].split(','))
  
-            for full, abbr in itertools.product(fullterms, abbrterms):
-                if full and abbr:
-                    self.abbreviations[full].append(abbr)
+    def compute(self, rule, props):
+        """ Generator for all ICUVariant tuples from a single variant rule.
+        """
+        parts = re.split(r'(\|)?([=-])>', rule)
+        if len(parts) != 4:
+            raise UsageError("Syntax error in variant rule: " + rule)
+
+        decompose = parts[1] is None
+        src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
+        repl_terms = (self.norm.transliterate(t.strip()) for t in parts[3].split(','))
+
+        # If the source should be kept, add a 1:1 replacement
+        if parts[2] == '-':
+            for src in src_terms:
+                if src:
+                    for froms, tos in _create_variants(*src, src[0], decompose):
+                        yield variants.ICUVariant(froms, tos, props)
+
+        for src, repl in itertools.product(src_terms, repl_terms):
+            if src and repl:
+                for froms, tos in _create_variants(*src, repl, decompose):
+                    yield variants.ICUVariant(froms, tos, props)
+
+
+    def _parse_variant_word(self, name):
+        name = name.strip()
+        match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
+        if match is None or (match.group(1) == '~' and match.group(3) == '~'):
+            raise UsageError("Invalid variant word descriptor '{}'".format(name))
+        norm_name = self.norm.transliterate(match.group(2))
+        if not norm_name:
+            return None
+
+        return norm_name, match.group(1), match.group(3)
+
+
+_FLAG_MATCH = {'^': '^ ',
+               '$': ' ^',
+               '': ' '}
+
+
+def _create_variants(src, preflag, postflag, repl, decompose):
+    if preflag == '~':
+        postfix = _FLAG_MATCH[postflag]
+        # suffix decomposition
+        src = src + postfix
+        repl = repl + postfix
+
+        yield src, repl
+        yield ' ' + src, ' ' + repl
+
+        if decompose:
+            yield src, ' ' + repl
+            yield ' ' + src, repl
+    elif postflag == '~':
+        # prefix decomposition
+        prefix = _FLAG_MATCH[preflag]
+        src = prefix + src
+        repl = prefix + repl
+
+        yield src, repl
+        yield src + ' ', repl + ' '
+
+        if decompose:
+            yield src, repl + ' '
+            yield src + ' ', repl
+    else:
+        prefix = _FLAG_MATCH[preflag]
+        postfix = _FLAG_MATCH[postflag]
+
+        yield prefix + src + postfix, prefix + repl + postfix