Merge remote-tracking branch 'upstream/master'

[nominatim.git] / nominatim / tokenizer / icu_rule_loader.py
diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py

index ddb17ae76698025dd9c9fd82619137d0a5b1e22e..0e6e40b4c88dc3109e5aa9fa60cb27925458454b 100644 (file)
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -3,47 +3,55 @@ Helper class to create ICU rules from a configuration file.
  """
  import io
  import logging
-from collections import defaultdict
  import itertools
-from pathlib import Path
+import re
  
-import yaml
  from icu import Transliterator
  
  from nominatim.errors import UsageError
+import nominatim.tokenizer.icu_variants as variants
  
  LOG = logging.getLogger()
  
-def _flatten_yaml_list(content):
+def _flatten_config_list(content):
      if not content:
          return []
  
      if not isinstance(content, list):
-        raise UsageError("List expected in ICU yaml configuration.")
+        raise UsageError("List expected in ICU configuration.")
  
      output = []
      for ele in content:
          if isinstance(ele, list):
-            output.extend(_flatten_yaml_list(ele))
+            output.extend(_flatten_config_list(ele))
          else:
              output.append(ele)
  
      return output
  
  
+class VariantRule:
+    """ Saves a single variant expansion.
+
+        An expansion consists of the normalized replacement term and
+        a dicitonary of properties that describe when the expansion applies.
+    """
+
+    def __init__(self, replacement, properties):
+        self.replacement = replacement
+        self.properties = properties or {}
+
+
  class ICURuleLoader:
      """ Compiler for ICU rules from a tokenizer configuration file.
      """
  
-    def __init__(self, configfile):
-        self.configfile = configfile
-        self.compound_suffixes = set()
-        self.abbreviations = defaultdict()
+    def __init__(self, rules):
+        self.variants = set()
  
-        if configfile.suffix == '.yaml':
-            self._load_from_yaml()
-        else:
-            raise UsageError("Unknown format of tokenizer configuration.")
+        self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
+        self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
+        self._parse_variant_list(self._get_section(rules, 'variants'))
  
  
      def get_search_rules(self):
@@ -74,65 +82,16 @@ class ICURuleLoader:
              The result is a list of pairs: the first item is the sequence to
              replace, the second is a list of replacements.
          """
-        synonyms = defaultdict(set)
-
-        # First add entries for compound decomposition.
-        for suffix in self.compound_suffixes:
-            variants = (suffix + ' ', ' ' + suffix + ' ')
-            for key in variants:
-                synonyms[key].update(variants)
-
-        for full, abbr in self.abbreviations.items():
-            key = ' ' + full + ' '
-            # Entries in the abbreviation list always apply to full words:
-            synonyms[key].update((' ' + a + ' ' for a in abbr))
-            # Replacements are optional, so add a noop
-            synonyms[key].add(key)
-
-            if full in self.compound_suffixes:
-                # Full word abbreviating to compunded version.
-                synonyms[key].update((a + ' ' for a in abbr))
-
-                key = full + ' '
-                # Uncompunded suffix abbrevitating to decompounded version.
-                synonyms[key].update((' ' + a + ' ' for a in abbr))
-                # Uncompunded suffix abbrevitating to compunded version.
-                synonyms[key].update((a + ' ' for a in abbr))
-
-        # sort the resulting list by descending length (longer matches are prefered).
-        sorted_keys = sorted(synonyms.keys(), key=len, reverse=True)
-
-        return [(k, list(synonyms[k])) for k in sorted_keys]
+        return self.variants
  
-    def _yaml_include_representer(self, loader, node):
-        value = loader.construct_scalar(node)
  
-        if Path(value).is_absolute():
-            content = Path(value).read_text()
-        else:
-            content = (self.configfile.parent / value).read_text()
-
-        return yaml.safe_load(content)
-
-
-    def _load_from_yaml(self):
-        yaml.add_constructor('!include', self._yaml_include_representer,
-                             Loader=yaml.SafeLoader)
-        rules = yaml.safe_load(self.configfile.read_text())
-
-        self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
-        self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
-        self._parse_compound_suffix_list(self._get_section(rules, 'compound_suffixes'))
-        self._parse_abbreviation_list(self._get_section(rules, 'abbreviations'))
-
-
-    def _get_section(self, rules, section):
+    @staticmethod
+    def _get_section(rules, section):
          """ Get the section named 'section' from the rules. If the section does
              not exist, raise a usage error with a meaningful message.
          """
          if section not in rules:
-            LOG.fatal("Section '%s' not found in tokenizer config '%s'.",
-                      section, str(self.configfile))
+            LOG.fatal("Section '%s' not found in tokenizer config.", section)
              raise UsageError("Syntax error in tokenizer configuration file.")
  
          return rules[section]
@@ -150,41 +109,114 @@ class ICURuleLoader:
          if content is None:
              return ''
  
-        return ';'.join(_flatten_yaml_list(content)) + ';'
+        return ';'.join(_flatten_config_list(content)) + ';'
  
  
+    def _parse_variant_list(self, rules):
+        self.variants.clear()
  
-    def _parse_compound_suffix_list(self, rules):
          if not rules:
-            self.compound_suffixes = set()
              return
  
-        norm = Transliterator.createFromRules("rule_loader_normalization",
-                                              self.normalization_rules)
+        rules = _flatten_config_list(rules)
  
-        # Make sure all suffixes are in their normalised form.
-        self.compound_suffixes = set((norm.transliterate(s) for s in rules))
+        vmaker = _VariantMaker(self.normalization_rules)
  
+        properties = []
+        for section in rules:
+            # Create the property field and deduplicate against existing
+            # instances.
+            props = variants.ICUVariantProperties.from_rules(section)
+            for existing in properties:
+                if existing == props:
+                    props = existing
+                    break
+            else:
+                properties.append(props)
  
-    def _parse_abbreviation_list(self, rules):
-        self.abbreviations = defaultdict(list)
+            for rule in (section.get('words') or []):
+                self.variants.update(vmaker.compute(rule, props))
  
-        if not rules:
-            return
  
-        norm = Transliterator.createFromRules("rule_loader_normalization",
-                                              self.normalization_rules)
+class _VariantMaker:
+    """ Generater for all necessary ICUVariants from a single variant rule.
  
-        for rule in rules:
-            parts = rule.split('=>')
-            if len(parts) != 2:
-                LOG.fatal("Syntax error in abbreviation section, line: %s", rule)
-                raise UsageError("Syntax error in tokenizer configuration file.")
+        All text in rules is normalized to make sure the variants match later.
+    """
+
+    def __init__(self, norm_rules):
+        self.norm = Transliterator.createFromRules("rule_loader_normalization",
+                                                   norm_rules)
  
-            # Make sure all terms match the normalised version.
-            fullterms = (norm.transliterate(t.strip()) for t in parts[0].split(','))
-            abbrterms = (norm.transliterate(t.strip()) for t in parts[1].split(','))
  
-            for full, abbr in itertools.product(fullterms, abbrterms):
-                if full and abbr:
-                    self.abbreviations[full].append(abbr)
+    def compute(self, rule, props):
+        """ Generator for all ICUVariant tuples from a single variant rule.
+        """
+        parts = re.split(r'(\|)?([=-])>', rule)
+        if len(parts) != 4:
+            raise UsageError("Syntax error in variant rule: " + rule)
+
+        decompose = parts[1] is None
+        src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
+        repl_terms = (self.norm.transliterate(t.strip()) for t in parts[3].split(','))
+
+        # If the source should be kept, add a 1:1 replacement
+        if parts[2] == '-':
+            for src in src_terms:
+                if src:
+                    for froms, tos in _create_variants(*src, src[0], decompose):
+                        yield variants.ICUVariant(froms, tos, props)
+
+        for src, repl in itertools.product(src_terms, repl_terms):
+            if src and repl:
+                for froms, tos in _create_variants(*src, repl, decompose):
+                    yield variants.ICUVariant(froms, tos, props)
+
+
+    def _parse_variant_word(self, name):
+        name = name.strip()
+        match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
+        if match is None or (match.group(1) == '~' and match.group(3) == '~'):
+            raise UsageError("Invalid variant word descriptor '{}'".format(name))
+        norm_name = self.norm.transliterate(match.group(2))
+        if not norm_name:
+            return None
+
+        return norm_name, match.group(1), match.group(3)
+
+
+_FLAG_MATCH = {'^': '^ ',
+               '$': ' ^',
+               '': ' '}
+
+
+def _create_variants(src, preflag, postflag, repl, decompose):
+    if preflag == '~':
+        postfix = _FLAG_MATCH[postflag]
+        # suffix decomposition
+        src = src + postfix
+        repl = repl + postfix
+
+        yield src, repl
+        yield ' ' + src, ' ' + repl
+
+        if decompose:
+            yield src, ' ' + repl
+            yield ' ' + src, repl
+    elif postflag == '~':
+        # prefix decomposition
+        prefix = _FLAG_MATCH[preflag]
+        src = prefix + src
+        repl = prefix + repl
+
+        yield src, repl
+        yield src + ' ', repl + ' '
+
+        if decompose:
+            yield src, repl + ' '
+            yield src + ' ', repl
+    else:
+        prefix = _FLAG_MATCH[preflag]
+        postfix = _FLAG_MATCH[postflag]
+
+        yield prefix + src + postfix, prefix + repl + postfix