use yaml tag syntax to mark include files

[nominatim.git] / nominatim / tokenizer / icu_rule_loader.py
diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py

index 3b7211691f72eed8ed18373be91bb714742cb732..ddb17ae76698025dd9c9fd82619137d0a5b1e22e 100644 (file)
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -2,17 +2,34 @@
  Helper class to create ICU rules from a configuration file.
  """
  import io
-import yaml
  import logging
  from collections import defaultdict
  import itertools
+from pathlib import Path
  
+import yaml
  from icu import Transliterator
  
  from nominatim.errors import UsageError
  
  LOG = logging.getLogger()
  
+def _flatten_yaml_list(content):
+    if not content:
+        return []
+
+    if not isinstance(content, list):
+        raise UsageError("List expected in ICU yaml configuration.")
+
+    output = []
+    for ele in content:
+        if isinstance(ele, list):
+            output.extend(_flatten_yaml_list(ele))
+        else:
+            output.append(ele)
+
+    return output
+
  
  class ICURuleLoader:
      """ Compiler for ICU rules from a tokenizer configuration file.
@@ -20,6 +37,8 @@ class ICURuleLoader:
  
      def __init__(self, configfile):
          self.configfile = configfile
+        self.compound_suffixes = set()
+        self.abbreviations = defaultdict()
  
          if configfile.suffix == '.yaml':
              self._load_from_yaml()
@@ -28,24 +47,14 @@ class ICURuleLoader:
  
  
      def get_search_rules(self):
-        """ Returns the ICU rules to be used during search.
-            The rules combine normalization, compound decomposition (including
-            abbreviated compounds) and transliteration.
+        """ Return the ICU rules to be used during search.
+            The rules combine normalization and transliteration.
          """
          # First apply the normalization rules.
          rules = io.StringIO()
          rules.write(self.normalization_rules)
  
-        # For all compound suffixes: add them in their full and any abbreviated form.
-        suffixes = set()
-        for suffix in self.compound_suffixes:
-            suffixes.add(suffix)
-            suffixes.update(self.abbreviations.get(suffix, []))
-
-        for suffix in sorted(suffixes, key=lambda x:len(x), reverse=True):
-            rules.write("'{0} ' > ' {0} ';".format(suffix))
-
-        # Finally add transliteration.
+        # Then add transliteration.
          rules.write(self.transliteration_rules)
          return rules.getvalue()
  
@@ -60,13 +69,19 @@ class ICURuleLoader:
          return self.transliteration_rules
  
      def get_replacement_pairs(self):
-        """ Returns the list of possible compound decompositions with
+        """ Return the list of possible compound decompositions with
              application of abbreviations included.
              The result is a list of pairs: the first item is the sequence to
              replace, the second is a list of replacements.
          """
          synonyms = defaultdict(set)
  
+        # First add entries for compound decomposition.
+        for suffix in self.compound_suffixes:
+            variants = (suffix + ' ', ' ' + suffix + ' ')
+            for key in variants:
+                synonyms[key].update(variants)
+
          for full, abbr in self.abbreviations.items():
              key = ' ' + full + ' '
              # Entries in the abbreviation list always apply to full words:
@@ -74,24 +89,36 @@ class ICURuleLoader:
              # Replacements are optional, so add a noop
              synonyms[key].add(key)
  
-        # Entries in the compound list expand to themselves and to
-        # abbreviations.
-        for suffix in self.compound_suffixes:
-            keyset = synonyms[suffix + ' ']
-            keyset.add(' ' + suffix + ' ')
-            keyset.update((' ' + a + ' ' for a in self.abbreviations.get(suffix, [])))
-            # The terms the entries are shortended to, need to be decompunded as well.
-            for abbr in self.abbreviations.get(suffix, []):
-                synonyms[abbr + ' '].add(' ' + abbr + ' ')
+            if full in self.compound_suffixes:
+                # Full word abbreviating to compunded version.
+                synonyms[key].update((a + ' ' for a in abbr))
+
+                key = full + ' '
+                # Uncompunded suffix abbrevitating to decompounded version.
+                synonyms[key].update((' ' + a + ' ' for a in abbr))
+                # Uncompunded suffix abbrevitating to compunded version.
+                synonyms[key].update((a + ' ' for a in abbr))
  
          # sort the resulting list by descending length (longer matches are prefered).
-        sorted_keys = sorted(synonyms.keys(), key=lambda x: len(x), reverse=True)
+        sorted_keys = sorted(synonyms.keys(), key=len, reverse=True)
  
          return [(k, list(synonyms[k])) for k in sorted_keys]
  
+    def _yaml_include_representer(self, loader, node):
+        value = loader.construct_scalar(node)
+
+        if Path(value).is_absolute():
+            content = Path(value).read_text()
+        else:
+            content = (self.configfile.parent / value).read_text()
+
+        return yaml.safe_load(content)
+
  
      def _load_from_yaml(self):
-        rules = yaml.load(self.configfile.read_text())
+        yaml.add_constructor('!include', self._yaml_include_representer,
+                             Loader=yaml.SafeLoader)
+        rules = yaml.safe_load(self.configfile.read_text())
  
          self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
          self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
@@ -120,10 +147,11 @@ class ICURuleLoader:
          """
          content = self._get_section(rules, section)
  
-        if isinstance(content, str):
-            return (self.configfile.parent / content).read_text().replace('\n', ' ')
+        if content is None:
+            return ''
+
+        return ';'.join(_flatten_yaml_list(content)) + ';'
  
-        return ';'.join(content) + ';'
  
  
      def _parse_compound_suffix_list(self, rules):
@@ -158,4 +186,5 @@ class ICURuleLoader:
              abbrterms = (norm.transliterate(t.strip()) for t in parts[1].split(','))
  
              for full, abbr in itertools.product(fullterms, abbrterms):
-                self.abbreviations[full].append(abbr)
+                if full and abbr:
+                    self.abbreviations[full].append(abbr)