correctly quote strings when copying in data

[nominatim.git] / nominatim / tokenizer / icu_rule_loader.py
diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py

index 2597656b3919750e44e811ad6457e35ad8dfb838..269faed981abbbb9ffc530bd32d6b38ae0c30df4 100644 (file)
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -2,11 +2,11 @@
  Helper class to create ICU rules from a configuration file.
  """
  import io
-import yaml
  import logging
  from collections import defaultdict
  import itertools
  
+import yaml
  from icu import Transliterator
  
  from nominatim.errors import UsageError
@@ -20,6 +20,8 @@ class ICURuleLoader:
  
      def __init__(self, configfile):
          self.configfile = configfile
+        self.compound_suffixes = set()
+        self.abbreviations = defaultdict()
  
          if configfile.suffix == '.yaml':
              self._load_from_yaml()
@@ -42,7 +44,7 @@ class ICURuleLoader:
              suffixes.add(suffix)
              suffixes.update(self.abbreviations.get(suffix, []))
  
-        for suffix in sorted(suffixes, key=lambda x:len(x), reverse=True):
+        for suffix in sorted(suffixes, key=len, reverse=True):
              rules.write("'{0} ' > ' {0} ';".format(suffix))
  
          # Finally add transliteration.
@@ -85,13 +87,13 @@ class ICURuleLoader:
                  synonyms[abbr + ' '].add(' ' + abbr + ' ')
  
          # sort the resulting list by descending length (longer matches are prefered).
-        sorted_keys = sorted(synonyms.keys(), key=lambda x: len(x), reverse=True)
+        sorted_keys = sorted(synonyms.keys(), key=len, reverse=True)
  
          return [(k, list(synonyms[k])) for k in sorted_keys]
  
  
      def _load_from_yaml(self):
-        rules = yaml.load(self.configfile.read_text())
+        rules = yaml.safe_load(self.configfile.read_text())
  
          self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
          self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
@@ -120,6 +122,9 @@ class ICURuleLoader:
          """
          content = self._get_section(rules, section)
  
+        if content is None:
+            return ''
+
          if isinstance(content, str):
              return (self.configfile.parent / content).read_text().replace('\n', ' ')
  
@@ -158,4 +163,5 @@ class ICURuleLoader:
              abbrterms = (norm.transliterate(t.strip()) for t in parts[1].split(','))
  
              for full, abbr in itertools.product(fullterms, abbrterms):
-                self.abbreviations[full].append(abbr)
+                if full and abbr:
+                    self.abbreviations[full].append(abbr)