+"""
+Helper class to create ICU rules from a configuration file.
+"""
+import io
+import yaml
+import logging
+from collections import defaultdict
+import itertools
+
+from icu import Transliterator
+
+from nominatim.errors import UsageError
+
+LOG = logging.getLogger()
+
+
+class ICURuleLoader:
+ """ Compiler for ICU rules from a tokenizer configuration file.
+ """
+
+ def __init__(self, configfile):
+ self.configfile = configfile
+
+ if configfile.suffix == '.yaml':
+ self._load_from_yaml()
+ else:
+ raise UsageError("Unknown format of tokenizer configuration.")
+
+
+ def get_search_rules(self):
+ """ Returns the ICU rules to be used during search.
+ The rules combine normalization, compound decomposition (including
+ abbreviated compounds) and transliteration.
+ """
+ # First apply the normalization rules.
+ rules = io.StringIO()
+ rules.write(self.normalization_rules)
+
+ # For all compound suffixes: add them in their full and any abbreviated form.
+ suffixes = set()
+ for suffix in self.compound_suffixes:
+ suffixes.add(suffix)
+ suffixes.update(self.abbreviations.get(suffix, []))
+
+ for suffix in sorted(suffixes, key=lambda x:len(x), reverse=True):
+ rules.write("'{0} ' > ' {0} ';".format(suffix))
+
+ # Finally add transliteration.
+ rules.write(self.transliteration_rules)
+ return rules.getvalue()
+
+ def get_normalization_rules(self):
+ """ Return rules for normalisation of a term.
+ """
+ return self.normalization_rules
+
+ def get_transliteration_rules(self):
+ """ Return the rules for converting a string into its asciii representation.
+ """
+ return self.transliteration_rules
+
+ def get_replacement_pairs(self):
+ """ Returns the list of possible compound decompositions with
+ application of abbreviations included.
+ The result is a list of pairs: the first item is the sequence to
+ replace, the second is a list of replacements.
+ """
+ synonyms = defaultdict(set)
+
+ for full, abbr in self.abbreviations.items():
+ key = ' ' + full + ' '
+ # Entries in the abbreviation list always apply to full words:
+ synonyms[key].update((' ' + a + ' ' for a in abbr))
+ # Replacements are optional, so add a noop
+ synonyms[key].add(key)
+
+ # Entries in the compound list expand to themselves and to
+ # abbreviations.
+ for suffix in self.compound_suffixes:
+ keyset = synonyms[suffix + ' ']
+ keyset.add(' ' + suffix + ' ')
+ keyset.update((' ' + a + ' ' for a in self.abbreviations.get(suffix, [])))
+ # The terms the entries are shortended to, need to be decompunded as well.
+ for abbr in self.abbreviations.get(suffix, []):
+ synonyms[abbr + ' '].add(' ' + abbr + ' ')
+
+ # sort the resulting list by descending length (longer matches are prefered).
+ sorted_keys = sorted(synonyms.keys(), key=lambda x: len(x), reverse=True)
+
+ return [(k, list(synonyms[k])) for k in sorted_keys]
+
+
+ def _load_from_yaml(self):
+ rules = yaml.load(self.configfile.read_text())
+
+ self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
+ self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
+ self._parse_compound_suffix_list(self._get_section(rules, 'compound_suffixes'))
+ self._parse_abbreviation_list(self._get_section(rules, 'abbreviations'))
+
+
+ def _get_section(self, rules, section):
+ """ Get the section named 'section' from the rules. If the section does
+ not exist, raise a usage error with a meaningful message.
+ """
+ if section not in rules:
+ LOG.fatal("Section '%s' not found in tokenizer config '%s'.",
+ section, str(self.configfile))
+ raise UsageError("Syntax error in tokenizer configuration file.")
+
+ return rules[section]
+
+
+ def _cfg_to_icu_rules(self, rules, section):
+ """ Load an ICU ruleset from the given section. If the section is a
+ simple string, it is interpreted as a file name and the rules are
+ loaded verbatim from the given file. The filename is expected to be
+ relative to the tokenizer rule file. If the section is a list then
+ each line is assumed to be a rule. All rules are concatenated and returned.
+ """
+ content = self._get_section(rules, section)
+
+ if isinstance(content, str):
+ return (self.configfile.parent / content).read_text().replace('\n', ' ')
+
+ return ';'.join(content) + ';'
+
+
+ def _parse_compound_suffix_list(self, rules):
+ if not rules:
+ self.compound_suffixes = set()
+ return
+
+ norm = Transliterator.createFromRules("rule_loader_normalization",
+ self.normalization_rules)
+
+ # Make sure all suffixes are in their normalised form.
+ self.compound_suffixes = set((norm.transliterate(s) for s in rules))
+
+
+ def _parse_abbreviation_list(self, rules):
+ self.abbreviations = defaultdict(list)
+
+ if not rules:
+ return
+
+ norm = Transliterator.createFromRules("rule_loader_normalization",
+ self.normalization_rules)
+
+ for rule in rules:
+ parts = rule.split('=>')
+ if len(parts) != 2:
+ LOG.fatal("Syntax error in abbreviation section, line: %s", rule)
+ raise UsageError("Syntax error in tokenizer configuration file.")
+
+ # Make sure all terms match the normalised version.
+ fullterms = (norm.transliterate(t.strip()) for t in parts[0].split(','))
+ abbrterms = (norm.transliterate(t.strip()) for t in parts[1].split(','))
+
+ for full, abbr in itertools.product(fullterms, abbrterms):
+ self.abbreviations[full].append(abbr)