Helper class to create ICU rules from a configuration file.
"""
import io
-import yaml
import logging
from collections import defaultdict
import itertools
+from pathlib import Path
+import yaml
from icu import Transliterator
from nominatim.errors import UsageError
LOG = logging.getLogger()
+def _flatten_yaml_list(content):
+ if not content:
+ return []
+
+ if not isinstance(content, list):
+ raise UsageError("List expected in ICU yaml configuration.")
+
+ output = []
+ for ele in content:
+ if isinstance(ele, list):
+ output.extend(_flatten_yaml_list(ele))
+ else:
+ output.append(ele)
+
+ return output
+
class ICURuleLoader:
""" Compiler for ICU rules from a tokenizer configuration file.
def __init__(self, configfile):
self.configfile = configfile
+ self.compound_suffixes = set()
+ self.abbreviations = defaultdict()
if configfile.suffix == '.yaml':
self._load_from_yaml()
def get_search_rules(self):
- """ Returns the ICU rules to be used during search.
- The rules combine normalization, compound decomposition (including
- abbreviated compounds) and transliteration.
+ """ Return the ICU rules to be used during search.
+ The rules combine normalization and transliteration.
"""
# First apply the normalization rules.
rules = io.StringIO()
rules.write(self.normalization_rules)
- # For all compound suffixes: add them in their full and any abbreviated form.
- suffixes = set()
- for suffix in self.compound_suffixes:
- suffixes.add(suffix)
- suffixes.update(self.abbreviations.get(suffix, []))
-
- for suffix in sorted(suffixes, key=lambda x:len(x), reverse=True):
- rules.write("'{0} ' > ' {0} ';".format(suffix))
-
- # Finally add transliteration.
+ # Then add transliteration.
rules.write(self.transliteration_rules)
return rules.getvalue()
return self.transliteration_rules
def get_replacement_pairs(self):
- """ Returns the list of possible compound decompositions with
+ """ Return the list of possible compound decompositions with
application of abbreviations included.
The result is a list of pairs: the first item is the sequence to
replace, the second is a list of replacements.
"""
synonyms = defaultdict(set)
+ # First add entries for compound decomposition.
+ for suffix in self.compound_suffixes:
+ variants = (suffix + ' ', ' ' + suffix + ' ')
+ for key in variants:
+ synonyms[key].update(variants)
+
for full, abbr in self.abbreviations.items():
key = ' ' + full + ' '
# Entries in the abbreviation list always apply to full words:
# Replacements are optional, so add a noop
synonyms[key].add(key)
- # Entries in the compound list expand to themselves and to
- # abbreviations.
- for suffix in self.compound_suffixes:
- keyset = synonyms[suffix + ' ']
- keyset.add(' ' + suffix + ' ')
- keyset.update((' ' + a + ' ' for a in self.abbreviations.get(suffix, [])))
- # The terms the entries are shortended to, need to be decompunded as well.
- for abbr in self.abbreviations.get(suffix, []):
- synonyms[abbr + ' '].add(' ' + abbr + ' ')
+ if full in self.compound_suffixes:
+ # Full word abbreviating to compunded version.
+ synonyms[key].update((a + ' ' for a in abbr))
+
+ key = full + ' '
+ # Uncompunded suffix abbrevitating to decompounded version.
+ synonyms[key].update((' ' + a + ' ' for a in abbr))
+ # Uncompunded suffix abbrevitating to compunded version.
+ synonyms[key].update((a + ' ' for a in abbr))
# sort the resulting list by descending length (longer matches are prefered).
- sorted_keys = sorted(synonyms.keys(), key=lambda x: len(x), reverse=True)
+ sorted_keys = sorted(synonyms.keys(), key=len, reverse=True)
return [(k, list(synonyms[k])) for k in sorted_keys]
+ def _yaml_include_representer(self, loader, node):
+ value = loader.construct_scalar(node)
+
+ if Path(value).is_absolute():
+ content = Path(value).read_text()
+ else:
+ content = (self.configfile.parent / value).read_text()
+
+ return yaml.safe_load(content)
+
def _load_from_yaml(self):
- rules = yaml.load(self.configfile.read_text())
+ yaml.add_constructor('!include', self._yaml_include_representer,
+ Loader=yaml.SafeLoader)
+ rules = yaml.safe_load(self.configfile.read_text())
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
"""
content = self._get_section(rules, section)
- if isinstance(content, str):
- return (self.configfile.parent / content).read_text().replace('\n', ' ')
+ if content is None:
+ return ''
+
+ return ';'.join(_flatten_yaml_list(content)) + ';'
- return ';'.join(content) + ';'
def _parse_compound_suffix_list(self, rules):
abbrterms = (norm.transliterate(t.strip()) for t in parts[1].split(','))
for full, abbr in itertools.product(fullterms, abbrterms):
- self.abbreviations[full].append(abbr)
+ if full and abbr:
+ self.abbreviations[full].append(abbr)