2 Helper class to create ICU rules from a configuration file.
7 from pathlib import Path
11 from icu import Transliterator
13 from nominatim.errors import UsageError
14 import nominatim.tokenizer.icu_variants as variants
16 LOG = logging.getLogger()
18 def _flatten_yaml_list(content):
22 if not isinstance(content, list):
23 raise UsageError("List expected in ICU yaml configuration.")
27 if isinstance(ele, list):
28 output.extend(_flatten_yaml_list(ele))
36 """ Saves a single variant expansion.
38 An expansion consists of the normalized replacement term and
39 a dicitonary of properties that describe when the expansion applies.
42 def __init__(self, replacement, properties):
43 self.replacement = replacement
44 self.properties = properties or {}
48 """ Compiler for ICU rules from a tokenizer configuration file.
51 def __init__(self, configfile):
52 self.configfile = configfile
55 if configfile.suffix == '.yaml':
56 self._load_from_yaml()
58 raise UsageError("Unknown format of tokenizer configuration.")
61 def get_search_rules(self):
62 """ Return the ICU rules to be used during search.
63 The rules combine normalization and transliteration.
65 # First apply the normalization rules.
67 rules.write(self.normalization_rules)
69 # Then add transliteration.
70 rules.write(self.transliteration_rules)
71 return rules.getvalue()
73 def get_normalization_rules(self):
74 """ Return rules for normalisation of a term.
76 return self.normalization_rules
78 def get_transliteration_rules(self):
79 """ Return the rules for converting a string into its asciii representation.
81 return self.transliteration_rules
83 def get_replacement_pairs(self):
84 """ Return the list of possible compound decompositions with
85 application of abbreviations included.
86 The result is a list of pairs: the first item is the sequence to
87 replace, the second is a list of replacements.
91 def _yaml_include_representer(self, loader, node):
92 value = loader.construct_scalar(node)
94 if Path(value).is_absolute():
97 content = (self.configfile.parent / value)
99 return yaml.safe_load(content.read_text(encoding='utf-8'))
102 def _load_from_yaml(self):
103 yaml.add_constructor('!include', self._yaml_include_representer,
104 Loader=yaml.SafeLoader)
105 rules = yaml.safe_load(self.configfile.read_text(encoding='utf-8'))
107 self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
108 self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
109 self._parse_variant_list(self._get_section(rules, 'variants'))
112 def _get_section(self, rules, section):
113 """ Get the section named 'section' from the rules. If the section does
114 not exist, raise a usage error with a meaningful message.
116 if section not in rules:
117 LOG.fatal("Section '%s' not found in tokenizer config '%s'.",
118 section, str(self.configfile))
119 raise UsageError("Syntax error in tokenizer configuration file.")
121 return rules[section]
124 def _cfg_to_icu_rules(self, rules, section):
125 """ Load an ICU ruleset from the given section. If the section is a
126 simple string, it is interpreted as a file name and the rules are
127 loaded verbatim from the given file. The filename is expected to be
128 relative to the tokenizer rule file. If the section is a list then
129 each line is assumed to be a rule. All rules are concatenated and returned.
131 content = self._get_section(rules, section)
136 return ';'.join(_flatten_yaml_list(content)) + ';'
139 def _parse_variant_list(self, rules):
140 self.variants.clear()
145 rules = _flatten_yaml_list(rules)
147 vmaker = _VariantMaker(self.normalization_rules)
150 for section in rules:
151 # Create the property field and deduplicate against existing
153 props = variants.ICUVariantProperties.from_rules(section)
154 for existing in properties:
155 if existing == props:
159 properties.append(props)
161 for rule in (section.get('words') or []):
162 self.variants.update(vmaker.compute(rule, props))
166 """ Generater for all necessary ICUVariants from a single variant rule.
168 All text in rules is normalized to make sure the variants match later.
171 def __init__(self, norm_rules):
172 self.norm = Transliterator.createFromRules("rule_loader_normalization",
176 def compute(self, rule, props):
177 """ Generator for all ICUVariant tuples from a single variant rule.
179 parts = re.split(r'(\|)?([=-])>', rule)
181 raise UsageError("Syntax error in variant rule: " + rule)
183 decompose = parts[1] is None
184 src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
185 repl_terms = (self.norm.transliterate(t.strip()) for t in parts[3].split(','))
187 # If the source should be kept, add a 1:1 replacement
189 for src in src_terms:
191 for froms, tos in _create_variants(*src, src[0], decompose):
192 yield variants.ICUVariant(froms, tos, props)
194 for src, repl in itertools.product(src_terms, repl_terms):
196 for froms, tos in _create_variants(*src, repl, decompose):
197 yield variants.ICUVariant(froms, tos, props)
200 def _parse_variant_word(self, name):
202 match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
203 if match is None or (match.group(1) == '~' and match.group(3) == '~'):
204 raise UsageError("Invalid variant word descriptor '{}'".format(name))
205 norm_name = self.norm.transliterate(match.group(2))
209 return norm_name, match.group(1), match.group(3)
212 _FLAG_MATCH = {'^': '^ ',
217 def _create_variants(src, preflag, postflag, repl, decompose):
219 postfix = _FLAG_MATCH[postflag]
220 # suffix decomposition
222 repl = repl + postfix
225 yield ' ' + src, ' ' + repl
228 yield src, ' ' + repl
229 yield ' ' + src, repl
230 elif postflag == '~':
231 # prefix decomposition
232 prefix = _FLAG_MATCH[preflag]
237 yield src + ' ', repl + ' '
240 yield src, repl + ' '
241 yield src + ' ', repl
243 prefix = _FLAG_MATCH[preflag]
244 postfix = _FLAG_MATCH[postflag]
246 yield prefix + src + postfix, prefix + repl + postfix