2 Helper class to create ICU rules from a configuration file.
9 from icu import Transliterator
11 from nominatim.errors import UsageError
12 import nominatim.tokenizer.icu_variants as variants
14 LOG = logging.getLogger()
16 def _flatten_config_list(content):
20 if not isinstance(content, list):
21 raise UsageError("List expected in ICU configuration.")
25 if isinstance(ele, list):
26 output.extend(_flatten_config_list(ele))
34 """ Saves a single variant expansion.
36 An expansion consists of the normalized replacement term and
37 a dicitonary of properties that describe when the expansion applies.
40 def __init__(self, replacement, properties):
41 self.replacement = replacement
42 self.properties = properties or {}
46 """ Compiler for ICU rules from a tokenizer configuration file.
49 def __init__(self, rules):
52 self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
53 self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
54 self._parse_variant_list(self._get_section(rules, 'variants'))
57 def get_search_rules(self):
58 """ Return the ICU rules to be used during search.
59 The rules combine normalization and transliteration.
61 # First apply the normalization rules.
63 rules.write(self.normalization_rules)
65 # Then add transliteration.
66 rules.write(self.transliteration_rules)
67 return rules.getvalue()
69 def get_normalization_rules(self):
70 """ Return rules for normalisation of a term.
72 return self.normalization_rules
74 def get_transliteration_rules(self):
75 """ Return the rules for converting a string into its asciii representation.
77 return self.transliteration_rules
79 def get_replacement_pairs(self):
80 """ Return the list of possible compound decompositions with
81 application of abbreviations included.
82 The result is a list of pairs: the first item is the sequence to
83 replace, the second is a list of replacements.
89 def _get_section(rules, section):
90 """ Get the section named 'section' from the rules. If the section does
91 not exist, raise a usage error with a meaningful message.
93 if section not in rules:
94 LOG.fatal("Section '%s' not found in tokenizer config.", section)
95 raise UsageError("Syntax error in tokenizer configuration file.")
100 def _cfg_to_icu_rules(self, rules, section):
101 """ Load an ICU ruleset from the given section. If the section is a
102 simple string, it is interpreted as a file name and the rules are
103 loaded verbatim from the given file. The filename is expected to be
104 relative to the tokenizer rule file. If the section is a list then
105 each line is assumed to be a rule. All rules are concatenated and returned.
107 content = self._get_section(rules, section)
112 return ';'.join(_flatten_config_list(content)) + ';'
115 def _parse_variant_list(self, rules):
116 self.variants.clear()
121 rules = _flatten_config_list(rules)
123 vmaker = _VariantMaker(self.normalization_rules)
126 for section in rules:
127 # Create the property field and deduplicate against existing
129 props = variants.ICUVariantProperties.from_rules(section)
130 for existing in properties:
131 if existing == props:
135 properties.append(props)
137 for rule in (section.get('words') or []):
138 self.variants.update(vmaker.compute(rule, props))
142 """ Generater for all necessary ICUVariants from a single variant rule.
144 All text in rules is normalized to make sure the variants match later.
147 def __init__(self, norm_rules):
148 self.norm = Transliterator.createFromRules("rule_loader_normalization",
152 def compute(self, rule, props):
153 """ Generator for all ICUVariant tuples from a single variant rule.
155 parts = re.split(r'(\|)?([=-])>', rule)
157 raise UsageError("Syntax error in variant rule: " + rule)
159 decompose = parts[1] is None
160 src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
161 repl_terms = (self.norm.transliterate(t.strip()) for t in parts[3].split(','))
163 # If the source should be kept, add a 1:1 replacement
165 for src in src_terms:
167 for froms, tos in _create_variants(*src, src[0], decompose):
168 yield variants.ICUVariant(froms, tos, props)
170 for src, repl in itertools.product(src_terms, repl_terms):
172 for froms, tos in _create_variants(*src, repl, decompose):
173 yield variants.ICUVariant(froms, tos, props)
176 def _parse_variant_word(self, name):
178 match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
179 if match is None or (match.group(1) == '~' and match.group(3) == '~'):
180 raise UsageError("Invalid variant word descriptor '{}'".format(name))
181 norm_name = self.norm.transliterate(match.group(2))
185 return norm_name, match.group(1), match.group(3)
188 _FLAG_MATCH = {'^': '^ ',
193 def _create_variants(src, preflag, postflag, repl, decompose):
195 postfix = _FLAG_MATCH[postflag]
196 # suffix decomposition
198 repl = repl + postfix
201 yield ' ' + src, ' ' + repl
204 yield src, ' ' + repl
205 yield ' ' + src, repl
206 elif postflag == '~':
207 # prefix decomposition
208 prefix = _FLAG_MATCH[preflag]
213 yield src + ' ', repl + ' '
216 yield src, repl + ' '
217 yield src + ' ', repl
219 prefix = _FLAG_MATCH[preflag]
220 postfix = _FLAG_MATCH[postflag]
222 yield prefix + src + postfix, prefix + repl + postfix