1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Parser for configuration for variants.
10 from collections import defaultdict, namedtuple
14 from icu import Transliterator
16 from nominatim.config import flatten_config_list
17 from nominatim.errors import UsageError
19 ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
21 def get_variant_config(rules, normalization_rules):
22 """ Convert the variant definition from the configuration into
25 Returns a tuple containing the replacement set and the list of characters
26 used in the replacements.
28 immediate = defaultdict(list)
33 rules = flatten_config_list(rules, 'variants')
35 vmaker = _VariantMaker(normalization_rules)
38 for rule in (section.get('words') or []):
39 vset.update(vmaker.compute(rule))
41 # Intermediate reorder by source. Also compute required character set.
43 if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
44 replstr = variant.replacement[:-1]
46 replstr = variant.replacement
47 immediate[variant.source].append(replstr)
48 chars.update(variant.source)
50 return list(immediate.items()), ''.join(chars)
54 """ Generater for all necessary ICUVariants from a single variant rule.
56 All text in rules is normalized to make sure the variants match later.
59 def __init__(self, norm_rules):
60 self.norm = Transliterator.createFromRules("rule_loader_normalization",
64 def compute(self, rule):
65 """ Generator for all ICUVariant tuples from a single variant rule.
67 parts = re.split(r'(\|)?([=-])>', rule)
69 raise UsageError(f"Syntax error in variant rule: {rule}")
71 decompose = parts[1] is None
72 src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
73 repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
75 # If the source should be kept, add a 1:1 replacement
79 for froms, tos in _create_variants(*src, src[0], decompose):
80 yield ICUVariant(froms, tos)
82 for src, repl in itertools.product(src_terms, repl_terms):
84 for froms, tos in _create_variants(*src, repl, decompose):
85 yield ICUVariant(froms, tos)
88 def _parse_variant_word(self, name):
90 match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
91 if match is None or (match.group(1) == '~' and match.group(3) == '~'):
92 raise UsageError(f"Invalid variant word descriptor '{name}'")
93 norm_name = self.norm.transliterate(match.group(2)).strip()
97 return norm_name, match.group(1), match.group(3)
100 _FLAG_MATCH = {'^': '^ ',
105 def _create_variants(src, preflag, postflag, repl, decompose):
107 postfix = _FLAG_MATCH[postflag]
108 # suffix decomposition
110 repl = repl + postfix
113 yield ' ' + src, ' ' + repl
116 yield src, ' ' + repl
117 yield ' ' + src, repl
118 elif postflag == '~':
119 # prefix decomposition
120 prefix = _FLAG_MATCH[preflag]
125 yield src + ' ', repl + ' '
128 yield src, repl + ' '
129 yield src + ' ', repl
131 prefix = _FLAG_MATCH[preflag]
132 postfix = _FLAG_MATCH[postflag]
134 yield prefix + src + postfix, prefix + repl + postfix