1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Parser for configuration for variants.
10 from typing import Any, Iterator, Tuple, List, Optional, Set, NamedTuple
11 from collections import defaultdict
15 from icu import Transliterator
17 from nominatim.config import flatten_config_list
18 from nominatim.errors import UsageError
20 class ICUVariant(NamedTuple):
21 """ A single replacement rule for variant creation.
27 def get_variant_config(in_rules: Any,
28 normalization_rules: str) -> Tuple[List[Tuple[str, List[str]]], str]:
29 """ Convert the variant definition from the configuration into
32 Returns a tuple containing the replacement set and the list of characters
33 used in the replacements.
35 immediate = defaultdict(list)
36 chars: Set[str] = set()
39 vset: Set[ICUVariant] = set()
40 rules = flatten_config_list(in_rules, 'variants')
42 vmaker = _VariantMaker(normalization_rules)
45 for rule in (section.get('words') or []):
46 vset.update(vmaker.compute(rule))
48 # Intermediate reorder by source. Also compute required character set.
50 if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
51 replstr = variant.replacement[:-1]
53 replstr = variant.replacement
54 immediate[variant.source].append(replstr)
55 chars.update(variant.source)
57 return list(immediate.items()), ''.join(chars)
61 """ Generater for all necessary ICUVariants from a single variant rule.
63 All text in rules is normalized to make sure the variants match later.
66 def __init__(self, norm_rules: Any) -> None:
67 self.norm = Transliterator.createFromRules("rule_loader_normalization",
71 def compute(self, rule: Any) -> Iterator[ICUVariant]:
72 """ Generator for all ICUVariant tuples from a single variant rule.
74 parts = re.split(r'(\|)?([=-])>', rule)
76 raise UsageError(f"Syntax error in variant rule: {rule}")
78 decompose = parts[1] is None
79 src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
80 repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
82 # If the source should be kept, add a 1:1 replacement
86 for froms, tos in _create_variants(*src, src[0], decompose):
87 yield ICUVariant(froms, tos)
89 for src, repl in itertools.product(src_terms, repl_terms):
91 for froms, tos in _create_variants(*src, repl, decompose):
92 yield ICUVariant(froms, tos)
95 def _parse_variant_word(self, name: str) -> Optional[Tuple[str, str, str]]:
97 match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
98 if match is None or (match.group(1) == '~' and match.group(3) == '~'):
99 raise UsageError(f"Invalid variant word descriptor '{name}'")
100 norm_name = self.norm.transliterate(match.group(2)).strip()
104 return norm_name, match.group(1), match.group(3)
107 _FLAG_MATCH = {'^': '^ ',
112 def _create_variants(src: str, preflag: str, postflag: str,
113 repl: str, decompose: bool) -> Iterator[Tuple[str, str]]:
115 postfix = _FLAG_MATCH[postflag]
116 # suffix decomposition
118 repl = repl + postfix
121 yield ' ' + src, ' ' + repl
124 yield src, ' ' + repl
125 yield ' ' + src, repl
126 elif postflag == '~':
127 # prefix decomposition
128 prefix = _FLAG_MATCH[preflag]
133 yield src + ' ', repl + ' '
136 yield src, repl + ' '
137 yield src + ' ', repl
139 prefix = _FLAG_MATCH[preflag]
140 postfix = _FLAG_MATCH[postflag]
142 yield prefix + src + postfix, prefix + repl + postfix