nominatim/tokenizer/token_analysis/config_variants.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Parser for configuration for variants.
   9 """
  10 from typing import Any, Iterator, Tuple, List, Optional, Set, NamedTuple
  11 from collections import defaultdict
  12 import itertools
  13 import re
  14
  15 from icu import Transliterator
  16
  17 from nominatim.config import flatten_config_list
  18 from nominatim.errors import UsageError
  19
  20 class ICUVariant(NamedTuple):
  21     """ A single replacement rule for variant creation.
  22     """
  23     source: str
  24     replacement: str
  25
  26
  27 def get_variant_config(in_rules: Any,
  28                        normalization_rules: str) -> Tuple[List[Tuple[str, List[str]]], str]:
  29     """ Convert the variant definition from the configuration into
  30         replacement sets.
  31
  32         Returns a tuple containing the replacement set and the list of characters
  33         used in the replacements.
  34     """
  35     immediate = defaultdict(list)
  36     chars: Set[str] = set()
  37
  38     if in_rules:
  39         vset: Set[ICUVariant] = set()
  40         rules = flatten_config_list(in_rules, 'variants')
  41
  42         vmaker = _VariantMaker(normalization_rules)
  43
  44         for section in rules:
  45             for rule in (section.get('words') or []):
  46                 vset.update(vmaker.compute(rule))
  47
  48         # Intermediate reorder by source. Also compute required character set.
  49         for variant in vset:
  50             if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
  51                 replstr = variant.replacement[:-1]
  52             else:
  53                 replstr = variant.replacement
  54             immediate[variant.source].append(replstr)
  55             chars.update(variant.source)
  56
  57     return list(immediate.items()), ''.join(chars)
  58
  59
  60 class _VariantMaker:
  61     """ Generater for all necessary ICUVariants from a single variant rule.
  62
  63         All text in rules is normalized to make sure the variants match later.
  64     """
  65
  66     def __init__(self, norm_rules: Any) -> None:
  67         self.norm = Transliterator.createFromRules("rule_loader_normalization",
  68                                                    norm_rules)
  69
  70
  71     def compute(self, rule: Any) -> Iterator[ICUVariant]:
  72         """ Generator for all ICUVariant tuples from a single variant rule.
  73         """
  74         parts = re.split(r'(\|)?([=-])>', rule)
  75         if len(parts) != 4:
  76             raise UsageError(f"Syntax error in variant rule: {rule}")
  77
  78         decompose = parts[1] is None
  79         src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
  80         repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
  81
  82         # If the source should be kept, add a 1:1 replacement
  83         if parts[2] == '-':
  84             for src in src_terms:
  85                 if src:
  86                     for froms, tos in _create_variants(*src, src[0], decompose):
  87                         yield ICUVariant(froms, tos)
  88
  89         for src, repl in itertools.product(src_terms, repl_terms):
  90             if src and repl:
  91                 for froms, tos in _create_variants(*src, repl, decompose):
  92                     yield ICUVariant(froms, tos)
  93
  94
  95     def _parse_variant_word(self, name: str) -> Optional[Tuple[str, str, str]]:
  96         name = name.strip()
  97         match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
  98         if match is None or (match.group(1) == '~' and match.group(3) == '~'):
  99             raise UsageError(f"Invalid variant word descriptor '{name}'")
 100         norm_name = self.norm.transliterate(match.group(2)).strip()
 101         if not norm_name:
 102             return None
 103
 104         return norm_name, match.group(1), match.group(3)
 105
 106
 107 _FLAG_MATCH = {'^': '^ ',
 108                '$': ' ^',
 109                '': ' '}
 110
 111
 112 def _create_variants(src: str, preflag: str, postflag: str,
 113                      repl: str, decompose: bool) -> Iterator[Tuple[str, str]]:
 114     if preflag == '~':
 115         postfix = _FLAG_MATCH[postflag]
 116         # suffix decomposition
 117         src = src + postfix
 118         repl = repl + postfix
 119
 120         yield src, repl
 121         yield ' ' + src, ' ' + repl
 122
 123         if decompose:
 124             yield src, ' ' + repl
 125             yield ' ' + src, repl
 126     elif postflag == '~':
 127         # prefix decomposition
 128         prefix = _FLAG_MATCH[preflag]
 129         src = prefix + src
 130         repl = prefix + repl
 131
 132         yield src, repl
 133         yield src + ' ', repl + ' '
 134
 135         if decompose:
 136             yield src, repl + ' '
 137             yield src + ' ', repl
 138     else:
 139         prefix = _FLAG_MATCH[preflag]
 140         postfix = _FLAG_MATCH[postflag]
 141
 142         yield prefix + src + postfix, prefix + repl + postfix