nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py

   1 # SPDX-License-Identifier: GPL-2.0-only
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2022 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 This sanitizer sets the `analyzer` property depending on the
   9 language of the tag. The language is taken from the suffix of the name.
  10 If a name already has an analyzer tagged, then this is kept.
  11
  12 Arguments:
  13
  14     filter-kind: Restrict the names the sanitizer should be applied to
  15                  to the given tags. The parameter expects a list of
  16                  regular expressions which are matched against `kind`.
  17                  Note that a match against the full string is expected.
  18     whitelist: Restrict the set of languages that should be tagged.
  19                Expects a list of acceptable suffixes. When unset,
  20                all 2- and 3-letter lower-case codes are accepted.
  21     use-defaults:  Configure what happens when the name has no suffix.
  22                    When set to 'all', a variant is created for
  23                    each of the default languages in the country
  24                    the feature is in. When set to 'mono', a variant is
  25                    only created, when exactly one language is spoken
  26                    in the country. The default is to do nothing with
  27                    the default languages of a country.
  28     mode: Define how the variants are created and may be 'replace' or
  29           'append'. When set to 'append' the original name (without
  30           any analyzer tagged) is retained. (default: replace)
  31
  32 """
  33 import re
  34
  35 from nominatim.tools import country_info
  36
  37 class _AnalyzerByLanguage:
  38     """ Processor for tagging the language of names in a place.
  39     """
  40
  41     def __init__(self, config):
  42         if 'filter-kind' in config:
  43             self.regexes = [re.compile(regex) for regex in config['filter-kind']]
  44         else:
  45             self.regexes = None
  46
  47         self.replace = config.get('mode', 'replace') != 'append'
  48         self.whitelist = config.get('whitelist')
  49
  50         self.__compute_default_languages(config.get('use-defaults', 'no'))
  51
  52
  53     def __compute_default_languages(self, use_defaults):
  54         self.deflangs = {}
  55
  56         if use_defaults in ('mono', 'all'):
  57             for ccode, prop in country_info.iterate():
  58                 clangs = prop['languages']
  59                 if len(clangs) == 1 or use_defaults == 'all':
  60                     if self.whitelist:
  61                         self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
  62                     else:
  63                         self.deflangs[ccode] = clangs
  64
  65
  66     def _kind_matches(self, kind):
  67         if self.regexes is None:
  68             return True
  69
  70         return any(regex.fullmatch(kind) for regex in self.regexes)
  71
  72
  73     def _suffix_matches(self, suffix):
  74         if self.whitelist is None:
  75             return len(suffix) in (2, 3) and suffix.islower()
  76
  77         return suffix in self.whitelist
  78
  79
  80     def __call__(self, obj):
  81         if not obj.names:
  82             return
  83
  84         more_names = []
  85
  86         for name in (n for n in obj.names
  87                      if not n.has_attr('analyzer') and self._kind_matches(n.kind)):
  88             if name.suffix:
  89                 langs = [name.suffix] if self._suffix_matches(name.suffix) else None
  90             else:
  91                 langs = self.deflangs.get(obj.place.country_code)
  92
  93
  94             if langs:
  95                 if self.replace:
  96                     name.set_attr('analyzer', langs[0])
  97                 else:
  98                     more_names.append(name.clone(attr={'analyzer': langs[0]}))
  99
 100                 more_names.extend(name.clone(attr={'analyzer': l}) for l in langs[1:])
 101
 102         obj.names.extend(more_names)
 103
 104
 105 def create(config):
 106     """ Create a function that sets the analyzer property depending on the
 107         language of the tag.
 108     """
 109     return _AnalyzerByLanguage(config)