2 This sanitizer sets the `analyzer` property depending on the
3 language of the tag. The language is taken from the suffix of the name.
4 If a name already has an analyzer tagged, then this is kept.
8 filter-kind: Restrict the names the sanitizer should be applied to
9 to the given tags. The parameter expects a list of
10 regular expressions which are matched against `kind`.
11 Note that a match against the full string is expected.
12 whitelist: Restrict the set of languages that should be tagged.
13 Expects a list of acceptable suffixes. When unset,
14 all 2- and 3-letter lower-case codes are accepted.
15 use-defaults: Configure what happens when the name has no suffix.
16 When set to 'all', a variant is created for
17 each of the default languages in the country
18 the feature is in. When set to 'mono', a variant is
19 only created, when exactly one language is spoken
20 in the country. The default is to do nothing with
21 the default languages of a country.
22 mode: Define how the variants are created and may be 'replace' or
23 'append'. When set to 'append' the original name (without
24 any analyzer tagged) is retained. (default: replace)
29 from nominatim.tools import country_info
31 class _AnalyzerByLanguage:
32 """ Processor for tagging the language of names in a place.
35 def __init__(self, config):
36 if 'filter-kind' in config:
37 self.regexes = [re.compile(regex) for regex in config['filter-kind']]
41 self.replace = config.get('mode', 'replace') != 'append'
42 self.whitelist = config.get('whitelist')
44 self.__compute_default_languages(config.get('use-defaults', 'no'))
47 def __compute_default_languages(self, use_defaults):
50 if use_defaults in ('mono', 'all'):
51 for ccode, prop in country_info.iterate():
52 clangs = prop['languages']
53 if len(clangs) == 1 or use_defaults == 'all':
55 self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
57 self.deflangs[ccode] = clangs
60 def _kind_matches(self, kind):
61 if self.regexes is None:
64 return any(regex.fullmatch(kind) for regex in self.regexes)
67 def _suffix_matches(self, suffix):
68 if self.whitelist is None:
69 return len(suffix) in (2, 3) and suffix.islower()
71 return suffix in self.whitelist
74 def __call__(self, obj):
80 for name in (n for n in obj.names
81 if not n.has_attr('analyzer') and self._kind_matches(n.kind)):
83 langs = [name.suffix] if self._suffix_matches(name.suffix) else None
85 langs = self.deflangs.get(obj.place.country_code)
90 name.set_attr('analyzer', langs[0])
92 more_names.append(name.clone(attr={'analyzer': langs[0]}))
94 more_names.extend(name.clone(attr={'analyzer': l}) for l in langs[1:])
96 obj.names.extend(more_names)
100 """ Create a function that sets the analyzer property depending on the
103 return _AnalyzerByLanguage(config)