from nominatim.errors import UsageError
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
+import nominatim.tools.country_info
LOG = logging.getLogger()
rules = config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG')
+ # Make sure country information is available to analyzers and sanatizers.
+ nominatim.tools.country_info.setup_country_config(config)
+
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
self.analysis_rules = _get_section(rules, 'token-analysis')
--- /dev/null
+"""
+Name processor for tagging the langauge of the name
+"""
+import re
+
+from nominatim.tools import country_info
+
+class _AnalyzerByLanguage:
+ """ Processor for tagging the language of names in a place.
+ """
+
+ def __init__(self, config):
+ if 'filter-kind' in config:
+ self.regexes = [re.compile(regex) for regex in config['filter-kind']]
+ else:
+ self.regexes = None
+
+ self.use_defaults = config.get('use-defaults', 'no')
+ if self.use_defaults not in ('mono', 'all'):
+ self.use_defaults = False
+
+ self.replace = config.get('mode', 'replace') != 'append'
+ self.whitelist = config.get('whitelist')
+
+ # Compute the languages to use when no suffix is given.
+ self.deflangs = {}
+ for ccode, prop in country_info.iterate():
+ clangs = prop['languages']
+ if len(clangs) == 1 or self.use_defaults == 'all':
+ if self.whitelist:
+ self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
+ else:
+ self.deflangs[ccode] = clangs
+
+
+
+ def _kind_matches(self, kind):
+ if self.regexes is None:
+ return True
+
+ return any(regex.search(kind) for regex in self.regexes)
+
+
+ def _suffix_matches(self, suffix):
+ if self.whitelist is None:
+ return len(suffix) in (2, 3) and suffix.islower()
+
+ return suffix in self.whitelist
+
+
+ def __call__(self, obj):
+ if not obj.names:
+ return
+
+ more_names = []
+
+ for name in (n for n in obj.names
+ if not n.has_attr('analyzer') and self._kind_matches(n.kind)):
+ if name.suffix:
+ langs = [name.suffix] if self._suffix_matches(name.suffix) else None
+ else:
+ if self.use_defaults:
+ langs = self.deflangs.get(obj.place.country_code)
+ if self.use_defaults == 'mono' and len(langs) > 1:
+ langs = None
+
+ if langs:
+ if self.replace:
+ name.set_attr('analyzer', langs[0])
+ else:
+ more_names.append(name.clone(attr={'analyzer': langs[0]}))
+
+ more_names.extend(name.clone(attr={'analyzer': l}) for l in langs[1:])
+
+ obj.names.extend(more_names)
+
+
+def create(config):
+ """ Create a function that sets the analyzer property depending on the
+ language of the tag. The language is taken from the suffix.
+
+ To restrict the set of languages that should be tagged, use
+ 'whitelist'. A list of acceptable suffixes. When unset, all 2- and
+ 3-letter codes are accepted.
+
+ 'use-defaults' configures what happens when the name has no suffix
+ with a language tag. When set to 'all', a variant is created for
+ each on the spoken languages in the country the feature is in. When
+ set to 'mono', a variant is created, when only one language is spoken
+ in the country. The default is, to do nothing with the default languages
+ of a country.
+
+ 'mode' hay be 'replace' (the default) or 'append' and configures if
+ the original name (without any analyzer tagged) is retained.
+
+ With 'filter-kind' the set of names the sanitizer should be applied
+ to can be retricted to the given patterns of 'kind'. It expects a
+ list of regular expression to be matched against 'kind'.
+ """
+ return _AnalyzerByLanguage(config)
def configure(rules, normalization_rules):
""" Extract and preprocess the configuration for this module.
"""
- rules = rules.get('variants')
+ config = {}
+
+ config['replacements'], config['chars'] = _get_variant_config(rules.get('variants'),
+ normalization_rules)
+ config['variant_only'] = rules.get('mode', '') == 'variant-only'
+
+ return config
+
+
+def _get_variant_config(rules, normalization_rules):
+ """ Convert the variant definition from the configuration into
+ replacement sets.
+ """
immediate = defaultdict(list)
chars = set()
immediate[variant.source].append(replstr)
chars.update(variant.source)
- return {'replacements': list(immediate.items()),
- 'chars': ''.join(chars)}
+ return list(immediate.items()), ''.join(chars)
class _VariantMaker:
def __init__(self, to_ascii, config):
self.to_ascii = to_ascii
+ self.variant_only = config['variant_only']
# Set up datrie
- self.replacements = datrie.Trie(config['chars'])
- for src, repllist in config['replacements']:
- self.replacements[src] = repllist
+ if config['replacements']:
+ self.replacements = datrie.Trie(config['chars'])
+ for src, repllist in config['replacements']:
+ self.replacements[src] = repllist
+ else:
+ self.replacements = None
def get_variants_ascii(self, norm_name):
partials = ['']
startpos = 0
- pos = 0
- force_space = False
- while pos < len(baseform):
- full, repl = self.replacements.longest_prefix_item(baseform[pos:],
- (None, None))
- if full is not None:
- done = baseform[startpos:pos]
- partials = [v + done + r
- for v, r in itertools.product(partials, repl)
- if not force_space or r.startswith(' ')]
- if len(partials) > 128:
- # If too many variants are produced, they are unlikely
- # to be helpful. Only use the original term.
- startpos = 0
- break
- startpos = pos + len(full)
- if full[-1] == ' ':
- startpos -= 1
- force_space = True
- pos = startpos
- else:
- pos += 1
- force_space = False
+ if self.replacements is not None:
+ pos = 0
+ force_space = False
+ while pos < len(baseform):
+ full, repl = self.replacements.longest_prefix_item(baseform[pos:],
+ (None, None))
+ if full is not None:
+ done = baseform[startpos:pos]
+ partials = [v + done + r
+ for v, r in itertools.product(partials, repl)
+ if not force_space or r.startswith(' ')]
+ if len(partials) > 128:
+ # If too many variants are produced, they are unlikely
+ # to be helpful. Only use the original term.
+ startpos = 0
+ break
+ startpos = pos + len(full)
+ if full[-1] == ' ':
+ startpos -= 1
+ force_space = True
+ pos = startpos
+ else:
+ pos += 1
+ force_space = False
# No variants detected? Fast return.
if startpos == 0:
+ if self.variant_only:
+ return []
+
trans_name = self.to_ascii.transliterate(norm_name).strip()
return [trans_name] if trans_name else []
- return self._compute_result_set(partials, baseform[startpos:])
+ return self._compute_result_set(partials, baseform[startpos:],
+ norm_name if self.variant_only else '')
- def _compute_result_set(self, partials, prefix):
+ def _compute_result_set(self, partials, prefix, exclude):
results = set()
for variant in partials:
- vname = variant + prefix
- trans_name = self.to_ascii.transliterate(vname[1:-1]).strip()
- if trans_name:
- results.add(trans_name)
+ vname = (variant + prefix)[1:-1].strip()
+ if vname != exclude:
+ trans_name = self.to_ascii.transliterate(vname).strip()
+ if trans_name:
+ results.add(trans_name)
return list(results)
def __init__(self):
self._info = {}
+
def load(self, config):
""" Load the country properties from the configuration files,
if they are not loaded yet.
"""
if not self._info:
self._info = config.load_sub_configuration('country_settings.yaml')
+ # Convert languages into a list for simpler handling.
+ for prop in self._info.values():
+ if 'languages' not in prop:
+ prop['languages'] = []
+ elif not isinstance(prop['languages'], list):
+ prop['languages'] = [x.strip()
+ for x in prop['languages'].split(',')]
+
def items(self):
""" Return tuples of (country_code, property dict) as iterable.
_COUNTRY_INFO.load(config)
+def iterate():
+ """ Iterate over country code and properties.
+ """
+ return _COUNTRY_INFO.items()
+
+
def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
""" Create and populate the tables with basic static data that provides
the background for geocoding. Data is assumed to not yet exist.
partition = 0
else:
partition = props.get('partition')
- if ',' in (props.get('languages', ',') or ','):
- lang = None
- else:
- lang = props['languages']
+ lang = props['languages'][0] if len(props['languages']) == 1 else None
params.append((ccode, partition, lang))
with connect(dsn) as conn:
# (Bouvet Island)
bv:
partition: 185
- languages: no
+ languages: "no"
# Botswana (Botswana)
bw:
# (Svalbard and Jan Mayen)
sj:
partition: 197
- languages: no
+ languages: "no"
# Slovakia (Slovensko)
sk:
sanitizers:
- step: split-name-list
- step: strip-brace-terms
+ - step: tag-analyzer-by-language
+ filter-kind: [".*name.*"]
+ whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
+ use-defaults: all
+ mode: append
token-analysis:
- analyzer: generic
+ - id: bg
+ analyzer: generic
+ mode: variant-only
variants:
- !include icu-rules/variants-bg.yaml
+ - id: ca
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-ca.yaml
+ - id: cs
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-cs.yaml
+ - id: da
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-da.yaml
+ - id: de
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-de.yaml
+ - id: el
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-el.yaml
+ - id: en
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-en.yaml
+ - id: es
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-es.yaml
+ - id: et
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-et.yaml
+ - id: eu
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-eu.yaml
+ - id: fi
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-fi.yaml
+ - id: fr
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-fr.yaml
+ - id: gl
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-gl.yaml
+ - id: hu
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-hu.yaml
+ - id: it
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-it.yaml
+ - id: ja
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-ja.yaml
+ - id: mg
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-mg.yaml
+ - id: ms
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-ms.yaml
+ - id: nl
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-nl.yaml
+ - id: no
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-no.yaml
+ - id: pl
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-pl.yaml
+ - id: pt
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-pt.yaml
+ - id: ro
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-ro.yaml
+ - id: ru
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-ru.yaml
+ - id: sk
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-sk.yaml
+ - id: sl
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-sl.yaml
+ - id: sv
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-sv.yaml
+ - id: tr
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-tr.yaml
+ - id: uk
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-uk.yaml
+ - id: vi
+ analyzer: generic
+ mode: variant-only
+ variants:
- !include icu-rules/variants-vi.yaml
Scenario: Special characters in name
Given the places
- | osm | class | type | name |
+ | osm | class | type | name+name:de |
| N1 | place | locality | Jim-Knopf-Straße |
| N2 | place | locality | Smith/Weston |
| N3 | place | locality | space mountain |
def get_normalized_variants(proc, name):
- return proc.get_variants_ascii(proc.get_normalized(name))
+ return proc.analysis[None].get_variants_ascii(proc.normalizer.transliterate(name).strip())
def test_variants_empty(cfgfile):
config = cfgfile('~street => s,st', 'master => mstr')
proc = ICURuleLoader(config).make_token_analysis()
- assert proc.get_search_normalized('Master Street') == 'master street'
- assert proc.get_search_normalized('Earnes St') == 'earnes st'
- assert proc.get_search_normalized('Nostreet') == 'nostreet'
+ assert proc.search.transliterate('Master Street').strip() == 'master street'
+ assert proc.search.transliterate('Earnes St').strip() == 'earnes st'
+ assert proc.search.transliterate('Nostreet').strip() == 'nostreet'