summary |
shortlog |
log |
commit | commitdiff |
tree
raw |
patch |
inline | side by side (from parent 1:
52847b6)
Adds a mandatory section 'analyzer' to the token-analysis entries
which define, which analyser to use. Currently there is exactly
one, generic, which implements the former ICUNameProcessor.
"""
Helper class to create ICU rules from a configuration file.
"""
"""
Helper class to create ICU rules from a configuration file.
"""
import io
import json
import logging
import io
import json
import logging
from nominatim.config import flatten_config_list
from nominatim.db.properties import set_property, get_property
from nominatim.errors import UsageError
from nominatim.config import flatten_config_list
from nominatim.db.properties import set_property, get_property
from nominatim.errors import UsageError
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
import nominatim.tokenizer.icu_variants as variants
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
import nominatim.tokenizer.icu_variants as variants
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
+def _get_section(rules, section):
+ """ Get the section named 'section' from the rules. If the section does
+ not exist, raise a usage error with a meaningful message.
+ """
+ if section not in rules:
+ LOG.fatal("Section '%s' not found in tokenizer config.", section)
+ raise UsageError("Syntax error in tokenizer configuration file.")
+
+ return rules[section]
+
+
class VariantRule:
""" Saves a single variant expansion.
class VariantRule:
""" Saves a single variant expansion.
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
- self.analysis_rules = self._get_section(rules, 'token-analysis')
+ self.analysis_rules = _get_section(rules, 'token-analysis')
self._setup_analysis()
# Load optional sanitizer rule set.
self._setup_analysis()
# Load optional sanitizer rule set.
- def _get_section(rules, section):
- """ Get the section named 'section' from the rules. If the section does
- not exist, raise a usage error with a meaningful message.
- """
- if section not in rules:
- LOG.fatal("Section '%s' not found in tokenizer config.", section)
- raise UsageError("Syntax error in tokenizer configuration file.")
-
- return rules[section]
-
-
- def _cfg_to_icu_rules(self, rules, section):
+ def _cfg_to_icu_rules(rules, section):
""" Load an ICU ruleset from the given section. If the section is a
simple string, it is interpreted as a file name and the rules are
loaded verbatim from the given file. The filename is expected to be
relative to the tokenizer rule file. If the section is a list then
each line is assumed to be a rule. All rules are concatenated and returned.
"""
""" Load an ICU ruleset from the given section. If the section is a
simple string, it is interpreted as a file name and the rules are
loaded verbatim from the given file. The filename is expected to be
relative to the tokenizer rule file. If the section is a list then
each line is assumed to be a rule. All rules are concatenated and returned.
"""
- content = self._get_section(rules, section)
+ content = _get_section(rules, section)
if content is None:
return ''
if content is None:
return ''
"""
def __init__(self, rules, normalization_rules):
"""
def __init__(self, rules, normalization_rules):
+ # Find the analysis module
+ module_name = 'nominatim.tokenizer.token_analysis.' \
+ + _get_section(rules, 'analyzer').replace('-', '_')
+ analysis_mod = importlib.import_module(module_name)
+ self._mod_create = analysis_mod.create
+
+ # Load the configuration.
+ self.config = {}
self._parse_variant_list(rules.get('variants'), normalization_rules)
def create(self, normalization_rules, transliteration_rules):
""" Create an analyzer from the given rules.
"""
self._parse_variant_list(rules.get('variants'), normalization_rules)
def create(self, normalization_rules, transliteration_rules):
""" Create an analyzer from the given rules.
"""
- return ICUNameProcessor(normalization_rules,
+ return self._mod_create(normalization_rules,
def _parse_variant_list(self, rules, normalization_rules):
def _parse_variant_list(self, rules, normalization_rules):
properties.append(props)
for rule in (section.get('words') or []):
properties.append(props)
for rule in (section.get('words') or []):
- self.variants.update(vmaker.compute(rule, props))
+ vset.update(vmaker.compute(rule, props))
+
+ self.config['variants'] = vset
-Processor for names that are imported into the database based on the
-ICU library.
+Generic processor for names that creates abbreviation variants.
"""
from collections import defaultdict
import itertools
"""
from collections import defaultdict
import itertools
from icu import Transliterator
import datrie
from icu import Transliterator
import datrie
+def create(norm_rules, trans_rules, config):
+ """ Create a new token analysis instance for this module.
+ """
+ return GenericTokenAnalysis(norm_rules, trans_rules, config['variants'])
+
+
+class GenericTokenAnalysis:
""" Collects the different transformation rules for normalisation of names
and provides the functions to apply the transformations.
"""
""" Collects the different transformation rules for normalisation of names
and provides the functions to apply the transformations.
"""
- step: split-name-list
- step: strip-brace-terms
token-analysis:
- step: split-name-list
- step: strip-brace-terms
token-analysis:
+ - analyzer: generic
+ variants:
- !include icu-rules/variants-bg.yaml
- !include icu-rules/variants-ca.yaml
- !include icu-rules/variants-cs.yaml
- !include icu-rules/variants-bg.yaml
- !include icu-rules/variants-ca.yaml
- !include icu-rules/variants-cs.yaml
cfgstr = {'normalization': list(norm),
'sanitizers': sanitizers,
'transliteration': list(trans),
cfgstr = {'normalization': list(norm),
'sanitizers': sanitizers,
'transliteration': list(trans),
- 'token-analysis': [{'variants': [{'words': list(variants)}]}]}
+ 'token-analysis': [{'analyzer': 'generic',
+ 'variants': [{'words': list(variants)}]}]}
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
tok.loader = ICURuleLoader(test_config)
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
tok.loader = ICURuleLoader(test_config)
- ":: Latin ()"
- "[[:Punctuation:][:Space:]]+ > ' '"
""")
- ":: Latin ()"
- "[[:Punctuation:][:Space:]]+ > ' '"
""")
- content += "token-analysis:\n - variants:\n - words:\n"
+ content += "token-analysis:\n - analyzer: generic\n variants:\n - words:\n"
content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs:
content += " {}: {}\n".format(k, v)
content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs:
content += " {}: {}\n".format(k, v)
normalization:
transliteration:
token-analysis:
normalization:
transliteration:
token-analysis:
+ - analyzer: generic
+ variants:
"""))
rules = ICURuleLoader(test_config)
"""))
rules = ICURuleLoader(test_config)
- "'ax' > 'b'"
- !include transliteration.yaml
token-analysis:
- "'ax' > 'b'"
- !include transliteration.yaml
token-analysis:
+ - analyzer: generic
+ variants:
"""))
transpath = test_config.project_dir / ('transliteration.yaml')
transpath.write_text('- "x > y"')
"""))
transpath = test_config.project_dir / ('transliteration.yaml')
transpath.write_text('- "x > y"')
def get_replacements(self, *variants):
loader = ICURuleLoader(self.cfgrules(*variants))
def get_replacements(self, *variants):
loader = ICURuleLoader(self.cfgrules(*variants))
- rules = loader.analysis[None].variants
+ rules = loader.analysis[None].config['variants']
return set((v.source, v.replacement) for v in rules)
return set((v.source, v.replacement) for v in rules)
- ":: Latin ()"
- "'🜵' > ' '"
""")
- ":: Latin ()"
- "'🜵' > ' '"
""")
- content += "token-analysis:\n - variants:\n - words:\n"
+ content += "token-analysis:\n - analyzer: generic\n variants:\n - words:\n"
content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs:
content += " {}: {}\n".format(k, v)
content += '\n'.join((" - " + s for s in variants)) + '\n'
for k, v in kwargs:
content += " {}: {}\n".format(k, v)