]> git.openstreetmap.org Git - nominatim.git/commitdiff
make token analyzers configurable modules
authorSarah Hoffmann <lonvia@denofr.de>
Mon, 4 Oct 2021 15:34:30 +0000 (17:34 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Mon, 4 Oct 2021 15:37:34 +0000 (17:37 +0200)
Adds a mandatory section 'analyzer' to the token-analysis entries
which define, which analyser to use. Currently there is exactly
one, generic, which implements the former ICUNameProcessor.

nominatim/tokenizer/icu_rule_loader.py
nominatim/tokenizer/token_analysis/__init__.py [new file with mode: 0644]
nominatim/tokenizer/token_analysis/generic.py [moved from nominatim/tokenizer/icu_name_processor.py with 92% similarity]
settings/icu_tokenizer.yaml
test/python/test_tokenizer_icu.py
test/python/test_tokenizer_icu_rule_loader.py
test/python/tokenizer/token_analysis/test_generic.py [moved from test/python/test_tokenizer_icu_name_processor.py with 97% similarity]

index cf72520953456e9318576f51d9fc7acc280d668e..a8bdba933637195b3a62a687142b3c08752a3cc8 100644 (file)
@@ -1,6 +1,7 @@
 """
 Helper class to create ICU rules from a configuration file.
 """
 """
 Helper class to create ICU rules from a configuration file.
 """
+import importlib
 import io
 import json
 import logging
 import io
 import json
 import logging
@@ -12,7 +13,6 @@ from icu import Transliterator
 from nominatim.config import flatten_config_list
 from nominatim.db.properties import set_property, get_property
 from nominatim.errors import UsageError
 from nominatim.config import flatten_config_list
 from nominatim.db.properties import set_property, get_property
 from nominatim.errors import UsageError
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 import nominatim.tokenizer.icu_variants as variants
 
 from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 import nominatim.tokenizer.icu_variants as variants
 
@@ -23,6 +23,17 @@ DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
 DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
 
 
 DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
 
 
+def _get_section(rules, section):
+    """ Get the section named 'section' from the rules. If the section does
+        not exist, raise a usage error with a meaningful message.
+    """
+    if section not in rules:
+        LOG.fatal("Section '%s' not found in tokenizer config.", section)
+        raise UsageError("Syntax error in tokenizer configuration file.")
+
+    return rules[section]
+
+
 class VariantRule:
     """ Saves a single variant expansion.
 
 class VariantRule:
     """ Saves a single variant expansion.
 
@@ -45,7 +56,7 @@ class ICURuleLoader:
 
         self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
         self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
 
         self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
         self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
-        self.analysis_rules = self._get_section(rules, 'token-analysis')
+        self.analysis_rules = _get_section(rules, 'token-analysis')
         self._setup_analysis()
 
         # Load optional sanitizer rule set.
         self._setup_analysis()
 
         # Load optional sanitizer rule set.
@@ -130,25 +141,14 @@ class ICURuleLoader:
 
 
     @staticmethod
 
 
     @staticmethod
-    def _get_section(rules, section):
-        """ Get the section named 'section' from the rules. If the section does
-            not exist, raise a usage error with a meaningful message.
-        """
-        if section not in rules:
-            LOG.fatal("Section '%s' not found in tokenizer config.", section)
-            raise UsageError("Syntax error in tokenizer configuration file.")
-
-        return rules[section]
-
-
-    def _cfg_to_icu_rules(self, rules, section):
+    def _cfg_to_icu_rules(rules, section):
         """ Load an ICU ruleset from the given section. If the section is a
             simple string, it is interpreted as a file name and the rules are
             loaded verbatim from the given file. The filename is expected to be
             relative to the tokenizer rule file. If the section is a list then
             each line is assumed to be a rule. All rules are concatenated and returned.
         """
         """ Load an ICU ruleset from the given section. If the section is a
             simple string, it is interpreted as a file name and the rules are
             loaded verbatim from the given file. The filename is expected to be
             relative to the tokenizer rule file. If the section is a list then
             each line is assumed to be a rule. All rules are concatenated and returned.
         """
-        content = self._get_section(rules, section)
+        content = _get_section(rules, section)
 
         if content is None:
             return ''
 
         if content is None:
             return ''
@@ -162,19 +162,27 @@ class TokenAnalyzerRule:
     """
 
     def __init__(self, rules, normalization_rules):
     """
 
     def __init__(self, rules, normalization_rules):
+        # Find the analysis module
+        module_name = 'nominatim.tokenizer.token_analysis.' \
+                      + _get_section(rules, 'analyzer').replace('-', '_')
+        analysis_mod = importlib.import_module(module_name)
+        self._mod_create = analysis_mod.create
+
+        # Load the configuration.
+        self.config = {}
         self._parse_variant_list(rules.get('variants'), normalization_rules)
 
 
     def create(self, normalization_rules, transliteration_rules):
         """ Create an analyzer from the given rules.
         """
         self._parse_variant_list(rules.get('variants'), normalization_rules)
 
 
     def create(self, normalization_rules, transliteration_rules):
         """ Create an analyzer from the given rules.
         """
-        return ICUNameProcessor(normalization_rules,
+        return self._mod_create(normalization_rules,
                                 transliteration_rules,
                                 transliteration_rules,
-                                self.variants)
+                                self.config)
 
 
     def _parse_variant_list(self, rules, normalization_rules):
 
 
     def _parse_variant_list(self, rules, normalization_rules):
-        self.variants = set()
+        vset = set()
 
         if not rules:
             return
 
         if not rules:
             return
@@ -196,7 +204,9 @@ class TokenAnalyzerRule:
                 properties.append(props)
 
             for rule in (section.get('words') or []):
                 properties.append(props)
 
             for rule in (section.get('words') or []):
-                self.variants.update(vmaker.compute(rule, props))
+                vset.update(vmaker.compute(rule, props))
+
+        self.config['variants'] = vset
 
 
 class _VariantMaker:
 
 
 class _VariantMaker:
diff --git a/nominatim/tokenizer/token_analysis/__init__.py b/nominatim/tokenizer/token_analysis/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
similarity index 92%
rename from nominatim/tokenizer/icu_name_processor.py
rename to nominatim/tokenizer/token_analysis/generic.py
index 544f5ebce9bf8f3d9b9e477d5689e1a142e65853..2c720f1d6af9e38b93e6eea95f4ed70de866235a 100644 (file)
@@ -1,6 +1,5 @@
 """
 """
-Processor for names that are imported into the database based on the
-ICU library.
+Generic processor for names that creates abbreviation variants.
 """
 from collections import defaultdict
 import itertools
 """
 from collections import defaultdict
 import itertools
@@ -8,8 +7,15 @@ import itertools
 from icu import Transliterator
 import datrie
 
 from icu import Transliterator
 import datrie
 
+### Analysis section
 
 
-class ICUNameProcessor:
+def create(norm_rules, trans_rules, config):
+    """ Create a new token analysis instance for this module.
+    """
+    return GenericTokenAnalysis(norm_rules, trans_rules, config['variants'])
+
+
+class GenericTokenAnalysis:
     """ Collects the different transformation rules for normalisation of names
         and provides the functions to apply the transformations.
     """
     """ Collects the different transformation rules for normalisation of names
         and provides the functions to apply the transformations.
     """
index f85c33ffc75f02d1fe6d47eb9de4985abd02a34e..d070adcbbd649122aa5a37c621271b7d8635cb01 100644 (file)
@@ -28,7 +28,8 @@ sanitizers:
     - step: split-name-list
     - step: strip-brace-terms
 token-analysis:
     - step: split-name-list
     - step: strip-brace-terms
 token-analysis:
-    - variants:
+    - analyzer: generic
+      variants:
           - !include icu-rules/variants-bg.yaml
           - !include icu-rules/variants-ca.yaml
           - !include icu-rules/variants-cs.yaml
           - !include icu-rules/variants-bg.yaml
           - !include icu-rules/variants-ca.yaml
           - !include icu-rules/variants-cs.yaml
index 16caf3edf12299cfe95e57bf5c72e889a39d0c63..52cca6a7a0d31a7e20543a342088641221ecdf43 100644 (file)
@@ -72,7 +72,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
         cfgstr = {'normalization': list(norm),
                   'sanitizers': sanitizers,
                   'transliteration': list(trans),
         cfgstr = {'normalization': list(norm),
                   'sanitizers': sanitizers,
                   'transliteration': list(trans),
-                  'token-analysis': [{'variants': [{'words': list(variants)}]}]}
+                  'token-analysis': [{'analyzer': 'generic',
+                                      'variants': [{'words': list(variants)}]}]}
         (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
         tok.loader = ICURuleLoader(test_config)
 
         (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
         tok.loader = ICURuleLoader(test_config)
 
index 5d931043d737e92f96cc5560088a226839cd4276..b76eee75c9a5660cc6fd590a7a4fc3cd088eeac6 100644 (file)
@@ -34,7 +34,7 @@ def cfgrules(test_config):
             - "::  Latin ()"
             - "[[:Punctuation:][:Space:]]+ > ' '"
         """)
             - "::  Latin ()"
             - "[[:Punctuation:][:Space:]]+ > ' '"
         """)
-        content += "token-analysis:\n  - variants:\n     - words:\n"
+        content += "token-analysis:\n  - analyzer: generic\n    variants:\n     - words:\n"
         content += '\n'.join(("         - " + s for s in variants)) + '\n'
         for k, v in kwargs:
             content += "    {}: {}\n".format(k, v)
         content += '\n'.join(("         - " + s for s in variants)) + '\n'
         for k, v in kwargs:
             content += "    {}: {}\n".format(k, v)
@@ -50,7 +50,8 @@ def test_empty_rule_set(test_config):
         normalization:
         transliteration:
         token-analysis:
         normalization:
         transliteration:
         token-analysis:
-          - variants:
+          - analyzer: generic
+            variants:
         """))
 
     rules = ICURuleLoader(test_config)
         """))
 
     rules = ICURuleLoader(test_config)
@@ -108,7 +109,8 @@ def test_transliteration_rules_from_file(test_config):
             - "'ax' > 'b'"
             - !include transliteration.yaml
         token-analysis:
             - "'ax' > 'b'"
             - !include transliteration.yaml
         token-analysis:
-            - variants:
+            - analyzer: generic
+              variants:
         """))
     transpath = test_config.project_dir / ('transliteration.yaml')
     transpath.write_text('- "x > y"')
         """))
     transpath = test_config.project_dir / ('transliteration.yaml')
     transpath.write_text('- "x > y"')
@@ -128,7 +130,7 @@ class TestGetReplacements:
 
     def get_replacements(self, *variants):
         loader = ICURuleLoader(self.cfgrules(*variants))
 
     def get_replacements(self, *variants):
         loader = ICURuleLoader(self.cfgrules(*variants))
-        rules = loader.analysis[None].variants
+        rules = loader.analysis[None].config['variants']
 
         return set((v.source, v.replacement) for v in rules)
 
 
         return set((v.source, v.replacement) for v in rules)
 
similarity index 97%
rename from test/python/test_tokenizer_icu_name_processor.py
rename to test/python/tokenizer/token_analysis/test_generic.py
index 366d2aee23855bd2e9a9f00a2df81a0974ebba79..f0ce4208e288afbe9fdce86f255abd6220c69de2 100644 (file)
@@ -28,7 +28,7 @@ def cfgfile(def_config, tmp_path):
             - "::  Latin ()"
             - "'🜵' > ' '"
         """)
             - "::  Latin ()"
             - "'🜵' > ' '"
         """)
-        content += "token-analysis:\n  - variants:\n      - words:\n"
+        content += "token-analysis:\n  - analyzer: generic\n    variants:\n      - words:\n"
         content += '\n'.join(("          - " + s for s in variants)) + '\n'
         for k, v in kwargs:
             content += "        {}: {}\n".format(k, v)
         content += '\n'.join(("          - " + s for s in variants)) + '\n'
         for k, v in kwargs:
             content += "        {}: {}\n".format(k, v)