]> git.openstreetmap.org Git - nominatim.git/blobdiff - nominatim/tokenizer/icu_name_processor.py
introduce sanitizer step before token analysis
[nominatim.git] / nominatim / tokenizer / icu_name_processor.py
index 93d2b0ffa26b9151ccba1928c0e7d0745ce4380a..544f5ebce9bf8f3d9b9e477d5689e1a142e65853 100644 (file)
@@ -8,67 +8,25 @@ import itertools
 from icu import Transliterator
 import datrie
 
 from icu import Transliterator
 import datrie
 
-from nominatim.db.properties import set_property, get_property
-from nominatim.tokenizer import icu_variants as variants
-
-DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
-DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
-DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
-DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
-
-
-class ICUNameProcessorRules:
-    """ Data object that saves the rules needed for the name processor.
-
-        The rules can either be initialised through an ICURuleLoader or
-        be loaded from a database when a connection is given.
-    """
-    def __init__(self, loader=None, conn=None):
-        if loader is not None:
-            self.norm_rules = loader.get_normalization_rules()
-            self.trans_rules = loader.get_transliteration_rules()
-            self.replacements = loader.get_replacement_pairs()
-            self.search_rules = loader.get_search_rules()
-        elif conn is not None:
-            self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
-            self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
-            self.replacements = \
-                variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
-            self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
-        else:
-            assert False, "Parameter loader or conn required."
-
-
-    def save_rules(self, conn):
-        """ Save the rules in the property table of the given database.
-            the rules can be loaded again by handing in a connection into
-            the constructor of the class.
-        """
-        set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
-        set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
-        set_property(conn, DBCFG_IMPORT_REPLACEMENTS,
-                     variants.pickle_variant_set(self.replacements))
-        set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
-
 
 class ICUNameProcessor:
     """ Collects the different transformation rules for normalisation of names
 
 class ICUNameProcessor:
     """ Collects the different transformation rules for normalisation of names
-        and provides the functions to aply the transformations.
+        and provides the functions to apply the transformations.
     """
 
     """
 
-    def __init__(self, rules):
+    def __init__(self, norm_rules, trans_rules, replacements):
         self.normalizer = Transliterator.createFromRules("icu_normalization",
         self.normalizer = Transliterator.createFromRules("icu_normalization",
-                                                         rules.norm_rules)
+                                                         norm_rules)
         self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
         self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
-                                                       rules.trans_rules +
+                                                       trans_rules +
                                                        ";[:Space:]+ > ' '")
         self.search = Transliterator.createFromRules("icu_search",
                                                        ";[:Space:]+ > ' '")
         self.search = Transliterator.createFromRules("icu_search",
-                                                     rules.search_rules)
+                                                     norm_rules + trans_rules)
 
         # Intermediate reorder by source. Also compute required character set.
         immediate = defaultdict(list)
         chars = set()
 
         # Intermediate reorder by source. Also compute required character set.
         immediate = defaultdict(list)
         chars = set()
-        for variant in rules.replacements:
+        for variant in replacements:
             if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
                 replstr = variant.replacement[:-1]
             else:
             if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
                 replstr = variant.replacement[:-1]
             else: