unify ICUNameProcessorRules and ICURuleLoader

[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index 5768fd3596652e07fca2896a9d6a02772af8ccb5..87906d71d75484639078c56d7b1dd9c0295a8572 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -14,7 +14,6 @@ from nominatim.db.properties import set_property, get_property
  from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
  DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
  DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
@@ -36,7 +35,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
      def __init__(self, dsn, data_dir):
          self.dsn = dsn
          self.data_dir = data_dir
      def __init__(self, dsn, data_dir):
          self.dsn = dsn
          self.data_dir = data_dir
-        self.naming_rules = None
+        self.loader = None
          self.term_normalization = None
  
  
          self.term_normalization = None
  
  
@@ -46,9 +45,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
              This copies all necessary data in the project directory to make
              sure the tokenizer remains stable even over updates.
          """
              This copies all necessary data in the project directory to make
              sure the tokenizer remains stable even over updates.
          """
-        loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
-                                                             config='TOKENIZER_CONFIG'))
-        self.naming_rules = ICUNameProcessorRules(loader=loader)
+        self.loader = ICURuleLoader(config)
+
          self.term_normalization = config.TERM_NORMALIZATION
  
          self._install_php(config.lib_dir.php)
          self.term_normalization = config.TERM_NORMALIZATION
  
          self._install_php(config.lib_dir.php)
@@ -59,11 +57,13 @@ class LegacyICUTokenizer(AbstractTokenizer):
              self._init_db_tables(config)
  
  
              self._init_db_tables(config)
  
  
-    def init_from_project(self):
+    def init_from_project(self, config):
          """ Initialise the tokenizer from the project directory.
          """
          """ Initialise the tokenizer from the project directory.
          """
+        self.loader = ICURuleLoader(config)
+
          with connect(self.dsn) as conn:
          with connect(self.dsn) as conn:
-            self.naming_rules = ICUNameProcessorRules(conn=conn)
+            self.loader.load_config_from_db(conn)
              self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  
  
              self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  
  
@@ -81,12 +81,12 @@ class LegacyICUTokenizer(AbstractTokenizer):
              sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  
  
              sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  
  
-    def check_database(self):
+    def check_database(self, config):
          """ Check that the tokenizer is set up correctly.
          """
          """ Check that the tokenizer is set up correctly.
          """
-        self.init_from_project()
+        self.init_from_project(config)
  
  
-        if self.naming_rules is None:
+        if self.term_normalization is None:
              return "Configuration for tokenizer 'icu' are missing."
  
          return None
              return "Configuration for tokenizer 'icu' are missing."
  
          return None
@@ -107,7 +107,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
  
              Analyzers are not thread-safe. You need to instantiate one per thread.
          """
  
              Analyzers are not thread-safe. You need to instantiate one per thread.
          """
-        return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
+        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis())
  
  
      def _install_php(self, phpdir):
  
  
      def _install_php(self, phpdir):
@@ -118,7 +118,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
              <?php
              @define('CONST_Max_Word_Frequency', 10000000);
              @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
              <?php
              @define('CONST_Max_Word_Frequency', 10000000);
              @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
-            @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
+            @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
              require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
  
  
              require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
  
  
@@ -127,8 +127,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
              database as database properties.
          """
          with connect(self.dsn) as conn:
              database as database properties.
          """
          with connect(self.dsn) as conn:
-            self.naming_rules.save_rules(conn)
-
+            self.loader.save_config_to_db(conn)
              set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
  
  
              set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
  
  
@@ -163,7 +162,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
          """ Count the partial terms from the names in the place table.
          """
          words = Counter()
          """ Count the partial terms from the names in the place table.
          """
          words = Counter()
-        name_proc = ICUNameProcessor(self.naming_rules)
+        name_proc = self.loader.make_token_analysis()
  
          with conn.cursor(name="words") as cur:
              cur.execute(""" SELECT v, count(*) FROM
  
          with conn.cursor(name="words") as cur:
              cur.execute(""" SELECT v, count(*) FROM
@@ -390,18 +389,17 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          """
          token_info = _TokenInfo(self._cache)
  
          """
          token_info = _TokenInfo(self._cache)
  
-        names = place.get('name')
+        names = place.name
  
          if names:
              fulls, partials = self._compute_name_tokens(names)
  
              token_info.add_names(fulls, partials)
  
  
          if names:
              fulls, partials = self._compute_name_tokens(names)
  
              token_info.add_names(fulls, partials)
  
-            country_feature = place.get('country_feature')
-            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self.add_country_names(country_feature.lower(), names)
+            if place.is_country():
+                self.add_country_names(place.country_code, names)
  
  
-        address = place.get('address')
+        address = place.address
          if address:
              self._process_place_address(token_info, address)
  
          if address:
              self._process_place_address(token_info, address)