unify ICUNameProcessorRules and ICURuleLoader

[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index e9cb3d26c48cca3000c2d322c66ff4375f0af622..87906d71d75484639078c56d7b1dd9c0295a8572 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -14,10 +14,8 @@ from nominatim.db.properties import set_property, get_property
  from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
-DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  
  LOG = logging.getLogger()
  DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  
  LOG = logging.getLogger()
@@ -37,9 +35,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
      def __init__(self, dsn, data_dir):
          self.dsn = dsn
          self.data_dir = data_dir
      def __init__(self, dsn, data_dir):
          self.dsn = dsn
          self.data_dir = data_dir
-        self.naming_rules = None
+        self.loader = None
          self.term_normalization = None
          self.term_normalization = None
-        self.max_word_frequency = None
  
  
      def init_new_db(self, config, init_db=True):
  
  
      def init_new_db(self, config, init_db=True):
@@ -48,27 +45,26 @@ class LegacyICUTokenizer(AbstractTokenizer):
              This copies all necessary data in the project directory to make
              sure the tokenizer remains stable even over updates.
          """
              This copies all necessary data in the project directory to make
              sure the tokenizer remains stable even over updates.
          """
-        loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
-                                              config='TOKENIZER_CONFIG'))
-        self.naming_rules = ICUNameProcessorRules(loader=loader)
+        self.loader = ICURuleLoader(config)
+
          self.term_normalization = config.TERM_NORMALIZATION
          self.term_normalization = config.TERM_NORMALIZATION
-        self.max_word_frequency = config.MAX_WORD_FREQUENCY
  
          self._install_php(config.lib_dir.php)
  
          self._install_php(config.lib_dir.php)
-        self._save_config(config)
+        self._save_config()
  
          if init_db:
              self.update_sql_functions(config)
              self._init_db_tables(config)
  
  
  
          if init_db:
              self.update_sql_functions(config)
              self._init_db_tables(config)
  
  
-    def init_from_project(self):
+    def init_from_project(self, config):
          """ Initialise the tokenizer from the project directory.
          """
          """ Initialise the tokenizer from the project directory.
          """
+        self.loader = ICURuleLoader(config)
+
          with connect(self.dsn) as conn:
          with connect(self.dsn) as conn:
-            self.naming_rules = ICUNameProcessorRules(conn=conn)
+            self.loader.load_config_from_db(conn)
              self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
              self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
-            self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  
  
      def finalize_import(self, _):
  
  
      def finalize_import(self, _):
@@ -81,18 +77,16 @@ class LegacyICUTokenizer(AbstractTokenizer):
          """ Reimport the SQL functions for this tokenizer.
          """
          with connect(self.dsn) as conn:
          """ Reimport the SQL functions for this tokenizer.
          """
          with connect(self.dsn) as conn:
-            max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
              sqlp = SQLPreprocessor(conn, config)
              sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql',
-                              max_word_freq=max_word_freq)
+            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  
  
  
  
-    def check_database(self):
+    def check_database(self, config):
          """ Check that the tokenizer is set up correctly.
          """
          """ Check that the tokenizer is set up correctly.
          """
-        self.init_from_project()
+        self.init_from_project(config)
  
  
-        if self.naming_rules is None:
+        if self.term_normalization is None:
              return "Configuration for tokenizer 'icu' are missing."
  
          return None
              return "Configuration for tokenizer 'icu' are missing."
  
          return None
@@ -113,7 +107,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
  
              Analyzers are not thread-safe. You need to instantiate one per thread.
          """
  
              Analyzers are not thread-safe. You need to instantiate one per thread.
          """
-        return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
+        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis())
  
  
      def _install_php(self, phpdir):
  
  
      def _install_php(self, phpdir):
@@ -122,20 +116,18 @@ class LegacyICUTokenizer(AbstractTokenizer):
          php_file = self.data_dir / "tokenizer.php"
          php_file.write_text(dedent(f"""\
              <?php
          php_file = self.data_dir / "tokenizer.php"
          php_file.write_text(dedent(f"""\
              <?php
-            @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
+            @define('CONST_Max_Word_Frequency', 10000000);
              @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
              @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
-            @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
+            @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
              require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
  
  
              require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
  
  
-    def _save_config(self, config):
+    def _save_config(self):
          """ Save the configuration that needs to remain stable for the given
              database as database properties.
          """
          with connect(self.dsn) as conn:
          """ Save the configuration that needs to remain stable for the given
              database as database properties.
          """
          with connect(self.dsn) as conn:
-            self.naming_rules.save_rules(conn)
-
-            set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
+            self.loader.save_config_to_db(conn)
              set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
  
  
              set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
  
  
@@ -170,7 +162,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
          """ Count the partial terms from the names in the place table.
          """
          words = Counter()
          """ Count the partial terms from the names in the place table.
          """
          words = Counter()
-        name_proc = ICUNameProcessor(self.naming_rules)
+        name_proc = self.loader.make_token_analysis()
  
          with conn.cursor(name="words") as cur:
              cur.execute(""" SELECT v, count(*) FROM
  
          with conn.cursor(name="words") as cur:
              cur.execute(""" SELECT v, count(*) FROM
@@ -397,18 +389,17 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          """
          token_info = _TokenInfo(self._cache)
  
          """
          token_info = _TokenInfo(self._cache)
  
-        names = place.get('name')
+        names = place.name
  
          if names:
              fulls, partials = self._compute_name_tokens(names)
  
              token_info.add_names(fulls, partials)
  
  
          if names:
              fulls, partials = self._compute_name_tokens(names)
  
              token_info.add_names(fulls, partials)
  
-            country_feature = place.get('country_feature')
-            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self.add_country_names(country_feature.lower(), names)
+            if place.is_country():
+                self.add_country_names(place.country_code, names)
  
  
-        address = place.get('address')
+        address = place.address
          if address:
              self._process_place_address(token_info, address)
  
          if address:
              self._process_place_address(token_info, address)
  
@@ -424,12 +415,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
              elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
                  hnrs.append(value)
              elif key == 'street':
              elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
                  hnrs.append(value)
              elif key == 'street':
-                token_info.add_street(*self._compute_name_tokens({'name': value}))
+                token_info.add_street(self._compute_partial_tokens(value))
              elif key == 'place':
              elif key == 'place':
-                token_info.add_place(*self._compute_name_tokens({'name': value}))
+                token_info.add_place(self._compute_partial_tokens(value))
              elif not key.startswith('_') and \
                   key not in ('country', 'full'):
              elif not key.startswith('_') and \
                   key not in ('country', 'full'):
-                addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+                addr_terms.append((key, self._compute_partial_tokens(value)))
  
          if hnrs:
              hnrs = self._split_housenumbers(hnrs)
  
          if hnrs:
              hnrs = self._split_housenumbers(hnrs)
@@ -438,6 +429,32 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          if addr_terms:
              token_info.add_address_terms(addr_terms)
  
          if addr_terms:
              token_info.add_address_terms(addr_terms)
  
+    def _compute_partial_tokens(self, name):
+        """ Normalize the given term, split it into partial words and return
+            then token list for them.
+        """
+        norm_name = self.name_processor.get_search_normalized(name)
+
+        tokens = []
+        need_lookup = []
+        for partial in norm_name.split():
+            token = self._cache.partials.get(partial)
+            if token:
+                tokens.append(token)
+            else:
+                need_lookup.append(partial)
+
+        if need_lookup:
+            with self.conn.cursor() as cur:
+                cur.execute("""SELECT word, getorcreate_partial_word(word)
+                               FROM unnest(%s) word""",
+                            (need_lookup, ))
+
+                for partial, token in cur:
+                    tokens.append(token)
+                    self._cache.partials[partial] = token
+
+        return tokens
  
      def _compute_name_tokens(self, names):
          """ Computes the full name and partial name tokens for the given
  
      def _compute_name_tokens(self, names):
          """ Computes the full name and partial name tokens for the given
@@ -551,30 +568,25 @@ class _TokenInfo:
          self.data['hnr'] = ';'.join(hnrs)
  
  
          self.data['hnr'] = ';'.join(hnrs)
  
  
-    def add_street(self, fulls, _):
+    def add_street(self, tokens):
          """ Add addr:street match terms.
          """
          """ Add addr:street match terms.
          """
-        if fulls:
-            self.data['street'] = self._mk_array(fulls)
+        if tokens:
+            self.data['street'] = self._mk_array(tokens)
  
  
  
  
-    def add_place(self, fulls, partials):
+    def add_place(self, tokens):
          """ Add addr:place search and match terms.
          """
          """ Add addr:place search and match terms.
          """
-        if fulls:
-            self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
-            self.data['place_match'] = self._mk_array(fulls)
+        if tokens:
+            self.data['place'] = self._mk_array(tokens)
  
  
      def add_address_terms(self, terms):
          """ Add additional address terms.
          """
  
  
      def add_address_terms(self, terms):
          """ Add additional address terms.
          """
-        tokens = {}
-
-        for key, fulls, partials in terms:
-            if fulls:
-                tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
-                               self._mk_array(fulls)]
+        tokens = {key: self._mk_array(partials)
+                  for key, partials in terms if partials}
  
          if tokens:
              self.data['addr'] = tokens
  
          if tokens:
              self.data['addr'] = tokens
@@ -588,6 +600,7 @@ class _TokenCache:
      """
      def __init__(self):
          self.names = {}
      """
      def __init__(self):
          self.names = {}
+        self.partials = {}
          self.postcodes = set()
          self.housenumbers = {}
  
          self.postcodes = set()
          self.housenumbers = {}