X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/925195725dfcb7f1a6795c50244c1df6cb7242ce..16daa57e4757e4daeffec1e61630f989727dc563:/nominatim/tokenizer/icu_tokenizer.py

diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py
index cb411204..87906d71 100644
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -8,17 +8,14 @@ import json
 import logging
 import re
 from textwrap import dedent
-from pathlib import Path
 
 from nominatim.db.connection import connect
 from nominatim.db.properties import set_property, get_property
 from nominatim.db.utils import CopyBuffer
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 
-DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
 
 LOG = logging.getLogger()
@@ -38,9 +35,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
     def __init__(self, dsn, data_dir):
         self.dsn = dsn
         self.data_dir = data_dir
-        self.naming_rules = None
+        self.loader = None
         self.term_normalization = None
-        self.max_word_frequency = None
 
 
     def init_new_db(self, config, init_db=True):
@@ -49,31 +45,26 @@ class LegacyICUTokenizer(AbstractTokenizer):
             This copies all necessary data in the project directory to make
             sure the tokenizer remains stable even over updates.
         """
-        if config.TOKENIZER_CONFIG:
-            cfgfile = Path(config.TOKENIZER_CONFIG)
-        else:
-            cfgfile = config.config_dir / 'icu_tokenizer.yaml'
+        self.loader = ICURuleLoader(config)
 
-        loader = ICURuleLoader(cfgfile)
-        self.naming_rules = ICUNameProcessorRules(loader=loader)
         self.term_normalization = config.TERM_NORMALIZATION
-        self.max_word_frequency = config.MAX_WORD_FREQUENCY
 
         self._install_php(config.lib_dir.php)
-        self._save_config(config)
+        self._save_config()
 
         if init_db:
             self.update_sql_functions(config)
             self._init_db_tables(config)
 
 
-    def init_from_project(self):
+    def init_from_project(self, config):
         """ Initialise the tokenizer from the project directory.
         """
+        self.loader = ICURuleLoader(config)
+
         with connect(self.dsn) as conn:
-            self.naming_rules = ICUNameProcessorRules(conn=conn)
+            self.loader.load_config_from_db(conn)
             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
-            self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
 
 
     def finalize_import(self, _):
@@ -86,18 +77,16 @@ class LegacyICUTokenizer(AbstractTokenizer):
         """ Reimport the SQL functions for this tokenizer.
         """
         with connect(self.dsn) as conn:
-            max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
             sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql',
-                              max_word_freq=max_word_freq)
+            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
 
 
-    def check_database(self):
+    def check_database(self, config):
         """ Check that the tokenizer is set up correctly.
         """
-        self.init_from_project()
+        self.init_from_project(config)
 
-        if self.naming_rules is None:
+        if self.term_normalization is None:
             return "Configuration for tokenizer 'icu' are missing."
 
         return None
@@ -118,7 +107,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
 
             Analyzers are not thread-safe. You need to instantiate one per thread.
         """
-        return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
+        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis())
 
 
     def _install_php(self, phpdir):
@@ -127,20 +116,18 @@ class LegacyICUTokenizer(AbstractTokenizer):
         php_file = self.data_dir / "tokenizer.php"
         php_file.write_text(dedent(f"""\
             <?php
-            @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
+            @define('CONST_Max_Word_Frequency', 10000000);
             @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
-            @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
+            @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 
 
-    def _save_config(self, config):
+    def _save_config(self):
         """ Save the configuration that needs to remain stable for the given
             database as database properties.
         """
         with connect(self.dsn) as conn:
-            self.naming_rules.save_rules(conn)
-
-            set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
+            self.loader.save_config_to_db(conn)
             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 
 
@@ -175,7 +162,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
         """ Count the partial terms from the names in the place table.
         """
         words = Counter()
-        name_proc = ICUNameProcessor(self.naming_rules)
+        name_proc = self.loader.make_token_analysis()
 
         with conn.cursor(name="words") as cur:
             cur.execute(""" SELECT v, count(*) FROM
@@ -402,18 +389,17 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         """
         token_info = _TokenInfo(self._cache)
 
-        names = place.get('name')
+        names = place.name
 
         if names:
             fulls, partials = self._compute_name_tokens(names)
 
             token_info.add_names(fulls, partials)
 
-            country_feature = place.get('country_feature')
-            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self.add_country_names(country_feature.lower(), names)
+            if place.is_country():
+                self.add_country_names(place.country_code, names)
 
-        address = place.get('address')
+        address = place.address
         if address:
             self._process_place_address(token_info, address)
 
@@ -429,12 +415,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
             elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
                 hnrs.append(value)
             elif key == 'street':
-                token_info.add_street(*self._compute_name_tokens({'name': value}))
+                token_info.add_street(self._compute_partial_tokens(value))
             elif key == 'place':
-                token_info.add_place(*self._compute_name_tokens({'name': value}))
+                token_info.add_place(self._compute_partial_tokens(value))
             elif not key.startswith('_') and \
                  key not in ('country', 'full'):
-                addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+                addr_terms.append((key, self._compute_partial_tokens(value)))
 
         if hnrs:
             hnrs = self._split_housenumbers(hnrs)
@@ -443,6 +429,32 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         if addr_terms:
             token_info.add_address_terms(addr_terms)
 
+    def _compute_partial_tokens(self, name):
+        """ Normalize the given term, split it into partial words and return
+            then token list for them.
+        """
+        norm_name = self.name_processor.get_search_normalized(name)
+
+        tokens = []
+        need_lookup = []
+        for partial in norm_name.split():
+            token = self._cache.partials.get(partial)
+            if token:
+                tokens.append(token)
+            else:
+                need_lookup.append(partial)
+
+        if need_lookup:
+            with self.conn.cursor() as cur:
+                cur.execute("""SELECT word, getorcreate_partial_word(word)
+                               FROM unnest(%s) word""",
+                            (need_lookup, ))
+
+                for partial, token in cur:
+                    tokens.append(token)
+                    self._cache.partials[partial] = token
+
+        return tokens
 
     def _compute_name_tokens(self, names):
         """ Computes the full name and partial name tokens for the given
@@ -556,30 +568,25 @@ class _TokenInfo:
         self.data['hnr'] = ';'.join(hnrs)
 
 
-    def add_street(self, fulls, _):
+    def add_street(self, tokens):
         """ Add addr:street match terms.
         """
-        if fulls:
-            self.data['street'] = self._mk_array(fulls)
+        if tokens:
+            self.data['street'] = self._mk_array(tokens)
 
 
-    def add_place(self, fulls, partials):
+    def add_place(self, tokens):
         """ Add addr:place search and match terms.
         """
-        if fulls:
-            self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
-            self.data['place_match'] = self._mk_array(fulls)
+        if tokens:
+            self.data['place'] = self._mk_array(tokens)
 
 
     def add_address_terms(self, terms):
         """ Add additional address terms.
         """
-        tokens = {}
-
-        for key, fulls, partials in terms:
-            if fulls:
-                tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
-                               self._mk_array(fulls)]
+        tokens = {key: self._mk_array(partials)
+                  for key, partials in terms if partials}
 
         if tokens:
             self.data['addr'] = tokens
@@ -593,6 +600,7 @@ class _TokenCache:
     """
     def __init__(self):
         self.names = {}
+        self.partials = {}
         self.postcodes = set()
         self.housenumbers = {}