Merge pull request #2539 from lonvia/clean-up-python-tests

[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index 5768fd3596652e07fca2896a9d6a02772af8ccb5..ea6e5d3cca5a9d063cd69b89c214f1d5e9699526 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -2,7 +2,6 @@
  Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
  """
  Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
  """
-from collections import Counter
  import itertools
  import json
  import logging
  import itertools
  import json
  import logging
@@ -10,11 +9,10 @@ import re
  from textwrap import dedent
  
  from nominatim.db.connection import connect
  from textwrap import dedent
  
  from nominatim.db.connection import connect
-from nominatim.db.properties import set_property, get_property
  from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.indexer.place_info import PlaceInfo
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
  DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
  DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
@@ -36,8 +34,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
      def __init__(self, dsn, data_dir):
          self.dsn = dsn
          self.data_dir = data_dir
      def __init__(self, dsn, data_dir):
          self.dsn = dsn
          self.data_dir = data_dir
-        self.naming_rules = None
-        self.term_normalization = None
+        self.loader = None
  
  
      def init_new_db(self, config, init_db=True):
  
  
      def init_new_db(self, config, init_db=True):
@@ -46,10 +43,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
              This copies all necessary data in the project directory to make
              sure the tokenizer remains stable even over updates.
          """
              This copies all necessary data in the project directory to make
              sure the tokenizer remains stable even over updates.
          """
-        loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
-                                                             config='TOKENIZER_CONFIG'))
-        self.naming_rules = ICUNameProcessorRules(loader=loader)
-        self.term_normalization = config.TERM_NORMALIZATION
+        self.loader = ICURuleLoader(config)
  
          self._install_php(config.lib_dir.php)
          self._save_config()
  
          self._install_php(config.lib_dir.php)
          self._save_config()
@@ -59,18 +53,22 @@ class LegacyICUTokenizer(AbstractTokenizer):
              self._init_db_tables(config)
  
  
              self._init_db_tables(config)
  
  
-    def init_from_project(self):
+    def init_from_project(self, config):
          """ Initialise the tokenizer from the project directory.
          """
          """ Initialise the tokenizer from the project directory.
          """
+        self.loader = ICURuleLoader(config)
+
          with connect(self.dsn) as conn:
          with connect(self.dsn) as conn:
-            self.naming_rules = ICUNameProcessorRules(conn=conn)
-            self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
+            self.loader.load_config_from_db(conn)
  
  
  
  
-    def finalize_import(self, _):
+    def finalize_import(self, config):
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  
  
      def update_sql_functions(self, config):
  
  
      def update_sql_functions(self, config):
@@ -81,15 +79,31 @@ class LegacyICUTokenizer(AbstractTokenizer):
              sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  
  
              sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  
  
-    def check_database(self):
+    def check_database(self, config):
          """ Check that the tokenizer is set up correctly.
          """
          """ Check that the tokenizer is set up correctly.
          """
-        self.init_from_project()
+        # Will throw an error if there is an issue.
+        self.init_from_project(config)
  
  
-        if self.naming_rules is None:
-            return "Configuration for tokenizer 'icu' are missing."
  
  
-        return None
+    def update_statistics(self):
+        """ Recompute frequencies for all name words.
+        """
+        with connect(self.dsn) as conn:
+            if conn.table_exists('search_name'):
+                with conn.cursor() as cur:
+                    cur.drop_table("word_frequencies")
+                    LOG.info("Computing word frequencies")
+                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                     SELECT unnest(name_vector) as id, count(*)
+                                     FROM search_name GROUP BY id""")
+                    cur.execute("CREATE INDEX ON word_frequencies(id)")
+                    LOG.info("Update word table with recomputed frequencies")
+                    cur.execute("""UPDATE word
+                                   SET info = info || jsonb_build_object('count', count)
+                                   FROM word_frequencies WHERE word_id = id""")
+                    cur.drop_table("word_frequencies")
+            conn.commit()
  
  
      def name_analyzer(self):
  
  
      def name_analyzer(self):
@@ -107,7 +121,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
  
              Analyzers are not thread-safe. You need to instantiate one per thread.
          """
  
              Analyzers are not thread-safe. You need to instantiate one per thread.
          """
-        return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
+        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
+                                     self.loader.make_token_analysis())
  
  
      def _install_php(self, phpdir):
  
  
      def _install_php(self, phpdir):
@@ -117,8 +132,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
          php_file.write_text(dedent(f"""\
              <?php
              @define('CONST_Max_Word_Frequency', 10000000);
          php_file.write_text(dedent(f"""\
              <?php
              @define('CONST_Max_Word_Frequency', 10000000);
-            @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
-            @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
+            @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
+            @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
              require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
  
  
              require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
  
  
@@ -127,9 +142,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
              database as database properties.
          """
          with connect(self.dsn) as conn:
              database as database properties.
          """
          with connect(self.dsn) as conn:
-            self.naming_rules.save_rules(conn)
-
-            set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
+            self.loader.save_config_to_db(conn)
  
  
      def _init_db_tables(self, config):
  
  
      def _init_db_tables(self, config):
@@ -141,45 +154,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
              sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
              conn.commit()
  
              sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
              conn.commit()
  
-            LOG.warning("Precomputing word tokens")
-
-            # get partial words and their frequencies
-            words = self._count_partial_terms(conn)
-
-            # copy them back into the word table
-            with CopyBuffer() as copystr:
-                for term, cnt in words.items():
-                    copystr.add('w', term, json.dumps({'count': cnt}))
-
-                with conn.cursor() as cur:
-                    copystr.copy_out(cur, 'word',
-                                     columns=['type', 'word_token', 'info'])
-                    cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-                                   WHERE word_id is null and type = 'w'""")
-
-            conn.commit()
-
-    def _count_partial_terms(self, conn):
-        """ Count the partial terms from the names in the place table.
-        """
-        words = Counter()
-        name_proc = ICUNameProcessor(self.naming_rules)
-
-        with conn.cursor(name="words") as cur:
-            cur.execute(""" SELECT v, count(*) FROM
-                              (SELECT svals(name) as v FROM place)x
-                            WHERE length(v) < 75 GROUP BY v""")
-
-            for name, cnt in cur:
-                terms = set()
-                for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
-                    if ' ' in word:
-                        terms.update(word.split())
-                for term in terms:
-                    words[term] += cnt
-
-        return words
-
  
  class LegacyICUNameAnalyzer(AbstractAnalyzer):
      """ The legacy analyzer uses the ICU library for splitting names.
  
  class LegacyICUNameAnalyzer(AbstractAnalyzer):
      """ The legacy analyzer uses the ICU library for splitting names.
@@ -188,10 +162,11 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          normalization.
      """
  
          normalization.
      """
  
-    def __init__(self, dsn, name_proc):
+    def __init__(self, dsn, sanitizer, token_analysis):
          self.conn = connect(dsn).connection
          self.conn.autocommit = True
          self.conn = connect(dsn).connection
          self.conn.autocommit = True
-        self.name_processor = name_proc
+        self.sanitizer = sanitizer
+        self.token_analysis = token_analysis
  
          self._cache = _TokenCache()
  
  
          self._cache = _TokenCache()
  
@@ -204,6 +179,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
              self.conn = None
  
  
              self.conn = None
  
  
+    def _search_normalized(self, name):
+        """ Return the search token transliteration of the given name.
+        """
+        return self.token_analysis.search.transliterate(name).strip()
+
+
+    def _normalized(self, name):
+        """ Return the normalized version of the given name with all
+            non-relevant information removed.
+        """
+        return self.token_analysis.normalizer.transliterate(name).strip()
+
+
      def get_word_token_info(self, words):
          """ Return token information for the given list of words.
              If a word starts with # it is assumed to be a full name
      def get_word_token_info(self, words):
          """ Return token information for the given list of words.
              If a word starts with # it is assumed to be a full name
@@ -219,9 +207,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          partial_tokens = {}
          for word in words:
              if word.startswith('#'):
          partial_tokens = {}
          for word in words:
              if word.startswith('#'):
-                full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
+                full_tokens[word] = self._search_normalized(word[1:])
              else:
              else:
-                partial_tokens[word] = self.name_processor.get_search_normalized(word)
+                partial_tokens[word] = self._search_normalized(word)
  
          with self.conn.cursor() as cur:
              cur.execute("""SELECT word_token, word_id
  
          with self.conn.cursor() as cur:
              cur.execute("""SELECT word_token, word_id
@@ -252,7 +240,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
  
              This function takes minor shortcuts on transliteration.
          """
  
              This function takes minor shortcuts on transliteration.
          """
-        return self.name_processor.get_search_normalized(hnr)
+        return self._search_normalized(hnr)
  
      def update_postcodes_from_db(self):
          """ Update postcode tokens in the word table from the location_postcode
  
      def update_postcodes_from_db(self):
          """ Update postcode tokens in the word table from the location_postcode
@@ -275,7 +263,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                      if postcode is None:
                          to_delete.append(word)
                      else:
                      if postcode is None:
                          to_delete.append(word)
                      else:
-                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                        copystr.add(self._search_normalized(postcode),
                                      'P', postcode)
  
                  if to_delete:
                                      'P', postcode)
  
                  if to_delete:
@@ -293,7 +281,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
              completely replaced. Otherwise the phrases are added to the
              already existing ones.
          """
              completely replaced. Otherwise the phrases are added to the
              already existing ones.
          """
-        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
                              for p in phrases))
  
          with self.conn.cursor() as cur:
                              for p in phrases))
  
          with self.conn.cursor() as cur:
@@ -323,7 +311,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          added = 0
          with CopyBuffer() as copystr:
              for word, cls, typ, oper in to_add:
          added = 0
          with CopyBuffer() as copystr:
              for word, cls, typ, oper in to_add:
-                term = self.name_processor.get_search_normalized(word)
+                term = self._search_normalized(word)
                  if term:
                      copystr.add(term, 'S', word,
                                  json.dumps({'class': cls, 'type': typ,
                  if term:
                      copystr.add(term, 'S', word,
                                  json.dumps({'class': cls, 'type': typ,
@@ -357,9 +345,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
      def add_country_names(self, country_code, names):
          """ Add names for the given country to the search index.
          """
      def add_country_names(self, country_code, names):
          """ Add names for the given country to the search index.
          """
+        # Make sure any name preprocessing for country names applies.
+        info = PlaceInfo({'name': names, 'country_code': country_code,
+                          'rank_address': 4, 'class': 'boundary',
+                          'type': 'administrative'})
+        self._add_country_full_names(country_code,
+                                     self.sanitizer.process_names(info)[0])
+
+
+    def _add_country_full_names(self, country_code, names):
+        """ Add names for the given country from an already sanitized
+            name list.
+        """
          word_tokens = set()
          word_tokens = set()
-        for name in self._compute_full_names(names):
-            norm_name = self.name_processor.get_search_normalized(name)
+        for name in names:
+            norm_name = self._search_normalized(name.name)
              if norm_name:
                  word_tokens.add(norm_name)
  
              if norm_name:
                  word_tokens.add(norm_name)
  
@@ -385,23 +385,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
      def process_place(self, place):
          """ Determine tokenizer information about the given place.
  
      def process_place(self, place):
          """ Determine tokenizer information about the given place.
  
-            Returns a JSON-serialisable structure that will be handed into
+            Returns a JSON-serializable structure that will be handed into
              the database via the token_info field.
          """
          token_info = _TokenInfo(self._cache)
  
              the database via the token_info field.
          """
          token_info = _TokenInfo(self._cache)
  
-        names = place.get('name')
+        names, address = self.sanitizer.process_names(place)
  
          if names:
              fulls, partials = self._compute_name_tokens(names)
  
              token_info.add_names(fulls, partials)
  
  
          if names:
              fulls, partials = self._compute_name_tokens(names)
  
              token_info.add_names(fulls, partials)
  
-            country_feature = place.get('country_feature')
-            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self.add_country_names(country_feature.lower(), names)
+            if place.is_country():
+                self._add_country_full_names(place.country_code, names)
  
  
-        address = place.get('address')
          if address:
              self._process_place_address(token_info, address)
  
          if address:
              self._process_place_address(token_info, address)
  
@@ -411,18 +409,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
      def _process_place_address(self, token_info, address):
          hnrs = []
          addr_terms = []
      def _process_place_address(self, token_info, address):
          hnrs = []
          addr_terms = []
-        for key, value in address.items():
-            if key == 'postcode':
-                self._add_postcode(value)
-            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                hnrs.append(value)
-            elif key == 'street':
-                token_info.add_street(self._compute_partial_tokens(value))
-            elif key == 'place':
-                token_info.add_place(self._compute_partial_tokens(value))
-            elif not key.startswith('_') and \
-                 key not in ('country', 'full'):
-                addr_terms.append((key, self._compute_partial_tokens(value)))
+        for item in address:
+            if item.kind == 'postcode':
+                self._add_postcode(item.name)
+            elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                hnrs.append(item.name)
+            elif item.kind == 'street':
+                token_info.add_street(self._compute_partial_tokens(item.name))
+            elif item.kind == 'place':
+                token_info.add_place(self._compute_partial_tokens(item.name))
+            elif not item.kind.startswith('_') and \
+                 item.kind not in ('country', 'full'):
+                addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
  
          if hnrs:
              hnrs = self._split_housenumbers(hnrs)
  
          if hnrs:
              hnrs = self._split_housenumbers(hnrs)
@@ -431,11 +429,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          if addr_terms:
              token_info.add_address_terms(addr_terms)
  
          if addr_terms:
              token_info.add_address_terms(addr_terms)
  
+
      def _compute_partial_tokens(self, name):
          """ Normalize the given term, split it into partial words and return
              then token list for them.
          """
      def _compute_partial_tokens(self, name):
          """ Normalize the given term, split it into partial words and return
              then token list for them.
          """
-        norm_name = self.name_processor.get_search_normalized(name)
+        norm_name = self._search_normalized(name)
  
          tokens = []
          need_lookup = []
  
          tokens = []
          need_lookup = []
@@ -458,28 +457,34 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
  
          return tokens
  
  
          return tokens
  
+
      def _compute_name_tokens(self, names):
          """ Computes the full name and partial name tokens for the given
              dictionary of names.
          """
      def _compute_name_tokens(self, names):
          """ Computes the full name and partial name tokens for the given
              dictionary of names.
          """
-        full_names = self._compute_full_names(names)
          full_tokens = set()
          partial_tokens = set()
  
          full_tokens = set()
          partial_tokens = set()
  
-        for name in full_names:
-            norm_name = self.name_processor.get_normalized(name)
-            full, part = self._cache.names.get(norm_name, (None, None))
+        for name in names:
+            analyzer_id = name.get_attr('analyzer')
+            norm_name = self._normalized(name.name)
+            if analyzer_id is None:
+                token_id = norm_name
+            else:
+                token_id = f'{norm_name}@{analyzer_id}'
+
+            full, part = self._cache.names.get(token_id, (None, None))
              if full is None:
              if full is None:
-                variants = self.name_processor.get_variants_ascii(norm_name)
+                variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
                  if not variants:
                      continue
  
                  with self.conn.cursor() as cur:
                      cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
                  if not variants:
                      continue
  
                  with self.conn.cursor() as cur:
                      cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
-                                (norm_name, variants))
+                                (token_id, variants))
                      full, part = cur.fetchone()
  
                      full, part = cur.fetchone()
  
-                self._cache.names[norm_name] = (full, part)
+                self._cache.names[token_id] = (full, part)
  
              full_tokens.add(full)
              partial_tokens.update(part)
  
              full_tokens.add(full)
              partial_tokens.update(part)
@@ -487,23 +492,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          return full_tokens, partial_tokens
  
  
          return full_tokens, partial_tokens
  
  
-    @staticmethod
-    def _compute_full_names(names):
-        """ Return the set of all full name word ids to be used with the
-            given dictionary of names.
-        """
-        full_names = set()
-        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
-            if name:
-                full_names.add(name)
-
-                brace_idx = name.find('(')
-                if brace_idx >= 0:
-                    full_names.add(name[:brace_idx].strip())
-
-        return full_names
-
-
      def _add_postcode(self, postcode):
          """ Make sure the normalized postcode is present in the word table.
          """
      def _add_postcode(self, postcode):
          """ Make sure the normalized postcode is present in the word table.
          """
@@ -511,7 +499,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
              postcode = self.normalize_postcode(postcode)
  
              if postcode not in self._cache.postcodes:
              postcode = self.normalize_postcode(postcode)
  
              if postcode not in self._cache.postcodes:
-                term = self.name_processor.get_search_normalized(postcode)
+                term = self._search_normalized(postcode)
                  if not term:
                      return
  
                  if not term:
                      return