Merge remote-tracking branch 'upstream/master'

[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index fbaa25969dec5436a159ccb5d663e72aa1fc72ad..2af0bcb257ad214f3e67621a7ac1aaa83b7092d1 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -2,7 +2,6 @@
  Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
  """
-from collections import Counter
  import itertools
  import json
  import logging
@@ -13,8 +12,8 @@ from nominatim.db.connection import connect
  from nominatim.db.properties import set_property, get_property
  from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.indexer.place_info import PlaceInfo
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
  DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
@@ -36,7 +35,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
      def __init__(self, dsn, data_dir):
          self.dsn = dsn
          self.data_dir = data_dir
-        self.naming_rules = None
+        self.loader = None
          self.term_normalization = None
  
  
@@ -46,9 +45,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
              This copies all necessary data in the project directory to make
              sure the tokenizer remains stable even over updates.
          """
-        loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
-                                                             config='TOKENIZER_CONFIG'))
-        self.naming_rules = ICUNameProcessorRules(loader=loader)
+        self.loader = ICURuleLoader(config)
+
          self.term_normalization = config.TERM_NORMALIZATION
  
          self._install_php(config.lib_dir.php)
@@ -59,11 +57,13 @@ class LegacyICUTokenizer(AbstractTokenizer):
              self._init_db_tables(config)
  
  
-    def init_from_project(self):
+    def init_from_project(self, config):
          """ Initialise the tokenizer from the project directory.
          """
+        self.loader = ICURuleLoader(config)
+
          with connect(self.dsn) as conn:
-            self.naming_rules = ICUNameProcessorRules(conn=conn)
+            self.loader.load_config_from_db(conn)
              self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  
  
@@ -81,17 +81,36 @@ class LegacyICUTokenizer(AbstractTokenizer):
              sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  
  
-    def check_database(self):
+    def check_database(self, config):
          """ Check that the tokenizer is set up correctly.
          """
-        self.init_from_project()
+        self.init_from_project(config)
  
-        if self.naming_rules is None:
+        if self.term_normalization is None:
              return "Configuration for tokenizer 'icu' are missing."
  
          return None
  
  
+    def update_statistics(self):
+        """ Recompute frequencies for all name words.
+        """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                cur.drop_table("word_frequencies")
+                LOG.info("Computing word frequencies")
+                cur.execute("""CREATE TEMP TABLE word_frequencies AS
+                                 SELECT unnest(name_vector) as id, count(*)
+                                 FROM search_name GROUP BY id""")
+                cur.execute("CREATE INDEX ON word_frequencies(id)")
+                LOG.info("Update word table with recomputed frequencies")
+                cur.execute("""UPDATE word
+                               SET info = info || jsonb_build_object('count', count)
+                               FROM word_frequencies WHERE word_id = id""")
+                cur.drop_table("word_frequencies")
+            conn.commit()
+
+
      def name_analyzer(self):
          """ Create a new analyzer for tokenizing names and queries
              using this tokinzer. Analyzers are context managers and should
@@ -107,7 +126,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
  
              Analyzers are not thread-safe. You need to instantiate one per thread.
          """
-        return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
+        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
+                                     self.loader.make_token_analysis())
  
  
      def _install_php(self, phpdir):
@@ -118,7 +138,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
              <?php
              @define('CONST_Max_Word_Frequency', 10000000);
              @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
-            @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
+            @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
              require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
  
  
@@ -127,8 +147,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
              database as database properties.
          """
          with connect(self.dsn) as conn:
-            self.naming_rules.save_rules(conn)
-
+            self.loader.save_config_to_db(conn)
              set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
  
  
@@ -141,45 +160,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
              sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
              conn.commit()
  
-            LOG.warning("Precomputing word tokens")
-
-            # get partial words and their frequencies
-            words = self._count_partial_terms(conn)
-
-            # copy them back into the word table
-            with CopyBuffer() as copystr:
-                for term, cnt in words.items():
-                    copystr.add('w', term, json.dumps({'count': cnt}))
-
-                with conn.cursor() as cur:
-                    copystr.copy_out(cur, 'word',
-                                     columns=['type', 'word_token', 'info'])
-                    cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-                                   WHERE word_id is null and type = 'w'""")
-
-            conn.commit()
-
-    def _count_partial_terms(self, conn):
-        """ Count the partial terms from the names in the place table.
-        """
-        words = Counter()
-        name_proc = ICUNameProcessor(self.naming_rules)
-
-        with conn.cursor(name="words") as cur:
-            cur.execute(""" SELECT v, count(*) FROM
-                              (SELECT svals(name) as v FROM place)x
-                            WHERE length(v) < 75 GROUP BY v""")
-
-            for name, cnt in cur:
-                terms = set()
-                for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
-                    if ' ' in word:
-                        terms.update(word.split())
-                for term in terms:
-                    words[term] += cnt
-
-        return words
-
  
  class LegacyICUNameAnalyzer(AbstractAnalyzer):
      """ The legacy analyzer uses the ICU library for splitting names.
@@ -188,10 +168,11 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          normalization.
      """
  
-    def __init__(self, dsn, name_proc):
+    def __init__(self, dsn, sanitizer, token_analysis):
          self.conn = connect(dsn).connection
          self.conn.autocommit = True
-        self.name_processor = name_proc
+        self.sanitizer = sanitizer
+        self.token_analysis = token_analysis
  
          self._cache = _TokenCache()
  
@@ -204,6 +185,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
              self.conn = None
  
  
+    def _search_normalized(self, name):
+        """ Return the search token transliteration of the given name.
+        """
+        return self.token_analysis.search.transliterate(name).strip()
+
+
+    def _normalized(self, name):
+        """ Return the normalized version of the given name with all
+            non-relevant information removed.
+        """
+        return self.token_analysis.normalizer.transliterate(name).strip()
+
+
      def get_word_token_info(self, words):
          """ Return token information for the given list of words.
              If a word starts with # it is assumed to be a full name
@@ -219,9 +213,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          partial_tokens = {}
          for word in words:
              if word.startswith('#'):
-                full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
+                full_tokens[word] = self._search_normalized(word[1:])
              else:
-                partial_tokens[word] = self.name_processor.get_search_normalized(word)
+                partial_tokens[word] = self._search_normalized(word)
  
          with self.conn.cursor() as cur:
              cur.execute("""SELECT word_token, word_id
@@ -252,7 +246,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
  
              This function takes minor shortcuts on transliteration.
          """
-        return self.name_processor.get_search_normalized(hnr)
+        return self._search_normalized(hnr)
  
      def update_postcodes_from_db(self):
          """ Update postcode tokens in the word table from the location_postcode
@@ -275,7 +269,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                      if postcode is None:
                          to_delete.append(word)
                      else:
-                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                        copystr.add(self._search_normalized(postcode),
                                      'P', postcode)
  
                  if to_delete:
@@ -293,7 +287,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
              completely replaced. Otherwise the phrases are added to the
              already existing ones.
          """
-        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
                              for p in phrases))
  
          with self.conn.cursor() as cur:
@@ -323,7 +317,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          added = 0
          with CopyBuffer() as copystr:
              for word, cls, typ, oper in to_add:
-                term = self.name_processor.get_search_normalized(word)
+                term = self._search_normalized(word)
                  if term:
                      copystr.add(term, 'S', word,
                                  json.dumps({'class': cls, 'type': typ,
@@ -357,9 +351,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
      def add_country_names(self, country_code, names):
          """ Add names for the given country to the search index.
          """
+        # Make sure any name preprocessing for country names applies.
+        info = PlaceInfo({'name': names, 'country_code': country_code,
+                          'rank_address': 4, 'class': 'boundary',
+                          'type': 'administrative'})
+        self._add_country_full_names(country_code,
+                                     self.sanitizer.process_names(info)[0])
+
+
+    def _add_country_full_names(self, country_code, names):
+        """ Add names for the given country from an already sanitized
+            name list.
+        """
          word_tokens = set()
-        for name in self._compute_full_names(names):
-            norm_name = self.name_processor.get_search_normalized(name)
+        for name in names:
+            norm_name = self._search_normalized(name.name)
              if norm_name:
                  word_tokens.add(norm_name)
  
@@ -385,12 +391,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
      def process_place(self, place):
          """ Determine tokenizer information about the given place.
  
-            Returns a JSON-serialisable structure that will be handed into
+            Returns a JSON-serializable structure that will be handed into
              the database via the token_info field.
          """
          token_info = _TokenInfo(self._cache)
  
-        names = place.name
+        names, address = self.sanitizer.process_names(place)
  
          if names:
              fulls, partials = self._compute_name_tokens(names)
@@ -398,9 +404,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
              token_info.add_names(fulls, partials)
  
              if place.is_country():
-                self.add_country_names(place.country_code, names)
+                self._add_country_full_names(place.country_code, names)
  
-        address = place.address
          if address:
              self._process_place_address(token_info, address)
  
@@ -410,18 +415,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
      def _process_place_address(self, token_info, address):
          hnrs = []
          addr_terms = []
-        for key, value in address.items():
-            if key == 'postcode':
-                self._add_postcode(value)
-            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                hnrs.append(value)
-            elif key == 'street':
-                token_info.add_street(self._compute_partial_tokens(value))
-            elif key == 'place':
-                token_info.add_place(self._compute_partial_tokens(value))
-            elif not key.startswith('_') and \
-                 key not in ('country', 'full'):
-                addr_terms.append((key, self._compute_partial_tokens(value)))
+        for item in address:
+            if item.kind == 'postcode':
+                self._add_postcode(item.name)
+            elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                hnrs.append(item.name)
+            elif item.kind == 'street':
+                token_info.add_street(self._compute_partial_tokens(item.name))
+            elif item.kind == 'place':
+                token_info.add_place(self._compute_partial_tokens(item.name))
+            elif not item.kind.startswith('_') and \
+                 item.kind not in ('country', 'full'):
+                addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
  
          if hnrs:
              hnrs = self._split_housenumbers(hnrs)
@@ -430,11 +435,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          if addr_terms:
              token_info.add_address_terms(addr_terms)
  
+
      def _compute_partial_tokens(self, name):
          """ Normalize the given term, split it into partial words and return
              then token list for them.
          """
-        norm_name = self.name_processor.get_search_normalized(name)
+        norm_name = self._search_normalized(name)
  
          tokens = []
          need_lookup = []
@@ -457,28 +463,34 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
  
          return tokens
  
+
      def _compute_name_tokens(self, names):
          """ Computes the full name and partial name tokens for the given
              dictionary of names.
          """
-        full_names = self._compute_full_names(names)
          full_tokens = set()
          partial_tokens = set()
  
-        for name in full_names:
-            norm_name = self.name_processor.get_normalized(name)
-            full, part = self._cache.names.get(norm_name, (None, None))
+        for name in names:
+            analyzer_id = name.get_attr('analyzer')
+            norm_name = self._normalized(name.name)
+            if analyzer_id is None:
+                token_id = norm_name
+            else:
+                token_id = f'{norm_name}@{analyzer_id}'
+
+            full, part = self._cache.names.get(token_id, (None, None))
              if full is None:
-                variants = self.name_processor.get_variants_ascii(norm_name)
+                variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
                  if not variants:
                      continue
  
                  with self.conn.cursor() as cur:
                      cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
-                                (norm_name, variants))
+                                (token_id, variants))
                      full, part = cur.fetchone()
  
-                self._cache.names[norm_name] = (full, part)
+                self._cache.names[token_id] = (full, part)
  
              full_tokens.add(full)
              partial_tokens.update(part)
@@ -486,23 +498,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
          return full_tokens, partial_tokens
  
  
-    @staticmethod
-    def _compute_full_names(names):
-        """ Return the set of all full name word ids to be used with the
-            given dictionary of names.
-        """
-        full_names = set()
-        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
-            if name:
-                full_names.add(name)
-
-                brace_idx = name.find('(')
-                if brace_idx >= 0:
-                    full_names.add(name[:brace_idx].strip())
-
-        return full_names
-
-
      def _add_postcode(self, postcode):
          """ Make sure the normalized postcode is present in the word table.
          """
@@ -510,7 +505,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
              postcode = self.normalize_postcode(postcode)
  
              if postcode not in self._cache.postcodes:
-                term = self.name_processor.get_search_normalized(postcode)
+                term = self._search_normalized(postcode)
                  if not term:
                      return