switch word tokens to new word table layout

[nominatim.git] / nominatim / tokenizer / legacy_icu_tokenizer.py
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index 6bf409cca3ab3674b41605b06e8dfe49eda40e41..14fa5b609456c51ee4a7f9a35f6e5bf9908636c7 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -9,8 +9,6 @@ import re
  from textwrap import dedent
  from pathlib import Path
  
-import psycopg2.extras
-
  from nominatim.db.connection import connect
  from nominatim.db.properties import set_property, get_property
  from nominatim.db.utils import CopyBuffer
@@ -76,13 +74,11 @@ class LegacyICUTokenizer:
              self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  
  
-    def finalize_import(self, config):
+    def finalize_import(self, _):
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
-        with connect(self.dsn) as conn:
-            sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
+        pass
  
  
      def update_sql_functions(self, config):
@@ -123,18 +119,17 @@ class LegacyICUTokenizer:
          """
          return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
  
-    # pylint: disable=missing-format-attribute
+
      def _install_php(self, phpdir):
          """ Install the php script for the tokenizer.
          """
          php_file = self.data_dir / "tokenizer.php"
-        php_file.write_text(dedent("""\
+        php_file.write_text(dedent(f"""\
              <?php
-            @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
-            @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
-            @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
-            require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
-            """.format(self, phpdir)))
+            @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
+            @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
+            @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
+            require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');"""))
  
  
      def _save_config(self, config):
@@ -154,7 +149,7 @@ class LegacyICUTokenizer:
          """
          with connect(self.dsn) as conn:
              sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
+            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
              conn.commit()
  
              LOG.warning("Precomputing word tokens")
@@ -163,7 +158,9 @@ class LegacyICUTokenizer:
              words = Counter()
              name_proc = ICUNameProcessor(self.naming_rules)
              with conn.cursor(name="words") as cur:
-                cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
+                cur.execute(""" SELECT v, count(*) FROM
+                                  (SELECT svals(name) as v FROM place)x
+                                WHERE length(v) < 75 GROUP BY v""")
  
                  for name, cnt in cur:
                      terms = set()
@@ -175,14 +172,14 @@ class LegacyICUTokenizer:
  
              # copy them back into the word table
              with CopyBuffer() as copystr:
-                for args in words.items():
-                    copystr.add(*args)
+                for k, v in words.items():
+                    copystr.add('w', k, {'count': v})
  
                  with conn.cursor() as cur:
                      copystr.copy_out(cur, 'word',
-                                     columns=['word_token', 'search_name_count'])
+                                     columns=['type', 'word_token', 'info'])
                      cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-                                   WHERE word_id is null""")
+                                   WHERE word_id is null and type = 'w'""")
  
              conn.commit()
  
@@ -229,22 +226,26 @@ class LegacyICUNameAnalyzer:
              The function is used for testing and debugging only
              and not necessarily efficient.
          """
-        tokens = {}
+        full_tokens = {}
+        partial_tokens = {}
          for word in words:
              if word.startswith('#'):
-                tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
+                full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
              else:
-                tokens[word] = self.name_processor.get_search_normalized(word)
+                partial_tokens[word] = self.name_processor.get_search_normalized(word)
  
          with self.conn.cursor() as cur:
-            cur.execute("""SELECT word_token, word_id
-                           FROM word, (SELECT unnest(%s::TEXT[]) as term) t
-                           WHERE word_token = t.term
-                                 and class is null and country_code is null""",
-                        (list(tokens.values()), ))
+            cur.execute("""(SELECT word_token, word_id
+                            FROM word WHERE word_token = ANY(%s) and type = 'W')
+                           UNION
+                           (SELECT word_token, word_id
+                            FROM word WHERE word_token = ANY(%s) and type = 'w')""",
+                        (list(full_tokens.values()),
+                         list(partial_tokens.values())))
              ids = {r[0]: r[1] for r in cur}
  
-        return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
+        return [(k, v, ids.get(v, None)) for k, v in full_tokens.items()] \
+               + [(k, v, ids.get(v, None)) for k, v in partial_tokens.items()]
  
  
      @staticmethod
@@ -276,8 +277,7 @@ class LegacyICUNameAnalyzer:
                              (SELECT pc, word FROM
                                (SELECT distinct(postcode) as pc FROM location_postcode) p
                                FULL JOIN
-                              (SELECT word FROM word
-                                WHERE class ='place' and type = 'postcode') w
+                              (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w
                                ON pc = word) x
                             WHERE pc is null or word is null""")
  
@@ -286,24 +286,23 @@ class LegacyICUNameAnalyzer:
                      if postcode is None:
                          to_delete.append(word)
                      else:
-                        copystr.add(
-                            postcode,
-                            ' ' + self.name_processor.get_search_normalized(postcode),
-                            'place', 'postcode', 0)
+                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                                    'P', {'postcode': postcode})
  
                  if to_delete:
                      cur.execute("""DELETE FROM WORD
-                                   WHERE class ='place' and type = 'postcode'
-                                         and word = any(%s)
+                                   WHERE type ='P' and info->>'postcode' = any(%s)
                                  """, (to_delete, ))
  
                  copystr.copy_out(cur, 'word',
-                                 columns=['word', 'word_token', 'class', 'type',
-                                          'search_name_count'])
+                                 columns=['word_token', 'type', 'info'])
  
  
      def update_special_phrases(self, phrases, should_replace):
          """ Replace the search index for special phrases with the new phrases.
+            If `should_replace` is True, then the previous set of will be
+            completely replaced. Otherwise the phrases are added to the
+            already existing ones.
          """
          norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
                              for p in phrases))
@@ -311,11 +310,10 @@ class LegacyICUNameAnalyzer:
          with self.conn.cursor() as cur:
              # Get the old phrases.
              existing_phrases = set()
-            cur.execute("""SELECT word, class, type, operator FROM word
-                           WHERE class != 'place'
-                                 OR (type != 'house' AND type != 'postcode')""")
-            for label, cls, typ, oper in cur:
-                existing_phrases.add((label, cls, typ, oper or '-'))
+            cur.execute("SELECT info FROM word WHERE type = 'S'")
+            for (info, ) in cur:
+                existing_phrases.add((info['word'], info['class'], info['type'],
+                                      info.get('op') or '-'))
  
              added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
              if should_replace:
@@ -338,13 +336,13 @@ class LegacyICUNameAnalyzer:
              for word, cls, typ, oper in to_add:
                  term = self.name_processor.get_search_normalized(word)
                  if term:
-                    copystr.add(word, ' ' + term, cls, typ,
-                                oper if oper in ('in', 'near')  else None, 0)
+                    copystr.add(term, 'S',
+                                {'word': word, 'class': cls, 'type': typ,
+                                 'op': oper if oper in ('in', 'near') else None})
                      added += 1
  
              copystr.copy_out(cursor, 'word',
-                             columns=['word', 'word_token', 'class', 'type',
-                                      'operator', 'search_name_count'])
+                             columns=['word_token', 'type', 'info'])
  
          return added
  
@@ -357,12 +355,12 @@ class LegacyICUNameAnalyzer:
          to_delete = existing_phrases - new_phrases
  
          if to_delete:
-            psycopg2.extras.execute_values(
-                cursor,
+            cursor.execute_values(
                  """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
-                    WHERE word = name and class = in_class and type = in_type
-                          and ((op = '-' and operator is null) or op = operator)""",
-                to_delete)
+                    WHERE info->>'word' = name
+                          and info->>'class' = in_class and info->>'type' = in_type
+                          and ((op = '-' and info->>'op' is null) or op = info->>'op')
+                """, to_delete)
  
          return len(to_delete)
  
@@ -372,21 +370,27 @@ class LegacyICUNameAnalyzer:
          """
          word_tokens = set()
          for name in self._compute_full_names(names):
-            if name:
-                word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
+            norm_name = self.name_processor.get_search_normalized(name)
+            if norm_name:
+                word_tokens.add(norm_name)
  
          with self.conn.cursor() as cur:
              # Get existing names
-            cur.execute("SELECT word_token FROM word WHERE country_code = %s",
+            cur.execute("""SELECT word_token FROM word
+                            WHERE type = 'C' and info->>'cc'= %s""",
                          (country_code, ))
              word_tokens.difference_update((t[0] for t in cur))
  
+            # Only add those names that are not yet in the list.
              if word_tokens:
-                cur.execute("""INSERT INTO word (word_id, word_token, country_code,
-                                                 search_name_count)
-                               (SELECT nextval('seq_word'), token, '{}', 0
+                cur.execute("""INSERT INTO word (word_token, type, info)
+                               (SELECT token, 'C', json_build_object('cc', %s)
                                  FROM unnest(%s) as token)
-                            """.format(country_code), (list(word_tokens),))
+                            """, (country_code, list(word_tokens)))
+
+            # No names are deleted at the moment.
+            # If deletion is made possible, then the static names from the
+            # initial 'country_name' table should be kept.
  
  
      def process_place(self, place):
@@ -409,33 +413,36 @@ class LegacyICUNameAnalyzer:
                  self.add_country_names(country_feature.lower(), names)
  
          address = place.get('address')
-
          if address:
-            hnrs = []
-            addr_terms = []
-            for key, value in address.items():
-                if key == 'postcode':
-                    self._add_postcode(value)
-                elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                    hnrs.append(value)
-                elif key == 'street':
-                    token_info.add_street(*self._compute_name_tokens({'name': value}))
-                elif key == 'place':
-                    token_info.add_place(*self._compute_name_tokens({'name': value}))
-                elif not key.startswith('_') and \
-                     key not in ('country', 'full'):
-                    addr_terms.append((key, *self._compute_name_tokens({'name': value})))
-
-            if hnrs:
-                hnrs = self._split_housenumbers(hnrs)
-                token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
-
-            if addr_terms:
-                token_info.add_address_terms(addr_terms)
+            self._process_place_address(token_info, address)
  
          return token_info.data
  
  
+    def _process_place_address(self, token_info, address):
+        hnrs = []
+        addr_terms = []
+        for key, value in address.items():
+            if key == 'postcode':
+                self._add_postcode(value)
+            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                hnrs.append(value)
+            elif key == 'street':
+                token_info.add_street(*self._compute_name_tokens({'name': value}))
+            elif key == 'place':
+                token_info.add_place(*self._compute_name_tokens({'name': value}))
+            elif not key.startswith('_') and \
+                 key not in ('country', 'full'):
+                addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+
+        if hnrs:
+            hnrs = self._split_housenumbers(hnrs)
+            token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
+
+        if addr_terms:
+            token_info.add_address_terms(addr_terms)
+
+
      def _compute_name_tokens(self, names):
          """ Computes the full name and partial name tokens for the given
              dictionary of names.
@@ -495,14 +502,13 @@ class LegacyICUNameAnalyzer:
  
                  with self.conn.cursor() as cur:
                      # no word_id needed for postcodes
-                    cur.execute("""INSERT INTO word (word, word_token, class, type,
-                                                     search_name_count)
-                                   (SELECT pc, %s, 'place', 'postcode', 0
+                    cur.execute("""INSERT INTO word (word_token, type, info)
+                                   (SELECT %s, 'P', json_build_object('postcode', pc)
                                      FROM (VALUES (%s)) as v(pc)
                                      WHERE NOT EXISTS
                                       (SELECT * FROM word
-                                      WHERE word = pc and class='place' and type='postcode'))
-                                """, (' ' + term, postcode))
+                                      WHERE type = 'P' and info->>postcode = pc))
+                                """, (term, postcode))
                  self._cache.postcodes.add(postcode)
  
  
@@ -593,7 +599,8 @@ class _TokenCache:
  
      def get_hnr_tokens(self, conn, terms):
          """ Get token ids for a list of housenumbers, looking them up in the
-            database if necessary.
+            database if necessary. `terms` is an iterable of normalized
+            housenumbers.
          """
          tokens = []
          askdb = []