reinstate word column in icu word table

author Sarah Hoffmann <lonvia@denofr.de>

Sun, 25 Jul 2021 13:08:11 +0000 (15:08 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Sun, 25 Jul 2021 13:08:11 +0000 (15:08 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php

index 9bd9828cb0703ce388108713856c6f176c15feb4..7a900e5a44f62c1abb21339465d5277e3fa34afc 100644 (file)
--- a/lib-php/tokenizer/legacy_icu_tokenizer.php
+++ b/lib-php/tokenizer/legacy_icu_tokenizer.php
@@ -145,8 +145,7 @@ class Tokenizer
      private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
      {
          // Check which tokens we have, get the ID numbers
-        $sSQL = 'SELECT word_id, word_token, type,';
-        $sSQL .= "      info->>'cc' as country, info->>'postcode' as postcode,";
+        $sSQL = 'SELECT word_id, word_token, type, word,';
          $sSQL .= "      info->>'op' as operator,";
          $sSQL .= "      info->>'class' as class, info->>'type' as ctype,";
          $sSQL .= "      info->>'count' as count";
@@ -163,11 +162,14 @@ class Tokenizer
  
              switch ($aWord['type']) {
                  case 'C':  // country name tokens
-                    if ($aWord['country'] !== null
+                    if ($aWord['word'] !== null
                          && (!$this->aCountryRestriction
-                            || in_array($aWord['country'], $this->aCountryRestriction))
+                            || in_array($aWord['word'], $this->aCountryRestriction))
                      ) {
-                        $oValidTokens->addToken($sTok, new Token\Country($iId, $aWord['country']));
+                        $oValidTokens->addToken(
+                            $sTok,
+                            new Token\Country($iId, $aWord['word'])
+                        );
                      }
                      break;
                  case 'H':  // house number tokens
@@ -177,12 +179,15 @@ class Tokenizer
                      // Postcodes are not normalized, so they may have content
                      // that makes SQL injection possible. Reject postcodes
                      // that would need special escaping.
-                    if ($aWord['postcode'] !== null
-                        && pg_escape_string($aWord['postcode']) == $aWord['postcode']
+                    if ($aWord['word'] !== null
+                        && pg_escape_string($aWord['word']) == $aWord['word']
                      ) {
-                        $sNormPostcode = $this->normalizeString($aWord['postcode']);
+                        $sNormPostcode = $this->normalizeString($aWord['word']);
                          if (strpos($sNormQuery, $sNormPostcode) !== false) {
-                            $oValidTokens->addToken($sTok, new Token\Postcode($iId, $aWord['postcode'], null));
+                            $oValidTokens->addToken(
+                                $sTok,
+                                new Token\Postcode($iId, $aWord['word'], null)
+                            );
                          }
                      }
                      break;
@@ -192,7 +197,7 @@ class Tokenizer
                              $iId,
                              $aWord['class'],
                              $aWord['ctype'],
-                            (isset($aWord['op'])) ? Operator::NEAR : Operator::NONE
+                            (isset($aWord['operator'])) ? Operator::NEAR : Operator::NONE
                          ));
                      }
                      break;
diff --git a/lib-sql/tokenizer/icu_tokenizer_tables.sql b/lib-sql/tokenizer/icu_tokenizer_tables.sql

index 912ba768ea1121990e412ad8ec19dfcff57cca70..7ec3c6f891c958d5eb43de3df91faba6ea30a37c 100644 (file)
--- a/lib-sql/tokenizer/icu_tokenizer_tables.sql
+++ b/lib-sql/tokenizer/icu_tokenizer_tables.sql
@@ -3,6 +3,7 @@ CREATE TABLE word (
    word_id INTEGER,
    word_token text NOT NULL,
    type text NOT NULL,
+  word text,
    info jsonb
  ) {{db.tablespace.search_data}};
  
@@ -10,15 +11,15 @@ CREATE INDEX idx_word_word_token ON word
      USING BTREE (word_token) {{db.tablespace.search_index}};
  -- Used when updating country names from the boundary relation.
  CREATE INDEX idx_word_country_names ON word
-    USING btree((info->>'cc')) {{db.tablespace.address_index}}
+    USING btree(word) {{db.tablespace.address_index}}
      WHERE type = 'C';
  -- Used when inserting new postcodes on updates.
  CREATE INDEX idx_word_postcodes ON word
-    USING btree((info->>'postcode')) {{db.tablespace.address_index}}
+    USING btree(word) {{db.tablespace.address_index}}
      WHERE type = 'P';
  -- Used when inserting full words.
  CREATE INDEX idx_word_full_word ON word
-    USING btree((info->>'word')) {{db.tablespace.address_index}}
+    USING btree(word) {{db.tablespace.address_index}}
      WHERE type = 'W';
  
  GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
diff --git a/lib-sql/tokenizer/legacy_icu_tokenizer.sql b/lib-sql/tokenizer/legacy_icu_tokenizer.sql

index e021bf8bc45e278cd25f496e4f9cfc8917a4ac89..ffe6648c38e959c6279efb2d1898d835514f32a7 100644 (file)
--- a/lib-sql/tokenizer/legacy_icu_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql
@@ -98,13 +98,13 @@ DECLARE
    term_count INTEGER;
  BEGIN
    SELECT min(word_id) INTO full_token
-    FROM word WHERE info->>'word' = norm_term and type = 'W';
+    FROM word WHERE word = norm_term and type = 'W';
  
    IF full_token IS NULL THEN
      full_token := nextval('seq_word');
-    INSERT INTO word (word_id, word_token, type, info)
-      SELECT full_token, lookup_term, 'W',
-             json_build_object('word', norm_term, 'count', 0)
+    INSERT INTO word (word_id, word_token, type, word, info)
+      SELECT full_token, lookup_term, 'W', norm_term,
+             json_build_object('count', 0)
          FROM unnest(lookup_terms) as lookup_term;
    END IF;
  
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index 3d7d752e7d45675fa70f245cb30c574c087d8603..1f8096ff2848df94f326578496e1009666960d6f 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -278,7 +278,7 @@ class LegacyICUNameAnalyzer:
                              (SELECT pc, word FROM
                                (SELECT distinct(postcode) as pc FROM location_postcode) p
                                FULL JOIN
-                              (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w
+                              (SELECT word FROM word WHERE type = 'P') w
                                ON pc = word) x
                             WHERE pc is null or word is null""")
  
@@ -288,15 +288,15 @@ class LegacyICUNameAnalyzer:
                          to_delete.append(word)
                      else:
                          copystr.add(self.name_processor.get_search_normalized(postcode),
-                                    'P', json.dumps({'postcode': postcode}))
+                                    'P', postcode)
  
                  if to_delete:
                      cur.execute("""DELETE FROM WORD
-                                   WHERE type ='P' and info->>'postcode' = any(%s)
+                                   WHERE type ='P' and word = any(%s)
                                  """, (to_delete, ))
  
                  copystr.copy_out(cur, 'word',
-                                 columns=['word_token', 'type', 'info'])
+                                 columns=['word_token', 'type', 'word'])
  
  
      def update_special_phrases(self, phrases, should_replace):
@@ -311,9 +311,9 @@ class LegacyICUNameAnalyzer:
          with self.conn.cursor() as cur:
              # Get the old phrases.
              existing_phrases = set()
-            cur.execute("SELECT info FROM word WHERE type = 'S'")
-            for (info, ) in cur:
-                existing_phrases.add((info['word'], info['class'], info['type'],
+            cur.execute("SELECT word, info FROM word WHERE type = 'S'")
+            for word, info in cur:
+                existing_phrases.add((word, info['class'], info['type'],
                                        info.get('op') or '-'))
  
              added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
@@ -337,13 +337,13 @@ class LegacyICUNameAnalyzer:
              for word, cls, typ, oper in to_add:
                  term = self.name_processor.get_search_normalized(word)
                  if term:
-                    copystr.add(term, 'S',
-                                json.dumps({'word': word, 'class': cls, 'type': typ,
+                    copystr.add(term, 'S', word,
+                                json.dumps({'class': cls, 'type': typ,
                                              'op': oper if oper in ('in', 'near') else None}))
                      added += 1
  
              copystr.copy_out(cursor, 'word',
-                             columns=['word_token', 'type', 'info'])
+                             columns=['word_token', 'type', 'word', 'info'])
  
          return added
  
@@ -358,7 +358,7 @@ class LegacyICUNameAnalyzer:
          if to_delete:
              cursor.execute_values(
                  """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
-                    WHERE info->>'word' = name
+                    WHERE type = 'S' and word = name
                            and info->>'class' = in_class and info->>'type' = in_type
                            and ((op = '-' and info->>'op' is null) or op = info->>'op')
                  """, to_delete)
@@ -378,14 +378,14 @@ class LegacyICUNameAnalyzer:
          with self.conn.cursor() as cur:
              # Get existing names
              cur.execute("""SELECT word_token FROM word
-                            WHERE type = 'C' and info->>'cc'= %s""",
+                            WHERE type = 'C' and word = %s""",
                          (country_code, ))
              word_tokens.difference_update((t[0] for t in cur))
  
              # Only add those names that are not yet in the list.
              if word_tokens:
-                cur.execute("""INSERT INTO word (word_token, type, info)
-                               (SELECT token, 'C', json_build_object('cc', %s)
+                cur.execute("""INSERT INTO word (word_token, type, word)
+                               (SELECT token, 'C', %s
                                  FROM unnest(%s) as token)
                              """, (country_code, list(word_tokens)))
  
@@ -503,12 +503,11 @@ class LegacyICUNameAnalyzer:
  
                  with self.conn.cursor() as cur:
                      # no word_id needed for postcodes
-                    cur.execute("""INSERT INTO word (word_token, type, info)
-                                   (SELECT %s, 'P', json_build_object('postcode', pc)
-                                    FROM (VALUES (%s)) as v(pc)
+                    cur.execute("""INSERT INTO word (word_token, type, word)
+                                   (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
                                      WHERE NOT EXISTS
                                       (SELECT * FROM word
-                                      WHERE type = 'P' and info->>'postcode' = pc))
+                                      WHERE type = 'P' and word = pc))
                                  """, (term, postcode))
                  self._cache.postcodes.add(postcode)
  
diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py

index be2789f326c49c47026bd4113d1aec45f0fa12fd..ac61fc67356aa8ab04274fe69bf9b28f0eddeffd 100644 (file)
--- a/test/bdd/steps/steps_db_ops.py
+++ b/test/bdd/steps/steps_db_ops.py
@@ -266,22 +266,6 @@ def check_location_postcode(context):
  
              db_row.assert_row(row, ('country', 'postcode'))
  
-@then("word contains(?P<exclude> not)?")
-def check_word_table(context, exclude):
-    """ Check the contents of the word table. Each row represents a table row
-        and all data must match. Data not present in the expected table, may
-        be arbitry. The rows are identified via all given columns.
-    """
-    with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-        for row in context.table:
-            wheres = ' AND '.join(["{} = %s".format(h) for h in row.headings])
-            cur.execute("SELECT * from word WHERE " + wheres, list(row.cells))
-            if exclude:
-                assert cur.rowcount == 0, "Row still in word table: %s" % '/'.join(values)
-            else:
-                assert cur.rowcount > 0, "Row not in word table: %s" % '/'.join(values)
-
-
  @then("there are(?P<exclude> no)? word tokens for postcodes (?P<postcodes>.*)")
  def check_word_table_for_postcodes(context, exclude, postcodes):
      """ Check that the tokenizer produces postcode tokens for the given
@@ -297,8 +281,7 @@ def check_word_table_for_postcodes(context, exclude, postcodes):
  
      with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
          if nctx.tokenizer == 'legacy_icu':
-            cur.execute("""SELECT info->>'postcode' FROM word
-                           WHERE type = 'P' and info->>'postcode' = any(%s)""",
+            cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
                          (plist,))
          else:
              cur.execute("""SELECT word FROM word WHERE word = any(%s)
diff --git a/test/python/mock_icu_word_table.py b/test/python/mock_icu_word_table.py

index 3d457d0bd99dbb073fc1923bf936ccb882bd4132..cde5e770990dd9655e9708d611ae4d070b5675ed 100644 (file)
--- a/test/python/mock_icu_word_table.py
+++ b/test/python/mock_icu_word_table.py
@@ -12,16 +12,16 @@ class MockIcuWordTable:
              cur.execute("""CREATE TABLE word (word_id INTEGER,
                                                word_token text NOT NULL,
                                                type text NOT NULL,
+                                              word text,
                                                info jsonb)""")
  
          conn.commit()
  
      def add_special(self, word_token, word, cls, typ, oper):
          with self.conn.cursor() as cur:
-            cur.execute("""INSERT INTO word (word_token, type, info)
-                              VALUES (%s, 'S',
-                                      json_build_object('word', %s,
-                                                        'class', %s,
+            cur.execute("""INSERT INTO word (word_token, type, word, info)
+                              VALUES (%s, 'S', %s,
+                                      json_build_object('class', %s,
                                                          'type', %s,
                                                          'op', %s))
                          """, (word_token, word, cls, typ, oper))
@@ -30,16 +30,16 @@ class MockIcuWordTable:
  
      def add_country(self, country_code, word_token):
          with self.conn.cursor() as cur:
-            cur.execute("""INSERT INTO word (word_token, type, info)
-                           VALUES(%s, 'C', json_build_object('cc', %s))""",
+            cur.execute("""INSERT INTO word (word_token, type, word)
+                           VALUES(%s, 'C', %s)""",
                          (word_token, country_code))
          self.conn.commit()
  
  
      def add_postcode(self, word_token, postcode):
          with self.conn.cursor() as cur:
-            cur.execute("""INSERT INTO word (word_token, type, info)
-                              VALUES (%s, 'P', json_build_object('postcode', %s))
+            cur.execute("""INSERT INTO word (word_token, type, word)
+                              VALUES (%s, 'P', %s)
                          """, (word_token, postcode))
          self.conn.commit()
  
@@ -56,8 +56,8 @@ class MockIcuWordTable:
  
      def get_special(self):
          with self.conn.cursor() as cur:
-            cur.execute("SELECT word_token, info FROM word WHERE type = 'S'")
-            result = set(((row[0], row[1]['word'], row[1]['class'],
+            cur.execute("SELECT word_token, info, word FROM word WHERE type = 'S'")
+            result = set(((row[0], row[2], row[1]['class'],
                             row[1]['type'], row[1]['op']) for row in cur))
              assert len(result) == cur.rowcount, "Word table has duplicates."
              return result
@@ -65,7 +65,7 @@ class MockIcuWordTable:
  
      def get_country(self):
          with self.conn.cursor() as cur:
-            cur.execute("SELECT info->>'cc', word_token FROM word WHERE type = 'C'")
+            cur.execute("SELECT word, word_token FROM word WHERE type = 'C'")
              result = set((tuple(row) for row in cur))
              assert len(result) == cur.rowcount, "Word table has duplicates."
              return result
@@ -73,7 +73,7 @@ class MockIcuWordTable:
  
      def get_postcodes(self):
          with self.conn.cursor() as cur:
-            cur.execute("SELECT info->>'postcode' FROM word WHERE type = 'P'")
+            cur.execute("SELECT word FROM word WHERE type = 'P'")
              return set((row[0] for row in cur))
author	Sarah Hoffmann <lonvia@denofr.de>
	Sun, 25 Jul 2021 13:08:11 +0000 (15:08 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
lib-php/tokenizer/legacy_icu_tokenizer.php		patch \| blob \| history
lib-sql/tokenizer/icu_tokenizer_tables.sql		patch \| blob \| history
lib-sql/tokenizer/legacy_icu_tokenizer.sql		patch \| blob \| history
nominatim/tokenizer/legacy_icu_tokenizer.py		patch \| blob \| history
test/bdd/steps/steps_db_ops.py		patch \| blob \| history
test/python/mock_icu_word_table.py		patch \| blob \| history