]> git.openstreetmap.org Git - nominatim.git/commitdiff
reinstate word column in icu word table
authorSarah Hoffmann <lonvia@denofr.de>
Sun, 25 Jul 2021 13:08:11 +0000 (15:08 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
Postgresql is very bad at creating statistics for jsonb
columns. The result is that the query planer tends to
use JIT for queries with a where over 'info' even when
there is an index.

lib-php/tokenizer/legacy_icu_tokenizer.php
lib-sql/tokenizer/icu_tokenizer_tables.sql
lib-sql/tokenizer/legacy_icu_tokenizer.sql
nominatim/tokenizer/legacy_icu_tokenizer.py
test/bdd/steps/steps_db_ops.py
test/python/mock_icu_word_table.py

index 9bd9828cb0703ce388108713856c6f176c15feb4..7a900e5a44f62c1abb21339465d5277e3fa34afc 100644 (file)
@@ -145,8 +145,7 @@ class Tokenizer
     private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
     {
         // Check which tokens we have, get the ID numbers
-        $sSQL = 'SELECT word_id, word_token, type,';
-        $sSQL .= "      info->>'cc' as country, info->>'postcode' as postcode,";
+        $sSQL = 'SELECT word_id, word_token, type, word,';
         $sSQL .= "      info->>'op' as operator,";
         $sSQL .= "      info->>'class' as class, info->>'type' as ctype,";
         $sSQL .= "      info->>'count' as count";
@@ -163,11 +162,14 @@ class Tokenizer
 
             switch ($aWord['type']) {
                 case 'C':  // country name tokens
-                    if ($aWord['country'] !== null
+                    if ($aWord['word'] !== null
                         && (!$this->aCountryRestriction
-                            || in_array($aWord['country'], $this->aCountryRestriction))
+                            || in_array($aWord['word'], $this->aCountryRestriction))
                     ) {
-                        $oValidTokens->addToken($sTok, new Token\Country($iId, $aWord['country']));
+                        $oValidTokens->addToken(
+                            $sTok,
+                            new Token\Country($iId, $aWord['word'])
+                        );
                     }
                     break;
                 case 'H':  // house number tokens
@@ -177,12 +179,15 @@ class Tokenizer
                     // Postcodes are not normalized, so they may have content
                     // that makes SQL injection possible. Reject postcodes
                     // that would need special escaping.
-                    if ($aWord['postcode'] !== null
-                        && pg_escape_string($aWord['postcode']) == $aWord['postcode']
+                    if ($aWord['word'] !== null
+                        && pg_escape_string($aWord['word']) == $aWord['word']
                     ) {
-                        $sNormPostcode = $this->normalizeString($aWord['postcode']);
+                        $sNormPostcode = $this->normalizeString($aWord['word']);
                         if (strpos($sNormQuery, $sNormPostcode) !== false) {
-                            $oValidTokens->addToken($sTok, new Token\Postcode($iId, $aWord['postcode'], null));
+                            $oValidTokens->addToken(
+                                $sTok,
+                                new Token\Postcode($iId, $aWord['word'], null)
+                            );
                         }
                     }
                     break;
@@ -192,7 +197,7 @@ class Tokenizer
                             $iId,
                             $aWord['class'],
                             $aWord['ctype'],
-                            (isset($aWord['op'])) ? Operator::NEAR : Operator::NONE
+                            (isset($aWord['operator'])) ? Operator::NEAR : Operator::NONE
                         ));
                     }
                     break;
index 912ba768ea1121990e412ad8ec19dfcff57cca70..7ec3c6f891c958d5eb43de3df91faba6ea30a37c 100644 (file)
@@ -3,6 +3,7 @@ CREATE TABLE word (
   word_id INTEGER,
   word_token text NOT NULL,
   type text NOT NULL,
+  word text,
   info jsonb
 ) {{db.tablespace.search_data}};
 
@@ -10,15 +11,15 @@ CREATE INDEX idx_word_word_token ON word
     USING BTREE (word_token) {{db.tablespace.search_index}};
 -- Used when updating country names from the boundary relation.
 CREATE INDEX idx_word_country_names ON word
-    USING btree((info->>'cc')) {{db.tablespace.address_index}}
+    USING btree(word) {{db.tablespace.address_index}}
     WHERE type = 'C';
 -- Used when inserting new postcodes on updates.
 CREATE INDEX idx_word_postcodes ON word
-    USING btree((info->>'postcode')) {{db.tablespace.address_index}}
+    USING btree(word) {{db.tablespace.address_index}}
     WHERE type = 'P';
 -- Used when inserting full words.
 CREATE INDEX idx_word_full_word ON word
-    USING btree((info->>'word')) {{db.tablespace.address_index}}
+    USING btree(word) {{db.tablespace.address_index}}
     WHERE type = 'W';
 
 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
index e021bf8bc45e278cd25f496e4f9cfc8917a4ac89..ffe6648c38e959c6279efb2d1898d835514f32a7 100644 (file)
@@ -98,13 +98,13 @@ DECLARE
   term_count INTEGER;
 BEGIN
   SELECT min(word_id) INTO full_token
-    FROM word WHERE info->>'word' = norm_term and type = 'W';
+    FROM word WHERE word = norm_term and type = 'W';
 
   IF full_token IS NULL THEN
     full_token := nextval('seq_word');
-    INSERT INTO word (word_id, word_token, type, info)
-      SELECT full_token, lookup_term, 'W',
-             json_build_object('word', norm_term, 'count', 0)
+    INSERT INTO word (word_id, word_token, type, word, info)
+      SELECT full_token, lookup_term, 'W', norm_term,
+             json_build_object('count', 0)
         FROM unnest(lookup_terms) as lookup_term;
   END IF;
 
index 3d7d752e7d45675fa70f245cb30c574c087d8603..1f8096ff2848df94f326578496e1009666960d6f 100644 (file)
@@ -278,7 +278,7 @@ class LegacyICUNameAnalyzer:
                             (SELECT pc, word FROM
                               (SELECT distinct(postcode) as pc FROM location_postcode) p
                               FULL JOIN
-                              (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w
+                              (SELECT word FROM word WHERE type = 'P') w
                               ON pc = word) x
                            WHERE pc is null or word is null""")
 
@@ -288,15 +288,15 @@ class LegacyICUNameAnalyzer:
                         to_delete.append(word)
                     else:
                         copystr.add(self.name_processor.get_search_normalized(postcode),
-                                    'P', json.dumps({'postcode': postcode}))
+                                    'P', postcode)
 
                 if to_delete:
                     cur.execute("""DELETE FROM WORD
-                                   WHERE type ='P' and info->>'postcode' = any(%s)
+                                   WHERE type ='P' and word = any(%s)
                                 """, (to_delete, ))
 
                 copystr.copy_out(cur, 'word',
-                                 columns=['word_token', 'type', 'info'])
+                                 columns=['word_token', 'type', 'word'])
 
 
     def update_special_phrases(self, phrases, should_replace):
@@ -311,9 +311,9 @@ class LegacyICUNameAnalyzer:
         with self.conn.cursor() as cur:
             # Get the old phrases.
             existing_phrases = set()
-            cur.execute("SELECT info FROM word WHERE type = 'S'")
-            for (info, ) in cur:
-                existing_phrases.add((info['word'], info['class'], info['type'],
+            cur.execute("SELECT word, info FROM word WHERE type = 'S'")
+            for word, info in cur:
+                existing_phrases.add((word, info['class'], info['type'],
                                       info.get('op') or '-'))
 
             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
@@ -337,13 +337,13 @@ class LegacyICUNameAnalyzer:
             for word, cls, typ, oper in to_add:
                 term = self.name_processor.get_search_normalized(word)
                 if term:
-                    copystr.add(term, 'S',
-                                json.dumps({'word': word, 'class': cls, 'type': typ,
+                    copystr.add(term, 'S', word,
+                                json.dumps({'class': cls, 'type': typ,
                                             'op': oper if oper in ('in', 'near') else None}))
                     added += 1
 
             copystr.copy_out(cursor, 'word',
-                             columns=['word_token', 'type', 'info'])
+                             columns=['word_token', 'type', 'word', 'info'])
 
         return added
 
@@ -358,7 +358,7 @@ class LegacyICUNameAnalyzer:
         if to_delete:
             cursor.execute_values(
                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
-                    WHERE info->>'word' = name
+                    WHERE type = 'S' and word = name
                           and info->>'class' = in_class and info->>'type' = in_type
                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
                 """, to_delete)
@@ -378,14 +378,14 @@ class LegacyICUNameAnalyzer:
         with self.conn.cursor() as cur:
             # Get existing names
             cur.execute("""SELECT word_token FROM word
-                            WHERE type = 'C' and info->>'cc'= %s""",
+                            WHERE type = 'C' and word = %s""",
                         (country_code, ))
             word_tokens.difference_update((t[0] for t in cur))
 
             # Only add those names that are not yet in the list.
             if word_tokens:
-                cur.execute("""INSERT INTO word (word_token, type, info)
-                               (SELECT token, 'C', json_build_object('cc', %s)
+                cur.execute("""INSERT INTO word (word_token, type, word)
+                               (SELECT token, 'C', %s
                                 FROM unnest(%s) as token)
                             """, (country_code, list(word_tokens)))
 
@@ -503,12 +503,11 @@ class LegacyICUNameAnalyzer:
 
                 with self.conn.cursor() as cur:
                     # no word_id needed for postcodes
-                    cur.execute("""INSERT INTO word (word_token, type, info)
-                                   (SELECT %s, 'P', json_build_object('postcode', pc)
-                                    FROM (VALUES (%s)) as v(pc)
+                    cur.execute("""INSERT INTO word (word_token, type, word)
+                                   (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
                                     WHERE NOT EXISTS
                                      (SELECT * FROM word
-                                      WHERE type = 'P' and info->>'postcode' = pc))
+                                      WHERE type = 'P' and word = pc))
                                 """, (term, postcode))
                 self._cache.postcodes.add(postcode)
 
index be2789f326c49c47026bd4113d1aec45f0fa12fd..ac61fc67356aa8ab04274fe69bf9b28f0eddeffd 100644 (file)
@@ -266,22 +266,6 @@ def check_location_postcode(context):
 
             db_row.assert_row(row, ('country', 'postcode'))
 
-@then("word contains(?P<exclude> not)?")
-def check_word_table(context, exclude):
-    """ Check the contents of the word table. Each row represents a table row
-        and all data must match. Data not present in the expected table, may
-        be arbitry. The rows are identified via all given columns.
-    """
-    with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-        for row in context.table:
-            wheres = ' AND '.join(["{} = %s".format(h) for h in row.headings])
-            cur.execute("SELECT * from word WHERE " + wheres, list(row.cells))
-            if exclude:
-                assert cur.rowcount == 0, "Row still in word table: %s" % '/'.join(values)
-            else:
-                assert cur.rowcount > 0, "Row not in word table: %s" % '/'.join(values)
-
-
 @then("there are(?P<exclude> no)? word tokens for postcodes (?P<postcodes>.*)")
 def check_word_table_for_postcodes(context, exclude, postcodes):
     """ Check that the tokenizer produces postcode tokens for the given
@@ -297,8 +281,7 @@ def check_word_table_for_postcodes(context, exclude, postcodes):
 
     with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
         if nctx.tokenizer == 'legacy_icu':
-            cur.execute("""SELECT info->>'postcode' FROM word
-                           WHERE type = 'P' and info->>'postcode' = any(%s)""",
+            cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
                         (plist,))
         else:
             cur.execute("""SELECT word FROM word WHERE word = any(%s)
index 3d457d0bd99dbb073fc1923bf936ccb882bd4132..cde5e770990dd9655e9708d611ae4d070b5675ed 100644 (file)
@@ -12,16 +12,16 @@ class MockIcuWordTable:
             cur.execute("""CREATE TABLE word (word_id INTEGER,
                                               word_token text NOT NULL,
                                               type text NOT NULL,
+                                              word text,
                                               info jsonb)""")
 
         conn.commit()
 
     def add_special(self, word_token, word, cls, typ, oper):
         with self.conn.cursor() as cur:
-            cur.execute("""INSERT INTO word (word_token, type, info)
-                              VALUES (%s, 'S',
-                                      json_build_object('word', %s,
-                                                        'class', %s,
+            cur.execute("""INSERT INTO word (word_token, type, word, info)
+                              VALUES (%s, 'S', %s,
+                                      json_build_object('class', %s,
                                                         'type', %s,
                                                         'op', %s))
                         """, (word_token, word, cls, typ, oper))
@@ -30,16 +30,16 @@ class MockIcuWordTable:
 
     def add_country(self, country_code, word_token):
         with self.conn.cursor() as cur:
-            cur.execute("""INSERT INTO word (word_token, type, info)
-                           VALUES(%s, 'C', json_build_object('cc', %s))""",
+            cur.execute("""INSERT INTO word (word_token, type, word)
+                           VALUES(%s, 'C', %s)""",
                         (word_token, country_code))
         self.conn.commit()
 
 
     def add_postcode(self, word_token, postcode):
         with self.conn.cursor() as cur:
-            cur.execute("""INSERT INTO word (word_token, type, info)
-                              VALUES (%s, 'P', json_build_object('postcode', %s))
+            cur.execute("""INSERT INTO word (word_token, type, word)
+                              VALUES (%s, 'P', %s)
                         """, (word_token, postcode))
         self.conn.commit()
 
@@ -56,8 +56,8 @@ class MockIcuWordTable:
 
     def get_special(self):
         with self.conn.cursor() as cur:
-            cur.execute("SELECT word_token, info FROM word WHERE type = 'S'")
-            result = set(((row[0], row[1]['word'], row[1]['class'],
+            cur.execute("SELECT word_token, info, word FROM word WHERE type = 'S'")
+            result = set(((row[0], row[2], row[1]['class'],
                            row[1]['type'], row[1]['op']) for row in cur))
             assert len(result) == cur.rowcount, "Word table has duplicates."
             return result
@@ -65,7 +65,7 @@ class MockIcuWordTable:
 
     def get_country(self):
         with self.conn.cursor() as cur:
-            cur.execute("SELECT info->>'cc', word_token FROM word WHERE type = 'C'")
+            cur.execute("SELECT word, word_token FROM word WHERE type = 'C'")
             result = set((tuple(row) for row in cur))
             assert len(result) == cur.rowcount, "Word table has duplicates."
             return result
@@ -73,7 +73,7 @@ class MockIcuWordTable:
 
     def get_postcodes(self):
         with self.conn.cursor() as cur:
-            cur.execute("SELECT info->>'postcode' FROM word WHERE type = 'P'")
+            cur.execute("SELECT word FROM word WHERE type = 'P'")
             return set((row[0] for row in cur))