]> git.openstreetmap.org Git - nominatim.git/commitdiff
switch postcode tokens to new word table layout
authorSarah Hoffmann <lonvia@denofr.de>
Tue, 20 Jul 2021 10:11:12 +0000 (12:11 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
lib-php/tokenizer/legacy_icu_tokenizer.php
lib-sql/tokenizer/icu_tokenizer_tables.sql
nominatim/tokenizer/legacy_icu_tokenizer.py

index b2fc27c715f26072e17a2c32b1721ba512b20f79..2461a1fd828698bb6786e16778b62c1b8b6077f2 100644 (file)
@@ -147,7 +147,7 @@ class Tokenizer
     {
         // Check which tokens we have, get the ID numbers
         $sSQL = 'SELECT word_id, word_token, type';
-        $sSQL .= "      info->>'cc' as country";
+        $sSQL .= "      info->>'cc' as country, info->>'postcode' as postcode";
         $sSQL .= ' FROM word WHERE word_token in (';
         $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
 
@@ -171,6 +171,16 @@ class Tokenizer
                 'H':  // house number tokens
                     $oToken = new Token\HouseNumber($iId, $aWord['word_token']);
                     break;
+                'P':  // postcode tokens
+                    // Postcodes are not normalized, so they may have content
+                    // that makes SQL injection possible. Reject postcodes
+                    // that would need special escaping.
+                    if ($aWord['postcode'] === null
+                        || pg_escape_string($aWord['postcode']) == $aWord['postcode']
+                    ) {
+                       continue;
+                    }
+                    $oToken = new Token\Postcode($iId, $aWord['postcode'], null);
                 default:
                     continue;
             }
index 1d70a9c38e8b86b8600aad4dd809f1fe6a0eded9..dd3ac4779613d4cc1b69c093f8fbe78bd8587d03 100644 (file)
@@ -10,7 +10,12 @@ CREATE INDEX idx_word_word_token ON word
     USING BTREE (word_token) {{db.tablespace.search_index}};
 -- Used when updating country names from the boundary relation.
 CREATE INDEX idx_word_country_names ON word
-    USING btree((info->>'cc')) WHERE type = 'C';
+    USING btree((info->>'cc')) {{db.tablespace.address_index}}
+    WHERE type = 'C';
+-- Used when inserting new postcodes on updates.
+CREATE INDEX idx_word_postcodes ON word
+    USING btree((info->>'postcode')) {{db.tablespace.address_index}}
+    WHERE type = 'P'
 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
 
 DROP SEQUENCE IF EXISTS seq_word;
index 9fbb9bb09688cf349bfb28a74c84d640fcd84767..e0fd3a023d5453fc82838af563c01d9d4f759181 100644 (file)
@@ -276,8 +276,7 @@ class LegacyICUNameAnalyzer:
                             (SELECT pc, word FROM
                               (SELECT distinct(postcode) as pc FROM location_postcode) p
                               FULL JOIN
-                              (SELECT word FROM word
-                                WHERE class ='place' and type = 'postcode') w
+                              (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w
                               ON pc = word) x
                            WHERE pc is null or word is null""")
 
@@ -286,20 +285,16 @@ class LegacyICUNameAnalyzer:
                     if postcode is None:
                         to_delete.append(word)
                     else:
-                        copystr.add(
-                            postcode,
-                            ' ' + self.name_processor.get_search_normalized(postcode),
-                            'place', 'postcode', 0)
+                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                                    'P', {'postcode': postcode})
 
                 if to_delete:
                     cur.execute("""DELETE FROM WORD
-                                   WHERE class ='place' and type = 'postcode'
-                                         and word = any(%s)
+                                   WHERE class ='P' and info->>'postcode' = any(%s)
                                 """, (to_delete, ))
 
                 copystr.copy_out(cur, 'word',
-                                 columns=['word', 'word_token', 'class', 'type',
-                                          'search_name_count'])
+                                 columns=['word_token', 'type', 'info'])
 
 
     def update_special_phrases(self, phrases, should_replace):
@@ -503,14 +498,13 @@ class LegacyICUNameAnalyzer:
 
                 with self.conn.cursor() as cur:
                     # no word_id needed for postcodes
-                    cur.execute("""INSERT INTO word (word, word_token, class, type,
-                                                     search_name_count)
-                                   (SELECT pc, %s, 'place', 'postcode', 0
+                    cur.execute("""INSERT INTO word (word_token, type, info)
+                                   (SELECT %s, 'P', json_build_object('postcode', pc)
                                     FROM (VALUES (%s)) as v(pc)
                                     WHERE NOT EXISTS
                                      (SELECT * FROM word
-                                      WHERE word = pc and class='place' and type='postcode'))
-                                """, (' ' + term, postcode))
+                                      WHERE type = 'P' and info->>postcode = pc))
+                                """, (term, postcode))
                 self._cache.postcodes.add(postcode)