From 5394b1fa1bd258745a9906c7df605310bc6cc021 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Tue, 20 Jul 2021 12:11:12 +0200
Subject: [PATCH] switch postcode tokens to new word table layout

---
 lib-php/tokenizer/legacy_icu_tokenizer.php  | 12 ++++++++++-
 lib-sql/tokenizer/icu_tokenizer_tables.sql  |  7 +++++-
 nominatim/tokenizer/legacy_icu_tokenizer.py | 24 ++++++++-------------
 3 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php
index b2fc27c7..2461a1fd 100644
--- a/lib-php/tokenizer/legacy_icu_tokenizer.php
+++ b/lib-php/tokenizer/legacy_icu_tokenizer.php
@@ -147,7 +147,7 @@ class Tokenizer
     {
         // Check which tokens we have, get the ID numbers
         $sSQL = 'SELECT word_id, word_token, type';
-        $sSQL .= "      info->>'cc' as country";
+        $sSQL .= "      info->>'cc' as country, info->>'postcode' as postcode";
         $sSQL .= ' FROM word WHERE word_token in (';
         $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
 
@@ -171,6 +171,16 @@ class Tokenizer
                 'H':  // house number tokens
                     $oToken = new Token\HouseNumber($iId, $aWord['word_token']);
                     break;
+                'P':  // postcode tokens
+                    // Postcodes are not normalized, so they may have content
+                    // that makes SQL injection possible. Reject postcodes
+                    // that would need special escaping.
+                    if ($aWord['postcode'] === null
+                        || pg_escape_string($aWord['postcode']) == $aWord['postcode']
+                    ) {
+                       continue;
+                    }
+                    $oToken = new Token\Postcode($iId, $aWord['postcode'], null);
                 default:
                     continue;
             }
diff --git a/lib-sql/tokenizer/icu_tokenizer_tables.sql b/lib-sql/tokenizer/icu_tokenizer_tables.sql
index 1d70a9c3..dd3ac477 100644
--- a/lib-sql/tokenizer/icu_tokenizer_tables.sql
+++ b/lib-sql/tokenizer/icu_tokenizer_tables.sql
@@ -10,7 +10,12 @@ CREATE INDEX idx_word_word_token ON word
     USING BTREE (word_token) {{db.tablespace.search_index}};
 -- Used when updating country names from the boundary relation.
 CREATE INDEX idx_word_country_names ON word
-    USING btree((info->>'cc')) WHERE type = 'C';
+    USING btree((info->>'cc')) {{db.tablespace.address_index}}
+    WHERE type = 'C';
+-- Used when inserting new postcodes on updates.
+CREATE INDEX idx_word_postcodes ON word
+    USING btree((info->>'postcode')) {{db.tablespace.address_index}}
+    WHERE type = 'P'
 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
 
 DROP SEQUENCE IF EXISTS seq_word;
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py
index 9fbb9bb0..e0fd3a02 100644
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -276,8 +276,7 @@ class LegacyICUNameAnalyzer:
                             (SELECT pc, word FROM
                               (SELECT distinct(postcode) as pc FROM location_postcode) p
                               FULL JOIN
-                              (SELECT word FROM word
-                                WHERE class ='place' and type = 'postcode') w
+                              (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w
                               ON pc = word) x
                            WHERE pc is null or word is null""")
 
@@ -286,20 +285,16 @@ class LegacyICUNameAnalyzer:
                     if postcode is None:
                         to_delete.append(word)
                     else:
-                        copystr.add(
-                            postcode,
-                            ' ' + self.name_processor.get_search_normalized(postcode),
-                            'place', 'postcode', 0)
+                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                                    'P', {'postcode': postcode})
 
                 if to_delete:
                     cur.execute("""DELETE FROM WORD
-                                   WHERE class ='place' and type = 'postcode'
-                                         and word = any(%s)
+                                   WHERE class ='P' and info->>'postcode' = any(%s)
                                 """, (to_delete, ))
 
                 copystr.copy_out(cur, 'word',
-                                 columns=['word', 'word_token', 'class', 'type',
-                                          'search_name_count'])
+                                 columns=['word_token', 'type', 'info'])
 
 
     def update_special_phrases(self, phrases, should_replace):
@@ -503,14 +498,13 @@ class LegacyICUNameAnalyzer:
 
                 with self.conn.cursor() as cur:
                     # no word_id needed for postcodes
-                    cur.execute("""INSERT INTO word (word, word_token, class, type,
-                                                     search_name_count)
-                                   (SELECT pc, %s, 'place', 'postcode', 0
+                    cur.execute("""INSERT INTO word (word_token, type, info)
+                                   (SELECT %s, 'P', json_build_object('postcode', pc)
                                     FROM (VALUES (%s)) as v(pc)
                                     WHERE NOT EXISTS
                                      (SELECT * FROM word
-                                      WHERE word = pc and class='place' and type='postcode'))
-                                """, (' ' + term, postcode))
+                                      WHERE type = 'P' and info->>postcode = pc))
+                                """, (term, postcode))
                 self._cache.postcodes.add(postcode)
 
 
-- 
2.39.5