switch postcode tokens to new word table layout

author Sarah Hoffmann <lonvia@denofr.de>

Tue, 20 Jul 2021 10:11:12 +0000 (12:11 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Tue, 20 Jul 2021 10:11:12 +0000 (12:11 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php

index b2fc27c715f26072e17a2c32b1721ba512b20f79..2461a1fd828698bb6786e16778b62c1b8b6077f2 100644 (file)
--- a/lib-php/tokenizer/legacy_icu_tokenizer.php
+++ b/lib-php/tokenizer/legacy_icu_tokenizer.php
@@ -147,7 +147,7 @@ class Tokenizer
      {
          // Check which tokens we have, get the ID numbers
          $sSQL = 'SELECT word_id, word_token, type';
-        $sSQL .= "      info->>'cc' as country";
+        $sSQL .= "      info->>'cc' as country, info->>'postcode' as postcode";
          $sSQL .= ' FROM word WHERE word_token in (';
          $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
  
@@ -171,6 +171,16 @@ class Tokenizer
                  'H':  // house number tokens
                      $oToken = new Token\HouseNumber($iId, $aWord['word_token']);
                      break;
+                'P':  // postcode tokens
+                    // Postcodes are not normalized, so they may have content
+                    // that makes SQL injection possible. Reject postcodes
+                    // that would need special escaping.
+                    if ($aWord['postcode'] === null
+                        || pg_escape_string($aWord['postcode']) == $aWord['postcode']
+                    ) {
+                       continue;
+                    }
+                    $oToken = new Token\Postcode($iId, $aWord['postcode'], null);
                  default:
                      continue;
              }
diff --git a/lib-sql/tokenizer/icu_tokenizer_tables.sql b/lib-sql/tokenizer/icu_tokenizer_tables.sql

index 1d70a9c38e8b86b8600aad4dd809f1fe6a0eded9..dd3ac4779613d4cc1b69c093f8fbe78bd8587d03 100644 (file)
--- a/lib-sql/tokenizer/icu_tokenizer_tables.sql
+++ b/lib-sql/tokenizer/icu_tokenizer_tables.sql
@@ -10,7 +10,12 @@ CREATE INDEX idx_word_word_token ON word
      USING BTREE (word_token) {{db.tablespace.search_index}};
  -- Used when updating country names from the boundary relation.
  CREATE INDEX idx_word_country_names ON word
-    USING btree((info->>'cc')) WHERE type = 'C';
+    USING btree((info->>'cc')) {{db.tablespace.address_index}}
+    WHERE type = 'C';
+-- Used when inserting new postcodes on updates.
+CREATE INDEX idx_word_postcodes ON word
+    USING btree((info->>'postcode')) {{db.tablespace.address_index}}
+    WHERE type = 'P'
  GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
  
  DROP SEQUENCE IF EXISTS seq_word;
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index 9fbb9bb09688cf349bfb28a74c84d640fcd84767..e0fd3a023d5453fc82838af563c01d9d4f759181 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -276,8 +276,7 @@ class LegacyICUNameAnalyzer:
                              (SELECT pc, word FROM
                                (SELECT distinct(postcode) as pc FROM location_postcode) p
                                FULL JOIN
-                              (SELECT word FROM word
-                                WHERE class ='place' and type = 'postcode') w
+                              (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w
                                ON pc = word) x
                             WHERE pc is null or word is null""")
  
@@ -286,20 +285,16 @@ class LegacyICUNameAnalyzer:
                      if postcode is None:
                          to_delete.append(word)
                      else:
-                        copystr.add(
-                            postcode,
-                            ' ' + self.name_processor.get_search_normalized(postcode),
-                            'place', 'postcode', 0)
+                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                                    'P', {'postcode': postcode})
  
                  if to_delete:
                      cur.execute("""DELETE FROM WORD
-                                   WHERE class ='place' and type = 'postcode'
-                                         and word = any(%s)
+                                   WHERE class ='P' and info->>'postcode' = any(%s)
                                  """, (to_delete, ))
  
                  copystr.copy_out(cur, 'word',
-                                 columns=['word', 'word_token', 'class', 'type',
-                                          'search_name_count'])
+                                 columns=['word_token', 'type', 'info'])
  
  
      def update_special_phrases(self, phrases, should_replace):
@@ -503,14 +498,13 @@ class LegacyICUNameAnalyzer:
  
                  with self.conn.cursor() as cur:
                      # no word_id needed for postcodes
-                    cur.execute("""INSERT INTO word (word, word_token, class, type,
-                                                     search_name_count)
-                                   (SELECT pc, %s, 'place', 'postcode', 0
+                    cur.execute("""INSERT INTO word (word_token, type, info)
+                                   (SELECT %s, 'P', json_build_object('postcode', pc)
                                      FROM (VALUES (%s)) as v(pc)
                                      WHERE NOT EXISTS
                                       (SELECT * FROM word
-                                      WHERE word = pc and class='place' and type='postcode'))
-                                """, (' ' + term, postcode))
+                                      WHERE type = 'P' and info->>postcode = pc))
+                                """, (term, postcode))
                  self._cache.postcodes.add(postcode)
author	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 20 Jul 2021 10:11:12 +0000 (12:11 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
lib-php/tokenizer/legacy_icu_tokenizer.php		patch \| blob \| history
lib-sql/tokenizer/icu_tokenizer_tables.sql		patch \| blob \| history
nominatim/tokenizer/legacy_icu_tokenizer.py		patch \| blob \| history