From 5394b1fa1bd258745a9906c7df605310bc6cc021 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 20 Jul 2021 12:11:12 +0200 Subject: [PATCH] switch postcode tokens to new word table layout --- lib-php/tokenizer/legacy_icu_tokenizer.php | 12 ++++++++++- lib-sql/tokenizer/icu_tokenizer_tables.sql | 7 +++++- nominatim/tokenizer/legacy_icu_tokenizer.py | 24 ++++++++------------- 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index b2fc27c7..2461a1fd 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -147,7 +147,7 @@ class Tokenizer { // Check which tokens we have, get the ID numbers $sSQL = 'SELECT word_id, word_token, type'; - $sSQL .= " info->>'cc' as country"; + $sSQL .= " info->>'cc' as country, info->>'postcode' as postcode"; $sSQL .= ' FROM word WHERE word_token in ('; $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')'; @@ -171,6 +171,16 @@ class Tokenizer 'H': // house number tokens $oToken = new Token\HouseNumber($iId, $aWord['word_token']); break; + 'P': // postcode tokens + // Postcodes are not normalized, so they may have content + // that makes SQL injection possible. Reject postcodes + // that would need special escaping. + if ($aWord['postcode'] === null + || pg_escape_string($aWord['postcode']) == $aWord['postcode'] + ) { + continue; + } + $oToken = new Token\Postcode($iId, $aWord['postcode'], null); default: continue; } diff --git a/lib-sql/tokenizer/icu_tokenizer_tables.sql b/lib-sql/tokenizer/icu_tokenizer_tables.sql index 1d70a9c3..dd3ac477 100644 --- a/lib-sql/tokenizer/icu_tokenizer_tables.sql +++ b/lib-sql/tokenizer/icu_tokenizer_tables.sql @@ -10,7 +10,12 @@ CREATE INDEX idx_word_word_token ON word USING BTREE (word_token) {{db.tablespace.search_index}}; -- Used when updating country names from the boundary relation. CREATE INDEX idx_word_country_names ON word - USING btree((info->>'cc')) WHERE type = 'C'; + USING btree((info->>'cc')) {{db.tablespace.address_index}} + WHERE type = 'C'; +-- Used when inserting new postcodes on updates. +CREATE INDEX idx_word_postcodes ON word + USING btree((info->>'postcode')) {{db.tablespace.address_index}} + WHERE type = 'P' GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}"; DROP SEQUENCE IF EXISTS seq_word; diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 9fbb9bb0..e0fd3a02 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -276,8 +276,7 @@ class LegacyICUNameAnalyzer: (SELECT pc, word FROM (SELECT distinct(postcode) as pc FROM location_postcode) p FULL JOIN - (SELECT word FROM word - WHERE class ='place' and type = 'postcode') w + (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w ON pc = word) x WHERE pc is null or word is null""") @@ -286,20 +285,16 @@ class LegacyICUNameAnalyzer: if postcode is None: to_delete.append(word) else: - copystr.add( - postcode, - ' ' + self.name_processor.get_search_normalized(postcode), - 'place', 'postcode', 0) + copystr.add(self.name_processor.get_search_normalized(postcode), + 'P', {'postcode': postcode}) if to_delete: cur.execute("""DELETE FROM WORD - WHERE class ='place' and type = 'postcode' - and word = any(%s) + WHERE class ='P' and info->>'postcode' = any(%s) """, (to_delete, )) copystr.copy_out(cur, 'word', - columns=['word', 'word_token', 'class', 'type', - 'search_name_count']) + columns=['word_token', 'type', 'info']) def update_special_phrases(self, phrases, should_replace): @@ -503,14 +498,13 @@ class LegacyICUNameAnalyzer: with self.conn.cursor() as cur: # no word_id needed for postcodes - cur.execute("""INSERT INTO word (word, word_token, class, type, - search_name_count) - (SELECT pc, %s, 'place', 'postcode', 0 + cur.execute("""INSERT INTO word (word_token, type, info) + (SELECT %s, 'P', json_build_object('postcode', pc) FROM (VALUES (%s)) as v(pc) WHERE NOT EXISTS (SELECT * FROM word - WHERE word = pc and class='place' and type='postcode')) - """, (' ' + term, postcode)) + WHERE type = 'P' and info->>postcode = pc)) + """, (term, postcode)) self._cache.postcodes.add(postcode) -- 2.39.5