From 70f154be8b69d3b57eebd25eff225ee29ccc97ba Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 21 Jul 2021 10:41:38 +0200 Subject: [PATCH] switch word tokens to new word table layout --- lib-php/tokenizer/legacy_icu_tokenizer.php | 70 +++++---------------- lib-sql/tokenizer/icu_tokenizer_tables.sql | 7 ++- lib-sql/tokenizer/legacy_icu_tokenizer.sql | 16 ++--- nominatim/tokenizer/legacy_icu_tokenizer.py | 51 +++++++-------- 4 files changed, 58 insertions(+), 86 deletions(-) diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index 70358976..314bd27e 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -148,8 +148,9 @@ class Tokenizer // Check which tokens we have, get the ID numbers $sSQL = 'SELECT word_id, word_token, type'; $sSQL .= " info->>'cc' as country, info->>'postcode' as postcode,"; - $sSQL .= " info->>'word' as word, info->>'op' as operator,"; - $sSQL .= " info->>'class' as class, info->>'type' as type"; + $sSQL .= " info->>'op' as operator,"; + $sSQL .= " info->>'class' as class, info->>'type' as type,"; + $sSQL .= " info->>'count' as count"; $sSQL .= ' FROM word WHERE word_token in ('; $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')'; @@ -190,8 +191,6 @@ class Tokenizer break; 'S': // tokens for classification terms (special phrases) if ($aWord['class'] === null || $aWord['type'] === null - || $aWord['word'] === null - || strpos($sNormQuery, $aWord['word']) === false ) { continue; } @@ -202,58 +201,23 @@ class Tokenizer $aWord['op'] ? Operator::NEAR : Operator::NONE ); break; + 'W': // full-word tokens + $oToken = new Token\Word( + $iId, + (int) $aWord['count'], + substr_count($aWord['word_token'], ' ') + ); + break; + 'w': // partial word terms + $oToken = new Token\Partial( + $iId, + $aWord['word_token'], + (int) $aWord['count'] + ); + break; default: continue; } -/* if ($aWord['class']) { - // Special terms need to appear in their normalized form. - // (postcodes are not normalized in the word table) - $sNormWord = $this->normalizeString($aWord['word']); - if ($aWord['word'] && strpos($sNormQuery, $sNormWord) === false) { - continue; - } - - if ($aWord['class'] == 'place' && $aWord['type'] == 'house') { - $oToken = new Token\HouseNumber($iId, trim($aWord['word_token'])); - } elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') { - if ($aWord['word'] - && pg_escape_string($aWord['word']) == $aWord['word'] - ) { - $oToken = new Token\Postcode( - $iId, - $aWord['word'], - $aWord['country_code'] - ); - } - } else { - // near and in operator the same at the moment - $oToken = new Token\SpecialTerm( - $iId, - $aWord['class'], - $aWord['type'], - $aWord['operator'] ? Operator::NEAR : Operator::NONE - ); - } - } elseif ($aWord['country_code']) { - // Filter country tokens that do not match restricted countries. - if (!$this->aCountryRestriction - || in_array($aWord['country_code'], $this->aCountryRestriction) - ) { - $oToken = new Token\Country($iId, $aWord['country_code']); - } - } elseif ($aWord['word_token'][0] == ' ') { - $oToken = new Token\Word( - $iId, - (int) $aWord['count'], - substr_count($aWord['word_token'], ' ') - ); - } else { - $oToken = new Token\Partial( - $iId, - $aWord['word_token'], - (int) $aWord['count'] - ); - }*/ $oValidTokens->addToken($aWord['word_token'], $oToken); } diff --git a/lib-sql/tokenizer/icu_tokenizer_tables.sql b/lib-sql/tokenizer/icu_tokenizer_tables.sql index dd3ac477..13b12797 100644 --- a/lib-sql/tokenizer/icu_tokenizer_tables.sql +++ b/lib-sql/tokenizer/icu_tokenizer_tables.sql @@ -15,7 +15,12 @@ CREATE INDEX idx_word_country_names ON word -- Used when inserting new postcodes on updates. CREATE INDEX idx_word_postcodes ON word USING btree((info->>'postcode')) {{db.tablespace.address_index}} - WHERE type = 'P' + WHERE type = 'P'; +-- Used when inserting full words. +CREATE INDEX idx_word_full_word ON word + USING btree((info->>'word')) {{db.tablespace.address_index}} + WHERE type = 'W'; + GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}"; DROP SEQUENCE IF EXISTS seq_word; diff --git a/lib-sql/tokenizer/legacy_icu_tokenizer.sql b/lib-sql/tokenizer/legacy_icu_tokenizer.sql index e9dcf4bc..f4258f82 100644 --- a/lib-sql/tokenizer/legacy_icu_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql @@ -98,12 +98,14 @@ DECLARE term_count INTEGER; BEGIN SELECT min(word_id) INTO full_token - FROM word WHERE word = norm_term and class is null and country_code is null; + FROM word WHERE info->>'word' = norm_term and type = 'W'; IF full_token IS NULL THEN full_token := nextval('seq_word'); - INSERT INTO word (word_id, word_token, word, search_name_count) - SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term; + INSERT INTO word (word_id, word_token, info) + SELECT full_token, lookup_term, + json_build_object('word', norm_term, 'count', 0) + FROM unnest(lookup_terms) as lookup_term; END IF; FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP @@ -115,14 +117,14 @@ BEGIN partial_tokens := '{}'::INT[]; FOR term IN SELECT unnest(partial_terms) LOOP - SELECT min(word_id), max(search_name_count) INTO term_id, term_count - FROM word WHERE word_token = term and class is null and country_code is null; + SELECT min(word_id), max(info->>'count') INTO term_id, term_count + FROM word WHERE word_token = term and type = 'w'; IF term_id IS NULL THEN term_id := nextval('seq_word'); term_count := 0; - INSERT INTO word (word_id, word_token, search_name_count) - VALUES (term_id, term, 0); + INSERT INTO word (word_id, word_token, info) + VALUES (term_id, term, json_build_object('count', term_count)); END IF; IF term_count < {{ max_word_freq }} THEN diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index a645b598..14fa5b60 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -74,13 +74,11 @@ class LegacyICUTokenizer: self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ) - def finalize_import(self, config): + def finalize_import(self, _): """ Do any required postprocessing to make the tokenizer data ready for use. """ - with connect(self.dsn) as conn: - sqlp = SQLPreprocessor(conn, config) - sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql') + pass def update_sql_functions(self, config): @@ -121,18 +119,17 @@ class LegacyICUTokenizer: """ return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules)) - # pylint: disable=missing-format-attribute + def _install_php(self, phpdir): """ Install the php script for the tokenizer. """ php_file = self.data_dir / "tokenizer.php" - php_file.write_text(dedent("""\ + php_file.write_text(dedent(f"""\ >'postcode' = any(%s) + WHERE type ='P' and info->>'postcode' = any(%s) """, (to_delete, )) copystr.copy_out(cur, 'word', -- 2.39.5