From: Sarah Hoffmann Date: Thu, 23 Sep 2021 14:57:24 +0000 (+0200) Subject: icu tokenizer: switch to matching against partial names X-Git-Tag: v4.0.0~25^2~2 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/bd7c7ddad04627d2bf402ebf20d6a8413a331320 icu tokenizer: switch to matching against partial names When matching address parts from addr:* tags against place names, the address names where so far converted to full names and compared those to the place names. This can become problematic with the new ICU tokenizer once we introduce creation of different variants depending on the place name context. It wouldn't be clear which variant to produce to get a match, so we would have to create all of them. To work around this issue, switch to using the partial terms for matching. This introduces a larger fuzziness between matches but that shouldn't be a problem because matching is always geographically restricted. The search terms created for address parts have a different problem: they are already created before we even know if they are going to be used. This can lead to spurious entries in the word table, which slows down searching. This problem can also be circumvented by using only partial terms for the search terms. In terms of searching that means that the address terms would not get the full-word boost, but given that the case where an address part does not exist as an OSM object should be the exception, this is likely acceptable. --- diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql index 230cb2ea..6092319a 100644 --- a/lib-sql/tokenizer/icu_tokenizer.sql +++ b/lib-sql/tokenizer/icu_tokenizer.sql @@ -44,28 +44,28 @@ $$ LANGUAGE SQL IMMUTABLE; CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB) RETURNS BOOLEAN AS $$ - SELECT info->>'place_match' is not null; + SELECT info->>'place' is not null; $$ LANGUAGE SQL IMMUTABLE; CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) RETURNS BOOLEAN AS $$ - SELECT (info->>'street')::INTEGER[] && street_tokens + SELECT (info->>'street')::INTEGER[] <@ street_tokens $$ LANGUAGE SQL IMMUTABLE STRICT; CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[]) RETURNS BOOLEAN AS $$ - SELECT (info->>'place_match')::INTEGER[] && place_tokens + SELECT (info->>'place')::INTEGER[] <@ place_tokens $$ LANGUAGE SQL IMMUTABLE STRICT; CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB) RETURNS INTEGER[] AS $$ - SELECT (info->>'place_search')::INTEGER[] + SELECT (info->>'place')::INTEGER[] $$ LANGUAGE SQL IMMUTABLE STRICT; @@ -79,14 +79,14 @@ $$ LANGUAGE SQL IMMUTABLE STRICT; CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT) RETURNS INTEGER[] AS $$ - SELECT (info->'addr'->key->>0)::INTEGER[]; + SELECT (info->'addr'->>key)::INTEGER[]; $$ LANGUAGE SQL IMMUTABLE STRICT; CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[]) RETURNS BOOLEAN AS $$ - SELECT (info->'addr'->key->>1)::INTEGER[] && tokens; + SELECT (info->'addr'->>key)::INTEGER[] <@ tokens; $$ LANGUAGE SQL IMMUTABLE STRICT; @@ -146,15 +146,34 @@ BEGIN VALUES (term_id, term, 'w', json_build_object('count', term_count)); END IF; - IF term_count < {{ max_word_freq }} THEN - partial_tokens := array_merge(partial_tokens, ARRAY[term_id]); - END IF; + partial_tokens := array_merge(partial_tokens, ARRAY[term_id]); END LOOP; END; $$ LANGUAGE plpgsql; +CREATE OR REPLACE FUNCTION getorcreate_partial_word(partial TEXT) + RETURNS INTEGER + AS $$ +DECLARE + token INTEGER; +BEGIN + SELECT min(word_id) INTO token + FROM word WHERE word_token = partial and type = 'w'; + + IF token IS NULL THEN + token := nextval('seq_word'); + INSERT INTO word (word_id, word_token, type, info) + VALUES (token, partial, 'w', json_build_object('count', 0)); + END IF; + + RETURN token; +END; +$$ +LANGUAGE plpgsql; + + CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT) RETURNS INTEGER AS $$ diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 61263678..22f5e78f 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -17,7 +17,6 @@ from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer -DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq" DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization" LOG = logging.getLogger() @@ -39,7 +38,6 @@ class LegacyICUTokenizer(AbstractTokenizer): self.data_dir = data_dir self.naming_rules = None self.term_normalization = None - self.max_word_frequency = None def init_new_db(self, config, init_db=True): @@ -52,7 +50,6 @@ class LegacyICUTokenizer(AbstractTokenizer): config='TOKENIZER_CONFIG')) self.naming_rules = ICUNameProcessorRules(loader=loader) self.term_normalization = config.TERM_NORMALIZATION - self.max_word_frequency = config.MAX_WORD_FREQUENCY self._install_php(config.lib_dir.php) self._save_config(config) @@ -68,7 +65,6 @@ class LegacyICUTokenizer(AbstractTokenizer): with connect(self.dsn) as conn: self.naming_rules = ICUNameProcessorRules(conn=conn) self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION) - self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ) def finalize_import(self, _): @@ -81,10 +77,8 @@ class LegacyICUTokenizer(AbstractTokenizer): """ Reimport the SQL functions for this tokenizer. """ with connect(self.dsn) as conn: - max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ) sqlp = SQLPreprocessor(conn, config) - sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql', - max_word_freq=max_word_freq) + sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql') def check_database(self): @@ -122,7 +116,7 @@ class LegacyICUTokenizer(AbstractTokenizer): php_file = self.data_dir / "tokenizer.php" php_file.write_text(dedent(f"""\