X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/59fe74ddf6749d93c93e88b1aeff0eb59a8e03ec..9963261d8d572f7a0d88ef27f5d938f085c603ba:/lib-sql/tokenizer/icu_tokenizer.sql diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql index 230cb2ea..599d0eb0 100644 --- a/lib-sql/tokenizer/icu_tokenizer.sql +++ b/lib-sql/tokenizer/icu_tokenizer.sql @@ -1,3 +1,10 @@ +-- SPDX-License-Identifier: GPL-2.0-only +-- +-- This file is part of Nominatim. (https://nominatim.org) +-- +-- Copyright (C) 2022 by the Nominatim developer community. +-- For a full list of authors see the git log. + -- Get tokens used for searching the given place. -- -- These are the tokens that will be saved in the search_name table. @@ -44,7 +51,7 @@ $$ LANGUAGE SQL IMMUTABLE; CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB) RETURNS BOOLEAN AS $$ - SELECT info->>'place_match' is not null; + SELECT info->>'place' is not null; $$ LANGUAGE SQL IMMUTABLE; @@ -58,14 +65,14 @@ $$ LANGUAGE SQL IMMUTABLE STRICT; CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[]) RETURNS BOOLEAN AS $$ - SELECT (info->>'place_match')::INTEGER[] && place_tokens + SELECT (info->>'place')::INTEGER[] <@ place_tokens $$ LANGUAGE SQL IMMUTABLE STRICT; CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB) RETURNS INTEGER[] AS $$ - SELECT (info->>'place_search')::INTEGER[] + SELECT (info->>'place')::INTEGER[] $$ LANGUAGE SQL IMMUTABLE STRICT; @@ -79,21 +86,21 @@ $$ LANGUAGE SQL IMMUTABLE STRICT; CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT) RETURNS INTEGER[] AS $$ - SELECT (info->'addr'->key->>0)::INTEGER[]; + SELECT (info->'addr'->>key)::INTEGER[]; $$ LANGUAGE SQL IMMUTABLE STRICT; CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[]) RETURNS BOOLEAN AS $$ - SELECT (info->'addr'->key->>1)::INTEGER[] && tokens; + SELECT (info->'addr'->>key)::INTEGER[] <@ tokens; $$ LANGUAGE SQL IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT) +CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB) RETURNS TEXT AS $$ - SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END; + SELECT info->>'postcode'; $$ LANGUAGE SQL IMMUTABLE STRICT; @@ -146,15 +153,34 @@ BEGIN VALUES (term_id, term, 'w', json_build_object('count', term_count)); END IF; - IF term_count < {{ max_word_freq }} THEN - partial_tokens := array_merge(partial_tokens, ARRAY[term_id]); - END IF; + partial_tokens := array_merge(partial_tokens, ARRAY[term_id]); END LOOP; END; $$ LANGUAGE plpgsql; +CREATE OR REPLACE FUNCTION getorcreate_partial_word(partial TEXT) + RETURNS INTEGER + AS $$ +DECLARE + token INTEGER; +BEGIN + SELECT min(word_id) INTO token + FROM word WHERE word_token = partial and type = 'w'; + + IF token IS NULL THEN + token := nextval('seq_word'); + INSERT INTO word (word_id, word_token, type, info) + VALUES (token, partial, 'w', json_build_object('count', 0)); + END IF; + + RETURN token; +END; +$$ +LANGUAGE plpgsql; + + CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT) RETURNS INTEGER AS $$ @@ -174,3 +200,49 @@ BEGIN END; $$ LANGUAGE plpgsql; + + +CREATE OR REPLACE FUNCTION create_analyzed_hnr_id(norm_term TEXT, lookup_terms TEXT[]) + RETURNS INTEGER + AS $$ +DECLARE + return_id INTEGER; +BEGIN + SELECT min(word_id) INTO return_id + FROM word WHERE word = norm_term and type = 'H'; + + IF return_id IS NULL THEN + return_id := nextval('seq_word'); + INSERT INTO word (word_id, word_token, type, word, info) + SELECT return_id, lookup_term, 'H', norm_term, + json_build_object('lookup', lookup_terms[1]) + FROM unnest(lookup_terms) as lookup_term; + END IF; + + RETURN return_id; +END; +$$ +LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION create_postcode_word(postcode TEXT, lookup_terms TEXT[]) + RETURNS BOOLEAN + AS $$ +DECLARE + existing INTEGER; +BEGIN + SELECT count(*) INTO existing + FROM word WHERE word = postcode and type = 'P'; + + IF existing > 0 THEN + RETURN TRUE; + END IF; + + -- postcodes don't need word ids + INSERT INTO word (word_token, type, word) + SELECT lookup_term, 'P', postcode FROM unnest(lookup_terms) as lookup_term; + + RETURN FALSE; +END; +$$ +LANGUAGE plpgsql; +