X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/2ae293aeb629050f03b3f9b88b80c3c1e724df5e..70f154be8b69d3b57eebd25eff225ee29ccc97ba:/lib-sql/tokenizer/legacy_icu_tokenizer.sql diff --git a/lib-sql/tokenizer/legacy_icu_tokenizer.sql b/lib-sql/tokenizer/legacy_icu_tokenizer.sql index 8fd0ede4..f4258f82 100644 --- a/lib-sql/tokenizer/legacy_icu_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql @@ -87,25 +87,50 @@ $$ LANGUAGE SQL IMMUTABLE STRICT; --------------- private functions ---------------------------------------------- -CREATE OR REPLACE FUNCTION getorcreate_term_id(lookup_term TEXT) - RETURNS INTEGER +CREATE OR REPLACE FUNCTION getorcreate_full_word(norm_term TEXT, lookup_terms TEXT[], + OUT full_token INT, + OUT partial_tokens INT[]) AS $$ DECLARE - return_id INTEGER; + partial_terms TEXT[] = '{}'::TEXT[]; + term TEXT; + term_id INTEGER; term_count INTEGER; BEGIN - SELECT min(word_id), max(search_name_count) INTO return_id, term_count - FROM word WHERE word_token = lookup_term and class is null and type is null; - - IF return_id IS NULL THEN - return_id := nextval('seq_word'); - INSERT INTO word (word_id, word_token, search_name_count) - VALUES (return_id, lookup_term, 0); - ELSEIF left(lookup_term, 1) = ' ' and term_count > {{ max_word_freq }} THEN - return_id := 0; + SELECT min(word_id) INTO full_token + FROM word WHERE info->>'word' = norm_term and type = 'W'; + + IF full_token IS NULL THEN + full_token := nextval('seq_word'); + INSERT INTO word (word_id, word_token, info) + SELECT full_token, lookup_term, + json_build_object('word', norm_term, 'count', 0) + FROM unnest(lookup_terms) as lookup_term; END IF; - RETURN return_id; + FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP + term := trim(term); + IF NOT (ARRAY[term] <@ partial_terms) THEN + partial_terms := partial_terms || term; + END IF; + END LOOP; + + partial_tokens := '{}'::INT[]; + FOR term IN SELECT unnest(partial_terms) LOOP + SELECT min(word_id), max(info->>'count') INTO term_id, term_count + FROM word WHERE word_token = term and type = 'w'; + + IF term_id IS NULL THEN + term_id := nextval('seq_word'); + term_count := 0; + INSERT INTO word (word_id, word_token, info) + VALUES (term_id, term, json_build_object('count', term_count)); + END IF; + + IF term_count < {{ max_word_freq }} THEN + partial_tokens := array_merge(partial_tokens, ARRAY[term_id]); + END IF; + END LOOP; END; $$ LANGUAGE plpgsql; @@ -117,15 +142,13 @@ CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT) DECLARE return_id INTEGER; BEGIN - SELECT min(word_id) INTO return_id - FROM word - WHERE word_token = ' ' || lookup_term - and class = 'place' and type = 'house'; + SELECT min(word_id) INTO return_id FROM word + WHERE word_token = lookup_term and type = 'H'; IF return_id IS NULL THEN return_id := nextval('seq_word'); - INSERT INTO word (word_id, word_token, class, type, search_name_count) - VALUES (return_id, ' ' || lookup_term, 'place', 'house', 0); + INSERT INTO word (word_id, word_token, type) + VALUES (return_id, lookup_term, 'H'); END IF; RETURN return_id;