From 59fe74ddf6749d93c93e88b1aeff0eb59a8e03ec Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 22 Sep 2021 22:20:02 +0200 Subject: [PATCH] move name matching into tokenizer module Instead of requesting the match tokens from the tokenizer when looking for parent streets/places and address parts, hand in the saved tokens and ask if they match. This gives the tokenizer more freedom to decide how name matching should be done. --- lib-sql/functions/interpolation.sql | 9 ++-- lib-sql/functions/partition-functions.sql | 18 +++++--- lib-sql/functions/placex_triggers.sql | 32 ++++++-------- lib-sql/functions/utils.sql | 31 +++++-------- lib-sql/tiger_import_start.sql | 9 +--- lib-sql/tokenizer/icu_tokenizer.sql | 53 +++++++++++++++-------- lib-sql/tokenizer/legacy_tokenizer.sql | 53 +++++++++++++++-------- 7 files changed, 115 insertions(+), 90 deletions(-) diff --git a/lib-sql/functions/interpolation.sql b/lib-sql/functions/interpolation.sql index 55e44dfd..4ef36f4f 100644 --- a/lib-sql/functions/interpolation.sql +++ b/lib-sql/functions/interpolation.sql @@ -43,7 +43,7 @@ LANGUAGE plpgsql STABLE; -- find the parent road of the cut road parts -CREATE OR REPLACE FUNCTION get_interpolation_parent(street INTEGER[], place INTEGER[], +CREATE OR REPLACE FUNCTION get_interpolation_parent(token_info JSONB, partition SMALLINT, centroid GEOMETRY, geom GEOMETRY) RETURNS BIGINT @@ -52,7 +52,7 @@ DECLARE parent_place_id BIGINT; location RECORD; BEGIN - parent_place_id := find_parent_for_address(street, place, partition, centroid); + parent_place_id := find_parent_for_address(token_info, partition, centroid); IF parent_place_id is null THEN FOR location IN SELECT place_id FROM placex @@ -155,9 +155,8 @@ BEGIN NEW.interpolationtype = NEW.address->'interpolation'; place_centroid := ST_PointOnSurface(NEW.linegeo); - NEW.parent_place_id = get_interpolation_parent(token_addr_street_match_tokens(NEW.token_info), - token_addr_place_match_tokens(NEW.token_info), - NEW.partition, place_centroid, NEW.linegeo); + NEW.parent_place_id = get_interpolation_parent(NEW.token_info, NEW.partition, + place_centroid, NEW.linegeo); interpol_postcode := token_normalized_postcode(NEW.address->'postcode'); diff --git a/lib-sql/functions/partition-functions.sql b/lib-sql/functions/partition-functions.sql index 53aba22c..97afec15 100644 --- a/lib-sql/functions/partition-functions.sql +++ b/lib-sql/functions/partition-functions.sql @@ -66,7 +66,7 @@ LANGUAGE plpgsql STABLE; CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY, from_rank SMALLINT, to_rank SMALLINT, - extent FLOAT, tokens INT[]) + extent FLOAT, token_info JSONB, key TEXT) RETURNS nearfeaturecentr AS $$ DECLARE @@ -80,7 +80,7 @@ BEGIN FROM location_area_large_{{ partition }} WHERE geometry && ST_Expand(feature, extent) AND rank_address between from_rank and to_rank - AND tokens && keywords + AND token_matches_address(token_info, key, keywords) GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1; RETURN r; @@ -148,18 +148,21 @@ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION getNearestNamedRoadPlaceId(in_partition INTEGER, point GEOMETRY, - isin_token INTEGER[]) + token_info JSONB) RETURNS BIGINT AS $$ DECLARE parent BIGINT; BEGIN + IF not token_has_addr_street(token_info) THEN + RETURN NULL; + END IF; {% for partition in db.partitions %} IF in_partition = {{ partition }} THEN SELECT place_id FROM search_name_{{ partition }} INTO parent - WHERE name_vector && isin_token + WHERE token_matches_street(token_info, name_vector) AND centroid && ST_Expand(point, 0.015) AND address_rank between 26 and 27 ORDER BY ST_Distance(centroid, point) ASC limit 1; @@ -174,19 +177,22 @@ LANGUAGE plpgsql STABLE; CREATE OR REPLACE FUNCTION getNearestNamedPlacePlaceId(in_partition INTEGER, point GEOMETRY, - isin_token INTEGER[]) + token_info JSONB) RETURNS BIGINT AS $$ DECLARE parent BIGINT; BEGIN + IF not token_has_addr_place(token_info) THEN + RETURN NULL; + END IF; {% for partition in db.partitions %} IF in_partition = {{ partition }} THEN SELECT place_id INTO parent FROM search_name_{{ partition }} - WHERE name_vector && isin_token + WHERE token_matches_place(token_info, name_vector) AND centroid && ST_Expand(point, 0.04) AND address_rank between 16 and 25 ORDER BY ST_Distance(centroid, point) ASC limit 1; diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql index fa7156ec..9c2a67a1 100644 --- a/lib-sql/functions/placex_triggers.sql +++ b/lib-sql/functions/placex_triggers.sql @@ -104,8 +104,7 @@ CREATE OR REPLACE FUNCTION find_parent_for_poi(poi_osm_type CHAR(1), poi_osm_id BIGINT, poi_partition SMALLINT, bbox GEOMETRY, - addr_street INTEGER[], - addr_place INTEGER[], + token_info JSONB, is_place_addr BOOLEAN) RETURNS BIGINT AS $$ @@ -119,8 +118,7 @@ BEGIN parent_place_id := find_associated_street(poi_osm_type, poi_osm_id); IF parent_place_id is null THEN - parent_place_id := find_parent_for_address(addr_street, addr_place, - poi_partition, bbox); + parent_place_id := find_parent_for_address(token_info, poi_partition, bbox); END IF; IF parent_place_id is null and poi_osm_type = 'N' THEN @@ -333,13 +331,14 @@ BEGIN WHERE s.place_id = parent_place_id; FOR addr_item IN - SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens - FROM token_get_address_tokens(token_info) - WHERE not search_tokens <@ parent_address_vector + SELECT (get_addr_tag_rank(key, country)).*, key, + token_get_address_search_tokens(token_info, key) as search_tokens + FROM token_get_address_keys(token_info) as key + WHERE not token_get_address_search_tokens(token_info, key) <@ parent_address_vector LOOP addr_place := get_address_place(in_partition, geometry, addr_item.from_rank, addr_item.to_rank, - addr_item.extent, addr_item.match_tokens); + addr_item.extent, token_info, addr_item.key); IF addr_place is null THEN -- No place found in OSM that matches. Make it at least searchable. @@ -447,14 +446,16 @@ BEGIN FOR location IN SELECT (get_address_place(partition, geometry, from_rank, to_rank, - extent, match_tokens)).*, search_tokens - FROM (SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens - FROM token_get_address_tokens(token_info)) x + extent, token_info, key)).*, key + FROM (SELECT (get_addr_tag_rank(key, country)).*, key + FROM token_get_address_keys(token_info) as key) x ORDER BY rank_address, distance, isguess desc LOOP IF location.place_id is null THEN {% if not db.reverse_only %} - nameaddress_vector := array_merge(nameaddress_vector, location.search_tokens); + nameaddress_vector := array_merge(nameaddress_vector, + token_get_address_search_tokens(token_info, + location.key)); {% endif %} ELSE {% if not db.reverse_only %} @@ -689,9 +690,6 @@ DECLARE parent_address_level SMALLINT; place_address_level SMALLINT; - addr_street INTEGER[]; - addr_place INTEGER[]; - max_rank SMALLINT; name_vector INTEGER[]; @@ -860,8 +858,6 @@ BEGIN END IF; NEW.housenumber := token_normalized_housenumber(NEW.token_info); - addr_street := token_addr_street_match_tokens(NEW.token_info); - addr_place := token_addr_place_match_tokens(NEW.token_info); NEW.postcode := null; @@ -907,7 +903,7 @@ BEGIN NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id, NEW.partition, ST_Envelope(NEW.geometry), - addr_street, addr_place, + NEW.token_info, is_place_address); -- If we found the road take a shortcut here. diff --git a/lib-sql/functions/utils.sql b/lib-sql/functions/utils.sql index c308d025..f7d2093c 100644 --- a/lib-sql/functions/utils.sql +++ b/lib-sql/functions/utils.sql @@ -215,13 +215,12 @@ LANGUAGE plpgsql STABLE; -- Find the parent of an address with addr:street/addr:place tag. -- --- \param street Value of addr:street or NULL if tag is missing. --- \param place Value of addr:place or NULL if tag is missing. +-- \param token_info Naming info with the address information. -- \param partition Partition where to search the parent. -- \param centroid Location of the address. -- -- \return Place ID of the parent if one was found, NULL otherwise. -CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEGER[], +CREATE OR REPLACE FUNCTION find_parent_for_address(token_info JSONB, partition SMALLINT, centroid GEOMETRY) RETURNS BIGINT @@ -229,30 +228,22 @@ CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEG DECLARE parent_place_id BIGINT; BEGIN - IF street is not null THEN - -- Check for addr:street attributes - -- Note that addr:street links can only be indexed, once the street itself is indexed - parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, street); - IF parent_place_id is not null THEN - {% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %} - RETURN parent_place_id; - END IF; + -- Check for addr:street attributes + parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, token_info); + IF parent_place_id is not null THEN + {% if debug %}RAISE WARNING 'Get parent from addr:street: %', parent_place_id;{% endif %} + RETURN parent_place_id; END IF; -- Check for addr:place attributes. - IF place is not null THEN - parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, place); - IF parent_place_id is not null THEN - {% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %} - RETURN parent_place_id; - END IF; - END IF; - - RETURN NULL; + parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, token_info); + {% if debug %}RAISE WARNING 'Get parent from addr:place: %', parent_place_id;{% endif %} + RETURN parent_place_id; END; $$ LANGUAGE plpgsql STABLE; + CREATE OR REPLACE FUNCTION delete_location(OLD_place_id BIGINT) RETURNS BOOLEAN AS $$ diff --git a/lib-sql/tiger_import_start.sql b/lib-sql/tiger_import_start.sql index faa4efbb..f344e174 100644 --- a/lib-sql/tiger_import_start.sql +++ b/lib-sql/tiger_import_start.sql @@ -14,7 +14,6 @@ DECLARE out_partition INTEGER; out_parent_place_id BIGINT; location RECORD; - address_street_word_ids INTEGER[]; BEGIN @@ -54,13 +53,9 @@ BEGIN place_centroid := ST_Centroid(linegeo); out_partition := get_partition('us'); - out_parent_place_id := null; - address_street_word_ids := token_addr_street_match_tokens(token_info); - IF address_street_word_ids IS NOT NULL THEN - out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid, - address_street_word_ids); - END IF; + out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid, + token_info); IF out_parent_place_id IS NULL THEN SELECT getNearestParallelRoadFeature(out_partition, linegeo) diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql index ffe6648c..230cb2ea 100644 --- a/lib-sql/tokenizer/icu_tokenizer.sql +++ b/lib-sql/tokenizer/icu_tokenizer.sql @@ -34,17 +34,31 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB) - RETURNS INTEGER[] +CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB) + RETURNS BOOLEAN +AS $$ + SELECT info->>'street' is not null; +$$ LANGUAGE SQL IMMUTABLE; + + +CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB) + RETURNS BOOLEAN AS $$ - SELECT (info->>'street')::INTEGER[] + SELECT info->>'place_match' is not null; +$$ LANGUAGE SQL IMMUTABLE; + + +CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) + RETURNS BOOLEAN +AS $$ + SELECT (info->>'street')::INTEGER[] && street_tokens $$ LANGUAGE SQL IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB) - RETURNS INTEGER[] +CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[]) + RETURNS BOOLEAN AS $$ - SELECT (info->>'place_match')::INTEGER[] + SELECT (info->>'place_match')::INTEGER[] && place_tokens $$ LANGUAGE SQL IMMUTABLE STRICT; @@ -55,19 +69,24 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; -DROP TYPE IF EXISTS token_addresstoken CASCADE; -CREATE TYPE token_addresstoken AS ( - key TEXT, - match_tokens INT[], - search_tokens INT[] -); +CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB) + RETURNS SETOF TEXT +AS $$ + SELECT * FROM jsonb_object_keys(info->'addr'); +$$ LANGUAGE SQL IMMUTABLE STRICT; + + +CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT) + RETURNS INTEGER[] +AS $$ + SELECT (info->'addr'->key->>0)::INTEGER[]; +$$ LANGUAGE SQL IMMUTABLE STRICT; + -CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB) - RETURNS SETOF token_addresstoken +CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[]) + RETURNS BOOLEAN AS $$ - SELECT key, (value->>1)::int[] as match_tokens, - (value->>0)::int[] as search_tokens - FROM jsonb_each(info->'addr'); + SELECT (info->'addr'->key->>1)::INTEGER[] && tokens; $$ LANGUAGE SQL IMMUTABLE STRICT; diff --git a/lib-sql/tokenizer/legacy_tokenizer.sql b/lib-sql/tokenizer/legacy_tokenizer.sql index a2c6b520..2b734e6f 100644 --- a/lib-sql/tokenizer/legacy_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_tokenizer.sql @@ -34,17 +34,31 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB) - RETURNS INTEGER[] +CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB) + RETURNS BOOLEAN +AS $$ + SELECT info->>'street' is not null; +$$ LANGUAGE SQL IMMUTABLE; + + +CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB) + RETURNS BOOLEAN AS $$ - SELECT (info->>'street')::INTEGER[] + SELECT info->>'place_match' is not null; +$$ LANGUAGE SQL IMMUTABLE; + + +CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) + RETURNS BOOLEAN +AS $$ + SELECT (info->>'street')::INTEGER[] && street_tokens $$ LANGUAGE SQL IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB) - RETURNS INTEGER[] +CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[]) + RETURNS BOOLEAN AS $$ - SELECT (info->>'place_match')::INTEGER[] + SELECT (info->>'place_match')::INTEGER[] && place_tokens $$ LANGUAGE SQL IMMUTABLE STRICT; @@ -55,19 +69,24 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; -DROP TYPE IF EXISTS token_addresstoken CASCADE; -CREATE TYPE token_addresstoken AS ( - key TEXT, - match_tokens INT[], - search_tokens INT[] -); +CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB) + RETURNS SETOF TEXT +AS $$ + SELECT * FROM jsonb_object_keys(info->'addr'); +$$ LANGUAGE SQL IMMUTABLE STRICT; + + +CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT) + RETURNS INTEGER[] +AS $$ + SELECT (info->'addr'->key->>0)::INTEGER[]; +$$ LANGUAGE SQL IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB) - RETURNS SETOF token_addresstoken + +CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[]) + RETURNS BOOLEAN AS $$ - SELECT key, (value->>1)::int[] as match_tokens, - (value->>0)::int[] as search_tokens - FROM jsonb_each(info->'addr'); + SELECT (info->'addr'->key->>1)::INTEGER[] && tokens; $$ LANGUAGE SQL IMMUTABLE STRICT; -- 2.39.5