From: Sarah Hoffmann Date: Fri, 30 Jun 2023 09:08:25 +0000 (+0200) Subject: use information from tokenizer to determine street vs. place address X-Git-Tag: v4.3.0~59^2~2 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/645ea5a057346a698f007c0b9e35a58c9ed11755 use information from tokenizer to determine street vs. place address So far the SQL logic used the information from the address field to determine if an address is attached to a street or place. This changes the logic to use the information provided in the token_info. This allows sanitizers to enforce a certain parenting without changing the visible address information. --- diff --git a/docs/develop/Tokenizers.md b/docs/develop/Tokenizers.md index 273e65e2..eb0d4ea2 100644 --- a/docs/develop/Tokenizers.md +++ b/docs/develop/Tokenizers.md @@ -189,6 +189,28 @@ a house number token text. If a place has multiple house numbers they must be listed with a semicolon as delimiter. Must be NULL when the place has no house numbers. +```sql +FUNCTION token_is_street_address(info JSONB) RETURNS BOOLEAN +``` + +Return true if this is an object that should be parented against a street. +Only relevant for objects with address rank 30. + +```sql +FUNCTION token_has_addr_street(info JSONB) RETURNS BOOLEAN +``` + +Return true if there are street names to match against for finding the +parent of the object. + + +```sql +FUNCTION token_has_addr_place(info JSONB) RETURNS BOOLEAN +``` + +Return true if there are place names to match against for finding the +parent of the object. + ```sql FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) RETURNS BOOLEAN ``` diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql index 99d2872f..a252e9aa 100644 --- a/lib-sql/functions/placex_triggers.sql +++ b/lib-sql/functions/placex_triggers.sql @@ -996,7 +996,7 @@ BEGIN {% if debug %}RAISE WARNING 'finding street for % %', NEW.osm_type, NEW.osm_id;{% endif %} NEW.parent_place_id := null; - is_place_address := coalesce(not NEW.address ? 'street' and NEW.address ? 'place', FALSE); + is_place_address := not token_is_street_address(NEW.token_info); -- We have to find our parent road. NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id, @@ -1013,7 +1013,7 @@ BEGIN SELECT p.country_code, p.postcode, p.name FROM placex p WHERE p.place_id = NEW.parent_place_id INTO location; - IF is_place_address THEN + IF is_place_address and NEW.address ? 'place' THEN -- Check if the addr:place tag is part of the parent name SELECT count(*) INTO i FROM svals(location.name) AS pname WHERE pname = NEW.address->'place'; diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql index 599d0eb0..04fcedcb 100644 --- a/lib-sql/tokenizer/icu_tokenizer.sql +++ b/lib-sql/tokenizer/icu_tokenizer.sql @@ -41,10 +41,17 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; +CREATE OR REPLACE FUNCTION token_is_street_address(info JSONB) + RETURNS BOOLEAN +AS $$ + SELECT info->>'street' is not null or info->>'place' is null; +$$ LANGUAGE SQL IMMUTABLE; + + CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB) RETURNS BOOLEAN AS $$ - SELECT info->>'street' is not null; + SELECT info->>'street' is not null and info->>'street' != '{}'; $$ LANGUAGE SQL IMMUTABLE; diff --git a/lib-sql/tokenizer/legacy_tokenizer.sql b/lib-sql/tokenizer/legacy_tokenizer.sql index 5826f74a..3b82619f 100644 --- a/lib-sql/tokenizer/legacy_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_tokenizer.sql @@ -41,6 +41,13 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; +CREATE OR REPLACE FUNCTION token_is_street_address(info JSONB) + RETURNS BOOLEAN +AS $$ + SELECT info->>'street' is not null or info->>'place' is null; +$$ LANGUAGE SQL IMMUTABLE; + + CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB) RETURNS BOOLEAN AS $$ diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 79f383f6..b6e64637 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -720,7 +720,7 @@ class _TokenInfo: self.names: Optional[str] = None self.housenumbers: Set[str] = set() self.housenumber_tokens: Set[int] = set() - self.street_tokens: Set[int] = set() + self.street_tokens: Optional[Set[int]] = None self.place_tokens: Set[int] = set() self.address_tokens: Dict[str, str] = {} self.postcode: Optional[str] = None @@ -742,7 +742,7 @@ class _TokenInfo: out['hnr'] = ';'.join(self.housenumbers) out['hnr_tokens'] = self._mk_array(self.housenumber_tokens) - if self.street_tokens: + if self.street_tokens is not None: out['street'] = self._mk_array(self.street_tokens) if self.place_tokens: @@ -776,6 +776,8 @@ class _TokenInfo: def add_street(self, tokens: Iterable[int]) -> None: """ Add addr:street match terms. """ + if self.street_tokens is None: + self.street_tokens = set() self.street_tokens.update(tokens) diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index 7f0ffce1..2d9da69a 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -523,7 +523,7 @@ class TestPlaceAddress: def test_process_place_nonexisting_street(self): info = self.process_address(street='Grand Road') - assert 'street' not in info + assert info['street'] == '{}' def test_process_place_multiple_street_tags(self): @@ -538,7 +538,7 @@ class TestPlaceAddress: def test_process_place_street_empty(self): info = self.process_address(street='🜵') - assert 'street' not in info + assert info['street'] == '{}' def test_process_place_street_from_cache(self):