From: Sarah Hoffmann Date: Wed, 29 Sep 2021 09:54:14 +0000 (+0200) Subject: export more data for the tokenizer name preparation X-Git-Tag: v4.0.0~23^2~5 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/be65c8303f18d0f92bbf5bc9558f8789d33f21d9 export more data for the tokenizer name preparation Adds class, type, country and rank to the exported information and removes the rather odd hack for countries. Whether a place represents a country boundary can now be computed by the tokenizer. --- diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql index 9c2a67a1..8ae8cf39 100644 --- a/lib-sql/functions/placex_triggers.sql +++ b/lib-sql/functions/placex_triggers.sql @@ -1,30 +1,33 @@ -- Trigger functions for the placex table. +-- Information returned by update preparation. +DROP TYPE IF EXISTS prepare_update_info CASCADE; +CREATE TYPE prepare_update_info AS ( + name HSTORE, + address HSTORE, + rank_address SMALLINT, + country_code TEXT, + class TEXT, + type TEXT, + linked_place_id BIGINT +); + -- Retrieve the data needed by the indexer for updating the place. --- --- Return parameters: --- name list of names --- address list of address tags, either from the object or a surrounding --- building --- country_feature If the place is a country feature, this contains the --- country code, otherwise it is null. -CREATE OR REPLACE FUNCTION placex_prepare_update(p placex, - OUT name HSTORE, - OUT address HSTORE, - OUT country_feature VARCHAR, - OUT linked_place_id BIGINT) +CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex) + RETURNS prepare_update_info AS $$ DECLARE location RECORD; + result prepare_update_info; BEGIN -- For POI nodes, check if the address should be derived from a surrounding -- building. IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN - address := p.address; + result.address := p.address; ELSE -- The additional && condition works around the misguided query -- planner of postgis 3.0. - SELECT placex.address || hstore('_inherited', '') INTO address + SELECT placex.address || hstore('_inherited', '') INTO result.address FROM placex WHERE ST_Covers(geometry, p.centroid) and geometry && p.centroid @@ -34,27 +37,26 @@ BEGIN LIMIT 1; END IF; - address := address - '_unlisted_place'::TEXT; - name := p.name; + result.address := result.address - '_unlisted_place'::TEXT; + result.name := p.name; + result.class := p.class; + result.type := p.type; + result.country_code := p.country_code; + result.rank_address := p.rank_address; -- Names of linked places need to be merged in, so search for a linkable -- place already here. SELECT * INTO location FROM find_linked_place(p); IF location.place_id is not NULL THEN - linked_place_id := location.place_id; + result.linked_place_id := location.place_id; IF NOT location.name IS NULL THEN - name := location.name || name; + result.name := location.name || result.name; END IF; END IF; - country_feature := CASE WHEN p.admin_level = 2 - and p.class = 'boundary' and p.type = 'administrative' - and p.osm_type = 'R' - THEN p.country_code - ELSE null - END; + RETURN result; END; $$ LANGUAGE plpgsql STABLE; diff --git a/nominatim/indexer/place_info.py b/nominatim/indexer/place_info.py index fd179fef..06d730e0 100644 --- a/nominatim/indexer/place_info.py +++ b/nominatim/indexer/place_info.py @@ -38,7 +38,31 @@ class PlaceInfo: @property - def country_feature(self): - """ Return the country code if the place is a valid country boundary. + def country_code(self): + """ The country code of the country the place is in. Guaranteed + to be a two-letter lower-case string or None, if no country + could be found. """ - return self._info.get('country_feature') + return self._info.get('country_code') + + + @property + def rank_address(self): + """ The computed rank address before rank correction. + """ + return self._info.get('rank_address') + + + def is_a(self, key, value): + """ Check if the place's primary tag corresponds to the given + key and value. + """ + return self._info.get('class') == key and self._info.get('type') == value + + + def is_country(self): + """ Check if the place is a valid country boundary. + """ + return self.rank_address == 4 \ + and self.is_a('boundary', 'administrative') \ + and self.country_code is not None diff --git a/nominatim/indexer/runners.py b/nominatim/indexer/runners.py index 43966419..70536a71 100644 --- a/nominatim/indexer/runners.py +++ b/nominatim/indexer/runners.py @@ -39,7 +39,7 @@ class AbstractPlacexRunner: @staticmethod def get_place_details(worker, ids): - worker.perform("""SELECT place_id, (placex_prepare_update(placex)).* + worker.perform("""SELECT place_id, (placex_indexing_prepare(placex)).* FROM placex WHERE place_id IN %s""", (tuple((p[0] for p in ids)), )) diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 81b07568..fbaa2596 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -397,9 +397,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): token_info.add_names(fulls, partials) - country_feature = place.country_feature - if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): - self.add_country_names(country_feature.lower(), names) + if place.is_country(): + self.add_country_names(place.country_code, names) address = place.address if address: diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index 8bfb309d..dc6972dc 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -410,9 +410,8 @@ class LegacyNameAnalyzer(AbstractAnalyzer): if names: token_info.add_names(self.conn, names) - country_feature = place.country_feature - if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): - self.add_country_names(country_feature.lower(), names) + if place.is_country(): + self.add_country_names(place.country_code, names) address = place.address if address: diff --git a/test/python/test_indexing.py b/test/python/test_indexing.py index 60ad0bc4..4c9d940d 100644 --- a/test/python/test_indexing.py +++ b/test/python/test_indexing.py @@ -29,6 +29,7 @@ class IndexerTestDB: indexed_date TIMESTAMP, partition SMALLINT, admin_level SMALLINT, + country_code TEXT, address HSTORE, token_info JSONB, geometry_sector INTEGER)""") @@ -54,15 +55,26 @@ class IndexerTestDB: END IF; RETURN NEW; END; $$ LANGUAGE plpgsql;""") - cur.execute("""CREATE OR REPLACE FUNCTION placex_prepare_update(p placex, - OUT name HSTORE, - OUT address HSTORE, - OUT country_feature VARCHAR, - OUT linked_place_id BIGINT) + cur.execute("DROP TYPE IF EXISTS prepare_update_info CASCADE") + cur.execute("""CREATE TYPE prepare_update_info AS ( + name HSTORE, + address HSTORE, + rank_address SMALLINT, + country_code TEXT, + class TEXT, + type TEXT, + linked_place_id BIGINT + )""") + cur.execute("""CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex, + OUT result prepare_update_info) AS $$ BEGIN - address := p.address; - name := p.name; + result.address := p.address; + result.name := p.name; + result.class := p.class; + result.type := p.type; + result.country_code := p.country_code; + result.rank_address := p.rank_address; END; $$ LANGUAGE plpgsql STABLE; """) diff --git a/test/python/test_tokenizer_icu.py b/test/python/test_tokenizer_icu.py index 28c6ef7a..bbfc0b12 100644 --- a/test/python/test_tokenizer_icu.py +++ b/test/python/test_tokenizer_icu.py @@ -323,10 +323,8 @@ class TestPlaceNames: assert eval(info['names']) == set((t[2] for t in tokens)) - def process_named_place(self, names, country_feature=None): + def process_named_place(self, names): place = {'name': names} - if country_feature: - place['country_feature'] = country_feature return self.analyzer.process_place(PlaceInfo(place)) @@ -353,7 +351,13 @@ class TestPlaceNames: def test_country_name(self, word_table): - info = self.process_named_place({'name': 'Norge'}, country_feature='no') + place = PlaceInfo({'name' : {'name': 'Norge'}, + 'country_code': 'no', + 'rank_address': 4, + 'class': 'boundary', + 'type': 'administrative'}) + + info = self.analyzer.process_place(place) self.expect_name_terms(info, '#norge', 'norge') assert word_table.get_country() == {('no', 'NORGE')}