From 37b2c6a830c90aea17b76c5b6a74c711025a142d Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 8 Jun 2022 08:19:55 +0200 Subject: [PATCH] port legacy tokenizer to new postcode handling Also documents the changes to the SQL functions of the tokenizer. --- docs/develop/Tokenizers.md | 6 +++--- lib-sql/tokenizer/icu_tokenizer.sql | 7 ------- lib-sql/tokenizer/legacy_tokenizer.sql | 4 ++-- nominatim/tokenizer/legacy_tokenizer.py | 10 ++++++++-- nominatim/tools/postcodes.py | 6 +++--- 5 files changed, 16 insertions(+), 17 deletions(-) diff --git a/docs/develop/Tokenizers.md b/docs/develop/Tokenizers.md index 2b4da005..5fe4e38d 100644 --- a/docs/develop/Tokenizers.md +++ b/docs/develop/Tokenizers.md @@ -245,11 +245,11 @@ Currently, tokenizers are encouraged to make sure that matching works against both the search token list and the match token list. ```sql -FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT +FUNCTION token_get_postcode(info JSONB) RETURNS TEXT ``` -Return the normalized version of the given postcode. This function must return -the same value as the Python function `AbstractAnalyzer->normalize_postcode()`. +Return the postcode for the object, if any exists. The postcode must be in +the form that should also be presented to the end-user. ```sql FUNCTION token_strip_info(info JSONB) RETURNS JSONB diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql index f86a0a37..599d0eb0 100644 --- a/lib-sql/tokenizer/icu_tokenizer.sql +++ b/lib-sql/tokenizer/icu_tokenizer.sql @@ -97,13 +97,6 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT) - RETURNS TEXT -AS $$ - SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END; -$$ LANGUAGE SQL IMMUTABLE STRICT; - - CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB) RETURNS TEXT AS $$ diff --git a/lib-sql/tokenizer/legacy_tokenizer.sql b/lib-sql/tokenizer/legacy_tokenizer.sql index 64453d4e..5826f74a 100644 --- a/lib-sql/tokenizer/legacy_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_tokenizer.sql @@ -97,10 +97,10 @@ AS $$ $$ LANGUAGE SQL IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT) +CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB) RETURNS TEXT AS $$ - SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END; + SELECT info->>'postcode'; $$ LANGUAGE SQL IMMUTABLE STRICT; diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index a292b180..36fd5722 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -467,8 +467,9 @@ class LegacyNameAnalyzer(AbstractAnalyzer): if key == 'postcode': # Make sure the normalized postcode is present in the word table. if re.search(r'[:,;]', value) is None: - self._cache.add_postcode(self.conn, - self.normalize_postcode(value)) + norm_pc = self.normalize_postcode(value) + token_info.set_postcode(norm_pc) + self._cache.add_postcode(self.conn, norm_pc) elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): hnrs.append(value) elif key == 'street': @@ -527,6 +528,11 @@ class _TokenInfo: self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone() + def set_postcode(self, postcode): + """ Set or replace the postcode token with the given value. + """ + self.data['postcode'] = postcode + def add_street(self, conn, street): """ Add addr:street match terms. """ diff --git a/nominatim/tools/postcodes.py b/nominatim/tools/postcodes.py index 27fbcc9b..9c66719b 100644 --- a/nominatim/tools/postcodes.py +++ b/nominatim/tools/postcodes.py @@ -186,17 +186,17 @@ def update_postcodes(dsn, project_dir, tokenizer): # Recompute the list of valid postcodes from placex. with conn.cursor(name="placex_postcodes") as cur: cur.execute(""" - SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid) + SELECT cc, pc, ST_X(centroid), ST_Y(centroid) FROM (SELECT COALESCE(plx.country_code, get_country_code(ST_Centroid(pl.geometry))) as cc, - token_normalized_postcode(pl.address->'postcode') as pc, + pl.address->'postcode' as pc, COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid FROM place AS pl LEFT OUTER JOIN placex AS plx ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx WHERE pc IS NOT null AND cc IS NOT null - ORDER BY country_code, pc""") + ORDER BY cc, pc""") collector = None -- 2.39.5