From 37b2c6a830c90aea17b76c5b6a74c711025a142d Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 8 Jun 2022 08:19:55 +0200
Subject: [PATCH] port legacy tokenizer to new postcode handling

Also documents the changes to the SQL functions of the tokenizer.
---
 docs/develop/Tokenizers.md              |  6 +++---
 lib-sql/tokenizer/icu_tokenizer.sql     |  7 -------
 lib-sql/tokenizer/legacy_tokenizer.sql  |  4 ++--
 nominatim/tokenizer/legacy_tokenizer.py | 10 ++++++++--
 nominatim/tools/postcodes.py            |  6 +++---
 5 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/docs/develop/Tokenizers.md b/docs/develop/Tokenizers.md
index 2b4da005..5fe4e38d 100644
--- a/docs/develop/Tokenizers.md
+++ b/docs/develop/Tokenizers.md
@@ -245,11 +245,11 @@ Currently, tokenizers are encouraged to make sure that matching works against
 both the search token list and the match token list.
 
 ```sql
-FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT
+FUNCTION token_get_postcode(info JSONB) RETURNS TEXT
 ```
 
-Return the normalized version of the given postcode. This function must return
-the same value as the Python function `AbstractAnalyzer->normalize_postcode()`.
+Return the postcode for the object, if any exists. The postcode must be in
+the form that should also be presented to the end-user.
 
 ```sql
 FUNCTION token_strip_info(info JSONB) RETURNS JSONB
diff --git a/lib-sql/tokenizer/icu_tokenizer.sql b/lib-sql/tokenizer/icu_tokenizer.sql
index f86a0a37..599d0eb0 100644
--- a/lib-sql/tokenizer/icu_tokenizer.sql
+++ b/lib-sql/tokenizer/icu_tokenizer.sql
@@ -97,13 +97,6 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
-CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
-  RETURNS TEXT
-AS $$
-  SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
-$$ LANGUAGE SQL IMMUTABLE STRICT;
-
-
 CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB)
   RETURNS TEXT
 AS $$
diff --git a/lib-sql/tokenizer/legacy_tokenizer.sql b/lib-sql/tokenizer/legacy_tokenizer.sql
index 64453d4e..5826f74a 100644
--- a/lib-sql/tokenizer/legacy_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer.sql
@@ -97,10 +97,10 @@ AS $$
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
-CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+CREATE OR REPLACE FUNCTION token_get_postcode(info JSONB)
   RETURNS TEXT
 AS $$
-  SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
+  SELECT info->>'postcode';
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 
 
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py
index a292b180..36fd5722 100644
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -467,8 +467,9 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
             if key == 'postcode':
                 # Make sure the normalized postcode is present in the word table.
                 if re.search(r'[:,;]', value) is None:
-                    self._cache.add_postcode(self.conn,
-                                             self.normalize_postcode(value))
+                    norm_pc = self.normalize_postcode(value)
+                    token_info.set_postcode(norm_pc)
+                    self._cache.add_postcode(self.conn, norm_pc)
             elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
                 hnrs.append(value)
             elif key == 'street':
@@ -527,6 +528,11 @@ class _TokenInfo:
             self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
 
 
+    def set_postcode(self, postcode):
+        """ Set or replace the postcode token with the given value.
+        """
+        self.data['postcode'] = postcode
+
     def add_street(self, conn, street):
         """ Add addr:street match terms.
         """
diff --git a/nominatim/tools/postcodes.py b/nominatim/tools/postcodes.py
index 27fbcc9b..9c66719b 100644
--- a/nominatim/tools/postcodes.py
+++ b/nominatim/tools/postcodes.py
@@ -186,17 +186,17 @@ def update_postcodes(dsn, project_dir, tokenizer):
             # Recompute the list of valid postcodes from placex.
             with conn.cursor(name="placex_postcodes") as cur:
                 cur.execute("""
-                SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid)
+                SELECT cc, pc, ST_X(centroid), ST_Y(centroid)
                 FROM (SELECT
                         COALESCE(plx.country_code,
                                  get_country_code(ST_Centroid(pl.geometry))) as cc,
-                        token_normalized_postcode(pl.address->'postcode') as pc,
+                        pl.address->'postcode' as pc,
                         COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid
                       FROM place AS pl LEFT OUTER JOIN placex AS plx
                              ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
                     WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx
                 WHERE pc IS NOT null AND cc IS NOT null
-                ORDER BY country_code, pc""")
+                ORDER BY cc, pc""")
 
                 collector = None
 
-- 
2.39.5