From 8080625747dc7e87bc510d2af0d3edf5d551a6d0 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Thu, 12 May 2022 11:43:47 +0200
Subject: [PATCH] remove postcodes from countries that don't have them

The postcodes will only be removed as a 'computed postcode' they
are still searchable for the given object.
---
 .pylintrc                                        |  2 +-
 nominatim/tokenizer/sanitizers/config.py         | 14 ++++++++++++++
 .../sanitizers/tag_analyzer_by_language.py       |  3 +--
 nominatim/tools/country_info.py                  | 14 ++++++++++++--
 settings/icu_tokenizer.yaml                      |  2 ++
 test/bdd/db/import/postcodes.feature             | 16 +++++++++++++++-
 6 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/.pylintrc b/.pylintrc
index fef53872..52d9fcf9 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -13,4 +13,4 @@ ignored-classes=NominatimArgs,closing
 # 'too-many-ancestors' is triggered already by deriving from UserDict
 disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use
 
-good-names=i,x,y,fd,db
+good-names=i,x,y,fd,db,cc
diff --git a/nominatim/tokenizer/sanitizers/config.py b/nominatim/tokenizer/sanitizers/config.py
index ecfcacbe..ce5ce1eb 100644
--- a/nominatim/tokenizer/sanitizers/config.py
+++ b/nominatim/tokenizer/sanitizers/config.py
@@ -44,6 +44,20 @@ class SanitizerConfig(UserDict):
         return values
 
 
+    def get_bool(self, param, default=None):
+        """ Extract a configuration parameter as a boolean.
+            The parameter must be one of the yaml boolean values or an
+            user error will be raised. If `default` is given, then the parameter
+            may also be missing or empty.
+        """
+        value = self.data.get(param, default)
+
+        if not isinstance(value, bool):
+            raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no'.")
+
+        return value
+
+
     def get_delimiter(self, default=',;'):
         """ Return the 'delimiter' parameter in the configuration as a
             compiled regular expression that can be used to split the names on the
diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
index 7898b1c6..9a99d127 100644
--- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
+++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
@@ -48,8 +48,7 @@ class _AnalyzerByLanguage:
         self.deflangs = {}
 
         if use_defaults in ('mono', 'all'):
-            for ccode, prop in country_info.iterate():
-                clangs = prop['languages']
+            for ccode, clangs in country_info.iterate('languages'):
                 if len(clangs) == 1 or use_defaults == 'all':
                     if self.whitelist:
                         self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
diff --git a/nominatim/tools/country_info.py b/nominatim/tools/country_info.py
index 0ad00171..d754b4dd 100644
--- a/nominatim/tools/country_info.py
+++ b/nominatim/tools/country_info.py
@@ -84,10 +84,20 @@ def setup_country_config(config):
     _COUNTRY_INFO.load(config)
 
 
-def iterate():
+def iterate(prop=None):
     """ Iterate over country code and properties.
+
+        When `prop` is None, all countries are returned with their complete
+        set of properties.
+
+        If `prop` is given, then only countries are returned where the
+        given property is set. The second item of the tuple contains only
+        the content of the given property.
     """
-    return _COUNTRY_INFO.items()
+    if prop is None:
+        return _COUNTRY_INFO.items()
+
+    return ((c, p[prop]) for c, p in _COUNTRY_INFO.items() if prop in p)
 
 
 def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml
index cd9c0d6d..544bd81d 100644
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -32,6 +32,8 @@ sanitizers:
         - streetnumber
       convert-to-name:
         - (\A|.*,)[^\d,]{3,}(,.*|\Z)
+    - step: clean-postcodes
+      convert-to-address: yes
     - step: split-name-list
     - step: strip-brace-terms
     - step: tag-analyzer-by-language
diff --git a/test/bdd/db/import/postcodes.feature b/test/bdd/db/import/postcodes.feature
index 15beab57..50afa7cc 100644
--- a/test/bdd/db/import/postcodes.feature
+++ b/test/bdd/db/import/postcodes.feature
@@ -246,4 +246,18 @@ Feature: Import of postcodes
          | 12 445 4 | ca      | 25          | 11 |
          | A1:BC10  | ca      | 25          | 11 |
 
-
+    Scenario: Postcodes outside all countries are not added to the postcode and word table
+        Given the places
+            | osm | class | type  | addr+postcode | addr+housenumber | addr+place  | geometry  |
+            | N34 | place | house | 01982         | 111              | Null Island | 0 0.00001 |
+        And the places
+            | osm | class | type   | name        | geometry |
+            | N1  | place | hamlet | Null Island | 0 0      |
+        When importing
+        Then location_postcode contains exactly
+            | country | postcode | geometry |
+        And there are no word tokens for postcodes 01982
+        When sending search query "111, 01982 Null Island"
+        Then results contain
+            | osm | display_name |
+            | N34 | 111, Null Island, 01982 |
-- 
2.39.5