]> git.openstreetmap.org Git - nominatim.git/commitdiff
remove postcodes from countries that don't have them
authorSarah Hoffmann <lonvia@denofr.de>
Thu, 12 May 2022 09:43:47 +0000 (11:43 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Thu, 23 Jun 2022 21:42:31 +0000 (23:42 +0200)
The postcodes will only be removed as a 'computed postcode' they
are still searchable for the given object.

.pylintrc
nominatim/tokenizer/sanitizers/config.py
nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
nominatim/tools/country_info.py
settings/icu_tokenizer.yaml
test/bdd/db/import/postcodes.feature

index fef53872118c6a034286b6490afcd939f889ef11..52d9fcf9e623b2b709841efe35d1a9995cb5a9fe 100644 (file)
--- a/.pylintrc
+++ b/.pylintrc
@@ -13,4 +13,4 @@ ignored-classes=NominatimArgs,closing
 # 'too-many-ancestors' is triggered already by deriving from UserDict
 disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use
 
 # 'too-many-ancestors' is triggered already by deriving from UserDict
 disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use
 
-good-names=i,x,y,fd,db
+good-names=i,x,y,fd,db,cc
index ecfcacbe551e7c0747e20b1e14e30458c3b858bc..ce5ce1eb8b5606dd702efb2b582facf1a48a0626 100644 (file)
@@ -44,6 +44,20 @@ class SanitizerConfig(UserDict):
         return values
 
 
         return values
 
 
+    def get_bool(self, param, default=None):
+        """ Extract a configuration parameter as a boolean.
+            The parameter must be one of the yaml boolean values or an
+            user error will be raised. If `default` is given, then the parameter
+            may also be missing or empty.
+        """
+        value = self.data.get(param, default)
+
+        if not isinstance(value, bool):
+            raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no'.")
+
+        return value
+
+
     def get_delimiter(self, default=',;'):
         """ Return the 'delimiter' parameter in the configuration as a
             compiled regular expression that can be used to split the names on the
     def get_delimiter(self, default=',;'):
         """ Return the 'delimiter' parameter in the configuration as a
             compiled regular expression that can be used to split the names on the
index 7898b1c68525dd59d8362a83e258db8ced173a59..9a99d127728290264c7762f7c76fefb7177f3267 100644 (file)
@@ -48,8 +48,7 @@ class _AnalyzerByLanguage:
         self.deflangs = {}
 
         if use_defaults in ('mono', 'all'):
         self.deflangs = {}
 
         if use_defaults in ('mono', 'all'):
-            for ccode, prop in country_info.iterate():
-                clangs = prop['languages']
+            for ccode, clangs in country_info.iterate('languages'):
                 if len(clangs) == 1 or use_defaults == 'all':
                     if self.whitelist:
                         self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
                 if len(clangs) == 1 or use_defaults == 'all':
                     if self.whitelist:
                         self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
index 0ad001719e164f110afbf063f69f57711a78b42c..d754b4ddb029365b22d2cc7a77ccaeefc49a2719 100644 (file)
@@ -84,10 +84,20 @@ def setup_country_config(config):
     _COUNTRY_INFO.load(config)
 
 
     _COUNTRY_INFO.load(config)
 
 
-def iterate():
+def iterate(prop=None):
     """ Iterate over country code and properties.
     """ Iterate over country code and properties.
+
+        When `prop` is None, all countries are returned with their complete
+        set of properties.
+
+        If `prop` is given, then only countries are returned where the
+        given property is set. The second item of the tuple contains only
+        the content of the given property.
     """
     """
-    return _COUNTRY_INFO.items()
+    if prop is None:
+        return _COUNTRY_INFO.items()
+
+    return ((c, p[prop]) for c, p in _COUNTRY_INFO.items() if prop in p)
 
 
 def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
 
 
 def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
index cd9c0d6dd56974888c9e12fdf834b51a5b55b22e..544bd81db01ee0dc17f13fce5ad859959b186812 100644 (file)
@@ -32,6 +32,8 @@ sanitizers:
         - streetnumber
       convert-to-name:
         - (\A|.*,)[^\d,]{3,}(,.*|\Z)
         - streetnumber
       convert-to-name:
         - (\A|.*,)[^\d,]{3,}(,.*|\Z)
+    - step: clean-postcodes
+      convert-to-address: yes
     - step: split-name-list
     - step: strip-brace-terms
     - step: tag-analyzer-by-language
     - step: split-name-list
     - step: strip-brace-terms
     - step: tag-analyzer-by-language
index 15beab57827e31b4f411ed05dee028636626385f..50afa7cc2dc5560e66729af3c09c06a9321daa09 100644 (file)
@@ -246,4 +246,18 @@ Feature: Import of postcodes
          | 12 445 4 | ca      | 25          | 11 |
          | A1:BC10  | ca      | 25          | 11 |
 
          | 12 445 4 | ca      | 25          | 11 |
          | A1:BC10  | ca      | 25          | 11 |
 
-
+    Scenario: Postcodes outside all countries are not added to the postcode and word table
+        Given the places
+            | osm | class | type  | addr+postcode | addr+housenumber | addr+place  | geometry  |
+            | N34 | place | house | 01982         | 111              | Null Island | 0 0.00001 |
+        And the places
+            | osm | class | type   | name        | geometry |
+            | N1  | place | hamlet | Null Island | 0 0      |
+        When importing
+        Then location_postcode contains exactly
+            | country | postcode | geometry |
+        And there are no word tokens for postcodes 01982
+        When sending search query "111, 01982 Null Island"
+        Then results contain
+            | osm | display_name |
+            | N34 | 111, Null Island, 01982 |