remove postcodes from countries that don't have them

author Sarah Hoffmann <lonvia@denofr.de>

Thu, 12 May 2022 09:43:47 +0000 (11:43 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Thu, 23 Jun 2022 21:42:31 +0000 (23:42 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Thu, 12 May 2022 09:43:47 +0000 (11:43 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Thu, 23 Jun 2022 21:42:31 +0000 (23:42 +0200)
diff --git a/.pylintrc b/.pylintrc

index fef53872118c6a034286b6490afcd939f889ef11..52d9fcf9e623b2b709841efe35d1a9995cb5a9fe 100644 (file)
--- a/.pylintrc
+++ b/.pylintrc
@@ -13,4 +13,4 @@ ignored-classes=NominatimArgs,closing
  # 'too-many-ancestors' is triggered already by deriving from UserDict
  disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use
  
  # 'too-many-ancestors' is triggered already by deriving from UserDict
  disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use
  
-good-names=i,x,y,fd,db
+good-names=i,x,y,fd,db,cc
diff --git a/nominatim/tokenizer/sanitizers/config.py b/nominatim/tokenizer/sanitizers/config.py

index ecfcacbe551e7c0747e20b1e14e30458c3b858bc..ce5ce1eb8b5606dd702efb2b582facf1a48a0626 100644 (file)
--- a/nominatim/tokenizer/sanitizers/config.py
+++ b/nominatim/tokenizer/sanitizers/config.py
@@ -44,6 +44,20 @@ class SanitizerConfig(UserDict):
          return values
  
  
          return values
  
  
+    def get_bool(self, param, default=None):
+        """ Extract a configuration parameter as a boolean.
+            The parameter must be one of the yaml boolean values or an
+            user error will be raised. If `default` is given, then the parameter
+            may also be missing or empty.
+        """
+        value = self.data.get(param, default)
+
+        if not isinstance(value, bool):
+            raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no'.")
+
+        return value
+
+
      def get_delimiter(self, default=',;'):
          """ Return the 'delimiter' parameter in the configuration as a
              compiled regular expression that can be used to split the names on the
      def get_delimiter(self, default=',;'):
          """ Return the 'delimiter' parameter in the configuration as a
              compiled regular expression that can be used to split the names on the
diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py

index 7898b1c68525dd59d8362a83e258db8ced173a59..9a99d127728290264c7762f7c76fefb7177f3267 100644 (file)
--- a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
+++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
@@ -48,8 +48,7 @@ class _AnalyzerByLanguage:
          self.deflangs = {}
  
          if use_defaults in ('mono', 'all'):
          self.deflangs = {}
  
          if use_defaults in ('mono', 'all'):
-            for ccode, prop in country_info.iterate():
-                clangs = prop['languages']
+            for ccode, clangs in country_info.iterate('languages'):
                  if len(clangs) == 1 or use_defaults == 'all':
                      if self.whitelist:
                          self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
                  if len(clangs) == 1 or use_defaults == 'all':
                      if self.whitelist:
                          self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
diff --git a/nominatim/tools/country_info.py b/nominatim/tools/country_info.py

index 0ad001719e164f110afbf063f69f57711a78b42c..d754b4ddb029365b22d2cc7a77ccaeefc49a2719 100644 (file)
--- a/nominatim/tools/country_info.py
+++ b/nominatim/tools/country_info.py
@@ -84,10 +84,20 @@ def setup_country_config(config):
      _COUNTRY_INFO.load(config)
  
  
      _COUNTRY_INFO.load(config)
  
  
-def iterate():
+def iterate(prop=None):
      """ Iterate over country code and properties.
      """ Iterate over country code and properties.
+
+        When `prop` is None, all countries are returned with their complete
+        set of properties.
+
+        If `prop` is given, then only countries are returned where the
+        given property is set. The second item of the tuple contains only
+        the content of the given property.
      """
      """
-    return _COUNTRY_INFO.items()
+    if prop is None:
+        return _COUNTRY_INFO.items()
+
+    return ((c, p[prop]) for c, p in _COUNTRY_INFO.items() if prop in p)
  
  
  def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
  
  
  def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml

index cd9c0d6dd56974888c9e12fdf834b51a5b55b22e..544bd81db01ee0dc17f13fce5ad859959b186812 100644 (file)
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -32,6 +32,8 @@ sanitizers:
          - streetnumber
        convert-to-name:
          - (\A|.*,)[^\d,]{3,}(,.*|\Z)
          - streetnumber
        convert-to-name:
          - (\A|.*,)[^\d,]{3,}(,.*|\Z)
+    - step: clean-postcodes
+      convert-to-address: yes
      - step: split-name-list
      - step: strip-brace-terms
      - step: tag-analyzer-by-language
      - step: split-name-list
      - step: strip-brace-terms
      - step: tag-analyzer-by-language
diff --git a/test/bdd/db/import/postcodes.feature b/test/bdd/db/import/postcodes.feature

index 15beab57827e31b4f411ed05dee028636626385f..50afa7cc2dc5560e66729af3c09c06a9321daa09 100644 (file)
--- a/test/bdd/db/import/postcodes.feature
+++ b/test/bdd/db/import/postcodes.feature
@@ -246,4 +246,18 @@ Feature: Import of postcodes
           | 12 445 4 | ca      | 25          | 11 |
           | A1:BC10  | ca      | 25          | 11 |
  
           | 12 445 4 | ca      | 25          | 11 |
           | A1:BC10  | ca      | 25          | 11 |
  
-
+    Scenario: Postcodes outside all countries are not added to the postcode and word table
+        Given the places
+            | osm | class | type  | addr+postcode | addr+housenumber | addr+place  | geometry  |
+            | N34 | place | house | 01982         | 111              | Null Island | 0 0.00001 |
+        And the places
+            | osm | class | type   | name        | geometry |
+            | N1  | place | hamlet | Null Island | 0 0      |
+        When importing
+        Then location_postcode contains exactly
+            | country | postcode | geometry |
+        And there are no word tokens for postcodes 01982
+        When sending search query "111, 01982 Null Island"
+        Then results contain
+            | osm | display_name |
+            | N34 | 111, Null Island, 01982 |
author	Sarah Hoffmann <lonvia@denofr.de>
	Thu, 12 May 2022 09:43:47 +0000 (11:43 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Thu, 23 Jun 2022 21:42:31 +0000 (23:42 +0200)
.pylintrc		patch \| blob \| history
nominatim/tokenizer/sanitizers/config.py		patch \| blob \| history
nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py		patch \| blob \| history
nominatim/tools/country_info.py		patch \| blob \| history
settings/icu_tokenizer.yaml		patch \| blob \| history
test/bdd/db/import/postcodes.feature		patch \| blob \| history