Merge pull request #2902 from lonvia/tiger-county-sanitizer

author Sarah Hoffmann <lonvia@denofr.de>

Wed, 23 Nov 2022 16:58:42 +0000 (17:58 +0100)

committer GitHub <noreply@github.com>

Wed, 23 Nov 2022 16:58:42 +0000 (17:58 +0100)
author Sarah Hoffmann <lonvia@denofr.de>
Wed, 23 Nov 2022 16:58:42 +0000 (17:58 +0100)
committer GitHub <noreply@github.com>
Wed, 23 Nov 2022 16:58:42 +0000 (17:58 +0100)
diff --git a/.pylintrc b/.pylintrc

index e860940778fe9492072a9d50ce322ea643df6ce9..e62371c615158f2762e3d18ae0e185ad0e54bff7 100644 (file)
--- a/.pylintrc
+++ b/.pylintrc
@@ -15,4 +15,4 @@ ignored-classes=NominatimArgs,closing
  #   typed Python is enabled. See also https://github.com/PyCQA/pylint/issues/5273
  disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use,not-context-manager
  
-good-names=i,x,y,fd,db,cc
+good-names=i,x,y,m,fd,db,cc
diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md

index c563b20105160e27cb7176ff8b597f0c063c7fab..58606c29d0176822ce3364324520b34634a5e242 100644 (file)
--- a/docs/customize/Tokenizers.md
+++ b/docs/customize/Tokenizers.md
@@ -213,6 +213,15 @@ The following is a list of sanitizers that are shipped with Nominatim.
      rendering:
          heading_level: 6
  
+##### clean-tiger-tags
+
+::: nominatim.tokenizer.sanitizers.clean_tiger_tags
+    selection:
+        members: False
+    rendering:
+        heading_level: 6
+
+
  
  #### Token Analysis
  
diff --git a/nominatim/tokenizer/sanitizers/clean_tiger_tags.py b/nominatim/tokenizer/sanitizers/clean_tiger_tags.py

new file mode 100644 (file)

index 0000000..9698a32
--- /dev/null
+++ b/nominatim/tokenizer/sanitizers/clean_tiger_tags.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Sanitizer that preprocesses tags from the TIGER import.
+
+It makes the following changes:
+
+* remove state reference from tiger:county
+"""
+from typing import Callable
+import re
+
+from nominatim.tokenizer.sanitizers.base import ProcessInfo
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
+
+COUNTY_MATCH = re.compile('(.*), [A-Z][A-Z]')
+
+def _clean_tiger_county(obj: ProcessInfo) -> None:
+    """ Remove the state reference from tiger:county tags.
+
+        This transforms a name like 'Hamilton, AL' into 'Hamilton'.
+        If no state reference is detected at the end, the name is left as is.
+    """
+    if not obj.address:
+        return
+
+    for item in obj.address:
+        if item.kind == 'tiger' and item.suffix == 'county':
+            m = COUNTY_MATCH.fullmatch(item.name)
+            if m:
+                item.name = m[1]
+            # Switch kind and suffix, the split left them reversed.
+            item.kind = 'county'
+            item.suffix = 'tiger'
+
+            return
+
+
+def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
+    """ Create a housenumber processing function.
+    """
+    return _clean_tiger_county
diff --git a/settings/flex-base.lua b/settings/flex-base.lua

index 19f4e27bde554dd8e01cd2df2932cec7940a58e0..fe3ce32ae6d85cd18fe0217900ba324da58c12e8 100644 (file)
--- a/settings/flex-base.lua
+++ b/settings/flex-base.lua
@@ -347,16 +347,6 @@ function process_tags(o)
  
      local is_interpolation = o:grab_address{match=INTERPOLATION_TAGS} > 0
  
-    if ADD_TIGER_COUNTY then
-        local v = o:grab_tag('tiger:county')
-        if v ~= nil then
-            v, num = v:gsub(',.*', ' county')
-            if num == 0 then
-                v = v .. ' county'
-            end
-            o:set_address('tiger:county', v)
-        end
-    end
      o:grab_address{match=ADDRESS_TAGS}
  
      if is_interpolation then
diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml

index 212fdcb9e2f7d29cac379c0a58e9041e2819912d..16339970c2ae5727cf606fe686a607814cfec2a2 100644 (file)
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -35,6 +35,7 @@ sanitizers:
      - step: clean-postcodes
        convert-to-address: yes
        default-pattern: "[A-Z0-9- ]{3,12}"
+    - step: clean-tiger-tags
      - step: split-name-list
      - step: strip-brace-terms
      - step: tag-analyzer-by-language
diff --git a/settings/import-extratags.lua b/settings/import-extratags.lua

index 535af3c8ced1abec65aa3cf1c816e91156f51a10..7b1880ef7df4486a417e362d4df8a0ff41c626fc 100644 (file)
--- a/settings/import-extratags.lua
+++ b/settings/import-extratags.lua
@@ -123,8 +123,7 @@ HOUSENUMBER_TAGS = tag_match{keys = {'addr:housenumber', 'addr:conscriptionnumbe
  
  INTERPOLATION_TAGS = tag_match{keys = {'addr:interpolation'}}
  
-ADDRESS_TAGS = tag_match{keys = {'addr:*', 'is_in:*'}}
-ADD_TIGER_COUNTY = true
+ADDRESS_TAGS = tag_match{keys = {'addr:*', 'is_in:*', 'tiger:county'}}
  
  SAVE_EXTRA_MAINS = true
  
diff --git a/test/bdd/osm2pgsql/import/tags.feature b/test/bdd/osm2pgsql/import/tags.feature

index 1f6857f2c0ac3dcb5d740f49b90acdce546e45cc..60d241fe68e9d3296e82ae9c0938882d7b991eab 100644 (file)
--- a/test/bdd/osm2pgsql/import/tags.feature
+++ b/test/bdd/osm2pgsql/import/tags.feature
@@ -166,20 +166,6 @@ Feature: Tag evaluation
              | N10003:place    | place    | island         |
  
  
-    Scenario: Shorten tiger:county tags
-        When loading osm data
-            """
-            n11001 Tplace=village,tiger:county=Feebourgh%2c%%20%AL
-            n11002 Tplace=village,addr:state=Alabama,tiger:county=Feebourgh%2c%%20%AL
-            n11003 Tplace=village,tiger:county=Feebourgh
-            """
-        Then place contains exactly
-            | object | class | address             |
-            | N11001 | place | 'tiger:county': 'Feebourgh county' |
-            | N11002 | place | 'tiger:county': 'Feebourgh county', 'state': 'Alabama' |
-            | N11003 | place | 'tiger:county': 'Feebourgh county' |
-
-
      Scenario: Building fallbacks
          When loading osm data
              """
diff --git a/test/python/tokenizer/sanitizers/test_clean_tiger_tags.py b/test/python/tokenizer/sanitizers/test_clean_tiger_tags.py

new file mode 100644 (file)

index 0000000..fc17ad2
--- /dev/null
+++ b/test/python/tokenizer/sanitizers/test_clean_tiger_tags.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for sanitizer that clean up TIGER tags.
+"""
+import pytest
+
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from nominatim.data.place_info import PlaceInfo
+
+class TestCleanTigerTags:
+
+    @pytest.fixture(autouse=True)
+    def setup_country(self, def_config):
+        self.config = def_config
+
+
+    def run_sanitizer_on(self, addr):
+        place = PlaceInfo({'address': addr})
+        _, outaddr = PlaceSanitizer([{'step': 'clean-tiger-tags'}], self.config).process_names(place)
+
+        return sorted([(p.name, p.kind, p.suffix) for p in outaddr])
+
+    @pytest.mark.parametrize('inname,outname', [('Hamilton, AL', 'Hamilton'),
+                                                ('Little, Borough, CA', 'Little, Borough')])
+    def test_well_formatted(self, inname, outname):
+        assert self.run_sanitizer_on({'tiger:county': inname})\
+            == [(outname, 'county', 'tiger')]
+
+
+    @pytest.mark.parametrize('name', ('Hamilton', 'Big, Road', ''))
+    def test_badly_formatted(self, name):
+        assert self.run_sanitizer_on({'tiger:county': name})\
+            == [(name, 'county', 'tiger')]
+
+
+    def test_unmatched(self):
+        assert self.run_sanitizer_on({'tiger:country': 'US'})\
+            == [('US', 'tiger', 'country')]
author	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 23 Nov 2022 16:58:42 +0000 (17:58 +0100)
committer	GitHub <noreply@github.com>
	Wed, 23 Nov 2022 16:58:42 +0000 (17:58 +0100)
.pylintrc		patch \| blob \| history
docs/customize/Tokenizers.md		patch \| blob \| history
nominatim/tokenizer/sanitizers/clean_tiger_tags.py	[new file with mode: 0644]	patch \| blob
settings/flex-base.lua		patch \| blob \| history
settings/icu_tokenizer.yaml		patch \| blob \| history
settings/import-extratags.lua		patch \| blob \| history
test/bdd/osm2pgsql/import/tags.feature		patch \| blob \| history
test/python/tokenizer/sanitizers/test_clean_tiger_tags.py	[new file with mode: 0644]	patch \| blob