# typed Python is enabled. See also https://github.com/PyCQA/pylint/issues/5273
disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use,not-context-manager
-good-names=i,x,y,fd,db,cc
+good-names=i,x,y,m,fd,db,cc
rendering:
heading_level: 6
+##### clean-tiger-tags
+
+::: nominatim.tokenizer.sanitizers.clean_tiger_tags
+ selection:
+ members: False
+ rendering:
+ heading_level: 6
+
+
#### Token Analysis
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Sanitizer that preprocesses tags from the TIGER import.
+
+It makes the following changes:
+
+* remove state reference from tiger:county
+"""
+from typing import Callable
+import re
+
+from nominatim.tokenizer.sanitizers.base import ProcessInfo
+from nominatim.tokenizer.sanitizers.config import SanitizerConfig
+
+COUNTY_MATCH = re.compile('(.*), [A-Z][A-Z]')
+
+def _clean_tiger_county(obj: ProcessInfo) -> None:
+ """ Remove the state reference from tiger:county tags.
+
+ This transforms a name like 'Hamilton, AL' into 'Hamilton'.
+ If no state reference is detected at the end, the name is left as is.
+ """
+ if not obj.address:
+ return
+
+ for item in obj.address:
+ if item.kind == 'tiger' and item.suffix == 'county':
+ m = COUNTY_MATCH.fullmatch(item.name)
+ if m:
+ item.name = m[1]
+ # Switch kind and suffix, the split left them reversed.
+ item.kind = 'county'
+ item.suffix = 'tiger'
+
+ return
+
+
+def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
+ """ Create a housenumber processing function.
+ """
+ return _clean_tiger_county
local is_interpolation = o:grab_address{match=INTERPOLATION_TAGS} > 0
- if ADD_TIGER_COUNTY then
- local v = o:grab_tag('tiger:county')
- if v ~= nil then
- v, num = v:gsub(',.*', ' county')
- if num == 0 then
- v = v .. ' county'
- end
- o:set_address('tiger:county', v)
- end
- end
o:grab_address{match=ADDRESS_TAGS}
if is_interpolation then
- step: clean-postcodes
convert-to-address: yes
default-pattern: "[A-Z0-9- ]{3,12}"
+ - step: clean-tiger-tags
- step: split-name-list
- step: strip-brace-terms
- step: tag-analyzer-by-language
INTERPOLATION_TAGS = tag_match{keys = {'addr:interpolation'}}
-ADDRESS_TAGS = tag_match{keys = {'addr:*', 'is_in:*'}}
-ADD_TIGER_COUNTY = true
+ADDRESS_TAGS = tag_match{keys = {'addr:*', 'is_in:*', 'tiger:county'}}
SAVE_EXTRA_MAINS = true
| N10003:place | place | island |
- Scenario: Shorten tiger:county tags
- When loading osm data
- """
- n11001 Tplace=village,tiger:county=Feebourgh%2c%%20%AL
- n11002 Tplace=village,addr:state=Alabama,tiger:county=Feebourgh%2c%%20%AL
- n11003 Tplace=village,tiger:county=Feebourgh
- """
- Then place contains exactly
- | object | class | address |
- | N11001 | place | 'tiger:county': 'Feebourgh county' |
- | N11002 | place | 'tiger:county': 'Feebourgh county', 'state': 'Alabama' |
- | N11003 | place | 'tiger:county': 'Feebourgh county' |
-
-
Scenario: Building fallbacks
When loading osm data
"""
--- /dev/null
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2022 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for sanitizer that clean up TIGER tags.
+"""
+import pytest
+
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from nominatim.data.place_info import PlaceInfo
+
+class TestCleanTigerTags:
+
+ @pytest.fixture(autouse=True)
+ def setup_country(self, def_config):
+ self.config = def_config
+
+
+ def run_sanitizer_on(self, addr):
+ place = PlaceInfo({'address': addr})
+ _, outaddr = PlaceSanitizer([{'step': 'clean-tiger-tags'}], self.config).process_names(place)
+
+ return sorted([(p.name, p.kind, p.suffix) for p in outaddr])
+
+ @pytest.mark.parametrize('inname,outname', [('Hamilton, AL', 'Hamilton'),
+ ('Little, Borough, CA', 'Little, Borough')])
+ def test_well_formatted(self, inname, outname):
+ assert self.run_sanitizer_on({'tiger:county': inname})\
+ == [(outname, 'county', 'tiger')]
+
+
+ @pytest.mark.parametrize('name', ('Hamilton', 'Big, Road', ''))
+ def test_badly_formatted(self, name):
+ assert self.run_sanitizer_on({'tiger:county': name})\
+ == [(name, 'county', 'tiger')]
+
+
+ def test_unmatched(self):
+ assert self.run_sanitizer_on({'tiger:country': 'US'})\
+ == [('US', 'tiger', 'country')]