From fd3dec8efebd202352f20cbfebb36d955e0198c4 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 22 Nov 2022 17:10:21 +0100 Subject: [PATCH] add sanitizer for TIGER tags Currently only takes over cleaning the tiger:county data. This was done by the import until now. --- .pylintrc | 2 +- docs/customize/Tokenizers.md | 9 ++++ .../tokenizer/sanitizers/clean_tiger_tags.py | 46 +++++++++++++++++++ settings/icu_tokenizer.yaml | 1 + .../sanitizers/test_clean_tiger_tags.py | 43 +++++++++++++++++ 5 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 nominatim/tokenizer/sanitizers/clean_tiger_tags.py create mode 100644 test/python/tokenizer/sanitizers/test_clean_tiger_tags.py diff --git a/.pylintrc b/.pylintrc index e8609407..e62371c6 100644 --- a/.pylintrc +++ b/.pylintrc @@ -15,4 +15,4 @@ ignored-classes=NominatimArgs,closing # typed Python is enabled. See also https://github.com/PyCQA/pylint/issues/5273 disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use,not-context-manager -good-names=i,x,y,fd,db,cc +good-names=i,x,y,m,fd,db,cc diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md index c563b201..58606c29 100644 --- a/docs/customize/Tokenizers.md +++ b/docs/customize/Tokenizers.md @@ -213,6 +213,15 @@ The following is a list of sanitizers that are shipped with Nominatim. rendering: heading_level: 6 +##### clean-tiger-tags + +::: nominatim.tokenizer.sanitizers.clean_tiger_tags + selection: + members: False + rendering: + heading_level: 6 + + #### Token Analysis diff --git a/nominatim/tokenizer/sanitizers/clean_tiger_tags.py b/nominatim/tokenizer/sanitizers/clean_tiger_tags.py new file mode 100644 index 00000000..9698a326 --- /dev/null +++ b/nominatim/tokenizer/sanitizers/clean_tiger_tags.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Sanitizer that preprocesses tags from the TIGER import. + +It makes the following changes: + +* remove state reference from tiger:county +""" +from typing import Callable +import re + +from nominatim.tokenizer.sanitizers.base import ProcessInfo +from nominatim.tokenizer.sanitizers.config import SanitizerConfig + +COUNTY_MATCH = re.compile('(.*), [A-Z][A-Z]') + +def _clean_tiger_county(obj: ProcessInfo) -> None: + """ Remove the state reference from tiger:county tags. + + This transforms a name like 'Hamilton, AL' into 'Hamilton'. + If no state reference is detected at the end, the name is left as is. + """ + if not obj.address: + return + + for item in obj.address: + if item.kind == 'tiger' and item.suffix == 'county': + m = COUNTY_MATCH.fullmatch(item.name) + if m: + item.name = m[1] + # Switch kind and suffix, the split left them reversed. + item.kind = 'county' + item.suffix = 'tiger' + + return + + +def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]: + """ Create a housenumber processing function. + """ + return _clean_tiger_county diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index 212fdcb9..16339970 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -35,6 +35,7 @@ sanitizers: - step: clean-postcodes convert-to-address: yes default-pattern: "[A-Z0-9- ]{3,12}" + - step: clean-tiger-tags - step: split-name-list - step: strip-brace-terms - step: tag-analyzer-by-language diff --git a/test/python/tokenizer/sanitizers/test_clean_tiger_tags.py b/test/python/tokenizer/sanitizers/test_clean_tiger_tags.py new file mode 100644 index 00000000..fc17ad24 --- /dev/null +++ b/test/python/tokenizer/sanitizers/test_clean_tiger_tags.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for sanitizer that clean up TIGER tags. +""" +import pytest + +from nominatim.tokenizer.place_sanitizer import PlaceSanitizer +from nominatim.data.place_info import PlaceInfo + +class TestCleanTigerTags: + + @pytest.fixture(autouse=True) + def setup_country(self, def_config): + self.config = def_config + + + def run_sanitizer_on(self, addr): + place = PlaceInfo({'address': addr}) + _, outaddr = PlaceSanitizer([{'step': 'clean-tiger-tags'}], self.config).process_names(place) + + return sorted([(p.name, p.kind, p.suffix) for p in outaddr]) + + @pytest.mark.parametrize('inname,outname', [('Hamilton, AL', 'Hamilton'), + ('Little, Borough, CA', 'Little, Borough')]) + def test_well_formatted(self, inname, outname): + assert self.run_sanitizer_on({'tiger:county': inname})\ + == [(outname, 'county', 'tiger')] + + + @pytest.mark.parametrize('name', ('Hamilton', 'Big, Road', '')) + def test_badly_formatted(self, name): + assert self.run_sanitizer_on({'tiger:county': name})\ + == [(name, 'county', 'tiger')] + + + def test_unmatched(self): + assert self.run_sanitizer_on({'tiger:country': 'US'})\ + == [('US', 'tiger', 'country')] -- 2.39.5