From 7f7d2fd5b35fa31f1d53cd0e319705a4600c3f29 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 6 Dec 2021 14:46:40 +0100 Subject: [PATCH] skip most addr: tags with suffixes Only one addr: tag can be processed currently, so make sure it is the one without suffixes to not get odd data. addr:street is the exception because it uses a different matching mechanism. --- nominatim/tokenizer/icu_tokenizer.py | 5 +++-- test/python/tokenizer/test_icu.py | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index f8f6af2e..33f05cc4 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -420,8 +420,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): if token: streets.append(token) elif item.kind == 'place': - token_info.add_place(self._compute_partial_tokens(item.name)) - elif not item.kind.startswith('_') and \ + if not item.suffix: + token_info.add_place(self._compute_partial_tokens(item.name)) + elif not item.kind.startswith('_') and not item.suffix and \ item.kind not in ('country', 'full'): addr_terms.append((item.kind, self._compute_partial_tokens(item.name))) diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index 22112220..83668b39 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -514,6 +514,12 @@ class TestPlaceAddress: assert eval(info['place']) == self.name_token_set('HONU', 'LULU') + def test_process_place_place_extra(self): + info = self.process_address(**{'place:en': 'Honu Lulu'}) + + assert 'place' not in info + + def test_process_place_place_empty(self): info = self.process_address(place='🜵') @@ -533,6 +539,14 @@ class TestPlaceAddress: assert result == {'city': city, 'suburb': city, 'state': state} + def test_process_place_multiple_address_terms(self): + info = self.process_address(**{'city': 'Bruxelles', 'city:de': 'Brüssel'}) + + result = {k: eval(v) for k,v in info['addr'].items()} + + assert result == {'city': self.name_token_set('Bruxelles')} + + def test_process_place_address_terms_empty(self): info = self.process_address(country='de', city=' ', street='Hauptstr', full='right behind the church') -- 2.39.5