From a79a3210e69c648e3e5c1123b4049ce0419b484c Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 5 Feb 2022 15:13:45 +0100 Subject: [PATCH] implement is-a-name option for housenumbers --- .../tokenizer/sanitizers/clean_housenumbers.py | 15 +++++++++++++-- settings/icu_tokenizer.yaml | 6 ++++-- test/bdd/db/query/housenumbers.feature | 14 ++++++++++++++ 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/nominatim/tokenizer/sanitizers/clean_housenumbers.py b/nominatim/tokenizer/sanitizers/clean_housenumbers.py index 49f9b4f0..48021793 100644 --- a/nominatim/tokenizer/sanitizers/clean_housenumbers.py +++ b/nominatim/tokenizer/sanitizers/clean_housenumbers.py @@ -29,6 +29,10 @@ class _HousenumberSanitizer: self.filter_kind = create_kind_filter(config, 'housenumber') self.split_regexp = create_split_regex(config) + nameregexps = config.get('is-a-name', []) + self.is_name_regexp = [re.compile(r) for r in nameregexps] + + def __call__(self, obj): if not obj.address: @@ -37,8 +41,11 @@ class _HousenumberSanitizer: new_address = [] for item in obj.address: if self.filter_kind(item): - new_address.extend(item.clone(kind='housenumber', name=n) - for n in self.sanitize(item.name)) + if self.treat_as_name(item.name): + obj.names.append(item.clone(kind='housenumber')) + else: + new_address.extend(item.clone(kind='housenumber', name=n) + for n in self.sanitize(item.name)) else: # Don't touch other address items. new_address.append(item) @@ -62,6 +69,10 @@ class _HousenumberSanitizer: yield hnr + def _treat_as_name(self, housenumber): + return any(r.fullmatch(housenumber) is not None for r in self.is_name_regexp) + + def create(config): """ Create a housenumber processing function. """ diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index bf51f563..7f53c5a7 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -25,13 +25,15 @@ transliteration: - "[^a-z0-9[:Space:]] >" - ":: NFC ()" sanitizers: - - step: split-name-list - - step: strip-brace-terms - step: clean-housenumbers filter-kind: - housenumber - conscriptionnumber - streetnumber + is-a-name: + - (\A|.*,)[^\d,]{3,}(,.*|\Z) + - step: split-name-list + - step: strip-brace-terms - step: tag-analyzer-by-language filter-kind: [".*name.*"] whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi] diff --git a/test/bdd/db/query/housenumbers.feature b/test/bdd/db/query/housenumbers.feature index 63bd8984..bbb43f17 100644 --- a/test/bdd/db/query/housenumbers.feature +++ b/test/bdd/db/query/housenumbers.feature @@ -53,3 +53,17 @@ Feature: Searching of house numbers | 2;4;12 | | 2,4,12 | | 2, 4, 12 | + + + Scenario: A name mapped as a housenumber is found + Given the places + | osm | class | type | housenr | geometry | + | N1 | building | yes | Warring | 9 | + And the places + | osm | class | type | name | geometry | + | W10 | highway | path | Chester St | 1,2,3 | + When importing + When sending search query "Chester St Warring" + Then results contain + | osm | + | N1 | -- 2.39.5