From f03a05f6bb33918712691657a2179c96da26e93b Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 16 Feb 2022 17:18:23 +0100 Subject: [PATCH] add new analyser for houenumbers This analyser makes spaces optional. --- .../tokenizer/token_analysis/housenumbers.py | 61 +++++ settings/icu_tokenizer.yaml | 2 + test/bdd/db/query/housenumbers.feature | 224 +++++++++++++++++- 3 files changed, 284 insertions(+), 3 deletions(-) create mode 100644 nominatim/tokenizer/token_analysis/housenumbers.py diff --git a/nominatim/tokenizer/token_analysis/housenumbers.py b/nominatim/tokenizer/token_analysis/housenumbers.py new file mode 100644 index 00000000..6a838e00 --- /dev/null +++ b/nominatim/tokenizer/token_analysis/housenumbers.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Specialized processor for housenumbers. Analyses common housenumber patterns +and creates variants for them. +""" +import re + +from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator + +RE_NON_DIGIT = re.compile('[^0-9]') +RE_DIGIT_ALPHA = re.compile(r'(\d)\s*([^\d\s␣])') +RE_ALPHA_DIGIT = re.compile(r'([^\s\d␣])\s*(\d)') + +### Configuration section + +def configure(rules, normalization_rules): + """ All behaviour is currently hard-coded. + """ + return None + +### Analysis section + +def create(normalizer, transliterator, config): + """ Create a new token analysis instance for this module. + """ + return HousenumberTokenAnalysis(normalizer, transliterator) + + +class HousenumberTokenAnalysis: + """ Detects common housenumber patterns and normalizes them. + """ + def __init__(self, norm, trans): + self.norm = norm + self.trans = trans + + self.mutator = MutationVariantGenerator('␣', (' ', '')) + + def normalize(self, name): + """ Return the normalized form of the housenumber. + """ + # shortcut for number-only numbers, which make up 90% of the data. + if RE_NON_DIGIT.search(name) is None: + return name + + norm = self.trans.transliterate(self.norm.transliterate(name)) + norm = RE_DIGIT_ALPHA.sub(r'\1␣\2', norm) + norm = RE_ALPHA_DIGIT.sub(r'\1␣\2', norm) + + return norm + + def get_variants_ascii(self, norm_name): + """ Compute the spelling variants for the given normalized housenumber. + + Generates variants for optional spaces (marked with '␣'). + """ + return list(self.mutator.generate([norm_name])) diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index 50bb72d2..bebd49e9 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -41,6 +41,8 @@ sanitizers: mode: append token-analysis: - analyzer: generic + - id: "@housenumber" + analyzer: housenumbers - id: bg analyzer: generic mode: variant-only diff --git a/test/bdd/db/query/housenumbers.feature b/test/bdd/db/query/housenumbers.feature index 4d42da9f..62963af4 100644 --- a/test/bdd/db/query/housenumbers.feature +++ b/test/bdd/db/query/housenumbers.feature @@ -9,10 +9,10 @@ Feature: Searching of house numbers | | | | | 4 | - Scenario: A simple numeral housenumber is found + Scenario: A simple ascii digit housenumber is found Given the places - | osm | class | type | housenr | geometry | - | N1 | building | yes | 45 | 9 | + | osm | class | type | housenr | geometry | + | N1 | building | yes | 45 | 9 | And the places | osm | class | type | name | geometry | | W10 | highway | path | North Road | 1,2,3 | @@ -27,6 +27,34 @@ Feature: Searching of house numbers | N1 | + Scenario Outline: Numeral housenumbers in any script are found + Given the places + | osm | class | type | housenr | geometry | + | N1 | building | yes | | 9 | + And the places + | osm | class | type | name | geometry | + | W10 | highway | path | North Road | 1,2,3 | + When importing + And sending search query "45, North Road" + Then results contain + | osm | + | N1 | + When sending search query "North Road ④⑤" + Then results contain + | osm | + | N1 | + When sending search query "North Road 𑁪𑁫" + Then results contain + | osm | + | N1 | + + Examples: + | number | + | 45 | + | ④⑤ | + | 𑁪𑁫 | + + Scenario Outline: Each housenumber in a list is found Given the places | osm | class | type | housenr | geometry | @@ -55,6 +83,196 @@ Feature: Searching of house numbers | 2, 4, 12 | + Scenario Outline: Housenumber - letter combinations are found + Given the places + | osm | class | type | housenr | geometry | + | N1 | building | yes | | 9 | + And the places + | osm | class | type | name | geometry | + | W10 | highway | path | Multistr | 1,2,3 | + When importing + When sending search query "2A Multistr" + Then results contain + | osm | + | N1 | + When sending search query "2 a Multistr" + Then results contain + | osm | + | N1 | + When sending search query "2-A Multistr" + Then results contain + | osm | + | N1 | + When sending search query "Multistr 2 A" + Then results contain + | osm | + | N1 | + + Examples: + | hnr | + | 2a | + | 2 A | + | 2-a | + | 2/A | + + + Scenario Outline: Number - Number combinations as a housenumber are found + Given the places + | osm | class | type | housenr | geometry | + | N1 | building | yes | | 9 | + And the places + | osm | class | type | name | geometry | + | W10 | highway | path | Chester St | 1,2,3 | + When importing + When sending search query "34-10 Chester St" + Then results contain + | osm | + | N1 | + When sending search query "34/10 Chester St" + Then results contain + | osm | + | N1 | + When sending search query "34 10 Chester St" + Then results contain + | osm | + | N1 | + When sending search query "3410 Chester St" + Then results contain + | osm | + | W10 | + + Examples: + | hnr | + | 34-10 | + | 34 10 | + | 34/10 | + + + Scenario Outline: a bis housenumber is found + Given the places + | osm | class | type | housenr | geometry | + | N1 | building | yes | | 9 | + And the places + | osm | class | type | name | geometry | + | W10 | highway | path | Rue Paris | 1,2,3 | + When importing + When sending search query "Rue Paris 45bis" + Then results contain + | osm | + | N1 | + When sending search query "Rue Paris 45 BIS" + Then results contain + | osm | + | N1 | + When sending search query "Rue Paris 45BIS" + Then results contain + | osm | + | N1 | + When sending search query "Rue Paris 45 bis" + Then results contain + | osm | + | N1 | + + Examples: + | hnr | + | 45bis | + | 45BIS | + | 45 BIS | + | 45 bis | + + Scenario Outline: a ter housenumber is found + Given the places + | osm | class | type | housenr | geometry | + | N1 | building | yes | | 9 | + And the places + | osm | class | type | name | geometry | + | W10 | highway | path | Rue du Berger | 1,2,3 | + When importing + When sending search query "Rue du Berger 45ter" + Then results contain + | osm | + | N1 | + When sending search query "Rue du Berger 45 TER" + Then results contain + | osm | + | N1 | + When sending search query "Rue du Berger 45TER" + Then results contain + | osm | + | N1 | + When sending search query "Rue du Berger 45 ter" + Then results contain + | osm | + | N1 | + + Examples: + | hnr | + | 45ter | + | 45TER | + | 45 ter | + | 45 TER | + + + Scenario Outline: a number - letter - number combination housenumber is found + Given the places + | osm | class | type | housenr | geometry | + | N1 | building | yes | | 9 | + And the places + | osm | class | type | name | geometry | + | W10 | highway | path | Herengracht | 1,2,3 | + When importing + When sending search query "501-H 1 Herengracht" + Then results contain + | osm | + | N1 | + When sending search query "501H-1 Herengracht" + Then results contain + | osm | + | N1 | + When sending search query "501H1 Herengracht" + Then results contain + | osm | + | N1 | + When sending search query "501-H1 Herengracht" + Then results contain + | osm | + | N1 | + + Examples: + | hnr | + | 501 H1 | + | 501H 1 | + | 501/H/1 | + | 501h1 | + + + Scenario Outline: Russian housenumbers are found + Given the places + | osm | class | type | housenr | geometry | + | N1 | building | yes | | 9 | + And the places + | osm | class | type | name | geometry | + | W10 | highway | path | Голубинская улица | 1,2,3 | + When importing + When sending search query "Голубинская улица 55к3" + Then results contain + | osm | + | N1 | + When sending search query "Голубинская улица 55 k3" + Then results contain + | osm | + | N1 | + When sending search query "Голубинская улица 55 к-3" + Then results contain + | osm | + | N1 | + + Examples: + | hnr | + | 55к3 | + | 55 к3 | + + Scenario: A name mapped as a housenumber is found Given the places | osm | class | type | housenr | geometry | -- 2.39.5