From 80ea13437df4c6d57ea503adbdfc9928de8d859c Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 6 Jun 2022 23:37:04 +0200 Subject: [PATCH] move postcode matcher in a separate file --- nominatim/data/__init__.py | 0 nominatim/data/postcode_format.py | 97 +++++++++++++++++++ nominatim/tokenizer/icu_tokenizer.py | 2 +- .../tokenizer/sanitizers/clean_postcodes.py | 70 +------------ test/python/tokenizer/test_icu.py | 7 -- 5 files changed, 103 insertions(+), 73 deletions(-) create mode 100644 nominatim/data/__init__.py create mode 100644 nominatim/data/postcode_format.py diff --git a/nominatim/data/__init__.py b/nominatim/data/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nominatim/data/postcode_format.py b/nominatim/data/postcode_format.py new file mode 100644 index 00000000..0158111a --- /dev/null +++ b/nominatim/data/postcode_format.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Functions for formatting postcodes according to their country-specific +format. +""" +import re + +from nominatim.errors import UsageError +from nominatim.tools import country_info + +class CountryPostcodeMatcher: + """ Matches and formats a postcode according to a format definition + of the given country. + """ + def __init__(self, country_code, config): + if 'pattern' not in config: + raise UsageError("Field 'pattern' required for 'postcode' " + f"for country '{country_code}'") + + pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]') + + self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*') + self.pattern = re.compile(pc_pattern) + + self.output = config.get('output', r'\g<0>') + + + def match(self, postcode): + """ Match the given postcode against the postcode pattern for this + matcher. Returns a `re.Match` object if the match was successful + and None otherwise. + """ + # Upper-case, strip spaces and leading country code. + normalized = self.norm_pattern.fullmatch(postcode.upper()) + + if normalized: + return self.pattern.fullmatch(normalized.group(1)) + + return None + + + def normalize(self, match): + """ Return the default format of the postcode for the given match. + `match` must be a `re.Match` object previously returned by + `match()` + """ + return match.expand(self.output) + + +class PostcodeFormatter: + """ Container for different postcode formats of the world and + access functions. + """ + def __init__(self): + # Objects without a country code can't have a postcode per definition. + self.country_without_postcode = {None} + self.country_matcher = {} + self.default_matcher = CountryPostcodeMatcher('', {'pattern': '.*'}) + + for ccode, prop in country_info.iterate('postcode'): + if prop is False: + self.country_without_postcode.add(ccode) + elif isinstance(prop, dict): + self.country_matcher[ccode] = CountryPostcodeMatcher(ccode, prop) + else: + raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'") + + + def set_default_pattern(self, pattern): + """ Set the postcode match pattern to use, when a country does not + have a specific pattern or is marked as country without postcode. + """ + self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern}) + + + def match(self, country_code, postcode): + """ Match the given postcode against the postcode pattern for this + matcher. Returns a `re.Match` object if the country has a pattern + and the match was successful or None if the match failed. + """ + if country_code in self.country_without_postcode: + return None + + return self.country_matcher.get(country_code, self.default_matcher).match(postcode) + + + def normalize(self, country_code, match): + """ Return the default format of the postcode for the given match. + `match` must be a `re.Match` object previously returned by + `match()` + """ + return self.country_matcher.get(country_code, self.default_matcher).normalize(match) diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 61c47c11..0dc551e1 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -607,7 +607,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def _add_postcode(self, item): """ Make sure the normalized postcode is present in the word table. """ - analyzer = self.token_analysis.get_analyzer('@postcode') + analyzer = self.token_analysis.analysis.get('@postcode') if analyzer is None: postcode_name = item.name.strip().upper() diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py index d1edc60d..fbc46fa5 100644 --- a/nominatim/tokenizer/sanitizers/clean_postcodes.py +++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py @@ -16,70 +16,17 @@ Arguments: When set to 'no', non-conforming postcodes are not searchable either. """ -import re - -from nominatim.errors import UsageError -from nominatim.tools import country_info - -class _PostcodeMatcher: - """ Matches and formats a postcode according to the format definition. - """ - def __init__(self, country_code, config): - if 'pattern' not in config: - raise UsageError("Field 'pattern' required for 'postcode' " - f"for country '{country_code}'") - - pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]') - - self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*') - self.pattern = re.compile(pc_pattern) - - self.output = config.get('output', r'\g<0>') - - - def match(self, postcode): - """ Match the given postcode against the postcode pattern for this - matcher. Returns a `re.Match` object if the match was successful - and None otherwise. - """ - # Upper-case, strip spaces and leading country code. - normalized = self.norm_pattern.fullmatch(postcode.upper()) - - if normalized: - return self.pattern.fullmatch(normalized.group(1)) - - return None - - - def normalize(self, match): - """ Return the default format of the postcode for the given match. - `match` must be a `re.Match` object previously returned by - `match()` - """ - return match.expand(self.output) - +from nominatim.data.postcode_format import PostcodeFormatter class _PostcodeSanitizer: def __init__(self, config): self.convert_to_address = config.get_bool('convert-to-address', True) - # Objects without a country code can't have a postcode per definition. - self.country_without_postcode = {None} - self.country_matcher = {} - - for ccode, prop in country_info.iterate('postcode'): - if prop is False: - self.country_without_postcode.add(ccode) - elif isinstance(prop, dict): - self.country_matcher[ccode] = _PostcodeMatcher(ccode, prop) - else: - raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'") + self.matcher = PostcodeFormatter() default_pattern = config.get('default-pattern') if default_pattern is not None and isinstance(default_pattern, str): - self.default_matcher = _PostcodeMatcher('', {'pattern': default_pattern}) - else: - self.default_matcher = None + self.matcher.set_default_pattern(default_pattern) def __call__(self, obj): @@ -106,18 +53,11 @@ class _PostcodeSanitizer: normalized version. Returns None if the postcode does not correspond to the oficial format of the given country. """ - if country in self.country_without_postcode: - return None - - matcher = self.country_matcher.get(country, self.default_matcher) - if matcher is None: - return postcode.upper(), '' - - match = matcher.match(postcode) + match = self.matcher.match(country, postcode) if match is None: return None - return matcher.normalize(match), ' '.join(match.groups()) + return self.matcher.normalize(country, match), ' '.join(match.groups()) diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index d85a5b65..6138a03a 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -437,13 +437,6 @@ class TestPlaceAddress: assert word_table.get_postcodes() == {pcode, } - @pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836']) - def test_process_place_bad_postcode(self, word_table, pcode): - self.process_address(postcode=pcode) - - assert not word_table.get_postcodes() - - @pytest.mark.parametrize('hnr', ['123a', '1', '101']) def test_process_place_housenumbers_simple(self, hnr, getorcreate_hnr_id): info = self.process_address(housenumber=hnr) -- 2.39.5