X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/90d4d339dbed83cc90823401634f01a20e129548..ca7b46511d41d67e229f758e638367c241815c11:/nominatim/tokenizer/sanitizers/clean_postcodes.py diff --git a/nominatim/tokenizer/sanitizers/clean_postcodes.py b/nominatim/tokenizer/sanitizers/clean_postcodes.py index b07908cd..d1edc60d 100644 --- a/nominatim/tokenizer/sanitizers/clean_postcodes.py +++ b/nominatim/tokenizer/sanitizers/clean_postcodes.py @@ -29,17 +29,34 @@ class _PostcodeMatcher: raise UsageError("Field 'pattern' required for 'postcode' " f"for country '{country_code}'") - self.pattern = re.compile(config['pattern'].replace('d', '[0-9]') - .replace('l', '[A-Z]')) + pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]') + self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?(.*)\\s*') + self.pattern = re.compile(pc_pattern) - def normalize(self, postcode): - """ Return the normalized version of the postcode. If the given postcode - does not correspond to the usage-pattern, return null. + self.output = config.get('output', r'\g<0>') + + + def match(self, postcode): + """ Match the given postcode against the postcode pattern for this + matcher. Returns a `re.Match` object if the match was successful + and None otherwise. """ - normalized = postcode.strip().upper() + # Upper-case, strip spaces and leading country code. + normalized = self.norm_pattern.fullmatch(postcode.upper()) + + if normalized: + return self.pattern.fullmatch(normalized.group(1)) + + return None + - return normalized if self.pattern.fullmatch(normalized) else None + def normalize(self, match): + """ Return the default format of the postcode for the given match. + `match` must be a `re.Match` object previously returned by + `match()` + """ + return match.expand(self.output) class _PostcodeSanitizer: @@ -58,6 +75,12 @@ class _PostcodeSanitizer: else: raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'") + default_pattern = config.get('default-pattern') + if default_pattern is not None and isinstance(default_pattern, str): + self.default_matcher = _PostcodeMatcher('', {'pattern': default_pattern}) + else: + self.default_matcher = None + def __call__(self, obj): if not obj.address: @@ -74,7 +97,8 @@ class _PostcodeSanitizer: else: obj.address.pop(pos) else: - postcode.name = formatted + postcode.name = formatted[0] + postcode.set_attr('variant', formatted[1]) def scan(self, postcode, country): @@ -85,10 +109,16 @@ class _PostcodeSanitizer: if country in self.country_without_postcode: return None - if country in self.country_matcher: - return self.country_matcher[country].normalize(postcode) + matcher = self.country_matcher.get(country, self.default_matcher) + if matcher is None: + return postcode.upper(), '' + + match = matcher.match(postcode) + if match is None: + return None + + return matcher.normalize(match), ' '.join(match.groups()) - return postcode.upper()