From: Sarah Hoffmann Date: Tue, 4 Mar 2025 14:18:27 +0000 (+0100) Subject: add support for country prefixes in postcodes X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/434fbbfd182c2f556d46b3bac65a4d92ab59b9d7?hp=--cc add support for country prefixes in postcodes --- 434fbbfd182c2f556d46b3bac65a4d92ab59b9d7 diff --git a/src/nominatim_api/search/postcode_parser.py b/src/nominatim_api/search/postcode_parser.py index 93ed87c4..1148d3c3 100644 --- a/src/nominatim_api/search/postcode_parser.py +++ b/src/nominatim_api/search/postcode_parser.py @@ -7,7 +7,7 @@ """ Handling of arbitrary postcode tokens in tokenized query string. """ -from typing import Tuple, Set +from typing import Tuple, Set, Dict, List import re from collections import defaultdict @@ -31,20 +31,21 @@ class PostcodeParser: cdata = yaml.safe_load(config.find_config_file('country_settings.yaml') .read_text(encoding='utf-8')) - unique_patterns = defaultdict(set) + unique_patterns: Dict[str, Dict[str, List[str]]] = {} for cc, data in cdata.items(): if data.get('postcode'): - pat = data['postcode']['pattern'] + pat = data['postcode']['pattern'].replace('d', '[0-9]').replace('l', '[a-z]') out = data['postcode'].get('output') - unique_patterns[pat.replace('d', '[0-9]').replace('l', '[a-z]')].add(out) + if pat not in unique_patterns: + unique_patterns[pat] = defaultdict(list) + unique_patterns[pat][out].append(cc) self.global_pattern = re.compile( - '(?:' + - '|'.join(f"(?:{k})" for k in unique_patterns) - + ')[:, >]') + '(?:(?P[a-z][a-z])(?P[ -]?))?(?P(?:(?:' + + ')|(?:'.join(unique_patterns) + '))[:, >].*)') - self.local_patterns = [(re.compile(f"(?:{k})[:, >]"), v) - for k, v in unique_patterns.items()] + self.local_patterns = [(re.compile(f"{pat}[:, >]"), list(info.items())) + for pat, info in unique_patterns.items()] def parse(self, query: qmod.QueryStruct) -> Set[Tuple[int, int, str]]: """ Parse postcodes in the given list of query tokens taking into @@ -64,18 +65,22 @@ class PostcodeParser: if word[-1] in ' -' and nodes[i + 3].btype != '`': word += nodes[i + 3].term_normalized + nodes[i + 3].btype - # Use global pattern to check for presence of any postocde. - m = self.global_pattern.match(word) + # Use global pattern to check for presence of any postcode. + m = self.global_pattern.fullmatch(word) if m: # If there was a match, check against each pattern separately # because multiple patterns might be machting at the end. + cc = m.group('cc') + pc_word = m.group('pc') + cc_spaces = len(m.group('space') or '') for pattern, info in self.local_patterns: - lm = pattern.match(word) + lm = pattern.match(pc_word) if lm: - trange = (i, i + sum(c in ' ,-:>' for c in lm.group(0))) - for out in info: - if out: - outcodes.add((*trange, lm.expand(out).upper())) - else: - outcodes.add((*trange, lm.group(0)[:-1].upper())) + trange = (i, i + cc_spaces + sum(c in ' ,-:>' for c in lm.group(0))) + for out, out_ccs in info: + if cc is None or cc in out_ccs: + if out: + outcodes.add((*trange, lm.expand(out).upper())) + else: + outcodes.add((*trange, lm.group(0)[:-1].upper())) return outcodes