1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2025 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Handling of arbitrary postcode tokens in tokenized query string.
10 from typing import Tuple, Set, Dict, List
12 from collections import defaultdict
16 from ..config import Configuration
17 from . import query as qmod
21 """ Pattern-based parser for postcodes in tokenized queries.
23 The postcode patterns are read from the country configuration.
24 The parser does currently not return country restrictions.
27 def __init__(self, config: Configuration) -> None:
28 # skip over includes here to avoid loading the complete country name data
29 yaml.add_constructor('!include', lambda loader, node: [],
30 Loader=yaml.SafeLoader)
31 cdata = yaml.safe_load(config.find_config_file('country_settings.yaml')
32 .read_text(encoding='utf-8'))
34 unique_patterns: Dict[str, Dict[str, List[str]]] = {}
35 for cc, data in cdata.items():
36 if data.get('postcode'):
37 pat = data['postcode']['pattern'].replace('d', '[0-9]').replace('l', '[a-z]')
38 out = data['postcode'].get('output')
39 if pat not in unique_patterns:
40 unique_patterns[pat] = defaultdict(list)
41 unique_patterns[pat][out].append(cc)
43 self.global_pattern = re.compile(
44 '(?:(?P<cc>[a-z][a-z])(?P<space>[ -]?))?(?P<pc>(?:(?:'
45 + ')|(?:'.join(unique_patterns) + '))[:, >].*)')
47 self.local_patterns = [(re.compile(f"{pat}[:, >]"), list(info.items()))
48 for pat, info in unique_patterns.items()]
50 def parse(self, query: qmod.QueryStruct) -> Set[Tuple[int, int, str]]:
51 """ Parse postcodes in the given list of query tokens taking into
52 account the list of breaks from the nodes.
54 The result is a sequence of tuples with
55 [start node id, end node id, postcode token]
58 outcodes: Set[Tuple[int, int, str]] = set()
60 for i in range(query.num_token_slots()):
61 if nodes[i].btype in '<,: ' and nodes[i + 1].btype != '`' \
62 and (i == 0 or nodes[i - 1].ptype != qmod.PHRASE_POSTCODE):
63 if nodes[i].ptype == qmod.PHRASE_ANY:
64 word = nodes[i + 1].term_normalized + nodes[i + 1].btype
65 if word[-1] in ' -' and nodes[i + 2].btype != '`' \
66 and nodes[i + 1].ptype == qmod.PHRASE_ANY:
67 word += nodes[i + 2].term_normalized + nodes[i + 2].btype
68 if word[-1] in ' -' and nodes[i + 3].btype != '`' \
69 and nodes[i + 2].ptype == qmod.PHRASE_ANY:
70 word += nodes[i + 3].term_normalized + nodes[i + 3].btype
72 self._match_word(word, i, False, outcodes)
73 elif nodes[i].ptype == qmod.PHRASE_POSTCODE:
74 word = nodes[i + 1].term_normalized + nodes[i + 1].btype
75 for j in range(i + 1, query.num_token_slots()):
76 if nodes[j].ptype != qmod.PHRASE_POSTCODE:
78 word += nodes[j + 1].term_normalized + nodes[j + 1].btype
80 self._match_word(word, i, True, outcodes)
84 def _match_word(self, word: str, pos: int, fullmatch: bool,
85 outcodes: Set[Tuple[int, int, str]]) -> None:
86 # Use global pattern to check for presence of any postcode.
87 m = self.global_pattern.fullmatch(word)
89 # If there was a match, check against each pattern separately
90 # because multiple patterns might be machting at the end.
92 pc_word = m.group('pc')
93 cc_spaces = len(m.group('space') or '')
94 for pattern, info in self.local_patterns:
95 lm = pattern.fullmatch(pc_word) if fullmatch else pattern.match(pc_word)
97 trange = (pos, pos + cc_spaces + sum(c in ' ,-:>' for c in lm.group(0)))
98 for out, out_ccs in info:
99 if cc is None or cc in out_ccs:
101 outcodes.add((*trange, lm.expand(out).upper()))
103 outcodes.add((*trange, lm.group(0)[:-1].upper()))