From: Sarah Hoffmann Date: Wed, 5 Mar 2025 09:08:07 +0000 (+0100) Subject: restrict postcode parsing in typed phrases X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/6b0d58d9fdbd2e488f3dc408c1b7d8837710f6e6?hp=-c restrict postcode parsing in typed phrases Postcodes can only appear in postcode-type phrases and must then cover the full phrase --- 6b0d58d9fdbd2e488f3dc408c1b7d8837710f6e6 diff --git a/src/nominatim_api/search/postcode_parser.py b/src/nominatim_api/search/postcode_parser.py index 1148d3c3..bb3ef1a4 100644 --- a/src/nominatim_api/search/postcode_parser.py +++ b/src/nominatim_api/search/postcode_parser.py @@ -55,32 +55,49 @@ class PostcodeParser: [start node id, end node id, postcode token] """ nodes = query.nodes - outcodes = set() + outcodes: Set[Tuple[int, int, str]] = set() for i in range(query.num_token_slots()): - if nodes[i].btype in '<,: ' and nodes[i + 1].btype != '`': - word = nodes[i + 1].term_normalized + nodes[i + 1].btype - if word[-1] in ' -' and nodes[i + 2].btype != '`': - word += nodes[i + 2].term_normalized + nodes[i + 2].btype - if word[-1] in ' -' and nodes[i + 3].btype != '`': - word += nodes[i + 3].term_normalized + nodes[i + 3].btype - - # Use global pattern to check for presence of any postcode. - m = self.global_pattern.fullmatch(word) - if m: - # If there was a match, check against each pattern separately - # because multiple patterns might be machting at the end. - cc = m.group('cc') - pc_word = m.group('pc') - cc_spaces = len(m.group('space') or '') - for pattern, info in self.local_patterns: - lm = pattern.match(pc_word) - if lm: - trange = (i, i + cc_spaces + sum(c in ' ,-:>' for c in lm.group(0))) - for out, out_ccs in info: - if cc is None or cc in out_ccs: - if out: - outcodes.add((*trange, lm.expand(out).upper())) - else: - outcodes.add((*trange, lm.group(0)[:-1].upper())) + if nodes[i].btype in '<,: ' and nodes[i + 1].btype != '`' \ + and (i == 0 or nodes[i - 1].ptype != qmod.PHRASE_POSTCODE): + if nodes[i].ptype == qmod.PHRASE_ANY: + word = nodes[i + 1].term_normalized + nodes[i + 1].btype + if word[-1] in ' -' and nodes[i + 2].btype != '`' \ + and nodes[i + 1].ptype == qmod.PHRASE_ANY: + word += nodes[i + 2].term_normalized + nodes[i + 2].btype + if word[-1] in ' -' and nodes[i + 3].btype != '`' \ + and nodes[i + 2].ptype == qmod.PHRASE_ANY: + word += nodes[i + 3].term_normalized + nodes[i + 3].btype + + self._match_word(word, i, False, outcodes) + elif nodes[i].ptype == qmod.PHRASE_POSTCODE: + word = nodes[i + 1].term_normalized + nodes[i + 1].btype + for j in range(i + 1, query.num_token_slots()): + if nodes[j].ptype != qmod.PHRASE_POSTCODE: + break + word += nodes[j + 1].term_normalized + nodes[j + 1].btype + + self._match_word(word, i, True, outcodes) + return outcodes + + def _match_word(self, word: str, pos: int, fullmatch: bool, + outcodes: Set[Tuple[int, int, str]]) -> None: + # Use global pattern to check for presence of any postcode. + m = self.global_pattern.fullmatch(word) + if m: + # If there was a match, check against each pattern separately + # because multiple patterns might be machting at the end. + cc = m.group('cc') + pc_word = m.group('pc') + cc_spaces = len(m.group('space') or '') + for pattern, info in self.local_patterns: + lm = pattern.fullmatch(pc_word) if fullmatch else pattern.match(pc_word) + if lm: + trange = (pos, pos + cc_spaces + sum(c in ' ,-:>' for c in lm.group(0))) + for out, out_ccs in info: + if cc is None or cc in out_ccs: + if out: + outcodes.add((*trange, lm.expand(out).upper())) + else: + outcodes.add((*trange, lm.group(0)[:-1].upper())) diff --git a/test/python/api/search/test_postcode_parser.py b/test/python/api/search/test_postcode_parser.py index f691a58c..284aba5b 100644 --- a/test/python/api/search/test_postcode_parser.py +++ b/test/python/api/search/test_postcode_parser.py @@ -14,7 +14,7 @@ from itertools import zip_longest import pytest from nominatim_api.search.postcode_parser import PostcodeParser -from nominatim_api.search.query import QueryStruct, PHRASE_ANY +from nominatim_api.search.query import QueryStruct, PHRASE_ANY, PHRASE_POSTCODE, PHRASE_STREET @pytest.fixture def pc_config(project_env): @@ -131,3 +131,24 @@ def test_postcode_with_non_matching_country_prefix(pc_config): assert not parser.parse(mk_query('ky12233')) +def test_postcode_inside_postcode_phrase(pc_config): + parser = PostcodeParser(pc_config) + + query = QueryStruct([]) + query.nodes[-1].ptype = PHRASE_STREET + query.add_node(',', PHRASE_STREET, 0.1, '12345', '12345') + query.add_node(',', PHRASE_POSTCODE, 0.1, 'xz', 'xz') + query.add_node('>', PHRASE_POSTCODE, 0.1, '4444', '4444') + + assert parser.parse(query) == {(2, 3, '4444')} + + +def test_partial_postcode_in_postcode_phrase(pc_config): + parser = PostcodeParser(pc_config) + + query = QueryStruct([]) + query.nodes[-1].ptype = PHRASE_POSTCODE + query.add_node(' ', PHRASE_POSTCODE, 0.1, '2224', '2224') + query.add_node('>', PHRASE_POSTCODE, 0.1, '12345', '12345') + + assert not parser.parse(query)