restrict postcode parsing in typed phrases

author Sarah Hoffmann <lonvia@denofr.de>

Wed, 5 Mar 2025 09:08:07 +0000 (10:08 +0100)

committer Sarah Hoffmann <lonvia@denofr.de>

Wed, 5 Mar 2025 09:09:33 +0000 (10:09 +0100)
author Sarah Hoffmann <lonvia@denofr.de>
Wed, 5 Mar 2025 09:08:07 +0000 (10:08 +0100)
committer Sarah Hoffmann <lonvia@denofr.de>
Wed, 5 Mar 2025 09:09:33 +0000 (10:09 +0100)
diff --git a/src/nominatim_api/search/postcode_parser.py b/src/nominatim_api/search/postcode_parser.py

index 1148d3c33b727aa35c91c8e7aa865e049d32686b..bb3ef1a4c86781ae9afb2bb478fae70669400053 100644 (file)
--- a/src/nominatim_api/search/postcode_parser.py
+++ b/src/nominatim_api/search/postcode_parser.py
@@ -55,32 +55,49 @@ class PostcodeParser:
              [start node id, end node id, postcode token]
          """
          nodes = query.nodes
-        outcodes = set()
+        outcodes: Set[Tuple[int, int, str]] = set()
  
          for i in range(query.num_token_slots()):
-            if nodes[i].btype in '<,: ' and nodes[i + 1].btype != '`':
-                word = nodes[i + 1].term_normalized + nodes[i + 1].btype
-                if word[-1] in ' -' and nodes[i + 2].btype != '`':
-                    word += nodes[i + 2].term_normalized + nodes[i + 2].btype
-                    if word[-1] in ' -' and nodes[i + 3].btype != '`':
-                        word += nodes[i + 3].term_normalized + nodes[i + 3].btype
-
-                # Use global pattern to check for presence of any postcode.
-                m = self.global_pattern.fullmatch(word)
-                if m:
-                    # If there was a match, check against each pattern separately
-                    # because multiple patterns might be machting at the end.
-                    cc = m.group('cc')
-                    pc_word = m.group('pc')
-                    cc_spaces = len(m.group('space') or '')
-                    for pattern, info in self.local_patterns:
-                        lm = pattern.match(pc_word)
-                        if lm:
-                            trange = (i, i + cc_spaces + sum(c in ' ,-:>' for c in lm.group(0)))
-                            for out, out_ccs in info:
-                                if cc is None or cc in out_ccs:
-                                    if out:
-                                        outcodes.add((*trange, lm.expand(out).upper()))
-                                    else:
-                                        outcodes.add((*trange, lm.group(0)[:-1].upper()))
+            if nodes[i].btype in '<,: ' and nodes[i + 1].btype != '`' \
+                    and (i == 0 or nodes[i - 1].ptype != qmod.PHRASE_POSTCODE):
+                if nodes[i].ptype == qmod.PHRASE_ANY:
+                    word = nodes[i + 1].term_normalized + nodes[i + 1].btype
+                    if word[-1] in ' -' and nodes[i + 2].btype != '`' \
+                            and nodes[i + 1].ptype == qmod.PHRASE_ANY:
+                        word += nodes[i + 2].term_normalized + nodes[i + 2].btype
+                        if word[-1] in ' -' and nodes[i + 3].btype != '`' \
+                                and nodes[i + 2].ptype == qmod.PHRASE_ANY:
+                            word += nodes[i + 3].term_normalized + nodes[i + 3].btype
+
+                    self._match_word(word, i, False, outcodes)
+                elif nodes[i].ptype == qmod.PHRASE_POSTCODE:
+                    word = nodes[i + 1].term_normalized + nodes[i + 1].btype
+                    for j in range(i + 1, query.num_token_slots()):
+                        if nodes[j].ptype != qmod.PHRASE_POSTCODE:
+                            break
+                        word += nodes[j + 1].term_normalized + nodes[j + 1].btype
+
+                    self._match_word(word, i, True, outcodes)
+
          return outcodes
+
+    def _match_word(self, word: str, pos: int, fullmatch: bool,
+                    outcodes: Set[Tuple[int, int, str]]) -> None:
+        # Use global pattern to check for presence of any postcode.
+        m = self.global_pattern.fullmatch(word)
+        if m:
+            # If there was a match, check against each pattern separately
+            # because multiple patterns might be machting at the end.
+            cc = m.group('cc')
+            pc_word = m.group('pc')
+            cc_spaces = len(m.group('space') or '')
+            for pattern, info in self.local_patterns:
+                lm = pattern.fullmatch(pc_word) if fullmatch else pattern.match(pc_word)
+                if lm:
+                    trange = (pos, pos + cc_spaces + sum(c in ' ,-:>' for c in lm.group(0)))
+                    for out, out_ccs in info:
+                        if cc is None or cc in out_ccs:
+                            if out:
+                                outcodes.add((*trange, lm.expand(out).upper()))
+                            else:
+                                outcodes.add((*trange, lm.group(0)[:-1].upper()))
diff --git a/test/python/api/search/test_postcode_parser.py b/test/python/api/search/test_postcode_parser.py

index f691a58c35bcd017c2c55a1d1a4664bd70e6b204..284aba5b932b3c96bb60187f003da3402fe18561 100644 (file)
--- a/test/python/api/search/test_postcode_parser.py
+++ b/test/python/api/search/test_postcode_parser.py
@@ -14,7 +14,7 @@ from itertools import zip_longest
  import pytest
  
  from nominatim_api.search.postcode_parser import PostcodeParser
-from nominatim_api.search.query import QueryStruct, PHRASE_ANY
+from nominatim_api.search.query import QueryStruct, PHRASE_ANY, PHRASE_POSTCODE, PHRASE_STREET
  
  @pytest.fixture
  def pc_config(project_env):
@@ -131,3 +131,24 @@ def test_postcode_with_non_matching_country_prefix(pc_config):
  
      assert not parser.parse(mk_query('ky12233'))
  
+def test_postcode_inside_postcode_phrase(pc_config):
+    parser = PostcodeParser(pc_config)
+
+    query = QueryStruct([])
+    query.nodes[-1].ptype = PHRASE_STREET
+    query.add_node(',', PHRASE_STREET, 0.1, '12345', '12345')
+    query.add_node(',', PHRASE_POSTCODE, 0.1, 'xz', 'xz')
+    query.add_node('>', PHRASE_POSTCODE, 0.1, '4444', '4444')
+
+    assert parser.parse(query) == {(2, 3, '4444')}
+
+
+def test_partial_postcode_in_postcode_phrase(pc_config):
+    parser = PostcodeParser(pc_config)
+
+    query = QueryStruct([])
+    query.nodes[-1].ptype = PHRASE_POSTCODE
+    query.add_node(' ', PHRASE_POSTCODE, 0.1, '2224', '2224')
+    query.add_node('>', PHRASE_POSTCODE, 0.1, '12345', '12345')
+
+    assert not parser.parse(query)
author	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 5 Mar 2025 09:08:07 +0000 (10:08 +0100)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 5 Mar 2025 09:09:33 +0000 (10:09 +0100)
src/nominatim_api/search/postcode_parser.py		patch \| blob \| history
test/python/api/search/test_postcode_parser.py		patch \| blob \| history