]> git.openstreetmap.org Git - nominatim.git/commitdiff
add support for country prefixes in postcodes
authorSarah Hoffmann <lonvia@denofr.de>
Tue, 4 Mar 2025 14:18:27 +0000 (15:18 +0100)
committerSarah Hoffmann <lonvia@denofr.de>
Tue, 4 Mar 2025 14:18:27 +0000 (15:18 +0100)
src/nominatim_api/search/postcode_parser.py

index 93ed87c429bfbf45c460bf855bd265e97c68bcae..1148d3c33b727aa35c91c8e7aa865e049d32686b 100644 (file)
@@ -7,7 +7,7 @@
 """
 Handling of arbitrary postcode tokens in tokenized query string.
 """
-from typing import Tuple, Set
+from typing import Tuple, Set, Dict, List
 import re
 from collections import defaultdict
 
@@ -31,20 +31,21 @@ class PostcodeParser:
         cdata = yaml.safe_load(config.find_config_file('country_settings.yaml')
                                      .read_text(encoding='utf-8'))
 
-        unique_patterns = defaultdict(set)
+        unique_patterns: Dict[str, Dict[str, List[str]]] = {}
         for cc, data in cdata.items():
             if data.get('postcode'):
-                pat = data['postcode']['pattern']
+                pat = data['postcode']['pattern'].replace('d', '[0-9]').replace('l', '[a-z]')
                 out = data['postcode'].get('output')
-                unique_patterns[pat.replace('d', '[0-9]').replace('l', '[a-z]')].add(out)
+                if pat not in unique_patterns:
+                    unique_patterns[pat] = defaultdict(list)
+                unique_patterns[pat][out].append(cc)
 
         self.global_pattern = re.compile(
-                '(?:' +
-                '|'.join(f"(?:{k})" for k in unique_patterns)
-                + ')[:, >]')
+                '(?:(?P<cc>[a-z][a-z])(?P<space>[ -]?))?(?P<pc>(?:(?:'
+                + ')|(?:'.join(unique_patterns) + '))[:, >].*)')
 
-        self.local_patterns = [(re.compile(f"(?:{k})[:, >]"), v)
-                               for k, v in unique_patterns.items()]
+        self.local_patterns = [(re.compile(f"{pat}[:, >]"), list(info.items()))
+                               for pat, info in unique_patterns.items()]
 
     def parse(self, query: qmod.QueryStruct) -> Set[Tuple[int, int, str]]:
         """ Parse postcodes in the given list of query tokens taking into
@@ -64,18 +65,22 @@ class PostcodeParser:
                     if word[-1] in ' -' and nodes[i + 3].btype != '`':
                         word += nodes[i + 3].term_normalized + nodes[i + 3].btype
 
-                # Use global pattern to check for presence of any postocde.
-                m = self.global_pattern.match(word)
+                # Use global pattern to check for presence of any postcode.
+                m = self.global_pattern.fullmatch(word)
                 if m:
                     # If there was a match, check against each pattern separately
                     # because multiple patterns might be machting at the end.
+                    cc = m.group('cc')
+                    pc_word = m.group('pc')
+                    cc_spaces = len(m.group('space') or '')
                     for pattern, info in self.local_patterns:
-                        lm = pattern.match(word)
+                        lm = pattern.match(pc_word)
                         if lm:
-                            trange = (i, i + sum(c in ' ,-:>' for c in lm.group(0)))
-                            for out in info:
-                                if out:
-                                    outcodes.add((*trange, lm.expand(out).upper()))
-                                else:
-                                    outcodes.add((*trange, lm.group(0)[:-1].upper()))
+                            trange = (i, i + cc_spaces + sum(c in ' ,-:>' for c in lm.group(0)))
+                            for out, out_ccs in info:
+                                if cc is None or cc in out_ccs:
+                                    if out:
+                                        outcodes.add((*trange, lm.expand(out).upper()))
+                                    else:
+                                        outcodes.add((*trange, lm.group(0)[:-1].upper()))
         return outcodes