src/nominatim_api/search/postcode_parser.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2025 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Handling of arbitrary postcode tokens in tokenized query string.
   9 """
  10 from typing import Tuple, Set, Dict, List
  11 import re
  12 from collections import defaultdict
  13
  14 import yaml
  15
  16 from ..config import Configuration
  17 from . import query as qmod
  18
  19
  20 class PostcodeParser:
  21     """ Pattern-based parser for postcodes in tokenized queries.
  22
  23         The postcode patterns are read from the country configuration.
  24         The parser does currently not return country restrictions.
  25     """
  26
  27     def __init__(self, config: Configuration) -> None:
  28         # skip over includes here to avoid loading the complete country name data
  29         yaml.add_constructor('!include', lambda loader, node: [],
  30                              Loader=yaml.SafeLoader)
  31         cdata = yaml.safe_load(config.find_config_file('country_settings.yaml')
  32                                      .read_text(encoding='utf-8'))
  33
  34         unique_patterns: Dict[str, Dict[str, List[str]]] = {}
  35         for cc, data in cdata.items():
  36             if data.get('postcode'):
  37                 pat = data['postcode']['pattern'].replace('d', '[0-9]').replace('l', '[a-z]')
  38                 out = data['postcode'].get('output')
  39                 if pat not in unique_patterns:
  40                     unique_patterns[pat] = defaultdict(list)
  41                 unique_patterns[pat][out].append(cc)
  42
  43         self.global_pattern = re.compile(
  44                 '(?:(?P<cc>[a-z][a-z])(?P<space>[ -]?))?(?P<pc>(?:(?:'
  45                 + ')|(?:'.join(unique_patterns) + '))[:, >].*)')
  46
  47         self.local_patterns = [(re.compile(f"{pat}[:, >]"), list(info.items()))
  48                                for pat, info in unique_patterns.items()]
  49
  50     def parse(self, query: qmod.QueryStruct) -> Set[Tuple[int, int, str]]:
  51         """ Parse postcodes in the given list of query tokens taking into
  52             account the list of breaks from the nodes.
  53
  54             The result is a sequence of tuples with
  55             [start node id, end node id, postcode token]
  56         """
  57         nodes = query.nodes
  58         outcodes: Set[Tuple[int, int, str]] = set()
  59
  60         for i in range(query.num_token_slots()):
  61             if nodes[i].btype in '<,: ' and nodes[i + 1].btype != '`' \
  62                     and (i == 0 or nodes[i - 1].ptype != qmod.PHRASE_POSTCODE):
  63                 if nodes[i].ptype == qmod.PHRASE_ANY:
  64                     word = nodes[i + 1].term_normalized + nodes[i + 1].btype
  65                     if word[-1] in ' -' and nodes[i + 2].btype != '`' \
  66                             and nodes[i + 1].ptype == qmod.PHRASE_ANY:
  67                         word += nodes[i + 2].term_normalized + nodes[i + 2].btype
  68                         if word[-1] in ' -' and nodes[i + 3].btype != '`' \
  69                                 and nodes[i + 2].ptype == qmod.PHRASE_ANY:
  70                             word += nodes[i + 3].term_normalized + nodes[i + 3].btype
  71
  72                     self._match_word(word, i, False, outcodes)
  73                 elif nodes[i].ptype == qmod.PHRASE_POSTCODE:
  74                     word = nodes[i + 1].term_normalized + nodes[i + 1].btype
  75                     for j in range(i + 1, query.num_token_slots()):
  76                         if nodes[j].ptype != qmod.PHRASE_POSTCODE:
  77                             break
  78                         word += nodes[j + 1].term_normalized + nodes[j + 1].btype
  79
  80                     self._match_word(word, i, True, outcodes)
  81
  82         return outcodes
  83
  84     def _match_word(self, word: str, pos: int, fullmatch: bool,
  85                     outcodes: Set[Tuple[int, int, str]]) -> None:
  86         # Use global pattern to check for presence of any postcode.
  87         m = self.global_pattern.fullmatch(word)
  88         if m:
  89             # If there was a match, check against each pattern separately
  90             # because multiple patterns might be machting at the end.
  91             cc = m.group('cc')
  92             pc_word = m.group('pc')
  93             cc_spaces = len(m.group('space') or '')
  94             for pattern, info in self.local_patterns:
  95                 lm = pattern.fullmatch(pc_word) if fullmatch else pattern.match(pc_word)
  96                 if lm:
  97                     trange = (pos, pos + cc_spaces + sum(c in ' ,-:>' for c in lm.group(0)))
  98                     for out, out_ccs in info:
  99                         if cc is None or cc in out_ccs:
 100                             if out:
 101                                 outcodes.add((*trange, lm.expand(out).upper()))
 102                             else:
 103                                 outcodes.add((*trange, lm.group(0)[:-1].upper()))