"""
Handling of arbitrary postcode tokens in tokenized query string.
"""
-from typing import Tuple, Set
+from typing import Tuple, Set, Dict, List
import re
from collections import defaultdict
cdata = yaml.safe_load(config.find_config_file('country_settings.yaml')
.read_text(encoding='utf-8'))
- unique_patterns = defaultdict(set)
+ unique_patterns: Dict[str, Dict[str, List[str]]] = {}
for cc, data in cdata.items():
if data.get('postcode'):
- pat = data['postcode']['pattern']
+ pat = data['postcode']['pattern'].replace('d', '[0-9]').replace('l', '[a-z]')
out = data['postcode'].get('output')
- unique_patterns[pat.replace('d', '[0-9]').replace('l', '[a-z]')].add(out)
+ if pat not in unique_patterns:
+ unique_patterns[pat] = defaultdict(list)
+ unique_patterns[pat][out].append(cc)
self.global_pattern = re.compile(
- '(?:' +
- '|'.join(f"(?:{k})" for k in unique_patterns)
- + ')[:, >]')
+ '(?:(?P<cc>[a-z][a-z])(?P<space>[ -]?))?(?P<pc>(?:(?:'
+ + ')|(?:'.join(unique_patterns) + '))[:, >].*)')
- self.local_patterns = [(re.compile(f"(?:{k})[:, >]"), v)
- for k, v in unique_patterns.items()]
+ self.local_patterns = [(re.compile(f"{pat}[:, >]"), list(info.items()))
+ for pat, info in unique_patterns.items()]
def parse(self, query: qmod.QueryStruct) -> Set[Tuple[int, int, str]]:
""" Parse postcodes in the given list of query tokens taking into
if word[-1] in ' -' and nodes[i + 3].btype != '`':
word += nodes[i + 3].term_normalized + nodes[i + 3].btype
- # Use global pattern to check for presence of any postocde.
- m = self.global_pattern.match(word)
+ # Use global pattern to check for presence of any postcode.
+ m = self.global_pattern.fullmatch(word)
if m:
# If there was a match, check against each pattern separately
# because multiple patterns might be machting at the end.
+ cc = m.group('cc')
+ pc_word = m.group('pc')
+ cc_spaces = len(m.group('space') or '')
for pattern, info in self.local_patterns:
- lm = pattern.match(word)
+ lm = pattern.match(pc_word)
if lm:
- trange = (i, i + sum(c in ' ,-:>' for c in lm.group(0)))
- for out in info:
- if out:
- outcodes.add((*trange, lm.expand(out).upper()))
- else:
- outcodes.add((*trange, lm.group(0)[:-1].upper()))
+ trange = (i, i + cc_spaces + sum(c in ' ,-:>' for c in lm.group(0)))
+ for out, out_ccs in info:
+ if cc is None or cc in out_ccs:
+ if out:
+ outcodes.add((*trange, lm.expand(out).upper()))
+ else:
+ outcodes.add((*trange, lm.group(0)[:-1].upper()))
return outcodes