From: Sarah Hoffmann <lonvia@denofr.de>
Date: Tue, 25 Feb 2025 19:56:07 +0000 (+0100)
Subject: add postcode parser
X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/fc1c6261ed24b66a61c038f1def268f5aa07fecc

add postcode parser
---

diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py
index e6bba95c..01513103 100644
--- a/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@ -25,6 +25,7 @@ from ..logging import log
 from . import query as qmod
 from ..query_preprocessing.config import QueryConfig
 from .query_analyzer_factory import AbstractQueryAnalyzer
+from .postcode_parser import PostcodeParser
 
 
 DB_TO_TOKEN_TYPE = {
@@ -117,6 +118,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
     """
     def __init__(self, conn: SearchConnection) -> None:
         self.conn = conn
+        self.postcode_parser = PostcodeParser(conn.config)
 
     async def setup(self) -> None:
         """ Set up static data structures needed for the analysis.
@@ -199,6 +201,11 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                     query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token)
 
         self.add_extra_tokens(query)
+        for start, end, pc in self.postcode_parser.parse(query):
+            query.add_token(qmod.TokenRange(start, end),
+                            qmod.TOKEN_POSTCODE,
+                            ICUToken(penalty=0.1, token=0, count=1, addr_count=1,
+                                     lookup_word=pc, word_token=pc, info=None))
         self.rerank_tokens(query)
 
         log().table_dump('Word tokens', _dump_word_tokens(query))
@@ -240,9 +247,13 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
     async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
         """ Return the token information from the database for the
             given word tokens.
+
+            This function excludes postcode tokens
         """
         t = self.conn.t.meta.tables['word']
-        return await self.conn.execute(t.select().where(t.c.word_token.in_(words)))
+        return await self.conn.execute(t.select()
+                                        .where(t.c.word_token.in_(words))
+                                        .where(t.c.type != 'P'))
 
     def add_extra_tokens(self, query: qmod.QueryStruct) -> None:
         """ Add tokens to query that are not saved in the database.
diff --git a/src/nominatim_api/search/postcode_parser.py b/src/nominatim_api/search/postcode_parser.py
new file mode 100644
index 00000000..93ed87c4
--- /dev/null
+++ b/src/nominatim_api/search/postcode_parser.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2025 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Handling of arbitrary postcode tokens in tokenized query string.
+"""
+from typing import Tuple, Set
+import re
+from collections import defaultdict
+
+import yaml
+
+from ..config import Configuration
+from . import query as qmod
+
+
+class PostcodeParser:
+    """ Pattern-based parser for postcodes in tokenized queries.
+
+        The postcode patterns are read from the country configuration.
+        The parser does currently not return country restrictions.
+    """
+
+    def __init__(self, config: Configuration) -> None:
+        # skip over includes here to avoid loading the complete country name data
+        yaml.add_constructor('!include', lambda loader, node: [],
+                             Loader=yaml.SafeLoader)
+        cdata = yaml.safe_load(config.find_config_file('country_settings.yaml')
+                                     .read_text(encoding='utf-8'))
+
+        unique_patterns = defaultdict(set)
+        for cc, data in cdata.items():
+            if data.get('postcode'):
+                pat = data['postcode']['pattern']
+                out = data['postcode'].get('output')
+                unique_patterns[pat.replace('d', '[0-9]').replace('l', '[a-z]')].add(out)
+
+        self.global_pattern = re.compile(
+                '(?:' +
+                '|'.join(f"(?:{k})" for k in unique_patterns)
+                + ')[:, >]')
+
+        self.local_patterns = [(re.compile(f"(?:{k})[:, >]"), v)
+                               for k, v in unique_patterns.items()]
+
+    def parse(self, query: qmod.QueryStruct) -> Set[Tuple[int, int, str]]:
+        """ Parse postcodes in the given list of query tokens taking into
+            account the list of breaks from the nodes.
+
+            The result is a sequence of tuples with
+            [start node id, end node id, postcode token]
+        """
+        nodes = query.nodes
+        outcodes = set()
+
+        for i in range(query.num_token_slots()):
+            if nodes[i].btype in '<,: ' and nodes[i + 1].btype != '`':
+                word = nodes[i + 1].term_normalized + nodes[i + 1].btype
+                if word[-1] in ' -' and nodes[i + 2].btype != '`':
+                    word += nodes[i + 2].term_normalized + nodes[i + 2].btype
+                    if word[-1] in ' -' and nodes[i + 3].btype != '`':
+                        word += nodes[i + 3].term_normalized + nodes[i + 3].btype
+
+                # Use global pattern to check for presence of any postocde.
+                m = self.global_pattern.match(word)
+                if m:
+                    # If there was a match, check against each pattern separately
+                    # because multiple patterns might be machting at the end.
+                    for pattern, info in self.local_patterns:
+                        lm = pattern.match(word)
+                        if lm:
+                            trange = (i, i + sum(c in ' ,-:>' for c in lm.group(0)))
+                            for out in info:
+                                if out:
+                                    outcodes.add((*trange, lm.expand(out).upper()))
+                                else:
+                                    outcodes.add((*trange, lm.group(0)[:-1].upper()))
+        return outcodes