add postcode parser

author Sarah Hoffmann <lonvia@denofr.de>

Tue, 25 Feb 2025 19:56:07 +0000 (20:56 +0100)

committer Sarah Hoffmann <lonvia@denofr.de>

Tue, 4 Mar 2025 07:57:37 +0000 (08:57 +0100)
author Sarah Hoffmann <lonvia@denofr.de>
Tue, 25 Feb 2025 19:56:07 +0000 (20:56 +0100)
committer Sarah Hoffmann <lonvia@denofr.de>
Tue, 4 Mar 2025 07:57:37 +0000 (08:57 +0100)
diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py

index e6bba95c6fd071d871ac1813265b734f3d6fdd13..01513103dcc059f33ff84e46d5058660ad0db93a 100644 (file)
--- a/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@ -25,6 +25,7 @@ from ..logging import log
  from . import query as qmod
  from ..query_preprocessing.config import QueryConfig
  from .query_analyzer_factory import AbstractQueryAnalyzer
+from .postcode_parser import PostcodeParser
  
  
  DB_TO_TOKEN_TYPE = {
@@ -117,6 +118,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
      """
      def __init__(self, conn: SearchConnection) -> None:
          self.conn = conn
+        self.postcode_parser = PostcodeParser(conn.config)
  
      async def setup(self) -> None:
          """ Set up static data structures needed for the analysis.
@@ -199,6 +201,11 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                      query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token)
  
          self.add_extra_tokens(query)
+        for start, end, pc in self.postcode_parser.parse(query):
+            query.add_token(qmod.TokenRange(start, end),
+                            qmod.TOKEN_POSTCODE,
+                            ICUToken(penalty=0.1, token=0, count=1, addr_count=1,
+                                     lookup_word=pc, word_token=pc, info=None))
          self.rerank_tokens(query)
  
          log().table_dump('Word tokens', _dump_word_tokens(query))
@@ -240,9 +247,13 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
      async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
          """ Return the token information from the database for the
              given word tokens.
+
+            This function excludes postcode tokens
          """
          t = self.conn.t.meta.tables['word']
-        return await self.conn.execute(t.select().where(t.c.word_token.in_(words)))
+        return await self.conn.execute(t.select()
+                                        .where(t.c.word_token.in_(words))
+                                        .where(t.c.type != 'P'))
  
      def add_extra_tokens(self, query: qmod.QueryStruct) -> None:
          """ Add tokens to query that are not saved in the database.
diff --git a/src/nominatim_api/search/postcode_parser.py b/src/nominatim_api/search/postcode_parser.py

new file mode 100644 (file)

index 0000000..93ed87c
--- /dev/null
+++ b/src/nominatim_api/search/postcode_parser.py
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2025 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Handling of arbitrary postcode tokens in tokenized query string.
+"""
+from typing import Tuple, Set
+import re
+from collections import defaultdict
+
+import yaml
+
+from ..config import Configuration
+from . import query as qmod
+
+
+class PostcodeParser:
+    """ Pattern-based parser for postcodes in tokenized queries.
+
+        The postcode patterns are read from the country configuration.
+        The parser does currently not return country restrictions.
+    """
+
+    def __init__(self, config: Configuration) -> None:
+        # skip over includes here to avoid loading the complete country name data
+        yaml.add_constructor('!include', lambda loader, node: [],
+                             Loader=yaml.SafeLoader)
+        cdata = yaml.safe_load(config.find_config_file('country_settings.yaml')
+                                     .read_text(encoding='utf-8'))
+
+        unique_patterns = defaultdict(set)
+        for cc, data in cdata.items():
+            if data.get('postcode'):
+                pat = data['postcode']['pattern']
+                out = data['postcode'].get('output')
+                unique_patterns[pat.replace('d', '[0-9]').replace('l', '[a-z]')].add(out)
+
+        self.global_pattern = re.compile(
+                '(?:' +
+                '|'.join(f"(?:{k})" for k in unique_patterns)
+                + ')[:, >]')
+
+        self.local_patterns = [(re.compile(f"(?:{k})[:, >]"), v)
+                               for k, v in unique_patterns.items()]
+
+    def parse(self, query: qmod.QueryStruct) -> Set[Tuple[int, int, str]]:
+        """ Parse postcodes in the given list of query tokens taking into
+            account the list of breaks from the nodes.
+
+            The result is a sequence of tuples with
+            [start node id, end node id, postcode token]
+        """
+        nodes = query.nodes
+        outcodes = set()
+
+        for i in range(query.num_token_slots()):
+            if nodes[i].btype in '<,: ' and nodes[i + 1].btype != '`':
+                word = nodes[i + 1].term_normalized + nodes[i + 1].btype
+                if word[-1] in ' -' and nodes[i + 2].btype != '`':
+                    word += nodes[i + 2].term_normalized + nodes[i + 2].btype
+                    if word[-1] in ' -' and nodes[i + 3].btype != '`':
+                        word += nodes[i + 3].term_normalized + nodes[i + 3].btype
+
+                # Use global pattern to check for presence of any postocde.
+                m = self.global_pattern.match(word)
+                if m:
+                    # If there was a match, check against each pattern separately
+                    # because multiple patterns might be machting at the end.
+                    for pattern, info in self.local_patterns:
+                        lm = pattern.match(word)
+                        if lm:
+                            trange = (i, i + sum(c in ' ,-:>' for c in lm.group(0)))
+                            for out in info:
+                                if out:
+                                    outcodes.add((*trange, lm.expand(out).upper()))
+                                else:
+                                    outcodes.add((*trange, lm.group(0)[:-1].upper()))
+        return outcodes
author	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 25 Feb 2025 19:56:07 +0000 (20:56 +0100)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 4 Mar 2025 07:57:37 +0000 (08:57 +0100)
src/nominatim_api/search/icu_tokenizer.py		patch \| blob \| history
src/nominatim_api/search/postcode_parser.py	[new file with mode: 0644]	patch \| blob