Merge pull request #3678 from lonvia/search-tweaks

author Sarah Hoffmann <lonvia@denofr.de>

Wed, 19 Mar 2025 15:00:52 +0000 (16:00 +0100)

committer GitHub <noreply@github.com>

Wed, 19 Mar 2025 15:00:52 +0000 (16:00 +0100)
author Sarah Hoffmann <lonvia@denofr.de>
Wed, 19 Mar 2025 15:00:52 +0000 (16:00 +0100)
committer GitHub <noreply@github.com>
Wed, 19 Mar 2025 15:00:52 +0000 (16:00 +0100)
diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py

index b3e14f6a114a6ffb649e761953c6ae81950a7880..1bd0030d54f0319581c22dd5921c2deeaa8f0927 100644 (file)
--- a/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@ -193,10 +193,12 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
  
          self.add_extra_tokens(query)
          for start, end, pc in self.postcode_parser.parse(query):
+            term = ' '.join(n.term_lookup for n in query.nodes[start + 1:end + 1])
              query.add_token(qmod.TokenRange(start, end),
                              qmod.TOKEN_POSTCODE,
                              ICUToken(penalty=0.1, token=0, count=1, addr_count=1,
-                                     lookup_word=pc, word_token=pc, info=None))
+                                     lookup_word=pc, word_token=term,
+                                     info=None))
          self.rerank_tokens(query)
  
          log().table_dump('Word tokens', _dump_word_tokens(query))
@@ -267,10 +269,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
          """
          for i, node, tlist in query.iter_token_lists():
              if tlist.ttype == qmod.TOKEN_POSTCODE:
+                tlen = len(cast(ICUToken, tlist.tokens[0]).word_token)
                  for repl in node.starting:
                      if repl.end == tlist.end and repl.ttype != qmod.TOKEN_POSTCODE \
-                       and (repl.ttype != qmod.TOKEN_HOUSENUMBER
-                            or len(tlist.tokens[0].lookup_word) > 4):
+                       and (repl.ttype != qmod.TOKEN_HOUSENUMBER or tlen > 4):
                          repl.add_penalty(0.39)
              elif (tlist.ttype == qmod.TOKEN_HOUSENUMBER
                    and len(tlist.tokens[0].lookup_word) <= 3):
diff --git a/src/nominatim_api/search/token_assignment.py b/src/nominatim_api/search/token_assignment.py

index 3ca9385cf91fe35c35d3217c29a0286a82d262c7..8d25aa8f10bdc3c751a2788c89c5710b50a4f8fb 100644 (file)
--- a/src/nominatim_api/search/token_assignment.py
+++ b/src/nominatim_api/search/token_assignment.py
@@ -269,10 +269,9 @@ class _TokenSequence:
              # <address>,<postcode> should give preference to address search
              if base.postcode.start == 0:
                  penalty = self.penalty
-                self.direction = -1  # name searches are only possible backwards
              else:
                  penalty = self.penalty + 0.1
-                self.direction = 1  # name searches are only possible forwards
+            penalty += 0.1 * max(0, len(base.address) - 1)
              yield dataclasses.replace(base, penalty=penalty)
  
      def _get_assignments_address_forward(self, base: TokenAssignment,
@@ -282,6 +281,11 @@ class _TokenSequence:
          """
          first = base.address[0]
  
+        # The postcode must come after the name.
+        if base.postcode and base.postcode < first:
+            log().var_dump('skip forward', (base.postcode, first))
+            return
+
          log().comment('first word = name')
          yield dataclasses.replace(base, penalty=self.penalty,
                                    name=first, address=base.address[1:])
@@ -317,7 +321,12 @@ class _TokenSequence:
          """
          last = base.address[-1]
  
-        if self.direction == -1 or len(base.address) > 1:
+        # The postcode must come before the name for backward direction.
+        if base.postcode and base.postcode > last:
+            log().var_dump('skip backward', (base.postcode, last))
+            return
+
+        if self.direction == -1 or len(base.address) > 1 or base.postcode:
              log().comment('last word = name')
              yield dataclasses.replace(base, penalty=self.penalty,
                                        name=last, address=base.address[:-1])
author	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 19 Mar 2025 15:00:52 +0000 (16:00 +0100)
committer	GitHub <noreply@github.com>
	Wed, 19 Mar 2025 15:00:52 +0000 (16:00 +0100)
src/nominatim_api/search/icu_tokenizer.py		patch \| blob \| history
src/nominatim_api/search/token_assignment.py		patch \| blob \| history