add SOFT_PHRASE break and enable parsing

author Sarah Hoffmann <lonvia@denofr.de>

Mon, 6 Jan 2025 16:10:24 +0000 (17:10 +0100)

committer Sarah Hoffmann <lonvia@denofr.de>

Mon, 6 Jan 2025 16:10:24 +0000 (17:10 +0100)
author Sarah Hoffmann <lonvia@denofr.de>
Mon, 6 Jan 2025 16:10:24 +0000 (17:10 +0100)
committer Sarah Hoffmann <lonvia@denofr.de>
Mon, 6 Jan 2025 16:10:24 +0000 (17:10 +0100)
diff --git a/src/nominatim_api/search/db_search_builder.py b/src/nominatim_api/search/db_search_builder.py

index 632270ef04176f394a10e29d9397141bdeb5a457..a6335c1377c0eada94fb0a3554d31aa48d2e1742 100644 (file)
--- a/src/nominatim_api/search/db_search_builder.py
+++ b/src/nominatim_api/search/db_search_builder.py
@@ -433,6 +433,7 @@ PENALTY_WORDCHANGE = {
      BreakType.START: 0.0,
      BreakType.END: 0.0,
      BreakType.PHRASE: 0.0,
+    BreakType.SOFT_PHRASE: 0.0,
      BreakType.WORD: 0.1,
      BreakType.PART: 0.2,
      BreakType.TOKEN: 0.4
diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py

index 5976fbec05d8c515dfff092606942b62e602aaac..d52614fdaf4863b0aaa2e2721659988d9c03470d 100644 (file)
--- a/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@ -11,6 +11,8 @@ from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
  from collections import defaultdict
  import dataclasses
  import difflib
+import re
+from itertools import zip_longest
  
  from icu import Transliterator
  
@@ -242,16 +244,22 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
          wordnr = 0
          for phrase in query.source:
              query.nodes[-1].ptype = phrase.ptype
-            for word in phrase.text.split(' '):
+            phrase_split = re.split('([ :-])', phrase.text)
+            # The zip construct will give us the pairs of word/break from
+            # the regular expression split. As the split array ends on the
+            # final word, we simply use the fillvalue to even out the list and
+            # add the phrase break at the end.
+            for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue=','):
+                if not word:
+                    continue
                  trans = self.transliterator.transliterate(word)
                  if trans:
                      for term in trans.split(' '):
                          if term:
                              parts.append(QueryPart(term, word, wordnr))
                              query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
-                    query.nodes[-1].btype = qmod.BreakType.WORD
+                    query.nodes[-1].btype = qmod.BreakType(breakchar)
                  wordnr += 1
-            query.nodes[-1].btype = qmod.BreakType.PHRASE
  
              for word, wrange in yield_words(parts, phrase_start):
                  words[word].append(wrange)
diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py

index 02ebbb5b9d7b6f690af9bdcba466151d57117d1a..b2e18337c67e22e7198812689ee81a44af8ef499 100644 (file)
--- a/src/nominatim_api/search/query.py
+++ b/src/nominatim_api/search/query.py
@@ -21,7 +21,13 @@ class BreakType(enum.Enum):
      END = '>'
      """ End of the query. """
      PHRASE = ','
-    """ Break between two phrases. """
+    """ Hard break between two phrases. Address parts cannot cross hard
+        phrase boundaries."""
+    SOFT_PHRASE = ':'
+    """ Likely break between two phrases. Address parts should not cross soft
+        phrase boundaries. Soft breaks can be inserted by a preprocessor
+        that is analysing the input string.
+    """
      WORD = ' '
      """ Break between words. """
      PART = '-'
diff --git a/src/nominatim_api/search/token_assignment.py b/src/nominatim_api/search/token_assignment.py

index a2e1804c732d737096d4e6416a858515c640b058..0983fd13b0b46fd2ae812cd3414ed491eb205942 100644 (file)
--- a/src/nominatim_api/search/token_assignment.py
+++ b/src/nominatim_api/search/token_assignment.py
@@ -27,6 +27,7 @@ PENALTY_TOKENCHANGE = {
      qmod.BreakType.START: 0.0,
      qmod.BreakType.END: 0.0,
      qmod.BreakType.PHRASE: 0.0,
+    qmod.BreakType.SOFT_PHRASE: 0.0,
      qmod.BreakType.WORD: 0.1,
      qmod.BreakType.PART: 0.2,
      qmod.BreakType.TOKEN: 0.4
author	Sarah Hoffmann <lonvia@denofr.de>
	Mon, 6 Jan 2025 16:10:24 +0000 (17:10 +0100)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Mon, 6 Jan 2025 16:10:24 +0000 (17:10 +0100)
src/nominatim_api/search/db_search_builder.py		patch \| blob \| history
src/nominatim_api/search/icu_tokenizer.py		patch \| blob \| history
src/nominatim_api/search/query.py		patch \| blob \| history
src/nominatim_api/search/token_assignment.py		patch \| blob \| history