X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/a690605a96191ed0ac230493bfb8ae3aa6504988..4577669213ea392fa7e25a2fce444f387763f4c8:/src/nominatim_api/search/query.py

diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py
index 53482df8..87638129 100644
--- a/src/nominatim_api/search/query.py
+++ b/src/nominatim_api/search/query.py
@@ -12,23 +12,30 @@ from abc import ABC, abstractmethod
 import dataclasses
 import enum
 
-class BreakType(enum.Enum):
-    """ Type of break between tokens.
-    """
-    START = '<'
-    """ Begin of the query. """
-    END = '>'
-    """ End of the query. """
-    PHRASE = ','
-    """ Break between two phrases. """
-    WORD = ' '
-    """ Break between words. """
-    PART = '-'
-    """ Break inside a word, for example a hyphen or apostrophe. """
-    TOKEN = '`'
-    """ Break created as a result of tokenization.
-        This may happen in languages without spaces between words.
-    """
+
+BreakType = str
+""" Type of break between tokens.
+"""
+BREAK_START = '<'
+""" Begin of the query. """
+BREAK_END = '>'
+""" End of the query. """
+BREAK_PHRASE = ','
+""" Hard break between two phrases. Address parts cannot cross hard
+    phrase boundaries."""
+BREAK_SOFT_PHRASE = ':'
+""" Likely break between two phrases. Address parts should not cross soft
+    phrase boundaries. Soft breaks can be inserted by a preprocessor
+    that is analysing the input string.
+"""
+BREAK_WORD = ' '
+""" Break between words. """
+BREAK_PART = '-'
+""" Break inside a word, for example a hyphen or apostrophe. """
+BREAK_TOKEN = '`'
+""" Break created as a result of tokenization.
+    This may happen in languages without spaces between words.
+"""
 
 
 class TokenType(enum.Enum):
@@ -102,48 +109,43 @@ class Token(ABC):
     addr_count: int
     lookup_word: str
 
-
     @abstractmethod
     def get_category(self) -> Tuple[str, str]:
         """ Return the category restriction for qualifier terms and
             category objects.
         """
 
+
 @dataclasses.dataclass
 class TokenRange:
     """ Indexes of query nodes over which a token spans.
     """
     start: int
     end: int
+    penalty: Optional[float] = None
 
     def __lt__(self, other: 'TokenRange') -> bool:
         return self.end <= other.start
 
-
     def __le__(self, other: 'TokenRange') -> bool:
         return NotImplemented
 
-
     def __gt__(self, other: 'TokenRange') -> bool:
         return self.start >= other.end
 
-
     def __ge__(self, other: 'TokenRange') -> bool:
         return NotImplemented
 
-
     def replace_start(self, new_start: int) -> 'TokenRange':
         """ Return a new token range with the new start.
         """
         return TokenRange(new_start, self.end)
 
-
     def replace_end(self, new_end: int) -> 'TokenRange':
         """ Return a new token range with the new end.
         """
         return TokenRange(self.start, new_end)
 
-
     def split(self, index: int) -> Tuple['TokenRange', 'TokenRange']:
         """ Split the span into two spans at the given index.
             The index must be within the span.
@@ -159,7 +161,6 @@ class TokenList:
     ttype: TokenType
     tokens: List[Token]
 
-
     def add_penalty(self, penalty: float) -> None:
         """ Add the given penalty to all tokens in the list.
         """
@@ -181,7 +182,6 @@ class QueryNode:
         """
         return any(tl.end == end and tl.ttype in ttypes for tl in self.starting)
 
-
     def get_tokens(self, end: int, ttype: TokenType) -> Optional[List[Token]]:
         """ Get the list of tokens of the given type starting at this node
             and ending at the node 'end'. Returns 'None' if no such
@@ -218,15 +218,13 @@ class QueryStruct:
     def __init__(self, source: List[Phrase]) -> None:
         self.source = source
         self.nodes: List[QueryNode] = \
-            [QueryNode(BreakType.START, source[0].ptype if source else PhraseType.NONE)]
-
+            [QueryNode(BREAK_START, source[0].ptype if source else PhraseType.NONE)]
 
     def num_token_slots(self) -> int:
         """ Return the length of the query in vertice steps.
         """
         return len(self.nodes) - 1
 
-
     def add_node(self, btype: BreakType, ptype: PhraseType) -> None:
         """ Append a new break node with the given break type.
             The phrase type denotes the type for any tokens starting
@@ -234,7 +232,6 @@ class QueryStruct:
         """
         self.nodes.append(QueryNode(btype, ptype))
 
-
     def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
         """ Add a token to the query. 'start' and 'end' are the indexes of the
             nodes from which to which the token spans. The indexes must exist
@@ -246,8 +243,8 @@ class QueryStruct:
             be added to, then the token is silently dropped.
         """
         snode = self.nodes[trange.start]
-        full_phrase = snode.btype in (BreakType.START, BreakType.PHRASE)\
-                      and self.nodes[trange.end].btype in (BreakType.PHRASE, BreakType.END)
+        full_phrase = snode.btype in (BREAK_START, BREAK_PHRASE)\
+            and self.nodes[trange.end].btype in (BREAK_PHRASE, BREAK_END)
         if snode.ptype.compatible_with(ttype, full_phrase):
             tlist = snode.get_tokens(trange.end, ttype)
             if tlist is None:
@@ -255,7 +252,6 @@ class QueryStruct:
             else:
                 tlist.append(token)
 
-
     def get_tokens(self, trange: TokenRange, ttype: TokenType) -> List[Token]:
         """ Get the list of tokens of a given type, spanning the given
             nodes. The nodes must exist. If no tokens exist, an
@@ -263,7 +259,6 @@ class QueryStruct:
         """
         return self.nodes[trange.start].get_tokens(trange.end, ttype) or []
 
-
     def get_partials_list(self, trange: TokenRange) -> List[Token]:
         """ Create a list of partial tokens between the given nodes.
             The list is composed of the first token of type PARTIAL
@@ -271,8 +266,7 @@ class QueryStruct:
             assumed to exist.
         """
         return [next(iter(self.get_tokens(TokenRange(i, i+1), TokenType.PARTIAL)))
-                          for i in range(trange.start, trange.end)]
-
+                for i in range(trange.start, trange.end)]
 
     def iter_token_lists(self) -> Iterator[Tuple[int, QueryNode, TokenList]]:
         """ Iterator over all token lists in the query.
@@ -281,7 +275,6 @@ class QueryStruct:
             for tlist in node.starting:
                 yield i, node, tlist
 
-
     def find_lookup_word_by_id(self, token: int) -> str:
         """ Find the first token with the given token ID and return
             its lookup word. Returns 'None' if no such token exists.