X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/ae8694a6a6862d7cb66cd91102d2802c9899e7cf..31412e06740727695c5d9512e0cd59c0dd683322:/src/nominatim_api/search/query.py

diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py
index 02ebbb5b..68a6b00a 100644
--- a/src/nominatim_api/search/query.py
+++ b/src/nominatim_api/search/query.py
@@ -13,42 +13,48 @@ import dataclasses
 import enum
 
 
-class BreakType(enum.Enum):
-    """ Type of break between tokens.
-    """
-    START = '<'
-    """ Begin of the query. """
-    END = '>'
-    """ End of the query. """
-    PHRASE = ','
-    """ Break between two phrases. """
-    WORD = ' '
-    """ Break between words. """
-    PART = '-'
-    """ Break inside a word, for example a hyphen or apostrophe. """
-    TOKEN = '`'
-    """ Break created as a result of tokenization.
-        This may happen in languages without spaces between words.
-    """
+BreakType = str
+""" Type of break between tokens.
+"""
+BREAK_START = '<'
+""" Begin of the query. """
+BREAK_END = '>'
+""" End of the query. """
+BREAK_PHRASE = ','
+""" Hard break between two phrases. Address parts cannot cross hard
+    phrase boundaries."""
+BREAK_SOFT_PHRASE = ':'
+""" Likely break between two phrases. Address parts should not cross soft
+    phrase boundaries. Soft breaks can be inserted by a preprocessor
+    that is analysing the input string.
+"""
+BREAK_WORD = ' '
+""" Break between words. """
+BREAK_PART = '-'
+""" Break inside a word, for example a hyphen or apostrophe. """
+BREAK_TOKEN = '`'
+""" Break created as a result of tokenization.
+    This may happen in languages without spaces between words.
+"""
 
 
-class TokenType(enum.Enum):
-    """ Type of token.
-    """
-    WORD = enum.auto()
-    """ Full name of a place. """
-    PARTIAL = enum.auto()
-    """ Word term without breaks, does not necessarily represent a full name. """
-    HOUSENUMBER = enum.auto()
-    """ Housenumber term. """
-    POSTCODE = enum.auto()
-    """ Postal code term. """
-    COUNTRY = enum.auto()
-    """ Country name or reference. """
-    QUALIFIER = enum.auto()
-    """ Special term used together with name (e.g. _Hotel_ Bellevue). """
-    NEAR_ITEM = enum.auto()
-    """ Special term used as searchable object(e.g. supermarket in ...). """
+TokenType = str
+""" Type of token.
+"""
+TOKEN_WORD = 'W'
+""" Full name of a place. """
+TOKEN_PARTIAL = 'w'
+""" Word term without breaks, does not necessarily represent a full name. """
+TOKEN_HOUSENUMBER = 'H'
+""" Housenumber term. """
+TOKEN_POSTCODE = 'P'
+""" Postal code term. """
+TOKEN_COUNTRY = 'C'
+""" Country name or reference. """
+TOKEN_QUALIFIER = 'Q'
+""" Special term used together with name (e.g. _Hotel_ Bellevue). """
+TOKEN_NEAR_ITEM = 'N'
+""" Special term used as searchable object(e.g. supermarket in ...). """
 
 
 class PhraseType(enum.Enum):
@@ -76,19 +82,19 @@ class PhraseType(enum.Enum):
         """ Check if the given token type can be used with the phrase type.
         """
         if self == PhraseType.NONE:
-            return not is_full_phrase or ttype != TokenType.QUALIFIER
+            return not is_full_phrase or ttype != TOKEN_QUALIFIER
         if self == PhraseType.AMENITY:
-            return ttype in (TokenType.WORD, TokenType.PARTIAL)\
-                   or (is_full_phrase and ttype == TokenType.NEAR_ITEM)\
-                   or (not is_full_phrase and ttype == TokenType.QUALIFIER)
+            return ttype in (TOKEN_WORD, TOKEN_PARTIAL)\
+                   or (is_full_phrase and ttype == TOKEN_NEAR_ITEM)\
+                   or (not is_full_phrase and ttype == TOKEN_QUALIFIER)
         if self == PhraseType.STREET:
-            return ttype in (TokenType.WORD, TokenType.PARTIAL, TokenType.HOUSENUMBER)
+            return ttype in (TOKEN_WORD, TOKEN_PARTIAL, TOKEN_HOUSENUMBER)
         if self == PhraseType.POSTCODE:
-            return ttype == TokenType.POSTCODE
+            return ttype == TOKEN_POSTCODE
         if self == PhraseType.COUNTRY:
-            return ttype == TokenType.COUNTRY
+            return ttype == TOKEN_COUNTRY
 
-        return ttype in (TokenType.WORD, TokenType.PARTIAL)
+        return ttype in (TOKEN_WORD, TOKEN_PARTIAL)
 
 
 @dataclasses.dataclass
@@ -116,6 +122,7 @@ class TokenRange:
     """
     start: int
     end: int
+    penalty: Optional[float] = None
 
     def __lt__(self, other: 'TokenRange') -> bool:
         return self.end <= other.start
@@ -211,7 +218,7 @@ class QueryStruct:
     def __init__(self, source: List[Phrase]) -> None:
         self.source = source
         self.nodes: List[QueryNode] = \
-            [QueryNode(BreakType.START, source[0].ptype if source else PhraseType.NONE)]
+            [QueryNode(BREAK_START, source[0].ptype if source else PhraseType.NONE)]
 
     def num_token_slots(self) -> int:
         """ Return the length of the query in vertice steps.
@@ -236,8 +243,8 @@ class QueryStruct:
             be added to, then the token is silently dropped.
         """
         snode = self.nodes[trange.start]
-        full_phrase = snode.btype in (BreakType.START, BreakType.PHRASE)\
-            and self.nodes[trange.end].btype in (BreakType.PHRASE, BreakType.END)
+        full_phrase = snode.btype in (BREAK_START, BREAK_PHRASE)\
+            and self.nodes[trange.end].btype in (BREAK_PHRASE, BREAK_END)
         if snode.ptype.compatible_with(ttype, full_phrase):
             tlist = snode.get_tokens(trange.end, ttype)
             if tlist is None:
@@ -258,7 +265,7 @@ class QueryStruct:
             going to the subsequent node. Such PARTIAL tokens are
             assumed to exist.
         """
-        return [next(iter(self.get_tokens(TokenRange(i, i+1), TokenType.PARTIAL)))
+        return [next(iter(self.get_tokens(TokenRange(i, i+1), TOKEN_PARTIAL)))
                 for i in range(trange.start, trange.end)]
 
     def iter_token_lists(self) -> Iterator[Tuple[int, QueryNode, TokenList]]:
@@ -278,5 +285,5 @@ class QueryStruct:
             for tlist in node.starting:
                 for t in tlist.tokens:
                     if t.token == token:
-                        return f"[{tlist.ttype.name[0]}]{t.lookup_word}"
+                        return f"[{tlist.ttype}]{t.lookup_word}"
         return 'None'