replace PhraseType enum with simple int constants

[nominatim.git] / src / nominatim_api / search / query.py
diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py

index 53482df84a11d228ec5c771b71dfc739f9258b1e..8530c4f2301e8f2fa7ff072b4eebe7d1203a1e27 100644 (file)
--- a/src/nominatim_api/search/query.py
+++ b/src/nominatim_api/search/query.py
@@ -10,84 +10,91 @@ Datastructures for a tokenized query.
  from typing import List, Tuple, Optional, Iterator
  from abc import ABC, abstractmethod
  import dataclasses
  from typing import List, Tuple, Optional, Iterator
  from abc import ABC, abstractmethod
  import dataclasses
-import enum
  
  
-class BreakType(enum.Enum):
-    """ Type of break between tokens.
-    """
-    START = '<'
-    """ Begin of the query. """
-    END = '>'
-    """ End of the query. """
-    PHRASE = ','
-    """ Break between two phrases. """
-    WORD = ' '
-    """ Break between words. """
-    PART = '-'
-    """ Break inside a word, for example a hyphen or apostrophe. """
-    TOKEN = '`'
-    """ Break created as a result of tokenization.
-        This may happen in languages without spaces between words.
-    """
  
  
+BreakType = str
+""" Type of break between tokens.
+"""
+BREAK_START = '<'
+""" Begin of the query. """
+BREAK_END = '>'
+""" End of the query. """
+BREAK_PHRASE = ','
+""" Hard break between two phrases. Address parts cannot cross hard
+    phrase boundaries."""
+BREAK_SOFT_PHRASE = ':'
+""" Likely break between two phrases. Address parts should not cross soft
+    phrase boundaries. Soft breaks can be inserted by a preprocessor
+    that is analysing the input string.
+"""
+BREAK_WORD = ' '
+""" Break between words. """
+BREAK_PART = '-'
+""" Break inside a word, for example a hyphen or apostrophe. """
+BREAK_TOKEN = '`'
+""" Break created as a result of tokenization.
+    This may happen in languages without spaces between words.
+"""
  
  
-class TokenType(enum.Enum):
-    """ Type of token.
-    """
-    WORD = enum.auto()
-    """ Full name of a place. """
-    PARTIAL = enum.auto()
-    """ Word term without breaks, does not necessarily represent a full name. """
-    HOUSENUMBER = enum.auto()
-    """ Housenumber term. """
-    POSTCODE = enum.auto()
-    """ Postal code term. """
-    COUNTRY = enum.auto()
-    """ Country name or reference. """
-    QUALIFIER = enum.auto()
-    """ Special term used together with name (e.g. _Hotel_ Bellevue). """
-    NEAR_ITEM = enum.auto()
-    """ Special term used as searchable object(e.g. supermarket in ...). """
-
-
-class PhraseType(enum.Enum):
-    """ Designation of a phrase.
+
+TokenType = str
+""" Type of token.
+"""
+TOKEN_WORD = 'W'
+""" Full name of a place. """
+TOKEN_PARTIAL = 'w'
+""" Word term without breaks, does not necessarily represent a full name. """
+TOKEN_HOUSENUMBER = 'H'
+""" Housenumber term. """
+TOKEN_POSTCODE = 'P'
+""" Postal code term. """
+TOKEN_COUNTRY = 'C'
+""" Country name or reference. """
+TOKEN_QUALIFIER = 'Q'
+""" Special term used together with name (e.g. _Hotel_ Bellevue). """
+TOKEN_NEAR_ITEM = 'N'
+""" Special term used as searchable object(e.g. supermarket in ...). """
+
+
+PhraseType = int
+""" Designation of a phrase.
+"""
+PHRASE_ANY = 0
+""" No specific designation (i.e. source is free-form query). """
+PHRASE_AMENITY = 1
+""" Contains name or type of a POI. """
+PHRASE_STREET = 2
+""" Contains a street name optionally with a housenumber. """
+PHRASE_CITY = 3
+""" Contains the postal city. """
+PHRASE_COUNTY = 4
+""" Contains the equivalent of a county. """
+PHRASE_STATE = 5
+""" Contains a state or province. """
+PHRASE_POSTCODE = 6
+""" Contains a postal code. """
+PHRASE_COUNTRY = 7
+""" Contains the country name or code. """
+
+
+def _phrase_compatible_with(ptype: PhraseType, ttype: TokenType,
+                            is_full_phrase: bool) -> bool:
+    """ Check if the given token type can be used with the phrase type.
      """
      """
-    NONE = 0
-    """ No specific designation (i.e. source is free-form query). """
-    AMENITY = enum.auto()
-    """ Contains name or type of a POI. """
-    STREET = enum.auto()
-    """ Contains a street name optionally with a housenumber. """
-    CITY = enum.auto()
-    """ Contains the postal city. """
-    COUNTY = enum.auto()
-    """ Contains the equivalent of a county. """
-    STATE = enum.auto()
-    """ Contains a state or province. """
-    POSTCODE = enum.auto()
-    """ Contains a postal code. """
-    COUNTRY = enum.auto()
-    """ Contains the country name or code. """
-
-    def compatible_with(self, ttype: TokenType,
-                        is_full_phrase: bool) -> bool:
-        """ Check if the given token type can be used with the phrase type.
-        """
-        if self == PhraseType.NONE:
-            return not is_full_phrase or ttype != TokenType.QUALIFIER
-        if self == PhraseType.AMENITY:
-            return ttype in (TokenType.WORD, TokenType.PARTIAL)\
-                   or (is_full_phrase and ttype == TokenType.NEAR_ITEM)\
-                   or (not is_full_phrase and ttype == TokenType.QUALIFIER)
-        if self == PhraseType.STREET:
-            return ttype in (TokenType.WORD, TokenType.PARTIAL, TokenType.HOUSENUMBER)
-        if self == PhraseType.POSTCODE:
-            return ttype == TokenType.POSTCODE
-        if self == PhraseType.COUNTRY:
-            return ttype == TokenType.COUNTRY
-
-        return ttype in (TokenType.WORD, TokenType.PARTIAL)
+    if ptype == PHRASE_ANY:
+        return not is_full_phrase or ttype != TOKEN_QUALIFIER
+    if ptype == PHRASE_AMENITY:
+        return ttype in (TOKEN_WORD, TOKEN_PARTIAL)\
+               or (is_full_phrase and ttype == TOKEN_NEAR_ITEM)\
+               or (not is_full_phrase and ttype == TOKEN_QUALIFIER)
+    if ptype == PHRASE_STREET:
+        return ttype in (TOKEN_WORD, TOKEN_PARTIAL, TOKEN_HOUSENUMBER)
+    if ptype == PHRASE_POSTCODE:
+        return ttype == TOKEN_POSTCODE
+    if ptype == PHRASE_COUNTRY:
+        return ttype == TOKEN_COUNTRY
+
+    return ttype in (TOKEN_WORD, TOKEN_PARTIAL)
  
  
  @dataclasses.dataclass
  
  
  @dataclasses.dataclass
@@ -102,48 +109,43 @@ class Token(ABC):
      addr_count: int
      lookup_word: str
  
      addr_count: int
      lookup_word: str
  
-
      @abstractmethod
      def get_category(self) -> Tuple[str, str]:
          """ Return the category restriction for qualifier terms and
              category objects.
          """
  
      @abstractmethod
      def get_category(self) -> Tuple[str, str]:
          """ Return the category restriction for qualifier terms and
              category objects.
          """
  
+
  @dataclasses.dataclass
  class TokenRange:
      """ Indexes of query nodes over which a token spans.
      """
      start: int
      end: int
  @dataclasses.dataclass
  class TokenRange:
      """ Indexes of query nodes over which a token spans.
      """
      start: int
      end: int
+    penalty: Optional[float] = None
  
      def __lt__(self, other: 'TokenRange') -> bool:
          return self.end <= other.start
  
  
      def __lt__(self, other: 'TokenRange') -> bool:
          return self.end <= other.start
  
-
      def __le__(self, other: 'TokenRange') -> bool:
          return NotImplemented
  
      def __le__(self, other: 'TokenRange') -> bool:
          return NotImplemented
  
-
      def __gt__(self, other: 'TokenRange') -> bool:
          return self.start >= other.end
  
      def __gt__(self, other: 'TokenRange') -> bool:
          return self.start >= other.end
  
-
      def __ge__(self, other: 'TokenRange') -> bool:
          return NotImplemented
  
      def __ge__(self, other: 'TokenRange') -> bool:
          return NotImplemented
  
-
      def replace_start(self, new_start: int) -> 'TokenRange':
          """ Return a new token range with the new start.
          """
          return TokenRange(new_start, self.end)
  
      def replace_start(self, new_start: int) -> 'TokenRange':
          """ Return a new token range with the new start.
          """
          return TokenRange(new_start, self.end)
  
-
      def replace_end(self, new_end: int) -> 'TokenRange':
          """ Return a new token range with the new end.
          """
          return TokenRange(self.start, new_end)
  
      def replace_end(self, new_end: int) -> 'TokenRange':
          """ Return a new token range with the new end.
          """
          return TokenRange(self.start, new_end)
  
-
      def split(self, index: int) -> Tuple['TokenRange', 'TokenRange']:
          """ Split the span into two spans at the given index.
              The index must be within the span.
      def split(self, index: int) -> Tuple['TokenRange', 'TokenRange']:
          """ Split the span into two spans at the given index.
              The index must be within the span.
@@ -159,7 +161,6 @@ class TokenList:
      ttype: TokenType
      tokens: List[Token]
  
      ttype: TokenType
      tokens: List[Token]
  
-
      def add_penalty(self, penalty: float) -> None:
          """ Add the given penalty to all tokens in the list.
          """
      def add_penalty(self, penalty: float) -> None:
          """ Add the given penalty to all tokens in the list.
          """
@@ -181,7 +182,6 @@ class QueryNode:
          """
          return any(tl.end == end and tl.ttype in ttypes for tl in self.starting)
  
          """
          return any(tl.end == end and tl.ttype in ttypes for tl in self.starting)
  
-
      def get_tokens(self, end: int, ttype: TokenType) -> Optional[List[Token]]:
          """ Get the list of tokens of the given type starting at this node
              and ending at the node 'end'. Returns 'None' if no such
      def get_tokens(self, end: int, ttype: TokenType) -> Optional[List[Token]]:
          """ Get the list of tokens of the given type starting at this node
              and ending at the node 'end'. Returns 'None' if no such
@@ -218,15 +218,13 @@ class QueryStruct:
      def __init__(self, source: List[Phrase]) -> None:
          self.source = source
          self.nodes: List[QueryNode] = \
      def __init__(self, source: List[Phrase]) -> None:
          self.source = source
          self.nodes: List[QueryNode] = \
-            [QueryNode(BreakType.START, source[0].ptype if source else PhraseType.NONE)]
-
+            [QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY)]
  
      def num_token_slots(self) -> int:
          """ Return the length of the query in vertice steps.
          """
          return len(self.nodes) - 1
  
  
      def num_token_slots(self) -> int:
          """ Return the length of the query in vertice steps.
          """
          return len(self.nodes) - 1
  
-
      def add_node(self, btype: BreakType, ptype: PhraseType) -> None:
          """ Append a new break node with the given break type.
              The phrase type denotes the type for any tokens starting
      def add_node(self, btype: BreakType, ptype: PhraseType) -> None:
          """ Append a new break node with the given break type.
              The phrase type denotes the type for any tokens starting
@@ -234,7 +232,6 @@ class QueryStruct:
          """
          self.nodes.append(QueryNode(btype, ptype))
  
          """
          self.nodes.append(QueryNode(btype, ptype))
  
-
      def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
          """ Add a token to the query. 'start' and 'end' are the indexes of the
              nodes from which to which the token spans. The indexes must exist
      def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
          """ Add a token to the query. 'start' and 'end' are the indexes of the
              nodes from which to which the token spans. The indexes must exist
@@ -246,16 +243,15 @@ class QueryStruct:
              be added to, then the token is silently dropped.
          """
          snode = self.nodes[trange.start]
              be added to, then the token is silently dropped.
          """
          snode = self.nodes[trange.start]
-        full_phrase = snode.btype in (BreakType.START, BreakType.PHRASE)\
-                      and self.nodes[trange.end].btype in (BreakType.PHRASE, BreakType.END)
-        if snode.ptype.compatible_with(ttype, full_phrase):
+        full_phrase = snode.btype in (BREAK_START, BREAK_PHRASE)\
+            and self.nodes[trange.end].btype in (BREAK_PHRASE, BREAK_END)
+        if _phrase_compatible_with(snode.ptype, ttype, full_phrase):
              tlist = snode.get_tokens(trange.end, ttype)
              if tlist is None:
                  snode.starting.append(TokenList(trange.end, ttype, [token]))
              else:
                  tlist.append(token)
  
              tlist = snode.get_tokens(trange.end, ttype)
              if tlist is None:
                  snode.starting.append(TokenList(trange.end, ttype, [token]))
              else:
                  tlist.append(token)
  
-
      def get_tokens(self, trange: TokenRange, ttype: TokenType) -> List[Token]:
          """ Get the list of tokens of a given type, spanning the given
              nodes. The nodes must exist. If no tokens exist, an
      def get_tokens(self, trange: TokenRange, ttype: TokenType) -> List[Token]:
          """ Get the list of tokens of a given type, spanning the given
              nodes. The nodes must exist. If no tokens exist, an
@@ -263,16 +259,14 @@ class QueryStruct:
          """
          return self.nodes[trange.start].get_tokens(trange.end, ttype) or []
  
          """
          return self.nodes[trange.start].get_tokens(trange.end, ttype) or []
  
-
      def get_partials_list(self, trange: TokenRange) -> List[Token]:
          """ Create a list of partial tokens between the given nodes.
              The list is composed of the first token of type PARTIAL
              going to the subsequent node. Such PARTIAL tokens are
              assumed to exist.
          """
      def get_partials_list(self, trange: TokenRange) -> List[Token]:
          """ Create a list of partial tokens between the given nodes.
              The list is composed of the first token of type PARTIAL
              going to the subsequent node. Such PARTIAL tokens are
              assumed to exist.
          """
-        return [next(iter(self.get_tokens(TokenRange(i, i+1), TokenType.PARTIAL)))
-                          for i in range(trange.start, trange.end)]
-
+        return [next(iter(self.get_tokens(TokenRange(i, i+1), TOKEN_PARTIAL)))
+                for i in range(trange.start, trange.end)]
  
      def iter_token_lists(self) -> Iterator[Tuple[int, QueryNode, TokenList]]:
          """ Iterator over all token lists in the query.
  
      def iter_token_lists(self) -> Iterator[Tuple[int, QueryNode, TokenList]]:
          """ Iterator over all token lists in the query.
@@ -281,7 +275,6 @@ class QueryStruct:
              for tlist in node.starting:
                  yield i, node, tlist
  
              for tlist in node.starting:
                  yield i, node, tlist
  
-
      def find_lookup_word_by_id(self, token: int) -> str:
          """ Find the first token with the given token ID and return
              its lookup word. Returns 'None' if no such token exists.
      def find_lookup_word_by_id(self, token: int) -> str:
          """ Find the first token with the given token ID and return
              its lookup word. Returns 'None' if no such token exists.
@@ -292,5 +285,5 @@ class QueryStruct:
              for tlist in node.starting:
                  for t in tlist.tokens:
                      if t.token == token:
              for tlist in node.starting:
                  for t in tlist.tokens:
                      if t.token == token:
-                        return f"[{tlist.ttype.name[0]}]{t.lookup_word}"
+                        return f"[{tlist.ttype}]{t.lookup_word}"
          return 'None'
          return 'None'