src/nominatim_api/search/query.py

   1 # SPDX-License-Identifier: GPL-3.0-or-later
   2 #
   3 # This file is part of Nominatim. (https://nominatim.org)
   4 #
   5 # Copyright (C) 2024 by the Nominatim developer community.
   6 # For a full list of authors see the git log.
   7 """
   8 Datastructures for a tokenized query.
   9 """
  10 from typing import List, Tuple, Optional, Iterator
  11 from abc import ABC, abstractmethod
  12 import dataclasses
  13 import enum
  14
  15 class BreakType(enum.Enum):
  16     """ Type of break between tokens.
  17     """
  18     START = '<'
  19     """ Begin of the query. """
  20     END = '>'
  21     """ End of the query. """
  22     PHRASE = ','
  23     """ Break between two phrases. """
  24     WORD = ' '
  25     """ Break between words. """
  26     PART = '-'
  27     """ Break inside a word, for example a hyphen or apostrophe. """
  28     TOKEN = '`'
  29     """ Break created as a result of tokenization.
  30         This may happen in languages without spaces between words.
  31     """
  32
  33
  34 class TokenType(enum.Enum):
  35     """ Type of token.
  36     """
  37     WORD = enum.auto()
  38     """ Full name of a place. """
  39     PARTIAL = enum.auto()
  40     """ Word term without breaks, does not necessarily represent a full name. """
  41     HOUSENUMBER = enum.auto()
  42     """ Housenumber term. """
  43     POSTCODE = enum.auto()
  44     """ Postal code term. """
  45     COUNTRY = enum.auto()
  46     """ Country name or reference. """
  47     QUALIFIER = enum.auto()
  48     """ Special term used together with name (e.g. _Hotel_ Bellevue). """
  49     NEAR_ITEM = enum.auto()
  50     """ Special term used as searchable object(e.g. supermarket in ...). """
  51
  52
  53 class PhraseType(enum.Enum):
  54     """ Designation of a phrase.
  55     """
  56     NONE = 0
  57     """ No specific designation (i.e. source is free-form query). """
  58     AMENITY = enum.auto()
  59     """ Contains name or type of a POI. """
  60     STREET = enum.auto()
  61     """ Contains a street name optionally with a housenumber. """
  62     CITY = enum.auto()
  63     """ Contains the postal city. """
  64     COUNTY = enum.auto()
  65     """ Contains the equivalent of a county. """
  66     STATE = enum.auto()
  67     """ Contains a state or province. """
  68     POSTCODE = enum.auto()
  69     """ Contains a postal code. """
  70     COUNTRY = enum.auto()
  71     """ Contains the country name or code. """
  72
  73     def compatible_with(self, ttype: TokenType,
  74                         is_full_phrase: bool) -> bool:
  75         """ Check if the given token type can be used with the phrase type.
  76         """
  77         if self == PhraseType.NONE:
  78             return not is_full_phrase or ttype != TokenType.QUALIFIER
  79         if self == PhraseType.AMENITY:
  80             return ttype in (TokenType.WORD, TokenType.PARTIAL)\
  81                    or (is_full_phrase and ttype == TokenType.NEAR_ITEM)\
  82                    or (not is_full_phrase and ttype == TokenType.QUALIFIER)
  83         if self == PhraseType.STREET:
  84             return ttype in (TokenType.WORD, TokenType.PARTIAL, TokenType.HOUSENUMBER)
  85         if self == PhraseType.POSTCODE:
  86             return ttype == TokenType.POSTCODE
  87         if self == PhraseType.COUNTRY:
  88             return ttype == TokenType.COUNTRY
  89
  90         return ttype in (TokenType.WORD, TokenType.PARTIAL)
  91
  92
  93 @dataclasses.dataclass
  94 class Token(ABC):
  95     """ Base type for tokens.
  96         Specific query analyzers must implement the concrete token class.
  97     """
  98
  99     penalty: float
 100     token: int
 101     count: int
 102     addr_count: int
 103     lookup_word: str
 104     is_indexed: bool
 105
 106
 107     @abstractmethod
 108     def get_category(self) -> Tuple[str, str]:
 109         """ Return the category restriction for qualifier terms and
 110             category objects.
 111         """
 112
 113 @dataclasses.dataclass
 114 class TokenRange:
 115     """ Indexes of query nodes over which a token spans.
 116     """
 117     start: int
 118     end: int
 119
 120     def __lt__(self, other: 'TokenRange') -> bool:
 121         return self.end <= other.start
 122
 123
 124     def __le__(self, other: 'TokenRange') -> bool:
 125         return NotImplemented
 126
 127
 128     def __gt__(self, other: 'TokenRange') -> bool:
 129         return self.start >= other.end
 130
 131
 132     def __ge__(self, other: 'TokenRange') -> bool:
 133         return NotImplemented
 134
 135
 136     def replace_start(self, new_start: int) -> 'TokenRange':
 137         """ Return a new token range with the new start.
 138         """
 139         return TokenRange(new_start, self.end)
 140
 141
 142     def replace_end(self, new_end: int) -> 'TokenRange':
 143         """ Return a new token range with the new end.
 144         """
 145         return TokenRange(self.start, new_end)
 146
 147
 148     def split(self, index: int) -> Tuple['TokenRange', 'TokenRange']:
 149         """ Split the span into two spans at the given index.
 150             The index must be within the span.
 151         """
 152         return self.replace_end(index), self.replace_start(index)
 153
 154
 155 @dataclasses.dataclass
 156 class TokenList:
 157     """ List of all tokens of a given type going from one breakpoint to another.
 158     """
 159     end: int
 160     ttype: TokenType
 161     tokens: List[Token]
 162
 163
 164     def add_penalty(self, penalty: float) -> None:
 165         """ Add the given penalty to all tokens in the list.
 166         """
 167         for token in self.tokens:
 168             token.penalty += penalty
 169
 170
 171 @dataclasses.dataclass
 172 class QueryNode:
 173     """ A node of the query representing a break between terms.
 174     """
 175     btype: BreakType
 176     ptype: PhraseType
 177     starting: List[TokenList] = dataclasses.field(default_factory=list)
 178
 179     def has_tokens(self, end: int, *ttypes: TokenType) -> bool:
 180         """ Check if there are tokens of the given types ending at the
 181             given node.
 182         """
 183         return any(tl.end == end and tl.ttype in ttypes for tl in self.starting)
 184
 185
 186     def get_tokens(self, end: int, ttype: TokenType) -> Optional[List[Token]]:
 187         """ Get the list of tokens of the given type starting at this node
 188             and ending at the node 'end'. Returns 'None' if no such
 189             tokens exist.
 190         """
 191         for tlist in self.starting:
 192             if tlist.end == end and tlist.ttype == ttype:
 193                 return tlist.tokens
 194         return None
 195
 196
 197 @dataclasses.dataclass
 198 class Phrase:
 199     """ A normalized query part. Phrases may be typed which means that
 200         they then represent a specific part of the address.
 201     """
 202     ptype: PhraseType
 203     text: str
 204
 205
 206 class QueryStruct:
 207     """ A tokenized search query together with the normalized source
 208         from which the tokens have been parsed.
 209
 210         The query contains a list of nodes that represent the breaks
 211         between words. Tokens span between nodes, which don't necessarily
 212         need to be direct neighbours. Thus the query is represented as a
 213         directed acyclic graph.
 214
 215         When created, a query contains a single node: the start of the
 216         query. Further nodes can be added by appending to 'nodes'.
 217     """
 218
 219     def __init__(self, source: List[Phrase]) -> None:
 220         self.source = source
 221         self.nodes: List[QueryNode] = \
 222             [QueryNode(BreakType.START, source[0].ptype if source else PhraseType.NONE)]
 223
 224
 225     def num_token_slots(self) -> int:
 226         """ Return the length of the query in vertice steps.
 227         """
 228         return len(self.nodes) - 1
 229
 230
 231     def add_node(self, btype: BreakType, ptype: PhraseType) -> None:
 232         """ Append a new break node with the given break type.
 233             The phrase type denotes the type for any tokens starting
 234             at the node.
 235         """
 236         self.nodes.append(QueryNode(btype, ptype))
 237
 238
 239     def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
 240         """ Add a token to the query. 'start' and 'end' are the indexes of the
 241             nodes from which to which the token spans. The indexes must exist
 242             and are expected to be in the same phrase.
 243             'ttype' denotes the type of the token and 'token' the token to
 244             be inserted.
 245
 246             If the token type is not compatible with the phrase it should
 247             be added to, then the token is silently dropped.
 248         """
 249         snode = self.nodes[trange.start]
 250         full_phrase = snode.btype in (BreakType.START, BreakType.PHRASE)\
 251                       and self.nodes[trange.end].btype in (BreakType.PHRASE, BreakType.END)
 252         if snode.ptype.compatible_with(ttype, full_phrase):
 253             tlist = snode.get_tokens(trange.end, ttype)
 254             if tlist is None:
 255                 snode.starting.append(TokenList(trange.end, ttype, [token]))
 256             else:
 257                 tlist.append(token)
 258
 259
 260     def get_tokens(self, trange: TokenRange, ttype: TokenType) -> List[Token]:
 261         """ Get the list of tokens of a given type, spanning the given
 262             nodes. The nodes must exist. If no tokens exist, an
 263             empty list is returned.
 264         """
 265         return self.nodes[trange.start].get_tokens(trange.end, ttype) or []
 266
 267
 268     def get_partials_list(self, trange: TokenRange) -> List[Token]:
 269         """ Create a list of partial tokens between the given nodes.
 270             The list is composed of the first token of type PARTIAL
 271             going to the subsequent node. Such PARTIAL tokens are
 272             assumed to exist.
 273         """
 274         return [next(iter(self.get_tokens(TokenRange(i, i+1), TokenType.PARTIAL)))
 275                           for i in range(trange.start, trange.end)]
 276
 277
 278     def iter_token_lists(self) -> Iterator[Tuple[int, QueryNode, TokenList]]:
 279         """ Iterator over all token lists in the query.
 280         """
 281         for i, node in enumerate(self.nodes):
 282             for tlist in node.starting:
 283                 yield i, node, tlist
 284
 285
 286     def find_lookup_word_by_id(self, token: int) -> str:
 287         """ Find the first token with the given token ID and return
 288             its lookup word. Returns 'None' if no such token exists.
 289             The function is very slow and must only be used for
 290             debugging.
 291         """
 292         for node in self.nodes:
 293             for tlist in node.starting:
 294                 for t in tlist.tokens:
 295                     if t.token == token:
 296                         return f"[{tlist.ttype.name[0]}]{t.lookup_word}"
 297         return 'None'