1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Datastructures for a tokenized query.
10 from typing import List, Tuple, Optional, Iterator
11 from abc import ABC, abstractmethod
16 """ Type of break between tokens.
19 """ Begin of the query. """
21 """ End of the query. """
23 """ Hard break between two phrases. Address parts cannot cross hard
25 BREAK_SOFT_PHRASE = ':'
26 """ Likely break between two phrases. Address parts should not cross soft
27 phrase boundaries. Soft breaks can be inserted by a preprocessor
28 that is analysing the input string.
31 """ Break between words. """
33 """ Break inside a word, for example a hyphen or apostrophe. """
35 """ Break created as a result of tokenization.
36 This may happen in languages without spaces between words.
44 """ Full name of a place. """
46 """ Word term without breaks, does not necessarily represent a full name. """
47 TOKEN_HOUSENUMBER = 'H'
48 """ Housenumber term. """
50 """ Postal code term. """
52 """ Country name or reference. """
54 """ Special term used together with name (e.g. _Hotel_ Bellevue). """
56 """ Special term used as searchable object(e.g. supermarket in ...). """
60 """ Designation of a phrase.
63 """ No specific designation (i.e. source is free-form query). """
65 """ Contains name or type of a POI. """
67 """ Contains a street name optionally with a housenumber. """
69 """ Contains the postal city. """
71 """ Contains the equivalent of a county. """
73 """ Contains a state or province. """
75 """ Contains a postal code. """
77 """ Contains the country name or code. """
80 def _phrase_compatible_with(ptype: PhraseType, ttype: TokenType,
81 is_full_phrase: bool) -> bool:
82 """ Check if the given token type can be used with the phrase type.
84 if ptype == PHRASE_ANY:
85 return not is_full_phrase or ttype != TOKEN_QUALIFIER
86 if ptype == PHRASE_AMENITY:
87 return ttype in (TOKEN_WORD, TOKEN_PARTIAL)\
88 or (is_full_phrase and ttype == TOKEN_NEAR_ITEM)\
89 or (not is_full_phrase and ttype == TOKEN_QUALIFIER)
90 if ptype == PHRASE_STREET:
91 return ttype in (TOKEN_WORD, TOKEN_PARTIAL, TOKEN_HOUSENUMBER)
92 if ptype == PHRASE_POSTCODE:
93 return ttype == TOKEN_POSTCODE
94 if ptype == PHRASE_COUNTRY:
95 return ttype == TOKEN_COUNTRY
97 return ttype in (TOKEN_WORD, TOKEN_PARTIAL)
100 @dataclasses.dataclass
102 """ Base type for tokens.
103 Specific query analyzers must implement the concrete token class.
113 def get_category(self) -> Tuple[str, str]:
114 """ Return the category restriction for qualifier terms and
119 @dataclasses.dataclass
121 """ Indexes of query nodes over which a token spans.
125 penalty: Optional[float] = None
127 def __lt__(self, other: 'TokenRange') -> bool:
128 return self.end <= other.start
130 def __le__(self, other: 'TokenRange') -> bool:
131 return NotImplemented
133 def __gt__(self, other: 'TokenRange') -> bool:
134 return self.start >= other.end
136 def __ge__(self, other: 'TokenRange') -> bool:
137 return NotImplemented
139 def replace_start(self, new_start: int) -> 'TokenRange':
140 """ Return a new token range with the new start.
142 return TokenRange(new_start, self.end)
144 def replace_end(self, new_end: int) -> 'TokenRange':
145 """ Return a new token range with the new end.
147 return TokenRange(self.start, new_end)
149 def split(self, index: int) -> Tuple['TokenRange', 'TokenRange']:
150 """ Split the span into two spans at the given index.
151 The index must be within the span.
153 return self.replace_end(index), self.replace_start(index)
156 @dataclasses.dataclass
158 """ List of all tokens of a given type going from one breakpoint to another.
164 def add_penalty(self, penalty: float) -> None:
165 """ Add the given penalty to all tokens in the list.
167 for token in self.tokens:
168 token.penalty += penalty
171 @dataclasses.dataclass
173 """ A node of the query representing a break between terms.
175 The node also contains information on the source term
176 ending at the node. The tokens are created from this information.
182 """ Penalty for the break at this node.
185 """ Transliterated term following this node.
188 """ Normalised form of term following this node.
189 When the token resulted from a split during transliteration,
190 then this string contains the complete source term.
193 starting: List[TokenList] = dataclasses.field(default_factory=list)
195 def adjust_break(self, btype: BreakType, penalty: float) -> None:
196 """ Change the break type and penalty for this node.
199 self.penalty = penalty
201 def has_tokens(self, end: int, *ttypes: TokenType) -> bool:
202 """ Check if there are tokens of the given types ending at the
205 return any(tl.end == end and tl.ttype in ttypes for tl in self.starting)
207 def get_tokens(self, end: int, ttype: TokenType) -> Optional[List[Token]]:
208 """ Get the list of tokens of the given type starting at this node
209 and ending at the node 'end'. Returns 'None' if no such
212 for tlist in self.starting:
213 if tlist.end == end and tlist.ttype == ttype:
218 @dataclasses.dataclass
220 """ A normalized query part. Phrases may be typed which means that
221 they then represent a specific part of the address.
228 """ A tokenized search query together with the normalized source
229 from which the tokens have been parsed.
231 The query contains a list of nodes that represent the breaks
232 between words. Tokens span between nodes, which don't necessarily
233 need to be direct neighbours. Thus the query is represented as a
234 directed acyclic graph.
236 When created, a query contains a single node: the start of the
237 query. Further nodes can be added by appending to 'nodes'.
240 def __init__(self, source: List[Phrase]) -> None:
242 self.nodes: List[QueryNode] = \
243 [QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY,
246 def num_token_slots(self) -> int:
247 """ Return the length of the query in vertice steps.
249 return len(self.nodes) - 1
251 def add_node(self, btype: BreakType, ptype: PhraseType,
252 break_penalty: float = 0.0,
253 term_lookup: str = '', term_normalized: str = '') -> None:
254 """ Append a new break node with the given break type.
255 The phrase type denotes the type for any tokens starting
258 self.nodes.append(QueryNode(btype, ptype, break_penalty, term_lookup, term_normalized))
260 def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
261 """ Add a token to the query. 'start' and 'end' are the indexes of the
262 nodes from which to which the token spans. The indexes must exist
263 and are expected to be in the same phrase.
264 'ttype' denotes the type of the token and 'token' the token to
267 If the token type is not compatible with the phrase it should
268 be added to, then the token is silently dropped.
270 snode = self.nodes[trange.start]
271 full_phrase = snode.btype in (BREAK_START, BREAK_PHRASE)\
272 and self.nodes[trange.end].btype in (BREAK_PHRASE, BREAK_END)
273 if _phrase_compatible_with(snode.ptype, ttype, full_phrase):
274 tlist = snode.get_tokens(trange.end, ttype)
276 snode.starting.append(TokenList(trange.end, ttype, [token]))
280 def get_tokens(self, trange: TokenRange, ttype: TokenType) -> List[Token]:
281 """ Get the list of tokens of a given type, spanning the given
282 nodes. The nodes must exist. If no tokens exist, an
283 empty list is returned.
285 return self.nodes[trange.start].get_tokens(trange.end, ttype) or []
287 def get_partials_list(self, trange: TokenRange) -> List[Token]:
288 """ Create a list of partial tokens between the given nodes.
289 The list is composed of the first token of type PARTIAL
290 going to the subsequent node. Such PARTIAL tokens are
293 return [next(iter(self.get_tokens(TokenRange(i, i+1), TOKEN_PARTIAL)))
294 for i in range(trange.start, trange.end)]
296 def iter_token_lists(self) -> Iterator[Tuple[int, QueryNode, TokenList]]:
297 """ Iterator over all token lists in the query.
299 for i, node in enumerate(self.nodes):
300 for tlist in node.starting:
303 def find_lookup_word_by_id(self, token: int) -> str:
304 """ Find the first token with the given token ID and return
305 its lookup word. Returns 'None' if no such token exists.
306 The function is very slow and must only be used for
309 for node in self.nodes:
310 for tlist in node.starting:
311 for t in tlist.tokens:
313 return f"[{tlist.ttype}]{t.lookup_word}"
316 def get_transliterated_query(self) -> str:
317 """ Return a string representation of the transliterated query
318 with the character representation of the different break types.
320 For debugging purposes only.
322 return ''.join(''.join((n.term_lookup, n.btype)) for n in self.nodes)