1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Datastructures for a tokenized query.
10 from typing import List, Tuple, Optional, Iterator
11 from abc import ABC, abstractmethod
16 class BreakType(enum.Enum):
17 """ Type of break between tokens.
20 """ Begin of the query. """
22 """ End of the query. """
24 """ Hard break between two phrases. Address parts cannot cross hard
27 """ Likely break between two phrases. Address parts should not cross soft
28 phrase boundaries. Soft breaks can be inserted by a preprocessor
29 that is analysing the input string.
32 """ Break between words. """
34 """ Break inside a word, for example a hyphen or apostrophe. """
36 """ Break created as a result of tokenization.
37 This may happen in languages without spaces between words.
41 class TokenType(enum.Enum):
45 """ Full name of a place. """
47 """ Word term without breaks, does not necessarily represent a full name. """
48 HOUSENUMBER = enum.auto()
49 """ Housenumber term. """
50 POSTCODE = enum.auto()
51 """ Postal code term. """
53 """ Country name or reference. """
54 QUALIFIER = enum.auto()
55 """ Special term used together with name (e.g. _Hotel_ Bellevue). """
56 NEAR_ITEM = enum.auto()
57 """ Special term used as searchable object(e.g. supermarket in ...). """
60 class PhraseType(enum.Enum):
61 """ Designation of a phrase.
64 """ No specific designation (i.e. source is free-form query). """
66 """ Contains name or type of a POI. """
68 """ Contains a street name optionally with a housenumber. """
70 """ Contains the postal city. """
72 """ Contains the equivalent of a county. """
74 """ Contains a state or province. """
75 POSTCODE = enum.auto()
76 """ Contains a postal code. """
78 """ Contains the country name or code. """
80 def compatible_with(self, ttype: TokenType,
81 is_full_phrase: bool) -> bool:
82 """ Check if the given token type can be used with the phrase type.
84 if self == PhraseType.NONE:
85 return not is_full_phrase or ttype != TokenType.QUALIFIER
86 if self == PhraseType.AMENITY:
87 return ttype in (TokenType.WORD, TokenType.PARTIAL)\
88 or (is_full_phrase and ttype == TokenType.NEAR_ITEM)\
89 or (not is_full_phrase and ttype == TokenType.QUALIFIER)
90 if self == PhraseType.STREET:
91 return ttype in (TokenType.WORD, TokenType.PARTIAL, TokenType.HOUSENUMBER)
92 if self == PhraseType.POSTCODE:
93 return ttype == TokenType.POSTCODE
94 if self == PhraseType.COUNTRY:
95 return ttype == TokenType.COUNTRY
97 return ttype in (TokenType.WORD, TokenType.PARTIAL)
100 @dataclasses.dataclass
102 """ Base type for tokens.
103 Specific query analyzers must implement the concrete token class.
113 def get_category(self) -> Tuple[str, str]:
114 """ Return the category restriction for qualifier terms and
119 @dataclasses.dataclass
121 """ Indexes of query nodes over which a token spans.
126 def __lt__(self, other: 'TokenRange') -> bool:
127 return self.end <= other.start
129 def __le__(self, other: 'TokenRange') -> bool:
130 return NotImplemented
132 def __gt__(self, other: 'TokenRange') -> bool:
133 return self.start >= other.end
135 def __ge__(self, other: 'TokenRange') -> bool:
136 return NotImplemented
138 def replace_start(self, new_start: int) -> 'TokenRange':
139 """ Return a new token range with the new start.
141 return TokenRange(new_start, self.end)
143 def replace_end(self, new_end: int) -> 'TokenRange':
144 """ Return a new token range with the new end.
146 return TokenRange(self.start, new_end)
148 def split(self, index: int) -> Tuple['TokenRange', 'TokenRange']:
149 """ Split the span into two spans at the given index.
150 The index must be within the span.
152 return self.replace_end(index), self.replace_start(index)
155 @dataclasses.dataclass
157 """ List of all tokens of a given type going from one breakpoint to another.
163 def add_penalty(self, penalty: float) -> None:
164 """ Add the given penalty to all tokens in the list.
166 for token in self.tokens:
167 token.penalty += penalty
170 @dataclasses.dataclass
172 """ A node of the query representing a break between terms.
176 starting: List[TokenList] = dataclasses.field(default_factory=list)
178 def has_tokens(self, end: int, *ttypes: TokenType) -> bool:
179 """ Check if there are tokens of the given types ending at the
182 return any(tl.end == end and tl.ttype in ttypes for tl in self.starting)
184 def get_tokens(self, end: int, ttype: TokenType) -> Optional[List[Token]]:
185 """ Get the list of tokens of the given type starting at this node
186 and ending at the node 'end'. Returns 'None' if no such
189 for tlist in self.starting:
190 if tlist.end == end and tlist.ttype == ttype:
195 @dataclasses.dataclass
197 """ A normalized query part. Phrases may be typed which means that
198 they then represent a specific part of the address.
205 """ A tokenized search query together with the normalized source
206 from which the tokens have been parsed.
208 The query contains a list of nodes that represent the breaks
209 between words. Tokens span between nodes, which don't necessarily
210 need to be direct neighbours. Thus the query is represented as a
211 directed acyclic graph.
213 When created, a query contains a single node: the start of the
214 query. Further nodes can be added by appending to 'nodes'.
217 def __init__(self, source: List[Phrase]) -> None:
219 self.nodes: List[QueryNode] = \
220 [QueryNode(BreakType.START, source[0].ptype if source else PhraseType.NONE)]
222 def num_token_slots(self) -> int:
223 """ Return the length of the query in vertice steps.
225 return len(self.nodes) - 1
227 def add_node(self, btype: BreakType, ptype: PhraseType) -> None:
228 """ Append a new break node with the given break type.
229 The phrase type denotes the type for any tokens starting
232 self.nodes.append(QueryNode(btype, ptype))
234 def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
235 """ Add a token to the query. 'start' and 'end' are the indexes of the
236 nodes from which to which the token spans. The indexes must exist
237 and are expected to be in the same phrase.
238 'ttype' denotes the type of the token and 'token' the token to
241 If the token type is not compatible with the phrase it should
242 be added to, then the token is silently dropped.
244 snode = self.nodes[trange.start]
245 full_phrase = snode.btype in (BreakType.START, BreakType.PHRASE)\
246 and self.nodes[trange.end].btype in (BreakType.PHRASE, BreakType.END)
247 if snode.ptype.compatible_with(ttype, full_phrase):
248 tlist = snode.get_tokens(trange.end, ttype)
250 snode.starting.append(TokenList(trange.end, ttype, [token]))
254 def get_tokens(self, trange: TokenRange, ttype: TokenType) -> List[Token]:
255 """ Get the list of tokens of a given type, spanning the given
256 nodes. The nodes must exist. If no tokens exist, an
257 empty list is returned.
259 return self.nodes[trange.start].get_tokens(trange.end, ttype) or []
261 def get_partials_list(self, trange: TokenRange) -> List[Token]:
262 """ Create a list of partial tokens between the given nodes.
263 The list is composed of the first token of type PARTIAL
264 going to the subsequent node. Such PARTIAL tokens are
267 return [next(iter(self.get_tokens(TokenRange(i, i+1), TokenType.PARTIAL)))
268 for i in range(trange.start, trange.end)]
270 def iter_token_lists(self) -> Iterator[Tuple[int, QueryNode, TokenList]]:
271 """ Iterator over all token lists in the query.
273 for i, node in enumerate(self.nodes):
274 for tlist in node.starting:
277 def find_lookup_word_by_id(self, token: int) -> str:
278 """ Find the first token with the given token ID and return
279 its lookup word. Returns 'None' if no such token exists.
280 The function is very slow and must only be used for
283 for node in self.nodes:
284 for tlist in node.starting:
285 for t in tlist.tokens:
287 return f"[{tlist.ttype.name[0]}]{t.lookup_word}"