1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2023 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Datastructures for a tokenized query.
10 from typing import List, Tuple, Optional, NamedTuple, Iterator
11 from abc import ABC, abstractmethod
15 class BreakType(enum.Enum):
16 """ Type of break between tokens.
19 """ Begin of the query. """
21 """ End of the query. """
23 """ Break between two phrases. """
25 """ Break between words. """
27 """ Break inside a word, for example a hyphen or apostrophe. """
29 """ Break created as a result of tokenization.
30 This may happen in languages without spaces between words.
34 class TokenType(enum.Enum):
38 """ Full name of a place. """
40 """ Word term without breaks, does not necessarily represent a full name. """
41 HOUSENUMBER = enum.auto()
42 """ Housenumber term. """
43 POSTCODE = enum.auto()
44 """ Postal code term. """
46 """ Country name or reference. """
47 QUALIFIER = enum.auto()
48 """ Special term used together with name (e.g. _Hotel_ Bellevue). """
49 CATEGORY = enum.auto()
50 """ Special term used as searchable object(e.g. supermarket in ...). """
53 class PhraseType(enum.Enum):
54 """ Designation of a phrase.
57 """ No specific designation (i.e. source is free-form query). """
59 """ Contains name or type of a POI. """
61 """ Contains a street name optionally with a housenumber. """
63 """ Contains the postal city. """
65 """ Contains the equivalent of a county. """
67 """ Contains a state or province. """
68 POSTCODE = enum.auto()
69 """ Contains a postal code. """
71 """ Contains the country name or code. """
73 def compatible_with(self, ttype: TokenType) -> bool:
74 """ Check if the given token type can be used with the phrase type.
76 if self == PhraseType.NONE:
78 if self == PhraseType.AMENITY:
79 return ttype in (TokenType.WORD, TokenType.PARTIAL,
80 TokenType.QUALIFIER, TokenType.CATEGORY)
81 if self == PhraseType.STREET:
82 return ttype in (TokenType.WORD, TokenType.PARTIAL, TokenType.HOUSENUMBER)
83 if self == PhraseType.POSTCODE:
84 return ttype == TokenType.POSTCODE
85 if self == PhraseType.COUNTRY:
86 return ttype == TokenType.COUNTRY
88 return ttype in (TokenType.WORD, TokenType.PARTIAL)
91 @dataclasses.dataclass
93 """ Base type for tokens.
94 Specific query analyzers must implement the concrete token class.
105 def get_category(self) -> Tuple[str, str]:
106 """ Return the category restriction for qualifier terms and
111 class TokenRange(NamedTuple):
112 """ Indexes of query nodes over which a token spans.
118 @dataclasses.dataclass
120 """ List of all tokens of a given type going from one breakpoint to another.
127 def add_penalty(self, penalty: float) -> None:
128 """ Add the given penalty to all tokens in the list.
130 for token in self.tokens:
131 token.penalty += penalty
134 @dataclasses.dataclass
136 """ A node of the querry representing a break between terms.
140 starting: List[TokenList] = dataclasses.field(default_factory=list)
142 def has_tokens(self, end: int, *ttypes: TokenType) -> bool:
143 """ Check if there are tokens of the given types ending at the
146 return any(tl.end == end and tl.ttype in ttypes for tl in self.starting)
149 def get_tokens(self, end: int, ttype: TokenType) -> Optional[List[Token]]:
150 """ Get the list of tokens of the given type starting at this node
151 and ending at the node 'end'. Returns 'None' if no such
154 return next((t.tokens for t in self.starting if t.end == end and t.ttype == ttype), None)
157 @dataclasses.dataclass
159 """ A normalized query part. Phrases may be typed which means that
160 they then represent a specific part of the address.
167 """ A tokenized search query together with the normalized source
168 from which the tokens have been parsed.
170 The query contains a list of nodes that represent the breaks
171 between words. Tokens span between nodes, which don't necessarily
172 need to be direct neighbours. Thus the query is represented as a
173 directed acyclic graph.
175 When created, a query contains a single node: the start of the
176 query. Further nodes can be added by appending to 'nodes'.
179 def __init__(self, source: List[Phrase]) -> None:
181 self.nodes: List[QueryNode] = \
182 [QueryNode(BreakType.START, source[0].ptype if source else PhraseType.NONE)]
185 def num_token_slots(self) -> int:
186 """ Return the length of the query in vertice steps.
188 return len(self.nodes) - 1
191 def add_node(self, btype: BreakType, ptype: PhraseType) -> None:
192 """ Append a new break node with the given break type.
193 The phrase type denotes the type for any tokens starting
196 self.nodes.append(QueryNode(btype, ptype))
199 def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
200 """ Add a token to the query. 'start' and 'end' are the indexes of the
201 nodes from which to which the token spans. The indexes must exist
202 and are expected to be in the same phrase.
203 'ttype' denotes the type of the token and 'token' the token to
206 If the token type is not compatible with the phrase it should
207 be added to, then the token is silently dropped.
209 snode = self.nodes[trange.start]
210 if snode.ptype.compatible_with(ttype):
211 tlist = snode.get_tokens(trange.end, ttype)
213 snode.starting.append(TokenList(trange.end, ttype, [token]))
218 def get_tokens(self, trange: TokenRange, ttype: TokenType) -> List[Token]:
219 """ Get the list of tokens of a given type, spanning the given
220 nodes. The nodes must exist. If no tokens exist, an
221 empty list is returned.
223 return self.nodes[trange.start].get_tokens(trange.end, ttype) or []
226 def get_partials_list(self, trange: TokenRange) -> List[Token]:
227 """ Create a list of partial tokens between the given nodes.
228 The list is composed of the first token of type PARTIAL
229 going to the subsequent node. Such PARTIAL tokens are
232 return [next(iter(self.get_tokens(TokenRange(i, i+1), TokenType.PARTIAL)))
233 for i in range(trange.start, trange.end)]
236 def iter_token_lists(self) -> Iterator[Tuple[int, QueryNode, TokenList]]:
237 """ Iterator over all token lists in the query.
239 for i, node in enumerate(self.nodes):
240 for tlist in node.starting:
244 def find_lookup_word_by_id(self, token: int) -> str:
245 """ Find the first token with the given token ID and return
246 its lookup word. Returns 'None' if no such token exists.
247 The function is very slow and must only be used for
250 for node in self.nodes:
251 for tlist in node.starting:
252 for t in tlist.tokens:
254 return f"[{tlist.ttype.name[0]}]{t.lookup_word}"