X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/a690605a96191ed0ac230493bfb8ae3aa6504988..4577669213ea392fa7e25a2fce444f387763f4c8:/src/nominatim_api/search/query.py diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py index 53482df8..87638129 100644 --- a/src/nominatim_api/search/query.py +++ b/src/nominatim_api/search/query.py @@ -12,23 +12,30 @@ from abc import ABC, abstractmethod import dataclasses import enum -class BreakType(enum.Enum): - """ Type of break between tokens. - """ - START = '<' - """ Begin of the query. """ - END = '>' - """ End of the query. """ - PHRASE = ',' - """ Break between two phrases. """ - WORD = ' ' - """ Break between words. """ - PART = '-' - """ Break inside a word, for example a hyphen or apostrophe. """ - TOKEN = '`' - """ Break created as a result of tokenization. - This may happen in languages without spaces between words. - """ + +BreakType = str +""" Type of break between tokens. +""" +BREAK_START = '<' +""" Begin of the query. """ +BREAK_END = '>' +""" End of the query. """ +BREAK_PHRASE = ',' +""" Hard break between two phrases. Address parts cannot cross hard + phrase boundaries.""" +BREAK_SOFT_PHRASE = ':' +""" Likely break between two phrases. Address parts should not cross soft + phrase boundaries. Soft breaks can be inserted by a preprocessor + that is analysing the input string. +""" +BREAK_WORD = ' ' +""" Break between words. """ +BREAK_PART = '-' +""" Break inside a word, for example a hyphen or apostrophe. """ +BREAK_TOKEN = '`' +""" Break created as a result of tokenization. + This may happen in languages without spaces between words. +""" class TokenType(enum.Enum): @@ -102,48 +109,43 @@ class Token(ABC): addr_count: int lookup_word: str - @abstractmethod def get_category(self) -> Tuple[str, str]: """ Return the category restriction for qualifier terms and category objects. """ + @dataclasses.dataclass class TokenRange: """ Indexes of query nodes over which a token spans. """ start: int end: int + penalty: Optional[float] = None def __lt__(self, other: 'TokenRange') -> bool: return self.end <= other.start - def __le__(self, other: 'TokenRange') -> bool: return NotImplemented - def __gt__(self, other: 'TokenRange') -> bool: return self.start >= other.end - def __ge__(self, other: 'TokenRange') -> bool: return NotImplemented - def replace_start(self, new_start: int) -> 'TokenRange': """ Return a new token range with the new start. """ return TokenRange(new_start, self.end) - def replace_end(self, new_end: int) -> 'TokenRange': """ Return a new token range with the new end. """ return TokenRange(self.start, new_end) - def split(self, index: int) -> Tuple['TokenRange', 'TokenRange']: """ Split the span into two spans at the given index. The index must be within the span. @@ -159,7 +161,6 @@ class TokenList: ttype: TokenType tokens: List[Token] - def add_penalty(self, penalty: float) -> None: """ Add the given penalty to all tokens in the list. """ @@ -181,7 +182,6 @@ class QueryNode: """ return any(tl.end == end and tl.ttype in ttypes for tl in self.starting) - def get_tokens(self, end: int, ttype: TokenType) -> Optional[List[Token]]: """ Get the list of tokens of the given type starting at this node and ending at the node 'end'. Returns 'None' if no such @@ -218,15 +218,13 @@ class QueryStruct: def __init__(self, source: List[Phrase]) -> None: self.source = source self.nodes: List[QueryNode] = \ - [QueryNode(BreakType.START, source[0].ptype if source else PhraseType.NONE)] - + [QueryNode(BREAK_START, source[0].ptype if source else PhraseType.NONE)] def num_token_slots(self) -> int: """ Return the length of the query in vertice steps. """ return len(self.nodes) - 1 - def add_node(self, btype: BreakType, ptype: PhraseType) -> None: """ Append a new break node with the given break type. The phrase type denotes the type for any tokens starting @@ -234,7 +232,6 @@ class QueryStruct: """ self.nodes.append(QueryNode(btype, ptype)) - def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None: """ Add a token to the query. 'start' and 'end' are the indexes of the nodes from which to which the token spans. The indexes must exist @@ -246,8 +243,8 @@ class QueryStruct: be added to, then the token is silently dropped. """ snode = self.nodes[trange.start] - full_phrase = snode.btype in (BreakType.START, BreakType.PHRASE)\ - and self.nodes[trange.end].btype in (BreakType.PHRASE, BreakType.END) + full_phrase = snode.btype in (BREAK_START, BREAK_PHRASE)\ + and self.nodes[trange.end].btype in (BREAK_PHRASE, BREAK_END) if snode.ptype.compatible_with(ttype, full_phrase): tlist = snode.get_tokens(trange.end, ttype) if tlist is None: @@ -255,7 +252,6 @@ class QueryStruct: else: tlist.append(token) - def get_tokens(self, trange: TokenRange, ttype: TokenType) -> List[Token]: """ Get the list of tokens of a given type, spanning the given nodes. The nodes must exist. If no tokens exist, an @@ -263,7 +259,6 @@ class QueryStruct: """ return self.nodes[trange.start].get_tokens(trange.end, ttype) or [] - def get_partials_list(self, trange: TokenRange) -> List[Token]: """ Create a list of partial tokens between the given nodes. The list is composed of the first token of type PARTIAL @@ -271,8 +266,7 @@ class QueryStruct: assumed to exist. """ return [next(iter(self.get_tokens(TokenRange(i, i+1), TokenType.PARTIAL))) - for i in range(trange.start, trange.end)] - + for i in range(trange.start, trange.end)] def iter_token_lists(self) -> Iterator[Tuple[int, QueryNode, TokenList]]: """ Iterator over all token lists in the query. @@ -281,7 +275,6 @@ class QueryStruct: for tlist in node.starting: yield i, node, tlist - def find_lookup_word_by_id(self, token: int) -> str: """ Find the first token with the given token ID and return its lookup word. Returns 'None' if no such token exists.