X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/ae8694a6a6862d7cb66cd91102d2802c9899e7cf..31412e06740727695c5d9512e0cd59c0dd683322:/src/nominatim_api/search/query.py diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py index 02ebbb5b..68a6b00a 100644 --- a/src/nominatim_api/search/query.py +++ b/src/nominatim_api/search/query.py @@ -13,42 +13,48 @@ import dataclasses import enum -class BreakType(enum.Enum): - """ Type of break between tokens. - """ - START = '<' - """ Begin of the query. """ - END = '>' - """ End of the query. """ - PHRASE = ',' - """ Break between two phrases. """ - WORD = ' ' - """ Break between words. """ - PART = '-' - """ Break inside a word, for example a hyphen or apostrophe. """ - TOKEN = '`' - """ Break created as a result of tokenization. - This may happen in languages without spaces between words. - """ +BreakType = str +""" Type of break between tokens. +""" +BREAK_START = '<' +""" Begin of the query. """ +BREAK_END = '>' +""" End of the query. """ +BREAK_PHRASE = ',' +""" Hard break between two phrases. Address parts cannot cross hard + phrase boundaries.""" +BREAK_SOFT_PHRASE = ':' +""" Likely break between two phrases. Address parts should not cross soft + phrase boundaries. Soft breaks can be inserted by a preprocessor + that is analysing the input string. +""" +BREAK_WORD = ' ' +""" Break between words. """ +BREAK_PART = '-' +""" Break inside a word, for example a hyphen or apostrophe. """ +BREAK_TOKEN = '`' +""" Break created as a result of tokenization. + This may happen in languages without spaces between words. +""" -class TokenType(enum.Enum): - """ Type of token. - """ - WORD = enum.auto() - """ Full name of a place. """ - PARTIAL = enum.auto() - """ Word term without breaks, does not necessarily represent a full name. """ - HOUSENUMBER = enum.auto() - """ Housenumber term. """ - POSTCODE = enum.auto() - """ Postal code term. """ - COUNTRY = enum.auto() - """ Country name or reference. """ - QUALIFIER = enum.auto() - """ Special term used together with name (e.g. _Hotel_ Bellevue). """ - NEAR_ITEM = enum.auto() - """ Special term used as searchable object(e.g. supermarket in ...). """ +TokenType = str +""" Type of token. +""" +TOKEN_WORD = 'W' +""" Full name of a place. """ +TOKEN_PARTIAL = 'w' +""" Word term without breaks, does not necessarily represent a full name. """ +TOKEN_HOUSENUMBER = 'H' +""" Housenumber term. """ +TOKEN_POSTCODE = 'P' +""" Postal code term. """ +TOKEN_COUNTRY = 'C' +""" Country name or reference. """ +TOKEN_QUALIFIER = 'Q' +""" Special term used together with name (e.g. _Hotel_ Bellevue). """ +TOKEN_NEAR_ITEM = 'N' +""" Special term used as searchable object(e.g. supermarket in ...). """ class PhraseType(enum.Enum): @@ -76,19 +82,19 @@ class PhraseType(enum.Enum): """ Check if the given token type can be used with the phrase type. """ if self == PhraseType.NONE: - return not is_full_phrase or ttype != TokenType.QUALIFIER + return not is_full_phrase or ttype != TOKEN_QUALIFIER if self == PhraseType.AMENITY: - return ttype in (TokenType.WORD, TokenType.PARTIAL)\ - or (is_full_phrase and ttype == TokenType.NEAR_ITEM)\ - or (not is_full_phrase and ttype == TokenType.QUALIFIER) + return ttype in (TOKEN_WORD, TOKEN_PARTIAL)\ + or (is_full_phrase and ttype == TOKEN_NEAR_ITEM)\ + or (not is_full_phrase and ttype == TOKEN_QUALIFIER) if self == PhraseType.STREET: - return ttype in (TokenType.WORD, TokenType.PARTIAL, TokenType.HOUSENUMBER) + return ttype in (TOKEN_WORD, TOKEN_PARTIAL, TOKEN_HOUSENUMBER) if self == PhraseType.POSTCODE: - return ttype == TokenType.POSTCODE + return ttype == TOKEN_POSTCODE if self == PhraseType.COUNTRY: - return ttype == TokenType.COUNTRY + return ttype == TOKEN_COUNTRY - return ttype in (TokenType.WORD, TokenType.PARTIAL) + return ttype in (TOKEN_WORD, TOKEN_PARTIAL) @dataclasses.dataclass @@ -116,6 +122,7 @@ class TokenRange: """ start: int end: int + penalty: Optional[float] = None def __lt__(self, other: 'TokenRange') -> bool: return self.end <= other.start @@ -211,7 +218,7 @@ class QueryStruct: def __init__(self, source: List[Phrase]) -> None: self.source = source self.nodes: List[QueryNode] = \ - [QueryNode(BreakType.START, source[0].ptype if source else PhraseType.NONE)] + [QueryNode(BREAK_START, source[0].ptype if source else PhraseType.NONE)] def num_token_slots(self) -> int: """ Return the length of the query in vertice steps. @@ -236,8 +243,8 @@ class QueryStruct: be added to, then the token is silently dropped. """ snode = self.nodes[trange.start] - full_phrase = snode.btype in (BreakType.START, BreakType.PHRASE)\ - and self.nodes[trange.end].btype in (BreakType.PHRASE, BreakType.END) + full_phrase = snode.btype in (BREAK_START, BREAK_PHRASE)\ + and self.nodes[trange.end].btype in (BREAK_PHRASE, BREAK_END) if snode.ptype.compatible_with(ttype, full_phrase): tlist = snode.get_tokens(trange.end, ttype) if tlist is None: @@ -258,7 +265,7 @@ class QueryStruct: going to the subsequent node. Such PARTIAL tokens are assumed to exist. """ - return [next(iter(self.get_tokens(TokenRange(i, i+1), TokenType.PARTIAL))) + return [next(iter(self.get_tokens(TokenRange(i, i+1), TOKEN_PARTIAL))) for i in range(trange.start, trange.end)] def iter_token_lists(self) -> Iterator[Tuple[int, QueryNode, TokenList]]: @@ -278,5 +285,5 @@ class QueryStruct: for tlist in node.starting: for t in tlist.tokens: if t.token == token: - return f"[{tlist.ttype.name[0]}]{t.lookup_word}" + return f"[{tlist.ttype}]{t.lookup_word}" return 'None'