X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/c314a3092c5b51c7782015f6fa9ac093b46fa174..e4295dba10bdb05045e35c772b3d8ca3cb042fd1:/src/nominatim_api/search/token_assignment.py diff --git a/src/nominatim_api/search/token_assignment.py b/src/nominatim_api/search/token_assignment.py index 5ac63d6f..8d25aa8f 100644 --- a/src/nominatim_api/search/token_assignment.py +++ b/src/nominatim_api/search/token_assignment.py @@ -14,7 +14,6 @@ import dataclasses from ..logging import log from . import query as qmod -# pylint: disable=too-many-return-statements,too-many-branches @dataclasses.dataclass class TypedRange: @@ -25,18 +24,20 @@ class TypedRange: PENALTY_TOKENCHANGE = { - qmod.BreakType.START: 0.0, - qmod.BreakType.END: 0.0, - qmod.BreakType.PHRASE: 0.0, - qmod.BreakType.WORD: 0.1, - qmod.BreakType.PART: 0.2, - qmod.BreakType.TOKEN: 0.4 + qmod.BREAK_START: 0.0, + qmod.BREAK_END: 0.0, + qmod.BREAK_PHRASE: 0.0, + qmod.BREAK_SOFT_PHRASE: 0.0, + qmod.BREAK_WORD: 0.1, + qmod.BREAK_PART: 0.2, + qmod.BREAK_TOKEN: 0.4 } TypedRangeSeq = List[TypedRange] + @dataclasses.dataclass -class TokenAssignment: # pylint: disable=too-many-instance-attributes +class TokenAssignment: """ Representation of a possible assignment of token types to the tokens in a tokenized query. """ @@ -49,24 +50,23 @@ class TokenAssignment: # pylint: disable=too-many-instance-attributes near_item: Optional[qmod.TokenRange] = None qualifier: Optional[qmod.TokenRange] = None - @staticmethod def from_ranges(ranges: TypedRangeSeq) -> 'TokenAssignment': """ Create a new token assignment from a sequence of typed spans. """ out = TokenAssignment() for token in ranges: - if token.ttype == qmod.TokenType.PARTIAL: + if token.ttype == qmod.TOKEN_PARTIAL: out.address.append(token.trange) - elif token.ttype == qmod.TokenType.HOUSENUMBER: + elif token.ttype == qmod.TOKEN_HOUSENUMBER: out.housenumber = token.trange - elif token.ttype == qmod.TokenType.POSTCODE: + elif token.ttype == qmod.TOKEN_POSTCODE: out.postcode = token.trange - elif token.ttype == qmod.TokenType.COUNTRY: + elif token.ttype == qmod.TOKEN_COUNTRY: out.country = token.trange - elif token.ttype == qmod.TokenType.NEAR_ITEM: + elif token.ttype == qmod.TOKEN_NEAR_ITEM: out.near_item = token.trange - elif token.ttype == qmod.TokenType.QUALIFIER: + elif token.ttype == qmod.TOKEN_QUALIFIER: out.qualifier = token.trange return out @@ -83,34 +83,29 @@ class _TokenSequence: self.direction = direction self.penalty = penalty - def __str__(self) -> str: - seq = ''.join(f'[{r.trange.start} - {r.trange.end}: {r.ttype.name}]' for r in self.seq) + seq = ''.join(f'[{r.trange.start} - {r.trange.end}: {r.ttype}]' for r in self.seq) return f'{seq} (dir: {self.direction}, penalty: {self.penalty})' - @property def end_pos(self) -> int: """ Return the index of the global end of the current sequence. """ return self.seq[-1].trange.end if self.seq else 0 - def has_types(self, *ttypes: qmod.TokenType) -> bool: """ Check if the current sequence contains any typed ranges of the given types. """ return any(s.ttype in ttypes for s in self.seq) - def is_final(self) -> bool: """ Return true when the sequence cannot be extended by any form of token anymore. """ # Country and category must be the final term for left-to-right return len(self.seq) > 1 and \ - self.seq[-1].ttype in (qmod.TokenType.COUNTRY, qmod.TokenType.NEAR_ITEM) - + self.seq[-1].ttype in (qmod.TOKEN_COUNTRY, qmod.TOKEN_NEAR_ITEM) def appendable(self, ttype: qmod.TokenType) -> Optional[int]: """ Check if the give token type is appendable to the existing sequence. @@ -119,23 +114,23 @@ class _TokenSequence: new direction of the sequence after adding such a type. The token is not added. """ - if ttype == qmod.TokenType.WORD: + if ttype == qmod.TOKEN_WORD: return None if not self.seq: # Append unconditionally to the empty list - if ttype == qmod.TokenType.COUNTRY: + if ttype == qmod.TOKEN_COUNTRY: return -1 - if ttype in (qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER): + if ttype in (qmod.TOKEN_HOUSENUMBER, qmod.TOKEN_QUALIFIER): return 1 return self.direction # Name tokens are always acceptable and don't change direction - if ttype == qmod.TokenType.PARTIAL: + if ttype == qmod.TOKEN_PARTIAL: # qualifiers cannot appear in the middle of the query. They need # to be near the next phrase. if self.direction == -1 \ - and any(t.ttype == qmod.TokenType.QUALIFIER for t in self.seq[:-1]): + and any(t.ttype == qmod.TOKEN_QUALIFIER for t in self.seq[:-1]): return None return self.direction @@ -143,60 +138,59 @@ class _TokenSequence: if self.has_types(ttype): return None - if ttype == qmod.TokenType.HOUSENUMBER: + if ttype == qmod.TOKEN_HOUSENUMBER: if self.direction == 1: - if len(self.seq) == 1 and self.seq[0].ttype == qmod.TokenType.QUALIFIER: + if len(self.seq) == 1 and self.seq[0].ttype == qmod.TOKEN_QUALIFIER: return None if len(self.seq) > 2 \ - or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY): - return None # direction left-to-right: housenumber must come before anything - elif self.direction == -1 \ - or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY): - return -1 # force direction right-to-left if after other terms + or self.has_types(qmod.TOKEN_POSTCODE, qmod.TOKEN_COUNTRY): + return None # direction left-to-right: housenumber must come before anything + elif (self.direction == -1 + or self.has_types(qmod.TOKEN_POSTCODE, qmod.TOKEN_COUNTRY)): + return -1 # force direction right-to-left if after other terms return self.direction - if ttype == qmod.TokenType.POSTCODE: + if ttype == qmod.TOKEN_POSTCODE: if self.direction == -1: - if self.has_types(qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER): + if self.has_types(qmod.TOKEN_HOUSENUMBER, qmod.TOKEN_QUALIFIER): return None return -1 if self.direction == 1: - return None if self.has_types(qmod.TokenType.COUNTRY) else 1 - if self.has_types(qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER): + return None if self.has_types(qmod.TOKEN_COUNTRY) else 1 + if self.has_types(qmod.TOKEN_HOUSENUMBER, qmod.TOKEN_QUALIFIER): return 1 return self.direction - if ttype == qmod.TokenType.COUNTRY: + if ttype == qmod.TOKEN_COUNTRY: return None if self.direction == -1 else 1 - if ttype == qmod.TokenType.NEAR_ITEM: + if ttype == qmod.TOKEN_NEAR_ITEM: return self.direction - if ttype == qmod.TokenType.QUALIFIER: + if ttype == qmod.TOKEN_QUALIFIER: if self.direction == 1: if (len(self.seq) == 1 - and self.seq[0].ttype in (qmod.TokenType.PARTIAL, qmod.TokenType.NEAR_ITEM)) \ + and self.seq[0].ttype in (qmod.TOKEN_PARTIAL, qmod.TOKEN_NEAR_ITEM)) \ or (len(self.seq) == 2 - and self.seq[0].ttype == qmod.TokenType.NEAR_ITEM - and self.seq[1].ttype == qmod.TokenType.PARTIAL): + and self.seq[0].ttype == qmod.TOKEN_NEAR_ITEM + and self.seq[1].ttype == qmod.TOKEN_PARTIAL): return 1 return None if self.direction == -1: return -1 - tempseq = self.seq[1:] if self.seq[0].ttype == qmod.TokenType.NEAR_ITEM else self.seq + tempseq = self.seq[1:] if self.seq[0].ttype == qmod.TOKEN_NEAR_ITEM else self.seq if len(tempseq) == 0: return 1 - if len(tempseq) == 1 and self.seq[0].ttype == qmod.TokenType.HOUSENUMBER: + if len(tempseq) == 1 and self.seq[0].ttype == qmod.TOKEN_HOUSENUMBER: return None - if len(tempseq) > 1 or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY): + if len(tempseq) > 1 or self.has_types(qmod.TOKEN_POSTCODE, qmod.TOKEN_COUNTRY): return -1 return 0 return None - def advance(self, ttype: qmod.TokenType, end_pos: int, btype: qmod.BreakType) -> Optional['_TokenSequence']: """ Return a new token sequence state with the given token type @@ -211,7 +205,7 @@ class _TokenSequence: new_penalty = 0.0 else: last = self.seq[-1] - if btype != qmod.BreakType.PHRASE and last.ttype == ttype: + if btype != qmod.BREAK_PHRASE and last.ttype == ttype: # extend the existing range newseq = self.seq[:-1] + [TypedRange(ttype, last.trange.replace_end(end_pos))] new_penalty = 0.0 @@ -223,7 +217,6 @@ class _TokenSequence: return _TokenSequence(newseq, newdir, self.penalty + new_penalty) - def _adapt_penalty_from_priors(self, priors: int, new_dir: int) -> bool: if priors >= 2: if self.direction == 0: @@ -236,7 +229,6 @@ class _TokenSequence: return True - def recheck_sequence(self) -> bool: """ Check that the sequence is a fully valid token assignment and adapt direction and penalties further if necessary. @@ -248,25 +240,24 @@ class _TokenSequence: # housenumbers may not be further than 2 words from the beginning. # If there are two words in front, give it a penalty. hnrpos = next((i for i, tr in enumerate(self.seq) - if tr.ttype == qmod.TokenType.HOUSENUMBER), + if tr.ttype == qmod.TOKEN_HOUSENUMBER), None) if hnrpos is not None: if self.direction != -1: - priors = sum(1 for t in self.seq[:hnrpos] if t.ttype == qmod.TokenType.PARTIAL) + priors = sum(1 for t in self.seq[:hnrpos] if t.ttype == qmod.TOKEN_PARTIAL) if not self._adapt_penalty_from_priors(priors, -1): return False if self.direction != 1: - priors = sum(1 for t in self.seq[hnrpos+1:] if t.ttype == qmod.TokenType.PARTIAL) + priors = sum(1 for t in self.seq[hnrpos+1:] if t.ttype == qmod.TOKEN_PARTIAL) if not self._adapt_penalty_from_priors(priors, 1): return False - if any(t.ttype == qmod.TokenType.NEAR_ITEM for t in self.seq): + if any(t.ttype == qmod.TOKEN_NEAR_ITEM for t in self.seq): self.penalty += 1.0 return True - def _get_assignments_postcode(self, base: TokenAssignment, - query_len: int) -> Iterator[TokenAssignment]: + query_len: int) -> Iterator[TokenAssignment]: """ Yield possible assignments of Postcode searches with an address component. """ @@ -278,13 +269,11 @@ class _TokenSequence: #
,