from ..logging import log
from . import query as qmod
-# pylint: disable=too-many-return-statements,too-many-branches
@dataclasses.dataclass
class TypedRange:
PENALTY_TOKENCHANGE = {
- qmod.BreakType.START: 0.0,
- qmod.BreakType.END: 0.0,
- qmod.BreakType.PHRASE: 0.0,
- qmod.BreakType.WORD: 0.1,
- qmod.BreakType.PART: 0.2,
- qmod.BreakType.TOKEN: 0.4
+ qmod.BREAK_START: 0.0,
+ qmod.BREAK_END: 0.0,
+ qmod.BREAK_PHRASE: 0.0,
+ qmod.BREAK_SOFT_PHRASE: 0.0,
+ qmod.BREAK_WORD: 0.1,
+ qmod.BREAK_PART: 0.2,
+ qmod.BREAK_TOKEN: 0.4
}
TypedRangeSeq = List[TypedRange]
+
@dataclasses.dataclass
-class TokenAssignment: # pylint: disable=too-many-instance-attributes
+class TokenAssignment:
""" Representation of a possible assignment of token types
to the tokens in a tokenized query.
"""
near_item: Optional[qmod.TokenRange] = None
qualifier: Optional[qmod.TokenRange] = None
-
@staticmethod
def from_ranges(ranges: TypedRangeSeq) -> 'TokenAssignment':
""" Create a new token assignment from a sequence of typed spans.
"""
out = TokenAssignment()
for token in ranges:
- if token.ttype == qmod.TokenType.PARTIAL:
+ if token.ttype == qmod.TOKEN_PARTIAL:
out.address.append(token.trange)
- elif token.ttype == qmod.TokenType.HOUSENUMBER:
+ elif token.ttype == qmod.TOKEN_HOUSENUMBER:
out.housenumber = token.trange
- elif token.ttype == qmod.TokenType.POSTCODE:
+ elif token.ttype == qmod.TOKEN_POSTCODE:
out.postcode = token.trange
- elif token.ttype == qmod.TokenType.COUNTRY:
+ elif token.ttype == qmod.TOKEN_COUNTRY:
out.country = token.trange
- elif token.ttype == qmod.TokenType.NEAR_ITEM:
+ elif token.ttype == qmod.TOKEN_NEAR_ITEM:
out.near_item = token.trange
- elif token.ttype == qmod.TokenType.QUALIFIER:
+ elif token.ttype == qmod.TOKEN_QUALIFIER:
out.qualifier = token.trange
return out
self.direction = direction
self.penalty = penalty
-
def __str__(self) -> str:
- seq = ''.join(f'[{r.trange.start} - {r.trange.end}: {r.ttype.name}]' for r in self.seq)
+ seq = ''.join(f'[{r.trange.start} - {r.trange.end}: {r.ttype}]' for r in self.seq)
return f'{seq} (dir: {self.direction}, penalty: {self.penalty})'
-
@property
def end_pos(self) -> int:
""" Return the index of the global end of the current sequence.
"""
return self.seq[-1].trange.end if self.seq else 0
-
def has_types(self, *ttypes: qmod.TokenType) -> bool:
""" Check if the current sequence contains any typed ranges of
the given types.
"""
return any(s.ttype in ttypes for s in self.seq)
-
def is_final(self) -> bool:
""" Return true when the sequence cannot be extended by any
form of token anymore.
"""
# Country and category must be the final term for left-to-right
return len(self.seq) > 1 and \
- self.seq[-1].ttype in (qmod.TokenType.COUNTRY, qmod.TokenType.NEAR_ITEM)
-
+ self.seq[-1].ttype in (qmod.TOKEN_COUNTRY, qmod.TOKEN_NEAR_ITEM)
def appendable(self, ttype: qmod.TokenType) -> Optional[int]:
""" Check if the give token type is appendable to the existing sequence.
new direction of the sequence after adding such a type. The
token is not added.
"""
- if ttype == qmod.TokenType.WORD:
+ if ttype == qmod.TOKEN_WORD:
return None
if not self.seq:
# Append unconditionally to the empty list
- if ttype == qmod.TokenType.COUNTRY:
+ if ttype == qmod.TOKEN_COUNTRY:
return -1
- if ttype in (qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
+ if ttype in (qmod.TOKEN_HOUSENUMBER, qmod.TOKEN_QUALIFIER):
return 1
return self.direction
# Name tokens are always acceptable and don't change direction
- if ttype == qmod.TokenType.PARTIAL:
+ if ttype == qmod.TOKEN_PARTIAL:
# qualifiers cannot appear in the middle of the query. They need
# to be near the next phrase.
if self.direction == -1 \
- and any(t.ttype == qmod.TokenType.QUALIFIER for t in self.seq[:-1]):
+ and any(t.ttype == qmod.TOKEN_QUALIFIER for t in self.seq[:-1]):
return None
return self.direction
if self.has_types(ttype):
return None
- if ttype == qmod.TokenType.HOUSENUMBER:
+ if ttype == qmod.TOKEN_HOUSENUMBER:
if self.direction == 1:
- if len(self.seq) == 1 and self.seq[0].ttype == qmod.TokenType.QUALIFIER:
+ if len(self.seq) == 1 and self.seq[0].ttype == qmod.TOKEN_QUALIFIER:
return None
if len(self.seq) > 2 \
- or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
- return None # direction left-to-right: housenumber must come before anything
- elif self.direction == -1 \
- or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
- return -1 # force direction right-to-left if after other terms
+ or self.has_types(qmod.TOKEN_POSTCODE, qmod.TOKEN_COUNTRY):
+ return None # direction left-to-right: housenumber must come before anything
+ elif (self.direction == -1
+ or self.has_types(qmod.TOKEN_POSTCODE, qmod.TOKEN_COUNTRY)):
+ return -1 # force direction right-to-left if after other terms
return self.direction
- if ttype == qmod.TokenType.POSTCODE:
+ if ttype == qmod.TOKEN_POSTCODE:
if self.direction == -1:
- if self.has_types(qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
+ if self.has_types(qmod.TOKEN_HOUSENUMBER, qmod.TOKEN_QUALIFIER):
return None
return -1
if self.direction == 1:
- return None if self.has_types(qmod.TokenType.COUNTRY) else 1
- if self.has_types(qmod.TokenType.HOUSENUMBER, qmod.TokenType.QUALIFIER):
+ return None if self.has_types(qmod.TOKEN_COUNTRY) else 1
+ if self.has_types(qmod.TOKEN_HOUSENUMBER, qmod.TOKEN_QUALIFIER):
return 1
return self.direction
- if ttype == qmod.TokenType.COUNTRY:
+ if ttype == qmod.TOKEN_COUNTRY:
return None if self.direction == -1 else 1
- if ttype == qmod.TokenType.NEAR_ITEM:
+ if ttype == qmod.TOKEN_NEAR_ITEM:
return self.direction
- if ttype == qmod.TokenType.QUALIFIER:
+ if ttype == qmod.TOKEN_QUALIFIER:
if self.direction == 1:
if (len(self.seq) == 1
- and self.seq[0].ttype in (qmod.TokenType.PARTIAL, qmod.TokenType.NEAR_ITEM)) \
+ and self.seq[0].ttype in (qmod.TOKEN_PARTIAL, qmod.TOKEN_NEAR_ITEM)) \
or (len(self.seq) == 2
- and self.seq[0].ttype == qmod.TokenType.NEAR_ITEM
- and self.seq[1].ttype == qmod.TokenType.PARTIAL):
+ and self.seq[0].ttype == qmod.TOKEN_NEAR_ITEM
+ and self.seq[1].ttype == qmod.TOKEN_PARTIAL):
return 1
return None
if self.direction == -1:
return -1
- tempseq = self.seq[1:] if self.seq[0].ttype == qmod.TokenType.NEAR_ITEM else self.seq
+ tempseq = self.seq[1:] if self.seq[0].ttype == qmod.TOKEN_NEAR_ITEM else self.seq
if len(tempseq) == 0:
return 1
- if len(tempseq) == 1 and self.seq[0].ttype == qmod.TokenType.HOUSENUMBER:
+ if len(tempseq) == 1 and self.seq[0].ttype == qmod.TOKEN_HOUSENUMBER:
return None
- if len(tempseq) > 1 or self.has_types(qmod.TokenType.POSTCODE, qmod.TokenType.COUNTRY):
+ if len(tempseq) > 1 or self.has_types(qmod.TOKEN_POSTCODE, qmod.TOKEN_COUNTRY):
return -1
return 0
return None
-
def advance(self, ttype: qmod.TokenType, end_pos: int,
btype: qmod.BreakType) -> Optional['_TokenSequence']:
""" Return a new token sequence state with the given token type
new_penalty = 0.0
else:
last = self.seq[-1]
- if btype != qmod.BreakType.PHRASE and last.ttype == ttype:
+ if btype != qmod.BREAK_PHRASE and last.ttype == ttype:
# extend the existing range
newseq = self.seq[:-1] + [TypedRange(ttype, last.trange.replace_end(end_pos))]
new_penalty = 0.0
return _TokenSequence(newseq, newdir, self.penalty + new_penalty)
-
def _adapt_penalty_from_priors(self, priors: int, new_dir: int) -> bool:
if priors >= 2:
if self.direction == 0:
return True
-
def recheck_sequence(self) -> bool:
""" Check that the sequence is a fully valid token assignment
and adapt direction and penalties further if necessary.
# housenumbers may not be further than 2 words from the beginning.
# If there are two words in front, give it a penalty.
hnrpos = next((i for i, tr in enumerate(self.seq)
- if tr.ttype == qmod.TokenType.HOUSENUMBER),
+ if tr.ttype == qmod.TOKEN_HOUSENUMBER),
None)
if hnrpos is not None:
if self.direction != -1:
- priors = sum(1 for t in self.seq[:hnrpos] if t.ttype == qmod.TokenType.PARTIAL)
+ priors = sum(1 for t in self.seq[:hnrpos] if t.ttype == qmod.TOKEN_PARTIAL)
if not self._adapt_penalty_from_priors(priors, -1):
return False
if self.direction != 1:
- priors = sum(1 for t in self.seq[hnrpos+1:] if t.ttype == qmod.TokenType.PARTIAL)
+ priors = sum(1 for t in self.seq[hnrpos+1:] if t.ttype == qmod.TOKEN_PARTIAL)
if not self._adapt_penalty_from_priors(priors, 1):
return False
- if any(t.ttype == qmod.TokenType.NEAR_ITEM for t in self.seq):
+ if any(t.ttype == qmod.TOKEN_NEAR_ITEM for t in self.seq):
self.penalty += 1.0
return True
-
def _get_assignments_postcode(self, base: TokenAssignment,
- query_len: int) -> Iterator[TokenAssignment]:
+ query_len: int) -> Iterator[TokenAssignment]:
""" Yield possible assignments of Postcode searches with an
address component.
"""
# <address>,<postcode> should give preference to address search
if base.postcode.start == 0:
penalty = self.penalty
- self.direction = -1 # name searches are only possible backwards
else:
penalty = self.penalty + 0.1
- self.direction = 1 # name searches are only possible forwards
+ penalty += 0.1 * max(0, len(base.address) - 1)
yield dataclasses.replace(base, penalty=penalty)
-
def _get_assignments_address_forward(self, base: TokenAssignment,
query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Yield possible assignments of address searches with
"""
first = base.address[0]
+ # The postcode must come after the name.
+ if base.postcode and base.postcode < first:
+ log().var_dump('skip forward', (base.postcode, first))
+ return
+
log().comment('first word = name')
yield dataclasses.replace(base, penalty=self.penalty,
name=first, address=base.address[1:])
# * the containing phrase is strictly typed
if (base.housenumber and first.end < base.housenumber.start)\
or (base.qualifier and base.qualifier > first)\
- or (query.nodes[first.start].ptype != qmod.PhraseType.NONE):
+ or (query.nodes[first.start].ptype != qmod.PHRASE_ANY):
return
penalty = self.penalty
yield dataclasses.replace(base, name=name, address=[addr] + base.address[1:],
penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
-
def _get_assignments_address_backward(self, base: TokenAssignment,
query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Yield possible assignments of address searches with
"""
last = base.address[-1]
- if self.direction == -1 or len(base.address) > 1:
+ # The postcode must come before the name for backward direction.
+ if base.postcode and base.postcode > last:
+ log().var_dump('skip backward', (base.postcode, last))
+ return
+
+ if self.direction == -1 or len(base.address) > 1 or base.postcode:
log().comment('last word = name')
yield dataclasses.replace(base, penalty=self.penalty,
name=last, address=base.address[:-1])
# * the containing phrase is strictly typed
if (base.housenumber and last.start > base.housenumber.end)\
or (base.qualifier and base.qualifier < last)\
- or (query.nodes[last.start].ptype != qmod.PhraseType.NONE):
+ or (query.nodes[last.start].ptype != qmod.PHRASE_ANY):
return
penalty = self.penalty
yield dataclasses.replace(base, name=name, address=base.address[:-1] + [addr],
penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
-
def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Yield possible assignments for the current sequence.
another. It does not include penalties for transitions within a
type.
"""
- todo = [_TokenSequence([], direction=0 if query.source[0].ptype == qmod.PhraseType.NONE else 1)]
+ todo = [_TokenSequence([], direction=0 if query.source[0].ptype == qmod.PHRASE_ANY else 1)]
while todo:
state = todo.pop()