import dataclasses
import enum
+
class BreakType(enum.Enum):
""" Type of break between tokens.
"""
END = '>'
""" End of the query. """
PHRASE = ','
- """ Break between two phrases. """
+ """ Hard break between two phrases. Address parts cannot cross hard
+ phrase boundaries."""
+ SOFT_PHRASE = ':'
+ """ Likely break between two phrases. Address parts should not cross soft
+ phrase boundaries. Soft breaks can be inserted by a preprocessor
+ that is analysing the input string.
+ """
WORD = ' '
""" Break between words. """
PART = '-'
count: int
addr_count: int
lookup_word: str
- is_indexed: bool
-
@abstractmethod
def get_category(self) -> Tuple[str, str]:
category objects.
"""
+
@dataclasses.dataclass
class TokenRange:
""" Indexes of query nodes over which a token spans.
"""
start: int
end: int
+ penalty: Optional[float] = None
def __lt__(self, other: 'TokenRange') -> bool:
return self.end <= other.start
-
def __le__(self, other: 'TokenRange') -> bool:
return NotImplemented
-
def __gt__(self, other: 'TokenRange') -> bool:
return self.start >= other.end
-
def __ge__(self, other: 'TokenRange') -> bool:
return NotImplemented
-
def replace_start(self, new_start: int) -> 'TokenRange':
""" Return a new token range with the new start.
"""
return TokenRange(new_start, self.end)
-
def replace_end(self, new_end: int) -> 'TokenRange':
""" Return a new token range with the new end.
"""
return TokenRange(self.start, new_end)
-
def split(self, index: int) -> Tuple['TokenRange', 'TokenRange']:
""" Split the span into two spans at the given index.
The index must be within the span.
ttype: TokenType
tokens: List[Token]
-
def add_penalty(self, penalty: float) -> None:
""" Add the given penalty to all tokens in the list.
"""
"""
return any(tl.end == end and tl.ttype in ttypes for tl in self.starting)
-
def get_tokens(self, end: int, ttype: TokenType) -> Optional[List[Token]]:
""" Get the list of tokens of the given type starting at this node
and ending at the node 'end'. Returns 'None' if no such
self.nodes: List[QueryNode] = \
[QueryNode(BreakType.START, source[0].ptype if source else PhraseType.NONE)]
-
def num_token_slots(self) -> int:
""" Return the length of the query in vertice steps.
"""
return len(self.nodes) - 1
-
def add_node(self, btype: BreakType, ptype: PhraseType) -> None:
""" Append a new break node with the given break type.
The phrase type denotes the type for any tokens starting
"""
self.nodes.append(QueryNode(btype, ptype))
-
def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
""" Add a token to the query. 'start' and 'end' are the indexes of the
nodes from which to which the token spans. The indexes must exist
"""
snode = self.nodes[trange.start]
full_phrase = snode.btype in (BreakType.START, BreakType.PHRASE)\
- and self.nodes[trange.end].btype in (BreakType.PHRASE, BreakType.END)
+ and self.nodes[trange.end].btype in (BreakType.PHRASE, BreakType.END)
if snode.ptype.compatible_with(ttype, full_phrase):
tlist = snode.get_tokens(trange.end, ttype)
if tlist is None:
else:
tlist.append(token)
-
def get_tokens(self, trange: TokenRange, ttype: TokenType) -> List[Token]:
""" Get the list of tokens of a given type, spanning the given
nodes. The nodes must exist. If no tokens exist, an
"""
return self.nodes[trange.start].get_tokens(trange.end, ttype) or []
-
def get_partials_list(self, trange: TokenRange) -> List[Token]:
""" Create a list of partial tokens between the given nodes.
The list is composed of the first token of type PARTIAL
assumed to exist.
"""
return [next(iter(self.get_tokens(TokenRange(i, i+1), TokenType.PARTIAL)))
- for i in range(trange.start, trange.end)]
-
+ for i in range(trange.start, trange.end)]
def iter_token_lists(self) -> Iterator[Tuple[int, QueryNode, TokenList]]:
""" Iterator over all token lists in the query.
for tlist in node.starting:
yield i, node, tlist
-
def find_lookup_word_by_id(self, token: int) -> str:
""" Find the first token with the given token ID and return
its lookup word. Returns 'None' if no such token exists.