+
+ def extract_words(self, base_penalty: float = 0.0,
+ start: int = 0,
+ endpos: Optional[int] = None) -> Dict[str, List[TokenRange]]:
+ """ Add all combinations of words that can be formed from the terms
+ between the given start and endnode. The terms are joined with
+ spaces for each break. Words can never go across a BREAK_PHRASE.
+
+ The functions returns a dictionary of possible words with their
+ position within the query and a penalty. The penalty is computed
+ from the base_penalty plus the penalty for each node the word
+ crosses.
+ """
+ if endpos is None:
+ endpos = len(self.nodes)
+
+ words: Dict[str, List[TokenRange]] = defaultdict(list)
+
+ for first in range(start, endpos - 1):
+ word = self.nodes[first + 1].term_lookup
+ penalty = base_penalty
+ words[word].append(TokenRange(first, first + 1, penalty=penalty))
+ if self.nodes[first + 1].btype != BREAK_PHRASE:
+ for last in range(first + 2, min(first + 20, endpos)):
+ word = ' '.join((word, self.nodes[last].term_lookup))
+ penalty += self.nodes[last - 1].penalty
+ words[word].append(TokenRange(first, last, penalty=penalty))
+ if self.nodes[last].btype == BREAK_PHRASE:
+ break
+
+ return words