6 * A word list creator based on simple splitting by space.
8 * Creates possible permutations of split phrases by finding all combination
9 * of splitting the phrase on space boundaries.
13 const MAX_WORDSET_LEN = 20;
14 const MAX_WORDSETS = 100;
16 // The phrase as a list of simple terms (without spaces).
20 * Create a new word list
22 * @param string sPhrase Phrase to create the word list from. The phrase is
23 * expected to be normalised, so that there are no
26 public function __construct($sPhrase)
28 if (strlen($sPhrase) > 0) {
29 $this->aWords = explode(' ', $sPhrase);
31 $this->aWords = array();
36 * Get all possible tokens that are present in this word list.
38 * @return array The list of string tokens in the word list.
40 public function getTokens()
43 $iNumWords = count($this->aWords);
45 for ($i = 0; $i < $iNumWords; $i++) {
46 $sPhrase = $this->aWords[$i];
47 $aTokens[$sPhrase] = $sPhrase;
49 for ($j = $i + 1; $j < $iNumWords; $j++) {
50 $sPhrase .= ' '.$this->aWords[$j];
51 $aTokens[$sPhrase] = $sPhrase;
59 * Compute all possible permutations of phrase splits that result in
60 * words which are in the token list.
62 public function getWordSets($oTokens)
64 $iNumWords = count($this->aWords);
66 if ($iNumWords == 0) {
70 // Caches the word set for the partial phrase up to word i.
71 $aSetCache = array_fill(0, $iNumWords, array());
73 // Initialise first element of cache. There can only be the word.
74 if ($oTokens->containsAny($this->aWords[0])) {
75 $aSetCache[0][] = array($this->aWords[0]);
78 // Now do the next elements using what we already have.
79 for ($i = 1; $i < $iNumWords; $i++) {
80 for ($j = $i; $j > 0; $j--) {
81 $sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial;
82 if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
83 $aPartial = array($sPartial);
84 foreach ($aSetCache[$j - 1] as $aSet) {
85 if (count($aSet) < SimpleWordList::MAX_WORDSET_LEN) {
86 $aSetCache[$i][] = array_merge($aSet, $aPartial);
89 if (count($aSetCache[$i]) > 2 * SimpleWordList::MAX_WORDSETS) {
92 array('\Nominatim\SimpleWordList', 'cmpByArraylen')
94 $aSetCache[$i] = array_slice(
97 SimpleWordList::MAX_WORDSETS
103 // finally the current full phrase
104 $sPartial = $this->aWords[0].' '.$sPartial;
105 if ($oTokens->containsAny($sPartial)) {
106 $aSetCache[$i][] = array($sPartial);
110 $aWordSets = $aSetCache[$iNumWords - 1];
111 usort($aWordSets, array('\Nominatim\SimpleWordList', 'cmpByArraylen'));
112 return array_slice($aWordSets, 0, SimpleWordList::MAX_WORDSETS);
115 public static function cmpByArraylen($aA, $aB)
120 if ($iALen == $iBLen) {
124 return ($iALen < $iBLen) ? -1 : 1;
127 public function debugInfo()
129 return $this->aWords;