$aNewPhraseSearches = array();
$sPhraseType = $bIsStructured ? $oPhrase->getPhraseType() : '';
- foreach ($oPhrase->getWordSets() as $iWordSet => $aWordset) {
- // Too many permutations - too expensive
- if ($iWordSet > 120) break;
-
+ foreach ($oPhrase->getWordSets() as $aWordset) {
$aWordsetSearches = $aSearches;
// Add all words from this wordset
}
}
- Debug::printDebugTable('Phrases', $aPhrases);
Debug::printVar('Tokens', $aTokens);
$oValidTokens = new TokenList();
Debug::printGroupTable('Valid Tokens', $oValidTokens->debugInfo());
+ foreach ($aPhrases as $oPhrase) {
+ $oPhrase->computeWordSets($oValidTokens);
+ }
+ Debug::printDebugTable('Phrases', $aPhrases);
+
Debug::newSection('Search candidates');
$aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens, $bStructuredPhrases);
*/
class Phrase
{
- const MAX_DEPTH = 7;
+ public const MAX_WORDSET_LEN = 20;
+ public const MAX_WORDSETS = 100;
// Complete phrase as a string.
private $sPhrase;
// Possible segmentations of the phrase.
private $aWordSets;
+ public static function cmpByArraylen($aA, $aB)
+ {
+ $iALen = count($aA);
+ $iBLen = count($aB);
+
+ if ($iALen == $iBLen) {
+ return 0;
+ }
+
+ return ($iALen < $iBLen) ? -1 : 1;
+ }
+
public function __construct($sPhrase, $sPhraseType)
{
$this->sPhrase = trim($sPhrase);
$this->sPhraseType = $sPhraseType;
$this->aWords = explode(' ', $this->sPhrase);
- $this->aWordSets = $this->createWordSets($this->aWords, 0);
}
/**
*/
public function addTokens(&$aTokens)
{
- foreach ($this->aWordSets as $aSet) {
- foreach ($aSet as $sWord) {
- $aTokens[' '.$sWord] = ' '.$sWord;
- $aTokens[$sWord] = $sWord;
+ $iNumWords = count($this->aWords);
+
+ for ($i = 0; $i < $iNumWords; $i++) {
+ $sPhrase = $this->aWords[$i];
+ $aTokens[' '.$sPhrase] = ' '.$sPhrase;
+ $aTokens[$sPhrase] = $sPhrase;
+
+ for ($j = $i + 1; $j < $iNumWords; $j++) {
+ $sPhrase .= ' '.$this->aWords[$j];
+ $aTokens[' '.$sPhrase] = ' '.$sPhrase;
+ $aTokens[$sPhrase] = $sPhrase;
}
}
}
*/
public function invertWordSets()
{
- $this->aWordSets = $this->createInverseWordSets($this->aWords, 0);
+ foreach ($this->aWordSets as $i => $aSet) {
+ $this->aWordSets[$i] = array_reverse($aSet);
+ }
}
- private function createWordSets($aWords, $iDepth)
+ public function computeWordSets($oTokens)
{
- $aResult = array(array(join(' ', $aWords)));
- $sFirstToken = '';
- if ($iDepth < Phrase::MAX_DEPTH) {
- while (count($aWords) > 1) {
- $sWord = array_shift($aWords);
- $sFirstToken .= ($sFirstToken?' ':'').$sWord;
- $aRest = $this->createWordSets($aWords, $iDepth + 1);
- foreach ($aRest as $aSet) {
- $aResult[] = array_merge(array($sFirstToken), $aSet);
- }
- }
- }
+ $iNumWords = count($this->aWords);
+ // Caches the word set for the partial phrase up to word i.
+ $aSetCache = array_fill(0, $iNumWords, array());
- return $aResult;
- }
+ // Initialise first element of cache. There can only be the word.
+ if ($oTokens->containsAny($this->aWords[0])) {
+ $aSetCache[0][] = array($this->aWords[0]);
+ }
- private function createInverseWordSets($aWords, $iDepth)
- {
- $aResult = array(array(join(' ', $aWords)));
- $sFirstToken = '';
- if ($iDepth < Phrase::MAX_DEPTH) {
- while (count($aWords) > 1) {
- $sWord = array_pop($aWords);
- $sFirstToken = $sWord.($sFirstToken?' ':'').$sFirstToken;
- $aRest = $this->createInverseWordSets($aWords, $iDepth + 1);
- foreach ($aRest as $aSet) {
- $aResult[] = array_merge(array($sFirstToken), $aSet);
+ // Now do the next elements using what we already have.
+ for ($i = 1; $i < $iNumWords; $i++) {
+ for ($j = $i; $j > 0; $j--) {
+ $sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial;
+ if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
+ $aPartial = array($sPartial);
+ foreach ($aSetCache[$j - 1] as $aSet) {
+ if (count($aSet) < Phrase::MAX_WORDSET_LEN) {
+ $aSetCache[$i][] = array_merge($aSet, $aPartial);
+ }
+ }
+ if (count($aSetCache[$i]) > 2 * Phrase::MAX_WORDSETS) {
+ usort(
+ $aSetCache[$i],
+ array('\Nominatim\Phrase', 'cmpByArraylen')
+ );
+ $aSetCache[$i] = array_slice(
+ $aSetCache[$i],
+ 0,
+ Phrase::MAX_WORDSETS
+ );
+ }
}
}
+
+ // finally the current full phrase
+ $sPartial = $this->aWords[0].' '.$sPartial;
+ if ($oTokens->containsAny($sPartial)) {
+ $aSetCache[$i][] = array($sPartial);
+ }
}
- return $aResult;
+ $this->aWordSets = $aSetCache[$iNumWords - 1];
+ usort($this->aWordSets, array('\Nominatim\Phrase', 'cmpByArraylen'));
+ $this->aWordSets = array_slice($this->aWordSets, 0, Phrase::MAX_WORDSETS);
}
+
public function debugInfo()
{
return array(
return isset($this->aTokens[$sWord]);
}
+ /**
+ * Check if there are partial or full tokens for the given word.
+ *
+ * @param string $sWord Token word to look for.
+ *
+ * @return bool True if there is one or more token for the token word.
+ */
+ public function containsAny($sWord)
+ {
+ return isset($this->aTokens[$sWord]) || isset($this->aTokens[' '.$sWord]);
+ }
+
/**
* Get the list of tokens for the given token word.
*
require_once(CONST_BasePath.'/lib/Phrase.php');
+class TokensFullSet
+{
+ public function containsAny($sTerm)
+ {
+ return true;
+ }
+}
+
+// phpcs:ignore PSR1.Classes.ClassDeclaration.MultipleClasses
+class TokensPartialSet
+{
+ public function __construct($aTokens)
+ {
+ $this->aTokens = array_flip($aTokens);
+ }
+
+ public function containsAny($sTerm)
+ {
+ return isset($this->aTokens[$sTerm]);
+ }
+}
+
+// phpcs:ignore PSR1.Classes.ClassDeclaration.MultipleClasses
class PhraseTest extends \PHPUnit\Framework\TestCase
{
public function testEmptyPhrase()
{
$oPhrase = new Phrase('', '');
+ $oPhrase->computeWordSets(new TokensFullSet());
$this->assertEquals(
array(array('')),
public function testSingleWordPhrase()
{
$oPhrase = new Phrase('a', '');
+ $oPhrase->computeWordSets(new TokensFullSet());
$this->assertEquals(
'(a)',
public function testMultiWordPhrase()
{
$oPhrase = new Phrase('a b', '');
+ $oPhrase->computeWordSets(new TokensFullSet());
$this->assertEquals(
'(a b),(a|b)',
$this->serializeSets($oPhrase->getWordSets())
);
$oPhrase = new Phrase('a b c', '');
+ $oPhrase->computeWordSets(new TokensFullSet());
$this->assertEquals(
- '(a b c),(a|b c),(a|b|c),(a b|c)',
+ '(a b c),(a|b c),(a b|c),(a|b|c)',
$this->serializeSets($oPhrase->getWordSets())
);
$oPhrase = new Phrase('a b c d', '');
+ $oPhrase->computeWordSets(new TokensFullSet());
$this->assertEquals(
- '(a b c d),(a|b c d),(a|b|c d),(a|b|c|d),(a|b c|d),(a b|c d),(a b|c|d),(a b c|d)',
+ '(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)',
$this->serializeSets($oPhrase->getWordSets())
);
}
public function testInverseWordSets()
{
$oPhrase = new Phrase('a b c', '');
+ $oPhrase->computeWordSets(new TokensFullSet());
$oPhrase->invertWordSets();
$this->assertEquals(
- '(a b c),(c|a b),(c|b|a),(b c|a)',
+ '(a b c),(b c|a),(c|a b),(c|b|a)',
$this->serializeSets($oPhrase->getWordSets())
);
}
- public function testMaxDepth()
+ public function testMaxWordSets()
{
$oPhrase = new Phrase(join(' ', array_fill(0, 4, 'a')), '');
+ $oPhrase->computeWordSets(new TokensFullSet());
$this->assertEquals(8, count($oPhrase->getWordSets()));
$oPhrase->invertWordSets();
$this->assertEquals(8, count($oPhrase->getWordSets()));
$oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
- $this->assertEquals(41226, count($oPhrase->getWordSets()));
+ $oPhrase->computeWordSets(new TokensFullSet());
+ $this->assertEquals(Phrase::MAX_WORDSETS, count($oPhrase->getWordSets()));
$oPhrase->invertWordSets();
- $this->assertEquals(41226, count($oPhrase->getWordSets()));
+ $this->assertEquals(Phrase::MAX_WORDSETS, count($oPhrase->getWordSets()));
+ }
+
+
+ public function testPartialTokensShortTerm()
+ {
+ $oPhrase = new Phrase('a b c d', '');
+ $oPhrase->computeWordSets(new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
+ $this->assertEquals(
+ '(a|b c d),(a|b c|d)',
+ $this->serializeSets($oPhrase->getWordSets())
+ );
+ }
+
+
+ public function testPartialTokensLongTerm()
+ {
+ $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
+ $oPhrase->computeWordSets(new TokensPartialSet(array('a', 'a a a a a')));
+ $this->assertEquals(80, count($oPhrase->getWordSets()));
}
}