*/
class Phrase
{
- const MAX_WORDSET_LEN = 20;
- const MAX_WORDSETS = 100;
-
// Complete phrase as a string.
private $sPhrase;
// Element type for structured searches.
// Possible segmentations of the phrase.
private $aWordSets;
- public static function cmpByArraylen($aA, $aB)
- {
- $iALen = count($aA);
- $iBLen = count($aB);
-
- if ($iALen == $iBLen) {
- return 0;
- }
-
- return ($iALen < $iBLen) ? -1 : 1;
- }
-
-
public function __construct($sPhrase, $sPhraseType)
{
$this->sPhrase = trim($sPhrase);
return $this->sPhraseType;
}
+ public function setWordSets($aWordSets)
+ {
+ $this->aWordSets = $aWordSets;
+ }
+
/**
* Return the array of possible segmentations of the phrase.
*
}
}
- public function computeWordSets($aWords, $oTokens)
- {
- $iNumWords = count($aWords);
-
- if ($iNumWords == 0) {
- $this->aWordSets = null;
- return;
- }
-
- // Caches the word set for the partial phrase up to word i.
- $aSetCache = array_fill(0, $iNumWords, array());
-
- // Initialise first element of cache. There can only be the word.
- if ($oTokens->containsAny($aWords[0])) {
- $aSetCache[0][] = array($aWords[0]);
- }
-
- // Now do the next elements using what we already have.
- for ($i = 1; $i < $iNumWords; $i++) {
- for ($j = $i; $j > 0; $j--) {
- $sPartial = $j == $i ? $aWords[$j] : $aWords[$j].' '.$sPartial;
- if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
- $aPartial = array($sPartial);
- foreach ($aSetCache[$j - 1] as $aSet) {
- if (count($aSet) < Phrase::MAX_WORDSET_LEN) {
- $aSetCache[$i][] = array_merge($aSet, $aPartial);
- }
- }
- if (count($aSetCache[$i]) > 2 * Phrase::MAX_WORDSETS) {
- usort(
- $aSetCache[$i],
- array('\Nominatim\Phrase', 'cmpByArraylen')
- );
- $aSetCache[$i] = array_slice(
- $aSetCache[$i],
- 0,
- Phrase::MAX_WORDSETS
- );
- }
- }
- }
-
- // finally the current full phrase
- $sPartial = $aWords[0].' '.$sPartial;
- if ($oTokens->containsAny($sPartial)) {
- $aSetCache[$i][] = array($sPartial);
- }
- }
-
- $this->aWordSets = $aSetCache[$iNumWords - 1];
- usort($this->aWordSets, array('\Nominatim\Phrase', 'cmpByArraylen'));
- $this->aWordSets = array_slice($this->aWordSets, 0, Phrase::MAX_WORDSETS);
- }
-
-
public function debugInfo()
{
return array(
--- /dev/null
+<?php
+
+namespace Nominatim;
+
+/**
+ * A word list creator based on simple splitting by space.
+ *
+ * Creates possible permutations of split phrases by finding all combination
+ * of splitting the phrase on space boundaries.
+ */
+class SimpleWordList
+{
+ const MAX_WORDSET_LEN = 20;
+ const MAX_WORDSETS = 100;
+
+ // The phrase as a list of simple terms (without spaces).
+ private $aWords;
+
+ /**
+ * Create a new word list
+ *
+ * @param string sPhrase Phrase to create the word list from. The phrase is
+ * expected to be normalised, so that there are no
+ * subsequent spaces.
+ */
+ public function __construct($sPhrase)
+ {
+ if (strlen($sPhrase) > 0) {
+ $this->aWords = explode(' ', $sPhrase);
+ } else {
+ $this->aWords = array();
+ }
+ }
+
+ /**
+ * Get all possible tokens that are present in this word list.
+ *
+ * @return array The list of string tokens in the word list.
+ */
+ public function getTokens()
+ {
+ $aTokens = array();
+ $iNumWords = count($this->aWords);
+
+ for ($i = 0; $i < $iNumWords; $i++) {
+ $sPhrase = $this->aWords[$i];
+ $aTokens[$sPhrase] = $sPhrase;
+
+ for ($j = $i + 1; $j < $iNumWords; $j++) {
+ $sPhrase .= ' '.$this->aWords[$j];
+ $aTokens[$sPhrase] = $sPhrase;
+ }
+ }
+
+ return $aTokens;
+ }
+
+ /**
+ * Compute all possible permutations of phrase splits that result in
+ * words which are in the token list.
+ */
+ public function getWordSets($oTokens)
+ {
+ $iNumWords = count($this->aWords);
+
+ if ($iNumWords == 0) {
+ return null;
+ }
+
+ // Caches the word set for the partial phrase up to word i.
+ $aSetCache = array_fill(0, $iNumWords, array());
+
+ // Initialise first element of cache. There can only be the word.
+ if ($oTokens->containsAny($this->aWords[0])) {
+ $aSetCache[0][] = array($this->aWords[0]);
+ }
+
+ // Now do the next elements using what we already have.
+ for ($i = 1; $i < $iNumWords; $i++) {
+ for ($j = $i; $j > 0; $j--) {
+ $sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial;
+ if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
+ $aPartial = array($sPartial);
+ foreach ($aSetCache[$j - 1] as $aSet) {
+ if (count($aSet) < SimpleWordList::MAX_WORDSET_LEN) {
+ $aSetCache[$i][] = array_merge($aSet, $aPartial);
+ }
+ }
+ if (count($aSetCache[$i]) > 2 * SimpleWordList::MAX_WORDSETS) {
+ usort(
+ $aSetCache[$i],
+ array('\Nominatim\SimpleWordList', 'cmpByArraylen')
+ );
+ $aSetCache[$i] = array_slice(
+ $aSetCache[$i],
+ 0,
+ SimpleWordList::MAX_WORDSETS
+ );
+ }
+ }
+ }
+
+ // finally the current full phrase
+ $sPartial = $this->aWords[0].' '.$sPartial;
+ if ($oTokens->containsAny($sPartial)) {
+ $aSetCache[$i][] = array($sPartial);
+ }
+ }
+
+ $aWordSets = $aSetCache[$iNumWords - 1];
+ usort($aWordSets, array('\Nominatim\SimpleWordList', 'cmpByArraylen'));
+ return array_slice($aWordSets, 0, SimpleWordList::MAX_WORDSETS);
+ }
+
+ public static function cmpByArraylen($aA, $aB)
+ {
+ $iALen = count($aA);
+ $iBLen = count($aB);
+
+ if ($iALen == $iBLen) {
+ return 0;
+ }
+
+ return ($iALen < $iBLen) ? -1 : 1;
+ }
+
+ public function debugInfo()
+ {
+ return $this->aWords;
+ }
+}
namespace Nominatim;
+require_once(CONST_LibDir.'/SimpleWordList.php');
+
class Tokenizer
{
private $oDB;
$sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
$sPhrase = $this->makeStandardWord($oPhrase->getPhrase());
Debug::printVar('Phrase', $sPhrase);
- if (strlen($sPhrase) > 0) {
- $aWords = explode(' ', $sPhrase);
- Tokenizer::addTokens($aTokens, $aWords);
- $aWordLists[] = $aWords;
- } else {
- $aWordLists[] = array();
- }
+
+ $oWordList = new SimpleWordList($sPhrase);
+ $aTokens = array_merge($aTokens, $oWordList->getTokens());
+ $aWordLists[] = $oWordList;
}
Debug::printVar('Tokens', $aTokens);
$oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
foreach ($aPhrases as $iPhrase => $oPhrase) {
- $oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
+ $oPhrase->setWordSets($aWordLists[$iPhrase]->getWordSets($oValidTokens));
}
return $oValidTokens;
}
}
}
-
-
- /**
- * Add the tokens from this phrase to the given list of tokens.
- *
- * @param string[] $aTokens List of tokens to append.
- *
- * @return void
- */
- private static function addTokens(&$aTokens, $aWords)
- {
- $iNumWords = count($aWords);
-
- for ($i = 0; $i < $iNumWords; $i++) {
- $sPhrase = $aWords[$i];
- $aTokens[$sPhrase] = $sPhrase;
-
- for ($j = $i + 1; $j < $iNumWords; $j++) {
- $sPhrase .= ' '.$aWords[$j];
- $aTokens[$sPhrase] = $sPhrase;
- }
- }
- }
}
namespace Nominatim;
+require_once(CONST_LibDir.'/SimpleWordList.php');
+
class Tokenizer
{
private $oDB;
$aWordLists = array();
$aTokens = array();
foreach ($aNormPhrases as $sPhrase) {
- if (strlen($sPhrase) > 0) {
- $aWords = explode(' ', $sPhrase);
- Tokenizer::addTokens($aTokens, $aWords);
- $aWordLists[] = $aWords;
- } else {
- $aWordLists[] = array();
+ $oWordList = new SimpleWordList($sPhrase);
+
+ foreach ($oWordList->getTokens() as $sToken) {
+ $aTokens[' '.$sToken] = ' '.$sToken;
+ $aTokens[$sToken] = $sToken;
}
+
+ $aWordLists[] = $oWordList;
}
Debug::printVar('Tokens', $aTokens);
$oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
foreach ($aPhrases as $iPhrase => $oPhrase) {
- $oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
+ $oPhrase->setWordSets($aWordLists[$iPhrase]->getWordSets($oValidTokens));
}
return $oValidTokens;
}
}
}
-
-
- /**
- * Add the tokens from this phrase to the given list of tokens.
- *
- * @param string[] $aTokens List of tokens to append.
- *
- * @return void
- */
- private static function addTokens(&$aTokens, $aWords)
- {
- $iNumWords = count($aWords);
-
- for ($i = 0; $i < $iNumWords; $i++) {
- $sPhrase = $aWords[$i];
- $aTokens[' '.$sPhrase] = ' '.$sPhrase;
- $aTokens[$sPhrase] = $sPhrase;
-
- for ($j = $i + 1; $j < $iNumWords; $j++) {
- $sPhrase .= ' '.$aWords[$j];
- $aTokens[' '.$sPhrase] = ' '.$sPhrase;
- $aTokens[$sPhrase] = $sPhrase;
- }
- }
- }
}
+++ /dev/null
-<?php
-
-namespace Nominatim;
-
-require_once(CONST_LibDir.'/Phrase.php');
-
-class TokensFullSet
-{
- public function containsAny($sTerm)
- {
- return true;
- }
-}
-
-// phpcs:ignore PSR1.Classes.ClassDeclaration.MultipleClasses
-class TokensPartialSet
-{
- public function __construct($aTokens)
- {
- $this->aTokens = array_flip($aTokens);
- }
-
- public function containsAny($sTerm)
- {
- return isset($this->aTokens[$sTerm]);
- }
-}
-
-// phpcs:ignore PSR1.Classes.ClassDeclaration.MultipleClasses
-class PhraseTest extends \PHPUnit\Framework\TestCase
-{
-
-
- private function serializeSets($aSets)
- {
- $aParts = array();
- foreach ($aSets as $aSet) {
- $aParts[] = '(' . join('|', $aSet) . ')';
- }
- return join(',', $aParts);
- }
-
-
- public function testEmptyPhrase()
- {
- $oPhrase = new Phrase('', '');
- $oPhrase->computeWordSets(array(), new TokensFullSet());
-
- $this->assertNull($oPhrase->getWordSets());
- }
-
-
- public function testSingleWordPhrase()
- {
- $oPhrase = new Phrase('a', '');
- $oPhrase->computeWordSets(array('a'), new TokensFullSet());
-
- $this->assertEquals(
- '(a)',
- $this->serializeSets($oPhrase->getWordSets())
- );
- }
-
-
- public function testMultiWordPhrase()
- {
- $oPhrase = new Phrase('a b', '');
- $oPhrase->computeWordSets(array('a', 'b'), new TokensFullSet());
- $this->assertEquals(
- '(a b),(a|b)',
- $this->serializeSets($oPhrase->getWordSets())
- );
-
- $oPhrase = new Phrase('a b c', '');
- $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
- $this->assertEquals(
- '(a b c),(a|b c),(a b|c),(a|b|c)',
- $this->serializeSets($oPhrase->getWordSets())
- );
-
- $oPhrase = new Phrase('a b c d', '');
- $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensFullSet());
- $this->assertEquals(
- '(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)',
- $this->serializeSets($oPhrase->getWordSets())
- );
- }
-
-
- public function testInverseWordSets()
- {
- $oPhrase = new Phrase('a b c', '');
- $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
- $oPhrase->invertWordSets();
-
- $this->assertEquals(
- '(a b c),(b c|a),(c|a b),(c|b|a)',
- $this->serializeSets($oPhrase->getWordSets())
- );
- }
-
-
- public function testMaxWordSets()
- {
- $aWords = array_fill(0, 4, 'a');
- $oPhrase = new Phrase(join(' ', $aWords), '');
- $oPhrase->computeWordSets($aWords, new TokensFullSet());
- $this->assertEquals(8, count($oPhrase->getWordSets()));
- $oPhrase->invertWordSets();
- $this->assertEquals(8, count($oPhrase->getWordSets()));
-
- $aWords = array_fill(0, 18, 'a');
- $oPhrase = new Phrase(join(' ', $aWords), '');
- $oPhrase->computeWordSets($aWords, new TokensFullSet());
- $this->assertEquals(100, count($oPhrase->getWordSets()));
- $oPhrase->invertWordSets();
- $this->assertEquals(100, count($oPhrase->getWordSets()));
- }
-
-
- public function testPartialTokensShortTerm()
- {
- $oPhrase = new Phrase('a b c d', '');
- $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
- $this->assertEquals(
- '(a|b c d),(a|b c|d)',
- $this->serializeSets($oPhrase->getWordSets())
- );
- }
-
-
- public function testPartialTokensLongTerm()
- {
- $aWords = array_fill(0, 18, 'a');
- $oPhrase = new Phrase(join(' ', $aWords), '');
- $oPhrase->computeWordSets($aWords, new TokensPartialSet(array('a', 'a a a a a')));
- $this->assertEquals(80, count($oPhrase->getWordSets()));
- }
-}
--- /dev/null
+<?php
+
+namespace Nominatim;
+
+require_once(CONST_LibDir.'/SimpleWordList.php');
+
+class TokensFullSet
+{
+ public function containsAny($sTerm)
+ {
+ return true;
+ }
+}
+
+// phpcs:ignore PSR1.Classes.ClassDeclaration.MultipleClasses
+class TokensPartialSet
+{
+ public function __construct($aTokens)
+ {
+ $this->aTokens = array_flip($aTokens);
+ }
+
+ public function containsAny($sTerm)
+ {
+ return isset($this->aTokens[$sTerm]);
+ }
+}
+
+// phpcs:ignore PSR1.Classes.ClassDeclaration.MultipleClasses
+class SimpleWordListTest extends \PHPUnit\Framework\TestCase
+{
+
+
+ private function serializeSets($aSets)
+ {
+ $aParts = array();
+ foreach ($aSets as $aSet) {
+ $aParts[] = '(' . join('|', $aSet) . ')';
+ }
+ return join(',', $aParts);
+ }
+
+
+ public function testEmptyPhrase()
+ {
+ $oList = new SimpleWordList('');
+ $this->assertNull($oList->getWordSets(new TokensFullSet()));
+ }
+
+
+ public function testSingleWordPhrase()
+ {
+ $oList = new SimpleWordList('a');
+
+ $this->assertEquals(
+ '(a)',
+ $this->serializeSets($oList->getWordSets(new TokensFullSet()))
+ );
+ }
+
+
+ public function testMultiWordPhrase()
+ {
+ $oList = new SimpleWordList('a b');
+ $this->assertEquals(
+ '(a b),(a|b)',
+ $this->serializeSets($oList->getWordSets(new TokensFullSet()))
+ );
+
+ $oList = new SimpleWordList('a b c');
+ $this->assertEquals(
+ '(a b c),(a|b c),(a b|c),(a|b|c)',
+ $this->serializeSets($oList->getWordSets(new TokensFullSet()))
+ );
+
+ $oList = new SimpleWordList('a b c d');
+ $this->assertEquals(
+ '(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)',
+ $this->serializeSets($oList->getWordSets(new TokensFullSet()))
+ );
+ }
+
+
+ public function testMaxWordSets()
+ {
+ $aWords = array_fill(0, 4, 'a');
+ $oList = new SimpleWordList(join(' ', $aWords));
+ $this->assertEquals(8, count($oList->getWordSets(new TokensFullSet())));
+
+ $aWords = array_fill(0, 18, 'a');
+ $oList = new SimpleWordList(join(' ', $aWords));
+ $this->assertEquals(100, count($oList->getWordSets(new TokensFullSet())));
+ }
+
+
+ public function testPartialTokensShortTerm()
+ {
+ $oList = new SimpleWordList('a b c d');
+ $this->assertEquals(
+ '(a|b c d),(a|b c|d)',
+ $this->serializeSets($oList->getWordSets(new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d'))))
+ );
+ }
+
+
+ public function testPartialTokensLongTerm()
+ {
+ $aWords = array_fill(0, 18, 'a');
+ $oList = new SimpleWordList(join(' ', $aWords));
+ $this->assertEquals(80, count($oList->getWordSets(new TokensPartialSet(array('a', 'a a a a a')))));
+ }
+}