X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/023f94b066c11628ecc87ca9876dbe5ec4136776..974bd2d8b205bb4947fa546a95caed088c470bb0:/lib/Phrase.php diff --git a/lib/Phrase.php b/lib/Phrase.php index 23b9e3ca..e2643e87 100644 --- a/lib/Phrase.php +++ b/lib/Phrase.php @@ -9,7 +9,8 @@ namespace Nominatim; */ class Phrase { - CONST MAX_DEPTH = 7; + const MAX_WORDSET_LEN = 20; + const MAX_WORDSETS = 100; // Complete phrase as a string. private $sPhrase; @@ -20,73 +21,140 @@ class Phrase // Possible segmentations of the phrase. private $aWordSets; + public static function cmpByArraylen($aA, $aB) + { + $iALen = count($aA); + $iBLen = count($aB); + + if ($iALen == $iBLen) { + return 0; + } + + return ($iALen < $iBLen) ? -1 : 1; + } + public function __construct($sPhrase, $sPhraseType) { $this->sPhrase = trim($sPhrase); $this->sPhraseType = $sPhraseType; $this->aWords = explode(' ', $this->sPhrase); - $this->aWordSets = $this->createWordSets($this->aWords, 0); } + /** + * Return the element type of the phrase. + * + * @return string Pharse type if the phrase comes from a structured query + * or empty string otherwise. + */ public function getPhraseType() { return $this->sPhraseType; } + /** + * Return the array of possible segmentations of the phrase. + * + * @return string[][] Array of segmentations, each consisting of an + * array of terms. + */ public function getWordSets() { return $this->aWordSets; } + /** + * Add the tokens from this phrase to the given list of tokens. + * + * @param string[] $aTokens List of tokens to append. + * + * @return void + */ public function addTokens(&$aTokens) { - foreach ($this->aWordSets as $aSet) { - foreach ($aSet as $sWord) { - $aTokens[' '.$sWord] = ' '.$sWord; - $aTokens[$sWord] = $sWord; + $iNumWords = count($this->aWords); + + for ($i = 0; $i < $iNumWords; $i++) { + $sPhrase = $this->aWords[$i]; + $aTokens[' '.$sPhrase] = ' '.$sPhrase; + $aTokens[$sPhrase] = $sPhrase; + + for ($j = $i + 1; $j < $iNumWords; $j++) { + $sPhrase .= ' '.$this->aWords[$j]; + $aTokens[' '.$sPhrase] = ' '.$sPhrase; + $aTokens[$sPhrase] = $sPhrase; } } } + /** + * Invert the set of possible segmentations. + * + * @return void + */ public function invertWordSets() { - $this->aWordSets = $this->createInverseWordSets($this->aWords, 0); + foreach ($this->aWordSets as $i => $aSet) { + $this->aWordSets[$i] = array_reverse($aSet); + } } - private function createWordSets($aWords, $iDepth) + public function computeWordSets($oTokens) { - $aResult = array(array(join(' ', $aWords))); - $sFirstToken = ''; - if ($iDepth < Phrase::MAX_DEPTH) { - while (sizeof($aWords) > 1) { - $sWord = array_shift($aWords); - $sFirstToken .= ($sFirstToken?' ':'').$sWord; - $aRest = $this->createWordSets($aWords, $iDepth + 1); - foreach ($aRest as $aSet) { - $aResult[] = array_merge(array($sFirstToken), $aSet); + $iNumWords = count($this->aWords); + // Caches the word set for the partial phrase up to word i. + $aSetCache = array_fill(0, $iNumWords, array()); + + // Initialise first element of cache. There can only be the word. + if ($oTokens->containsAny($this->aWords[0])) { + $aSetCache[0][] = array($this->aWords[0]); + } + + // Now do the next elements using what we already have. + for ($i = 1; $i < $iNumWords; $i++) { + for ($j = $i; $j > 0; $j--) { + $sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial; + if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) { + $aPartial = array($sPartial); + foreach ($aSetCache[$j - 1] as $aSet) { + if (count($aSet) < Phrase::MAX_WORDSET_LEN) { + $aSetCache[$i][] = array_merge($aSet, $aPartial); + } + } + if (count($aSetCache[$i]) > 2 * Phrase::MAX_WORDSETS) { + usort( + $aSetCache[$i], + array('\Nominatim\Phrase', 'cmpByArraylen') + ); + $aSetCache[$i] = array_slice( + $aSetCache[$i], + 0, + Phrase::MAX_WORDSETS + ); + } } } + + // finally the current full phrase + $sPartial = $this->aWords[0].' '.$sPartial; + if ($oTokens->containsAny($sPartial)) { + $aSetCache[$i][] = array($sPartial); + } } - return $aResult; + $this->aWordSets = $aSetCache[$iNumWords - 1]; + usort($this->aWordSets, array('\Nominatim\Phrase', 'cmpByArraylen')); + $this->aWordSets = array_slice($this->aWordSets, 0, Phrase::MAX_WORDSETS); } - public function createInverseWordSets($aWords, $iDepth) - { - $aResult = array(array(join(' ', $aWords))); - $sFirstToken = ''; - if ($iDepth < Phrase::MAX_DEPTH) { - while (sizeof($aWords) > 1) { - $sWord = array_pop($aWords); - $sFirstToken = $sWord.($sFirstToken?' ':'').$sFirstToken; - $aRest = $this->createInverseWordSets($aWords, $iDepth + 1); - foreach ($aRest as $aSet) { - $aResult[] = array_merge(array($sFirstToken), $aSet); - } - } - } - return $aResult; + public function debugInfo() + { + return array( + 'Type' => $this->sPhraseType, + 'Phrase' => $this->sPhrase, + 'Words' => $this->aWords, + 'WordSets' => $this->aWordSets + ); } -}; +}