From 1147b83b22a073db2f81ea177f9caa12e727e249 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 12 Aug 2021 11:09:46 +0200 Subject: [PATCH] php: make word list a first-class object This separates the logic of creating word sets from the Phrase class. A tokenizer may now derived the word sets any way they like. The SimpleWordList class provides a standard implementation for splitting phrases on spaces. --- lib-php/Phrase.php | 76 +---------- lib-php/SimpleWordList.php | 131 +++++++++++++++++++ lib-php/tokenizer/legacy_icu_tokenizer.php | 38 ++---- lib-php/tokenizer/legacy_tokenizer.php | 42 ++----- test/php/Nominatim/PhraseTest.php | 139 --------------------- test/php/Nominatim/SimpleWordListTest.php | 112 +++++++++++++++++ 6 files changed, 265 insertions(+), 273 deletions(-) create mode 100644 lib-php/SimpleWordList.php delete mode 100644 test/php/Nominatim/PhraseTest.php create mode 100644 test/php/Nominatim/SimpleWordListTest.php diff --git a/lib-php/Phrase.php b/lib-php/Phrase.php index d14c842d..cdde6134 100644 --- a/lib-php/Phrase.php +++ b/lib-php/Phrase.php @@ -9,9 +9,6 @@ namespace Nominatim; */ class Phrase { - const MAX_WORDSET_LEN = 20; - const MAX_WORDSETS = 100; - // Complete phrase as a string. private $sPhrase; // Element type for structured searches. @@ -19,19 +16,6 @@ class Phrase // Possible segmentations of the phrase. private $aWordSets; - public static function cmpByArraylen($aA, $aB) - { - $iALen = count($aA); - $iBLen = count($aB); - - if ($iALen == $iBLen) { - return 0; - } - - return ($iALen < $iBLen) ? -1 : 1; - } - - public function __construct($sPhrase, $sPhraseType) { $this->sPhrase = trim($sPhrase); @@ -57,6 +41,11 @@ class Phrase return $this->sPhraseType; } + public function setWordSets($aWordSets) + { + $this->aWordSets = $aWordSets; + } + /** * Return the array of possible segmentations of the phrase. * @@ -80,61 +69,6 @@ class Phrase } } - public function computeWordSets($aWords, $oTokens) - { - $iNumWords = count($aWords); - - if ($iNumWords == 0) { - $this->aWordSets = null; - return; - } - - // Caches the word set for the partial phrase up to word i. - $aSetCache = array_fill(0, $iNumWords, array()); - - // Initialise first element of cache. There can only be the word. - if ($oTokens->containsAny($aWords[0])) { - $aSetCache[0][] = array($aWords[0]); - } - - // Now do the next elements using what we already have. - for ($i = 1; $i < $iNumWords; $i++) { - for ($j = $i; $j > 0; $j--) { - $sPartial = $j == $i ? $aWords[$j] : $aWords[$j].' '.$sPartial; - if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) { - $aPartial = array($sPartial); - foreach ($aSetCache[$j - 1] as $aSet) { - if (count($aSet) < Phrase::MAX_WORDSET_LEN) { - $aSetCache[$i][] = array_merge($aSet, $aPartial); - } - } - if (count($aSetCache[$i]) > 2 * Phrase::MAX_WORDSETS) { - usort( - $aSetCache[$i], - array('\Nominatim\Phrase', 'cmpByArraylen') - ); - $aSetCache[$i] = array_slice( - $aSetCache[$i], - 0, - Phrase::MAX_WORDSETS - ); - } - } - } - - // finally the current full phrase - $sPartial = $aWords[0].' '.$sPartial; - if ($oTokens->containsAny($sPartial)) { - $aSetCache[$i][] = array($sPartial); - } - } - - $this->aWordSets = $aSetCache[$iNumWords - 1]; - usort($this->aWordSets, array('\Nominatim\Phrase', 'cmpByArraylen')); - $this->aWordSets = array_slice($this->aWordSets, 0, Phrase::MAX_WORDSETS); - } - - public function debugInfo() { return array( diff --git a/lib-php/SimpleWordList.php b/lib-php/SimpleWordList.php new file mode 100644 index 00000000..3dd9bda1 --- /dev/null +++ b/lib-php/SimpleWordList.php @@ -0,0 +1,131 @@ + 0) { + $this->aWords = explode(' ', $sPhrase); + } else { + $this->aWords = array(); + } + } + + /** + * Get all possible tokens that are present in this word list. + * + * @return array The list of string tokens in the word list. + */ + public function getTokens() + { + $aTokens = array(); + $iNumWords = count($this->aWords); + + for ($i = 0; $i < $iNumWords; $i++) { + $sPhrase = $this->aWords[$i]; + $aTokens[$sPhrase] = $sPhrase; + + for ($j = $i + 1; $j < $iNumWords; $j++) { + $sPhrase .= ' '.$this->aWords[$j]; + $aTokens[$sPhrase] = $sPhrase; + } + } + + return $aTokens; + } + + /** + * Compute all possible permutations of phrase splits that result in + * words which are in the token list. + */ + public function getWordSets($oTokens) + { + $iNumWords = count($this->aWords); + + if ($iNumWords == 0) { + return null; + } + + // Caches the word set for the partial phrase up to word i. + $aSetCache = array_fill(0, $iNumWords, array()); + + // Initialise first element of cache. There can only be the word. + if ($oTokens->containsAny($this->aWords[0])) { + $aSetCache[0][] = array($this->aWords[0]); + } + + // Now do the next elements using what we already have. + for ($i = 1; $i < $iNumWords; $i++) { + for ($j = $i; $j > 0; $j--) { + $sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial; + if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) { + $aPartial = array($sPartial); + foreach ($aSetCache[$j - 1] as $aSet) { + if (count($aSet) < SimpleWordList::MAX_WORDSET_LEN) { + $aSetCache[$i][] = array_merge($aSet, $aPartial); + } + } + if (count($aSetCache[$i]) > 2 * SimpleWordList::MAX_WORDSETS) { + usort( + $aSetCache[$i], + array('\Nominatim\SimpleWordList', 'cmpByArraylen') + ); + $aSetCache[$i] = array_slice( + $aSetCache[$i], + 0, + SimpleWordList::MAX_WORDSETS + ); + } + } + } + + // finally the current full phrase + $sPartial = $this->aWords[0].' '.$sPartial; + if ($oTokens->containsAny($sPartial)) { + $aSetCache[$i][] = array($sPartial); + } + } + + $aWordSets = $aSetCache[$iNumWords - 1]; + usort($aWordSets, array('\Nominatim\SimpleWordList', 'cmpByArraylen')); + return array_slice($aWordSets, 0, SimpleWordList::MAX_WORDSETS); + } + + public static function cmpByArraylen($aA, $aB) + { + $iALen = count($aA); + $iBLen = count($aB); + + if ($iALen == $iBLen) { + return 0; + } + + return ($iALen < $iBLen) ? -1 : 1; + } + + public function debugInfo() + { + return $this->aWords; + } +} diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index 690ef136..ca224a22 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -2,6 +2,8 @@ namespace Nominatim; +require_once(CONST_LibDir.'/SimpleWordList.php'); + class Tokenizer { private $oDB; @@ -81,13 +83,10 @@ class Tokenizer $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase()); $sPhrase = $this->makeStandardWord($oPhrase->getPhrase()); Debug::printVar('Phrase', $sPhrase); - if (strlen($sPhrase) > 0) { - $aWords = explode(' ', $sPhrase); - Tokenizer::addTokens($aTokens, $aWords); - $aWordLists[] = $aWords; - } else { - $aWordLists[] = array(); - } + + $oWordList = new SimpleWordList($sPhrase); + $aTokens = array_merge($aTokens, $oWordList->getTokens()); + $aWordLists[] = $oWordList; } Debug::printVar('Tokens', $aTokens); @@ -96,7 +95,7 @@ class Tokenizer $oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery); foreach ($aPhrases as $iPhrase => $oPhrase) { - $oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens); + $oPhrase->setWordSets($aWordLists[$iPhrase]->getWordSets($oValidTokens)); } return $oValidTokens; @@ -210,27 +209,4 @@ class Tokenizer } } } - - - /** - * Add the tokens from this phrase to the given list of tokens. - * - * @param string[] $aTokens List of tokens to append. - * - * @return void - */ - private static function addTokens(&$aTokens, $aWords) - { - $iNumWords = count($aWords); - - for ($i = 0; $i < $iNumWords; $i++) { - $sPhrase = $aWords[$i]; - $aTokens[$sPhrase] = $sPhrase; - - for ($j = $i + 1; $j < $iNumWords; $j++) { - $sPhrase .= ' '.$aWords[$j]; - $aTokens[$sPhrase] = $sPhrase; - } - } - } } diff --git a/lib-php/tokenizer/legacy_tokenizer.php b/lib-php/tokenizer/legacy_tokenizer.php index 6760057d..e5ffbe02 100644 --- a/lib-php/tokenizer/legacy_tokenizer.php +++ b/lib-php/tokenizer/legacy_tokenizer.php @@ -2,6 +2,8 @@ namespace Nominatim; +require_once(CONST_LibDir.'/SimpleWordList.php'); + class Tokenizer { private $oDB; @@ -99,13 +101,14 @@ class Tokenizer $aWordLists = array(); $aTokens = array(); foreach ($aNormPhrases as $sPhrase) { - if (strlen($sPhrase) > 0) { - $aWords = explode(' ', $sPhrase); - Tokenizer::addTokens($aTokens, $aWords); - $aWordLists[] = $aWords; - } else { - $aWordLists[] = array(); + $oWordList = new SimpleWordList($sPhrase); + + foreach ($oWordList->getTokens() as $sToken) { + $aTokens[' '.$sToken] = ' '.$sToken; + $aTokens[$sToken] = $sToken; } + + $aWordLists[] = $oWordList; } Debug::printVar('Tokens', $aTokens); @@ -114,7 +117,7 @@ class Tokenizer $oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery); foreach ($aPhrases as $iPhrase => $oPhrase) { - $oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens); + $oPhrase->setWordSets($aWordLists[$iPhrase]->getWordSets($oValidTokens)); } return $oValidTokens; @@ -226,29 +229,4 @@ class Tokenizer } } } - - - /** - * Add the tokens from this phrase to the given list of tokens. - * - * @param string[] $aTokens List of tokens to append. - * - * @return void - */ - private static function addTokens(&$aTokens, $aWords) - { - $iNumWords = count($aWords); - - for ($i = 0; $i < $iNumWords; $i++) { - $sPhrase = $aWords[$i]; - $aTokens[' '.$sPhrase] = ' '.$sPhrase; - $aTokens[$sPhrase] = $sPhrase; - - for ($j = $i + 1; $j < $iNumWords; $j++) { - $sPhrase .= ' '.$aWords[$j]; - $aTokens[' '.$sPhrase] = ' '.$sPhrase; - $aTokens[$sPhrase] = $sPhrase; - } - } - } } diff --git a/test/php/Nominatim/PhraseTest.php b/test/php/Nominatim/PhraseTest.php deleted file mode 100644 index e4c2bbd1..00000000 --- a/test/php/Nominatim/PhraseTest.php +++ /dev/null @@ -1,139 +0,0 @@ -aTokens = array_flip($aTokens); - } - - public function containsAny($sTerm) - { - return isset($this->aTokens[$sTerm]); - } -} - -// phpcs:ignore PSR1.Classes.ClassDeclaration.MultipleClasses -class PhraseTest extends \PHPUnit\Framework\TestCase -{ - - - private function serializeSets($aSets) - { - $aParts = array(); - foreach ($aSets as $aSet) { - $aParts[] = '(' . join('|', $aSet) . ')'; - } - return join(',', $aParts); - } - - - public function testEmptyPhrase() - { - $oPhrase = new Phrase('', ''); - $oPhrase->computeWordSets(array(), new TokensFullSet()); - - $this->assertNull($oPhrase->getWordSets()); - } - - - public function testSingleWordPhrase() - { - $oPhrase = new Phrase('a', ''); - $oPhrase->computeWordSets(array('a'), new TokensFullSet()); - - $this->assertEquals( - '(a)', - $this->serializeSets($oPhrase->getWordSets()) - ); - } - - - public function testMultiWordPhrase() - { - $oPhrase = new Phrase('a b', ''); - $oPhrase->computeWordSets(array('a', 'b'), new TokensFullSet()); - $this->assertEquals( - '(a b),(a|b)', - $this->serializeSets($oPhrase->getWordSets()) - ); - - $oPhrase = new Phrase('a b c', ''); - $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet()); - $this->assertEquals( - '(a b c),(a|b c),(a b|c),(a|b|c)', - $this->serializeSets($oPhrase->getWordSets()) - ); - - $oPhrase = new Phrase('a b c d', ''); - $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensFullSet()); - $this->assertEquals( - '(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)', - $this->serializeSets($oPhrase->getWordSets()) - ); - } - - - public function testInverseWordSets() - { - $oPhrase = new Phrase('a b c', ''); - $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet()); - $oPhrase->invertWordSets(); - - $this->assertEquals( - '(a b c),(b c|a),(c|a b),(c|b|a)', - $this->serializeSets($oPhrase->getWordSets()) - ); - } - - - public function testMaxWordSets() - { - $aWords = array_fill(0, 4, 'a'); - $oPhrase = new Phrase(join(' ', $aWords), ''); - $oPhrase->computeWordSets($aWords, new TokensFullSet()); - $this->assertEquals(8, count($oPhrase->getWordSets())); - $oPhrase->invertWordSets(); - $this->assertEquals(8, count($oPhrase->getWordSets())); - - $aWords = array_fill(0, 18, 'a'); - $oPhrase = new Phrase(join(' ', $aWords), ''); - $oPhrase->computeWordSets($aWords, new TokensFullSet()); - $this->assertEquals(100, count($oPhrase->getWordSets())); - $oPhrase->invertWordSets(); - $this->assertEquals(100, count($oPhrase->getWordSets())); - } - - - public function testPartialTokensShortTerm() - { - $oPhrase = new Phrase('a b c d', ''); - $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d'))); - $this->assertEquals( - '(a|b c d),(a|b c|d)', - $this->serializeSets($oPhrase->getWordSets()) - ); - } - - - public function testPartialTokensLongTerm() - { - $aWords = array_fill(0, 18, 'a'); - $oPhrase = new Phrase(join(' ', $aWords), ''); - $oPhrase->computeWordSets($aWords, new TokensPartialSet(array('a', 'a a a a a'))); - $this->assertEquals(80, count($oPhrase->getWordSets())); - } -} diff --git a/test/php/Nominatim/SimpleWordListTest.php b/test/php/Nominatim/SimpleWordListTest.php new file mode 100644 index 00000000..5c993668 --- /dev/null +++ b/test/php/Nominatim/SimpleWordListTest.php @@ -0,0 +1,112 @@ +aTokens = array_flip($aTokens); + } + + public function containsAny($sTerm) + { + return isset($this->aTokens[$sTerm]); + } +} + +// phpcs:ignore PSR1.Classes.ClassDeclaration.MultipleClasses +class SimpleWordListTest extends \PHPUnit\Framework\TestCase +{ + + + private function serializeSets($aSets) + { + $aParts = array(); + foreach ($aSets as $aSet) { + $aParts[] = '(' . join('|', $aSet) . ')'; + } + return join(',', $aParts); + } + + + public function testEmptyPhrase() + { + $oList = new SimpleWordList(''); + $this->assertNull($oList->getWordSets(new TokensFullSet())); + } + + + public function testSingleWordPhrase() + { + $oList = new SimpleWordList('a'); + + $this->assertEquals( + '(a)', + $this->serializeSets($oList->getWordSets(new TokensFullSet())) + ); + } + + + public function testMultiWordPhrase() + { + $oList = new SimpleWordList('a b'); + $this->assertEquals( + '(a b),(a|b)', + $this->serializeSets($oList->getWordSets(new TokensFullSet())) + ); + + $oList = new SimpleWordList('a b c'); + $this->assertEquals( + '(a b c),(a|b c),(a b|c),(a|b|c)', + $this->serializeSets($oList->getWordSets(new TokensFullSet())) + ); + + $oList = new SimpleWordList('a b c d'); + $this->assertEquals( + '(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)', + $this->serializeSets($oList->getWordSets(new TokensFullSet())) + ); + } + + + public function testMaxWordSets() + { + $aWords = array_fill(0, 4, 'a'); + $oList = new SimpleWordList(join(' ', $aWords)); + $this->assertEquals(8, count($oList->getWordSets(new TokensFullSet()))); + + $aWords = array_fill(0, 18, 'a'); + $oList = new SimpleWordList(join(' ', $aWords)); + $this->assertEquals(100, count($oList->getWordSets(new TokensFullSet()))); + } + + + public function testPartialTokensShortTerm() + { + $oList = new SimpleWordList('a b c d'); + $this->assertEquals( + '(a|b c d),(a|b c|d)', + $this->serializeSets($oList->getWordSets(new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')))) + ); + } + + + public function testPartialTokensLongTerm() + { + $aWords = array_fill(0, 18, 'a'); + $oList = new SimpleWordList(join(' ', $aWords)); + $this->assertEquals(80, count($oList->getWordSets(new TokensPartialSet(array('a', 'a a a a a'))))); + } +} -- 2.39.5