From: Sarah Hoffmann Date: Mon, 25 Oct 2021 18:46:01 +0000 (+0200) Subject: ICU: additional ranking by matching of normalised term X-Git-Tag: deploy~146 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/b6a831443c8f46bfe1be59ecd1cb7a6847fdf948 ICU: additional ranking by matching of normalised term Keep track of normalised word for tokens and then recheck against normalized form in database to exclude non-matching script. --- diff --git a/lib-php/TokenWord.php b/lib-php/TokenWord.php index 59456e35..b9e28d91 100644 --- a/lib-php/TokenWord.php +++ b/lib-php/TokenWord.php @@ -13,12 +13,15 @@ class Word private $iSearchNameCount; /// Number of terms in the word. private $iTermCount; + /// Match score. + private $iMatchScore; - public function __construct($iId, $iSearchNameCount, $iTermCount) + public function __construct($iId, $iSearchNameCount, $iTermCount, $iMatchScore = 1) { $this->iId = $iId; $this->iSearchNameCount = $iSearchNameCount; $this->iTermCount = $iTermCount; + $this->iMatchScore = $iMatchScore; } public function getId() @@ -63,13 +66,13 @@ class Word if ($this->iTermCount > 1 && ($oPosition->isPhrase('') || !$oPosition->isFirstPhrase()) ) { - $oNewSearch = $oSearch->clone(1); + $oNewSearch = $oSearch->clone($this->iMatchScore); $oNewSearch->addAddressToken($this->iId); return array($oNewSearch); } } elseif (!$oSearch->hasName(true)) { - $oNewSearch = $oSearch->clone(1); + $oNewSearch = $oSearch->clone($this->iMatchScore); $oNewSearch->addNameToken( $this->iId, CONST_Search_NameOnlySearchFrequencyThreshold @@ -90,7 +93,8 @@ class Word 'Type' => 'word', 'Info' => array( 'count' => $this->iSearchNameCount, - 'terms' => $this->iTermCount + 'terms' => $this->iTermCount, + 'score' => $this->iMatchScore ) ); } diff --git a/lib-php/tokenizer/icu_tokenizer.php b/lib-php/tokenizer/icu_tokenizer.php index f4dd3aeb..cebdac47 100644 --- a/lib-php/tokenizer/icu_tokenizer.php +++ b/lib-php/tokenizer/icu_tokenizer.php @@ -89,19 +89,26 @@ class Tokenizer $aWordLists = array(); $aTokens = array(); foreach ($aPhrases as $iPhrase => $oPhrase) { - $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase()); - $sPhrase = $this->makeStandardWord($oPhrase->getPhrase()); - Debug::printVar('Phrase', $sPhrase); + $sNormPhrase = $this->normalizeString($oPhrase->getPhrase()); + Debug::printVar('Phrase', $sNormPhrase); + + $oWordList = new SimpleWordList($sNormPhrase); + + foreach ($oWordList->getTokens() as $sToken) { + $sTransToken = $this->makeStandardWord($sToken); + if (!isset($aTokens[$sTransToken])) { + $aTokens[$sTransToken] = array(); + } + $aTokens[$sTransToken][$sToken] = $sToken; + } - $oWordList = new SimpleWordList($sPhrase); - $aTokens = array_merge($aTokens, $oWordList->getTokens()); $aWordLists[] = $oWordList; } Debug::printVar('Tokens', $aTokens); Debug::printVar('WordLists', $aWordLists); - $oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery); + $oValidTokens = $this->computeValidTokens($aTokens); foreach ($aPhrases as $iPhrase => $oPhrase) { $oPhrase->setWordSets($aWordLists[$iPhrase]->getWordSets($oValidTokens)); @@ -111,16 +118,16 @@ class Tokenizer } - private function computeValidTokens($aTokens, $sNormQuery) + private function computeValidTokens($aTokens) { $oValidTokens = new TokenList(); if (!empty($aTokens)) { - $this->addTokensFromDB($oValidTokens, $aTokens, $sNormQuery); + $this->addTokensFromDB($oValidTokens, $aTokens); // Try more interpretations for Tokens that could not be matched. - foreach ($aTokens as $sToken) { - if ($sToken[0] != ' ' && !$oValidTokens->contains($sToken)) { + foreach ($aTokens as $sToken => $aNormalized) { + if (!$oValidTokens->contains($sToken)) { if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) { // US ZIP+4 codes - merge in the 5-digit ZIP code $oValidTokens->addToken( @@ -143,7 +150,7 @@ class Tokenizer } - private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery) + private function addTokensFromDB(&$oValidTokens, $aTokens) { // Check which tokens we have, get the ID numbers $sSQL = 'SELECT word_id, word_token, type, word,'; @@ -151,7 +158,7 @@ class Tokenizer $sSQL .= " info->>'class' as class, info->>'type' as ctype,"; $sSQL .= " info->>'count' as count"; $sSQL .= ' FROM word WHERE word_token in ('; - $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')'; + $sSQL .= join(',', $this->oDB->getDBQuotedList(array_keys($aTokens))).')'; Debug::printSQL($sSQL); @@ -160,18 +167,23 @@ class Tokenizer foreach ($aDBWords as $aWord) { $iId = (int) $aWord['word_id']; $sTok = $aWord['word_token']; + $aNorms = $aTokens[$sTok]; switch ($aWord['type']) { case 'C': // country name tokens if ($aWord['word'] !== null) { - $oValidTokens->addToken( - $sTok, - new Token\Country($iId, $aWord['word']) - ); + foreach ($aNorms as $sNorm) { + $oValidTokens->addToken( + $sNorm, + new Token\Country($iId, $aWord['word']) + ); + } } break; case 'H': // house number tokens - $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $aWord['word_token'])); + foreach ($aNorms as $sNorm) { + $oValidTokens->addToken($sNorm, new Token\HouseNumber($iId, $aWord['word_token'])); + } break; case 'P': // postcode tokens // Postcodes are not normalized, so they may have content @@ -181,37 +193,48 @@ class Tokenizer && pg_escape_string($aWord['word']) == $aWord['word'] ) { $sNormPostcode = $this->normalizeString($aWord['word']); - if (strpos($sNormQuery, $sNormPostcode) !== false) { - $oValidTokens->addToken( - $sTok, - new Token\Postcode($iId, $aWord['word'], null) - ); + foreach ($aNorms as $sNorm) { + if ($sNormPostcode == $sNorm) { + $oValidTokens->addToken( + $sNorm, + new Token\Postcode($iId, $aWord['word'], null) + ); + } } } break; case 'S': // tokens for classification terms (special phrases) if ($aWord['class'] !== null && $aWord['ctype'] !== null) { - $oValidTokens->addToken($sTok, new Token\SpecialTerm( - $iId, - $aWord['class'], - $aWord['ctype'], - (isset($aWord['operator'])) ? Operator::NEAR : Operator::NONE - )); + foreach ($aNorms as $sNorm) { + if ($aWord['word'] == $sNorm) { + $oValidTokens->addToken($sTok, new Token\SpecialTerm( + $iId, + $aWord['class'], + $aWord['ctype'], + (isset($aWord['operator'])) ? Operator::NEAR : Operator::NONE + )); + } + } } break; case 'W': // full-word tokens - $oValidTokens->addToken($sTok, new Token\Word( - $iId, - (int) $aWord['count'], - substr_count($aWord['word_token'], ' ') - )); + foreach ($aNorms as $sNorm) { + $oValidTokens->addToken($sNorm, new Token\Word( + $iId, + (int) $aWord['count'], + substr_count($aWord['word_token'], ' ') + 1, + levenshtein($aWord['word'], $sNorm) + 1 + )); + } break; case 'w': // partial word terms - $oValidTokens->addToken($sTok, new Token\Partial( - $iId, - $aWord['word_token'], - (int) $aWord['count'] - )); + foreach ($aNorms as $sNorm) { + $oValidTokens->addToken($sNorm, new Token\Partial( + $iId, + $aWord['word_token'], + (int) $aWord['count'] + )); + } break; default: break;