X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/b6a831443c8f46bfe1be59ecd1cb7a6847fdf948..df15f13c628a3e15dee4bcad44e97b2cacb40b9f:/lib-php/tokenizer/icu_tokenizer.php diff --git a/lib-php/tokenizer/icu_tokenizer.php b/lib-php/tokenizer/icu_tokenizer.php index cebdac47..e45d0765 100644 --- a/lib-php/tokenizer/icu_tokenizer.php +++ b/lib-php/tokenizer/icu_tokenizer.php @@ -1,4 +1,12 @@ $oPhrase) { - $sNormPhrase = $this->normalizeString($oPhrase->getPhrase()); - Debug::printVar('Phrase', $sNormPhrase); - - $oWordList = new SimpleWordList($sNormPhrase); - - foreach ($oWordList->getTokens() as $sToken) { - $sTransToken = $this->makeStandardWord($sToken); - if (!isset($aTokens[$sTransToken])) { - $aTokens[$sTransToken] = array(); - } - $aTokens[$sTransToken][$sToken] = $sToken; - } + $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase()); + $sPhrase = $this->makeStandardWord($oPhrase->getPhrase()); + Debug::printVar('Phrase', $sPhrase); + $oWordList = new SimpleWordList($sPhrase); + $aTokens = array_merge($aTokens, $oWordList->getTokens()); $aWordLists[] = $oWordList; } Debug::printVar('Tokens', $aTokens); Debug::printVar('WordLists', $aWordLists); - $oValidTokens = $this->computeValidTokens($aTokens); + $oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery); foreach ($aPhrases as $iPhrase => $oPhrase) { $oPhrase->setWordSets($aWordLists[$iPhrase]->getWordSets($oValidTokens)); @@ -118,16 +119,16 @@ class Tokenizer } - private function computeValidTokens($aTokens) + private function computeValidTokens($aTokens, $sNormQuery) { $oValidTokens = new TokenList(); if (!empty($aTokens)) { - $this->addTokensFromDB($oValidTokens, $aTokens); + $this->addTokensFromDB($oValidTokens, $aTokens, $sNormQuery); // Try more interpretations for Tokens that could not be matched. - foreach ($aTokens as $sToken => $aNormalized) { - if (!$oValidTokens->contains($sToken)) { + foreach ($aTokens as $sToken) { + if ($sToken[0] != ' ' && !$oValidTokens->contains($sToken)) { if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) { // US ZIP+4 codes - merge in the 5-digit ZIP code $oValidTokens->addToken( @@ -150,15 +151,16 @@ class Tokenizer } - private function addTokensFromDB(&$oValidTokens, $aTokens) + private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery) { // Check which tokens we have, get the ID numbers $sSQL = 'SELECT word_id, word_token, type, word,'; $sSQL .= " info->>'op' as operator,"; $sSQL .= " info->>'class' as class, info->>'type' as ctype,"; - $sSQL .= " info->>'count' as count"; + $sSQL .= " info->>'count' as count,"; + $sSQL .= " info->>'lookup' as lookup"; $sSQL .= ' FROM word WHERE word_token in ('; - $sSQL .= join(',', $this->oDB->getDBQuotedList(array_keys($aTokens))).')'; + $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')'; Debug::printSQL($sSQL); @@ -167,23 +169,19 @@ class Tokenizer foreach ($aDBWords as $aWord) { $iId = (int) $aWord['word_id']; $sTok = $aWord['word_token']; - $aNorms = $aTokens[$sTok]; switch ($aWord['type']) { case 'C': // country name tokens if ($aWord['word'] !== null) { - foreach ($aNorms as $sNorm) { - $oValidTokens->addToken( - $sNorm, - new Token\Country($iId, $aWord['word']) - ); - } + $oValidTokens->addToken( + $sTok, + new Token\Country($iId, $aWord['word']) + ); } break; case 'H': // house number tokens - foreach ($aNorms as $sNorm) { - $oValidTokens->addToken($sNorm, new Token\HouseNumber($iId, $aWord['word_token'])); - } + $sLookup = $aWord['lookup'] ?? $aWord['word_token']; + $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $sLookup)); break; case 'P': // postcode tokens // Postcodes are not normalized, so they may have content @@ -192,49 +190,42 @@ class Tokenizer if ($aWord['word'] !== null && pg_escape_string($aWord['word']) == $aWord['word'] ) { - $sNormPostcode = $this->normalizeString($aWord['word']); - foreach ($aNorms as $sNorm) { - if ($sNormPostcode == $sNorm) { - $oValidTokens->addToken( - $sNorm, - new Token\Postcode($iId, $aWord['word'], null) - ); - } + $iSplitPos = strpos($aWord['word'], '@'); + if ($iSplitPos === false) { + $sPostcode = $aWord['word']; + } else { + $sPostcode = substr($aWord['word'], 0, $iSplitPos); } + + $oValidTokens->addToken( + $sTok, + new Token\Postcode($iId, $sPostcode, null) + ); } break; case 'S': // tokens for classification terms (special phrases) if ($aWord['class'] !== null && $aWord['ctype'] !== null) { - foreach ($aNorms as $sNorm) { - if ($aWord['word'] == $sNorm) { - $oValidTokens->addToken($sTok, new Token\SpecialTerm( - $iId, - $aWord['class'], - $aWord['ctype'], - (isset($aWord['operator'])) ? Operator::NEAR : Operator::NONE - )); - } - } - } - break; - case 'W': // full-word tokens - foreach ($aNorms as $sNorm) { - $oValidTokens->addToken($sNorm, new Token\Word( + $oValidTokens->addToken($sTok, new Token\SpecialTerm( $iId, - (int) $aWord['count'], - substr_count($aWord['word_token'], ' ') + 1, - levenshtein($aWord['word'], $sNorm) + 1 + $aWord['class'], + $aWord['ctype'], + (isset($aWord['operator'])) ? Operator::NEAR : Operator::NONE )); } break; + case 'W': // full-word tokens + $oValidTokens->addToken($sTok, new Token\Word( + $iId, + (int) $aWord['count'], + substr_count($aWord['word_token'], ' ') + )); + break; case 'w': // partial word terms - foreach ($aNorms as $sNorm) { - $oValidTokens->addToken($sNorm, new Token\Partial( - $iId, - $aWord['word_token'], - (int) $aWord['count'] - )); - } + $oValidTokens->addToken($sTok, new Token\Partial( + $iId, + $aWord['word_token'], + (int) $aWord['count'] + )); break; default: break;