X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/118858a55e5ec522d870842532d26ff0276c85ba..afb439b089efd45a5393be8eb0c3e8877f1ff7a7:/lib-php/tokenizer/icu_tokenizer.php?ds=sidebyside diff --git a/lib-php/tokenizer/icu_tokenizer.php b/lib-php/tokenizer/icu_tokenizer.php index ca224a22..e45d0765 100644 --- a/lib-php/tokenizer/icu_tokenizer.php +++ b/lib-php/tokenizer/icu_tokenizer.php @@ -1,4 +1,12 @@ oNormalizer->transliterate($sTerm); } + + public function mostFrequentWords($iNum) + { + $sSQL = "SELECT word FROM word WHERE type = 'W'"; + $sSQL .= "ORDER BY info->'count' DESC LIMIT ".$iNum; + return $this->oDB->getCol($sSQL); + } + + private function makeStandardWord($sTerm) { return trim($this->oTransliterator->transliterate(' '.$sTerm.' ')); @@ -140,7 +157,8 @@ class Tokenizer $sSQL = 'SELECT word_id, word_token, type, word,'; $sSQL .= " info->>'op' as operator,"; $sSQL .= " info->>'class' as class, info->>'type' as ctype,"; - $sSQL .= " info->>'count' as count"; + $sSQL .= " info->>'count' as count,"; + $sSQL .= " info->>'lookup' as lookup"; $sSQL .= ' FROM word WHERE word_token in ('; $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')'; @@ -162,7 +180,8 @@ class Tokenizer } break; case 'H': // house number tokens - $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $aWord['word_token'])); + $sLookup = $aWord['lookup'] ?? $aWord['word_token']; + $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $sLookup)); break; case 'P': // postcode tokens // Postcodes are not normalized, so they may have content @@ -171,13 +190,17 @@ class Tokenizer if ($aWord['word'] !== null && pg_escape_string($aWord['word']) == $aWord['word'] ) { - $sNormPostcode = $this->normalizeString($aWord['word']); - if (strpos($sNormQuery, $sNormPostcode) !== false) { - $oValidTokens->addToken( - $sTok, - new Token\Postcode($iId, $aWord['word'], null) - ); + $iSplitPos = strpos($aWord['word'], '@'); + if ($iSplitPos === false) { + $sPostcode = $aWord['word']; + } else { + $sPostcode = substr($aWord['word'], 0, $iSplitPos); } + + $oValidTokens->addToken( + $sTok, + new Token\Postcode($iId, $sPostcode, null) + ); } break; case 'S': // tokens for classification terms (special phrases)