From: Sarah Hoffmann Date: Mon, 25 Oct 2021 11:08:16 +0000 (+0200) Subject: fix warming for ICU tokenizer X-Git-Tag: v4.0.0~11^2 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/16cc395f7834774ac7c518a1e2ba9584ef44a896 fix warming for ICU tokenizer Running the warm-up search requests requires querying the most frequent words. This must be done via the tokenizer to honor the different formats of the word table. --- diff --git a/lib-php/admin/warm.php b/lib-php/admin/warm.php index 39a37506..338ec2da 100644 --- a/lib-php/admin/warm.php +++ b/lib-php/admin/warm.php @@ -86,8 +86,13 @@ if (!$aResult['reverse-only']) { if ($bVerbose) { echo "\n"; } + + $oTokenizer = new \Nominatim\Tokenizer($oDB); + + $aWords = $oTokenizer->mostFrequentWords(1000); + $sSQL = 'SELECT word FROM word WHERE word is not null ORDER BY search_name_count DESC LIMIT 1000'; - foreach ($oDB->getCol($sSQL) as $sWord) { + foreach ($aWords as $sWord) { if ($bVerbose) { echo "$sWord = "; } diff --git a/lib-php/tokenizer/icu_tokenizer.php b/lib-php/tokenizer/icu_tokenizer.php index ca224a22..f4dd3aeb 100644 --- a/lib-php/tokenizer/icu_tokenizer.php +++ b/lib-php/tokenizer/icu_tokenizer.php @@ -40,6 +40,15 @@ class Tokenizer return $this->oNormalizer->transliterate($sTerm); } + + public function mostFrequentWords($iNum) + { + $sSQL = "SELECT word FROM word WHERE type = 'W'"; + $sSQL .= "ORDER BY info->'count' DESC LIMIT ".$iNum; + return $this->oDB->getCol($sSQL); + } + + private function makeStandardWord($sTerm) { return trim($this->oTransliterator->transliterate(' '.$sTerm.' ')); diff --git a/lib-php/tokenizer/legacy_tokenizer.php b/lib-php/tokenizer/legacy_tokenizer.php index b508d220..d5686f64 100644 --- a/lib-php/tokenizer/legacy_tokenizer.php +++ b/lib-php/tokenizer/legacy_tokenizer.php @@ -48,6 +48,14 @@ class Tokenizer } + public function mostFrequentWords($iNum) + { + $sSQL = 'SELECT word FROM word WHERE word is not null '; + $sSQL .= 'ORDER BY search_name_count DESC LIMIT '.$iNum; + return $this->oDB->getCol($sSQL); + } + + public function tokensForSpecialTerm($sTerm) { $aResults = array();