From 16cc395f7834774ac7c518a1e2ba9584ef44a896 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Mon, 25 Oct 2021 13:08:16 +0200 Subject: [PATCH] fix warming for ICU tokenizer Running the warm-up search requests requires querying the most frequent words. This must be done via the tokenizer to honor the different formats of the word table. --- lib-php/admin/warm.php | 7 ++++++- lib-php/tokenizer/icu_tokenizer.php | 9 +++++++++ lib-php/tokenizer/legacy_tokenizer.php | 8 ++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/lib-php/admin/warm.php b/lib-php/admin/warm.php index 39a37506..338ec2da 100644 --- a/lib-php/admin/warm.php +++ b/lib-php/admin/warm.php @@ -86,8 +86,13 @@ if (!$aResult['reverse-only']) { if ($bVerbose) { echo "\n"; } + + $oTokenizer = new \Nominatim\Tokenizer($oDB); + + $aWords = $oTokenizer->mostFrequentWords(1000); + $sSQL = 'SELECT word FROM word WHERE word is not null ORDER BY search_name_count DESC LIMIT 1000'; - foreach ($oDB->getCol($sSQL) as $sWord) { + foreach ($aWords as $sWord) { if ($bVerbose) { echo "$sWord = "; } diff --git a/lib-php/tokenizer/icu_tokenizer.php b/lib-php/tokenizer/icu_tokenizer.php index ca224a22..f4dd3aeb 100644 --- a/lib-php/tokenizer/icu_tokenizer.php +++ b/lib-php/tokenizer/icu_tokenizer.php @@ -40,6 +40,15 @@ class Tokenizer return $this->oNormalizer->transliterate($sTerm); } + + public function mostFrequentWords($iNum) + { + $sSQL = "SELECT word FROM word WHERE type = 'W'"; + $sSQL .= "ORDER BY info->'count' DESC LIMIT ".$iNum; + return $this->oDB->getCol($sSQL); + } + + private function makeStandardWord($sTerm) { return trim($this->oTransliterator->transliterate(' '.$sTerm.' ')); diff --git a/lib-php/tokenizer/legacy_tokenizer.php b/lib-php/tokenizer/legacy_tokenizer.php index b508d220..d5686f64 100644 --- a/lib-php/tokenizer/legacy_tokenizer.php +++ b/lib-php/tokenizer/legacy_tokenizer.php @@ -48,6 +48,14 @@ class Tokenizer } + public function mostFrequentWords($iNum) + { + $sSQL = 'SELECT word FROM word WHERE word is not null '; + $sSQL .= 'ORDER BY search_name_count DESC LIMIT '.$iNum; + return $this->oDB->getCol($sSQL); + } + + public function tokensForSpecialTerm($sTerm) { $aResults = array(); -- 2.39.5