From 3cd85eaaf16f740bb984436184b37a84a94b2553 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 15 Jul 2021 14:48:20 +0200 Subject: [PATCH] remove Token from explicit input for SearchDescription extension The token string is only required by the PartialToken type, so it can simply save the token string internally. No need to pass it to every type. Also moves the check for multi-word partials to the token loader code in the tokenizer. Multi-word partials can only happen with the legacy tokenizer and when the database was loaded with an older version of Nominatim. No need to keep the check for everybody. --- lib-php/Geocode.php | 1 - lib-php/SearchDescription.php | 12 ++++-------- lib-php/TokenPartial.php | 5 ++++- lib-php/tokenizer/legacy_icu_tokenizer.php | 1 + lib-php/tokenizer/legacy_tokenizer.php | 5 ++++- 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/lib-php/Geocode.php b/lib-php/Geocode.php index c2b4f4e4..001c1e1e 100644 --- a/lib-php/Geocode.php +++ b/lib-php/Geocode.php @@ -363,7 +363,6 @@ class Geocode foreach ($aWordsetSearches as $oCurrentSearch) { foreach ($oValidTokens->get($sToken) as $oSearchTerm) { $aNewSearches = $oCurrentSearch->extendWithSearchTerm( - $sToken, $oSearchTerm, $oPosition ); diff --git a/lib-php/SearchDescription.php b/lib-php/SearchDescription.php index 8924287a..b4a78eb8 100644 --- a/lib-php/SearchDescription.php +++ b/lib-php/SearchDescription.php @@ -152,14 +152,13 @@ class SearchDescription /** * Derive new searches by adding a full term to the existing search. * - * @param string $sToken Term for the token. * @param object $oSearchTerm Description of the token. * @param object $oPosition Description of the token position within the query. * * @return SearchDescription[] List of derived search descriptions. */ - public function extendWithSearchTerm($sToken, $oSearchTerm, $oPosition) + public function extendWithSearchTerm($oSearchTerm, $oPosition) { $aNewSearches = array(); @@ -315,10 +314,8 @@ class SearchDescription } } elseif (!$oPosition->isPhrase('country') && is_a($oSearchTerm, '\Nominatim\Token\Partial') - && strpos($sToken, ' ') === false ) { $aNewSearches = $this->extendWithPartialTerm( - $sToken, $oSearchTerm, $oPosition ); @@ -330,14 +327,13 @@ class SearchDescription /** * Derive new searches by adding a partial term to the existing search. * - * @param string $sToken Term for the token. * @param object $oSearchTerm Description of the token. * @param object $oPosition Description of the token position within the query. * * @return SearchDescription[] List of derived search descriptions. */ - private function extendWithPartialTerm($sToken, $oSearchTerm, $oPosition) + private function extendWithPartialTerm($oSearchTerm, $oPosition) { $aNewSearches = array(); $iWordID = $oSearchTerm->iId; @@ -347,7 +343,7 @@ class SearchDescription ) { $oSearch = clone $this; $oSearch->iSearchRank++; - if (preg_match('#^[0-9 ]+$#', $sToken)) { + if (preg_match('#^[0-9 ]+$#', $oSearchTerm->sToken)) { $oSearch->iSearchRank++; } if ($oSearchTerm->iSearchNameCount < CONST_Max_Word_Frequency) { @@ -367,7 +363,7 @@ class SearchDescription if (empty($this->aName) && empty($this->aNameNonSearch)) { $oSearch->iSearchRank++; } - if (preg_match('#^[0-9 ]+$#', $sToken)) { + if (preg_match('#^[0-9 ]+$#', $oSearchTerm->sToken)) { $oSearch->iSearchRank++; } if ($oSearchTerm->iSearchNameCount < CONST_Max_Word_Frequency) { diff --git a/lib-php/TokenPartial.php b/lib-php/TokenPartial.php index 477ef9c5..99a75947 100644 --- a/lib-php/TokenPartial.php +++ b/lib-php/TokenPartial.php @@ -11,10 +11,13 @@ class Partial public $iId; /// Number of appearances in the database. public $iSearchNameCount; + /// Normalised version of the partial word. + public $sToken; - public function __construct($iId, $iSearchNameCount) + public function __construct($iId, $sToken, $iSearchNameCount) { $this->iId = $iId; + $this->sToken = $sToken; $this->iSearchNameCount = $iSearchNameCount; } diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index 96a1d8a6..2c0884c8 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -205,6 +205,7 @@ class Tokenizer } else { $oToken = new Token\Partial( $iId, + $aWord['word_token'], (int) $aWord['count'] ); } diff --git a/lib-php/tokenizer/legacy_tokenizer.php b/lib-php/tokenizer/legacy_tokenizer.php index 238fbcf4..064b4166 100644 --- a/lib-php/tokenizer/legacy_tokenizer.php +++ b/lib-php/tokenizer/legacy_tokenizer.php @@ -218,9 +218,12 @@ class Tokenizer (int) $aWord['count'], substr_count($aWord['word_token'], ' ') ); - } else { + // For backward compatibility: ignore all partial tokens with more + // than one word. + } elseif (strpos($aWord['word_token'], ' ') === false) { $oToken = new Token\Partial( $iId, + $aWord['word_token'], (int) $aWord['count'] ); } -- 2.39.5