]> git.openstreetmap.org Git - nominatim.git/blobdiff - lib-php/tokenizer/legacy_icu_tokenizer.php
introduce a separate token type for partials
[nominatim.git] / lib-php / tokenizer / legacy_icu_tokenizer.php
index 09cfe70fbf661a3e0440531310a45fb1fabbfab7..8cff6f322410366d2e0ca2ceaf143d2b2035ce64 100644 (file)
@@ -47,9 +47,7 @@ class Tokenizer
 
     private function makeStandardWord($sTerm)
     {
-        $sNorm = ' '.$this->oTransliterator->transliterate($sTerm).' ';
-
-        return trim(str_replace(CONST_Abbreviations[0], CONST_Abbreviations[1], $sNorm));
+        return trim($this->oTransliterator->transliterate(' '.$sTerm.' '));
     }
 
 
@@ -90,6 +88,7 @@ class Tokenizer
         foreach ($aPhrases as $iPhrase => $oPhrase) {
             $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
             $sPhrase = $this->makeStandardWord($oPhrase->getPhrase());
+            Debug::printVar('Phrase', $sPhrase);
             if (strlen($sPhrase) > 0) {
                 $aWords = explode(' ', $sPhrase);
                 Tokenizer::addTokens($aTokens, $aWords);
@@ -196,17 +195,27 @@ class Tokenizer
                 ) {
                     $oToken = new Token\Country($iId, $aWord['country_code']);
                 }
+            } elseif ($aWord['word_token'][0] == ' ') {
+                 $oToken = new Token\Word(
+                     $iId,
+                     $aWord['word_token'][0] != ' ',
+                     (int) $aWord['count'],
+                     substr_count($aWord['word_token'], ' ')
+                 );
             } else {
-                $oToken = new Token\Word(
+                $oToken = new Token\Partial(
                     $iId,
-                    $aWord['word_token'][0] != ' ',
-                    (int) $aWord['count'],
-                    substr_count($aWord['word_token'], ' ')
+                    (int) $aWord['count']
                 );
             }
 
             if ($oToken) {
-                $oValidTokens->addToken($aWord['word_token'], $oToken);
+                // remove any leading spaces
+                if ($aWord['word_token'][0] == ' ') {
+                    $oValidTokens->addToken(substr($aWord['word_token'], 1), $oToken);
+                } else {
+                    $oValidTokens->addToken($aWord['word_token'], $oToken);
+                }
             }
         }
     }