]> git.openstreetmap.org Git - nominatim.git/blobdiff - lib-php/Geocode.php
add PHP part for new ICU-base tokenizer
[nominatim.git] / lib-php / Geocode.php
index f638af9a9a300a927f066a59690c23b6ee58e966..53ee49c043f53d80434887978434998987f9c830 100644 (file)
@@ -8,17 +8,18 @@ require_once(CONST_LibDir.'/ReverseGeocode.php');
 require_once(CONST_LibDir.'/SearchDescription.php');
 require_once(CONST_LibDir.'/SearchContext.php');
 require_once(CONST_LibDir.'/TokenList.php');
+require_once(CONST_TokenizerDir.'/tokenizer.php');
 
 class Geocode
 {
     protected $oDB;
 
     protected $oPlaceLookup;
+    protected $oTokenizer;
 
     protected $aLangPrefOrder = array();
 
     protected $aExcludePlaceIDs = array();
-    protected $bReverseInPlan = true;
 
     protected $iLimit = 20;
     protected $iFinalLimit = 10;
@@ -42,28 +43,12 @@ class Geocode
     protected $sQuery = false;
     protected $aStructuredQuery = false;
 
-    protected $oNormalizer = null;
-
 
     public function __construct(&$oDB)
     {
         $this->oDB =& $oDB;
         $this->oPlaceLookup = new PlaceLookup($this->oDB);
-        $this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
-    }
-
-    private function normTerm($sTerm)
-    {
-        if ($this->oNormalizer === null) {
-            return $sTerm;
-        }
-
-        return $this->oNormalizer->transliterate($sTerm);
-    }
-
-    public function setReverseInPlan($bReverse)
-    {
-        $this->bReverseInPlan = $bReverse;
+        $this->oTokenizer = new \Nominatim\Tokenizer($this->oDB);
     }
 
     public function setLanguagePreference($aLangPref)
@@ -262,7 +247,6 @@ class Geocode
                 $oParams->getString('country'),
                 $oParams->getString('postalcode')
             );
-            $this->setReverseInPlan(false);
         } else {
             $this->setQuery($sQuery);
         }
@@ -330,7 +314,7 @@ class Geocode
         return false;
     }
 
-    public function getGroupedSearches($aSearches, $aPhrases, $oValidTokens, $bIsStructured)
+    public function getGroupedSearches($aSearches, $aPhrases, $oValidTokens)
     {
         /*
              Calculate all searches using oValidTokens i.e.
@@ -345,7 +329,7 @@ class Geocode
          */
         foreach ($aPhrases as $iPhrase => $oPhrase) {
             $aNewPhraseSearches = array();
-            $sPhraseType = $bIsStructured ? $oPhrase->getPhraseType() : '';
+            $sPhraseType = $oPhrase->getPhraseType();
 
             foreach ($oPhrase->getWordSets() as $aWordset) {
                 $aWordsetSearches = $aSearches;
@@ -388,7 +372,7 @@ class Geocode
                                 $aNewSearches = $oCurrentSearch->extendWithPartialTerm(
                                     $sToken,
                                     $oSearchTerm,
-                                    $bIsStructured,
+                                    (bool) $sPhraseType,
                                     $iPhrase,
                                     $oValidTokens->get(' '.$sToken)
                                 );
@@ -517,12 +501,10 @@ class Geocode
         if ($this->aCountryCodes) {
             $oCtx->setCountryList($this->aCountryCodes);
         }
+        $this->oTokenizer->setCountryRestriction($this->aCountryCodes);
 
         Debug::newSection('Query Preprocessing');
 
-        $sNormQuery = $this->normTerm($this->sQuery);
-        Debug::printVar('Normalized query', $sNormQuery);
-
         $sLanguagePrefArraySQL = $this->oDB->getArraySQL(
             $this->oDB->getDBQuotedList($this->aLangPrefOrder)
         );
@@ -576,117 +558,62 @@ class Geocode
             }
 
             if ($sSpecialTerm && !$aSearches[0]->hasOperator()) {
-                $sSpecialTerm = pg_escape_string($sSpecialTerm);
-                $sToken = $this->oDB->getOne(
-                    'SELECT make_standard_name(:term)',
-                    array(':term' => $sSpecialTerm),
-                    'Cannot decode query. Wrong encoding?'
-                );
-                $sSQL = 'SELECT class, type FROM word ';
-                $sSQL .= '   WHERE word_token in (\' '.$sToken.'\')';
-                $sSQL .= '   AND class is not null AND class not in (\'place\')';
-
-                Debug::printSQL($sSQL);
-                $aSearchWords = $this->oDB->getAll($sSQL);
-                $aNewSearches = array();
-                foreach ($aSearches as $oSearch) {
-                    foreach ($aSearchWords as $aSearchTerm) {
-                        $oNewSearch = clone $oSearch;
-                        $oNewSearch->setPoiSearch(
-                            Operator::TYPE,
-                            $aSearchTerm['class'],
-                            $aSearchTerm['type']
-                        );
-                        $aNewSearches[] = $oNewSearch;
+                $aTokens = $this->oTokenizer->tokensForSpecialTerm($sSpecialTerm);
+
+                if (!empty($aTokens)) {
+                    $aNewSearches = array();
+                    foreach ($aSearches as $oSearch) {
+                        foreach ($aTokens as $oToken) {
+                            $oNewSearch = clone $oSearch;
+                            $oNewSearch->setPoiSearch(
+                                $oToken->iOperator,
+                                $oToken->sClass,
+                                $oToken->sType
+                            );
+                            $aNewSearches[] = $oNewSearch;
+                        }
                     }
+                    $aSearches = $aNewSearches;
                 }
-                $aSearches = $aNewSearches;
             }
 
             // Split query into phrases
             // Commas are used to reduce the search space by indicating where phrases split
+            $aPhrases = array();
             if ($this->aStructuredQuery) {
-                $aInPhrases = $this->aStructuredQuery;
-                $bStructuredPhrases = true;
+                foreach ($this->aStructuredQuery as $iPhrase => $sPhrase) {
+                    $aPhrases[] = new Phrase($sPhrase, $iPhrase);
+                }
             } else {
-                $aInPhrases = explode(',', $sQuery);
-                $bStructuredPhrases = false;
+                foreach (explode(',', $sQuery) as $sPhrase) {
+                    $aPhrases[] = new Phrase($sPhrase, '');
+                }
             }
 
             Debug::printDebugArray('Search context', $oCtx);
             Debug::printDebugArray('Base search', empty($aSearches) ? null : $aSearches[0]);
-            Debug::printVar('Final query phrases', $aInPhrases);
 
-            // Convert each phrase to standard form
-            // Create a list of standard words
-            // Get all 'sets' of words
-            // Generate a complete list of all
             Debug::newSection('Tokenization');
-            $aTokens = array();
-            $aPhrases = array();
-            foreach ($aInPhrases as $iPhrase => $sPhrase) {
-                $sPhrase = $this->oDB->getOne(
-                    'SELECT make_standard_name(:phrase)',
-                    array(':phrase' => $sPhrase),
-                    'Cannot normalize query string (is it a UTF-8 string?)'
-                );
-                if (trim($sPhrase)) {
-                    $oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : '');
-                    $oPhrase->addTokens($aTokens);
-                    $aPhrases[] = $oPhrase;
-                }
-            }
-
-            Debug::printVar('Tokens', $aTokens);
-
-            $oValidTokens = new TokenList();
-
-            if (!empty($aTokens)) {
-                $oValidTokens->addTokensFromDB(
-                    $this->oDB,
-                    $aTokens,
-                    $this->aCountryCodes,
-                    $sNormQuery,
-                    $this->oNormalizer
-                );
+            $oValidTokens = $this->oTokenizer->extractTokensFromPhrases($aPhrases);
 
+            if ($oValidTokens->count() > 0) {
                 $oCtx->setFullNameWords($oValidTokens->getFullWordIDs());
 
-                // Try more interpretations for Tokens that could not be matched.
-                foreach ($aTokens as $sToken) {
-                    if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
-                        if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
-                            // US ZIP+4 codes - merge in the 5-digit ZIP code
-                            $oValidTokens->addToken(
-                                $sToken,
-                                new Token\Postcode(null, $aData[1], 'us')
-                            );
-                        } elseif (preg_match('/^ [0-9]+$/', $sToken)) {
-                            // Unknown single word token with a number.
-                            // Assume it is a house number.
-                            $oValidTokens->addToken(
-                                $sToken,
-                                new Token\HouseNumber(null, trim($sToken))
-                            );
-                        }
-                    }
-                }
+                $aPhrases = array_filter($aPhrases, function ($oPhrase) {
+                    return $oPhrase->getWordSets() !== null;
+                });
 
                 // Any words that have failed completely?
                 // TODO: suggestions
 
                 Debug::printGroupTable('Valid Tokens', $oValidTokens->debugInfo());
-
-                foreach ($aPhrases as $oPhrase) {
-                    $oPhrase->computeWordSets($oValidTokens);
-                }
                 Debug::printDebugTable('Phrases', $aPhrases);
 
                 Debug::newSection('Search candidates');
 
-                $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens, $bStructuredPhrases);
+                $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens);
 
-                if ($this->bReverseInPlan) {
+                if (!$this->aStructuredQuery) {
                     // Reverse phrase array and also reverse the order of the wordsets in
                     // the first and final phrase. Don't bother about phrases in the middle
                     // because order in the address doesn't matter.
@@ -695,7 +622,7 @@ class Geocode
                     if (count($aPhrases) > 1) {
                         $aPhrases[count($aPhrases)-1]->invertWordSets();
                     }
-                    $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens, false);
+                    $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens);
 
                     foreach ($aGroupedSearches as $aSearches) {
                         foreach ($aSearches as $aSearch) {
@@ -838,7 +765,6 @@ class Geocode
                     foreach ($aResults as $oResult) {
                         if (($this->iMaxAddressRank == 30 &&
                              ($oResult->iTable == Result::TABLE_OSMLINE
-                              || $oResult->iTable == Result::TABLE_AUX
                               || $oResult->iTable == Result::TABLE_TIGER))
                             || in_array($oResult->iId, $aFilteredIDs)
                         ) {
@@ -999,7 +925,6 @@ class Geocode
                 'Structured query' => $this->aStructuredQuery,
                 'Name keys' => Debug::fmtArrayVals($this->aLangPrefOrder),
                 'Excluded place IDs' => Debug::fmtArrayVals($this->aExcludePlaceIDs),
-                'Try reversed query'=> $this->bReverseInPlan,
                 'Limit (for searches)' => $this->iLimit,
                 'Limit (for results)'=> $this->iFinalLimit,
                 'Country codes' => Debug::fmtArrayVals($this->aCountryCodes),