X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/6a7e0d652b1d40a397e1c1386d500101796676c4..1c175e3a67fbfb94f29c47bdcd02fb28136e591b:/lib-php/Geocode.php diff --git a/lib-php/Geocode.php b/lib-php/Geocode.php index 12f9da37..53ee49c0 100644 --- a/lib-php/Geocode.php +++ b/lib-php/Geocode.php @@ -8,17 +8,18 @@ require_once(CONST_LibDir.'/ReverseGeocode.php'); require_once(CONST_LibDir.'/SearchDescription.php'); require_once(CONST_LibDir.'/SearchContext.php'); require_once(CONST_LibDir.'/TokenList.php'); +require_once(CONST_TokenizerDir.'/tokenizer.php'); class Geocode { protected $oDB; protected $oPlaceLookup; + protected $oTokenizer; protected $aLangPrefOrder = array(); protected $aExcludePlaceIDs = array(); - protected $bReverseInPlan = true; protected $iLimit = 20; protected $iFinalLimit = 10; @@ -42,28 +43,12 @@ class Geocode protected $sQuery = false; protected $aStructuredQuery = false; - protected $oNormalizer = null; - public function __construct(&$oDB) { $this->oDB =& $oDB; $this->oPlaceLookup = new PlaceLookup($this->oDB); - $this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules); - } - - private function normTerm($sTerm) - { - if ($this->oNormalizer === null) { - return $sTerm; - } - - return $this->oNormalizer->transliterate($sTerm); - } - - public function setReverseInPlan($bReverse) - { - $this->bReverseInPlan = $bReverse; + $this->oTokenizer = new \Nominatim\Tokenizer($this->oDB); } public function setLanguagePreference($aLangPref) @@ -262,7 +247,6 @@ class Geocode $oParams->getString('country'), $oParams->getString('postalcode') ); - $this->setReverseInPlan(false); } else { $this->setQuery($sQuery); } @@ -330,7 +314,7 @@ class Geocode return false; } - public function getGroupedSearches($aSearches, $aPhrases, $oValidTokens, $bIsStructured) + public function getGroupedSearches($aSearches, $aPhrases, $oValidTokens) { /* Calculate all searches using oValidTokens i.e. @@ -345,7 +329,7 @@ class Geocode */ foreach ($aPhrases as $iPhrase => $oPhrase) { $aNewPhraseSearches = array(); - $sPhraseType = $bIsStructured ? $oPhrase->getPhraseType() : ''; + $sPhraseType = $oPhrase->getPhraseType(); foreach ($oPhrase->getWordSets() as $aWordset) { $aWordsetSearches = $aSearches; @@ -388,7 +372,7 @@ class Geocode $aNewSearches = $oCurrentSearch->extendWithPartialTerm( $sToken, $oSearchTerm, - $bIsStructured, + (bool) $sPhraseType, $iPhrase, $oValidTokens->get(' '.$sToken) ); @@ -517,12 +501,10 @@ class Geocode if ($this->aCountryCodes) { $oCtx->setCountryList($this->aCountryCodes); } + $this->oTokenizer->setCountryRestriction($this->aCountryCodes); Debug::newSection('Query Preprocessing'); - $sNormQuery = $this->normTerm($this->sQuery); - Debug::printVar('Normalized query', $sNormQuery); - $sLanguagePrefArraySQL = $this->oDB->getArraySQL( $this->oDB->getDBQuotedList($this->aLangPrefOrder) ); @@ -576,117 +558,62 @@ class Geocode } if ($sSpecialTerm && !$aSearches[0]->hasOperator()) { - $sSpecialTerm = pg_escape_string($sSpecialTerm); - $sToken = $this->oDB->getOne( - 'SELECT make_standard_name(:term)', - array(':term' => $sSpecialTerm), - 'Cannot decode query. Wrong encoding?' - ); - $sSQL = 'SELECT class, type FROM word '; - $sSQL .= ' WHERE word_token in (\' '.$sToken.'\')'; - $sSQL .= ' AND class is not null AND class not in (\'place\')'; - - Debug::printSQL($sSQL); - $aSearchWords = $this->oDB->getAll($sSQL); - $aNewSearches = array(); - foreach ($aSearches as $oSearch) { - foreach ($aSearchWords as $aSearchTerm) { - $oNewSearch = clone $oSearch; - $oNewSearch->setPoiSearch( - Operator::TYPE, - $aSearchTerm['class'], - $aSearchTerm['type'] - ); - $aNewSearches[] = $oNewSearch; + $aTokens = $this->oTokenizer->tokensForSpecialTerm($sSpecialTerm); + + if (!empty($aTokens)) { + $aNewSearches = array(); + foreach ($aSearches as $oSearch) { + foreach ($aTokens as $oToken) { + $oNewSearch = clone $oSearch; + $oNewSearch->setPoiSearch( + $oToken->iOperator, + $oToken->sClass, + $oToken->sType + ); + $aNewSearches[] = $oNewSearch; + } } + $aSearches = $aNewSearches; } - $aSearches = $aNewSearches; } // Split query into phrases // Commas are used to reduce the search space by indicating where phrases split + $aPhrases = array(); if ($this->aStructuredQuery) { - $aInPhrases = $this->aStructuredQuery; - $bStructuredPhrases = true; + foreach ($this->aStructuredQuery as $iPhrase => $sPhrase) { + $aPhrases[] = new Phrase($sPhrase, $iPhrase); + } } else { - $aInPhrases = explode(',', $sQuery); - $bStructuredPhrases = false; + foreach (explode(',', $sQuery) as $sPhrase) { + $aPhrases[] = new Phrase($sPhrase, ''); + } } Debug::printDebugArray('Search context', $oCtx); Debug::printDebugArray('Base search', empty($aSearches) ? null : $aSearches[0]); - Debug::printVar('Final query phrases', $aInPhrases); - // Convert each phrase to standard form - // Create a list of standard words - // Get all 'sets' of words - // Generate a complete list of all Debug::newSection('Tokenization'); - $aTokens = array(); - $aPhrases = array(); - foreach ($aInPhrases as $iPhrase => $sPhrase) { - $sPhrase = $this->oDB->getOne( - 'SELECT make_standard_name(:phrase)', - array(':phrase' => $sPhrase), - 'Cannot normalize query string (is it a UTF-8 string?)' - ); - if (trim($sPhrase)) { - $oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : ''); - $oPhrase->addTokens($aTokens); - $aPhrases[] = $oPhrase; - } - } - - Debug::printVar('Tokens', $aTokens); - - $oValidTokens = new TokenList(); - - if (!empty($aTokens)) { - $oValidTokens->addTokensFromDB( - $this->oDB, - $aTokens, - $this->aCountryCodes, - $sNormQuery, - $this->oNormalizer - ); + $oValidTokens = $this->oTokenizer->extractTokensFromPhrases($aPhrases); + if ($oValidTokens->count() > 0) { $oCtx->setFullNameWords($oValidTokens->getFullWordIDs()); - // Try more interpretations for Tokens that could not be matched. - foreach ($aTokens as $sToken) { - if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) { - if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) { - // US ZIP+4 codes - merge in the 5-digit ZIP code - $oValidTokens->addToken( - $sToken, - new Token\Postcode(null, $aData[1], 'us') - ); - } elseif (preg_match('/^ [0-9]+$/', $sToken)) { - // Unknown single word token with a number. - // Assume it is a house number. - $oValidTokens->addToken( - $sToken, - new Token\HouseNumber(null, trim($sToken)) - ); - } - } - } + $aPhrases = array_filter($aPhrases, function ($oPhrase) { + return $oPhrase->getWordSets() !== null; + }); // Any words that have failed completely? // TODO: suggestions Debug::printGroupTable('Valid Tokens', $oValidTokens->debugInfo()); - - foreach ($aPhrases as $oPhrase) { - $oPhrase->computeWordSets($oValidTokens); - } Debug::printDebugTable('Phrases', $aPhrases); Debug::newSection('Search candidates'); - $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens, $bStructuredPhrases); + $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens); - if ($this->bReverseInPlan) { + if (!$this->aStructuredQuery) { // Reverse phrase array and also reverse the order of the wordsets in // the first and final phrase. Don't bother about phrases in the middle // because order in the address doesn't matter. @@ -695,7 +622,7 @@ class Geocode if (count($aPhrases) > 1) { $aPhrases[count($aPhrases)-1]->invertWordSets(); } - $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens, false); + $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens); foreach ($aGroupedSearches as $aSearches) { foreach ($aSearches as $aSearch) { @@ -778,14 +705,19 @@ class Geocode if (!empty($aResults)) { $aSplitResults = Result::splitResults($aResults); Debug::printVar('Split results', $aSplitResults); - if ($iGroupLoop <= 4 && empty($aSplitResults['tail']) - && reset($aSplitResults['head'])->iResultRank > 0) { + if ($iGroupLoop <= 4 + && reset($aSplitResults['head'])->iResultRank > 0 + && $iGroupedRank !== array_key_last($aGroupedSearches)) { // Haven't found an exact match for the query yet. // Therefore add result from the next group level. $aNextResults = $aSplitResults['head']; foreach ($aNextResults as $oRes) { $oRes->iResultRank--; } + foreach ($aSplitResults['tail'] as $oRes) { + $oRes->iResultRank--; + $aNextResults[$oRes->iId] = $oRes; + } $aResults = array(); } else { $aResults = $aSplitResults['head']; @@ -833,7 +765,6 @@ class Geocode foreach ($aResults as $oResult) { if (($this->iMaxAddressRank == 30 && ($oResult->iTable == Result::TABLE_OSMLINE - || $oResult->iTable == Result::TABLE_AUX || $oResult->iTable == Result::TABLE_TIGER)) || in_array($oResult->iId, $aFilteredIDs) ) { @@ -994,7 +925,6 @@ class Geocode 'Structured query' => $this->aStructuredQuery, 'Name keys' => Debug::fmtArrayVals($this->aLangPrefOrder), 'Excluded place IDs' => Debug::fmtArrayVals($this->aExcludePlaceIDs), - 'Try reversed query'=> $this->bReverseInPlan, 'Limit (for searches)' => $this->iLimit, 'Limit (for results)'=> $this->iFinalLimit, 'Country codes' => Debug::fmtArrayVals($this->aCountryCodes),