From: Sarah Hoffmann Date: Sun, 15 Oct 2017 16:08:25 +0000 (+0200) Subject: Merge remote-tracking branch 'upstream/master' X-Git-Tag: deploy~358 X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/4e99f8a3408d59f5c6bf8d934fda7bb3726f2da2?hp=-c Merge remote-tracking branch 'upstream/master' --- 4e99f8a3408d59f5c6bf8d934fda7bb3726f2da2 diff --combined lib/Geocode.php index 51b79735,be543012..306255b5 --- a/lib/Geocode.php +++ b/lib/Geocode.php @@@ -3,6 -3,7 +3,7 @@@ namespace Nominatim; require_once(CONST_BasePath.'/lib/PlaceLookup.php'); + require_once(CONST_BasePath.'/lib/Phrase.php'); require_once(CONST_BasePath.'/lib/ReverseGeocode.php'); require_once(CONST_BasePath.'/lib/SearchDescription.php'); require_once(CONST_BasePath.'/lib/SearchContext.php'); @@@ -26,7 -27,7 +27,7 @@@ class Geocod protected $aExcludePlaceIDs = array(); protected $bDeDupe = true; - protected $bReverseInPlan = false; + protected $bReverseInPlan = true; protected $iLimit = 20; protected $iFinalLimit = 10; @@@ -668,7 -669,7 +669,7 @@@ return $aSearchResults; } - public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery) + public function getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bIsStructured) { /* Calculate all searches using aValidTokens i.e. @@@ -683,15 -684,11 +684,11 @@@ */ $iGlobalRank = 0; - foreach ($aPhrases as $iPhrase => $aPhrase) { + foreach ($aPhrases as $iPhrase => $oPhrase) { $aNewPhraseSearches = array(); - if ($bStructuredPhrases) { - $sPhraseType = $aPhraseTypes[$iPhrase]; - } else { - $sPhraseType = ''; - } + $sPhraseType = $bIsStructured ? $oPhrase->getPhraseType() : ''; - foreach ($aPhrase['wordsets'] as $iWordSet => $aWordset) { + foreach ($oPhrase->getWordSets() as $iWordSet => $aWordset) { // Too many permutations - too expensive if ($iWordSet > 120) break; @@@ -710,17 -707,8 +707,8 @@@ // If the token is valid if (isset($aValidTokens[' '.$sToken])) { foreach ($aValidTokens[' '.$sToken] as $aSearchTerm) { - // Recheck if the original word shows up in the query. - $bWordInQuery = false; - if (isset($aSearchTerm['word']) && $aSearchTerm['word']) { - $bWordInQuery = strpos( - $sNormQuery, - $this->normTerm($aSearchTerm['word']) - ) !== false; - } $aNewSearches = $oCurrentSearch->extendWithFullTerm( $aSearchTerm, - $bWordInQuery, isset($aValidTokens[$sToken]) && strpos($sToken, ' ') === false, $sPhraseType, @@@ -746,9 -734,8 +734,8 @@@ foreach ($aValidTokens[$sToken] as $aSearchTerm) { $aNewSearches = $oCurrentSearch->extendWithPartialTerm( $aSearchTerm, - $bStructuredPhrases, + $bIsStructured, $iPhrase, - $aWordFrequencyScores, isset($aValidTokens[' '.$sToken]) ? $aValidTokens[' '.$sToken] : array() ); @@@ -806,7 -793,7 +793,7 @@@ // Revisit searches, drop bad searches and give penalty to unlikely combinations. $aGroupedSearches = array(); foreach ($aSearches as $oSearch) { - if (!$oSearch->isValidSearch($this->aCountryCodes)) { + if (!$oSearch->isValidSearch()) { continue; } @@@ -955,10 -942,10 +942,10 @@@ // Split query into phrases // Commas are used to reduce the search space by indicating where phrases split if ($this->aStructuredQuery) { - $aPhrases = $this->aStructuredQuery; + $aInPhrases = $this->aStructuredQuery; $bStructuredPhrases = true; } else { - $aPhrases = explode(',', $sQuery); + $aInPhrases = explode(',', $sQuery); $bStructuredPhrases = false; } @@@ -967,25 -954,19 +954,19 @@@ // Get all 'sets' of words // Generate a complete list of all $aTokens = array(); - foreach ($aPhrases as $iPhrase => $sPhrase) { - $aPhrase = chksql( - $this->oDB->getRow("SELECT make_standard_name('".pg_escape_string($sPhrase)."') as string"), + $aPhrases = array(); + foreach ($aInPhrases as $iPhrase => $sPhrase) { + $sPhrase = chksql( + $this->oDB->getOne('SELECT make_standard_name('.getDBQuoted($sPhrase).')'), "Cannot normalize query string (is it a UTF-8 string?)" ); - if (trim($aPhrase['string'])) { - $aPhrases[$iPhrase] = $aPhrase; - $aPhrases[$iPhrase]['words'] = explode(' ', $aPhrases[$iPhrase]['string']); - $aPhrases[$iPhrase]['wordsets'] = getWordSets($aPhrases[$iPhrase]['words'], 0); - $aTokens = array_merge($aTokens, getTokensFromSets($aPhrases[$iPhrase]['wordsets'])); - } else { - unset($aPhrases[$iPhrase]); + if (trim($sPhrase)) { + $oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : ''); + $oPhrase->addTokens($aTokens); + $aPhrases[] = $oPhrase; } } - // Reindex phrases - we make assumptions later on that they are numerically keyed in order - $aPhraseTypes = array_keys($aPhrases); - $aPhrases = array_values($aPhrases); - if (sizeof($aTokens)) { // Check which tokens we have, get the ID numbers $sSQL = 'SELECT word_id, word_token, word, class, type, country_code, operator, search_name_count'; @@@ -999,22 -980,29 +980,29 @@@ $this->oDB->getAll($sSQL), "Could not get word tokens." ); - $aPossibleMainWordIDs = array(); $aWordFrequencyScores = array(); foreach ($aDatabaseWords as $aToken) { - // Very special case - require 2 letter country param to match the country code found - if ($bStructuredPhrases && $aToken['country_code'] && !empty($this->aStructuredQuery['country']) - && strlen($this->aStructuredQuery['country']) == 2 && strtolower($this->aStructuredQuery['country']) != $aToken['country_code'] + // Filter country tokens that do not match restricted countries. + if ($this->aCountryCodes + && $aToken['country_code'] + && !in_array($aToken['country_code'], $this->aCountryCodes) ) { continue; } + // Special terms need to appear in their normalized form. + if ($aToken['word'] && $aToken['class']) { + $sNormWord = $this->normTerm($aToken['word']); + if (strpos($sNormQuery, $sNormWord) === false) { + continue; + } + } + if (isset($aValidTokens[$aToken['word_token']])) { $aValidTokens[$aToken['word_token']][] = $aToken; } else { $aValidTokens[$aToken['word_token']] = array($aToken); } - if (!$aToken['class'] && !$aToken['country_code']) $aPossibleMainWordIDs[$aToken['word_id']] = 1; $aWordFrequencyScores[$aToken['word_id']] = $aToken['search_name_count'] + 1; } if (CONST_Debug) var_Dump($aPhrases, $aValidTokens); @@@ -1046,19 -1034,18 +1034,18 @@@ // Any words that have failed completely? // TODO: suggestions - $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery); + $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bStructuredPhrases); if ($this->bReverseInPlan) { // Reverse phrase array and also reverse the order of the wordsets in // the first and final phrase. Don't bother about phrases in the middle // because order in the address doesn't matter. $aPhrases = array_reverse($aPhrases); - $aPhrases[0]['wordsets'] = getInverseWordSets($aPhrases[0]['words'], 0); + $aPhrases[0]->invertWordSets(); if (sizeof($aPhrases) > 1) { - $aFinalPhrase = end($aPhrases); - $aPhrases[sizeof($aPhrases)-1]['wordsets'] = getInverseWordSets($aFinalPhrase['words'], 0); + $aPhrases[sizeof($aPhrases)-1]->invertWordSets(); } - $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false, $sNormQuery); + $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, false); foreach ($aGroupedSearches as $aSearches) { foreach ($aSearches as $aSearch) { @@@ -1288,8 -1275,7 +1275,7 @@@ $aResult['name'] = $aResult['langaddress']; - if ($oCtx->hasNearPoint()) - { + if ($oCtx->hasNearPoint()) { $aResult['importance'] = 0.001; $aResult['foundorder'] = $aResult['addressimportance']; } else { diff --combined lib/lib.php index e4a343d1,76775d6c..3db3a825 --- a/lib/lib.php +++ b/lib/lib.php @@@ -60,54 -60,6 +60,6 @@@ function byImportance($a, $b } - function getWordSets($aWords, $iDepth) - { - $aResult = array(array(join(' ', $aWords))); - $sFirstToken = ''; - if ($iDepth < 7) { - while (sizeof($aWords) > 1) { - $sWord = array_shift($aWords); - $sFirstToken .= ($sFirstToken?' ':'').$sWord; - $aRest = getWordSets($aWords, $iDepth+1); - foreach ($aRest as $aSet) { - $aResult[] = array_merge(array($sFirstToken), $aSet); - } - } - } - return $aResult; - } - - function getInverseWordSets($aWords, $iDepth) - { - $aResult = array(array(join(' ', $aWords))); - $sFirstToken = ''; - if ($iDepth < 8) { - while (sizeof($aWords) > 1) { - $sWord = array_pop($aWords); - $sFirstToken = $sWord.($sFirstToken?' ':'').$sFirstToken; - $aRest = getInverseWordSets($aWords, $iDepth+1); - foreach ($aRest as $aSet) { - $aResult[] = array_merge(array($sFirstToken), $aSet); - } - } - } - return $aResult; - } - - - function getTokensFromSets($aSets) - { - $aTokens = array(); - foreach ($aSets as $aSet) { - foreach ($aSet as $sWord) { - $aTokens[' '.$sWord] = ' '.$sWord; - $aTokens[$sWord] = $sWord; - } - } - return $aTokens; - } - - function getClassTypes() { return array( @@@ -632,10 -584,10 +584,10 @@@ function geometryText2Points($geometry_ // preg_match_all('/(-?[0-9.]+) (-?[0-9.]+)/', $aMatch[1], $aPolyPoints, PREG_SET_ORDER); // - } elseif (preg_match('#MULTIPOLYGON\\(\\(\\(([- 0-9.,]+)#', $geometry_as_text, $aMatch)) { +/* } elseif (preg_match('#MULTIPOLYGON\\(\\(\\(([- 0-9.,]+)#', $geometry_as_text, $aMatch)) { // preg_match_all('/(-?[0-9.]+) (-?[0-9.]+)/', $aMatch[1], $aPolyPoints, PREG_SET_ORDER); - // + */ } elseif (preg_match('#POINT\\((-?[0-9.]+) (-?[0-9.]+)\\)#', $geometry_as_text, $aMatch)) { // $aPolyPoints = createPointsAroundCenter($aMatch[1], $aMatch[2], $fRadius);