From: Sarah Hoffmann <lonvia@denofr.de>
Date: Sun, 15 Oct 2017 16:08:25 +0000 (+0200)
Subject: Merge remote-tracking branch 'upstream/master'
X-Git-Tag: deploy~358
X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/4e99f8a3408d59f5c6bf8d934fda7bb3726f2da2?hp=-c

Merge remote-tracking branch 'upstream/master'
---

4e99f8a3408d59f5c6bf8d934fda7bb3726f2da2
diff --combined lib/Geocode.php
index 51b79735,be543012..306255b5
--- a/lib/Geocode.php
+++ b/lib/Geocode.php
@@@ -3,6 -3,7 +3,7 @@@
  namespace Nominatim;
  
  require_once(CONST_BasePath.'/lib/PlaceLookup.php');
+ require_once(CONST_BasePath.'/lib/Phrase.php');
  require_once(CONST_BasePath.'/lib/ReverseGeocode.php');
  require_once(CONST_BasePath.'/lib/SearchDescription.php');
  require_once(CONST_BasePath.'/lib/SearchContext.php');
@@@ -26,7 -27,7 +27,7 @@@ class Geocod
  
      protected $aExcludePlaceIDs = array();
      protected $bDeDupe = true;
 -    protected $bReverseInPlan = false;
 +    protected $bReverseInPlan = true;
  
      protected $iLimit = 20;
      protected $iFinalLimit = 10;
@@@ -668,7 -669,7 +669,7 @@@
          return $aSearchResults;
      }
  
-     public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery)
+     public function getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bIsStructured)
      {
          /*
               Calculate all searches using aValidTokens i.e.
@@@ -683,15 -684,11 +684,11 @@@
           */
          $iGlobalRank = 0;
  
-         foreach ($aPhrases as $iPhrase => $aPhrase) {
+         foreach ($aPhrases as $iPhrase => $oPhrase) {
              $aNewPhraseSearches = array();
-             if ($bStructuredPhrases) {
-                 $sPhraseType = $aPhraseTypes[$iPhrase];
-             } else {
-                 $sPhraseType = '';
-             }
+             $sPhraseType = $bIsStructured ? $oPhrase->getPhraseType() : '';
  
-             foreach ($aPhrase['wordsets'] as $iWordSet => $aWordset) {
+             foreach ($oPhrase->getWordSets() as $iWordSet => $aWordset) {
                  // Too many permutations - too expensive
                  if ($iWordSet > 120) break;
  
@@@ -710,17 -707,8 +707,8 @@@
                          // If the token is valid
                          if (isset($aValidTokens[' '.$sToken])) {
                              foreach ($aValidTokens[' '.$sToken] as $aSearchTerm) {
-                                 // Recheck if the original word shows up in the query.
-                                 $bWordInQuery = false;
-                                 if (isset($aSearchTerm['word']) && $aSearchTerm['word']) {
-                                     $bWordInQuery = strpos(
-                                         $sNormQuery,
-                                         $this->normTerm($aSearchTerm['word'])
-                                     ) !== false;
-                                 }
                                  $aNewSearches = $oCurrentSearch->extendWithFullTerm(
                                      $aSearchTerm,
-                                     $bWordInQuery,
                                      isset($aValidTokens[$sToken])
                                        && strpos($sToken, ' ') === false,
                                      $sPhraseType,
@@@ -746,9 -734,8 +734,8 @@@
                              foreach ($aValidTokens[$sToken] as $aSearchTerm) {
                                  $aNewSearches = $oCurrentSearch->extendWithPartialTerm(
                                      $aSearchTerm,
-                                     $bStructuredPhrases,
+                                     $bIsStructured,
                                      $iPhrase,
-                                     $aWordFrequencyScores,
                                      isset($aValidTokens[' '.$sToken]) ? $aValidTokens[' '.$sToken] : array()
                                  );
  
@@@ -806,7 -793,7 +793,7 @@@
          // Revisit searches, drop bad searches and give penalty to unlikely combinations.
          $aGroupedSearches = array();
          foreach ($aSearches as $oSearch) {
-             if (!$oSearch->isValidSearch($this->aCountryCodes)) {
+             if (!$oSearch->isValidSearch()) {
                  continue;
              }
  
@@@ -955,10 -942,10 +942,10 @@@
              // Split query into phrases
              // Commas are used to reduce the search space by indicating where phrases split
              if ($this->aStructuredQuery) {
-                 $aPhrases = $this->aStructuredQuery;
+                 $aInPhrases = $this->aStructuredQuery;
                  $bStructuredPhrases = true;
              } else {
-                 $aPhrases = explode(',', $sQuery);
+                 $aInPhrases = explode(',', $sQuery);
                  $bStructuredPhrases = false;
              }
  
@@@ -967,25 -954,19 +954,19 @@@
              // Get all 'sets' of words
              // Generate a complete list of all
              $aTokens = array();
-             foreach ($aPhrases as $iPhrase => $sPhrase) {
-                 $aPhrase = chksql(
-                     $this->oDB->getRow("SELECT make_standard_name('".pg_escape_string($sPhrase)."') as string"),
+             $aPhrases = array();
+             foreach ($aInPhrases as $iPhrase => $sPhrase) {
+                 $sPhrase = chksql(
+                     $this->oDB->getOne('SELECT make_standard_name('.getDBQuoted($sPhrase).')'),
                      "Cannot normalize query string (is it a UTF-8 string?)"
                  );
-                 if (trim($aPhrase['string'])) {
-                     $aPhrases[$iPhrase] = $aPhrase;
-                     $aPhrases[$iPhrase]['words'] = explode(' ', $aPhrases[$iPhrase]['string']);
-                     $aPhrases[$iPhrase]['wordsets'] = getWordSets($aPhrases[$iPhrase]['words'], 0);
-                     $aTokens = array_merge($aTokens, getTokensFromSets($aPhrases[$iPhrase]['wordsets']));
-                 } else {
-                     unset($aPhrases[$iPhrase]);
+                 if (trim($sPhrase)) {
+                     $oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : '');
+                     $oPhrase->addTokens($aTokens);
+                     $aPhrases[] = $oPhrase;
                  }
              }
  
-             // Reindex phrases - we make assumptions later on that they are numerically keyed in order
-             $aPhraseTypes = array_keys($aPhrases);
-             $aPhrases = array_values($aPhrases);
- 
              if (sizeof($aTokens)) {
                  // Check which tokens we have, get the ID numbers
                  $sSQL = 'SELECT word_id, word_token, word, class, type, country_code, operator, search_name_count';
@@@ -999,22 -980,29 +980,29 @@@
                      $this->oDB->getAll($sSQL),
                      "Could not get word tokens."
                  );
-                 $aPossibleMainWordIDs = array();
                  $aWordFrequencyScores = array();
                  foreach ($aDatabaseWords as $aToken) {
-                     // Very special case - require 2 letter country param to match the country code found
-                     if ($bStructuredPhrases && $aToken['country_code'] && !empty($this->aStructuredQuery['country'])
-                         && strlen($this->aStructuredQuery['country']) == 2 && strtolower($this->aStructuredQuery['country']) != $aToken['country_code']
+                     // Filter country tokens that do not match restricted countries.
+                     if ($this->aCountryCodes
+                         && $aToken['country_code']
+                         && !in_array($aToken['country_code'], $this->aCountryCodes)
                      ) {
                          continue;
                      }
  
+                     // Special terms need to appear in their normalized form.
+                     if ($aToken['word'] && $aToken['class']) {
+                         $sNormWord = $this->normTerm($aToken['word']);
+                         if (strpos($sNormQuery, $sNormWord) === false) {
+                             continue;
+                         }
+                     }
+ 
                      if (isset($aValidTokens[$aToken['word_token']])) {
                          $aValidTokens[$aToken['word_token']][] = $aToken;
                      } else {
                          $aValidTokens[$aToken['word_token']] = array($aToken);
                      }
-                     if (!$aToken['class'] && !$aToken['country_code']) $aPossibleMainWordIDs[$aToken['word_id']] = 1;
                      $aWordFrequencyScores[$aToken['word_id']] = $aToken['search_name_count'] + 1;
                  }
                  if (CONST_Debug) var_Dump($aPhrases, $aValidTokens);
@@@ -1046,19 -1034,18 +1034,18 @@@
                  // Any words that have failed completely?
                  // TODO: suggestions
  
-                 $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery);
+                 $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bStructuredPhrases);
  
                  if ($this->bReverseInPlan) {
                      // Reverse phrase array and also reverse the order of the wordsets in
                      // the first and final phrase. Don't bother about phrases in the middle
                      // because order in the address doesn't matter.
                      $aPhrases = array_reverse($aPhrases);
-                     $aPhrases[0]['wordsets'] = getInverseWordSets($aPhrases[0]['words'], 0);
+                     $aPhrases[0]->invertWordSets();
                      if (sizeof($aPhrases) > 1) {
-                         $aFinalPhrase = end($aPhrases);
-                         $aPhrases[sizeof($aPhrases)-1]['wordsets'] = getInverseWordSets($aFinalPhrase['words'], 0);
+                         $aPhrases[sizeof($aPhrases)-1]->invertWordSets();
                      }
-                     $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false, $sNormQuery);
+                     $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, false);
  
                      foreach ($aGroupedSearches as $aSearches) {
                          foreach ($aSearches as $aSearch) {
@@@ -1288,8 -1275,7 +1275,7 @@@
  
              $aResult['name'] = $aResult['langaddress'];
  
-             if ($oCtx->hasNearPoint())
-             {
+             if ($oCtx->hasNearPoint()) {
                  $aResult['importance'] = 0.001;
                  $aResult['foundorder'] = $aResult['addressimportance'];
              } else {
diff --combined lib/lib.php
index e4a343d1,76775d6c..3db3a825
--- a/lib/lib.php
+++ b/lib/lib.php
@@@ -60,54 -60,6 +60,6 @@@ function byImportance($a, $b
  }
  
  
- function getWordSets($aWords, $iDepth)
- {
-     $aResult = array(array(join(' ', $aWords)));
-     $sFirstToken = '';
-     if ($iDepth < 7) {
-         while (sizeof($aWords) > 1) {
-             $sWord = array_shift($aWords);
-             $sFirstToken .= ($sFirstToken?' ':'').$sWord;
-             $aRest = getWordSets($aWords, $iDepth+1);
-             foreach ($aRest as $aSet) {
-                 $aResult[] = array_merge(array($sFirstToken), $aSet);
-             }
-         }
-     }
-     return $aResult;
- }
- 
- function getInverseWordSets($aWords, $iDepth)
- {
-     $aResult = array(array(join(' ', $aWords)));
-     $sFirstToken = '';
-     if ($iDepth < 8) {
-         while (sizeof($aWords) > 1) {
-             $sWord = array_pop($aWords);
-             $sFirstToken = $sWord.($sFirstToken?' ':'').$sFirstToken;
-             $aRest = getInverseWordSets($aWords, $iDepth+1);
-             foreach ($aRest as $aSet) {
-                 $aResult[] = array_merge(array($sFirstToken), $aSet);
-             }
-         }
-     }
-     return $aResult;
- }
- 
- 
- function getTokensFromSets($aSets)
- {
-     $aTokens = array();
-     foreach ($aSets as $aSet) {
-         foreach ($aSet as $sWord) {
-             $aTokens[' '.$sWord] = ' '.$sWord;
-             $aTokens[$sWord] = $sWord;
-         }
-     }
-     return $aTokens;
- }
- 
- 
  function getClassTypes()
  {
      return array(
@@@ -632,10 -584,10 +584,10 @@@ function geometryText2Points($geometry_
          //
          preg_match_all('/(-?[0-9.]+) (-?[0-9.]+)/', $aMatch[1], $aPolyPoints, PREG_SET_ORDER);
          //
 -    } elseif (preg_match('#MULTIPOLYGON\\(\\(\\(([- 0-9.,]+)#', $geometry_as_text, $aMatch)) {
 +/*    } elseif (preg_match('#MULTIPOLYGON\\(\\(\\(([- 0-9.,]+)#', $geometry_as_text, $aMatch)) {
          //
          preg_match_all('/(-?[0-9.]+) (-?[0-9.]+)/', $aMatch[1], $aPolyPoints, PREG_SET_ORDER);
 -        //
 +        */
      } elseif (preg_match('#POINT\\((-?[0-9.]+) (-?[0-9.]+)\\)#', $geometry_as_text, $aMatch)) {
          //
          $aPolyPoints = createPointsAroundCenter($aMatch[1], $aMatch[2], $fRadius);