From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 14 Jul 2021 20:17:17 +0000 (+0200)
Subject: remove special status of partial tokens
X-Git-Tag: v4.0.0~50^2~7
X-Git-Url: https://git.openstreetmap.org./nominatim.git/commitdiff_plain/143ff1446656ee995356c7a7b5eaa624140c70d8

remove special status of partial tokens

Full-word tokens are no longer marked by a space at the
beginning of the token. Use the new Partial token category
instead. This removes a couple of special casing, we don't
really need.

The word table still has the space for compatibility reasons,
so the tokenizer code needs to get rid of it when loading the
tokens.
---

diff --git a/lib-php/Geocode.php b/lib-php/Geocode.php
index ec21a0dc..eda6df54 100644
--- a/lib-php/Geocode.php
+++ b/lib-php/Geocode.php
@@ -355,15 +355,15 @@ class Geocode
                     $aNewWordsetSearches = array();
 
                     foreach ($aWordsetSearches as $oCurrentSearch) {
-                        // Tokens with full name matches.
-                        foreach ($oValidTokens->get(' '.$sToken) as $oSearchTerm) {
-                            $aNewSearches = $oCurrentSearch->extendWithFullTerm(
+                        foreach ($oValidTokens->get($sToken) as $oSearchTerm) {
+                            $aNewSearches = $oCurrentSearch->extendWithSearchTerm(
+                                $sToken,
                                 $oSearchTerm,
                                 $sPhraseType,
                                 $iToken == 0 && $iPhrase == 0,
-                                $iPhrase == 0,
                                 $iToken + 1 == count($aWordset)
-                                  && $iPhrase + 1 == count($aPhrases)
+                                  && $iPhrase + 1 == count($aPhrases),
+                                $iPhrase
                             );
 
                             foreach ($aNewSearches as $oSearch) {
@@ -372,27 +372,6 @@ class Geocode
                                 }
                             }
                         }
-                        // Look for partial matches.
-                        // Note that there is no point in adding country terms here
-                        // because country is omitted in the address.
-                        if ($sPhraseType != 'country') {
-                            // Allow searching for a word - but at extra cost
-                            foreach ($oValidTokens->get($sToken) as $oSearchTerm) {
-                                $aNewSearches = $oCurrentSearch->extendWithPartialTerm(
-                                    $sToken,
-                                    $oSearchTerm,
-                                    (bool) $sPhraseType,
-                                    $iPhrase,
-                                    $oValidTokens->get(' '.$sToken)
-                                );
-
-                                foreach ($aNewSearches as $oSearch) {
-                                    if ($oSearch->getRank() < $this->iMaxRank) {
-                                        $aNewWordsetSearches[] = $oSearch;
-                                    }
-                                }
-                            }
-                        }
                     }
                     // Sort and cut
                     usort($aNewWordsetSearches, array('Nominatim\SearchDescription', 'bySearchRank'));
diff --git a/lib-php/SearchDescription.php b/lib-php/SearchDescription.php
index 6091fd61..938beb61 100644
--- a/lib-php/SearchDescription.php
+++ b/lib-php/SearchDescription.php
@@ -152,17 +152,17 @@ class SearchDescription
     /**
      * Derive new searches by adding a full term to the existing search.
      *
-     * @param object $oSearchTerm  Description of the token.
-     * @param string $sPhraseType  Type of phrase the token is contained in.
-     * @param bool   $bFirstToken  True if the token is at the beginning of the
-     *                             query.
-     * @param bool   $bFirstPhrase True if the token is in the first phrase of
-     *                             the query.
-     * @param bool   $bLastToken   True if the token is at the end of the query.
+     * @param string  $sToken       Term for the token.
+     * @param object  $oSearchTerm  Description of the token.
+     * @param string  $sPhraseType  Type of phrase the token is contained in.
+     * @param bool    $bFirstToken  True if the token is at the beginning of the
+     *                              query.
+     * @param bool    $bLastToken   True if the token is at the end of the query.
+     * @param integer $iPhrase      Number of the phrase the token is in.
      *
      * @return SearchDescription[] List of derived search descriptions.
      */
-    public function extendWithFullTerm($oSearchTerm, $sPhraseType, $bFirstToken, $bFirstPhrase, $bLastToken)
+    public function extendWithSearchTerm($sToken, $oSearchTerm, $sPhraseType, $bFirstToken, $bLastToken, $iPhrase)
     {
         $aNewSearches = array();
 
@@ -295,8 +295,8 @@ class SearchDescription
             // of the phrase. In structured search the name must forcably in
             // the first phrase. In unstructured search it may be in a later
             // phrase when the first phrase is a house number.
-            if (!empty($this->aName) || !($bFirstPhrase || $sPhraseType == '')) {
-                if (($sPhraseType == '' || !$bFirstPhrase) && $oSearchTerm->iTermCount > 1) {
+            if (!empty($this->aName) || !($iPhrase == 0 || $sPhraseType == '')) {
+                if (($sPhraseType == '' || $iPhrase > 0) && $oSearchTerm->iTermCount > 1) {
                     $oSearch = clone $this;
                     $oSearch->iNamePhrase = -1;
                     $oSearch->iSearchRank += 1;
@@ -314,6 +314,16 @@ class SearchDescription
                 }
                 $aNewSearches[] = $oSearch;
             }
+        } elseif ($sPhraseType != 'country'
+                  && is_a($oSearchTerm, '\Nominatim\Token\Partial')
+                  && strpos($sToken, ' ') === false
+        ) {
+            $aNewSearches = $this->extendWithPartialTerm(
+                $sToken,
+                $oSearchTerm,
+                (bool) $sPhraseType,
+                $iPhrase
+            );
         }
 
         return $aNewSearches;
@@ -326,20 +336,11 @@ class SearchDescription
      * @param object  $oSearchTerm        Description of the token.
      * @param bool    $bStructuredPhrases True if the search is structured.
      * @param integer $iPhrase            Number of the phrase the token is in.
-     * @param array[] $aFullTokens        List of full term tokens with the
-     *                                    same name.
      *
      * @return SearchDescription[] List of derived search descriptions.
      */
-    public function extendWithPartialTerm($sToken, $oSearchTerm, $bStructuredPhrases, $iPhrase, $aFullTokens)
+    private function extendWithPartialTerm($sToken, $oSearchTerm, $bStructuredPhrases, $iPhrase)
     {
-        // Only allow name terms.
-        if (!(is_a($oSearchTerm, '\Nominatim\Token\Word'))
-            || strpos($sToken, ' ') !== false
-        ) {
-            return array();
-        }
-
         $aNewSearches = array();
         $iWordID = $oSearchTerm->iId;
 
@@ -355,9 +356,6 @@ class SearchDescription
                 $oSearch->aAddress[$iWordID] = $iWordID;
             } else {
                 $oSearch->aAddressNonSearch[$iWordID] = $iWordID;
-                if (!empty($aFullTokens)) {
-                    $oSearch->iSearchRank++;
-                }
             }
             $aNewSearches[] = $oSearch;
         }
@@ -385,9 +383,6 @@ class SearchDescription
                 }
                 $oSearch->aName[$iWordID] = $iWordID;
             } else {
-                if (!empty($aFullTokens)) {
-                    $oSearch->iSearchRank++;
-                }
                 $oSearch->aNameNonSearch[$iWordID] = $iWordID;
             }
             $oSearch->iNamePhrase = $iPhrase;
diff --git a/lib-php/TokenList.php b/lib-php/TokenList.php
index f310306d..bc8f9c3f 100644
--- a/lib-php/TokenList.php
+++ b/lib-php/TokenList.php
@@ -18,15 +18,6 @@ require_once(CONST_LibDir.'/SpecialSearchOperator.php');
  * tokens do not have a common base class. All tokens need to have a field
  * with the word id that points to an entry in the `word` database table
  * but otherwise the information saved about a token can be very different.
- *
- * There are two different kinds of token words: full words and partial terms.
- *
- * Full words start with a space. They represent a complete name of a place.
- * All special tokens are normally full words.
- *
- * Partial terms have no space at the beginning. They may represent a part of
- * a name of a place (e.g. in the name 'World Trade Center' a partial term
- * would be 'Trade' or 'Trade Center'). They are only used in TokenWord.
  */
 class TokenList
 {
@@ -65,7 +56,7 @@ class TokenList
      */
     public function containsAny($sWord)
     {
-        return isset($this->aTokens[$sWord]) || isset($this->aTokens[' '.$sWord]);
+        return isset($this->aTokens[$sWord]);
     }
 
     /**
@@ -87,7 +78,7 @@ class TokenList
 
         foreach ($this->aTokens as $aTokenList) {
             foreach ($aTokenList as $oToken) {
-                if (is_a($oToken, '\Nominatim\Token\Word') && !$oToken->bPartial) {
+                if (is_a($oToken, '\Nominatim\Token\Word')) {
                     $ids[$oToken->iId] = $oToken->iId;
                 }
             }
diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php
index 8cff6f32..96a1d8a6 100644
--- a/lib-php/tokenizer/legacy_icu_tokenizer.php
+++ b/lib-php/tokenizer/legacy_icu_tokenizer.php
@@ -120,14 +120,14 @@ class Tokenizer
 
             // Try more interpretations for Tokens that could not be matched.
             foreach ($aTokens as $sToken) {
-                if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
-                    if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
+                if ($sToken[0] != ' ' && !$oValidTokens->contains($sToken)) {
+                    if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
                         // US ZIP+4 codes - merge in the 5-digit ZIP code
                         $oValidTokens->addToken(
                             $sToken,
                             new Token\Postcode(null, $aData[1], 'us')
                         );
-                    } elseif (preg_match('/^ [0-9]+$/', $sToken)) {
+                    } elseif (preg_match('/^[0-9]+$/', $sToken)) {
                         // Unknown single word token with a number.
                         // Assume it is a house number.
                         $oValidTokens->addToken(
diff --git a/lib-php/tokenizer/legacy_tokenizer.php b/lib-php/tokenizer/legacy_tokenizer.php
index ec2d7e68..238fbcf4 100644
--- a/lib-php/tokenizer/legacy_tokenizer.php
+++ b/lib-php/tokenizer/legacy_tokenizer.php
@@ -137,14 +137,14 @@ class Tokenizer
 
             // Try more interpretations for Tokens that could not be matched.
             foreach ($aTokens as $sToken) {
-                if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
-                    if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
+                if ($sToken[0] != ' ' && !$oValidTokens->contains($sToken)) {
+                    if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
                         // US ZIP+4 codes - merge in the 5-digit ZIP code
                         $oValidTokens->addToken(
                             $sToken,
                             new Token\Postcode(null, $aData[1], 'us')
                         );
-                    } elseif (preg_match('/^ [0-9]+$/', $sToken)) {
+                    } elseif (preg_match('/^[0-9]+$/', $sToken)) {
                         // Unknown single word token with a number.
                         // Assume it is a house number.
                         $oValidTokens->addToken(