]> git.openstreetmap.org Git - nominatim.git/commitdiff
Merge pull request #2396 from lonvia/partial-word-token
authorSarah Hoffmann <lonvia@denofr.de>
Mon, 19 Jul 2021 07:42:37 +0000 (09:42 +0200)
committerGitHub <noreply@github.com>
Mon, 19 Jul 2021 07:42:37 +0000 (09:42 +0200)
Reorganise code that build the SearchDescription

12 files changed:
lib-php/Geocode.php
lib-php/SearchDescription.php
lib-php/SearchPosition.php [new file with mode: 0644]
lib-php/TokenCountry.php
lib-php/TokenHousenumber.php
lib-php/TokenList.php
lib-php/TokenPartial.php [new file with mode: 0644]
lib-php/TokenPostcode.php
lib-php/TokenSpecialTerm.php
lib-php/TokenWord.php
lib-php/tokenizer/legacy_icu_tokenizer.php
lib-php/tokenizer/legacy_tokenizer.php

index ec21a0dcd79d498d6f3943ae8e77ce78977b87bf..52b92c9928770f3baac9d4b162d90f9d1c904748 100644 (file)
@@ -7,6 +7,7 @@ require_once(CONST_LibDir.'/Phrase.php');
 require_once(CONST_LibDir.'/ReverseGeocode.php');
 require_once(CONST_LibDir.'/SearchDescription.php');
 require_once(CONST_LibDir.'/SearchContext.php');
+require_once(CONST_LibDir.'/SearchPosition.php');
 require_once(CONST_LibDir.'/TokenList.php');
 require_once(CONST_TokenizerDir.'/tokenizer.php');
 
@@ -345,7 +346,11 @@ class Geocode
          */
         foreach ($aPhrases as $iPhrase => $oPhrase) {
             $aNewPhraseSearches = array();
-            $sPhraseType = $oPhrase->getPhraseType();
+            $oPosition = new SearchPosition(
+                $oPhrase->getPhraseType(),
+                $iPhrase,
+                count($aPhrases)
+            );
 
             foreach ($oPhrase->getWordSets() as $aWordset) {
                 $aWordsetSearches = $aSearches;
@@ -353,37 +358,14 @@ class Geocode
                 // Add all words from this wordset
                 foreach ($aWordset as $iToken => $sToken) {
                     $aNewWordsetSearches = array();
+                    $oPosition->setTokenPosition($iToken, count($aWordset));
 
                     foreach ($aWordsetSearches as $oCurrentSearch) {
-                        // Tokens with full name matches.
-                        foreach ($oValidTokens->get(' '.$sToken) as $oSearchTerm) {
-                            $aNewSearches = $oCurrentSearch->extendWithFullTerm(
-                                $oSearchTerm,
-                                $sPhraseType,
-                                $iToken == 0 && $iPhrase == 0,
-                                $iPhrase == 0,
-                                $iToken + 1 == count($aWordset)
-                                  && $iPhrase + 1 == count($aPhrases)
-                            );
-
-                            foreach ($aNewSearches as $oSearch) {
-                                if ($oSearch->getRank() < $this->iMaxRank) {
-                                    $aNewWordsetSearches[] = $oSearch;
-                                }
-                            }
-                        }
-                        // Look for partial matches.
-                        // Note that there is no point in adding country terms here
-                        // because country is omitted in the address.
-                        if ($sPhraseType != 'country') {
-                            // Allow searching for a word - but at extra cost
-                            foreach ($oValidTokens->get($sToken) as $oSearchTerm) {
-                                $aNewSearches = $oCurrentSearch->extendWithPartialTerm(
-                                    $sToken,
-                                    $oSearchTerm,
-                                    (bool) $sPhraseType,
-                                    $iPhrase,
-                                    $oValidTokens->get(' '.$sToken)
+                        foreach ($oValidTokens->get($sToken) as $oSearchTerm) {
+                            if ($oSearchTerm->isExtendable($oCurrentSearch, $oPosition)) {
+                                $aNewSearches = $oSearchTerm->extendSearch(
+                                    $oCurrentSearch,
+                                    $oPosition
                                 );
 
                                 foreach ($aNewSearches as $oSearch) {
@@ -573,15 +555,15 @@ class Geocode
 
                 if (!empty($aTokens)) {
                     $aNewSearches = array();
+                    $oPosition = new SearchPosition('', 0, 1);
+                    $oPosition->setTokenPosition(0, 1);
+
                     foreach ($aSearches as $oSearch) {
                         foreach ($aTokens as $oToken) {
-                            $oNewSearch = clone $oSearch;
-                            $oNewSearch->setPoiSearch(
-                                $oToken->iOperator,
-                                $oToken->sClass,
-                                $oToken->sType
+                            $aNewSearches = array_merge(
+                                $aNewSearches,
+                                $oToken->extendSearch($oSearch, $oPosition)
                             );
-                            $aNewSearches[] = $oNewSearch;
                         }
                     }
                     $aSearches = $aNewSearches;
index 6091fd617396b38cf55bd242fcf95d70de67c5c3..4d944bfb1ee498835c2d3178b2ea04d2439ab75d 100644 (file)
@@ -67,35 +67,6 @@ class SearchDescription
         return $this->iSearchRank;
     }
 
-    /**
-     * Make this search a POI search.
-     *
-     * In a POI search, objects are not (only) searched by their name
-     * but also by the primary OSM key/value pair (class and type in Nominatim).
-     *
-     * @param integer $iOperator Type of POI search
-     * @param string  $sClass    Class (or OSM tag key) of POI.
-     * @param string  $sType     Type (or OSM tag value) of POI.
-     *
-     * @return void
-     */
-    public function setPoiSearch($iOperator, $sClass, $sType)
-    {
-        $this->iOperator = $iOperator;
-        $this->sClass = $sClass;
-        $this->sType = $sType;
-    }
-
-    /**
-     * Check if any operator is set.
-     *
-     * @return bool True, if this is a special search operation.
-     */
-    public function hasOperator()
-    {
-        return $this->iOperator != Operator::NONE;
-    }
-
     /**
      * Extract key/value pairs from a query.
      *
@@ -148,253 +119,234 @@ class SearchDescription
 
     /////////// Search building functions
 
-
     /**
-     * Derive new searches by adding a full term to the existing search.
+     * Create a copy of this search description adding to search rank.
      *
-     * @param object $oSearchTerm  Description of the token.
-     * @param string $sPhraseType  Type of phrase the token is contained in.
-     * @param bool   $bFirstToken  True if the token is at the beginning of the
-     *                             query.
-     * @param bool   $bFirstPhrase True if the token is in the first phrase of
-     *                             the query.
-     * @param bool   $bLastToken   True if the token is at the end of the query.
+     * @param integer $iTermCost  Cost to add to the current search rank.
      *
-     * @return SearchDescription[] List of derived search descriptions.
+     * @return object Cloned search description.
      */
-    public function extendWithFullTerm($oSearchTerm, $sPhraseType, $bFirstToken, $bFirstPhrase, $bLastToken)
+    public function clone($iTermCost)
     {
-        $aNewSearches = array();
+        $oSearch = clone $this;
+        $oSearch->iSearchRank += $iTermCost;
 
-        if (($sPhraseType == '' || $sPhraseType == 'country')
-            && is_a($oSearchTerm, '\Nominatim\Token\Country')
-        ) {
-            if (!$this->sCountryCode) {
-                $oSearch = clone $this;
-                $oSearch->iSearchRank++;
-                $oSearch->sCountryCode = $oSearchTerm->sCountryCode;
-                // Country is almost always at the end of the string
-                // - increase score for finding it anywhere else (optimisation)
-                if (!$bLastToken) {
-                    $oSearch->iSearchRank += 5;
-                    $oSearch->iNamePhrase = -1;
-                }
-                $aNewSearches[] = $oSearch;
-            }
-        } elseif (($sPhraseType == '' || $sPhraseType == 'postalcode')
-                  && is_a($oSearchTerm, '\Nominatim\Token\Postcode')
-        ) {
-            if (!$this->sPostcode) {
-                // If we have structured search or this is the first term,
-                // make the postcode the primary search element.
-                if ($this->iOperator == Operator::NONE && $bFirstToken) {
-                    $oSearch = clone $this;
-                    $oSearch->iSearchRank++;
-                    $oSearch->iOperator = Operator::POSTCODE;
-                    $oSearch->aAddress = array_merge($this->aAddress, $this->aName);
-                    $oSearch->aName =
-                        array($oSearchTerm->iId => $oSearchTerm->sPostcode);
-                    $aNewSearches[] = $oSearch;
-                }
+        return $oSearch;
+    }
 
-                // If we have a structured search or this is not the first term,
-                // add the postcode as an addendum.
-                if ($this->iOperator != Operator::POSTCODE
-                    && ($sPhraseType == 'postalcode' || !empty($this->aName))
-                ) {
-                    $oSearch = clone $this;
-                    $oSearch->iSearchRank++;
-                    $oSearch->iNamePhrase = -1;
-                    if (strlen($oSearchTerm->sPostcode) < 4) {
-                        $oSearch->iSearchRank += 4 - strlen($oSearchTerm->sPostcode);
-                    }
-                    $oSearch->sPostcode = $oSearchTerm->sPostcode;
-                    $aNewSearches[] = $oSearch;
-                }
-            }
-        } elseif (($sPhraseType == '' || $sPhraseType == 'street')
-                 && is_a($oSearchTerm, '\Nominatim\Token\HouseNumber')
-        ) {
-            if (!$this->sHouseNumber && $this->iOperator != Operator::POSTCODE) {
-                // sanity check: if the housenumber is not mainly made
-                // up of numbers, add a penalty
-                $iSearchCost = 1;
-                if (preg_match('/\\d/', $oSearchTerm->sToken) === 0
-                    || preg_match_all('/[^0-9]/', $oSearchTerm->sToken, $aMatches) > 2) {
-                    $iSearchCost++;
-                }
-                if ($this->iOperator != Operator::NONE) {
-                    $iSearchCost++;
-                }
-                if (empty($oSearchTerm->iId)) {
-                    $iSearchCost++;
-                }
-                // also must not appear in the middle of the address
-                if (!empty($this->aAddress)
-                    || (!empty($this->aAddressNonSearch))
-                    || $this->sPostcode
-                ) {
-                    $iSearchCost++;
-                }
+    /**
+     * Check if the search currently includes a name.
+     *
+     * @param bool bIncludeNonNames  If true stop-word tokens are taken into
+     *                               account, too.
+     *
+     * @return bool True, if search has a name.
+     */
+    public function hasName($bIncludeNonNames = false)
+    {
+        return !empty($this->aName)
+               || (!empty($this->aNameNonSearch) && $bIncludeNonNames);
+    }
 
-                $oSearch = clone $this;
-                $oSearch->iSearchRank += $iSearchCost;
-                $oSearch->iNamePhrase = -1;
-                $oSearch->sHouseNumber = $oSearchTerm->sToken;
-                $aNewSearches[] = $oSearch;
-
-                // Housenumbers may appear in the name when the place has its own
-                // address terms.
-                if ($oSearchTerm->iId !== null
-                    && ($this->iNamePhrase >= 0 || empty($this->aName))
-                    && empty($this->aAddress)
-                   ) {
-                    $oSearch = clone $this;
-                    $oSearch->iSearchRank += $iSearchCost;
-                    $oSearch->aAddress = $this->aName;
-                    $oSearch->bRareName = false;
-                    $oSearch->aName = array($oSearchTerm->iId => $oSearchTerm->iId);
-                    $aNewSearches[] = $oSearch;
-                }
-            }
-        } elseif ($sPhraseType == ''
-                  && is_a($oSearchTerm, '\Nominatim\Token\SpecialTerm')
-        ) {
-            if ($this->iOperator == Operator::NONE) {
-                $oSearch = clone $this;
-                $oSearch->iSearchRank += 2;
-                $oSearch->iNamePhrase = -1;
-
-                $iOp = $oSearchTerm->iOperator;
-                if ($iOp == Operator::NONE) {
-                    if (!empty($this->aName) || $this->oContext->isBoundedSearch()) {
-                        $iOp = Operator::NAME;
-                    } else {
-                        $iOp = Operator::NEAR;
-                    }
-                    $oSearch->iSearchRank += 2;
-                } elseif (!$bFirstToken && !$bLastToken) {
-                    $oSearch->iSearchRank += 2;
-                }
-                if ($this->sHouseNumber) {
-                    $oSearch->iSearchRank++;
-                }
+    /**
+     * Check if the search currently includes an address term.
+     *
+     * @return bool True, if any address term is included, including stop-word
+     *              terms.
+     */
+    public function hasAddress()
+    {
+        return !empty($this->aAddress) || !empty($this->aAddressNonSearch);
+    }
 
-                $oSearch->setPoiSearch(
-                    $iOp,
-                    $oSearchTerm->sClass,
-                    $oSearchTerm->sType
-                );
-                $aNewSearches[] = $oSearch;
-            }
-        } elseif ($sPhraseType != 'country'
-                  && is_a($oSearchTerm, '\Nominatim\Token\Word')
-        ) {
-            $iWordID = $oSearchTerm->iId;
-            // Full words can only be a name if they appear at the beginning
-            // of the phrase. In structured search the name must forcably in
-            // the first phrase. In unstructured search it may be in a later
-            // phrase when the first phrase is a house number.
-            if (!empty($this->aName) || !($bFirstPhrase || $sPhraseType == '')) {
-                if (($sPhraseType == '' || !$bFirstPhrase) && $oSearchTerm->iTermCount > 1) {
-                    $oSearch = clone $this;
-                    $oSearch->iNamePhrase = -1;
-                    $oSearch->iSearchRank += 1;
-                    $oSearch->aAddress[$iWordID] = $iWordID;
-                    $aNewSearches[] = $oSearch;
-                }
-            } elseif (empty($this->aNameNonSearch)) {
-                $oSearch = clone $this;
-                $oSearch->iSearchRank++;
-                $oSearch->aName = array($iWordID => $iWordID);
-                if (CONST_Search_NameOnlySearchFrequencyThreshold) {
-                    $oSearch->bRareName =
-                        $oSearchTerm->iSearchNameCount
-                          < CONST_Search_NameOnlySearchFrequencyThreshold;
-                }
-                $aNewSearches[] = $oSearch;
-            }
-        }
+    /**
+     * Check if a country restriction is currently included in the search.
+     *
+     * @return bool True, if a country restriction is set.
+     */
+    public function hasCountry()
+    {
+        return $this->sCountryCode !== '';
+    }
 
-        return $aNewSearches;
+    /**
+     * Check if a postcode is currently included in the search.
+     *
+     * @return bool True, if a postcode is set.
+     */
+    public function hasPostcode()
+    {
+        return $this->sPostcode !== '';
     }
 
     /**
-     * Derive new searches by adding a partial term to the existing search.
+     * Check if a house number is set for the search.
      *
-     * @param string  $sToken             Term for the token.
-     * @param object  $oSearchTerm        Description of the token.
-     * @param bool    $bStructuredPhrases True if the search is structured.
-     * @param integer $iPhrase            Number of the phrase the token is in.
-     * @param array[] $aFullTokens        List of full term tokens with the
-     *                                    same name.
+     * @return bool True, if a house number is set.
+     */
+    public function hasHousenumber()
+    {
+        return $this->sHouseNumber !== '';
+    }
+
+    /**
+     * Check if a special type of place is requested.
      *
-     * @return SearchDescription[] List of derived search descriptions.
+     * param integer iOperator  When set, check for the particular
+     *                          operator used for the special type.
+     *
+     * @return bool True, if speial type is requested or, if requested,
+     *              a special type with the given operator.
      */
-    public function extendWithPartialTerm($sToken, $oSearchTerm, $bStructuredPhrases, $iPhrase, $aFullTokens)
+    public function hasOperator($iOperator = null)
     {
-        // Only allow name terms.
-        if (!(is_a($oSearchTerm, '\Nominatim\Token\Word'))
-            || strpos($sToken, ' ') !== false
-        ) {
-            return array();
+        return $iOperator === null ? $this->iOperator != Operator::NONE : $this->iOperator == $iOperator;
+    }
+
+    /**
+     * Add the given token to the list of terms to search for in the address.
+     *
+     * @param integer iID       ID of term to add.
+     * @param bool bSearchable  Term should be used to search for result
+     *                          (i.e. term is not a stop word).
+     */
+    public function addAddressToken($iId, $bSearchable = true)
+    {
+        if ($bSearchable) {
+            $this->aAddress[$iId] = $iId;
+        } else {
+            $this->aAddressNonSearch[$iId] = $iId;
         }
+    }
 
-        $aNewSearches = array();
-        $iWordID = $oSearchTerm->iId;
+    /**
+     * Add the given full-word token to the list of terms to search for in the
+     * name.
+     *
+     * @param interger iId    ID of term to add.
+     * @param bool bRareName  True if the term is infrequent enough to not
+     *                        require other constraints for efficient search.
+     */
+    public function addNameToken($iId, $bRareName)
+    {
+        $this->aName[$iId] = $iId;
+        $this->bRareName = $bRareName;
+    }
 
-        if ((!$bStructuredPhrases || $iPhrase > 0)
-            && (!empty($this->aName))
-        ) {
-            $oSearch = clone $this;
-            $oSearch->iSearchRank++;
-            if (preg_match('#^[0-9 ]+$#', $sToken)) {
-                $oSearch->iSearchRank++;
-            }
-            if ($oSearchTerm->iSearchNameCount < CONST_Max_Word_Frequency) {
-                $oSearch->aAddress[$iWordID] = $iWordID;
-            } else {
-                $oSearch->aAddressNonSearch[$iWordID] = $iWordID;
-                if (!empty($aFullTokens)) {
-                    $oSearch->iSearchRank++;
-                }
-            }
-            $aNewSearches[] = $oSearch;
+    /**
+     * Add the given partial token to the list of terms to search for in
+     * the name.
+     *
+     * @param integer iID            ID of term to add.
+     * @param bool bSearchable       Term should be used to search for result
+     *                               (i.e. term is not a stop word).
+     * @param integer iPhraseNumber  Index of phrase, where the partial term
+     *                               appears.
+     */
+    public function addPartialNameToken($iId, $bSearchable, $iPhraseNumber)
+    {
+        if ($bSearchable) {
+            $this->aName[$iId] = $iId;
+        } else {
+            $this->aNameNonSearch[$iId] = $iId;
         }
+        $this->iNamePhrase = $iPhraseNumber;
+    }
 
-        if ((!$this->sPostcode && !$this->aAddress && !$this->aAddressNonSearch)
-            && ((empty($this->aName) && empty($this->aNameNonSearch)) || $this->iNamePhrase == $iPhrase)
-        ) {
-            $oSearch = clone $this;
-            $oSearch->iSearchRank++;
-            if (empty($this->aName) && empty($this->aNameNonSearch)) {
-                $oSearch->iSearchRank++;
-            }
-            if (preg_match('#^[0-9 ]+$#', $sToken)) {
-                $oSearch->iSearchRank++;
-            }
-            if ($oSearchTerm->iSearchNameCount < CONST_Max_Word_Frequency) {
-                if (empty($this->aName)
-                    && CONST_Search_NameOnlySearchFrequencyThreshold
-                ) {
-                    $oSearch->bRareName =
-                        $oSearchTerm->iSearchNameCount
-                          < CONST_Search_NameOnlySearchFrequencyThreshold;
-                } else {
-                    $oSearch->bRareName = false;
-                }
-                $oSearch->aName[$iWordID] = $iWordID;
-            } else {
-                if (!empty($aFullTokens)) {
-                    $oSearch->iSearchRank++;
-                }
-                $oSearch->aNameNonSearch[$iWordID] = $iWordID;
-            }
-            $oSearch->iNamePhrase = $iPhrase;
-            $aNewSearches[] = $oSearch;
-        }
+    /**
+     * Set country restriction for the search.
+     *
+     * @param string sCountryCode  Country code of country to restrict search to.
+     */
+    public function setCountry($sCountryCode)
+    {
+        $this->sCountryCode = $sCountryCode;
+        $this->iNamePhrase = -1;
+    }
+
+    /**
+     * Set postcode search constraint.
+     *
+     * @param string sPostcode  Postcode the result should have.
+     */
+    public function setPostcode($sPostcode)
+    {
+        $this->sPostcode = $sPostcode;
+        $this->iNamePhrase = -1;
+    }
+
+    /**
+     * Make this search a search for a postcode object.
+     *
+     * @param integer iId       Token Id for the postcode.
+     * @param string sPostcode  Postcode to look for.
+     */
+    public function setPostcodeAsName($iId, $sPostcode)
+    {
+        $this->iOperator = Operator::POSTCODE;
+        $this->aAddress = array_merge($this->aAddress, $this->aName);
+        $this->aName = array($iId => $sPostcode);
+        $this->bRareName = true;
+        $this->iNamePhrase = -1;
+    }
+
+    /**
+     * Set house number search cnstraint.
+     *
+     * @param string sNumber  House number the result should have.
+     */
+    public function setHousenumber($sNumber)
+    {
+        $this->sHouseNumber = $sNumber;
+        $this->iNamePhrase = -1;
+    }
+
+    /**
+     * Make this search a search for a house number.
+     *
+     * @param integer iId  Token Id for the house number.
+     */
+    public function setHousenumberAsName($iId)
+    {
+        $this->aAddress = array_merge($this->aAddress, $this->aName);
+        $this->bRareName = false;
+        $this->aName = array($iId => $iId);
+        $this->iNamePhrase = -1;
+    }
+
+    /**
+     * Make this search a POI search.
+     *
+     * In a POI search, objects are not (only) searched by their name
+     * but also by the primary OSM key/value pair (class and type in Nominatim).
+     *
+     * @param integer $iOperator Type of POI search
+     * @param string  $sClass    Class (or OSM tag key) of POI.
+     * @param string  $sType     Type (or OSM tag value) of POI.
+     *
+     * @return void
+     */
+    public function setPoiSearch($iOperator, $sClass, $sType)
+    {
+        $this->iOperator = $iOperator;
+        $this->sClass = $sClass;
+        $this->sType = $sType;
+        $this->iNamePhrase = -1;
+    }
+
+    public function getNamePhrase()
+    {
+        return $this->iNamePhrase;
+    }
 
-        return $aNewSearches;
+    /**
+     * Get the global search context.
+     *
+     * @return object  Objects of global search constraints.
+     */
+    public function getContext()
+    {
+        return $this->oContext;
     }
 
     /////////// Query functions
diff --git a/lib-php/SearchPosition.php b/lib-php/SearchPosition.php
new file mode 100644 (file)
index 0000000..e4260bf
--- /dev/null
@@ -0,0 +1,87 @@
+<?php
+
+namespace Nominatim;
+
+/**
+ * Description of the position of a token within a query.
+ */
+class SearchPosition
+{
+    private $sPhraseType;
+
+    private $iPhrase;
+    private $iNumPhrases;
+
+    private $iToken;
+    private $iNumTokens;
+
+
+    public function __construct($sPhraseType, $iPhrase, $iNumPhrases)
+    {
+        $this->sPhraseType = $sPhraseType;
+        $this->iPhrase = $iPhrase;
+        $this->iNumPhrases = $iNumPhrases;
+    }
+
+    public function setTokenPosition($iToken, $iNumTokens)
+    {
+        $this->iToken = $iToken;
+        $this->iNumTokens = $iNumTokens;
+    }
+
+    /**
+     * Check if the phrase can be of the given type.
+     *
+     * @param string  $sType  Type of phrse requested.
+     *
+     * @return True if the phrase is untyped or of the given type.
+     */
+    public function maybePhrase($sType)
+    {
+        return $this->sPhraseType == '' || $this->sPhraseType == $sType;
+    }
+
+    /**
+     * Check if the phrase is exactly of the given type.
+     *
+     * @param string  $sType  Type of phrse requested.
+     *
+     * @return True if the phrase of the given type.
+     */
+    public function isPhrase($sType)
+    {
+        return $this->sPhraseType == $sType;
+    }
+
+    /**
+     * Return true if the token is the very first in the query.
+     */
+    public function isFirstToken()
+    {
+        return $this->iPhrase == 0 && $this->iToken == 0;
+    }
+
+    /**
+     * Check if the token is the final one in the query.
+     */
+    public function isLastToken()
+    {
+        return $this->iToken + 1 == $this->iNumTokens && $this->iPhrase + 1 == $this->iNumPhrases;
+    }
+
+    /**
+     * Check if the current token is part of the first phrase in the query.
+     */
+    public function isFirstPhrase()
+    {
+        return $this->iPhrase == 0;
+    }
+
+    /**
+     * Get the phrase position in the query.
+     */
+    public function getPhrase()
+    {
+        return $this->iPhrase;
+    }
+}
index 518c0a31e3df225c37a19073946dd6c0dfd11035..c9b7b6af1a93b1e778f9721397dec8760809c005 100644 (file)
@@ -8,9 +8,9 @@ namespace Nominatim\Token;
 class Country
 {
     /// Database word id, if available.
-    public $iId;
+    private $iId;
     /// Two-letter country code (lower-cased).
-    public $sCountryCode;
+    private $sCountryCode;
 
     public function __construct($iId, $sCountryCode)
     {
@@ -18,6 +18,44 @@ class Country
         $this->sCountryCode = $sCountryCode;
     }
 
+    public function getId()
+    {
+        return $this->iId;
+    }
+
+    /**
+     * Check if the token can be added to the given search.
+     * Derive new searches by adding this token to an existing search.
+     *
+     * @param object  $oSearch      Partial search description derived so far.
+     * @param object  $oPosition    Description of the token position within
+                                    the query.
+     *
+     * @return True if the token is compatible with the search configuration
+     *         given the position.
+     */
+    public function isExtendable($oSearch, $oPosition)
+    {
+        return !$oSearch->hasCountry() && $oPosition->maybePhrase('country');
+    }
+
+    /**
+     * Derive new searches by adding this token to an existing search.
+     *
+     * @param object  $oSearch      Partial search description derived so far.
+     * @param object  $oPosition    Description of the token position within
+                                    the query.
+     *
+     * @return SearchDescription[] List of derived search descriptions.
+     */
+    public function extendSearch($oSearch, $oPosition)
+    {
+        $oNewSearch = $oSearch->clone($oPosition->isLastToken() ? 1 : 6);
+        $oNewSearch->setCountry($this->sCountryCode);
+
+        return array($oNewSearch);
+    }
+
     public function debugInfo()
     {
         return array(
@@ -26,4 +64,9 @@ class Country
                 'Info' => $this->sCountryCode
                );
     }
+
+    public function debugCode()
+    {
+        return 'C';
+    }
 }
index 5c7c6e9b633a4af458acbc637249ffad2453f7ac..cd60d3ca5620b7851a36736971adf753c6db49f9 100644 (file)
@@ -8,9 +8,9 @@ namespace Nominatim\Token;
 class HouseNumber
 {
     /// Database word id, if available.
-    public $iId;
+    private $iId;
     /// Normalized house number.
-    public $sToken;
+    private $sToken;
 
     public function __construct($iId, $sToken)
     {
@@ -18,6 +18,80 @@ class HouseNumber
         $this->sToken = $sToken;
     }
 
+    public function getId()
+    {
+        return $this->iId;
+    }
+
+    /**
+     * Check if the token can be added to the given search.
+     * Derive new searches by adding this token to an existing search.
+     *
+     * @param object  $oSearch      Partial search description derived so far.
+     * @param object  $oPosition    Description of the token position within
+                                    the query.
+     *
+     * @return True if the token is compatible with the search configuration
+     *         given the position.
+     */
+    public function isExtendable($oSearch, $oPosition)
+    {
+        return !$oSearch->hasHousenumber()
+               && !$oSearch->hasOperator(\Nominatim\Operator::POSTCODE)
+               && $oPosition->maybePhrase('street');
+    }
+
+    /**
+     * Derive new searches by adding this token to an existing search.
+     *
+     * @param object  $oSearch      Partial search description derived so far.
+     * @param object  $oPosition    Description of the token position within
+                                    the query.
+     *
+     * @return SearchDescription[] List of derived search descriptions.
+     */
+    public function extendSearch($oSearch, $oPosition)
+    {
+        $aNewSearches = array();
+
+        // sanity check: if the housenumber is not mainly made
+        // up of numbers, add a penalty
+        $iSearchCost = 1;
+        if (preg_match('/\\d/', $this->sToken) === 0
+            || preg_match_all('/[^0-9]/', $this->sToken, $aMatches) > 2) {
+            $iSearchCost++;
+        }
+        if (!$oSearch->hasOperator(\Nominatim\Operator::NONE)) {
+            $iSearchCost++;
+        }
+        if (empty($this->iId)) {
+            $iSearchCost++;
+        }
+        // also must not appear in the middle of the address
+        if ($oSearch->hasAddress() || $oSearch->hasPostcode()) {
+            $iSearchCost++;
+        }
+
+        $oNewSearch = $oSearch->clone($iSearchCost);
+        $oNewSearch->setHousenumber($this->sToken);
+        $aNewSearches[] = $oNewSearch;
+
+        // Housenumbers may appear in the name when the place has its own
+        // address terms.
+        if ($this->iId !== null
+            && ($oSearch->getNamePhrase() >= 0 || !$oSearch->hasName())
+            && !$oSearch->hasAddress()
+        ) {
+            $oNewSearch = $oSearch->clone($iSearchCost);
+            $oNewSearch->setHousenumberAsName($this->iId);
+
+            $aNewSearches[] = $oNewSearch;
+        }
+
+        return $aNewSearches;
+    }
+
+
     public function debugInfo()
     {
         return array(
@@ -26,4 +100,9 @@ class HouseNumber
                 'Info' => array('nr' => $this->sToken)
                );
     }
+
+    public function debugCode()
+    {
+        return 'H';
+    }
 }
index 2df9fe0586710f120c821b09f809f286cd616f44..a599648c21acdb48191c684f4f94c41950e2ae8c 100644 (file)
@@ -7,6 +7,7 @@ require_once(CONST_LibDir.'/TokenHousenumber.php');
 require_once(CONST_LibDir.'/TokenPostcode.php');
 require_once(CONST_LibDir.'/TokenSpecialTerm.php');
 require_once(CONST_LibDir.'/TokenWord.php');
+require_once(CONST_LibDir.'/TokenPartial.php');
 require_once(CONST_LibDir.'/SpecialSearchOperator.php');
 
 /**
@@ -17,15 +18,6 @@ require_once(CONST_LibDir.'/SpecialSearchOperator.php');
  * tokens do not have a common base class. All tokens need to have a field
  * with the word id that points to an entry in the `word` database table
  * but otherwise the information saved about a token can be very different.
- *
- * There are two different kinds of token words: full words and partial terms.
- *
- * Full words start with a space. They represent a complete name of a place.
- * All special tokens are normally full words.
- *
- * Partial terms have no space at the beginning. They may represent a part of
- * a name of a place (e.g. in the name 'World Trade Center' a partial term
- * would be 'Trade' or 'Trade Center'). They are only used in TokenWord.
  */
 class TokenList
 {
@@ -64,7 +56,7 @@ class TokenList
      */
     public function containsAny($sWord)
     {
-        return isset($this->aTokens[$sWord]) || isset($this->aTokens[' '.$sWord]);
+        return isset($this->aTokens[$sWord]);
     }
 
     /**
@@ -86,8 +78,8 @@ class TokenList
 
         foreach ($this->aTokens as $aTokenList) {
             foreach ($aTokenList as $oToken) {
-                if (is_a($oToken, '\Nominatim\Token\Word') && !$oToken->bPartial) {
-                    $ids[$oToken->iId] = $oToken->iId;
+                if (is_a($oToken, '\Nominatim\Token\Word')) {
+                    $ids[$oToken->getId()] = $oToken->getId();
                 }
             }
         }
@@ -117,9 +109,9 @@ class TokenList
         $aWordsIDs = array();
         foreach ($this->aTokens as $sToken => $aWords) {
             foreach ($aWords as $aToken) {
-                if ($aToken->iId !== null) {
-                    $aWordsIDs[$aToken->iId] =
-                        '#'.$sToken.'('.$aToken->iId.')#';
+                $iId = $aToken->getId();
+                if ($iId !== null) {
+                    $aWordsIDs[$iId] = '#'.$sToken.'('.$aToken->debugCode().' '.$iId.')#';
                 }
             }
         }
diff --git a/lib-php/TokenPartial.php b/lib-php/TokenPartial.php
new file mode 100644 (file)
index 0000000..131bb2a
--- /dev/null
@@ -0,0 +1,118 @@
+<?php
+
+namespace Nominatim\Token;
+
+/**
+ * A standard word token.
+ */
+class Partial
+{
+    /// Database word id, if applicable.
+    private $iId;
+    /// Number of appearances in the database.
+    private $iSearchNameCount;
+    /// True, if the token consists exclusively of digits and spaces.
+    private $bNumberToken;
+
+    public function __construct($iId, $sToken, $iSearchNameCount)
+    {
+        $this->iId = $iId;
+        $this->bNumberToken = (bool) preg_match('#^[0-9 ]+$#', $sToken);
+        $this->iSearchNameCount = $iSearchNameCount;
+    }
+
+    public function getId()
+    {
+        return $this->iId;
+    }
+
+    /**
+     * Check if the token can be added to the given search.
+     * Derive new searches by adding this token to an existing search.
+     *
+     * @param object  $oSearch      Partial search description derived so far.
+     * @param object  $oPosition    Description of the token position within
+                                    the query.
+     *
+     * @return True if the token is compatible with the search configuration
+     *         given the position.
+     */
+    public function isExtendable($oSearch, $oPosition)
+    {
+        return !$oPosition->isPhrase('country');
+    }
+
+    /**
+     * Derive new searches by adding this token to an existing search.
+     *
+     * @param object  $oSearch      Partial search description derived so far.
+     * @param object  $oPosition    Description of the token position within
+                                    the query.
+     *
+     * @return SearchDescription[] List of derived search descriptions.
+     */
+    public function extendSearch($oSearch, $oPosition)
+    {
+        $aNewSearches = array();
+
+        // Partial token in Address.
+        if (($oPosition->isPhrase('') || !$oPosition->isFirstPhrase())
+            && $oSearch->hasName()
+        ) {
+            $iSearchCost = $this->bNumberToken ? 2 : 1;
+            if ($this->iSearchNameCount >= CONST_Max_Word_Frequency) {
+                $iSearchCost += 1;
+            }
+
+            $oNewSearch = $oSearch->clone($iSearchCost);
+            $oNewSearch->addAddressToken(
+                $this->iId,
+                $this->iSearchNameCount < CONST_Max_Word_Frequency
+            );
+
+            $aNewSearches[] = $oNewSearch;
+        }
+
+        // Partial token in Name.
+        if ((!$oSearch->hasPostcode() && !$oSearch->hasAddress())
+            && (!$oSearch->hasName(true)
+                || $oSearch->getNamePhrase() == $oPosition->getPhrase())
+        ) {
+            $iSearchCost = 1;
+            if (!$oSearch->hasName(true)) {
+                $iSearchCost += 1;
+            }
+            if ($this->bNumberToken) {
+                $iSearchCost += 1;
+            }
+
+            $oNewSearch = $oSearch->clone($iSearchCost);
+            $oNewSearch->addPartialNameToken(
+                $this->iId,
+                $this->iSearchNameCount < CONST_Max_Word_Frequency,
+                $oPosition->getPhrase()
+            );
+
+            $aNewSearches[] = $oNewSearch;
+        }
+
+        return $aNewSearches;
+    }
+
+
+    public function debugInfo()
+    {
+        return array(
+                'ID' => $this->iId,
+                'Type' => 'partial',
+                'Info' => array(
+                           'count' => $this->iSearchNameCount
+                          )
+               );
+    }
+
+    public function debugCode()
+    {
+        return 'w';
+    }
+}
index 8fa2ae8021d1bfbed459fb0c546d379271f1188c..c0b42fad5ae3fab4b4360f153806e4365f20b357 100644 (file)
@@ -8,11 +8,11 @@ namespace Nominatim\Token;
 class Postcode
 {
     /// Database word id, if available.
-    public $iId;
+    private $iId;
     /// Full nomralized postcode (upper cased).
-    public $sPostcode;
+    private $sPostcode;
     // Optional country code the postcode belongs to (currently unused).
-    public $sCountryCode;
+    private $sCountryCode;
 
     public function __construct($iId, $sPostcode, $sCountryCode = '')
     {
@@ -21,6 +21,67 @@ class Postcode
         $this->sCountryCode = empty($sCountryCode) ? '' : $sCountryCode;
     }
 
+    public function getId()
+    {
+        return $this->iId;
+    }
+
+    /**
+     * Check if the token can be added to the given search.
+     * Derive new searches by adding this token to an existing search.
+     *
+     * @param object  $oSearch      Partial search description derived so far.
+     * @param object  $oPosition    Description of the token position within
+                                    the query.
+     *
+     * @return True if the token is compatible with the search configuration
+     *         given the position.
+     */
+    public function isExtendable($oSearch, $oPosition)
+    {
+        return !$oSearch->hasPostcode() && $oPosition->maybePhrase('postalcode');
+    }
+
+    /**
+     * Derive new searches by adding this token to an existing search.
+     *
+     * @param object  $oSearch      Partial search description derived so far.
+     * @param object  $oPosition    Description of the token position within
+                                    the query.
+     *
+     * @return SearchDescription[] List of derived search descriptions.
+     */
+    public function extendSearch($oSearch, $oPosition)
+    {
+        $aNewSearches = array();
+
+        // If we have structured search or this is the first term,
+        // make the postcode the primary search element.
+        if ($oSearch->hasOperator(\Nominatim\Operator::NONE) && $oPosition->isFirstToken()) {
+            $oNewSearch = $oSearch->clone(1);
+            $oNewSearch->setPostcodeAsName($this->iId, $this->sPostcode);
+
+            $aNewSearches[] = $oNewSearch;
+        }
+
+        // If we have a structured search or this is not the first term,
+        // add the postcode as an addendum.
+        if (!$oSearch->hasOperator(\Nominatim\Operator::POSTCODE)
+            && ($oPosition->isPhrase('postalcode') || $oSearch->hasName())
+        ) {
+            $iPenalty = 1;
+            if (strlen($this->sPostcode) < 4) {
+                $iPenalty += 4 - strlen($this->sPostcode);
+            }
+            $oNewSearch = $oSearch->clone($iPenalty);
+            $oNewSearch->setPostcode($this->sPostcode);
+
+            $aNewSearches[] = $oNewSearch;
+        }
+
+        return $aNewSearches;
+    }
+
     public function debugInfo()
     {
         return array(
@@ -29,4 +90,9 @@ class Postcode
                 'Info' => $this->sPostcode.'('.$this->sCountryCode.')'
                );
     }
+
+    public function debugCode()
+    {
+        return 'P';
+    }
 }
index b2c312ec90e53d8a52b022aeb01ab057059fbd3f..5b2d4c70a64f75de8971da78500bc2b8ad65e331 100644 (file)
@@ -10,13 +10,13 @@ require_once(CONST_LibDir.'/SpecialSearchOperator.php');
 class SpecialTerm
 {
     /// Database word id, if applicable.
-    public $iId;
+    private $iId;
     /// Class (or OSM tag key) of the place to look for.
-    public $sClass;
+    private $sClass;
     /// Type (or OSM tag value) of the place to look for.
-    public $sType;
+    private $sType;
     /// Relationship of the operator to the object (see Operator class).
-    public $iOperator;
+    private $iOperator;
 
     public function __construct($iID, $sClass, $sType, $iOperator)
     {
@@ -26,6 +26,62 @@ class SpecialTerm
         $this->iOperator = $iOperator;
     }
 
+    public function getId()
+    {
+        return $this->iId;
+    }
+
+    /**
+     * Check if the token can be added to the given search.
+     * Derive new searches by adding this token to an existing search.
+     *
+     * @param object  $oSearch      Partial search description derived so far.
+     * @param object  $oPosition    Description of the token position within
+                                    the query.
+     *
+     * @return True if the token is compatible with the search configuration
+     *         given the position.
+     */
+    public function isExtendable($oSearch, $oPosition)
+    {
+        return !$oSearch->hasOperator() && $oPosition->isPhrase('');
+    }
+
+    /**
+     * Derive new searches by adding this token to an existing search.
+     *
+     * @param object  $oSearch      Partial search description derived so far.
+     * @param object  $oPosition    Description of the token position within
+                                    the query.
+     *
+     * @return SearchDescription[] List of derived search descriptions.
+     */
+    public function extendSearch($oSearch, $oPosition)
+    {
+        $iSearchCost = 2;
+
+        $iOp = $this->iOperator;
+        if ($iOp == \Nominatim\Operator::NONE) {
+            if ($oSearch->hasName() || $oSearch->getContext()->isBoundedSearch()) {
+                $iOp = \Nominatim\Operator::NAME;
+            } else {
+                $iOp = \Nominatim\Operator::NEAR;
+            }
+            $iSearchCost += 2;
+        } elseif (!$oPosition->isFirstToken() && !$oPosition->isLastToken()) {
+            $iSearchCost += 2;
+        }
+        if ($oSearch->hasHousenumber()) {
+            $iSearchCost ++;
+        }
+
+        $oNewSearch = $oSearch->clone($iSearchCost);
+        $oNewSearch->setPoiSearch($iOp, $this->sClass, $this->sType);
+
+        return array($oNewSearch);
+    }
+
+
     public function debugInfo()
     {
         return array(
@@ -38,4 +94,9 @@ class SpecialTerm
                           )
                );
     }
+
+    public function debugCode()
+    {
+        return 'S';
+    }
 }
index fc28535d4582e459f5d88c72b8977efaf1930fa9..59456e35aaef3d309b8d2da23aca21df2d546b9f 100644 (file)
@@ -8,31 +8,95 @@ namespace Nominatim\Token;
 class Word
 {
     /// Database word id, if applicable.
-    public $iId;
-    /// If true, the word may represent only part of a place name.
-    public $bPartial;
+    private $iId;
     /// Number of appearances in the database.
-    public $iSearchNameCount;
+    private $iSearchNameCount;
     /// Number of terms in the word.
-    public $iTermCount;
+    private $iTermCount;
 
-    public function __construct($iId, $bPartial, $iSearchNameCount, $iTermCount)
+    public function __construct($iId, $iSearchNameCount, $iTermCount)
     {
         $this->iId = $iId;
-        $this->bPartial = $bPartial;
         $this->iSearchNameCount = $iSearchNameCount;
         $this->iTermCount = $iTermCount;
     }
 
+    public function getId()
+    {
+        return $this->iId;
+    }
+
+    /**
+     * Check if the token can be added to the given search.
+     * Derive new searches by adding this token to an existing search.
+     *
+     * @param object  $oSearch      Partial search description derived so far.
+     * @param object  $oPosition    Description of the token position within
+                                    the query.
+     *
+     * @return True if the token is compatible with the search configuration
+     *         given the position.
+     */
+    public function isExtendable($oSearch, $oPosition)
+    {
+        return !$oPosition->isPhrase('country');
+    }
+
+    /**
+     * Derive new searches by adding this token to an existing search.
+     *
+     * @param object  $oSearch      Partial search description derived so far.
+     * @param object  $oPosition    Description of the token position within
+                                    the query.
+     *
+     * @return SearchDescription[] List of derived search descriptions.
+     */
+    public function extendSearch($oSearch, $oPosition)
+    {
+        // Full words can only be a name if they appear at the beginning
+        // of the phrase. In structured search the name must forcably in
+        // the first phrase. In unstructured search it may be in a later
+        // phrase when the first phrase is a house number.
+        if ($oSearch->hasName()
+            || !($oPosition->isFirstPhrase() || $oPosition->isPhrase(''))
+        ) {
+            if ($this->iTermCount > 1
+                && ($oPosition->isPhrase('') || !$oPosition->isFirstPhrase())
+            ) {
+                $oNewSearch = $oSearch->clone(1);
+                $oNewSearch->addAddressToken($this->iId);
+
+                return array($oNewSearch);
+            }
+        } elseif (!$oSearch->hasName(true)) {
+            $oNewSearch = $oSearch->clone(1);
+            $oNewSearch->addNameToken(
+                $this->iId,
+                CONST_Search_NameOnlySearchFrequencyThreshold
+                && $this->iSearchNameCount
+                          < CONST_Search_NameOnlySearchFrequencyThreshold
+            );
+
+            return array($oNewSearch);
+        }
+
+        return array();
+    }
+
     public function debugInfo()
     {
         return array(
                 'ID' => $this->iId,
                 'Type' => 'word',
                 'Info' => array(
-                           'partial' => $this->bPartial,
-                           'count' => $this->iSearchNameCount
+                           'count' => $this->iSearchNameCount,
+                           'terms' => $this->iTermCount
                           )
                );
     }
+
+    public function debugCode()
+    {
+        return 'W';
+    }
 }
index 92dd727283019ea3454b20ee7232f0234f583b0c..2c0884c8170b46df51f64d90e67def88ac2d3b55 100644 (file)
@@ -120,14 +120,14 @@ class Tokenizer
 
             // Try more interpretations for Tokens that could not be matched.
             foreach ($aTokens as $sToken) {
-                if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
-                    if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
+                if ($sToken[0] != ' ' && !$oValidTokens->contains($sToken)) {
+                    if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
                         // US ZIP+4 codes - merge in the 5-digit ZIP code
                         $oValidTokens->addToken(
                             $sToken,
                             new Token\Postcode(null, $aData[1], 'us')
                         );
-                    } elseif (preg_match('/^ [0-9]+$/', $sToken)) {
+                    } elseif (preg_match('/^[0-9]+$/', $sToken)) {
                         // Unknown single word token with a number.
                         // Assume it is a house number.
                         $oValidTokens->addToken(
@@ -195,17 +195,28 @@ class Tokenizer
                 ) {
                     $oToken = new Token\Country($iId, $aWord['country_code']);
                 }
+            } elseif ($aWord['word_token'][0] == ' ') {
+                 $oToken = new Token\Word(
+                     $iId,
+                     $aWord['word_token'][0] != ' ',
+                     (int) $aWord['count'],
+                     substr_count($aWord['word_token'], ' ')
+                 );
             } else {
-                $oToken = new Token\Word(
+                $oToken = new Token\Partial(
                     $iId,
-                    $aWord['word_token'][0] != ' ',
-                    (int) $aWord['count'],
-                    substr_count($aWord['word_token'], ' ')
+                    $aWord['word_token'],
+                    (int) $aWord['count']
                 );
             }
 
             if ($oToken) {
-                $oValidTokens->addToken($aWord['word_token'], $oToken);
+                // remove any leading spaces
+                if ($aWord['word_token'][0] == ' ') {
+                    $oValidTokens->addToken(substr($aWord['word_token'], 1), $oToken);
+                } else {
+                    $oValidTokens->addToken($aWord['word_token'], $oToken);
+                }
             }
         }
     }
index 50207c31785d6c3579418fa42931f8d6a39be81c..064b41667a9322bb6cb164dd6f7bb041490d1257 100644 (file)
@@ -137,14 +137,14 @@ class Tokenizer
 
             // Try more interpretations for Tokens that could not be matched.
             foreach ($aTokens as $sToken) {
-                if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
-                    if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
+                if ($sToken[0] != ' ' && !$oValidTokens->contains($sToken)) {
+                    if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
                         // US ZIP+4 codes - merge in the 5-digit ZIP code
                         $oValidTokens->addToken(
                             $sToken,
                             new Token\Postcode(null, $aData[1], 'us')
                         );
-                    } elseif (preg_match('/^ [0-9]+$/', $sToken)) {
+                    } elseif (preg_match('/^[0-9]+$/', $sToken)) {
                         // Unknown single word token with a number.
                         // Assume it is a house number.
                         $oValidTokens->addToken(
@@ -212,17 +212,29 @@ class Tokenizer
                 ) {
                     $oToken = new Token\Country($iId, $aWord['country_code']);
                 }
-            } else {
+            } elseif ($aWord['word_token'][0] == ' ') {
                 $oToken = new Token\Word(
                     $iId,
-                    $aWord['word_token'][0] != ' ',
                     (int) $aWord['count'],
                     substr_count($aWord['word_token'], ' ')
                 );
+            // For backward compatibility: ignore all partial tokens with more
+            // than one word.
+            } elseif (strpos($aWord['word_token'], ' ') === false) {
+                $oToken = new Token\Partial(
+                    $iId,
+                    $aWord['word_token'],
+                    (int) $aWord['count']
+                );
             }
 
             if ($oToken) {
-                $oValidTokens->addToken($aWord['word_token'], $oToken);
+                // remove any leading spaces
+                if ($aWord['word_token'][0] == ' ') {
+                    $oValidTokens->addToken(substr($aWord['word_token'], 1), $oToken);
+                } else {
+                    $oValidTokens->addToken($aWord['word_token'], $oToken);
+                }
             }
         }
     }