Merge pull request #2305 from lonvia/tokenizer

author Sarah Hoffmann <lonvia@denofr.de>

Mon, 3 May 2021 07:15:34 +0000 (09:15 +0200)

committer GitHub <noreply@github.com>

Mon, 3 May 2021 07:15:34 +0000 (09:15 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Mon, 3 May 2021 07:15:34 +0000 (09:15 +0200)
committer GitHub <noreply@github.com>
Mon, 3 May 2021 07:15:34 +0000 (09:15 +0200)
diff --git a/lib-php/Geocode.php b/lib-php/Geocode.php

index 6cec6a85e088ee40c69917eb152e54bf571fc1a7..53ee49c043f53d80434887978434998987f9c830 100644 (file)
--- a/lib-php/Geocode.php
+++ b/lib-php/Geocode.php
@@ -8,12 +8,14 @@ require_once(CONST_LibDir.'/ReverseGeocode.php');
  require_once(CONST_LibDir.'/SearchDescription.php');
  require_once(CONST_LibDir.'/SearchContext.php');
  require_once(CONST_LibDir.'/TokenList.php');
+require_once(CONST_TokenizerDir.'/tokenizer.php');
  
  class Geocode
  {
      protected $oDB;
  
      protected $oPlaceLookup;
+    protected $oTokenizer;
  
      protected $aLangPrefOrder = array();
  
@@ -41,23 +43,12 @@ class Geocode
      protected $sQuery = false;
      protected $aStructuredQuery = false;
  
-    protected $oNormalizer = null;
-
  
      public function __construct(&$oDB)
      {
          $this->oDB =& $oDB;
          $this->oPlaceLookup = new PlaceLookup($this->oDB);
-        $this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
-    }
-
-    private function normTerm($sTerm)
-    {
-        if ($this->oNormalizer === null) {
-            return $sTerm;
-        }
-
-        return $this->oNormalizer->transliterate($sTerm);
+        $this->oTokenizer = new \Nominatim\Tokenizer($this->oDB);
      }
  
      public function setLanguagePreference($aLangPref)
@@ -510,12 +501,10 @@ class Geocode
          if ($this->aCountryCodes) {
              $oCtx->setCountryList($this->aCountryCodes);
          }
+        $this->oTokenizer->setCountryRestriction($this->aCountryCodes);
  
          Debug::newSection('Query Preprocessing');
  
-        $sNormQuery = $this->normTerm($this->sQuery);
-        Debug::printVar('Normalized query', $sNormQuery);
-
          $sLanguagePrefArraySQL = $this->oDB->getArraySQL(
              $this->oDB->getDBQuotedList($this->aLangPrefOrder)
          );
@@ -569,108 +558,55 @@ class Geocode
              }
  
              if ($sSpecialTerm && !$aSearches[0]->hasOperator()) {
-                $sSpecialTerm = pg_escape_string($sSpecialTerm);
-                $sToken = $this->oDB->getOne(
-                    'SELECT make_standard_name(:term)',
-                    array(':term' => $sSpecialTerm),
-                    'Cannot decode query. Wrong encoding?'
-                );
-                $sSQL = 'SELECT class, type FROM word ';
-                $sSQL .= '   WHERE word_token in (\' '.$sToken.'\')';
-                $sSQL .= '   AND class is not null AND class not in (\'place\')';
-
-                Debug::printSQL($sSQL);
-                $aSearchWords = $this->oDB->getAll($sSQL);
-                $aNewSearches = array();
-                foreach ($aSearches as $oSearch) {
-                    foreach ($aSearchWords as $aSearchTerm) {
-                        $oNewSearch = clone $oSearch;
-                        $oNewSearch->setPoiSearch(
-                            Operator::TYPE,
-                            $aSearchTerm['class'],
-                            $aSearchTerm['type']
-                        );
-                        $aNewSearches[] = $oNewSearch;
+                $aTokens = $this->oTokenizer->tokensForSpecialTerm($sSpecialTerm);
+
+                if (!empty($aTokens)) {
+                    $aNewSearches = array();
+                    foreach ($aSearches as $oSearch) {
+                        foreach ($aTokens as $oToken) {
+                            $oNewSearch = clone $oSearch;
+                            $oNewSearch->setPoiSearch(
+                                $oToken->iOperator,
+                                $oToken->sClass,
+                                $oToken->sType
+                            );
+                            $aNewSearches[] = $oNewSearch;
+                        }
                      }
+                    $aSearches = $aNewSearches;
                  }
-                $aSearches = $aNewSearches;
              }
  
              // Split query into phrases
              // Commas are used to reduce the search space by indicating where phrases split
+            $aPhrases = array();
              if ($this->aStructuredQuery) {
-                $aInPhrases = $this->aStructuredQuery;
+                foreach ($this->aStructuredQuery as $iPhrase => $sPhrase) {
+                    $aPhrases[] = new Phrase($sPhrase, $iPhrase);
+                }
              } else {
-                $aInPhrases = explode(',', $sQuery);
+                foreach (explode(',', $sQuery) as $sPhrase) {
+                    $aPhrases[] = new Phrase($sPhrase, '');
+                }
              }
  
              Debug::printDebugArray('Search context', $oCtx);
              Debug::printDebugArray('Base search', empty($aSearches) ? null : $aSearches[0]);
-            Debug::printVar('Final query phrases', $aInPhrases);
  
-            // Convert each phrase to standard form
-            // Create a list of standard words
-            // Get all 'sets' of words
-            // Generate a complete list of all
              Debug::newSection('Tokenization');
-            $aTokens = array();
-            $aPhrases = array();
-            foreach ($aInPhrases as $iPhrase => $sPhrase) {
-                $sPhrase = $this->oDB->getOne(
-                    'SELECT make_standard_name(:phrase)',
-                    array(':phrase' => $sPhrase),
-                    'Cannot normalize query string (is it a UTF-8 string?)'
-                );
-                if (trim($sPhrase)) {
-                    $oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : '');
-                    $oPhrase->addTokens($aTokens);
-                    $aPhrases[] = $oPhrase;
-                }
-            }
-
-            Debug::printVar('Tokens', $aTokens);
-
-            $oValidTokens = new TokenList();
-
-            if (!empty($aTokens)) {
-                $oValidTokens->addTokensFromDB(
-                    $this->oDB,
-                    $aTokens,
-                    $this->aCountryCodes,
-                    $sNormQuery,
-                    $this->oNormalizer
-                );
+            $oValidTokens = $this->oTokenizer->extractTokensFromPhrases($aPhrases);
  
+            if ($oValidTokens->count() > 0) {
                  $oCtx->setFullNameWords($oValidTokens->getFullWordIDs());
  
-                // Try more interpretations for Tokens that could not be matched.
-                foreach ($aTokens as $sToken) {
-                    if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
-                        if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
-                            // US ZIP+4 codes - merge in the 5-digit ZIP code
-                            $oValidTokens->addToken(
-                                $sToken,
-                                new Token\Postcode(null, $aData[1], 'us')
-                            );
-                        } elseif (preg_match('/^ [0-9]+$/', $sToken)) {
-                            // Unknown single word token with a number.
-                            // Assume it is a house number.
-                            $oValidTokens->addToken(
-                                $sToken,
-                                new Token\HouseNumber(null, trim($sToken))
-                            );
-                        }
-                    }
-                }
+                $aPhrases = array_filter($aPhrases, function ($oPhrase) {
+                    return $oPhrase->getWordSets() !== null;
+                });
  
                  // Any words that have failed completely?
                  // TODO: suggestions
  
                  Debug::printGroupTable('Valid Tokens', $oValidTokens->debugInfo());
-
-                foreach ($aPhrases as $oPhrase) {
-                    $oPhrase->computeWordSets($oValidTokens);
-                }
                  Debug::printDebugTable('Phrases', $aPhrases);
  
                  Debug::newSection('Search candidates');
diff --git a/lib-php/Phrase.php b/lib-php/Phrase.php

index e2643e878ed40a3bb22533600603647418e98011..d14c842df809158aa9644e0badc90244b84f0508 100644 (file)
--- a/lib-php/Phrase.php
+++ b/lib-php/Phrase.php
@@ -16,8 +16,6 @@ class Phrase
      private $sPhrase;
      // Element type for structured searches.
      private $sPhraseType;
-    // Space-separated words of the phrase.
-    private $aWords;
      // Possible segmentations of the phrase.
      private $aWordSets;
  
@@ -38,7 +36,14 @@ class Phrase
      {
          $this->sPhrase = trim($sPhrase);
          $this->sPhraseType = $sPhraseType;
-        $this->aWords = explode(' ', $this->sPhrase);
+    }
+
+    /**
+     * Get the orginal phrase of the string.
+     */
+    public function getPhrase()
+    {
+        return $this->sPhrase;
      }
  
      /**
@@ -63,30 +68,6 @@ class Phrase
          return $this->aWordSets;
      }
  
-    /**
-     * Add the tokens from this phrase to the given list of tokens.
-     *
-     * @param string[] $aTokens List of tokens to append.
-     *
-     * @return void
-     */
-    public function addTokens(&$aTokens)
-    {
-        $iNumWords = count($this->aWords);
-
-        for ($i = 0; $i < $iNumWords; $i++) {
-            $sPhrase = $this->aWords[$i];
-            $aTokens[' '.$sPhrase] = ' '.$sPhrase;
-            $aTokens[$sPhrase] = $sPhrase;
-
-            for ($j = $i + 1; $j < $iNumWords; $j++) {
-                $sPhrase .= ' '.$this->aWords[$j];
-                $aTokens[' '.$sPhrase] = ' '.$sPhrase;
-                $aTokens[$sPhrase] = $sPhrase;
-            }
-        }
-    }
-
      /**
       * Invert the set of possible segmentations.
       *
@@ -99,21 +80,27 @@ class Phrase
          }
      }
  
-    public function computeWordSets($oTokens)
+    public function computeWordSets($aWords, $oTokens)
      {
-        $iNumWords = count($this->aWords);
+        $iNumWords = count($aWords);
+
+        if ($iNumWords == 0) {
+            $this->aWordSets = null;
+            return;
+        }
+
          // Caches the word set for the partial phrase up to word i.
          $aSetCache = array_fill(0, $iNumWords, array());
  
          // Initialise first element of cache. There can only be the word.
-        if ($oTokens->containsAny($this->aWords[0])) {
-            $aSetCache[0][] = array($this->aWords[0]);
+        if ($oTokens->containsAny($aWords[0])) {
+            $aSetCache[0][] = array($aWords[0]);
          }
  
          // Now do the next elements using what we already have.
          for ($i = 1; $i < $iNumWords; $i++) {
              for ($j = $i; $j > 0; $j--) {
-                $sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial;
+                $sPartial = $j == $i ? $aWords[$j] : $aWords[$j].' '.$sPartial;
                  if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
                      $aPartial = array($sPartial);
                      foreach ($aSetCache[$j - 1] as $aSet) {
@@ -136,7 +123,7 @@ class Phrase
              }
  
              // finally the current full phrase
-            $sPartial = $this->aWords[0].' '.$sPartial;
+            $sPartial = $aWords[0].' '.$sPartial;
              if ($oTokens->containsAny($sPartial)) {
                  $aSetCache[$i][] = array($sPartial);
              }
@@ -153,7 +140,6 @@ class Phrase
          return array(
                  'Type' => $this->sPhraseType,
                  'Phrase' => $this->sPhrase,
-                'Words' => $this->aWords,
                  'WordSets' => $this->aWordSets
                 );
      }
diff --git a/lib-php/Status.php b/lib-php/Status.php

index 2d9e78db42606f59a90cb2d6e18ad4d8774a6a2e..52abfcd04916d038f0e95fcfeec9039acb88e3d1 100644 (file)
--- a/lib-php/Status.php
+++ b/lib-php/Status.php
@@ -2,6 +2,8 @@
  
  namespace Nominatim;
  
+require_once(CONST_TokenizerDir.'/tokenizer.php');
+
  use Exception;
  
  class Status
@@ -25,24 +27,8 @@ class Status
              throw new Exception('Database connection failed', 700);
          }
  
-        $sStandardWord = $this->oDB->getOne("SELECT make_standard_name('a')");
-        if ($sStandardWord === false) {
-            throw new Exception('Module failed', 701);
-        }
-
-        if ($sStandardWord != 'a') {
-            throw new Exception('Module call failed', 702);
-        }
-
-        $sSQL = 'SELECT word_id, word_token, word, class, type, country_code, ';
-        $sSQL .= "operator, search_name_count FROM word WHERE word_token IN (' a')";
-        $iWordID = $this->oDB->getOne($sSQL);
-        if ($iWordID === false) {
-            throw new Exception('Query failed', 703);
-        }
-        if (!$iWordID) {
-            throw new Exception('No value', 704);
-        }
+        $oTokenizer = new \Nominatim\Tokenizer($this->oDB);
+        $oTokenizer->checkStatus();
      }
  
      public function dataDate()
diff --git a/lib-php/TokenList.php b/lib-php/TokenList.php

index a419da6a9ad3429bc9087070038d491392ba1785..2df9fe0586710f120c821b09f809f286cd616f44 100644 (file)
--- a/lib-php/TokenList.php
+++ b/lib-php/TokenList.php
@@ -95,88 +95,6 @@ class TokenList
          return $ids;
      }
  
-    /**
-     * Add token information from the word table in the database.
-     *
-     * @param object   $oDB           Nominatim::DB instance.
-     * @param string[] $aTokens       List of tokens to look up in the database.
-     * @param string[] $aCountryCodes List of country restrictions.
-     * @param string   $sNormQuery    Normalized query string.
-     * @param object   $oNormalizer   Normalizer function to use on tokens.
-     *
-     * @return void
-     */
-    public function addTokensFromDB(&$oDB, &$aTokens, &$aCountryCodes, $sNormQuery, $oNormalizer)
-    {
-        // Check which tokens we have, get the ID numbers
-        $sSQL = 'SELECT word_id, word_token, word, class, type, country_code,';
-        $sSQL .= ' operator, coalesce(search_name_count, 0) as count';
-        $sSQL .= ' FROM word WHERE word_token in (';
-        $sSQL .= join(',', $oDB->getDBQuotedList($aTokens)).')';
-
-        Debug::printSQL($sSQL);
-
-        $aDBWords = $oDB->getAll($sSQL, null, 'Could not get word tokens.');
-
-        foreach ($aDBWords as $aWord) {
-            $oToken = null;
-            $iId = (int) $aWord['word_id'];
-
-            if ($aWord['class']) {
-                // Special terms need to appear in their normalized form.
-                if ($aWord['word']) {
-                    $sNormWord = $aWord['word'];
-                    if ($oNormalizer != null) {
-                        $sNormWord = $oNormalizer->transliterate($aWord['word']);
-                    }
-                    if (strpos($sNormQuery, $sNormWord) === false) {
-                        continue;
-                    }
-                }
-
-                if ($aWord['class'] == 'place' && $aWord['type'] == 'house') {
-                    $oToken = new Token\HouseNumber($iId, trim($aWord['word_token']));
-                } elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') {
-                    if ($aWord['word']
-                        && pg_escape_string($aWord['word']) == $aWord['word']
-                    ) {
-                        $oToken = new Token\Postcode(
-                            $iId,
-                            $aWord['word'],
-                            $aWord['country_code']
-                        );
-                    }
-                } else {
-                    // near and in operator the same at the moment
-                    $oToken = new Token\SpecialTerm(
-                        $iId,
-                        $aWord['class'],
-                        $aWord['type'],
-                        $aWord['operator'] ? Operator::NEAR : Operator::NONE
-                    );
-                }
-            } elseif ($aWord['country_code']) {
-                // Filter country tokens that do not match restricted countries.
-                if (!$aCountryCodes
-                    || in_array($aWord['country_code'], $aCountryCodes)
-                ) {
-                    $oToken = new Token\Country($iId, $aWord['country_code']);
-                }
-            } else {
-                $oToken = new Token\Word(
-                    $iId,
-                    $aWord['word_token'][0] != ' ',
-                    (int) $aWord['count'],
-                    substr_count($aWord['word_token'], ' ')
-                );
-            }
-
-            if ($oToken) {
-                $this->addToken($aWord['word_token'], $oToken);
-            }
-        }
-    }
-
      /**
       * Add a new token for the given word.
       *
diff --git a/lib-php/admin/query.php b/lib-php/admin/query.php

index 268b87cc0e0bbe1e2e895cf655736e5cb53c80d9..21121fbd316ac2269c137c8786361ac8da5735d1 100644 (file)
--- a/lib-php/admin/query.php
+++ b/lib-php/admin/query.php
@@ -2,7 +2,6 @@
  @define('CONST_LibDir', dirname(dirname(__FILE__)));
  
  require_once(CONST_LibDir.'/init-cmd.php');
-require_once(CONST_LibDir.'/Geocode.php');
  require_once(CONST_LibDir.'/ParameterParser.php');
  ini_set('memory_limit', '800M');
  
@@ -41,16 +40,16 @@ loadSettings($aCMDResult['project-dir'] ?? getcwd());
  @define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
  @define('CONST_Log_DB', getSettingBool('LOG_DB'));
  @define('CONST_Log_File', getSetting('LOG_FILE', false));
-@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
  @define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
  @define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
  @define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
  @define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
  @define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
-@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
  @define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
  @define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
+@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');
  
+require_once(CONST_LibDir.'/Geocode.php');
  
  $oDB = new Nominatim\DB;
  $oDB->connect();
diff --git a/lib-php/admin/warm.php b/lib-php/admin/warm.php

index d7950af9b1912b91d70c2f29b691c06a328fa15c..d6aa3d9b0d5f0978045b037427520a059a936c3b 100644 (file)
--- a/lib-php/admin/warm.php
+++ b/lib-php/admin/warm.php
@@ -3,7 +3,6 @@
  
  require_once(CONST_LibDir.'/init-cmd.php');
  require_once(CONST_LibDir.'/log.php');
-require_once(CONST_LibDir.'/Geocode.php');
  require_once(CONST_LibDir.'/PlaceLookup.php');
  require_once(CONST_LibDir.'/ReverseGeocode.php');
  
@@ -26,16 +25,16 @@ loadSettings($aCMDResult['project-dir'] ?? getcwd());
  @define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
  @define('CONST_Log_DB', getSettingBool('LOG_DB'));
  @define('CONST_Log_File', getSetting('LOG_FILE', false));
-@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
  @define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
  @define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
  @define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
  @define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
  @define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
-@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
  @define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
  @define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
+@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');
  
+require_once(CONST_LibDir.'/Geocode.php');
  
  $oDB = new Nominatim\DB();
  $oDB->connect();
diff --git a/lib-php/tokenizer/legacy_tokenizer.php b/lib-php/tokenizer/legacy_tokenizer.php

new file mode 100644 (file)

index 0000000..0fb37fd
--- /dev/null
+++ b/lib-php/tokenizer/legacy_tokenizer.php
@@ -0,0 +1,254 @@
+<?php
+
+namespace Nominatim;
+
+class Tokenizer
+{
+    private $oDB;
+
+    private $oNormalizer = null;
+    private $aCountryRestriction = null;
+
+    public function __construct(&$oDB)
+    {
+        $this->oDB =& $oDB;
+        $this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
+    }
+
+    public function checkStatus()
+    {
+        $sStandardWord = $this->oDB->getOne("SELECT make_standard_name('a')");
+        if ($sStandardWord === false) {
+            throw new Exception('Module failed', 701);
+        }
+
+        if ($sStandardWord != 'a') {
+            throw new Exception('Module call failed', 702);
+        }
+
+        $sSQL = "SELECT word_id FROM word WHERE word_token IN (' a')";
+        $iWordID = $this->oDB->getOne($sSQL);
+        if ($iWordID === false) {
+            throw new Exception('Query failed', 703);
+        }
+        if (!$iWordID) {
+            throw new Exception('No value', 704);
+        }
+    }
+
+
+    public function setCountryRestriction($aCountries)
+    {
+        $this->aCountryRestriction = $aCountries;
+    }
+
+
+    public function normalizeString($sTerm)
+    {
+        if ($this->oNormalizer === null) {
+            return $sTerm;
+        }
+
+        return $this->oNormalizer->transliterate($sTerm);
+    }
+
+
+    public function tokensForSpecialTerm($sTerm)
+    {
+        $aResults = array();
+
+        $sSQL = 'SELECT word_id, class, type FROM word ';
+        $sSQL .= '   WHERE word_token = \' \' || make_standard_name(:term)';
+        $sSQL .= '   AND class is not null AND class not in (\'place\')';
+
+        Debug::printVar('Term', $sTerm);
+        Debug::printSQL($sSQL);
+        $aSearchWords = $this->oDB->getAll($sSQL, array(':term' => $sTerm));
+
+        Debug::printVar('Results', $aSearchWords);
+
+        foreach ($aSearchWords as $aSearchTerm) {
+            $aResults[] = new \Nominatim\Token\SpecialTerm(
+                $aSearchTerm['word_id'],
+                $aSearchTerm['class'],
+                $aSearchTerm['type'],
+                \Nominatim\Operator::TYPE
+            );
+        }
+
+        Debug::printVar('Special term tokens', $aResults);
+
+        return $aResults;
+    }
+
+
+    public function extractTokensFromPhrases(&$aPhrases)
+    {
+        // First get the normalized version of all phrases
+        $sNormQuery = '';
+        $sSQL = 'SELECT ';
+        $aParams = array();
+        foreach ($aPhrases as $iPhrase => $oPhrase) {
+            $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
+            $sSQL .= 'make_standard_name(:' .$iPhrase.') as p'.$iPhrase.',';
+            $aParams[':'.$iPhrase] = $oPhrase->getPhrase();
+        }
+        $sSQL = substr($sSQL, 0, -1);
+
+        Debug::printSQL($sSQL);
+        Debug::printVar('SQL parameters', $aParams);
+
+        $aNormPhrases = $this->oDB->getRow($sSQL, $aParams);
+
+        Debug::printVar('SQL result', $aNormPhrases);
+
+        // now compute all possible tokens
+        $aWordLists = array();
+        $aTokens = array();
+        foreach ($aNormPhrases as $sTitle => $sPhrase) {
+            if (strlen($sPhrase) > 0) {
+                $aWords = explode(' ', $sPhrase);
+                Tokenizer::addTokens($aTokens, $aWords);
+                $aWordLists[] = $aWords;
+            } else {
+                $aWordLists[] = array();
+            }
+        }
+
+        Debug::printVar('Tokens', $aTokens);
+        Debug::printVar('WordLists', $aWordLists);
+
+        $oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
+
+        foreach ($aPhrases as $iPhrase => $oPhrase) {
+            $oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
+        }
+
+        return $oValidTokens;
+    }
+
+
+    private function computeValidTokens($aTokens, $sNormQuery)
+    {
+        $oValidTokens = new TokenList();
+
+        if (!empty($aTokens)) {
+            $this->addTokensFromDB($oValidTokens, $aTokens, $sNormQuery);
+
+            // Try more interpretations for Tokens that could not be matched.
+            foreach ($aTokens as $sToken) {
+                if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
+                    if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
+                        // US ZIP+4 codes - merge in the 5-digit ZIP code
+                        $oValidTokens->addToken(
+                            $sToken,
+                            new Token\Postcode(null, $aData[1], 'us')
+                        );
+                    } elseif (preg_match('/^ [0-9]+$/', $sToken)) {
+                        // Unknown single word token with a number.
+                        // Assume it is a house number.
+                        $oValidTokens->addToken(
+                            $sToken,
+                            new Token\HouseNumber(null, trim($sToken))
+                        );
+                    }
+                }
+            }
+        }
+
+        return $oValidTokens;
+    }
+
+
+    private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
+    {
+        // Check which tokens we have, get the ID numbers
+        $sSQL = 'SELECT word_id, word_token, word, class, type, country_code,';
+        $sSQL .= ' operator, coalesce(search_name_count, 0) as count';
+        $sSQL .= ' FROM word WHERE word_token in (';
+        $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
+
+        Debug::printSQL($sSQL);
+
+        $aDBWords = $this->oDB->getAll($sSQL, null, 'Could not get word tokens.');
+
+        foreach ($aDBWords as $aWord) {
+            $oToken = null;
+            $iId = (int) $aWord['word_id'];
+
+            if ($aWord['class']) {
+                // Special terms need to appear in their normalized form.
+                // (postcodes are not normalized in the word table)
+                $sNormWord = $this->normalizeString($aWord['word']);
+                if ($aWord['word'] && strpos($sNormQuery, $sNormWord) === false) {
+                    continue;
+                }
+
+                if ($aWord['class'] == 'place' && $aWord['type'] == 'house') {
+                    $oToken = new Token\HouseNumber($iId, trim($aWord['word_token']));
+                } elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') {
+                    if ($aWord['word']
+                        && pg_escape_string($aWord['word']) == $aWord['word']
+                    ) {
+                        $oToken = new Token\Postcode(
+                            $iId,
+                            $aWord['word'],
+                            $aWord['country_code']
+                        );
+                    }
+                } else {
+                    // near and in operator the same at the moment
+                    $oToken = new Token\SpecialTerm(
+                        $iId,
+                        $aWord['class'],
+                        $aWord['type'],
+                        $aWord['operator'] ? Operator::NEAR : Operator::NONE
+                    );
+                }
+            } elseif ($aWord['country_code']) {
+                // Filter country tokens that do not match restricted countries.
+                if (!$this->aCountryRestriction
+                    || in_array($aWord['country_code'], $this->aCountryRestriction)
+                ) {
+                    $oToken = new Token\Country($iId, $aWord['country_code']);
+                }
+            } else {
+                $oToken = new Token\Word(
+                    $iId,
+                    $aWord['word_token'][0] != ' ',
+                    (int) $aWord['count'],
+                    substr_count($aWord['word_token'], ' ')
+                );
+            }
+
+            if ($oToken) {
+                $oValidTokens->addToken($aWord['word_token'], $oToken);
+            }
+        }
+    }
+
+
+    /**
+     * Add the tokens from this phrase to the given list of tokens.
+     *
+     * @param string[] $aTokens List of tokens to append.
+     *
+     * @return void
+     */
+    private static function addTokens(&$aTokens, $aWords)
+    {
+        $iNumWords = count($aWords);
+
+        for ($i = 0; $i < $iNumWords; $i++) {
+            $sPhrase = $aWords[$i];
+            $aTokens[' '.$sPhrase] = ' '.$sPhrase;
+            $aTokens[$sPhrase] = $sPhrase;
+
+            for ($j = $i + 1; $j < $iNumWords; $j++) {
+                $sPhrase .= ' '.$aWords[$j];
+                $aTokens[' '.$sPhrase] = ' '.$sPhrase;
+                $aTokens[$sPhrase] = $sPhrase;
+            }
+        }
+    }
+}
diff --git a/lib-sql/functions.sql b/lib-sql/functions.sql

index 750af9f0f5a9efa79e208f18b6295f92a3b248b9..e9419ca2ea92947a2a3f5fe34379092022a8dc51 100644 (file)
--- a/lib-sql/functions.sql
+++ b/lib-sql/functions.sql
@@ -1,5 +1,4 @@
  {% include('functions/utils.sql') %}
-{% include('functions/normalization.sql') %}
  {% include('functions/ranking.sql') %}
  {% include('functions/importance.sql') %}
  {% include('functions/address_lookup.sql') %}
diff --git a/lib-sql/functions/interpolation.sql b/lib-sql/functions/interpolation.sql

index a797cad3ac1de74b3500eb53d90bbd2111b826f1..55e44dfd646e05f497658ba401207b0fd2194b67 100644 (file)
--- a/lib-sql/functions/interpolation.sql
+++ b/lib-sql/functions/interpolation.sql
@@ -12,39 +12,47 @@ $$
  LANGUAGE plpgsql IMMUTABLE;
  
  
+CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
+RETURNS HSTORE
+  AS $$
+DECLARE
+  location RECORD;
+  waynodes BIGINT[];
+BEGIN
+  IF akeys(in_address) != ARRAY['interpolation'] THEN
+    RETURN in_address;
+  END IF;
+
+  SELECT nodes INTO waynodes FROM planet_osm_ways WHERE id = wayid;
+  FOR location IN
+    SELECT placex.address, placex.osm_id FROM placex
+     WHERE osm_type = 'N' and osm_id = ANY(waynodes)
+           and placex.address is not null
+           and (placex.address ? 'street' or placex.address ? 'place')
+           and indexed_status < 100
+  LOOP
+    -- mark it as a derived address
+    RETURN location.address || in_address || hstore('_inherited', '');
+  END LOOP;
+
+  RETURN in_address;
+END;
+$$
+LANGUAGE plpgsql STABLE;
+
+
+
  -- find the parent road of the cut road parts
-CREATE OR REPLACE FUNCTION get_interpolation_parent(wayid BIGINT, street TEXT,
-                                                    place TEXT, partition SMALLINT,
+CREATE OR REPLACE FUNCTION get_interpolation_parent(street INTEGER[], place INTEGER[],
+                                                    partition SMALLINT,
                                                      centroid GEOMETRY, geom GEOMETRY)
    RETURNS BIGINT
    AS $$
  DECLARE
-  addr_street TEXT;
-  addr_place TEXT;
    parent_place_id BIGINT;
-
-  waynodes BIGINT[];
-
    location RECORD;
  BEGIN
-  addr_street = street;
-  addr_place = place;
-
-  IF addr_street is null and addr_place is null THEN
-    select nodes from planet_osm_ways where id = wayid INTO waynodes;
-    FOR location IN SELECT placex.address from placex
-                    where osm_type = 'N' and osm_id = ANY(waynodes)
-                          and placex.address is not null
-                          and (placex.address ? 'street' or placex.address ? 'place')
-                          and indexed_status < 100
-                    limit 1 LOOP
-      addr_street = location.address->'street';
-      addr_place = location.address->'place';
-    END LOOP;
-  END IF;
-
-  parent_place_id := find_parent_for_address(addr_street, addr_place,
-                                             partition, centroid);
+  parent_place_id := find_parent_for_address(street, place, partition, centroid);
  
    IF parent_place_id is null THEN
      FOR location IN SELECT place_id FROM placex
@@ -147,15 +155,15 @@ BEGIN
    NEW.interpolationtype = NEW.address->'interpolation';
  
    place_centroid := ST_PointOnSurface(NEW.linegeo);
-  NEW.parent_place_id = get_interpolation_parent(NEW.osm_id, NEW.address->'street',
-                                                 NEW.address->'place',
+  NEW.parent_place_id = get_interpolation_parent(token_addr_street_match_tokens(NEW.token_info),
+                                                 token_addr_place_match_tokens(NEW.token_info),
                                                   NEW.partition, place_centroid, NEW.linegeo);
  
-  IF NEW.address is not NULL AND NEW.address ? 'postcode' AND NEW.address->'postcode' not similar to '%(,|;)%' THEN
-    interpol_postcode := NEW.address->'postcode';
-    housenum := getorcreate_postcode_id(NEW.address->'postcode');
-  ELSE
-    interpol_postcode := NULL;
+  interpol_postcode := token_normalized_postcode(NEW.address->'postcode');
+
+  NEW.token_info := token_strip_info(NEW.token_info);
+  IF NEW.address ? '_inherited' THEN
+    NEW.address := hstore('interpolation', NEW.interpolationtype);
    END IF;
  
    -- if the line was newly inserted, split the line as necessary
@@ -202,12 +210,13 @@ BEGIN
  
              -- determine postcode
              postcode := coalesce(interpol_postcode,
-                                 prevnode.address->'postcode',
-                                 nextnode.address->'postcode',
+                                 token_normalized_postcode(prevnode.address->'postcode'),
+                                 token_normalized_postcode(nextnode.address->'postcode'),
                                   postcode);
  
              IF postcode is NULL THEN
-                SELECT placex.postcode FROM placex WHERE place_id = NEW.parent_place_id INTO postcode;
+                SELECT token_normalized_postcode(placex.postcode)
+                  FROM placex WHERE place_id = NEW.parent_place_id INTO postcode;
              END IF;
              IF postcode is NULL THEN
                  postcode := get_nearest_postcode(NEW.country_code, nextnode.geometry);
@@ -217,7 +226,7 @@ BEGIN
                  NEW.startnumber := startnumber;
                  NEW.endnumber := endnumber;
                  NEW.linegeo := sectiongeo;
-                NEW.postcode := upper(trim(postcode));
+                NEW.postcode := postcode;
               ELSE
                insert into location_property_osmline
                       (linegeo, partition, osm_id, parent_place_id,
diff --git a/lib-sql/functions/normalization.sql b/lib-sql/functions/normalization.sql

deleted file mode 100644 (file)

index c7bd2fc..0000000
--- a/lib-sql/functions/normalization.sql
+++ /dev/null
@@ -1,525 +0,0 @@
--- Functions for term normalisation and access to the 'word' table.
-
-CREATE OR REPLACE FUNCTION transliteration(text) RETURNS text
-  AS '{{ modulepath }}/nominatim.so', 'transliteration'
-LANGUAGE c IMMUTABLE STRICT;
-
-
-CREATE OR REPLACE FUNCTION gettokenstring(text) RETURNS text
-  AS '{{ modulepath }}/nominatim.so', 'gettokenstring'
-LANGUAGE c IMMUTABLE STRICT;
-
-
-CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT
-  AS $$
-DECLARE
-  o TEXT;
-BEGIN
-  o := public.gettokenstring(public.transliteration(name));
-  RETURN trim(substr(o,1,length(o)));
-END;
-$$
-LANGUAGE plpgsql IMMUTABLE;
-
--- returns NULL if the word is too common
-CREATE OR REPLACE FUNCTION getorcreate_word_id(lookup_word TEXT) 
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  return_word_id INTEGER;
-  count INTEGER;
-BEGIN
-  lookup_token := trim(lookup_word);
-  SELECT min(word_id), max(search_name_count) FROM word
-    WHERE word_token = lookup_token and class is null and type is null
-    INTO return_word_id, count;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, null, null, null, null, 0);
-  ELSE
-    IF count > get_maxwordfreq() THEN
-      return_word_id := NULL;
-    END IF;
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
--- Create housenumber tokens from an OSM addr:housenumber.
--- The housnumber is split at comma and semicolon as necessary.
--- The function returns the normalized form of the housenumber suitable
--- for comparison.
-CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
-  RETURNS TEXT
-  AS $$
-DECLARE
-  normtext TEXT;
-BEGIN
-  SELECT array_to_string(array_agg(trans), ';')
-    INTO normtext
-    FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word)
-          FROM (SELECT make_standard_name(h) as lookup_word
-                FROM regexp_split_to_table(housenumber, '[,;]') h) x) y;
-
-  return normtext;
-END;
-$$ LANGUAGE plpgsql STABLE STRICT;
-
-CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  return_word_id INTEGER;
-BEGIN
-  lookup_token := ' ' || trim(lookup_word);
-  SELECT min(word_id) FROM word
-    WHERE word_token = lookup_token and class='place' and type='house'
-    INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, null,
-                             'place', 'house', null, 0);
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  lookup_word TEXT;
-  return_word_id INTEGER;
-BEGIN
-  lookup_word := upper(trim(postcode));
-  lookup_token := ' ' || make_standard_name(lookup_word);
-  SELECT min(word_id) FROM word
-    WHERE word_token = lookup_token and word = lookup_word
-          and class='place' and type='postcode'
-    INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, lookup_word,
-                             'place', 'postcode', null, 0);
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
-                                               lookup_country_code varchar(2))
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  return_word_id INTEGER;
-BEGIN
-  lookup_token := ' '||trim(lookup_word);
-  SELECT min(word_id) FROM word
-    WHERE word_token = lookup_token and country_code=lookup_country_code
-    INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, null,
-                             null, null, lookup_country_code, 0);
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
-                                               lookup_class text, lookup_type text)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  return_word_id INTEGER;
-BEGIN
-  lookup_token := ' '||trim(lookup_word);
-  SELECT min(word_id) FROM word
-  WHERE word_token = lookup_token and word = normalized_word
-        and class = lookup_class and type = lookup_type
-  INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
-                             lookup_class, lookup_type, null, 0);
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT,
-                                                       normalized_word TEXT,
-                                                       lookup_class text,
-                                                       lookup_type text,
-                                                       op text)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  return_word_id INTEGER;
-BEGIN
-  lookup_token := ' '||trim(lookup_word);
-  SELECT min(word_id) FROM word
-  WHERE word_token = lookup_token and word = normalized_word
-        and class = lookup_class and type = lookup_type and operator = op
-  INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
-                             lookup_class, lookup_type, null, 0, op);
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT, src_word TEXT)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  nospace_lookup_token TEXT;
-  return_word_id INTEGER;
-BEGIN
-  lookup_token := ' '||trim(lookup_word);
-  SELECT min(word_id) FROM word
-  WHERE word_token = lookup_token and class is null and type is null
-  INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, src_word,
-                             null, null, null, 0);
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-BEGIN
-  RETURN getorcreate_name_id(lookup_word, '');
-END;
-$$
-LANGUAGE plpgsql;
-
--- Normalize a string and lookup its word ids (partial words).
-CREATE OR REPLACE FUNCTION addr_ids_from_name(lookup_word TEXT)
-  RETURNS INTEGER[]
-  AS $$
-DECLARE
-  words TEXT[];
-  id INTEGER;
-  return_word_id INTEGER[];
-  word_ids INTEGER[];
-  j INTEGER;
-BEGIN
-  words := string_to_array(make_standard_name(lookup_word), ' ');
-  IF array_upper(words, 1) IS NOT NULL THEN
-    FOR j IN 1..array_upper(words, 1) LOOP
-      IF (words[j] != '') THEN
-        SELECT array_agg(word_id) INTO word_ids
-          FROM word
-         WHERE word_token = words[j] and class is null and type is null;
-
-        IF word_ids IS NULL THEN
-          id := nextval('seq_word');
-          INSERT INTO word VALUES (id, words[j], null, null, null, null, 0);
-          return_word_id := return_word_id || id;
-        ELSE
-          return_word_id := array_merge(return_word_id, word_ids);
-        END IF;
-      END IF;
-    END LOOP;
-  END IF;
-
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
--- Normalize a string and look up its name ids (full words).
-CREATE OR REPLACE FUNCTION word_ids_from_name(lookup_word TEXT)
-  RETURNS INTEGER[]
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  return_word_ids INTEGER[];
-BEGIN
-  lookup_token := ' '|| make_standard_name(lookup_word);
-  SELECT array_agg(word_id) FROM word
-    WHERE word_token = lookup_token and class is null and type is null
-    INTO return_word_ids;
-  RETURN return_word_ids;
-END;
-$$
-LANGUAGE plpgsql STABLE STRICT;
-
-
-CREATE OR REPLACE FUNCTION create_country(src HSTORE, country_code varchar(2))
-  RETURNS VOID
-  AS $$
-DECLARE
-  s TEXT;
-  w INTEGER;
-  words TEXT[];
-  item RECORD;
-  j INTEGER;
-BEGIN
-  FOR item IN SELECT (each(src)).* LOOP
-
-    s := make_standard_name(item.value);
-    w := getorcreate_country(s, country_code);
-
-    words := regexp_split_to_array(item.value, E'[,;()]');
-    IF array_upper(words, 1) != 1 THEN
-      FOR j IN 1..array_upper(words, 1) LOOP
-        s := make_standard_name(words[j]);
-        IF s != '' THEN
-          w := getorcreate_country(s, country_code);
-        END IF;
-      END LOOP;
-    END IF;
-  END LOOP;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION make_keywords(src HSTORE)
-  RETURNS INTEGER[]
-  AS $$
-DECLARE
-  result INTEGER[];
-  s TEXT;
-  w INTEGER;
-  words TEXT[];
-  item RECORD;
-  j INTEGER;
-BEGIN
-  result := '{}'::INTEGER[];
-
-  FOR item IN SELECT (each(src)).* LOOP
-
-    s := make_standard_name(item.value);
-    w := getorcreate_name_id(s, item.value);
-
-    IF not(ARRAY[w] <@ result) THEN
-      result := result || w;
-    END IF;
-
-    w := getorcreate_word_id(s);
-
-    IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
-      result := result || w;
-    END IF;
-
-    words := string_to_array(s, ' ');
-    IF array_upper(words, 1) IS NOT NULL THEN
-      FOR j IN 1..array_upper(words, 1) LOOP
-        IF (words[j] != '') THEN
-          w = getorcreate_word_id(words[j]);
-          IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
-            result := result || w;
-          END IF;
-        END IF;
-      END LOOP;
-    END IF;
-
-    words := regexp_split_to_array(item.value, E'[,;()]');
-    IF array_upper(words, 1) != 1 THEN
-      FOR j IN 1..array_upper(words, 1) LOOP
-        s := make_standard_name(words[j]);
-        IF s != '' THEN
-          w := getorcreate_word_id(s);
-          IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
-            result := result || w;
-          END IF;
-        END IF;
-      END LOOP;
-    END IF;
-
-    s := regexp_replace(item.value, '市$', '');
-    IF s != item.value THEN
-      s := make_standard_name(s);
-      IF s != '' THEN
-        w := getorcreate_name_id(s, item.value);
-        IF NOT (ARRAY[w] <@ result) THEN
-          result := result || w;
-        END IF;
-      END IF;
-    END IF;
-
-  END LOOP;
-
-  RETURN result;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION precompute_words(src TEXT)
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  s TEXT;
-  w INTEGER;
-  words TEXT[];
-  i INTEGER;
-  j INTEGER;
-BEGIN
-  s := make_standard_name(src);
-  w := getorcreate_name_id(s, src);
-
-  w := getorcreate_word_id(s);
-
-  words := string_to_array(s, ' ');
-  IF array_upper(words, 1) IS NOT NULL THEN
-    FOR j IN 1..array_upper(words, 1) LOOP
-      IF (words[j] != '') THEN
-        w := getorcreate_word_id(words[j]);
-      END IF;
-    END LOOP;
-  END IF;
-
-  words := regexp_split_to_array(src, E'[,;()]');
-  IF array_upper(words, 1) != 1 THEN
-    FOR j IN 1..array_upper(words, 1) LOOP
-      s := make_standard_name(words[j]);
-      IF s != '' THEN
-        w := getorcreate_word_id(s);
-      END IF;
-    END LOOP;
-  END IF;
-
-  s := regexp_replace(src, '市$', '');
-  IF s != src THEN
-    s := make_standard_name(s);
-    IF s != '' THEN
-      w := getorcreate_name_id(s, src);
-    END IF;
-  END IF;
-
-  RETURN 1;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION create_poi_search_terms(obj_place_id BIGINT,
-                                                   in_partition SMALLINT,
-                                                   parent_place_id BIGINT,
-                                                   address HSTORE,
-                                                   country TEXT,
-                                                   housenumber TEXT,
-                                                   initial_name_vector INTEGER[],
-                                                   geometry GEOMETRY,
-                                                   OUT name_vector INTEGER[],
-                                                   OUT nameaddress_vector INTEGER[])
-  AS $$
-DECLARE
-  parent_name_vector INTEGER[];
-  parent_address_vector INTEGER[];
-  addr_place_ids INTEGER[];
-
-  addr_item RECORD;
-  parent_address_place_ids BIGINT[];
-  filtered_address HSTORE;
-BEGIN
-  nameaddress_vector := '{}'::INTEGER[];
-
-  SELECT s.name_vector, s.nameaddress_vector
-    INTO parent_name_vector, parent_address_vector
-    FROM search_name s
-    WHERE s.place_id = parent_place_id;
-
-  -- Find all address tags that don't appear in the parent search names.
-  SELECT hstore(array_agg(ARRAY[k, v])) INTO filtered_address
-    FROM (SELECT skeys(address) as k, svals(address) as v) a
-   WHERE not addr_ids_from_name(v) && parent_address_vector
-         AND k not in ('country', 'street', 'place', 'postcode',
-                       'housenumber', 'streetnumber', 'conscriptionnumber');
-
-  -- Compute all search terms from the addr: tags.
-  IF filtered_address IS NOT NULL THEN
-    FOR addr_item IN
-      SELECT * FROM
-        get_places_for_addr_tags(in_partition, geometry, filtered_address, country)
-    LOOP
-        IF addr_item.place_id is null THEN
-            nameaddress_vector := array_merge(nameaddress_vector,
-                                              addr_item.keywords);
-            CONTINUE;
-        END IF;
-
-        IF parent_address_place_ids is null THEN
-            SELECT array_agg(parent_place_id) INTO parent_address_place_ids
-              FROM place_addressline
-             WHERE place_id = parent_place_id;
-        END IF;
-
-        IF not parent_address_place_ids @> ARRAY[addr_item.place_id] THEN
-            nameaddress_vector := array_merge(nameaddress_vector,
-                                              addr_item.keywords);
-
-            INSERT INTO place_addressline (place_id, address_place_id, fromarea,
-                                           isaddress, distance, cached_rank_address)
-            VALUES (obj_place_id, addr_item.place_id, not addr_item.isguess,
-                    true, addr_item.distance, addr_item.rank_address);
-        END IF;
-    END LOOP;
-  END IF;
-
-  name_vector := initial_name_vector;
-
-  -- Check if the parent covers all address terms.
-  -- If not, create a search name entry with the house number as the name.
-  -- This is unusual for the search_name table but prevents that the place
-  -- is returned when we only search for the street/place.
-
-  IF housenumber is not null and not nameaddress_vector <@ parent_address_vector THEN
-    name_vector := array_merge(name_vector,
-                               ARRAY[getorcreate_housenumber_id(make_standard_name(housenumber))]);
-  END IF;
-
-  IF not address ? 'street' and address ? 'place' THEN
-    addr_place_ids := addr_ids_from_name(address->'place');
-    IF not addr_place_ids <@ parent_name_vector THEN
-      -- make sure addr:place terms are always searchable
-      nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids);
-      -- If there is a housenumber, also add the place name as a name,
-      -- so we can search it by the usual housenumber+place algorithms.
-      IF housenumber is not null THEN
-        name_vector := array_merge(name_vector,
-                                   ARRAY[getorcreate_name_id(make_standard_name(address->'place'))]);
-      END IF;
-    END IF;
-  END IF;
-
-  -- Cheating here by not recomputing all terms but simply using the ones
-  -- from the parent object.
-  nameaddress_vector := array_merge(nameaddress_vector, parent_name_vector);
-  nameaddress_vector := array_merge(nameaddress_vector, parent_address_vector);
-
-END;
-$$
-LANGUAGE plpgsql;
diff --git a/lib-sql/functions/partition-functions.sql b/lib-sql/functions/partition-functions.sql

index cfa151de14c8aa268ebad34b70901e94c38058f5..53aba22c90a3e15290c76ad125c0c7f11982ae52 100644 (file)
--- a/lib-sql/functions/partition-functions.sql
+++ b/lib-sql/functions/partition-functions.sql
@@ -63,54 +63,36 @@ END
  $$
  LANGUAGE plpgsql STABLE;
  
-CREATE OR REPLACE FUNCTION get_places_for_addr_tags(in_partition SMALLINT,
-                                                    feature GEOMETRY,
-                                                    address HSTORE, country TEXT)
-  RETURNS SETOF nearfeaturecentr
+
+CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY,
+                                             from_rank SMALLINT, to_rank SMALLINT,
+                                             extent FLOAT, tokens INT[])
+  RETURNS nearfeaturecentr
    AS $$
  DECLARE
    r nearfeaturecentr%rowtype;
-  item RECORD;
  BEGIN
-  FOR item IN
-    SELECT (get_addr_tag_rank(key, country)).*, key, name FROM
-      (SELECT skeys(address) as key, svals(address) as name) x
-  LOOP
-   IF item.from_rank is null THEN
-     CONTINUE;
-   END IF;
-
  {% for partition in db.partitions %}
-    IF in_partition = {{ partition }} THEN
-        SELECT place_id, keywords, rank_address, rank_search,
-               min(ST_Distance(feature, centroid)) as distance,
-               isguess, postcode, centroid INTO r
+  IF in_partition = {{ partition }} THEN
+      SELECT place_id, keywords, rank_address, rank_search,
+             min(ST_Distance(feature, centroid)) as distance,
+             isguess, postcode, centroid INTO r
          FROM location_area_large_{{ partition }}
-        WHERE geometry && ST_Expand(feature, item.extent)
-          AND rank_address between item.from_rank and item.to_rank
-          AND word_ids_from_name(item.name) && keywords
+        WHERE geometry && ST_Expand(feature, extent)
+              AND rank_address between from_rank and to_rank
+              AND tokens && keywords
          GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
          ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1;
-      IF r.place_id is null THEN
-        -- If we cannot find a place for the term, just return the
-        -- search term for the given name. That ensures that the address
-        -- element can still be searched for, even though it will not be
-        -- displayed.
-        RETURN NEXT ROW(null, addr_ids_from_name(item.name), null, null,
-                        null, null, null, null)::nearfeaturecentr;
-      ELSE
-        RETURN NEXT r;
-      END IF;
-      CONTINUE;
-    END IF;
+      RETURN r;
+  END IF;
  {% endfor %}
  
-    RAISE EXCEPTION 'Unknown partition %', in_partition;
-  END LOOP;
+  RAISE EXCEPTION 'Unknown partition %', in_partition;
  END;
  $$
  LANGUAGE plpgsql STABLE;
  
+
  create or replace function deleteLocationArea(in_partition INTEGER, in_place_id BIGINT, in_rank_search INTEGER) RETURNS BOOLEAN AS $$
  DECLARE
  BEGIN
diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql

index 812bc79ff5c44314858af1ae350ec7d2c8cd0597..9a31f3ae327c338dc0d1538f68b51e03f1591148 100644 (file)
--- a/lib-sql/functions/placex_triggers.sql
+++ b/lib-sql/functions/placex_triggers.sql
@@ -1,5 +1,84 @@
  -- Trigger functions for the placex table.
  
+-- Retrieve the data needed by the indexer for updating the place.
+--
+-- Return parameters:
+--  name            list of names
+--  address         list of address tags, either from the object or a surrounding
+--                  building
+--  country_feature If the place is a country feature, this contains the
+--                  country code, otherwise it is null.
+CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
+                                                 OUT name HSTORE,
+                                                 OUT address HSTORE,
+                                                 OUT country_feature VARCHAR)
+  AS $$
+BEGIN
+  -- For POI nodes, check if the address should be derived from a surrounding
+  -- building.
+  IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN
+    address := p.address;
+  ELSE
+    -- The additional && condition works around the misguided query
+    -- planner of postgis 3.0.
+    SELECT placex.address || hstore('_inherited', '') INTO address
+      FROM placex
+     WHERE ST_Covers(geometry, p.centroid)
+           and geometry && p.centroid
+           and placex.address is not null
+           and (placex.address ? 'housenumber' or placex.address ? 'street' or placex.address ? 'place')
+           and rank_search = 30 AND ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon')
+     LIMIT 1;
+  END IF;
+
+  address := address - '_unlisted_place'::TEXT;
+  name := p.name;
+
+  country_feature := CASE WHEN p.admin_level = 2
+                               and p.class = 'boundary' and p.type = 'administrative'
+                               and p.osm_type = 'R'
+                          THEN p.country_code
+                          ELSE null
+                     END;
+END;
+$$
+LANGUAGE plpgsql STABLE;
+
+
+CREATE OR REPLACE FUNCTION find_associated_street(poi_osm_type CHAR(1),
+                                                  poi_osm_id BIGINT)
+  RETURNS BIGINT
+  AS $$
+DECLARE
+  location RECORD;
+  parent RECORD;
+BEGIN
+  FOR location IN
+    SELECT members FROM planet_osm_rels
+    WHERE parts @> ARRAY[poi_osm_id]
+          and members @> ARRAY[lower(poi_osm_type) || poi_osm_id]
+          and tags @> ARRAY['associatedStreet']
+  LOOP
+    FOR i IN 1..array_upper(location.members, 1) BY 2 LOOP
+      IF location.members[i+1] = 'street' THEN
+        FOR parent IN
+          SELECT place_id from placex
+           WHERE osm_type = 'W' and osm_id = substring(location.members[i],2)::bigint
+                 and name is not null
+                 and rank_search between 26 and 27
+        LOOP
+          RETURN parent.place_id;
+        END LOOP;
+      END IF;
+    END LOOP;
+  END LOOP;
+
+  RETURN NULL;
+END;
+$$
+LANGUAGE plpgsql STABLE;
+
+
  -- Find the parent road of a POI.
  --
  -- \returns Place ID of parent object or NULL if none
@@ -10,118 +89,89 @@ CREATE OR REPLACE FUNCTION find_parent_for_poi(poi_osm_type CHAR(1),
                                                 poi_osm_id BIGINT,
                                                 poi_partition SMALLINT,
                                                 bbox GEOMETRY,
-                                               addr_street TEXT,
-                                               addr_place TEXT,
-                                               fallback BOOL = true)
+                                               addr_street INTEGER[],
+                                               addr_place INTEGER[],
+                                               is_place_addr BOOLEAN)
    RETURNS BIGINT
    AS $$
  DECLARE
    parent_place_id BIGINT DEFAULT NULL;
    location RECORD;
-  parent RECORD;
  BEGIN
-    {% if debug %}RAISE WARNING 'finding street for % %', poi_osm_type, poi_osm_id;{% endif %}
+  {% if debug %}RAISE WARNING 'finding street for % %', poi_osm_type, poi_osm_id;{% endif %}
+
+  -- Is this object part of an associatedStreet relation?
+  parent_place_id := find_associated_street(poi_osm_type, poi_osm_id);
  
-    -- Is this object part of an associatedStreet relation?
+  IF parent_place_id is null THEN
+    parent_place_id := find_parent_for_address(addr_street, addr_place,
+                                               poi_partition, bbox);
+  END IF;
+
+  IF parent_place_id is null and poi_osm_type = 'N' THEN
+    -- Is this node part of an interpolation?
      FOR location IN
-      SELECT members FROM planet_osm_rels
-      WHERE parts @> ARRAY[poi_osm_id]
-        and members @> ARRAY[lower(poi_osm_type) || poi_osm_id]
-        and tags @> ARRAY['associatedStreet']
+      SELECT q.parent_place_id
+        FROM location_property_osmline q, planet_osm_ways x
+       WHERE q.linegeo && bbox and x.id = q.osm_id
+             and poi_osm_id = any(x.nodes)
+       LIMIT 1
      LOOP
-      FOR i IN 1..array_upper(location.members, 1) BY 2 LOOP
-        IF location.members[i+1] = 'street' THEN
-          FOR parent IN
-            SELECT place_id from placex
-             WHERE osm_type = 'W' and osm_id = substring(location.members[i],2)::bigint
-               and name is not null
-               and rank_search between 26 and 27
-          LOOP
-            RETURN parent.place_id;
-          END LOOP;
-        END IF;
-      END LOOP;
+      {% if debug %}RAISE WARNING 'Get parent from interpolation: %', location.parent_place_id;{% endif %}
+      RETURN location.parent_place_id;
      END LOOP;
  
-    parent_place_id := find_parent_for_address(addr_street, addr_place,
-                                               poi_partition, bbox);
-    IF parent_place_id is not null THEN
-      RETURN parent_place_id;
-    END IF;
+    FOR location IN
+      SELECT p.place_id, p.osm_id, p.rank_search, p.address,
+             coalesce(p.centroid, ST_Centroid(p.geometry)) as centroid
+        FROM placex p, planet_osm_ways w
+       WHERE p.osm_type = 'W' and p.rank_search >= 26
+             and p.geometry && bbox
+             and w.id = p.osm_id and poi_osm_id = any(w.nodes)
+    LOOP
+      {% if debug %}RAISE WARNING 'Node is part of way % ', location.osm_id;{% endif %}
+
+      -- Way IS a road then we are on it - that must be our road
+      IF location.rank_search < 28 THEN
+        {% if debug %}RAISE WARNING 'node in way that is a street %',location;{% endif %}
+        RETURN location.place_id;
+      END IF;
+
+      parent_place_id := find_associated_street('W', location.osm_id);
+    END LOOP;
+  END IF;
  
-    IF poi_osm_type = 'N' THEN
-      -- Is this node part of an interpolation?
-      FOR parent IN
-        SELECT q.parent_place_id
-          FROM location_property_osmline q, planet_osm_ways x
-         WHERE q.linegeo && bbox and x.id = q.osm_id
-               and poi_osm_id = any(x.nodes)
-         LIMIT 1
+  IF parent_place_id is NULL THEN
+    IF is_place_addr THEN
+      -- The address is attached to a place we don't know.
+      -- Instead simply use the containing area with the largest rank.
+      FOR location IN
+        SELECT place_id FROM placex
+         WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
+               AND rank_address between 5 and 25
+         ORDER BY rank_address desc
        LOOP
-        {% if debug %}RAISE WARNING 'Get parent from interpolation: %', parent.parent_place_id;{% endif %}
-        RETURN parent.parent_place_id;
+        RETURN location.place_id;
        END LOOP;
-
-      -- Is this node part of any other way?
+    ELSEIF ST_Area(bbox) < 0.005 THEN
+      -- for smaller features get the nearest road
+      SELECT getNearestRoadPlaceId(poi_partition, bbox) INTO parent_place_id;
+      {% if debug %}RAISE WARNING 'Checked for nearest way (%)', parent_place_id;{% endif %}
+    ELSE
+      -- for larger features simply find the area with the largest rank that
+      -- contains the bbox, only use addressable features
        FOR location IN
-        SELECT p.place_id, p.osm_id, p.rank_search, p.address,
-               coalesce(p.centroid, ST_Centroid(p.geometry)) as centroid
-          FROM placex p, planet_osm_ways w
-         WHERE p.osm_type = 'W' and p.rank_search >= 26
-               and p.geometry && bbox
-               and w.id = p.osm_id and poi_osm_id = any(w.nodes)
+        SELECT place_id FROM placex
+         WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
+               AND rank_address between 5 and 25
+        ORDER BY rank_address desc
        LOOP
-        {% if debug %}RAISE WARNING 'Node is part of way % ', location.osm_id;{% endif %}
-
-        -- Way IS a road then we are on it - that must be our road
-        IF location.rank_search < 28 THEN
-          {% if debug %}RAISE WARNING 'node in way that is a street %',location;{% endif %}
-          return location.place_id;
-        END IF;
-
-        SELECT find_parent_for_poi('W', location.osm_id, poi_partition,
-                                   location.centroid,
-                                   location.address->'street',
-                                   location.address->'place',
-                                   false)
-          INTO parent_place_id;
-        IF parent_place_id is not null THEN
-          RETURN parent_place_id;
-        END IF;
+        RETURN location.place_id;
        END LOOP;
      END IF;
+  END IF;
  
-    IF fallback THEN
-      IF addr_street is null and addr_place is not null THEN
-        -- The address is attached to a place we don't know.
-        -- Instead simply use the containing area with the largest rank.
-        FOR location IN
-          SELECT place_id FROM placex
-            WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
-                  AND rank_address between 5 and 25
-            ORDER BY rank_address desc
-        LOOP
-            RETURN location.place_id;
-        END LOOP;
-      ELSEIF ST_Area(bbox) < 0.005 THEN
-        -- for smaller features get the nearest road
-        SELECT getNearestRoadPlaceId(poi_partition, bbox) INTO parent_place_id;
-        {% if debug %}RAISE WARNING 'Checked for nearest way (%)', parent_place_id;{% endif %}
-      ELSE
-        -- for larger features simply find the area with the largest rank that
-        -- contains the bbox, only use addressable features
-        FOR location IN
-          SELECT place_id FROM placex
-            WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
-                  AND rank_address between 5 and 25
-            ORDER BY rank_address desc
-        LOOP
-            RETURN location.place_id;
-        END LOOP;
-      END IF;
-    END IF;
-
-    RETURN parent_place_id;
+  RETURN parent_place_id;
  END;
  $$
  LANGUAGE plpgsql STABLE;
@@ -240,6 +290,101 @@ $$
  LANGUAGE plpgsql STABLE;
  
  
+CREATE OR REPLACE FUNCTION create_poi_search_terms(obj_place_id BIGINT,
+                                                   in_partition SMALLINT,
+                                                   parent_place_id BIGINT,
+                                                   is_place_addr BOOLEAN,
+                                                   country TEXT,
+                                                   token_info JSONB,
+                                                   geometry GEOMETRY,
+                                                   OUT name_vector INTEGER[],
+                                                   OUT nameaddress_vector INTEGER[])
+  AS $$
+DECLARE
+  parent_name_vector INTEGER[];
+  parent_address_vector INTEGER[];
+  addr_place_ids INTEGER[];
+  hnr_vector INTEGER[];
+
+  addr_item RECORD;
+  addr_place RECORD;
+  parent_address_place_ids BIGINT[];
+BEGIN
+  nameaddress_vector := '{}'::INTEGER[];
+
+  SELECT s.name_vector, s.nameaddress_vector
+    INTO parent_name_vector, parent_address_vector
+    FROM search_name s
+    WHERE s.place_id = parent_place_id;
+
+  FOR addr_item IN
+    SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
+      FROM token_get_address_tokens(token_info)
+      WHERE not search_tokens <@ parent_address_vector
+  LOOP
+    addr_place := get_address_place(in_partition, geometry,
+                                    addr_item.from_rank, addr_item.to_rank,
+                                    addr_item.extent, addr_item.match_tokens);
+
+    IF addr_place is null THEN
+      -- No place found in OSM that matches. Make it at least searchable.
+      nameaddress_vector := array_merge(nameaddress_vector, addr_item.search_tokens);
+    ELSE
+      IF parent_address_place_ids is null THEN
+        SELECT array_agg(parent_place_id) INTO parent_address_place_ids
+          FROM place_addressline
+          WHERE place_id = parent_place_id;
+      END IF;
+
+      -- If the parent already lists the place in place_address line, then we
+      -- are done. Otherwise, add its own place_address line.
+      IF not parent_address_place_ids @> ARRAY[addr_place.place_id] THEN
+        nameaddress_vector := array_merge(nameaddress_vector, addr_place.keywords);
+
+        INSERT INTO place_addressline (place_id, address_place_id, fromarea,
+                                       isaddress, distance, cached_rank_address)
+          VALUES (obj_place_id, addr_place.place_id, not addr_place.isguess,
+                    true, addr_place.distance, addr_place.rank_address);
+      END IF;
+    END IF;
+  END LOOP;
+
+  name_vector := token_get_name_search_tokens(token_info);
+
+  -- Check if the parent covers all address terms.
+  -- If not, create a search name entry with the house number as the name.
+  -- This is unusual for the search_name table but prevents that the place
+  -- is returned when we only search for the street/place.
+
+  hnr_vector := token_get_housenumber_search_tokens(token_info);
+
+  IF hnr_vector is not null and not nameaddress_vector <@ parent_address_vector THEN
+    name_vector := array_merge(name_vector, hnr_vector);
+  END IF;
+
+  IF is_place_addr THEN
+    addr_place_ids := token_addr_place_search_tokens(token_info);
+    IF not addr_place_ids <@ parent_name_vector THEN
+      -- make sure addr:place terms are always searchable
+      nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids);
+      -- If there is a housenumber, also add the place name as a name,
+      -- so we can search it by the usual housenumber+place algorithms.
+      IF hnr_vector is not null THEN
+        name_vector := array_merge(name_vector, addr_place_ids);
+      END IF;
+    END IF;
+  END IF;
+
+  -- Cheating here by not recomputing all terms but simply using the ones
+  -- from the parent object.
+  nameaddress_vector := array_merge(nameaddress_vector, parent_name_vector);
+  nameaddress_vector := array_merge(nameaddress_vector, parent_address_vector);
+
+END;
+$$
+LANGUAGE plpgsql;
+
+
  -- Insert address of a place into the place_addressline table.
  --
  -- \param obj_place_id  Place_id of the place to compute the address for.
@@ -260,7 +405,7 @@ LANGUAGE plpgsql STABLE;
  CREATE OR REPLACE FUNCTION insert_addresslines(obj_place_id BIGINT,
                                                 partition SMALLINT,
                                                 maxrank SMALLINT,
-                                               address HSTORE,
+                                               token_info JSONB,
                                                 geometry GEOMETRY,
                                                 country TEXT,
                                                 OUT parent_place_id BIGINT,
@@ -275,7 +420,8 @@ DECLARE
    current_node_area GEOMETRY := NULL;
  
    parent_place_rank INT := 0;
-  addr_place_ids BIGINT[];
+  addr_place_ids BIGINT[] := '{}'::int[];
+  new_address_vector INT[];
  
    location RECORD;
  BEGIN
@@ -285,16 +431,21 @@ BEGIN
    address_havelevel := array_fill(false, ARRAY[maxrank]);
  
    FOR location IN
-    SELECT * FROM get_places_for_addr_tags(partition, geometry,
-                                                   address, country)
-    ORDER BY rank_address, distance, isguess desc
+    SELECT (get_address_place(partition, geometry, from_rank, to_rank,
+                              extent, match_tokens)).*, search_tokens
+      FROM (SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
+              FROM token_get_address_tokens(token_info)) x
+      ORDER BY rank_address, distance, isguess desc
    LOOP
-    {% if not db.reverse_only %}
-      nameaddress_vector := array_merge(nameaddress_vector,
-                                        location.keywords::int[]);
-    {% endif %}
+    IF location.place_id is null THEN
+      {% if not db.reverse_only %}
+      nameaddress_vector := array_merge(nameaddress_vector, location.search_tokens);
+      {% endif %}
+    ELSE
+      {% if not db.reverse_only %}
+      nameaddress_vector := array_merge(nameaddress_vector, location.keywords::INTEGER[]);
+      {% endif %}
  
-    IF location.place_id is not null THEN
        location_isaddress := not address_havelevel[location.rank_address];
        IF not address_havelevel[location.rank_address] THEN
          address_havelevel[location.rank_address] := true;
@@ -309,13 +460,13 @@ BEGIN
          VALUES (obj_place_id, location.place_id, not location.isguess,
                  true, location.distance, location.rank_address);
  
-      addr_place_ids := array_append(addr_place_ids, location.place_id);
+      addr_place_ids := addr_place_ids || location.place_id;
      END IF;
    END LOOP;
  
    FOR location IN
      SELECT * FROM getNearFeatures(partition, geometry, maxrank)
-    WHERE addr_place_ids is null or not addr_place_ids @> ARRAY[place_id]
+    WHERE not addr_place_ids @> ARRAY[place_id]
      ORDER BY rank_address, isguess asc,
               distance *
                 CASE WHEN rank_address = 16 AND rank_search = 15 THEN 0.2
@@ -397,10 +548,11 @@ BEGIN
    NEW.place_id := nextval('seq_place');
    NEW.indexed_status := 1; --STATUS_NEW
  
-  NEW.country_code := lower(get_country_code(NEW.geometry));
+  NEW.centroid := ST_PointOnSurface(NEW.geometry);
+  NEW.country_code := lower(get_country_code(NEW.centroid));
  
    NEW.partition := get_partition(NEW.country_code);
-  NEW.geometry_sector := geometry_sector(NEW.partition, NEW.geometry);
+  NEW.geometry_sector := geometry_sector(NEW.partition, NEW.centroid);
  
    IF NEW.osm_type = 'X' THEN
      -- E'X'ternal records should already be in the right format so do nothing
@@ -522,8 +674,8 @@ DECLARE
    parent_address_level SMALLINT;
    place_address_level SMALLINT;
  
-  addr_street TEXT;
-  addr_place TEXT;
+  addr_street INTEGER[];
+  addr_place INTEGER[];
  
    max_rank SMALLINT;
  
@@ -531,12 +683,11 @@ DECLARE
    nameaddress_vector INTEGER[];
    addr_nameaddress_vector INTEGER[];
  
-  inherited_address HSTORE;
-
    linked_node_id BIGINT;
    linked_importance FLOAT;
    linked_wikipedia TEXT;
  
+  is_place_address BOOLEAN;
    result BOOLEAN;
  BEGIN
    -- deferred delete
@@ -566,9 +717,9 @@ BEGIN
    -- update not necessary for osmline, cause linked_place_id does not exist
  
    NEW.extratags := NEW.extratags - 'linked_place'::TEXT;
-  NEW.address := NEW.address - '_unlisted_place'::TEXT;
  
    IF NEW.linked_place_id is not null THEN
+    NEW.token_info := null;
      {% if debug %}RAISE WARNING 'place already linked to %', NEW.linked_place_id;{% endif %}
      RETURN NEW;
    END IF;
@@ -579,13 +730,34 @@ BEGIN
    -- imported as place=postcode. That's why relations are allowed to pass here.
    -- This can go away in a couple of versions.
    IF NEW.class = 'place'  and NEW.type = 'postcode' and NEW.osm_type != 'R' THEN
+    NEW.token_info := null;
      RETURN NEW;
    END IF;
  
-  -- Speed up searches - just use the centroid of the feature
-  -- cheaper but less acurate
+  -- Compute a preliminary centroid.
    NEW.centroid := ST_PointOnSurface(NEW.geometry);
-  {% if debug %}RAISE WARNING 'Computing preliminary centroid at %',ST_AsText(NEW.centroid);{% endif %}
+
+    -- recalculate country and partition
+  IF NEW.rank_search = 4 AND NEW.address is not NULL AND NEW.address ? 'country' THEN
+    -- for countries, believe the mapped country code,
+    -- so that we remain in the right partition if the boundaries
+    -- suddenly expand.
+    NEW.country_code := lower(NEW.address->'country');
+    NEW.partition := get_partition(lower(NEW.country_code));
+    IF NEW.partition = 0 THEN
+      NEW.country_code := lower(get_country_code(NEW.centroid));
+      NEW.partition := get_partition(NEW.country_code);
+    END IF;
+  ELSE
+    IF NEW.rank_search >= 4 THEN
+      NEW.country_code := lower(get_country_code(NEW.centroid));
+    ELSE
+      NEW.country_code := NULL;
+    END IF;
+    NEW.partition := get_partition(NEW.country_code);
+  END IF;
+  {% if debug %}RAISE WARNING 'Country updated: "%"', NEW.country_code;{% endif %}
+
  
    -- recompute the ranks, they might change when linking changes
    SELECT * INTO NEW.rank_search, NEW.rank_address
@@ -665,54 +837,12 @@ BEGIN
      parent_address_level := 3;
    END IF;
  
-  {% if debug %}RAISE WARNING 'Copy over address tags';{% endif %}
-  -- housenumber is a computed field, so start with an empty value
-  NEW.housenumber := NULL;
-  IF NEW.address is not NULL THEN
-      IF NEW.address ? 'conscriptionnumber' THEN
-        IF NEW.address ? 'streetnumber' THEN
-            NEW.housenumber := (NEW.address->'conscriptionnumber') || '/' || (NEW.address->'streetnumber');
-        ELSE
-            NEW.housenumber := NEW.address->'conscriptionnumber';
-        END IF;
-      ELSEIF NEW.address ? 'streetnumber' THEN
-        NEW.housenumber := NEW.address->'streetnumber';
-      ELSEIF NEW.address ? 'housenumber' THEN
-        NEW.housenumber := NEW.address->'housenumber';
-      END IF;
-      NEW.housenumber := create_housenumber_id(NEW.housenumber);
-
-      addr_street := NEW.address->'street';
-      addr_place := NEW.address->'place';
-
-      IF NEW.address ? 'postcode' and NEW.address->'postcode' not similar to '%(:|,|;)%' THEN
-        i := getorcreate_postcode_id(NEW.address->'postcode');
-      END IF;
-  END IF;
+  NEW.housenumber := token_normalized_housenumber(NEW.token_info);
+  addr_street := token_addr_street_match_tokens(NEW.token_info);
+  addr_place := token_addr_place_match_tokens(NEW.token_info);
  
    NEW.postcode := null;
  
-  -- recalculate country and partition
-  IF NEW.rank_search = 4 AND NEW.address is not NULL AND NEW.address ? 'country' THEN
-    -- for countries, believe the mapped country code,
-    -- so that we remain in the right partition if the boundaries
-    -- suddenly expand.
-    NEW.country_code := lower(NEW.address->'country');
-    NEW.partition := get_partition(lower(NEW.country_code));
-    IF NEW.partition = 0 THEN
-      NEW.country_code := lower(get_country_code(NEW.centroid));
-      NEW.partition := get_partition(NEW.country_code);
-    END IF;
-  ELSE
-    IF NEW.rank_search >= 4 THEN
-      NEW.country_code := lower(get_country_code(NEW.centroid));
-    ELSE
-      NEW.country_code := NULL;
-    END IF;
-    NEW.partition := get_partition(NEW.country_code);
-  END IF;
-  {% if debug %}RAISE WARNING 'Country updated: "%"', NEW.country_code;{% endif %}
-
    -- waterway ways are linked when they are part of a relation and have the same class/type
    IF NEW.osm_type = 'R' and NEW.class = 'waterway' THEN
        FOR relation_members IN select members from planet_osm_rels r where r.id = NEW.osm_id and r.parts != array[]::bigint[]
@@ -749,33 +879,14 @@ BEGIN
  
      {% if debug %}RAISE WARNING 'finding street for % %', NEW.osm_type, NEW.osm_id;{% endif %}
      NEW.parent_place_id := null;
-
-    -- if we have a POI and there is no address information,
-    -- see if we can get it from a surrounding building
-    inherited_address := ''::HSTORE;
-    IF NEW.osm_type = 'N' AND addr_street IS NULL AND addr_place IS NULL
-       AND NEW.housenumber IS NULL THEN
-      FOR location IN
-        -- The additional && condition works around the misguided query
-        -- planner of postgis 3.0.
-        SELECT address from placex where ST_Covers(geometry, NEW.centroid)
-            and geometry && NEW.centroid
-            and (address ? 'housenumber' or address ? 'street' or address ? 'place')
-            and rank_search > 28 AND ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon')
-            limit 1
-      LOOP
-        NEW.housenumber := location.address->'housenumber';
-        addr_street := location.address->'street';
-        addr_place := location.address->'place';
-        inherited_address := location.address;
-      END LOOP;
-    END IF;
+    is_place_address := coalesce(not NEW.address ? 'street' and NEW.address ? 'place', FALSE);
  
      -- We have to find our parent road.
      NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id,
                                                 NEW.partition,
                                                 ST_Envelope(NEW.geometry),
-                                               addr_street, addr_place);
+                                               addr_street, addr_place,
+                                               is_place_address);
  
      -- If we found the road take a shortcut here.
      -- Otherwise fall back to the full address getting method below.
@@ -785,12 +896,12 @@ BEGIN
        SELECT p.country_code, p.postcode, p.name FROM placex p
         WHERE p.place_id = NEW.parent_place_id INTO location;
  
-      IF addr_street is null and addr_place is not null THEN
+      IF is_place_address THEN
          -- Check if the addr:place tag is part of the parent name
          SELECT count(*) INTO i
-          FROM svals(location.name) AS pname WHERE pname = addr_place;
+          FROM svals(location.name) AS pname WHERE pname = NEW.address->'place';
          IF i = 0 THEN
-          NEW.address = NEW.address || hstore('_unlisted_place', addr_place);
+          NEW.address = NEW.address || hstore('_unlisted_place', NEW.address->'place');
          END IF;
        END IF;
  
@@ -798,39 +909,21 @@ BEGIN
        {% if debug %}RAISE WARNING 'Got parent details from search name';{% endif %}
  
        -- determine postcode
-      IF NEW.address is not null AND NEW.address ? 'postcode' THEN
-          NEW.postcode = upper(trim(NEW.address->'postcode'));
-      ELSE
-         NEW.postcode := location.postcode;
-      END IF;
-      IF NEW.postcode is null THEN
-        NEW.postcode := get_nearest_postcode(NEW.country_code, NEW.geometry);
-      END IF;
+      NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'),
+                               location.postcode,
+                               get_nearest_postcode(NEW.country_code, NEW.geometry));
  
        IF NEW.name is not NULL THEN
            NEW.name := add_default_place_name(NEW.country_code, NEW.name);
-          name_vector := make_keywords(NEW.name);
-
-          IF NEW.rank_search <= 25 and NEW.rank_address > 0 THEN
-            result := add_location(NEW.place_id, NEW.country_code, NEW.partition,
-                                   name_vector, NEW.rank_search, NEW.rank_address,
-                                   upper(trim(NEW.address->'postcode')), NEW.geometry,
-                                   NEW.centroid);
-            {% if debug %}RAISE WARNING 'Place added to location table';{% endif %}
-          END IF;
-
        END IF;
  
        {% if not db.reverse_only %}
-      IF array_length(name_vector, 1) is not NULL
-         OR inherited_address is not NULL OR NEW.address is not NULL
-      THEN
+      IF NEW.name is not NULL OR NEW.address is not NULL THEN
          SELECT * INTO name_vector, nameaddress_vector
            FROM create_poi_search_terms(NEW.place_id,
                                         NEW.partition, NEW.parent_place_id,
-                                       inherited_address || NEW.address,
-                                       NEW.country_code, NEW.housenumber,
-                                       name_vector, NEW.centroid);
+                                       is_place_address, NEW.country_code,
+                                       NEW.token_info, NEW.centroid);
  
          IF array_length(name_vector, 1) is not NULL THEN
            INSERT INTO search_name (place_id, search_rank, address_rank,
@@ -844,6 +937,17 @@ BEGIN
        END IF;
        {% endif %}
  
+      NEW.token_info := token_strip_info(NEW.token_info);
+      -- If the address was inherited from a surrounding building,
+      -- do not add it permanently to the table.
+      IF NEW.address ? '_inherited' THEN
+        IF NEW.address ? '_unlisted_place' THEN
+          NEW.address := hstore('_unlisted_place', NEW.address->'_unlisted_place');
+        ELSE
+          NEW.address := null;
+        END IF;
+      END IF;
+
        RETURN NEW;
      END IF;
  
@@ -914,19 +1018,11 @@ BEGIN
      END IF;
    END IF;
  
-  -- Initialise the name vector using our name
-  NEW.name := add_default_place_name(NEW.country_code, NEW.name);
-  name_vector := make_keywords(NEW.name);
-
-  -- make sure all names are in the word table
    IF NEW.admin_level = 2
       AND NEW.class = 'boundary' AND NEW.type = 'administrative'
       AND NEW.country_code IS NOT NULL AND NEW.osm_type = 'R'
    THEN
-    PERFORM create_country(NEW.name, lower(NEW.country_code));
-    {% if debug %}RAISE WARNING 'Country names updated';{% endif %}
-
-    -- Also update the list of country names. Adding an additional sanity
+    -- Update the list of country names. Adding an additional sanity
      -- check here: make sure the country does overlap with the area where
      -- we expect it to be as per static country grid.
      FOR location IN
@@ -959,29 +1055,28 @@ BEGIN
    ELSEIF NEW.rank_address > 25 THEN
      max_rank := 25;
    ELSE
-    max_rank = NEW.rank_address;
+    max_rank := NEW.rank_address;
    END IF;
  
    SELECT * FROM insert_addresslines(NEW.place_id, NEW.partition, max_rank,
-                                    NEW.address, geom, NEW.country_code)
+                                    NEW.token_info, geom, NEW.country_code)
      INTO NEW.parent_place_id, NEW.postcode, nameaddress_vector;
  
    {% if debug %}RAISE WARNING 'RETURN insert_addresslines: %, %, %', NEW.parent_place_id, NEW.postcode, nameaddress_vector;{% endif %}
  
-  IF NEW.address is not null AND NEW.address ? 'postcode' 
-     AND NEW.address->'postcode' not similar to '%(,|;)%' THEN
-    NEW.postcode := upper(trim(NEW.address->'postcode'));
-  END IF;
-
-  IF NEW.postcode is null AND NEW.rank_search > 8 THEN
-    NEW.postcode := get_nearest_postcode(NEW.country_code, NEW.geometry);
-  END IF;
+  NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'),
+                           NEW.postcode);
  
    -- if we have a name add this to the name search table
    IF NEW.name IS NOT NULL THEN
+    -- Initialise the name vector using our name
+    NEW.name := add_default_place_name(NEW.country_code, NEW.name);
+    name_vector := token_get_name_search_tokens(NEW.token_info);
  
      IF NEW.rank_search <= 25 and NEW.rank_address > 0 THEN
-      result := add_location(NEW.place_id, NEW.country_code, NEW.partition, name_vector, NEW.rank_search, NEW.rank_address, upper(trim(NEW.address->'postcode')), NEW.geometry, NEW.centroid);
+      result := add_location(NEW.place_id, NEW.country_code, NEW.partition,
+                             name_vector, NEW.rank_search, NEW.rank_address,
+                             NEW.postcode, NEW.geometry, NEW.centroid);
        {% if debug %}RAISE WARNING 'added to location (full)';{% endif %}
      END IF;
  
@@ -990,8 +1085,11 @@ BEGIN
        {% if debug %}RAISE WARNING 'insert into road location table (full)';{% endif %}
      END IF;
  
-    result := insertSearchName(NEW.partition, NEW.place_id, name_vector,
-                               NEW.rank_search, NEW.rank_address, NEW.geometry);
+    IF NEW.rank_address between 16 and 27 THEN
+      result := insertSearchName(NEW.partition, NEW.place_id,
+                                 token_get_name_match_tokens(NEW.token_info),
+                                 NEW.rank_search, NEW.rank_address, NEW.geometry);
+    END IF;
      {% if debug %}RAISE WARNING 'added to search name (full)';{% endif %}
  
      {% if not db.reverse_only %}
@@ -1002,11 +1100,15 @@ BEGIN
                         NEW.importance, NEW.country_code, name_vector,
                         nameaddress_vector, NEW.centroid);
      {% endif %}
+  END IF;
  
+  IF NEW.postcode is null AND NEW.rank_search > 8 THEN
+    NEW.postcode := get_nearest_postcode(NEW.country_code, NEW.geometry);
    END IF;
  
    {% if debug %}RAISE WARNING 'place update % % finsihed.', NEW.osm_type, NEW.osm_id;{% endif %}
  
+  NEW.token_info := token_strip_info(NEW.token_info);
    RETURN NEW;
  END;
  $$
diff --git a/lib-sql/functions/utils.sql b/lib-sql/functions/utils.sql

index 4868b828e4bf6e988f1daf601f7655ac57bcba87..c308d0259b8505887d8c8fdc9a85630a20b3313f 100644 (file)
--- a/lib-sql/functions/utils.sql
+++ b/lib-sql/functions/utils.sql
@@ -221,37 +221,30 @@ LANGUAGE plpgsql STABLE;
  -- \param centroid   Location of the address.
  --
  -- \return Place ID of the parent if one was found, NULL otherwise.
-CREATE OR REPLACE FUNCTION find_parent_for_address(street TEXT, place TEXT,
+CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEGER[],
                                                     partition SMALLINT,
                                                     centroid GEOMETRY)
    RETURNS BIGINT
    AS $$
  DECLARE
    parent_place_id BIGINT;
-  word_ids INTEGER[];
  BEGIN
    IF street is not null THEN
      -- Check for addr:street attributes
      -- Note that addr:street links can only be indexed, once the street itself is indexed
-    word_ids := word_ids_from_name(street);
-    IF word_ids is not null THEN
-      parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, word_ids);
-      IF parent_place_id is not null THEN
-        {% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
-        RETURN parent_place_id;
-      END IF;
+    parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, street);
+    IF parent_place_id is not null THEN
+      {% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
+      RETURN parent_place_id;
      END IF;
    END IF;
  
    -- Check for addr:place attributes.
    IF place is not null THEN
-    word_ids := word_ids_from_name(place);
-    IF word_ids is not null THEN
-      parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, word_ids);
-      IF parent_place_id is not null THEN
-        {% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
-        RETURN parent_place_id;
-      END IF;
+    parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, place);
+    IF parent_place_id is not null THEN
+      {% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
+      RETURN parent_place_id;
      END IF;
    END IF;
  
diff --git a/lib-sql/indices.sql b/lib-sql/indices.sql

index a6f7cf95fcb6c7e0346ee0cdc8ca54fe2a77be81..81299544573c0c4c1ffea2850b8be54477f6a2bb 100644 (file)
--- a/lib-sql/indices.sql
+++ b/lib-sql/indices.sql
@@ -1,9 +1,6 @@
  -- Indices used only during search and update.
  -- These indices are created only after the indexing process is done.
  
-CREATE INDEX {{sql.if_index_not_exists}} idx_word_word_id
-  ON word USING BTREE (word_id) {{db.tablespace.search_index}};
-
  CREATE INDEX {{sql.if_index_not_exists}} idx_place_addressline_address_place_id
    ON place_addressline USING BTREE (address_place_id) {{db.tablespace.search_index}};
  
diff --git a/lib-sql/tables.sql b/lib-sql/tables.sql

index 609472ecb78f9bdbaecf2414fc8ec00fa47d368b..9732c26cb3b82623e2fe5dd799d94fe24492b226 100644 (file)
--- a/lib-sql/tables.sql
+++ b/lib-sql/tables.sql
@@ -43,22 +43,6 @@ CREATE TABLE nominatim_properties (
  );
  GRANT SELECT ON TABLE nominatim_properties TO "{{config.DATABASE_WEBUSER}}";
  
-drop table IF EXISTS word;
-CREATE TABLE word (
-  word_id INTEGER,
-  word_token text,
-  word text,
-  class text,
-  type text,
-  country_code varchar(2),
-  search_name_count INTEGER,
-  operator TEXT
-  ) {{db.tablespace.search_data}};
-CREATE INDEX idx_word_word_token on word USING BTREE (word_token) {{db.tablespace.search_index}};
-GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}" ;
-DROP SEQUENCE IF EXISTS seq_word;
-CREATE SEQUENCE seq_word start 1;
-
  drop table IF EXISTS location_area CASCADE;
  CREATE TABLE location_area (
    place_id BIGINT,
@@ -109,6 +93,7 @@ CREATE TABLE location_property_osmline (
      linegeo GEOMETRY,
      interpolationtype TEXT,
      address HSTORE,
+    token_info JSONB, -- custom column for tokenizer use only
      postcode TEXT,
      country_code VARCHAR(2)
    ){{db.tablespace.search_data}};
@@ -158,6 +143,7 @@ CREATE TABLE placex (
    indexed_status SMALLINT,
    LIKE place INCLUDING CONSTRAINTS,
    wikipedia TEXT, -- calculated wikipedia article name (language:title)
+  token_info JSONB, -- custom column for tokenizer use only
    country_code varchar(2),
    housenumber TEXT,
    postcode TEXT,
@@ -168,6 +154,10 @@ CREATE INDEX idx_placex_osmid ON placex USING BTREE (osm_type, osm_id) {{db.tabl
  CREATE INDEX idx_placex_linked_place_id ON placex USING BTREE (linked_place_id) {{db.tablespace.address_index}} WHERE linked_place_id IS NOT NULL;
  CREATE INDEX idx_placex_rank_search ON placex USING BTREE (rank_search, geometry_sector) {{db.tablespace.address_index}};
  CREATE INDEX idx_placex_geometry ON placex USING GIST (geometry) {{db.tablespace.search_index}};
+CREATE INDEX idx_placex_geometry_buildings ON placex
+  USING GIST (geometry) {{db.tablespace.search_index}}
+  WHERE address is not null and rank_search = 30
+        and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon');
  CREATE INDEX idx_placex_geometry_placenode ON placex
    USING GIST (geometry) {{db.tablespace.search_index}}
    WHERE osm_type = 'N' and rank_search < 26
@@ -178,7 +168,6 @@ DROP SEQUENCE IF EXISTS seq_place;
  CREATE SEQUENCE seq_place start 1;
  GRANT SELECT on placex to "{{config.DATABASE_WEBUSER}}" ;
  GRANT SELECT on place_addressline to "{{config.DATABASE_WEBUSER}}" ;
-GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}" ;
  GRANT SELECT ON planet_osm_ways to "{{config.DATABASE_WEBUSER}}" ;
  GRANT SELECT ON planet_osm_rels to "{{config.DATABASE_WEBUSER}}" ;
  GRANT SELECT on location_area to "{{config.DATABASE_WEBUSER}}" ;
diff --git a/lib-sql/tokenizer/legacy_tokenizer.sql b/lib-sql/tokenizer/legacy_tokenizer.sql

new file mode 100644 (file)

index 0000000..fe82762
--- /dev/null
+++ b/lib-sql/tokenizer/legacy_tokenizer.sql
@@ -0,0 +1,399 @@
+-- Get tokens used for searching the given place.
+--
+-- These are the tokens that will be saved in the search_name table.
+CREATE OR REPLACE FUNCTION token_get_name_search_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'names')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Get tokens for matching the place name against others.
+--
+-- This should usually be restricted to full name tokens.
+CREATE OR REPLACE FUNCTION token_get_name_match_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'names')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Return the housenumber tokens applicable for the place.
+CREATE OR REPLACE FUNCTION token_get_housenumber_search_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'hnr_tokens')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Return the housenumber in the form that it can be matched during search.
+CREATE OR REPLACE FUNCTION token_normalized_housenumber(info JSONB)
+  RETURNS TEXT
+AS $$
+  SELECT info->>'hnr';
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'street')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'place_match')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
+  RETURNS INTEGER[]
+AS $$
+  SELECT (info->>'place_search')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+DROP TYPE IF EXISTS token_addresstoken CASCADE;
+CREATE TYPE token_addresstoken AS (
+  key TEXT,
+  match_tokens INT[],
+  search_tokens INT[]
+);
+
+CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
+  RETURNS SETOF token_addresstoken
+AS $$
+  SELECT key, (value->>1)::int[] as match_tokens,
+         (value->>0)::int[] as search_tokens
+  FROM jsonb_each(info->'addr');
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+  RETURNS TEXT
+AS $$
+  SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Return token info that should be saved permanently in the database.
+CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
+  RETURNS JSONB
+AS $$
+  SELECT NULL::JSONB;
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+--------------- private functions ----------------------------------------------
+
+-- Functions for term normalisation and access to the 'word' table.
+
+CREATE OR REPLACE FUNCTION transliteration(text) RETURNS text
+  AS '{{ modulepath }}/nominatim.so', 'transliteration'
+LANGUAGE c IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION gettokenstring(text) RETURNS text
+  AS '{{ modulepath }}/nominatim.so', 'gettokenstring'
+LANGUAGE c IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT
+  AS $$
+DECLARE
+  o TEXT;
+BEGIN
+  o := public.gettokenstring(public.transliteration(name));
+  RETURN trim(substr(o,1,length(o)));
+END;
+$$
+LANGUAGE plpgsql IMMUTABLE;
+
+-- returns NULL if the word is too common
+CREATE OR REPLACE FUNCTION getorcreate_word_id(lookup_word TEXT) 
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  lookup_token TEXT;
+  return_word_id INTEGER;
+  count INTEGER;
+BEGIN
+  lookup_token := trim(lookup_word);
+  SELECT min(word_id), max(search_name_count) FROM word
+    WHERE word_token = lookup_token and class is null and type is null
+    INTO return_word_id, count;
+  IF return_word_id IS NULL THEN
+    return_word_id := nextval('seq_word');
+    INSERT INTO word VALUES (return_word_id, lookup_token, null, null, null, null, 0);
+  ELSE
+    IF count > {{ max_word_freq }} THEN
+      return_word_id := NULL;
+    END IF;
+  END IF;
+  RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+-- Create housenumber tokens from an OSM addr:housenumber.
+-- The housnumber is split at comma and semicolon as necessary.
+-- The function returns the normalized form of the housenumber suitable
+-- for comparison.
+CREATE OR REPLACE FUNCTION create_housenumbers(housenumbers TEXT[],
+                                               OUT tokens TEXT,
+                                               OUT normtext TEXT)
+  AS $$
+BEGIN
+  SELECT array_to_string(array_agg(trans), ';'), array_agg(tid)::TEXT
+    INTO normtext, tokens
+    FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word) as tid
+          FROM (SELECT make_standard_name(h) as lookup_word
+                FROM unnest(housenumbers) h) x) y;
+END;
+$$ LANGUAGE plpgsql STABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  lookup_token TEXT;
+  return_word_id INTEGER;
+BEGIN
+  lookup_token := ' ' || trim(lookup_word);
+  SELECT min(word_id) FROM word
+    WHERE word_token = lookup_token and class='place' and type='house'
+    INTO return_word_id;
+  IF return_word_id IS NULL THEN
+    return_word_id := nextval('seq_word');
+    INSERT INTO word VALUES (return_word_id, lookup_token, null,
+                             'place', 'house', null, 0);
+  END IF;
+  RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT)
+  RETURNS BOOLEAN
+  AS $$
+DECLARE
+  r RECORD;
+  lookup_token TEXT;
+  return_word_id INTEGER;
+BEGIN
+  lookup_token := ' ' || make_standard_name(postcode);
+  FOR r IN
+    SELECT word_id FROM word
+    WHERE word_token = lookup_token and word = postcode
+          and class='place' and type='postcode'
+  LOOP
+    RETURN false;
+  END LOOP;
+
+  INSERT INTO word VALUES (nextval('seq_word'), lookup_token, postcode,
+                           'place', 'postcode', null, 0);
+  RETURN true;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT, src_word TEXT)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  lookup_token TEXT;
+  nospace_lookup_token TEXT;
+  return_word_id INTEGER;
+BEGIN
+  lookup_token := ' '||trim(lookup_word);
+  SELECT min(word_id) FROM word
+  WHERE word_token = lookup_token and class is null and type is null
+  INTO return_word_id;
+  IF return_word_id IS NULL THEN
+    return_word_id := nextval('seq_word');
+    INSERT INTO word VALUES (return_word_id, lookup_token, src_word,
+                             null, null, null, 0);
+  END IF;
+  RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+-- Normalize a string and lookup its word ids (partial words).
+CREATE OR REPLACE FUNCTION addr_ids_from_name(lookup_word TEXT)
+  RETURNS INTEGER[]
+  AS $$
+DECLARE
+  words TEXT[];
+  id INTEGER;
+  return_word_id INTEGER[];
+  word_ids INTEGER[];
+  j INTEGER;
+BEGIN
+  words := string_to_array(make_standard_name(lookup_word), ' ');
+  IF array_upper(words, 1) IS NOT NULL THEN
+    FOR j IN 1..array_upper(words, 1) LOOP
+      IF (words[j] != '') THEN
+        SELECT array_agg(word_id) INTO word_ids
+          FROM word
+         WHERE word_token = words[j] and class is null and type is null;
+
+        IF word_ids IS NULL THEN
+          id := nextval('seq_word');
+          INSERT INTO word VALUES (id, words[j], null, null, null, null, 0);
+          return_word_id := return_word_id || id;
+        ELSE
+          return_word_id := array_merge(return_word_id, word_ids);
+        END IF;
+      END IF;
+    END LOOP;
+  END IF;
+
+  RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+-- Normalize a string and look up its name ids (full words).
+CREATE OR REPLACE FUNCTION word_ids_from_name(lookup_word TEXT)
+  RETURNS INTEGER[]
+  AS $$
+DECLARE
+  lookup_token TEXT;
+  return_word_ids INTEGER[];
+BEGIN
+  lookup_token := ' '|| make_standard_name(lookup_word);
+  SELECT array_agg(word_id) FROM word
+    WHERE word_token = lookup_token and class is null and type is null
+    INTO return_word_ids;
+  RETURN return_word_ids;
+END;
+$$
+LANGUAGE plpgsql STABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION make_keywords(src HSTORE)
+  RETURNS INTEGER[]
+  AS $$
+DECLARE
+  result INTEGER[];
+  s TEXT;
+  w INTEGER;
+  words TEXT[];
+  item RECORD;
+  j INTEGER;
+BEGIN
+  result := '{}'::INTEGER[];
+
+  FOR item IN SELECT (each(src)).* LOOP
+
+    s := make_standard_name(item.value);
+    w := getorcreate_name_id(s, item.value);
+
+    IF not(ARRAY[w] <@ result) THEN
+      result := result || w;
+    END IF;
+
+    w := getorcreate_word_id(s);
+
+    IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
+      result := result || w;
+    END IF;
+
+    words := string_to_array(s, ' ');
+    IF array_upper(words, 1) IS NOT NULL THEN
+      FOR j IN 1..array_upper(words, 1) LOOP
+        IF (words[j] != '') THEN
+          w = getorcreate_word_id(words[j]);
+          IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
+            result := result || w;
+          END IF;
+        END IF;
+      END LOOP;
+    END IF;
+
+    words := regexp_split_to_array(item.value, E'[,;()]');
+    IF array_upper(words, 1) != 1 THEN
+      FOR j IN 1..array_upper(words, 1) LOOP
+        s := make_standard_name(words[j]);
+        IF s != '' THEN
+          w := getorcreate_word_id(s);
+          IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
+            result := result || w;
+          END IF;
+        END IF;
+      END LOOP;
+    END IF;
+
+    s := regexp_replace(item.value, '市$', '');
+    IF s != item.value THEN
+      s := make_standard_name(s);
+      IF s != '' THEN
+        w := getorcreate_name_id(s, item.value);
+        IF NOT (ARRAY[w] <@ result) THEN
+          result := result || w;
+        END IF;
+      END IF;
+    END IF;
+
+  END LOOP;
+
+  RETURN result;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION precompute_words(src TEXT)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  s TEXT;
+  w INTEGER;
+  words TEXT[];
+  i INTEGER;
+  j INTEGER;
+BEGIN
+  s := make_standard_name(src);
+  w := getorcreate_name_id(s, src);
+
+  w := getorcreate_word_id(s);
+
+  words := string_to_array(s, ' ');
+  IF array_upper(words, 1) IS NOT NULL THEN
+    FOR j IN 1..array_upper(words, 1) LOOP
+      IF (words[j] != '') THEN
+        w := getorcreate_word_id(words[j]);
+      END IF;
+    END LOOP;
+  END IF;
+
+  words := regexp_split_to_array(src, E'[,;()]');
+  IF array_upper(words, 1) != 1 THEN
+    FOR j IN 1..array_upper(words, 1) LOOP
+      s := make_standard_name(words[j]);
+      IF s != '' THEN
+        w := getorcreate_word_id(s);
+      END IF;
+    END LOOP;
+  END IF;
+
+  s := regexp_replace(src, '市$', '');
+  IF s != src THEN
+    s := make_standard_name(s);
+    IF s != '' THEN
+      w := getorcreate_name_id(s, src);
+    END IF;
+  END IF;
+
+  RETURN 1;
+END;
+$$
+LANGUAGE plpgsql;
diff --git a/lib-sql/tokenizer/legacy_tokenizer_indices.sql b/lib-sql/tokenizer/legacy_tokenizer_indices.sql

new file mode 100644 (file)

index 0000000..44a2909
--- /dev/null
+++ b/lib-sql/tokenizer/legacy_tokenizer_indices.sql
@@ -0,0 +1,2 @@
+CREATE INDEX {{sql.if_index_not_exists}} idx_word_word_id
+  ON word USING BTREE (word_id) {{db.tablespace.search_index}};
diff --git a/lib-sql/tokenizer/legacy_tokenizer_tables.sql b/lib-sql/tokenizer/legacy_tokenizer_tables.sql

new file mode 100644 (file)

index 0000000..3410b76
--- /dev/null
+++ b/lib-sql/tokenizer/legacy_tokenizer_tables.sql
@@ -0,0 +1,19 @@
+DROP TABLE IF EXISTS word;
+CREATE TABLE word (
+  word_id INTEGER,
+  word_token text NOT NULL,
+  word text,
+  class text,
+  type text,
+  country_code varchar(2),
+  search_name_count INTEGER,
+  operator TEXT
+) {{db.tablespace.search_data}};
+
+CREATE INDEX idx_word_word_token ON word
+    USING BTREE (word_token) {{db.tablespace.search_index}};
+GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
+
+DROP SEQUENCE IF EXISTS seq_word;
+CREATE SEQUENCE seq_word start 1;
+GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
diff --git a/nominatim/clicmd/index.py b/nominatim/clicmd/index.py

index 8fd4f6011251f480a3d37ff2572d979c7b971cb5..ea95e4565270a72dad8b9cdd2c609f51ee27a9d3 100644 (file)
--- a/nominatim/clicmd/index.py
+++ b/nominatim/clicmd/index.py
@@ -32,8 +32,11 @@ class UpdateIndex:
      @staticmethod
      def run(args):
          from ..indexer.indexer import Indexer
+        from ..tokenizer import factory as tokenizer_factory
  
-        indexer = Indexer(args.config.get_libpq_dsn(),
+        tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+
+        indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
                            args.threads or psutil.cpu_count() or 1)
  
          if not args.no_boundaries:
diff --git a/nominatim/clicmd/refresh.py b/nominatim/clicmd/refresh.py

index ddc00d497bbe786a944664fc64ca0f45e69a70f3..e6e749121d24100d1393d6e59cc5d7c71bfa8218 100644 (file)
--- a/nominatim/clicmd/refresh.py
+++ b/nominatim/clicmd/refresh.py
@@ -46,6 +46,7 @@ class UpdateRefresh:
      @staticmethod
      def run(args):
          from ..tools import refresh
+        from ..tokenizer import factory as tokenizer_factory
  
          if args.postcodes:
              LOG.warning("Update postcodes centroid")
@@ -66,6 +67,8 @@ class UpdateRefresh:
              with connect(args.config.get_libpq_dsn()) as conn:
                  refresh.create_functions(conn, args.config,
                                           args.diffs, args.enable_debug_statements)
+                tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+                tokenizer.update_sql_functions(args.config)
  
          if args.wiki_data:
              data_path = Path(args.config.WIKIPEDIA_DATA_PATH
diff --git a/nominatim/clicmd/replication.py b/nominatim/clicmd/replication.py

index c75322d9bf4c5591b9d252bdd288496424dccf54..69939430188838bb69f89bfe9ae041eb3cac1989 100644 (file)
--- a/nominatim/clicmd/replication.py
+++ b/nominatim/clicmd/replication.py
@@ -83,6 +83,7 @@ class UpdateReplication:
      def _update(args):
          from ..tools import replication
          from ..indexer.indexer import Indexer
+        from ..tokenizer import factory as tokenizer_factory
  
          params = args.osm2pgsql_options(default_cache=2000, default_threads=1)
          params.update(base_url=args.config.REPLICATION_URL,
@@ -106,6 +107,8 @@ class UpdateReplication:
                  raise UsageError("Bad argument '--no-index'.")
              recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL')
  
+        tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+
          while True:
              with connect(args.config.get_libpq_dsn()) as conn:
                  start = dt.datetime.now(dt.timezone.utc)
@@ -116,7 +119,7 @@ class UpdateReplication:
  
              if state is not replication.UpdateState.NO_CHANGES and args.do_index:
                  index_start = dt.datetime.now(dt.timezone.utc)
-                indexer = Indexer(args.config.get_libpq_dsn(),
+                indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
                                    args.threads or 1)
                  indexer.index_boundaries(0, 30)
                  indexer.index_by_rank(0, 30)
diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py

index 2014ff9e2faa32453483cbbef1993def03832bf0..eb0178a9f560f563a867488b5e608e67a0c024ad 100644 (file)
--- a/nominatim/clicmd/setup.py
+++ b/nominatim/clicmd/setup.py
@@ -56,6 +56,7 @@ class SetupAll:
          from ..tools import refresh
          from ..indexer.indexer import Indexer
          from ..tools import postcodes
+        from ..tokenizer import factory as tokenizer_factory
  
          if args.osm_file and not Path(args.osm_file).is_file():
              LOG.fatal("OSM file '%s' does not exist.", args.osm_file)
@@ -67,12 +68,6 @@ class SetupAll:
                                                      args.no_partitions,
                                                      rouser=args.config.DATABASE_WEBUSER)
  
-            LOG.warning('Installing database module')
-            with connect(args.config.get_libpq_dsn()) as conn:
-                database_import.install_module(args.module_dir, args.project_dir,
-                                               args.config.DATABASE_MODULE_PATH,
-                                               conn=conn)
-
              LOG.warning('Importing OSM data file')
              database_import.import_osm_data(Path(args.osm_file),
                                              args.osm2pgsql_options(0, 1),
@@ -105,22 +100,31 @@ class SetupAll:
          if args.continue_at is None or args.continue_at == 'load-data':
              LOG.warning('Initialise tables')
              with connect(args.config.get_libpq_dsn()) as conn:
-                database_import.truncate_data_tables(conn, args.config.MAX_WORD_FREQUENCY)
+                database_import.truncate_data_tables(conn)
  
              LOG.warning('Load data into placex table')
              database_import.load_data(args.config.get_libpq_dsn(),
-                                      args.data_dir,
                                        args.threads or psutil.cpu_count() or 1)
  
+        LOG.warning("Setting up tokenizer")
+        if args.continue_at is None or args.continue_at == 'load-data':
+            # (re)initialise the tokenizer data
+            tokenizer = tokenizer_factory.create_tokenizer(args.config)
+        else:
+            # just load the tokenizer
+            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+
+        if args.continue_at is None or args.continue_at == 'load-data':
              LOG.warning('Calculate postcodes')
-            postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir)
+            postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir,
+                                       tokenizer)
  
          if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
              if args.continue_at is not None and args.continue_at != 'load-data':
                  with connect(args.config.get_libpq_dsn()) as conn:
                      SetupAll._create_pending_index(conn, args.config.TABLESPACE_ADDRESS_INDEX)
              LOG.warning('Indexing places')
-            indexer = Indexer(args.config.get_libpq_dsn(),
+            indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
                                args.threads or psutil.cpu_count() or 1)
              indexer.index_full(analyse=not args.index_noanalyse)
  
@@ -129,7 +133,9 @@ class SetupAll:
              database_import.create_search_indices(conn, args.config,
                                                    drop=args.no_updates)
              LOG.warning('Create search index for default country names.')
-            database_import.create_country_names(conn, args.config)
+            database_import.create_country_names(conn, tokenizer,
+                                                 args.config.LANGUAGES)
+        tokenizer.finalize_import(args.config)
  
          webdir = args.project_dir / 'website'
          LOG.warning('Setup website at %s', webdir)
diff --git a/nominatim/clicmd/special_phrases.py b/nominatim/clicmd/special_phrases.py

index 1eb25bdc156ea8dc9bc0ce53441a1d598620de0c..002960feb2049b9d850ea54aeebc7b1644ba4dd1 100644 (file)
--- a/nominatim/clicmd/special_phrases.py
+++ b/nominatim/clicmd/special_phrases.py
@@ -9,6 +9,8 @@ LOG = logging.getLogger()
  
  # Do not repeat documentation of subcommand classes.
  # pylint: disable=C0111
+# Using non-top-level imports to avoid eventually unused imports.
+# pylint: disable=E0012,C0415
  
  class ImportSpecialPhrases:
      """\
@@ -22,10 +24,13 @@ class ImportSpecialPhrases:
  
      @staticmethod
      def run(args):
+        from ..tokenizer import factory as tokenizer_factory
+
          if args.import_from_wiki:
              LOG.warning('Special phrases importation starting')
+            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
              with connect(args.config.get_libpq_dsn()) as db_connection:
                  SpecialPhrasesImporter(
                      args.config, args.phplib_dir, db_connection
-                ).import_from_wiki()
+                ).import_from_wiki(tokenizer)
          return 0
diff --git a/nominatim/db/async_connection.py b/nominatim/db/async_connection.py

index 361fe0757365c40db30eb4b3e9246ed3564e39ad..a4f554965ab9144a89bb609296ee80ca82049ac3 100644 (file)
--- a/nominatim/db/async_connection.py
+++ b/nominatim/db/async_connection.py
@@ -48,14 +48,14 @@ class DBConnection:
      """ A single non-blocking database connection.
      """
  
-    def __init__(self, dsn):
+    def __init__(self, dsn, cursor_factory=None):
          self.current_query = None
          self.current_params = None
          self.dsn = dsn
  
          self.conn = None
          self.cursor = None
-        self.connect()
+        self.connect(cursor_factory=cursor_factory)
  
      def close(self):
          """ Close all open connections. Does not wait for pending requests.
@@ -66,7 +66,7 @@ class DBConnection:
  
          self.conn = None
  
-    def connect(self):
+    def connect(self, cursor_factory=None):
          """ (Re)connect to the database. Creates an asynchronous connection
              with JIT and parallel processing disabled. If a connection was
              already open, it is closed and a new connection established.
@@ -79,7 +79,7 @@ class DBConnection:
          self.conn = psycopg2.connect(**{'dsn' : self.dsn, 'async' : True})
          self.wait()
  
-        self.cursor = self.conn.cursor()
+        self.cursor = self.conn.cursor(cursor_factory=cursor_factory)
          # Disable JIT and parallel workers as they are known to cause problems.
          # Update pg_settings instead of using SET because it does not yield
          # errors on older versions of Postgres where the settings are not
diff --git a/nominatim/db/sql_preprocessor.py b/nominatim/db/sql_preprocessor.py

index 9e0b291298d7739993c6447db4ca159ae517fe40..dafc5de434bb3bf69a69014d7d7e20a2059f9313 100644 (file)
--- a/nominatim/db/sql_preprocessor.py
+++ b/nominatim/db/sql_preprocessor.py
@@ -89,8 +89,6 @@ class SQLPreprocessor:
          self.env.globals['db'] = db_info
          self.env.globals['sql'] = _setup_postgres_sql(conn)
          self.env.globals['postgres'] = _setup_postgresql_features(conn)
-        self.env.globals['modulepath'] = config.DATABASE_MODULE_PATH or \
-                                         str((config.project_dir / 'module').resolve())
  
  
      def run_sql_file(self, conn, name, **kwargs):
diff --git a/nominatim/indexer/indexer.py b/nominatim/indexer/indexer.py

index faf91c456b1eb9bd0d6a00d8e1f396f8fd846ab7..b7673abaddc8090a896351c7ad230f372742a739 100644 (file)
--- a/nominatim/indexer/indexer.py
+++ b/nominatim/indexer/indexer.py
@@ -3,6 +3,9 @@ Main work horse for indexing (computing addresses) the database.
  """
  import logging
  import select
+import time
+
+import psycopg2.extras
  
  from nominatim.indexer.progress import ProgressLogger
  from nominatim.indexer import runners
@@ -11,6 +14,73 @@ from nominatim.db.connection import connect
  
  LOG = logging.getLogger()
  
+
+class PlaceFetcher:
+    """ Asynchronous connection that fetches place details for processing.
+    """
+    def __init__(self, dsn, setup_conn):
+        self.wait_time = 0
+        self.current_ids = None
+        self.conn = DBConnection(dsn, cursor_factory=psycopg2.extras.DictCursor)
+
+        with setup_conn.cursor() as cur:
+            # need to fetch those manually because register_hstore cannot
+            # fetch them on an asynchronous connection below.
+            hstore_oid = cur.scalar("SELECT 'hstore'::regtype::oid")
+            hstore_array_oid = cur.scalar("SELECT 'hstore[]'::regtype::oid")
+
+        psycopg2.extras.register_hstore(self.conn.conn, oid=hstore_oid,
+                                        array_oid=hstore_array_oid)
+
+    def close(self):
+        """ Close the underlying asynchronous connection.
+        """
+        if self.conn:
+            self.conn.close()
+            self.conn = None
+
+
+    def fetch_next_batch(self, cur, runner):
+        """ Send a request for the next batch of places.
+            If details for the places are required, they will be fetched
+            asynchronously.
+
+            Returns true if there is still data available.
+        """
+        ids = cur.fetchmany(100)
+
+        if not ids:
+            self.current_ids = None
+            return False
+
+        if hasattr(runner, 'get_place_details'):
+            runner.get_place_details(self.conn, ids)
+            self.current_ids = []
+        else:
+            self.current_ids = ids
+
+        return True
+
+    def get_batch(self):
+        """ Get the next batch of data, previously requested with
+            `fetch_next_batch`.
+        """
+        if self.current_ids is not None and not self.current_ids:
+            tstart = time.time()
+            self.conn.wait()
+            self.wait_time += time.time() - tstart
+            self.current_ids = self.conn.cursor.fetchall()
+
+        return self.current_ids
+
+    def __enter__(self):
+        return self
+
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.conn.wait()
+        self.close()
+
  class WorkerPool:
      """ A pool of asynchronous database connections.
  
@@ -21,6 +91,7 @@ class WorkerPool:
      def __init__(self, dsn, pool_size):
          self.threads = [DBConnection(dsn) for _ in range(pool_size)]
          self.free_workers = self._yield_free_worker()
+        self.wait_time = 0
  
  
      def finish_all(self):
@@ -64,7 +135,9 @@ class WorkerPool:
                  ready = self.threads
                  command_stat = 0
              else:
+                tstart = time.time()
                  _, ready, _ = select.select([], self.threads, [])
+                self.wait_time += time.time() - tstart
  
  
      def __enter__(self):
@@ -72,6 +145,7 @@ class WorkerPool:
  
  
      def __exit__(self, exc_type, exc_value, traceback):
+        self.finish_all()
          self.close()
  
  
@@ -79,8 +153,9 @@ class Indexer:
      """ Main indexing routine.
      """
  
-    def __init__(self, dsn, num_threads):
+    def __init__(self, dsn, tokenizer, num_threads):
          self.dsn = dsn
+        self.tokenizer = tokenizer
          self.num_threads = num_threads
  
  
@@ -123,8 +198,9 @@ class Indexer:
          LOG.warning("Starting indexing boundaries using %s threads",
                      self.num_threads)
  
-        for rank in range(max(minrank, 4), min(maxrank, 26)):
-            self._index(runners.BoundaryRunner(rank))
+        with self.tokenizer.name_analyzer() as analyzer:
+            for rank in range(max(minrank, 4), min(maxrank, 26)):
+                self._index(runners.BoundaryRunner(rank, analyzer))
  
      def index_by_rank(self, minrank, maxrank):
          """ Index all entries of placex in the given rank range (inclusive)
@@ -137,15 +213,16 @@ class Indexer:
          LOG.warning("Starting indexing rank (%i to %i) using %i threads",
                      minrank, maxrank, self.num_threads)
  
-        for rank in range(max(1, minrank), maxrank):
-            self._index(runners.RankRunner(rank))
+        with self.tokenizer.name_analyzer() as analyzer:
+            for rank in range(max(1, minrank), maxrank):
+                self._index(runners.RankRunner(rank, analyzer))
  
-        if maxrank == 30:
-            self._index(runners.RankRunner(0))
-            self._index(runners.InterpolationRunner(), 20)
-            self._index(runners.RankRunner(30), 20)
-        else:
-            self._index(runners.RankRunner(maxrank))
+            if maxrank == 30:
+                self._index(runners.RankRunner(0, analyzer))
+                self._index(runners.InterpolationRunner(analyzer), 20)
+                self._index(runners.RankRunner(30, analyzer), 20)
+            else:
+                self._index(runners.RankRunner(maxrank, analyzer))
  
  
      def index_postcodes(self):
@@ -173,6 +250,7 @@ class Indexer:
          LOG.warning("Starting %s (using batch size %s)", runner.name(), batch)
  
          with connect(self.dsn) as conn:
+            psycopg2.extras.register_hstore(conn)
              with conn.cursor() as cur:
                  total_tuples = cur.scalar(runner.sql_count_objects())
                  LOG.debug("Total number of rows: %i", total_tuples)
@@ -185,19 +263,24 @@ class Indexer:
                  with conn.cursor(name='places') as cur:
                      cur.execute(runner.sql_get_objects())
  
-                    with WorkerPool(self.dsn, self.num_threads) as pool:
-                        while True:
-                            places = [p[0] for p in cur.fetchmany(batch)]
-                            if not places:
-                                break
+                    with PlaceFetcher(self.dsn, conn) as fetcher:
+                        with WorkerPool(self.dsn, self.num_threads) as pool:
+                            has_more = fetcher.fetch_next_batch(cur, runner)
+                            while has_more:
+                                places = fetcher.get_batch()
  
-                            LOG.debug("Processing places: %s", str(places))
-                            worker = pool.next_free_worker()
+                                # asynchronously get the next batch
+                                has_more = fetcher.fetch_next_batch(cur, runner)
  
-                            worker.perform(runner.sql_index_place(places))
-                            progress.add(len(places))
+                                # And insert the curent batch
+                                for idx in range(0, len(places), batch):
+                                    part = places[idx:idx+batch]
+                                    LOG.debug("Processing places: %s", str(part))
+                                    runner.index_places(pool.next_free_worker(), part)
+                                    progress.add(len(part))
  
-                        pool.finish_all()
+                            LOG.info("Wait time: fetcher: %.2fs,  pool: %.2fs",
+                                     fetcher.wait_time, pool.wait_time)
  
                  conn.commit()
  
diff --git a/nominatim/indexer/runners.py b/nominatim/indexer/runners.py

index 3c853cd0a19130b8bda5ab82ed539193bf75fa40..aa607faae3f3d48988ebddd738a49c90ba4bb607 100644 (file)
--- a/nominatim/indexer/runners.py
+++ b/nominatim/indexer/runners.py
@@ -2,14 +2,51 @@
  Mix-ins that provide the actual commands for the indexer for various indexing
  tasks.
  """
+import functools
+
+import psycopg2.extras
+
  # pylint: disable=C0111
  
-class RankRunner:
-    """ Returns SQL commands for indexing one rank within the placex table.
+class AbstractPlacexRunner:
+    """ Returns SQL commands for indexing of the placex table.
      """
+    SELECT_SQL = 'SELECT place_id FROM placex'
  
-    def __init__(self, rank):
+    def __init__(self, rank, analyzer):
          self.rank = rank
+        self.analyzer = analyzer
+
+
+    @staticmethod
+    @functools.lru_cache(maxsize=1)
+    def _index_sql(num_places):
+        return """ UPDATE placex
+                   SET indexed_status = 0, address = v.addr, token_info = v.ti
+                   FROM (VALUES {}) as v(id, addr, ti)
+                   WHERE place_id = v.id
+               """.format(','.join(["(%s, %s::hstore, %s::jsonb)"]  * num_places))
+
+
+    @staticmethod
+    def get_place_details(worker, ids):
+        worker.perform("""SELECT place_id, (placex_prepare_update(placex)).*
+                          FROM placex WHERE place_id IN %s""",
+                       (tuple((p[0] for p in ids)), ))
+
+
+    def index_places(self, worker, places):
+        values = []
+        for place in places:
+            values.extend((place[x] for x in ('place_id', 'address')))
+            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+
+        worker.perform(self._index_sql(len(places)), values)
+
+
+class RankRunner(AbstractPlacexRunner):
+    """ Returns SQL commands for indexing one rank within the placex table.
+    """
  
      def name(self):
          return "rank {}".format(self.rank)
@@ -20,24 +57,16 @@ class RankRunner:
                 """.format(self.rank)
  
      def sql_get_objects(self):
-        return """SELECT place_id FROM placex
-                  WHERE indexed_status > 0 and rank_address = {}
-                  ORDER BY geometry_sector""".format(self.rank)
+        return """{} WHERE indexed_status > 0 and rank_address = {}
+                     ORDER BY geometry_sector
+               """.format(self.SELECT_SQL, self.rank)
  
-    @staticmethod
-    def sql_index_place(ids):
-        return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\
-               .format(','.join((str(i) for i in ids)))
  
-
-class BoundaryRunner:
+class BoundaryRunner(AbstractPlacexRunner):
      """ Returns SQL commands for indexing the administrative boundaries
          of a certain rank.
      """
  
-    def __init__(self, rank):
-        self.rank = rank
-
      def name(self):
          return "boundaries rank {}".format(self.rank)
  
@@ -49,16 +78,10 @@ class BoundaryRunner:
                 """.format(self.rank)
  
      def sql_get_objects(self):
-        return """SELECT place_id FROM placex
-                  WHERE indexed_status > 0 and rank_search = {}
-                        and class = 'boundary' and type = 'administrative'
-                  ORDER BY partition, admin_level
-               """.format(self.rank)
-
-    @staticmethod
-    def sql_index_place(ids):
-        return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\
-               .format(','.join((str(i) for i in ids)))
+        return """{} WHERE indexed_status > 0 and rank_search = {}
+                           and class = 'boundary' and type = 'administrative'
+                     ORDER BY partition, admin_level
+               """.format(self.SELECT_SQL, self.rank)
  
  
  class InterpolationRunner:
@@ -66,6 +89,10 @@ class InterpolationRunner:
          location_property_osmline.
      """
  
+    def __init__(self, analyzer):
+        self.analyzer = analyzer
+
+
      @staticmethod
      def name():
          return "interpolation lines (location_property_osmline)"
@@ -77,15 +104,37 @@ class InterpolationRunner:
  
      @staticmethod
      def sql_get_objects():
-        return """SELECT place_id FROM location_property_osmline
+        return """SELECT place_id
+                  FROM location_property_osmline
                    WHERE indexed_status > 0
                    ORDER BY geometry_sector"""
  
+
+    @staticmethod
+    def get_place_details(worker, ids):
+        worker.perform("""SELECT place_id, get_interpolation_address(address, osm_id) as address
+                          FROM location_property_osmline WHERE place_id IN %s""",
+                       (tuple((p[0] for p in ids)), ))
+
+
      @staticmethod
-    def sql_index_place(ids):
-        return """UPDATE location_property_osmline
-                  SET indexed_status = 0 WHERE place_id IN ({})
-               """.format(','.join((str(i) for i in ids)))
+    @functools.lru_cache(maxsize=1)
+    def _index_sql(num_places):
+        return """ UPDATE location_property_osmline
+                   SET indexed_status = 0, address = v.addr, token_info = v.ti
+                   FROM (VALUES {}) as v(id, addr, ti)
+                   WHERE place_id = v.id
+               """.format(','.join(["(%s, %s::hstore, %s::jsonb)"]  * num_places))
+
+
+    def index_places(self, worker, places):
+        values = []
+        for place in places:
+            values.extend((place[x] for x in ('place_id', 'address')))
+            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+
+        worker.perform(self._index_sql(len(places)), values)
+
  
  
  class PostcodeRunner:
@@ -107,7 +156,7 @@ class PostcodeRunner:
                    ORDER BY country_code, postcode"""
  
      @staticmethod
-    def sql_index_place(ids):
-        return """UPDATE location_postcode SET indexed_status = 0
-                  WHERE place_id IN ({})
-               """.format(','.join((str(i) for i in ids)))
+    def index_places(worker, ids):
+        worker.perform(""" UPDATE location_postcode SET indexed_status = 0
+                           WHERE place_id IN ({})
+                       """.format(','.join((str(i[0]) for i in ids))))
diff --git a/nominatim/tokenizer/__init__.py b/nominatim/tokenizer/__init__.py

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/nominatim/tokenizer/factory.py b/nominatim/tokenizer/factory.py

new file mode 100644 (file)

index 0000000..e0c0629
--- /dev/null
+++ b/nominatim/tokenizer/factory.py
@@ -0,0 +1,88 @@
+"""
+Functions for creating a tokenizer or initialising the right one for an
+existing database.
+
+A tokenizer is something that is bound to the lifetime of a database. It
+can be choosen and configured before the intial import but then needs to
+be used consistently when querying and updating the database.
+
+This module provides the functions to create and configure a new tokenizer
+as well as instanciating the appropriate tokenizer for updating an existing
+database.
+
+A tokenizer usually also includes PHP code for querying. The appropriate PHP
+normalizer module is installed, when the tokenizer is created.
+"""
+import logging
+import importlib
+
+from ..errors import UsageError
+from ..db import properties
+from ..db.connection import connect
+
+LOG = logging.getLogger()
+
+def _import_tokenizer(name):
+    """ Load the tokenizer.py module from project directory.
+    """
+    try:
+        return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
+    except ModuleNotFoundError as exp:
+        LOG.fatal("No tokenizer named '%s' available. "
+                  "Check the setting of NOMINATIM_TOKENIZER.", name)
+        raise UsageError('Tokenizer not found') from exp
+
+
+def create_tokenizer(config, init_db=True, module_name=None):
+    """ Create a new tokenizer as defined by the given configuration.
+
+        The tokenizer data and code is copied into the 'tokenizer' directory
+        of the project directory and the tokenizer loaded from its new location.
+    """
+    if module_name is None:
+        module_name = config.TOKENIZER
+
+    # Create the directory for the tokenizer data
+    basedir = config.project_dir / 'tokenizer'
+    if not basedir.exists():
+        basedir.mkdir()
+    elif not basedir.is_dir():
+        LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir)
+        raise UsageError("Tokenizer setup failed.")
+
+    # Import and initialize the tokenizer.
+    tokenizer_module = _import_tokenizer(module_name)
+
+    tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
+    tokenizer.init_new_db(config, init_db=init_db)
+
+    with connect(config.get_libpq_dsn()) as conn:
+        properties.set_property(conn, 'tokenizer', module_name)
+
+    return tokenizer
+
+
+def get_tokenizer_for_db(config):
+    """ Instantiate a tokenizer for an existing database.
+
+        The function looks up the appropriate tokenizer in the database
+        and initialises it.
+    """
+    basedir = config.project_dir / 'tokenizer'
+    if not basedir.is_dir():
+        LOG.fatal("Cannot find tokenizer data in '%s'.", basedir)
+        raise UsageError('Cannot initialize tokenizer.')
+
+    with connect(config.get_libpq_dsn()) as conn:
+        name = properties.get_property(conn, 'tokenizer')
+
+    if name is None:
+        LOG.fatal("Tokenizer was not set up properly. Database property missing.")
+        raise UsageError('Cannot initialize tokenizer.')
+
+    tokenizer_module = _import_tokenizer(name)
+
+    tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
+    tokenizer.init_from_project()
+
+    return tokenizer
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py

new file mode 100644 (file)

index 0000000..2f060b8
--- /dev/null
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -0,0 +1,540 @@
+"""
+Tokenizer implementing normalisation as used before Nominatim 4.
+"""
+from collections import OrderedDict
+import logging
+import re
+import shutil
+from textwrap import dedent
+
+from icu import Transliterator
+import psycopg2
+import psycopg2.extras
+
+from nominatim.db.connection import connect
+from nominatim.db import properties
+from nominatim.db import utils as db_utils
+from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.errors import UsageError
+
+DBCFG_NORMALIZATION = "tokenizer_normalization"
+DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
+
+LOG = logging.getLogger()
+
+def create(dsn, data_dir):
+    """ Create a new instance of the tokenizer provided by this module.
+    """
+    return LegacyTokenizer(dsn, data_dir)
+
+
+def _install_module(config_module_path, src_dir, module_dir):
+    """ Copies the PostgreSQL normalisation module into the project
+        directory if necessary. For historical reasons the module is
+        saved in the '/module' subdirectory and not with the other tokenizer
+        data.
+
+        The function detects when the installation is run from the
+        build directory. It doesn't touch the module in that case.
+    """
+    # Custom module locations are simply used as is.
+    if config_module_path:
+        LOG.info("Using custom path for database module at '%s'", config_module_path)
+        return config_module_path
+
+    # Compatibility mode for builddir installations.
+    if module_dir.exists() and src_dir.samefile(module_dir):
+        LOG.info('Running from build directory. Leaving database module as is.')
+        return module_dir
+
+    # In any other case install the module in the project directory.
+    if not module_dir.exists():
+        module_dir.mkdir()
+
+    destfile = module_dir / 'nominatim.so'
+    shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
+    destfile.chmod(0o755)
+
+    LOG.info('Database module installed at %s', str(destfile))
+
+    return module_dir
+
+
+def _check_module(module_dir, conn):
+    """ Try to use the PostgreSQL module to confirm that it is correctly
+        installed and accessible from PostgreSQL.
+    """
+    with conn.cursor() as cur:
+        try:
+            cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
+                           RETURNS text AS '{}/nominatim.so', 'transliteration'
+                           LANGUAGE c IMMUTABLE STRICT;
+                           DROP FUNCTION nominatim_test_import_func(text)
+                        """.format(module_dir))
+        except psycopg2.DatabaseError as err:
+            LOG.fatal("Error accessing database module: %s", err)
+            raise UsageError("Database module cannot be accessed.") from err
+
+
+class LegacyTokenizer:
+    """ The legacy tokenizer uses a special PostgreSQL module to normalize
+        names and queries. The tokenizer thus implements normalization through
+        calls to the database.
+    """
+
+    def __init__(self, dsn, data_dir):
+        self.dsn = dsn
+        self.data_dir = data_dir
+        self.normalization = None
+
+
+    def init_new_db(self, config, init_db=True):
+        """ Set up a new tokenizer for the database.
+
+            This copies all necessary data in the project directory to make
+            sure the tokenizer remains stable even over updates.
+        """
+        module_dir = _install_module(config.DATABASE_MODULE_PATH,
+                                     config.lib_dir.module,
+                                     config.project_dir / 'module')
+
+        self.normalization = config.TERM_NORMALIZATION
+
+        self._install_php(config)
+
+        with connect(self.dsn) as conn:
+            _check_module(module_dir, conn)
+            self._save_config(conn, config)
+            conn.commit()
+
+        if init_db:
+            self.update_sql_functions(config)
+            self._init_db_tables(config)
+
+
+    def init_from_project(self):
+        """ Initialise the tokenizer from the project directory.
+        """
+        with connect(self.dsn) as conn:
+            self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
+
+
+    def finalize_import(self, config):
+        """ Do any required postprocessing to make the tokenizer data ready
+            for use.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
+
+
+    def update_sql_functions(self, config):
+        """ Reimport the SQL functions for this tokenizer.
+        """
+        with connect(self.dsn) as conn:
+            max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
+            modulepath = config.DATABASE_MODULE_PATH or \
+                         str((config.project_dir / 'module').resolve())
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
+                              max_word_freq=max_word_freq,
+                              modulepath=modulepath)
+
+
+    def check_database(self):
+        """ Check that the tokenizer is set up correctly.
+        """
+        hint = """\
+             The Postgresql extension nominatim.so was not correctly loaded.
+
+             Error: {error}
+
+             Hints:
+             * Check the output of the CMmake/make installation step
+             * Does nominatim.so exist?
+             * Does nominatim.so exist on the database server?
+             * Can nominatim.so be accessed by the database user?
+             """
+        with connect(self.dsn) as conn:
+            with conn.cursor() as cur:
+                try:
+                    out = cur.scalar("SELECT make_standard_name('a')")
+                except psycopg2.Error as err:
+                    return hint.format(error=str(err))
+
+        if out != 'a':
+            return hint.format(error='Unexpected result for make_standard_name()')
+
+        return None
+
+
+    def migrate_database(self, config):
+        """ Initialise the project directory of an existing database for
+            use with this tokenizer.
+
+            This is a special migration function for updating existing databases
+            to new software versions.
+        """
+        self.normalization = config.TERM_NORMALIZATION
+        module_dir = _install_module(config.DATABASE_MODULE_PATH,
+                                     config.lib_dir.module,
+                                     config.project_dir / 'module')
+
+        with connect(self.dsn) as conn:
+            _check_module(module_dir, conn)
+            self._save_config(conn, config)
+
+
+    def name_analyzer(self):
+        """ Create a new analyzer for tokenizing names and queries
+            using this tokinzer. Analyzers are context managers and should
+            be used accordingly:
+
+            ```
+            with tokenizer.name_analyzer() as analyzer:
+                analyser.tokenize()
+            ```
+
+            When used outside the with construct, the caller must ensure to
+            call the close() function before destructing the analyzer.
+
+            Analyzers are not thread-safe. You need to instantiate one per thread.
+        """
+        normalizer = Transliterator.createFromRules("phrase normalizer",
+                                                    self.normalization)
+        return LegacyNameAnalyzer(self.dsn, normalizer)
+
+
+    def _install_php(self, config):
+        """ Install the php script for the tokenizer.
+        """
+        php_file = self.data_dir / "tokenizer.php"
+        php_file.write_text(dedent("""\
+            <?php
+            @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
+            @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
+            require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
+            """.format(config)))
+
+
+    def _init_db_tables(self, config):
+        """ Set up the word table and fill it with pre-computed word
+            frequencies.
+        """
+        with connect(self.dsn) as conn:
+            sqlp = SQLPreprocessor(conn, config)
+            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
+            conn.commit()
+
+        LOG.warning("Precomputing word tokens")
+        db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
+
+
+    def _save_config(self, conn, config):
+        """ Save the configuration that needs to remain stable for the given
+            database as database properties.
+        """
+        properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
+        properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
+
+
+class LegacyNameAnalyzer:
+    """ The legacy analyzer uses the special Postgresql module for
+        splitting names.
+
+        Each instance opens a connection to the database to request the
+        normalization.
+    """
+
+    def __init__(self, dsn, normalizer):
+        self.conn = connect(dsn).connection
+        self.conn.autocommit = True
+        self.normalizer = normalizer
+        psycopg2.extras.register_hstore(self.conn)
+
+        self._cache = _TokenCache(self.conn)
+
+
+    def __enter__(self):
+        return self
+
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+
+    def close(self):
+        """ Free all resources used by the analyzer.
+        """
+        if self.conn:
+            self.conn.close()
+            self.conn = None
+
+
+    def normalize(self, phrase):
+        """ Normalize the given phrase, i.e. remove all properties that
+            are irrelevant for search.
+        """
+        return self.normalizer.transliterate(phrase)
+
+
+    def add_postcodes_from_db(self):
+        """ Add postcodes from the location_postcode table to the word table.
+        """
+        with self.conn.cursor() as cur:
+            cur.execute("""SELECT count(create_postcode_id(pc))
+                           FROM (SELECT distinct(postcode) as pc
+                                 FROM location_postcode) x""")
+
+
+    def update_special_phrases(self, phrases):
+        """ Replace the search index for special phrases with the new phrases.
+        """
+        norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
+                            for p in phrases))
+
+        with self.conn.cursor() as cur:
+            # Get the old phrases.
+            existing_phrases = set()
+            cur.execute("""SELECT word, class, type, operator FROM word
+                           WHERE class != 'place'
+                                 OR (type != 'house' AND type != 'postcode')""")
+            for label, cls, typ, oper in cur:
+                existing_phrases.add((label, cls, typ, oper or '-'))
+
+            to_add = norm_phrases - existing_phrases
+            to_delete = existing_phrases - norm_phrases
+
+            if to_add:
+                psycopg2.extras.execute_values(
+                    cur,
+                    """ INSERT INTO word (word_id, word_token, word, class, type,
+                                          search_name_count, operator)
+                        (SELECT nextval('seq_word'), make_standard_name(name), name,
+                                class, type, 0,
+                                CASE WHEN op in ('in', 'near') THEN op ELSE null END
+                           FROM (VALUES %s) as v(name, class, type, op))""",
+                    to_add)
+
+            if to_delete:
+                psycopg2.extras.execute_values(
+                    cur,
+                    """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+                        WHERE word = name and class = in_class and type = in_type
+                              and ((op = '-' and operator is null) or op = operator)""",
+                    to_delete)
+
+        LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
+                 len(norm_phrases), len(to_add), len(to_delete))
+
+
+    def add_country_names(self, country_code, names):
+        """ Add names for the given country to the search index.
+        """
+        with self.conn.cursor() as cur:
+            cur.execute(
+                """INSERT INTO word (word_id, word_token, country_code)
+                   (SELECT nextval('seq_word'), lookup_token, %s
+                      FROM (SELECT ' ' || make_standard_name(n) as lookup_token
+                            FROM unnest(%s)n) y
+                      WHERE NOT EXISTS(SELECT * FROM word
+                                       WHERE word_token = lookup_token and country_code = %s))
+                """, (country_code, names, country_code))
+
+
+    def process_place(self, place):
+        """ Determine tokenizer information about the given place.
+
+            Returns a JSON-serialisable structure that will be handed into
+            the database via the token_info field.
+        """
+        token_info = _TokenInfo(self._cache)
+
+        names = place.get('name')
+
+        if names:
+            token_info.add_names(self.conn, names)
+
+            country_feature = place.get('country_feature')
+            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
+                self.add_country_names(country_feature.lower(), list(names.values()))
+
+        address = place.get('address')
+
+        if address:
+            hnrs = []
+            addr_terms = []
+            for key, value in address.items():
+                if key == 'postcode':
+                    self._add_postcode(value)
+                elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                    hnrs.append(value)
+                elif key == 'street':
+                    token_info.add_street(self.conn, value)
+                elif key == 'place':
+                    token_info.add_place(self.conn, value)
+                elif not key.startswith('_') and \
+                     key not in ('country', 'full'):
+                    addr_terms.append((key, value))
+
+            if hnrs:
+                token_info.add_housenumbers(self.conn, hnrs)
+
+            if addr_terms:
+                token_info.add_address_terms(self.conn, addr_terms)
+
+        return token_info.data
+
+
+    def _add_postcode(self, postcode):
+        """ Make sure the normalized postcode is present in the word table.
+        """
+        def _create_postcode_from_db(pcode):
+            with self.conn.cursor() as cur:
+                cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
+
+        if re.search(r'[:,;]', postcode) is None:
+            self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
+
+
+class _TokenInfo:
+    """ Collect token information to be sent back to the database.
+    """
+    def __init__(self, cache):
+        self.cache = cache
+        self.data = {}
+
+
+    def add_names(self, conn, names):
+        """ Add token information for the names of the place.
+        """
+        with conn.cursor() as cur:
+            # Create the token IDs for all names.
+            self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
+                                            (names, ))
+
+
+    def add_housenumbers(self, conn, hnrs):
+        """ Extract housenumber information from the address.
+        """
+        if len(hnrs) == 1:
+            token = self.cache.get_housenumber(hnrs[0])
+            if token is not None:
+                self.data['hnr_tokens'] = token
+                self.data['hnr'] = hnrs[0]
+                return
+
+        # split numbers if necessary
+        simple_list = []
+        for hnr in hnrs:
+            simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
+
+        if len(simple_list) > 1:
+            simple_list = list(set(simple_list))
+
+        with conn.cursor() as cur:
+            cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
+            self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
+
+
+    def add_street(self, conn, street):
+        """ Add addr:street match terms.
+        """
+        def _get_street(name):
+            with conn.cursor() as cur:
+                return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
+
+        self.data['street'] = self.cache.streets.get(street, _get_street)
+
+
+    def add_place(self, conn, place):
+        """ Add addr:place search and match terms.
+        """
+        def _get_place(name):
+            with conn.cursor() as cur:
+                cur.execute("""SELECT (addr_ids_from_name(%s)
+                                       || getorcreate_name_id(make_standard_name(%s), ''))::text,
+                                      word_ids_from_name(%s)::text""",
+                            (name, name, name))
+                return cur.fetchone()
+
+        self.data['place_search'], self.data['place_match'] = \
+            self.cache.places.get(place, _get_place)
+
+
+    def add_address_terms(self, conn, terms):
+        """ Add additional address terms.
+        """
+        def _get_address_term(name):
+            with conn.cursor() as cur:
+                cur.execute("""SELECT addr_ids_from_name(%s)::text,
+                                      word_ids_from_name(%s)::text""",
+                            (name, name))
+                return cur.fetchone()
+
+        tokens = {}
+        for key, value in terms:
+            tokens[key] = self.cache.address_terms.get(value, _get_address_term)
+
+        self.data['addr'] = tokens
+
+
+class _LRU:
+    """ Least recently used cache that accepts a generator function to
+        produce the item when there is a cache miss.
+    """
+
+    def __init__(self, maxsize=128, init_data=None):
+        self.data = init_data or OrderedDict()
+        self.maxsize = maxsize
+        if init_data is not None and len(init_data) > maxsize:
+            self.maxsize = len(init_data)
+
+    def get(self, key, generator):
+        """ Get the item with the given key from the cache. If nothing
+            is found in the cache, generate the value through the
+            generator function and store it in the cache.
+        """
+        value = self.data.get(key)
+        if value is not None:
+            self.data.move_to_end(key)
+        else:
+            value = generator(key)
+            if len(self.data) >= self.maxsize:
+                self.data.popitem(last=False)
+            self.data[key] = value
+
+        return value
+
+
+class _TokenCache:
+    """ Cache for token information to avoid repeated database queries.
+
+        This cache is not thread-safe and needs to be instantiated per
+        analyzer.
+    """
+    def __init__(self, conn):
+        # various LRU caches
+        self.streets = _LRU(maxsize=256)
+        self.places = _LRU(maxsize=128)
+        self.address_terms = _LRU(maxsize=1024)
+
+        # Lookup houseunumbers up to 100 and cache them
+        with conn.cursor() as cur:
+            cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
+                           FROM generate_series(1, 100) as i""")
+            self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
+
+        # Get postcodes that are already saved
+        postcodes = OrderedDict()
+        with conn.cursor() as cur:
+            cur.execute("""SELECT word FROM word
+                           WHERE class ='place' and type = 'postcode'""")
+            for row in cur:
+                postcodes[row[0]] = None
+        self.postcodes = _LRU(maxsize=32, init_data=postcodes)
+
+    def get_housenumber(self, number):
+        """ Get a housenumber token from the cache.
+        """
+        return self._cached_housenumbers.get(number)
diff --git a/nominatim/tools/check_database.py b/nominatim/tools/check_database.py

index 2d57c5ee323ee8e1f620a26932b2e51be6530384..d4f793b46334c27f304fbfe898268a4f23f564a9 100644 (file)
--- a/nominatim/tools/check_database.py
+++ b/nominatim/tools/check_database.py
@@ -4,10 +4,9 @@ Collection of functions that check if the database is complete and functional.
  from enum import Enum
  from textwrap import dedent
  
-import psycopg2
-
  from nominatim.db.connection import connect
  from nominatim.errors import UsageError
+from nominatim.tokenizer import factory as tokenizer_factory
  
  CHECKLIST = []
  
@@ -78,8 +77,7 @@ def check_database(config):
  
  
  def _get_indexes(conn):
-    indexes = ['idx_word_word_id',
-               'idx_place_addressline_address_place_id',
+    indexes = ['idx_place_addressline_address_place_id',
                 'idx_placex_rank_search',
                 'idx_placex_rank_address',
                 'idx_placex_parent_place_id',
@@ -149,7 +147,7 @@ def check_placex_table(conn, config):
  
  
  @_check(hint="""placex table has no data. Did the import finish sucessfully?""")
-def check_placex_size(conn, config): # pylint: disable=W0613
+def check_placex_size(conn, _):
      """ Checking for placex content
      """
      with conn.cursor() as cur:
@@ -158,38 +156,30 @@ def check_placex_size(conn, config): # pylint: disable=W0613
      return CheckState.OK if cnt > 0 else CheckState.FATAL
  
  
-@_check(hint="""\
-             The Postgresql extension nominatim.so was not correctly loaded.
-
-             Error: {error}
-
-             Hints:
-             * Check the output of the CMmake/make installation step
-             * Does nominatim.so exist?
-             * Does nominatim.so exist on the database server?
-             * Can nominatim.so be accessed by the database user?
-             """)
-def check_module(conn, config): # pylint: disable=W0613
-    """ Checking that nominatim.so module is installed
+@_check(hint="""{msg}""")
+def check_tokenizer(_, config):
+    """ Checking that tokenizer works
      """
-    with conn.cursor() as cur:
-        try:
-            out = cur.scalar("SELECT make_standard_name('a')")
-        except psycopg2.ProgrammingError as err:
-            return CheckState.FAIL, dict(error=str(err))
+    try:
+        tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
+    except UsageError:
+        return CheckState.FAIL, dict(msg="""\
+            Cannot load tokenizer. Did the import finish sucessfully?""")
  
-        if out != 'a':
-            return CheckState.FAIL, dict(error='Unexpected result for make_standard_name()')
+    result = tokenizer.check_database()
  
+    if result is None:
          return CheckState.OK
  
+    return CheckState.FAIL, dict(msg=result)
+
  
  @_check(hint="""\
               The indexing didn't finish. {count} entries are not yet indexed.
  
               To index the remaining entries, run:   {index_cmd}
               """)
-def check_indexing(conn, config): # pylint: disable=W0613
+def check_indexing(conn, _):
      """ Checking indexing status
      """
      with conn.cursor() as cur:
@@ -198,7 +188,7 @@ def check_indexing(conn, config): # pylint: disable=W0613
      if cnt == 0:
          return CheckState.OK
  
-    if conn.index_exists('idx_word_word_id'):
+    if conn.index_exists('idx_placex_rank_search'):
          # Likely just an interrupted update.
          index_cmd = 'nominatim index'
      else:
@@ -214,7 +204,7 @@ def check_indexing(conn, config): # pylint: disable=W0613
  
               Rerun the index creation with:   nominatim import --continue db-postprocess
               """)
-def check_database_indexes(conn, config): # pylint: disable=W0613
+def check_database_indexes(conn, _):
      """ Checking that database indexes are complete
      """
      missing = []
@@ -236,7 +226,7 @@ def check_database_indexes(conn, config): # pylint: disable=W0613
               Invalid indexes:
                 {indexes}
               """)
-def check_database_index_valid(conn, config): # pylint: disable=W0613
+def check_database_index_valid(conn, _):
      """ Checking that all database indexes are valid
      """
      with conn.cursor() as cur:
diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py

index 3618ed28e14416d412f4033d6130bc4593d3aa38..664d3c6b39ed2fafb57e45960c052ccd76401505 100644 (file)
--- a/nominatim/tools/database_import.py
+++ b/nominatim/tools/database_import.py
@@ -5,11 +5,10 @@ import logging
  import os
  import selectors
  import subprocess
-import shutil
  from pathlib import Path
  
  import psutil
-import psycopg2
+import psycopg2.extras
  
  from nominatim.db.connection import connect, get_pg_env
  from nominatim.db import utils as db_utils
@@ -89,49 +88,6 @@ def setup_extensions(conn):
          raise UsageError('PostGIS version is too old.')
  
  
-def install_module(src_dir, project_dir, module_dir, conn=None):
-    """ Copy the normalization module from src_dir into the project
-        directory under the '/module' directory. If 'module_dir' is set, then
-        use the module from there instead and check that it is accessible
-        for Postgresql.
-
-        The function detects when the installation is run from the
-        build directory. It doesn't touch the module in that case.
-
-        If 'conn' is given, then the function also tests if the module
-        can be access via the given database.
-    """
-    if not module_dir:
-        module_dir = project_dir / 'module'
-
-        if not module_dir.exists() or not src_dir.samefile(module_dir):
-
-            if not module_dir.exists():
-                module_dir.mkdir()
-
-            destfile = module_dir / 'nominatim.so'
-            shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
-            destfile.chmod(0o755)
-
-            LOG.info('Database module installed at %s', str(destfile))
-        else:
-            LOG.info('Running from build directory. Leaving database module as is.')
-    else:
-        LOG.info("Using custom path for database module at '%s'", module_dir)
-
-    if conn is not None:
-        with conn.cursor() as cur:
-            try:
-                cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
-                               RETURNS text AS '{}/nominatim.so', 'transliteration'
-                               LANGUAGE c IMMUTABLE STRICT;
-                               DROP FUNCTION nominatim_test_import_func(text)
-                            """.format(module_dir))
-            except psycopg2.DatabaseError as err:
-                LOG.fatal("Error accessing database module: %s", err)
-                raise UsageError("Database module cannot be accessed.") from err
-
-
  def import_base_data(dsn, sql_dir, ignore_partitions=False):
      """ Create and populate the tables with basic static data that provides
          the background for geocoding. Data is assumed to not yet exist.
@@ -205,11 +161,10 @@ def create_partition_tables(conn, config):
      sql.run_sql_file(conn, 'partition-tables.src.sql')
  
  
-def truncate_data_tables(conn, max_word_frequency=None):
+def truncate_data_tables(conn):
      """ Truncate all data tables to prepare for a fresh load.
      """
      with conn.cursor() as cur:
-        cur.execute('TRUNCATE word')
          cur.execute('TRUNCATE placex')
          cur.execute('TRUNCATE place_addressline')
          cur.execute('TRUNCATE location_area')
@@ -228,23 +183,13 @@ def truncate_data_tables(conn, max_word_frequency=None):
          for table in [r[0] for r in list(cur)]:
              cur.execute('TRUNCATE ' + table)
  
-        if max_word_frequency is not None:
-            # Used by getorcreate_word_id to ignore frequent partial words.
-            cur.execute("""CREATE OR REPLACE FUNCTION get_maxwordfreq()
-                           RETURNS integer AS $$
-                             SELECT {} as maxwordfreq;
-                           $$ LANGUAGE SQL IMMUTABLE
-                        """.format(max_word_frequency))
-        conn.commit()
+    conn.commit()
  
  _COPY_COLUMNS = 'osm_type, osm_id, class, type, name, admin_level, address, extratags, geometry'
  
-def load_data(dsn, data_dir, threads):
+def load_data(dsn, threads):
      """ Copy data into the word and placex table.
      """
-    # Pre-calculate the most important terms in the word list.
-    db_utils.execute_file(dsn, data_dir / 'words.sql')
-
      sel = selectors.DefaultSelector()
      # Then copy data from place to placex in <threads - 1> chunks.
      place_threads = max(1, threads - 1)
@@ -306,34 +251,37 @@ def create_search_indices(conn, config, drop=False):
  
      sql.run_sql_file(conn, 'indices.sql', drop=drop)
  
-def create_country_names(conn, config):
-    """ Create search index for default country names.
+def create_country_names(conn, tokenizer, languages=None):
+    """ Add default country names to search index. `languages` is a comma-
+        separated list of language codes as used in OSM. If `languages` is not
+        empty then only name translations for the given languages are added
+        to the index.
      """
+    if languages:
+        languages = languages.split(',')
+
+    def _include_key(key):
+        return key == 'name' or \
+               (key.startswith('name:') \
+                and (not languages or key[5:] in languages))
  
      with conn.cursor() as cur:
-        cur.execute("""SELECT getorcreate_country(make_standard_name('uk'), 'gb')""")
-        cur.execute("""SELECT getorcreate_country(make_standard_name('united states'), 'us')""")
-        cur.execute("""SELECT COUNT(*) FROM
-                       (SELECT getorcreate_country(make_standard_name(country_code),
-                       country_code) FROM country_name WHERE country_code is not null) AS x""")
-        cur.execute("""SELECT COUNT(*) FROM
-                       (SELECT getorcreate_country(make_standard_name(name->'name'), country_code) 
-                       FROM country_name WHERE name ? 'name') AS x""")
-        sql_statement = """SELECT COUNT(*) FROM (SELECT getorcreate_country(make_standard_name(v),
-                           country_code) FROM (SELECT country_code, skeys(name)
-                           AS k, svals(name) AS v FROM country_name) x WHERE k"""
-
-        languages = config.LANGUAGES
-
-        if languages:
-            sql_statement = "{} IN (".format(sql_statement)
-            delim = ''
-            for language in languages.split(','):
-                sql_statement = "{}{}'name:{}'".format(sql_statement, delim, language)
-                delim = ', '
-            sql_statement = '{})'.format(sql_statement)
-        else:
-            sql_statement = "{} LIKE 'name:%'".format(sql_statement)
-        sql_statement = "{}) v".format(sql_statement)
-        cur.execute(sql_statement)
+        psycopg2.extras.register_hstore(cur)
+        cur.execute("""SELECT country_code, name FROM country_name
+                       WHERE country_code is not null""")
+
+        with tokenizer.name_analyzer() as analyzer:
+            for code, name in cur:
+                names = [code]
+                if code == 'gb':
+                    names.append('UK')
+                if code == 'us':
+                    names.append('United States')
+
+                # country names (only in languages as provided)
+                if name:
+                    names.extend((v for k, v in name.items() if _include_key(k)))
+
+                analyzer.add_country_names(code, names)
+
      conn.commit()
diff --git a/nominatim/tools/migration.py b/nominatim/tools/migration.py

index 4af5cb4879ef8068c60b2426c4f7d64293a97b6a..ddf25cd91ba30307fef2c4d2ee65cae2a38596ce 100644 (file)
--- a/nominatim/tools/migration.py
+++ b/nominatim/tools/migration.py
@@ -6,7 +6,8 @@ import logging
  from nominatim.db import properties
  from nominatim.db.connection import connect
  from nominatim.version import NOMINATIM_VERSION
-from nominatim.tools import refresh, database_import
+from nominatim.tools import refresh
+from nominatim.tokenizer import factory as tokenizer_factory
  from nominatim.errors import UsageError
  
  LOG = logging.getLogger()
@@ -43,11 +44,14 @@ def migrate(config, paths):
                              '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(version))
                  kwargs = dict(conn=conn, config=config, paths=paths)
                  func(**kwargs)
+                conn.commit()
                  has_run_migration = True
  
          if has_run_migration:
              LOG.warning('Updating SQL functions.')
              refresh.create_functions(conn, config)
+            tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
+            tokenizer.update_sql_functions(config)
  
          properties.set_property(conn, 'database_version',
                                  '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))
@@ -108,17 +112,6 @@ def import_status_timestamp_change(conn, **_):
                         TYPE timestamp with time zone;""")
  
  
-@_migration(3, 5, 0, 99)
-def install_database_module_in_project_directory(conn, config, paths, **_):
-    """ Install database module in project directory.
-
-        The database module needs to be present in the project directory
-        since those were introduced.
-    """
-    database_import.install_module(paths.module_dir, paths.project_dir,
-                                   config.DATABASE_MODULE_PATH, conn=conn)
-
-
  @_migration(3, 5, 0, 99)
  def add_nominatim_property_table(conn, config, **_):
      """ Add nominatim_property table.
@@ -137,6 +130,9 @@ def change_housenumber_transliteration(conn, **_):
  
          The database schema switched from saving raw housenumbers in
          placex.housenumber to saving transliterated ones.
+
+        Note: the function create_housenumber_id() has been dropped in later
+              versions.
      """
      with conn.cursor() as cur:
          cur.execute("""CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
@@ -173,3 +169,25 @@ def switch_placenode_geometry_index(conn, **_):
                                and class = 'place' and type != 'postcode'
                                and linked_place_id is null""")
          cur.execute(""" DROP INDEX IF EXISTS idx_placex_adminname """)
+
+
+@_migration(3, 7, 0, 1)
+def install_legacy_tokenizer(conn, config, **_):
+    """ Setup legacy tokenizer.
+
+        If no other tokenizer has been configured yet, then create the
+        configuration for the backwards-compatible legacy tokenizer
+    """
+    if properties.get_property(conn, 'tokenizer') is None:
+        with conn.cursor() as cur:
+            for table in ('placex', 'location_property_osmline'):
+                has_column = cur.scalar("""SELECT count(*) FROM information_schema.columns
+                                           WHERE table_name = %s
+                                           and column_name = 'token_info'""",
+                                        (table, ))
+            if has_column == 0:
+                cur.execute('ALTER TABLE {} ADD COLUMN token_info JSONB'.format(table))
+        tokenizer = tokenizer_factory.create_tokenizer(config, init_db=False,
+                                                       module_name='legacy')
+
+        tokenizer.migrate_database(config)
diff --git a/nominatim/tools/postcodes.py b/nominatim/tools/postcodes.py

index 0a568cbafc73c125c4f9d34b7141b42b05eb6b11..78bd8cb9490c5646754cef2ef2bbf2348d5e2a74 100644 (file)
--- a/nominatim/tools/postcodes.py
+++ b/nominatim/tools/postcodes.py
@@ -6,7 +6,7 @@ of artificial postcode centroids.
  from nominatim.db.utils import execute_file
  from nominatim.db.connection import connect
  
-def import_postcodes(dsn, project_dir):
+def import_postcodes(dsn, project_dir, tokenizer):
      """ Set up the initial list of postcodes.
      """
  
@@ -41,10 +41,11 @@ def import_postcodes(dsn, project_dir):
                  INSERT INTO location_postcode
                   (place_id, indexed_status, country_code, postcode, geometry)
                  SELECT nextval('seq_place'), 1, country_code,
-                       upper(trim (both ' ' from address->'postcode')) as pc,
+                       token_normalized_postcode(address->'postcode') as pc,
                         ST_Centroid(ST_Collect(ST_Centroid(geometry)))
                    FROM placex
-                 WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%'
+                 WHERE address ? 'postcode'
+                       and token_normalized_postcode(address->'postcode') is not null
                         AND geometry IS NOT null
                   GROUP BY country_code, pc
              """)
@@ -52,9 +53,10 @@ def import_postcodes(dsn, project_dir):
              cur.execute("""
                  INSERT INTO location_postcode
                   (place_id, indexed_status, country_code, postcode, geometry)
-                SELECT nextval('seq_place'), 1, 'us', postcode,
+                SELECT nextval('seq_place'), 1, 'us',
+                       token_normalized_postcode(postcode),
                         ST_SetSRID(ST_Point(x,y),4326)
-                  FROM us_postcode WHERE postcode NOT IN
+                  FROM us_postcode WHERE token_normalized_postcode(postcode) NOT IN
                          (SELECT postcode FROM location_postcode
                            WHERE country_code = 'us')
              """)
@@ -62,8 +64,9 @@ def import_postcodes(dsn, project_dir):
              cur.execute("""
                  INSERT INTO location_postcode
                   (place_id, indexed_status, country_code, postcode, geometry)
-                SELECT nextval('seq_place'), 1, 'gb', postcode, geometry
-                  FROM gb_postcode WHERE postcode NOT IN
+                SELECT nextval('seq_place'), 1, 'gb',
+                       token_normalized_postcode(postcode), geometry
+                  FROM gb_postcode WHERE token_normalized_postcode(postcode) NOT IN
                             (SELECT postcode FROM location_postcode
                               WHERE country_code = 'gb')
              """)
@@ -72,9 +75,7 @@ def import_postcodes(dsn, project_dir):
                      DELETE FROM word WHERE class='place' and type='postcode'
                      and word NOT IN (SELECT postcode FROM location_postcode)
              """)
-
-            cur.execute("""
-                SELECT count(getorcreate_postcode_id(v)) FROM
-                (SELECT distinct(postcode) as v FROM location_postcode) p
-            """)
          conn.commit()
+
+        with tokenizer.name_analyzer() as analyzer:
+            analyzer.add_postcodes_from_db()
diff --git a/nominatim/tools/refresh.py b/nominatim/tools/refresh.py

index 8fc0c69dd16dca16ec680daebf6544d25f50d847..6720465fd9220387781b8df939742a1aee482a0e 100644 (file)
--- a/nominatim/tools/refresh.py
+++ b/nominatim/tools/refresh.py
@@ -104,13 +104,11 @@ PHP_CONST_DEFS = (
      ('Default_Language', 'DEFAULT_LANGUAGE', str),
      ('Log_DB', 'LOG_DB', bool),
      ('Log_File', 'LOG_FILE', str),
-    ('Max_Word_Frequency', 'MAX_WORD_FREQUENCY', int),
      ('NoAccessControl', 'CORS_NOACCESSCONTROL', bool),
      ('Places_Max_ID_count', 'LOOKUP_MAX_COUNT', int),
      ('PolygonOutput_MaximumTypes', 'POLYGON_OUTPUT_MAX_TYPES', int),
      ('Search_BatchMode', 'SEARCH_BATCH_MODE', bool),
      ('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str),
-    ('Term_Normalization_Rules', 'TERM_NORMALIZATION', str),
      ('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool),
      ('MapIcon_URL', 'MAPICON_URL', str),
  )
@@ -175,9 +173,11 @@ def setup_website(basedir, config):
  
                        @define('CONST_Debug', $_GET['debug'] ?? false);
                        @define('CONST_LibDir', '{0}');
+                      @define('CONST_TokenizerDir', '{2}');
                        @define('CONST_NominatimVersion', '{1[0]}.{1[1]}.{1[2]}-{1[3]}');
  
-                      """.format(config.lib_dir.php, NOMINATIM_VERSION))
+                      """.format(config.lib_dir.php, NOMINATIM_VERSION,
+                                 config.project_dir / 'tokenizer'))
  
      for php_name, conf_name, var_type in PHP_CONST_DEFS:
          if var_type == bool:
diff --git a/nominatim/tools/special_phrases/importer_statistics.py b/nominatim/tools/special_phrases/importer_statistics.py

index 46965c4bed6d276b8e6cbb6d55c17a974595dba1..9b97bca628dc8bb729d6645568722b369e91d6ef 100644 (file)
--- a/nominatim/tools/special_phrases/importer_statistics.py
+++ b/nominatim/tools/special_phrases/importer_statistics.py
@@ -24,9 +24,6 @@ class SpecialPhrasesImporterStatistics():
          self.tables_deleted = 0
          self.tables_ignored = 0
          self.global_phrases_invalid = 0
-        self.global_phrases_added = 0
-        self.global_phrases_ignored = 0
-        self.global_phrases_deleted = 0
  
      def _set_lang_values_to_0(self):
          """
@@ -34,8 +31,6 @@ class SpecialPhrasesImporterStatistics():
              lang to 0.
          """
          self.lang_phrases_invalid = 0
-        self.lang_phrases_added = 0
-        self.lang_phrases_ignored = 0
  
      def notify_one_phrase_invalid(self):
          """
@@ -45,29 +40,6 @@ class SpecialPhrasesImporterStatistics():
          self.lang_phrases_invalid += 1
          self.global_phrases_invalid += 1
  
-    def notify_one_phrase_added(self):
-        """
-            Add +1 to the count of entries
-            added to the db.
-        """
-        self.lang_phrases_added += 1
-        self.global_phrases_added += 1
-
-    def notify_one_phrase_ignored(self):
-        """
-            Add +1 to the count of ignored
-            entries as it was already in the db.
-        """
-        self.lang_phrases_ignored += 1
-        self.global_phrases_ignored += 1
-
-    def notify_one_phrase_deleted(self):
-        """
-            Add +1 to the count of phrases deleted
-            from the database.
-        """
-        self.global_phrases_deleted += 1
-
      def notify_one_table_created(self):
          """
              Add +1 to the count of created tables.
@@ -97,12 +69,6 @@ class SpecialPhrasesImporterStatistics():
          LOG.info('- %s phrases were invalid.', self.global_phrases_invalid)
          if self.global_phrases_invalid > 0:
              LOG.info('  Those invalid phrases have been skipped.')
-        LOG.info('- %s phrases were ignored as they are already in the database',
-                 self.global_phrases_ignored)
-        LOG.info('- %s phrases were added to the database', self.global_phrases_added)
-        LOG.info('- %s phrases were deleted from the database', self.global_phrases_deleted)
-        if self.global_phrases_deleted > 0:
-            LOG.info('  They were deleted as they are not valid anymore.')
          LOG.info('- %s tables were ignored as they already exist on the database',
                   self.tables_ignored)
          LOG.info('- %s tables were created', self.tables_created)
@@ -126,9 +92,6 @@ class SpecialPhrasesImporterStatistics():
          LOG.info('- %s phrases were invalid.', self.lang_phrases_invalid)
          if self.lang_phrases_invalid > 0:
              LOG.info('  Those invalid phrases have been skipped.')
-        LOG.info('- %s phrases were ignored as they are already in the database',
-                 self.lang_phrases_ignored)
-        LOG.info('- %s phrases were added to the database', self.lang_phrases_added)
          LOG.info('====================================================================')
  
          if self.lang_phrases_invalid > 0:
diff --git a/nominatim/tools/special_phrases/special_phrases_importer.py b/nominatim/tools/special_phrases/special_phrases_importer.py

index a8e780c39ba4515568e5103b208ef0822af8ff6c..9649f94b1a736b0d561d489d1b9e67ef8546d28b 100644 (file)
--- a/nominatim/tools/special_phrases/special_phrases_importer.py
+++ b/nominatim/tools/special_phrases/special_phrases_importer.py
@@ -9,7 +9,6 @@ import re
  import subprocess
  import json
  
-from icu import Transliterator
  from psycopg2.sql import Identifier, Literal, SQL
  
  from nominatim.tools.exec_utils import get_url
@@ -33,21 +32,14 @@ class SpecialPhrasesImporter():
              r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
          )
          self.sanity_check_pattern = re.compile(r'^\w+$')
-        self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
-                                                             self.config.TERM_NORMALIZATION)
-        #This set will contain all existing phrases from the word table which
-        #no longer exist on the wiki.
-        #It contain tuples with the following format: (normalized_word, class, type, operator)
-        self.words_phrases_to_delete = set()
-        #This set will contain the phrases which still exist from the wiki.
-        #It is used to prevent duplicates on the wiki by removing them from
-        #the word_phrases_to_delete only at the end.
-        self.words_phrases_still_exist = set()
+        # This set will contain all existing phrases to be added.
+        # It contains tuples with the following format: (lable, class, type, operator)
+        self.word_phrases = set()
          #This set will contain all existing place_classtype tables which doesn't match any
          #special phrases class/type on the wiki.
          self.table_phrases_to_delete = set()
  
-    def import_from_wiki(self, languages=None):
+    def import_from_wiki(self, tokenizer, languages=None):
          """
              Iterate through all specified languages and
              extract corresponding special phrases from the wiki.
@@ -55,7 +47,6 @@ class SpecialPhrasesImporter():
          if languages is not None and not isinstance(languages, list):
              raise TypeError('The \'languages\' argument should be of type list.')
  
-        self._fetch_existing_words_phrases()
          self._fetch_existing_place_classtype_tables()
  
          #Get all languages to process.
@@ -71,30 +62,15 @@ class SpecialPhrasesImporter():
              self.statistics_handler.notify_current_lang_done(lang)
  
          self._create_place_classtype_table_and_indexes(class_type_pairs)
-        self._remove_non_existent_phrases_from_db()
+        self._remove_non_existent_tables_from_db()
          self.db_connection.commit()
+
+        with tokenizer.name_analyzer() as analyzer:
+            analyzer.update_special_phrases(self.word_phrases)
+
          LOG.warning('Import done.')
          self.statistics_handler.notify_import_done()
  
-    def _fetch_existing_words_phrases(self):
-        """
-            Fetch existing special phrases from the word table.
-            Fill the word_phrases_to_delete set of the class.
-        """
-        #Only extract special phrases terms:
-        #If class=place and type=house then it is a housenumber term.
-        #If class=place and type=postcode then it is a postcode term.
-        word_query = """
-            SELECT word, class, type, operator FROM word
-            WHERE class != 'place' OR (type != 'house' AND type != 'postcode')
-        """
-        with self.db_connection.cursor() as db_cursor:
-            db_cursor.execute(SQL(word_query))
-            for row in db_cursor:
-                row[3] = '-' if row[3] is None else row[3]
-                self.words_phrases_to_delete.add(
-                    (row[0], row[1], row[2], row[3])
-                )
  
      def _fetch_existing_place_classtype_tables(self):
          """
@@ -176,7 +152,6 @@ class SpecialPhrasesImporter():
  
          for match in matches:
              phrase_label = match[0].strip()
-            normalized_label = self.transliterator.transliterate(phrase_label)
              phrase_class = match[1].strip()
              phrase_type = match[2].strip()
              phrase_operator = match[3].strip()
@@ -198,20 +173,6 @@ class SpecialPhrasesImporter():
              ):
                  continue
  
-            #Check if the phrase already exists in the database.
-            if (
-                    (normalized_label, phrase_class, phrase_type, phrase_operator)
-                    in self.words_phrases_to_delete
-            ):
-                #Remove this phrase from the ones to delete as it still exist on the wiki.
-                self.words_phrases_still_exist.add(
-                    (normalized_label, phrase_class, phrase_type, phrase_operator)
-                )
-                class_type_pairs.add((phrase_class, phrase_type))
-                self.statistics_handler.notify_one_phrase_ignored()
-                #Dont need to add this phrase as it already exists in the word table.
-                continue
-
              #sanity check, in case somebody added garbage in the wiki
              if not self._check_sanity(lang, phrase_class, phrase_type):
                  self.statistics_handler.notify_one_phrase_invalid()
@@ -219,35 +180,11 @@ class SpecialPhrasesImporter():
  
              class_type_pairs.add((phrase_class, phrase_type))
  
-            self._process_amenity(
-                phrase_label, normalized_label, phrase_class,
-                phrase_type, phrase_operator
-            )
-            self.statistics_handler.notify_one_phrase_added()
+            self.word_phrases.add((phrase_label, phrase_class,
+                                   phrase_type, phrase_operator))
  
          return class_type_pairs
  
-    def _process_amenity(self, phrase_label, normalized_label,
-                         phrase_class, phrase_type, phrase_operator):
-        # pylint: disable-msg=too-many-arguments
-        """
-            Add phrase lookup and corresponding class and
-            type to the word table based on the operator.
-        """
-        with self.db_connection.cursor() as db_cursor:
-            if phrase_operator == 'near':
-                db_cursor.execute("""SELECT getorcreate_amenityoperator(
-                                  make_standard_name(%s), %s, %s, %s, 'near')""",
-                                  (phrase_label, normalized_label, phrase_class, phrase_type))
-            elif phrase_operator == 'in':
-                db_cursor.execute("""SELECT getorcreate_amenityoperator(
-                                  make_standard_name(%s), %s, %s, %s, 'in')""",
-                                  (phrase_label, normalized_label, phrase_class, phrase_type))
-            else:
-                db_cursor.execute("""SELECT getorcreate_amenity(
-                                  make_standard_name(%s), %s, %s, %s)""",
-                                  (phrase_label, normalized_label, phrase_class, phrase_type))
-
  
      def _create_place_classtype_table_and_indexes(self, class_type_pairs):
          """
@@ -339,33 +276,15 @@ class SpecialPhrasesImporter():
                                .format(Identifier(table_name),
                                        Identifier(self.config.DATABASE_WEBUSER)))
  
-    def _remove_non_existent_phrases_from_db(self):
+    def _remove_non_existent_tables_from_db(self):
          """
              Remove special phrases which doesn't exist on the wiki anymore.
-            Delete from the word table and delete the place_classtype tables.
+            Delete the place_classtype tables.
          """
          LOG.warning('Cleaning database...')
-        self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist
          #Array containing all queries to execute. Contain tuples of format (query, parameters)
          queries_parameters = []
  
-        #Delete phrases from the word table which are not on the wiki anymore.
-        for phrase_to_delete in self.words_phrases_to_delete:
-            self.statistics_handler.notify_one_phrase_deleted()
-            if phrase_to_delete[3] == '-':
-                query = """
-                    DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null
-                """
-                parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], )
-                queries_parameters.append((query, parameters))
-            else:
-                query = """
-                    DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s
-                """
-                parameters = (phrase_to_delete[0], phrase_to_delete[1],
-                              phrase_to_delete[2], phrase_to_delete[3], )
-                queries_parameters.append((query, parameters))
-
          #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
          for table in self.table_phrases_to_delete:
              self.statistics_handler.notify_one_table_deleted()
diff --git a/nominatim/version.py b/nominatim/version.py

index 9670ea6062b2c7a14971fa39993e16d165f99a1e..6f9005eaf868d8dabce8f26ade77ec28612d63e4 100644 (file)
--- a/nominatim/version.py
+++ b/nominatim/version.py
@@ -10,7 +10,7 @@ Version information for Nominatim.
  # and must always be increased when there is a change to the database or code
  # that requires a migration.
  # Released versions always have a database patch level of 0.
-NOMINATIM_VERSION = (3, 7, 0, 1)
+NOMINATIM_VERSION = (3, 7, 0, 2)
  
  POSTGRESQL_REQUIRED_VERSION = (9, 3)
  POSTGIS_REQUIRED_VERSION = (2, 2)
diff --git a/settings/env.defaults b/settings/env.defaults

index 4069270eb73161073fba7ed0af593a8c3730e69c..09819c0a88b295fd7c1599e9ce60dae7a5b21549 100644 (file)
--- a/settings/env.defaults
+++ b/settings/env.defaults
@@ -18,6 +18,12 @@ NOMINATIM_DATABASE_WEBUSER="www-data"
  # Changing this value requires to run 'nominatim refresh --functions'.
  NOMINATIM_DATABASE_MODULE_PATH=
  
+# Tokenizer used for normalizing and parsing queries and names.
+# The tokenizer is set up during import and cannot be changed afterwards
+# without a reimport.
+# Currently available tokenizers: legacy
+NOMINATIM_TOKENIZER="legacy"
+
  # Number of occurances of a word before it is considered frequent.
  # Similar to the concept of stop words. Frequent partial words get ignored
  # or handled differently during search.
diff --git a/test/bdd/db/import/search_name.feature b/test/bdd/db/import/search_name.feature

index 0e922e1d6b503716f649ff8b13409adf21bd9a1f..fd207059408fc96092b537149c0f83e807a5260c 100644 (file)
--- a/test/bdd/db/import/search_name.feature
+++ b/test/bdd/db/import/search_name.feature
@@ -24,7 +24,7 @@ Feature: Creation of search terms
          When importing
          Then search_name contains
           | object | nameaddress_vector |
-         | N1     | Rose, Street, Walltown |
+         | N1     | #Rose Street, Walltown |
          When searching for "23 Rose Street, Walltown"
          Then results contain
           | osm_type | osm_id | name |
@@ -248,7 +248,7 @@ Feature: Creation of search terms
          When importing
          Then search_name contains
           | object | name_vector | nameaddress_vector |
-         | N1     | #Green Moss | Rose, Street, Walltown |
+         | N1     | #Green Moss | #Rose Street, Walltown |
          When searching for "Green Moss, Rose Street, Walltown"
          Then results contain
           | osm_type | osm_id | name |
@@ -299,7 +299,7 @@ Feature: Creation of search terms
          When importing
          Then search_name contains
           | object | name_vector | nameaddress_vector |
-         | N1     | foo         | the road |
+         | N1     | foo         | #the road |
  
      Scenario: Some addr: tags are added to address
          Given the scene roads-with-pois
diff --git a/test/bdd/steps/nominatim_environment.py b/test/bdd/steps/nominatim_environment.py

index 6381e4b4a2da8865c89e711910458a2a7fb13d74..f6e3a03945d0a8b834fbf8500874c74693648b7c 100644 (file)
--- a/test/bdd/steps/nominatim_environment.py
+++ b/test/bdd/steps/nominatim_environment.py
@@ -10,6 +10,7 @@ sys.path.insert(1, str((Path(__file__) / '..' / '..' / '..' / '..').resolve()))
  from nominatim import cli
  from nominatim.config import Configuration
  from nominatim.tools import refresh
+from nominatim.tokenizer import factory as tokenizer_factory
  from steps.utils import run_script
  
  class NominatimEnvironment:
@@ -106,9 +107,19 @@ class NominatimEnvironment:
              self.website_dir.cleanup()
  
          self.website_dir = tempfile.TemporaryDirectory()
-        cfg = Configuration(None, self.src_dir / 'settings', environ=self.test_env)
-        cfg.lib_dir.php = self.src_dir / 'lib-php'
-        refresh.setup_website(Path(self.website_dir.name) / 'website', cfg)
+        refresh.setup_website(Path(self.website_dir.name) / 'website',
+                              self.get_test_config())
+
+
+    def get_test_config(self):
+        cfg = Configuration(Path(self.website_dir.name), self.src_dir / 'settings',
+                            environ=self.test_env)
+        cfg.set_libdirs(module=self.build_dir / 'module',
+                        osm2pgsql=self.build_dir / 'osm2pgsql' / 'osm2pgsql',
+                        php=self.src_dir / 'lib-php',
+                        sql=self.src_dir / 'lib-sql',
+                        data=self.src_dir / 'data')
+        return cfg
  
      def get_libpq_dsn(self):
          dsn = self.test_env['NOMINATIM_DATABASE_DSN']
@@ -169,33 +180,41 @@ class NominatimEnvironment:
          """
          self.write_nominatim_config(self.api_test_db)
  
-        if self.api_db_done:
-            return
+        if not self.api_db_done:
+            self.api_db_done = True
  
-        self.api_db_done = True
-
-        if self._reuse_or_drop_db(self.api_test_db):
-            return
+            if not self._reuse_or_drop_db(self.api_test_db):
+                testdata = Path('__file__') / '..' / '..' / 'testdb'
+                self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())
  
-        testdata = Path('__file__') / '..' / '..' / 'testdb'
-        self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())
+                try:
+                    self.run_nominatim('import', '--osm-file', str(self.api_test_file))
+                    self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
+                    self.run_nominatim('freeze')
  
-        try:
-            self.run_nominatim('import', '--osm-file', str(self.api_test_file))
-            self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
-            self.run_nominatim('freeze')
+                    phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
+                    run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
+                except:
+                    self.db_drop_database(self.api_test_db)
+                    raise
  
-            phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
-            run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
-        except:
-            self.db_drop_database(self.api_test_db)
-            raise
+        tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
  
  
      def setup_unknown_db(self):
          """ Setup a test against a non-existing database.
          """
-        self.write_nominatim_config('UNKNOWN_DATABASE_NAME')
+        # The tokenizer needs an existing database to function.
+        # So start with the usual database
+        class _Context:
+            db = None
+
+        context = _Context()
+        self.setup_db(context)
+        tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
+
+        # Then drop the DB again
+        self.teardown_db(context, force_drop=True)
  
      def setup_db(self, context):
          """ Setup a test against a fresh, empty test database.
@@ -212,13 +231,13 @@ class NominatimEnvironment:
          context.db.autocommit = True
          psycopg2.extras.register_hstore(context.db, globally=False)
  
-    def teardown_db(self, context):
+    def teardown_db(self, context, force_drop=False):
          """ Remove the test database, if it exists.
          """
-        if 'db' in context:
+        if hasattr(context, 'db'):
              context.db.close()
  
-        if not self.keep_scenario_db:
+        if force_drop or not self.keep_scenario_db:
              self.db_drop_database(self.test_db)
  
      def _reuse_or_drop_db(self, name):
diff --git a/test/bdd/steps/steps_db_ops.py b/test/bdd/steps/steps_db_ops.py

index 72a610eb123733db313ee74d510b633afdac5fb3..52a50a511c8a095c0c9ce5bd8bd9cb9f7e0783fb 100644 (file)
--- a/test/bdd/steps/steps_db_ops.py
+++ b/test/bdd/steps/steps_db_ops.py
@@ -7,6 +7,7 @@ from place_inserter import PlaceColumn
  from table_compare import NominatimID, DBRow
  
  from nominatim.indexer import indexer
+from nominatim.tokenizer import factory as tokenizer_factory
  
  def check_database_integrity(context):
      """ Check some generic constraints on the tables.
@@ -86,6 +87,9 @@ def add_data_to_planet_ways(context):
  def import_and_index_data_from_place_table(context):
      """ Import data previously set up in the place table.
      """
+    nctx = context.nominatim
+
+    tokenizer = tokenizer_factory.create_tokenizer(nctx.get_test_config())
      context.nominatim.copy_from_place(context.db)
  
      # XXX use tool function as soon as it is ported
@@ -105,7 +109,7 @@ def import_and_index_data_from_place_table(context):
  
      # Call directly as the refresh function does not include postcodes.
      indexer.LOG.setLevel(logging.ERROR)
-    indexer.Indexer(context.nominatim.get_libpq_dsn(), 1).index_full(analyse=False)
+    indexer.Indexer(context.nominatim.get_libpq_dsn(), tokenizer, 1).index_full(analyse=False)
  
      check_database_integrity(context)
  
@@ -230,7 +234,7 @@ def check_search_name_contents(context, exclude):
                                  if exclude:
                                      assert not present, "Found term for {}/{}: {}".format(row['object'], name, wid[1])
                                  else:
-                                    assert present, "Missing term for {}/{}: {}".fromat(row['object'], name, wid[1])
+                                    assert present, "Missing term for {}/{}: {}".format(row['object'], name, wid[1])
                      elif name != 'object':
                          assert db_row.contains(name, value), db_row.assert_msg(name, value)
  
diff --git a/test/php/Nominatim/PhraseTest.php b/test/php/Nominatim/PhraseTest.php

index 42166e34f330da72a06f2089e6e99d0a4b73db21..e4c2bbd1be0f80130b66a018e0ee0790f217ab4f 100644 (file)
--- a/test/php/Nominatim/PhraseTest.php
+++ b/test/php/Nominatim/PhraseTest.php
@@ -44,19 +44,16 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
      public function testEmptyPhrase()
      {
          $oPhrase = new Phrase('', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array(), new TokensFullSet());
  
-        $this->assertEquals(
-            array(array('')),
-            $oPhrase->getWordSets()
-        );
+        $this->assertNull($oPhrase->getWordSets());
      }
  
  
      public function testSingleWordPhrase()
      {
          $oPhrase = new Phrase('a', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a'), new TokensFullSet());
  
          $this->assertEquals(
              '(a)',
@@ -68,21 +65,21 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
      public function testMultiWordPhrase()
      {
          $oPhrase = new Phrase('a b', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a', 'b'), new TokensFullSet());
          $this->assertEquals(
              '(a b),(a|b)',
              $this->serializeSets($oPhrase->getWordSets())
          );
  
          $oPhrase = new Phrase('a b c', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
          $this->assertEquals(
              '(a b c),(a|b c),(a b|c),(a|b|c)',
              $this->serializeSets($oPhrase->getWordSets())
          );
  
          $oPhrase = new Phrase('a b c d', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensFullSet());
          $this->assertEquals(
              '(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)',
              $this->serializeSets($oPhrase->getWordSets())
@@ -93,7 +90,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
      public function testInverseWordSets()
      {
          $oPhrase = new Phrase('a b c', '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
          $oPhrase->invertWordSets();
  
          $this->assertEquals(
@@ -105,14 +102,16 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
  
      public function testMaxWordSets()
      {
-        $oPhrase = new Phrase(join(' ', array_fill(0, 4, 'a')), '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $aWords = array_fill(0, 4, 'a');
+        $oPhrase = new Phrase(join(' ', $aWords), '');
+        $oPhrase->computeWordSets($aWords, new TokensFullSet());
          $this->assertEquals(8, count($oPhrase->getWordSets()));
          $oPhrase->invertWordSets();
          $this->assertEquals(8, count($oPhrase->getWordSets()));
  
-        $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
-        $oPhrase->computeWordSets(new TokensFullSet());
+        $aWords = array_fill(0, 18, 'a');
+        $oPhrase = new Phrase(join(' ', $aWords), '');
+        $oPhrase->computeWordSets($aWords, new TokensFullSet());
          $this->assertEquals(100, count($oPhrase->getWordSets()));
          $oPhrase->invertWordSets();
          $this->assertEquals(100, count($oPhrase->getWordSets()));
@@ -122,7 +121,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
      public function testPartialTokensShortTerm()
      {
          $oPhrase = new Phrase('a b c d', '');
-        $oPhrase->computeWordSets(new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
+        $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
          $this->assertEquals(
              '(a|b c d),(a|b c|d)',
              $this->serializeSets($oPhrase->getWordSets())
@@ -132,8 +131,9 @@ class PhraseTest extends \PHPUnit\Framework\TestCase
  
      public function testPartialTokensLongTerm()
      {
-        $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
-        $oPhrase->computeWordSets(new TokensPartialSet(array('a', 'a a a a a')));
+        $aWords = array_fill(0, 18, 'a');
+        $oPhrase = new Phrase(join(' ', $aWords), '');
+        $oPhrase->computeWordSets($aWords, new TokensPartialSet(array('a', 'a a a a a')));
          $this->assertEquals(80, count($oPhrase->getWordSets()));
      }
  }
diff --git a/test/php/Nominatim/StatusTest.php b/test/php/Nominatim/StatusTest.php

index 8cb8a703a07f7d186603eb58a6d0862e0b7cf47c..9e03a970fd2aa1211c234460918dd119cf6a7cc0 100644 (file)
--- a/test/php/Nominatim/StatusTest.php
+++ b/test/php/Nominatim/StatusTest.php
@@ -2,6 +2,8 @@
  
  namespace Nominatim;
  
+@define('CONST_TokenizerDir', dirname(__FILE__));
+
  require_once(CONST_LibDir.'/DB.php');
  require_once(CONST_LibDir.'/Status.php');
  
@@ -40,45 +42,6 @@ class StatusTest extends \PHPUnit\Framework\TestCase
          $this->assertEquals('No database', $oStatus->status());
      }
  
-
-    public function testModuleFail()
-    {
-        $this->expectException(\Exception::class);
-        $this->expectExceptionMessage('Module call failed');
-        $this->expectExceptionCode(702);
-
-        // stub has getOne method but doesn't return anything
-        $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
-                        ->setMethods(array('connect', 'getOne'))
-                        ->getMock();
-
-        $oStatus = new Status($oDbStub);
-        $this->assertNull($oStatus->status());
-    }
-
-
-    public function testWordIdQueryFail()
-    {
-        $this->expectException(\Exception::class);
-        $this->expectExceptionMessage('No value');
-        $this->expectExceptionCode(704);
-
-        $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
-                        ->setMethods(array('connect', 'getOne'))
-                        ->getMock();
-
-        // return no word_id
-        $oDbStub->method('getOne')
-                ->will($this->returnCallback(function ($sql) {
-                    if (preg_match("/make_standard_name\('a'\)/", $sql)) return 'a';
-                    if (preg_match('/SELECT word_id, word_token/', $sql)) return null;
-                }));
-
-        $oStatus = new Status($oDbStub);
-        $this->assertNull($oStatus->status());
-    }
-
-
      public function testOK()
      {
          $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
@@ -100,7 +63,7 @@ class StatusTest extends \PHPUnit\Framework\TestCase
          $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
                          ->setMethods(array('getOne'))
                          ->getMock();
-     
+
          $oDbStub->method('getOne')
                  ->willReturn(1519430221);
  
diff --git a/test/php/Nominatim/TokenListTest.php b/test/php/Nominatim/TokenListTest.php

index 14a595ea9f3c115da1e8063e3436739174699e18..f0139d7670b4dd246bf3797aa2b49de6704918a6 100644 (file)
--- a/test/php/Nominatim/TokenListTest.php
+++ b/test/php/Nominatim/TokenListTest.php
@@ -49,88 +49,4 @@ class TokenTest extends \PHPUnit\Framework\TestCase
          $this->assertFalse($TL->contains('unknownword'));
          $this->assertEquals(array(), $TL->get('unknownword'));
      }
-
-    public function testAddress()
-    {
-        $this->expectOutputRegex('/<p><tt>/');
-
-        $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
-                        ->setMethods(array('getAll', 'getDBQuotedList'))
-                        ->getMock();
-
-        $oDbStub->method('getDBQuotedList')
-                ->will($this->returnCallback(function ($aVals) {
-                    return array_map(function ($sVal) {
-                        return "'".$sVal."'";
-                    }, $aVals);
-                }));
-
-
-        $oDbStub->method('getAll')
-                ->will($this->returnCallback(function ($sql) {
-                    $aResults = array();
-                    if (preg_match('/1051/', $sql)) {
-                        $aResults[] = $this->wordResult(array(
-                                                         'word_id' => 999,
-                                                         'word_token' => '1051',
-                                                         'class' => 'place',
-                                                         'type' => 'house'
-                                                        ));
-                    }
-                    if (preg_match('/hauptstr/', $sql)) {
-                        $aResults[] = $this->wordResult(array(
-                                                         'word_id' => 999,
-                                                         'word_token' => 'hauptstr',
-                                                         'class' => 'place',
-                                                         'type' => 'street',
-                                                         'operator' => true
-                                                        ));
-                    }
-                    if (preg_match('/64286/', $sql)) {
-                        $aResults[] = $this->wordResult(array(
-                                                         'word_id' => 999,
-                                                         'word_token' => '64286',
-                                                         'word' => '64286',
-                                                         'class' => 'place',
-                                                         'type' => 'postcode'
-                                                        ));
-                    }
-                    if (preg_match('/darmstadt/', $sql)) {
-                        $aResults[] = $this->wordResult(array(
-                                                         'word_id' => 999,
-                                                         'word_token' => 'darmstadt',
-                                                         'count' => 533
-                                                        ));
-                    }
-                    if (preg_match('/alemagne/', $sql)) {
-                        $aResults[] = $this->wordResult(array(
-                                                         'word_id' => 999,
-                                                         'word_token' => 'alemagne',
-                                                         'country_code' => 'de',
-                                                        ));
-                    }
-                    if (preg_match('/mexico/', $sql)) {
-                        $aResults[] = $this->wordResult(array(
-                                                         'word_id' => 999,
-                                                         'word_token' => 'mexico',
-                                                         'country_code' => 'mx',
-                                                        ));
-                    }
-                    return $aResults;
-                }));
-
-        $aCountryCodes = array('de', 'fr');
-        $sNormQuery = '1051 hauptstr 64286 darmstadt alemagne mexico';
-        $aTokens = explode(' ', $sNormQuery);
-
-        $TL = new TokenList;
-        $TL->addTokensFromDB($oDbStub, $aTokens, $aCountryCodes, $sNormQuery, $this->oNormalizer);
-        $this->assertEquals(5, $TL->count());
-
-        $this->assertEquals(array(new Token\HouseNumber(999, '1051')), $TL->get('1051'));
-        $this->assertEquals(array(new Token\Country(999, 'de')), $TL->get('alemagne'));
-        $this->assertEquals(array(new Token\Postcode(999, '64286')), $TL->get('64286'));
-        $this->assertEquals(array(new Token\Word(999, true, 533, 0)), $TL->get('darmstadt'));
-        $this->assertEquals(array(new Token\SpecialTerm(999, 'place', 'street', true)), $TL->get('hauptstr'));
-    }
  }
diff --git a/test/php/Nominatim/tokenizer.php b/test/php/Nominatim/tokenizer.php

new file mode 100644 (file)

index 0000000..0735e66
--- /dev/null
+++ b/test/php/Nominatim/tokenizer.php
@@ -0,0 +1,17 @@
+<?php
+
+namespace Nominatim;
+
+class Tokenizer
+{
+    private $oDB;
+
+    public function __construct(&$oDB)
+    {
+        $this->oDB =& $oDB;
+    }
+
+    public function checkStatus()
+    {
+    }
+}
diff --git a/test/python/conftest.py b/test/python/conftest.py

index 4b9749c01f4c2f2d9159f30fc53961107705ce90..493620c45ece19c5abf3a7477e41886f7a36c192 100644 (file)
--- a/test/python/conftest.py
+++ b/test/python/conftest.py
@@ -1,3 +1,4 @@
+import importlib
  import itertools
  import sys
  from pathlib import Path
@@ -15,6 +16,9 @@ sys.path.insert(0, str(SRC_DIR.resolve()))
  from nominatim.config import Configuration
  from nominatim.db import connection
  from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.db import properties
+
+import dummy_tokenizer
  
  class _TestingCursor(psycopg2.extras.DictCursor):
      """ Extension to the DictCursor class that provides execution
@@ -117,9 +121,8 @@ def table_factory(temp_db_cursor):
      def mk_table(name, definition='id INT', content=None):
          temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition))
          if content is not None:
-            if not isinstance(content, str):
-                content = '),('.join([str(x) for x in content])
-            temp_db_cursor.execute("INSERT INTO {} VALUES ({})".format(name, content))
+            psycopg2.extras.execute_values(
+                temp_db_cursor, "INSERT INTO {} VALUES %s".format(name), content)
  
      return mk_table
  
@@ -144,6 +147,11 @@ def tmp_phplib_dir():
  
          yield Path(phpdir)
  
+
+@pytest.fixture
+def property_table(table_factory):
+    table_factory('nominatim_properties', 'property TEXT, value TEXT')
+
  @pytest.fixture
  def status_table(temp_db_conn):
      """ Create an empty version of the status table and
@@ -281,10 +289,29 @@ def osm2pgsql_options(temp_db):
  
  @pytest.fixture
  def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
-    monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.')
-    table_factory('country_name', 'partition INT', (0, 1, 2))
+    table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, )))
      cfg = Configuration(None, SRC_DIR.resolve() / 'settings')
      cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php',
                      sql=tmp_path, data=SRC_DIR / 'data')
  
      return SQLPreprocessor(temp_db_conn, cfg)
+
+
+@pytest.fixture
+def tokenizer_mock(monkeypatch, property_table, temp_db_conn, tmp_path):
+    """ Sets up the configuration so that the test dummy tokenizer will be
+        loaded when the tokenizer factory is used. Also returns a factory
+        with which a new dummy tokenizer may be created.
+    """
+    monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
+
+    def _import_dummy(module, *args, **kwargs):
+        return dummy_tokenizer
+
+    monkeypatch.setattr(importlib, "import_module", _import_dummy)
+    properties.set_property(temp_db_conn, 'tokenizer', 'dummy')
+
+    def _create_tokenizer():
+        return dummy_tokenizer.DummyTokenizer(None, None)
+
+    return _create_tokenizer
diff --git a/test/python/dummy_tokenizer.py b/test/python/dummy_tokenizer.py

new file mode 100644 (file)

index 0000000..6352a64
--- /dev/null
+++ b/test/python/dummy_tokenizer.py
@@ -0,0 +1,64 @@
+"""
+Tokenizer for testing.
+"""
+
+def create(dsn, data_dir):
+    """ Create a new instance of the tokenizer provided by this module.
+    """
+    return DummyTokenizer(dsn, data_dir)
+
+class DummyTokenizer:
+
+    def __init__(self, dsn, data_dir):
+        self.dsn = dsn
+        self.data_dir = data_dir
+        self.init_state = None
+        self.analyser_cache = {}
+
+
+    def init_new_db(self, *args, **kwargs):
+        assert self.init_state == None
+        self.init_state = "new"
+
+
+    def init_from_project(self):
+        assert self.init_state == None
+        self.init_state = "loaded"
+
+
+    def finalize_import(self, _):
+        pass
+
+
+    def name_analyzer(self):
+        return DummyNameAnalyzer(self.analyser_cache)
+
+
+class DummyNameAnalyzer:
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+
+    def __init__(self, cache):
+        self.analyser_cache = cache
+        cache['countries'] = []
+
+
+    def close(self):
+        pass
+
+    def add_postcodes_from_db(self):
+        pass
+
+    def update_special_phrases(self, phrases):
+        self.analyser_cache['special_phrases'] = phrases
+
+    def add_country_names(self, code, names):
+        self.analyser_cache['countries'].append((code, names))
+
+    def process_place(self, place):
+        return {}
diff --git a/test/python/test_cli.py b/test/python/test_cli.py

index afa01e575c497ca70805e6c26835ef32da3f57cd..a286995611ddf524c54ae5c1657b3e3e15bf55c2 100644 (file)
--- a/test/python/test_cli.py
+++ b/test/python/test_cli.py
@@ -22,6 +22,7 @@ import nominatim.tools.database_import
  import nominatim.tools.freeze
  import nominatim.tools.refresh
  import nominatim.tools.postcodes
+import nominatim.tokenizer.factory
  
  from mocks import MockParamCapture
  
@@ -56,6 +57,28 @@ def mock_func_factory(monkeypatch):
      return get_mock
  
  
+@pytest.fixture
+def tokenizer_mock(monkeypatch):
+    class DummyTokenizer:
+        def __init__(self, *args, **kwargs):
+            self.update_sql_functions_called = False
+            self.finalize_import_called = False
+
+        def update_sql_functions(self, *args):
+            self.update_sql_functions_called = True
+
+        def finalize_import(self, *args):
+            self.finalize_import_called = True
+
+    tok = DummyTokenizer()
+    monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' ,
+                        lambda *args: tok)
+    monkeypatch.setattr(nominatim.tokenizer.factory, 'create_tokenizer' ,
+                        lambda *args: tok)
+
+    return tok
+
+
  def test_cli_help(capsys):
      """ Running nominatim tool without arguments prints help.
      """
@@ -84,10 +107,9 @@ def test_import_bad_file(temp_db):
      assert 1 == call_nominatim('import', '--osm-file', '.')
  
  
-def test_import_full(temp_db, mock_func_factory):
+def test_import_full(temp_db, mock_func_factory, tokenizer_mock):
      mocks = [
          mock_func_factory(nominatim.tools.database_import, 'setup_database_skeleton'),
-        mock_func_factory(nominatim.tools.database_import, 'install_module'),
          mock_func_factory(nominatim.tools.database_import, 'import_osm_data'),
          mock_func_factory(nominatim.tools.refresh, 'import_wikipedia_articles'),
          mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'),
@@ -107,6 +129,7 @@ def test_import_full(temp_db, mock_func_factory):
      cf_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
  
      assert 0 == call_nominatim('import', '--osm-file', __file__)
+    assert tokenizer_mock.finalize_import_called
  
      assert cf_mock.called > 1
  
@@ -114,7 +137,7 @@ def test_import_full(temp_db, mock_func_factory):
          assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)
  
  
-def test_import_continue_load_data(temp_db, mock_func_factory):
+def test_import_continue_load_data(temp_db, mock_func_factory, tokenizer_mock):
      mocks = [
          mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'),
          mock_func_factory(nominatim.tools.database_import, 'load_data'),
@@ -127,12 +150,14 @@ def test_import_continue_load_data(temp_db, mock_func_factory):
      ]
  
      assert 0 == call_nominatim('import', '--continue', 'load-data')
+    assert tokenizer_mock.finalize_import_called
  
      for mock in mocks:
          assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)
  
  
-def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp_db_conn):
+def test_import_continue_indexing(temp_db, mock_func_factory, placex_table,
+                                  temp_db_conn, tokenizer_mock):
      mocks = [
          mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
          mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
@@ -153,7 +178,7 @@ def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp
      assert temp_db_conn.index_exists('idx_placex_pendingsector')
  
  
-def test_import_continue_postprocess(temp_db, mock_func_factory):
+def test_import_continue_postprocess(temp_db, mock_func_factory, tokenizer_mock):
      mocks = [
          mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
          mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
@@ -163,6 +188,8 @@ def test_import_continue_postprocess(temp_db, mock_func_factory):
  
      assert 0 == call_nominatim('import', '--continue', 'db-postprocess')
  
+    assert tokenizer_mock.finalize_import_called
+
      for mock in mocks:
          assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)
  
@@ -217,7 +244,8 @@ def test_add_data_command(mock_run_legacy, name, oid):
                            (['--boundaries-only'], 1, 0),
                            (['--no-boundaries'], 0, 1),
                            (['--boundaries-only', '--no-boundaries'], 0, 0)])
-def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ranks):
+def test_index_command(mock_func_factory, temp_db_cursor, tokenizer_mock,
+                       params, do_bnds, do_ranks):
      temp_db_cursor.execute("CREATE TABLE import_status (indexed bool)")
      bnd_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_boundaries')
      rank_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_by_rank')
@@ -227,7 +255,7 @@ def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ra
      assert bnd_mock.called == do_bnds
      assert rank_mock.called == do_ranks
  
-def test_special_phrases_command(temp_db, mock_func_factory):
+def test_special_phrases_command(temp_db, mock_func_factory, tokenizer_mock):
      func = mock_func_factory(nominatim.clicmd.special_phrases.SpecialPhrasesImporter, 'import_from_wiki')
  
      call_nominatim('special-phrases', '--import-from-wiki')
@@ -238,7 +266,6 @@ def test_special_phrases_command(temp_db, mock_func_factory):
                           ('postcodes', 'update_postcodes'),
                           ('word-counts', 'recompute_word_counts'),
                           ('address-levels', 'load_address_levels_from_file'),
-                         ('functions', 'create_functions'),
                           ('wiki-data', 'import_wikipedia_articles'),
                           ('importance', 'recompute_importance'),
                           ('website', 'setup_website'),
@@ -250,6 +277,14 @@ def test_refresh_command(mock_func_factory, temp_db, command, func):
      assert func_mock.called == 1
  
  
+def test_refresh_create_functions(mock_func_factory, temp_db, tokenizer_mock):
+    func_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
+
+    assert 0 == call_nominatim('refresh', '--functions')
+    assert func_mock.called == 1
+    assert tokenizer_mock.update_sql_functions_called
+
+
  def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db):
      calls = []
      monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles',
diff --git a/test/python/test_cli_replication.py b/test/python/test_cli_replication.py

index a62ad1a4a69be28887755ea301aeb747788d4e4d..b95e6ede1787dd7619bee512e95a5694c1e30b02 100644 (file)
--- a/test/python/test_cli_replication.py
+++ b/test/python/test_cli_replication.py
@@ -27,7 +27,29 @@ def call_nominatim(*args):
                                     cli_args=['replication'] + list(args))
  
  @pytest.fixture
-def index_mock(monkeypatch):
+def tokenizer_mock(monkeypatch):
+    class DummyTokenizer:
+        def __init__(self, *args, **kwargs):
+            self.update_sql_functions_called = False
+            self.finalize_import_called = False
+
+        def update_sql_functions(self, *args):
+            self.update_sql_functions_called = True
+
+        def finalize_import(self, *args):
+            self.finalize_import_called = True
+
+    tok = DummyTokenizer()
+    monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' ,
+                        lambda *args: tok)
+    monkeypatch.setattr(nominatim.tokenizer.factory, 'create_tokenizer' ,
+                        lambda *args: tok)
+
+    return tok
+
+
+@pytest.fixture
+def index_mock(monkeypatch, tokenizer_mock):
      mock = MockParamCapture()
      monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_boundaries', mock)
      monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_by_rank', mock)
@@ -52,7 +74,7 @@ def init_status(temp_db_conn, status_table):
  
  
  @pytest.fixture
-def update_mock(mock_func_factory, init_status):
+def update_mock(mock_func_factory, init_status, tokenizer_mock):
      return mock_func_factory(nominatim.tools.replication, 'update')
  
  @pytest.mark.parametrize("params,func", [
diff --git a/test/python/test_db_sql_preprocessor.py b/test/python/test_db_sql_preprocessor.py

index 08a195bd2d794d9c6f9f01cd18116df64d466b42..6a254ef3b8e6943342168349dc04aa43598af80e 100644 (file)
--- a/test/python/test_db_sql_preprocessor.py
+++ b/test/python/test_db_sql_preprocessor.py
@@ -24,7 +24,6 @@ def sql_factory(tmp_path):
      ("'{{db.partitions|join}}'", '012'),
      ("{% if 'country_name' in db.tables %}'yes'{% else %}'no'{% endif %}", "yes"),
      ("{% if 'xxx' in db.tables %}'yes'{% else %}'no'{% endif %}", "no"),
-    ("'{{config.DATABASE_MODULE_PATH}}'", '.')
      ])
  def test_load_file_simple(sql_preprocessor, sql_factory, temp_db_conn, temp_db_cursor, expr, ret):
      sqlfile = sql_factory("RETURN {};".format(expr))
diff --git a/test/python/test_indexing.py b/test/python/test_indexing.py

index 6692eba61d349b8908415d793078fba17b0db3af..ff84e37964b660a33be2faa0908ba2ad0c1519a7 100644 (file)
--- a/test/python/test_indexing.py
+++ b/test/python/test_indexing.py
@@ -6,6 +6,7 @@ import psycopg2
  import pytest
  
  from nominatim.indexer import indexer
+from nominatim.tokenizer import factory
  
  class IndexerTestDB:
  
@@ -17,6 +18,7 @@ class IndexerTestDB:
          self.conn = conn
          self.conn.set_isolation_level(0)
          with self.conn.cursor() as cur:
+            cur.execute('CREATE EXTENSION hstore')
              cur.execute("""CREATE TABLE placex (place_id BIGINT,
                                                  class TEXT,
                                                  type TEXT,
@@ -26,9 +28,14 @@ class IndexerTestDB:
                                                  indexed_date TIMESTAMP,
                                                  partition SMALLINT,
                                                  admin_level SMALLINT,
+                                                address HSTORE,
+                                                token_info JSONB,
                                                  geometry_sector INTEGER)""")
              cur.execute("""CREATE TABLE location_property_osmline (
                                 place_id BIGINT,
+                               osm_id BIGINT,
+                               address HSTORE,
+                               token_info JSONB,
                                 indexed_status SMALLINT,
                                 indexed_date TIMESTAMP,
                                 geometry_sector INTEGER)""")
@@ -46,6 +53,25 @@ class IndexerTestDB:
                               END IF;
                               RETURN NEW;
                             END; $$ LANGUAGE plpgsql;""")
+            cur.execute("""CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
+                                                      OUT name HSTORE,
+                                                      OUT address HSTORE,
+                                                      OUT country_feature VARCHAR)
+                           AS $$
+                           BEGIN
+                            address := p.address;
+                            name := p.address;
+                           END;
+                           $$ LANGUAGE plpgsql STABLE;
+                        """)
+            cur.execute("""CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
+                           RETURNS HSTORE AS $$
+                           BEGIN
+                             RETURN in_address;
+                           END;
+                           $$ LANGUAGE plpgsql STABLE;
+                        """)
+
              for table in ('placex', 'location_property_osmline', 'location_postcode'):
                  cur.execute("""CREATE TRIGGER {0}_update BEFORE UPDATE ON {0}
                                 FOR EACH ROW EXECUTE PROCEDURE date_update()
@@ -76,9 +102,9 @@ class IndexerTestDB:
          next_id = next(self.osmline_id)
          with self.conn.cursor() as cur:
              cur.execute("""INSERT INTO location_property_osmline
-                              (place_id, indexed_status, geometry_sector)
-                              VALUES (%s, 1, %s)""",
-                        (next_id, sector))
+                              (place_id, osm_id, indexed_status, geometry_sector)
+                              VALUES (%s, %s, 1, %s)""",
+                        (next_id, next_id, sector))
          return next_id
  
      def add_postcode(self, country, postcode):
@@ -102,8 +128,14 @@ def test_db(temp_db_conn):
      yield IndexerTestDB(temp_db_conn)
  
  
+@pytest.fixture
+def test_tokenizer(tokenizer_mock, def_config, tmp_path):
+    def_config.project_dir = tmp_path
+    return factory.create_tokenizer(def_config)
+
+
  @pytest.mark.parametrize("threads", [1, 15])
-def test_index_all_by_rank(test_db, threads):
+def test_index_all_by_rank(test_db, threads, test_tokenizer):
      for rank in range(31):
          test_db.add_place(rank_address=rank, rank_search=rank)
      test_db.add_osmline()
@@ -111,7 +143,7 @@ def test_index_all_by_rank(test_db, threads):
      assert 31 == test_db.placex_unindexed()
      assert 1 == test_db.osmline_unindexed()
  
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
      idx.index_by_rank(0, 30)
  
      assert 0 == test_db.placex_unindexed()
@@ -142,7 +174,7 @@ def test_index_all_by_rank(test_db, threads):
  
  
  @pytest.mark.parametrize("threads", [1, 15])
-def test_index_partial_without_30(test_db, threads):
+def test_index_partial_without_30(test_db, threads, test_tokenizer):
      for rank in range(31):
          test_db.add_place(rank_address=rank, rank_search=rank)
      test_db.add_osmline()
@@ -150,7 +182,8 @@ def test_index_partial_without_30(test_db, threads):
      assert 31 == test_db.placex_unindexed()
      assert 1 == test_db.osmline_unindexed()
  
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest',
+                          test_tokenizer, threads)
      idx.index_by_rank(4, 15)
  
      assert 19 == test_db.placex_unindexed()
@@ -162,7 +195,7 @@ def test_index_partial_without_30(test_db, threads):
  
  
  @pytest.mark.parametrize("threads", [1, 15])
-def test_index_partial_with_30(test_db, threads):
+def test_index_partial_with_30(test_db, threads, test_tokenizer):
      for rank in range(31):
          test_db.add_place(rank_address=rank, rank_search=rank)
      test_db.add_osmline()
@@ -170,7 +203,7 @@ def test_index_partial_with_30(test_db, threads):
      assert 31 == test_db.placex_unindexed()
      assert 1 == test_db.osmline_unindexed()
  
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
      idx.index_by_rank(28, 30)
  
      assert 27 == test_db.placex_unindexed()
@@ -181,7 +214,7 @@ def test_index_partial_with_30(test_db, threads):
                        WHERE indexed_status = 0 AND rank_address between 1 and 27""")
  
  @pytest.mark.parametrize("threads", [1, 15])
-def test_index_boundaries(test_db, threads):
+def test_index_boundaries(test_db, threads, test_tokenizer):
      for rank in range(4, 10):
          test_db.add_admin(rank_address=rank, rank_search=rank)
      for rank in range(31):
@@ -191,7 +224,7 @@ def test_index_boundaries(test_db, threads):
      assert 37 == test_db.placex_unindexed()
      assert 1 == test_db.osmline_unindexed()
  
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
      idx.index_boundaries(0, 30)
  
      assert 31 == test_db.placex_unindexed()
@@ -203,13 +236,13 @@ def test_index_boundaries(test_db, threads):
  
  
  @pytest.mark.parametrize("threads", [1, 15])
-def test_index_postcodes(test_db, threads):
+def test_index_postcodes(test_db, threads, test_tokenizer):
      for postcode in range(1000):
          test_db.add_postcode('de', postcode)
      for postcode in range(32000, 33000):
          test_db.add_postcode('us', postcode)
  
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
      idx.index_postcodes()
  
      assert 0 == test_db.scalar("""SELECT count(*) FROM location_postcode
@@ -217,7 +250,7 @@ def test_index_postcodes(test_db, threads):
  
  
  @pytest.mark.parametrize("analyse", [True, False])
-def test_index_full(test_db, analyse):
+def test_index_full(test_db, analyse, test_tokenizer):
      for rank in range(4, 10):
          test_db.add_admin(rank_address=rank, rank_search=rank)
      for rank in range(31):
@@ -226,7 +259,7 @@ def test_index_full(test_db, analyse):
      for postcode in range(1000):
          test_db.add_postcode('de', postcode)
  
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', 4)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, 4)
      idx.index_full(analyse=analyse)
  
      assert 0 == test_db.placex_unindexed()
@@ -236,13 +269,13 @@ def test_index_full(test_db, analyse):
  
  
  @pytest.mark.parametrize("threads", [1, 15])
-def test_index_reopen_connection(test_db, threads, monkeypatch):
+def test_index_reopen_connection(test_db, threads, monkeypatch, test_tokenizer):
      monkeypatch.setattr(indexer.WorkerPool, "REOPEN_CONNECTIONS_AFTER", 15)
  
      for _ in range(1000):
          test_db.add_place(rank_address=30, rank_search=30)
  
-    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+    idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
      idx.index_by_rank(28, 30)
  
      assert 0 == test_db.placex_unindexed()
diff --git a/test/python/test_tokenizer_factory.py b/test/python/test_tokenizer_factory.py

new file mode 100644 (file)

index 0000000..69517e9
--- /dev/null
+++ b/test/python/test_tokenizer_factory.py
@@ -0,0 +1,77 @@
+"""
+Tests for creating new tokenizers.
+"""
+import importlib
+import pytest
+
+from nominatim.db import properties
+from nominatim.tokenizer import factory
+from nominatim.errors import UsageError
+from dummy_tokenizer import DummyTokenizer
+
+@pytest.fixture
+def test_config(def_config, tmp_path):
+    def_config.project_dir = tmp_path
+    return def_config
+
+
+def test_setup_dummy_tokenizer(temp_db_conn, test_config,
+                               tokenizer_mock, property_table):
+    tokenizer = factory.create_tokenizer(test_config)
+
+    assert isinstance(tokenizer, DummyTokenizer)
+    assert tokenizer.init_state == "new"
+    assert (test_config.project_dir / 'tokenizer').is_dir()
+
+    assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy'
+
+
+def test_setup_tokenizer_dir_exists(test_config, tokenizer_mock, property_table):
+    (test_config.project_dir / 'tokenizer').mkdir()
+
+    tokenizer = factory.create_tokenizer(test_config)
+
+    assert isinstance(tokenizer, DummyTokenizer)
+    assert tokenizer.init_state == "new"
+
+
+def test_setup_tokenizer_dir_failure(test_config, tokenizer_mock, property_table):
+    (test_config.project_dir / 'tokenizer').write_text("foo")
+
+    with pytest.raises(UsageError):
+        factory.create_tokenizer(test_config)
+
+
+def test_setup_bad_tokenizer_name(test_config, monkeypatch):
+    monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
+
+    with pytest.raises(UsageError):
+        factory.create_tokenizer(test_config)
+
+def test_load_tokenizer(temp_db_conn, test_config,
+                        tokenizer_mock, property_table):
+    factory.create_tokenizer(test_config)
+
+    tokenizer = factory.get_tokenizer_for_db(test_config)
+
+    assert isinstance(tokenizer, DummyTokenizer)
+    assert tokenizer.init_state == "loaded"
+
+
+def test_load_no_tokenizer_dir(test_config, tokenizer_mock, property_table):
+    factory.create_tokenizer(test_config)
+
+    test_config.project_dir = test_config.project_dir / 'foo'
+
+    with pytest.raises(UsageError):
+        factory.get_tokenizer_for_db(test_config)
+
+
+def test_load_missing_propoerty(temp_db_cursor, test_config, tokenizer_mock, property_table):
+    factory.create_tokenizer(test_config)
+
+    temp_db_cursor.execute("TRUNCATE TABLE nominatim_properties")
+
+    with pytest.raises(UsageError):
+        factory.get_tokenizer_for_db(test_config)
+
diff --git a/test/python/test_tokenizer_legacy.py b/test/python/test_tokenizer_legacy.py

new file mode 100644 (file)

index 0000000..58bf5a5
--- /dev/null
+++ b/test/python/test_tokenizer_legacy.py
@@ -0,0 +1,298 @@
+"""
+Test for legacy tokenizer.
+"""
+import shutil
+
+import pytest
+
+from nominatim.tokenizer import legacy_tokenizer
+from nominatim.db import properties
+from nominatim.errors import UsageError
+
+@pytest.fixture
+def test_config(def_config, tmp_path):
+    def_config.project_dir = tmp_path / 'project'
+    def_config.project_dir.mkdir()
+
+    module_dir = tmp_path / 'module_src'
+    module_dir.mkdir()
+    (module_dir / 'nominatim.so').write_text('TEST nomiantim.so')
+
+    def_config.lib_dir.module = module_dir
+
+    sqldir = tmp_path / 'sql'
+    sqldir.mkdir()
+    (sqldir / 'tokenizer').mkdir()
+    (sqldir / 'tokenizer' / 'legacy_tokenizer.sql').write_text("SELECT 'a'")
+    (sqldir / 'words.sql').write_text("SELECT 'a'")
+    shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'),
+                str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql'))
+
+    def_config.lib_dir.sql = sqldir
+    def_config.lib_dir.data = sqldir
+
+    return def_config
+
+
+@pytest.fixture
+def tokenizer_factory(dsn, tmp_path, monkeypatch, property_table):
+    (tmp_path / 'tokenizer').mkdir()
+
+    def _maker():
+        return legacy_tokenizer.create(dsn, tmp_path / 'tokenizer')
+
+    return _maker
+
+@pytest.fixture
+def tokenizer_setup(tokenizer_factory, test_config, monkeypatch, sql_preprocessor):
+    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+    tok = tokenizer_factory()
+    tok.init_new_db(test_config)
+
+
+@pytest.fixture
+def analyzer(tokenizer_factory, test_config, monkeypatch, sql_preprocessor,
+             word_table, temp_db_with_extensions, tmp_path):
+    sql = tmp_path / 'sql' / 'tokenizer' / 'legacy_tokenizer.sql'
+    sql.write_text("""
+        CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
+          RETURNS INTEGER AS $$ SELECT 342; $$ LANGUAGE SQL;
+        """)
+
+    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+    monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
+    tok = tokenizer_factory()
+    tok.init_new_db(test_config)
+    monkeypatch.undo()
+
+    with tok.name_analyzer() as analyzer:
+        yield analyzer
+
+
+@pytest.fixture
+def make_standard_name(temp_db_cursor):
+    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)
+                              RETURNS TEXT AS $$ SELECT ' ' || name; $$ LANGUAGE SQL""")
+
+
+@pytest.fixture
+def create_postcode_id(table_factory, temp_db_cursor):
+    table_factory('out_postcode_table', 'postcode TEXT')
+
+    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT)
+                              RETURNS BOOLEAN AS $$
+                              INSERT INTO out_postcode_table VALUES (postcode) RETURNING True;
+                              $$ LANGUAGE SQL""")
+
+
+@pytest.fixture
+def create_housenumbers(temp_db_cursor):
+    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION create_housenumbers(
+                                  housenumbers TEXT[],
+                                  OUT tokens TEXT, OUT normtext TEXT)
+                              AS $$
+                              SELECT housenumbers::TEXT, array_to_string(housenumbers, ';')
+                              $$ LANGUAGE SQL""")
+
+
+@pytest.fixture
+def make_keywords(temp_db_cursor, temp_db_with_extensions):
+    temp_db_cursor.execute(
+        """CREATE OR REPLACE FUNCTION make_keywords(names HSTORE)
+           RETURNS INTEGER[] AS $$ SELECT ARRAY[1, 2, 3] $$ LANGUAGE SQL""")
+
+def test_init_new(tokenizer_factory, test_config, monkeypatch,
+                  temp_db_conn, sql_preprocessor):
+    monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv')
+    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+
+    tok = tokenizer_factory()
+    tok.init_new_db(test_config)
+
+    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) == 'xxvv'
+
+    outfile = test_config.project_dir / 'module' / 'nominatim.so'
+
+    assert outfile.exists()
+    assert outfile.read_text() == 'TEST nomiantim.so'
+    assert outfile.stat().st_mode == 33261
+
+
+def test_init_module_load_failed(tokenizer_factory, test_config,
+                                 monkeypatch, temp_db_conn):
+    tok = tokenizer_factory()
+
+    with pytest.raises(UsageError):
+        tok.init_new_db(test_config)
+
+
+def test_init_module_custom(tokenizer_factory, test_config,
+                            monkeypatch, tmp_path, sql_preprocessor):
+    module_dir = (tmp_path / 'custom').resolve()
+    module_dir.mkdir()
+    (module_dir/ 'nominatim.so').write_text('CUSTOM nomiantim.so')
+
+    monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', str(module_dir))
+    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+
+    tok = tokenizer_factory()
+    tok.init_new_db(test_config)
+
+    assert not (test_config.project_dir / 'module').exists()
+
+
+def test_init_from_project(tokenizer_setup, tokenizer_factory):
+    tok = tokenizer_factory()
+
+    tok.init_from_project()
+
+    assert tok.normalization is not None
+
+
+def test_update_sql_functions(sql_preprocessor, temp_db_conn,
+                              tokenizer_factory, test_config, table_factory,
+                              monkeypatch, temp_db_cursor):
+    monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '1133')
+    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+    tok = tokenizer_factory()
+    tok.init_new_db(test_config)
+    monkeypatch.undo()
+
+    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
+
+    table_factory('test', 'txt TEXT')
+
+    func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer.sql'
+    func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}'),
+                                                   ('{{modulepath}}')""")
+
+    tok.update_sql_functions(test_config)
+
+    test_content = temp_db_cursor.row_set('SELECT * FROM test')
+    assert test_content == set((('1133', ), (str(test_config.project_dir / 'module'), )))
+
+
+def test_migrate_database(tokenizer_factory, test_config, temp_db_conn, monkeypatch):
+    monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+    tok = tokenizer_factory()
+    tok.migrate_database(test_config)
+
+    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) is not None
+    assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) is not None
+
+    outfile = test_config.project_dir / 'module' / 'nominatim.so'
+
+    assert outfile.exists()
+    assert outfile.read_text() == 'TEST nomiantim.so'
+    assert outfile.stat().st_mode == 33261
+
+
+def test_normalize(analyzer):
+    assert analyzer.normalize('TEsT') == 'test'
+
+
+def test_add_postcodes_from_db(analyzer, table_factory, temp_db_cursor,
+                               create_postcode_id):
+    table_factory('location_postcode', 'postcode TEXT',
+                  content=(('1234',), ('12 34',), ('AB23',), ('1234',)))
+
+    analyzer.add_postcodes_from_db()
+
+    assert temp_db_cursor.row_set("SELECT * from out_postcode_table") \
+               == set((('1234', ), ('12 34', ), ('AB23',)))
+
+
+def test_update_special_phrase_empty_table(analyzer, word_table, temp_db_cursor,
+                                           make_standard_name):
+    analyzer.update_special_phrases([
+        ("König bei", "amenity", "royal", "near"),
+        ("Könige", "amenity", "royal", "-"),
+        ("strasse", "highway", "primary", "in")
+    ])
+
+    assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
+                                     FROM word WHERE class != 'place'""") \
+               == set(((' könig bei', 'könig bei', 'amenity', 'royal', 'near'),
+                       (' könige', 'könige', 'amenity', 'royal', None),
+                       (' strasse', 'strasse', 'highway', 'primary', 'in')))
+
+
+def test_update_special_phrase_delete_all(analyzer, word_table, temp_db_cursor,
+                                          make_standard_name):
+    temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
+                              VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
+                                     (' bar', 'bar', 'highway', 'road', null)""")
+
+    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+    analyzer.update_special_phrases([])
+
+    assert 0 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+
+def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor,
+                                      make_standard_name):
+    temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
+                              VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
+                                     (' bar', 'bar', 'highway', 'road', null)""")
+
+    assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+    analyzer.update_special_phrases([
+      ('prison', 'amenity', 'prison', 'in'),
+      ('bar', 'highway', 'road', '-'),
+      ('garden', 'leisure', 'garden', 'near')
+    ])
+
+    assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
+                                     FROM word WHERE class != 'place'""") \
+               == set(((' prison', 'prison', 'amenity', 'prison', 'in'),
+                       (' bar', 'bar', 'highway', 'road', None),
+                       (' garden', 'garden', 'leisure', 'garden', 'near')))
+
+
+def test_process_place_names(analyzer, make_keywords):
+
+    info = analyzer.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
+
+    assert info['names'] == '{1,2,3}'
+
+
+@pytest.mark.parametrize('pc', ['12345', 'AB 123', '34-345'])
+def test_process_place_postcode(analyzer, temp_db_cursor, create_postcode_id, pc):
+
+    info = analyzer.process_place({'address': {'postcode' : pc}})
+
+    assert temp_db_cursor.row_set("SELECT * from out_postcode_table") \
+               == set(((pc, ),))
+
+
+@pytest.mark.parametrize('pc', ['12:23', 'ab;cd;f', '123;836'])
+def test_process_place_bad_postcode(analyzer, temp_db_cursor, create_postcode_id,
+                                    pc):
+
+    info = analyzer.process_place({'address': {'postcode' : pc}})
+
+    assert 0 == temp_db_cursor.scalar("SELECT count(*) from out_postcode_table")
+
+
+@pytest.mark.parametrize('hnr', ['123a', '1', '101'])
+def test_process_place_housenumbers_simple(analyzer, create_housenumbers, hnr):
+    info = analyzer.process_place({'address': {'housenumber' : hnr}})
+
+    assert info['hnr'] == hnr
+    assert info['hnr_tokens'].startswith("{")
+
+
+def test_process_place_housenumbers_lists(analyzer, create_housenumbers):
+    info = analyzer.process_place({'address': {'conscriptionnumber' : '1; 2;3'}})
+
+    assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
+
+
+def test_process_place_housenumbers_duplicates(analyzer, create_housenumbers):
+    info = analyzer.process_place({'address': {'housenumber' : '134',
+                                               'conscriptionnumber' : '134',
+                                               'streetnumber' : '99a'}})
+
+    assert set(info['hnr'].split(';')) == set(('134', '99a'))
diff --git a/test/python/test_tools_check_database.py b/test/python/test_tools_check_database.py

index 68b376a781c585b417c09a441e3f7485d7d231fa..53001c271691ed5d1d39571c25346203db73a4aa 100644 (file)
--- a/test/python/test_tools_check_database.py
+++ b/test/python/test_tools_check_database.py
@@ -43,8 +43,22 @@ def test_check_placex_table_size_bad(temp_db_cursor, temp_db_conn, def_config):
      assert chkdb.check_placex_size(temp_db_conn, def_config) == chkdb.CheckState.FATAL
  
  
-def test_check_module_bad(temp_db_conn, def_config):
-    assert chkdb.check_module(temp_db_conn, def_config) == chkdb.CheckState.FAIL
+def test_check_tokenizer_missing(temp_db_conn, def_config, tmp_path):
+    def_config.project_dir = tmp_path
+    assert chkdb.check_tokenizer(temp_db_conn, def_config) == chkdb.CheckState.FAIL
+
+
+@pytest.mark.parametrize("check_result,state", [(None, chkdb.CheckState.OK),
+                                                ("Something wrong", chkdb.CheckState.FAIL)])
+def test_check_tokenizer(tokenizer_mock, temp_db_conn, def_config, monkeypatch,
+                         check_result, state):
+    class _TestTokenizer:
+        def check_database(self):
+            return check_result
+
+    monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db',
+                         lambda *a, **k: _TestTokenizer())
+    assert chkdb.check_tokenizer(temp_db_conn, def_config) == state
  
  
  def test_check_indexing_good(temp_db_cursor, temp_db_conn, def_config):
diff --git a/test/python/test_tools_database_import.py b/test/python/test_tools_database_import.py

index 280ca704ce7c8419b6b745390401d285ef29bf16..ceac7a2421dc43afd23ecc1cf3a4c3066c7b02f1 100644 (file)
--- a/test/python/test_tools_database_import.py
+++ b/test/python/test_tools_database_import.py
@@ -80,39 +80,6 @@ def test_setup_extensions_old_postgis(temp_db_conn, monkeypatch):
          database_import.setup_extensions(temp_db_conn)
  
  
-def test_install_module(tmp_path):
-    src_dir = tmp_path / 'source'
-    src_dir.mkdir()
-    (src_dir / 'nominatim.so').write_text('TEST nomiantim.so')
-
-    project_dir = tmp_path / 'project'
-    project_dir.mkdir()
-
-    database_import.install_module(src_dir, project_dir, '')
-
-    outfile = project_dir / 'module' / 'nominatim.so'
-
-    assert outfile.exists()
-    assert outfile.read_text() == 'TEST nomiantim.so'
-    assert outfile.stat().st_mode == 33261
-
-
-def test_install_module_custom(tmp_path):
-    (tmp_path / 'nominatim.so').write_text('TEST nomiantim.so')
-
-    database_import.install_module(tmp_path, tmp_path, str(tmp_path.resolve()))
-
-    assert not (tmp_path / 'module').exists()
-
-
-def test_install_module_fail_access(temp_db_conn, tmp_path):
-    (tmp_path / 'nominatim.so').write_text('TEST nomiantim.so')
-
-    with pytest.raises(UsageError, match='.*module cannot be accessed.*'):
-        database_import.install_module(tmp_path, tmp_path, '',
-                                       conn=temp_db_conn)
-
-
  def test_import_base_data(src_dir, temp_db, temp_db_cursor):
      temp_db_cursor.execute('CREATE EXTENSION hstore')
      temp_db_cursor.execute('CREATE EXTENSION postgis')
@@ -171,14 +138,15 @@ def test_import_osm_data_default_cache(temp_db_cursor,osm2pgsql_options):
  
  
  def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory):
-    tables = ('word', 'placex', 'place_addressline', 'location_area',
+    tables = ('placex', 'place_addressline', 'location_area',
                'location_area_country',
                'location_property_tiger', 'location_property_osmline',
                'location_postcode', 'search_name', 'location_road_23')
      for table in tables:
-        table_factory(table, content=(1, 2, 3))
+        table_factory(table, content=((1, ), (2, ), (3, )))
+        assert temp_db_cursor.table_rows(table) == 3
  
-    database_import.truncate_data_tables(temp_db_conn, max_word_frequency=23)
+    database_import.truncate_data_tables(temp_db_conn)
  
      for table in tables:
          assert temp_db_cursor.table_rows(table) == 0
@@ -196,36 +164,33 @@ def test_load_data(dsn, src_dir, place_row, placex_table, osmline_table, word_ta
      place_row(osm_type='W', osm_id=342, cls='place', typ='houses',
                geom='SRID=4326;LINESTRING(0 0, 10 10)')
  
-    database_import.load_data(dsn, src_dir / 'data', threads)
+    database_import.load_data(dsn, threads)
  
      assert temp_db_cursor.table_rows('placex') == 30
      assert temp_db_cursor.table_rows('location_property_osmline') == 1
  
-@pytest.mark.parametrize("languages", (False, True))
-def test_create_country_names(temp_db_conn, temp_db_cursor, def_config,
-                              temp_db_with_extensions, monkeypatch, languages):
-    if languages:
-        monkeypatch.setenv('NOMINATIM_LANGUAGES', 'fr,en')
-    temp_db_cursor.execute("""CREATE FUNCTION make_standard_name (name TEXT)
-                                  RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL
-                               """)
-    temp_db_cursor.execute('CREATE TABLE country_name (country_code varchar(2), name hstore)')
-    temp_db_cursor.execute('CREATE TABLE word (code varchar(2))')
-    temp_db_cursor.execute("""INSERT INTO country_name VALUES ('us',
-                              '"name"=>"us","name:af"=>"us"')""")
-    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
-                            lookup_country_code varchar(2))
-                            RETURNS INTEGER
-                            AS $$
-                            BEGIN
-                                INSERT INTO word VALUES (lookup_country_code);
-                                RETURN 5;
-                            END;
-                            $$
-                            LANGUAGE plpgsql;
-                               """)
-    database_import.create_country_names(temp_db_conn, def_config)
+
+@pytest.mark.parametrize("languages", (None, ' fr,en'))
+def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor,
+                              table_factory, tokenizer_mock, languages):
+
+    table_factory('country_name', 'country_code varchar(2), name hstore',
+                  content=(('us', '"name"=>"us1","name:af"=>"us2"'),
+                           ('fr', '"name"=>"Fra", "name:en"=>"Fren"')))
+
+    assert temp_db_cursor.scalar("SELECT count(*) FROM country_name") == 2
+
+    tokenizer = tokenizer_mock()
+
+    database_import.create_country_names(temp_db_conn, tokenizer, languages)
+
+    assert len(tokenizer.analyser_cache['countries']) == 2
+
+    result_set = {k: set(v) for k, v in tokenizer.analyser_cache['countries']}
+
      if languages:
-        assert temp_db_cursor.table_rows('word') == 4
+        assert result_set == {'us' : set(('us', 'us1', 'United States')),
+                              'fr' : set(('fr', 'Fra', 'Fren'))}
      else:
-        assert temp_db_cursor.table_rows('word') == 5
+        assert result_set == {'us' : set(('us', 'us1', 'us2', 'United States')),
+                              'fr' : set(('fr', 'Fra', 'Fren'))}
diff --git a/test/python/test_tools_import_special_phrases.py b/test/python/test_tools_import_special_phrases.py

index cb82f6b78c3d6ad53333673773dd9294ce4c2493..24b3318da37d86db31535dbb427f8081dcb5f44b 100644 (file)
--- a/test/python/test_tools_import_special_phrases.py
+++ b/test/python/test_tools_import_special_phrases.py
@@ -11,41 +11,6 @@ from nominatim.tools import SpecialPhrasesImporter
  
  TEST_BASE_DIR = Path(__file__) / '..' / '..'
  
-def test_fetch_existing_words_phrases_basic(special_phrases_importer, word_table,
-                                            temp_db_cursor):
-    """
-        Check for the fetch_existing_words_phrases() method.
-        It should return special phrase term added to the word
-        table.
-    """
-    query ="""
-        INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
-        'class', 'type', null, 0, 'near');
-    """
-    temp_db_cursor.execute(query)
-
-    assert not special_phrases_importer.words_phrases_to_delete
-    special_phrases_importer._fetch_existing_words_phrases()
-    contained_phrase = special_phrases_importer.words_phrases_to_delete.pop()
-    assert contained_phrase == ('normalized_word', 'class', 'type', 'near')
-
-@pytest.mark.parametrize("house_type", ['house', 'postcode'])
-def test_fetch_existing_words_phrases_special_cases(special_phrases_importer, word_table,
-                                                    house_type, temp_db_cursor):
-    """
-        Check for the fetch_existing_words_phrases() method.
-        It should return nothing as the terms added correspond
-        to a housenumber and postcode term.
-    """
-    query ="""
-        INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
-        'place', %s, null, 0, 'near');
-    """
-    temp_db_cursor.execute(query, (house_type,))
-
-    special_phrases_importer._fetch_existing_words_phrases()
-    assert not special_phrases_importer.words_phrases_to_delete
-
  def test_fetch_existing_place_classtype_tables(special_phrases_importer, temp_db_cursor):
      """
          Check for the fetch_existing_place_classtype_tables() method.
@@ -118,41 +83,11 @@ def test_convert_settings_giving_json(special_phrases_importer):
          the same path is directly returned
      """
      json_file = (TEST_BASE_DIR / 'testfiles' / 'phrase_settings.json').resolve()
-    
+
      returned = special_phrases_importer._convert_php_settings_if_needed(json_file)
  
      assert returned == json_file
  
-def test_process_amenity_with_operator(special_phrases_importer, getorcreate_amenityoperator_funcs,
-                                       temp_db_conn, word_table):
-    """
-        Test that _process_amenity() execute well the 
-        getorcreate_amenityoperator() SQL function and that
-        the 2 differents operators are well handled.
-    """
-    special_phrases_importer._process_amenity('', '', '', '', 'near')
-    special_phrases_importer._process_amenity('', '', '', '', 'in')
-
-    with temp_db_conn.cursor() as temp_db_cursor:
-        temp_db_cursor.execute("SELECT * FROM word WHERE operator='near' OR operator='in'")
-        results = temp_db_cursor.fetchall()
-
-    assert len(results) == 2
-
-def test_process_amenity_without_operator(special_phrases_importer, getorcreate_amenity_funcs,
-                                          temp_db_conn, word_table):
-    """
-        Test that _process_amenity() execute well the
-        getorcreate_amenity() SQL function.
-    """
-    special_phrases_importer._process_amenity('', '', '', '', '')
-
-    with temp_db_conn.cursor() as temp_db_cursor:
-        temp_db_cursor.execute("SELECT * FROM word WHERE operator='no_operator'")
-        result = temp_db_cursor.fetchone()
-
-    assert result
-
  def test_create_place_classtype_indexes(temp_db_conn, special_phrases_importer):
      """
          Test that _create_place_classtype_indexes() create the
@@ -215,8 +150,7 @@ def test_create_place_classtype_table_and_indexes(
          assert check_placeid_and_centroid_indexes(temp_db_conn, pair[0], pair[1])
          assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, pair[0], pair[1])
  
-def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer, word_table,
-                             getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs):
+def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer):
      """
          Test that _process_xml_content() process the given xml content right
          by executing the right SQL functions for amenities and 
@@ -228,11 +162,9 @@ def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer,
      #Converted output set to a dict for easy assert further.
      results = dict(special_phrases_importer._process_xml_content(get_test_xml_wiki_content(), 'en'))
  
-    assert check_amenities_with_op(temp_db_conn)
-    assert check_amenities_without_op(temp_db_conn)
      assert results[class_test] and type_test in results.values()
  
-def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_phrases,
+def test_remove_non_existent_tables_from_db(special_phrases_importer, default_phrases,
                                               temp_db_conn):
      """
          Check for the remove_non_existent_phrases_from_db() method.
@@ -245,22 +177,10 @@ def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_p
          be deleted.
      """
      with temp_db_conn.cursor() as temp_db_cursor:
-        to_delete_phrase_tuple = ('normalized_word', 'class', 'type', 'near')
-        to_keep_phrase_tuple = (
-            'normalized_word_exists', 'class_exists', 'type_exists', 'near'
-        )
-        special_phrases_importer.words_phrases_to_delete = {
-            to_delete_phrase_tuple,
-            to_keep_phrase_tuple
-        }
-        special_phrases_importer.words_phrases_still_exist = {
-            to_keep_phrase_tuple
-        }
          special_phrases_importer.table_phrases_to_delete = {
              'place_classtype_testclasstypetable_to_delete'
          }
  
-        query_words = 'SELECT word, class, type, operator FROM word;'
          query_tables = """
              SELECT table_name
              FROM information_schema.tables
@@ -268,21 +188,16 @@ def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_p
              AND table_name like 'place_classtype_%';
          """
  
-        special_phrases_importer._remove_non_existent_phrases_from_db()
+        special_phrases_importer._remove_non_existent_tables_from_db()
  
-        temp_db_cursor.execute(query_words)
-        words_result = temp_db_cursor.fetchall()
          temp_db_cursor.execute(query_tables)
          tables_result = temp_db_cursor.fetchall()
-        assert len(words_result) == 1 and words_result[0] == [
-            'normalized_word_exists', 'class_exists', 'type_exists', 'near'
-        ]
          assert (len(tables_result) == 1 and
              tables_result[0][0] == 'place_classtype_testclasstypetable_to_keep'
          )
  
-def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer, placex_table, 
-                          getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs, word_table):
+def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer,
+                          placex_table, tokenizer_mock):
      """
          Check that the main import_from_wiki() method is well executed.
          It should create the place_classtype table, the place_id and centroid indexes,
@@ -294,17 +209,14 @@ def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases
      #what is deleted and what is preserved.
      with temp_db_conn.cursor() as temp_db_cursor:
          temp_db_cursor.execute("""
-            INSERT INTO word VALUES(99999, ' animal shelter', 'animal shelter',
-            'amenity', 'animal_shelter', null, 0, null);
-
-            INSERT INTO word VALUES(99999, ' wrong_lookup_token', 'wrong_normalized_word',
-            'wrong_class', 'wrong_type', null, 0, 'near');
-
              CREATE TABLE place_classtype_amenity_animal_shelter();
              CREATE TABLE place_classtype_wrongclass_wrongtype();""")
  
      monkeypatch.setattr('nominatim.tools.SpecialPhrasesImporter._get_wiki_content', mock_get_wiki_content)
-    special_phrases_importer.import_from_wiki(['en'])
+    tokenizer = tokenizer_mock()
+    special_phrases_importer.import_from_wiki(tokenizer, ['en'])
+
+    assert len(tokenizer.analyser_cache['special_phrases']) == 18
  
      class_test = 'aerialway'
      type_test = 'zip_line'
@@ -312,22 +224,12 @@ def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases
      assert check_table_exist(temp_db_conn, class_test, type_test)
      assert check_placeid_and_centroid_indexes(temp_db_conn, class_test, type_test)
      assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, class_test, type_test)
-    assert check_amenities_with_op(temp_db_conn)
-    assert check_amenities_without_op(temp_db_conn)
      assert check_table_exist(temp_db_conn, 'amenity', 'animal_shelter')
      assert not check_table_exist(temp_db_conn, 'wrong_class', 'wrong_type')
  
      #Format (query, should_return_something_bool) use to easily execute all asserts
      queries_tests = set()
  
-    #Used to check that the correct phrase already in the word table before is still there.
-    query_correct_word = "SELECT * FROM word WHERE word = 'animal shelter'"
-    queries_tests.add((query_correct_word, True))
-
-    #Used to check if wrong phrase was deleted from the word table of the database.
-    query_wrong_word = "SELECT word FROM word WHERE word = 'wrong_normalized_word'"
-    queries_tests.add((query_wrong_word, False))
-
      #Used to check that correct place_classtype table already in the datase before is still there.
      query_existing_table = """
          SELECT table_name
@@ -412,24 +314,6 @@ def check_placeid_and_centroid_indexes(temp_db_conn, phrase_class, phrase_type):
          temp_db_conn.index_exists(index_prefix + 'place_id')
      )
  
-def check_amenities_with_op(temp_db_conn):
-    """
-        Check that the test table for the SQL function getorcreate_amenityoperator()
-        contains more than one value (so that the SQL function was call more than one time).
-    """
-    with temp_db_conn.cursor() as temp_db_cursor:
-        temp_db_cursor.execute("SELECT * FROM word WHERE operator != 'no_operator'")
-        return len(temp_db_cursor.fetchall()) > 1
-
-def check_amenities_without_op(temp_db_conn):
-    """
-        Check that the test table for the SQL function getorcreate_amenity()
-        contains more than one value (so that the SQL function was call more than one time).
-    """
-    with temp_db_conn.cursor() as temp_db_cursor:
-        temp_db_cursor.execute("SELECT * FROM word WHERE operator = 'no_operator'")
-        return len(temp_db_cursor.fetchall()) > 1
-
  @pytest.fixture
  def special_phrases_importer(temp_db_conn, def_config, temp_phplib_dir_with_migration):
      """
@@ -453,48 +337,7 @@ def temp_phplib_dir_with_migration():
          yield Path(phpdir)
  
  @pytest.fixture
-def default_phrases(word_table, temp_db_cursor):
+def default_phrases(temp_db_cursor):
      temp_db_cursor.execute("""
-        INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
-        'class', 'type', null, 0, 'near');
-
-        INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word_exists',
-        'class_exists', 'type_exists', null, 0, 'near');
-
          CREATE TABLE place_classtype_testclasstypetable_to_delete();
          CREATE TABLE place_classtype_testclasstypetable_to_keep();""")
-
-@pytest.fixture
-def make_strandard_name_func(temp_db_cursor):
-    temp_db_cursor.execute("""
-        CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT AS $$
-        BEGIN
-        RETURN trim(name); --Basically return only the trimed name for the tests
-        END;
-        $$ LANGUAGE plpgsql IMMUTABLE;""")
-        
-@pytest.fixture
-def getorcreate_amenity_funcs(temp_db_cursor, make_strandard_name_func):
-    temp_db_cursor.execute("""
-        CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
-                                                    lookup_class text, lookup_type text)
-        RETURNS void as $$
-        BEGIN
-            INSERT INTO word VALUES(null, lookup_word, normalized_word,
-            lookup_class, lookup_type, null, 0, 'no_operator');
-        END;
-        $$ LANGUAGE plpgsql""")
-
-@pytest.fixture
-def getorcreate_amenityoperator_funcs(temp_db_cursor, make_strandard_name_func):
-    temp_db_cursor.execute("""
-        CREATE TABLE temp_with_operator(op TEXT);
-
-        CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT, normalized_word TEXT,
-                                                    lookup_class text, lookup_type text, op text)
-        RETURNS void as $$
-        BEGIN 
-            INSERT INTO word VALUES(null, lookup_word, normalized_word,
-            lookup_class, lookup_type, null, 0, op);
-        END;
-        $$ LANGUAGE plpgsql""")
-\ No newline at end of file
diff --git a/test/python/test_tools_postcodes.py b/test/python/test_tools_postcodes.py

index 1fc060b0c6439677e592aa90d4d48d3db51a8ea6..37b47dfa680258e5587fa40e5057d7fd85904a96 100644 (file)
--- a/test/python/test_tools_postcodes.py
+++ b/test/python/test_tools_postcodes.py
@@ -5,6 +5,11 @@ Tests for functions to maintain the artificial postcode table.
  import pytest
  
  from nominatim.tools import postcodes
+import dummy_tokenizer
+
+@pytest.fixture
+def tokenizer():
+    return dummy_tokenizer.DummyTokenizer(None, None)
  
  @pytest.fixture
  def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
@@ -20,26 +25,26 @@ def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
                        postcode TEXT,
                        geometry GEOMETRY(Geometry, 4326)""")
      temp_db_cursor.execute('CREATE SEQUENCE seq_place')
-    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
-                              RETURNS INTEGER AS $$ BEGIN RETURN 1; END; $$ LANGUAGE plpgsql;
+    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+                              RETURNS TEXT AS $$ BEGIN RETURN postcode; END; $$ LANGUAGE plpgsql;
                             """)
  
  
-def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path):
-    postcodes.import_postcodes(dsn, tmp_path)
+def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
+    postcodes.import_postcodes(dsn, tmp_path, tokenizer)
  
      assert temp_db_cursor.table_exists('gb_postcode')
      assert temp_db_cursor.table_exists('us_postcode')
      assert temp_db_cursor.table_rows('location_postcode') == 0
  
  
-def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path):
+def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
      temp_db_cursor.execute("""
          INSERT INTO placex (place_id, country_code, address, geometry)
            VALUES (1, 'xx', '"postcode"=>"9486"', 'SRID=4326;POINT(10 12)')
      """)
  
-    postcodes.import_postcodes(dsn, tmp_path)
+    postcodes.import_postcodes(dsn, tmp_path, tokenizer)
  
      rows = temp_db_cursor.row_set(""" SELECT postcode, country_code,
                                        ST_X(geometry), ST_Y(geometry)
diff --git a/test/python/test_tools_refresh_create_functions.py b/test/python/test_tools_refresh_create_functions.py

index 53ea2b520a2cb207c0d9ac85ddada67604be5d5e..3f9bccbdd4162f81165dcdddd64f88879b179ad7 100644 (file)
--- a/test/python/test_tools_refresh_create_functions.py
+++ b/test/python/test_tools_refresh_create_functions.py
@@ -11,9 +11,7 @@ def sql_tmp_path(tmp_path, def_config):
      return tmp_path
  
  @pytest.fixture
-def conn(temp_db_conn, table_factory, monkeypatch):
-    monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.')
-    table_factory('country_name', 'partition INT', (0, 1, 2))
+def conn(sql_preprocessor, temp_db_conn):
      return temp_db_conn
  
  
diff --git a/test/python/test_tools_refresh_setup_website.py b/test/python/test_tools_refresh_setup_website.py

index 18b146fc22b4b07b5c4696bf2f0c8a7cfea30bd6..dc822e3c166051bf6812b75d41fb64703427ae97 100644 (file)
--- a/test/python/test_tools_refresh_setup_website.py
+++ b/test/python/test_tools_refresh_setup_website.py
@@ -26,6 +26,7 @@ def test_script(envdir):
  
  def run_website_script(envdir, config):
      config.lib_dir.php = envdir / 'php'
+    config.project_dir = envdir
      refresh.setup_website(envdir, config)
  
      proc = subprocess.run(['/usr/bin/env', 'php', '-Cq',
diff --git a/test/testdb/specialphrases_testdb.sql b/test/testdb/specialphrases_testdb.sql

index b3b5d76d770d9c3e311181f8419296c5359d137e..7e72076e574783c6b9831a2f2db6f3c3a555b58b 100644 (file)
--- a/test/testdb/specialphrases_testdb.sql
+++ b/test/testdb/specialphrases_testdb.sql
@@ -1,120 +1,170 @@
-SELECT getorcreate_amenity(make_standard_name('Aerodrome'), 'aerodrome', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenity(make_standard_name('Aerodromes'), 'aerodromes', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodrome in'), 'aerodrome in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodromes in'), 'aerodromes in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodrome near'), 'aerodrome near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodromes near'), 'aerodromes near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenity(make_standard_name('Airport'), 'airport', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenity(make_standard_name('Airports'), 'airports', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenityoperator(make_standard_name('Airport in'), 'airport in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Airports in'), 'airports in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Airport near'), 'airport near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Airports near'), 'airports near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'bar');
-SELECT getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'bar');
-SELECT getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'pub');
-SELECT getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'pub');
-SELECT getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
-SELECT getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
-SELECT getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'bar');
-SELECT getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'bar');
-SELECT getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'pub');
-SELECT getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'pub');
-SELECT getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenity(make_standard_name('Restaurant'), 'restaurant', 'amenity', 'restaurant');
-SELECT getorcreate_amenity(make_standard_name('Restaurants'), 'restaurants', 'amenity', 'restaurant');
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurant in'), 'restaurant in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurants in'), 'restaurants in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurant near'), 'restaurant near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurants near'), 'restaurants near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenity(make_standard_name('Mural'), 'mural', 'artwork_type', 'mural');
-SELECT getorcreate_amenity(make_standard_name('Murals'), 'murals', 'artwork_type', 'mural');
-SELECT getorcreate_amenityoperator(make_standard_name('Mural in'), 'mural in', 'artwork_type', 'mural', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Murals in'), 'murals in', 'artwork_type', 'mural', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Mural near'), 'mural near', 'artwork_type', 'mural', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Murals near'), 'murals near', 'artwork_type', 'mural', 'near');
-SELECT getorcreate_amenity(make_standard_name('Sculpture'), 'sculpture', 'artwork_type', 'sculpture');
-SELECT getorcreate_amenity(make_standard_name('Sculptures'), 'sculptures', 'artwork_type', 'sculpture');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculpture in'), 'sculpture in', 'artwork_type', 'sculpture', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculptures in'), 'sculptures in', 'artwork_type', 'sculpture', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculpture near'), 'sculpture near', 'artwork_type', 'sculpture', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculptures near'), 'sculptures near', 'artwork_type', 'sculpture', 'near');
-SELECT getorcreate_amenity(make_standard_name('Statue'), 'statue', 'artwork_type', 'statue');
-SELECT getorcreate_amenity(make_standard_name('Statues'), 'statues', 'artwork_type', 'statue');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'artwork_type', 'statue', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'artwork_type', 'statue', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'artwork_type', 'statue', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'artwork_type', 'statue', 'near');
-SELECT getorcreate_amenity(make_standard_name('ATM'), 'atm', 'atm', 'yes');
-SELECT getorcreate_amenity(make_standard_name('ATMs'), 'atms', 'atm', 'yes');
-SELECT getorcreate_amenityoperator(make_standard_name('ATM in'), 'atm in', 'atm', 'yes', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('ATMs in'), 'atms in', 'atm', 'yes', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('ATM near'), 'atm near', 'atm', 'yes', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('ATMs near'), 'atms near', 'atm', 'yes', 'near');
-SELECT getorcreate_amenity(make_standard_name('National Park'), 'national park', 'boundary', 'national_park');
-SELECT getorcreate_amenity(make_standard_name('National Parks'), 'national parks', 'boundary', 'national_park');
-SELECT getorcreate_amenityoperator(make_standard_name('National Park in'), 'national park in', 'boundary', 'national_park', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('National Parks in'), 'national parks in', 'boundary', 'national_park', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('National Park near'), 'national park near', 'boundary', 'national_park', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('National Parks near'), 'national parks near', 'boundary', 'national_park', 'near');
-SELECT getorcreate_amenity(make_standard_name('Changing table'), 'changing table', 'changing_table', 'yes');
-SELECT getorcreate_amenity(make_standard_name('Changing tables'), 'changing tables', 'changing_table', 'yes');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing table in'), 'changing table in', 'changing_table', 'yes', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing tables in'), 'changing tables in', 'changing_table', 'yes', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing table near'), 'changing table near', 'changing_table', 'yes', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing tables near'), 'changing tables near', 'changing_table', 'yes', 'near');
-SELECT getorcreate_amenity(make_standard_name('Roundabout'), 'roundabout', 'junction', 'roundabout');
-SELECT getorcreate_amenity(make_standard_name('Roundabouts'), 'roundabouts', 'junction', 'roundabout');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabout in'), 'roundabout in', 'junction', 'roundabout', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabouts in'), 'roundabouts in', 'junction', 'roundabout', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabout near'), 'roundabout near', 'junction', 'roundabout', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabouts near'), 'roundabouts near', 'junction', 'roundabout', 'near');
-SELECT getorcreate_amenity(make_standard_name('Plaque'), 'plaque', 'memorial', 'plaque');
-SELECT getorcreate_amenity(make_standard_name('Plaques'), 'plaques', 'memorial', 'plaque');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaque in'), 'plaque in', 'memorial', 'plaque', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaques in'), 'plaques in', 'memorial', 'plaque', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaque near'), 'plaque near', 'memorial', 'plaque', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaques near'), 'plaques near', 'memorial', 'plaque', 'near');
-SELECT getorcreate_amenity(make_standard_name('Statue'), 'statue', 'memorial', 'statue');
-SELECT getorcreate_amenity(make_standard_name('Statues'), 'statues', 'memorial', 'statue');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'memorial', 'statue', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'memorial', 'statue', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'memorial', 'statue', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'memorial', 'statue', 'near');
-SELECT getorcreate_amenity(make_standard_name('Stolperstein'), 'stolperstein', 'memorial', 'stolperstein');
-SELECT getorcreate_amenity(make_standard_name('Stolpersteins'), 'stolpersteins', 'memorial', 'stolperstein');
-SELECT getorcreate_amenity(make_standard_name('Stolpersteine'), 'stolpersteine', 'memorial', 'stolperstein');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolperstein in'), 'stolperstein in', 'memorial', 'stolperstein', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteins in'), 'stolpersteins in', 'memorial', 'stolperstein', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteine in'), 'stolpersteine in', 'memorial', 'stolperstein', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolperstein near'), 'stolperstein near', 'memorial', 'stolperstein', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteins near'), 'stolpersteins near', 'memorial', 'stolperstein', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteine near'), 'stolpersteine near', 'memorial', 'stolperstein', 'near');
-SELECT getorcreate_amenity(make_standard_name('War Memorial'), 'war memorial', 'memorial', 'war_memorial');
-SELECT getorcreate_amenity(make_standard_name('War Memorials'), 'war memorials', 'memorial', 'war_memorial');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorial in'), 'war memorial in', 'memorial', 'war_memorial', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorials in'), 'war memorials in', 'memorial', 'war_memorial', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorial near'), 'war memorial near', 'memorial', 'war_memorial', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorials near'), 'war memorials near', 'memorial', 'war_memorial', 'near');
+CREATE OR REPLACE FUNCTION test_getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
+                                               lookup_class text, lookup_type text)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  lookup_token TEXT;
+  return_word_id INTEGER;
+BEGIN
+  lookup_token := ' '||trim(lookup_word);
+  SELECT min(word_id) FROM word
+  WHERE word_token = lookup_token and word = normalized_word
+        and class = lookup_class and type = lookup_type
+  INTO return_word_id;
+  IF return_word_id IS NULL THEN
+    return_word_id := nextval('seq_word');
+    INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
+                             lookup_class, lookup_type, null, 0);
+  END IF;
+  RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION test_getorcreate_amenityoperator(lookup_word TEXT,
+                                                       normalized_word TEXT,
+                                                       lookup_class text,
+                                                       lookup_type text,
+                                                       op text)
+  RETURNS INTEGER
+  AS $$
+DECLARE
+  lookup_token TEXT;
+  return_word_id INTEGER;
+BEGIN
+  lookup_token := ' '||trim(lookup_word);
+  SELECT min(word_id) FROM word
+  WHERE word_token = lookup_token and word = normalized_word
+        and class = lookup_class and type = lookup_type and operator = op
+  INTO return_word_id;
+  IF return_word_id IS NULL THEN
+    return_word_id := nextval('seq_word');
+    INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
+                             lookup_class, lookup_type, null, 0, op);
+  END IF;
+  RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+SELECT test_getorcreate_amenity(make_standard_name('Aerodrome'), 'aerodrome', 'aeroway', 'aerodrome');
+SELECT test_getorcreate_amenity(make_standard_name('Aerodromes'), 'aerodromes', 'aeroway', 'aerodrome');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodrome in'), 'aerodrome in', 'aeroway', 'aerodrome', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodromes in'), 'aerodromes in', 'aeroway', 'aerodrome', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodrome near'), 'aerodrome near', 'aeroway', 'aerodrome', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodromes near'), 'aerodromes near', 'aeroway', 'aerodrome', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Airport'), 'airport', 'aeroway', 'aerodrome');
+SELECT test_getorcreate_amenity(make_standard_name('Airports'), 'airports', 'aeroway', 'aerodrome');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airport in'), 'airport in', 'aeroway', 'aerodrome', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airports in'), 'airports in', 'aeroway', 'aerodrome', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airport near'), 'airport near', 'aeroway', 'aerodrome', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airports near'), 'airports near', 'aeroway', 'aerodrome', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'bar');
+SELECT test_getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'bar');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'bar', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'bar', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'bar', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'bar', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'pub');
+SELECT test_getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'pub');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'pub', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'pub', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'pub', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'pub', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
+SELECT test_getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'bar');
+SELECT test_getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'bar');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'bar', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'bar', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'bar', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'bar', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'pub');
+SELECT test_getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'pub');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'pub', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'pub', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'pub', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'pub', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Restaurant'), 'restaurant', 'amenity', 'restaurant');
+SELECT test_getorcreate_amenity(make_standard_name('Restaurants'), 'restaurants', 'amenity', 'restaurant');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurant in'), 'restaurant in', 'amenity', 'restaurant', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurants in'), 'restaurants in', 'amenity', 'restaurant', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurant near'), 'restaurant near', 'amenity', 'restaurant', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurants near'), 'restaurants near', 'amenity', 'restaurant', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Mural'), 'mural', 'artwork_type', 'mural');
+SELECT test_getorcreate_amenity(make_standard_name('Murals'), 'murals', 'artwork_type', 'mural');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Mural in'), 'mural in', 'artwork_type', 'mural', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Murals in'), 'murals in', 'artwork_type', 'mural', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Mural near'), 'mural near', 'artwork_type', 'mural', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Murals near'), 'murals near', 'artwork_type', 'mural', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Sculpture'), 'sculpture', 'artwork_type', 'sculpture');
+SELECT test_getorcreate_amenity(make_standard_name('Sculptures'), 'sculptures', 'artwork_type', 'sculpture');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculpture in'), 'sculpture in', 'artwork_type', 'sculpture', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculptures in'), 'sculptures in', 'artwork_type', 'sculpture', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculpture near'), 'sculpture near', 'artwork_type', 'sculpture', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculptures near'), 'sculptures near', 'artwork_type', 'sculpture', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Statue'), 'statue', 'artwork_type', 'statue');
+SELECT test_getorcreate_amenity(make_standard_name('Statues'), 'statues', 'artwork_type', 'statue');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'artwork_type', 'statue', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'artwork_type', 'statue', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'artwork_type', 'statue', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'artwork_type', 'statue', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('ATM'), 'atm', 'atm', 'yes');
+SELECT test_getorcreate_amenity(make_standard_name('ATMs'), 'atms', 'atm', 'yes');
+SELECT test_getorcreate_amenityoperator(make_standard_name('ATM in'), 'atm in', 'atm', 'yes', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('ATMs in'), 'atms in', 'atm', 'yes', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('ATM near'), 'atm near', 'atm', 'yes', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('ATMs near'), 'atms near', 'atm', 'yes', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('National Park'), 'national park', 'boundary', 'national_park');
+SELECT test_getorcreate_amenity(make_standard_name('National Parks'), 'national parks', 'boundary', 'national_park');
+SELECT test_getorcreate_amenityoperator(make_standard_name('National Park in'), 'national park in', 'boundary', 'national_park', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('National Parks in'), 'national parks in', 'boundary', 'national_park', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('National Park near'), 'national park near', 'boundary', 'national_park', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('National Parks near'), 'national parks near', 'boundary', 'national_park', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Changing table'), 'changing table', 'changing_table', 'yes');
+SELECT test_getorcreate_amenity(make_standard_name('Changing tables'), 'changing tables', 'changing_table', 'yes');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Changing table in'), 'changing table in', 'changing_table', 'yes', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Changing tables in'), 'changing tables in', 'changing_table', 'yes', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Changing table near'), 'changing table near', 'changing_table', 'yes', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Changing tables near'), 'changing tables near', 'changing_table', 'yes', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Roundabout'), 'roundabout', 'junction', 'roundabout');
+SELECT test_getorcreate_amenity(make_standard_name('Roundabouts'), 'roundabouts', 'junction', 'roundabout');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabout in'), 'roundabout in', 'junction', 'roundabout', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabouts in'), 'roundabouts in', 'junction', 'roundabout', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabout near'), 'roundabout near', 'junction', 'roundabout', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabouts near'), 'roundabouts near', 'junction', 'roundabout', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Plaque'), 'plaque', 'memorial', 'plaque');
+SELECT test_getorcreate_amenity(make_standard_name('Plaques'), 'plaques', 'memorial', 'plaque');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Plaque in'), 'plaque in', 'memorial', 'plaque', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Plaques in'), 'plaques in', 'memorial', 'plaque', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Plaque near'), 'plaque near', 'memorial', 'plaque', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Plaques near'), 'plaques near', 'memorial', 'plaque', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Statue'), 'statue', 'memorial', 'statue');
+SELECT test_getorcreate_amenity(make_standard_name('Statues'), 'statues', 'memorial', 'statue');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'memorial', 'statue', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'memorial', 'statue', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'memorial', 'statue', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'memorial', 'statue', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Stolperstein'), 'stolperstein', 'memorial', 'stolperstein');
+SELECT test_getorcreate_amenity(make_standard_name('Stolpersteins'), 'stolpersteins', 'memorial', 'stolperstein');
+SELECT test_getorcreate_amenity(make_standard_name('Stolpersteine'), 'stolpersteine', 'memorial', 'stolperstein');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolperstein in'), 'stolperstein in', 'memorial', 'stolperstein', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteins in'), 'stolpersteins in', 'memorial', 'stolperstein', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteine in'), 'stolpersteine in', 'memorial', 'stolperstein', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolperstein near'), 'stolperstein near', 'memorial', 'stolperstein', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteins near'), 'stolpersteins near', 'memorial', 'stolperstein', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteine near'), 'stolpersteine near', 'memorial', 'stolperstein', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('War Memorial'), 'war memorial', 'memorial', 'war_memorial');
+SELECT test_getorcreate_amenity(make_standard_name('War Memorials'), 'war memorials', 'memorial', 'war_memorial');
+SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorial in'), 'war memorial in', 'memorial', 'war_memorial', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorials in'), 'war memorials in', 'memorial', 'war_memorial', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorial near'), 'war memorial near', 'memorial', 'war_memorial', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorials near'), 'war memorials near', 'memorial', 'war_memorial', 'near');
  CREATE INDEX idx_placex_classtype ON placex (class, type);CREATE TABLE place_classtype_aeroway_aerodrome AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex WHERE class = 'aeroway' AND type = 'aerodrome';
  CREATE INDEX idx_place_classtype_aeroway_aerodrome_centroid ON place_classtype_aeroway_aerodrome USING GIST (centroid);
  CREATE INDEX idx_place_classtype_aeroway_aerodrome_place_id ON place_classtype_aeroway_aerodrome USING btree(place_id);
@@ -175,4 +225,7 @@ CREATE TABLE place_classtype_memorial_war_memorial AS SELECT place_id AS place_i
  CREATE INDEX idx_place_classtype_memorial_war_memorial_centroid ON place_classtype_memorial_war_memorial USING GIST (centroid);
  CREATE INDEX idx_place_classtype_memorial_war_memorial_place_id ON place_classtype_memorial_war_memorial USING btree(place_id);
  GRANT SELECT ON place_classtype_memorial_war_memorial TO "www-data";
-DROP INDEX idx_placex_classtype;
-\ No newline at end of file
+DROP INDEX idx_placex_classtype;
+
+DROP FUNCTION test_getorcreate_amenity;
+DROP FUNCTION test_getorcreate_amenityoperator;
author	Sarah Hoffmann <lonvia@denofr.de>
	Mon, 3 May 2021 07:15:34 +0000 (09:15 +0200)
committer	GitHub <noreply@github.com>
	Mon, 3 May 2021 07:15:34 +0000 (09:15 +0200)
lib-php/Geocode.php		patch \| blob \| history
lib-php/Phrase.php		patch \| blob \| history
lib-php/Status.php		patch \| blob \| history
lib-php/TokenList.php		patch \| blob \| history
lib-php/admin/query.php		patch \| blob \| history
lib-php/admin/warm.php		patch \| blob \| history
lib-php/tokenizer/legacy_tokenizer.php	[new file with mode: 0644]	patch \| blob
lib-sql/functions.sql		patch \| blob \| history
lib-sql/functions/interpolation.sql		patch \| blob \| history
lib-sql/functions/normalization.sql	[deleted file]	patch \| blob \| history
lib-sql/functions/partition-functions.sql		patch \| blob \| history
lib-sql/functions/placex_triggers.sql		patch \| blob \| history
lib-sql/functions/utils.sql		patch \| blob \| history
lib-sql/indices.sql		patch \| blob \| history
lib-sql/tables.sql		patch \| blob \| history
lib-sql/tokenizer/legacy_tokenizer.sql	[new file with mode: 0644]	patch \| blob
lib-sql/tokenizer/legacy_tokenizer_indices.sql	[new file with mode: 0644]	patch \| blob
lib-sql/tokenizer/legacy_tokenizer_tables.sql	[new file with mode: 0644]	patch \| blob
nominatim/clicmd/index.py		patch \| blob \| history
nominatim/clicmd/refresh.py		patch \| blob \| history
nominatim/clicmd/replication.py		patch \| blob \| history
nominatim/clicmd/setup.py		patch \| blob \| history
nominatim/clicmd/special_phrases.py		patch \| blob \| history
nominatim/db/async_connection.py		patch \| blob \| history
nominatim/db/sql_preprocessor.py		patch \| blob \| history
nominatim/indexer/indexer.py		patch \| blob \| history
nominatim/indexer/runners.py		patch \| blob \| history
nominatim/tokenizer/__init__.py	[new file with mode: 0644]	patch \| blob
nominatim/tokenizer/factory.py	[new file with mode: 0644]	patch \| blob
nominatim/tokenizer/legacy_tokenizer.py	[new file with mode: 0644]	patch \| blob
nominatim/tools/check_database.py		patch \| blob \| history
nominatim/tools/database_import.py		patch \| blob \| history
nominatim/tools/migration.py		patch \| blob \| history
nominatim/tools/postcodes.py		patch \| blob \| history
nominatim/tools/refresh.py		patch \| blob \| history
nominatim/tools/special_phrases/importer_statistics.py		patch \| blob \| history
nominatim/tools/special_phrases/special_phrases_importer.py		patch \| blob \| history
nominatim/version.py		patch \| blob \| history
settings/env.defaults		patch \| blob \| history
test/bdd/db/import/search_name.feature		patch \| blob \| history
test/bdd/steps/nominatim_environment.py		patch \| blob \| history
test/bdd/steps/steps_db_ops.py		patch \| blob \| history
test/php/Nominatim/PhraseTest.php		patch \| blob \| history
test/php/Nominatim/StatusTest.php		patch \| blob \| history
test/php/Nominatim/TokenListTest.php		patch \| blob \| history
test/php/Nominatim/tokenizer.php	[new file with mode: 0644]	patch \| blob
test/python/conftest.py		patch \| blob \| history
test/python/dummy_tokenizer.py	[new file with mode: 0644]	patch \| blob
test/python/test_cli.py		patch \| blob \| history
test/python/test_cli_replication.py		patch \| blob \| history
test/python/test_db_sql_preprocessor.py		patch \| blob \| history
test/python/test_indexing.py		patch \| blob \| history
test/python/test_tokenizer_factory.py	[new file with mode: 0644]	patch \| blob
test/python/test_tokenizer_legacy.py	[new file with mode: 0644]	patch \| blob
test/python/test_tools_check_database.py		patch \| blob \| history
test/python/test_tools_database_import.py		patch \| blob \| history
test/python/test_tools_import_special_phrases.py		patch \| blob \| history
test/python/test_tools_postcodes.py		patch \| blob \| history
test/python/test_tools_refresh_create_functions.py		patch \| blob \| history
test/python/test_tools_refresh_setup_website.py		patch \| blob \| history
test/testdb/specialphrases_testdb.sql		patch \| blob \| history