require_once(CONST_LibDir.'/SearchDescription.php');
require_once(CONST_LibDir.'/SearchContext.php');
require_once(CONST_LibDir.'/TokenList.php');
+require_once(CONST_TokenizerDir.'/tokenizer.php');
class Geocode
{
protected $oDB;
protected $oPlaceLookup;
+ protected $oTokenizer;
protected $aLangPrefOrder = array();
protected $sQuery = false;
protected $aStructuredQuery = false;
- protected $oNormalizer = null;
-
public function __construct(&$oDB)
{
$this->oDB =& $oDB;
$this->oPlaceLookup = new PlaceLookup($this->oDB);
- $this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
- }
-
- private function normTerm($sTerm)
- {
- if ($this->oNormalizer === null) {
- return $sTerm;
- }
-
- return $this->oNormalizer->transliterate($sTerm);
+ $this->oTokenizer = new \Nominatim\Tokenizer($this->oDB);
}
public function setLanguagePreference($aLangPref)
if ($this->aCountryCodes) {
$oCtx->setCountryList($this->aCountryCodes);
}
+ $this->oTokenizer->setCountryRestriction($this->aCountryCodes);
Debug::newSection('Query Preprocessing');
- $sNormQuery = $this->normTerm($this->sQuery);
- Debug::printVar('Normalized query', $sNormQuery);
-
$sLanguagePrefArraySQL = $this->oDB->getArraySQL(
$this->oDB->getDBQuotedList($this->aLangPrefOrder)
);
}
if ($sSpecialTerm && !$aSearches[0]->hasOperator()) {
- $sSpecialTerm = pg_escape_string($sSpecialTerm);
- $sToken = $this->oDB->getOne(
- 'SELECT make_standard_name(:term)',
- array(':term' => $sSpecialTerm),
- 'Cannot decode query. Wrong encoding?'
- );
- $sSQL = 'SELECT class, type FROM word ';
- $sSQL .= ' WHERE word_token in (\' '.$sToken.'\')';
- $sSQL .= ' AND class is not null AND class not in (\'place\')';
-
- Debug::printSQL($sSQL);
- $aSearchWords = $this->oDB->getAll($sSQL);
- $aNewSearches = array();
- foreach ($aSearches as $oSearch) {
- foreach ($aSearchWords as $aSearchTerm) {
- $oNewSearch = clone $oSearch;
- $oNewSearch->setPoiSearch(
- Operator::TYPE,
- $aSearchTerm['class'],
- $aSearchTerm['type']
- );
- $aNewSearches[] = $oNewSearch;
+ $aTokens = $this->oTokenizer->tokensForSpecialTerm($sSpecialTerm);
+
+ if (!empty($aTokens)) {
+ $aNewSearches = array();
+ foreach ($aSearches as $oSearch) {
+ foreach ($aTokens as $oToken) {
+ $oNewSearch = clone $oSearch;
+ $oNewSearch->setPoiSearch(
+ $oToken->iOperator,
+ $oToken->sClass,
+ $oToken->sType
+ );
+ $aNewSearches[] = $oNewSearch;
+ }
}
+ $aSearches = $aNewSearches;
}
- $aSearches = $aNewSearches;
}
// Split query into phrases
// Commas are used to reduce the search space by indicating where phrases split
+ $aPhrases = array();
if ($this->aStructuredQuery) {
- $aInPhrases = $this->aStructuredQuery;
+ foreach ($this->aStructuredQuery as $iPhrase => $sPhrase) {
+ $aPhrases[] = new Phrase($sPhrase, $iPhrase);
+ }
} else {
- $aInPhrases = explode(',', $sQuery);
+ foreach (explode(',', $sQuery) as $sPhrase) {
+ $aPhrases[] = new Phrase($sPhrase, '');
+ }
}
Debug::printDebugArray('Search context', $oCtx);
Debug::printDebugArray('Base search', empty($aSearches) ? null : $aSearches[0]);
- Debug::printVar('Final query phrases', $aInPhrases);
- // Convert each phrase to standard form
- // Create a list of standard words
- // Get all 'sets' of words
- // Generate a complete list of all
Debug::newSection('Tokenization');
- $aTokens = array();
- $aPhrases = array();
- foreach ($aInPhrases as $iPhrase => $sPhrase) {
- $sPhrase = $this->oDB->getOne(
- 'SELECT make_standard_name(:phrase)',
- array(':phrase' => $sPhrase),
- 'Cannot normalize query string (is it a UTF-8 string?)'
- );
- if (trim($sPhrase)) {
- $oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : '');
- $oPhrase->addTokens($aTokens);
- $aPhrases[] = $oPhrase;
- }
- }
-
- Debug::printVar('Tokens', $aTokens);
-
- $oValidTokens = new TokenList();
-
- if (!empty($aTokens)) {
- $oValidTokens->addTokensFromDB(
- $this->oDB,
- $aTokens,
- $this->aCountryCodes,
- $sNormQuery,
- $this->oNormalizer
- );
+ $oValidTokens = $this->oTokenizer->extractTokensFromPhrases($aPhrases);
+ if ($oValidTokens->count() > 0) {
$oCtx->setFullNameWords($oValidTokens->getFullWordIDs());
- // Try more interpretations for Tokens that could not be matched.
- foreach ($aTokens as $sToken) {
- if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
- if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
- // US ZIP+4 codes - merge in the 5-digit ZIP code
- $oValidTokens->addToken(
- $sToken,
- new Token\Postcode(null, $aData[1], 'us')
- );
- } elseif (preg_match('/^ [0-9]+$/', $sToken)) {
- // Unknown single word token with a number.
- // Assume it is a house number.
- $oValidTokens->addToken(
- $sToken,
- new Token\HouseNumber(null, trim($sToken))
- );
- }
- }
- }
+ $aPhrases = array_filter($aPhrases, function ($oPhrase) {
+ return $oPhrase->getWordSets() !== null;
+ });
// Any words that have failed completely?
// TODO: suggestions
Debug::printGroupTable('Valid Tokens', $oValidTokens->debugInfo());
-
- foreach ($aPhrases as $oPhrase) {
- $oPhrase->computeWordSets($oValidTokens);
- }
Debug::printDebugTable('Phrases', $aPhrases);
Debug::newSection('Search candidates');
private $sPhrase;
// Element type for structured searches.
private $sPhraseType;
- // Space-separated words of the phrase.
- private $aWords;
// Possible segmentations of the phrase.
private $aWordSets;
{
$this->sPhrase = trim($sPhrase);
$this->sPhraseType = $sPhraseType;
- $this->aWords = explode(' ', $this->sPhrase);
+ }
+
+ /**
+ * Get the orginal phrase of the string.
+ */
+ public function getPhrase()
+ {
+ return $this->sPhrase;
}
/**
return $this->aWordSets;
}
- /**
- * Add the tokens from this phrase to the given list of tokens.
- *
- * @param string[] $aTokens List of tokens to append.
- *
- * @return void
- */
- public function addTokens(&$aTokens)
- {
- $iNumWords = count($this->aWords);
-
- for ($i = 0; $i < $iNumWords; $i++) {
- $sPhrase = $this->aWords[$i];
- $aTokens[' '.$sPhrase] = ' '.$sPhrase;
- $aTokens[$sPhrase] = $sPhrase;
-
- for ($j = $i + 1; $j < $iNumWords; $j++) {
- $sPhrase .= ' '.$this->aWords[$j];
- $aTokens[' '.$sPhrase] = ' '.$sPhrase;
- $aTokens[$sPhrase] = $sPhrase;
- }
- }
- }
-
/**
* Invert the set of possible segmentations.
*
}
}
- public function computeWordSets($oTokens)
+ public function computeWordSets($aWords, $oTokens)
{
- $iNumWords = count($this->aWords);
+ $iNumWords = count($aWords);
+
+ if ($iNumWords == 0) {
+ $this->aWordSets = null;
+ return;
+ }
+
// Caches the word set for the partial phrase up to word i.
$aSetCache = array_fill(0, $iNumWords, array());
// Initialise first element of cache. There can only be the word.
- if ($oTokens->containsAny($this->aWords[0])) {
- $aSetCache[0][] = array($this->aWords[0]);
+ if ($oTokens->containsAny($aWords[0])) {
+ $aSetCache[0][] = array($aWords[0]);
}
// Now do the next elements using what we already have.
for ($i = 1; $i < $iNumWords; $i++) {
for ($j = $i; $j > 0; $j--) {
- $sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial;
+ $sPartial = $j == $i ? $aWords[$j] : $aWords[$j].' '.$sPartial;
if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
$aPartial = array($sPartial);
foreach ($aSetCache[$j - 1] as $aSet) {
}
// finally the current full phrase
- $sPartial = $this->aWords[0].' '.$sPartial;
+ $sPartial = $aWords[0].' '.$sPartial;
if ($oTokens->containsAny($sPartial)) {
$aSetCache[$i][] = array($sPartial);
}
return array(
'Type' => $this->sPhraseType,
'Phrase' => $this->sPhrase,
- 'Words' => $this->aWords,
'WordSets' => $this->aWordSets
);
}
namespace Nominatim;
+require_once(CONST_TokenizerDir.'/tokenizer.php');
+
use Exception;
class Status
throw new Exception('Database connection failed', 700);
}
- $sStandardWord = $this->oDB->getOne("SELECT make_standard_name('a')");
- if ($sStandardWord === false) {
- throw new Exception('Module failed', 701);
- }
-
- if ($sStandardWord != 'a') {
- throw new Exception('Module call failed', 702);
- }
-
- $sSQL = 'SELECT word_id, word_token, word, class, type, country_code, ';
- $sSQL .= "operator, search_name_count FROM word WHERE word_token IN (' a')";
- $iWordID = $this->oDB->getOne($sSQL);
- if ($iWordID === false) {
- throw new Exception('Query failed', 703);
- }
- if (!$iWordID) {
- throw new Exception('No value', 704);
- }
+ $oTokenizer = new \Nominatim\Tokenizer($this->oDB);
+ $oTokenizer->checkStatus();
}
public function dataDate()
return $ids;
}
- /**
- * Add token information from the word table in the database.
- *
- * @param object $oDB Nominatim::DB instance.
- * @param string[] $aTokens List of tokens to look up in the database.
- * @param string[] $aCountryCodes List of country restrictions.
- * @param string $sNormQuery Normalized query string.
- * @param object $oNormalizer Normalizer function to use on tokens.
- *
- * @return void
- */
- public function addTokensFromDB(&$oDB, &$aTokens, &$aCountryCodes, $sNormQuery, $oNormalizer)
- {
- // Check which tokens we have, get the ID numbers
- $sSQL = 'SELECT word_id, word_token, word, class, type, country_code,';
- $sSQL .= ' operator, coalesce(search_name_count, 0) as count';
- $sSQL .= ' FROM word WHERE word_token in (';
- $sSQL .= join(',', $oDB->getDBQuotedList($aTokens)).')';
-
- Debug::printSQL($sSQL);
-
- $aDBWords = $oDB->getAll($sSQL, null, 'Could not get word tokens.');
-
- foreach ($aDBWords as $aWord) {
- $oToken = null;
- $iId = (int) $aWord['word_id'];
-
- if ($aWord['class']) {
- // Special terms need to appear in their normalized form.
- if ($aWord['word']) {
- $sNormWord = $aWord['word'];
- if ($oNormalizer != null) {
- $sNormWord = $oNormalizer->transliterate($aWord['word']);
- }
- if (strpos($sNormQuery, $sNormWord) === false) {
- continue;
- }
- }
-
- if ($aWord['class'] == 'place' && $aWord['type'] == 'house') {
- $oToken = new Token\HouseNumber($iId, trim($aWord['word_token']));
- } elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') {
- if ($aWord['word']
- && pg_escape_string($aWord['word']) == $aWord['word']
- ) {
- $oToken = new Token\Postcode(
- $iId,
- $aWord['word'],
- $aWord['country_code']
- );
- }
- } else {
- // near and in operator the same at the moment
- $oToken = new Token\SpecialTerm(
- $iId,
- $aWord['class'],
- $aWord['type'],
- $aWord['operator'] ? Operator::NEAR : Operator::NONE
- );
- }
- } elseif ($aWord['country_code']) {
- // Filter country tokens that do not match restricted countries.
- if (!$aCountryCodes
- || in_array($aWord['country_code'], $aCountryCodes)
- ) {
- $oToken = new Token\Country($iId, $aWord['country_code']);
- }
- } else {
- $oToken = new Token\Word(
- $iId,
- $aWord['word_token'][0] != ' ',
- (int) $aWord['count'],
- substr_count($aWord['word_token'], ' ')
- );
- }
-
- if ($oToken) {
- $this->addToken($aWord['word_token'], $oToken);
- }
- }
- }
-
/**
* Add a new token for the given word.
*
@define('CONST_LibDir', dirname(dirname(__FILE__)));
require_once(CONST_LibDir.'/init-cmd.php');
-require_once(CONST_LibDir.'/Geocode.php');
require_once(CONST_LibDir.'/ParameterParser.php');
ini_set('memory_limit', '800M');
@define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
@define('CONST_Log_DB', getSettingBool('LOG_DB'));
@define('CONST_Log_File', getSetting('LOG_FILE', false));
-@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
@define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
@define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
@define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
@define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
@define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
-@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
@define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
@define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
+@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');
+require_once(CONST_LibDir.'/Geocode.php');
$oDB = new Nominatim\DB;
$oDB->connect();
require_once(CONST_LibDir.'/init-cmd.php');
require_once(CONST_LibDir.'/log.php');
-require_once(CONST_LibDir.'/Geocode.php');
require_once(CONST_LibDir.'/PlaceLookup.php');
require_once(CONST_LibDir.'/ReverseGeocode.php');
@define('CONST_Default_Language', getSetting('DEFAULT_LANGUAGE', false));
@define('CONST_Log_DB', getSettingBool('LOG_DB'));
@define('CONST_Log_File', getSetting('LOG_FILE', false));
-@define('CONST_Max_Word_Frequency', getSetting('MAX_WORD_FREQUENCY'));
@define('CONST_NoAccessControl', getSettingBool('CORS_NOACCESSCONTROL'));
@define('CONST_Places_Max_ID_count', getSetting('LOOKUP_MAX_COUNT'));
@define('CONST_PolygonOutput_MaximumTypes', getSetting('POLYGON_OUTPUT_MAX_TYPES'));
@define('CONST_Search_BatchMode', getSettingBool('SEARCH_BATCH_MODE'));
@define('CONST_Search_NameOnlySearchFrequencyThreshold', getSetting('SEARCH_NAME_ONLY_THRESHOLD'));
-@define('CONST_Term_Normalization_Rules', getSetting('TERM_NORMALIZATION'));
@define('CONST_Use_US_Tiger_Data', getSettingBool('USE_US_TIGER_DATA'));
@define('CONST_MapIcon_URL', getSetting('MAPICON_URL', false));
+@define('CONST_TokenizerDir', CONST_InstallDir.'/tokenizer');
+require_once(CONST_LibDir.'/Geocode.php');
$oDB = new Nominatim\DB();
$oDB->connect();
--- /dev/null
+<?php
+
+namespace Nominatim;
+
+class Tokenizer
+{
+ private $oDB;
+
+ private $oNormalizer = null;
+ private $aCountryRestriction = null;
+
+ public function __construct(&$oDB)
+ {
+ $this->oDB =& $oDB;
+ $this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
+ }
+
+ public function checkStatus()
+ {
+ $sStandardWord = $this->oDB->getOne("SELECT make_standard_name('a')");
+ if ($sStandardWord === false) {
+ throw new Exception('Module failed', 701);
+ }
+
+ if ($sStandardWord != 'a') {
+ throw new Exception('Module call failed', 702);
+ }
+
+ $sSQL = "SELECT word_id FROM word WHERE word_token IN (' a')";
+ $iWordID = $this->oDB->getOne($sSQL);
+ if ($iWordID === false) {
+ throw new Exception('Query failed', 703);
+ }
+ if (!$iWordID) {
+ throw new Exception('No value', 704);
+ }
+ }
+
+
+ public function setCountryRestriction($aCountries)
+ {
+ $this->aCountryRestriction = $aCountries;
+ }
+
+
+ public function normalizeString($sTerm)
+ {
+ if ($this->oNormalizer === null) {
+ return $sTerm;
+ }
+
+ return $this->oNormalizer->transliterate($sTerm);
+ }
+
+
+ public function tokensForSpecialTerm($sTerm)
+ {
+ $aResults = array();
+
+ $sSQL = 'SELECT word_id, class, type FROM word ';
+ $sSQL .= ' WHERE word_token = \' \' || make_standard_name(:term)';
+ $sSQL .= ' AND class is not null AND class not in (\'place\')';
+
+ Debug::printVar('Term', $sTerm);
+ Debug::printSQL($sSQL);
+ $aSearchWords = $this->oDB->getAll($sSQL, array(':term' => $sTerm));
+
+ Debug::printVar('Results', $aSearchWords);
+
+ foreach ($aSearchWords as $aSearchTerm) {
+ $aResults[] = new \Nominatim\Token\SpecialTerm(
+ $aSearchTerm['word_id'],
+ $aSearchTerm['class'],
+ $aSearchTerm['type'],
+ \Nominatim\Operator::TYPE
+ );
+ }
+
+ Debug::printVar('Special term tokens', $aResults);
+
+ return $aResults;
+ }
+
+
+ public function extractTokensFromPhrases(&$aPhrases)
+ {
+ // First get the normalized version of all phrases
+ $sNormQuery = '';
+ $sSQL = 'SELECT ';
+ $aParams = array();
+ foreach ($aPhrases as $iPhrase => $oPhrase) {
+ $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
+ $sSQL .= 'make_standard_name(:' .$iPhrase.') as p'.$iPhrase.',';
+ $aParams[':'.$iPhrase] = $oPhrase->getPhrase();
+ }
+ $sSQL = substr($sSQL, 0, -1);
+
+ Debug::printSQL($sSQL);
+ Debug::printVar('SQL parameters', $aParams);
+
+ $aNormPhrases = $this->oDB->getRow($sSQL, $aParams);
+
+ Debug::printVar('SQL result', $aNormPhrases);
+
+ // now compute all possible tokens
+ $aWordLists = array();
+ $aTokens = array();
+ foreach ($aNormPhrases as $sTitle => $sPhrase) {
+ if (strlen($sPhrase) > 0) {
+ $aWords = explode(' ', $sPhrase);
+ Tokenizer::addTokens($aTokens, $aWords);
+ $aWordLists[] = $aWords;
+ } else {
+ $aWordLists[] = array();
+ }
+ }
+
+ Debug::printVar('Tokens', $aTokens);
+ Debug::printVar('WordLists', $aWordLists);
+
+ $oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
+
+ foreach ($aPhrases as $iPhrase => $oPhrase) {
+ $oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
+ }
+
+ return $oValidTokens;
+ }
+
+
+ private function computeValidTokens($aTokens, $sNormQuery)
+ {
+ $oValidTokens = new TokenList();
+
+ if (!empty($aTokens)) {
+ $this->addTokensFromDB($oValidTokens, $aTokens, $sNormQuery);
+
+ // Try more interpretations for Tokens that could not be matched.
+ foreach ($aTokens as $sToken) {
+ if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
+ if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
+ // US ZIP+4 codes - merge in the 5-digit ZIP code
+ $oValidTokens->addToken(
+ $sToken,
+ new Token\Postcode(null, $aData[1], 'us')
+ );
+ } elseif (preg_match('/^ [0-9]+$/', $sToken)) {
+ // Unknown single word token with a number.
+ // Assume it is a house number.
+ $oValidTokens->addToken(
+ $sToken,
+ new Token\HouseNumber(null, trim($sToken))
+ );
+ }
+ }
+ }
+ }
+
+ return $oValidTokens;
+ }
+
+
+ private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery)
+ {
+ // Check which tokens we have, get the ID numbers
+ $sSQL = 'SELECT word_id, word_token, word, class, type, country_code,';
+ $sSQL .= ' operator, coalesce(search_name_count, 0) as count';
+ $sSQL .= ' FROM word WHERE word_token in (';
+ $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
+
+ Debug::printSQL($sSQL);
+
+ $aDBWords = $this->oDB->getAll($sSQL, null, 'Could not get word tokens.');
+
+ foreach ($aDBWords as $aWord) {
+ $oToken = null;
+ $iId = (int) $aWord['word_id'];
+
+ if ($aWord['class']) {
+ // Special terms need to appear in their normalized form.
+ // (postcodes are not normalized in the word table)
+ $sNormWord = $this->normalizeString($aWord['word']);
+ if ($aWord['word'] && strpos($sNormQuery, $sNormWord) === false) {
+ continue;
+ }
+
+ if ($aWord['class'] == 'place' && $aWord['type'] == 'house') {
+ $oToken = new Token\HouseNumber($iId, trim($aWord['word_token']));
+ } elseif ($aWord['class'] == 'place' && $aWord['type'] == 'postcode') {
+ if ($aWord['word']
+ && pg_escape_string($aWord['word']) == $aWord['word']
+ ) {
+ $oToken = new Token\Postcode(
+ $iId,
+ $aWord['word'],
+ $aWord['country_code']
+ );
+ }
+ } else {
+ // near and in operator the same at the moment
+ $oToken = new Token\SpecialTerm(
+ $iId,
+ $aWord['class'],
+ $aWord['type'],
+ $aWord['operator'] ? Operator::NEAR : Operator::NONE
+ );
+ }
+ } elseif ($aWord['country_code']) {
+ // Filter country tokens that do not match restricted countries.
+ if (!$this->aCountryRestriction
+ || in_array($aWord['country_code'], $this->aCountryRestriction)
+ ) {
+ $oToken = new Token\Country($iId, $aWord['country_code']);
+ }
+ } else {
+ $oToken = new Token\Word(
+ $iId,
+ $aWord['word_token'][0] != ' ',
+ (int) $aWord['count'],
+ substr_count($aWord['word_token'], ' ')
+ );
+ }
+
+ if ($oToken) {
+ $oValidTokens->addToken($aWord['word_token'], $oToken);
+ }
+ }
+ }
+
+
+ /**
+ * Add the tokens from this phrase to the given list of tokens.
+ *
+ * @param string[] $aTokens List of tokens to append.
+ *
+ * @return void
+ */
+ private static function addTokens(&$aTokens, $aWords)
+ {
+ $iNumWords = count($aWords);
+
+ for ($i = 0; $i < $iNumWords; $i++) {
+ $sPhrase = $aWords[$i];
+ $aTokens[' '.$sPhrase] = ' '.$sPhrase;
+ $aTokens[$sPhrase] = $sPhrase;
+
+ for ($j = $i + 1; $j < $iNumWords; $j++) {
+ $sPhrase .= ' '.$aWords[$j];
+ $aTokens[' '.$sPhrase] = ' '.$sPhrase;
+ $aTokens[$sPhrase] = $sPhrase;
+ }
+ }
+ }
+}
{% include('functions/utils.sql') %}
-{% include('functions/normalization.sql') %}
{% include('functions/ranking.sql') %}
{% include('functions/importance.sql') %}
{% include('functions/address_lookup.sql') %}
LANGUAGE plpgsql IMMUTABLE;
+CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
+RETURNS HSTORE
+ AS $$
+DECLARE
+ location RECORD;
+ waynodes BIGINT[];
+BEGIN
+ IF akeys(in_address) != ARRAY['interpolation'] THEN
+ RETURN in_address;
+ END IF;
+
+ SELECT nodes INTO waynodes FROM planet_osm_ways WHERE id = wayid;
+ FOR location IN
+ SELECT placex.address, placex.osm_id FROM placex
+ WHERE osm_type = 'N' and osm_id = ANY(waynodes)
+ and placex.address is not null
+ and (placex.address ? 'street' or placex.address ? 'place')
+ and indexed_status < 100
+ LOOP
+ -- mark it as a derived address
+ RETURN location.address || in_address || hstore('_inherited', '');
+ END LOOP;
+
+ RETURN in_address;
+END;
+$$
+LANGUAGE plpgsql STABLE;
+
+
+
-- find the parent road of the cut road parts
-CREATE OR REPLACE FUNCTION get_interpolation_parent(wayid BIGINT, street TEXT,
- place TEXT, partition SMALLINT,
+CREATE OR REPLACE FUNCTION get_interpolation_parent(street INTEGER[], place INTEGER[],
+ partition SMALLINT,
centroid GEOMETRY, geom GEOMETRY)
RETURNS BIGINT
AS $$
DECLARE
- addr_street TEXT;
- addr_place TEXT;
parent_place_id BIGINT;
-
- waynodes BIGINT[];
-
location RECORD;
BEGIN
- addr_street = street;
- addr_place = place;
-
- IF addr_street is null and addr_place is null THEN
- select nodes from planet_osm_ways where id = wayid INTO waynodes;
- FOR location IN SELECT placex.address from placex
- where osm_type = 'N' and osm_id = ANY(waynodes)
- and placex.address is not null
- and (placex.address ? 'street' or placex.address ? 'place')
- and indexed_status < 100
- limit 1 LOOP
- addr_street = location.address->'street';
- addr_place = location.address->'place';
- END LOOP;
- END IF;
-
- parent_place_id := find_parent_for_address(addr_street, addr_place,
- partition, centroid);
+ parent_place_id := find_parent_for_address(street, place, partition, centroid);
IF parent_place_id is null THEN
FOR location IN SELECT place_id FROM placex
NEW.interpolationtype = NEW.address->'interpolation';
place_centroid := ST_PointOnSurface(NEW.linegeo);
- NEW.parent_place_id = get_interpolation_parent(NEW.osm_id, NEW.address->'street',
- NEW.address->'place',
+ NEW.parent_place_id = get_interpolation_parent(token_addr_street_match_tokens(NEW.token_info),
+ token_addr_place_match_tokens(NEW.token_info),
NEW.partition, place_centroid, NEW.linegeo);
- IF NEW.address is not NULL AND NEW.address ? 'postcode' AND NEW.address->'postcode' not similar to '%(,|;)%' THEN
- interpol_postcode := NEW.address->'postcode';
- housenum := getorcreate_postcode_id(NEW.address->'postcode');
- ELSE
- interpol_postcode := NULL;
+ interpol_postcode := token_normalized_postcode(NEW.address->'postcode');
+
+ NEW.token_info := token_strip_info(NEW.token_info);
+ IF NEW.address ? '_inherited' THEN
+ NEW.address := hstore('interpolation', NEW.interpolationtype);
END IF;
-- if the line was newly inserted, split the line as necessary
-- determine postcode
postcode := coalesce(interpol_postcode,
- prevnode.address->'postcode',
- nextnode.address->'postcode',
+ token_normalized_postcode(prevnode.address->'postcode'),
+ token_normalized_postcode(nextnode.address->'postcode'),
postcode);
IF postcode is NULL THEN
- SELECT placex.postcode FROM placex WHERE place_id = NEW.parent_place_id INTO postcode;
+ SELECT token_normalized_postcode(placex.postcode)
+ FROM placex WHERE place_id = NEW.parent_place_id INTO postcode;
END IF;
IF postcode is NULL THEN
postcode := get_nearest_postcode(NEW.country_code, nextnode.geometry);
NEW.startnumber := startnumber;
NEW.endnumber := endnumber;
NEW.linegeo := sectiongeo;
- NEW.postcode := upper(trim(postcode));
+ NEW.postcode := postcode;
ELSE
insert into location_property_osmline
(linegeo, partition, osm_id, parent_place_id,
+++ /dev/null
--- Functions for term normalisation and access to the 'word' table.
-
-CREATE OR REPLACE FUNCTION transliteration(text) RETURNS text
- AS '{{ modulepath }}/nominatim.so', 'transliteration'
-LANGUAGE c IMMUTABLE STRICT;
-
-
-CREATE OR REPLACE FUNCTION gettokenstring(text) RETURNS text
- AS '{{ modulepath }}/nominatim.so', 'gettokenstring'
-LANGUAGE c IMMUTABLE STRICT;
-
-
-CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT
- AS $$
-DECLARE
- o TEXT;
-BEGIN
- o := public.gettokenstring(public.transliteration(name));
- RETURN trim(substr(o,1,length(o)));
-END;
-$$
-LANGUAGE plpgsql IMMUTABLE;
-
--- returns NULL if the word is too common
-CREATE OR REPLACE FUNCTION getorcreate_word_id(lookup_word TEXT)
- RETURNS INTEGER
- AS $$
-DECLARE
- lookup_token TEXT;
- return_word_id INTEGER;
- count INTEGER;
-BEGIN
- lookup_token := trim(lookup_word);
- SELECT min(word_id), max(search_name_count) FROM word
- WHERE word_token = lookup_token and class is null and type is null
- INTO return_word_id, count;
- IF return_word_id IS NULL THEN
- return_word_id := nextval('seq_word');
- INSERT INTO word VALUES (return_word_id, lookup_token, null, null, null, null, 0);
- ELSE
- IF count > get_maxwordfreq() THEN
- return_word_id := NULL;
- END IF;
- END IF;
- RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
--- Create housenumber tokens from an OSM addr:housenumber.
--- The housnumber is split at comma and semicolon as necessary.
--- The function returns the normalized form of the housenumber suitable
--- for comparison.
-CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
- RETURNS TEXT
- AS $$
-DECLARE
- normtext TEXT;
-BEGIN
- SELECT array_to_string(array_agg(trans), ';')
- INTO normtext
- FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word)
- FROM (SELECT make_standard_name(h) as lookup_word
- FROM regexp_split_to_table(housenumber, '[,;]') h) x) y;
-
- return normtext;
-END;
-$$ LANGUAGE plpgsql STABLE STRICT;
-
-CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
- RETURNS INTEGER
- AS $$
-DECLARE
- lookup_token TEXT;
- return_word_id INTEGER;
-BEGIN
- lookup_token := ' ' || trim(lookup_word);
- SELECT min(word_id) FROM word
- WHERE word_token = lookup_token and class='place' and type='house'
- INTO return_word_id;
- IF return_word_id IS NULL THEN
- return_word_id := nextval('seq_word');
- INSERT INTO word VALUES (return_word_id, lookup_token, null,
- 'place', 'house', null, 0);
- END IF;
- RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
- RETURNS INTEGER
- AS $$
-DECLARE
- lookup_token TEXT;
- lookup_word TEXT;
- return_word_id INTEGER;
-BEGIN
- lookup_word := upper(trim(postcode));
- lookup_token := ' ' || make_standard_name(lookup_word);
- SELECT min(word_id) FROM word
- WHERE word_token = lookup_token and word = lookup_word
- and class='place' and type='postcode'
- INTO return_word_id;
- IF return_word_id IS NULL THEN
- return_word_id := nextval('seq_word');
- INSERT INTO word VALUES (return_word_id, lookup_token, lookup_word,
- 'place', 'postcode', null, 0);
- END IF;
- RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
- lookup_country_code varchar(2))
- RETURNS INTEGER
- AS $$
-DECLARE
- lookup_token TEXT;
- return_word_id INTEGER;
-BEGIN
- lookup_token := ' '||trim(lookup_word);
- SELECT min(word_id) FROM word
- WHERE word_token = lookup_token and country_code=lookup_country_code
- INTO return_word_id;
- IF return_word_id IS NULL THEN
- return_word_id := nextval('seq_word');
- INSERT INTO word VALUES (return_word_id, lookup_token, null,
- null, null, lookup_country_code, 0);
- END IF;
- RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
- lookup_class text, lookup_type text)
- RETURNS INTEGER
- AS $$
-DECLARE
- lookup_token TEXT;
- return_word_id INTEGER;
-BEGIN
- lookup_token := ' '||trim(lookup_word);
- SELECT min(word_id) FROM word
- WHERE word_token = lookup_token and word = normalized_word
- and class = lookup_class and type = lookup_type
- INTO return_word_id;
- IF return_word_id IS NULL THEN
- return_word_id := nextval('seq_word');
- INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
- lookup_class, lookup_type, null, 0);
- END IF;
- RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT,
- normalized_word TEXT,
- lookup_class text,
- lookup_type text,
- op text)
- RETURNS INTEGER
- AS $$
-DECLARE
- lookup_token TEXT;
- return_word_id INTEGER;
-BEGIN
- lookup_token := ' '||trim(lookup_word);
- SELECT min(word_id) FROM word
- WHERE word_token = lookup_token and word = normalized_word
- and class = lookup_class and type = lookup_type and operator = op
- INTO return_word_id;
- IF return_word_id IS NULL THEN
- return_word_id := nextval('seq_word');
- INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
- lookup_class, lookup_type, null, 0, op);
- END IF;
- RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT, src_word TEXT)
- RETURNS INTEGER
- AS $$
-DECLARE
- lookup_token TEXT;
- nospace_lookup_token TEXT;
- return_word_id INTEGER;
-BEGIN
- lookup_token := ' '||trim(lookup_word);
- SELECT min(word_id) FROM word
- WHERE word_token = lookup_token and class is null and type is null
- INTO return_word_id;
- IF return_word_id IS NULL THEN
- return_word_id := nextval('seq_word');
- INSERT INTO word VALUES (return_word_id, lookup_token, src_word,
- null, null, null, 0);
- END IF;
- RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT)
- RETURNS INTEGER
- AS $$
-DECLARE
-BEGIN
- RETURN getorcreate_name_id(lookup_word, '');
-END;
-$$
-LANGUAGE plpgsql;
-
--- Normalize a string and lookup its word ids (partial words).
-CREATE OR REPLACE FUNCTION addr_ids_from_name(lookup_word TEXT)
- RETURNS INTEGER[]
- AS $$
-DECLARE
- words TEXT[];
- id INTEGER;
- return_word_id INTEGER[];
- word_ids INTEGER[];
- j INTEGER;
-BEGIN
- words := string_to_array(make_standard_name(lookup_word), ' ');
- IF array_upper(words, 1) IS NOT NULL THEN
- FOR j IN 1..array_upper(words, 1) LOOP
- IF (words[j] != '') THEN
- SELECT array_agg(word_id) INTO word_ids
- FROM word
- WHERE word_token = words[j] and class is null and type is null;
-
- IF word_ids IS NULL THEN
- id := nextval('seq_word');
- INSERT INTO word VALUES (id, words[j], null, null, null, null, 0);
- return_word_id := return_word_id || id;
- ELSE
- return_word_id := array_merge(return_word_id, word_ids);
- END IF;
- END IF;
- END LOOP;
- END IF;
-
- RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
--- Normalize a string and look up its name ids (full words).
-CREATE OR REPLACE FUNCTION word_ids_from_name(lookup_word TEXT)
- RETURNS INTEGER[]
- AS $$
-DECLARE
- lookup_token TEXT;
- return_word_ids INTEGER[];
-BEGIN
- lookup_token := ' '|| make_standard_name(lookup_word);
- SELECT array_agg(word_id) FROM word
- WHERE word_token = lookup_token and class is null and type is null
- INTO return_word_ids;
- RETURN return_word_ids;
-END;
-$$
-LANGUAGE plpgsql STABLE STRICT;
-
-
-CREATE OR REPLACE FUNCTION create_country(src HSTORE, country_code varchar(2))
- RETURNS VOID
- AS $$
-DECLARE
- s TEXT;
- w INTEGER;
- words TEXT[];
- item RECORD;
- j INTEGER;
-BEGIN
- FOR item IN SELECT (each(src)).* LOOP
-
- s := make_standard_name(item.value);
- w := getorcreate_country(s, country_code);
-
- words := regexp_split_to_array(item.value, E'[,;()]');
- IF array_upper(words, 1) != 1 THEN
- FOR j IN 1..array_upper(words, 1) LOOP
- s := make_standard_name(words[j]);
- IF s != '' THEN
- w := getorcreate_country(s, country_code);
- END IF;
- END LOOP;
- END IF;
- END LOOP;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION make_keywords(src HSTORE)
- RETURNS INTEGER[]
- AS $$
-DECLARE
- result INTEGER[];
- s TEXT;
- w INTEGER;
- words TEXT[];
- item RECORD;
- j INTEGER;
-BEGIN
- result := '{}'::INTEGER[];
-
- FOR item IN SELECT (each(src)).* LOOP
-
- s := make_standard_name(item.value);
- w := getorcreate_name_id(s, item.value);
-
- IF not(ARRAY[w] <@ result) THEN
- result := result || w;
- END IF;
-
- w := getorcreate_word_id(s);
-
- IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
- result := result || w;
- END IF;
-
- words := string_to_array(s, ' ');
- IF array_upper(words, 1) IS NOT NULL THEN
- FOR j IN 1..array_upper(words, 1) LOOP
- IF (words[j] != '') THEN
- w = getorcreate_word_id(words[j]);
- IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
- result := result || w;
- END IF;
- END IF;
- END LOOP;
- END IF;
-
- words := regexp_split_to_array(item.value, E'[,;()]');
- IF array_upper(words, 1) != 1 THEN
- FOR j IN 1..array_upper(words, 1) LOOP
- s := make_standard_name(words[j]);
- IF s != '' THEN
- w := getorcreate_word_id(s);
- IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
- result := result || w;
- END IF;
- END IF;
- END LOOP;
- END IF;
-
- s := regexp_replace(item.value, '市$', '');
- IF s != item.value THEN
- s := make_standard_name(s);
- IF s != '' THEN
- w := getorcreate_name_id(s, item.value);
- IF NOT (ARRAY[w] <@ result) THEN
- result := result || w;
- END IF;
- END IF;
- END IF;
-
- END LOOP;
-
- RETURN result;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION precompute_words(src TEXT)
- RETURNS INTEGER
- AS $$
-DECLARE
- s TEXT;
- w INTEGER;
- words TEXT[];
- i INTEGER;
- j INTEGER;
-BEGIN
- s := make_standard_name(src);
- w := getorcreate_name_id(s, src);
-
- w := getorcreate_word_id(s);
-
- words := string_to_array(s, ' ');
- IF array_upper(words, 1) IS NOT NULL THEN
- FOR j IN 1..array_upper(words, 1) LOOP
- IF (words[j] != '') THEN
- w := getorcreate_word_id(words[j]);
- END IF;
- END LOOP;
- END IF;
-
- words := regexp_split_to_array(src, E'[,;()]');
- IF array_upper(words, 1) != 1 THEN
- FOR j IN 1..array_upper(words, 1) LOOP
- s := make_standard_name(words[j]);
- IF s != '' THEN
- w := getorcreate_word_id(s);
- END IF;
- END LOOP;
- END IF;
-
- s := regexp_replace(src, '市$', '');
- IF s != src THEN
- s := make_standard_name(s);
- IF s != '' THEN
- w := getorcreate_name_id(s, src);
- END IF;
- END IF;
-
- RETURN 1;
-END;
-$$
-LANGUAGE plpgsql;
-
-
-CREATE OR REPLACE FUNCTION create_poi_search_terms(obj_place_id BIGINT,
- in_partition SMALLINT,
- parent_place_id BIGINT,
- address HSTORE,
- country TEXT,
- housenumber TEXT,
- initial_name_vector INTEGER[],
- geometry GEOMETRY,
- OUT name_vector INTEGER[],
- OUT nameaddress_vector INTEGER[])
- AS $$
-DECLARE
- parent_name_vector INTEGER[];
- parent_address_vector INTEGER[];
- addr_place_ids INTEGER[];
-
- addr_item RECORD;
- parent_address_place_ids BIGINT[];
- filtered_address HSTORE;
-BEGIN
- nameaddress_vector := '{}'::INTEGER[];
-
- SELECT s.name_vector, s.nameaddress_vector
- INTO parent_name_vector, parent_address_vector
- FROM search_name s
- WHERE s.place_id = parent_place_id;
-
- -- Find all address tags that don't appear in the parent search names.
- SELECT hstore(array_agg(ARRAY[k, v])) INTO filtered_address
- FROM (SELECT skeys(address) as k, svals(address) as v) a
- WHERE not addr_ids_from_name(v) && parent_address_vector
- AND k not in ('country', 'street', 'place', 'postcode',
- 'housenumber', 'streetnumber', 'conscriptionnumber');
-
- -- Compute all search terms from the addr: tags.
- IF filtered_address IS NOT NULL THEN
- FOR addr_item IN
- SELECT * FROM
- get_places_for_addr_tags(in_partition, geometry, filtered_address, country)
- LOOP
- IF addr_item.place_id is null THEN
- nameaddress_vector := array_merge(nameaddress_vector,
- addr_item.keywords);
- CONTINUE;
- END IF;
-
- IF parent_address_place_ids is null THEN
- SELECT array_agg(parent_place_id) INTO parent_address_place_ids
- FROM place_addressline
- WHERE place_id = parent_place_id;
- END IF;
-
- IF not parent_address_place_ids @> ARRAY[addr_item.place_id] THEN
- nameaddress_vector := array_merge(nameaddress_vector,
- addr_item.keywords);
-
- INSERT INTO place_addressline (place_id, address_place_id, fromarea,
- isaddress, distance, cached_rank_address)
- VALUES (obj_place_id, addr_item.place_id, not addr_item.isguess,
- true, addr_item.distance, addr_item.rank_address);
- END IF;
- END LOOP;
- END IF;
-
- name_vector := initial_name_vector;
-
- -- Check if the parent covers all address terms.
- -- If not, create a search name entry with the house number as the name.
- -- This is unusual for the search_name table but prevents that the place
- -- is returned when we only search for the street/place.
-
- IF housenumber is not null and not nameaddress_vector <@ parent_address_vector THEN
- name_vector := array_merge(name_vector,
- ARRAY[getorcreate_housenumber_id(make_standard_name(housenumber))]);
- END IF;
-
- IF not address ? 'street' and address ? 'place' THEN
- addr_place_ids := addr_ids_from_name(address->'place');
- IF not addr_place_ids <@ parent_name_vector THEN
- -- make sure addr:place terms are always searchable
- nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids);
- -- If there is a housenumber, also add the place name as a name,
- -- so we can search it by the usual housenumber+place algorithms.
- IF housenumber is not null THEN
- name_vector := array_merge(name_vector,
- ARRAY[getorcreate_name_id(make_standard_name(address->'place'))]);
- END IF;
- END IF;
- END IF;
-
- -- Cheating here by not recomputing all terms but simply using the ones
- -- from the parent object.
- nameaddress_vector := array_merge(nameaddress_vector, parent_name_vector);
- nameaddress_vector := array_merge(nameaddress_vector, parent_address_vector);
-
-END;
-$$
-LANGUAGE plpgsql;
$$
LANGUAGE plpgsql STABLE;
-CREATE OR REPLACE FUNCTION get_places_for_addr_tags(in_partition SMALLINT,
- feature GEOMETRY,
- address HSTORE, country TEXT)
- RETURNS SETOF nearfeaturecentr
+
+CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY,
+ from_rank SMALLINT, to_rank SMALLINT,
+ extent FLOAT, tokens INT[])
+ RETURNS nearfeaturecentr
AS $$
DECLARE
r nearfeaturecentr%rowtype;
- item RECORD;
BEGIN
- FOR item IN
- SELECT (get_addr_tag_rank(key, country)).*, key, name FROM
- (SELECT skeys(address) as key, svals(address) as name) x
- LOOP
- IF item.from_rank is null THEN
- CONTINUE;
- END IF;
-
{% for partition in db.partitions %}
- IF in_partition = {{ partition }} THEN
- SELECT place_id, keywords, rank_address, rank_search,
- min(ST_Distance(feature, centroid)) as distance,
- isguess, postcode, centroid INTO r
+ IF in_partition = {{ partition }} THEN
+ SELECT place_id, keywords, rank_address, rank_search,
+ min(ST_Distance(feature, centroid)) as distance,
+ isguess, postcode, centroid INTO r
FROM location_area_large_{{ partition }}
- WHERE geometry && ST_Expand(feature, item.extent)
- AND rank_address between item.from_rank and item.to_rank
- AND word_ids_from_name(item.name) && keywords
+ WHERE geometry && ST_Expand(feature, extent)
+ AND rank_address between from_rank and to_rank
+ AND tokens && keywords
GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1;
- IF r.place_id is null THEN
- -- If we cannot find a place for the term, just return the
- -- search term for the given name. That ensures that the address
- -- element can still be searched for, even though it will not be
- -- displayed.
- RETURN NEXT ROW(null, addr_ids_from_name(item.name), null, null,
- null, null, null, null)::nearfeaturecentr;
- ELSE
- RETURN NEXT r;
- END IF;
- CONTINUE;
- END IF;
+ RETURN r;
+ END IF;
{% endfor %}
- RAISE EXCEPTION 'Unknown partition %', in_partition;
- END LOOP;
+ RAISE EXCEPTION 'Unknown partition %', in_partition;
END;
$$
LANGUAGE plpgsql STABLE;
+
create or replace function deleteLocationArea(in_partition INTEGER, in_place_id BIGINT, in_rank_search INTEGER) RETURNS BOOLEAN AS $$
DECLARE
BEGIN
-- Trigger functions for the placex table.
+-- Retrieve the data needed by the indexer for updating the place.
+--
+-- Return parameters:
+-- name list of names
+-- address list of address tags, either from the object or a surrounding
+-- building
+-- country_feature If the place is a country feature, this contains the
+-- country code, otherwise it is null.
+CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
+ OUT name HSTORE,
+ OUT address HSTORE,
+ OUT country_feature VARCHAR)
+ AS $$
+BEGIN
+ -- For POI nodes, check if the address should be derived from a surrounding
+ -- building.
+ IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN
+ address := p.address;
+ ELSE
+ -- The additional && condition works around the misguided query
+ -- planner of postgis 3.0.
+ SELECT placex.address || hstore('_inherited', '') INTO address
+ FROM placex
+ WHERE ST_Covers(geometry, p.centroid)
+ and geometry && p.centroid
+ and placex.address is not null
+ and (placex.address ? 'housenumber' or placex.address ? 'street' or placex.address ? 'place')
+ and rank_search = 30 AND ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon')
+ LIMIT 1;
+ END IF;
+
+ address := address - '_unlisted_place'::TEXT;
+ name := p.name;
+
+ country_feature := CASE WHEN p.admin_level = 2
+ and p.class = 'boundary' and p.type = 'administrative'
+ and p.osm_type = 'R'
+ THEN p.country_code
+ ELSE null
+ END;
+END;
+$$
+LANGUAGE plpgsql STABLE;
+
+
+CREATE OR REPLACE FUNCTION find_associated_street(poi_osm_type CHAR(1),
+ poi_osm_id BIGINT)
+ RETURNS BIGINT
+ AS $$
+DECLARE
+ location RECORD;
+ parent RECORD;
+BEGIN
+ FOR location IN
+ SELECT members FROM planet_osm_rels
+ WHERE parts @> ARRAY[poi_osm_id]
+ and members @> ARRAY[lower(poi_osm_type) || poi_osm_id]
+ and tags @> ARRAY['associatedStreet']
+ LOOP
+ FOR i IN 1..array_upper(location.members, 1) BY 2 LOOP
+ IF location.members[i+1] = 'street' THEN
+ FOR parent IN
+ SELECT place_id from placex
+ WHERE osm_type = 'W' and osm_id = substring(location.members[i],2)::bigint
+ and name is not null
+ and rank_search between 26 and 27
+ LOOP
+ RETURN parent.place_id;
+ END LOOP;
+ END IF;
+ END LOOP;
+ END LOOP;
+
+ RETURN NULL;
+END;
+$$
+LANGUAGE plpgsql STABLE;
+
+
-- Find the parent road of a POI.
--
-- \returns Place ID of parent object or NULL if none
poi_osm_id BIGINT,
poi_partition SMALLINT,
bbox GEOMETRY,
- addr_street TEXT,
- addr_place TEXT,
- fallback BOOL = true)
+ addr_street INTEGER[],
+ addr_place INTEGER[],
+ is_place_addr BOOLEAN)
RETURNS BIGINT
AS $$
DECLARE
parent_place_id BIGINT DEFAULT NULL;
location RECORD;
- parent RECORD;
BEGIN
- {% if debug %}RAISE WARNING 'finding street for % %', poi_osm_type, poi_osm_id;{% endif %}
+ {% if debug %}RAISE WARNING 'finding street for % %', poi_osm_type, poi_osm_id;{% endif %}
+
+ -- Is this object part of an associatedStreet relation?
+ parent_place_id := find_associated_street(poi_osm_type, poi_osm_id);
- -- Is this object part of an associatedStreet relation?
+ IF parent_place_id is null THEN
+ parent_place_id := find_parent_for_address(addr_street, addr_place,
+ poi_partition, bbox);
+ END IF;
+
+ IF parent_place_id is null and poi_osm_type = 'N' THEN
+ -- Is this node part of an interpolation?
FOR location IN
- SELECT members FROM planet_osm_rels
- WHERE parts @> ARRAY[poi_osm_id]
- and members @> ARRAY[lower(poi_osm_type) || poi_osm_id]
- and tags @> ARRAY['associatedStreet']
+ SELECT q.parent_place_id
+ FROM location_property_osmline q, planet_osm_ways x
+ WHERE q.linegeo && bbox and x.id = q.osm_id
+ and poi_osm_id = any(x.nodes)
+ LIMIT 1
LOOP
- FOR i IN 1..array_upper(location.members, 1) BY 2 LOOP
- IF location.members[i+1] = 'street' THEN
- FOR parent IN
- SELECT place_id from placex
- WHERE osm_type = 'W' and osm_id = substring(location.members[i],2)::bigint
- and name is not null
- and rank_search between 26 and 27
- LOOP
- RETURN parent.place_id;
- END LOOP;
- END IF;
- END LOOP;
+ {% if debug %}RAISE WARNING 'Get parent from interpolation: %', location.parent_place_id;{% endif %}
+ RETURN location.parent_place_id;
END LOOP;
- parent_place_id := find_parent_for_address(addr_street, addr_place,
- poi_partition, bbox);
- IF parent_place_id is not null THEN
- RETURN parent_place_id;
- END IF;
+ FOR location IN
+ SELECT p.place_id, p.osm_id, p.rank_search, p.address,
+ coalesce(p.centroid, ST_Centroid(p.geometry)) as centroid
+ FROM placex p, planet_osm_ways w
+ WHERE p.osm_type = 'W' and p.rank_search >= 26
+ and p.geometry && bbox
+ and w.id = p.osm_id and poi_osm_id = any(w.nodes)
+ LOOP
+ {% if debug %}RAISE WARNING 'Node is part of way % ', location.osm_id;{% endif %}
+
+ -- Way IS a road then we are on it - that must be our road
+ IF location.rank_search < 28 THEN
+ {% if debug %}RAISE WARNING 'node in way that is a street %',location;{% endif %}
+ RETURN location.place_id;
+ END IF;
+
+ parent_place_id := find_associated_street('W', location.osm_id);
+ END LOOP;
+ END IF;
- IF poi_osm_type = 'N' THEN
- -- Is this node part of an interpolation?
- FOR parent IN
- SELECT q.parent_place_id
- FROM location_property_osmline q, planet_osm_ways x
- WHERE q.linegeo && bbox and x.id = q.osm_id
- and poi_osm_id = any(x.nodes)
- LIMIT 1
+ IF parent_place_id is NULL THEN
+ IF is_place_addr THEN
+ -- The address is attached to a place we don't know.
+ -- Instead simply use the containing area with the largest rank.
+ FOR location IN
+ SELECT place_id FROM placex
+ WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
+ AND rank_address between 5 and 25
+ ORDER BY rank_address desc
LOOP
- {% if debug %}RAISE WARNING 'Get parent from interpolation: %', parent.parent_place_id;{% endif %}
- RETURN parent.parent_place_id;
+ RETURN location.place_id;
END LOOP;
-
- -- Is this node part of any other way?
+ ELSEIF ST_Area(bbox) < 0.005 THEN
+ -- for smaller features get the nearest road
+ SELECT getNearestRoadPlaceId(poi_partition, bbox) INTO parent_place_id;
+ {% if debug %}RAISE WARNING 'Checked for nearest way (%)', parent_place_id;{% endif %}
+ ELSE
+ -- for larger features simply find the area with the largest rank that
+ -- contains the bbox, only use addressable features
FOR location IN
- SELECT p.place_id, p.osm_id, p.rank_search, p.address,
- coalesce(p.centroid, ST_Centroid(p.geometry)) as centroid
- FROM placex p, planet_osm_ways w
- WHERE p.osm_type = 'W' and p.rank_search >= 26
- and p.geometry && bbox
- and w.id = p.osm_id and poi_osm_id = any(w.nodes)
+ SELECT place_id FROM placex
+ WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
+ AND rank_address between 5 and 25
+ ORDER BY rank_address desc
LOOP
- {% if debug %}RAISE WARNING 'Node is part of way % ', location.osm_id;{% endif %}
-
- -- Way IS a road then we are on it - that must be our road
- IF location.rank_search < 28 THEN
- {% if debug %}RAISE WARNING 'node in way that is a street %',location;{% endif %}
- return location.place_id;
- END IF;
-
- SELECT find_parent_for_poi('W', location.osm_id, poi_partition,
- location.centroid,
- location.address->'street',
- location.address->'place',
- false)
- INTO parent_place_id;
- IF parent_place_id is not null THEN
- RETURN parent_place_id;
- END IF;
+ RETURN location.place_id;
END LOOP;
END IF;
+ END IF;
- IF fallback THEN
- IF addr_street is null and addr_place is not null THEN
- -- The address is attached to a place we don't know.
- -- Instead simply use the containing area with the largest rank.
- FOR location IN
- SELECT place_id FROM placex
- WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
- AND rank_address between 5 and 25
- ORDER BY rank_address desc
- LOOP
- RETURN location.place_id;
- END LOOP;
- ELSEIF ST_Area(bbox) < 0.005 THEN
- -- for smaller features get the nearest road
- SELECT getNearestRoadPlaceId(poi_partition, bbox) INTO parent_place_id;
- {% if debug %}RAISE WARNING 'Checked for nearest way (%)', parent_place_id;{% endif %}
- ELSE
- -- for larger features simply find the area with the largest rank that
- -- contains the bbox, only use addressable features
- FOR location IN
- SELECT place_id FROM placex
- WHERE bbox && geometry AND _ST_Covers(geometry, ST_Centroid(bbox))
- AND rank_address between 5 and 25
- ORDER BY rank_address desc
- LOOP
- RETURN location.place_id;
- END LOOP;
- END IF;
- END IF;
-
- RETURN parent_place_id;
+ RETURN parent_place_id;
END;
$$
LANGUAGE plpgsql STABLE;
LANGUAGE plpgsql STABLE;
+CREATE OR REPLACE FUNCTION create_poi_search_terms(obj_place_id BIGINT,
+ in_partition SMALLINT,
+ parent_place_id BIGINT,
+ is_place_addr BOOLEAN,
+ country TEXT,
+ token_info JSONB,
+ geometry GEOMETRY,
+ OUT name_vector INTEGER[],
+ OUT nameaddress_vector INTEGER[])
+ AS $$
+DECLARE
+ parent_name_vector INTEGER[];
+ parent_address_vector INTEGER[];
+ addr_place_ids INTEGER[];
+ hnr_vector INTEGER[];
+
+ addr_item RECORD;
+ addr_place RECORD;
+ parent_address_place_ids BIGINT[];
+BEGIN
+ nameaddress_vector := '{}'::INTEGER[];
+
+ SELECT s.name_vector, s.nameaddress_vector
+ INTO parent_name_vector, parent_address_vector
+ FROM search_name s
+ WHERE s.place_id = parent_place_id;
+
+ FOR addr_item IN
+ SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
+ FROM token_get_address_tokens(token_info)
+ WHERE not search_tokens <@ parent_address_vector
+ LOOP
+ addr_place := get_address_place(in_partition, geometry,
+ addr_item.from_rank, addr_item.to_rank,
+ addr_item.extent, addr_item.match_tokens);
+
+ IF addr_place is null THEN
+ -- No place found in OSM that matches. Make it at least searchable.
+ nameaddress_vector := array_merge(nameaddress_vector, addr_item.search_tokens);
+ ELSE
+ IF parent_address_place_ids is null THEN
+ SELECT array_agg(parent_place_id) INTO parent_address_place_ids
+ FROM place_addressline
+ WHERE place_id = parent_place_id;
+ END IF;
+
+ -- If the parent already lists the place in place_address line, then we
+ -- are done. Otherwise, add its own place_address line.
+ IF not parent_address_place_ids @> ARRAY[addr_place.place_id] THEN
+ nameaddress_vector := array_merge(nameaddress_vector, addr_place.keywords);
+
+ INSERT INTO place_addressline (place_id, address_place_id, fromarea,
+ isaddress, distance, cached_rank_address)
+ VALUES (obj_place_id, addr_place.place_id, not addr_place.isguess,
+ true, addr_place.distance, addr_place.rank_address);
+ END IF;
+ END IF;
+ END LOOP;
+
+ name_vector := token_get_name_search_tokens(token_info);
+
+ -- Check if the parent covers all address terms.
+ -- If not, create a search name entry with the house number as the name.
+ -- This is unusual for the search_name table but prevents that the place
+ -- is returned when we only search for the street/place.
+
+ hnr_vector := token_get_housenumber_search_tokens(token_info);
+
+ IF hnr_vector is not null and not nameaddress_vector <@ parent_address_vector THEN
+ name_vector := array_merge(name_vector, hnr_vector);
+ END IF;
+
+ IF is_place_addr THEN
+ addr_place_ids := token_addr_place_search_tokens(token_info);
+ IF not addr_place_ids <@ parent_name_vector THEN
+ -- make sure addr:place terms are always searchable
+ nameaddress_vector := array_merge(nameaddress_vector, addr_place_ids);
+ -- If there is a housenumber, also add the place name as a name,
+ -- so we can search it by the usual housenumber+place algorithms.
+ IF hnr_vector is not null THEN
+ name_vector := array_merge(name_vector, addr_place_ids);
+ END IF;
+ END IF;
+ END IF;
+
+ -- Cheating here by not recomputing all terms but simply using the ones
+ -- from the parent object.
+ nameaddress_vector := array_merge(nameaddress_vector, parent_name_vector);
+ nameaddress_vector := array_merge(nameaddress_vector, parent_address_vector);
+
+END;
+$$
+LANGUAGE plpgsql;
+
+
-- Insert address of a place into the place_addressline table.
--
-- \param obj_place_id Place_id of the place to compute the address for.
CREATE OR REPLACE FUNCTION insert_addresslines(obj_place_id BIGINT,
partition SMALLINT,
maxrank SMALLINT,
- address HSTORE,
+ token_info JSONB,
geometry GEOMETRY,
country TEXT,
OUT parent_place_id BIGINT,
current_node_area GEOMETRY := NULL;
parent_place_rank INT := 0;
- addr_place_ids BIGINT[];
+ addr_place_ids BIGINT[] := '{}'::int[];
+ new_address_vector INT[];
location RECORD;
BEGIN
address_havelevel := array_fill(false, ARRAY[maxrank]);
FOR location IN
- SELECT * FROM get_places_for_addr_tags(partition, geometry,
- address, country)
- ORDER BY rank_address, distance, isguess desc
+ SELECT (get_address_place(partition, geometry, from_rank, to_rank,
+ extent, match_tokens)).*, search_tokens
+ FROM (SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
+ FROM token_get_address_tokens(token_info)) x
+ ORDER BY rank_address, distance, isguess desc
LOOP
- {% if not db.reverse_only %}
- nameaddress_vector := array_merge(nameaddress_vector,
- location.keywords::int[]);
- {% endif %}
+ IF location.place_id is null THEN
+ {% if not db.reverse_only %}
+ nameaddress_vector := array_merge(nameaddress_vector, location.search_tokens);
+ {% endif %}
+ ELSE
+ {% if not db.reverse_only %}
+ nameaddress_vector := array_merge(nameaddress_vector, location.keywords::INTEGER[]);
+ {% endif %}
- IF location.place_id is not null THEN
location_isaddress := not address_havelevel[location.rank_address];
IF not address_havelevel[location.rank_address] THEN
address_havelevel[location.rank_address] := true;
VALUES (obj_place_id, location.place_id, not location.isguess,
true, location.distance, location.rank_address);
- addr_place_ids := array_append(addr_place_ids, location.place_id);
+ addr_place_ids := addr_place_ids || location.place_id;
END IF;
END LOOP;
FOR location IN
SELECT * FROM getNearFeatures(partition, geometry, maxrank)
- WHERE addr_place_ids is null or not addr_place_ids @> ARRAY[place_id]
+ WHERE not addr_place_ids @> ARRAY[place_id]
ORDER BY rank_address, isguess asc,
distance *
CASE WHEN rank_address = 16 AND rank_search = 15 THEN 0.2
NEW.place_id := nextval('seq_place');
NEW.indexed_status := 1; --STATUS_NEW
- NEW.country_code := lower(get_country_code(NEW.geometry));
+ NEW.centroid := ST_PointOnSurface(NEW.geometry);
+ NEW.country_code := lower(get_country_code(NEW.centroid));
NEW.partition := get_partition(NEW.country_code);
- NEW.geometry_sector := geometry_sector(NEW.partition, NEW.geometry);
+ NEW.geometry_sector := geometry_sector(NEW.partition, NEW.centroid);
IF NEW.osm_type = 'X' THEN
-- E'X'ternal records should already be in the right format so do nothing
parent_address_level SMALLINT;
place_address_level SMALLINT;
- addr_street TEXT;
- addr_place TEXT;
+ addr_street INTEGER[];
+ addr_place INTEGER[];
max_rank SMALLINT;
nameaddress_vector INTEGER[];
addr_nameaddress_vector INTEGER[];
- inherited_address HSTORE;
-
linked_node_id BIGINT;
linked_importance FLOAT;
linked_wikipedia TEXT;
+ is_place_address BOOLEAN;
result BOOLEAN;
BEGIN
-- deferred delete
-- update not necessary for osmline, cause linked_place_id does not exist
NEW.extratags := NEW.extratags - 'linked_place'::TEXT;
- NEW.address := NEW.address - '_unlisted_place'::TEXT;
IF NEW.linked_place_id is not null THEN
+ NEW.token_info := null;
{% if debug %}RAISE WARNING 'place already linked to %', NEW.linked_place_id;{% endif %}
RETURN NEW;
END IF;
-- imported as place=postcode. That's why relations are allowed to pass here.
-- This can go away in a couple of versions.
IF NEW.class = 'place' and NEW.type = 'postcode' and NEW.osm_type != 'R' THEN
+ NEW.token_info := null;
RETURN NEW;
END IF;
- -- Speed up searches - just use the centroid of the feature
- -- cheaper but less acurate
+ -- Compute a preliminary centroid.
NEW.centroid := ST_PointOnSurface(NEW.geometry);
- {% if debug %}RAISE WARNING 'Computing preliminary centroid at %',ST_AsText(NEW.centroid);{% endif %}
+
+ -- recalculate country and partition
+ IF NEW.rank_search = 4 AND NEW.address is not NULL AND NEW.address ? 'country' THEN
+ -- for countries, believe the mapped country code,
+ -- so that we remain in the right partition if the boundaries
+ -- suddenly expand.
+ NEW.country_code := lower(NEW.address->'country');
+ NEW.partition := get_partition(lower(NEW.country_code));
+ IF NEW.partition = 0 THEN
+ NEW.country_code := lower(get_country_code(NEW.centroid));
+ NEW.partition := get_partition(NEW.country_code);
+ END IF;
+ ELSE
+ IF NEW.rank_search >= 4 THEN
+ NEW.country_code := lower(get_country_code(NEW.centroid));
+ ELSE
+ NEW.country_code := NULL;
+ END IF;
+ NEW.partition := get_partition(NEW.country_code);
+ END IF;
+ {% if debug %}RAISE WARNING 'Country updated: "%"', NEW.country_code;{% endif %}
+
-- recompute the ranks, they might change when linking changes
SELECT * INTO NEW.rank_search, NEW.rank_address
parent_address_level := 3;
END IF;
- {% if debug %}RAISE WARNING 'Copy over address tags';{% endif %}
- -- housenumber is a computed field, so start with an empty value
- NEW.housenumber := NULL;
- IF NEW.address is not NULL THEN
- IF NEW.address ? 'conscriptionnumber' THEN
- IF NEW.address ? 'streetnumber' THEN
- NEW.housenumber := (NEW.address->'conscriptionnumber') || '/' || (NEW.address->'streetnumber');
- ELSE
- NEW.housenumber := NEW.address->'conscriptionnumber';
- END IF;
- ELSEIF NEW.address ? 'streetnumber' THEN
- NEW.housenumber := NEW.address->'streetnumber';
- ELSEIF NEW.address ? 'housenumber' THEN
- NEW.housenumber := NEW.address->'housenumber';
- END IF;
- NEW.housenumber := create_housenumber_id(NEW.housenumber);
-
- addr_street := NEW.address->'street';
- addr_place := NEW.address->'place';
-
- IF NEW.address ? 'postcode' and NEW.address->'postcode' not similar to '%(:|,|;)%' THEN
- i := getorcreate_postcode_id(NEW.address->'postcode');
- END IF;
- END IF;
+ NEW.housenumber := token_normalized_housenumber(NEW.token_info);
+ addr_street := token_addr_street_match_tokens(NEW.token_info);
+ addr_place := token_addr_place_match_tokens(NEW.token_info);
NEW.postcode := null;
- -- recalculate country and partition
- IF NEW.rank_search = 4 AND NEW.address is not NULL AND NEW.address ? 'country' THEN
- -- for countries, believe the mapped country code,
- -- so that we remain in the right partition if the boundaries
- -- suddenly expand.
- NEW.country_code := lower(NEW.address->'country');
- NEW.partition := get_partition(lower(NEW.country_code));
- IF NEW.partition = 0 THEN
- NEW.country_code := lower(get_country_code(NEW.centroid));
- NEW.partition := get_partition(NEW.country_code);
- END IF;
- ELSE
- IF NEW.rank_search >= 4 THEN
- NEW.country_code := lower(get_country_code(NEW.centroid));
- ELSE
- NEW.country_code := NULL;
- END IF;
- NEW.partition := get_partition(NEW.country_code);
- END IF;
- {% if debug %}RAISE WARNING 'Country updated: "%"', NEW.country_code;{% endif %}
-
-- waterway ways are linked when they are part of a relation and have the same class/type
IF NEW.osm_type = 'R' and NEW.class = 'waterway' THEN
FOR relation_members IN select members from planet_osm_rels r where r.id = NEW.osm_id and r.parts != array[]::bigint[]
{% if debug %}RAISE WARNING 'finding street for % %', NEW.osm_type, NEW.osm_id;{% endif %}
NEW.parent_place_id := null;
-
- -- if we have a POI and there is no address information,
- -- see if we can get it from a surrounding building
- inherited_address := ''::HSTORE;
- IF NEW.osm_type = 'N' AND addr_street IS NULL AND addr_place IS NULL
- AND NEW.housenumber IS NULL THEN
- FOR location IN
- -- The additional && condition works around the misguided query
- -- planner of postgis 3.0.
- SELECT address from placex where ST_Covers(geometry, NEW.centroid)
- and geometry && NEW.centroid
- and (address ? 'housenumber' or address ? 'street' or address ? 'place')
- and rank_search > 28 AND ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon')
- limit 1
- LOOP
- NEW.housenumber := location.address->'housenumber';
- addr_street := location.address->'street';
- addr_place := location.address->'place';
- inherited_address := location.address;
- END LOOP;
- END IF;
+ is_place_address := coalesce(not NEW.address ? 'street' and NEW.address ? 'place', FALSE);
-- We have to find our parent road.
NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id,
NEW.partition,
ST_Envelope(NEW.geometry),
- addr_street, addr_place);
+ addr_street, addr_place,
+ is_place_address);
-- If we found the road take a shortcut here.
-- Otherwise fall back to the full address getting method below.
SELECT p.country_code, p.postcode, p.name FROM placex p
WHERE p.place_id = NEW.parent_place_id INTO location;
- IF addr_street is null and addr_place is not null THEN
+ IF is_place_address THEN
-- Check if the addr:place tag is part of the parent name
SELECT count(*) INTO i
- FROM svals(location.name) AS pname WHERE pname = addr_place;
+ FROM svals(location.name) AS pname WHERE pname = NEW.address->'place';
IF i = 0 THEN
- NEW.address = NEW.address || hstore('_unlisted_place', addr_place);
+ NEW.address = NEW.address || hstore('_unlisted_place', NEW.address->'place');
END IF;
END IF;
{% if debug %}RAISE WARNING 'Got parent details from search name';{% endif %}
-- determine postcode
- IF NEW.address is not null AND NEW.address ? 'postcode' THEN
- NEW.postcode = upper(trim(NEW.address->'postcode'));
- ELSE
- NEW.postcode := location.postcode;
- END IF;
- IF NEW.postcode is null THEN
- NEW.postcode := get_nearest_postcode(NEW.country_code, NEW.geometry);
- END IF;
+ NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'),
+ location.postcode,
+ get_nearest_postcode(NEW.country_code, NEW.geometry));
IF NEW.name is not NULL THEN
NEW.name := add_default_place_name(NEW.country_code, NEW.name);
- name_vector := make_keywords(NEW.name);
-
- IF NEW.rank_search <= 25 and NEW.rank_address > 0 THEN
- result := add_location(NEW.place_id, NEW.country_code, NEW.partition,
- name_vector, NEW.rank_search, NEW.rank_address,
- upper(trim(NEW.address->'postcode')), NEW.geometry,
- NEW.centroid);
- {% if debug %}RAISE WARNING 'Place added to location table';{% endif %}
- END IF;
-
END IF;
{% if not db.reverse_only %}
- IF array_length(name_vector, 1) is not NULL
- OR inherited_address is not NULL OR NEW.address is not NULL
- THEN
+ IF NEW.name is not NULL OR NEW.address is not NULL THEN
SELECT * INTO name_vector, nameaddress_vector
FROM create_poi_search_terms(NEW.place_id,
NEW.partition, NEW.parent_place_id,
- inherited_address || NEW.address,
- NEW.country_code, NEW.housenumber,
- name_vector, NEW.centroid);
+ is_place_address, NEW.country_code,
+ NEW.token_info, NEW.centroid);
IF array_length(name_vector, 1) is not NULL THEN
INSERT INTO search_name (place_id, search_rank, address_rank,
END IF;
{% endif %}
+ NEW.token_info := token_strip_info(NEW.token_info);
+ -- If the address was inherited from a surrounding building,
+ -- do not add it permanently to the table.
+ IF NEW.address ? '_inherited' THEN
+ IF NEW.address ? '_unlisted_place' THEN
+ NEW.address := hstore('_unlisted_place', NEW.address->'_unlisted_place');
+ ELSE
+ NEW.address := null;
+ END IF;
+ END IF;
+
RETURN NEW;
END IF;
END IF;
END IF;
- -- Initialise the name vector using our name
- NEW.name := add_default_place_name(NEW.country_code, NEW.name);
- name_vector := make_keywords(NEW.name);
-
- -- make sure all names are in the word table
IF NEW.admin_level = 2
AND NEW.class = 'boundary' AND NEW.type = 'administrative'
AND NEW.country_code IS NOT NULL AND NEW.osm_type = 'R'
THEN
- PERFORM create_country(NEW.name, lower(NEW.country_code));
- {% if debug %}RAISE WARNING 'Country names updated';{% endif %}
-
- -- Also update the list of country names. Adding an additional sanity
+ -- Update the list of country names. Adding an additional sanity
-- check here: make sure the country does overlap with the area where
-- we expect it to be as per static country grid.
FOR location IN
ELSEIF NEW.rank_address > 25 THEN
max_rank := 25;
ELSE
- max_rank = NEW.rank_address;
+ max_rank := NEW.rank_address;
END IF;
SELECT * FROM insert_addresslines(NEW.place_id, NEW.partition, max_rank,
- NEW.address, geom, NEW.country_code)
+ NEW.token_info, geom, NEW.country_code)
INTO NEW.parent_place_id, NEW.postcode, nameaddress_vector;
{% if debug %}RAISE WARNING 'RETURN insert_addresslines: %, %, %', NEW.parent_place_id, NEW.postcode, nameaddress_vector;{% endif %}
- IF NEW.address is not null AND NEW.address ? 'postcode'
- AND NEW.address->'postcode' not similar to '%(,|;)%' THEN
- NEW.postcode := upper(trim(NEW.address->'postcode'));
- END IF;
-
- IF NEW.postcode is null AND NEW.rank_search > 8 THEN
- NEW.postcode := get_nearest_postcode(NEW.country_code, NEW.geometry);
- END IF;
+ NEW.postcode := coalesce(token_normalized_postcode(NEW.address->'postcode'),
+ NEW.postcode);
-- if we have a name add this to the name search table
IF NEW.name IS NOT NULL THEN
+ -- Initialise the name vector using our name
+ NEW.name := add_default_place_name(NEW.country_code, NEW.name);
+ name_vector := token_get_name_search_tokens(NEW.token_info);
IF NEW.rank_search <= 25 and NEW.rank_address > 0 THEN
- result := add_location(NEW.place_id, NEW.country_code, NEW.partition, name_vector, NEW.rank_search, NEW.rank_address, upper(trim(NEW.address->'postcode')), NEW.geometry, NEW.centroid);
+ result := add_location(NEW.place_id, NEW.country_code, NEW.partition,
+ name_vector, NEW.rank_search, NEW.rank_address,
+ NEW.postcode, NEW.geometry, NEW.centroid);
{% if debug %}RAISE WARNING 'added to location (full)';{% endif %}
END IF;
{% if debug %}RAISE WARNING 'insert into road location table (full)';{% endif %}
END IF;
- result := insertSearchName(NEW.partition, NEW.place_id, name_vector,
- NEW.rank_search, NEW.rank_address, NEW.geometry);
+ IF NEW.rank_address between 16 and 27 THEN
+ result := insertSearchName(NEW.partition, NEW.place_id,
+ token_get_name_match_tokens(NEW.token_info),
+ NEW.rank_search, NEW.rank_address, NEW.geometry);
+ END IF;
{% if debug %}RAISE WARNING 'added to search name (full)';{% endif %}
{% if not db.reverse_only %}
NEW.importance, NEW.country_code, name_vector,
nameaddress_vector, NEW.centroid);
{% endif %}
+ END IF;
+ IF NEW.postcode is null AND NEW.rank_search > 8 THEN
+ NEW.postcode := get_nearest_postcode(NEW.country_code, NEW.geometry);
END IF;
{% if debug %}RAISE WARNING 'place update % % finsihed.', NEW.osm_type, NEW.osm_id;{% endif %}
+ NEW.token_info := token_strip_info(NEW.token_info);
RETURN NEW;
END;
$$
-- \param centroid Location of the address.
--
-- \return Place ID of the parent if one was found, NULL otherwise.
-CREATE OR REPLACE FUNCTION find_parent_for_address(street TEXT, place TEXT,
+CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEGER[],
partition SMALLINT,
centroid GEOMETRY)
RETURNS BIGINT
AS $$
DECLARE
parent_place_id BIGINT;
- word_ids INTEGER[];
BEGIN
IF street is not null THEN
-- Check for addr:street attributes
-- Note that addr:street links can only be indexed, once the street itself is indexed
- word_ids := word_ids_from_name(street);
- IF word_ids is not null THEN
- parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, word_ids);
- IF parent_place_id is not null THEN
- {% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
- RETURN parent_place_id;
- END IF;
+ parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, street);
+ IF parent_place_id is not null THEN
+ {% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
+ RETURN parent_place_id;
END IF;
END IF;
-- Check for addr:place attributes.
IF place is not null THEN
- word_ids := word_ids_from_name(place);
- IF word_ids is not null THEN
- parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, word_ids);
- IF parent_place_id is not null THEN
- {% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
- RETURN parent_place_id;
- END IF;
+ parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, place);
+ IF parent_place_id is not null THEN
+ {% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
+ RETURN parent_place_id;
END IF;
END IF;
-- Indices used only during search and update.
-- These indices are created only after the indexing process is done.
-CREATE INDEX {{sql.if_index_not_exists}} idx_word_word_id
- ON word USING BTREE (word_id) {{db.tablespace.search_index}};
-
CREATE INDEX {{sql.if_index_not_exists}} idx_place_addressline_address_place_id
ON place_addressline USING BTREE (address_place_id) {{db.tablespace.search_index}};
);
GRANT SELECT ON TABLE nominatim_properties TO "{{config.DATABASE_WEBUSER}}";
-drop table IF EXISTS word;
-CREATE TABLE word (
- word_id INTEGER,
- word_token text,
- word text,
- class text,
- type text,
- country_code varchar(2),
- search_name_count INTEGER,
- operator TEXT
- ) {{db.tablespace.search_data}};
-CREATE INDEX idx_word_word_token on word USING BTREE (word_token) {{db.tablespace.search_index}};
-GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}" ;
-DROP SEQUENCE IF EXISTS seq_word;
-CREATE SEQUENCE seq_word start 1;
-
drop table IF EXISTS location_area CASCADE;
CREATE TABLE location_area (
place_id BIGINT,
linegeo GEOMETRY,
interpolationtype TEXT,
address HSTORE,
+ token_info JSONB, -- custom column for tokenizer use only
postcode TEXT,
country_code VARCHAR(2)
){{db.tablespace.search_data}};
indexed_status SMALLINT,
LIKE place INCLUDING CONSTRAINTS,
wikipedia TEXT, -- calculated wikipedia article name (language:title)
+ token_info JSONB, -- custom column for tokenizer use only
country_code varchar(2),
housenumber TEXT,
postcode TEXT,
CREATE INDEX idx_placex_linked_place_id ON placex USING BTREE (linked_place_id) {{db.tablespace.address_index}} WHERE linked_place_id IS NOT NULL;
CREATE INDEX idx_placex_rank_search ON placex USING BTREE (rank_search, geometry_sector) {{db.tablespace.address_index}};
CREATE INDEX idx_placex_geometry ON placex USING GIST (geometry) {{db.tablespace.search_index}};
+CREATE INDEX idx_placex_geometry_buildings ON placex
+ USING GIST (geometry) {{db.tablespace.search_index}}
+ WHERE address is not null and rank_search = 30
+ and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon');
CREATE INDEX idx_placex_geometry_placenode ON placex
USING GIST (geometry) {{db.tablespace.search_index}}
WHERE osm_type = 'N' and rank_search < 26
CREATE SEQUENCE seq_place start 1;
GRANT SELECT on placex to "{{config.DATABASE_WEBUSER}}" ;
GRANT SELECT on place_addressline to "{{config.DATABASE_WEBUSER}}" ;
-GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}" ;
GRANT SELECT ON planet_osm_ways to "{{config.DATABASE_WEBUSER}}" ;
GRANT SELECT ON planet_osm_rels to "{{config.DATABASE_WEBUSER}}" ;
GRANT SELECT on location_area to "{{config.DATABASE_WEBUSER}}" ;
--- /dev/null
+-- Get tokens used for searching the given place.
+--
+-- These are the tokens that will be saved in the search_name table.
+CREATE OR REPLACE FUNCTION token_get_name_search_tokens(info JSONB)
+ RETURNS INTEGER[]
+AS $$
+ SELECT (info->>'names')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Get tokens for matching the place name against others.
+--
+-- This should usually be restricted to full name tokens.
+CREATE OR REPLACE FUNCTION token_get_name_match_tokens(info JSONB)
+ RETURNS INTEGER[]
+AS $$
+ SELECT (info->>'names')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Return the housenumber tokens applicable for the place.
+CREATE OR REPLACE FUNCTION token_get_housenumber_search_tokens(info JSONB)
+ RETURNS INTEGER[]
+AS $$
+ SELECT (info->>'hnr_tokens')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Return the housenumber in the form that it can be matched during search.
+CREATE OR REPLACE FUNCTION token_normalized_housenumber(info JSONB)
+ RETURNS TEXT
+AS $$
+ SELECT info->>'hnr';
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
+ RETURNS INTEGER[]
+AS $$
+ SELECT (info->>'street')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
+ RETURNS INTEGER[]
+AS $$
+ SELECT (info->>'place_match')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
+ RETURNS INTEGER[]
+AS $$
+ SELECT (info->>'place_search')::INTEGER[]
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+DROP TYPE IF EXISTS token_addresstoken CASCADE;
+CREATE TYPE token_addresstoken AS (
+ key TEXT,
+ match_tokens INT[],
+ search_tokens INT[]
+);
+
+CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
+ RETURNS SETOF token_addresstoken
+AS $$
+ SELECT key, (value->>1)::int[] as match_tokens,
+ (value->>0)::int[] as search_tokens
+ FROM jsonb_each(info->'addr');
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+ RETURNS TEXT
+AS $$
+ SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+
+-- Return token info that should be saved permanently in the database.
+CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
+ RETURNS JSONB
+AS $$
+ SELECT NULL::JSONB;
+$$ LANGUAGE SQL IMMUTABLE STRICT;
+
+--------------- private functions ----------------------------------------------
+
+-- Functions for term normalisation and access to the 'word' table.
+
+CREATE OR REPLACE FUNCTION transliteration(text) RETURNS text
+ AS '{{ modulepath }}/nominatim.so', 'transliteration'
+LANGUAGE c IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION gettokenstring(text) RETURNS text
+ AS '{{ modulepath }}/nominatim.so', 'gettokenstring'
+LANGUAGE c IMMUTABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT
+ AS $$
+DECLARE
+ o TEXT;
+BEGIN
+ o := public.gettokenstring(public.transliteration(name));
+ RETURN trim(substr(o,1,length(o)));
+END;
+$$
+LANGUAGE plpgsql IMMUTABLE;
+
+-- returns NULL if the word is too common
+CREATE OR REPLACE FUNCTION getorcreate_word_id(lookup_word TEXT)
+ RETURNS INTEGER
+ AS $$
+DECLARE
+ lookup_token TEXT;
+ return_word_id INTEGER;
+ count INTEGER;
+BEGIN
+ lookup_token := trim(lookup_word);
+ SELECT min(word_id), max(search_name_count) FROM word
+ WHERE word_token = lookup_token and class is null and type is null
+ INTO return_word_id, count;
+ IF return_word_id IS NULL THEN
+ return_word_id := nextval('seq_word');
+ INSERT INTO word VALUES (return_word_id, lookup_token, null, null, null, null, 0);
+ ELSE
+ IF count > {{ max_word_freq }} THEN
+ return_word_id := NULL;
+ END IF;
+ END IF;
+ RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+-- Create housenumber tokens from an OSM addr:housenumber.
+-- The housnumber is split at comma and semicolon as necessary.
+-- The function returns the normalized form of the housenumber suitable
+-- for comparison.
+CREATE OR REPLACE FUNCTION create_housenumbers(housenumbers TEXT[],
+ OUT tokens TEXT,
+ OUT normtext TEXT)
+ AS $$
+BEGIN
+ SELECT array_to_string(array_agg(trans), ';'), array_agg(tid)::TEXT
+ INTO normtext, tokens
+ FROM (SELECT lookup_word as trans, getorcreate_housenumber_id(lookup_word) as tid
+ FROM (SELECT make_standard_name(h) as lookup_word
+ FROM unnest(housenumbers) h) x) y;
+END;
+$$ LANGUAGE plpgsql STABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
+ RETURNS INTEGER
+ AS $$
+DECLARE
+ lookup_token TEXT;
+ return_word_id INTEGER;
+BEGIN
+ lookup_token := ' ' || trim(lookup_word);
+ SELECT min(word_id) FROM word
+ WHERE word_token = lookup_token and class='place' and type='house'
+ INTO return_word_id;
+ IF return_word_id IS NULL THEN
+ return_word_id := nextval('seq_word');
+ INSERT INTO word VALUES (return_word_id, lookup_token, null,
+ 'place', 'house', null, 0);
+ END IF;
+ RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT)
+ RETURNS BOOLEAN
+ AS $$
+DECLARE
+ r RECORD;
+ lookup_token TEXT;
+ return_word_id INTEGER;
+BEGIN
+ lookup_token := ' ' || make_standard_name(postcode);
+ FOR r IN
+ SELECT word_id FROM word
+ WHERE word_token = lookup_token and word = postcode
+ and class='place' and type='postcode'
+ LOOP
+ RETURN false;
+ END LOOP;
+
+ INSERT INTO word VALUES (nextval('seq_word'), lookup_token, postcode,
+ 'place', 'postcode', null, 0);
+ RETURN true;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION getorcreate_name_id(lookup_word TEXT, src_word TEXT)
+ RETURNS INTEGER
+ AS $$
+DECLARE
+ lookup_token TEXT;
+ nospace_lookup_token TEXT;
+ return_word_id INTEGER;
+BEGIN
+ lookup_token := ' '||trim(lookup_word);
+ SELECT min(word_id) FROM word
+ WHERE word_token = lookup_token and class is null and type is null
+ INTO return_word_id;
+ IF return_word_id IS NULL THEN
+ return_word_id := nextval('seq_word');
+ INSERT INTO word VALUES (return_word_id, lookup_token, src_word,
+ null, null, null, 0);
+ END IF;
+ RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+-- Normalize a string and lookup its word ids (partial words).
+CREATE OR REPLACE FUNCTION addr_ids_from_name(lookup_word TEXT)
+ RETURNS INTEGER[]
+ AS $$
+DECLARE
+ words TEXT[];
+ id INTEGER;
+ return_word_id INTEGER[];
+ word_ids INTEGER[];
+ j INTEGER;
+BEGIN
+ words := string_to_array(make_standard_name(lookup_word), ' ');
+ IF array_upper(words, 1) IS NOT NULL THEN
+ FOR j IN 1..array_upper(words, 1) LOOP
+ IF (words[j] != '') THEN
+ SELECT array_agg(word_id) INTO word_ids
+ FROM word
+ WHERE word_token = words[j] and class is null and type is null;
+
+ IF word_ids IS NULL THEN
+ id := nextval('seq_word');
+ INSERT INTO word VALUES (id, words[j], null, null, null, null, 0);
+ return_word_id := return_word_id || id;
+ ELSE
+ return_word_id := array_merge(return_word_id, word_ids);
+ END IF;
+ END IF;
+ END LOOP;
+ END IF;
+
+ RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+-- Normalize a string and look up its name ids (full words).
+CREATE OR REPLACE FUNCTION word_ids_from_name(lookup_word TEXT)
+ RETURNS INTEGER[]
+ AS $$
+DECLARE
+ lookup_token TEXT;
+ return_word_ids INTEGER[];
+BEGIN
+ lookup_token := ' '|| make_standard_name(lookup_word);
+ SELECT array_agg(word_id) FROM word
+ WHERE word_token = lookup_token and class is null and type is null
+ INTO return_word_ids;
+ RETURN return_word_ids;
+END;
+$$
+LANGUAGE plpgsql STABLE STRICT;
+
+
+CREATE OR REPLACE FUNCTION make_keywords(src HSTORE)
+ RETURNS INTEGER[]
+ AS $$
+DECLARE
+ result INTEGER[];
+ s TEXT;
+ w INTEGER;
+ words TEXT[];
+ item RECORD;
+ j INTEGER;
+BEGIN
+ result := '{}'::INTEGER[];
+
+ FOR item IN SELECT (each(src)).* LOOP
+
+ s := make_standard_name(item.value);
+ w := getorcreate_name_id(s, item.value);
+
+ IF not(ARRAY[w] <@ result) THEN
+ result := result || w;
+ END IF;
+
+ w := getorcreate_word_id(s);
+
+ IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
+ result := result || w;
+ END IF;
+
+ words := string_to_array(s, ' ');
+ IF array_upper(words, 1) IS NOT NULL THEN
+ FOR j IN 1..array_upper(words, 1) LOOP
+ IF (words[j] != '') THEN
+ w = getorcreate_word_id(words[j]);
+ IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
+ result := result || w;
+ END IF;
+ END IF;
+ END LOOP;
+ END IF;
+
+ words := regexp_split_to_array(item.value, E'[,;()]');
+ IF array_upper(words, 1) != 1 THEN
+ FOR j IN 1..array_upper(words, 1) LOOP
+ s := make_standard_name(words[j]);
+ IF s != '' THEN
+ w := getorcreate_word_id(s);
+ IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
+ result := result || w;
+ END IF;
+ END IF;
+ END LOOP;
+ END IF;
+
+ s := regexp_replace(item.value, '市$', '');
+ IF s != item.value THEN
+ s := make_standard_name(s);
+ IF s != '' THEN
+ w := getorcreate_name_id(s, item.value);
+ IF NOT (ARRAY[w] <@ result) THEN
+ result := result || w;
+ END IF;
+ END IF;
+ END IF;
+
+ END LOOP;
+
+ RETURN result;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION precompute_words(src TEXT)
+ RETURNS INTEGER
+ AS $$
+DECLARE
+ s TEXT;
+ w INTEGER;
+ words TEXT[];
+ i INTEGER;
+ j INTEGER;
+BEGIN
+ s := make_standard_name(src);
+ w := getorcreate_name_id(s, src);
+
+ w := getorcreate_word_id(s);
+
+ words := string_to_array(s, ' ');
+ IF array_upper(words, 1) IS NOT NULL THEN
+ FOR j IN 1..array_upper(words, 1) LOOP
+ IF (words[j] != '') THEN
+ w := getorcreate_word_id(words[j]);
+ END IF;
+ END LOOP;
+ END IF;
+
+ words := regexp_split_to_array(src, E'[,;()]');
+ IF array_upper(words, 1) != 1 THEN
+ FOR j IN 1..array_upper(words, 1) LOOP
+ s := make_standard_name(words[j]);
+ IF s != '' THEN
+ w := getorcreate_word_id(s);
+ END IF;
+ END LOOP;
+ END IF;
+
+ s := regexp_replace(src, '市$', '');
+ IF s != src THEN
+ s := make_standard_name(s);
+ IF s != '' THEN
+ w := getorcreate_name_id(s, src);
+ END IF;
+ END IF;
+
+ RETURN 1;
+END;
+$$
+LANGUAGE plpgsql;
--- /dev/null
+CREATE INDEX {{sql.if_index_not_exists}} idx_word_word_id
+ ON word USING BTREE (word_id) {{db.tablespace.search_index}};
--- /dev/null
+DROP TABLE IF EXISTS word;
+CREATE TABLE word (
+ word_id INTEGER,
+ word_token text NOT NULL,
+ word text,
+ class text,
+ type text,
+ country_code varchar(2),
+ search_name_count INTEGER,
+ operator TEXT
+) {{db.tablespace.search_data}};
+
+CREATE INDEX idx_word_word_token ON word
+ USING BTREE (word_token) {{db.tablespace.search_index}};
+GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
+
+DROP SEQUENCE IF EXISTS seq_word;
+CREATE SEQUENCE seq_word start 1;
+GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
@staticmethod
def run(args):
from ..indexer.indexer import Indexer
+ from ..tokenizer import factory as tokenizer_factory
- indexer = Indexer(args.config.get_libpq_dsn(),
+ tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+
+ indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
args.threads or psutil.cpu_count() or 1)
if not args.no_boundaries:
@staticmethod
def run(args):
from ..tools import refresh
+ from ..tokenizer import factory as tokenizer_factory
if args.postcodes:
LOG.warning("Update postcodes centroid")
with connect(args.config.get_libpq_dsn()) as conn:
refresh.create_functions(conn, args.config,
args.diffs, args.enable_debug_statements)
+ tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+ tokenizer.update_sql_functions(args.config)
if args.wiki_data:
data_path = Path(args.config.WIKIPEDIA_DATA_PATH
def _update(args):
from ..tools import replication
from ..indexer.indexer import Indexer
+ from ..tokenizer import factory as tokenizer_factory
params = args.osm2pgsql_options(default_cache=2000, default_threads=1)
params.update(base_url=args.config.REPLICATION_URL,
raise UsageError("Bad argument '--no-index'.")
recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL')
+ tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+
while True:
with connect(args.config.get_libpq_dsn()) as conn:
start = dt.datetime.now(dt.timezone.utc)
if state is not replication.UpdateState.NO_CHANGES and args.do_index:
index_start = dt.datetime.now(dt.timezone.utc)
- indexer = Indexer(args.config.get_libpq_dsn(),
+ indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
args.threads or 1)
indexer.index_boundaries(0, 30)
indexer.index_by_rank(0, 30)
from ..tools import refresh
from ..indexer.indexer import Indexer
from ..tools import postcodes
+ from ..tokenizer import factory as tokenizer_factory
if args.osm_file and not Path(args.osm_file).is_file():
LOG.fatal("OSM file '%s' does not exist.", args.osm_file)
args.no_partitions,
rouser=args.config.DATABASE_WEBUSER)
- LOG.warning('Installing database module')
- with connect(args.config.get_libpq_dsn()) as conn:
- database_import.install_module(args.module_dir, args.project_dir,
- args.config.DATABASE_MODULE_PATH,
- conn=conn)
-
LOG.warning('Importing OSM data file')
database_import.import_osm_data(Path(args.osm_file),
args.osm2pgsql_options(0, 1),
if args.continue_at is None or args.continue_at == 'load-data':
LOG.warning('Initialise tables')
with connect(args.config.get_libpq_dsn()) as conn:
- database_import.truncate_data_tables(conn, args.config.MAX_WORD_FREQUENCY)
+ database_import.truncate_data_tables(conn)
LOG.warning('Load data into placex table')
database_import.load_data(args.config.get_libpq_dsn(),
- args.data_dir,
args.threads or psutil.cpu_count() or 1)
+ LOG.warning("Setting up tokenizer")
+ if args.continue_at is None or args.continue_at == 'load-data':
+ # (re)initialise the tokenizer data
+ tokenizer = tokenizer_factory.create_tokenizer(args.config)
+ else:
+ # just load the tokenizer
+ tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+
+ if args.continue_at is None or args.continue_at == 'load-data':
LOG.warning('Calculate postcodes')
- postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir)
+ postcodes.import_postcodes(args.config.get_libpq_dsn(), args.project_dir,
+ tokenizer)
if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
if args.continue_at is not None and args.continue_at != 'load-data':
with connect(args.config.get_libpq_dsn()) as conn:
SetupAll._create_pending_index(conn, args.config.TABLESPACE_ADDRESS_INDEX)
LOG.warning('Indexing places')
- indexer = Indexer(args.config.get_libpq_dsn(),
+ indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
args.threads or psutil.cpu_count() or 1)
indexer.index_full(analyse=not args.index_noanalyse)
database_import.create_search_indices(conn, args.config,
drop=args.no_updates)
LOG.warning('Create search index for default country names.')
- database_import.create_country_names(conn, args.config)
+ database_import.create_country_names(conn, tokenizer,
+ args.config.LANGUAGES)
+ tokenizer.finalize_import(args.config)
webdir = args.project_dir / 'website'
LOG.warning('Setup website at %s', webdir)
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
+# Using non-top-level imports to avoid eventually unused imports.
+# pylint: disable=E0012,C0415
class ImportSpecialPhrases:
"""\
@staticmethod
def run(args):
+ from ..tokenizer import factory as tokenizer_factory
+
if args.import_from_wiki:
LOG.warning('Special phrases importation starting')
+ tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
with connect(args.config.get_libpq_dsn()) as db_connection:
SpecialPhrasesImporter(
args.config, args.phplib_dir, db_connection
- ).import_from_wiki()
+ ).import_from_wiki(tokenizer)
return 0
""" A single non-blocking database connection.
"""
- def __init__(self, dsn):
+ def __init__(self, dsn, cursor_factory=None):
self.current_query = None
self.current_params = None
self.dsn = dsn
self.conn = None
self.cursor = None
- self.connect()
+ self.connect(cursor_factory=cursor_factory)
def close(self):
""" Close all open connections. Does not wait for pending requests.
self.conn = None
- def connect(self):
+ def connect(self, cursor_factory=None):
""" (Re)connect to the database. Creates an asynchronous connection
with JIT and parallel processing disabled. If a connection was
already open, it is closed and a new connection established.
self.conn = psycopg2.connect(**{'dsn' : self.dsn, 'async' : True})
self.wait()
- self.cursor = self.conn.cursor()
+ self.cursor = self.conn.cursor(cursor_factory=cursor_factory)
# Disable JIT and parallel workers as they are known to cause problems.
# Update pg_settings instead of using SET because it does not yield
# errors on older versions of Postgres where the settings are not
self.env.globals['db'] = db_info
self.env.globals['sql'] = _setup_postgres_sql(conn)
self.env.globals['postgres'] = _setup_postgresql_features(conn)
- self.env.globals['modulepath'] = config.DATABASE_MODULE_PATH or \
- str((config.project_dir / 'module').resolve())
def run_sql_file(self, conn, name, **kwargs):
"""
import logging
import select
+import time
+
+import psycopg2.extras
from nominatim.indexer.progress import ProgressLogger
from nominatim.indexer import runners
LOG = logging.getLogger()
+
+class PlaceFetcher:
+ """ Asynchronous connection that fetches place details for processing.
+ """
+ def __init__(self, dsn, setup_conn):
+ self.wait_time = 0
+ self.current_ids = None
+ self.conn = DBConnection(dsn, cursor_factory=psycopg2.extras.DictCursor)
+
+ with setup_conn.cursor() as cur:
+ # need to fetch those manually because register_hstore cannot
+ # fetch them on an asynchronous connection below.
+ hstore_oid = cur.scalar("SELECT 'hstore'::regtype::oid")
+ hstore_array_oid = cur.scalar("SELECT 'hstore[]'::regtype::oid")
+
+ psycopg2.extras.register_hstore(self.conn.conn, oid=hstore_oid,
+ array_oid=hstore_array_oid)
+
+ def close(self):
+ """ Close the underlying asynchronous connection.
+ """
+ if self.conn:
+ self.conn.close()
+ self.conn = None
+
+
+ def fetch_next_batch(self, cur, runner):
+ """ Send a request for the next batch of places.
+ If details for the places are required, they will be fetched
+ asynchronously.
+
+ Returns true if there is still data available.
+ """
+ ids = cur.fetchmany(100)
+
+ if not ids:
+ self.current_ids = None
+ return False
+
+ if hasattr(runner, 'get_place_details'):
+ runner.get_place_details(self.conn, ids)
+ self.current_ids = []
+ else:
+ self.current_ids = ids
+
+ return True
+
+ def get_batch(self):
+ """ Get the next batch of data, previously requested with
+ `fetch_next_batch`.
+ """
+ if self.current_ids is not None and not self.current_ids:
+ tstart = time.time()
+ self.conn.wait()
+ self.wait_time += time.time() - tstart
+ self.current_ids = self.conn.cursor.fetchall()
+
+ return self.current_ids
+
+ def __enter__(self):
+ return self
+
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.conn.wait()
+ self.close()
+
class WorkerPool:
""" A pool of asynchronous database connections.
def __init__(self, dsn, pool_size):
self.threads = [DBConnection(dsn) for _ in range(pool_size)]
self.free_workers = self._yield_free_worker()
+ self.wait_time = 0
def finish_all(self):
ready = self.threads
command_stat = 0
else:
+ tstart = time.time()
_, ready, _ = select.select([], self.threads, [])
+ self.wait_time += time.time() - tstart
def __enter__(self):
def __exit__(self, exc_type, exc_value, traceback):
+ self.finish_all()
self.close()
""" Main indexing routine.
"""
- def __init__(self, dsn, num_threads):
+ def __init__(self, dsn, tokenizer, num_threads):
self.dsn = dsn
+ self.tokenizer = tokenizer
self.num_threads = num_threads
LOG.warning("Starting indexing boundaries using %s threads",
self.num_threads)
- for rank in range(max(minrank, 4), min(maxrank, 26)):
- self._index(runners.BoundaryRunner(rank))
+ with self.tokenizer.name_analyzer() as analyzer:
+ for rank in range(max(minrank, 4), min(maxrank, 26)):
+ self._index(runners.BoundaryRunner(rank, analyzer))
def index_by_rank(self, minrank, maxrank):
""" Index all entries of placex in the given rank range (inclusive)
LOG.warning("Starting indexing rank (%i to %i) using %i threads",
minrank, maxrank, self.num_threads)
- for rank in range(max(1, minrank), maxrank):
- self._index(runners.RankRunner(rank))
+ with self.tokenizer.name_analyzer() as analyzer:
+ for rank in range(max(1, minrank), maxrank):
+ self._index(runners.RankRunner(rank, analyzer))
- if maxrank == 30:
- self._index(runners.RankRunner(0))
- self._index(runners.InterpolationRunner(), 20)
- self._index(runners.RankRunner(30), 20)
- else:
- self._index(runners.RankRunner(maxrank))
+ if maxrank == 30:
+ self._index(runners.RankRunner(0, analyzer))
+ self._index(runners.InterpolationRunner(analyzer), 20)
+ self._index(runners.RankRunner(30, analyzer), 20)
+ else:
+ self._index(runners.RankRunner(maxrank, analyzer))
def index_postcodes(self):
LOG.warning("Starting %s (using batch size %s)", runner.name(), batch)
with connect(self.dsn) as conn:
+ psycopg2.extras.register_hstore(conn)
with conn.cursor() as cur:
total_tuples = cur.scalar(runner.sql_count_objects())
LOG.debug("Total number of rows: %i", total_tuples)
with conn.cursor(name='places') as cur:
cur.execute(runner.sql_get_objects())
- with WorkerPool(self.dsn, self.num_threads) as pool:
- while True:
- places = [p[0] for p in cur.fetchmany(batch)]
- if not places:
- break
+ with PlaceFetcher(self.dsn, conn) as fetcher:
+ with WorkerPool(self.dsn, self.num_threads) as pool:
+ has_more = fetcher.fetch_next_batch(cur, runner)
+ while has_more:
+ places = fetcher.get_batch()
- LOG.debug("Processing places: %s", str(places))
- worker = pool.next_free_worker()
+ # asynchronously get the next batch
+ has_more = fetcher.fetch_next_batch(cur, runner)
- worker.perform(runner.sql_index_place(places))
- progress.add(len(places))
+ # And insert the curent batch
+ for idx in range(0, len(places), batch):
+ part = places[idx:idx+batch]
+ LOG.debug("Processing places: %s", str(part))
+ runner.index_places(pool.next_free_worker(), part)
+ progress.add(len(part))
- pool.finish_all()
+ LOG.info("Wait time: fetcher: %.2fs, pool: %.2fs",
+ fetcher.wait_time, pool.wait_time)
conn.commit()
Mix-ins that provide the actual commands for the indexer for various indexing
tasks.
"""
+import functools
+
+import psycopg2.extras
+
# pylint: disable=C0111
-class RankRunner:
- """ Returns SQL commands for indexing one rank within the placex table.
+class AbstractPlacexRunner:
+ """ Returns SQL commands for indexing of the placex table.
"""
+ SELECT_SQL = 'SELECT place_id FROM placex'
- def __init__(self, rank):
+ def __init__(self, rank, analyzer):
self.rank = rank
+ self.analyzer = analyzer
+
+
+ @staticmethod
+ @functools.lru_cache(maxsize=1)
+ def _index_sql(num_places):
+ return """ UPDATE placex
+ SET indexed_status = 0, address = v.addr, token_info = v.ti
+ FROM (VALUES {}) as v(id, addr, ti)
+ WHERE place_id = v.id
+ """.format(','.join(["(%s, %s::hstore, %s::jsonb)"] * num_places))
+
+
+ @staticmethod
+ def get_place_details(worker, ids):
+ worker.perform("""SELECT place_id, (placex_prepare_update(placex)).*
+ FROM placex WHERE place_id IN %s""",
+ (tuple((p[0] for p in ids)), ))
+
+
+ def index_places(self, worker, places):
+ values = []
+ for place in places:
+ values.extend((place[x] for x in ('place_id', 'address')))
+ values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+
+ worker.perform(self._index_sql(len(places)), values)
+
+
+class RankRunner(AbstractPlacexRunner):
+ """ Returns SQL commands for indexing one rank within the placex table.
+ """
def name(self):
return "rank {}".format(self.rank)
""".format(self.rank)
def sql_get_objects(self):
- return """SELECT place_id FROM placex
- WHERE indexed_status > 0 and rank_address = {}
- ORDER BY geometry_sector""".format(self.rank)
+ return """{} WHERE indexed_status > 0 and rank_address = {}
+ ORDER BY geometry_sector
+ """.format(self.SELECT_SQL, self.rank)
- @staticmethod
- def sql_index_place(ids):
- return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\
- .format(','.join((str(i) for i in ids)))
-
-class BoundaryRunner:
+class BoundaryRunner(AbstractPlacexRunner):
""" Returns SQL commands for indexing the administrative boundaries
of a certain rank.
"""
- def __init__(self, rank):
- self.rank = rank
-
def name(self):
return "boundaries rank {}".format(self.rank)
""".format(self.rank)
def sql_get_objects(self):
- return """SELECT place_id FROM placex
- WHERE indexed_status > 0 and rank_search = {}
- and class = 'boundary' and type = 'administrative'
- ORDER BY partition, admin_level
- """.format(self.rank)
-
- @staticmethod
- def sql_index_place(ids):
- return "UPDATE placex SET indexed_status = 0 WHERE place_id IN ({})"\
- .format(','.join((str(i) for i in ids)))
+ return """{} WHERE indexed_status > 0 and rank_search = {}
+ and class = 'boundary' and type = 'administrative'
+ ORDER BY partition, admin_level
+ """.format(self.SELECT_SQL, self.rank)
class InterpolationRunner:
location_property_osmline.
"""
+ def __init__(self, analyzer):
+ self.analyzer = analyzer
+
+
@staticmethod
def name():
return "interpolation lines (location_property_osmline)"
@staticmethod
def sql_get_objects():
- return """SELECT place_id FROM location_property_osmline
+ return """SELECT place_id
+ FROM location_property_osmline
WHERE indexed_status > 0
ORDER BY geometry_sector"""
+
+ @staticmethod
+ def get_place_details(worker, ids):
+ worker.perform("""SELECT place_id, get_interpolation_address(address, osm_id) as address
+ FROM location_property_osmline WHERE place_id IN %s""",
+ (tuple((p[0] for p in ids)), ))
+
+
@staticmethod
- def sql_index_place(ids):
- return """UPDATE location_property_osmline
- SET indexed_status = 0 WHERE place_id IN ({})
- """.format(','.join((str(i) for i in ids)))
+ @functools.lru_cache(maxsize=1)
+ def _index_sql(num_places):
+ return """ UPDATE location_property_osmline
+ SET indexed_status = 0, address = v.addr, token_info = v.ti
+ FROM (VALUES {}) as v(id, addr, ti)
+ WHERE place_id = v.id
+ """.format(','.join(["(%s, %s::hstore, %s::jsonb)"] * num_places))
+
+
+ def index_places(self, worker, places):
+ values = []
+ for place in places:
+ values.extend((place[x] for x in ('place_id', 'address')))
+ values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+
+ worker.perform(self._index_sql(len(places)), values)
+
class PostcodeRunner:
ORDER BY country_code, postcode"""
@staticmethod
- def sql_index_place(ids):
- return """UPDATE location_postcode SET indexed_status = 0
- WHERE place_id IN ({})
- """.format(','.join((str(i) for i in ids)))
+ def index_places(worker, ids):
+ worker.perform(""" UPDATE location_postcode SET indexed_status = 0
+ WHERE place_id IN ({})
+ """.format(','.join((str(i[0]) for i in ids))))
--- /dev/null
+"""
+Functions for creating a tokenizer or initialising the right one for an
+existing database.
+
+A tokenizer is something that is bound to the lifetime of a database. It
+can be choosen and configured before the intial import but then needs to
+be used consistently when querying and updating the database.
+
+This module provides the functions to create and configure a new tokenizer
+as well as instanciating the appropriate tokenizer for updating an existing
+database.
+
+A tokenizer usually also includes PHP code for querying. The appropriate PHP
+normalizer module is installed, when the tokenizer is created.
+"""
+import logging
+import importlib
+
+from ..errors import UsageError
+from ..db import properties
+from ..db.connection import connect
+
+LOG = logging.getLogger()
+
+def _import_tokenizer(name):
+ """ Load the tokenizer.py module from project directory.
+ """
+ try:
+ return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
+ except ModuleNotFoundError as exp:
+ LOG.fatal("No tokenizer named '%s' available. "
+ "Check the setting of NOMINATIM_TOKENIZER.", name)
+ raise UsageError('Tokenizer not found') from exp
+
+
+def create_tokenizer(config, init_db=True, module_name=None):
+ """ Create a new tokenizer as defined by the given configuration.
+
+ The tokenizer data and code is copied into the 'tokenizer' directory
+ of the project directory and the tokenizer loaded from its new location.
+ """
+ if module_name is None:
+ module_name = config.TOKENIZER
+
+ # Create the directory for the tokenizer data
+ basedir = config.project_dir / 'tokenizer'
+ if not basedir.exists():
+ basedir.mkdir()
+ elif not basedir.is_dir():
+ LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir)
+ raise UsageError("Tokenizer setup failed.")
+
+ # Import and initialize the tokenizer.
+ tokenizer_module = _import_tokenizer(module_name)
+
+ tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
+ tokenizer.init_new_db(config, init_db=init_db)
+
+ with connect(config.get_libpq_dsn()) as conn:
+ properties.set_property(conn, 'tokenizer', module_name)
+
+ return tokenizer
+
+
+def get_tokenizer_for_db(config):
+ """ Instantiate a tokenizer for an existing database.
+
+ The function looks up the appropriate tokenizer in the database
+ and initialises it.
+ """
+ basedir = config.project_dir / 'tokenizer'
+ if not basedir.is_dir():
+ LOG.fatal("Cannot find tokenizer data in '%s'.", basedir)
+ raise UsageError('Cannot initialize tokenizer.')
+
+ with connect(config.get_libpq_dsn()) as conn:
+ name = properties.get_property(conn, 'tokenizer')
+
+ if name is None:
+ LOG.fatal("Tokenizer was not set up properly. Database property missing.")
+ raise UsageError('Cannot initialize tokenizer.')
+
+ tokenizer_module = _import_tokenizer(name)
+
+ tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
+ tokenizer.init_from_project()
+
+ return tokenizer
--- /dev/null
+"""
+Tokenizer implementing normalisation as used before Nominatim 4.
+"""
+from collections import OrderedDict
+import logging
+import re
+import shutil
+from textwrap import dedent
+
+from icu import Transliterator
+import psycopg2
+import psycopg2.extras
+
+from nominatim.db.connection import connect
+from nominatim.db import properties
+from nominatim.db import utils as db_utils
+from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.errors import UsageError
+
+DBCFG_NORMALIZATION = "tokenizer_normalization"
+DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
+
+LOG = logging.getLogger()
+
+def create(dsn, data_dir):
+ """ Create a new instance of the tokenizer provided by this module.
+ """
+ return LegacyTokenizer(dsn, data_dir)
+
+
+def _install_module(config_module_path, src_dir, module_dir):
+ """ Copies the PostgreSQL normalisation module into the project
+ directory if necessary. For historical reasons the module is
+ saved in the '/module' subdirectory and not with the other tokenizer
+ data.
+
+ The function detects when the installation is run from the
+ build directory. It doesn't touch the module in that case.
+ """
+ # Custom module locations are simply used as is.
+ if config_module_path:
+ LOG.info("Using custom path for database module at '%s'", config_module_path)
+ return config_module_path
+
+ # Compatibility mode for builddir installations.
+ if module_dir.exists() and src_dir.samefile(module_dir):
+ LOG.info('Running from build directory. Leaving database module as is.')
+ return module_dir
+
+ # In any other case install the module in the project directory.
+ if not module_dir.exists():
+ module_dir.mkdir()
+
+ destfile = module_dir / 'nominatim.so'
+ shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
+ destfile.chmod(0o755)
+
+ LOG.info('Database module installed at %s', str(destfile))
+
+ return module_dir
+
+
+def _check_module(module_dir, conn):
+ """ Try to use the PostgreSQL module to confirm that it is correctly
+ installed and accessible from PostgreSQL.
+ """
+ with conn.cursor() as cur:
+ try:
+ cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
+ RETURNS text AS '{}/nominatim.so', 'transliteration'
+ LANGUAGE c IMMUTABLE STRICT;
+ DROP FUNCTION nominatim_test_import_func(text)
+ """.format(module_dir))
+ except psycopg2.DatabaseError as err:
+ LOG.fatal("Error accessing database module: %s", err)
+ raise UsageError("Database module cannot be accessed.") from err
+
+
+class LegacyTokenizer:
+ """ The legacy tokenizer uses a special PostgreSQL module to normalize
+ names and queries. The tokenizer thus implements normalization through
+ calls to the database.
+ """
+
+ def __init__(self, dsn, data_dir):
+ self.dsn = dsn
+ self.data_dir = data_dir
+ self.normalization = None
+
+
+ def init_new_db(self, config, init_db=True):
+ """ Set up a new tokenizer for the database.
+
+ This copies all necessary data in the project directory to make
+ sure the tokenizer remains stable even over updates.
+ """
+ module_dir = _install_module(config.DATABASE_MODULE_PATH,
+ config.lib_dir.module,
+ config.project_dir / 'module')
+
+ self.normalization = config.TERM_NORMALIZATION
+
+ self._install_php(config)
+
+ with connect(self.dsn) as conn:
+ _check_module(module_dir, conn)
+ self._save_config(conn, config)
+ conn.commit()
+
+ if init_db:
+ self.update_sql_functions(config)
+ self._init_db_tables(config)
+
+
+ def init_from_project(self):
+ """ Initialise the tokenizer from the project directory.
+ """
+ with connect(self.dsn) as conn:
+ self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
+
+
+ def finalize_import(self, config):
+ """ Do any required postprocessing to make the tokenizer data ready
+ for use.
+ """
+ with connect(self.dsn) as conn:
+ sqlp = SQLPreprocessor(conn, config)
+ sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
+
+
+ def update_sql_functions(self, config):
+ """ Reimport the SQL functions for this tokenizer.
+ """
+ with connect(self.dsn) as conn:
+ max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
+ modulepath = config.DATABASE_MODULE_PATH or \
+ str((config.project_dir / 'module').resolve())
+ sqlp = SQLPreprocessor(conn, config)
+ sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
+ max_word_freq=max_word_freq,
+ modulepath=modulepath)
+
+
+ def check_database(self):
+ """ Check that the tokenizer is set up correctly.
+ """
+ hint = """\
+ The Postgresql extension nominatim.so was not correctly loaded.
+
+ Error: {error}
+
+ Hints:
+ * Check the output of the CMmake/make installation step
+ * Does nominatim.so exist?
+ * Does nominatim.so exist on the database server?
+ * Can nominatim.so be accessed by the database user?
+ """
+ with connect(self.dsn) as conn:
+ with conn.cursor() as cur:
+ try:
+ out = cur.scalar("SELECT make_standard_name('a')")
+ except psycopg2.Error as err:
+ return hint.format(error=str(err))
+
+ if out != 'a':
+ return hint.format(error='Unexpected result for make_standard_name()')
+
+ return None
+
+
+ def migrate_database(self, config):
+ """ Initialise the project directory of an existing database for
+ use with this tokenizer.
+
+ This is a special migration function for updating existing databases
+ to new software versions.
+ """
+ self.normalization = config.TERM_NORMALIZATION
+ module_dir = _install_module(config.DATABASE_MODULE_PATH,
+ config.lib_dir.module,
+ config.project_dir / 'module')
+
+ with connect(self.dsn) as conn:
+ _check_module(module_dir, conn)
+ self._save_config(conn, config)
+
+
+ def name_analyzer(self):
+ """ Create a new analyzer for tokenizing names and queries
+ using this tokinzer. Analyzers are context managers and should
+ be used accordingly:
+
+ ```
+ with tokenizer.name_analyzer() as analyzer:
+ analyser.tokenize()
+ ```
+
+ When used outside the with construct, the caller must ensure to
+ call the close() function before destructing the analyzer.
+
+ Analyzers are not thread-safe. You need to instantiate one per thread.
+ """
+ normalizer = Transliterator.createFromRules("phrase normalizer",
+ self.normalization)
+ return LegacyNameAnalyzer(self.dsn, normalizer)
+
+
+ def _install_php(self, config):
+ """ Install the php script for the tokenizer.
+ """
+ php_file = self.data_dir / "tokenizer.php"
+ php_file.write_text(dedent("""\
+ <?php
+ @define('CONST_Max_Word_Frequency', {0.MAX_WORD_FREQUENCY});
+ @define('CONST_Term_Normalization_Rules', "{0.TERM_NORMALIZATION}");
+ require_once('{0.lib_dir.php}/tokenizer/legacy_tokenizer.php');
+ """.format(config)))
+
+
+ def _init_db_tables(self, config):
+ """ Set up the word table and fill it with pre-computed word
+ frequencies.
+ """
+ with connect(self.dsn) as conn:
+ sqlp = SQLPreprocessor(conn, config)
+ sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
+ conn.commit()
+
+ LOG.warning("Precomputing word tokens")
+ db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
+
+
+ def _save_config(self, conn, config):
+ """ Save the configuration that needs to remain stable for the given
+ database as database properties.
+ """
+ properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
+ properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
+
+
+class LegacyNameAnalyzer:
+ """ The legacy analyzer uses the special Postgresql module for
+ splitting names.
+
+ Each instance opens a connection to the database to request the
+ normalization.
+ """
+
+ def __init__(self, dsn, normalizer):
+ self.conn = connect(dsn).connection
+ self.conn.autocommit = True
+ self.normalizer = normalizer
+ psycopg2.extras.register_hstore(self.conn)
+
+ self._cache = _TokenCache(self.conn)
+
+
+ def __enter__(self):
+ return self
+
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+
+
+ def close(self):
+ """ Free all resources used by the analyzer.
+ """
+ if self.conn:
+ self.conn.close()
+ self.conn = None
+
+
+ def normalize(self, phrase):
+ """ Normalize the given phrase, i.e. remove all properties that
+ are irrelevant for search.
+ """
+ return self.normalizer.transliterate(phrase)
+
+
+ def add_postcodes_from_db(self):
+ """ Add postcodes from the location_postcode table to the word table.
+ """
+ with self.conn.cursor() as cur:
+ cur.execute("""SELECT count(create_postcode_id(pc))
+ FROM (SELECT distinct(postcode) as pc
+ FROM location_postcode) x""")
+
+
+ def update_special_phrases(self, phrases):
+ """ Replace the search index for special phrases with the new phrases.
+ """
+ norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
+ for p in phrases))
+
+ with self.conn.cursor() as cur:
+ # Get the old phrases.
+ existing_phrases = set()
+ cur.execute("""SELECT word, class, type, operator FROM word
+ WHERE class != 'place'
+ OR (type != 'house' AND type != 'postcode')""")
+ for label, cls, typ, oper in cur:
+ existing_phrases.add((label, cls, typ, oper or '-'))
+
+ to_add = norm_phrases - existing_phrases
+ to_delete = existing_phrases - norm_phrases
+
+ if to_add:
+ psycopg2.extras.execute_values(
+ cur,
+ """ INSERT INTO word (word_id, word_token, word, class, type,
+ search_name_count, operator)
+ (SELECT nextval('seq_word'), make_standard_name(name), name,
+ class, type, 0,
+ CASE WHEN op in ('in', 'near') THEN op ELSE null END
+ FROM (VALUES %s) as v(name, class, type, op))""",
+ to_add)
+
+ if to_delete:
+ psycopg2.extras.execute_values(
+ cur,
+ """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+ WHERE word = name and class = in_class and type = in_type
+ and ((op = '-' and operator is null) or op = operator)""",
+ to_delete)
+
+ LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
+ len(norm_phrases), len(to_add), len(to_delete))
+
+
+ def add_country_names(self, country_code, names):
+ """ Add names for the given country to the search index.
+ """
+ with self.conn.cursor() as cur:
+ cur.execute(
+ """INSERT INTO word (word_id, word_token, country_code)
+ (SELECT nextval('seq_word'), lookup_token, %s
+ FROM (SELECT ' ' || make_standard_name(n) as lookup_token
+ FROM unnest(%s)n) y
+ WHERE NOT EXISTS(SELECT * FROM word
+ WHERE word_token = lookup_token and country_code = %s))
+ """, (country_code, names, country_code))
+
+
+ def process_place(self, place):
+ """ Determine tokenizer information about the given place.
+
+ Returns a JSON-serialisable structure that will be handed into
+ the database via the token_info field.
+ """
+ token_info = _TokenInfo(self._cache)
+
+ names = place.get('name')
+
+ if names:
+ token_info.add_names(self.conn, names)
+
+ country_feature = place.get('country_feature')
+ if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
+ self.add_country_names(country_feature.lower(), list(names.values()))
+
+ address = place.get('address')
+
+ if address:
+ hnrs = []
+ addr_terms = []
+ for key, value in address.items():
+ if key == 'postcode':
+ self._add_postcode(value)
+ elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+ hnrs.append(value)
+ elif key == 'street':
+ token_info.add_street(self.conn, value)
+ elif key == 'place':
+ token_info.add_place(self.conn, value)
+ elif not key.startswith('_') and \
+ key not in ('country', 'full'):
+ addr_terms.append((key, value))
+
+ if hnrs:
+ token_info.add_housenumbers(self.conn, hnrs)
+
+ if addr_terms:
+ token_info.add_address_terms(self.conn, addr_terms)
+
+ return token_info.data
+
+
+ def _add_postcode(self, postcode):
+ """ Make sure the normalized postcode is present in the word table.
+ """
+ def _create_postcode_from_db(pcode):
+ with self.conn.cursor() as cur:
+ cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
+
+ if re.search(r'[:,;]', postcode) is None:
+ self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
+
+
+class _TokenInfo:
+ """ Collect token information to be sent back to the database.
+ """
+ def __init__(self, cache):
+ self.cache = cache
+ self.data = {}
+
+
+ def add_names(self, conn, names):
+ """ Add token information for the names of the place.
+ """
+ with conn.cursor() as cur:
+ # Create the token IDs for all names.
+ self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
+ (names, ))
+
+
+ def add_housenumbers(self, conn, hnrs):
+ """ Extract housenumber information from the address.
+ """
+ if len(hnrs) == 1:
+ token = self.cache.get_housenumber(hnrs[0])
+ if token is not None:
+ self.data['hnr_tokens'] = token
+ self.data['hnr'] = hnrs[0]
+ return
+
+ # split numbers if necessary
+ simple_list = []
+ for hnr in hnrs:
+ simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
+
+ if len(simple_list) > 1:
+ simple_list = list(set(simple_list))
+
+ with conn.cursor() as cur:
+ cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
+ self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
+
+
+ def add_street(self, conn, street):
+ """ Add addr:street match terms.
+ """
+ def _get_street(name):
+ with conn.cursor() as cur:
+ return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
+
+ self.data['street'] = self.cache.streets.get(street, _get_street)
+
+
+ def add_place(self, conn, place):
+ """ Add addr:place search and match terms.
+ """
+ def _get_place(name):
+ with conn.cursor() as cur:
+ cur.execute("""SELECT (addr_ids_from_name(%s)
+ || getorcreate_name_id(make_standard_name(%s), ''))::text,
+ word_ids_from_name(%s)::text""",
+ (name, name, name))
+ return cur.fetchone()
+
+ self.data['place_search'], self.data['place_match'] = \
+ self.cache.places.get(place, _get_place)
+
+
+ def add_address_terms(self, conn, terms):
+ """ Add additional address terms.
+ """
+ def _get_address_term(name):
+ with conn.cursor() as cur:
+ cur.execute("""SELECT addr_ids_from_name(%s)::text,
+ word_ids_from_name(%s)::text""",
+ (name, name))
+ return cur.fetchone()
+
+ tokens = {}
+ for key, value in terms:
+ tokens[key] = self.cache.address_terms.get(value, _get_address_term)
+
+ self.data['addr'] = tokens
+
+
+class _LRU:
+ """ Least recently used cache that accepts a generator function to
+ produce the item when there is a cache miss.
+ """
+
+ def __init__(self, maxsize=128, init_data=None):
+ self.data = init_data or OrderedDict()
+ self.maxsize = maxsize
+ if init_data is not None and len(init_data) > maxsize:
+ self.maxsize = len(init_data)
+
+ def get(self, key, generator):
+ """ Get the item with the given key from the cache. If nothing
+ is found in the cache, generate the value through the
+ generator function and store it in the cache.
+ """
+ value = self.data.get(key)
+ if value is not None:
+ self.data.move_to_end(key)
+ else:
+ value = generator(key)
+ if len(self.data) >= self.maxsize:
+ self.data.popitem(last=False)
+ self.data[key] = value
+
+ return value
+
+
+class _TokenCache:
+ """ Cache for token information to avoid repeated database queries.
+
+ This cache is not thread-safe and needs to be instantiated per
+ analyzer.
+ """
+ def __init__(self, conn):
+ # various LRU caches
+ self.streets = _LRU(maxsize=256)
+ self.places = _LRU(maxsize=128)
+ self.address_terms = _LRU(maxsize=1024)
+
+ # Lookup houseunumbers up to 100 and cache them
+ with conn.cursor() as cur:
+ cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
+ FROM generate_series(1, 100) as i""")
+ self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
+
+ # Get postcodes that are already saved
+ postcodes = OrderedDict()
+ with conn.cursor() as cur:
+ cur.execute("""SELECT word FROM word
+ WHERE class ='place' and type = 'postcode'""")
+ for row in cur:
+ postcodes[row[0]] = None
+ self.postcodes = _LRU(maxsize=32, init_data=postcodes)
+
+ def get_housenumber(self, number):
+ """ Get a housenumber token from the cache.
+ """
+ return self._cached_housenumbers.get(number)
from enum import Enum
from textwrap import dedent
-import psycopg2
-
from nominatim.db.connection import connect
from nominatim.errors import UsageError
+from nominatim.tokenizer import factory as tokenizer_factory
CHECKLIST = []
def _get_indexes(conn):
- indexes = ['idx_word_word_id',
- 'idx_place_addressline_address_place_id',
+ indexes = ['idx_place_addressline_address_place_id',
'idx_placex_rank_search',
'idx_placex_rank_address',
'idx_placex_parent_place_id',
@_check(hint="""placex table has no data. Did the import finish sucessfully?""")
-def check_placex_size(conn, config): # pylint: disable=W0613
+def check_placex_size(conn, _):
""" Checking for placex content
"""
with conn.cursor() as cur:
return CheckState.OK if cnt > 0 else CheckState.FATAL
-@_check(hint="""\
- The Postgresql extension nominatim.so was not correctly loaded.
-
- Error: {error}
-
- Hints:
- * Check the output of the CMmake/make installation step
- * Does nominatim.so exist?
- * Does nominatim.so exist on the database server?
- * Can nominatim.so be accessed by the database user?
- """)
-def check_module(conn, config): # pylint: disable=W0613
- """ Checking that nominatim.so module is installed
+@_check(hint="""{msg}""")
+def check_tokenizer(_, config):
+ """ Checking that tokenizer works
"""
- with conn.cursor() as cur:
- try:
- out = cur.scalar("SELECT make_standard_name('a')")
- except psycopg2.ProgrammingError as err:
- return CheckState.FAIL, dict(error=str(err))
+ try:
+ tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
+ except UsageError:
+ return CheckState.FAIL, dict(msg="""\
+ Cannot load tokenizer. Did the import finish sucessfully?""")
- if out != 'a':
- return CheckState.FAIL, dict(error='Unexpected result for make_standard_name()')
+ result = tokenizer.check_database()
+ if result is None:
return CheckState.OK
+ return CheckState.FAIL, dict(msg=result)
+
@_check(hint="""\
The indexing didn't finish. {count} entries are not yet indexed.
To index the remaining entries, run: {index_cmd}
""")
-def check_indexing(conn, config): # pylint: disable=W0613
+def check_indexing(conn, _):
""" Checking indexing status
"""
with conn.cursor() as cur:
if cnt == 0:
return CheckState.OK
- if conn.index_exists('idx_word_word_id'):
+ if conn.index_exists('idx_placex_rank_search'):
# Likely just an interrupted update.
index_cmd = 'nominatim index'
else:
Rerun the index creation with: nominatim import --continue db-postprocess
""")
-def check_database_indexes(conn, config): # pylint: disable=W0613
+def check_database_indexes(conn, _):
""" Checking that database indexes are complete
"""
missing = []
Invalid indexes:
{indexes}
""")
-def check_database_index_valid(conn, config): # pylint: disable=W0613
+def check_database_index_valid(conn, _):
""" Checking that all database indexes are valid
"""
with conn.cursor() as cur:
import os
import selectors
import subprocess
-import shutil
from pathlib import Path
import psutil
-import psycopg2
+import psycopg2.extras
from nominatim.db.connection import connect, get_pg_env
from nominatim.db import utils as db_utils
raise UsageError('PostGIS version is too old.')
-def install_module(src_dir, project_dir, module_dir, conn=None):
- """ Copy the normalization module from src_dir into the project
- directory under the '/module' directory. If 'module_dir' is set, then
- use the module from there instead and check that it is accessible
- for Postgresql.
-
- The function detects when the installation is run from the
- build directory. It doesn't touch the module in that case.
-
- If 'conn' is given, then the function also tests if the module
- can be access via the given database.
- """
- if not module_dir:
- module_dir = project_dir / 'module'
-
- if not module_dir.exists() or not src_dir.samefile(module_dir):
-
- if not module_dir.exists():
- module_dir.mkdir()
-
- destfile = module_dir / 'nominatim.so'
- shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
- destfile.chmod(0o755)
-
- LOG.info('Database module installed at %s', str(destfile))
- else:
- LOG.info('Running from build directory. Leaving database module as is.')
- else:
- LOG.info("Using custom path for database module at '%s'", module_dir)
-
- if conn is not None:
- with conn.cursor() as cur:
- try:
- cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
- RETURNS text AS '{}/nominatim.so', 'transliteration'
- LANGUAGE c IMMUTABLE STRICT;
- DROP FUNCTION nominatim_test_import_func(text)
- """.format(module_dir))
- except psycopg2.DatabaseError as err:
- LOG.fatal("Error accessing database module: %s", err)
- raise UsageError("Database module cannot be accessed.") from err
-
-
def import_base_data(dsn, sql_dir, ignore_partitions=False):
""" Create and populate the tables with basic static data that provides
the background for geocoding. Data is assumed to not yet exist.
sql.run_sql_file(conn, 'partition-tables.src.sql')
-def truncate_data_tables(conn, max_word_frequency=None):
+def truncate_data_tables(conn):
""" Truncate all data tables to prepare for a fresh load.
"""
with conn.cursor() as cur:
- cur.execute('TRUNCATE word')
cur.execute('TRUNCATE placex')
cur.execute('TRUNCATE place_addressline')
cur.execute('TRUNCATE location_area')
for table in [r[0] for r in list(cur)]:
cur.execute('TRUNCATE ' + table)
- if max_word_frequency is not None:
- # Used by getorcreate_word_id to ignore frequent partial words.
- cur.execute("""CREATE OR REPLACE FUNCTION get_maxwordfreq()
- RETURNS integer AS $$
- SELECT {} as maxwordfreq;
- $$ LANGUAGE SQL IMMUTABLE
- """.format(max_word_frequency))
- conn.commit()
+ conn.commit()
_COPY_COLUMNS = 'osm_type, osm_id, class, type, name, admin_level, address, extratags, geometry'
-def load_data(dsn, data_dir, threads):
+def load_data(dsn, threads):
""" Copy data into the word and placex table.
"""
- # Pre-calculate the most important terms in the word list.
- db_utils.execute_file(dsn, data_dir / 'words.sql')
-
sel = selectors.DefaultSelector()
# Then copy data from place to placex in <threads - 1> chunks.
place_threads = max(1, threads - 1)
sql.run_sql_file(conn, 'indices.sql', drop=drop)
-def create_country_names(conn, config):
- """ Create search index for default country names.
+def create_country_names(conn, tokenizer, languages=None):
+ """ Add default country names to search index. `languages` is a comma-
+ separated list of language codes as used in OSM. If `languages` is not
+ empty then only name translations for the given languages are added
+ to the index.
"""
+ if languages:
+ languages = languages.split(',')
+
+ def _include_key(key):
+ return key == 'name' or \
+ (key.startswith('name:') \
+ and (not languages or key[5:] in languages))
with conn.cursor() as cur:
- cur.execute("""SELECT getorcreate_country(make_standard_name('uk'), 'gb')""")
- cur.execute("""SELECT getorcreate_country(make_standard_name('united states'), 'us')""")
- cur.execute("""SELECT COUNT(*) FROM
- (SELECT getorcreate_country(make_standard_name(country_code),
- country_code) FROM country_name WHERE country_code is not null) AS x""")
- cur.execute("""SELECT COUNT(*) FROM
- (SELECT getorcreate_country(make_standard_name(name->'name'), country_code)
- FROM country_name WHERE name ? 'name') AS x""")
- sql_statement = """SELECT COUNT(*) FROM (SELECT getorcreate_country(make_standard_name(v),
- country_code) FROM (SELECT country_code, skeys(name)
- AS k, svals(name) AS v FROM country_name) x WHERE k"""
-
- languages = config.LANGUAGES
-
- if languages:
- sql_statement = "{} IN (".format(sql_statement)
- delim = ''
- for language in languages.split(','):
- sql_statement = "{}{}'name:{}'".format(sql_statement, delim, language)
- delim = ', '
- sql_statement = '{})'.format(sql_statement)
- else:
- sql_statement = "{} LIKE 'name:%'".format(sql_statement)
- sql_statement = "{}) v".format(sql_statement)
- cur.execute(sql_statement)
+ psycopg2.extras.register_hstore(cur)
+ cur.execute("""SELECT country_code, name FROM country_name
+ WHERE country_code is not null""")
+
+ with tokenizer.name_analyzer() as analyzer:
+ for code, name in cur:
+ names = [code]
+ if code == 'gb':
+ names.append('UK')
+ if code == 'us':
+ names.append('United States')
+
+ # country names (only in languages as provided)
+ if name:
+ names.extend((v for k, v in name.items() if _include_key(k)))
+
+ analyzer.add_country_names(code, names)
+
conn.commit()
from nominatim.db import properties
from nominatim.db.connection import connect
from nominatim.version import NOMINATIM_VERSION
-from nominatim.tools import refresh, database_import
+from nominatim.tools import refresh
+from nominatim.tokenizer import factory as tokenizer_factory
from nominatim.errors import UsageError
LOG = logging.getLogger()
'{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(version))
kwargs = dict(conn=conn, config=config, paths=paths)
func(**kwargs)
+ conn.commit()
has_run_migration = True
if has_run_migration:
LOG.warning('Updating SQL functions.')
refresh.create_functions(conn, config)
+ tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
+ tokenizer.update_sql_functions(config)
properties.set_property(conn, 'database_version',
'{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))
TYPE timestamp with time zone;""")
-@_migration(3, 5, 0, 99)
-def install_database_module_in_project_directory(conn, config, paths, **_):
- """ Install database module in project directory.
-
- The database module needs to be present in the project directory
- since those were introduced.
- """
- database_import.install_module(paths.module_dir, paths.project_dir,
- config.DATABASE_MODULE_PATH, conn=conn)
-
-
@_migration(3, 5, 0, 99)
def add_nominatim_property_table(conn, config, **_):
""" Add nominatim_property table.
The database schema switched from saving raw housenumbers in
placex.housenumber to saving transliterated ones.
+
+ Note: the function create_housenumber_id() has been dropped in later
+ versions.
"""
with conn.cursor() as cur:
cur.execute("""CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
and class = 'place' and type != 'postcode'
and linked_place_id is null""")
cur.execute(""" DROP INDEX IF EXISTS idx_placex_adminname """)
+
+
+@_migration(3, 7, 0, 1)
+def install_legacy_tokenizer(conn, config, **_):
+ """ Setup legacy tokenizer.
+
+ If no other tokenizer has been configured yet, then create the
+ configuration for the backwards-compatible legacy tokenizer
+ """
+ if properties.get_property(conn, 'tokenizer') is None:
+ with conn.cursor() as cur:
+ for table in ('placex', 'location_property_osmline'):
+ has_column = cur.scalar("""SELECT count(*) FROM information_schema.columns
+ WHERE table_name = %s
+ and column_name = 'token_info'""",
+ (table, ))
+ if has_column == 0:
+ cur.execute('ALTER TABLE {} ADD COLUMN token_info JSONB'.format(table))
+ tokenizer = tokenizer_factory.create_tokenizer(config, init_db=False,
+ module_name='legacy')
+
+ tokenizer.migrate_database(config)
from nominatim.db.utils import execute_file
from nominatim.db.connection import connect
-def import_postcodes(dsn, project_dir):
+def import_postcodes(dsn, project_dir, tokenizer):
""" Set up the initial list of postcodes.
"""
INSERT INTO location_postcode
(place_id, indexed_status, country_code, postcode, geometry)
SELECT nextval('seq_place'), 1, country_code,
- upper(trim (both ' ' from address->'postcode')) as pc,
+ token_normalized_postcode(address->'postcode') as pc,
ST_Centroid(ST_Collect(ST_Centroid(geometry)))
FROM placex
- WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%'
+ WHERE address ? 'postcode'
+ and token_normalized_postcode(address->'postcode') is not null
AND geometry IS NOT null
GROUP BY country_code, pc
""")
cur.execute("""
INSERT INTO location_postcode
(place_id, indexed_status, country_code, postcode, geometry)
- SELECT nextval('seq_place'), 1, 'us', postcode,
+ SELECT nextval('seq_place'), 1, 'us',
+ token_normalized_postcode(postcode),
ST_SetSRID(ST_Point(x,y),4326)
- FROM us_postcode WHERE postcode NOT IN
+ FROM us_postcode WHERE token_normalized_postcode(postcode) NOT IN
(SELECT postcode FROM location_postcode
WHERE country_code = 'us')
""")
cur.execute("""
INSERT INTO location_postcode
(place_id, indexed_status, country_code, postcode, geometry)
- SELECT nextval('seq_place'), 1, 'gb', postcode, geometry
- FROM gb_postcode WHERE postcode NOT IN
+ SELECT nextval('seq_place'), 1, 'gb',
+ token_normalized_postcode(postcode), geometry
+ FROM gb_postcode WHERE token_normalized_postcode(postcode) NOT IN
(SELECT postcode FROM location_postcode
WHERE country_code = 'gb')
""")
DELETE FROM word WHERE class='place' and type='postcode'
and word NOT IN (SELECT postcode FROM location_postcode)
""")
-
- cur.execute("""
- SELECT count(getorcreate_postcode_id(v)) FROM
- (SELECT distinct(postcode) as v FROM location_postcode) p
- """)
conn.commit()
+
+ with tokenizer.name_analyzer() as analyzer:
+ analyzer.add_postcodes_from_db()
('Default_Language', 'DEFAULT_LANGUAGE', str),
('Log_DB', 'LOG_DB', bool),
('Log_File', 'LOG_FILE', str),
- ('Max_Word_Frequency', 'MAX_WORD_FREQUENCY', int),
('NoAccessControl', 'CORS_NOACCESSCONTROL', bool),
('Places_Max_ID_count', 'LOOKUP_MAX_COUNT', int),
('PolygonOutput_MaximumTypes', 'POLYGON_OUTPUT_MAX_TYPES', int),
('Search_BatchMode', 'SEARCH_BATCH_MODE', bool),
('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str),
- ('Term_Normalization_Rules', 'TERM_NORMALIZATION', str),
('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool),
('MapIcon_URL', 'MAPICON_URL', str),
)
@define('CONST_Debug', $_GET['debug'] ?? false);
@define('CONST_LibDir', '{0}');
+ @define('CONST_TokenizerDir', '{2}');
@define('CONST_NominatimVersion', '{1[0]}.{1[1]}.{1[2]}-{1[3]}');
- """.format(config.lib_dir.php, NOMINATIM_VERSION))
+ """.format(config.lib_dir.php, NOMINATIM_VERSION,
+ config.project_dir / 'tokenizer'))
for php_name, conf_name, var_type in PHP_CONST_DEFS:
if var_type == bool:
self.tables_deleted = 0
self.tables_ignored = 0
self.global_phrases_invalid = 0
- self.global_phrases_added = 0
- self.global_phrases_ignored = 0
- self.global_phrases_deleted = 0
def _set_lang_values_to_0(self):
"""
lang to 0.
"""
self.lang_phrases_invalid = 0
- self.lang_phrases_added = 0
- self.lang_phrases_ignored = 0
def notify_one_phrase_invalid(self):
"""
self.lang_phrases_invalid += 1
self.global_phrases_invalid += 1
- def notify_one_phrase_added(self):
- """
- Add +1 to the count of entries
- added to the db.
- """
- self.lang_phrases_added += 1
- self.global_phrases_added += 1
-
- def notify_one_phrase_ignored(self):
- """
- Add +1 to the count of ignored
- entries as it was already in the db.
- """
- self.lang_phrases_ignored += 1
- self.global_phrases_ignored += 1
-
- def notify_one_phrase_deleted(self):
- """
- Add +1 to the count of phrases deleted
- from the database.
- """
- self.global_phrases_deleted += 1
-
def notify_one_table_created(self):
"""
Add +1 to the count of created tables.
LOG.info('- %s phrases were invalid.', self.global_phrases_invalid)
if self.global_phrases_invalid > 0:
LOG.info(' Those invalid phrases have been skipped.')
- LOG.info('- %s phrases were ignored as they are already in the database',
- self.global_phrases_ignored)
- LOG.info('- %s phrases were added to the database', self.global_phrases_added)
- LOG.info('- %s phrases were deleted from the database', self.global_phrases_deleted)
- if self.global_phrases_deleted > 0:
- LOG.info(' They were deleted as they are not valid anymore.')
LOG.info('- %s tables were ignored as they already exist on the database',
self.tables_ignored)
LOG.info('- %s tables were created', self.tables_created)
LOG.info('- %s phrases were invalid.', self.lang_phrases_invalid)
if self.lang_phrases_invalid > 0:
LOG.info(' Those invalid phrases have been skipped.')
- LOG.info('- %s phrases were ignored as they are already in the database',
- self.lang_phrases_ignored)
- LOG.info('- %s phrases were added to the database', self.lang_phrases_added)
LOG.info('====================================================================')
if self.lang_phrases_invalid > 0:
import subprocess
import json
-from icu import Transliterator
from psycopg2.sql import Identifier, Literal, SQL
from nominatim.tools.exec_utils import get_url
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
)
self.sanity_check_pattern = re.compile(r'^\w+$')
- self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
- self.config.TERM_NORMALIZATION)
- #This set will contain all existing phrases from the word table which
- #no longer exist on the wiki.
- #It contain tuples with the following format: (normalized_word, class, type, operator)
- self.words_phrases_to_delete = set()
- #This set will contain the phrases which still exist from the wiki.
- #It is used to prevent duplicates on the wiki by removing them from
- #the word_phrases_to_delete only at the end.
- self.words_phrases_still_exist = set()
+ # This set will contain all existing phrases to be added.
+ # It contains tuples with the following format: (lable, class, type, operator)
+ self.word_phrases = set()
#This set will contain all existing place_classtype tables which doesn't match any
#special phrases class/type on the wiki.
self.table_phrases_to_delete = set()
- def import_from_wiki(self, languages=None):
+ def import_from_wiki(self, tokenizer, languages=None):
"""
Iterate through all specified languages and
extract corresponding special phrases from the wiki.
if languages is not None and not isinstance(languages, list):
raise TypeError('The \'languages\' argument should be of type list.')
- self._fetch_existing_words_phrases()
self._fetch_existing_place_classtype_tables()
#Get all languages to process.
self.statistics_handler.notify_current_lang_done(lang)
self._create_place_classtype_table_and_indexes(class_type_pairs)
- self._remove_non_existent_phrases_from_db()
+ self._remove_non_existent_tables_from_db()
self.db_connection.commit()
+
+ with tokenizer.name_analyzer() as analyzer:
+ analyzer.update_special_phrases(self.word_phrases)
+
LOG.warning('Import done.')
self.statistics_handler.notify_import_done()
- def _fetch_existing_words_phrases(self):
- """
- Fetch existing special phrases from the word table.
- Fill the word_phrases_to_delete set of the class.
- """
- #Only extract special phrases terms:
- #If class=place and type=house then it is a housenumber term.
- #If class=place and type=postcode then it is a postcode term.
- word_query = """
- SELECT word, class, type, operator FROM word
- WHERE class != 'place' OR (type != 'house' AND type != 'postcode')
- """
- with self.db_connection.cursor() as db_cursor:
- db_cursor.execute(SQL(word_query))
- for row in db_cursor:
- row[3] = '-' if row[3] is None else row[3]
- self.words_phrases_to_delete.add(
- (row[0], row[1], row[2], row[3])
- )
def _fetch_existing_place_classtype_tables(self):
"""
for match in matches:
phrase_label = match[0].strip()
- normalized_label = self.transliterator.transliterate(phrase_label)
phrase_class = match[1].strip()
phrase_type = match[2].strip()
phrase_operator = match[3].strip()
):
continue
- #Check if the phrase already exists in the database.
- if (
- (normalized_label, phrase_class, phrase_type, phrase_operator)
- in self.words_phrases_to_delete
- ):
- #Remove this phrase from the ones to delete as it still exist on the wiki.
- self.words_phrases_still_exist.add(
- (normalized_label, phrase_class, phrase_type, phrase_operator)
- )
- class_type_pairs.add((phrase_class, phrase_type))
- self.statistics_handler.notify_one_phrase_ignored()
- #Dont need to add this phrase as it already exists in the word table.
- continue
-
#sanity check, in case somebody added garbage in the wiki
if not self._check_sanity(lang, phrase_class, phrase_type):
self.statistics_handler.notify_one_phrase_invalid()
class_type_pairs.add((phrase_class, phrase_type))
- self._process_amenity(
- phrase_label, normalized_label, phrase_class,
- phrase_type, phrase_operator
- )
- self.statistics_handler.notify_one_phrase_added()
+ self.word_phrases.add((phrase_label, phrase_class,
+ phrase_type, phrase_operator))
return class_type_pairs
- def _process_amenity(self, phrase_label, normalized_label,
- phrase_class, phrase_type, phrase_operator):
- # pylint: disable-msg=too-many-arguments
- """
- Add phrase lookup and corresponding class and
- type to the word table based on the operator.
- """
- with self.db_connection.cursor() as db_cursor:
- if phrase_operator == 'near':
- db_cursor.execute("""SELECT getorcreate_amenityoperator(
- make_standard_name(%s), %s, %s, %s, 'near')""",
- (phrase_label, normalized_label, phrase_class, phrase_type))
- elif phrase_operator == 'in':
- db_cursor.execute("""SELECT getorcreate_amenityoperator(
- make_standard_name(%s), %s, %s, %s, 'in')""",
- (phrase_label, normalized_label, phrase_class, phrase_type))
- else:
- db_cursor.execute("""SELECT getorcreate_amenity(
- make_standard_name(%s), %s, %s, %s)""",
- (phrase_label, normalized_label, phrase_class, phrase_type))
-
def _create_place_classtype_table_and_indexes(self, class_type_pairs):
"""
.format(Identifier(table_name),
Identifier(self.config.DATABASE_WEBUSER)))
- def _remove_non_existent_phrases_from_db(self):
+ def _remove_non_existent_tables_from_db(self):
"""
Remove special phrases which doesn't exist on the wiki anymore.
- Delete from the word table and delete the place_classtype tables.
+ Delete the place_classtype tables.
"""
LOG.warning('Cleaning database...')
- self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist
#Array containing all queries to execute. Contain tuples of format (query, parameters)
queries_parameters = []
- #Delete phrases from the word table which are not on the wiki anymore.
- for phrase_to_delete in self.words_phrases_to_delete:
- self.statistics_handler.notify_one_phrase_deleted()
- if phrase_to_delete[3] == '-':
- query = """
- DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null
- """
- parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], )
- queries_parameters.append((query, parameters))
- else:
- query = """
- DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s
- """
- parameters = (phrase_to_delete[0], phrase_to_delete[1],
- phrase_to_delete[2], phrase_to_delete[3], )
- queries_parameters.append((query, parameters))
-
#Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
for table in self.table_phrases_to_delete:
self.statistics_handler.notify_one_table_deleted()
# and must always be increased when there is a change to the database or code
# that requires a migration.
# Released versions always have a database patch level of 0.
-NOMINATIM_VERSION = (3, 7, 0, 1)
+NOMINATIM_VERSION = (3, 7, 0, 2)
POSTGRESQL_REQUIRED_VERSION = (9, 3)
POSTGIS_REQUIRED_VERSION = (2, 2)
# Changing this value requires to run 'nominatim refresh --functions'.
NOMINATIM_DATABASE_MODULE_PATH=
+# Tokenizer used for normalizing and parsing queries and names.
+# The tokenizer is set up during import and cannot be changed afterwards
+# without a reimport.
+# Currently available tokenizers: legacy
+NOMINATIM_TOKENIZER="legacy"
+
# Number of occurances of a word before it is considered frequent.
# Similar to the concept of stop words. Frequent partial words get ignored
# or handled differently during search.
When importing
Then search_name contains
| object | nameaddress_vector |
- | N1 | Rose, Street, Walltown |
+ | N1 | #Rose Street, Walltown |
When searching for "23 Rose Street, Walltown"
Then results contain
| osm_type | osm_id | name |
When importing
Then search_name contains
| object | name_vector | nameaddress_vector |
- | N1 | #Green Moss | Rose, Street, Walltown |
+ | N1 | #Green Moss | #Rose Street, Walltown |
When searching for "Green Moss, Rose Street, Walltown"
Then results contain
| osm_type | osm_id | name |
When importing
Then search_name contains
| object | name_vector | nameaddress_vector |
- | N1 | foo | the road |
+ | N1 | foo | #the road |
Scenario: Some addr: tags are added to address
Given the scene roads-with-pois
from nominatim import cli
from nominatim.config import Configuration
from nominatim.tools import refresh
+from nominatim.tokenizer import factory as tokenizer_factory
from steps.utils import run_script
class NominatimEnvironment:
self.website_dir.cleanup()
self.website_dir = tempfile.TemporaryDirectory()
- cfg = Configuration(None, self.src_dir / 'settings', environ=self.test_env)
- cfg.lib_dir.php = self.src_dir / 'lib-php'
- refresh.setup_website(Path(self.website_dir.name) / 'website', cfg)
+ refresh.setup_website(Path(self.website_dir.name) / 'website',
+ self.get_test_config())
+
+
+ def get_test_config(self):
+ cfg = Configuration(Path(self.website_dir.name), self.src_dir / 'settings',
+ environ=self.test_env)
+ cfg.set_libdirs(module=self.build_dir / 'module',
+ osm2pgsql=self.build_dir / 'osm2pgsql' / 'osm2pgsql',
+ php=self.src_dir / 'lib-php',
+ sql=self.src_dir / 'lib-sql',
+ data=self.src_dir / 'data')
+ return cfg
def get_libpq_dsn(self):
dsn = self.test_env['NOMINATIM_DATABASE_DSN']
"""
self.write_nominatim_config(self.api_test_db)
- if self.api_db_done:
- return
+ if not self.api_db_done:
+ self.api_db_done = True
- self.api_db_done = True
-
- if self._reuse_or_drop_db(self.api_test_db):
- return
+ if not self._reuse_or_drop_db(self.api_test_db):
+ testdata = Path('__file__') / '..' / '..' / 'testdb'
+ self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())
- testdata = Path('__file__') / '..' / '..' / 'testdb'
- self.test_env['NOMINATIM_WIKIPEDIA_DATA_PATH'] = str(testdata.resolve())
+ try:
+ self.run_nominatim('import', '--osm-file', str(self.api_test_file))
+ self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
+ self.run_nominatim('freeze')
- try:
- self.run_nominatim('import', '--osm-file', str(self.api_test_file))
- self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
- self.run_nominatim('freeze')
+ phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
+ run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
+ except:
+ self.db_drop_database(self.api_test_db)
+ raise
- phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
- run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
- except:
- self.db_drop_database(self.api_test_db)
- raise
+ tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
def setup_unknown_db(self):
""" Setup a test against a non-existing database.
"""
- self.write_nominatim_config('UNKNOWN_DATABASE_NAME')
+ # The tokenizer needs an existing database to function.
+ # So start with the usual database
+ class _Context:
+ db = None
+
+ context = _Context()
+ self.setup_db(context)
+ tokenizer_factory.create_tokenizer(self.get_test_config(), init_db=False)
+
+ # Then drop the DB again
+ self.teardown_db(context, force_drop=True)
def setup_db(self, context):
""" Setup a test against a fresh, empty test database.
context.db.autocommit = True
psycopg2.extras.register_hstore(context.db, globally=False)
- def teardown_db(self, context):
+ def teardown_db(self, context, force_drop=False):
""" Remove the test database, if it exists.
"""
- if 'db' in context:
+ if hasattr(context, 'db'):
context.db.close()
- if not self.keep_scenario_db:
+ if force_drop or not self.keep_scenario_db:
self.db_drop_database(self.test_db)
def _reuse_or_drop_db(self, name):
from table_compare import NominatimID, DBRow
from nominatim.indexer import indexer
+from nominatim.tokenizer import factory as tokenizer_factory
def check_database_integrity(context):
""" Check some generic constraints on the tables.
def import_and_index_data_from_place_table(context):
""" Import data previously set up in the place table.
"""
+ nctx = context.nominatim
+
+ tokenizer = tokenizer_factory.create_tokenizer(nctx.get_test_config())
context.nominatim.copy_from_place(context.db)
# XXX use tool function as soon as it is ported
# Call directly as the refresh function does not include postcodes.
indexer.LOG.setLevel(logging.ERROR)
- indexer.Indexer(context.nominatim.get_libpq_dsn(), 1).index_full(analyse=False)
+ indexer.Indexer(context.nominatim.get_libpq_dsn(), tokenizer, 1).index_full(analyse=False)
check_database_integrity(context)
if exclude:
assert not present, "Found term for {}/{}: {}".format(row['object'], name, wid[1])
else:
- assert present, "Missing term for {}/{}: {}".fromat(row['object'], name, wid[1])
+ assert present, "Missing term for {}/{}: {}".format(row['object'], name, wid[1])
elif name != 'object':
assert db_row.contains(name, value), db_row.assert_msg(name, value)
public function testEmptyPhrase()
{
$oPhrase = new Phrase('', '');
- $oPhrase->computeWordSets(new TokensFullSet());
+ $oPhrase->computeWordSets(array(), new TokensFullSet());
- $this->assertEquals(
- array(array('')),
- $oPhrase->getWordSets()
- );
+ $this->assertNull($oPhrase->getWordSets());
}
public function testSingleWordPhrase()
{
$oPhrase = new Phrase('a', '');
- $oPhrase->computeWordSets(new TokensFullSet());
+ $oPhrase->computeWordSets(array('a'), new TokensFullSet());
$this->assertEquals(
'(a)',
public function testMultiWordPhrase()
{
$oPhrase = new Phrase('a b', '');
- $oPhrase->computeWordSets(new TokensFullSet());
+ $oPhrase->computeWordSets(array('a', 'b'), new TokensFullSet());
$this->assertEquals(
'(a b),(a|b)',
$this->serializeSets($oPhrase->getWordSets())
);
$oPhrase = new Phrase('a b c', '');
- $oPhrase->computeWordSets(new TokensFullSet());
+ $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
$this->assertEquals(
'(a b c),(a|b c),(a b|c),(a|b|c)',
$this->serializeSets($oPhrase->getWordSets())
);
$oPhrase = new Phrase('a b c d', '');
- $oPhrase->computeWordSets(new TokensFullSet());
+ $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensFullSet());
$this->assertEquals(
'(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)',
$this->serializeSets($oPhrase->getWordSets())
public function testInverseWordSets()
{
$oPhrase = new Phrase('a b c', '');
- $oPhrase->computeWordSets(new TokensFullSet());
+ $oPhrase->computeWordSets(array('a', 'b', 'c'), new TokensFullSet());
$oPhrase->invertWordSets();
$this->assertEquals(
public function testMaxWordSets()
{
- $oPhrase = new Phrase(join(' ', array_fill(0, 4, 'a')), '');
- $oPhrase->computeWordSets(new TokensFullSet());
+ $aWords = array_fill(0, 4, 'a');
+ $oPhrase = new Phrase(join(' ', $aWords), '');
+ $oPhrase->computeWordSets($aWords, new TokensFullSet());
$this->assertEquals(8, count($oPhrase->getWordSets()));
$oPhrase->invertWordSets();
$this->assertEquals(8, count($oPhrase->getWordSets()));
- $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
- $oPhrase->computeWordSets(new TokensFullSet());
+ $aWords = array_fill(0, 18, 'a');
+ $oPhrase = new Phrase(join(' ', $aWords), '');
+ $oPhrase->computeWordSets($aWords, new TokensFullSet());
$this->assertEquals(100, count($oPhrase->getWordSets()));
$oPhrase->invertWordSets();
$this->assertEquals(100, count($oPhrase->getWordSets()));
public function testPartialTokensShortTerm()
{
$oPhrase = new Phrase('a b c d', '');
- $oPhrase->computeWordSets(new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
+ $oPhrase->computeWordSets(array('a', 'b', 'c', 'd'), new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d')));
$this->assertEquals(
'(a|b c d),(a|b c|d)',
$this->serializeSets($oPhrase->getWordSets())
public function testPartialTokensLongTerm()
{
- $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
- $oPhrase->computeWordSets(new TokensPartialSet(array('a', 'a a a a a')));
+ $aWords = array_fill(0, 18, 'a');
+ $oPhrase = new Phrase(join(' ', $aWords), '');
+ $oPhrase->computeWordSets($aWords, new TokensPartialSet(array('a', 'a a a a a')));
$this->assertEquals(80, count($oPhrase->getWordSets()));
}
}
namespace Nominatim;
+@define('CONST_TokenizerDir', dirname(__FILE__));
+
require_once(CONST_LibDir.'/DB.php');
require_once(CONST_LibDir.'/Status.php');
$this->assertEquals('No database', $oStatus->status());
}
-
- public function testModuleFail()
- {
- $this->expectException(\Exception::class);
- $this->expectExceptionMessage('Module call failed');
- $this->expectExceptionCode(702);
-
- // stub has getOne method but doesn't return anything
- $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
- ->setMethods(array('connect', 'getOne'))
- ->getMock();
-
- $oStatus = new Status($oDbStub);
- $this->assertNull($oStatus->status());
- }
-
-
- public function testWordIdQueryFail()
- {
- $this->expectException(\Exception::class);
- $this->expectExceptionMessage('No value');
- $this->expectExceptionCode(704);
-
- $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
- ->setMethods(array('connect', 'getOne'))
- ->getMock();
-
- // return no word_id
- $oDbStub->method('getOne')
- ->will($this->returnCallback(function ($sql) {
- if (preg_match("/make_standard_name\('a'\)/", $sql)) return 'a';
- if (preg_match('/SELECT word_id, word_token/', $sql)) return null;
- }));
-
- $oStatus = new Status($oDbStub);
- $this->assertNull($oStatus->status());
- }
-
-
public function testOK()
{
$oDbStub = $this->getMockBuilder(Nominatim\DB::class)
$oDbStub = $this->getMockBuilder(Nominatim\DB::class)
->setMethods(array('getOne'))
->getMock();
-
+
$oDbStub->method('getOne')
->willReturn(1519430221);
$this->assertFalse($TL->contains('unknownword'));
$this->assertEquals(array(), $TL->get('unknownword'));
}
-
- public function testAddress()
- {
- $this->expectOutputRegex('/<p><tt>/');
-
- $oDbStub = $this->getMockBuilder(Nominatim\DB::class)
- ->setMethods(array('getAll', 'getDBQuotedList'))
- ->getMock();
-
- $oDbStub->method('getDBQuotedList')
- ->will($this->returnCallback(function ($aVals) {
- return array_map(function ($sVal) {
- return "'".$sVal."'";
- }, $aVals);
- }));
-
-
- $oDbStub->method('getAll')
- ->will($this->returnCallback(function ($sql) {
- $aResults = array();
- if (preg_match('/1051/', $sql)) {
- $aResults[] = $this->wordResult(array(
- 'word_id' => 999,
- 'word_token' => '1051',
- 'class' => 'place',
- 'type' => 'house'
- ));
- }
- if (preg_match('/hauptstr/', $sql)) {
- $aResults[] = $this->wordResult(array(
- 'word_id' => 999,
- 'word_token' => 'hauptstr',
- 'class' => 'place',
- 'type' => 'street',
- 'operator' => true
- ));
- }
- if (preg_match('/64286/', $sql)) {
- $aResults[] = $this->wordResult(array(
- 'word_id' => 999,
- 'word_token' => '64286',
- 'word' => '64286',
- 'class' => 'place',
- 'type' => 'postcode'
- ));
- }
- if (preg_match('/darmstadt/', $sql)) {
- $aResults[] = $this->wordResult(array(
- 'word_id' => 999,
- 'word_token' => 'darmstadt',
- 'count' => 533
- ));
- }
- if (preg_match('/alemagne/', $sql)) {
- $aResults[] = $this->wordResult(array(
- 'word_id' => 999,
- 'word_token' => 'alemagne',
- 'country_code' => 'de',
- ));
- }
- if (preg_match('/mexico/', $sql)) {
- $aResults[] = $this->wordResult(array(
- 'word_id' => 999,
- 'word_token' => 'mexico',
- 'country_code' => 'mx',
- ));
- }
- return $aResults;
- }));
-
- $aCountryCodes = array('de', 'fr');
- $sNormQuery = '1051 hauptstr 64286 darmstadt alemagne mexico';
- $aTokens = explode(' ', $sNormQuery);
-
- $TL = new TokenList;
- $TL->addTokensFromDB($oDbStub, $aTokens, $aCountryCodes, $sNormQuery, $this->oNormalizer);
- $this->assertEquals(5, $TL->count());
-
- $this->assertEquals(array(new Token\HouseNumber(999, '1051')), $TL->get('1051'));
- $this->assertEquals(array(new Token\Country(999, 'de')), $TL->get('alemagne'));
- $this->assertEquals(array(new Token\Postcode(999, '64286')), $TL->get('64286'));
- $this->assertEquals(array(new Token\Word(999, true, 533, 0)), $TL->get('darmstadt'));
- $this->assertEquals(array(new Token\SpecialTerm(999, 'place', 'street', true)), $TL->get('hauptstr'));
- }
}
--- /dev/null
+<?php
+
+namespace Nominatim;
+
+class Tokenizer
+{
+ private $oDB;
+
+ public function __construct(&$oDB)
+ {
+ $this->oDB =& $oDB;
+ }
+
+ public function checkStatus()
+ {
+ }
+}
+import importlib
import itertools
import sys
from pathlib import Path
from nominatim.config import Configuration
from nominatim.db import connection
from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.db import properties
+
+import dummy_tokenizer
class _TestingCursor(psycopg2.extras.DictCursor):
""" Extension to the DictCursor class that provides execution
def mk_table(name, definition='id INT', content=None):
temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition))
if content is not None:
- if not isinstance(content, str):
- content = '),('.join([str(x) for x in content])
- temp_db_cursor.execute("INSERT INTO {} VALUES ({})".format(name, content))
+ psycopg2.extras.execute_values(
+ temp_db_cursor, "INSERT INTO {} VALUES %s".format(name), content)
return mk_table
yield Path(phpdir)
+
+@pytest.fixture
+def property_table(table_factory):
+ table_factory('nominatim_properties', 'property TEXT, value TEXT')
+
@pytest.fixture
def status_table(temp_db_conn):
""" Create an empty version of the status table and
@pytest.fixture
def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
- monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.')
- table_factory('country_name', 'partition INT', (0, 1, 2))
+ table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, )))
cfg = Configuration(None, SRC_DIR.resolve() / 'settings')
cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php',
sql=tmp_path, data=SRC_DIR / 'data')
return SQLPreprocessor(temp_db_conn, cfg)
+
+
+@pytest.fixture
+def tokenizer_mock(monkeypatch, property_table, temp_db_conn, tmp_path):
+ """ Sets up the configuration so that the test dummy tokenizer will be
+ loaded when the tokenizer factory is used. Also returns a factory
+ with which a new dummy tokenizer may be created.
+ """
+ monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
+
+ def _import_dummy(module, *args, **kwargs):
+ return dummy_tokenizer
+
+ monkeypatch.setattr(importlib, "import_module", _import_dummy)
+ properties.set_property(temp_db_conn, 'tokenizer', 'dummy')
+
+ def _create_tokenizer():
+ return dummy_tokenizer.DummyTokenizer(None, None)
+
+ return _create_tokenizer
--- /dev/null
+"""
+Tokenizer for testing.
+"""
+
+def create(dsn, data_dir):
+ """ Create a new instance of the tokenizer provided by this module.
+ """
+ return DummyTokenizer(dsn, data_dir)
+
+class DummyTokenizer:
+
+ def __init__(self, dsn, data_dir):
+ self.dsn = dsn
+ self.data_dir = data_dir
+ self.init_state = None
+ self.analyser_cache = {}
+
+
+ def init_new_db(self, *args, **kwargs):
+ assert self.init_state == None
+ self.init_state = "new"
+
+
+ def init_from_project(self):
+ assert self.init_state == None
+ self.init_state = "loaded"
+
+
+ def finalize_import(self, _):
+ pass
+
+
+ def name_analyzer(self):
+ return DummyNameAnalyzer(self.analyser_cache)
+
+
+class DummyNameAnalyzer:
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+
+
+ def __init__(self, cache):
+ self.analyser_cache = cache
+ cache['countries'] = []
+
+
+ def close(self):
+ pass
+
+ def add_postcodes_from_db(self):
+ pass
+
+ def update_special_phrases(self, phrases):
+ self.analyser_cache['special_phrases'] = phrases
+
+ def add_country_names(self, code, names):
+ self.analyser_cache['countries'].append((code, names))
+
+ def process_place(self, place):
+ return {}
import nominatim.tools.freeze
import nominatim.tools.refresh
import nominatim.tools.postcodes
+import nominatim.tokenizer.factory
from mocks import MockParamCapture
return get_mock
+@pytest.fixture
+def tokenizer_mock(monkeypatch):
+ class DummyTokenizer:
+ def __init__(self, *args, **kwargs):
+ self.update_sql_functions_called = False
+ self.finalize_import_called = False
+
+ def update_sql_functions(self, *args):
+ self.update_sql_functions_called = True
+
+ def finalize_import(self, *args):
+ self.finalize_import_called = True
+
+ tok = DummyTokenizer()
+ monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' ,
+ lambda *args: tok)
+ monkeypatch.setattr(nominatim.tokenizer.factory, 'create_tokenizer' ,
+ lambda *args: tok)
+
+ return tok
+
+
def test_cli_help(capsys):
""" Running nominatim tool without arguments prints help.
"""
assert 1 == call_nominatim('import', '--osm-file', '.')
-def test_import_full(temp_db, mock_func_factory):
+def test_import_full(temp_db, mock_func_factory, tokenizer_mock):
mocks = [
mock_func_factory(nominatim.tools.database_import, 'setup_database_skeleton'),
- mock_func_factory(nominatim.tools.database_import, 'install_module'),
mock_func_factory(nominatim.tools.database_import, 'import_osm_data'),
mock_func_factory(nominatim.tools.refresh, 'import_wikipedia_articles'),
mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'),
cf_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
assert 0 == call_nominatim('import', '--osm-file', __file__)
+ assert tokenizer_mock.finalize_import_called
assert cf_mock.called > 1
assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)
-def test_import_continue_load_data(temp_db, mock_func_factory):
+def test_import_continue_load_data(temp_db, mock_func_factory, tokenizer_mock):
mocks = [
mock_func_factory(nominatim.tools.database_import, 'truncate_data_tables'),
mock_func_factory(nominatim.tools.database_import, 'load_data'),
]
assert 0 == call_nominatim('import', '--continue', 'load-data')
+ assert tokenizer_mock.finalize_import_called
for mock in mocks:
assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)
-def test_import_continue_indexing(temp_db, mock_func_factory, placex_table, temp_db_conn):
+def test_import_continue_indexing(temp_db, mock_func_factory, placex_table,
+ temp_db_conn, tokenizer_mock):
mocks = [
mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
assert temp_db_conn.index_exists('idx_placex_pendingsector')
-def test_import_continue_postprocess(temp_db, mock_func_factory):
+def test_import_continue_postprocess(temp_db, mock_func_factory, tokenizer_mock):
mocks = [
mock_func_factory(nominatim.tools.database_import, 'create_search_indices'),
mock_func_factory(nominatim.tools.database_import, 'create_country_names'),
assert 0 == call_nominatim('import', '--continue', 'db-postprocess')
+ assert tokenizer_mock.finalize_import_called
+
for mock in mocks:
assert mock.called == 1, "Mock '{}' not called".format(mock.func_name)
(['--boundaries-only'], 1, 0),
(['--no-boundaries'], 0, 1),
(['--boundaries-only', '--no-boundaries'], 0, 0)])
-def test_index_command(mock_func_factory, temp_db_cursor, params, do_bnds, do_ranks):
+def test_index_command(mock_func_factory, temp_db_cursor, tokenizer_mock,
+ params, do_bnds, do_ranks):
temp_db_cursor.execute("CREATE TABLE import_status (indexed bool)")
bnd_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_boundaries')
rank_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_by_rank')
assert bnd_mock.called == do_bnds
assert rank_mock.called == do_ranks
-def test_special_phrases_command(temp_db, mock_func_factory):
+def test_special_phrases_command(temp_db, mock_func_factory, tokenizer_mock):
func = mock_func_factory(nominatim.clicmd.special_phrases.SpecialPhrasesImporter, 'import_from_wiki')
call_nominatim('special-phrases', '--import-from-wiki')
('postcodes', 'update_postcodes'),
('word-counts', 'recompute_word_counts'),
('address-levels', 'load_address_levels_from_file'),
- ('functions', 'create_functions'),
('wiki-data', 'import_wikipedia_articles'),
('importance', 'recompute_importance'),
('website', 'setup_website'),
assert func_mock.called == 1
+def test_refresh_create_functions(mock_func_factory, temp_db, tokenizer_mock):
+ func_mock = mock_func_factory(nominatim.tools.refresh, 'create_functions')
+
+ assert 0 == call_nominatim('refresh', '--functions')
+ assert func_mock.called == 1
+ assert tokenizer_mock.update_sql_functions_called
+
+
def test_refresh_importance_computed_after_wiki_import(monkeypatch, temp_db):
calls = []
monkeypatch.setattr(nominatim.tools.refresh, 'import_wikipedia_articles',
cli_args=['replication'] + list(args))
@pytest.fixture
-def index_mock(monkeypatch):
+def tokenizer_mock(monkeypatch):
+ class DummyTokenizer:
+ def __init__(self, *args, **kwargs):
+ self.update_sql_functions_called = False
+ self.finalize_import_called = False
+
+ def update_sql_functions(self, *args):
+ self.update_sql_functions_called = True
+
+ def finalize_import(self, *args):
+ self.finalize_import_called = True
+
+ tok = DummyTokenizer()
+ monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db' ,
+ lambda *args: tok)
+ monkeypatch.setattr(nominatim.tokenizer.factory, 'create_tokenizer' ,
+ lambda *args: tok)
+
+ return tok
+
+
+@pytest.fixture
+def index_mock(monkeypatch, tokenizer_mock):
mock = MockParamCapture()
monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_boundaries', mock)
monkeypatch.setattr(nominatim.indexer.indexer.Indexer, 'index_by_rank', mock)
@pytest.fixture
-def update_mock(mock_func_factory, init_status):
+def update_mock(mock_func_factory, init_status, tokenizer_mock):
return mock_func_factory(nominatim.tools.replication, 'update')
@pytest.mark.parametrize("params,func", [
("'{{db.partitions|join}}'", '012'),
("{% if 'country_name' in db.tables %}'yes'{% else %}'no'{% endif %}", "yes"),
("{% if 'xxx' in db.tables %}'yes'{% else %}'no'{% endif %}", "no"),
- ("'{{config.DATABASE_MODULE_PATH}}'", '.')
])
def test_load_file_simple(sql_preprocessor, sql_factory, temp_db_conn, temp_db_cursor, expr, ret):
sqlfile = sql_factory("RETURN {};".format(expr))
import pytest
from nominatim.indexer import indexer
+from nominatim.tokenizer import factory
class IndexerTestDB:
self.conn = conn
self.conn.set_isolation_level(0)
with self.conn.cursor() as cur:
+ cur.execute('CREATE EXTENSION hstore')
cur.execute("""CREATE TABLE placex (place_id BIGINT,
class TEXT,
type TEXT,
indexed_date TIMESTAMP,
partition SMALLINT,
admin_level SMALLINT,
+ address HSTORE,
+ token_info JSONB,
geometry_sector INTEGER)""")
cur.execute("""CREATE TABLE location_property_osmline (
place_id BIGINT,
+ osm_id BIGINT,
+ address HSTORE,
+ token_info JSONB,
indexed_status SMALLINT,
indexed_date TIMESTAMP,
geometry_sector INTEGER)""")
END IF;
RETURN NEW;
END; $$ LANGUAGE plpgsql;""")
+ cur.execute("""CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
+ OUT name HSTORE,
+ OUT address HSTORE,
+ OUT country_feature VARCHAR)
+ AS $$
+ BEGIN
+ address := p.address;
+ name := p.address;
+ END;
+ $$ LANGUAGE plpgsql STABLE;
+ """)
+ cur.execute("""CREATE OR REPLACE FUNCTION get_interpolation_address(in_address HSTORE, wayid BIGINT)
+ RETURNS HSTORE AS $$
+ BEGIN
+ RETURN in_address;
+ END;
+ $$ LANGUAGE plpgsql STABLE;
+ """)
+
for table in ('placex', 'location_property_osmline', 'location_postcode'):
cur.execute("""CREATE TRIGGER {0}_update BEFORE UPDATE ON {0}
FOR EACH ROW EXECUTE PROCEDURE date_update()
next_id = next(self.osmline_id)
with self.conn.cursor() as cur:
cur.execute("""INSERT INTO location_property_osmline
- (place_id, indexed_status, geometry_sector)
- VALUES (%s, 1, %s)""",
- (next_id, sector))
+ (place_id, osm_id, indexed_status, geometry_sector)
+ VALUES (%s, %s, 1, %s)""",
+ (next_id, next_id, sector))
return next_id
def add_postcode(self, country, postcode):
yield IndexerTestDB(temp_db_conn)
+@pytest.fixture
+def test_tokenizer(tokenizer_mock, def_config, tmp_path):
+ def_config.project_dir = tmp_path
+ return factory.create_tokenizer(def_config)
+
+
@pytest.mark.parametrize("threads", [1, 15])
-def test_index_all_by_rank(test_db, threads):
+def test_index_all_by_rank(test_db, threads, test_tokenizer):
for rank in range(31):
test_db.add_place(rank_address=rank, rank_search=rank)
test_db.add_osmline()
assert 31 == test_db.placex_unindexed()
assert 1 == test_db.osmline_unindexed()
- idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+ idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
idx.index_by_rank(0, 30)
assert 0 == test_db.placex_unindexed()
@pytest.mark.parametrize("threads", [1, 15])
-def test_index_partial_without_30(test_db, threads):
+def test_index_partial_without_30(test_db, threads, test_tokenizer):
for rank in range(31):
test_db.add_place(rank_address=rank, rank_search=rank)
test_db.add_osmline()
assert 31 == test_db.placex_unindexed()
assert 1 == test_db.osmline_unindexed()
- idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+ idx = indexer.Indexer('dbname=test_nominatim_python_unittest',
+ test_tokenizer, threads)
idx.index_by_rank(4, 15)
assert 19 == test_db.placex_unindexed()
@pytest.mark.parametrize("threads", [1, 15])
-def test_index_partial_with_30(test_db, threads):
+def test_index_partial_with_30(test_db, threads, test_tokenizer):
for rank in range(31):
test_db.add_place(rank_address=rank, rank_search=rank)
test_db.add_osmline()
assert 31 == test_db.placex_unindexed()
assert 1 == test_db.osmline_unindexed()
- idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+ idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
idx.index_by_rank(28, 30)
assert 27 == test_db.placex_unindexed()
WHERE indexed_status = 0 AND rank_address between 1 and 27""")
@pytest.mark.parametrize("threads", [1, 15])
-def test_index_boundaries(test_db, threads):
+def test_index_boundaries(test_db, threads, test_tokenizer):
for rank in range(4, 10):
test_db.add_admin(rank_address=rank, rank_search=rank)
for rank in range(31):
assert 37 == test_db.placex_unindexed()
assert 1 == test_db.osmline_unindexed()
- idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+ idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
idx.index_boundaries(0, 30)
assert 31 == test_db.placex_unindexed()
@pytest.mark.parametrize("threads", [1, 15])
-def test_index_postcodes(test_db, threads):
+def test_index_postcodes(test_db, threads, test_tokenizer):
for postcode in range(1000):
test_db.add_postcode('de', postcode)
for postcode in range(32000, 33000):
test_db.add_postcode('us', postcode)
- idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+ idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
idx.index_postcodes()
assert 0 == test_db.scalar("""SELECT count(*) FROM location_postcode
@pytest.mark.parametrize("analyse", [True, False])
-def test_index_full(test_db, analyse):
+def test_index_full(test_db, analyse, test_tokenizer):
for rank in range(4, 10):
test_db.add_admin(rank_address=rank, rank_search=rank)
for rank in range(31):
for postcode in range(1000):
test_db.add_postcode('de', postcode)
- idx = indexer.Indexer('dbname=test_nominatim_python_unittest', 4)
+ idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, 4)
idx.index_full(analyse=analyse)
assert 0 == test_db.placex_unindexed()
@pytest.mark.parametrize("threads", [1, 15])
-def test_index_reopen_connection(test_db, threads, monkeypatch):
+def test_index_reopen_connection(test_db, threads, monkeypatch, test_tokenizer):
monkeypatch.setattr(indexer.WorkerPool, "REOPEN_CONNECTIONS_AFTER", 15)
for _ in range(1000):
test_db.add_place(rank_address=30, rank_search=30)
- idx = indexer.Indexer('dbname=test_nominatim_python_unittest', threads)
+ idx = indexer.Indexer('dbname=test_nominatim_python_unittest', test_tokenizer, threads)
idx.index_by_rank(28, 30)
assert 0 == test_db.placex_unindexed()
--- /dev/null
+"""
+Tests for creating new tokenizers.
+"""
+import importlib
+import pytest
+
+from nominatim.db import properties
+from nominatim.tokenizer import factory
+from nominatim.errors import UsageError
+from dummy_tokenizer import DummyTokenizer
+
+@pytest.fixture
+def test_config(def_config, tmp_path):
+ def_config.project_dir = tmp_path
+ return def_config
+
+
+def test_setup_dummy_tokenizer(temp_db_conn, test_config,
+ tokenizer_mock, property_table):
+ tokenizer = factory.create_tokenizer(test_config)
+
+ assert isinstance(tokenizer, DummyTokenizer)
+ assert tokenizer.init_state == "new"
+ assert (test_config.project_dir / 'tokenizer').is_dir()
+
+ assert properties.get_property(temp_db_conn, 'tokenizer') == 'dummy'
+
+
+def test_setup_tokenizer_dir_exists(test_config, tokenizer_mock, property_table):
+ (test_config.project_dir / 'tokenizer').mkdir()
+
+ tokenizer = factory.create_tokenizer(test_config)
+
+ assert isinstance(tokenizer, DummyTokenizer)
+ assert tokenizer.init_state == "new"
+
+
+def test_setup_tokenizer_dir_failure(test_config, tokenizer_mock, property_table):
+ (test_config.project_dir / 'tokenizer').write_text("foo")
+
+ with pytest.raises(UsageError):
+ factory.create_tokenizer(test_config)
+
+
+def test_setup_bad_tokenizer_name(test_config, monkeypatch):
+ monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
+
+ with pytest.raises(UsageError):
+ factory.create_tokenizer(test_config)
+
+def test_load_tokenizer(temp_db_conn, test_config,
+ tokenizer_mock, property_table):
+ factory.create_tokenizer(test_config)
+
+ tokenizer = factory.get_tokenizer_for_db(test_config)
+
+ assert isinstance(tokenizer, DummyTokenizer)
+ assert tokenizer.init_state == "loaded"
+
+
+def test_load_no_tokenizer_dir(test_config, tokenizer_mock, property_table):
+ factory.create_tokenizer(test_config)
+
+ test_config.project_dir = test_config.project_dir / 'foo'
+
+ with pytest.raises(UsageError):
+ factory.get_tokenizer_for_db(test_config)
+
+
+def test_load_missing_propoerty(temp_db_cursor, test_config, tokenizer_mock, property_table):
+ factory.create_tokenizer(test_config)
+
+ temp_db_cursor.execute("TRUNCATE TABLE nominatim_properties")
+
+ with pytest.raises(UsageError):
+ factory.get_tokenizer_for_db(test_config)
+
--- /dev/null
+"""
+Test for legacy tokenizer.
+"""
+import shutil
+
+import pytest
+
+from nominatim.tokenizer import legacy_tokenizer
+from nominatim.db import properties
+from nominatim.errors import UsageError
+
+@pytest.fixture
+def test_config(def_config, tmp_path):
+ def_config.project_dir = tmp_path / 'project'
+ def_config.project_dir.mkdir()
+
+ module_dir = tmp_path / 'module_src'
+ module_dir.mkdir()
+ (module_dir / 'nominatim.so').write_text('TEST nomiantim.so')
+
+ def_config.lib_dir.module = module_dir
+
+ sqldir = tmp_path / 'sql'
+ sqldir.mkdir()
+ (sqldir / 'tokenizer').mkdir()
+ (sqldir / 'tokenizer' / 'legacy_tokenizer.sql').write_text("SELECT 'a'")
+ (sqldir / 'words.sql').write_text("SELECT 'a'")
+ shutil.copy(str(def_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer_tables.sql'),
+ str(sqldir / 'tokenizer' / 'legacy_tokenizer_tables.sql'))
+
+ def_config.lib_dir.sql = sqldir
+ def_config.lib_dir.data = sqldir
+
+ return def_config
+
+
+@pytest.fixture
+def tokenizer_factory(dsn, tmp_path, monkeypatch, property_table):
+ (tmp_path / 'tokenizer').mkdir()
+
+ def _maker():
+ return legacy_tokenizer.create(dsn, tmp_path / 'tokenizer')
+
+ return _maker
+
+@pytest.fixture
+def tokenizer_setup(tokenizer_factory, test_config, monkeypatch, sql_preprocessor):
+ monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+ tok = tokenizer_factory()
+ tok.init_new_db(test_config)
+
+
+@pytest.fixture
+def analyzer(tokenizer_factory, test_config, monkeypatch, sql_preprocessor,
+ word_table, temp_db_with_extensions, tmp_path):
+ sql = tmp_path / 'sql' / 'tokenizer' / 'legacy_tokenizer.sql'
+ sql.write_text("""
+ CREATE OR REPLACE FUNCTION getorcreate_housenumber_id(lookup_word TEXT)
+ RETURNS INTEGER AS $$ SELECT 342; $$ LANGUAGE SQL;
+ """)
+
+ monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+ monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', ':: lower();')
+ tok = tokenizer_factory()
+ tok.init_new_db(test_config)
+ monkeypatch.undo()
+
+ with tok.name_analyzer() as analyzer:
+ yield analyzer
+
+
+@pytest.fixture
+def make_standard_name(temp_db_cursor):
+ temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION make_standard_name(name TEXT)
+ RETURNS TEXT AS $$ SELECT ' ' || name; $$ LANGUAGE SQL""")
+
+
+@pytest.fixture
+def create_postcode_id(table_factory, temp_db_cursor):
+ table_factory('out_postcode_table', 'postcode TEXT')
+
+ temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION create_postcode_id(postcode TEXT)
+ RETURNS BOOLEAN AS $$
+ INSERT INTO out_postcode_table VALUES (postcode) RETURNING True;
+ $$ LANGUAGE SQL""")
+
+
+@pytest.fixture
+def create_housenumbers(temp_db_cursor):
+ temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION create_housenumbers(
+ housenumbers TEXT[],
+ OUT tokens TEXT, OUT normtext TEXT)
+ AS $$
+ SELECT housenumbers::TEXT, array_to_string(housenumbers, ';')
+ $$ LANGUAGE SQL""")
+
+
+@pytest.fixture
+def make_keywords(temp_db_cursor, temp_db_with_extensions):
+ temp_db_cursor.execute(
+ """CREATE OR REPLACE FUNCTION make_keywords(names HSTORE)
+ RETURNS INTEGER[] AS $$ SELECT ARRAY[1, 2, 3] $$ LANGUAGE SQL""")
+
+def test_init_new(tokenizer_factory, test_config, monkeypatch,
+ temp_db_conn, sql_preprocessor):
+ monkeypatch.setenv('NOMINATIM_TERM_NORMALIZATION', 'xxvv')
+ monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+
+ tok = tokenizer_factory()
+ tok.init_new_db(test_config)
+
+ assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) == 'xxvv'
+
+ outfile = test_config.project_dir / 'module' / 'nominatim.so'
+
+ assert outfile.exists()
+ assert outfile.read_text() == 'TEST nomiantim.so'
+ assert outfile.stat().st_mode == 33261
+
+
+def test_init_module_load_failed(tokenizer_factory, test_config,
+ monkeypatch, temp_db_conn):
+ tok = tokenizer_factory()
+
+ with pytest.raises(UsageError):
+ tok.init_new_db(test_config)
+
+
+def test_init_module_custom(tokenizer_factory, test_config,
+ monkeypatch, tmp_path, sql_preprocessor):
+ module_dir = (tmp_path / 'custom').resolve()
+ module_dir.mkdir()
+ (module_dir/ 'nominatim.so').write_text('CUSTOM nomiantim.so')
+
+ monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', str(module_dir))
+ monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+
+ tok = tokenizer_factory()
+ tok.init_new_db(test_config)
+
+ assert not (test_config.project_dir / 'module').exists()
+
+
+def test_init_from_project(tokenizer_setup, tokenizer_factory):
+ tok = tokenizer_factory()
+
+ tok.init_from_project()
+
+ assert tok.normalization is not None
+
+
+def test_update_sql_functions(sql_preprocessor, temp_db_conn,
+ tokenizer_factory, test_config, table_factory,
+ monkeypatch, temp_db_cursor):
+ monkeypatch.setenv('NOMINATIM_MAX_WORD_FREQUENCY', '1133')
+ monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+ tok = tokenizer_factory()
+ tok.init_new_db(test_config)
+ monkeypatch.undo()
+
+ assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) == '1133'
+
+ table_factory('test', 'txt TEXT')
+
+ func_file = test_config.lib_dir.sql / 'tokenizer' / 'legacy_tokenizer.sql'
+ func_file.write_text("""INSERT INTO test VALUES ('{{max_word_freq}}'),
+ ('{{modulepath}}')""")
+
+ tok.update_sql_functions(test_config)
+
+ test_content = temp_db_cursor.row_set('SELECT * FROM test')
+ assert test_content == set((('1133', ), (str(test_config.project_dir / 'module'), )))
+
+
+def test_migrate_database(tokenizer_factory, test_config, temp_db_conn, monkeypatch):
+ monkeypatch.setattr(legacy_tokenizer, '_check_module' , lambda m, c: None)
+ tok = tokenizer_factory()
+ tok.migrate_database(test_config)
+
+ assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_MAXWORDFREQ) is not None
+ assert properties.get_property(temp_db_conn, legacy_tokenizer.DBCFG_NORMALIZATION) is not None
+
+ outfile = test_config.project_dir / 'module' / 'nominatim.so'
+
+ assert outfile.exists()
+ assert outfile.read_text() == 'TEST nomiantim.so'
+ assert outfile.stat().st_mode == 33261
+
+
+def test_normalize(analyzer):
+ assert analyzer.normalize('TEsT') == 'test'
+
+
+def test_add_postcodes_from_db(analyzer, table_factory, temp_db_cursor,
+ create_postcode_id):
+ table_factory('location_postcode', 'postcode TEXT',
+ content=(('1234',), ('12 34',), ('AB23',), ('1234',)))
+
+ analyzer.add_postcodes_from_db()
+
+ assert temp_db_cursor.row_set("SELECT * from out_postcode_table") \
+ == set((('1234', ), ('12 34', ), ('AB23',)))
+
+
+def test_update_special_phrase_empty_table(analyzer, word_table, temp_db_cursor,
+ make_standard_name):
+ analyzer.update_special_phrases([
+ ("König bei", "amenity", "royal", "near"),
+ ("Könige", "amenity", "royal", "-"),
+ ("strasse", "highway", "primary", "in")
+ ])
+
+ assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
+ FROM word WHERE class != 'place'""") \
+ == set(((' könig bei', 'könig bei', 'amenity', 'royal', 'near'),
+ (' könige', 'könige', 'amenity', 'royal', None),
+ (' strasse', 'strasse', 'highway', 'primary', 'in')))
+
+
+def test_update_special_phrase_delete_all(analyzer, word_table, temp_db_cursor,
+ make_standard_name):
+ temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
+ VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
+ (' bar', 'bar', 'highway', 'road', null)""")
+
+ assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+ analyzer.update_special_phrases([])
+
+ assert 0 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+
+def test_update_special_phrase_modify(analyzer, word_table, temp_db_cursor,
+ make_standard_name):
+ temp_db_cursor.execute("""INSERT INTO word (word_token, word, class, type, operator)
+ VALUES (' foo', 'foo', 'amenity', 'prison', 'in'),
+ (' bar', 'bar', 'highway', 'road', null)""")
+
+ assert 2 == temp_db_cursor.scalar("SELECT count(*) FROM word WHERE class != 'place'""")
+
+ analyzer.update_special_phrases([
+ ('prison', 'amenity', 'prison', 'in'),
+ ('bar', 'highway', 'road', '-'),
+ ('garden', 'leisure', 'garden', 'near')
+ ])
+
+ assert temp_db_cursor.row_set("""SELECT word_token, word, class, type, operator
+ FROM word WHERE class != 'place'""") \
+ == set(((' prison', 'prison', 'amenity', 'prison', 'in'),
+ (' bar', 'bar', 'highway', 'road', None),
+ (' garden', 'garden', 'leisure', 'garden', 'near')))
+
+
+def test_process_place_names(analyzer, make_keywords):
+
+ info = analyzer.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
+
+ assert info['names'] == '{1,2,3}'
+
+
+@pytest.mark.parametrize('pc', ['12345', 'AB 123', '34-345'])
+def test_process_place_postcode(analyzer, temp_db_cursor, create_postcode_id, pc):
+
+ info = analyzer.process_place({'address': {'postcode' : pc}})
+
+ assert temp_db_cursor.row_set("SELECT * from out_postcode_table") \
+ == set(((pc, ),))
+
+
+@pytest.mark.parametrize('pc', ['12:23', 'ab;cd;f', '123;836'])
+def test_process_place_bad_postcode(analyzer, temp_db_cursor, create_postcode_id,
+ pc):
+
+ info = analyzer.process_place({'address': {'postcode' : pc}})
+
+ assert 0 == temp_db_cursor.scalar("SELECT count(*) from out_postcode_table")
+
+
+@pytest.mark.parametrize('hnr', ['123a', '1', '101'])
+def test_process_place_housenumbers_simple(analyzer, create_housenumbers, hnr):
+ info = analyzer.process_place({'address': {'housenumber' : hnr}})
+
+ assert info['hnr'] == hnr
+ assert info['hnr_tokens'].startswith("{")
+
+
+def test_process_place_housenumbers_lists(analyzer, create_housenumbers):
+ info = analyzer.process_place({'address': {'conscriptionnumber' : '1; 2;3'}})
+
+ assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
+
+
+def test_process_place_housenumbers_duplicates(analyzer, create_housenumbers):
+ info = analyzer.process_place({'address': {'housenumber' : '134',
+ 'conscriptionnumber' : '134',
+ 'streetnumber' : '99a'}})
+
+ assert set(info['hnr'].split(';')) == set(('134', '99a'))
assert chkdb.check_placex_size(temp_db_conn, def_config) == chkdb.CheckState.FATAL
-def test_check_module_bad(temp_db_conn, def_config):
- assert chkdb.check_module(temp_db_conn, def_config) == chkdb.CheckState.FAIL
+def test_check_tokenizer_missing(temp_db_conn, def_config, tmp_path):
+ def_config.project_dir = tmp_path
+ assert chkdb.check_tokenizer(temp_db_conn, def_config) == chkdb.CheckState.FAIL
+
+
+@pytest.mark.parametrize("check_result,state", [(None, chkdb.CheckState.OK),
+ ("Something wrong", chkdb.CheckState.FAIL)])
+def test_check_tokenizer(tokenizer_mock, temp_db_conn, def_config, monkeypatch,
+ check_result, state):
+ class _TestTokenizer:
+ def check_database(self):
+ return check_result
+
+ monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db',
+ lambda *a, **k: _TestTokenizer())
+ assert chkdb.check_tokenizer(temp_db_conn, def_config) == state
def test_check_indexing_good(temp_db_cursor, temp_db_conn, def_config):
database_import.setup_extensions(temp_db_conn)
-def test_install_module(tmp_path):
- src_dir = tmp_path / 'source'
- src_dir.mkdir()
- (src_dir / 'nominatim.so').write_text('TEST nomiantim.so')
-
- project_dir = tmp_path / 'project'
- project_dir.mkdir()
-
- database_import.install_module(src_dir, project_dir, '')
-
- outfile = project_dir / 'module' / 'nominatim.so'
-
- assert outfile.exists()
- assert outfile.read_text() == 'TEST nomiantim.so'
- assert outfile.stat().st_mode == 33261
-
-
-def test_install_module_custom(tmp_path):
- (tmp_path / 'nominatim.so').write_text('TEST nomiantim.so')
-
- database_import.install_module(tmp_path, tmp_path, str(tmp_path.resolve()))
-
- assert not (tmp_path / 'module').exists()
-
-
-def test_install_module_fail_access(temp_db_conn, tmp_path):
- (tmp_path / 'nominatim.so').write_text('TEST nomiantim.so')
-
- with pytest.raises(UsageError, match='.*module cannot be accessed.*'):
- database_import.install_module(tmp_path, tmp_path, '',
- conn=temp_db_conn)
-
-
def test_import_base_data(src_dir, temp_db, temp_db_cursor):
temp_db_cursor.execute('CREATE EXTENSION hstore')
temp_db_cursor.execute('CREATE EXTENSION postgis')
def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory):
- tables = ('word', 'placex', 'place_addressline', 'location_area',
+ tables = ('placex', 'place_addressline', 'location_area',
'location_area_country',
'location_property_tiger', 'location_property_osmline',
'location_postcode', 'search_name', 'location_road_23')
for table in tables:
- table_factory(table, content=(1, 2, 3))
+ table_factory(table, content=((1, ), (2, ), (3, )))
+ assert temp_db_cursor.table_rows(table) == 3
- database_import.truncate_data_tables(temp_db_conn, max_word_frequency=23)
+ database_import.truncate_data_tables(temp_db_conn)
for table in tables:
assert temp_db_cursor.table_rows(table) == 0
place_row(osm_type='W', osm_id=342, cls='place', typ='houses',
geom='SRID=4326;LINESTRING(0 0, 10 10)')
- database_import.load_data(dsn, src_dir / 'data', threads)
+ database_import.load_data(dsn, threads)
assert temp_db_cursor.table_rows('placex') == 30
assert temp_db_cursor.table_rows('location_property_osmline') == 1
-@pytest.mark.parametrize("languages", (False, True))
-def test_create_country_names(temp_db_conn, temp_db_cursor, def_config,
- temp_db_with_extensions, monkeypatch, languages):
- if languages:
- monkeypatch.setenv('NOMINATIM_LANGUAGES', 'fr,en')
- temp_db_cursor.execute("""CREATE FUNCTION make_standard_name (name TEXT)
- RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL
- """)
- temp_db_cursor.execute('CREATE TABLE country_name (country_code varchar(2), name hstore)')
- temp_db_cursor.execute('CREATE TABLE word (code varchar(2))')
- temp_db_cursor.execute("""INSERT INTO country_name VALUES ('us',
- '"name"=>"us","name:af"=>"us"')""")
- temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
- lookup_country_code varchar(2))
- RETURNS INTEGER
- AS $$
- BEGIN
- INSERT INTO word VALUES (lookup_country_code);
- RETURN 5;
- END;
- $$
- LANGUAGE plpgsql;
- """)
- database_import.create_country_names(temp_db_conn, def_config)
+
+@pytest.mark.parametrize("languages", (None, ' fr,en'))
+def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor,
+ table_factory, tokenizer_mock, languages):
+
+ table_factory('country_name', 'country_code varchar(2), name hstore',
+ content=(('us', '"name"=>"us1","name:af"=>"us2"'),
+ ('fr', '"name"=>"Fra", "name:en"=>"Fren"')))
+
+ assert temp_db_cursor.scalar("SELECT count(*) FROM country_name") == 2
+
+ tokenizer = tokenizer_mock()
+
+ database_import.create_country_names(temp_db_conn, tokenizer, languages)
+
+ assert len(tokenizer.analyser_cache['countries']) == 2
+
+ result_set = {k: set(v) for k, v in tokenizer.analyser_cache['countries']}
+
if languages:
- assert temp_db_cursor.table_rows('word') == 4
+ assert result_set == {'us' : set(('us', 'us1', 'United States')),
+ 'fr' : set(('fr', 'Fra', 'Fren'))}
else:
- assert temp_db_cursor.table_rows('word') == 5
+ assert result_set == {'us' : set(('us', 'us1', 'us2', 'United States')),
+ 'fr' : set(('fr', 'Fra', 'Fren'))}
TEST_BASE_DIR = Path(__file__) / '..' / '..'
-def test_fetch_existing_words_phrases_basic(special_phrases_importer, word_table,
- temp_db_cursor):
- """
- Check for the fetch_existing_words_phrases() method.
- It should return special phrase term added to the word
- table.
- """
- query ="""
- INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
- 'class', 'type', null, 0, 'near');
- """
- temp_db_cursor.execute(query)
-
- assert not special_phrases_importer.words_phrases_to_delete
- special_phrases_importer._fetch_existing_words_phrases()
- contained_phrase = special_phrases_importer.words_phrases_to_delete.pop()
- assert contained_phrase == ('normalized_word', 'class', 'type', 'near')
-
-@pytest.mark.parametrize("house_type", ['house', 'postcode'])
-def test_fetch_existing_words_phrases_special_cases(special_phrases_importer, word_table,
- house_type, temp_db_cursor):
- """
- Check for the fetch_existing_words_phrases() method.
- It should return nothing as the terms added correspond
- to a housenumber and postcode term.
- """
- query ="""
- INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
- 'place', %s, null, 0, 'near');
- """
- temp_db_cursor.execute(query, (house_type,))
-
- special_phrases_importer._fetch_existing_words_phrases()
- assert not special_phrases_importer.words_phrases_to_delete
-
def test_fetch_existing_place_classtype_tables(special_phrases_importer, temp_db_cursor):
"""
Check for the fetch_existing_place_classtype_tables() method.
the same path is directly returned
"""
json_file = (TEST_BASE_DIR / 'testfiles' / 'phrase_settings.json').resolve()
-
+
returned = special_phrases_importer._convert_php_settings_if_needed(json_file)
assert returned == json_file
-def test_process_amenity_with_operator(special_phrases_importer, getorcreate_amenityoperator_funcs,
- temp_db_conn, word_table):
- """
- Test that _process_amenity() execute well the
- getorcreate_amenityoperator() SQL function and that
- the 2 differents operators are well handled.
- """
- special_phrases_importer._process_amenity('', '', '', '', 'near')
- special_phrases_importer._process_amenity('', '', '', '', 'in')
-
- with temp_db_conn.cursor() as temp_db_cursor:
- temp_db_cursor.execute("SELECT * FROM word WHERE operator='near' OR operator='in'")
- results = temp_db_cursor.fetchall()
-
- assert len(results) == 2
-
-def test_process_amenity_without_operator(special_phrases_importer, getorcreate_amenity_funcs,
- temp_db_conn, word_table):
- """
- Test that _process_amenity() execute well the
- getorcreate_amenity() SQL function.
- """
- special_phrases_importer._process_amenity('', '', '', '', '')
-
- with temp_db_conn.cursor() as temp_db_cursor:
- temp_db_cursor.execute("SELECT * FROM word WHERE operator='no_operator'")
- result = temp_db_cursor.fetchone()
-
- assert result
-
def test_create_place_classtype_indexes(temp_db_conn, special_phrases_importer):
"""
Test that _create_place_classtype_indexes() create the
assert check_placeid_and_centroid_indexes(temp_db_conn, pair[0], pair[1])
assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, pair[0], pair[1])
-def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer, word_table,
- getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs):
+def test_process_xml_content(temp_db_conn, def_config, special_phrases_importer):
"""
Test that _process_xml_content() process the given xml content right
by executing the right SQL functions for amenities and
#Converted output set to a dict for easy assert further.
results = dict(special_phrases_importer._process_xml_content(get_test_xml_wiki_content(), 'en'))
- assert check_amenities_with_op(temp_db_conn)
- assert check_amenities_without_op(temp_db_conn)
assert results[class_test] and type_test in results.values()
-def test_remove_non_existent_phrases_from_db(special_phrases_importer, default_phrases,
+def test_remove_non_existent_tables_from_db(special_phrases_importer, default_phrases,
temp_db_conn):
"""
Check for the remove_non_existent_phrases_from_db() method.
be deleted.
"""
with temp_db_conn.cursor() as temp_db_cursor:
- to_delete_phrase_tuple = ('normalized_word', 'class', 'type', 'near')
- to_keep_phrase_tuple = (
- 'normalized_word_exists', 'class_exists', 'type_exists', 'near'
- )
- special_phrases_importer.words_phrases_to_delete = {
- to_delete_phrase_tuple,
- to_keep_phrase_tuple
- }
- special_phrases_importer.words_phrases_still_exist = {
- to_keep_phrase_tuple
- }
special_phrases_importer.table_phrases_to_delete = {
'place_classtype_testclasstypetable_to_delete'
}
- query_words = 'SELECT word, class, type, operator FROM word;'
query_tables = """
SELECT table_name
FROM information_schema.tables
AND table_name like 'place_classtype_%';
"""
- special_phrases_importer._remove_non_existent_phrases_from_db()
+ special_phrases_importer._remove_non_existent_tables_from_db()
- temp_db_cursor.execute(query_words)
- words_result = temp_db_cursor.fetchall()
temp_db_cursor.execute(query_tables)
tables_result = temp_db_cursor.fetchall()
- assert len(words_result) == 1 and words_result[0] == [
- 'normalized_word_exists', 'class_exists', 'type_exists', 'near'
- ]
assert (len(tables_result) == 1 and
tables_result[0][0] == 'place_classtype_testclasstypetable_to_keep'
)
-def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer, placex_table,
- getorcreate_amenity_funcs, getorcreate_amenityoperator_funcs, word_table):
+def test_import_from_wiki(monkeypatch, temp_db_conn, def_config, special_phrases_importer,
+ placex_table, tokenizer_mock):
"""
Check that the main import_from_wiki() method is well executed.
It should create the place_classtype table, the place_id and centroid indexes,
#what is deleted and what is preserved.
with temp_db_conn.cursor() as temp_db_cursor:
temp_db_cursor.execute("""
- INSERT INTO word VALUES(99999, ' animal shelter', 'animal shelter',
- 'amenity', 'animal_shelter', null, 0, null);
-
- INSERT INTO word VALUES(99999, ' wrong_lookup_token', 'wrong_normalized_word',
- 'wrong_class', 'wrong_type', null, 0, 'near');
-
CREATE TABLE place_classtype_amenity_animal_shelter();
CREATE TABLE place_classtype_wrongclass_wrongtype();""")
monkeypatch.setattr('nominatim.tools.SpecialPhrasesImporter._get_wiki_content', mock_get_wiki_content)
- special_phrases_importer.import_from_wiki(['en'])
+ tokenizer = tokenizer_mock()
+ special_phrases_importer.import_from_wiki(tokenizer, ['en'])
+
+ assert len(tokenizer.analyser_cache['special_phrases']) == 18
class_test = 'aerialway'
type_test = 'zip_line'
assert check_table_exist(temp_db_conn, class_test, type_test)
assert check_placeid_and_centroid_indexes(temp_db_conn, class_test, type_test)
assert check_grant_access(temp_db_conn, def_config.DATABASE_WEBUSER, class_test, type_test)
- assert check_amenities_with_op(temp_db_conn)
- assert check_amenities_without_op(temp_db_conn)
assert check_table_exist(temp_db_conn, 'amenity', 'animal_shelter')
assert not check_table_exist(temp_db_conn, 'wrong_class', 'wrong_type')
#Format (query, should_return_something_bool) use to easily execute all asserts
queries_tests = set()
- #Used to check that the correct phrase already in the word table before is still there.
- query_correct_word = "SELECT * FROM word WHERE word = 'animal shelter'"
- queries_tests.add((query_correct_word, True))
-
- #Used to check if wrong phrase was deleted from the word table of the database.
- query_wrong_word = "SELECT word FROM word WHERE word = 'wrong_normalized_word'"
- queries_tests.add((query_wrong_word, False))
-
#Used to check that correct place_classtype table already in the datase before is still there.
query_existing_table = """
SELECT table_name
temp_db_conn.index_exists(index_prefix + 'place_id')
)
-def check_amenities_with_op(temp_db_conn):
- """
- Check that the test table for the SQL function getorcreate_amenityoperator()
- contains more than one value (so that the SQL function was call more than one time).
- """
- with temp_db_conn.cursor() as temp_db_cursor:
- temp_db_cursor.execute("SELECT * FROM word WHERE operator != 'no_operator'")
- return len(temp_db_cursor.fetchall()) > 1
-
-def check_amenities_without_op(temp_db_conn):
- """
- Check that the test table for the SQL function getorcreate_amenity()
- contains more than one value (so that the SQL function was call more than one time).
- """
- with temp_db_conn.cursor() as temp_db_cursor:
- temp_db_cursor.execute("SELECT * FROM word WHERE operator = 'no_operator'")
- return len(temp_db_cursor.fetchall()) > 1
-
@pytest.fixture
def special_phrases_importer(temp_db_conn, def_config, temp_phplib_dir_with_migration):
"""
yield Path(phpdir)
@pytest.fixture
-def default_phrases(word_table, temp_db_cursor):
+def default_phrases(temp_db_cursor):
temp_db_cursor.execute("""
- INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word',
- 'class', 'type', null, 0, 'near');
-
- INSERT INTO word VALUES(99999, 'lookup_token', 'normalized_word_exists',
- 'class_exists', 'type_exists', null, 0, 'near');
-
CREATE TABLE place_classtype_testclasstypetable_to_delete();
CREATE TABLE place_classtype_testclasstypetable_to_keep();""")
-
-@pytest.fixture
-def make_strandard_name_func(temp_db_cursor):
- temp_db_cursor.execute("""
- CREATE OR REPLACE FUNCTION make_standard_name(name TEXT) RETURNS TEXT AS $$
- BEGIN
- RETURN trim(name); --Basically return only the trimed name for the tests
- END;
- $$ LANGUAGE plpgsql IMMUTABLE;""")
-
-@pytest.fixture
-def getorcreate_amenity_funcs(temp_db_cursor, make_strandard_name_func):
- temp_db_cursor.execute("""
- CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
- lookup_class text, lookup_type text)
- RETURNS void as $$
- BEGIN
- INSERT INTO word VALUES(null, lookup_word, normalized_word,
- lookup_class, lookup_type, null, 0, 'no_operator');
- END;
- $$ LANGUAGE plpgsql""")
-
-@pytest.fixture
-def getorcreate_amenityoperator_funcs(temp_db_cursor, make_strandard_name_func):
- temp_db_cursor.execute("""
- CREATE TABLE temp_with_operator(op TEXT);
-
- CREATE OR REPLACE FUNCTION getorcreate_amenityoperator(lookup_word TEXT, normalized_word TEXT,
- lookup_class text, lookup_type text, op text)
- RETURNS void as $$
- BEGIN
- INSERT INTO word VALUES(null, lookup_word, normalized_word,
- lookup_class, lookup_type, null, 0, op);
- END;
- $$ LANGUAGE plpgsql""")
\ No newline at end of file
import pytest
from nominatim.tools import postcodes
+import dummy_tokenizer
+
+@pytest.fixture
+def tokenizer():
+ return dummy_tokenizer.DummyTokenizer(None, None)
@pytest.fixture
def postcode_table(temp_db_with_extensions, temp_db_cursor, table_factory,
postcode TEXT,
geometry GEOMETRY(Geometry, 4326)""")
temp_db_cursor.execute('CREATE SEQUENCE seq_place')
- temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_postcode_id(postcode TEXT)
- RETURNS INTEGER AS $$ BEGIN RETURN 1; END; $$ LANGUAGE plpgsql;
+ temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
+ RETURNS TEXT AS $$ BEGIN RETURN postcode; END; $$ LANGUAGE plpgsql;
""")
-def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path):
- postcodes.import_postcodes(dsn, tmp_path)
+def test_import_postcodes_empty(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
+ postcodes.import_postcodes(dsn, tmp_path, tokenizer)
assert temp_db_cursor.table_exists('gb_postcode')
assert temp_db_cursor.table_exists('us_postcode')
assert temp_db_cursor.table_rows('location_postcode') == 0
-def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path):
+def test_import_postcodes_from_placex(dsn, temp_db_cursor, postcode_table, tmp_path, tokenizer):
temp_db_cursor.execute("""
INSERT INTO placex (place_id, country_code, address, geometry)
VALUES (1, 'xx', '"postcode"=>"9486"', 'SRID=4326;POINT(10 12)')
""")
- postcodes.import_postcodes(dsn, tmp_path)
+ postcodes.import_postcodes(dsn, tmp_path, tokenizer)
rows = temp_db_cursor.row_set(""" SELECT postcode, country_code,
ST_X(geometry), ST_Y(geometry)
return tmp_path
@pytest.fixture
-def conn(temp_db_conn, table_factory, monkeypatch):
- monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.')
- table_factory('country_name', 'partition INT', (0, 1, 2))
+def conn(sql_preprocessor, temp_db_conn):
return temp_db_conn
def run_website_script(envdir, config):
config.lib_dir.php = envdir / 'php'
+ config.project_dir = envdir
refresh.setup_website(envdir, config)
proc = subprocess.run(['/usr/bin/env', 'php', '-Cq',
-SELECT getorcreate_amenity(make_standard_name('Aerodrome'), 'aerodrome', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenity(make_standard_name('Aerodromes'), 'aerodromes', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodrome in'), 'aerodrome in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodromes in'), 'aerodromes in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodrome near'), 'aerodrome near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Aerodromes near'), 'aerodromes near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenity(make_standard_name('Airport'), 'airport', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenity(make_standard_name('Airports'), 'airports', 'aeroway', 'aerodrome');
-SELECT getorcreate_amenityoperator(make_standard_name('Airport in'), 'airport in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Airports in'), 'airports in', 'aeroway', 'aerodrome', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Airport near'), 'airport near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Airports near'), 'airports near', 'aeroway', 'aerodrome', 'near');
-SELECT getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'bar');
-SELECT getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'bar');
-SELECT getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'pub');
-SELECT getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'pub');
-SELECT getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
-SELECT getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
-SELECT getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'bar');
-SELECT getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'bar');
-SELECT getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'bar', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'bar', 'near');
-SELECT getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'pub');
-SELECT getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'pub');
-SELECT getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'pub', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'pub', 'near');
-SELECT getorcreate_amenity(make_standard_name('Restaurant'), 'restaurant', 'amenity', 'restaurant');
-SELECT getorcreate_amenity(make_standard_name('Restaurants'), 'restaurants', 'amenity', 'restaurant');
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurant in'), 'restaurant in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurants in'), 'restaurants in', 'amenity', 'restaurant', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurant near'), 'restaurant near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Restaurants near'), 'restaurants near', 'amenity', 'restaurant', 'near');
-SELECT getorcreate_amenity(make_standard_name('Mural'), 'mural', 'artwork_type', 'mural');
-SELECT getorcreate_amenity(make_standard_name('Murals'), 'murals', 'artwork_type', 'mural');
-SELECT getorcreate_amenityoperator(make_standard_name('Mural in'), 'mural in', 'artwork_type', 'mural', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Murals in'), 'murals in', 'artwork_type', 'mural', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Mural near'), 'mural near', 'artwork_type', 'mural', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Murals near'), 'murals near', 'artwork_type', 'mural', 'near');
-SELECT getorcreate_amenity(make_standard_name('Sculpture'), 'sculpture', 'artwork_type', 'sculpture');
-SELECT getorcreate_amenity(make_standard_name('Sculptures'), 'sculptures', 'artwork_type', 'sculpture');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculpture in'), 'sculpture in', 'artwork_type', 'sculpture', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculptures in'), 'sculptures in', 'artwork_type', 'sculpture', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculpture near'), 'sculpture near', 'artwork_type', 'sculpture', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Sculptures near'), 'sculptures near', 'artwork_type', 'sculpture', 'near');
-SELECT getorcreate_amenity(make_standard_name('Statue'), 'statue', 'artwork_type', 'statue');
-SELECT getorcreate_amenity(make_standard_name('Statues'), 'statues', 'artwork_type', 'statue');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'artwork_type', 'statue', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'artwork_type', 'statue', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'artwork_type', 'statue', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'artwork_type', 'statue', 'near');
-SELECT getorcreate_amenity(make_standard_name('ATM'), 'atm', 'atm', 'yes');
-SELECT getorcreate_amenity(make_standard_name('ATMs'), 'atms', 'atm', 'yes');
-SELECT getorcreate_amenityoperator(make_standard_name('ATM in'), 'atm in', 'atm', 'yes', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('ATMs in'), 'atms in', 'atm', 'yes', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('ATM near'), 'atm near', 'atm', 'yes', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('ATMs near'), 'atms near', 'atm', 'yes', 'near');
-SELECT getorcreate_amenity(make_standard_name('National Park'), 'national park', 'boundary', 'national_park');
-SELECT getorcreate_amenity(make_standard_name('National Parks'), 'national parks', 'boundary', 'national_park');
-SELECT getorcreate_amenityoperator(make_standard_name('National Park in'), 'national park in', 'boundary', 'national_park', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('National Parks in'), 'national parks in', 'boundary', 'national_park', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('National Park near'), 'national park near', 'boundary', 'national_park', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('National Parks near'), 'national parks near', 'boundary', 'national_park', 'near');
-SELECT getorcreate_amenity(make_standard_name('Changing table'), 'changing table', 'changing_table', 'yes');
-SELECT getorcreate_amenity(make_standard_name('Changing tables'), 'changing tables', 'changing_table', 'yes');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing table in'), 'changing table in', 'changing_table', 'yes', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing tables in'), 'changing tables in', 'changing_table', 'yes', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing table near'), 'changing table near', 'changing_table', 'yes', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Changing tables near'), 'changing tables near', 'changing_table', 'yes', 'near');
-SELECT getorcreate_amenity(make_standard_name('Roundabout'), 'roundabout', 'junction', 'roundabout');
-SELECT getorcreate_amenity(make_standard_name('Roundabouts'), 'roundabouts', 'junction', 'roundabout');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabout in'), 'roundabout in', 'junction', 'roundabout', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabouts in'), 'roundabouts in', 'junction', 'roundabout', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabout near'), 'roundabout near', 'junction', 'roundabout', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Roundabouts near'), 'roundabouts near', 'junction', 'roundabout', 'near');
-SELECT getorcreate_amenity(make_standard_name('Plaque'), 'plaque', 'memorial', 'plaque');
-SELECT getorcreate_amenity(make_standard_name('Plaques'), 'plaques', 'memorial', 'plaque');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaque in'), 'plaque in', 'memorial', 'plaque', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaques in'), 'plaques in', 'memorial', 'plaque', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaque near'), 'plaque near', 'memorial', 'plaque', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Plaques near'), 'plaques near', 'memorial', 'plaque', 'near');
-SELECT getorcreate_amenity(make_standard_name('Statue'), 'statue', 'memorial', 'statue');
-SELECT getorcreate_amenity(make_standard_name('Statues'), 'statues', 'memorial', 'statue');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'memorial', 'statue', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'memorial', 'statue', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'memorial', 'statue', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'memorial', 'statue', 'near');
-SELECT getorcreate_amenity(make_standard_name('Stolperstein'), 'stolperstein', 'memorial', 'stolperstein');
-SELECT getorcreate_amenity(make_standard_name('Stolpersteins'), 'stolpersteins', 'memorial', 'stolperstein');
-SELECT getorcreate_amenity(make_standard_name('Stolpersteine'), 'stolpersteine', 'memorial', 'stolperstein');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolperstein in'), 'stolperstein in', 'memorial', 'stolperstein', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteins in'), 'stolpersteins in', 'memorial', 'stolperstein', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteine in'), 'stolpersteine in', 'memorial', 'stolperstein', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolperstein near'), 'stolperstein near', 'memorial', 'stolperstein', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteins near'), 'stolpersteins near', 'memorial', 'stolperstein', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('Stolpersteine near'), 'stolpersteine near', 'memorial', 'stolperstein', 'near');
-SELECT getorcreate_amenity(make_standard_name('War Memorial'), 'war memorial', 'memorial', 'war_memorial');
-SELECT getorcreate_amenity(make_standard_name('War Memorials'), 'war memorials', 'memorial', 'war_memorial');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorial in'), 'war memorial in', 'memorial', 'war_memorial', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorials in'), 'war memorials in', 'memorial', 'war_memorial', 'in');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorial near'), 'war memorial near', 'memorial', 'war_memorial', 'near');
-SELECT getorcreate_amenityoperator(make_standard_name('War Memorials near'), 'war memorials near', 'memorial', 'war_memorial', 'near');
+CREATE OR REPLACE FUNCTION test_getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
+ lookup_class text, lookup_type text)
+ RETURNS INTEGER
+ AS $$
+DECLARE
+ lookup_token TEXT;
+ return_word_id INTEGER;
+BEGIN
+ lookup_token := ' '||trim(lookup_word);
+ SELECT min(word_id) FROM word
+ WHERE word_token = lookup_token and word = normalized_word
+ and class = lookup_class and type = lookup_type
+ INTO return_word_id;
+ IF return_word_id IS NULL THEN
+ return_word_id := nextval('seq_word');
+ INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
+ lookup_class, lookup_type, null, 0);
+ END IF;
+ RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION test_getorcreate_amenityoperator(lookup_word TEXT,
+ normalized_word TEXT,
+ lookup_class text,
+ lookup_type text,
+ op text)
+ RETURNS INTEGER
+ AS $$
+DECLARE
+ lookup_token TEXT;
+ return_word_id INTEGER;
+BEGIN
+ lookup_token := ' '||trim(lookup_word);
+ SELECT min(word_id) FROM word
+ WHERE word_token = lookup_token and word = normalized_word
+ and class = lookup_class and type = lookup_type and operator = op
+ INTO return_word_id;
+ IF return_word_id IS NULL THEN
+ return_word_id := nextval('seq_word');
+ INSERT INTO word VALUES (return_word_id, lookup_token, normalized_word,
+ lookup_class, lookup_type, null, 0, op);
+ END IF;
+ RETURN return_word_id;
+END;
+$$
+LANGUAGE plpgsql;
+
+SELECT test_getorcreate_amenity(make_standard_name('Aerodrome'), 'aerodrome', 'aeroway', 'aerodrome');
+SELECT test_getorcreate_amenity(make_standard_name('Aerodromes'), 'aerodromes', 'aeroway', 'aerodrome');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodrome in'), 'aerodrome in', 'aeroway', 'aerodrome', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodromes in'), 'aerodromes in', 'aeroway', 'aerodrome', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodrome near'), 'aerodrome near', 'aeroway', 'aerodrome', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Aerodromes near'), 'aerodromes near', 'aeroway', 'aerodrome', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Airport'), 'airport', 'aeroway', 'aerodrome');
+SELECT test_getorcreate_amenity(make_standard_name('Airports'), 'airports', 'aeroway', 'aerodrome');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airport in'), 'airport in', 'aeroway', 'aerodrome', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airports in'), 'airports in', 'aeroway', 'aerodrome', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airport near'), 'airport near', 'aeroway', 'aerodrome', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Airports near'), 'airports near', 'aeroway', 'aerodrome', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'bar');
+SELECT test_getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'bar');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'bar', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'bar', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'bar', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'bar', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Bar'), 'bar', 'amenity', 'pub');
+SELECT test_getorcreate_amenity(make_standard_name('Bars'), 'bars', 'amenity', 'pub');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar in'), 'bar in', 'amenity', 'pub', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars in'), 'bars in', 'amenity', 'pub', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bar near'), 'bar near', 'amenity', 'pub', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Bars near'), 'bars near', 'amenity', 'pub', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
+SELECT test_getorcreate_amenity(make_standard_name('Food'), 'food', 'amenity', 'restaurant');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food in'), 'food in', 'amenity', 'restaurant', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Food near'), 'food near', 'amenity', 'restaurant', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'bar');
+SELECT test_getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'bar');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'bar', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'bar', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'bar', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'bar', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Pub'), 'pub', 'amenity', 'pub');
+SELECT test_getorcreate_amenity(make_standard_name('Pubs'), 'pubs', 'amenity', 'pub');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub in'), 'pub in', 'amenity', 'pub', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs in'), 'pubs in', 'amenity', 'pub', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pub near'), 'pub near', 'amenity', 'pub', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Pubs near'), 'pubs near', 'amenity', 'pub', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Restaurant'), 'restaurant', 'amenity', 'restaurant');
+SELECT test_getorcreate_amenity(make_standard_name('Restaurants'), 'restaurants', 'amenity', 'restaurant');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurant in'), 'restaurant in', 'amenity', 'restaurant', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurants in'), 'restaurants in', 'amenity', 'restaurant', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurant near'), 'restaurant near', 'amenity', 'restaurant', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Restaurants near'), 'restaurants near', 'amenity', 'restaurant', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Mural'), 'mural', 'artwork_type', 'mural');
+SELECT test_getorcreate_amenity(make_standard_name('Murals'), 'murals', 'artwork_type', 'mural');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Mural in'), 'mural in', 'artwork_type', 'mural', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Murals in'), 'murals in', 'artwork_type', 'mural', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Mural near'), 'mural near', 'artwork_type', 'mural', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Murals near'), 'murals near', 'artwork_type', 'mural', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Sculpture'), 'sculpture', 'artwork_type', 'sculpture');
+SELECT test_getorcreate_amenity(make_standard_name('Sculptures'), 'sculptures', 'artwork_type', 'sculpture');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculpture in'), 'sculpture in', 'artwork_type', 'sculpture', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculptures in'), 'sculptures in', 'artwork_type', 'sculpture', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculpture near'), 'sculpture near', 'artwork_type', 'sculpture', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Sculptures near'), 'sculptures near', 'artwork_type', 'sculpture', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Statue'), 'statue', 'artwork_type', 'statue');
+SELECT test_getorcreate_amenity(make_standard_name('Statues'), 'statues', 'artwork_type', 'statue');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'artwork_type', 'statue', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'artwork_type', 'statue', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'artwork_type', 'statue', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'artwork_type', 'statue', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('ATM'), 'atm', 'atm', 'yes');
+SELECT test_getorcreate_amenity(make_standard_name('ATMs'), 'atms', 'atm', 'yes');
+SELECT test_getorcreate_amenityoperator(make_standard_name('ATM in'), 'atm in', 'atm', 'yes', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('ATMs in'), 'atms in', 'atm', 'yes', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('ATM near'), 'atm near', 'atm', 'yes', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('ATMs near'), 'atms near', 'atm', 'yes', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('National Park'), 'national park', 'boundary', 'national_park');
+SELECT test_getorcreate_amenity(make_standard_name('National Parks'), 'national parks', 'boundary', 'national_park');
+SELECT test_getorcreate_amenityoperator(make_standard_name('National Park in'), 'national park in', 'boundary', 'national_park', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('National Parks in'), 'national parks in', 'boundary', 'national_park', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('National Park near'), 'national park near', 'boundary', 'national_park', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('National Parks near'), 'national parks near', 'boundary', 'national_park', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Changing table'), 'changing table', 'changing_table', 'yes');
+SELECT test_getorcreate_amenity(make_standard_name('Changing tables'), 'changing tables', 'changing_table', 'yes');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Changing table in'), 'changing table in', 'changing_table', 'yes', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Changing tables in'), 'changing tables in', 'changing_table', 'yes', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Changing table near'), 'changing table near', 'changing_table', 'yes', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Changing tables near'), 'changing tables near', 'changing_table', 'yes', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Roundabout'), 'roundabout', 'junction', 'roundabout');
+SELECT test_getorcreate_amenity(make_standard_name('Roundabouts'), 'roundabouts', 'junction', 'roundabout');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabout in'), 'roundabout in', 'junction', 'roundabout', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabouts in'), 'roundabouts in', 'junction', 'roundabout', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabout near'), 'roundabout near', 'junction', 'roundabout', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Roundabouts near'), 'roundabouts near', 'junction', 'roundabout', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Plaque'), 'plaque', 'memorial', 'plaque');
+SELECT test_getorcreate_amenity(make_standard_name('Plaques'), 'plaques', 'memorial', 'plaque');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Plaque in'), 'plaque in', 'memorial', 'plaque', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Plaques in'), 'plaques in', 'memorial', 'plaque', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Plaque near'), 'plaque near', 'memorial', 'plaque', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Plaques near'), 'plaques near', 'memorial', 'plaque', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Statue'), 'statue', 'memorial', 'statue');
+SELECT test_getorcreate_amenity(make_standard_name('Statues'), 'statues', 'memorial', 'statue');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statue in'), 'statue in', 'memorial', 'statue', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statues in'), 'statues in', 'memorial', 'statue', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statue near'), 'statue near', 'memorial', 'statue', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Statues near'), 'statues near', 'memorial', 'statue', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('Stolperstein'), 'stolperstein', 'memorial', 'stolperstein');
+SELECT test_getorcreate_amenity(make_standard_name('Stolpersteins'), 'stolpersteins', 'memorial', 'stolperstein');
+SELECT test_getorcreate_amenity(make_standard_name('Stolpersteine'), 'stolpersteine', 'memorial', 'stolperstein');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolperstein in'), 'stolperstein in', 'memorial', 'stolperstein', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteins in'), 'stolpersteins in', 'memorial', 'stolperstein', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteine in'), 'stolpersteine in', 'memorial', 'stolperstein', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolperstein near'), 'stolperstein near', 'memorial', 'stolperstein', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteins near'), 'stolpersteins near', 'memorial', 'stolperstein', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('Stolpersteine near'), 'stolpersteine near', 'memorial', 'stolperstein', 'near');
+SELECT test_getorcreate_amenity(make_standard_name('War Memorial'), 'war memorial', 'memorial', 'war_memorial');
+SELECT test_getorcreate_amenity(make_standard_name('War Memorials'), 'war memorials', 'memorial', 'war_memorial');
+SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorial in'), 'war memorial in', 'memorial', 'war_memorial', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorials in'), 'war memorials in', 'memorial', 'war_memorial', 'in');
+SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorial near'), 'war memorial near', 'memorial', 'war_memorial', 'near');
+SELECT test_getorcreate_amenityoperator(make_standard_name('War Memorials near'), 'war memorials near', 'memorial', 'war_memorial', 'near');
CREATE INDEX idx_placex_classtype ON placex (class, type);CREATE TABLE place_classtype_aeroway_aerodrome AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex WHERE class = 'aeroway' AND type = 'aerodrome';
CREATE INDEX idx_place_classtype_aeroway_aerodrome_centroid ON place_classtype_aeroway_aerodrome USING GIST (centroid);
CREATE INDEX idx_place_classtype_aeroway_aerodrome_place_id ON place_classtype_aeroway_aerodrome USING btree(place_id);
CREATE INDEX idx_place_classtype_memorial_war_memorial_centroid ON place_classtype_memorial_war_memorial USING GIST (centroid);
CREATE INDEX idx_place_classtype_memorial_war_memorial_place_id ON place_classtype_memorial_war_memorial USING btree(place_id);
GRANT SELECT ON place_classtype_memorial_war_memorial TO "www-data";
-DROP INDEX idx_placex_classtype;
\ No newline at end of file
+DROP INDEX idx_placex_classtype;
+
+DROP FUNCTION test_getorcreate_amenity;
+DROP FUNCTION test_getorcreate_amenityoperator;