From b99a043c78f756f58b8e106ba60e999ea9538bb3 Mon Sep 17 00:00:00 2001 From: Brian Quinion Date: Thu, 22 Mar 2012 00:33:28 +0000 Subject: [PATCH 1/1] Use GB postcode table as definitive source. resort by pressence of search word in output address and other misc very minor changes not commited from MQ servers --- lib/lib.php | 50 +--------------- lib/log.php | 6 +- lib/template/details-html.php | 9 +-- nominatim/import.c | 3 + nominatim/index.c | 2 +- settings/settings.php | 4 +- sql/functions.sql | 105 +++++++++++++++++++--------------- sql/tables.sql | 1 + utils/setup.php | 2 +- utils/specialphrases.php | 3 + website/search.php | 29 ++++++++-- 11 files changed, 106 insertions(+), 108 deletions(-) diff --git a/lib/lib.php b/lib/lib.php index a7c805f7..069f3e25 100644 --- a/lib/lib.php +++ b/lib/lib.php @@ -40,9 +40,9 @@ function byImportance($a, $b) { -/* if ($a['importance'] != $b['importance']) return ($a['importance'] > $b['importance']?-1:1); +/* if ($a['aPointPolygon']['numfeatures'] != $b['aPointPolygon']['numfeatures']) return ($a['aPointPolygon']['numfeatures'] > $b['aPointPolygon']['numfeatures']?-1:1); if ($a['aPointPolygon']['area'] != $b['aPointPolygon']['area']) @@ -173,60 +173,12 @@ exit; } - if (sizeof($aNearPostcodes)) { return array(array('lat' => $aNearPostcodes[0]['lat'], 'lon' => $aNearPostcodes[0]['lon'], 'radius' => 0.005)); } return false; - - /* partial search disabled because it sequentially scans placex - - $sSQL = 'select substring(upper(postcode) from \'^[A-Z][A-Z]?[0-9][0-9A-Z]? [0-9]([A-Z][A-Z])$\'),ST_X(ST_Centroid(geometry)) as lon,ST_Y(ST_Centroid(geometry)) as lat from placex where country_code::text = \'gb\'::text AND substring(postcode from \'^([A-Z][A-Z]?[0-9][0-9A-Z]? [0-9])[A-Z][A-Z]$\') = \''.$sPostcodeSector.'\' and class=\'place\' and type=\'postcode\' '; - $sSQL .= ' union '; - $sSQL .= 'select substring(upper(postcode) from \'^[A-Z][A-Z]?[0-9][0-9A-Z]? [0-9]([A-Z][A-Z])$\'),ST_X(ST_Centroid(geometry)) as lon,ST_Y(ST_Centroid(geometry)) as lat from gb_postcode where substring(postcode from \'^([A-Z][A-Z]?[0-9][0-9A-Z]? [0-9])[A-Z][A-Z]$\') = \''.$sPostcodeSector.'\''; - $aNearPostcodes = $oDB->getAll($sSQL); - if (PEAR::IsError($aNearPostcodes)) - { - var_dump($sSQL, $aNearPostcodes); - exit; - } - - if (!sizeof($aNearPostcodes)) - { - return false; - } - - $fTotalLat = 0; - $fTotalLon = 0; - $fTotalFac = 0; - foreach($aNearPostcodes as $aPostcode) - { - $iDiff = gbPostcodeAlphaDifference($sPostcodeEnd, $aPostcode['substring'])*2 + 1; - if ($iDiff == 0) - $fFac = 1; - else - $fFac = 1/($iDiff*$iDiff); - - $fTotalFac += $fFac; - $fTotalLat += $aPostcode['lat'] * $fFac; - $fTotalLon += $aPostcode['lon'] * $fFac; - } - if ($fTotalFac) - { - $fLat = $fTotalLat / $fTotalFac; - $fLon = $fTotalLon / $fTotalFac; - $fRadius = min(0.1 / $fTotalFac, 0.02); - return array(array('lat' => $fLat, 'lon' => $fLon, 'radius' => $fRadius)); - } - return false; - */ - /* - $fTotalFac is a suprisingly good indicator of accuracy - $iZoom = 18 + round(log($fTotalFac,32)); - $iZoom = max(13,min(18,$iZoom)); - */ } function usPostcodeCalculate($sPostcode, &$oDB) diff --git a/lib/log.php b/lib/log.php index 93ad8f95..e3126c0e 100644 --- a/lib/log.php +++ b/lib/log.php @@ -26,7 +26,8 @@ $oDB->query($sSQL); } - if (CONST_Log_File && CONST_Log_File_ReverseLog != '') { + if (CONST_Log_File && CONST_Log_File_ReverseLog != '') + { if ($sType == 'reverse') { $aStartTime = explode('.',$hLog[0]); @@ -68,7 +69,8 @@ $oDB->query($sSQL); } - if (CONST_Log_File && CONST_Log_File_SearchLog != '') { + if (CONST_Log_File && CONST_Log_File_SearchLog != '') + { $aStartTime = explode('.',$hLog[0]); file_put_contents(CONST_Log_File_SearchLog, $aStartTime[0].','.$aStartTime[1].','. diff --git a/lib/template/details-html.php b/lib/template/details-html.php index 9124d594..aa27664b 100644 --- a/lib/template/details-html.php +++ b/lib/template/details-html.php @@ -74,11 +74,12 @@ body { var proj_map = map.getProjectionObject(); var latlon; var linearRing = new OpenLayers.Geometry.LinearRing(pointList).transform(proj_EPSG4326, proj_map);; diff --git a/nominatim/import.c b/nominatim/import.c index 95c742ff..b9341608 100644 --- a/nominatim/import.c +++ b/nominatim/import.c @@ -515,15 +515,18 @@ void EndElement(xmlTextReaderPtr reader, const xmlChar *name) // insert into place_address paramValues[0] = (const char *)place_id; paramValues[1] = (const char *)featureAddress[i].distance; + if (paramValues[1] == NULL || strlen(paramValues[1]) == 0) paramValues[1] = "0"; paramValues[2] = (const char *)featureAddress[i].type; paramValues[3] = (const char *)featureAddress[i].id; paramValues[4] = (const char *)featureAddress[i].key; paramValues[5] = (const char *)featureAddress[i].value; paramValues[6] = (const char *)featureAddress[i].isAddress; + if (verbose) fprintf(stderr, "placex_insert: %s %s\n", paramValues[2], paramValues[3]); res = PQexecPrepared(conn, "place_addressline_insert", 7, paramValues, NULL, NULL, 0); if (PQresultStatus(res) != PGRES_COMMAND_OK) { fprintf(stderr, "place_addressline_insert: INSERT failed: %s", PQerrorMessage(conn)); + fprintf(stderr, "(%s,%s,%s,%s,%s,%s,%s)",paramValues[0],paramValues[1],paramValues[2],paramValues[3],paramValues[4],paramValues[5],paramValues[6]); PQclear(res); exit(EXIT_FAILURE); } diff --git a/nominatim/index.c b/nominatim/index.c index 3f161cc3..368fd8a5 100644 --- a/nominatim/index.c +++ b/nominatim/index.c @@ -270,7 +270,7 @@ void nominatim_index(int rank_min, int rank_max, int num_threads, const char *co usleep(1000); // Aim for one update per second - if (sleepcount++ > 2000) + if (sleepcount++ > 500) { rankPerSecond = ((float)rankCountTuples + (float)count) / MAX(difftime(time(0), rankStartTime),1); fprintf(stderr, " Done %i in %i @ %f per second - Rank %i ETA (seconds): %f\n", (rankCountTuples + count), (int)(difftime(time(0), rankStartTime)), rankPerSecond, rank, ((float)(rankTotalTuples - (rankCountTuples + count)))/rankPerSecond); diff --git a/settings/settings.php b/settings/settings.php index 591af1c6..66d7bed6 100644 --- a/settings/settings.php +++ b/settings/settings.php @@ -10,8 +10,8 @@ @define('CONST_Postgresql_Version', '9.1'); @define('CONST_Path_Postgresql_Contrib', '/usr/share/postgresql/'.CONST_Postgresql_Version.'/contrib'); @define('CONST_Path_Postgresql_Postgis', CONST_Path_Postgresql_Contrib.'/postgis-1.5'); - @define('CONST_Osm2pgsql_Binary', CONST_BasePath.'/osm2pgsql/osm2pgsql'); - @define('CONST_Osmosis_Binary', CONST_BasePath.'/osmosis-0.38/bin/osmosis'); + @define('CONST_Osm2pgsql_Binary', CONST_BasePath.'/osm2pgsql/osm2pgsql'); + @define('CONST_Osmosis_Binary', CONST_BasePath.'/osmosis-0.38/bin/osmosis'); // Website settings @define('CONST_ClosedForIndexing', false); diff --git a/sql/functions.sql b/sql/functions.sql index 6ad13892..9a613152 100644 --- a/sql/functions.sql +++ b/sql/functions.sql @@ -940,7 +940,50 @@ BEGIN NEW.rank_address := NEW.rank_search; -- By doing in postgres we have the country available to us - currently only used for postcode - IF NEW.class = 'place' THEN + IF NEW.class in ('place','boundary') AND NEW.type in ('postcode','postal_code') THEN + + NEW.name := 'ref'=>NEW.postcode; + + IF NEW.country_code = 'gb' THEN + + IF NEW.postcode ~ '^([A-Z][A-Z]?[0-9][0-9A-Z]? [0-9][A-Z][A-Z])$' THEN + NEW.rank_search := 25; + NEW.rank_address := 5; + ELSEIF NEW.postcode ~ '^([A-Z][A-Z]?[0-9][0-9A-Z]? [0-9])$' THEN + NEW.rank_search := 23; + NEW.rank_address := 5; + ELSEIF NEW.postcode ~ '^([A-Z][A-Z]?[0-9][0-9A-Z])$' THEN + NEW.rank_search := 21; + NEW.rank_address := 5; + END IF; + + ELSEIF NEW.country_code = 'de' THEN + + IF NEW.postcode ~ '^([0-9]{5})$' THEN + NEW.rank_search := 21; + NEW.rank_address := 11; + END IF; + + ELSE + -- Guess at the postcode format and coverage (!) + IF upper(NEW.postcode) ~ '^[A-Z0-9]{1,5}$' THEN -- Probably too short to be very local + NEW.rank_search := 21; + NEW.rank_address := 11; + ELSE + -- Does it look splitable into and area and local code? + postcode := substring(upper(NEW.postcode) from '^([- :A-Z0-9]+)([- :][A-Z0-9]+)$'); + + IF postcode IS NOT NULL THEN + NEW.rank_search := 25; + NEW.rank_address := 11; + ELSEIF NEW.postcode ~ '^[- :A-Z0-9]{6,}$' THEN + NEW.rank_search := 21; + NEW.rank_address := 11; + END IF; + END IF; + END IF; + + ELSEIF NEW.class = 'place' THEN IF NEW.type in ('continent') THEN NEW.rank_search := 2; NEW.rank_address := NEW.rank_search; @@ -992,49 +1035,6 @@ BEGIN ELSEIF NEW.type in ('hall_of_residence','neighbourhood','housing_estate','nature_reserve') THEN NEW.rank_search := 22; NEW.rank_address := 22; - ELSEIF NEW.type in ('postcode') THEN - - NEW.name := 'ref'=>NEW.postcode; - - IF NEW.country_code = 'gb' THEN - - IF NEW.postcode ~ '^([A-Z][A-Z]?[0-9][0-9A-Z]? [0-9][A-Z][A-Z])$' THEN - NEW.rank_search := 25; - NEW.rank_address := 5; - ELSEIF NEW.postcode ~ '^([A-Z][A-Z]?[0-9][0-9A-Z]? [0-9])$' THEN - NEW.rank_search := 23; - NEW.rank_address := 5; - ELSEIF NEW.postcode ~ '^([A-Z][A-Z]?[0-9][0-9A-Z])$' THEN - NEW.rank_search := 21; - NEW.rank_address := 5; - END IF; - - ELSEIF NEW.country_code = 'de' THEN - - IF NEW.postcode ~ '^([0-9]{5})$' THEN - NEW.rank_search := 21; - NEW.rank_address := 11; - END IF; - - ELSE - -- Guess at the postcode format and coverage (!) - IF upper(NEW.postcode) ~ '^[A-Z0-9]{1,5}$' THEN -- Probably too short to be very local - NEW.rank_search := 21; - NEW.rank_address := 11; - ELSE - -- Does it look splitable into and area and local code? - postcode := substring(upper(NEW.postcode) from '^([- :A-Z0-9]+)([- :][A-Z0-9]+)$'); - - IF postcode IS NOT NULL THEN - NEW.rank_search := 25; - NEW.rank_address := 11; - ELSEIF NEW.postcode ~ '^[- :A-Z0-9]{6,}$' THEN - NEW.rank_search := 21; - NEW.rank_address := 11; - END IF; - END IF; - END IF; - ELSEIF NEW.type in ('airport','street') THEN NEW.rank_search := 26; NEW.rank_address := NEW.rank_search; @@ -1115,7 +1115,8 @@ BEGIN IF st_area(NEW.geometry) < 1 THEN -- mark items within the geometry for re-indexing -- RAISE WARNING 'placex poly insert: % % % %',NEW.osm_type,NEW.osm_id,NEW.class,NEW.type; --- work around bug in postgis + + -- work around bug in postgis, this may have been fixed in 2.0.0 (see http://trac.osgeo.org/postgis/ticket/547) update placex set indexed_status = 2 where (ST_Contains(NEW.geometry, placex.geometry) OR ST_Intersects(NEW.geometry, placex.geometry)) AND rank_search > NEW.rank_search and indexed_status = 0 and ST_geometrytype(placex.geometry) = 'ST_Point'; update placex set indexed_status = 2 where (ST_Contains(NEW.geometry, placex.geometry) OR ST_Intersects(NEW.geometry, placex.geometry)) @@ -1203,6 +1204,7 @@ DECLARE tagpairid INTEGER; + default_language TEXT; name_vector INTEGER[]; nameaddress_vector INTEGER[]; @@ -1256,6 +1258,19 @@ BEGIN -- cheaper but less acurate place_centroid := ST_Centroid(NEW.geometry); + -- Thought this wasn't needed but when we add new languages to the country_name table + -- we need to update the existing names + IF NEW.name is not null AND array_upper(%#NEW.name,1) > 1 THEN + default_language := get_country_language_code(NEW.country_code); + IF default_language IS NOT NULL THEN + IF NEW.name ? 'name' AND NOT NEW.name ? ('name:'||default_language) THEN + NEW.name := NEW.name || (('name:'||default_language) => (NEW.name -> 'name')); + ELSEIF NEW.name ? ('name:'||default_language) AND NOT NEW.name ? 'name' THEN + NEW.name := NEW.name || ('name' => (NEW.name -> 'name:'||default_language)); + END IF; + END IF; + END IF; + -- Initialise the name vector using our name name_vector := make_keywords(NEW.name); nameaddress_vector := '{}'::int[]; diff --git a/sql/tables.sql b/sql/tables.sql index 19d8c084..795b9277 100644 --- a/sql/tables.sql +++ b/sql/tables.sql @@ -212,6 +212,7 @@ CREATE TABLE placex ( geometry_sector INTEGER ); SELECT AddGeometryColumn('placex', 'geometry', 4326, 'GEOMETRY', 2); +SELECT AddGeometryColumn('placex', 'centroid', 4326, 'GEOMETRY', 2); CREATE UNIQUE INDEX idx_place_id ON placex USING BTREE (place_id); CREATE INDEX idx_placex_osmid ON placex USING BTREE (osm_type, osm_id); CREATE INDEX idx_placex_rank_search ON placex USING BTREE (rank_search, geometry_sector); diff --git a/utils/setup.php b/utils/setup.php index cf465b15..94888e44 100755 --- a/utils/setup.php +++ b/utils/setup.php @@ -346,7 +346,7 @@ if (!pg_query($oDB->connection, $sSQL)) fail(pg_last_error($oDB->connection)); } - if ($aCMDResult['osmosis-init'] && isset($aCMDResult['osmosis-init-date'])) + if (($aCMDResult['osmosis-init'] || $aCMDResult['all']) && isset($aCMDResult['osmosis-init-date'])) { $bDidSomething = true; diff --git a/utils/specialphrases.php b/utils/specialphrases.php index 1c3eff5b..81d240a6 100755 --- a/utils/specialphrases.php +++ b/utils/specialphrases.php @@ -107,6 +107,9 @@ foreach($aPairs as $aPair) { + if ($aPair[0] == 'yes') continue; + if ($aPair[1] == 'yes') continue; + if ($aPair[0] == 'highway') continue; if ($aPair[1] == 'highway') continue; echo "create table place_classtype_".pg_escape_string($aPair[0])."_".pg_escape_string($aPair[1])." as "; diff --git a/website/search.php b/website/search.php index d9adf6b8..64cec021 100755 --- a/website/search.php +++ b/website/search.php @@ -247,6 +247,7 @@ $sToken = $oDB->getOne("select make_standard_name('".$aSpecialTerm[1]."') as string"); $sSQL = 'select * from (select word_id,word_token, word, class, type, location, country_code, operator'; $sSQL .= ' from word where word_token in (\' '.$sToken.'\')) as x where (class is not null and class not in (\'place\',\'highway\')) or country_code is not null'; + if (CONST_Debug) var_Dump($sSQL); $aSearchWords = $oDB->getAll($sSQL); $aNewSearches = array(); foreach($aSearches as $aSearch) @@ -374,7 +375,8 @@ // Try and calculate GB postcodes we might be missing foreach($aTokens as $sToken) { - if (!isset($aValidTokens[$sToken]) && !isset($aValidTokens[' '.$sToken]) && preg_match('/^([A-Z][A-Z]?[0-9][0-9A-Z]? ?[0-9])([A-Z][A-Z])$/', strtoupper(trim($sToken)), $aData)) + // Source of gb postcodes is now definitive - always use + if (preg_match('/^([A-Z][A-Z]?[0-9][0-9A-Z]? ?[0-9])([A-Z][A-Z])$/', strtoupper(trim($sToken)), $aData)) { if (substr($aData[1],-2,1) != ' ') { @@ -416,7 +418,6 @@ Score how good the search is so they can be ordered */ - foreach($aPhrases as $iPhrase => $sPhrase) { $aNewPhraseSearches = array(); @@ -503,7 +504,7 @@ if ($aSearch['iSearchRank'] < $iMaxRank) $aNewWordsetSearches[] = $aSearch; } } - else + elseif (isset($aSearchTerm['word_id']) && $aSearchTerm['word_id']) { if (sizeof($aSearch['aName'])) { @@ -530,6 +531,8 @@ // Allow searching for a word - but at extra cost foreach($aValidTokens[$sToken] as $aSearchTerm) { + if (isset($aSearchTerm['word_id']) && $aSearchTerm['word_id']) + { //var_Dump('
',$aSearch['aName']); if (sizeof($aCurrentSearch['aName']) && strlen($sToken) >= 4) @@ -549,6 +552,7 @@ $aSearch['iNamePhrase'] = $iPhrase; if ($aSearch['iSearchRank'] < $iMaxRank) $aNewWordsetSearches[] = $aSearch; } + } } } else @@ -745,6 +749,9 @@ // First we need a position, either aName or fLat or both $aTerms = array(); $aOrder = array(); + + // TODO: filter out the pointless search terms (2 letter name tokens and less) + // they might be right - but they are just too darned expensive to run if (sizeof($aSearch['aName'])) $aTerms[] = "name_vector @> ARRAY[".join($aSearch['aName'],",")."]"; if (sizeof($aSearch['aAddress']) && $aSearch['aName'] != $aSearch['aAddress']) $aTerms[] = "nameaddress_vector @> ARRAY[".join($aSearch['aAddress'],",")."]"; if ($aSearch['sCountryCode']) $aTerms[] = "country_code = '".pg_escape_string($aSearch['sCountryCode'])."'"; @@ -1132,6 +1139,11 @@ //var_Dump($aSearchResults); //exit; $aClassType = getClassTypesWithImportance(); + $aRecheckWords = preg_split('/\b/',$sQuery); + foreach($aRecheckWords as $i => $sWord) + { + if (!$sWord) unset($aRecheckWords[$i]); + } foreach($aSearchResults as $iResNum => $aResult) { if (CONST_Search_AreaPolygons || true) @@ -1249,6 +1261,16 @@ //exit; } + // Adjust importance for the number of exact string matches in the result + $aResult['importance'] = max(0.001,$aResult['importance']); + $iCountWords = 0; + $sAddress = $aResult['langaddress']; + foreach($aRecheckWords as $i => $sWord) + { + if (stripos($sAddress, $sWord)!==false) $iCountWords++; + } + $aResult['importance'] = $aResult['importance'] + $iCountWords; + //if (CONST_Debug) var_dump($aResult['class'].':'.$aResult['type'].':'.$aResult['admin_level']); /* if (isset($aClassType[$aResult['class'].':'.$aResult['type'].':'.$aResult['admin_level']]['importance']) @@ -1270,7 +1292,6 @@ $aResult['foundorder'] = $iResNum; $aSearchResults[$iResNum] = $aResult; } - uasort($aSearchResults, 'byImportance'); //var_dump($aSearchResults);exit; -- 2.39.5