From 1c0fa81d88dd4d640e9582390bcadd6285acb72b Mon Sep 17 00:00:00 2001 From: Brian Quinion Date: Mon, 30 Apr 2012 01:05:06 +0100 Subject: [PATCH] calculate importance from wikipedia --- sql/functions.sql | 67 +++++++++++++++++++++++++++++++++++++++++++++-- sql/tables.sql | 26 ++++++++++++++++++ utils/setup.php | 64 +++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 152 insertions(+), 5 deletions(-) diff --git a/sql/functions.sql b/sql/functions.sql index 5f7fc310..02570811 100644 --- a/sql/functions.sql +++ b/sql/functions.sql @@ -1225,6 +1225,10 @@ DECLARE name_vector INTEGER[]; nameaddress_vector INTEGER[]; + wiki_article TEXT; + wiki_article_title TEXT; + wiki_article_language TEXT; + result BOOLEAN; BEGIN @@ -1308,6 +1312,34 @@ BEGIN address_havelevel[i] := false; END LOOP; + NEW.importance := null; + -- WARNING: see duplicate of code below (yuk!) + IF NEW.extratags?'wikipedia' THEN + wiki_article := replace(regexp_replace(NEW.extratags->'wikipedia',E'(.*?)([a-z]+).wikipedia.org/wiki/',E'\\2:'),' ','_'); + wiki_article_title := split_part(wiki_article, ':', 2); + IF wiki_article_title IS NULL OR wiki_article_title = '' THEN + wiki_article_title := wiki_article; + wiki_article_language := 'en'; + ELSE + wiki_article_language := lower(split_part(wiki_article, ':', 1)); + END IF; +--RAISE WARNING '% %', wiki_article_language, wiki_article_title; + + select wikipedia_article.importance,wikipedia_article.language||':'||wikipedia_article.title + from wikipedia_article + where language = wiki_article_language and + (title = wiki_article_title OR title = decode_url_part(wiki_article_title) OR title = replace(decode_url_part(wiki_article_title),E'\\','')) + UNION ALL + select wikipedia_article.importance,wikipedia_article.language||':'||wikipedia_article.title + from wikipedia_redirect join wikipedia_article on (wikipedia_redirect.language = wikipedia_article.language and wikipedia_redirect.to_title = wikipedia_article.title) + where wikipedia_redirect.language = wiki_article_language and + (from_title = wiki_article_title OR from_title = decode_url_part(wiki_article_title) OR from_title = replace(decode_url_part(wiki_article_title),E'\\','')) + order by importance asc limit 1 INTO NEW.importance,NEW.wikipedia; + + ELSE + select importance,language||':'||title from wikipedia_article where osm_type = NEW.osm_type and osm_id = NEW.osm_id order by importance asc limit 1 INTO NEW.importance,NEW.wikipedia; + END IF; + --RAISE WARNING '% %', NEW.place_id, NEW.rank_search; -- For low level elements we inherit from our parent road @@ -1535,7 +1567,7 @@ BEGIN END IF; -- not found one yet? how about doing a name search - IF NEW.centroid IS NULL AND NEW.name->'name' is not null and make_standard_name(NEW.name->'name') != '' THEN + IF NEW.centroid IS NULL AND (NEW.name->'name') is not null and make_standard_name(NEW.name->'name') != '' THEN FOR linkedPlacex IN select placex.* from placex WHERE make_standard_name(name->'name') = make_standard_name(NEW.name->'name') @@ -1567,6 +1599,31 @@ BEGIN place_centroid := NEW.centroid; END IF; + -- Did we gain a wikipedia tag in the process? then we need to recalculate our importance + -- WARNING: duplicate of code above (yuk!) + IF NEW.importance is null AND NEW.extratags?'wikipedia' THEN + wiki_article := replace(regexp_replace(NEW.extratags->'wikipedia',E'(.*?)([a-z]+).wikipedia.org/wiki/',E'\\2:'),' ','_'); + wiki_article_title := split_part(wiki_article, ':', 2); + IF wiki_article_title IS NULL OR wiki_article_title = '' THEN + wiki_article_title := wiki_article; + wiki_article_language := 'en'; + ELSE + wiki_article_language := lower(split_part(wiki_article, ':', 1)); + END IF; + + select wikipedia_article.importance,wikipedia_article.language||':'||wikipedia_article.title + from wikipedia_article + where language = wiki_article_language and + (title = wiki_article_title OR title = decode_url_part(wiki_article_title) OR title = replace(decode_url_part(wiki_article_title),E'\\','')) + UNION ALL + select wikipedia_article.importance,wikipedia_article.language||':'||wikipedia_article.title + from wikipedia_redirect join wikipedia_article on (wikipedia_redirect.language = wikipedia_article.language and wikipedia_redirect.to_title = wikipedia_article.title) + where wikipedia_redirect.language = wiki_article_language and + (from_title = wiki_article_title OR from_title = decode_url_part(wiki_article_title) OR from_title = replace(decode_url_part(wiki_article_title),E'\\','')) + order by importance asc limit 1 INTO NEW.importance,NEW.wikipedia; + + END IF; + END IF; NEW.parent_place_id = 0; @@ -2706,4 +2763,10 @@ END; $$ LANGUAGE plpgsql; - +-- See: http://stackoverflow.com/questions/6410088/how-can-i-mimic-the-php-urldecode-function-in-postgresql +CREATE OR REPLACE FUNCTION decode_url_part(p varchar) RETURNS varchar + AS $$ +SELECT convert_from(CAST(E'\\x' || string_agg(CASE WHEN length(r.m[1]) = 1 THEN encode(convert_to(r.m[1], 'SQL_ASCII'), 'hex') ELSE substring(r.m[1] from 2 for 2) END, '') AS bytea), 'UTF8') +FROM regexp_matches($1, '%[0-9a-f][0-9a-f]|.', 'gi') AS r(m); +$$ +LANGUAGE SQL IMMUTABLE STRICT; diff --git a/sql/tables.sql b/sql/tables.sql index 9baa2da0..fd01663e 100644 --- a/sql/tables.sql +++ b/sql/tables.sql @@ -210,6 +210,7 @@ CREATE TABLE placex ( importance FLOAT, indexed_status INTEGER, indexed_date TIMESTAMP, + wikipedia TEXT, -- calculated wikipedia article name (language:title) geometry_sector INTEGER ); SELECT AddGeometryColumn('placex', 'geometry', 4326, 'GEOMETRY', 2); @@ -301,3 +302,28 @@ CREATE INDEX idx_import_polygon_delete_osmid ON import_polygon_delete USING BTRE drop sequence file; CREATE SEQUENCE file start 1; + +-- null table so it won't error +-- deliberately no drop - importing the table is expensive and static, if it is already there better to avoid removing it +CREATE TABLE wikipedia_article ( + language text NOT NULL, + title text NOT NULL, + langcount integer, + othercount integer, + totalcount integer, + lat double precision, + lon double precision, + importance double precision, + osm_type character(1), + osm_id bigint +); +ALTER TABLE ONLY wikipedia_article ADD CONSTRAINT wikipedia_article_pkey PRIMARY KEY (language, title); +CREATE INDEX idx_wikipedia_article_osm_id ON wikipedia_article USING btree (osm_type, osm_id); + +CREATE TABLE wikipedia_redirect ( + language text, + from_title text, + to_title text +); +ALTER TABLE ONLY wikipedia_redirect ADD CONSTRAINT wikipedia_redirect_pkey PRIMARY KEY (language, from_title); + diff --git a/utils/setup.php b/utils/setup.php index 6415ea12..b9ee9a89 100755 --- a/utils/setup.php +++ b/utils/setup.php @@ -23,6 +23,7 @@ array('create-minimal-tables', '', 0, 1, 0, 0, 'bool', 'Create minimal main tables'), array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create main tables'), array('create-partitions', '', 0, 1, 0, 0, 'bool', 'Create required partition tables and triggers'), + array('import-wikipedia-articles', '', 0, 1, 0, 0, 'bool', 'Import wikipedia article dump'), array('load-data', '', 0, 1, 0, 0, 'bool', 'Copy data to live tables from import table'), array('import-tiger-data', '', 0, 1, 0, 0, 'bool', 'Import tiger data (not included in \'all\')'), array('calculate-postcodes', '', 0, 1, 0, 0, 'bool', 'Calculate postcode centroids'), @@ -207,6 +208,34 @@ pgsqlRunScript($sTemplate); } + if ($aCMDResult['import-wikipedia-articles'] || $aCMDResult['all']) + { + $bDidSomething = true; + $sWikiArticlesFile = CONST_BasePath.'/data/wikipedia_article.sql.bin'; + $sWikiRedirectsFile = CONST_BasePath.'/data/wikipedia_redirect.sql.bin'; + if (file_exists($sWikiArticlesFile)) + { + echo "Importing wikipedia articles..."; + pgsqlRunRestoreData($sWikiArticlesFile); + echo "...done\n"; + } + else + { + echo "WARNING: wikipedia article dump file not found - places will have default importance\n"; + } + if (file_exists($sWikiRedirectsFile)) + { + echo "Importing wikipedia redirects..."; + pgsqlRunRestoreData($sWikiRedirectsFile); + echo "...done\n"; + } + else + { + echo "WARNING: wikipedia redirect dump file not found - some place importance values may be missing\n"; + } + } + + if ($aCMDResult['load-data'] || $aCMDResult['all']) { echo "Load Data\n"; @@ -326,7 +355,7 @@ } fclose($hFile); - + $bAnyBusy = true; while($bAnyBusy) { @@ -451,7 +480,8 @@ // Convert database DSN to psql paramaters $aDSNInfo = DB::parseDSN(CONST_Database_DSN); - $sCMD = 'psql -f '.$sFilename.' '.$aDSNInfo['database']; + if (!isset($aDSNInfo['port']) || !$aDSNInfo['port']) $aDSNInfo['port'] = 5432; + $sCMD = 'psql -p '.$aDSNInfo['port'].' -d '.$aDSNInfo['database'].' -f '.$sFilename; $aDescriptors = array( 0 => array('pipe', 'r'), @@ -479,7 +509,7 @@ // Convert database DSN to psql paramaters $aDSNInfo = DB::parseDSN(CONST_Database_DSN); if (!isset($aDSNInfo['port']) || !$aDSNInfo['port']) $aDSNInfo['port'] = 5432; - $sCMD = 'psql -p '.$aDSNInfo['port'].' '.$aDSNInfo['database']; + $sCMD = 'psql -p '.$aDSNInfo['port'].' -d '.$aDSNInfo['database']; $aDescriptors = array( 0 => array('pipe', 'r'), 1 => STDOUT, @@ -497,3 +527,31 @@ fclose($ahPipes[0]); proc_close($hProcess); } + + function pgsqlRunRestoreData($sDumpFile) + { + // Convert database DSN to psql paramaters + $aDSNInfo = DB::parseDSN(CONST_Database_DSN); + if (!isset($aDSNInfo['port']) || !$aDSNInfo['port']) $aDSNInfo['port'] = 5432; + $sCMD = 'pg_restore -p '.$aDSNInfo['port'].' -d '.$aDSNInfo['database'].' -Fc -a '.$sDumpFile; + + $aDescriptors = array( + 0 => array('pipe', 'r'), + 1 => array('pipe', 'w'), + 2 => array('file', '/dev/null', 'a') + ); + $ahPipes = null; + $hProcess = proc_open($sCMD, $aDescriptors, $ahPipes); + if (!is_resource($hProcess)) fail('unable to start pg_restore'); + + fclose($ahPipes[0]); + + // TODO: error checking + while(!feof($ahPipes[1])) + { + echo fread($ahPipes[1], 4096); + } + fclose($ahPipes[1]); + + proc_close($hProcess); + } -- 2.39.5