From 0070817ba5e2f652f64fb68076ad3a61c2fcca77 Mon Sep 17 00:00:00 2001 From: Brian Quinion Date: Mon, 18 Mar 2013 23:57:10 +0100 Subject: [PATCH] first draft of wikipedia article/importance import script --- utils/importWikipedia.php | 593 ++++++++++++++++++++++++++++++++++++++ utils/import_wikipedia.sh | 50 ++++ 2 files changed, 643 insertions(+) create mode 100755 utils/importWikipedia.php create mode 100755 utils/import_wikipedia.sh diff --git a/utils/importWikipedia.php b/utils/importWikipedia.php new file mode 100755 index 00000000..6e32ee11 --- /dev/null +++ b/utils/importWikipedia.php @@ -0,0 +1,593 @@ +#!/usr/bin/php -Cq + +| season = March/April through October/November +| opening_date = July 1, 1974 +| previous_names = Great Adventure +| area_acre = 2200 +| rides = 45 park admission rides +| coasters = 12 +| water_rides = 2 +| owner = [[Six Flags]] +| general_manager = +| homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure] +}} +EOD; +var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText))); +exit; +//| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}} +*/ +/* + + $a = array(); + $a[] = 'test'; + + $oDB &= getDB(); + + if ($aCMDResult['drop-tables']) + { + $oDB->query('DROP TABLE wikipedia_article'); + $oDB->query('DROP TABLE wikipedia_link'); + } +*/ + + if ($aCMDResult['create-tables']) + { + $sSQL = <<<'EOD' +CREATE TABLE wikipedia_article ( + language text NOT NULL, + title text NOT NULL, + langcount integer, + othercount integer, + totalcount integer, + lat double precision, + lon double precision, + importance double precision, + title_en text, + osm_type character(1), + osm_id bigint, + infobox_type text, + population bigint, + website text +); + $oDB->query($sSQL); + + $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)"); + + $sSQL = <<<'EOD' +CREATE TABLE wikipedia_link ( + from_id INTEGER, + to_name text + ); +EOD; + $oDB->query($sSQL); + } + + function degreesAndMinutesToDecimal($iDegrees, $iMinutes=0, $fSeconds=0, $sNSEW='N') + { + $sNSEW = strtoupper($sNSEW); + return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600); + } + + function _parseWikipediaContent($sPageText) + { + $sPageText = str_replace("\n", ' ', $sPageText); + $sPageText = preg_replace('##m', '', $sPageText); + $sPageText = preg_replace('#.*?<\\/math>#m', '', $sPageText); + + $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE); + + $aPageProperties = array(); + $sPageBody = ''; + $aTemplates = array(); + $aLinks = array(); + + $aTemplateStack = array(); + $aState = array('body'); + foreach($aPageText as $i => $sPart) + { + switch($sPart) + { + case '{{': + array_unshift($aTemplateStack, array('', array())); + array_unshift($aState, 'template'); + break; + case '}}': + if ($aState[0] == 'template' || $aState[0] == 'templateparam') + { + $aTemplate = array_shift($aTemplateStack); + array_shift($aState); + + $aTemplates[] = $aTemplate; + + } + break; + case '[[': + $sLinkPage = ''; + $sLinkSyn = ''; + array_unshift($aState, 'link'); + break; + case ']]': + if ($aState[0] == 'link' || $aState[0] == 'linksynonim') + { + if (!$sLinkSyn) $sLinkSyn = $sLinkPage; + if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6); + + $aLinks[] = array($sLinkPage, $sLinkSyn); + + array_shift($aState); + switch($aState[0]) + { + case 'template': + $aTemplateStack[0][0] .= trim($sPart); + break; + case 'templateparam': + $aTemplateStack[0][1][0] .= $sLinkSyn; + break; + case 'link': + $sLinkPage .= trim($sPart); + break; + case 'linksynonim': + $sLinkSyn .= $sPart; + break; + case 'body': + $sPageBody .= $sLinkSyn; + break; + default: + var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText); + fail('unknown state'); + } + } + break; + case '|': + if ($aState[0] == 'template' || $aState[0] == 'templateparam') + { + // Create a new template paramater + $aState[0] = 'templateparam'; + array_unshift($aTemplateStack[0][1], ''); + } + if ($aState[0] == 'link') $aState[0] = 'linksynonim'; + break; + default: + switch($aState[0]) + { + case 'template': + $aTemplateStack[0][0] .= trim($sPart); + break; + case 'templateparam': + $aTemplateStack[0][1][0] .= $sPart; + break; + case 'link': + $sLinkPage .= trim($sPart); + break; + case 'linksynonim': + $sLinkSyn .= $sPart; + break; + case 'body': + $sPageBody .= $sPart; + break; + default: + var_dump($aState, $aPageText); + fail('unknown state'); + } + break; + } + } + return $aTemplates; + } + + function _templatesToProperties($aTemplates) + { + $aPageProperties = array(); + foreach($aTemplates as $iTemplate => $aTemplate) + { + $aParams = array(); + foreach(array_reverse($aTemplate[1]) as $iParam => $sParam) + { + if (($iPos = strpos($sParam, '=')) === FALSE) + { + $aParams[] = trim($sParam); + } + else + { + $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1)); + } + } + $aTemplates[$iTemplate][1] = $aParams; + if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name']; + if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population'])) + { + $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population']); + } + if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total'])) + { + $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_total']); + } + if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban'])) + { + $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_urban']); + } + if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate'])) + { + $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_estimate']); + } + if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website']) + { + if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch)) + { + $aPageProperties['sWebsite'] = $aMatch[1]; + if (strpos($aPageProperties['sWebsite'],':/'.'/') === FALSE) + { + $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite']; + } + } + } + if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld']) + { + $aPageProperties['sTopLevelDomain'] = str_replace(array('[',']','.'),'', $aParams['cctld']); + } + + if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0],0,7)) == 'infobox') + { + $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0],8)); + // $aPageProperties['aInfoboxParams'] = $aParams; + } + + // Assume the first template with lots of params is the type (fallback for infobox) + if (!isset($aPageProperties['sPossibleInfoboxType']) && sizeof($aParams) > 10) + { + $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]); + // $aPageProperties['aInfoboxParams'] = $aParams; + } + + // do we have a lat/lon + if (!isset($aPageProperties['fLat'])) + { + if (isset($aParams['latd']) && isset($aParams['longd'])) + { + $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']); + $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']); + } + if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees'])) + { + $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']); + $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']); + } + if (isset($aParams['latitude']) && isset($aParams['longitude'])) + { + if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude'])) + { + $aPageProperties['fLat'] = (float)$aParams['latitude']; + $aPageProperties['fLon'] = (float)$aParams['longitude']; + } + } + if (strtolower($aTemplate[0]) == 'coord') + { + if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S')) + { + $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]); + $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]); + } + elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S')) + { + $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]); + $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]); + } + else if (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S')) + { + $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0]; + $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2]; + } + else if (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1])) + { + $aPageProperties['fLat'] = (float)$aParams[0]; + $aPageProperties['fLon'] = (float)$aParams[1]; + } + } + if (isset($aParams['Latitude']) && isset($aParams['Longitude'])) + { + $aParams['Latitude'] = str_replace(' ',' ',$aParams['Latitude']); + $aParams['Longitude'] = str_replace(' ',' ',$aParams['Longitude']); + if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) + { + $aPageProperties['fLat'] = + (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]) + +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2; + } + else if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) + { + $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]); + } + + if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) + { + $aPageProperties['fLon'] = + (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]) + +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2; + } + else if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) + { + $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]); + } + } + } + } + if (isset($aPageProperties['sPossibleInfoboxType'])) + { + if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType']; + unset($aPageProperties['sPossibleInfoboxType']); + } + return $aPageProperties; + } + + if (isset($aCMDResult['parse-wikipedia'])) + { + $oDB =& getDB(); + $aArticleNames = $oDB->getCol('select page_title from content where page_namespace = 0 and page_id %10 = '.$aCMDResult['parse-wikipedia'].' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))'); +// $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0 and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')'); + foreach($aArticleNames as $sArticleName) + { + $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\''); + $aP = _templatesToProperties(_parseWikipediaContent($sPageText)); + + if (isset($aP['sInfoboxType'])) + { + $aP['sInfoboxType'] = preg_replace('#\\s+#',' ',$aP['sInfoboxType']); + $sSQL = 'update wikipedia_article set '; + $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\''; + $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';'; + $oDB->query($sSQL); + } + if (isset($aP['iPopulation'])) + { + $sSQL = 'update wikipedia_article set '; + $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\''; + $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';'; + $oDB->query($sSQL); + } + if (isset($aP['sWebsite'])) + { + $sSQL = 'update wikipedia_article set '; + $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\''; + $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';'; + $oDB->query($sSQL); + } + if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0')) + { + if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = ''; + echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n"; + $sSQL = 'update wikipedia_article set '; + $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\','; + $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\''; + $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';'; + $oDB->query($sSQL); + } + } + } + + function nominatimXMLStart($hParser, $sName, $aAttr) + { + global $aNominatRecords; + switch($sName) + { + case 'PLACE': + $aNominatRecords[] = $aAttr; + break; + } + } + + function nominatimXMLEnd($hParser, $sName) + { + } + + + if (isset($aCMDResult['link'])) + { + $oDB =& getDB(); + $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000"); + + // If you point this script at production OSM you will be blocked + $sNominatimBaseURL = 'http://SEVERNAME/search.php'; + + foreach($aWikiArticles as $aRecord) + { + $aRecord['name'] = str_replace('_',' ',$aRecord['title']); + + $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en'; + + echo "\n-- ".$aRecord['name'].", ".$aRecord['infobox_type']."\n"; + $fMaxDist = 0.0000001; + $bUnknown = false; + switch(strtolower($aRecord['infobox_type'])) + { + case 'former country': + continue 2; + case 'sea': + $fMaxDist = 60; // effectively turn it off + $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist); + break; + case 'country': + case 'island': + case 'islands': + case 'continent': + $fMaxDist = 60; // effectively turn it off + $sURL .= "&featuretype=country"; + $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist); + break; + case 'prefecture japan': + $aRecord['name'] = trim(str_replace(' Prefecture',' ', $aRecord['name'])); + case 'state': + case '#us state': + case 'county': + case 'u.s. state': + case 'u.s. state symbols': + case 'german state': + case 'province or territory of canada'; + case 'indian jurisdiction'; + case 'province'; + case 'french region': + case 'region of italy': + case 'kommune': + case '#australia state or territory': + case 'russian federal subject': + $fMaxDist = 4; + $sURL .= "&featuretype=state"; + $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist); + break; + case 'protected area': + $fMaxDist = 1; + $sURL .= "&nearlat=".$aRecord['lat']; + $sURL .= "&nearlon=".$aRecord['lon']; + $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist); + break; + case 'settlement': + $bUnknown = true; + case 'french commune': + case 'italian comune': + case 'uk place': + case 'italian comune': + case 'australian place': + case 'german place': + case '#geobox': + case 'u.s. county': + case 'municipality': + case 'city japan': + case 'russian inhabited locality': + case 'finnish municipality/land area': + case 'england county': + case 'israel municipality': + case 'russian city': + case 'city': + $fMaxDist = 0.2; + $sURL .= "&featuretype=settlement"; + $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5); + break; + case 'mountain': + case 'mountain pass': + case 'river': + case 'lake': + case 'airport': + $fMaxDist = 0.2; + $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5); + + case 'ship begin': + $fMaxDist = 0.1; + $aTypes = array('wreck'); + $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01); + $sURL .= "&nearlat=".$aRecord['lat']; + $sURL .= "&nearlon=".$aRecord['lon']; + break; + case 'road': + case 'university': + case 'company': + case 'department': + $fMaxDist = 0.005; + $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01); + $sURL .= "&bounded=1"; + $sURL .= "&nearlat=".$aRecord['lat']; + $sURL .= "&nearlon=".$aRecord['lon']; + break; + default: + $bUnknown = true; + $fMaxDist = 0.005; + $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01); +// $sURL .= "&bounded=1"; + $sURL .= "&nearlat=".$aRecord['lat']; + $sURL .= "&nearlon=".$aRecord['lon']; + echo "-- Unknown: ".$aRecord['infobox_type']."\n"; + break; + } + $sNameURL = $sURL.'&q='.urlencode($aRecord['name']); + + var_Dump($sNameURL); + $sXML = file_get_contents($sNameURL); + + $aNominatRecords = array(); + $hXMLParser = xml_parser_create(); + xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd'); + xml_parse($hXMLParser, $sXML, true); + xml_parser_free($hXMLParser); + + if (!isset($aNominatRecords[0])) + { + $aNameParts = preg_split('#[(,]#',$aRecord['name']); + if (sizeof($aNameParts) > 1) + { + $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0])); + var_Dump($sNameURL); + $sXML = file_get_contents($sNameURL); + + $aNominatRecords = array(); + $hXMLParser = xml_parser_create(); + xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd'); + xml_parse($hXMLParser, $sXML, true); + xml_parser_free($hXMLParser);# + } + } + + // assume first is best/right + for($i = 0; $i < sizeof($aNominatRecords); $i++) + { + $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']); + $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']); + $fDiff = sqrt($fDiff); + if ($bUnknown) { + // If it was an unknown type base it on the rank of the found result + $iRank = (int)$aNominatRecords[$i]['PLACE_RANK']; + if ($iRank <= 4) $fMaxDist = 2; + elseif ($iRank <= 8) $fMaxDist = 1; + elseif ($iRank <= 10) $fMaxDist = 0.8; + elseif ($iRank <= 12) $fMaxDist = 0.6; + elseif ($iRank <= 17) $fMaxDist = 0.2; + elseif ($iRank <= 18) $fMaxDist = 0.1; + elseif ($iRank <= 22) $fMaxDist = 0.02; + elseif ($iRank <= 26) $fMaxDist = 0.001; + else $fMaxDist = 0.001; + } + echo "-- FOUND \"".substr($aNominatRecords[$i]['DISPLAY_NAME'],0,50)."\", ".$aNominatRecords[$i]['CLASS'].", ".$aNominatRecords[$i]['TYPE'].", ".$aNominatRecords[$i]['PLACE_RANK'].", ".$aNominatRecords[$i]['OSM_TYPE']." (dist:$fDiff, max:$fMaxDist)\n"; + if ($fDiff > $fMaxDist) + { + echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n"; + } + else + { + $sSQL = "update wikipedia_article set osm_type="; + switch($aNominatRecords[$i]['OSM_TYPE']) + { + case 'relation': $sSQL .= "'R'"; break; + case 'way': $sSQL .= "'W'"; break; + case 'node': $sSQL .= "'N'"; break; + } + $sSQL .= ", osm_id=".$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'"; + $oDB->query($sSQL); + break; + } + } + } + } diff --git a/utils/import_wikipedia.sh b/utils/import_wikipedia.sh new file mode 100755 index 00000000..7dd5ab08 --- /dev/null +++ b/utils/import_wikipedia.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +psqlcmd=psql wikipedia2013 +mysql2pgsqlcmd=./mysql2pgsql.perl /dev/stdin /dev/stdout + +language=( "ar" "bg" "ca" "cs" "da" "de" "en" "es" "eo" "eu" "fa" "fr" "ko" "hi" "hr" "id" "it" "he" "lt" "hu" "ms" "nl" "ja" "no" "pl" "pt" "kk" "ro" "ru" "sk" "sl" "sr" "fi" "sv" "tr" "uk" "vi" "vo" "war" "zh" ) + +# wikipedia pages and links +echo "CREATE TABLE linkcounts (language text, title text, count integer, sumcount integer, lat double, lon double );" | $psqlcmd +echo "CREATE TABLE wikipedia_redirect (language text, from_title text, to_title text );" | $psqlcmd + +for i in "${language[@]}" +do + wget http://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-page.sql.gz + wget http://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-pagelinks.sql.gz + wget http://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-langlinks.sql.gz + wget http://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-redirect.sql.gz +done + +for i in "${language[@]}" +do + gzip -dc ${i}wiki-latest-pagelinks.sql.gz | sed "s/\`pagelinks\`/\`${i}pagelinks\`/g" | $mysql2pgsqlcmd | $psqlcmd + gzip -dc ${i}wiki-latest-page.sql.gz | sed "s/\`page\`/\`${i}page\`/g" | $mysql2pgsqlcmd | $psqlcmd + gzip -dc ${i}wiki-latest-langlinks.sql.gz | sed "s/\`langlinks\`/\`${i}langlinks\`/g" | $mysql2pgsqlcmd | $psqlcmd + gzip -dc ${i}wiki-latest-redirect.sql.gz | sed "s/\`redirect\`/\`${i}redirect\`/g" | $mysql2pgsqlcmd | $psqlcmd +done + +for i in "${language[@]}" +do + echo "create table ${i}pagelinkcount as select pl_title as title,count(*) as count from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | $psqlcmd + echo "insert into linkcounts select '${i}',pl_title,count(*) from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | $psqlcmd + echo "insert into wikipedia_redirect select '${i}',page_title,rd_title from ${i}redirect join ${i}page on (rd_from = page_id) where page_namespace = 0 and rd_namespace = 0;" | $psqlcmd + echo "alter table ${i}pagelinkcount add column othercount integer;" | $psqlcmd + echo "update ${i}pagelinkcount set othercount = 0;" | $psqlcmd + for j in "${language[@]}" + do + echo "update ${i}pagelinkcount set othercount = ${i}pagelinkcount.othercount + x.count from (select page_title as title,count from ${i}langlinks join ${i}page on (ll_from = page_id) join ${j}pagelinkcount on (ll_lang = '${j}' and ll_title = title)) as x where x.title = ${i}pagelinkcount.title;" | $psqlcmd + done + echo "insert into wikipedia_article select '${i}', title, count, othercount, count+othercount from ${i}pagelinkcount;" | $psqlcmd +done + +# precalculated lat,lon from dbpedia +cat geo_coordinates_en.nq | grep http://www.georss.org/georss/point | sed 's|]*> * "\(-\?[-0-9.E]\+\) \(-\?[-0-9.E]\+\)"@en .|update pagelinks set lat=\1, lon=\2 where language = '"'"'\3'"'"' and title = decode_url_part('"'"'\4'"'"');|g' | $psqlcmd + +# media wiki dumper +wget https://github.com/bcollier/mwdumper/blob/master/build/mwdumper.jar + +# latest english wikipedia articles +wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 +java -jar mwdumper.jar --format=sql:1.5 enwiki-latest-pages-articles.xml.bz2 | ./mysql2pgsql.perl /dev/stdin /dev/stdout | sed 's/"text (/text ("/g' | sed 's/"old_flags)"/"old_flags")/g' | sed 's/"revision (/revision ("/g' | sed 's/"rev_deleted)"/"rev_deleted")/g' | sed 's/"page (/page ("/g' | sed 's/"page_len)"/"page_len")/g' | sed "s/DATE_ADD(E'1970-01-01', INTERVAL UNIX_TIMESTAMP() SECOND)[+]//g" | sed 's/RAND()/0/g' | $psqlcmd -- 2.39.5