From 2051a84a09d4a370fa74db1ba8742e9095b842fc Mon Sep 17 00:00:00 2001 From: marc tobias Date: Sun, 1 Dec 2019 11:14:28 +0100 Subject: [PATCH] remove old wikidata script. See data-sources/wikipedia-wikidata/ for new process --- CMakeLists.txt | 1 - utils/importWikipedia.php | 557 ------------------------------- wikidata/create.sql | 75 ----- wikidata/import.sh | 28 -- wikidata/parse.php | 188 ----------- wikidata/pull_wikipedia_logs.php | 79 ----- 6 files changed, 928 deletions(-) delete mode 100644 utils/importWikipedia.php delete mode 100644 wikidata/create.sql delete mode 100755 wikidata/import.sh delete mode 100755 wikidata/parse.php delete mode 100644 wikidata/pull_wikipedia_logs.php diff --git a/CMakeLists.txt b/CMakeLists.txt index 88b04051..6bd85880 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -109,7 +109,6 @@ set(WEBSITESCRIPTS set(CUSTOMSCRIPTS utils/country_languages.php - utils/importWikipedia.php utils/export.php utils/query.php utils/setup.php diff --git a/utils/importWikipedia.php b/utils/importWikipedia.php deleted file mode 100644 index 2e256e35..00000000 --- a/utils/importWikipedia.php +++ /dev/null @@ -1,557 +0,0 @@ - -| season = March/April through October/November -| opening_date = July 1, 1974 -| previous_names = Great Adventure -| area_acre = 2200 -| rides = 45 park admission rides -| coasters = 12 -| water_rides = 2 -| owner = [[Six Flags]] -| general_manager = -| homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure] -}} -EOD; -var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText))); -exit; -//| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}} -*/ -/* - - $a = array(); - $a[] = 'test'; - - $oDB = new Nominatim\DB(); - $oDB->connect(); - - if ($aCMDResult['drop-tables']) - { - $oDB->query('DROP TABLE wikipedia_article'); - $oDB->query('DROP TABLE wikipedia_link'); - } -*/ - -if ($aCMDResult['create-tables']) { - $sSQL = <<<'EOD' -CREATE TABLE wikipedia_article ( - language text NOT NULL, - title text NOT NULL, - langcount integer, - othercount integer, - totalcount integer, - lat double precision, - lon double precision, - importance double precision, - title_en text, - osm_type character(1), - osm_id bigint, - infobox_type text, - population bigint, - website text -); - $oDB->query($sSQL); - - $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)"); - - $sSQL = <<<'EOD' -CREATE TABLE wikipedia_link ( - from_id INTEGER, - to_name text - ); -EOD; - $oDB->query($sSQL); -} - - -function degreesAndMinutesToDecimal($iDegrees, $iMinutes = 0, $fSeconds = 0, $sNSEW = 'N') -{ - $sNSEW = strtoupper($sNSEW); - return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600); -} - - -function _parseWikipediaContent($sPageText) -{ - $sPageText = str_replace("\n", ' ', $sPageText); - $sPageText = preg_replace('##m', '', $sPageText); - $sPageText = preg_replace('#.*?<\\/math>#m', '', $sPageText); - - $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE); - - $aPageProperties = array(); - $sPageBody = ''; - $aTemplates = array(); - $aLinks = array(); - - $aTemplateStack = array(); - $aState = array('body'); - foreach ($aPageText as $i => $sPart) { - switch ($sPart) { - case '{{': - array_unshift($aTemplateStack, array('', array())); - array_unshift($aState, 'template'); - break; - case '}}': - if ($aState[0] == 'template' || $aState[0] == 'templateparam') { - $aTemplate = array_shift($aTemplateStack); - array_shift($aState); - - $aTemplates[] = $aTemplate; - } - break; - case '[[': - $sLinkPage = ''; - $sLinkSyn = ''; - array_unshift($aState, 'link'); - break; - case ']]': - if ($aState[0] == 'link' || $aState[0] == 'linksynonim') { - if (!$sLinkSyn) $sLinkSyn = $sLinkPage; - if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6); - - $aLinks[] = array($sLinkPage, $sLinkSyn); - - array_shift($aState); - switch ($aState[0]) { - case 'template': - $aTemplateStack[0][0] .= trim($sPart); - break; - case 'templateparam': - $aTemplateStack[0][1][0] .= $sLinkSyn; - break; - case 'link': - $sLinkPage .= trim($sPart); - break; - case 'linksynonim': - $sLinkSyn .= $sPart; - break; - case 'body': - $sPageBody .= $sLinkSyn; - break; - default: - var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText); - fail('unknown state'); - } - } - break; - case '|': - if ($aState[0] == 'template' || $aState[0] == 'templateparam') { - // Create a new template paramater - $aState[0] = 'templateparam'; - array_unshift($aTemplateStack[0][1], ''); - } - if ($aState[0] == 'link') $aState[0] = 'linksynonim'; - break; - default: - switch ($aState[0]) { - case 'template': - $aTemplateStack[0][0] .= trim($sPart); - break; - case 'templateparam': - $aTemplateStack[0][1][0] .= $sPart; - break; - case 'link': - $sLinkPage .= trim($sPart); - break; - case 'linksynonim': - $sLinkSyn .= $sPart; - break; - case 'body': - $sPageBody .= $sPart; - break; - default: - var_dump($aState, $aPageText); - fail('unknown state'); - } - break; - } - } - return $aTemplates; -} - -function _templatesToProperties($aTemplates) -{ - $aPageProperties = array(); - foreach ($aTemplates as $iTemplate => $aTemplate) { - $aParams = array(); - foreach (array_reverse($aTemplate[1]) as $iParam => $sParam) { - if (($iPos = strpos($sParam, '=')) === false) { - $aParams[] = trim($sParam); - } else { - $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1)); - } - } - $aTemplates[$iTemplate][1] = $aParams; - if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name']; - if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population'])) { - $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population']); - } - if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total'])) { - $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_total']); - } - if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban'])) { - $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_urban']); - } - if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate'])) { - $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_estimate']); - } - if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website']) { - if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch)) { - $aPageProperties['sWebsite'] = $aMatch[1]; - if (strpos($aPageProperties['sWebsite'], ':/'.'/') === false) { - $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite']; - } - } - } - if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld']) { - $aPageProperties['sTopLevelDomain'] = str_replace(array('[', ']', '.'), '', $aParams['cctld']); - } - - if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0], 0, 7)) == 'infobox') { - $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0], 8)); - // $aPageProperties['aInfoboxParams'] = $aParams; - } - - // Assume the first template with lots of params is the type (fallback for infobox) - if (!isset($aPageProperties['sPossibleInfoboxType']) && count($aParams) > 10) { - $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]); - // $aPageProperties['aInfoboxParams'] = $aParams; - } - - // do we have a lat/lon - if (!isset($aPageProperties['fLat'])) { - if (isset($aParams['latd']) && isset($aParams['longd'])) { - $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']); - $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']); - } - if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees'])) { - $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']); - $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']); - } - if (isset($aParams['latitude']) && isset($aParams['longitude'])) { - if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude'])) { - $aPageProperties['fLat'] = (float)$aParams['latitude']; - $aPageProperties['fLon'] = (float)$aParams['longitude']; - } - } - if (strtolower($aTemplate[0]) == 'coord') { - if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S')) { - $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]); - $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]); - } elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S')) { - $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]); - $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]); - } elseif (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S')) { - $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0]; - $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2]; - } elseif (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1])) { - $aPageProperties['fLat'] = (float)$aParams[0]; - $aPageProperties['fLon'] = (float)$aParams[1]; - } - } - if (isset($aParams['Latitude']) && isset($aParams['Longitude'])) { - $aParams['Latitude'] = str_replace(' ', ' ', $aParams['Latitude']); - $aParams['Longitude'] = str_replace(' ', ' ', $aParams['Longitude']); - if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) { - $aPageProperties['fLat'] = - (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]) - +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2; - } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) { - $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]); - } - - if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) { - $aPageProperties['fLon'] = - (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]) - +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2; - } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) { - $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]); - } - } - } - } - if (isset($aPageProperties['sPossibleInfoboxType'])) { - if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType']; - unset($aPageProperties['sPossibleInfoboxType']); - } - return $aPageProperties; -} - -if (isset($aCMDResult['parse-wikipedia'])) { - $oDB = new Nominatim\DB(); - $oDB->connect(); - - $sSQL = 'select page_title from content where page_namespace = 0 and page_id %10 = '; - $sSQL .= $aCMDResult['parse-wikipedia']; - $sSQL .= ' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))'; - $aArticleNames = $oDB->getCol($sSQL); - /* $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0 - and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' - and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')'); - */ - foreach ($aArticleNames as $sArticleName) { - $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\''); - $aP = _templatesToProperties(_parseWikipediaContent($sPageText)); - - if (isset($aP['sInfoboxType'])) { - $aP['sInfoboxType'] = preg_replace('#\\s+#', ' ', $aP['sInfoboxType']); - $sSQL = 'update wikipedia_article set '; - $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\''; - $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';'; - $oDB->query($sSQL); - } - if (isset($aP['iPopulation'])) { - $sSQL = 'update wikipedia_article set '; - $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\''; - $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';'; - $oDB->query($sSQL); - } - if (isset($aP['sWebsite'])) { - $sSQL = 'update wikipedia_article set '; - $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\''; - $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';'; - $oDB->query($sSQL); - } - if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0')) { - if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = ''; - echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n"; - $sSQL = 'update wikipedia_article set '; - $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\','; - $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\''; - $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';'; - $oDB->query($sSQL); - } - } -} - - -function nominatimXMLStart($hParser, $sName, $aAttr) -{ - global $aNominatRecords; - switch ($sName) { - case 'PLACE': - $aNominatRecords[] = $aAttr; - break; - } -} - - -function nominatimXMLEnd($hParser, $sName) -{ -} - - -if (isset($aCMDResult['link'])) { - $oDB = new Nominatim\DB(); - $oDB->connect(); - - $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000"); - - // If you point this script at production OSM you will be blocked - $sNominatimBaseURL = 'http://SEVERNAME/search.php'; - - foreach ($aWikiArticles as $aRecord) { - $aRecord['name'] = str_replace('_', ' ', $aRecord['title']); - - $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en'; - - echo "\n-- ".$aRecord['name'].', '.$aRecord['infobox_type']."\n"; - $fMaxDist = 0.0000001; - $bUnknown = false; - switch (strtolower($aRecord['infobox_type'])) { - case 'former country': - continue 2; - case 'sea': - $fMaxDist = 60; // effectively turn it off - $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist); - break; - case 'country': - case 'island': - case 'islands': - case 'continent': - $fMaxDist = 60; // effectively turn it off - $sURL .= '&featuretype=country'; - $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist); - break; - case 'prefecture japan': - $aRecord['name'] = trim(str_replace(' Prefecture', ' ', $aRecord['name'])); - // intentionally no break - case 'state': - case '#us state': - case 'county': - case 'u.s. state': - case 'u.s. state symbols': - case 'german state': - case 'province or territory of canada': - case 'indian jurisdiction': - case 'province': - case 'french region': - case 'region of italy': - case 'kommune': - case '#australia state or territory': - case 'russian federal subject': - $fMaxDist = 4; - $sURL .= '&featuretype=state'; - $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist); - break; - case 'protected area': - $fMaxDist = 1; - $sURL .= '&nearlat='.$aRecord['lat']; - $sURL .= '&nearlon='.$aRecord['lon']; - $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist); - break; - case 'settlement': - $bUnknown = true; - // intentionally no break - case 'french commune': - case 'italian comune': - case 'uk place': - case 'italian comune': - case 'australian place': - case 'german place': - case '#geobox': - case 'u.s. county': - case 'municipality': - case 'city japan': - case 'russian inhabited locality': - case 'finnish municipality/land area': - case 'england county': - case 'israel municipality': - case 'russian city': - case 'city': - $fMaxDist = 0.2; - $sURL .= '&featuretype=settlement'; - $sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5); - break; - case 'mountain': - case 'mountain pass': - case 'river': - case 'lake': - case 'airport': - $fMaxDist = 0.2; - $sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5); - break; - case 'ship begin': - $fMaxDist = 0.1; - $aTypes = array('wreck'); - $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01); - $sURL .= '&nearlat='.$aRecord['lat']; - $sURL .= '&nearlon='.$aRecord['lon']; - break; - case 'road': - case 'university': - case 'company': - case 'department': - $fMaxDist = 0.005; - $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01); - $sURL .= '&bounded=1'; - $sURL .= '&nearlat='.$aRecord['lat']; - $sURL .= '&nearlon='.$aRecord['lon']; - break; - default: - $bUnknown = true; - $fMaxDist = 0.005; - $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01); - // $sURL .= "&bounded=1"; - $sURL .= '&nearlat='.$aRecord['lat']; - $sURL .= '&nearlon='.$aRecord['lon']; - echo '-- Unknown: '.$aRecord['infobox_type']."\n"; - break; - } - $sNameURL = $sURL.'&q='.urlencode($aRecord['name']); - - var_Dump($sNameURL); - $sXML = file_get_contents($sNameURL); - - $aNominatRecords = array(); - $hXMLParser = xml_parser_create(); - xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd'); - xml_parse($hXMLParser, $sXML, true); - xml_parser_free($hXMLParser); - - if (!isset($aNominatRecords[0])) { - $aNameParts = preg_split('#[(,]#', $aRecord['name']); - if (count($aNameParts) > 1) { - $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0])); - var_Dump($sNameURL); - $sXML = file_get_contents($sNameURL); - - $aNominatRecords = array(); - $hXMLParser = xml_parser_create(); - xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd'); - xml_parse($hXMLParser, $sXML, true); - xml_parser_free($hXMLParser); - } - } - - // assume first is best/right - for ($i = 0; $i < count($aNominatRecords); $i++) { - $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']); - $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']); - $fDiff = sqrt($fDiff); - if ($bUnknown) { - // If it was an unknown type base it on the rank of the found result - $iRank = (int)$aNominatRecords[$i]['PLACE_RANK']; - if ($iRank <= 4) $fMaxDist = 2; - elseif ($iRank <= 8) $fMaxDist = 1; - elseif ($iRank <= 10) $fMaxDist = 0.8; - elseif ($iRank <= 12) $fMaxDist = 0.6; - elseif ($iRank <= 17) $fMaxDist = 0.2; - elseif ($iRank <= 18) $fMaxDist = 0.1; - elseif ($iRank <= 22) $fMaxDist = 0.02; - elseif ($iRank <= 26) $fMaxDist = 0.001; - else $fMaxDist = 0.001; - } - echo '-- FOUND "'.substr($aNominatRecords[$i]['DISPLAY_NAME'], 0, 50); - echo '", '.$aNominatRecords[$i]['CLASS'].', '.$aNominatRecords[$i]['TYPE']; - echo ', '.$aNominatRecords[$i]['PLACE_RANK'].', '.$aNominatRecords[$i]['OSM_TYPE']; - echo " (dist:$fDiff, max:$fMaxDist)\n"; - if ($fDiff > $fMaxDist) { - echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n"; - } else { - $sSQL = 'update wikipedia_article set osm_type='; - switch ($aNominatRecords[$i]['OSM_TYPE']) { - case 'relation': - $sSQL .= "'R'"; - break; - case 'way': - $sSQL .= "'W'"; - break; - case 'node': - $sSQL .= "'N'"; - break; - } - $sSQL .= ', osm_id='.$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'"; - $oDB->query($sSQL); - break; - } - } - } -} diff --git a/wikidata/create.sql b/wikidata/create.sql deleted file mode 100644 index f6327bd0..00000000 --- a/wikidata/create.sql +++ /dev/null @@ -1,75 +0,0 @@ -DROP TABLE entity; -DROP TABLE entity_label; -DROP TABLE entity_description; -DROP TABLE entity_alias; -DROP TABLE entity_link; -DROP TABLE entity_property; - -CREATE TABLE entity ( - entity_id bigint, - title text, - pid bigint, - qid bigint, - datatype text, - CONSTRAINT pk_entity PRIMARY KEY(entity_id) -); - -CREATE TABLE entity_label ( - entity_id bigint, - language text, - label text, - CONSTRAINT pk_entity_label PRIMARY KEY(entity_id,language) -); - -CREATE TABLE entity_description ( - entity_id bigint, - language text, - description text, - CONSTRAINT pk_entity_description PRIMARY KEY(entity_id,language) -); - -CREATE TABLE entity_alias ( - entity_id bigint, - language text, - alias text, - CONSTRAINT pk_entity_alias PRIMARY KEY(entity_id,language,alias) -); - -CREATE TABLE entity_link ( - entity_id bigint, - target text, - value text, - CONSTRAINT pk_entity_link PRIMARY KEY(entity_id,target) -); - -CREATE TABLE entity_link_hit ( - entity_id bigint, - target text, - value text, - hits bigint, - CONSTRAINT pk_entity_link_hit PRIMARY KEY(entity_id,target) -); - -CREATE TABLE link_hit ( - target text, - value text, - hits bigint, - CONSTRAINT pk_link_hit PRIMARY KEY(target,value) -); - -CREATE TABLE entity_property ( - entity_id bigint, - order_id bigint, - pid bigint, - string text, - toqid bigint, - location geometry, - datetime timestamp with time zone, - CONSTRAINT pk_entity_property PRIMARY KEY(entity_id, order_id) -); - -CREATE TABLE import_link_hit ( - target text, - value text, - hits bigint -); diff --git a/wikidata/import.sh b/wikidata/import.sh deleted file mode 100755 index 97621f17..00000000 --- a/wikidata/import.sh +++ /dev/null @@ -1,28 +0,0 @@ -PSQL=/usr/lib/postgresql/9.2/bin/psql -d wikidata - -cat create.sql | $PSQL - -cat entity.csv | $PSQL -c "COPY entity from STDIN WITH CSV" -cat entity_label.csv | $PSQL -c "COPY entity_label from STDIN WITH CSV" -cat entity_description.csv | $PSQL -c "COPY entity_description from STDIN WITH CSV" -cat entity_alias.csv | $PSQL -c "COPY entity_alias from STDIN WITH CSV" -cat entity_link.csv | $PSQL -c "COPY entity_link from STDIN WITH CSV" -cat entity_property.csv | $PSQL -c "COPY entity_property from STDIN WITH CSV" - -$PSQL -c "create index idx_entity_link_target on entity_link using btree (target,value)" -$PSQL -c "create index idx_entity_qid on entity using btree (qid)" -$PSQL -c "create table property_label_en as select pid,null::text as label from entity where pid is not null" -$PSQL -c "update property_label_en set label = x.label from (select pid,label,language from entity join entity_label using (entity_id) where pid is not null and language = 'en') as x where x.pid = property_label_en.pid" -$PSQL -c "create unique index idx_property_label_en on property_label_en using btree (pid)" -$PSQL -c "alter table entity add column label_en text" -$PSQL -c "update entity set label_en = label from entity_label where entity.entity_id = entity_label.entity_id and language = 'en'" -$PSQL -c "alter table entity add column description_en text" -$PSQL -c "update entity set description_en = description from entity_description where entity.entity_id = entity_description.entity_id and language = 'en'" - -cat totals.txt | $PSQL -c "COPY import_link_hit from STDIN WITH CSV DELIMITER ' '" -$PSQL -c "truncate link_hit" -$PSQL -c "insert into link_hit select target||'wiki', replace(catch_decode_url_part(value), '_', ' '), sum(hits) from import_link_hit where replace(catch_decode_url_part(value), '_', ' ') is not null group by target||'wiki', replace(dcatch_decode_url_part(value), '_', ' ')" -$PSQL -c "truncate entity_link_hit" -$PSQL -c "insert into entity_link_hit select entity_id, target, value, coalesce(hits,0) from entity_link left outer join link_hit using (target, value)" -$PSQL -c "create table entity_hit as select entity_id,sum(hits) as hits from entity_link_hit group by entity_id" -$PSQL -c "create unique index idx_entity_hit on entity_hit using btree (entity_id)" diff --git a/wikidata/parse.php b/wikidata/parse.php deleted file mode 100755 index 1bef50ec..00000000 --- a/wikidata/parse.php +++ /dev/null @@ -1,188 +0,0 @@ -#!/usr/bin/php -Cq -') { - $sTitle = substr($sLine, 11, -9); - } elseif (substr($sLine, 0, 8) == ' ') { - $iNS = (int)substr($sLine, 8, -6); - } elseif (substr($sLine, 0, 8) == ' ') { - $iID = (int)substr($sLine, 8, -6); - } elseif (substr($sLine, 0, 33) == ' ') { - if ($iNS == -2) continue; - if ($iNS == -1) continue; - if ($iNS == 1) continue; - if ($iNS == 2) continue; - if ($iNS == 3) continue; - if ($iNS == 4) continue; - if ($iNS == 5) continue; - if ($iNS == 6) continue; - if ($iNS == 7) continue; - if ($iNS == 8) continue; - if ($iNS == 9) continue; - if ($iNS == 10) continue; - if ($iNS == 11) continue; - if ($iNS == 12) continue; - if ($iNS == 13) continue; - if ($iNS == 14) continue; - if ($iNS == 15) continue; - if ($iNS == 121) continue; - if ($iNS == 123) continue; - if ($iNS == 829) continue; - if ($iNS == 1198) continue; - if ($iNS == 1199) continue; - $sText = html_entity_decode(substr($sLine, 33, -8), ENT_COMPAT, 'UTF-8'); - $aArticle = json_decode($sText, true); - - if (array_diff(array_keys($aArticle), array('label', 'description', 'aliases', 'links', 'entity', 'claims', 'datatype')) != array()) { - // DEBUG - var_dump($sTitle); - var_dump(array_keys($aArticle)); - var_dump($aArticle); - exit; - } - - $iPID = $iQID = null; - if ($aArticle['entity'][0] == 'p') { - $iPID = (int) substr($aArticle['entity'], 1); - } elseif ($aArticle['entity'][0] == 'q') { - $iQID = (int) substr($aArticle['entity'], 1); - } else { - continue; - } - - echo '.'; - - fputcsv($hFileEntity, array($iID, $sTitle, $iPID, $iQID, @$aArticle['datatype'])); - - foreach ($aArticle['label'] as $sLang => $sLabel) { - fputcsv($hFileEntityLabel, array($iID, $sLang, $sLabel)); - // echo "insert into entity_label values (".$iID.",'".pg_escape_string($sLang)."','".pg_escape_string($sLabel)."');\n"; - } - - foreach ($aArticle['description'] as $sLang => $sLabel) { - fputcsv($hFileEntityDescription, array($iID, $sLang, $sLabel)); - // echo "insert into entity_description values (".$iID.",'".pg_escape_string($sLang)."','".pg_escape_string($sLabel)."');\n"; - } - - foreach ($aArticle['aliases'] as $sLang => $aLabels) { - $aUniqueAlias = array(); - foreach ($aLabels as $sLabel) { - if (!isset($aUniqueAlias[$sLabel]) && $sLabel) { - fputcsv($hFileEntityAlias, array($iID, $sLang, $sLabel)); - // echo "insert into entity_alias values (".$iID.",'".pg_escape_string($sLang)."','".pg_escape_string($sLabel)."');\n"; - $aUniqueAlias[$sLabel] = true; - } - } - } - - foreach ($aArticle['links'] as $sLang => $sLabel) { - fputcsv($hFileEntityLink, array($iID, $sLang, $sLabel)); - // echo "insert into entity_link values (".$iID.",'".pg_escape_string($sLang)."','".pg_escape_string($sLabel)."');\n"; - } - - - if (isset($aArticle['claims'])) { - // - foreach ($aArticle['claims'] as $iClaim => $aClaim) { - // - $bFail = false; - if ($aClaim['m'][0] == 'novalue') continue; - if ($aClaim['m'][0] == 'somevalue') continue; - $iPID = (int)$aClaim['m'][1]; - if ($aClaim['m'][0] != 'value') $bFail = true; - if ($aClaim['m'][2]== 'wikibase-entityid') { - // - if ($aClaim['m'][3]['entity-type'] != 'item') $bFail = true; - fputcsv($hFileEntityProperty, array($iID, $iClaim, $iPID, null, $aClaim['m'][3]['numeric-id'], null, null)); - // echo "insert into entity_property values (nextval('seq_entity_property'),".$iID.",".$iPID.",null,".$aClaim['m'][3]['numeric-id'].",null);\n"; - } elseif ($aClaim['m'][2] == 'globecoordinate') { - // - if ($aClaim['m'][3]['globe'] != 'http://www.wikidata.org/entity/Q2') $bFail = true; - fputcsv( - $hFileEntityProperty, - array( - $iID, - $iClaim, - $iPID, - null, - null, - 'SRID=4326;POINT('.((float) $aClaim['m'][3]['longitude']).' '.((float)$aClaim['m'][3]['latitude']).')', null - ) - ); - /* echo "insert into entity_property values (nextval('seq_entity_property'),"; - * echo $iID.",".$iPID.",null,null,ST_SetSRID(ST_MakePoint(".((float)$aClaim['m'][3]['longitude']); - * echo ", ".((float)$aClaim['m'][3]['latitude'])."),4326));\n"; - */ - } elseif ($aClaim['m'][2] == 'time') { - // TODO! - /* - if ($aClaim['m'][3]['calendarmodel'] == 'http://www.wikidata.org/entity/Q1985727') { - // Gregorian - if (preg_match('#(\\+|-)0*([0-9]{4})-([0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2})Z#', $aClaim['m'][3]['time'], $aMatch)) { - if ((int)$aMatch[2] < 4700 && ) { - $sDateString = $aMatch[2].'-'.$aMatch[3].($aClaim['m'][3]['timezone']>=0?'+':'').$aClaim['m'][3]['timezone'].($aMatch[1]=='-'?' bc':''); - fputcsv($hFileEntityProperty, array($iID,$iClaim,$iPID,null,null,null,$sDateString)); - } - } else { - // $bFail = true; - } - } elseif ( $aClaim['m'][3]['calendarmodel'] != 'http://www.wikidata.org/entity/Q1985786') { - // Julian - if (preg_match('#(\\+|-)0*([0-9]{4})-([0-9]{2})-([0-9]{2})T([0-9]{2}:[0-9]{2}:[0-9]{2})Z#', $aClaim['m'][3]['time'], $aMatch)) { - var_dump($aMatch); - exit; - $iDayCount = juliantojd(2, 11, 1732); - var_dump($iDayCount, jdtogregorian($iDayCount)); - } else { - $bFail = true; - exit; - } - exit; - } else { - // $bFail = true; - } - */ - } elseif ($aClaim['m'][2] == 'string') { - // echo "insert into entity_property values (nextval('seq_entity_property'),".$iID.",".$iPID.",'".pg_escape_string($aClaim['m'][3])."',null,null);\n"; - fputcsv($hFileEntityProperty, array($iID, $iClaim, $iPID, $aClaim['m'][3], null, null, null)); - } else { - $bFail = true; - } - - // Don't care about sources: if ($aClaim['refs'] != array()) $bFail = true; - - if ($bFail) { - var_dump($sTitle); - var_dump($aClaim); - } else { - // process - } - } - } - } - } - fclose($hFile); - fclose($hFileEntity); - fclose($hFileEntityLabel); - fclose($hFileEntityDescription); - fclose($hFileEntityAlias); - fclose($hFileEntityLink); - fclose($hFileEntityProperty); -} diff --git a/wikidata/pull_wikipedia_logs.php b/wikidata/pull_wikipedia_logs.php deleted file mode 100644 index 3e172069..00000000 --- a/wikidata/pull_wikipedia_logs.php +++ /dev/null @@ -1,79 +0,0 @@ - hour.txt'); - - $hPrevTotals = @fopen('totals.txt', 'r'); - $hDayTotals = @fopen('hour.txt', 'r'); - $hNewTotals = @fopen('newtotals.txt', 'w'); - - $sPrevKey = $sDayKey = true; - $sPrevLine = true; - $sDayLine = true; - - do { - if ($sPrevKey === $sDayKey) { - if ($sPrevLine !== true) fputs($hNewTotals, "$sPrevKey ".($iPrevValue+$iDayValue)."\n"); - $sPrevLine = true; - $sDayLine = true; - } elseif ($sDayKey !== false && ($sPrevKey > $sDayKey || $sPrevKey === false)) { - fputs($hNewTotals, "$sDayKey ".($iDayValue)."\n"); - $sDayLine = true; - } elseif ($sPrevKey !== false && ($sDayKey > $sPrevKey || $sDayKey === false)) { - fputs($hNewTotals, "$sPrevKey ".($iPrevValue)."\n"); - $sPrevLine = true; - } - - if ($sPrevLine === true) { - $sPrevLine = $hPrevTotals?fgets($hPrevTotals, 4096):false; - if ($sPrevLine !== false) { - $aPrevLine = explode(' ', $sPrevLine); - $sPrevKey = $aPrevLine[0].' '.$aPrevLine[1]; - $iPrevValue = (int)$aPrevLine[2]; - } else { - $sPrevKey = false; - $iPrevValue = 0; - } - } - - if ($sDayLine === true) { - $sDayLine = $hDayTotals?fgets($hDayTotals, 4096):false; - if ($sDayLine !== false) { - preg_match('#^([a-z]{2}) ([^ :]+) ([0-9]+) [0-9]+$#', $sDayLine, $aMatch); - $sDayKey = $aMatch[1].' '.$aMatch[2]; - $iDayValue = (int)$aMatch[3]; - } else { - $sDayKey = false; - $iDayValue = 0; - } - } - } while ($sPrevLine !== false || $sDayLine !== false); - - @fclose($hPrevTotals); - @fclose($hDayTotals); - @fclose($hNewTotals); - - @unlink('totals.txt'); - rename('newtotals.txt', 'totals.txt'); - } -} - -// Notes: -/* - gzip -dc $FILE.gz | grep -e "^en [^ :]\+ [0-9]\+" | - sed "s#\(^[a-z]\{2\}\) \([^ :]\+\) \([0-9]\+\) [0-9]\+#update wikipedia_article set hit_count = coalesce(hit_count,0) + \3 where language = '\1' - and title = catch_decode_url_part('\2');#g" | /opt/mapquest/stdbase-dev$ - cat totals.txt | sed "s#\(^[a-z]\{2\}\) \([^ ]\+\) \([0-9]\+\)\$#update entity_link set hits = s,0) + \3 where target = '\1wiki' and value = catch_decode_url_part('\2');#g" - cat totals.txt | sed "s#\(^[a-z]\{2\}\) \([^ ]\+\) \([0-9]\+\)\$#update entity_link set hits = coalesce(hits,0) + \3 where target = '\1wiki' and value = catch_decode_url_part('\2');#g" -*/ -- 2.39.5