X-Git-Url: https://git.openstreetmap.org./nominatim.git/blobdiff_plain/4a321487f3d44241242a1729ab4d137df1f57bfb..75f951d254127d8857b6ad95cac241917f88e542:/utils/importWikipedia.php diff --git a/utils/importWikipedia.php b/utils/importWikipedia.php old mode 100755 new mode 100644 index 0e532cd9..2e256e35 --- a/utils/importWikipedia.php +++ b/utils/importWikipedia.php @@ -1,13 +1,11 @@ -#!/usr/bin/php -Cq connect(); if ($aCMDResult['drop-tables']) { @@ -241,7 +240,7 @@ function _templatesToProperties($aTemplates) } // Assume the first template with lots of params is the type (fallback for infobox) - if (!isset($aPageProperties['sPossibleInfoboxType']) && sizeof($aParams) > 10) { + if (!isset($aPageProperties['sPossibleInfoboxType']) && count($aParams) > 10) { $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]); // $aPageProperties['aInfoboxParams'] = $aParams; } @@ -306,9 +305,17 @@ function _templatesToProperties($aTemplates) } if (isset($aCMDResult['parse-wikipedia'])) { - $oDB =& getDB(); - $aArticleNames = $oDB->getCol('select page_title from content where page_namespace = 0 and page_id %10 = '.$aCMDResult['parse-wikipedia'].' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))'); - // $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0 and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')'); + $oDB = new Nominatim\DB(); + $oDB->connect(); + + $sSQL = 'select page_title from content where page_namespace = 0 and page_id %10 = '; + $sSQL .= $aCMDResult['parse-wikipedia']; + $sSQL .= ' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))'; + $aArticleNames = $oDB->getCol($sSQL); + /* $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0 + and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' + and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')'); + */ foreach ($aArticleNames as $sArticleName) { $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\''); $aP = _templatesToProperties(_parseWikipediaContent($sPageText)); @@ -362,7 +369,9 @@ function nominatimXMLEnd($hParser, $sName) if (isset($aCMDResult['link'])) { - $oDB =& getDB(); + $oDB = new Nominatim\DB(); + $oDB->connect(); + $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000"); // If you point this script at production OSM you will be blocked @@ -373,7 +382,7 @@ if (isset($aCMDResult['link'])) { $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en'; - echo "\n-- ".$aRecord['name'].", ".$aRecord['infobox_type']."\n"; + echo "\n-- ".$aRecord['name'].', '.$aRecord['infobox_type']."\n"; $fMaxDist = 0.0000001; $bUnknown = false; switch (strtolower($aRecord['infobox_type'])) { @@ -381,19 +390,19 @@ if (isset($aCMDResult['link'])) { continue 2; case 'sea': $fMaxDist = 60; // effectively turn it off - $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist); + $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist); break; case 'country': case 'island': case 'islands': case 'continent': $fMaxDist = 60; // effectively turn it off - $sURL .= "&featuretype=country"; - $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist); + $sURL .= '&featuretype=country'; + $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist); break; case 'prefecture japan': $aRecord['name'] = trim(str_replace(' Prefecture', ' ', $aRecord['name'])); - break; + // intentionally no break case 'state': case '#us state': case 'county': @@ -409,18 +418,18 @@ if (isset($aCMDResult['link'])) { case '#australia state or territory': case 'russian federal subject': $fMaxDist = 4; - $sURL .= "&featuretype=state"; - $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist); + $sURL .= '&featuretype=state'; + $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist); break; case 'protected area': $fMaxDist = 1; - $sURL .= "&nearlat=".$aRecord['lat']; - $sURL .= "&nearlon=".$aRecord['lon']; - $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist); + $sURL .= '&nearlat='.$aRecord['lat']; + $sURL .= '&nearlon='.$aRecord['lon']; + $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist); break; case 'settlement': $bUnknown = true; - break; + // intentionally no break case 'french commune': case 'italian comune': case 'uk place': @@ -438,8 +447,8 @@ if (isset($aCMDResult['link'])) { case 'russian city': case 'city': $fMaxDist = 0.2; - $sURL .= "&featuretype=settlement"; - $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5); + $sURL .= '&featuretype=settlement'; + $sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5); break; case 'mountain': case 'mountain pass': @@ -447,33 +456,33 @@ if (isset($aCMDResult['link'])) { case 'lake': case 'airport': $fMaxDist = 0.2; - $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5); + $sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5); break; case 'ship begin': $fMaxDist = 0.1; $aTypes = array('wreck'); - $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01); - $sURL .= "&nearlat=".$aRecord['lat']; - $sURL .= "&nearlon=".$aRecord['lon']; + $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01); + $sURL .= '&nearlat='.$aRecord['lat']; + $sURL .= '&nearlon='.$aRecord['lon']; break; case 'road': case 'university': case 'company': case 'department': $fMaxDist = 0.005; - $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01); - $sURL .= "&bounded=1"; - $sURL .= "&nearlat=".$aRecord['lat']; - $sURL .= "&nearlon=".$aRecord['lon']; + $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01); + $sURL .= '&bounded=1'; + $sURL .= '&nearlat='.$aRecord['lat']; + $sURL .= '&nearlon='.$aRecord['lon']; break; default: $bUnknown = true; $fMaxDist = 0.005; - $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01); + $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01); // $sURL .= "&bounded=1"; - $sURL .= "&nearlat=".$aRecord['lat']; - $sURL .= "&nearlon=".$aRecord['lon']; - echo "-- Unknown: ".$aRecord['infobox_type']."\n"; + $sURL .= '&nearlat='.$aRecord['lat']; + $sURL .= '&nearlon='.$aRecord['lon']; + echo '-- Unknown: '.$aRecord['infobox_type']."\n"; break; } $sNameURL = $sURL.'&q='.urlencode($aRecord['name']); @@ -489,7 +498,7 @@ if (isset($aCMDResult['link'])) { if (!isset($aNominatRecords[0])) { $aNameParts = preg_split('#[(,]#', $aRecord['name']); - if (sizeof($aNameParts) > 1) { + if (count($aNameParts) > 1) { $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0])); var_Dump($sNameURL); $sXML = file_get_contents($sNameURL); @@ -503,7 +512,7 @@ if (isset($aCMDResult['link'])) { } // assume first is best/right - for ($i = 0; $i < sizeof($aNominatRecords); $i++) { + for ($i = 0; $i < count($aNominatRecords); $i++) { $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']); $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']); $fDiff = sqrt($fDiff); @@ -520,11 +529,14 @@ if (isset($aCMDResult['link'])) { elseif ($iRank <= 26) $fMaxDist = 0.001; else $fMaxDist = 0.001; } - echo "-- FOUND \"".substr($aNominatRecords[$i]['DISPLAY_NAME'], 0, 50)."\", ".$aNominatRecords[$i]['CLASS'].", ".$aNominatRecords[$i]['TYPE'].", ".$aNominatRecords[$i]['PLACE_RANK'].", ".$aNominatRecords[$i]['OSM_TYPE']." (dist:$fDiff, max:$fMaxDist)\n"; + echo '-- FOUND "'.substr($aNominatRecords[$i]['DISPLAY_NAME'], 0, 50); + echo '", '.$aNominatRecords[$i]['CLASS'].', '.$aNominatRecords[$i]['TYPE']; + echo ', '.$aNominatRecords[$i]['PLACE_RANK'].', '.$aNominatRecords[$i]['OSM_TYPE']; + echo " (dist:$fDiff, max:$fMaxDist)\n"; if ($fDiff > $fMaxDist) { echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n"; } else { - $sSQL = "update wikipedia_article set osm_type="; + $sSQL = 'update wikipedia_article set osm_type='; switch ($aNominatRecords[$i]['OSM_TYPE']) { case 'relation': $sSQL .= "'R'"; @@ -536,7 +548,7 @@ if (isset($aCMDResult['link'])) { $sSQL .= "'N'"; break; } - $sSQL .= ", osm_id=".$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'"; + $sSQL .= ', osm_id='.$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'"; $oDB->query($sSQL); break; }