]> git.openstreetmap.org Git - nominatim.git/blobdiff - utils/importWikipedia.php
Restructure script and website installation
[nominatim.git] / utils / importWikipedia.php
old mode 100755 (executable)
new mode 100644 (file)
index 0e532cd..06db603
@@ -1,13 +1,11 @@
-#!/usr/bin/php -Cq
 <?php
 
 <?php
 
-require_once(dirname(dirname(__FILE__)).'/settings/settings.php');
 require_once(CONST_BasePath.'/lib/init-cmd.php');
 ini_set('memory_limit', '800M');
 
 $aCMDOptions
  = array(
 require_once(CONST_BasePath.'/lib/init-cmd.php');
 ini_set('memory_limit', '800M');
 
 $aCMDOptions
  = array(
-    "Create and setup nominatim search system",
+    'Create and setup nominatim search system',
     array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
     array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
     array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
     array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
     array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
     array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
@@ -37,7 +35,7 @@ $sTestPageText = <<<EOD
 | coasters = 12
 | water_rides = 2
 | owner = [[Six Flags]]
 | coasters = 12
 | water_rides = 2
 | owner = [[Six Flags]]
-| general_manager = 
+| general_manager =
 | homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
 }}
 EOD;
 | homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
 }}
 EOD;
@@ -241,7 +239,7 @@ function _templatesToProperties($aTemplates)
         }
 
         // Assume the first template with lots of params is the type (fallback for infobox)
         }
 
         // Assume the first template with lots of params is the type (fallback for infobox)
-        if (!isset($aPageProperties['sPossibleInfoboxType']) && sizeof($aParams) > 10) {
+        if (!isset($aPageProperties['sPossibleInfoboxType']) && count($aParams) > 10) {
             $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
             // $aPageProperties['aInfoboxParams'] = $aParams;
         }
             $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
             // $aPageProperties['aInfoboxParams'] = $aParams;
         }
@@ -307,8 +305,14 @@ function _templatesToProperties($aTemplates)
 
 if (isset($aCMDResult['parse-wikipedia'])) {
     $oDB =& getDB();
 
 if (isset($aCMDResult['parse-wikipedia'])) {
     $oDB =& getDB();
-    $aArticleNames = $oDB->getCol('select page_title from content where page_namespace = 0 and page_id %10 = '.$aCMDResult['parse-wikipedia'].' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))');
-    // $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0 and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
+    $sSQL = 'select page_title from content where page_namespace = 0 and page_id %10 = ';
+    $sSQL .= $aCMDResult['parse-wikipedia'];
+    $sSQL .= ' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))';
+    $aArticleNames = $oDB->getCol($sSQL);
+    /* $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0
+        and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\'
+        and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
+     */
     foreach ($aArticleNames as $sArticleName) {
         $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
         $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
     foreach ($aArticleNames as $sArticleName) {
         $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
         $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
@@ -373,7 +377,7 @@ if (isset($aCMDResult['link'])) {
 
         $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
 
 
         $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
 
-        echo "\n-- ".$aRecord['name'].", ".$aRecord['infobox_type']."\n";
+        echo "\n-- ".$aRecord['name'].', '.$aRecord['infobox_type']."\n";
         $fMaxDist = 0.0000001;
         $bUnknown = false;
         switch (strtolower($aRecord['infobox_type'])) {
         $fMaxDist = 0.0000001;
         $bUnknown = false;
         switch (strtolower($aRecord['infobox_type'])) {
@@ -381,19 +385,19 @@ if (isset($aCMDResult['link'])) {
                 continue 2;
             case 'sea':
                 $fMaxDist = 60; // effectively turn it off
                 continue 2;
             case 'sea':
                 $fMaxDist = 60; // effectively turn it off
-                $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
+                $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
                 break;
             case 'country':
             case 'island':
             case 'islands':
             case 'continent':
                 $fMaxDist = 60; // effectively turn it off
                 break;
             case 'country':
             case 'island':
             case 'islands':
             case 'continent':
                 $fMaxDist = 60; // effectively turn it off
-                $sURL .= "&featuretype=country";
-                $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
+                $sURL .= '&featuretype=country';
+                $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
                 break;
             case 'prefecture japan':
                 $aRecord['name'] = trim(str_replace(' Prefecture', ' ', $aRecord['name']));
                 break;
             case 'prefecture japan':
                 $aRecord['name'] = trim(str_replace(' Prefecture', ' ', $aRecord['name']));
-                break;
+                // intentionally no break
             case 'state':
             case '#us state':
             case 'county':
             case 'state':
             case '#us state':
             case 'county':
@@ -409,18 +413,18 @@ if (isset($aCMDResult['link'])) {
             case '#australia state or territory':
             case 'russian federal subject':
                 $fMaxDist = 4;
             case '#australia state or territory':
             case 'russian federal subject':
                 $fMaxDist = 4;
-                $sURL .= "&featuretype=state";
-                $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
+                $sURL .= '&featuretype=state';
+                $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
                 break;
             case 'protected area':
                 $fMaxDist = 1;
                 break;
             case 'protected area':
                 $fMaxDist = 1;
-                $sURL .= "&nearlat=".$aRecord['lat'];
-                $sURL .= "&nearlon=".$aRecord['lon'];
-                $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
+                $sURL .= '&nearlat='.$aRecord['lat'];
+                $sURL .= '&nearlon='.$aRecord['lon'];
+                $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
                 break;
             case 'settlement':
                 $bUnknown = true;
                 break;
             case 'settlement':
                 $bUnknown = true;
-                break;
+                // intentionally no break
             case 'french commune':
             case 'italian comune':
             case 'uk place':
             case 'french commune':
             case 'italian comune':
             case 'uk place':
@@ -438,8 +442,8 @@ if (isset($aCMDResult['link'])) {
             case 'russian city':
             case 'city':
                 $fMaxDist = 0.2;
             case 'russian city':
             case 'city':
                 $fMaxDist = 0.2;
-                $sURL .= "&featuretype=settlement";
-                $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
+                $sURL .= '&featuretype=settlement';
+                $sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5);
                 break;
             case 'mountain':
             case 'mountain pass':
                 break;
             case 'mountain':
             case 'mountain pass':
@@ -447,33 +451,33 @@ if (isset($aCMDResult['link'])) {
             case 'lake':
             case 'airport':
                 $fMaxDist = 0.2;
             case 'lake':
             case 'airport':
                 $fMaxDist = 0.2;
-                $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
+                $sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5);
                 break;
             case 'ship begin':
                 $fMaxDist = 0.1;
                 $aTypes = array('wreck');
                 break;
             case 'ship begin':
                 $fMaxDist = 0.1;
                 $aTypes = array('wreck');
-                $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
-                $sURL .= "&nearlat=".$aRecord['lat'];
-                $sURL .= "&nearlon=".$aRecord['lon'];
+                $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
+                $sURL .= '&nearlat='.$aRecord['lat'];
+                $sURL .= '&nearlon='.$aRecord['lon'];
                 break;
             case 'road':
             case 'university':
             case 'company':
             case 'department':
                 $fMaxDist = 0.005;
                 break;
             case 'road':
             case 'university':
             case 'company':
             case 'department':
                 $fMaxDist = 0.005;
-                $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
-                $sURL .= "&bounded=1";
-                $sURL .= "&nearlat=".$aRecord['lat'];
-                $sURL .= "&nearlon=".$aRecord['lon'];
+                $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
+                $sURL .= '&bounded=1';
+                $sURL .= '&nearlat='.$aRecord['lat'];
+                $sURL .= '&nearlon='.$aRecord['lon'];
                 break;
             default:
                 $bUnknown = true;
                 $fMaxDist = 0.005;
                 break;
             default:
                 $bUnknown = true;
                 $fMaxDist = 0.005;
-                $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
+                $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
                 // $sURL .= "&bounded=1";
                 // $sURL .= "&bounded=1";
-                $sURL .= "&nearlat=".$aRecord['lat'];
-                $sURL .= "&nearlon=".$aRecord['lon'];
-                echo "-- Unknown: ".$aRecord['infobox_type']."\n";
+                $sURL .= '&nearlat='.$aRecord['lat'];
+                $sURL .= '&nearlon='.$aRecord['lon'];
+                echo '-- Unknown: '.$aRecord['infobox_type']."\n";
                 break;
         }
         $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
                 break;
         }
         $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
@@ -489,7 +493,7 @@ if (isset($aCMDResult['link'])) {
 
         if (!isset($aNominatRecords[0])) {
             $aNameParts = preg_split('#[(,]#', $aRecord['name']);
 
         if (!isset($aNominatRecords[0])) {
             $aNameParts = preg_split('#[(,]#', $aRecord['name']);
-            if (sizeof($aNameParts) > 1) {
+            if (count($aNameParts) > 1) {
                 $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
                 var_Dump($sNameURL);
                 $sXML = file_get_contents($sNameURL);
                 $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
                 var_Dump($sNameURL);
                 $sXML = file_get_contents($sNameURL);
@@ -503,7 +507,7 @@ if (isset($aCMDResult['link'])) {
         }
 
         // assume first is best/right
         }
 
         // assume first is best/right
-        for ($i = 0; $i < sizeof($aNominatRecords); $i++) {
+        for ($i = 0; $i < count($aNominatRecords); $i++) {
             $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
             $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
             $fDiff = sqrt($fDiff);
             $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
             $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
             $fDiff = sqrt($fDiff);
@@ -520,11 +524,14 @@ if (isset($aCMDResult['link'])) {
                 elseif ($iRank <= 26) $fMaxDist = 0.001;
                 else $fMaxDist = 0.001;
             }
                 elseif ($iRank <= 26) $fMaxDist = 0.001;
                 else $fMaxDist = 0.001;
             }
-            echo "-- FOUND \"".substr($aNominatRecords[$i]['DISPLAY_NAME'], 0, 50)."\", ".$aNominatRecords[$i]['CLASS'].", ".$aNominatRecords[$i]['TYPE'].", ".$aNominatRecords[$i]['PLACE_RANK'].", ".$aNominatRecords[$i]['OSM_TYPE']." (dist:$fDiff, max:$fMaxDist)\n";
+            echo '-- FOUND "'.substr($aNominatRecords[$i]['DISPLAY_NAME'], 0, 50);
+            echo '", '.$aNominatRecords[$i]['CLASS'].', '.$aNominatRecords[$i]['TYPE'];
+            echo ', '.$aNominatRecords[$i]['PLACE_RANK'].', '.$aNominatRecords[$i]['OSM_TYPE'];
+            echo " (dist:$fDiff, max:$fMaxDist)\n";
             if ($fDiff > $fMaxDist) {
                 echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
             } else {
             if ($fDiff > $fMaxDist) {
                 echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
             } else {
-                $sSQL = "update wikipedia_article set osm_type=";
+                $sSQL = 'update wikipedia_article set osm_type=';
                 switch ($aNominatRecords[$i]['OSM_TYPE']) {
                     case 'relation':
                         $sSQL .= "'R'";
                 switch ($aNominatRecords[$i]['OSM_TYPE']) {
                     case 'relation':
                         $sSQL .= "'R'";
@@ -536,7 +543,7 @@ if (isset($aCMDResult['link'])) {
                         $sSQL .= "'N'";
                         break;
                 }
                         $sSQL .= "'N'";
                         break;
                 }
-                $sSQL .= ", osm_id=".$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";
+                $sSQL .= ', osm_id='.$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";
                 $oDB->query($sSQL);
                 break;
             }
                 $oDB->query($sSQL);
                 break;
             }