4 require_once(dirname(dirname(__FILE__)).'/lib/init-cmd.php');
5 ini_set('memory_limit', '800M');
8 "Create and setup nominatim search system",
9 array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
10 array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
11 array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
13 array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
14 array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
15 array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
17 getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
20 $sTestPageText = <<<EOD
21 {{Coord|47|N|2|E|type:country_region:FR|display=title}}
22 {{ Infobox Amusement park
23 | name = Six Flags Great Adventure
24 | image = [[File:SixFlagsGreatAdventure logo.png]]
25 | caption = Six Flags Great Adventure logo
26 | location = [[Jackson, New Jersey|Jackson]]
27 | location2 = New Jersey
28 | location3 = United States
29 | address = 1 Six Flags Boulevard<ref name="drivedir"/>
30 | season = March/April through October/November
31 | opening_date = July 1, 1974
32 | previous_names = Great Adventure
34 | rides = 45 park admission rides
37 | owner = [[Six Flags]]
39 | homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
42 var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
44 //| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
53 if ($aCMDResult['drop-tables'])
55 $oDB->query('DROP TABLE wikipedia_article');
56 $oDB->query('DROP TABLE wikipedia_link');
60 if ($aCMDResult['create-tables'])
63 CREATE TABLE wikipedia_article (
64 language text NOT NULL,
71 importance double precision,
73 osm_type character(1),
81 $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
84 CREATE TABLE wikipedia_link (
92 function degreesAndMinutesToDecimal($iDegrees, $iMinutes=0, $fSeconds=0, $sNSEW='N')
94 $sNSEW = strtoupper($sNSEW);
95 return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
98 function _parseWikipediaContent($sPageText)
100 $sPageText = str_replace("\n", ' ', $sPageText);
101 $sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
102 $sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
104 $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
106 $aPageProperties = array();
108 $aTemplates = array();
111 $aTemplateStack = array();
112 $aState = array('body');
113 foreach($aPageText as $i => $sPart)
118 array_unshift($aTemplateStack, array('', array()));
119 array_unshift($aState, 'template');
122 if ($aState[0] == 'template' || $aState[0] == 'templateparam')
124 $aTemplate = array_shift($aTemplateStack);
125 array_shift($aState);
127 $aTemplates[] = $aTemplate;
134 array_unshift($aState, 'link');
137 if ($aState[0] == 'link' || $aState[0] == 'linksynonim')
139 if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
140 if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
142 $aLinks[] = array($sLinkPage, $sLinkSyn);
144 array_shift($aState);
148 $aTemplateStack[0][0] .= trim($sPart);
150 case 'templateparam':
151 $aTemplateStack[0][1][0] .= $sLinkSyn;
154 $sLinkPage .= trim($sPart);
160 $sPageBody .= $sLinkSyn;
163 var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
164 fail('unknown state');
169 if ($aState[0] == 'template' || $aState[0] == 'templateparam')
171 // Create a new template paramater
172 $aState[0] = 'templateparam';
173 array_unshift($aTemplateStack[0][1], '');
175 if ($aState[0] == 'link') $aState[0] = 'linksynonim';
181 $aTemplateStack[0][0] .= trim($sPart);
183 case 'templateparam':
184 $aTemplateStack[0][1][0] .= $sPart;
187 $sLinkPage .= trim($sPart);
193 $sPageBody .= $sPart;
196 var_dump($aState, $aPageText);
197 fail('unknown state');
205 function _templatesToProperties($aTemplates)
207 $aPageProperties = array();
208 foreach($aTemplates as $iTemplate => $aTemplate)
211 foreach(array_reverse($aTemplate[1]) as $iParam => $sParam)
213 if (($iPos = strpos($sParam, '=')) === FALSE)
215 $aParams[] = trim($sParam);
219 $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
222 $aTemplates[$iTemplate][1] = $aParams;
223 if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
224 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population']))
226 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population']);
228 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total']))
230 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_total']);
232 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban']))
234 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_urban']);
236 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate']))
238 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_estimate']);
240 if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website'])
242 if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch))
244 $aPageProperties['sWebsite'] = $aMatch[1];
245 if (strpos($aPageProperties['sWebsite'],':/'.'/') === FALSE)
247 $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
251 if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld'])
253 $aPageProperties['sTopLevelDomain'] = str_replace(array('[',']','.'),'', $aParams['cctld']);
256 if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0],0,7)) == 'infobox')
258 $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0],8));
259 // $aPageProperties['aInfoboxParams'] = $aParams;
262 // Assume the first template with lots of params is the type (fallback for infobox)
263 if (!isset($aPageProperties['sPossibleInfoboxType']) && sizeof($aParams) > 10)
265 $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
266 // $aPageProperties['aInfoboxParams'] = $aParams;
269 // do we have a lat/lon
270 if (!isset($aPageProperties['fLat']))
272 if (isset($aParams['latd']) && isset($aParams['longd']))
274 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
275 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
277 if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees']))
279 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
280 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
282 if (isset($aParams['latitude']) && isset($aParams['longitude']))
284 if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude']))
286 $aPageProperties['fLat'] = (float)$aParams['latitude'];
287 $aPageProperties['fLon'] = (float)$aParams['longitude'];
290 if (strtolower($aTemplate[0]) == 'coord')
292 if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S'))
294 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
295 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
297 elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S'))
299 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
300 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
302 else if (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S'))
304 $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
305 $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
307 else if (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1]))
309 $aPageProperties['fLat'] = (float)$aParams[0];
310 $aPageProperties['fLon'] = (float)$aParams[1];
313 if (isset($aParams['Latitude']) && isset($aParams['Longitude']))
315 $aParams['Latitude'] = str_replace(' ',' ',$aParams['Latitude']);
316 $aParams['Longitude'] = str_replace(' ',' ',$aParams['Longitude']);
317 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch))
319 $aPageProperties['fLat'] =
320 (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
321 +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
323 else if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch))
325 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
328 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch))
330 $aPageProperties['fLon'] =
331 (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
332 +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
334 else if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch))
336 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
341 if (isset($aPageProperties['sPossibleInfoboxType']))
343 if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
344 unset($aPageProperties['sPossibleInfoboxType']);
346 return $aPageProperties;
349 if (isset($aCMDResult['parse-wikipedia']))
352 $aArticleNames = $oDB->getCol('select page_title from content where page_namespace = 0 and page_id %10 = '.$aCMDResult['parse-wikipedia'].' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))');
353 // $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0 and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
354 foreach($aArticleNames as $sArticleName)
356 $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
357 $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
359 if (isset($aP['sInfoboxType']))
361 $aP['sInfoboxType'] = preg_replace('#\\s+#',' ',$aP['sInfoboxType']);
362 $sSQL = 'update wikipedia_article set ';
363 $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
364 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
367 if (isset($aP['iPopulation']))
369 $sSQL = 'update wikipedia_article set ';
370 $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
371 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
374 if (isset($aP['sWebsite']))
376 $sSQL = 'update wikipedia_article set ';
377 $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
378 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
381 if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0'))
383 if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
384 echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
385 $sSQL = 'update wikipedia_article set ';
386 $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
387 $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
388 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
394 function nominatimXMLStart($hParser, $sName, $aAttr)
396 global $aNominatRecords;
400 $aNominatRecords[] = $aAttr;
405 function nominatimXMLEnd($hParser, $sName)
410 if (isset($aCMDResult['link']))
413 $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
415 // If you point this script at production OSM you will be blocked
416 $sNominatimBaseURL = 'http://SEVERNAME/search.php';
418 foreach($aWikiArticles as $aRecord)
420 $aRecord['name'] = str_replace('_',' ',$aRecord['title']);
422 $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
424 echo "\n-- ".$aRecord['name'].", ".$aRecord['infobox_type']."\n";
425 $fMaxDist = 0.0000001;
427 switch(strtolower($aRecord['infobox_type']))
429 case 'former country':
432 $fMaxDist = 60; // effectively turn it off
433 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
439 $fMaxDist = 60; // effectively turn it off
440 $sURL .= "&featuretype=country";
441 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
443 case 'prefecture japan':
444 $aRecord['name'] = trim(str_replace(' Prefecture',' ', $aRecord['name']));
449 case 'u.s. state symbols':
451 case 'province or territory of canada';
452 case 'indian jurisdiction';
454 case 'french region':
455 case 'region of italy':
457 case '#australia state or territory':
458 case 'russian federal subject':
460 $sURL .= "&featuretype=state";
461 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
463 case 'protected area':
465 $sURL .= "&nearlat=".$aRecord['lat'];
466 $sURL .= "&nearlon=".$aRecord['lon'];
467 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
471 case 'french commune':
472 case 'italian comune':
474 case 'italian comune':
475 case 'australian place':
481 case 'russian inhabited locality':
482 case 'finnish municipality/land area':
483 case 'england county':
484 case 'israel municipality':
488 $sURL .= "&featuretype=settlement";
489 $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
492 case 'mountain pass':
497 $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
501 $aTypes = array('wreck');
502 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
503 $sURL .= "&nearlat=".$aRecord['lat'];
504 $sURL .= "&nearlon=".$aRecord['lon'];
511 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
512 $sURL .= "&bounded=1";
513 $sURL .= "&nearlat=".$aRecord['lat'];
514 $sURL .= "&nearlon=".$aRecord['lon'];
519 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
520 // $sURL .= "&bounded=1";
521 $sURL .= "&nearlat=".$aRecord['lat'];
522 $sURL .= "&nearlon=".$aRecord['lon'];
523 echo "-- Unknown: ".$aRecord['infobox_type']."\n";
526 $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
529 $sXML = file_get_contents($sNameURL);
531 $aNominatRecords = array();
532 $hXMLParser = xml_parser_create();
533 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
534 xml_parse($hXMLParser, $sXML, true);
535 xml_parser_free($hXMLParser);
537 if (!isset($aNominatRecords[0]))
539 $aNameParts = preg_split('#[(,]#',$aRecord['name']);
540 if (sizeof($aNameParts) > 1)
542 $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
544 $sXML = file_get_contents($sNameURL);
546 $aNominatRecords = array();
547 $hXMLParser = xml_parser_create();
548 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
549 xml_parse($hXMLParser, $sXML, true);
550 xml_parser_free($hXMLParser);#
554 // assume first is best/right
555 for($i = 0; $i < sizeof($aNominatRecords); $i++)
557 $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
558 $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
559 $fDiff = sqrt($fDiff);
561 // If it was an unknown type base it on the rank of the found result
562 $iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
563 if ($iRank <= 4) $fMaxDist = 2;
564 elseif ($iRank <= 8) $fMaxDist = 1;
565 elseif ($iRank <= 10) $fMaxDist = 0.8;
566 elseif ($iRank <= 12) $fMaxDist = 0.6;
567 elseif ($iRank <= 17) $fMaxDist = 0.2;
568 elseif ($iRank <= 18) $fMaxDist = 0.1;
569 elseif ($iRank <= 22) $fMaxDist = 0.02;
570 elseif ($iRank <= 26) $fMaxDist = 0.001;
571 else $fMaxDist = 0.001;
573 echo "-- FOUND \"".substr($aNominatRecords[$i]['DISPLAY_NAME'],0,50)."\", ".$aNominatRecords[$i]['CLASS'].", ".$aNominatRecords[$i]['TYPE'].", ".$aNominatRecords[$i]['PLACE_RANK'].", ".$aNominatRecords[$i]['OSM_TYPE']." (dist:$fDiff, max:$fMaxDist)\n";
574 if ($fDiff > $fMaxDist)
576 echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
580 $sSQL = "update wikipedia_article set osm_type=";
581 switch($aNominatRecords[$i]['OSM_TYPE'])
583 case 'relation': $sSQL .= "'R'"; break;
584 case 'way': $sSQL .= "'W'"; break;
585 case 'node': $sSQL .= "'N'"; break;
587 $sSQL .= ", osm_id=".$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";