4 require_once(dirname(dirname(__FILE__)).'/settings/settings.php');
5 require_once(CONST_BasePath.'/lib/init-cmd.php');
6 ini_set('memory_limit', '800M');
9 "Create and setup nominatim search system",
10 array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
11 array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
12 array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
14 array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
15 array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
16 array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
18 getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
21 $sTestPageText = <<<EOD
22 {{Coord|47|N|2|E|type:country_region:FR|display=title}}
23 {{ Infobox Amusement park
24 | name = Six Flags Great Adventure
25 | image = [[File:SixFlagsGreatAdventure logo.png]]
26 | caption = Six Flags Great Adventure logo
27 | location = [[Jackson, New Jersey|Jackson]]
28 | location2 = New Jersey
29 | location3 = United States
30 | address = 1 Six Flags Boulevard<ref name="drivedir"/>
31 | season = March/April through October/November
32 | opening_date = July 1, 1974
33 | previous_names = Great Adventure
35 | rides = 45 park admission rides
38 | owner = [[Six Flags]]
40 | homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
43 var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
45 //| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
54 if ($aCMDResult['drop-tables'])
56 $oDB->query('DROP TABLE wikipedia_article');
57 $oDB->query('DROP TABLE wikipedia_link');
61 if ($aCMDResult['create-tables']) {
63 CREATE TABLE wikipedia_article (
64 language text NOT NULL,
71 importance double precision,
73 osm_type character(1),
81 $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
84 CREATE TABLE wikipedia_link (
92 function degreesAndMinutesToDecimal($iDegrees, $iMinutes=0, $fSeconds=0, $sNSEW='N')
94 $sNSEW = strtoupper($sNSEW);
95 return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
98 function _parseWikipediaContent($sPageText)
100 $sPageText = str_replace("\n", ' ', $sPageText);
101 $sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
102 $sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
104 $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
106 $aPageProperties = array();
108 $aTemplates = array();
111 $aTemplateStack = array();
112 $aState = array('body');
113 foreach ($aPageText as $i => $sPart) {
116 array_unshift($aTemplateStack, array('', array()));
117 array_unshift($aState, 'template');
120 if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
121 $aTemplate = array_shift($aTemplateStack);
122 array_shift($aState);
124 $aTemplates[] = $aTemplate;
130 array_unshift($aState, 'link');
133 if ($aState[0] == 'link' || $aState[0] == 'linksynonim') {
134 if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
135 if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
137 $aLinks[] = array($sLinkPage, $sLinkSyn);
139 array_shift($aState);
140 switch ($aState[0]) {
142 $aTemplateStack[0][0] .= trim($sPart);
144 case 'templateparam':
145 $aTemplateStack[0][1][0] .= $sLinkSyn;
148 $sLinkPage .= trim($sPart);
154 $sPageBody .= $sLinkSyn;
157 var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
158 fail('unknown state');
163 if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
164 // Create a new template paramater
165 $aState[0] = 'templateparam';
166 array_unshift($aTemplateStack[0][1], '');
168 if ($aState[0] == 'link') $aState[0] = 'linksynonim';
171 switch ($aState[0]) {
173 $aTemplateStack[0][0] .= trim($sPart);
175 case 'templateparam':
176 $aTemplateStack[0][1][0] .= $sPart;
179 $sLinkPage .= trim($sPart);
185 $sPageBody .= $sPart;
188 var_dump($aState, $aPageText);
189 fail('unknown state');
197 function _templatesToProperties($aTemplates)
199 $aPageProperties = array();
200 foreach ($aTemplates as $iTemplate => $aTemplate) {
202 foreach (array_reverse($aTemplate[1]) as $iParam => $sParam) {
203 if (($iPos = strpos($sParam, '=')) === FALSE) {
204 $aParams[] = trim($sParam);
206 $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
209 $aTemplates[$iTemplate][1] = $aParams;
210 if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
211 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population'])) {
212 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population']);
214 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total'])) {
215 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_total']);
217 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban'])) {
218 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_urban']);
220 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate'])) {
221 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_estimate']);
223 if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website']) {
224 if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch)) {
225 $aPageProperties['sWebsite'] = $aMatch[1];
226 if (strpos($aPageProperties['sWebsite'],':/'.'/') === FALSE) {
227 $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
231 if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld']) {
232 $aPageProperties['sTopLevelDomain'] = str_replace(array('[',']','.'),'', $aParams['cctld']);
235 if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0],0,7)) == 'infobox') {
236 $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0],8));
237 // $aPageProperties['aInfoboxParams'] = $aParams;
240 // Assume the first template with lots of params is the type (fallback for infobox)
241 if (!isset($aPageProperties['sPossibleInfoboxType']) && sizeof($aParams) > 10) {
242 $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
243 // $aPageProperties['aInfoboxParams'] = $aParams;
246 // do we have a lat/lon
247 if (!isset($aPageProperties['fLat'])) {
248 if (isset($aParams['latd']) && isset($aParams['longd'])) {
249 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
250 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
252 if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees'])) {
253 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
254 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
256 if (isset($aParams['latitude']) && isset($aParams['longitude'])) {
257 if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude'])) {
258 $aPageProperties['fLat'] = (float)$aParams['latitude'];
259 $aPageProperties['fLon'] = (float)$aParams['longitude'];
262 if (strtolower($aTemplate[0]) == 'coord') {
263 if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S')) {
264 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
265 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
266 } elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S')) {
267 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
268 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
269 } else if (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S')) {
270 $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
271 $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
272 } else if (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1])) {
273 $aPageProperties['fLat'] = (float)$aParams[0];
274 $aPageProperties['fLon'] = (float)$aParams[1];
277 if (isset($aParams['Latitude']) && isset($aParams['Longitude'])) {
278 $aParams['Latitude'] = str_replace(' ',' ',$aParams['Latitude']);
279 $aParams['Longitude'] = str_replace(' ',' ',$aParams['Longitude']);
280 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
281 $aPageProperties['fLat'] =
282 (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
283 +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
284 } else if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
285 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
288 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
289 $aPageProperties['fLon'] =
290 (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
291 +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
292 } else if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
293 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
298 if (isset($aPageProperties['sPossibleInfoboxType'])) {
299 if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
300 unset($aPageProperties['sPossibleInfoboxType']);
302 return $aPageProperties;
305 if (isset($aCMDResult['parse-wikipedia'])) {
307 $aArticleNames = $oDB->getCol('select page_title from content where page_namespace = 0 and page_id %10 = '.$aCMDResult['parse-wikipedia'].' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))');
308 // $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0 and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
309 foreach ($aArticleNames as $sArticleName) {
310 $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
311 $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
313 if (isset($aP['sInfoboxType'])) {
314 $aP['sInfoboxType'] = preg_replace('#\\s+#',' ',$aP['sInfoboxType']);
315 $sSQL = 'update wikipedia_article set ';
316 $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
317 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
320 if (isset($aP['iPopulation'])) {
321 $sSQL = 'update wikipedia_article set ';
322 $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
323 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
326 if (isset($aP['sWebsite'])) {
327 $sSQL = 'update wikipedia_article set ';
328 $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
329 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
332 if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0')) {
333 if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
334 echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
335 $sSQL = 'update wikipedia_article set ';
336 $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
337 $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
338 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
344 function nominatimXMLStart($hParser, $sName, $aAttr)
346 global $aNominatRecords;
349 $aNominatRecords[] = $aAttr;
354 function nominatimXMLEnd($hParser, $sName)
359 if (isset($aCMDResult['link'])) {
361 $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
363 // If you point this script at production OSM you will be blocked
364 $sNominatimBaseURL = 'http://SEVERNAME/search.php';
366 foreach ($aWikiArticles as $aRecord) {
367 $aRecord['name'] = str_replace('_',' ',$aRecord['title']);
369 $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
371 echo "\n-- ".$aRecord['name'].", ".$aRecord['infobox_type']."\n";
372 $fMaxDist = 0.0000001;
374 switch (strtolower($aRecord['infobox_type'])) {
375 case 'former country':
378 $fMaxDist = 60; // effectively turn it off
379 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
385 $fMaxDist = 60; // effectively turn it off
386 $sURL .= "&featuretype=country";
387 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
389 case 'prefecture japan':
390 $aRecord['name'] = trim(str_replace(' Prefecture',' ', $aRecord['name']));
395 case 'u.s. state symbols':
397 case 'province or territory of canada';
398 case 'indian jurisdiction';
400 case 'french region':
401 case 'region of italy':
403 case '#australia state or territory':
404 case 'russian federal subject':
406 $sURL .= "&featuretype=state";
407 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
409 case 'protected area':
411 $sURL .= "&nearlat=".$aRecord['lat'];
412 $sURL .= "&nearlon=".$aRecord['lon'];
413 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
417 case 'french commune':
418 case 'italian comune':
420 case 'italian comune':
421 case 'australian place':
427 case 'russian inhabited locality':
428 case 'finnish municipality/land area':
429 case 'england county':
430 case 'israel municipality':
434 $sURL .= "&featuretype=settlement";
435 $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
438 case 'mountain pass':
443 $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
447 $aTypes = array('wreck');
448 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
449 $sURL .= "&nearlat=".$aRecord['lat'];
450 $sURL .= "&nearlon=".$aRecord['lon'];
457 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
458 $sURL .= "&bounded=1";
459 $sURL .= "&nearlat=".$aRecord['lat'];
460 $sURL .= "&nearlon=".$aRecord['lon'];
465 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
466 // $sURL .= "&bounded=1";
467 $sURL .= "&nearlat=".$aRecord['lat'];
468 $sURL .= "&nearlon=".$aRecord['lon'];
469 echo "-- Unknown: ".$aRecord['infobox_type']."\n";
472 $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
475 $sXML = file_get_contents($sNameURL);
477 $aNominatRecords = array();
478 $hXMLParser = xml_parser_create();
479 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
480 xml_parse($hXMLParser, $sXML, true);
481 xml_parser_free($hXMLParser);
483 if (!isset($aNominatRecords[0])) {
484 $aNameParts = preg_split('#[(,]#',$aRecord['name']);
485 if (sizeof($aNameParts) > 1) {
486 $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
488 $sXML = file_get_contents($sNameURL);
490 $aNominatRecords = array();
491 $hXMLParser = xml_parser_create();
492 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
493 xml_parse($hXMLParser, $sXML, true);
494 xml_parser_free($hXMLParser);#
498 // assume first is best/right
499 for ($i = 0; $i < sizeof($aNominatRecords); $i++) {
500 $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
501 $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
502 $fDiff = sqrt($fDiff);
504 // If it was an unknown type base it on the rank of the found result
505 $iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
506 if ($iRank <= 4) $fMaxDist = 2;
507 elseif ($iRank <= 8) $fMaxDist = 1;
508 elseif ($iRank <= 10) $fMaxDist = 0.8;
509 elseif ($iRank <= 12) $fMaxDist = 0.6;
510 elseif ($iRank <= 17) $fMaxDist = 0.2;
511 elseif ($iRank <= 18) $fMaxDist = 0.1;
512 elseif ($iRank <= 22) $fMaxDist = 0.02;
513 elseif ($iRank <= 26) $fMaxDist = 0.001;
514 else $fMaxDist = 0.001;
516 echo "-- FOUND \"".substr($aNominatRecords[$i]['DISPLAY_NAME'],0,50)."\", ".$aNominatRecords[$i]['CLASS'].", ".$aNominatRecords[$i]['TYPE'].", ".$aNominatRecords[$i]['PLACE_RANK'].", ".$aNominatRecords[$i]['OSM_TYPE']." (dist:$fDiff, max:$fMaxDist)\n";
517 if ($fDiff > $fMaxDist) {
518 echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
520 $sSQL = "update wikipedia_article set osm_type=";
521 switch ($aNominatRecords[$i]['OSM_TYPE']) {
522 case 'relation': $sSQL .= "'R'"; break;
523 case 'way': $sSQL .= "'W'"; break;
524 case 'node': $sSQL .= "'N'"; break;
526 $sSQL .= ", osm_id=".$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";