4 require_once(dirname(dirname(__FILE__)).'/settings/settings.php');
5 require_once(CONST_BasePath.'/lib/init-cmd.php');
6 ini_set('memory_limit', '800M');
10 "Create and setup nominatim search system",
11 array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
12 array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
13 array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
15 array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
16 array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
17 array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
19 getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
22 $sTestPageText = <<<EOD
23 {{Coord|47|N|2|E|type:country_region:FR|display=title}}
24 {{ Infobox Amusement park
25 | name = Six Flags Great Adventure
26 | image = [[File:SixFlagsGreatAdventure logo.png]]
27 | caption = Six Flags Great Adventure logo
28 | location = [[Jackson, New Jersey|Jackson]]
29 | location2 = New Jersey
30 | location3 = United States
31 | address = 1 Six Flags Boulevard<ref name="drivedir"/>
32 | season = March/April through October/November
33 | opening_date = July 1, 1974
34 | previous_names = Great Adventure
36 | rides = 45 park admission rides
39 | owner = [[Six Flags]]
41 | homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
44 var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
46 //| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
55 if ($aCMDResult['drop-tables'])
57 $oDB->query('DROP TABLE wikipedia_article');
58 $oDB->query('DROP TABLE wikipedia_link');
62 if ($aCMDResult['create-tables']) {
64 CREATE TABLE wikipedia_article (
65 language text NOT NULL,
72 importance double precision,
74 osm_type character(1),
82 $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
85 CREATE TABLE wikipedia_link (
93 function degreesAndMinutesToDecimal($iDegrees, $iMinutes = 0, $fSeconds = 0, $sNSEW = 'N')
95 $sNSEW = strtoupper($sNSEW);
96 return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
99 function _parseWikipediaContent($sPageText)
101 $sPageText = str_replace("\n", ' ', $sPageText);
102 $sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
103 $sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
105 $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
107 $aPageProperties = array();
109 $aTemplates = array();
112 $aTemplateStack = array();
113 $aState = array('body');
114 foreach ($aPageText as $i => $sPart) {
117 array_unshift($aTemplateStack, array('', array()));
118 array_unshift($aState, 'template');
121 if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
122 $aTemplate = array_shift($aTemplateStack);
123 array_shift($aState);
125 $aTemplates[] = $aTemplate;
131 array_unshift($aState, 'link');
134 if ($aState[0] == 'link' || $aState[0] == 'linksynonim') {
135 if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
136 if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
138 $aLinks[] = array($sLinkPage, $sLinkSyn);
140 array_shift($aState);
141 switch ($aState[0]) {
143 $aTemplateStack[0][0] .= trim($sPart);
145 case 'templateparam':
146 $aTemplateStack[0][1][0] .= $sLinkSyn;
149 $sLinkPage .= trim($sPart);
155 $sPageBody .= $sLinkSyn;
158 var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
159 fail('unknown state');
164 if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
165 // Create a new template paramater
166 $aState[0] = 'templateparam';
167 array_unshift($aTemplateStack[0][1], '');
169 if ($aState[0] == 'link') $aState[0] = 'linksynonim';
172 switch ($aState[0]) {
174 $aTemplateStack[0][0] .= trim($sPart);
176 case 'templateparam':
177 $aTemplateStack[0][1][0] .= $sPart;
180 $sLinkPage .= trim($sPart);
186 $sPageBody .= $sPart;
189 var_dump($aState, $aPageText);
190 fail('unknown state');
198 function _templatesToProperties($aTemplates)
200 $aPageProperties = array();
201 foreach ($aTemplates as $iTemplate => $aTemplate) {
203 foreach (array_reverse($aTemplate[1]) as $iParam => $sParam) {
204 if (($iPos = strpos($sParam, '=')) === FALSE) {
205 $aParams[] = trim($sParam);
207 $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
210 $aTemplates[$iTemplate][1] = $aParams;
211 if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
212 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population'])) {
213 $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population']);
215 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total'])) {
216 $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_total']);
218 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban'])) {
219 $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_urban']);
221 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate'])) {
222 $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_estimate']);
224 if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website']) {
225 if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch)) {
226 $aPageProperties['sWebsite'] = $aMatch[1];
227 if (strpos($aPageProperties['sWebsite'], ':/'.'/') === FALSE) {
228 $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
232 if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld']) {
233 $aPageProperties['sTopLevelDomain'] = str_replace(array('[', ']', '.'), '', $aParams['cctld']);
236 if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0], 0, 7)) == 'infobox') {
237 $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0], 8));
238 // $aPageProperties['aInfoboxParams'] = $aParams;
241 // Assume the first template with lots of params is the type (fallback for infobox)
242 if (!isset($aPageProperties['sPossibleInfoboxType']) && sizeof($aParams) > 10) {
243 $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
244 // $aPageProperties['aInfoboxParams'] = $aParams;
247 // do we have a lat/lon
248 if (!isset($aPageProperties['fLat'])) {
249 if (isset($aParams['latd']) && isset($aParams['longd'])) {
250 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
251 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
253 if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees'])) {
254 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
255 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
257 if (isset($aParams['latitude']) && isset($aParams['longitude'])) {
258 if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude'])) {
259 $aPageProperties['fLat'] = (float)$aParams['latitude'];
260 $aPageProperties['fLon'] = (float)$aParams['longitude'];
263 if (strtolower($aTemplate[0]) == 'coord') {
264 if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S')) {
265 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
266 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
267 } elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S')) {
268 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
269 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
270 } elseif (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S')) {
271 $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
272 $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
273 } elseif (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1])) {
274 $aPageProperties['fLat'] = (float)$aParams[0];
275 $aPageProperties['fLon'] = (float)$aParams[1];
278 if (isset($aParams['Latitude']) && isset($aParams['Longitude'])) {
279 $aParams['Latitude'] = str_replace(' ', ' ', $aParams['Latitude']);
280 $aParams['Longitude'] = str_replace(' ', ' ', $aParams['Longitude']);
281 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
282 $aPageProperties['fLat'] =
283 (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
284 +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
285 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
286 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
289 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
290 $aPageProperties['fLon'] =
291 (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
292 +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
293 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
294 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
299 if (isset($aPageProperties['sPossibleInfoboxType'])) {
300 if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
301 unset($aPageProperties['sPossibleInfoboxType']);
303 return $aPageProperties;
306 if (isset($aCMDResult['parse-wikipedia'])) {
308 $aArticleNames = $oDB->getCol('select page_title from content where page_namespace = 0 and page_id %10 = '.$aCMDResult['parse-wikipedia'].' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))');
309 // $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0 and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
310 foreach ($aArticleNames as $sArticleName) {
311 $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
312 $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
314 if (isset($aP['sInfoboxType'])) {
315 $aP['sInfoboxType'] = preg_replace('#\\s+#', ' ', $aP['sInfoboxType']);
316 $sSQL = 'update wikipedia_article set ';
317 $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
318 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
321 if (isset($aP['iPopulation'])) {
322 $sSQL = 'update wikipedia_article set ';
323 $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
324 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
327 if (isset($aP['sWebsite'])) {
328 $sSQL = 'update wikipedia_article set ';
329 $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
330 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
333 if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0')) {
334 if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
335 echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
336 $sSQL = 'update wikipedia_article set ';
337 $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
338 $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
339 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
345 function nominatimXMLStart($hParser, $sName, $aAttr)
347 global $aNominatRecords;
350 $aNominatRecords[] = $aAttr;
355 function nominatimXMLEnd($hParser, $sName)
360 if (isset($aCMDResult['link'])) {
362 $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
364 // If you point this script at production OSM you will be blocked
365 $sNominatimBaseURL = 'http://SEVERNAME/search.php';
367 foreach ($aWikiArticles as $aRecord) {
368 $aRecord['name'] = str_replace('_', ' ', $aRecord['title']);
370 $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
372 echo "\n-- ".$aRecord['name'].", ".$aRecord['infobox_type']."\n";
373 $fMaxDist = 0.0000001;
375 switch (strtolower($aRecord['infobox_type'])) {
376 case 'former country':
379 $fMaxDist = 60; // effectively turn it off
380 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
386 $fMaxDist = 60; // effectively turn it off
387 $sURL .= "&featuretype=country";
388 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
390 case 'prefecture japan':
391 $aRecord['name'] = trim(str_replace(' Prefecture', ' ', $aRecord['name']));
397 case 'u.s. state symbols':
399 case 'province or territory of canada':
400 case 'indian jurisdiction':
402 case 'french region':
403 case 'region of italy':
405 case '#australia state or territory':
406 case 'russian federal subject':
408 $sURL .= "&featuretype=state";
409 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
411 case 'protected area':
413 $sURL .= "&nearlat=".$aRecord['lat'];
414 $sURL .= "&nearlon=".$aRecord['lon'];
415 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
420 case 'french commune':
421 case 'italian comune':
423 case 'italian comune':
424 case 'australian place':
430 case 'russian inhabited locality':
431 case 'finnish municipality/land area':
432 case 'england county':
433 case 'israel municipality':
437 $sURL .= "&featuretype=settlement";
438 $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
441 case 'mountain pass':
446 $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
450 $aTypes = array('wreck');
451 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
452 $sURL .= "&nearlat=".$aRecord['lat'];
453 $sURL .= "&nearlon=".$aRecord['lon'];
460 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
461 $sURL .= "&bounded=1";
462 $sURL .= "&nearlat=".$aRecord['lat'];
463 $sURL .= "&nearlon=".$aRecord['lon'];
468 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
469 // $sURL .= "&bounded=1";
470 $sURL .= "&nearlat=".$aRecord['lat'];
471 $sURL .= "&nearlon=".$aRecord['lon'];
472 echo "-- Unknown: ".$aRecord['infobox_type']."\n";
475 $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
478 $sXML = file_get_contents($sNameURL);
480 $aNominatRecords = array();
481 $hXMLParser = xml_parser_create();
482 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
483 xml_parse($hXMLParser, $sXML, true);
484 xml_parser_free($hXMLParser);
486 if (!isset($aNominatRecords[0])) {
487 $aNameParts = preg_split('#[(,]#', $aRecord['name']);
488 if (sizeof($aNameParts) > 1) {
489 $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
491 $sXML = file_get_contents($sNameURL);
493 $aNominatRecords = array();
494 $hXMLParser = xml_parser_create();
495 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
496 xml_parse($hXMLParser, $sXML, true);
497 xml_parser_free($hXMLParser);#
501 // assume first is best/right
502 for ($i = 0; $i < sizeof($aNominatRecords); $i++) {
503 $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
504 $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
505 $fDiff = sqrt($fDiff);
507 // If it was an unknown type base it on the rank of the found result
508 $iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
509 if ($iRank <= 4) $fMaxDist = 2;
510 elseif ($iRank <= 8) $fMaxDist = 1;
511 elseif ($iRank <= 10) $fMaxDist = 0.8;
512 elseif ($iRank <= 12) $fMaxDist = 0.6;
513 elseif ($iRank <= 17) $fMaxDist = 0.2;
514 elseif ($iRank <= 18) $fMaxDist = 0.1;
515 elseif ($iRank <= 22) $fMaxDist = 0.02;
516 elseif ($iRank <= 26) $fMaxDist = 0.001;
517 else $fMaxDist = 0.001;
519 echo "-- FOUND \"".substr($aNominatRecords[$i]['DISPLAY_NAME'], 0, 50)."\", ".$aNominatRecords[$i]['CLASS'].", ".$aNominatRecords[$i]['TYPE'].", ".$aNominatRecords[$i]['PLACE_RANK'].", ".$aNominatRecords[$i]['OSM_TYPE']." (dist:$fDiff, max:$fMaxDist)\n";
520 if ($fDiff > $fMaxDist) {
521 echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
523 $sSQL = "update wikipedia_article set osm_type=";
524 switch ($aNominatRecords[$i]['OSM_TYPE']) {
525 case 'relation': $sSQL .= "'R'"; break;
526 case 'way': $sSQL .= "'W'"; break;
527 case 'node': $sSQL .= "'N'"; break;
529 $sSQL .= ", osm_id=".$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";