4 require_once(dirname(dirname(__FILE__)).'/settings/settings.php');
5 require_once(CONST_BasePath.'/lib/init-cmd.php');
6 ini_set('memory_limit', '800M');
10 "Create and setup nominatim search system",
11 array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
12 array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
13 array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
15 array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
16 array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
17 array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
19 getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
22 $sTestPageText = <<<EOD
23 {{Coord|47|N|2|E|type:country_region:FR|display=title}}
24 {{ Infobox Amusement park
25 | name = Six Flags Great Adventure
26 | image = [[File:SixFlagsGreatAdventure logo.png]]
27 | caption = Six Flags Great Adventure logo
28 | location = [[Jackson, New Jersey|Jackson]]
29 | location2 = New Jersey
30 | location3 = United States
31 | address = 1 Six Flags Boulevard<ref name="drivedir"/>
32 | season = March/April through October/November
33 | opening_date = July 1, 1974
34 | previous_names = Great Adventure
36 | rides = 45 park admission rides
39 | owner = [[Six Flags]]
41 | homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
44 var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
46 //| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
55 if ($aCMDResult['drop-tables'])
57 $oDB->query('DROP TABLE wikipedia_article');
58 $oDB->query('DROP TABLE wikipedia_link');
62 if ($aCMDResult['create-tables']) {
64 CREATE TABLE wikipedia_article (
65 language text NOT NULL,
72 importance double precision,
74 osm_type character(1),
82 $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
85 CREATE TABLE wikipedia_link (
93 function degreesAndMinutesToDecimal($iDegrees, $iMinutes = 0, $fSeconds = 0, $sNSEW = 'N')
95 $sNSEW = strtoupper($sNSEW);
96 return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
99 function _parseWikipediaContent($sPageText)
101 $sPageText = str_replace("\n", ' ', $sPageText);
102 $sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
103 $sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
105 $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
107 $aPageProperties = array();
109 $aTemplates = array();
112 $aTemplateStack = array();
113 $aState = array('body');
114 foreach ($aPageText as $i => $sPart) {
117 array_unshift($aTemplateStack, array('', array()));
118 array_unshift($aState, 'template');
121 if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
122 $aTemplate = array_shift($aTemplateStack);
123 array_shift($aState);
125 $aTemplates[] = $aTemplate;
131 array_unshift($aState, 'link');
134 if ($aState[0] == 'link' || $aState[0] == 'linksynonim') {
135 if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
136 if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
138 $aLinks[] = array($sLinkPage, $sLinkSyn);
140 array_shift($aState);
141 switch ($aState[0]) {
143 $aTemplateStack[0][0] .= trim($sPart);
145 case 'templateparam':
146 $aTemplateStack[0][1][0] .= $sLinkSyn;
149 $sLinkPage .= trim($sPart);
155 $sPageBody .= $sLinkSyn;
158 var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
159 fail('unknown state');
164 if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
165 // Create a new template paramater
166 $aState[0] = 'templateparam';
167 array_unshift($aTemplateStack[0][1], '');
169 if ($aState[0] == 'link') $aState[0] = 'linksynonim';
172 switch ($aState[0]) {
174 $aTemplateStack[0][0] .= trim($sPart);
176 case 'templateparam':
177 $aTemplateStack[0][1][0] .= $sPart;
180 $sLinkPage .= trim($sPart);
186 $sPageBody .= $sPart;
189 var_dump($aState, $aPageText);
190 fail('unknown state');
198 function _templatesToProperties($aTemplates)
200 $aPageProperties = array();
201 foreach ($aTemplates as $iTemplate => $aTemplate) {
203 foreach (array_reverse($aTemplate[1]) as $iParam => $sParam) {
204 if (($iPos = strpos($sParam, '=')) === FALSE) {
205 $aParams[] = trim($sParam);
207 $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
210 $aTemplates[$iTemplate][1] = $aParams;
211 if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
212 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population'])) {
213 $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population']);
215 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total'])) {
216 $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_total']);
218 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban'])) {
219 $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_urban']);
221 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate'])) {
222 $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_estimate']);
224 if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website']) {
225 if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch)) {
226 $aPageProperties['sWebsite'] = $aMatch[1];
227 if (strpos($aPageProperties['sWebsite'], ':/'.'/') === FALSE) {
228 $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
232 if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld']) {
233 $aPageProperties['sTopLevelDomain'] = str_replace(array('[', ']', '.'), '', $aParams['cctld']);
236 if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0], 0, 7)) == 'infobox') {
237 $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0], 8));
238 // $aPageProperties['aInfoboxParams'] = $aParams;
241 // Assume the first template with lots of params is the type (fallback for infobox)
242 if (!isset($aPageProperties['sPossibleInfoboxType']) && sizeof($aParams) > 10) {
243 $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
244 // $aPageProperties['aInfoboxParams'] = $aParams;
247 // do we have a lat/lon
248 if (!isset($aPageProperties['fLat'])) {
249 if (isset($aParams['latd']) && isset($aParams['longd'])) {
250 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
251 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
253 if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees'])) {
254 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
255 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
257 if (isset($aParams['latitude']) && isset($aParams['longitude'])) {
258 if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude'])) {
259 $aPageProperties['fLat'] = (float)$aParams['latitude'];
260 $aPageProperties['fLon'] = (float)$aParams['longitude'];
263 if (strtolower($aTemplate[0]) == 'coord') {
264 if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S')) {
265 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
266 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
267 } elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S')) {
268 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
269 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
270 } elseif (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S')) {
271 $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
272 $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
273 } elseif (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1])) {
274 $aPageProperties['fLat'] = (float)$aParams[0];
275 $aPageProperties['fLon'] = (float)$aParams[1];
278 if (isset($aParams['Latitude']) && isset($aParams['Longitude'])) {
279 $aParams['Latitude'] = str_replace(' ', ' ', $aParams['Latitude']);
280 $aParams['Longitude'] = str_replace(' ', ' ', $aParams['Longitude']);
281 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
282 $aPageProperties['fLat'] =
283 (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
284 +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
285 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
286 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
289 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
290 $aPageProperties['fLon'] =
291 (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
292 +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
293 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
294 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
299 if (isset($aPageProperties['sPossibleInfoboxType'])) {
300 if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
301 unset($aPageProperties['sPossibleInfoboxType']);
303 return $aPageProperties;
306 if (isset($aCMDResult['parse-wikipedia'])) {
308 $aArticleNames = $oDB->getCol('select page_title from content where page_namespace = 0 and page_id %10 = '.$aCMDResult['parse-wikipedia'].' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))');
309 // $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0 and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
310 foreach ($aArticleNames as $sArticleName) {
311 $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
312 $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
314 if (isset($aP['sInfoboxType'])) {
315 $aP['sInfoboxType'] = preg_replace('#\\s+#', ' ', $aP['sInfoboxType']);
316 $sSQL = 'update wikipedia_article set ';
317 $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
318 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
321 if (isset($aP['iPopulation'])) {
322 $sSQL = 'update wikipedia_article set ';
323 $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
324 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
327 if (isset($aP['sWebsite'])) {
328 $sSQL = 'update wikipedia_article set ';
329 $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
330 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
333 if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0')) {
334 if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
335 echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
336 $sSQL = 'update wikipedia_article set ';
337 $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
338 $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
339 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
345 function nominatimXMLStart($hParser, $sName, $aAttr)
347 global $aNominatRecords;
350 $aNominatRecords[] = $aAttr;
355 function nominatimXMLEnd($hParser, $sName)
360 if (isset($aCMDResult['link'])) {
362 $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
364 // If you point this script at production OSM you will be blocked
365 $sNominatimBaseURL = 'http://SEVERNAME/search.php';
367 foreach ($aWikiArticles as $aRecord) {
368 $aRecord['name'] = str_replace('_', ' ', $aRecord['title']);
370 $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
372 echo "\n-- ".$aRecord['name'].", ".$aRecord['infobox_type']."\n";
373 $fMaxDist = 0.0000001;
375 switch (strtolower($aRecord['infobox_type'])) {
376 case 'former country':
379 $fMaxDist = 60; // effectively turn it off
380 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
386 $fMaxDist = 60; // effectively turn it off
387 $sURL .= "&featuretype=country";
388 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
390 case 'prefecture japan':
391 $aRecord['name'] = trim(str_replace(' Prefecture', ' ', $aRecord['name']));
396 case 'u.s. state symbols':
398 case 'province or territory of canada';
399 case 'indian jurisdiction';
401 case 'french region':
402 case 'region of italy':
404 case '#australia state or territory':
405 case 'russian federal subject':
407 $sURL .= "&featuretype=state";
408 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
410 case 'protected area':
412 $sURL .= "&nearlat=".$aRecord['lat'];
413 $sURL .= "&nearlon=".$aRecord['lon'];
414 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
418 case 'french commune':
419 case 'italian comune':
421 case 'italian comune':
422 case 'australian place':
428 case 'russian inhabited locality':
429 case 'finnish municipality/land area':
430 case 'england county':
431 case 'israel municipality':
435 $sURL .= "&featuretype=settlement";
436 $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
439 case 'mountain pass':
444 $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
448 $aTypes = array('wreck');
449 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
450 $sURL .= "&nearlat=".$aRecord['lat'];
451 $sURL .= "&nearlon=".$aRecord['lon'];
458 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
459 $sURL .= "&bounded=1";
460 $sURL .= "&nearlat=".$aRecord['lat'];
461 $sURL .= "&nearlon=".$aRecord['lon'];
466 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
467 // $sURL .= "&bounded=1";
468 $sURL .= "&nearlat=".$aRecord['lat'];
469 $sURL .= "&nearlon=".$aRecord['lon'];
470 echo "-- Unknown: ".$aRecord['infobox_type']."\n";
473 $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
476 $sXML = file_get_contents($sNameURL);
478 $aNominatRecords = array();
479 $hXMLParser = xml_parser_create();
480 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
481 xml_parse($hXMLParser, $sXML, true);
482 xml_parser_free($hXMLParser);
484 if (!isset($aNominatRecords[0])) {
485 $aNameParts = preg_split('#[(,]#', $aRecord['name']);
486 if (sizeof($aNameParts) > 1) {
487 $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
489 $sXML = file_get_contents($sNameURL);
491 $aNominatRecords = array();
492 $hXMLParser = xml_parser_create();
493 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
494 xml_parse($hXMLParser, $sXML, true);
495 xml_parser_free($hXMLParser);#
499 // assume first is best/right
500 for ($i = 0; $i < sizeof($aNominatRecords); $i++) {
501 $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
502 $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
503 $fDiff = sqrt($fDiff);
505 // If it was an unknown type base it on the rank of the found result
506 $iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
507 if ($iRank <= 4) $fMaxDist = 2;
508 elseif ($iRank <= 8) $fMaxDist = 1;
509 elseif ($iRank <= 10) $fMaxDist = 0.8;
510 elseif ($iRank <= 12) $fMaxDist = 0.6;
511 elseif ($iRank <= 17) $fMaxDist = 0.2;
512 elseif ($iRank <= 18) $fMaxDist = 0.1;
513 elseif ($iRank <= 22) $fMaxDist = 0.02;
514 elseif ($iRank <= 26) $fMaxDist = 0.001;
515 else $fMaxDist = 0.001;
517 echo "-- FOUND \"".substr($aNominatRecords[$i]['DISPLAY_NAME'], 0, 50)."\", ".$aNominatRecords[$i]['CLASS'].", ".$aNominatRecords[$i]['TYPE'].", ".$aNominatRecords[$i]['PLACE_RANK'].", ".$aNominatRecords[$i]['OSM_TYPE']." (dist:$fDiff, max:$fMaxDist)\n";
518 if ($fDiff > $fMaxDist) {
519 echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
521 $sSQL = "update wikipedia_article set osm_type=";
522 switch ($aNominatRecords[$i]['OSM_TYPE']) {
523 case 'relation': $sSQL .= "'R'"; break;
524 case 'way': $sSQL .= "'W'"; break;
525 case 'node': $sSQL .= "'N'"; break;
527 $sSQL .= ", osm_id=".$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";