4 require_once(dirname(dirname(__FILE__)).'/settings/settings.php');
5 require_once(CONST_BasePath.'/lib/init-cmd.php');
6 ini_set('memory_limit', '800M');
10 "Create and setup nominatim search system",
11 array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
12 array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
13 array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
15 array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
16 array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
17 array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
19 getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
22 $sTestPageText = <<<EOD
23 {{Coord|47|N|2|E|type:country_region:FR|display=title}}
24 {{ Infobox Amusement park
25 | name = Six Flags Great Adventure
26 | image = [[File:SixFlagsGreatAdventure logo.png]]
27 | caption = Six Flags Great Adventure logo
28 | location = [[Jackson, New Jersey|Jackson]]
29 | location2 = New Jersey
30 | location3 = United States
31 | address = 1 Six Flags Boulevard<ref name="drivedir"/>
32 | season = March/April through October/November
33 | opening_date = July 1, 1974
34 | previous_names = Great Adventure
36 | rides = 45 park admission rides
39 | owner = [[Six Flags]]
41 | homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
44 var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
46 //| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
55 if ($aCMDResult['drop-tables'])
57 $oDB->query('DROP TABLE wikipedia_article');
58 $oDB->query('DROP TABLE wikipedia_link');
62 if ($aCMDResult['create-tables']) {
64 CREATE TABLE wikipedia_article (
65 language text NOT NULL,
72 importance double precision,
74 osm_type character(1),
82 $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
85 CREATE TABLE wikipedia_link (
94 function degreesAndMinutesToDecimal($iDegrees, $iMinutes = 0, $fSeconds = 0, $sNSEW = 'N')
96 $sNSEW = strtoupper($sNSEW);
97 return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
101 function _parseWikipediaContent($sPageText)
103 $sPageText = str_replace("\n", ' ', $sPageText);
104 $sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
105 $sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
107 $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
109 $aPageProperties = array();
111 $aTemplates = array();
114 $aTemplateStack = array();
115 $aState = array('body');
116 foreach ($aPageText as $i => $sPart) {
119 array_unshift($aTemplateStack, array('', array()));
120 array_unshift($aState, 'template');
123 if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
124 $aTemplate = array_shift($aTemplateStack);
125 array_shift($aState);
127 $aTemplates[] = $aTemplate;
133 array_unshift($aState, 'link');
136 if ($aState[0] == 'link' || $aState[0] == 'linksynonim') {
137 if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
138 if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
140 $aLinks[] = array($sLinkPage, $sLinkSyn);
142 array_shift($aState);
143 switch ($aState[0]) {
145 $aTemplateStack[0][0] .= trim($sPart);
147 case 'templateparam':
148 $aTemplateStack[0][1][0] .= $sLinkSyn;
151 $sLinkPage .= trim($sPart);
157 $sPageBody .= $sLinkSyn;
160 var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
161 fail('unknown state');
166 if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
167 // Create a new template paramater
168 $aState[0] = 'templateparam';
169 array_unshift($aTemplateStack[0][1], '');
171 if ($aState[0] == 'link') $aState[0] = 'linksynonim';
174 switch ($aState[0]) {
176 $aTemplateStack[0][0] .= trim($sPart);
178 case 'templateparam':
179 $aTemplateStack[0][1][0] .= $sPart;
182 $sLinkPage .= trim($sPart);
188 $sPageBody .= $sPart;
191 var_dump($aState, $aPageText);
192 fail('unknown state');
200 function _templatesToProperties($aTemplates)
202 $aPageProperties = array();
203 foreach ($aTemplates as $iTemplate => $aTemplate) {
205 foreach (array_reverse($aTemplate[1]) as $iParam => $sParam) {
206 if (($iPos = strpos($sParam, '=')) === false) {
207 $aParams[] = trim($sParam);
209 $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
212 $aTemplates[$iTemplate][1] = $aParams;
213 if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
214 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population'])) {
215 $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population']);
217 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total'])) {
218 $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_total']);
220 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban'])) {
221 $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_urban']);
223 if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate'])) {
224 $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_estimate']);
226 if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website']) {
227 if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch)) {
228 $aPageProperties['sWebsite'] = $aMatch[1];
229 if (strpos($aPageProperties['sWebsite'], ':/'.'/') === false) {
230 $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
234 if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld']) {
235 $aPageProperties['sTopLevelDomain'] = str_replace(array('[', ']', '.'), '', $aParams['cctld']);
238 if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0], 0, 7)) == 'infobox') {
239 $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0], 8));
240 // $aPageProperties['aInfoboxParams'] = $aParams;
243 // Assume the first template with lots of params is the type (fallback for infobox)
244 if (!isset($aPageProperties['sPossibleInfoboxType']) && sizeof($aParams) > 10) {
245 $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
246 // $aPageProperties['aInfoboxParams'] = $aParams;
249 // do we have a lat/lon
250 if (!isset($aPageProperties['fLat'])) {
251 if (isset($aParams['latd']) && isset($aParams['longd'])) {
252 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
253 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
255 if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees'])) {
256 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
257 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
259 if (isset($aParams['latitude']) && isset($aParams['longitude'])) {
260 if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude'])) {
261 $aPageProperties['fLat'] = (float)$aParams['latitude'];
262 $aPageProperties['fLon'] = (float)$aParams['longitude'];
265 if (strtolower($aTemplate[0]) == 'coord') {
266 if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S')) {
267 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
268 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
269 } elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S')) {
270 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
271 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
272 } elseif (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S')) {
273 $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
274 $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
275 } elseif (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1])) {
276 $aPageProperties['fLat'] = (float)$aParams[0];
277 $aPageProperties['fLon'] = (float)$aParams[1];
280 if (isset($aParams['Latitude']) && isset($aParams['Longitude'])) {
281 $aParams['Latitude'] = str_replace(' ', ' ', $aParams['Latitude']);
282 $aParams['Longitude'] = str_replace(' ', ' ', $aParams['Longitude']);
283 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
284 $aPageProperties['fLat'] =
285 (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
286 +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
287 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
288 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
291 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
292 $aPageProperties['fLon'] =
293 (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
294 +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
295 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
296 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
301 if (isset($aPageProperties['sPossibleInfoboxType'])) {
302 if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
303 unset($aPageProperties['sPossibleInfoboxType']);
305 return $aPageProperties;
308 if (isset($aCMDResult['parse-wikipedia'])) {
310 $aArticleNames = $oDB->getCol('select page_title from content where page_namespace = 0 and page_id %10 = '.$aCMDResult['parse-wikipedia'].' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))');
311 // $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0 and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
312 foreach ($aArticleNames as $sArticleName) {
313 $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
314 $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
316 if (isset($aP['sInfoboxType'])) {
317 $aP['sInfoboxType'] = preg_replace('#\\s+#', ' ', $aP['sInfoboxType']);
318 $sSQL = 'update wikipedia_article set ';
319 $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
320 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
323 if (isset($aP['iPopulation'])) {
324 $sSQL = 'update wikipedia_article set ';
325 $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
326 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
329 if (isset($aP['sWebsite'])) {
330 $sSQL = 'update wikipedia_article set ';
331 $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
332 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
335 if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0')) {
336 if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
337 echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
338 $sSQL = 'update wikipedia_article set ';
339 $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
340 $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
341 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
348 function nominatimXMLStart($hParser, $sName, $aAttr)
350 global $aNominatRecords;
353 $aNominatRecords[] = $aAttr;
359 function nominatimXMLEnd($hParser, $sName)
364 if (isset($aCMDResult['link'])) {
366 $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
368 // If you point this script at production OSM you will be blocked
369 $sNominatimBaseURL = 'http://SEVERNAME/search.php';
371 foreach ($aWikiArticles as $aRecord) {
372 $aRecord['name'] = str_replace('_', ' ', $aRecord['title']);
374 $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
376 echo "\n-- ".$aRecord['name'].", ".$aRecord['infobox_type']."\n";
377 $fMaxDist = 0.0000001;
379 switch (strtolower($aRecord['infobox_type'])) {
380 case 'former country':
383 $fMaxDist = 60; // effectively turn it off
384 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
390 $fMaxDist = 60; // effectively turn it off
391 $sURL .= "&featuretype=country";
392 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
394 case 'prefecture japan':
395 $aRecord['name'] = trim(str_replace(' Prefecture', ' ', $aRecord['name']));
401 case 'u.s. state symbols':
403 case 'province or territory of canada':
404 case 'indian jurisdiction':
406 case 'french region':
407 case 'region of italy':
409 case '#australia state or territory':
410 case 'russian federal subject':
412 $sURL .= "&featuretype=state";
413 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
415 case 'protected area':
417 $sURL .= "&nearlat=".$aRecord['lat'];
418 $sURL .= "&nearlon=".$aRecord['lon'];
419 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
424 case 'french commune':
425 case 'italian comune':
427 case 'italian comune':
428 case 'australian place':
434 case 'russian inhabited locality':
435 case 'finnish municipality/land area':
436 case 'england county':
437 case 'israel municipality':
441 $sURL .= "&featuretype=settlement";
442 $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
445 case 'mountain pass':
450 $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
454 $aTypes = array('wreck');
455 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
456 $sURL .= "&nearlat=".$aRecord['lat'];
457 $sURL .= "&nearlon=".$aRecord['lon'];
464 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
465 $sURL .= "&bounded=1";
466 $sURL .= "&nearlat=".$aRecord['lat'];
467 $sURL .= "&nearlon=".$aRecord['lon'];
472 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
473 // $sURL .= "&bounded=1";
474 $sURL .= "&nearlat=".$aRecord['lat'];
475 $sURL .= "&nearlon=".$aRecord['lon'];
476 echo "-- Unknown: ".$aRecord['infobox_type']."\n";
479 $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
482 $sXML = file_get_contents($sNameURL);
484 $aNominatRecords = array();
485 $hXMLParser = xml_parser_create();
486 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
487 xml_parse($hXMLParser, $sXML, true);
488 xml_parser_free($hXMLParser);
490 if (!isset($aNominatRecords[0])) {
491 $aNameParts = preg_split('#[(,]#', $aRecord['name']);
492 if (sizeof($aNameParts) > 1) {
493 $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
495 $sXML = file_get_contents($sNameURL);
497 $aNominatRecords = array();
498 $hXMLParser = xml_parser_create();
499 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
500 xml_parse($hXMLParser, $sXML, true);
501 xml_parser_free($hXMLParser);
505 // assume first is best/right
506 for ($i = 0; $i < sizeof($aNominatRecords); $i++) {
507 $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
508 $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
509 $fDiff = sqrt($fDiff);
511 // If it was an unknown type base it on the rank of the found result
512 $iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
513 if ($iRank <= 4) $fMaxDist = 2;
514 elseif ($iRank <= 8) $fMaxDist = 1;
515 elseif ($iRank <= 10) $fMaxDist = 0.8;
516 elseif ($iRank <= 12) $fMaxDist = 0.6;
517 elseif ($iRank <= 17) $fMaxDist = 0.2;
518 elseif ($iRank <= 18) $fMaxDist = 0.1;
519 elseif ($iRank <= 22) $fMaxDist = 0.02;
520 elseif ($iRank <= 26) $fMaxDist = 0.001;
521 else $fMaxDist = 0.001;
523 echo "-- FOUND \"".substr($aNominatRecords[$i]['DISPLAY_NAME'], 0, 50)."\", ".$aNominatRecords[$i]['CLASS'].", ".$aNominatRecords[$i]['TYPE'].", ".$aNominatRecords[$i]['PLACE_RANK'].", ".$aNominatRecords[$i]['OSM_TYPE']." (dist:$fDiff, max:$fMaxDist)\n";
524 if ($fDiff > $fMaxDist) {
525 echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
527 $sSQL = "update wikipedia_article set osm_type=";
528 switch ($aNominatRecords[$i]['OSM_TYPE']) {
539 $sSQL .= ", osm_id=".$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";