]> git.openstreetmap.org Git - nominatim.git/blob - utils/importWikipedia.php
Merge branch 'delete-us-postcode-without-name' of https://github.com/mtmail/Nominatim
[nominatim.git] / utils / importWikipedia.php
1 #!/usr/bin/php -Cq
2 <?php
3
4 require_once(dirname(dirname(__FILE__)).'/settings/settings.php');
5 require_once(CONST_BasePath.'/lib/init-cmd.php');
6 ini_set('memory_limit', '800M');
7
8 $aCMDOptions = array(
9     "Create and setup nominatim search system",
10     array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
11     array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
12     array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
13
14     array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
15     array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
16     array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
17 );
18 getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
19
20 /*
21 $sTestPageText = <<<EOD
22 {{Coord|47|N|2|E|type:country_region:FR|display=title}}
23 {{ Infobox Amusement park
24 | name = Six Flags Great Adventure
25 | image = [[File:SixFlagsGreatAdventure logo.png]]
26 | caption = Six Flags Great Adventure logo
27 | location = [[Jackson, New Jersey|Jackson]]
28 | location2 = New Jersey
29 | location3 = United States
30 | address = 1 Six Flags Boulevard<ref name="drivedir"/>
31 | season = March/April through October/November
32 | opening_date = July 1, 1974
33 | previous_names = Great Adventure
34 | area_acre = 2200
35 | rides = 45 park admission rides
36 | coasters = 12
37 | water_rides = 2
38 | owner = [[Six Flags]]
39 | general_manager = 
40 | homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
41 }}
42 EOD;
43 var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
44 exit;
45 //| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
46 */
47 /*
48
49     $a = array();
50     $a[] = 'test';
51
52     $oDB &= getDB();
53
54     if ($aCMDResult['drop-tables'])
55     {
56         $oDB->query('DROP TABLE wikipedia_article');
57         $oDB->query('DROP TABLE wikipedia_link');
58     }
59 */
60
61 if ($aCMDResult['create-tables'])
62 {
63     $sSQL = <<<'EOD'
64 CREATE TABLE wikipedia_article (
65     language text NOT NULL,
66     title text NOT NULL,
67     langcount integer,
68     othercount integer,
69     totalcount integer,
70     lat double precision,
71     lon double precision,
72     importance double precision,
73     title_en text,
74     osm_type character(1),
75     osm_id bigint,
76     infobox_type text,
77     population bigint,
78     website text
79 );
80         $oDB->query($sSQL);
81
82         $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
83
84         $sSQL = <<<'EOD'
85 CREATE TABLE wikipedia_link (
86   from_id INTEGER,
87   to_name text
88   );
89 EOD;
90     $oDB->query($sSQL);
91 }
92
93 function degreesAndMinutesToDecimal($iDegrees, $iMinutes=0, $fSeconds=0, $sNSEW='N')
94 {
95     $sNSEW = strtoupper($sNSEW);
96     return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
97 }
98
99 function _parseWikipediaContent($sPageText)
100 {
101     $sPageText = str_replace("\n", ' ', $sPageText);
102     $sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
103     $sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
104
105     $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
106
107     $aPageProperties = array();
108     $sPageBody = '';
109     $aTemplates = array();
110     $aLinks = array();
111
112     $aTemplateStack = array();
113     $aState = array('body');
114     foreach($aPageText as $i => $sPart)
115     {
116         switch($sPart)
117         {
118         case '{{':
119             array_unshift($aTemplateStack, array('', array()));
120             array_unshift($aState, 'template');
121             break;
122         case '}}':
123             if ($aState[0] == 'template' || $aState[0] == 'templateparam')
124             {
125                 $aTemplate = array_shift($aTemplateStack);
126                 array_shift($aState);
127
128                 $aTemplates[] = $aTemplate;
129
130             }
131             break;
132         case '[[':
133             $sLinkPage = '';
134             $sLinkSyn = '';
135             array_unshift($aState, 'link');
136             break;
137         case ']]':
138             if ($aState[0] == 'link' || $aState[0] == 'linksynonim')
139             {
140                 if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
141                 if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
142
143                 $aLinks[] = array($sLinkPage, $sLinkSyn);
144
145                 array_shift($aState);
146                 switch($aState[0])
147                 {
148                 case 'template':
149                     $aTemplateStack[0][0] .= trim($sPart);
150                     break;
151                 case 'templateparam':
152                     $aTemplateStack[0][1][0] .= $sLinkSyn;
153                     break;
154                 case 'link':
155                     $sLinkPage .= trim($sPart);
156                     break;
157                 case 'linksynonim':
158                     $sLinkSyn .= $sPart;
159                     break;
160                 case 'body':
161                     $sPageBody .= $sLinkSyn;
162                     break;
163                 default:
164                     var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
165                     fail('unknown state');
166                 }
167             }
168             break;
169         case '|':
170             if ($aState[0] == 'template' || $aState[0] == 'templateparam')
171             {
172                 // Create a new template paramater
173                 $aState[0] = 'templateparam';
174                 array_unshift($aTemplateStack[0][1], '');
175             }
176             if ($aState[0] == 'link') $aState[0] = 'linksynonim';
177             break;
178         default:
179             switch($aState[0])
180             {
181             case 'template':
182                 $aTemplateStack[0][0] .= trim($sPart);
183                 break;
184             case 'templateparam':
185                 $aTemplateStack[0][1][0] .= $sPart;
186                 break;
187             case 'link':
188                 $sLinkPage .= trim($sPart);
189                 break;
190             case 'linksynonim':
191                 $sLinkSyn .= $sPart;
192                 break;
193             case 'body':
194                 $sPageBody .= $sPart;
195                 break;
196             default:
197                 var_dump($aState, $aPageText);
198                 fail('unknown state');
199             }
200             break;
201         }
202     }
203     return $aTemplates;
204 }
205
206 function _templatesToProperties($aTemplates)
207 {
208     $aPageProperties = array();
209     foreach($aTemplates as $iTemplate => $aTemplate)
210     {
211         $aParams = array();
212         foreach(array_reverse($aTemplate[1]) as $iParam => $sParam)
213         {
214             if (($iPos = strpos($sParam, '=')) === FALSE)
215             {
216                 $aParams[] = trim($sParam);
217             }
218             else
219             {
220                 $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
221             }
222         }
223         $aTemplates[$iTemplate][1] = $aParams;
224         if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
225         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population']))
226         {
227             $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population']);
228         }
229         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total']))
230         {
231             $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_total']);
232         }
233         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban']))
234         {
235             $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_urban']);
236         }
237         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate']))
238         {
239             $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_estimate']);
240         }
241         if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website'])
242         {
243             if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch))
244             {
245                 $aPageProperties['sWebsite'] = $aMatch[1];
246                 if (strpos($aPageProperties['sWebsite'],':/'.'/') === FALSE)
247                 {
248                     $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
249                 }
250             }
251         }
252         if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld'])
253         {
254             $aPageProperties['sTopLevelDomain'] = str_replace(array('[',']','.'),'', $aParams['cctld']);
255         }
256
257         if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0],0,7)) == 'infobox')
258         {
259             $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0],8));
260             // $aPageProperties['aInfoboxParams'] = $aParams;
261         }
262
263         // Assume the first template with lots of params is the type (fallback for infobox)
264         if (!isset($aPageProperties['sPossibleInfoboxType']) && sizeof($aParams) > 10)
265         {
266             $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
267             // $aPageProperties['aInfoboxParams'] = $aParams;
268         }
269
270         // do we have a lat/lon
271         if (!isset($aPageProperties['fLat']))
272         {
273             if (isset($aParams['latd']) && isset($aParams['longd']))
274             {
275                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
276                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
277             }
278             if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees']))
279             {
280                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
281                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
282             }
283             if (isset($aParams['latitude']) && isset($aParams['longitude']))
284             {
285                 if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude']))
286                 {
287                     $aPageProperties['fLat'] = (float)$aParams['latitude'];
288                     $aPageProperties['fLon'] = (float)$aParams['longitude'];
289                 }
290             }
291             if (strtolower($aTemplate[0]) == 'coord')
292             {
293                 if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S'))
294                 {
295                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
296                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
297                 }
298                 elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S'))
299                 {
300                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
301                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
302                 }
303                 else if (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S'))
304                 {
305                     $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
306                     $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
307                 }
308                 else if (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1]))
309                 {
310                     $aPageProperties['fLat'] = (float)$aParams[0];
311                     $aPageProperties['fLon'] = (float)$aParams[1];
312                 }
313             }
314             if (isset($aParams['Latitude']) && isset($aParams['Longitude']))
315             {
316                 $aParams['Latitude'] = str_replace('&nbsp;',' ',$aParams['Latitude']);
317                 $aParams['Longitude'] = str_replace('&nbsp;',' ',$aParams['Longitude']);
318                 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch))
319                 {
320                     $aPageProperties['fLat'] =
321                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
322                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
323                 }
324                 else if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch))
325                 {
326                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
327                 }
328
329                 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch))
330                 {
331                     $aPageProperties['fLon'] =
332                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
333                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
334                 }
335                 else if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch))
336                 {
337                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
338                 }
339             }
340         }
341     }
342     if (isset($aPageProperties['sPossibleInfoboxType']))
343     {
344         if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
345         unset($aPageProperties['sPossibleInfoboxType']);
346     }
347     return $aPageProperties;
348 }
349
350 if (isset($aCMDResult['parse-wikipedia']))
351 {
352     $oDB =& getDB();
353     $aArticleNames = $oDB->getCol('select page_title from content where page_namespace = 0 and page_id %10 = '.$aCMDResult['parse-wikipedia'].' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))');
354 //      $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0 and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
355     foreach($aArticleNames as $sArticleName)
356     {
357         $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
358         $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
359
360         if (isset($aP['sInfoboxType']))
361         {
362             $aP['sInfoboxType'] = preg_replace('#\\s+#',' ',$aP['sInfoboxType']);
363             $sSQL = 'update wikipedia_article set ';
364             $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
365             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
366             $oDB->query($sSQL);
367         }
368         if (isset($aP['iPopulation']))
369         {
370             $sSQL = 'update wikipedia_article set ';
371             $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
372             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
373             $oDB->query($sSQL);
374         }
375         if (isset($aP['sWebsite']))
376         {
377             $sSQL = 'update wikipedia_article set ';
378             $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
379             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
380             $oDB->query($sSQL);
381         }
382         if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0'))
383         {
384             if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
385             echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
386             $sSQL = 'update wikipedia_article set ';
387             $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
388             $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
389             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
390             $oDB->query($sSQL);
391         }
392     }
393 }
394
395 function nominatimXMLStart($hParser, $sName, $aAttr)
396 {
397         global $aNominatRecords;
398         switch($sName)
399         {
400         case 'PLACE':
401                 $aNominatRecords[] = $aAttr;
402                 break;
403         }
404 }
405
406 function nominatimXMLEnd($hParser, $sName)
407 {
408 }
409
410
411 if (isset($aCMDResult['link']))
412 {
413     $oDB =& getDB();
414     $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
415
416     // If you point this script at production OSM you will be blocked
417     $sNominatimBaseURL = 'http://SEVERNAME/search.php';
418
419     foreach($aWikiArticles as $aRecord)
420     {
421         $aRecord['name'] = str_replace('_',' ',$aRecord['title']);
422
423         $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
424
425         echo "\n-- ".$aRecord['name'].", ".$aRecord['infobox_type']."\n";
426         $fMaxDist = 0.0000001;
427         $bUnknown = false;
428         switch(strtolower($aRecord['infobox_type']))
429         {
430         case 'former country':
431             continue 2;
432         case 'sea':
433             $fMaxDist = 60; // effectively turn it off
434             $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
435             break;
436         case 'country':
437         case 'island':
438         case 'islands':
439         case 'continent':
440             $fMaxDist = 60; // effectively turn it off
441             $sURL .= "&featuretype=country";
442             $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
443             break;
444         case 'prefecture japan':
445             $aRecord['name'] = trim(str_replace(' Prefecture',' ', $aRecord['name']));
446         case 'state':
447         case '#us state':
448         case 'county':
449         case 'u.s. state':
450         case 'u.s. state symbols':
451         case 'german state':
452         case 'province or territory of canada';
453         case 'indian jurisdiction';
454         case 'province';
455         case 'french region':
456         case 'region of italy':
457         case 'kommune':
458         case '#australia state or territory':
459         case 'russian federal subject':
460             $fMaxDist = 4;
461             $sURL .= "&featuretype=state";
462             $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
463             break;
464         case 'protected area':
465             $fMaxDist = 1;
466             $sURL .= "&nearlat=".$aRecord['lat'];
467             $sURL .= "&nearlon=".$aRecord['lon'];
468             $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
469             break;
470         case 'settlement':
471             $bUnknown = true;
472         case 'french commune':
473         case 'italian comune':
474         case 'uk place':
475         case 'italian comune':
476         case 'australian place':
477         case 'german place':
478         case '#geobox':
479         case 'u.s. county':
480         case 'municipality':
481         case 'city japan':
482         case 'russian inhabited locality':
483         case 'finnish municipality/land area':
484         case 'england county':
485         case 'israel municipality':
486         case 'russian city':
487         case 'city':
488             $fMaxDist = 0.2;
489             $sURL .= "&featuretype=settlement";
490             $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
491             break;
492         case 'mountain':
493         case 'mountain pass':
494         case 'river':
495         case 'lake':
496         case 'airport':
497             $fMaxDist = 0.2;
498             $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
499
500         case 'ship begin':
501             $fMaxDist = 0.1;
502             $aTypes = array('wreck');
503             $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
504             $sURL .= "&nearlat=".$aRecord['lat'];
505             $sURL .= "&nearlon=".$aRecord['lon'];
506             break;
507         case 'road':
508         case 'university':
509         case 'company':
510         case 'department':
511             $fMaxDist = 0.005;
512             $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
513             $sURL .= "&bounded=1";
514             $sURL .= "&nearlat=".$aRecord['lat'];
515             $sURL .= "&nearlon=".$aRecord['lon'];
516             break;
517         default:
518             $bUnknown = true;
519             $fMaxDist = 0.005;
520             $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
521 //              $sURL .= "&bounded=1";
522             $sURL .= "&nearlat=".$aRecord['lat'];
523             $sURL .= "&nearlon=".$aRecord['lon'];
524             echo "-- Unknown: ".$aRecord['infobox_type']."\n";
525             break;
526         }
527         $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
528
529         var_Dump($sNameURL);
530         $sXML = file_get_contents($sNameURL);
531
532         $aNominatRecords = array();
533         $hXMLParser = xml_parser_create();
534         xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
535         xml_parse($hXMLParser, $sXML, true);
536         xml_parser_free($hXMLParser);
537
538         if (!isset($aNominatRecords[0]))
539         {
540             $aNameParts = preg_split('#[(,]#',$aRecord['name']);
541             if (sizeof($aNameParts) > 1)
542             {
543                 $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
544                 var_Dump($sNameURL);
545                 $sXML = file_get_contents($sNameURL);
546
547                 $aNominatRecords = array();
548                 $hXMLParser = xml_parser_create();
549                 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
550                 xml_parse($hXMLParser, $sXML, true);
551                 xml_parser_free($hXMLParser);#
552             }
553         }
554
555         // assume first is best/right
556         for($i = 0; $i < sizeof($aNominatRecords); $i++)
557         {
558             $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
559             $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
560             $fDiff = sqrt($fDiff);
561             if ($bUnknown) {
562                 // If it was an unknown type base it on the rank of the found result
563                 $iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
564                 if ($iRank <= 4) $fMaxDist = 2;
565                 elseif ($iRank <= 8) $fMaxDist = 1;
566                 elseif ($iRank <= 10) $fMaxDist = 0.8;
567                 elseif ($iRank <= 12) $fMaxDist = 0.6;
568                 elseif ($iRank <= 17) $fMaxDist = 0.2;
569                 elseif ($iRank <= 18) $fMaxDist = 0.1;
570                 elseif ($iRank <= 22) $fMaxDist = 0.02;
571                 elseif ($iRank <= 26) $fMaxDist = 0.001;
572                 else $fMaxDist = 0.001;
573             }
574             echo "-- FOUND \"".substr($aNominatRecords[$i]['DISPLAY_NAME'],0,50)."\", ".$aNominatRecords[$i]['CLASS'].", ".$aNominatRecords[$i]['TYPE'].", ".$aNominatRecords[$i]['PLACE_RANK'].", ".$aNominatRecords[$i]['OSM_TYPE']." (dist:$fDiff, max:$fMaxDist)\n";
575             if ($fDiff > $fMaxDist)
576             {
577                 echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
578             }
579             else
580             {
581                 $sSQL = "update wikipedia_article set osm_type=";
582                 switch($aNominatRecords[$i]['OSM_TYPE'])
583                 {
584                 case 'relation': $sSQL .= "'R'"; break;
585                 case 'way': $sSQL .= "'W'"; break;
586                 case 'node': $sSQL .= "'N'"; break;
587                 }
588                 $sSQL .= ", osm_id=".$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";
589                 $oDB->query($sSQL);
590                 break;
591             }
592         }
593     }
594 }