]> git.openstreetmap.org Git - nominatim.git/blob - utils/importWikipedia.php
match languages such as ja_rm (or any other with underscore) properly
[nominatim.git] / utils / importWikipedia.php
1 #!/usr/bin/php -Cq
2 <?php
3
4 require_once(dirname(dirname(__FILE__)).'/settings/settings.php');
5 require_once(CONST_BasePath.'/lib/init-cmd.php');
6 ini_set('memory_limit', '800M');
7
8 $aCMDOptions
9  = array(
10     'Create and setup nominatim search system',
11     array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
12     array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
13     array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
14
15     array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
16     array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
17     array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
18    );
19 getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
20
21 /*
22 $sTestPageText = <<<EOD
23 {{Coord|47|N|2|E|type:country_region:FR|display=title}}
24 {{ Infobox Amusement park
25 | name = Six Flags Great Adventure
26 | image = [[File:SixFlagsGreatAdventure logo.png]]
27 | caption = Six Flags Great Adventure logo
28 | location = [[Jackson, New Jersey|Jackson]]
29 | location2 = New Jersey
30 | location3 = United States
31 | address = 1 Six Flags Boulevard<ref name="drivedir"/>
32 | season = March/April through October/November
33 | opening_date = July 1, 1974
34 | previous_names = Great Adventure
35 | area_acre = 2200
36 | rides = 45 park admission rides
37 | coasters = 12
38 | water_rides = 2
39 | owner = [[Six Flags]]
40 | general_manager =
41 | homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
42 }}
43 EOD;
44 var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
45 exit;
46 //| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
47 */
48 /*
49
50     $a = array();
51     $a[] = 'test';
52
53     $oDB &= getDB();
54
55     if ($aCMDResult['drop-tables'])
56     {
57         $oDB->query('DROP TABLE wikipedia_article');
58         $oDB->query('DROP TABLE wikipedia_link');
59     }
60 */
61
62 if ($aCMDResult['create-tables']) {
63     $sSQL = <<<'EOD'
64 CREATE TABLE wikipedia_article (
65     language text NOT NULL,
66     title text NOT NULL,
67     langcount integer,
68     othercount integer,
69     totalcount integer,
70     lat double precision,
71     lon double precision,
72     importance double precision,
73     title_en text,
74     osm_type character(1),
75     osm_id bigint,
76     infobox_type text,
77     population bigint,
78     website text
79 );
80         $oDB->query($sSQL);
81
82         $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
83
84         $sSQL = <<<'EOD'
85 CREATE TABLE wikipedia_link (
86   from_id INTEGER,
87   to_name text
88   );
89 EOD;
90     $oDB->query($sSQL);
91 }
92
93
94 function degreesAndMinutesToDecimal($iDegrees, $iMinutes = 0, $fSeconds = 0, $sNSEW = 'N')
95 {
96     $sNSEW = strtoupper($sNSEW);
97     return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
98 }
99
100
101 function _parseWikipediaContent($sPageText)
102 {
103     $sPageText = str_replace("\n", ' ', $sPageText);
104     $sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
105     $sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
106
107     $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
108
109     $aPageProperties = array();
110     $sPageBody = '';
111     $aTemplates = array();
112     $aLinks = array();
113
114     $aTemplateStack = array();
115     $aState = array('body');
116     foreach ($aPageText as $i => $sPart) {
117         switch ($sPart) {
118             case '{{':
119                 array_unshift($aTemplateStack, array('', array()));
120                 array_unshift($aState, 'template');
121                 break;
122             case '}}':
123                 if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
124                     $aTemplate = array_shift($aTemplateStack);
125                     array_shift($aState);
126
127                     $aTemplates[] = $aTemplate;
128                 }
129                 break;
130             case '[[':
131                 $sLinkPage = '';
132                 $sLinkSyn = '';
133                 array_unshift($aState, 'link');
134                 break;
135             case ']]':
136                 if ($aState[0] == 'link' || $aState[0] == 'linksynonim') {
137                     if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
138                     if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
139
140                     $aLinks[] = array($sLinkPage, $sLinkSyn);
141
142                     array_shift($aState);
143                     switch ($aState[0]) {
144                         case 'template':
145                             $aTemplateStack[0][0] .= trim($sPart);
146                             break;
147                         case 'templateparam':
148                             $aTemplateStack[0][1][0] .= $sLinkSyn;
149                             break;
150                         case 'link':
151                             $sLinkPage .= trim($sPart);
152                             break;
153                         case 'linksynonim':
154                             $sLinkSyn .= $sPart;
155                             break;
156                         case 'body':
157                             $sPageBody .= $sLinkSyn;
158                             break;
159                         default:
160                             var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
161                             fail('unknown state');
162                     }
163                 }
164                 break;
165             case '|':
166                 if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
167                     // Create a new template paramater
168                     $aState[0] = 'templateparam';
169                     array_unshift($aTemplateStack[0][1], '');
170                 }
171                 if ($aState[0] == 'link') $aState[0] = 'linksynonim';
172                 break;
173             default:
174                 switch ($aState[0]) {
175                     case 'template':
176                         $aTemplateStack[0][0] .= trim($sPart);
177                         break;
178                     case 'templateparam':
179                         $aTemplateStack[0][1][0] .= $sPart;
180                         break;
181                     case 'link':
182                         $sLinkPage .= trim($sPart);
183                         break;
184                     case 'linksynonim':
185                         $sLinkSyn .= $sPart;
186                         break;
187                     case 'body':
188                         $sPageBody .= $sPart;
189                         break;
190                     default:
191                         var_dump($aState, $aPageText);
192                         fail('unknown state');
193                 }
194                 break;
195         }
196     }
197     return $aTemplates;
198 }
199
200 function _templatesToProperties($aTemplates)
201 {
202     $aPageProperties = array();
203     foreach ($aTemplates as $iTemplate => $aTemplate) {
204         $aParams = array();
205         foreach (array_reverse($aTemplate[1]) as $iParam => $sParam) {
206             if (($iPos = strpos($sParam, '=')) === false) {
207                 $aParams[] = trim($sParam);
208             } else {
209                 $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
210             }
211         }
212         $aTemplates[$iTemplate][1] = $aParams;
213         if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
214         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population'])) {
215             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population']);
216         }
217         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total'])) {
218             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_total']);
219         }
220         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban'])) {
221             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_urban']);
222         }
223         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate'])) {
224             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_estimate']);
225         }
226         if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website']) {
227             if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch)) {
228                 $aPageProperties['sWebsite'] = $aMatch[1];
229                 if (strpos($aPageProperties['sWebsite'], ':/'.'/') === false) {
230                     $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
231                 }
232             }
233         }
234         if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld']) {
235             $aPageProperties['sTopLevelDomain'] = str_replace(array('[', ']', '.'), '', $aParams['cctld']);
236         }
237
238         if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0], 0, 7)) == 'infobox') {
239             $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0], 8));
240             // $aPageProperties['aInfoboxParams'] = $aParams;
241         }
242
243         // Assume the first template with lots of params is the type (fallback for infobox)
244         if (!isset($aPageProperties['sPossibleInfoboxType']) && count($aParams) > 10) {
245             $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
246             // $aPageProperties['aInfoboxParams'] = $aParams;
247         }
248
249         // do we have a lat/lon
250         if (!isset($aPageProperties['fLat'])) {
251             if (isset($aParams['latd']) && isset($aParams['longd'])) {
252                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
253                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
254             }
255             if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees'])) {
256                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
257                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
258             }
259             if (isset($aParams['latitude']) && isset($aParams['longitude'])) {
260                 if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude'])) {
261                     $aPageProperties['fLat'] = (float)$aParams['latitude'];
262                     $aPageProperties['fLon'] = (float)$aParams['longitude'];
263                 }
264             }
265             if (strtolower($aTemplate[0]) == 'coord') {
266                 if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S')) {
267                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
268                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
269                 } elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S')) {
270                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
271                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
272                 } elseif (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S')) {
273                     $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
274                     $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
275                 } elseif (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1])) {
276                     $aPageProperties['fLat'] = (float)$aParams[0];
277                     $aPageProperties['fLon'] = (float)$aParams[1];
278                 }
279             }
280             if (isset($aParams['Latitude']) && isset($aParams['Longitude'])) {
281                 $aParams['Latitude'] = str_replace('&nbsp;', ' ', $aParams['Latitude']);
282                 $aParams['Longitude'] = str_replace('&nbsp;', ' ', $aParams['Longitude']);
283                 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
284                     $aPageProperties['fLat'] =
285                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
286                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
287                 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
288                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
289                 }
290
291                 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
292                     $aPageProperties['fLon'] =
293                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
294                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
295                 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
296                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
297                 }
298             }
299         }
300     }
301     if (isset($aPageProperties['sPossibleInfoboxType'])) {
302         if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
303         unset($aPageProperties['sPossibleInfoboxType']);
304     }
305     return $aPageProperties;
306 }
307
308 if (isset($aCMDResult['parse-wikipedia'])) {
309     $oDB =& getDB();
310     $sSQL = 'select page_title from content where page_namespace = 0 and page_id %10 = ';
311     $sSQL .= $aCMDResult['parse-wikipedia'];
312     $sSQL .= ' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))';
313     $aArticleNames = $oDB->getCol($sSQL);
314     /* $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0
315         and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\'
316         and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
317      */
318     foreach ($aArticleNames as $sArticleName) {
319         $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
320         $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
321
322         if (isset($aP['sInfoboxType'])) {
323             $aP['sInfoboxType'] = preg_replace('#\\s+#', ' ', $aP['sInfoboxType']);
324             $sSQL = 'update wikipedia_article set ';
325             $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
326             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
327             $oDB->query($sSQL);
328         }
329         if (isset($aP['iPopulation'])) {
330             $sSQL = 'update wikipedia_article set ';
331             $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
332             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
333             $oDB->query($sSQL);
334         }
335         if (isset($aP['sWebsite'])) {
336             $sSQL = 'update wikipedia_article set ';
337             $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
338             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
339             $oDB->query($sSQL);
340         }
341         if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0')) {
342             if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
343             echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
344             $sSQL = 'update wikipedia_article set ';
345             $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
346             $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
347             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
348             $oDB->query($sSQL);
349         }
350     }
351 }
352
353
354 function nominatimXMLStart($hParser, $sName, $aAttr)
355 {
356     global $aNominatRecords;
357     switch ($sName) {
358         case 'PLACE':
359             $aNominatRecords[] = $aAttr;
360             break;
361     }
362 }
363
364
365 function nominatimXMLEnd($hParser, $sName)
366 {
367 }
368
369
370 if (isset($aCMDResult['link'])) {
371     $oDB =& getDB();
372     $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
373
374     // If you point this script at production OSM you will be blocked
375     $sNominatimBaseURL = 'http://SEVERNAME/search.php';
376
377     foreach ($aWikiArticles as $aRecord) {
378         $aRecord['name'] = str_replace('_', ' ', $aRecord['title']);
379
380         $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
381
382         echo "\n-- ".$aRecord['name'].', '.$aRecord['infobox_type']."\n";
383         $fMaxDist = 0.0000001;
384         $bUnknown = false;
385         switch (strtolower($aRecord['infobox_type'])) {
386             case 'former country':
387                 continue 2;
388             case 'sea':
389                 $fMaxDist = 60; // effectively turn it off
390                 $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
391                 break;
392             case 'country':
393             case 'island':
394             case 'islands':
395             case 'continent':
396                 $fMaxDist = 60; // effectively turn it off
397                 $sURL .= '&featuretype=country';
398                 $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
399                 break;
400             case 'prefecture japan':
401                 $aRecord['name'] = trim(str_replace(' Prefecture', ' ', $aRecord['name']));
402                 // intentionally no break
403             case 'state':
404             case '#us state':
405             case 'county':
406             case 'u.s. state':
407             case 'u.s. state symbols':
408             case 'german state':
409             case 'province or territory of canada':
410             case 'indian jurisdiction':
411             case 'province':
412             case 'french region':
413             case 'region of italy':
414             case 'kommune':
415             case '#australia state or territory':
416             case 'russian federal subject':
417                 $fMaxDist = 4;
418                 $sURL .= '&featuretype=state';
419                 $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
420                 break;
421             case 'protected area':
422                 $fMaxDist = 1;
423                 $sURL .= '&nearlat='.$aRecord['lat'];
424                 $sURL .= '&nearlon='.$aRecord['lon'];
425                 $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
426                 break;
427             case 'settlement':
428                 $bUnknown = true;
429                 // intentionally no break
430             case 'french commune':
431             case 'italian comune':
432             case 'uk place':
433             case 'italian comune':
434             case 'australian place':
435             case 'german place':
436             case '#geobox':
437             case 'u.s. county':
438             case 'municipality':
439             case 'city japan':
440             case 'russian inhabited locality':
441             case 'finnish municipality/land area':
442             case 'england county':
443             case 'israel municipality':
444             case 'russian city':
445             case 'city':
446                 $fMaxDist = 0.2;
447                 $sURL .= '&featuretype=settlement';
448                 $sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5);
449                 break;
450             case 'mountain':
451             case 'mountain pass':
452             case 'river':
453             case 'lake':
454             case 'airport':
455                 $fMaxDist = 0.2;
456                 $sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5);
457                 break;
458             case 'ship begin':
459                 $fMaxDist = 0.1;
460                 $aTypes = array('wreck');
461                 $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
462                 $sURL .= '&nearlat='.$aRecord['lat'];
463                 $sURL .= '&nearlon='.$aRecord['lon'];
464                 break;
465             case 'road':
466             case 'university':
467             case 'company':
468             case 'department':
469                 $fMaxDist = 0.005;
470                 $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
471                 $sURL .= '&bounded=1';
472                 $sURL .= '&nearlat='.$aRecord['lat'];
473                 $sURL .= '&nearlon='.$aRecord['lon'];
474                 break;
475             default:
476                 $bUnknown = true;
477                 $fMaxDist = 0.005;
478                 $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
479                 // $sURL .= "&bounded=1";
480                 $sURL .= '&nearlat='.$aRecord['lat'];
481                 $sURL .= '&nearlon='.$aRecord['lon'];
482                 echo '-- Unknown: '.$aRecord['infobox_type']."\n";
483                 break;
484         }
485         $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
486
487         var_Dump($sNameURL);
488         $sXML = file_get_contents($sNameURL);
489
490         $aNominatRecords = array();
491         $hXMLParser = xml_parser_create();
492         xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
493         xml_parse($hXMLParser, $sXML, true);
494         xml_parser_free($hXMLParser);
495
496         if (!isset($aNominatRecords[0])) {
497             $aNameParts = preg_split('#[(,]#', $aRecord['name']);
498             if (count($aNameParts) > 1) {
499                 $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
500                 var_Dump($sNameURL);
501                 $sXML = file_get_contents($sNameURL);
502
503                 $aNominatRecords = array();
504                 $hXMLParser = xml_parser_create();
505                 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
506                 xml_parse($hXMLParser, $sXML, true);
507                 xml_parser_free($hXMLParser);
508             }
509         }
510
511         // assume first is best/right
512         for ($i = 0; $i < count($aNominatRecords); $i++) {
513             $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
514             $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
515             $fDiff = sqrt($fDiff);
516             if ($bUnknown) {
517                 // If it was an unknown type base it on the rank of the found result
518                 $iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
519                 if ($iRank <= 4) $fMaxDist = 2;
520                 elseif ($iRank <= 8) $fMaxDist = 1;
521                 elseif ($iRank <= 10) $fMaxDist = 0.8;
522                 elseif ($iRank <= 12) $fMaxDist = 0.6;
523                 elseif ($iRank <= 17) $fMaxDist = 0.2;
524                 elseif ($iRank <= 18) $fMaxDist = 0.1;
525                 elseif ($iRank <= 22) $fMaxDist = 0.02;
526                 elseif ($iRank <= 26) $fMaxDist = 0.001;
527                 else $fMaxDist = 0.001;
528             }
529             echo '-- FOUND "'.substr($aNominatRecords[$i]['DISPLAY_NAME'], 0, 50);
530             echo '", '.$aNominatRecords[$i]['CLASS'].', '.$aNominatRecords[$i]['TYPE'];
531             echo ', '.$aNominatRecords[$i]['PLACE_RANK'].', '.$aNominatRecords[$i]['OSM_TYPE'];
532             echo " (dist:$fDiff, max:$fMaxDist)\n";
533             if ($fDiff > $fMaxDist) {
534                 echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
535             } else {
536                 $sSQL = 'update wikipedia_article set osm_type=';
537                 switch ($aNominatRecords[$i]['OSM_TYPE']) {
538                     case 'relation':
539                         $sSQL .= "'R'";
540                         break;
541                     case 'way':
542                         $sSQL .= "'W'";
543                         break;
544                     case 'node':
545                         $sSQL .= "'N'";
546                         break;
547                 }
548                 $sSQL .= ', osm_id='.$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";
549                 $oDB->query($sSQL);
550                 break;
551             }
552         }
553     }
554 }