]> git.openstreetmap.org Git - nominatim.git/blob - utils/importWikipedia.php
Merge pull request #1318 from mtmail/php-pdo
[nominatim.git] / utils / importWikipedia.php
1 <?php
2
3 require_once(CONST_BasePath.'/lib/init-cmd.php');
4 ini_set('memory_limit', '800M');
5
6 $aCMDOptions
7  = array(
8     'Create and setup nominatim search system',
9     array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
10     array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
11     array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
12
13     array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
14     array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
15     array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
16    );
17 getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
18
19 /*
20 $sTestPageText = <<<EOD
21 {{Coord|47|N|2|E|type:country_region:FR|display=title}}
22 {{ Infobox Amusement park
23 | name = Six Flags Great Adventure
24 | image = [[File:SixFlagsGreatAdventure logo.png]]
25 | caption = Six Flags Great Adventure logo
26 | location = [[Jackson, New Jersey|Jackson]]
27 | location2 = New Jersey
28 | location3 = United States
29 | address = 1 Six Flags Boulevard<ref name="drivedir"/>
30 | season = March/April through October/November
31 | opening_date = July 1, 1974
32 | previous_names = Great Adventure
33 | area_acre = 2200
34 | rides = 45 park admission rides
35 | coasters = 12
36 | water_rides = 2
37 | owner = [[Six Flags]]
38 | general_manager =
39 | homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
40 }}
41 EOD;
42 var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
43 exit;
44 //| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
45 */
46 /*
47
48     $a = array();
49     $a[] = 'test';
50
51     $oDB = new Nominatim\DB();
52     $oDB->connect();
53
54     if ($aCMDResult['drop-tables'])
55     {
56         $oDB->query('DROP TABLE wikipedia_article');
57         $oDB->query('DROP TABLE wikipedia_link');
58     }
59 */
60
61 if ($aCMDResult['create-tables']) {
62     $sSQL = <<<'EOD'
63 CREATE TABLE wikipedia_article (
64     language text NOT NULL,
65     title text NOT NULL,
66     langcount integer,
67     othercount integer,
68     totalcount integer,
69     lat double precision,
70     lon double precision,
71     importance double precision,
72     title_en text,
73     osm_type character(1),
74     osm_id bigint,
75     infobox_type text,
76     population bigint,
77     website text
78 );
79         $oDB->query($sSQL);
80
81         $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
82
83         $sSQL = <<<'EOD'
84 CREATE TABLE wikipedia_link (
85   from_id INTEGER,
86   to_name text
87   );
88 EOD;
89     $oDB->query($sSQL);
90 }
91
92
93 function degreesAndMinutesToDecimal($iDegrees, $iMinutes = 0, $fSeconds = 0, $sNSEW = 'N')
94 {
95     $sNSEW = strtoupper($sNSEW);
96     return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
97 }
98
99
100 function _parseWikipediaContent($sPageText)
101 {
102     $sPageText = str_replace("\n", ' ', $sPageText);
103     $sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
104     $sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
105
106     $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
107
108     $aPageProperties = array();
109     $sPageBody = '';
110     $aTemplates = array();
111     $aLinks = array();
112
113     $aTemplateStack = array();
114     $aState = array('body');
115     foreach ($aPageText as $i => $sPart) {
116         switch ($sPart) {
117             case '{{':
118                 array_unshift($aTemplateStack, array('', array()));
119                 array_unshift($aState, 'template');
120                 break;
121             case '}}':
122                 if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
123                     $aTemplate = array_shift($aTemplateStack);
124                     array_shift($aState);
125
126                     $aTemplates[] = $aTemplate;
127                 }
128                 break;
129             case '[[':
130                 $sLinkPage = '';
131                 $sLinkSyn = '';
132                 array_unshift($aState, 'link');
133                 break;
134             case ']]':
135                 if ($aState[0] == 'link' || $aState[0] == 'linksynonim') {
136                     if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
137                     if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
138
139                     $aLinks[] = array($sLinkPage, $sLinkSyn);
140
141                     array_shift($aState);
142                     switch ($aState[0]) {
143                         case 'template':
144                             $aTemplateStack[0][0] .= trim($sPart);
145                             break;
146                         case 'templateparam':
147                             $aTemplateStack[0][1][0] .= $sLinkSyn;
148                             break;
149                         case 'link':
150                             $sLinkPage .= trim($sPart);
151                             break;
152                         case 'linksynonim':
153                             $sLinkSyn .= $sPart;
154                             break;
155                         case 'body':
156                             $sPageBody .= $sLinkSyn;
157                             break;
158                         default:
159                             var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
160                             fail('unknown state');
161                     }
162                 }
163                 break;
164             case '|':
165                 if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
166                     // Create a new template paramater
167                     $aState[0] = 'templateparam';
168                     array_unshift($aTemplateStack[0][1], '');
169                 }
170                 if ($aState[0] == 'link') $aState[0] = 'linksynonim';
171                 break;
172             default:
173                 switch ($aState[0]) {
174                     case 'template':
175                         $aTemplateStack[0][0] .= trim($sPart);
176                         break;
177                     case 'templateparam':
178                         $aTemplateStack[0][1][0] .= $sPart;
179                         break;
180                     case 'link':
181                         $sLinkPage .= trim($sPart);
182                         break;
183                     case 'linksynonim':
184                         $sLinkSyn .= $sPart;
185                         break;
186                     case 'body':
187                         $sPageBody .= $sPart;
188                         break;
189                     default:
190                         var_dump($aState, $aPageText);
191                         fail('unknown state');
192                 }
193                 break;
194         }
195     }
196     return $aTemplates;
197 }
198
199 function _templatesToProperties($aTemplates)
200 {
201     $aPageProperties = array();
202     foreach ($aTemplates as $iTemplate => $aTemplate) {
203         $aParams = array();
204         foreach (array_reverse($aTemplate[1]) as $iParam => $sParam) {
205             if (($iPos = strpos($sParam, '=')) === false) {
206                 $aParams[] = trim($sParam);
207             } else {
208                 $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
209             }
210         }
211         $aTemplates[$iTemplate][1] = $aParams;
212         if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
213         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population'])) {
214             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population']);
215         }
216         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total'])) {
217             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_total']);
218         }
219         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban'])) {
220             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_urban']);
221         }
222         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate'])) {
223             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_estimate']);
224         }
225         if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website']) {
226             if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch)) {
227                 $aPageProperties['sWebsite'] = $aMatch[1];
228                 if (strpos($aPageProperties['sWebsite'], ':/'.'/') === false) {
229                     $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
230                 }
231             }
232         }
233         if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld']) {
234             $aPageProperties['sTopLevelDomain'] = str_replace(array('[', ']', '.'), '', $aParams['cctld']);
235         }
236
237         if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0], 0, 7)) == 'infobox') {
238             $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0], 8));
239             // $aPageProperties['aInfoboxParams'] = $aParams;
240         }
241
242         // Assume the first template with lots of params is the type (fallback for infobox)
243         if (!isset($aPageProperties['sPossibleInfoboxType']) && count($aParams) > 10) {
244             $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
245             // $aPageProperties['aInfoboxParams'] = $aParams;
246         }
247
248         // do we have a lat/lon
249         if (!isset($aPageProperties['fLat'])) {
250             if (isset($aParams['latd']) && isset($aParams['longd'])) {
251                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
252                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
253             }
254             if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees'])) {
255                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
256                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
257             }
258             if (isset($aParams['latitude']) && isset($aParams['longitude'])) {
259                 if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude'])) {
260                     $aPageProperties['fLat'] = (float)$aParams['latitude'];
261                     $aPageProperties['fLon'] = (float)$aParams['longitude'];
262                 }
263             }
264             if (strtolower($aTemplate[0]) == 'coord') {
265                 if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S')) {
266                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
267                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
268                 } elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S')) {
269                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
270                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
271                 } elseif (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S')) {
272                     $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
273                     $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
274                 } elseif (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1])) {
275                     $aPageProperties['fLat'] = (float)$aParams[0];
276                     $aPageProperties['fLon'] = (float)$aParams[1];
277                 }
278             }
279             if (isset($aParams['Latitude']) && isset($aParams['Longitude'])) {
280                 $aParams['Latitude'] = str_replace('&nbsp;', ' ', $aParams['Latitude']);
281                 $aParams['Longitude'] = str_replace('&nbsp;', ' ', $aParams['Longitude']);
282                 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
283                     $aPageProperties['fLat'] =
284                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
285                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
286                 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
287                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
288                 }
289
290                 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
291                     $aPageProperties['fLon'] =
292                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
293                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
294                 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
295                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
296                 }
297             }
298         }
299     }
300     if (isset($aPageProperties['sPossibleInfoboxType'])) {
301         if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
302         unset($aPageProperties['sPossibleInfoboxType']);
303     }
304     return $aPageProperties;
305 }
306
307 if (isset($aCMDResult['parse-wikipedia'])) {
308     $oDB = new Nominatim\DB();
309     $oDB->connect();
310
311     $sSQL = 'select page_title from content where page_namespace = 0 and page_id %10 = ';
312     $sSQL .= $aCMDResult['parse-wikipedia'];
313     $sSQL .= ' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))';
314     $aArticleNames = $oDB->getCol($sSQL);
315     /* $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0
316         and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\'
317         and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
318      */
319     foreach ($aArticleNames as $sArticleName) {
320         $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
321         $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
322
323         if (isset($aP['sInfoboxType'])) {
324             $aP['sInfoboxType'] = preg_replace('#\\s+#', ' ', $aP['sInfoboxType']);
325             $sSQL = 'update wikipedia_article set ';
326             $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
327             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
328             $oDB->query($sSQL);
329         }
330         if (isset($aP['iPopulation'])) {
331             $sSQL = 'update wikipedia_article set ';
332             $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
333             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
334             $oDB->query($sSQL);
335         }
336         if (isset($aP['sWebsite'])) {
337             $sSQL = 'update wikipedia_article set ';
338             $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
339             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
340             $oDB->query($sSQL);
341         }
342         if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0')) {
343             if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
344             echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
345             $sSQL = 'update wikipedia_article set ';
346             $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
347             $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
348             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
349             $oDB->query($sSQL);
350         }
351     }
352 }
353
354
355 function nominatimXMLStart($hParser, $sName, $aAttr)
356 {
357     global $aNominatRecords;
358     switch ($sName) {
359         case 'PLACE':
360             $aNominatRecords[] = $aAttr;
361             break;
362     }
363 }
364
365
366 function nominatimXMLEnd($hParser, $sName)
367 {
368 }
369
370
371 if (isset($aCMDResult['link'])) {
372     $oDB = new Nominatim\DB();
373     $oDB->connect();
374
375     $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
376
377     // If you point this script at production OSM you will be blocked
378     $sNominatimBaseURL = 'http://SEVERNAME/search.php';
379
380     foreach ($aWikiArticles as $aRecord) {
381         $aRecord['name'] = str_replace('_', ' ', $aRecord['title']);
382
383         $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
384
385         echo "\n-- ".$aRecord['name'].', '.$aRecord['infobox_type']."\n";
386         $fMaxDist = 0.0000001;
387         $bUnknown = false;
388         switch (strtolower($aRecord['infobox_type'])) {
389             case 'former country':
390                 continue 2;
391             case 'sea':
392                 $fMaxDist = 60; // effectively turn it off
393                 $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
394                 break;
395             case 'country':
396             case 'island':
397             case 'islands':
398             case 'continent':
399                 $fMaxDist = 60; // effectively turn it off
400                 $sURL .= '&featuretype=country';
401                 $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
402                 break;
403             case 'prefecture japan':
404                 $aRecord['name'] = trim(str_replace(' Prefecture', ' ', $aRecord['name']));
405                 // intentionally no break
406             case 'state':
407             case '#us state':
408             case 'county':
409             case 'u.s. state':
410             case 'u.s. state symbols':
411             case 'german state':
412             case 'province or territory of canada':
413             case 'indian jurisdiction':
414             case 'province':
415             case 'french region':
416             case 'region of italy':
417             case 'kommune':
418             case '#australia state or territory':
419             case 'russian federal subject':
420                 $fMaxDist = 4;
421                 $sURL .= '&featuretype=state';
422                 $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
423                 break;
424             case 'protected area':
425                 $fMaxDist = 1;
426                 $sURL .= '&nearlat='.$aRecord['lat'];
427                 $sURL .= '&nearlon='.$aRecord['lon'];
428                 $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
429                 break;
430             case 'settlement':
431                 $bUnknown = true;
432                 // intentionally no break
433             case 'french commune':
434             case 'italian comune':
435             case 'uk place':
436             case 'italian comune':
437             case 'australian place':
438             case 'german place':
439             case '#geobox':
440             case 'u.s. county':
441             case 'municipality':
442             case 'city japan':
443             case 'russian inhabited locality':
444             case 'finnish municipality/land area':
445             case 'england county':
446             case 'israel municipality':
447             case 'russian city':
448             case 'city':
449                 $fMaxDist = 0.2;
450                 $sURL .= '&featuretype=settlement';
451                 $sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5);
452                 break;
453             case 'mountain':
454             case 'mountain pass':
455             case 'river':
456             case 'lake':
457             case 'airport':
458                 $fMaxDist = 0.2;
459                 $sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5);
460                 break;
461             case 'ship begin':
462                 $fMaxDist = 0.1;
463                 $aTypes = array('wreck');
464                 $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
465                 $sURL .= '&nearlat='.$aRecord['lat'];
466                 $sURL .= '&nearlon='.$aRecord['lon'];
467                 break;
468             case 'road':
469             case 'university':
470             case 'company':
471             case 'department':
472                 $fMaxDist = 0.005;
473                 $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
474                 $sURL .= '&bounded=1';
475                 $sURL .= '&nearlat='.$aRecord['lat'];
476                 $sURL .= '&nearlon='.$aRecord['lon'];
477                 break;
478             default:
479                 $bUnknown = true;
480                 $fMaxDist = 0.005;
481                 $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
482                 // $sURL .= "&bounded=1";
483                 $sURL .= '&nearlat='.$aRecord['lat'];
484                 $sURL .= '&nearlon='.$aRecord['lon'];
485                 echo '-- Unknown: '.$aRecord['infobox_type']."\n";
486                 break;
487         }
488         $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
489
490         var_Dump($sNameURL);
491         $sXML = file_get_contents($sNameURL);
492
493         $aNominatRecords = array();
494         $hXMLParser = xml_parser_create();
495         xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
496         xml_parse($hXMLParser, $sXML, true);
497         xml_parser_free($hXMLParser);
498
499         if (!isset($aNominatRecords[0])) {
500             $aNameParts = preg_split('#[(,]#', $aRecord['name']);
501             if (count($aNameParts) > 1) {
502                 $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
503                 var_Dump($sNameURL);
504                 $sXML = file_get_contents($sNameURL);
505
506                 $aNominatRecords = array();
507                 $hXMLParser = xml_parser_create();
508                 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
509                 xml_parse($hXMLParser, $sXML, true);
510                 xml_parser_free($hXMLParser);
511             }
512         }
513
514         // assume first is best/right
515         for ($i = 0; $i < count($aNominatRecords); $i++) {
516             $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
517             $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
518             $fDiff = sqrt($fDiff);
519             if ($bUnknown) {
520                 // If it was an unknown type base it on the rank of the found result
521                 $iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
522                 if ($iRank <= 4) $fMaxDist = 2;
523                 elseif ($iRank <= 8) $fMaxDist = 1;
524                 elseif ($iRank <= 10) $fMaxDist = 0.8;
525                 elseif ($iRank <= 12) $fMaxDist = 0.6;
526                 elseif ($iRank <= 17) $fMaxDist = 0.2;
527                 elseif ($iRank <= 18) $fMaxDist = 0.1;
528                 elseif ($iRank <= 22) $fMaxDist = 0.02;
529                 elseif ($iRank <= 26) $fMaxDist = 0.001;
530                 else $fMaxDist = 0.001;
531             }
532             echo '-- FOUND "'.substr($aNominatRecords[$i]['DISPLAY_NAME'], 0, 50);
533             echo '", '.$aNominatRecords[$i]['CLASS'].', '.$aNominatRecords[$i]['TYPE'];
534             echo ', '.$aNominatRecords[$i]['PLACE_RANK'].', '.$aNominatRecords[$i]['OSM_TYPE'];
535             echo " (dist:$fDiff, max:$fMaxDist)\n";
536             if ($fDiff > $fMaxDist) {
537                 echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
538             } else {
539                 $sSQL = 'update wikipedia_article set osm_type=';
540                 switch ($aNominatRecords[$i]['OSM_TYPE']) {
541                     case 'relation':
542                         $sSQL .= "'R'";
543                         break;
544                     case 'way':
545                         $sSQL .= "'W'";
546                         break;
547                     case 'node':
548                         $sSQL .= "'N'";
549                         break;
550                 }
551                 $sSQL .= ', osm_id='.$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";
552                 $oDB->query($sSQL);
553                 break;
554             }
555         }
556     }
557 }