]> git.openstreetmap.org Git - nominatim.git/blob - utils/importWikipedia.php
details: check for unknown object and icons
[nominatim.git] / utils / importWikipedia.php
1 #!/usr/bin/php -Cq
2 <?php
3
4 require_once(dirname(dirname(__FILE__)).'/settings/settings.php');
5 require_once(CONST_BasePath.'/lib/init-cmd.php');
6 ini_set('memory_limit', '800M');
7
8 $aCMDOptions
9  = array(
10     "Create and setup nominatim search system",
11     array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
12     array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
13     array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
14
15     array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
16     array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
17     array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
18    );
19 getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
20
21 /*
22 $sTestPageText = <<<EOD
23 {{Coord|47|N|2|E|type:country_region:FR|display=title}}
24 {{ Infobox Amusement park
25 | name = Six Flags Great Adventure
26 | image = [[File:SixFlagsGreatAdventure logo.png]]
27 | caption = Six Flags Great Adventure logo
28 | location = [[Jackson, New Jersey|Jackson]]
29 | location2 = New Jersey
30 | location3 = United States
31 | address = 1 Six Flags Boulevard<ref name="drivedir"/>
32 | season = March/April through October/November
33 | opening_date = July 1, 1974
34 | previous_names = Great Adventure
35 | area_acre = 2200
36 | rides = 45 park admission rides
37 | coasters = 12
38 | water_rides = 2
39 | owner = [[Six Flags]]
40 | general_manager = 
41 | homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
42 }}
43 EOD;
44 var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
45 exit;
46 //| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
47 */
48 /*
49
50     $a = array();
51     $a[] = 'test';
52
53     $oDB &= getDB();
54
55     if ($aCMDResult['drop-tables'])
56     {
57         $oDB->query('DROP TABLE wikipedia_article');
58         $oDB->query('DROP TABLE wikipedia_link');
59     }
60 */
61
62 if ($aCMDResult['create-tables']) {
63     $sSQL = <<<'EOD'
64 CREATE TABLE wikipedia_article (
65     language text NOT NULL,
66     title text NOT NULL,
67     langcount integer,
68     othercount integer,
69     totalcount integer,
70     lat double precision,
71     lon double precision,
72     importance double precision,
73     title_en text,
74     osm_type character(1),
75     osm_id bigint,
76     infobox_type text,
77     population bigint,
78     website text
79 );
80         $oDB->query($sSQL);
81
82         $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
83
84         $sSQL = <<<'EOD'
85 CREATE TABLE wikipedia_link (
86   from_id INTEGER,
87   to_name text
88   );
89 EOD;
90     $oDB->query($sSQL);
91 }
92
93
94 function degreesAndMinutesToDecimal($iDegrees, $iMinutes = 0, $fSeconds = 0, $sNSEW = 'N')
95 {
96     $sNSEW = strtoupper($sNSEW);
97     return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
98 }
99
100
101 function _parseWikipediaContent($sPageText)
102 {
103     $sPageText = str_replace("\n", ' ', $sPageText);
104     $sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
105     $sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
106
107     $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
108
109     $aPageProperties = array();
110     $sPageBody = '';
111     $aTemplates = array();
112     $aLinks = array();
113
114     $aTemplateStack = array();
115     $aState = array('body');
116     foreach ($aPageText as $i => $sPart) {
117         switch ($sPart) {
118             case '{{':
119                 array_unshift($aTemplateStack, array('', array()));
120                 array_unshift($aState, 'template');
121                 break;
122             case '}}':
123                 if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
124                     $aTemplate = array_shift($aTemplateStack);
125                     array_shift($aState);
126
127                     $aTemplates[] = $aTemplate;
128                 }
129                 break;
130             case '[[':
131                 $sLinkPage = '';
132                 $sLinkSyn = '';
133                 array_unshift($aState, 'link');
134                 break;
135             case ']]':
136                 if ($aState[0] == 'link' || $aState[0] == 'linksynonim') {
137                     if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
138                     if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
139
140                     $aLinks[] = array($sLinkPage, $sLinkSyn);
141
142                     array_shift($aState);
143                     switch ($aState[0]) {
144                         case 'template':
145                             $aTemplateStack[0][0] .= trim($sPart);
146                             break;
147                         case 'templateparam':
148                             $aTemplateStack[0][1][0] .= $sLinkSyn;
149                             break;
150                         case 'link':
151                             $sLinkPage .= trim($sPart);
152                             break;
153                         case 'linksynonim':
154                             $sLinkSyn .= $sPart;
155                             break;
156                         case 'body':
157                             $sPageBody .= $sLinkSyn;
158                             break;
159                         default:
160                             var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
161                             fail('unknown state');
162                     }
163                 }
164                 break;
165             case '|':
166                 if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
167                     // Create a new template paramater
168                     $aState[0] = 'templateparam';
169                     array_unshift($aTemplateStack[0][1], '');
170                 }
171                 if ($aState[0] == 'link') $aState[0] = 'linksynonim';
172                 break;
173             default:
174                 switch ($aState[0]) {
175                     case 'template':
176                         $aTemplateStack[0][0] .= trim($sPart);
177                         break;
178                     case 'templateparam':
179                         $aTemplateStack[0][1][0] .= $sPart;
180                         break;
181                     case 'link':
182                         $sLinkPage .= trim($sPart);
183                         break;
184                     case 'linksynonim':
185                         $sLinkSyn .= $sPart;
186                         break;
187                     case 'body':
188                         $sPageBody .= $sPart;
189                         break;
190                     default:
191                         var_dump($aState, $aPageText);
192                         fail('unknown state');
193                 }
194                 break;
195         }
196     }
197     return $aTemplates;
198 }
199
200 function _templatesToProperties($aTemplates)
201 {
202     $aPageProperties = array();
203     foreach ($aTemplates as $iTemplate => $aTemplate) {
204         $aParams = array();
205         foreach (array_reverse($aTemplate[1]) as $iParam => $sParam) {
206             if (($iPos = strpos($sParam, '=')) === false) {
207                 $aParams[] = trim($sParam);
208             } else {
209                 $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
210             }
211         }
212         $aTemplates[$iTemplate][1] = $aParams;
213         if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
214         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population'])) {
215             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population']);
216         }
217         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total'])) {
218             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_total']);
219         }
220         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban'])) {
221             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_urban']);
222         }
223         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate'])) {
224             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_estimate']);
225         }
226         if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website']) {
227             if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch)) {
228                 $aPageProperties['sWebsite'] = $aMatch[1];
229                 if (strpos($aPageProperties['sWebsite'], ':/'.'/') === false) {
230                     $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
231                 }
232             }
233         }
234         if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld']) {
235             $aPageProperties['sTopLevelDomain'] = str_replace(array('[', ']', '.'), '', $aParams['cctld']);
236         }
237
238         if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0], 0, 7)) == 'infobox') {
239             $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0], 8));
240             // $aPageProperties['aInfoboxParams'] = $aParams;
241         }
242
243         // Assume the first template with lots of params is the type (fallback for infobox)
244         if (!isset($aPageProperties['sPossibleInfoboxType']) && sizeof($aParams) > 10) {
245             $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
246             // $aPageProperties['aInfoboxParams'] = $aParams;
247         }
248
249         // do we have a lat/lon
250         if (!isset($aPageProperties['fLat'])) {
251             if (isset($aParams['latd']) && isset($aParams['longd'])) {
252                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
253                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
254             }
255             if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees'])) {
256                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
257                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
258             }
259             if (isset($aParams['latitude']) && isset($aParams['longitude'])) {
260                 if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude'])) {
261                     $aPageProperties['fLat'] = (float)$aParams['latitude'];
262                     $aPageProperties['fLon'] = (float)$aParams['longitude'];
263                 }
264             }
265             if (strtolower($aTemplate[0]) == 'coord') {
266                 if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S')) {
267                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
268                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
269                 } elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S')) {
270                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
271                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
272                 } elseif (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S')) {
273                     $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
274                     $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
275                 } elseif (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1])) {
276                     $aPageProperties['fLat'] = (float)$aParams[0];
277                     $aPageProperties['fLon'] = (float)$aParams[1];
278                 }
279             }
280             if (isset($aParams['Latitude']) && isset($aParams['Longitude'])) {
281                 $aParams['Latitude'] = str_replace('&nbsp;', ' ', $aParams['Latitude']);
282                 $aParams['Longitude'] = str_replace('&nbsp;', ' ', $aParams['Longitude']);
283                 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
284                     $aPageProperties['fLat'] =
285                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
286                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
287                 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
288                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
289                 }
290
291                 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
292                     $aPageProperties['fLon'] =
293                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
294                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
295                 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
296                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
297                 }
298             }
299         }
300     }
301     if (isset($aPageProperties['sPossibleInfoboxType'])) {
302         if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
303         unset($aPageProperties['sPossibleInfoboxType']);
304     }
305     return $aPageProperties;
306 }
307
308 if (isset($aCMDResult['parse-wikipedia'])) {
309     $oDB =& getDB();
310     $aArticleNames = $oDB->getCol('select page_title from content where page_namespace = 0 and page_id %10 = '.$aCMDResult['parse-wikipedia'].' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))');
311     // $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0 and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
312     foreach ($aArticleNames as $sArticleName) {
313         $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
314         $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
315
316         if (isset($aP['sInfoboxType'])) {
317             $aP['sInfoboxType'] = preg_replace('#\\s+#', ' ', $aP['sInfoboxType']);
318             $sSQL = 'update wikipedia_article set ';
319             $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
320             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
321             $oDB->query($sSQL);
322         }
323         if (isset($aP['iPopulation'])) {
324             $sSQL = 'update wikipedia_article set ';
325             $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
326             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
327             $oDB->query($sSQL);
328         }
329         if (isset($aP['sWebsite'])) {
330             $sSQL = 'update wikipedia_article set ';
331             $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
332             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
333             $oDB->query($sSQL);
334         }
335         if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0')) {
336             if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
337             echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
338             $sSQL = 'update wikipedia_article set ';
339             $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
340             $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
341             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
342             $oDB->query($sSQL);
343         }
344     }
345 }
346
347
348 function nominatimXMLStart($hParser, $sName, $aAttr)
349 {
350     global $aNominatRecords;
351     switch ($sName) {
352         case 'PLACE':
353             $aNominatRecords[] = $aAttr;
354             break;
355     }
356 }
357
358
359 function nominatimXMLEnd($hParser, $sName)
360 {
361 }
362
363
364 if (isset($aCMDResult['link'])) {
365     $oDB =& getDB();
366     $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
367
368     // If you point this script at production OSM you will be blocked
369     $sNominatimBaseURL = 'http://SEVERNAME/search.php';
370
371     foreach ($aWikiArticles as $aRecord) {
372         $aRecord['name'] = str_replace('_', ' ', $aRecord['title']);
373
374         $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
375
376         echo "\n-- ".$aRecord['name'].", ".$aRecord['infobox_type']."\n";
377         $fMaxDist = 0.0000001;
378         $bUnknown = false;
379         switch (strtolower($aRecord['infobox_type'])) {
380             case 'former country':
381                 continue 2;
382             case 'sea':
383                 $fMaxDist = 60; // effectively turn it off
384                 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
385                 break;
386             case 'country':
387             case 'island':
388             case 'islands':
389             case 'continent':
390                 $fMaxDist = 60; // effectively turn it off
391                 $sURL .= "&featuretype=country";
392                 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
393                 break;
394             case 'prefecture japan':
395                 $aRecord['name'] = trim(str_replace(' Prefecture', ' ', $aRecord['name']));
396                 // intentionally no break
397             case 'state':
398             case '#us state':
399             case 'county':
400             case 'u.s. state':
401             case 'u.s. state symbols':
402             case 'german state':
403             case 'province or territory of canada':
404             case 'indian jurisdiction':
405             case 'province':
406             case 'french region':
407             case 'region of italy':
408             case 'kommune':
409             case '#australia state or territory':
410             case 'russian federal subject':
411                 $fMaxDist = 4;
412                 $sURL .= "&featuretype=state";
413                 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
414                 break;
415             case 'protected area':
416                 $fMaxDist = 1;
417                 $sURL .= "&nearlat=".$aRecord['lat'];
418                 $sURL .= "&nearlon=".$aRecord['lon'];
419                 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
420                 break;
421             case 'settlement':
422                 $bUnknown = true;
423                 // intentionally no break
424             case 'french commune':
425             case 'italian comune':
426             case 'uk place':
427             case 'italian comune':
428             case 'australian place':
429             case 'german place':
430             case '#geobox':
431             case 'u.s. county':
432             case 'municipality':
433             case 'city japan':
434             case 'russian inhabited locality':
435             case 'finnish municipality/land area':
436             case 'england county':
437             case 'israel municipality':
438             case 'russian city':
439             case 'city':
440                 $fMaxDist = 0.2;
441                 $sURL .= "&featuretype=settlement";
442                 $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
443                 break;
444             case 'mountain':
445             case 'mountain pass':
446             case 'river':
447             case 'lake':
448             case 'airport':
449                 $fMaxDist = 0.2;
450                 $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
451                 break;
452             case 'ship begin':
453                 $fMaxDist = 0.1;
454                 $aTypes = array('wreck');
455                 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
456                 $sURL .= "&nearlat=".$aRecord['lat'];
457                 $sURL .= "&nearlon=".$aRecord['lon'];
458                 break;
459             case 'road':
460             case 'university':
461             case 'company':
462             case 'department':
463                 $fMaxDist = 0.005;
464                 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
465                 $sURL .= "&bounded=1";
466                 $sURL .= "&nearlat=".$aRecord['lat'];
467                 $sURL .= "&nearlon=".$aRecord['lon'];
468                 break;
469             default:
470                 $bUnknown = true;
471                 $fMaxDist = 0.005;
472                 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
473                 // $sURL .= "&bounded=1";
474                 $sURL .= "&nearlat=".$aRecord['lat'];
475                 $sURL .= "&nearlon=".$aRecord['lon'];
476                 echo "-- Unknown: ".$aRecord['infobox_type']."\n";
477                 break;
478         }
479         $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
480
481         var_Dump($sNameURL);
482         $sXML = file_get_contents($sNameURL);
483
484         $aNominatRecords = array();
485         $hXMLParser = xml_parser_create();
486         xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
487         xml_parse($hXMLParser, $sXML, true);
488         xml_parser_free($hXMLParser);
489
490         if (!isset($aNominatRecords[0])) {
491             $aNameParts = preg_split('#[(,]#', $aRecord['name']);
492             if (sizeof($aNameParts) > 1) {
493                 $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
494                 var_Dump($sNameURL);
495                 $sXML = file_get_contents($sNameURL);
496
497                 $aNominatRecords = array();
498                 $hXMLParser = xml_parser_create();
499                 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
500                 xml_parse($hXMLParser, $sXML, true);
501                 xml_parser_free($hXMLParser);
502             }
503         }
504
505         // assume first is best/right
506         for ($i = 0; $i < sizeof($aNominatRecords); $i++) {
507             $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
508             $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
509             $fDiff = sqrt($fDiff);
510             if ($bUnknown) {
511                 // If it was an unknown type base it on the rank of the found result
512                 $iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
513                 if ($iRank <= 4) $fMaxDist = 2;
514                 elseif ($iRank <= 8) $fMaxDist = 1;
515                 elseif ($iRank <= 10) $fMaxDist = 0.8;
516                 elseif ($iRank <= 12) $fMaxDist = 0.6;
517                 elseif ($iRank <= 17) $fMaxDist = 0.2;
518                 elseif ($iRank <= 18) $fMaxDist = 0.1;
519                 elseif ($iRank <= 22) $fMaxDist = 0.02;
520                 elseif ($iRank <= 26) $fMaxDist = 0.001;
521                 else $fMaxDist = 0.001;
522             }
523             echo "-- FOUND \"".substr($aNominatRecords[$i]['DISPLAY_NAME'], 0, 50)."\", ".$aNominatRecords[$i]['CLASS'].", ".$aNominatRecords[$i]['TYPE'].", ".$aNominatRecords[$i]['PLACE_RANK'].", ".$aNominatRecords[$i]['OSM_TYPE']." (dist:$fDiff, max:$fMaxDist)\n";
524             if ($fDiff > $fMaxDist) {
525                 echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
526             } else {
527                 $sSQL = "update wikipedia_article set osm_type=";
528                 switch ($aNominatRecords[$i]['OSM_TYPE']) {
529                     case 'relation':
530                         $sSQL .= "'R'";
531                         break;
532                     case 'way':
533                         $sSQL .= "'W'";
534                         break;
535                     case 'node':
536                         $sSQL .= "'N'";
537                         break;
538                 }
539                 $sSQL .= ", osm_id=".$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";
540                 $oDB->query($sSQL);
541                 break;
542             }
543         }
544     }
545 }