]> git.openstreetmap.org Git - nominatim.git/blob - utils/importWikipedia.php
PSR2 fixes for /wikidata and /tests-php directories
[nominatim.git] / utils / importWikipedia.php
1 #!/usr/bin/php -Cq
2 <?php
3
4 require_once(dirname(dirname(__FILE__)).'/settings/settings.php');
5 require_once(CONST_BasePath.'/lib/init-cmd.php');
6 ini_set('memory_limit', '800M');
7
8 $aCMDOptions
9  = array(
10     "Create and setup nominatim search system",
11     array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
12     array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
13     array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
14
15     array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
16     array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
17     array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
18    );
19 getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
20
21 /*
22 $sTestPageText = <<<EOD
23 {{Coord|47|N|2|E|type:country_region:FR|display=title}}
24 {{ Infobox Amusement park
25 | name = Six Flags Great Adventure
26 | image = [[File:SixFlagsGreatAdventure logo.png]]
27 | caption = Six Flags Great Adventure logo
28 | location = [[Jackson, New Jersey|Jackson]]
29 | location2 = New Jersey
30 | location3 = United States
31 | address = 1 Six Flags Boulevard<ref name="drivedir"/>
32 | season = March/April through October/November
33 | opening_date = July 1, 1974
34 | previous_names = Great Adventure
35 | area_acre = 2200
36 | rides = 45 park admission rides
37 | coasters = 12
38 | water_rides = 2
39 | owner = [[Six Flags]]
40 | general_manager = 
41 | homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
42 }}
43 EOD;
44 var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
45 exit;
46 //| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
47 */
48 /*
49
50     $a = array();
51     $a[] = 'test';
52
53     $oDB &= getDB();
54
55     if ($aCMDResult['drop-tables'])
56     {
57         $oDB->query('DROP TABLE wikipedia_article');
58         $oDB->query('DROP TABLE wikipedia_link');
59     }
60 */
61
62 if ($aCMDResult['create-tables']) {
63     $sSQL = <<<'EOD'
64 CREATE TABLE wikipedia_article (
65     language text NOT NULL,
66     title text NOT NULL,
67     langcount integer,
68     othercount integer,
69     totalcount integer,
70     lat double precision,
71     lon double precision,
72     importance double precision,
73     title_en text,
74     osm_type character(1),
75     osm_id bigint,
76     infobox_type text,
77     population bigint,
78     website text
79 );
80         $oDB->query($sSQL);
81
82         $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
83
84         $sSQL = <<<'EOD'
85 CREATE TABLE wikipedia_link (
86   from_id INTEGER,
87   to_name text
88   );
89 EOD;
90     $oDB->query($sSQL);
91 }
92
93 function degreesAndMinutesToDecimal($iDegrees, $iMinutes = 0, $fSeconds = 0, $sNSEW = 'N')
94 {
95     $sNSEW = strtoupper($sNSEW);
96     return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
97 }
98
99 function _parseWikipediaContent($sPageText)
100 {
101     $sPageText = str_replace("\n", ' ', $sPageText);
102     $sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
103     $sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
104
105     $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
106
107     $aPageProperties = array();
108     $sPageBody = '';
109     $aTemplates = array();
110     $aLinks = array();
111
112     $aTemplateStack = array();
113     $aState = array('body');
114     foreach ($aPageText as $i => $sPart) {
115         switch ($sPart) {
116         case '{{':
117             array_unshift($aTemplateStack, array('', array()));
118             array_unshift($aState, 'template');
119             break;
120         case '}}':
121             if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
122                 $aTemplate = array_shift($aTemplateStack);
123                 array_shift($aState);
124
125                 $aTemplates[] = $aTemplate;
126             }
127             break;
128         case '[[':
129             $sLinkPage = '';
130             $sLinkSyn = '';
131             array_unshift($aState, 'link');
132             break;
133         case ']]':
134             if ($aState[0] == 'link' || $aState[0] == 'linksynonim') {
135                 if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
136                 if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
137
138                 $aLinks[] = array($sLinkPage, $sLinkSyn);
139
140                 array_shift($aState);
141                 switch ($aState[0]) {
142                 case 'template':
143                     $aTemplateStack[0][0] .= trim($sPart);
144                     break;
145                 case 'templateparam':
146                     $aTemplateStack[0][1][0] .= $sLinkSyn;
147                     break;
148                 case 'link':
149                     $sLinkPage .= trim($sPart);
150                     break;
151                 case 'linksynonim':
152                     $sLinkSyn .= $sPart;
153                     break;
154                 case 'body':
155                     $sPageBody .= $sLinkSyn;
156                     break;
157                 default:
158                     var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
159                     fail('unknown state');
160                 }
161             }
162             break;
163         case '|':
164             if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
165                 // Create a new template paramater
166                 $aState[0] = 'templateparam';
167                 array_unshift($aTemplateStack[0][1], '');
168             }
169             if ($aState[0] == 'link') $aState[0] = 'linksynonim';
170             break;
171         default:
172             switch ($aState[0]) {
173             case 'template':
174                 $aTemplateStack[0][0] .= trim($sPart);
175                 break;
176             case 'templateparam':
177                 $aTemplateStack[0][1][0] .= $sPart;
178                 break;
179             case 'link':
180                 $sLinkPage .= trim($sPart);
181                 break;
182             case 'linksynonim':
183                 $sLinkSyn .= $sPart;
184                 break;
185             case 'body':
186                 $sPageBody .= $sPart;
187                 break;
188             default:
189                 var_dump($aState, $aPageText);
190                 fail('unknown state');
191             }
192             break;
193         }
194     }
195     return $aTemplates;
196 }
197
198 function _templatesToProperties($aTemplates)
199 {
200     $aPageProperties = array();
201     foreach ($aTemplates as $iTemplate => $aTemplate) {
202         $aParams = array();
203         foreach (array_reverse($aTemplate[1]) as $iParam => $sParam) {
204             if (($iPos = strpos($sParam, '=')) === FALSE) {
205                 $aParams[] = trim($sParam);
206             } else {
207                 $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
208             }
209         }
210         $aTemplates[$iTemplate][1] = $aParams;
211         if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
212         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population'])) {
213             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population']);
214         }
215         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total'])) {
216             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_total']);
217         }
218         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban'])) {
219             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_urban']);
220         }
221         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate'])) {
222             $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_estimate']);
223         }
224         if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website']) {
225             if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch)) {
226                 $aPageProperties['sWebsite'] = $aMatch[1];
227                 if (strpos($aPageProperties['sWebsite'], ':/'.'/') === FALSE) {
228                     $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
229                 }
230             }
231         }
232         if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld']) {
233             $aPageProperties['sTopLevelDomain'] = str_replace(array('[', ']', '.'), '', $aParams['cctld']);
234         }
235
236         if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0], 0, 7)) == 'infobox') {
237             $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0], 8));
238             // $aPageProperties['aInfoboxParams'] = $aParams;
239         }
240
241         // Assume the first template with lots of params is the type (fallback for infobox)
242         if (!isset($aPageProperties['sPossibleInfoboxType']) && sizeof($aParams) > 10) {
243             $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
244             // $aPageProperties['aInfoboxParams'] = $aParams;
245         }
246
247         // do we have a lat/lon
248         if (!isset($aPageProperties['fLat'])) {
249             if (isset($aParams['latd']) && isset($aParams['longd'])) {
250                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
251                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
252             }
253             if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees'])) {
254                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
255                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
256             }
257             if (isset($aParams['latitude']) && isset($aParams['longitude'])) {
258                 if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude'])) {
259                     $aPageProperties['fLat'] = (float)$aParams['latitude'];
260                     $aPageProperties['fLon'] = (float)$aParams['longitude'];
261                 }
262             }
263             if (strtolower($aTemplate[0]) == 'coord') {
264                 if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S')) {
265                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
266                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
267                 } elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S')) {
268                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
269                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
270                 } elseif (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S')) {
271                     $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
272                     $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
273                 } elseif (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1])) {
274                     $aPageProperties['fLat'] = (float)$aParams[0];
275                     $aPageProperties['fLon'] = (float)$aParams[1];
276                 }
277             }
278             if (isset($aParams['Latitude']) && isset($aParams['Longitude'])) {
279                 $aParams['Latitude'] = str_replace('&nbsp;', ' ', $aParams['Latitude']);
280                 $aParams['Longitude'] = str_replace('&nbsp;', ' ', $aParams['Longitude']);
281                 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
282                     $aPageProperties['fLat'] =
283                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
284                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
285                 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
286                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
287                 }
288
289                 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
290                     $aPageProperties['fLon'] =
291                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
292                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
293                 } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
294                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
295                 }
296             }
297         }
298     }
299     if (isset($aPageProperties['sPossibleInfoboxType'])) {
300         if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
301         unset($aPageProperties['sPossibleInfoboxType']);
302     }
303     return $aPageProperties;
304 }
305
306 if (isset($aCMDResult['parse-wikipedia'])) {
307     $oDB =& getDB();
308     $aArticleNames = $oDB->getCol('select page_title from content where page_namespace = 0 and page_id %10 = '.$aCMDResult['parse-wikipedia'].' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))');
309 //      $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0 and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
310     foreach ($aArticleNames as $sArticleName) {
311         $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
312         $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
313
314         if (isset($aP['sInfoboxType'])) {
315             $aP['sInfoboxType'] = preg_replace('#\\s+#', ' ', $aP['sInfoboxType']);
316             $sSQL = 'update wikipedia_article set ';
317             $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
318             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
319             $oDB->query($sSQL);
320         }
321         if (isset($aP['iPopulation'])) {
322             $sSQL = 'update wikipedia_article set ';
323             $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
324             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
325             $oDB->query($sSQL);
326         }
327         if (isset($aP['sWebsite'])) {
328             $sSQL = 'update wikipedia_article set ';
329             $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
330             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
331             $oDB->query($sSQL);
332         }
333         if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0')) {
334             if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
335             echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
336             $sSQL = 'update wikipedia_article set ';
337             $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
338             $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
339             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
340             $oDB->query($sSQL);
341         }
342     }
343 }
344
345 function nominatimXMLStart($hParser, $sName, $aAttr)
346 {
347         global $aNominatRecords;
348         switch ($sName) {
349         case 'PLACE':
350                 $aNominatRecords[] = $aAttr;
351                 break;
352         }
353 }
354
355 function nominatimXMLEnd($hParser, $sName)
356 {
357 }
358
359
360 if (isset($aCMDResult['link'])) {
361     $oDB =& getDB();
362     $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
363
364     // If you point this script at production OSM you will be blocked
365     $sNominatimBaseURL = 'http://SEVERNAME/search.php';
366
367     foreach ($aWikiArticles as $aRecord) {
368         $aRecord['name'] = str_replace('_', ' ', $aRecord['title']);
369
370         $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
371
372         echo "\n-- ".$aRecord['name'].", ".$aRecord['infobox_type']."\n";
373         $fMaxDist = 0.0000001;
374         $bUnknown = false;
375         switch (strtolower($aRecord['infobox_type'])) {
376         case 'former country':
377             continue 2;
378         case 'sea':
379             $fMaxDist = 60; // effectively turn it off
380             $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
381             break;
382         case 'country':
383         case 'island':
384         case 'islands':
385         case 'continent':
386             $fMaxDist = 60; // effectively turn it off
387             $sURL .= "&featuretype=country";
388             $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
389             break;
390         case 'prefecture japan':
391             $aRecord['name'] = trim(str_replace(' Prefecture', ' ', $aRecord['name']));
392         case 'state':
393         case '#us state':
394         case 'county':
395         case 'u.s. state':
396         case 'u.s. state symbols':
397         case 'german state':
398         case 'province or territory of canada';
399         case 'indian jurisdiction';
400         case 'province';
401         case 'french region':
402         case 'region of italy':
403         case 'kommune':
404         case '#australia state or territory':
405         case 'russian federal subject':
406             $fMaxDist = 4;
407             $sURL .= "&featuretype=state";
408             $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
409             break;
410         case 'protected area':
411             $fMaxDist = 1;
412             $sURL .= "&nearlat=".$aRecord['lat'];
413             $sURL .= "&nearlon=".$aRecord['lon'];
414             $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
415             break;
416         case 'settlement':
417             $bUnknown = true;
418         case 'french commune':
419         case 'italian comune':
420         case 'uk place':
421         case 'italian comune':
422         case 'australian place':
423         case 'german place':
424         case '#geobox':
425         case 'u.s. county':
426         case 'municipality':
427         case 'city japan':
428         case 'russian inhabited locality':
429         case 'finnish municipality/land area':
430         case 'england county':
431         case 'israel municipality':
432         case 'russian city':
433         case 'city':
434             $fMaxDist = 0.2;
435             $sURL .= "&featuretype=settlement";
436             $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
437             break;
438         case 'mountain':
439         case 'mountain pass':
440         case 'river':
441         case 'lake':
442         case 'airport':
443             $fMaxDist = 0.2;
444             $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
445
446         case 'ship begin':
447             $fMaxDist = 0.1;
448             $aTypes = array('wreck');
449             $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
450             $sURL .= "&nearlat=".$aRecord['lat'];
451             $sURL .= "&nearlon=".$aRecord['lon'];
452             break;
453         case 'road':
454         case 'university':
455         case 'company':
456         case 'department':
457             $fMaxDist = 0.005;
458             $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
459             $sURL .= "&bounded=1";
460             $sURL .= "&nearlat=".$aRecord['lat'];
461             $sURL .= "&nearlon=".$aRecord['lon'];
462             break;
463         default:
464             $bUnknown = true;
465             $fMaxDist = 0.005;
466             $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
467 //              $sURL .= "&bounded=1";
468             $sURL .= "&nearlat=".$aRecord['lat'];
469             $sURL .= "&nearlon=".$aRecord['lon'];
470             echo "-- Unknown: ".$aRecord['infobox_type']."\n";
471             break;
472         }
473         $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
474
475         var_Dump($sNameURL);
476         $sXML = file_get_contents($sNameURL);
477
478         $aNominatRecords = array();
479         $hXMLParser = xml_parser_create();
480         xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
481         xml_parse($hXMLParser, $sXML, true);
482         xml_parser_free($hXMLParser);
483
484         if (!isset($aNominatRecords[0])) {
485             $aNameParts = preg_split('#[(,]#', $aRecord['name']);
486             if (sizeof($aNameParts) > 1) {
487                 $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
488                 var_Dump($sNameURL);
489                 $sXML = file_get_contents($sNameURL);
490
491                 $aNominatRecords = array();
492                 $hXMLParser = xml_parser_create();
493                 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
494                 xml_parse($hXMLParser, $sXML, true);
495                 xml_parser_free($hXMLParser);#
496             }
497         }
498
499         // assume first is best/right
500         for ($i = 0; $i < sizeof($aNominatRecords); $i++) {
501             $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
502             $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
503             $fDiff = sqrt($fDiff);
504             if ($bUnknown) {
505                 // If it was an unknown type base it on the rank of the found result
506                 $iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
507                 if ($iRank <= 4) $fMaxDist = 2;
508                 elseif ($iRank <= 8) $fMaxDist = 1;
509                 elseif ($iRank <= 10) $fMaxDist = 0.8;
510                 elseif ($iRank <= 12) $fMaxDist = 0.6;
511                 elseif ($iRank <= 17) $fMaxDist = 0.2;
512                 elseif ($iRank <= 18) $fMaxDist = 0.1;
513                 elseif ($iRank <= 22) $fMaxDist = 0.02;
514                 elseif ($iRank <= 26) $fMaxDist = 0.001;
515                 else $fMaxDist = 0.001;
516             }
517             echo "-- FOUND \"".substr($aNominatRecords[$i]['DISPLAY_NAME'], 0, 50)."\", ".$aNominatRecords[$i]['CLASS'].", ".$aNominatRecords[$i]['TYPE'].", ".$aNominatRecords[$i]['PLACE_RANK'].", ".$aNominatRecords[$i]['OSM_TYPE']." (dist:$fDiff, max:$fMaxDist)\n";
518             if ($fDiff > $fMaxDist) {
519                 echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
520             } else {
521                 $sSQL = "update wikipedia_article set osm_type=";
522                 switch ($aNominatRecords[$i]['OSM_TYPE']) {
523                 case 'relation': $sSQL .= "'R'"; break;
524                 case 'way': $sSQL .= "'W'"; break;
525                 case 'node': $sSQL .= "'N'"; break;
526                 }
527                 $sSQL .= ", osm_id=".$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";
528                 $oDB->query($sSQL);
529                 break;
530             }
531         }
532     }
533 }