]> git.openstreetmap.org Git - nominatim.git/blob - utils/importWikipedia.php
22307844de5269ad1d6bfcd2b3e764c3f36b7d5b
[nominatim.git] / utils / importWikipedia.php
1 #!/usr/bin/php -Cq
2 <?php
3
4 require_once(dirname(dirname(__FILE__)).'/settings/settings.php');
5 require_once(CONST_BasePath.'/lib/init-cmd.php');
6 ini_set('memory_limit', '800M');
7
8 $aCMDOptions = array(
9     "Create and setup nominatim search system",
10     array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
11     array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
12     array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
13
14     array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
15     array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
16     array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
17 );
18 getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
19
20 /*
21 $sTestPageText = <<<EOD
22 {{Coord|47|N|2|E|type:country_region:FR|display=title}}
23 {{ Infobox Amusement park
24 | name = Six Flags Great Adventure
25 | image = [[File:SixFlagsGreatAdventure logo.png]]
26 | caption = Six Flags Great Adventure logo
27 | location = [[Jackson, New Jersey|Jackson]]
28 | location2 = New Jersey
29 | location3 = United States
30 | address = 1 Six Flags Boulevard<ref name="drivedir"/>
31 | season = March/April through October/November
32 | opening_date = July 1, 1974
33 | previous_names = Great Adventure
34 | area_acre = 2200
35 | rides = 45 park admission rides
36 | coasters = 12
37 | water_rides = 2
38 | owner = [[Six Flags]]
39 | general_manager = 
40 | homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
41 }}
42 EOD;
43 var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
44 exit;
45 //| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
46 */
47 /*
48
49     $a = array();
50     $a[] = 'test';
51
52     $oDB &= getDB();
53
54     if ($aCMDResult['drop-tables'])
55     {
56         $oDB->query('DROP TABLE wikipedia_article');
57         $oDB->query('DROP TABLE wikipedia_link');
58     }
59 */
60
61 if ($aCMDResult['create-tables']) {
62     $sSQL = <<<'EOD'
63 CREATE TABLE wikipedia_article (
64     language text NOT NULL,
65     title text NOT NULL,
66     langcount integer,
67     othercount integer,
68     totalcount integer,
69     lat double precision,
70     lon double precision,
71     importance double precision,
72     title_en text,
73     osm_type character(1),
74     osm_id bigint,
75     infobox_type text,
76     population bigint,
77     website text
78 );
79         $oDB->query($sSQL);
80
81         $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
82
83         $sSQL = <<<'EOD'
84 CREATE TABLE wikipedia_link (
85   from_id INTEGER,
86   to_name text
87   );
88 EOD;
89     $oDB->query($sSQL);
90 }
91
92 function degreesAndMinutesToDecimal($iDegrees, $iMinutes=0, $fSeconds=0, $sNSEW='N')
93 {
94     $sNSEW = strtoupper($sNSEW);
95     return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
96 }
97
98 function _parseWikipediaContent($sPageText)
99 {
100     $sPageText = str_replace("\n", ' ', $sPageText);
101     $sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
102     $sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
103
104     $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
105
106     $aPageProperties = array();
107     $sPageBody = '';
108     $aTemplates = array();
109     $aLinks = array();
110
111     $aTemplateStack = array();
112     $aState = array('body');
113     foreach ($aPageText as $i => $sPart) {
114         switch ($sPart) {
115         case '{{':
116             array_unshift($aTemplateStack, array('', array()));
117             array_unshift($aState, 'template');
118             break;
119         case '}}':
120             if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
121                 $aTemplate = array_shift($aTemplateStack);
122                 array_shift($aState);
123
124                 $aTemplates[] = $aTemplate;
125             }
126             break;
127         case '[[':
128             $sLinkPage = '';
129             $sLinkSyn = '';
130             array_unshift($aState, 'link');
131             break;
132         case ']]':
133             if ($aState[0] == 'link' || $aState[0] == 'linksynonim') {
134                 if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
135                 if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
136
137                 $aLinks[] = array($sLinkPage, $sLinkSyn);
138
139                 array_shift($aState);
140                 switch ($aState[0]) {
141                 case 'template':
142                     $aTemplateStack[0][0] .= trim($sPart);
143                     break;
144                 case 'templateparam':
145                     $aTemplateStack[0][1][0] .= $sLinkSyn;
146                     break;
147                 case 'link':
148                     $sLinkPage .= trim($sPart);
149                     break;
150                 case 'linksynonim':
151                     $sLinkSyn .= $sPart;
152                     break;
153                 case 'body':
154                     $sPageBody .= $sLinkSyn;
155                     break;
156                 default:
157                     var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
158                     fail('unknown state');
159                 }
160             }
161             break;
162         case '|':
163             if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
164                 // Create a new template paramater
165                 $aState[0] = 'templateparam';
166                 array_unshift($aTemplateStack[0][1], '');
167             }
168             if ($aState[0] == 'link') $aState[0] = 'linksynonim';
169             break;
170         default:
171             switch ($aState[0]) {
172             case 'template':
173                 $aTemplateStack[0][0] .= trim($sPart);
174                 break;
175             case 'templateparam':
176                 $aTemplateStack[0][1][0] .= $sPart;
177                 break;
178             case 'link':
179                 $sLinkPage .= trim($sPart);
180                 break;
181             case 'linksynonim':
182                 $sLinkSyn .= $sPart;
183                 break;
184             case 'body':
185                 $sPageBody .= $sPart;
186                 break;
187             default:
188                 var_dump($aState, $aPageText);
189                 fail('unknown state');
190             }
191             break;
192         }
193     }
194     return $aTemplates;
195 }
196
197 function _templatesToProperties($aTemplates)
198 {
199     $aPageProperties = array();
200     foreach ($aTemplates as $iTemplate => $aTemplate) {
201         $aParams = array();
202         foreach (array_reverse($aTemplate[1]) as $iParam => $sParam) {
203             if (($iPos = strpos($sParam, '=')) === FALSE) {
204                 $aParams[] = trim($sParam);
205             } else {
206                 $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
207             }
208         }
209         $aTemplates[$iTemplate][1] = $aParams;
210         if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
211         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population'])) {
212             $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population']);
213         }
214         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total'])) {
215             $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_total']);
216         }
217         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban'])) {
218             $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_urban']);
219         }
220         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate'])) {
221             $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_estimate']);
222         }
223         if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website']) {
224             if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch)) {
225                 $aPageProperties['sWebsite'] = $aMatch[1];
226                 if (strpos($aPageProperties['sWebsite'],':/'.'/') === FALSE) {
227                     $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
228                 }
229             }
230         }
231         if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld']) {
232             $aPageProperties['sTopLevelDomain'] = str_replace(array('[',']','.'),'', $aParams['cctld']);
233         }
234
235         if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0],0,7)) == 'infobox') {
236             $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0],8));
237             // $aPageProperties['aInfoboxParams'] = $aParams;
238         }
239
240         // Assume the first template with lots of params is the type (fallback for infobox)
241         if (!isset($aPageProperties['sPossibleInfoboxType']) && sizeof($aParams) > 10) {
242             $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
243             // $aPageProperties['aInfoboxParams'] = $aParams;
244         }
245
246         // do we have a lat/lon
247         if (!isset($aPageProperties['fLat'])) {
248             if (isset($aParams['latd']) && isset($aParams['longd'])) {
249                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
250                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
251             }
252             if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees'])) {
253                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
254                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
255             }
256             if (isset($aParams['latitude']) && isset($aParams['longitude'])) {
257                 if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude'])) {
258                     $aPageProperties['fLat'] = (float)$aParams['latitude'];
259                     $aPageProperties['fLon'] = (float)$aParams['longitude'];
260                 }
261             }
262             if (strtolower($aTemplate[0]) == 'coord') {
263                 if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S')) {
264                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
265                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
266                 } elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S')) {
267                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
268                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
269                 } else if (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S')) {
270                     $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
271                     $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
272                 } else if (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1])) {
273                     $aPageProperties['fLat'] = (float)$aParams[0];
274                     $aPageProperties['fLon'] = (float)$aParams[1];
275                 }
276             }
277             if (isset($aParams['Latitude']) && isset($aParams['Longitude'])) {
278                 $aParams['Latitude'] = str_replace('&nbsp;',' ',$aParams['Latitude']);
279                 $aParams['Longitude'] = str_replace('&nbsp;',' ',$aParams['Longitude']);
280                 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
281                     $aPageProperties['fLat'] =
282                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
283                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
284                 } else if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
285                     $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
286                 }
287
288                 if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
289                     $aPageProperties['fLon'] =
290                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
291                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
292                 } else if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
293                     $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
294                 }
295             }
296         }
297     }
298     if (isset($aPageProperties['sPossibleInfoboxType'])) {
299         if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
300         unset($aPageProperties['sPossibleInfoboxType']);
301     }
302     return $aPageProperties;
303 }
304
305 if (isset($aCMDResult['parse-wikipedia'])) {
306     $oDB =& getDB();
307     $aArticleNames = $oDB->getCol('select page_title from content where page_namespace = 0 and page_id %10 = '.$aCMDResult['parse-wikipedia'].' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))');
308 //      $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0 and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
309     foreach ($aArticleNames as $sArticleName) {
310         $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
311         $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
312
313         if (isset($aP['sInfoboxType'])) {
314             $aP['sInfoboxType'] = preg_replace('#\\s+#',' ',$aP['sInfoboxType']);
315             $sSQL = 'update wikipedia_article set ';
316             $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
317             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
318             $oDB->query($sSQL);
319         }
320         if (isset($aP['iPopulation'])) {
321             $sSQL = 'update wikipedia_article set ';
322             $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
323             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
324             $oDB->query($sSQL);
325         }
326         if (isset($aP['sWebsite'])) {
327             $sSQL = 'update wikipedia_article set ';
328             $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
329             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
330             $oDB->query($sSQL);
331         }
332         if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0')) {
333             if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
334             echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
335             $sSQL = 'update wikipedia_article set ';
336             $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
337             $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
338             $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
339             $oDB->query($sSQL);
340         }
341     }
342 }
343
344 function nominatimXMLStart($hParser, $sName, $aAttr)
345 {
346         global $aNominatRecords;
347         switch ($sName) {
348         case 'PLACE':
349                 $aNominatRecords[] = $aAttr;
350                 break;
351         }
352 }
353
354 function nominatimXMLEnd($hParser, $sName)
355 {
356 }
357
358
359 if (isset($aCMDResult['link'])) {
360     $oDB =& getDB();
361     $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
362
363     // If you point this script at production OSM you will be blocked
364     $sNominatimBaseURL = 'http://SEVERNAME/search.php';
365
366     foreach ($aWikiArticles as $aRecord) {
367         $aRecord['name'] = str_replace('_',' ',$aRecord['title']);
368
369         $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
370
371         echo "\n-- ".$aRecord['name'].", ".$aRecord['infobox_type']."\n";
372         $fMaxDist = 0.0000001;
373         $bUnknown = false;
374         switch (strtolower($aRecord['infobox_type'])) {
375         case 'former country':
376             continue 2;
377         case 'sea':
378             $fMaxDist = 60; // effectively turn it off
379             $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
380             break;
381         case 'country':
382         case 'island':
383         case 'islands':
384         case 'continent':
385             $fMaxDist = 60; // effectively turn it off
386             $sURL .= "&featuretype=country";
387             $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
388             break;
389         case 'prefecture japan':
390             $aRecord['name'] = trim(str_replace(' Prefecture',' ', $aRecord['name']));
391         case 'state':
392         case '#us state':
393         case 'county':
394         case 'u.s. state':
395         case 'u.s. state symbols':
396         case 'german state':
397         case 'province or territory of canada';
398         case 'indian jurisdiction';
399         case 'province';
400         case 'french region':
401         case 'region of italy':
402         case 'kommune':
403         case '#australia state or territory':
404         case 'russian federal subject':
405             $fMaxDist = 4;
406             $sURL .= "&featuretype=state";
407             $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
408             break;
409         case 'protected area':
410             $fMaxDist = 1;
411             $sURL .= "&nearlat=".$aRecord['lat'];
412             $sURL .= "&nearlon=".$aRecord['lon'];
413             $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
414             break;
415         case 'settlement':
416             $bUnknown = true;
417         case 'french commune':
418         case 'italian comune':
419         case 'uk place':
420         case 'italian comune':
421         case 'australian place':
422         case 'german place':
423         case '#geobox':
424         case 'u.s. county':
425         case 'municipality':
426         case 'city japan':
427         case 'russian inhabited locality':
428         case 'finnish municipality/land area':
429         case 'england county':
430         case 'israel municipality':
431         case 'russian city':
432         case 'city':
433             $fMaxDist = 0.2;
434             $sURL .= "&featuretype=settlement";
435             $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
436             break;
437         case 'mountain':
438         case 'mountain pass':
439         case 'river':
440         case 'lake':
441         case 'airport':
442             $fMaxDist = 0.2;
443             $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
444
445         case 'ship begin':
446             $fMaxDist = 0.1;
447             $aTypes = array('wreck');
448             $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
449             $sURL .= "&nearlat=".$aRecord['lat'];
450             $sURL .= "&nearlon=".$aRecord['lon'];
451             break;
452         case 'road':
453         case 'university':
454         case 'company':
455         case 'department':
456             $fMaxDist = 0.005;
457             $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
458             $sURL .= "&bounded=1";
459             $sURL .= "&nearlat=".$aRecord['lat'];
460             $sURL .= "&nearlon=".$aRecord['lon'];
461             break;
462         default:
463             $bUnknown = true;
464             $fMaxDist = 0.005;
465             $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
466 //              $sURL .= "&bounded=1";
467             $sURL .= "&nearlat=".$aRecord['lat'];
468             $sURL .= "&nearlon=".$aRecord['lon'];
469             echo "-- Unknown: ".$aRecord['infobox_type']."\n";
470             break;
471         }
472         $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
473
474         var_Dump($sNameURL);
475         $sXML = file_get_contents($sNameURL);
476
477         $aNominatRecords = array();
478         $hXMLParser = xml_parser_create();
479         xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
480         xml_parse($hXMLParser, $sXML, true);
481         xml_parser_free($hXMLParser);
482
483         if (!isset($aNominatRecords[0])) {
484             $aNameParts = preg_split('#[(,]#',$aRecord['name']);
485             if (sizeof($aNameParts) > 1) {
486                 $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
487                 var_Dump($sNameURL);
488                 $sXML = file_get_contents($sNameURL);
489
490                 $aNominatRecords = array();
491                 $hXMLParser = xml_parser_create();
492                 xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
493                 xml_parse($hXMLParser, $sXML, true);
494                 xml_parser_free($hXMLParser);#
495             }
496         }
497
498         // assume first is best/right
499         for ($i = 0; $i < sizeof($aNominatRecords); $i++) {
500             $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
501             $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
502             $fDiff = sqrt($fDiff);
503             if ($bUnknown) {
504                 // If it was an unknown type base it on the rank of the found result
505                 $iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
506                 if ($iRank <= 4) $fMaxDist = 2;
507                 elseif ($iRank <= 8) $fMaxDist = 1;
508                 elseif ($iRank <= 10) $fMaxDist = 0.8;
509                 elseif ($iRank <= 12) $fMaxDist = 0.6;
510                 elseif ($iRank <= 17) $fMaxDist = 0.2;
511                 elseif ($iRank <= 18) $fMaxDist = 0.1;
512                 elseif ($iRank <= 22) $fMaxDist = 0.02;
513                 elseif ($iRank <= 26) $fMaxDist = 0.001;
514                 else $fMaxDist = 0.001;
515             }
516             echo "-- FOUND \"".substr($aNominatRecords[$i]['DISPLAY_NAME'],0,50)."\", ".$aNominatRecords[$i]['CLASS'].", ".$aNominatRecords[$i]['TYPE'].", ".$aNominatRecords[$i]['PLACE_RANK'].", ".$aNominatRecords[$i]['OSM_TYPE']." (dist:$fDiff, max:$fMaxDist)\n";
517             if ($fDiff > $fMaxDist) {
518                 echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
519             } else {
520                 $sSQL = "update wikipedia_article set osm_type=";
521                 switch ($aNominatRecords[$i]['OSM_TYPE']) {
522                 case 'relation': $sSQL .= "'R'"; break;
523                 case 'way': $sSQL .= "'W'"; break;
524                 case 'node': $sSQL .= "'N'"; break;
525                 }
526                 $sSQL .= ", osm_id=".$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";
527                 $oDB->query($sSQL);
528                 break;
529             }
530         }
531     }
532 }