]> git.openstreetmap.org Git - nominatim.git/blob - utils/importWikipedia.php
first draft of wikipedia article/importance import script
[nominatim.git] / utils / importWikipedia.php
1 #!/usr/bin/php -Cq
2 <?php
3
4         require_once(dirname(dirname(__FILE__)).'/lib/init-cmd.php');
5         ini_set('memory_limit', '800M');
6
7         $aCMDOptions = array(
8                 "Create and setup nominatim search system",
9                 array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
10                 array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
11                 array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
12
13                 array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
14                 array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
15                 array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
16         );
17         getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
18
19 /*
20 $sTestPageText = <<<EOD
21 {{Coord|47|N|2|E|type:country_region:FR|display=title}}
22 {{ Infobox Amusement park
23 | name = Six Flags Great Adventure
24 | image = [[File:SixFlagsGreatAdventure logo.png]]
25 | caption = Six Flags Great Adventure logo
26 | location = [[Jackson, New Jersey|Jackson]]
27 | location2 = New Jersey
28 | location3 = United States
29 | address = 1 Six Flags Boulevard<ref name="drivedir"/>
30 | season = March/April through October/November
31 | opening_date = July 1, 1974
32 | previous_names = Great Adventure
33 | area_acre = 2200
34 | rides = 45 park admission rides
35 | coasters = 12
36 | water_rides = 2
37 | owner = [[Six Flags]]
38 | general_manager = 
39 | homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
40 }}
41 EOD;
42 var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
43 exit;
44 //| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
45 */
46 /*
47
48         $a = array();
49         $a[] = 'test';
50
51         $oDB &= getDB();
52
53         if ($aCMDResult['drop-tables'])
54         {
55                 $oDB->query('DROP TABLE wikipedia_article');
56                 $oDB->query('DROP TABLE wikipedia_link');
57         }
58 */
59
60         if ($aCMDResult['create-tables'])
61         {
62                 $sSQL = <<<'EOD'
63 CREATE TABLE wikipedia_article (
64     language text NOT NULL,
65     title text NOT NULL,
66     langcount integer,
67     othercount integer,
68     totalcount integer,
69     lat double precision,
70     lon double precision,
71     importance double precision,
72     title_en text,
73     osm_type character(1),
74     osm_id bigint,
75     infobox_type text,
76     population bigint,
77     website text
78 );
79                 $oDB->query($sSQL);
80
81                 $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
82
83                 $sSQL = <<<'EOD'
84 CREATE TABLE wikipedia_link (
85   from_id INTEGER,
86   to_name text
87   );
88 EOD;
89                 $oDB->query($sSQL);
90         }
91
92         function degreesAndMinutesToDecimal($iDegrees, $iMinutes=0, $fSeconds=0, $sNSEW='N')
93         {
94                 $sNSEW = strtoupper($sNSEW);
95                 return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
96         }
97
98         function _parseWikipediaContent($sPageText)
99         {
100                 $sPageText = str_replace("\n", ' ', $sPageText);
101                 $sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
102                 $sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
103
104                 $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
105
106                 $aPageProperties = array();
107                 $sPageBody = '';
108                 $aTemplates = array();
109                 $aLinks = array();
110
111                 $aTemplateStack = array();
112                 $aState = array('body');
113                 foreach($aPageText as $i => $sPart)
114                 {
115                         switch($sPart)
116                         {
117                         case '{{':
118                                 array_unshift($aTemplateStack, array('', array()));
119                                 array_unshift($aState, 'template');
120                                 break;
121                         case '}}':
122                                 if ($aState[0] == 'template' || $aState[0] == 'templateparam')
123                                 {
124                                         $aTemplate = array_shift($aTemplateStack);
125                                         array_shift($aState);
126
127                                         $aTemplates[] = $aTemplate;
128
129                                 }
130                                 break;
131                         case '[[':
132                                 $sLinkPage = '';
133                                 $sLinkSyn = '';
134                                 array_unshift($aState, 'link');
135                                 break;
136                         case ']]':
137                                 if ($aState[0] == 'link' || $aState[0] == 'linksynonim')
138                                 {
139                                         if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
140                                         if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
141
142                                         $aLinks[] = array($sLinkPage, $sLinkSyn);
143
144                                         array_shift($aState);
145                                         switch($aState[0])
146                                         {
147                                         case 'template':
148                                                 $aTemplateStack[0][0] .= trim($sPart);
149                                                 break;
150                                         case 'templateparam':
151                                                 $aTemplateStack[0][1][0] .= $sLinkSyn;
152                                                 break;
153                                         case 'link':
154                                                 $sLinkPage .= trim($sPart);
155                                                 break;
156                                         case 'linksynonim':
157                                                 $sLinkSyn .= $sPart;
158                                                 break;
159                                         case 'body':
160                                                 $sPageBody .= $sLinkSyn;
161                                                 break;
162                                         default:
163                                                 var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
164                                                 fail('unknown state');
165                                         }
166                                 }
167                                 break;
168                         case '|':
169                                 if ($aState[0] == 'template' || $aState[0] == 'templateparam')
170                                 {
171                                         // Create a new template paramater
172                                         $aState[0] = 'templateparam';
173                                         array_unshift($aTemplateStack[0][1], '');
174                                 }
175                                 if ($aState[0] == 'link') $aState[0] = 'linksynonim';
176                                 break;
177                         default:
178                                 switch($aState[0])
179                                 {
180                                 case 'template':
181                                         $aTemplateStack[0][0] .= trim($sPart);
182                                         break;
183                                 case 'templateparam':
184                                         $aTemplateStack[0][1][0] .= $sPart;
185                                         break;
186                                 case 'link':
187                                         $sLinkPage .= trim($sPart);
188                                         break;
189                                 case 'linksynonim':
190                                         $sLinkSyn .= $sPart;
191                                         break;
192                                 case 'body':
193                                         $sPageBody .= $sPart;
194                                         break;
195                                 default:
196                                         var_dump($aState, $aPageText);
197                                         fail('unknown state');
198                                 }
199                                 break;
200                         }
201                 }
202                 return $aTemplates;
203         }
204
205         function _templatesToProperties($aTemplates)
206         {
207                 $aPageProperties = array();
208                 foreach($aTemplates as $iTemplate => $aTemplate)
209                 {
210                         $aParams = array();
211                         foreach(array_reverse($aTemplate[1]) as $iParam => $sParam)
212                         {
213                                 if (($iPos = strpos($sParam, '=')) === FALSE)
214                                 {
215                                         $aParams[] = trim($sParam);
216                                 }
217                                 else
218                                 {
219                                         $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
220                                 }
221                         }
222                         $aTemplates[$iTemplate][1] = $aParams;
223                         if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
224                         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population']))
225                         {
226                                 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population']);
227                         }
228                         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total']))
229                         {
230                                 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_total']);
231                         }
232                         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban']))
233                         {
234                                 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_urban']);
235                         }
236                         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate']))
237                         {
238                                 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_estimate']);
239                         }
240                         if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website'])
241                         {
242                                 if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch))
243                                 {
244                                         $aPageProperties['sWebsite'] = $aMatch[1];
245                                         if (strpos($aPageProperties['sWebsite'],':/'.'/') === FALSE)
246                                         {
247                                                 $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
248                                         }
249                                 }
250                         }
251                         if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld'])
252                         {
253                                 $aPageProperties['sTopLevelDomain'] = str_replace(array('[',']','.'),'', $aParams['cctld']);
254                         }
255
256                         if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0],0,7)) == 'infobox')
257                         {
258                                 $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0],8));
259                                 // $aPageProperties['aInfoboxParams'] = $aParams;
260                         }
261
262                         // Assume the first template with lots of params is the type (fallback for infobox)
263                         if (!isset($aPageProperties['sPossibleInfoboxType']) && sizeof($aParams) > 10)
264                         {
265                                 $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
266                                 // $aPageProperties['aInfoboxParams'] = $aParams;
267                         }
268
269                         // do we have a lat/lon
270                         if (!isset($aPageProperties['fLat']))
271                         {
272                                 if (isset($aParams['latd']) && isset($aParams['longd']))
273                                 {
274                                         $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
275                                         $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
276                                 }
277                                 if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees']))
278                                 {
279                                         $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
280                                         $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
281                                 }
282                                 if (isset($aParams['latitude']) && isset($aParams['longitude']))
283                                 {
284                                         if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude']))
285                                         {
286                                                 $aPageProperties['fLat'] = (float)$aParams['latitude'];
287                                                 $aPageProperties['fLon'] = (float)$aParams['longitude'];
288                                         }
289                                 }
290                                 if (strtolower($aTemplate[0]) == 'coord')
291                                 {
292                                         if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S'))
293                                         {
294                                                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
295                                                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
296                                         }
297                                         elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S'))
298                                         {
299                                                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
300                                                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
301                                         }
302                                         else if (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S'))
303                                         {
304                                                 $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
305                                                 $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
306                                         }
307                                         else if (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1]))
308                                         {
309                                                 $aPageProperties['fLat'] = (float)$aParams[0];
310                                                 $aPageProperties['fLon'] = (float)$aParams[1];
311                                         }
312                                 }
313                                 if (isset($aParams['Latitude']) && isset($aParams['Longitude']))
314                                 {
315                                         $aParams['Latitude'] = str_replace('&nbsp;',' ',$aParams['Latitude']);
316                                         $aParams['Longitude'] = str_replace('&nbsp;',' ',$aParams['Longitude']);
317                                         if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch))
318                                         {
319                                                 $aPageProperties['fLat'] =
320                                                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
321                                                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
322                                         }
323                                         else if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch))
324                                         {
325                                                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
326                                         }
327
328                                         if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch))
329                                         {
330                                                 $aPageProperties['fLon'] =
331                                                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
332                                                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
333                                         }
334                                         else if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch))
335                                         {
336                                                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
337                                         }
338                                 }
339                         }
340                 }
341                 if (isset($aPageProperties['sPossibleInfoboxType']))
342                 {
343                         if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
344                         unset($aPageProperties['sPossibleInfoboxType']);
345                 }
346                 return $aPageProperties;
347         }
348
349         if (isset($aCMDResult['parse-wikipedia']))
350         {
351                 $oDB =& getDB();
352                 $aArticleNames = $oDB->getCol('select page_title from content where page_namespace = 0 and page_id %10 = '.$aCMDResult['parse-wikipedia'].' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))');
353 //              $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0 and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
354                 foreach($aArticleNames as $sArticleName)
355                 {
356                         $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
357                         $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
358
359                         if (isset($aP['sInfoboxType']))
360                         {
361                                 $aP['sInfoboxType'] = preg_replace('#\\s+#',' ',$aP['sInfoboxType']);
362                                 $sSQL = 'update wikipedia_article set ';
363                                 $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
364                                 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
365                                 $oDB->query($sSQL);
366                         }
367                         if (isset($aP['iPopulation']))
368                         {
369                                 $sSQL = 'update wikipedia_article set ';
370                                 $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
371                                 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
372                                 $oDB->query($sSQL);
373                         }
374                         if (isset($aP['sWebsite']))
375                         {
376                                 $sSQL = 'update wikipedia_article set ';
377                                 $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
378                                 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
379                                 $oDB->query($sSQL);
380                         }
381                         if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0'))
382                         {
383                                 if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
384                                 echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
385                                 $sSQL = 'update wikipedia_article set ';
386                                 $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
387                                 $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
388                                 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
389                                 $oDB->query($sSQL);
390                         }
391                 }
392         }
393
394         function nominatimXMLStart($hParser, $sName, $aAttr)
395         {
396                 global $aNominatRecords;
397                 switch($sName)
398                 {
399                 case 'PLACE':
400                         $aNominatRecords[] = $aAttr;
401                         break;
402                 }
403         }
404
405         function nominatimXMLEnd($hParser, $sName)
406         {
407         }
408
409
410         if (isset($aCMDResult['link']))
411         {
412                 $oDB =& getDB();
413                 $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
414
415                 // If you point this script at production OSM you will be blocked
416                 $sNominatimBaseURL = 'http://SEVERNAME/search.php';
417
418                 foreach($aWikiArticles as $aRecord)
419                 {
420                         $aRecord['name'] = str_replace('_',' ',$aRecord['title']);
421
422                         $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
423
424                         echo "\n-- ".$aRecord['name'].", ".$aRecord['infobox_type']."\n";
425                         $fMaxDist = 0.0000001;
426                         $bUnknown = false;
427                         switch(strtolower($aRecord['infobox_type']))
428                         {
429                         case 'former country':
430                                 continue 2;
431                         case 'sea':
432                                 $fMaxDist = 60; // effectively turn it off
433                                 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
434                                 break;
435                         case 'country':
436                         case 'island':
437                         case 'islands':
438                         case 'continent':
439                                 $fMaxDist = 60; // effectively turn it off
440                                 $sURL .= "&featuretype=country";
441                                 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
442                                 break;
443                         case 'prefecture japan':
444                                 $aRecord['name'] = trim(str_replace(' Prefecture',' ', $aRecord['name']));
445                         case 'state':
446                         case '#us state':
447                         case 'county':
448                         case 'u.s. state':
449                         case 'u.s. state symbols':
450                         case 'german state':
451                         case 'province or territory of canada';
452                         case 'indian jurisdiction';
453                         case 'province';
454                         case 'french region':
455                         case 'region of italy':
456                         case 'kommune':
457                         case '#australia state or territory':
458                         case 'russian federal subject':
459                                 $fMaxDist = 4;
460                                 $sURL .= "&featuretype=state";
461                                 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
462                                 break;
463                         case 'protected area':
464                                 $fMaxDist = 1;
465                                 $sURL .= "&nearlat=".$aRecord['lat'];
466                                 $sURL .= "&nearlon=".$aRecord['lon'];
467                                 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
468                                 break;
469                         case 'settlement':
470                                 $bUnknown = true;
471                         case 'french commune':
472                         case 'italian comune':
473                         case 'uk place':
474                         case 'italian comune':
475                         case 'australian place':
476                         case 'german place':
477                         case '#geobox':
478                         case 'u.s. county':
479                         case 'municipality':
480                         case 'city japan':
481                         case 'russian inhabited locality':
482                         case 'finnish municipality/land area':
483                         case 'england county':
484                         case 'israel municipality':
485                         case 'russian city':
486                         case 'city':
487                                 $fMaxDist = 0.2;
488                                 $sURL .= "&featuretype=settlement";
489                                 $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
490                                 break;
491                         case 'mountain':
492                         case 'mountain pass':
493                         case 'river':
494                         case 'lake':
495                         case 'airport':
496                                 $fMaxDist = 0.2;
497                                 $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
498
499                         case 'ship begin':
500                                 $fMaxDist = 0.1;
501                                 $aTypes = array('wreck');
502                                 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
503                                 $sURL .= "&nearlat=".$aRecord['lat'];
504                                 $sURL .= "&nearlon=".$aRecord['lon'];
505                                 break;
506                         case 'road':
507                         case 'university':
508                         case 'company':
509                         case 'department':
510                                 $fMaxDist = 0.005;
511                                 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
512                                 $sURL .= "&bounded=1";
513                                 $sURL .= "&nearlat=".$aRecord['lat'];
514                                 $sURL .= "&nearlon=".$aRecord['lon'];
515                                 break;
516                         default:
517                                 $bUnknown = true;
518                                 $fMaxDist = 0.005;
519                                 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
520 //                              $sURL .= "&bounded=1";
521                                 $sURL .= "&nearlat=".$aRecord['lat'];
522                                 $sURL .= "&nearlon=".$aRecord['lon'];
523                                 echo "-- Unknown: ".$aRecord['infobox_type']."\n";
524                                 break;
525                         }
526                         $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
527
528                         var_Dump($sNameURL);
529                         $sXML = file_get_contents($sNameURL);
530
531                         $aNominatRecords = array();
532                         $hXMLParser = xml_parser_create();
533                         xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
534                         xml_parse($hXMLParser, $sXML, true);
535                         xml_parser_free($hXMLParser);
536
537                         if (!isset($aNominatRecords[0]))
538                         {
539                                 $aNameParts = preg_split('#[(,]#',$aRecord['name']);
540                                 if (sizeof($aNameParts) > 1)
541                                 {
542                                         $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
543                                         var_Dump($sNameURL);
544                                         $sXML = file_get_contents($sNameURL);
545
546                                         $aNominatRecords = array();
547                                         $hXMLParser = xml_parser_create();
548                                         xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
549                                         xml_parse($hXMLParser, $sXML, true);
550                                         xml_parser_free($hXMLParser);#
551                                 }
552                         }
553
554                         // assume first is best/right
555                         for($i = 0; $i < sizeof($aNominatRecords); $i++)
556                         {
557                                 $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
558                                 $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
559                                 $fDiff = sqrt($fDiff);
560                                 if ($bUnknown) {
561                                         // If it was an unknown type base it on the rank of the found result
562                                         $iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
563                                         if ($iRank <= 4) $fMaxDist = 2;
564                                         elseif ($iRank <= 8) $fMaxDist = 1;
565                                         elseif ($iRank <= 10) $fMaxDist = 0.8;
566                                         elseif ($iRank <= 12) $fMaxDist = 0.6;
567                                         elseif ($iRank <= 17) $fMaxDist = 0.2;
568                                         elseif ($iRank <= 18) $fMaxDist = 0.1;
569                                         elseif ($iRank <= 22) $fMaxDist = 0.02;
570                                         elseif ($iRank <= 26) $fMaxDist = 0.001;
571                                         else $fMaxDist = 0.001;
572                                 }
573                                 echo "-- FOUND \"".substr($aNominatRecords[$i]['DISPLAY_NAME'],0,50)."\", ".$aNominatRecords[$i]['CLASS'].", ".$aNominatRecords[$i]['TYPE'].", ".$aNominatRecords[$i]['PLACE_RANK'].", ".$aNominatRecords[$i]['OSM_TYPE']." (dist:$fDiff, max:$fMaxDist)\n";
574                                 if ($fDiff > $fMaxDist)
575                                 {
576                                         echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
577                                 }
578                                 else
579                                 {
580                                         $sSQL = "update wikipedia_article set osm_type=";
581                                         switch($aNominatRecords[$i]['OSM_TYPE'])
582                                         {
583                                         case 'relation': $sSQL .= "'R'"; break;
584                                         case 'way': $sSQL .= "'W'"; break;
585                                         case 'node': $sSQL .= "'N'"; break;
586                                         }
587                                         $sSQL .= ", osm_id=".$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";
588                                         $oDB->query($sSQL);
589                                         break;
590                                 }
591                         }
592                 }
593         }