]> git.openstreetmap.org Git - nominatim.git/blob - utils/importWikipedia.php
Merge pull request #499 from mtmail/travis-ci
[nominatim.git] / utils / importWikipedia.php
1 #!/usr/bin/php -Cq
2 <?php
3
4         require_once(dirname(dirname(__FILE__)).'/settings/settings.php');
5         require_once(CONST_BasePath.'/lib/init-cmd.php');
6         ini_set('memory_limit', '800M');
7
8         $aCMDOptions = array(
9                 "Create and setup nominatim search system",
10                 array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
11                 array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
12                 array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
13
14                 array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
15                 array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
16                 array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
17         );
18         getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
19
20 /*
21 $sTestPageText = <<<EOD
22 {{Coord|47|N|2|E|type:country_region:FR|display=title}}
23 {{ Infobox Amusement park
24 | name = Six Flags Great Adventure
25 | image = [[File:SixFlagsGreatAdventure logo.png]]
26 | caption = Six Flags Great Adventure logo
27 | location = [[Jackson, New Jersey|Jackson]]
28 | location2 = New Jersey
29 | location3 = United States
30 | address = 1 Six Flags Boulevard<ref name="drivedir"/>
31 | season = March/April through October/November
32 | opening_date = July 1, 1974
33 | previous_names = Great Adventure
34 | area_acre = 2200
35 | rides = 45 park admission rides
36 | coasters = 12
37 | water_rides = 2
38 | owner = [[Six Flags]]
39 | general_manager = 
40 | homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
41 }}
42 EOD;
43 var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
44 exit;
45 //| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
46 */
47 /*
48
49         $a = array();
50         $a[] = 'test';
51
52         $oDB &= getDB();
53
54         if ($aCMDResult['drop-tables'])
55         {
56                 $oDB->query('DROP TABLE wikipedia_article');
57                 $oDB->query('DROP TABLE wikipedia_link');
58         }
59 */
60
61         if ($aCMDResult['create-tables'])
62         {
63                 $sSQL = <<<'EOD'
64 CREATE TABLE wikipedia_article (
65     language text NOT NULL,
66     title text NOT NULL,
67     langcount integer,
68     othercount integer,
69     totalcount integer,
70     lat double precision,
71     lon double precision,
72     importance double precision,
73     title_en text,
74     osm_type character(1),
75     osm_id bigint,
76     infobox_type text,
77     population bigint,
78     website text
79 );
80                 $oDB->query($sSQL);
81
82                 $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
83
84                 $sSQL = <<<'EOD'
85 CREATE TABLE wikipedia_link (
86   from_id INTEGER,
87   to_name text
88   );
89 EOD;
90                 $oDB->query($sSQL);
91         }
92
93         function degreesAndMinutesToDecimal($iDegrees, $iMinutes=0, $fSeconds=0, $sNSEW='N')
94         {
95                 $sNSEW = strtoupper($sNSEW);
96                 return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
97         }
98
99         function _parseWikipediaContent($sPageText)
100         {
101                 $sPageText = str_replace("\n", ' ', $sPageText);
102                 $sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
103                 $sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
104
105                 $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
106
107                 $aPageProperties = array();
108                 $sPageBody = '';
109                 $aTemplates = array();
110                 $aLinks = array();
111
112                 $aTemplateStack = array();
113                 $aState = array('body');
114                 foreach($aPageText as $i => $sPart)
115                 {
116                         switch($sPart)
117                         {
118                         case '{{':
119                                 array_unshift($aTemplateStack, array('', array()));
120                                 array_unshift($aState, 'template');
121                                 break;
122                         case '}}':
123                                 if ($aState[0] == 'template' || $aState[0] == 'templateparam')
124                                 {
125                                         $aTemplate = array_shift($aTemplateStack);
126                                         array_shift($aState);
127
128                                         $aTemplates[] = $aTemplate;
129
130                                 }
131                                 break;
132                         case '[[':
133                                 $sLinkPage = '';
134                                 $sLinkSyn = '';
135                                 array_unshift($aState, 'link');
136                                 break;
137                         case ']]':
138                                 if ($aState[0] == 'link' || $aState[0] == 'linksynonim')
139                                 {
140                                         if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
141                                         if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
142
143                                         $aLinks[] = array($sLinkPage, $sLinkSyn);
144
145                                         array_shift($aState);
146                                         switch($aState[0])
147                                         {
148                                         case 'template':
149                                                 $aTemplateStack[0][0] .= trim($sPart);
150                                                 break;
151                                         case 'templateparam':
152                                                 $aTemplateStack[0][1][0] .= $sLinkSyn;
153                                                 break;
154                                         case 'link':
155                                                 $sLinkPage .= trim($sPart);
156                                                 break;
157                                         case 'linksynonim':
158                                                 $sLinkSyn .= $sPart;
159                                                 break;
160                                         case 'body':
161                                                 $sPageBody .= $sLinkSyn;
162                                                 break;
163                                         default:
164                                                 var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
165                                                 fail('unknown state');
166                                         }
167                                 }
168                                 break;
169                         case '|':
170                                 if ($aState[0] == 'template' || $aState[0] == 'templateparam')
171                                 {
172                                         // Create a new template paramater
173                                         $aState[0] = 'templateparam';
174                                         array_unshift($aTemplateStack[0][1], '');
175                                 }
176                                 if ($aState[0] == 'link') $aState[0] = 'linksynonim';
177                                 break;
178                         default:
179                                 switch($aState[0])
180                                 {
181                                 case 'template':
182                                         $aTemplateStack[0][0] .= trim($sPart);
183                                         break;
184                                 case 'templateparam':
185                                         $aTemplateStack[0][1][0] .= $sPart;
186                                         break;
187                                 case 'link':
188                                         $sLinkPage .= trim($sPart);
189                                         break;
190                                 case 'linksynonim':
191                                         $sLinkSyn .= $sPart;
192                                         break;
193                                 case 'body':
194                                         $sPageBody .= $sPart;
195                                         break;
196                                 default:
197                                         var_dump($aState, $aPageText);
198                                         fail('unknown state');
199                                 }
200                                 break;
201                         }
202                 }
203                 return $aTemplates;
204         }
205
206         function _templatesToProperties($aTemplates)
207         {
208                 $aPageProperties = array();
209                 foreach($aTemplates as $iTemplate => $aTemplate)
210                 {
211                         $aParams = array();
212                         foreach(array_reverse($aTemplate[1]) as $iParam => $sParam)
213                         {
214                                 if (($iPos = strpos($sParam, '=')) === FALSE)
215                                 {
216                                         $aParams[] = trim($sParam);
217                                 }
218                                 else
219                                 {
220                                         $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
221                                 }
222                         }
223                         $aTemplates[$iTemplate][1] = $aParams;
224                         if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
225                         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population']))
226                         {
227                                 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population']);
228                         }
229                         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total']))
230                         {
231                                 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_total']);
232                         }
233                         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban']))
234                         {
235                                 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_urban']);
236                         }
237                         if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate']))
238                         {
239                                 $aPageProperties['iPopulation'] = (int)str_replace(array(',','.'), '', $aParams['population_estimate']);
240                         }
241                         if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website'])
242                         {
243                                 if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch))
244                                 {
245                                         $aPageProperties['sWebsite'] = $aMatch[1];
246                                         if (strpos($aPageProperties['sWebsite'],':/'.'/') === FALSE)
247                                         {
248                                                 $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
249                                         }
250                                 }
251                         }
252                         if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld'])
253                         {
254                                 $aPageProperties['sTopLevelDomain'] = str_replace(array('[',']','.'),'', $aParams['cctld']);
255                         }
256
257                         if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0],0,7)) == 'infobox')
258                         {
259                                 $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0],8));
260                                 // $aPageProperties['aInfoboxParams'] = $aParams;
261                         }
262
263                         // Assume the first template with lots of params is the type (fallback for infobox)
264                         if (!isset($aPageProperties['sPossibleInfoboxType']) && sizeof($aParams) > 10)
265                         {
266                                 $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
267                                 // $aPageProperties['aInfoboxParams'] = $aParams;
268                         }
269
270                         // do we have a lat/lon
271                         if (!isset($aPageProperties['fLat']))
272                         {
273                                 if (isset($aParams['latd']) && isset($aParams['longd']))
274                                 {
275                                         $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
276                                         $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
277                                 }
278                                 if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees']))
279                                 {
280                                         $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
281                                         $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
282                                 }
283                                 if (isset($aParams['latitude']) && isset($aParams['longitude']))
284                                 {
285                                         if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude']))
286                                         {
287                                                 $aPageProperties['fLat'] = (float)$aParams['latitude'];
288                                                 $aPageProperties['fLon'] = (float)$aParams['longitude'];
289                                         }
290                                 }
291                                 if (strtolower($aTemplate[0]) == 'coord')
292                                 {
293                                         if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S'))
294                                         {
295                                                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
296                                                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
297                                         }
298                                         elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S'))
299                                         {
300                                                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
301                                                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
302                                         }
303                                         else if (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S'))
304                                         {
305                                                 $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
306                                                 $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
307                                         }
308                                         else if (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1]))
309                                         {
310                                                 $aPageProperties['fLat'] = (float)$aParams[0];
311                                                 $aPageProperties['fLon'] = (float)$aParams[1];
312                                         }
313                                 }
314                                 if (isset($aParams['Latitude']) && isset($aParams['Longitude']))
315                                 {
316                                         $aParams['Latitude'] = str_replace('&nbsp;',' ',$aParams['Latitude']);
317                                         $aParams['Longitude'] = str_replace('&nbsp;',' ',$aParams['Longitude']);
318                                         if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch))
319                                         {
320                                                 $aPageProperties['fLat'] =
321                                                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
322                                                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
323                                         }
324                                         else if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch))
325                                         {
326                                                 $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
327                                         }
328
329                                         if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch))
330                                         {
331                                                 $aPageProperties['fLon'] =
332                                                         (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
333                                                         +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
334                                         }
335                                         else if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch))
336                                         {
337                                                 $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
338                                         }
339                                 }
340                         }
341                 }
342                 if (isset($aPageProperties['sPossibleInfoboxType']))
343                 {
344                         if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
345                         unset($aPageProperties['sPossibleInfoboxType']);
346                 }
347                 return $aPageProperties;
348         }
349
350         if (isset($aCMDResult['parse-wikipedia']))
351         {
352                 $oDB =& getDB();
353                 $aArticleNames = $oDB->getCol('select page_title from content where page_namespace = 0 and page_id %10 = '.$aCMDResult['parse-wikipedia'].' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))');
354 //              $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0 and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
355                 foreach($aArticleNames as $sArticleName)
356                 {
357                         $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
358                         $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
359
360                         if (isset($aP['sInfoboxType']))
361                         {
362                                 $aP['sInfoboxType'] = preg_replace('#\\s+#',' ',$aP['sInfoboxType']);
363                                 $sSQL = 'update wikipedia_article set ';
364                                 $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
365                                 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
366                                 $oDB->query($sSQL);
367                         }
368                         if (isset($aP['iPopulation']))
369                         {
370                                 $sSQL = 'update wikipedia_article set ';
371                                 $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
372                                 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
373                                 $oDB->query($sSQL);
374                         }
375                         if (isset($aP['sWebsite']))
376                         {
377                                 $sSQL = 'update wikipedia_article set ';
378                                 $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
379                                 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
380                                 $oDB->query($sSQL);
381                         }
382                         if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0'))
383                         {
384                                 if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
385                                 echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
386                                 $sSQL = 'update wikipedia_article set ';
387                                 $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
388                                 $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
389                                 $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
390                                 $oDB->query($sSQL);
391                         }
392                 }
393         }
394
395         function nominatimXMLStart($hParser, $sName, $aAttr)
396         {
397                 global $aNominatRecords;
398                 switch($sName)
399                 {
400                 case 'PLACE':
401                         $aNominatRecords[] = $aAttr;
402                         break;
403                 }
404         }
405
406         function nominatimXMLEnd($hParser, $sName)
407         {
408         }
409
410
411         if (isset($aCMDResult['link']))
412         {
413                 $oDB =& getDB();
414                 $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
415
416                 // If you point this script at production OSM you will be blocked
417                 $sNominatimBaseURL = 'http://SEVERNAME/search.php';
418
419                 foreach($aWikiArticles as $aRecord)
420                 {
421                         $aRecord['name'] = str_replace('_',' ',$aRecord['title']);
422
423                         $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
424
425                         echo "\n-- ".$aRecord['name'].", ".$aRecord['infobox_type']."\n";
426                         $fMaxDist = 0.0000001;
427                         $bUnknown = false;
428                         switch(strtolower($aRecord['infobox_type']))
429                         {
430                         case 'former country':
431                                 continue 2;
432                         case 'sea':
433                                 $fMaxDist = 60; // effectively turn it off
434                                 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
435                                 break;
436                         case 'country':
437                         case 'island':
438                         case 'islands':
439                         case 'continent':
440                                 $fMaxDist = 60; // effectively turn it off
441                                 $sURL .= "&featuretype=country";
442                                 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
443                                 break;
444                         case 'prefecture japan':
445                                 $aRecord['name'] = trim(str_replace(' Prefecture',' ', $aRecord['name']));
446                         case 'state':
447                         case '#us state':
448                         case 'county':
449                         case 'u.s. state':
450                         case 'u.s. state symbols':
451                         case 'german state':
452                         case 'province or territory of canada';
453                         case 'indian jurisdiction';
454                         case 'province';
455                         case 'french region':
456                         case 'region of italy':
457                         case 'kommune':
458                         case '#australia state or territory':
459                         case 'russian federal subject':
460                                 $fMaxDist = 4;
461                                 $sURL .= "&featuretype=state";
462                                 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
463                                 break;
464                         case 'protected area':
465                                 $fMaxDist = 1;
466                                 $sURL .= "&nearlat=".$aRecord['lat'];
467                                 $sURL .= "&nearlon=".$aRecord['lon'];
468                                 $sURL .= "&viewbox=".($aRecord['lon']-$fMaxDist).",".($aRecord['lat']+$fMaxDist).",".($aRecord['lon']+$fMaxDist).",".($aRecord['lat']-$fMaxDist);
469                                 break;
470                         case 'settlement':
471                                 $bUnknown = true;
472                         case 'french commune':
473                         case 'italian comune':
474                         case 'uk place':
475                         case 'italian comune':
476                         case 'australian place':
477                         case 'german place':
478                         case '#geobox':
479                         case 'u.s. county':
480                         case 'municipality':
481                         case 'city japan':
482                         case 'russian inhabited locality':
483                         case 'finnish municipality/land area':
484                         case 'england county':
485                         case 'israel municipality':
486                         case 'russian city':
487                         case 'city':
488                                 $fMaxDist = 0.2;
489                                 $sURL .= "&featuretype=settlement";
490                                 $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
491                                 break;
492                         case 'mountain':
493                         case 'mountain pass':
494                         case 'river':
495                         case 'lake':
496                         case 'airport':
497                                 $fMaxDist = 0.2;
498                                 $sURL .= "&viewbox=".($aRecord['lon']-0.5).",".($aRecord['lat']+0.5).",".($aRecord['lon']+0.5).",".($aRecord['lat']-0.5);
499
500                         case 'ship begin':
501                                 $fMaxDist = 0.1;
502                                 $aTypes = array('wreck');
503                                 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
504                                 $sURL .= "&nearlat=".$aRecord['lat'];
505                                 $sURL .= "&nearlon=".$aRecord['lon'];
506                                 break;
507                         case 'road':
508                         case 'university':
509                         case 'company':
510                         case 'department':
511                                 $fMaxDist = 0.005;
512                                 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
513                                 $sURL .= "&bounded=1";
514                                 $sURL .= "&nearlat=".$aRecord['lat'];
515                                 $sURL .= "&nearlon=".$aRecord['lon'];
516                                 break;
517                         default:
518                                 $bUnknown = true;
519                                 $fMaxDist = 0.005;
520                                 $sURL .= "&viewbox=".($aRecord['lon']-0.01).",".($aRecord['lat']+0.01).",".($aRecord['lon']+0.01).",".($aRecord['lat']-0.01);
521 //                              $sURL .= "&bounded=1";
522                                 $sURL .= "&nearlat=".$aRecord['lat'];
523                                 $sURL .= "&nearlon=".$aRecord['lon'];
524                                 echo "-- Unknown: ".$aRecord['infobox_type']."\n";
525                                 break;
526                         }
527                         $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
528
529                         var_Dump($sNameURL);
530                         $sXML = file_get_contents($sNameURL);
531
532                         $aNominatRecords = array();
533                         $hXMLParser = xml_parser_create();
534                         xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
535                         xml_parse($hXMLParser, $sXML, true);
536                         xml_parser_free($hXMLParser);
537
538                         if (!isset($aNominatRecords[0]))
539                         {
540                                 $aNameParts = preg_split('#[(,]#',$aRecord['name']);
541                                 if (sizeof($aNameParts) > 1)
542                                 {
543                                         $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
544                                         var_Dump($sNameURL);
545                                         $sXML = file_get_contents($sNameURL);
546
547                                         $aNominatRecords = array();
548                                         $hXMLParser = xml_parser_create();
549                                         xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
550                                         xml_parse($hXMLParser, $sXML, true);
551                                         xml_parser_free($hXMLParser);#
552                                 }
553                         }
554
555                         // assume first is best/right
556                         for($i = 0; $i < sizeof($aNominatRecords); $i++)
557                         {
558                                 $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
559                                 $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
560                                 $fDiff = sqrt($fDiff);
561                                 if ($bUnknown) {
562                                         // If it was an unknown type base it on the rank of the found result
563                                         $iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
564                                         if ($iRank <= 4) $fMaxDist = 2;
565                                         elseif ($iRank <= 8) $fMaxDist = 1;
566                                         elseif ($iRank <= 10) $fMaxDist = 0.8;
567                                         elseif ($iRank <= 12) $fMaxDist = 0.6;
568                                         elseif ($iRank <= 17) $fMaxDist = 0.2;
569                                         elseif ($iRank <= 18) $fMaxDist = 0.1;
570                                         elseif ($iRank <= 22) $fMaxDist = 0.02;
571                                         elseif ($iRank <= 26) $fMaxDist = 0.001;
572                                         else $fMaxDist = 0.001;
573                                 }
574                                 echo "-- FOUND \"".substr($aNominatRecords[$i]['DISPLAY_NAME'],0,50)."\", ".$aNominatRecords[$i]['CLASS'].", ".$aNominatRecords[$i]['TYPE'].", ".$aNominatRecords[$i]['PLACE_RANK'].", ".$aNominatRecords[$i]['OSM_TYPE']." (dist:$fDiff, max:$fMaxDist)\n";
575                                 if ($fDiff > $fMaxDist)
576                                 {
577                                         echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
578                                 }
579                                 else
580                                 {
581                                         $sSQL = "update wikipedia_article set osm_type=";
582                                         switch($aNominatRecords[$i]['OSM_TYPE'])
583                                         {
584                                         case 'relation': $sSQL .= "'R'"; break;
585                                         case 'way': $sSQL .= "'W'"; break;
586                                         case 'node': $sSQL .= "'N'"; break;
587                                         }
588                                         $sSQL .= ", osm_id=".$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";
589                                         $oDB->query($sSQL);
590                                         break;
591                                 }
592                         }
593                 }
594         }