X-Git-Url: https://git.openstreetmap.org./rails.git/blobdiff_plain/9816c7dd0a6880589f2310f64b1acf672f4497e5..f5c5aacb209fd209c0b13203c0c7d0259635c671:/script/misc/update-wiki-pages?ds=sidebyside diff --git a/script/misc/update-wiki-pages b/script/misc/update-wiki-pages old mode 100644 new mode 100755 index 15dd1b6a0..d40708c96 --- a/script/misc/update-wiki-pages +++ b/script/misc/update-wiki-pages @@ -1,37 +1,25 @@ #!/usr/bin/env perl +use 5.010; use strict; use warnings; -use Pod::Usage (); -use Getopt::Long (); - -BEGIN { - eval "require MediaWiki::API; require YAML::XS;" or do { - print "You have to install some modules via CPAN to run this:\n"; - print " sudo cpanp MediaWiki::API YAML::XS\n"; - exit 1; - }; -} - +use Getopt::Long; +use Pod::Usage; use MediaWiki::API; +use Test::More qw(no_plan); use YAML::XS qw(Dump); -use Test::More 'no_plan'; =head1 NAME -update-wiki-pages - Screen-scrape the wiki for key/value wiki description pages +update-wiki-pages - Scrape the wiki for key/value wiki description pages =head1 SYNOPSIS perl script/misc/update-wiki-pages config/wiki_pages.yml -=head1 BUGS +Or with prove(1): -This will break if there are more than 500 key or value pages. Paging -needs to be implemenented. - -That or using a proper API or something (if it's there) or making a -direct query to the wiki database. + prove -e 'perl script/misc/update-wiki-pages' config/wiki_pages.yml =cut @@ -45,12 +33,17 @@ Getopt::Long::Parser->new( # On --help help() if $help; -help() unless $ARGV[0]; +my $out_file = $ARGV[0]; +$out_file //= 'config/wiki_pages.yml'; + +help() unless -f $out_file; # Get a API interface my $mw = MediaWiki::API->new(); ok($mw, "Got a MediaWiki API"); -$mw->{config}->{api_url} = 'http://wiki.openstreetmap.org/w/api.php'; +$mw->{config}->{api_url} = 'https://wiki.openstreetmap.org/w/api.php'; +$mw->{config}->{retries} = 5; +$mw->{config}->{retry_delay} = 30; # All our goodies my (%feature, %count); @@ -63,20 +56,27 @@ for my $lang ('', map { "${_}:" } qw[ Pt Fi De It HU Cz Fr RU Pl ]) { # Key pages ok(1, " Getting key pages"); my $cnt = stick_content_in_hash("key", "Template:${lang}KeyDescription", \%feature); + $cnt += stick_content_in_hash("key", "Template:${lang}Feature", \%feature); ok(1, " Got $cnt key pages"); $count{key} += $cnt; + # Key prefix pages + ok(1, " Getting key prefix pages"); + $cnt = stick_content_in_hash("key", "Template:${lang}KeyPrefixDescription", \%feature); + ok(1, " Got $cnt key prefix pages"); + $count{keyprefix} += $cnt; + # Value pages ok(1, " Getting value pages"); - my $cnt = stick_content_in_hash("tag", "Template:${lang}ValueDescription", \%feature); + $cnt = stick_content_in_hash("tag", "Template:${lang}ValueDescription", \%feature); ok(1, " Got $cnt value pages"); $count{value} += $cnt; } -ok(1, "Got a total of $count{$_} ${_}s") for qw[ key value ]; +ok(1, "Got a total of $count{$_} ${_}s") for qw[ key keyprefix value ]; # Dump to .yml file -open my $out, ">", $ARGV[0] or die "Can't open file '$ARGV[0]' supplied on the command line"; +open my $out, ">", $out_file or die "Can't open file '$out_file' supplied on the command line"; say $out "# THIS FILE IS AUTOGENERATED WITH THE script/misc/update-wiki-pages"; say $out "# PROGRAM DO NOT MANUALLY EDIT IT"; say $out ""; @@ -97,29 +97,54 @@ sub stick_content_in_hash }; my $count = 0; + + my $process_link = sub { + my $link = shift; + $count++; + ok(1, " ... got $count links") if $count % 200 == 0; + my $title = $link->{title}; + my $lang; + my $key_name; + if ($title =~ /^$ukey:(?.*?)$/) { + # English by default + $lang = "en"; + $key_name = $space_to_underscore->($+{key_name}); + } elsif ($title =~ /^(?[^:]+):$ukey:(?.*?)$/) { + $lang = lc $+{lang}; + $key_name = $space_to_underscore->($+{key_name}); + } + if ($lang && !exists($hash->{$lang}->{$key}->{$key_name})) { + $hash->{$lang}->{$key}->{$key_name} = $title; + } + }; + get_embeddedin( $title, sub { - my ($links) = @_; - my (@links) = @$links; - ok(1, " ... got " . scalar(@links) . " more links"); - for my $link (@links) { - $count++; - my $title = $link->{title}; - - if ($title =~ /^$ukey:(?.*?)$/) { - # English by default - $hash->{en}->{$key}->{ $space_to_underscore->($+{key_name}) } = $title; - } elsif ($title =~ /^(?[^:]+):$ukey:(?.*?)$/) { - $hash->{lc $+{lang}}->{$key}->{ $space_to_underscore->($+{key_name}) } = $title; + my $link = shift; + $process_link->($link); + get_redirects( + $link->{title}, + sub { + my $link = shift; + $process_link->($link) if exists($link->{redirect}); } - } + ); } ); return $count; } +sub process_list +{ + my $callback = shift; + my $links = shift; + for my $link (@$links) { + $callback->($link); + } +} + sub get_embeddedin { my ($title, $callback) = @_; @@ -135,7 +160,28 @@ sub get_embeddedin }, { max => '0', - hook => $callback, + hook => sub { process_list($callback, @_) }, + skip_encoding => 1, + } + ) || die $mw->{error}->{code} . ': ' . $mw->{error}->{details}; +} + +sub get_redirects +{ + my ($title, $callback) = @_; + my $articles = $mw->list( + { + action => 'query', + list => 'backlinks', + bltitle => $title, + blfilterredir => 'redirects', + # Doesn't work for De:* and anything non-en. Odd. + # einamespace => '0|8', + bllimit => '200', + }, + { + max => '0', + hook => sub { process_list($callback, @_) }, skip_encoding => 1, } ) || die $mw->{error}->{code} . ': ' . $mw->{error}->{details};