#!/usr/bin/env perl
+use 5.010;
use strict;
use warnings;
-use Pod::Usage ();
-use Getopt::Long ();
-
-BEGIN {
- eval "require MediaWiki::API; require YAML::XS;" or do {
- print "You have to install some modules via CPAN to run this:\n";
- print " sudo cpanp MediaWiki::API YAML::XS\n";
- exit 1;
- };
-}
-
+use Getopt::Long;
+use Pod::Usage;
use MediaWiki::API;
+use Test::More qw(no_plan);
use YAML::XS qw(Dump);
-use Test::More 'no_plan';
=head1 NAME
-update-wiki-pages - Screen-scrape the wiki for key/value wiki description pages
+update-wiki-pages - Scrape the wiki for key/value wiki description pages
=head1 SYNOPSIS
perl script/misc/update-wiki-pages config/wiki_pages.yml
-=head1 BUGS
+Or with prove(1):
-This will break if there are more than 500 key or value pages. Paging
-needs to be implemenented.
-
-That or using a proper API or something (if it's there) or making a
-direct query to the wiki database.
+ prove -e 'perl script/misc/update-wiki-pages' config/wiki_pages.yml
=cut
# On --help
help() if $help;
-help() unless $ARGV[0];
+my $out_file = $ARGV[0];
+$out_file //= 'config/wiki_pages.yml';
+
+help() unless -f $out_file;
# Get a API interface
my $mw = MediaWiki::API->new();
ok($mw, "Got a MediaWiki API");
-$mw->{config}->{api_url} = 'http://wiki.openstreetmap.org/w/api.php';
+$mw->{config}->{api_url} = 'https://wiki.openstreetmap.org/w/api.php';
+$mw->{config}->{retries} = 5;
+$mw->{config}->{retry_delay} = 30;
# All our goodies
my (%feature, %count);
# Key pages
ok(1, " Getting key pages");
my $cnt = stick_content_in_hash("key", "Template:${lang}KeyDescription", \%feature);
+ $cnt += stick_content_in_hash("key", "Template:${lang}Feature", \%feature);
ok(1, " Got $cnt key pages");
$count{key} += $cnt;
+ # Key prefix pages
+ ok(1, " Getting key prefix pages");
+ $cnt = stick_content_in_hash("key", "Template:${lang}KeyPrefixDescription", \%feature);
+ ok(1, " Got $cnt key prefix pages");
+ $count{keyprefix} += $cnt;
+
# Value pages
ok(1, " Getting value pages");
- my $cnt = stick_content_in_hash("tag", "Template:${lang}ValueDescription", \%feature);
+ $cnt = stick_content_in_hash("tag", "Template:${lang}ValueDescription", \%feature);
ok(1, " Got $cnt value pages");
$count{value} += $cnt;
}
-ok(1, "Got a total of $count{$_} ${_}s") for qw[ key value ];
+ok(1, "Got a total of $count{$_} ${_}s") for qw[ key keyprefix value ];
# Dump to .yml file
-open my $out, ">", $ARGV[0] or die "Can't open file '$ARGV[0]' supplied on the command line";
+open my $out, ">", $out_file or die "Can't open file '$out_file' supplied on the command line";
say $out "# THIS FILE IS AUTOGENERATED WITH THE script/misc/update-wiki-pages";
say $out "# PROGRAM DO NOT MANUALLY EDIT IT";
say $out "";
};
my $count = 0;
+
+ my $process_link = sub {
+ my $link = shift;
+ $count++;
+ ok(1, " ... got $count links") if $count % 200 == 0;
+ my $title = $link->{title};
+ my $lang;
+ my $key_name;
+ if ($title =~ /^$ukey:(?<key_name>.*?)$/) {
+ # English by default
+ $lang = "en";
+ $key_name = $space_to_underscore->($+{key_name});
+ } elsif ($title =~ /^(?<lang>[^:]+):$ukey:(?<key_name>.*?)$/) {
+ $lang = lc $+{lang};
+ $key_name = $space_to_underscore->($+{key_name});
+ }
+ if ($lang && !exists($hash->{$lang}->{$key}->{$key_name})) {
+ $hash->{$lang}->{$key}->{$key_name} = $title;
+ }
+ };
+
get_embeddedin(
$title,
sub {
- my ($links) = @_;
- my (@links) = @$links;
- ok(1, " ... got " . scalar(@links) . " more links");
- for my $link (@links) {
- $count++;
- my $title = $link->{title};
-
- if ($title =~ /^$ukey:(?<key_name>.*?)$/) {
- # English by default
- $hash->{en}->{$key}->{ $space_to_underscore->($+{key_name}) } = $title;
- } elsif ($title =~ /^(?<lang>[^:]+):$ukey:(?<key_name>.*?)$/) {
- $hash->{lc $+{lang}}->{$key}->{ $space_to_underscore->($+{key_name}) } = $title;
+ my $link = shift;
+ $process_link->($link);
+ get_redirects(
+ $link->{title},
+ sub {
+ my $link = shift;
+ $process_link->($link) if exists($link->{redirect});
}
- }
+ );
}
);
return $count;
}
+sub process_list
+{
+ my $callback = shift;
+ my $links = shift;
+ for my $link (@$links) {
+ $callback->($link);
+ }
+}
+
sub get_embeddedin
{
my ($title, $callback) = @_;
},
{
max => '0',
- hook => $callback,
+ hook => sub { process_list($callback, @_) },
+ skip_encoding => 1,
+ }
+ ) || die $mw->{error}->{code} . ': ' . $mw->{error}->{details};
+}
+
+sub get_redirects
+{
+ my ($title, $callback) = @_;
+ my $articles = $mw->list(
+ {
+ action => 'query',
+ list => 'backlinks',
+ bltitle => $title,
+ blfilterredir => 'redirects',
+ # Doesn't work for De:* and anything non-en. Odd.
+ # einamespace => '0|8',
+ bllimit => '200',
+ },
+ {
+ max => '0',
+ hook => sub { process_list($callback, @_) },
skip_encoding => 1,
}
) || die $mw->{error}->{code} . ': ' . $mw->{error}->{details};