package KelBot;
use strict;
use WWW::Mechanize;
use HTML::Entities;
use URI::Escape;
use XML::Simple;
use Carp;
use Encode;
use URI::Escape qw(uri_escape_utf8);
my $default_username = "";
my $default_password = "";
my $limit=5000;
sub new {
my $package = shift;
my $agent = shift || 'KelBot';
my $self = bless {}, $package;
$self->{mech} = WWW::Mechanize->new( cookie_jar => {}, onerror => \&Carp::carp );
$self->{mech}->agent("$agent");
$self->{host} = 'fr.wikipedia.org';
$self->{path} = 'w';
$self->{debug} = 0;
$self->{errstr} = '';
return $self;
}
sub _get {
my $self = shift;
my $page = shift;
my $action = shift || 'view';
my $extra = shift;
my $no_escape = shift || 0;
my $url =
"http://$self->{host}/$self->{path}/index.php?title=$page&action=$action";
$url .= $extra if $extra;
print STDERR "Retrieving $url\n" if $self->{debug};
my $res = $self->{mech}->get($url);
if ( $res->is_success() ) {
if ( $res->content =~ m/The action you have requested is limited to users in the group (.+)\./ ) {
my $group = $1;
$group =~ s/<.+?>//g;
$self->{errstr} = qq/Error requesting $page: You must be in the user group "$group"/;
carp $self->{errstr};
return 0;
} else {
return $res;
}
} else {
$self->{errstr} = "Error requesting $page: " . $res->status_line();
carp $self->{errstr};
return 0;
}
}
sub _get_api {
my $self = shift;
my $query = shift;
print STDERR "Retrieving http://$self->{host}/$self->{path}/api.php?$query\n"
if $self->{debug};
my $res =
$self->{mech}->get("http://$self->{host}/$self->{path}/api.php?$query");
if ( $res->is_success() ) {
return $res;
} else {
$self->{errstr} = "Error requesting api.php?$query: " . $res->status_line();
carp $self->{errstr};
return 0;
}
}
sub _put {
my $self = shift;
my $page = shift;
my $options = shift;
my $extra = shift;
my $res = $self->_get( $page, 'edit', $extra );
unless ($res) { return; }
if ( ( $res->content ) =~ m/<textarea .+?readonly='readonly'/ ) {
$self->{errstr} = "Error editing $page: Page is protected";
carp $self->{errstr};
return 0;
}
$res = $self->{mech}->submit_form( %{$options} );
return $res;
}
sub set_wiki {
my $self = shift;
$self->{host} = shift;
$self->{path} = shift;
print STDERR "Wiki set to http://$self->{host}/$self->{path}\n" if $self->{debug};
return 0;
}
sub login {
my $self = shift;
my $editor = shift || $default_username;
my $password = shift || $default_password;
my $cookies = ".perlwikipedia-$editor-cookies";
$self->{mech}->cookie_jar(
{ file => $cookies, autosave => 1 } );
if ( !defined $password ) {
$self->{mech}->{cookie_jar}->load($cookies);
my $cookies_exist = $self->{mech}->{cookie_jar}->as_string;
if ($cookies_exist) {
$self->{mech}->{cookie_jar}->load($cookies);
print STDERR "Loaded MediaWiki cookies from file $cookies\n" if $self->{debug};
return 0;
} else {
$self->{errstr} = "Cannot load MediaWiki cookies from file $cookies";
carp $self->{errstr};
return 1;
}
}
my $res = $self->_put(
'Special:Userlogin',
{
form_name => 'userlogin',
fields => {
wpName => $editor,
wpPassword => $password,
wpRemember => 1,
},
}
);
unless ($res) { return; }
my $content = $res->content();
my $login_status;
if ( $content =~ m/var wgUserName = "$editor"/ ) {
print STDERR qq/Login as "$editor" succeeded.\n/ if $self->{debug};
$login_status = 0;
} else {
if ( $content =~ m/There is no user by the name/ ) {
$self->{errstr} = qq/Login as "$editor" failed: User "$editor" does not exist/;
} elsif ( $content =~ m/Incorrect password entered/ ) {
$self->{errstr} = qq/Login as "$editor" failed: Bad password/;
} elsif ( $content =~ m/Password entered was blank/ ) {
$self->{errstr} = qq/Login as "$editor" failed: Blank password/;
}
$login_status = 1;
}
die "I can't log in." unless ($login_status eq '0');
return $login_status;
}
sub edit {
my $self = shift;
my $page = shift;
my $text = shift;
my $summary = shift;
my $is_minor = 1;
my $res;
# $text = encode( 'utf8', $text );
my $options = {
form_name => 'editform',
fields => {
wpSummary => $summary,
wpTextbox1 => $text,
},
};
$options->{fields}->{wpMinoredit} = 1 if ($is_minor);
$res = $self->_put($page, $options);
return $res;
}
sub get_history {
my $self = shift;
my $pagename = shift;
my $limit = shift || 5;
my $rvstartid = shift || '';
my $direction = shift;
my @return;
my @revisions;
if ( $limit > 50 ) {
$self->{errstr} = "Error requesting history for $pagename: Limit may not be set to values above 50";
carp $self->{errstr};
return 1;
}
my $query = "action=query&prop=revisions&titles=$pagename&rvlimit=$limit&rvprop=ids|timestamp|user|comment&format=xml";
if ( $rvstartid ) {
$query .= "&rvstartid=$rvstartid";
}
if ( $direction ) {
$query .= "&rvdir=$direction";
}
my $res = $self->_get_api($query);
unless ($res) { return 1; }
my $xml = XMLin( $res->content );
if ( ref( $xml->{query}->{pages}->{page}->{revisions}->{rev} ) eq "HASH" ) {
$revisions[0] = $xml->{query}->{pages}->{page}->{revisions}->{rev};
}
else {
@revisions = @{ $xml->{query}->{pages}->{page}->{revisions}->{rev} };
}
foreach my $hash ( @revisions ) {
my $revid = $hash->{revid};
my $user = $hash->{user};
my ( $timestamp_date, $timestamp_time ) = split( /T/, $hash->{timestamp} );
$timestamp_time=~s/Z$//;
my $comment = $hash->{comment};
push ( @return, {
revid => $revid,
user => $user,
timestamp_date => $timestamp_date,
timestamp_time => $timestamp_time,
comment => $comment,
} );
}
return @return;
}
sub get_text {
my $self = shift;
my $pagename = shift;
my $revid = shift || '';
my $section = shift || '';
my $recurse = shift || 0;
my $wikitext = '';
my $res;
$res = $self->_get( $pagename, 'edit', "&oldid=$revid§ion=$section" );
unless ($res) { return 1; }
if ($recurse) {
until ( $res->content =~ m/var wgAction = "edit"/ ) {
my $real_title;
if ( $res->content =~ m/var wgTitle = "(.+?)"/ ) {
$real_title = $1;
}
$res = $self->_get( $real_title, 'edit' );
}
}
if ( $res->content =~ /<textarea.+?\s?>(.+)<\/textarea>/s ) {
$wikitext = $1;
} else {
$self->{errstr} = "Could not get_text for $pagename!";
carp $self->{errstr};
}
return decode_entities($wikitext);
}
=item revert($pagename,$edit_summary,$old_revision_id)
Reverts the specified page to $old_revision_id, with an edit summary of $edit_summary.
=cut
sub revert {
my $self = shift;
my $pagename = shift;
my $summary = shift;
my $revid = shift;
return $self->_put(
$pagename,
{
form_name => 'editform',
fields => { wpSummary => $summary, },
},
"&oldid=$revid"
);
}
=item get_last($pagename,$username)
Returns the number of the last revision not made by $username.
=cut
sub get_last {
my $self = shift;
my $pagename = shift;
my $editor = shift;
my $revertto = 0;
my $res =
$self->_get_api( "action=query&prop=revisions&titles=$pagename&rvlimit=20&rvprop=ids|user&rvexcludeuser=$editor&format=xml" );
unless ($res) { return 1; }
my $xml = XMLin( $res->content );
if( ref( $xml->{query}->{pages}->{page}->{revisions}->{rev} ) eq 'ARRAY' ) {
$revertto = $xml->{query}->{pages}->{page}->{revisions}->{rev}[0]->{revid};
}
else {
$revertto = $xml->{query}->{pages}->{page}->{revisions}->{rev}->{revid};
}
return $revertto;
}
sub update_rc {
my $self = shift;
my $limit = shift || 5;
my @rc_table;
my $res =
$self->_get_api(
"action=query&list=recentchanges&rcnamespace=0&rclimit=$limit&format=xml");
unless ($res) { return 1; }
my $xml = XMLin( $res->content );
foreach my $hash ( @{ $xml->{query}->{recentchanges}->{rc} } ) {
my ( $timestamp_date, $timestamp_time ) = split( /T/, $hash->{timestamp} );
$timestamp_time =~ s/Z$//;
push( @rc_table, {
pagename => $hash->{title},
revid => $hash->{revid},
oldid => $hash->{old_revid},
timestamp_date => $timestamp_date,
timestamp_time => $timestamp_time,
}
);
}
return @rc_table;
}
sub last_contrib {
my $self = shift;
my $limit = shift || 5;
my @rc_table;
my $res =
$self->_get_api(
"action=query&list=usercontribs&format=xml&ucuser=".$default_username);
unless ($res) { return 1; }
my $xml = XMLin( $res->content );
foreach my $hash ( @{ $xml->{query}->{usercontribs}->{item} } ) {
my ( $timestamp_date, $timestamp_time ) = split( /T/, $hash->{timestamp} );
$timestamp_time =~ s/Z$//;
push( @rc_table, {
title => $hash->{title},
revid => $hash->{revid},
comment => $hash->{comment},
timestamp_date => $timestamp_date,
timestamp_time => $timestamp_time,
}
);
}
return @rc_table;
}
sub what_links_here {
my $self = shift;
my $article = shift;
my @links;
my $continue;
my $xml;
do {
my $res = $self->_get_api( "action=query&format=xml&blnamespace=0&blfilterredir=nonredirects&list=backlinks&bllimit=".$limit."&bltitle=".$article.($continue ? "&blcontinue=".$continue : "") );
unless ($res) { return 1; }
$xml = XMLin( $res->content );
foreach my $hash ( @{ $xml->{query}->{backlinks}->{bl} } ) {
push( @links, {
title => $hash->{title},
}
);
}
} while ($continue = $xml->{"query-continue"}->{backlinks}->{blcontinue} );
return @links;
}
sub embedded_in {
my $self = shift;
my $article = shift;
my @links;
my $continue;
my $xml;
do {
my $res = $self->_get_api( "action=query&format=xml&eifilterredir=nonredirects&list=embeddedin&eilimit=".$limit."&eititle=".$article.($continue ? "&eicontinue=".$continue : "") );
unless ($res) { return 1; }
$xml = XMLin( $res->content );
foreach my $hash ( @{ $xml->{query}->{embeddedin}->{ei} } ) {
push( @links, {
title => $hash->{title},
}
);
}
} while ($continue = $xml->{"query-continue"}->{embeddedin}->{eicontinue} );
return @links;
}
sub get_pages_in_category {
my $self = shift;
my $category = shift;
my @pages;
my $res = $self->_get( $category, 'view' );
unless ($res) { return 1; }
my $content = $res->content;
while ( $content =~ m{href="(?:[^"]+)/Category:[^"]+">([^<]*)</a></div>}ig )
{
push @pages, 'Category:' . $1;
}
while ( $content =~
m{<li><a href="(?:[^"]+)" title="([^"]+)">[^<]*</a></li>}ig ) {
push @pages, $1;
}
while ( my $res = $self->{mech}->follow_link( text => '200 suivants' ) ) {
sleep 1; #Cheap hack to make sure we don't bog down the server
my $content = $res->content;
while ( $content =~
m{<li><a href="(?:[^"]+)" title="([^"]+)">[^<]*</a></li>}ig ) {
push @pages, $1;
}
}
return @pages;
}
sub get_all_pages_in_category {
my $self = shift;
my $base_category = shift;
my @first = $self->get_pages_in_category($base_category);
my %data;
foreach my $page (@first) {
$data{$page} = '';
if ( $page =~ /^Category:/ ) {
my @pages = $self->get_all_pages_in_category($page);
foreach (@pages) {
$data{$_} = '';
}
}
}
return keys %data;
}
sub linksearch {
my $self = shift;
my $link = shift;
my @links;
my $res =
$self->_get( "Special:Linksearch", "edit", "&target=$link&limit=500" );
unless ($res) { return 1; }
my $content = $res->content;
while ( $content =~
m{<li><a href.+>(.+?)</a> linked from <a href.+>(.+)</a></li>}g ) {
push( @links, { link => $1, page => $2 } );
}
while ( my $res = $self->{mech}->follow_link( text => 'next 500' ) ) {
sleep 2;
my $content = $res->content;
while ( $content =~
m{<li><a href.+>(.+?)</a> linked from <a href=.+>(.+)</a></li>}g ) {
push( @links, { link => $1, page => $2 } );
}
}
return @links;
}
sub purge_page {
my $self = shift;
my $page = shift;
my $res = $self->_get( $page, 'purge' );
}
sub get_namespace_names {
my $self = shift;
my %return;
my $res = $self->_get_api("action=query&meta=siteinfo&siprop=namespaces&format=xml");
my $xml = XMLin( $res->content );
foreach my $id ( keys %{ $xml->{query}->{namespaces}->{ns} } ) {
$return{$id} = $xml->{query}->{namespaces}->{ns}->{$id}->{content};
}
return %return;
}
sub encyclo {
my $self = shift;
my @results;
foreach my $page (@_) {
if ($page =~ /:/ ) {
if ( $page =~ /modèle:/i || $page =~ /template:/i ) {
push(@results, $page);
}
} else {
push(@results, $page);
}
}
return @results;
}
1;