WWW::AllSportsJpとWWW::AllSportsJp::Old、作りました!

スポーツ写真サイト オールスポーツコミュニティから画像のURLを抽出するのに使いました。

Web::ScraperとWWW::Mechanize - ”improve it!”

と書いていたあれ。URLとパスワードを書いたYAMLを読み込ませると、そのイベントの写真(jpeg)のURLをYAMLにして出力します*1
とりあえず一段落してちゃんと動作しているようなのでソースを載せておきます。誰か添削してくれたら嬉しいなあ。

そういえば、YAML::SyckだとDumpしたときにUTF8フラグの処理がうまく行ってなかったみたいで文字化けしてしまいました。YAML::Dumpなら問題ありません。

使い方

use WWW::AllSportsJp;
WWW::AllSportsJp->new(shift)->do();

とか書いたperl.plを

$ perl perl.pl password.yml

とかやればOKです。

設定ファイル(YAML

password.ymlはたとえばhttp://www.allsports.jp/event/00000628.htmlだったら、

name: Soccer
url: http://www.allsports.jp/event/00000628.html
form:
  name: unlockform
  input: ev_password
passwords:
  - name: All
    password: "1234"

と書きます。nameは自分のメモ用なので好きなように書いてください。

パスワードが複数あるときは

name: Soccer
url: http://www.allsports.jp/event/00000628.html
form:
  name: unlockform
  input: ev_password
passwords:
  - name: 1
    password: "1234"
  - name: 2
    password: "2345"

とすればOKです。

WWW::AllSportsJp::Oldはhttp://www2.allsports.jp/用です。
password.ymlはたとえばhttp://www2.allsports.jp/event_detail.php?ev_id=6149だったら、

name: H17 Inter High School - Archery
url: http://www2.allsports.jp/event_detail.php?ev_id=6149
form:
  name: loginform
  input: ev_password
passwords:
  - name: All
    password: "1234"

と書いてください。

パスワードがない場合は空欄でいけるはず。

WWW::AllSportsJp.pm

package WWW::AllSportsJp;

use warnings;
use strict;
use Carp;
use WWW::Mechanize;
use WWW::Mechanize::DecodedContent;
use Web::Scraper;
use YAML;
use Data::Dumper;
$Data::Dumper::Indent = 1;
use URI;
use Encode;
use encoding 'utf8';

use version; our $VERSION = qv('0.0.4');

use Perl6::Say;

sub new(){
    my $self = shift;
    bless {
	yaml      => '',
	mech      => undef,
	scraper   => undef,
	file_name => shift,
	save      => {},
	debug     => 0,
    }, $self;
}

sub do(){
    my $self = shift;

    # Load config YAML
    my $file = $self->{file_name};
    say "Load Config YAML: $file";
    $self->{yaml} = YAML::LoadFile($file);
    say " -> OK";

    # Start accessing
    say "URL: $self->{yaml}->{url}";
    $self->{mech} = WWW::Mechanize->new( keep_alive => 4 );
    my $result = $self->{mech}->get( $self->{yaml}->{url} );
    $self->abort( Dumper $self->{mech}->response ) unless $self->{mech}->success;
    say " -> Accessing ...";

    # Get Information
    say "Get information";
    $self->getInformation;
    say " -> OK";

    # Authorization
    say "Authorize";
    $self->unlock;
    say " -> OK";

    # Get
    say "Get listing urls";
    $self->getListPages;
    say " -> OK";
    say YAML::Dump $self->{save}->{categories} if $self->{debug};

    say "Get page urls";
    $self->getPageUrls;
    say " -> OK";

    say "Get image urls";
    $self->getImageUrls;
    say " -> OK";

    say YAML::Dump $self->{save};
}

sub getInformation(){
    my $self = shift;
    my $mech = $self->{mech};

    my $scraper = scraper{
	process 'span.abbreviate',        'title'    => 'TEXT';
	process 'div.festa_detail tr th', 'types[]'  => 'TEXT';
	process 'div.festa_detail tr td', 'values[]' => 'TEXT';
    };
    $self->{save}->{information} = $scraper->scrape( $mech->decoded_content, $mech->uri );
}

sub getListPages(){
    my $self = shift;
    my $mech = $self->{mech};

    my $categories = scraper{
	process 'div#photo_sort table', 'categories[]' =>
	    scraper{
		process 'table tr th',   'title'  => 'TEXT';
		process 'table tr li a', 'list_pages[]' => { url => '@href', title => 'TEXT' };
	    };
    };

    #修正 2008-03-21
    #$self->{save} = $categories->scrape( $mech->decoded_content, $mech->uri );
    $self->{save}->{categories} = $categories->scrape( $mech->decoded_content, $mech->uri )->{categories};
}

sub getPageUrls(){
    my $self = shift;
    my $mech = $self->{mech};
    my $categories = $self->{save}->{categories};

     my $scraper = scraper{
	process 'div.photo_list_box div.photo_box', 'items[]' =>
	    scraper{
#		process 'table tr td a[href]', 'url'    => '@href';
		process 'p.detail a[href]',    'url'    => '@href';
		process 'p.num',               'number' => 'TEXT';
		process 'p.time',              'time'   => 'TEXT';
	    };
    };

    my $i = 0;
    my $count = 0;
    foreach my $category ( @$categories ){
	print " $i ->";
	my $j = 0;
	foreach my $list_page ( @{$category->{list_pages}} ){
	    print " $j";$j++;
	    my $result;
	    next if $list_page->{title} eq '';
	    say "scrape $list_page->{url}" if $self->{debug};
	    $mech->get( $list_page->{url} );
	    if( $mech->success ){
		$result = $scraper->scrape( $mech->decoded_content, $mech->uri );
		push @{$categories->[$i]->{photo_pages}}, @{$result->{items}};
		$count += @{$result->{items}};
		say YAML::Dump $result->{items} if $self->{debug};
	    }
	    else{
		say Dumper $mech->response;
		next;
	    }
	    last if $self->{debug};
	}
	$i++;
	say " (total $count)";
	last if $self->{debug};
    }
    say " (total $count)"; 
}

sub getImageUrls(){
    my $self = shift;
    my $mech = $self->{mech};
    my $categories = $self->{save}->{categories};

    $self->{yaml}->{url} =~ m|(https?://[^/]+)|;
    my $base_url = $1;
    say $base_url;
    
    my $count = 0;
    my $i = 0;
    foreach my $category ( @$categories ){
	print " $i ->";$i++;
	my $j = 0;
	foreach my $photo_page ( @{$category->{photo_pages}} ){
	    print " $j";
	    $mech->get( $photo_page->{url} );
	    if( $mech->success ){
		if( $mech->decoded_content =~ /\?pic=(photo_[^"&]+)["&]/ ){
		    $category->{photo_pages}->[$j++]->{image_url} = $base_url . "/photo/".$1;
		    $self->{save}->{image_urls}->[$count++] = $base_url . "/photo/".$1;
		}else{
		    say "fail: " . $photo_page->{url};
		}
	    }
            last if $self->{debug};
       }
       say " (total $count)";
       last if  $self->{debug};
    }
    say " (total $count)";
}

    
sub unlock(){
    my $self = shift;
    my $mech = $self->{mech};
    my $form = $self->{yaml}->{form};
    my $passwords = $self->{yaml}->{passwords};
    
    foreach my $password ( @$passwords ){
	say "$self->{yaml}->{name} $password->{name} $password->{password} ";
	$mech->submit_form(
			   form_name => $form->{name},
			   fields    => { $form->{input} => $password->{password} },
			   );
	$self->abort(Dumper $mech->response) unless $mech->success;
	say " -> Successfully Authorized.";
	last if $self->{debug};
    }
}

sub abort(){
    shift;
    print(@_);
    exit 8;
}

1;
__END__

WWW::AllSportsJp::Old.pm

package WWW::AllSportsJp::Old;

use warnings;
use strict;
use Carp;
use WWW::Mechanize;
use WWW::Mechanize::DecodedContent;
use Web::Scraper;
use YAML;
use Data::Dumper;
$Data::Dumper::Indent = 1;
use URI;
use Encode;
use encoding 'utf8';
use base qw(WWW::AllSportsJp);

use version; our $VERSION = qv('0.0.4');

use Perl6::Say;

sub getInformation(){
    my $self = shift;
    my $mech = $self->{mech};

    my $scraper = scraper{
	process 'h2',                  'title'    => 'TEXT';
	process 'td.bc_bub',           'types[]'  => 'TEXT';
	process 'td.wk_bu,td.wk_bu_b', 'values[]' => 'TEXT';
    };
    $self->{save}->{information} = $scraper->scrape( $mech->decoded_content, $mech->uri );
}

sub getListPages(){
    my $self = shift;
    my $mech = $self->{mech};

    my $links = scraper{
	process 'table.wk_bu table tr td[valign="top"]', 'links[]' =>
	    scraper{
		process 'td[align="right"]',           'category' => 'TEXT';
		process 'h3',                          'title'    => 'TEXT';
		process 'td[align!="center"] a[href]', 'url'      => '@href';
	    };
    };

    my $tmp_links = $links->scrape( $mech->decoded_content, $mech->uri );

    my $categories = [];
    my $list_pages;
    my ($i,$j)=(0,0);
    foreach my $tmp_link (@{$tmp_links->{links}}){
	if( exists $tmp_link->{category} ){
	    if( $tmp_link->{category} eq " " ){
		next;
	    }else{
		say undef if $i != 0;
		print " $i ->";
		$categories->[$i] = {
		    'title'       => $tmp_link->{category},
		    'list_pages'  => [],
		    'photo_pages' => [],
		};
		$list_pages = $categories->[$i++]->{list_pages};
		$j = 0;
		next;
	    }
	}elsif( exists $tmp_link->{url} ){
	    print " $j";
	    $list_pages->[$j++] = $tmp_link;
	}
    }
    say undef;
    
    $self->{save}->{categories} = $categories;

    say " -> OK";
    say "More";
    $self->getMoreListPages;
}

sub getMoreListPages(){
    my $self = shift;
    my $mech = $self->{mech};
    my $categories = $self->{save}->{categories};
    
    my $more_pages = scraper{
	process 'table[width="800"] tr td.pdd_10lr table tr td.txt_p a.bld[href]', 'more_pages[]' =>
	    scraper{
		process 'a', 'url'   => '@href';
		process 'a', 'title' => 'TEXT';
	    };
    };

    my $h = 0;
    foreach my $category (@$categories){
	my $new_list_pages = [];
	my ($i,$j) = (0,0);
	say " $h ->";$h++;
	foreach my $list_page ( @{$category->{list_pages}} ){
	    print "   $i ->";$i++;
	    $mech->get($list_page->{url});
	    if( $mech->success ){
		my $result = $more_pages->scrape( $mech->decoded_content, $mech->uri );
		foreach my $page ( @{$result->{more_pages}} ){
		    print " $j";$j++;
		    push @$new_list_pages, {
			'url'   =>$page->{url},
			'title' => $list_page->{title} . "-" . $page->{title},
		    };
		}
		say undef;
		$j=0;
	    }else{
		say Dumper $mech->response;
		next;
	    }
	}
	push @{$category->{list_pages}}, @$new_list_pages;
    }
}

sub getPageUrls(){
    my $self = shift;
    my $mech = $self->{mech};
    my $categories = $self->{save}->{categories};
    
    my $scraper = scraper{
	process 'div.skinList ul li.skinList_li div[style]', 'items[]' =>
	    scraper{
		process 'div[align="center"] table[height="175"] tr td a[href]', 'url'    => '@href';
		process 'div a.bld a',                                           'number' => 'TEXT';
		process 'div font[colof="gray"]',                                'time'   => 'TEXT';
	    };
    };

    my $i = 0;
    my $count = 0;
    foreach my $category ( @$categories ){
	print " $i ->";
	my $j = 0;
	foreach my $list_page ( @{$category->{list_pages}} ){
	    print " $j";$j++;
	    my $result;
	    next if $list_page->{title} eq '';
	    $mech->get( $list_page->{url} );
	    if( $mech->success ){
		$result = $scraper->scrape( $mech->decoded_content, $mech->uri );
		push @{$categories->[$i]->{photo_pages}}, @{$result->{items}};
		$count += @{$result->{items}};
	    }
	    else{
		say Dumper $mech->response;
	    }
	}
	$i++;
	say " (total $count)"; 
    }
    say " (total $count)";
}

sub getImageUrls(){
    my $self = shift;
    my $mech = $self->{mech};
    my $categories = $self->{save}->{categories};

    my $scraper = scraper{
	process 'noscript table tr td[style]', 'style' => '@style';
    };

    my $count = 0;
    my $i = 0;
    foreach my $category ( @$categories ){
	print " $i ->";$i++;
	my $j = 0;
	foreach my $photo_page ( @{$category->{photo_pages}} ){
	    print " $j";
	    $mech->get( $photo_page->{url} );
	    if( $mech->success ){
		my $result = $scraper->scrape( $mech->decoded_content, $mech->uri );
		if( $result->{style} =~ /url\(\'(.*)\'\)/ ){
		    $category->{photo_pages}->[$j++]->{image_url} = $1;
		    $self->{save}->{image_urls}->[$count++] = $1;
		}else{
		    say 'fail';
		}
	    }
	}
	say " (total $count)";
    }
    say " (total $count)";
}
    
sub unlock(){
    my $self = shift;
    my $mech = $self->{mech};
    my $form = $self->{yaml}->{form};
    my $passwords = $self->{yaml}->{passwords};

    foreach my $password ( @$passwords ){
	say "  $self->{yaml}->{name} $password->{name} $password->{password}";
	$mech->submit_form( fields => { $form->{input} => $password->{password} } );
	$self->abort(Dumper $mech->response) unless $mech->success;
	say "   -> Successfully Authorized";
    }
}

1;
__END__

*1:イベント名や撮影日時も出力されます。