これも、smart.fmで公開したスクリプトです。こちらの方は、特に引数などはありません。
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use utf8; | |
use WWW::Mechanize; | |
use Text::CSV_XS; | |
use Web::Scraper; | |
my $scraper = scraper { | |
process '//div[@id="answer"]/h2', 'part' => ['TEXT', sub {m/Part([1-7])/; $1;}]; | |
process '//div[@id="answer"]/h2', 'no' => ['TEXT', sub {m/Part[1-7] *- *([0-9]+)/; $1;}]; | |
process '//div[@id="a-question"]/dl[1]/dt', 'question' => 'TEXT'; | |
process '//div[@id="a-question"]/dl[1]/dd/ul/li', 'choices[]' => 'TEXT'; | |
process '//dl[@id="a-match"]/dd[2]', 'answer' => 'TEXT'; | |
process '//div[@id="a-sentence"]/dl[1]/dd', 'translation' => ['TEXT', sub {tr/\x{301c}\x{2212}/\x{ff5e}\x{ff0d}/; $_;}]; | |
process '//div[@id="a-sentence"]/dl[2]/dd', 'description' => ['TEXT', sub {tr/\x{301c}\x{2212}/\x{ff5e}\x{ff0d}/; $_;}]; | |
}; | |
my $csv = Text::CSV_XS->new({binary => 1}); | |
my $mech = new WWW::Mechanize(); | |
$mech->get('http://stepup.yahoo.co.jp/english/toeictest/question.html?c=5&c1=1'); | |
open my $file, '>:encoding(cp932)', 'yahootoeic_part5.csv' or die $!; # force output in CP932 | |
print "yahootoeic_part5.csv: Now scraping...\n"; | |
for (;;) { | |
sleep 1; | |
$mech->follow_link(text => 'A', url_regex => qr/.*answer\.html.*/); | |
my $scrap = $scraper->scrape($mech->content, $mech->uri); | |
$csv->combine($scrap->{part}, $scrap->{no}, $scrap->{question}, @{$scrap->{choices}}, $scrap->{answer}, $scrap->{translation}, $scrap->{description}); | |
print $file $csv->string() . "\n"; | |
sleep 1; | |
my $link = $mech->find_link(text => '次の問題', url_regex => qr/.*question\.html.*/); | |
last unless defined $link; | |
$mech->get($link); | |
} | |
close $file; | |
=head1 NAME | |
makecsv_yahootoeic5 - A script to download "Yahoo! Japan TOEIC Daily Test" as CSV files. | |
=head1 SYNOPSIS | |
makecsv_yahootoeic5 | |
=head1 DESCRIPTION | |
このスクリプトを使って、Yahoo! JapanのTOEICデイリーミニテスト<L<http://stepup.yahoo.co.jp/english/toeictest/>>のPart5問題をCSVファイルに書き出すことができます。 | |
=head1 AUTHOR | |
en45masao |
Gistに上げたコードをjavascriptでblogに貼り付けられるのは便利ですが、やはり読み込みにちょっと時間がかかってしまいますね。なので、普段は「もっと読む」を使って隠しておくのがよさそうです。
0 件のコメント:
コメントを投稿