| 1 | #!/usr/bin/env perl |
|---|
| 2 | use strict; |
|---|
| 3 | use warnings; |
|---|
| 4 | use utf8; |
|---|
| 5 | |
|---|
| 6 | use DateTime; |
|---|
| 7 | use DateTime::Duration; |
|---|
| 8 | use Encode qw( decode ); |
|---|
| 9 | use URI; |
|---|
| 10 | use Web::Scraper 0.22; |
|---|
| 11 | use YAML; |
|---|
| 12 | |
|---|
| 13 | my $url = 'http://comiclist.jp/index.php?p=n'; |
|---|
| 14 | my $s = scraper { |
|---|
| 15 | process '//table[@class="list_font"]/tr[position()>1]', 'entry[]' => scraper { |
|---|
| 16 | process '/tr/td[1]', |
|---|
| 17 | date => [ |
|---|
| 18 | 'text', |
|---|
| 19 | sub { |
|---|
| 20 | m!^(.*?)/(.*?)$! or return; |
|---|
| 21 | return +{ month => $1, day => $2 }; |
|---|
| 22 | }, |
|---|
| 23 | \&mk_date, |
|---|
| 24 | ], |
|---|
| 25 | part => ['text', sub { s!^.*?/!!; $_ } ]; |
|---|
| 26 | process '/tr/td[2]', publisher => 'text'; |
|---|
| 27 | process '/tr/td[3]', title => 'text'; |
|---|
| 28 | process '/tr/td[4]', author => 'text'; |
|---|
| 29 | process '/tr/td[5]', price => 'text'; |
|---|
| 30 | process '/tr/td[6]', note => ['text', sub { s/^\s+|\s+$//g; $_ } ]; |
|---|
| 31 | result qw( date part publisher title author price note ); |
|---|
| 32 | }; |
|---|
| 33 | result qw( entry ); |
|---|
| 34 | }; |
|---|
| 35 | $s->user_agent->env_proxy; |
|---|
| 36 | |
|---|
| 37 | ### get the number of pages |
|---|
| 38 | my $res = $s->user_agent->get($url); |
|---|
| 39 | $res->is_success or die "GET $url failed: " . $res->status_line; |
|---|
| 40 | my $content = decode('cp932', $res->content); |
|---|
| 41 | my $num_pages = 0; |
|---|
| 42 | while($content =~ /<input[^>]+name="pg_num"[^>]+value="(\d+)"/sg) { |
|---|
| 43 | $num_pages = $1 if $num_pages < $1; |
|---|
| 44 | } |
|---|
| 45 | |
|---|
| 46 | ### get pages by POST method |
|---|
| 47 | my @entry; |
|---|
| 48 | for(my $i = 1; $i <= $num_pages; ++$i) { |
|---|
| 49 | my $res = $s->user_agent->post($url, [pg_num => $i]); |
|---|
| 50 | $res->is_success or die "POST $url failed: " . $res->status_line; |
|---|
| 51 | my $content = decode('cp932', $res->content); |
|---|
| 52 | push @entry, @{ $s->scrape($content) }; |
|---|
| 53 | } |
|---|
| 54 | |
|---|
| 55 | ### make body |
|---|
| 56 | for my $e (@entry) { |
|---|
| 57 | # 1. base: author, publisher, and price |
|---|
| 58 | $e->{body} = join ', ', map { $e->{$_} } qw( author publisher price ); |
|---|
| 59 | # 2. add part if exists |
|---|
| 60 | $e->{body} = join ', ', map { $e->{$_} } qw( part body ) unless $e->{part} =~ /^\d+$/; |
|---|
| 61 | # 3. add note if exists |
|---|
| 62 | $e->{body} = join ', ', map { $e->{$_} } qw( body note ) if $e->{note}; |
|---|
| 63 | # 4. delete unnecessary elements |
|---|
| 64 | delete $e->{$_} for qw( part publisher price note ); |
|---|
| 65 | } |
|---|
| 66 | |
|---|
| 67 | binmode STDOUT, ':utf8'; |
|---|
| 68 | print Dump +{ |
|---|
| 69 | title => 'comiclist.jp コミック発売予定一覧', |
|---|
| 70 | link => $url, |
|---|
| 71 | entry => \@entry, |
|---|
| 72 | }; |
|---|
| 73 | |
|---|
| 74 | sub mk_date { |
|---|
| 75 | return unless defined $_; |
|---|
| 76 | $_->{day} = &part_to_day($_->{day}); |
|---|
| 77 | my $date = &guess_year($_); |
|---|
| 78 | return $date->ymd; |
|---|
| 79 | } |
|---|
| 80 | |
|---|
| 81 | sub part_to_day { |
|---|
| 82 | my $part = shift; |
|---|
| 83 | return unless defined $part; |
|---|
| 84 | return $part if $part =~ /^\d+$/; |
|---|
| 85 | return 21 if $part =~ /下/; |
|---|
| 86 | return 11 if $part =~ /中/; |
|---|
| 87 | return 1; |
|---|
| 88 | } |
|---|
| 89 | |
|---|
| 90 | sub guess_year { |
|---|
| 91 | my $value = shift; |
|---|
| 92 | return unless defined $value; |
|---|
| 93 | my $now = DateTime->now(time_zone => 'Asia/Tokyo'); |
|---|
| 94 | my $this = $now->clone->truncate(to => 'year')->set(%$value); |
|---|
| 95 | my $last = $this->clone->subtract(years => 1); |
|---|
| 96 | my $next = $this->clone->add(years => 1); |
|---|
| 97 | my @date = sort { DateTime::Duration->compare($a->[1], $b->[1], $now) } |
|---|
| 98 | map { [$_->[0], $_->[1]->is_positive ? $_->[1] : $_->[1]->inverse ] } |
|---|
| 99 | map { [$_, $now - $_] } ($this, $last, $next); |
|---|
| 100 | $date[0]->[0]; |
|---|
| 101 | } |
|---|
| 102 | |
|---|
| 103 | __END__ |
|---|