root/lang/perl/plagger/assets/plugins/CustomFeed-Script/comiclist_jp.pl @ 38817

Revision 38817, 3.1 kB (checked in by sweetpotato, 2 years ago)

add plugins/CustomFeed-Script/comiclist_jp.pl

  • Property svn:executable set to *
Line 
1#!/usr/bin/env perl
2use strict;
3use warnings;
4use utf8;
5
6use DateTime;
7use DateTime::Duration;
8use Encode qw( decode );
9use URI;
10use Web::Scraper 0.22;
11use YAML;
12
13my $url = 'http://comiclist.jp/index.php?p=n';
14my $s = scraper {
15    process '//table[@class="list_font"]/tr[position()>1]', 'entry[]' => scraper {
16        process '/tr/td[1]',
17            date => [
18                'text',
19                sub {
20                    m!^(.*?)/(.*?)$! or return;
21                    return +{ month => $1, day => $2 };
22                },
23                \&mk_date,
24            ],
25            part => ['text', sub { s!^.*?/!!; $_ } ];
26        process '/tr/td[2]', publisher => 'text';
27        process '/tr/td[3]', title => 'text';
28        process '/tr/td[4]', author => 'text';
29        process '/tr/td[5]', price => 'text';
30        process '/tr/td[6]', note => ['text', sub { s/^\s+|\s+$//g; $_ } ];
31        result qw( date part publisher title author price note );
32    };
33    result qw( entry );
34};
35$s->user_agent->env_proxy;
36
37### get the number of pages
38my $res = $s->user_agent->get($url);
39$res->is_success or die "GET $url failed: " . $res->status_line;
40my $content = decode('cp932', $res->content);
41my $num_pages = 0;
42while($content =~ /<input[^>]+name="pg_num"[^>]+value="(\d+)"/sg) {
43    $num_pages = $1 if $num_pages < $1;
44}
45
46### get pages by POST method
47my @entry;
48for(my $i = 1; $i <= $num_pages; ++$i) {
49    my $res = $s->user_agent->post($url, [pg_num => $i]);
50    $res->is_success or die "POST $url failed: " . $res->status_line;
51    my $content = decode('cp932', $res->content);
52    push @entry, @{ $s->scrape($content) };
53}
54
55### make body
56for my $e (@entry) {
57    # 1. base: author, publisher, and price
58    $e->{body} = join ', ', map { $e->{$_} } qw( author publisher price );
59    # 2. add part if exists
60    $e->{body} = join ', ', map { $e->{$_} } qw( part body ) unless $e->{part} =~ /^\d+$/;
61    # 3. add note if exists
62    $e->{body} = join ', ', map { $e->{$_} } qw( body note ) if $e->{note};
63    # 4. delete unnecessary elements
64    delete $e->{$_} for qw( part publisher price note );
65}
66
67binmode STDOUT, ':utf8';
68print Dump +{
69    title => 'comiclist.jp コミック発売予定一覧',
70    link  => $url,
71    entry => \@entry,
72};
73
74sub mk_date {
75    return unless defined $_;
76    $_->{day} = &part_to_day($_->{day});
77    my $date = &guess_year($_);
78    return $date->ymd;
79}
80
81sub part_to_day {
82    my $part = shift;
83    return unless defined $part;
84    return $part if $part =~ /^\d+$/;
85    return 21 if $part =~ /下/;
86    return 11 if $part =~ /中/;
87    return 1;
88}
89
90sub guess_year {
91    my $value = shift;
92    return unless defined $value;
93    my $now  = DateTime->now(time_zone => 'Asia/Tokyo');
94    my $this = $now->clone->truncate(to => 'year')->set(%$value);
95    my $last = $this->clone->subtract(years => 1);
96    my $next = $this->clone->add(years => 1);
97    my @date = sort { DateTime::Duration->compare($a->[1], $b->[1], $now) }
98               map { [$_->[0], $_->[1]->is_positive ? $_->[1] : $_->[1]->inverse ] }
99               map { [$_, $now - $_] } ($this, $last, $next);
100    $date[0]->[0];
101}
102
103__END__
Note: See TracBrowser for help on using the browser.