root/lang/perl/plagger/assets/plugins/CustomFeed-Script/www_nhk_or_jp_gogaku_english_business1.pl

Revision 16141, 1.6 kB (checked in by poppen, 12 months ago)

import Web::Scraper script for http://www.nhk.or.jp/gogaku/english/business1/index.html

  • Property svn:executable set to *
Line 
1#!/usr/bin/env perl
2
3use strict;
4use warnings;
5
6use utf8;
7use Web::Scraper;
8use URI;
9use YAML;
10use DateTime;
11
12my $stuff = URI->new(
13    "http://www.nhk.or.jp/gogaku/english/business1/index.html"
14);
15
16my $scraper = scraper {
17    process '//h1/img', 'title' => '@alt';
18    process '//h1/img', 'image' => '@src';
19    process '//div[@id="con-procontent"]/p[1]', 'description' => 'TEXT';
20    #process '//div[@id="eng-bus-audionow"]/div[@class="eng-bus-audnwlink" and p ]',
21    process '//div[@id="eng-bus-audionow"]/div[@class="eng-bus-audnwlink"]',
22        'entry[]' => scraper {
23            process '//a/text()', 'title' => 'TEXT';
24            process '//a/text()', 'body' => 'TEXT';
25            process '//a/text()', 'date' => ['TEXT', \&mk_date];
26            process '//a', 'enclosure[]' => scraper {
27                process '//a', 'url' => ['@href', \&to_string];
28            };
29        };
30};
31
32my $result = $scraper->scrape($stuff);
33$result->{link} = $stuff;
34
35binmode STDOUT, ":utf8";
36print Dump $result;
37
38sub to_string {
39    $_->as_string;
40}
41
42sub mk_date {
43    my $input = shift;
44    return unless ($input =~ m!(\d+)月(\d+)日!);
45
46    my $month = $1;
47    my $day = $2;
48
49    my $today = DateTime->now(time_zone => 'Asia/Tokyo')->truncate(to => 'day');
50    my $this = $today->clone->set(month => $month, day => $day);
51    my $last = $this->clone->subtract(years => 1);
52    my $next = $this->clone->add(years => 1);
53    my @date = sort { DateTime::Duration->compare($a->[1], $b->[1], $today) }
54               map { [$_->[0], $_->[1]->is_positive ? $_->[1] : $_->[1]->inverse ] }
55               map { [$_, $today - $_] } ($this, $last, $next);
56
57    return $date[0]->[0]->ymd;
58}
Note: See TracBrowser for help on using the browser.