root/lang/perl/WWW-CommentGetter/trunk/WWW-CommentGetter/lib/WWW/CommentGetter.pm @ 1957

Revision 1957, 3.1 kB (checked in by tokuhirom, 6 years ago)

CommentGetter: split siteinfo to sharedir.

Line 
1package WWW::CommentGetter;
2use strict;
3use warnings;
4use Carp;
5use HTTP::Response::Encoding;
6use Encode;
7use LWP::UserAgent;
8use HTML::TreeBuilder::XPath;
9use File::ShareDir 'module_file';
10use JSON::Any;
11
12our $VERSION = '0.01';
13
14sub new {
15    my ($class, ) = @_;
16    bless {}, $class;
17}
18
19sub _load_site_info {
20    my $self = shift;
21    $self->{__siteinfos} ||= do {
22        my $fname = module_file(__PACKAGE__, 'siteinfo.json');
23
24        open my $fh, '<', $fname or die $!;
25        my $src = do { local $/; join '', <$fh> };
26        close $fh;
27
28        JSON::Any->jsonToObj($src);
29    };
30}
31
32sub get {
33    my ($self, $url) = @_;
34
35    my $siteinfos = $self->_load_site_info;
36    for my $siteinfo (@$siteinfos) {
37        my $ret = scrape_by_siteinfo($url => $siteinfo);
38        return $ret if $ret;
39    }
40    return;
41}
42
43sub scrape_by_siteinfo {
44    my ($url, $siteinfo) = @_;
45
46    return unless $url =~ $siteinfo->{url};
47
48    my $ua = LWP::UserAgent->new(agent => __PACKAGE__);
49    my $res = $ua->get($url);
50    unless ($res->is_success) {
51        croak "cannot get $url";
52    }
53    my $content = decode $res->encoding, $res->content;
54
55    my $tree = HTML::TreeBuilder::XPath->new;
56    $tree->parse($content);
57
58    my $cnt = 1;
59    my @nodes = $tree->findnodes($siteinfo->{context});
60    my $stash = [];
61    for my $node (@nodes) {
62        my $s = {};
63        while (my ($key, $local_xpath) = each %{$siteinfo->{attributes}}) {
64            my $subattr = '';
65            if ($local_xpath =~ s/^substring\(([^,]+)([,\s\d]+)\)$/$1/) {
66                $subattr = $2;
67            }
68            warn "get $key by $local_xpath" if $ENV{DEBUG};
69            my $xpath = join '', $siteinfo->{context}, "[$cnt]", $local_xpath;
70            if ($subattr) {
71                $xpath = "substring(string($xpath)$subattr)";
72            }
73            warn $xpath if $ENV{DEBUG};
74            if ($subattr) {
75                $s->{$key} = trim($tree->findvalue($xpath));
76            } else {
77                $s->{$key} = as_text($tree->findnodes($xpath)->[0]);
78            }
79        }
80        push @$stash, $s;
81
82        $cnt++;
83    }
84
85    $tree = $tree->delete;
86
87    return $stash;
88}
89
90sub as_text {
91    my $element = shift;
92
93    if ($element->isTextNode) {
94        trim($element->string_value);
95    } else {
96        trim($element->as_text);
97    }
98}
99
100sub trim {
101    local $_ = shift;
102    s/^\s*//;
103    s/\s*$//;
104    return $_;
105}
106
1071;
108
109__END__
110
111=head1 NAME
112
113WWW::CommentGetter - scraping comments from blogs
114
115=head1 SYNOPSIS
116
117  use WWW::CommentGetter;
118  my $wcg = WWW::CommentGetter->new();
119  $wcg->get('http://blog.livedoor.jp/dankogai/archives/50935001.html');
120
121=head1 DESCRIPTION
122
123get the comment from blogs.
124
125=head1 METHODS
126
127=head2 new
128
129  my $wcg = WWW::CommentGetter->new('/path/to/assets/');
130
131constructer.
132
133=head2 get
134
135  $wcg->get('http://blog.livedoor.jp/dankogai/archives/50935001.html');
136
137get comments from permalink.
138
139=head1 AUTHOR
140
141Tokuhiro Matsuno <tokuhirom aaaatttt gmail dotottto commmmm>
142
143=head1 COPYRIGHT
144
145This program is free software; you can redistribute
146it and/or modify it under the same terms as Perl itself.
147
148The full text of the license can be found in the
149LICENSE file included with this module.
150
151=cut
Note: See TracBrowser for help on using the browser.