| 1 | package WWW::CommentGetter; |
|---|
| 2 | use strict; |
|---|
| 3 | use warnings; |
|---|
| 4 | use Carp; |
|---|
| 5 | use HTTP::Response::Encoding; |
|---|
| 6 | use Encode; |
|---|
| 7 | use LWP::UserAgent; |
|---|
| 8 | use HTML::TreeBuilder::XPath; |
|---|
| 9 | use File::ShareDir 'module_file'; |
|---|
| 10 | use JSON::Any; |
|---|
| 11 | |
|---|
| 12 | our $VERSION = '0.01'; |
|---|
| 13 | |
|---|
| 14 | sub new { |
|---|
| 15 | my ($class, ) = @_; |
|---|
| 16 | bless {}, $class; |
|---|
| 17 | } |
|---|
| 18 | |
|---|
| 19 | sub _load_site_info { |
|---|
| 20 | my $self = shift; |
|---|
| 21 | $self->{__siteinfos} ||= do { |
|---|
| 22 | my $fname = module_file(__PACKAGE__, 'siteinfo.json'); |
|---|
| 23 | |
|---|
| 24 | open my $fh, '<', $fname or die $!; |
|---|
| 25 | my $src = do { local $/; join '', <$fh> }; |
|---|
| 26 | close $fh; |
|---|
| 27 | |
|---|
| 28 | JSON::Any->jsonToObj($src); |
|---|
| 29 | }; |
|---|
| 30 | } |
|---|
| 31 | |
|---|
| 32 | sub get { |
|---|
| 33 | my ($self, $url) = @_; |
|---|
| 34 | |
|---|
| 35 | my $siteinfos = $self->_load_site_info; |
|---|
| 36 | for my $siteinfo (@$siteinfos) { |
|---|
| 37 | my $ret = scrape_by_siteinfo($url => $siteinfo); |
|---|
| 38 | return $ret if $ret; |
|---|
| 39 | } |
|---|
| 40 | return; |
|---|
| 41 | } |
|---|
| 42 | |
|---|
| 43 | sub scrape_by_siteinfo { |
|---|
| 44 | my ($url, $siteinfo) = @_; |
|---|
| 45 | |
|---|
| 46 | return unless $url =~ $siteinfo->{url}; |
|---|
| 47 | |
|---|
| 48 | my $ua = LWP::UserAgent->new(agent => __PACKAGE__); |
|---|
| 49 | my $res = $ua->get($url); |
|---|
| 50 | unless ($res->is_success) { |
|---|
| 51 | croak "cannot get $url"; |
|---|
| 52 | } |
|---|
| 53 | my $content = decode $res->encoding, $res->content; |
|---|
| 54 | |
|---|
| 55 | my $tree = HTML::TreeBuilder::XPath->new; |
|---|
| 56 | $tree->parse($content); |
|---|
| 57 | |
|---|
| 58 | my $cnt = 1; |
|---|
| 59 | my @nodes = $tree->findnodes($siteinfo->{context}); |
|---|
| 60 | my $stash = []; |
|---|
| 61 | for my $node (@nodes) { |
|---|
| 62 | my $s = {}; |
|---|
| 63 | while (my ($key, $local_xpath) = each %{$siteinfo->{attributes}}) { |
|---|
| 64 | my $subattr = ''; |
|---|
| 65 | if ($local_xpath =~ s/^substring\(([^,]+)([,\s\d]+)\)$/$1/) { |
|---|
| 66 | $subattr = $2; |
|---|
| 67 | } |
|---|
| 68 | warn "get $key by $local_xpath" if $ENV{DEBUG}; |
|---|
| 69 | my $xpath = join '', $siteinfo->{context}, "[$cnt]", $local_xpath; |
|---|
| 70 | if ($subattr) { |
|---|
| 71 | $xpath = "substring(string($xpath)$subattr)"; |
|---|
| 72 | } |
|---|
| 73 | warn $xpath if $ENV{DEBUG}; |
|---|
| 74 | if ($subattr) { |
|---|
| 75 | $s->{$key} = trim($tree->findvalue($xpath)); |
|---|
| 76 | } else { |
|---|
| 77 | $s->{$key} = as_text($tree->findnodes($xpath)->[0]); |
|---|
| 78 | } |
|---|
| 79 | } |
|---|
| 80 | push @$stash, $s; |
|---|
| 81 | |
|---|
| 82 | $cnt++; |
|---|
| 83 | } |
|---|
| 84 | |
|---|
| 85 | $tree = $tree->delete; |
|---|
| 86 | |
|---|
| 87 | return $stash; |
|---|
| 88 | } |
|---|
| 89 | |
|---|
| 90 | sub as_text { |
|---|
| 91 | my $element = shift; |
|---|
| 92 | |
|---|
| 93 | if ($element->isTextNode) { |
|---|
| 94 | trim($element->string_value); |
|---|
| 95 | } else { |
|---|
| 96 | trim($element->as_text); |
|---|
| 97 | } |
|---|
| 98 | } |
|---|
| 99 | |
|---|
| 100 | sub trim { |
|---|
| 101 | local $_ = shift; |
|---|
| 102 | s/^\s*//; |
|---|
| 103 | s/\s*$//; |
|---|
| 104 | return $_; |
|---|
| 105 | } |
|---|
| 106 | |
|---|
| 107 | 1; |
|---|
| 108 | |
|---|
| 109 | __END__ |
|---|
| 110 | |
|---|
| 111 | =head1 NAME |
|---|
| 112 | |
|---|
| 113 | WWW::CommentGetter - scraping comments from blogs |
|---|
| 114 | |
|---|
| 115 | =head1 SYNOPSIS |
|---|
| 116 | |
|---|
| 117 | use WWW::CommentGetter; |
|---|
| 118 | my $wcg = WWW::CommentGetter->new(); |
|---|
| 119 | $wcg->get('http://blog.livedoor.jp/dankogai/archives/50935001.html'); |
|---|
| 120 | |
|---|
| 121 | =head1 DESCRIPTION |
|---|
| 122 | |
|---|
| 123 | get the comment from blogs. |
|---|
| 124 | |
|---|
| 125 | =head1 METHODS |
|---|
| 126 | |
|---|
| 127 | =head2 new |
|---|
| 128 | |
|---|
| 129 | my $wcg = WWW::CommentGetter->new('/path/to/assets/'); |
|---|
| 130 | |
|---|
| 131 | constructer. |
|---|
| 132 | |
|---|
| 133 | =head2 get |
|---|
| 134 | |
|---|
| 135 | $wcg->get('http://blog.livedoor.jp/dankogai/archives/50935001.html'); |
|---|
| 136 | |
|---|
| 137 | get comments from permalink. |
|---|
| 138 | |
|---|
| 139 | =head1 AUTHOR |
|---|
| 140 | |
|---|
| 141 | Tokuhiro Matsuno <tokuhirom aaaatttt gmail dotottto commmmm> |
|---|
| 142 | |
|---|
| 143 | =head1 COPYRIGHT |
|---|
| 144 | |
|---|
| 145 | This program is free software; you can redistribute |
|---|
| 146 | it and/or modify it under the same terms as Perl itself. |
|---|
| 147 | |
|---|
| 148 | The full text of the license can be found in the |
|---|
| 149 | LICENSE file included with this module. |
|---|
| 150 | |
|---|
| 151 | =cut |
|---|