| | 24 | sub scrape_by_siteinfo { |
| | 25 | my ($url, $siteinfo) = @_; |
| | 26 | |
| | 27 | return unless $url =~ $siteinfo->{url}; |
| | 28 | |
| | 29 | my $ua = LWP::UserAgent->new; |
| | 30 | my $res = $ua->get($url); |
| | 31 | unless ($res->is_success) { |
| | 32 | croak "cannot get $url"; |
| | 33 | } |
| | 34 | my $content = decode $res->encoding, $res->content; |
| | 35 | |
| | 36 | my $tree = HTML::TreeBuilder::XPath->new; |
| | 37 | $tree->parse($content); |
| | 38 | |
| | 39 | my $cnt = 1; |
| | 40 | my @nodes = $tree->findnodes($siteinfo->{context}); |
| | 41 | my $stash = []; |
| | 42 | for my $node (@nodes) { |
| | 43 | my $s = {}; |
| | 44 | while (my ($key, $local_xpath) = each %{$siteinfo->{attributes}}) { |
| | 45 | my $xpath = join '', $siteinfo->{context}, "[$cnt]", $local_xpath; |
| | 46 | $s->{$key} = as_text($tree->findnodes($xpath)->[0]); |
| | 47 | } |
| | 48 | push @$stash, $s; |
| | 49 | |
| | 50 | $cnt++; |
| | 51 | } |
| | 52 | |
| | 53 | $tree = $tree->delete; |
| | 54 | |
| | 55 | return $stash; |
| | 56 | } |
| | 57 | |
| | 58 | sub as_text { |
| | 59 | my $element = shift; |
| | 60 | |
| | 61 | if ($element->isTextNode) { |
| | 62 | trim $element->string_value; |
| | 63 | } else { |
| | 64 | trim $element->as_text; |
| | 65 | } |
| | 66 | } |
| | 67 | |
| 26 | | sub scrape_by_siteinfo { |
| 27 | | my ($url, $siteinfo) = @_; |
| 28 | | |
| 29 | | return unless $url =~ $siteinfo->{url}; |
| 30 | | |
| 31 | | my $ret = scraper { |
| 32 | | process $siteinfo->{context}, 'comments[]' => scraper { |
| 33 | | my $x = shift; |
| 34 | | while (my ($key, $xpath) = each %{$siteinfo->{attributes}}) { |
| 35 | | process $xpath, $key, sub { |
| 36 | | my $element = shift; |
| 37 | | trim $element->as_text; |
| 38 | | }; |
| 39 | | } |
| 40 | | }; |
| 41 | | }->scrape(URI->new($url)); |
| 42 | | |
| 43 | | return [ @{ $ret->{comments} } ]; |
| 44 | | } |