| 1 | package Plagger::Plugin::Subscription::Parse;
|
|---|
| 2 |
|
|---|
| 3 | use strict;
|
|---|
| 4 | use warnings;
|
|---|
| 5 |
|
|---|
| 6 | use URI;
|
|---|
| 7 | use Encode ();
|
|---|
| 8 | use HTML::TreeBuilder::XPath;
|
|---|
| 9 | use HTML::Entities ();
|
|---|
| 10 | use Plagger::Util;
|
|---|
| 11 | use Plagger::Tag;
|
|---|
| 12 | use Plagger::Feed;
|
|---|
| 13 |
|
|---|
| 14 | use base qw( Plagger::Plugin );
|
|---|
| 15 |
|
|---|
| 16 | sub register {
|
|---|
| 17 | my ( $self, $c ) = @_;
|
|---|
| 18 | $c->register_hook(
|
|---|
| 19 | $self,
|
|---|
| 20 | 'subscription.load' => $self->can('load'),
|
|---|
| 21 | );
|
|---|
| 22 | }
|
|---|
| 23 |
|
|---|
| 24 | sub load {
|
|---|
| 25 | my ( $self, $c ) = @_;
|
|---|
| 26 |
|
|---|
| 27 | my $confs = $self->conf->{'subscribe'};
|
|---|
| 28 | $confs = [ $confs ] if ( ref $confs ne 'ARRAY' );
|
|---|
| 29 |
|
|---|
| 30 | my $default = $self->conf->{'default'};
|
|---|
| 31 | $default->{'xpath'} ||= {
|
|---|
| 32 | url => '//a/@href',
|
|---|
| 33 | title => '//a/@title',
|
|---|
| 34 | };
|
|---|
| 35 |
|
|---|
| 36 | for my $conf ( @{ $confs } ) {
|
|---|
| 37 | $conf = { target => $conf } if ( ! ref $conf );
|
|---|
| 38 |
|
|---|
| 39 | my $uri = $conf->{'target'}
|
|---|
| 40 | or $c->log( error => "Target url is missing." ) and next;
|
|---|
| 41 |
|
|---|
| 42 | Encode::_utf8_off( $uri );
|
|---|
| 43 |
|
|---|
| 44 | for my $name ( qw( xpath regexp after_hook ) ) {
|
|---|
| 45 | $conf->{$name} ||= $default->{$name};
|
|---|
| 46 | }
|
|---|
| 47 |
|
|---|
| 48 | my $meta = {};
|
|---|
| 49 | $meta->{'link'} = $conf->{'target'};
|
|---|
| 50 | for my $name ( qw( title meta tag ) ) {
|
|---|
| 51 | $meta->{$name} = $conf->{$name} || $default->{$name};
|
|---|
| 52 | }
|
|---|
| 53 |
|
|---|
| 54 | if ( $meta->{'tag'} && ! ref $meta->{'tag'} ) {
|
|---|
| 55 | my $tags = [ Plagger::Tag->parse( $meta->{'tag'} ) ];
|
|---|
| 56 | $meta->{'tag'} = $tags;
|
|---|
| 57 | }
|
|---|
| 58 |
|
|---|
| 59 | $c->log( info => "Parse $uri" );
|
|---|
| 60 | my $xhtml = Plagger::Util::load_uri( URI->new( $uri ), $self );
|
|---|
| 61 | my $title = Plagger::Util::extract_title( $xhtml );
|
|---|
| 62 | $meta->{'title'} ||= $title || $meta->{'link'};
|
|---|
| 63 |
|
|---|
| 64 | my $datas;
|
|---|
| 65 | if ( defined $conf->{'xpath'} ) {
|
|---|
| 66 | $datas = $self->find_node( $c, $conf->{'target'}, $xhtml, $conf->{'xpath'} )
|
|---|
| 67 | or next;
|
|---|
| 68 | }
|
|---|
| 69 | else {
|
|---|
| 70 | $datas = $self->parse( $c, $conf->{'target'}, $xhtml, $conf->{'regexp'} )
|
|---|
| 71 | or next;
|
|---|
| 72 | }
|
|---|
| 73 |
|
|---|
| 74 | if ( $conf->{'after_hook'} ) {
|
|---|
| 75 | for my $data ( @{ $datas } ) {
|
|---|
| 76 | eval $conf->{'after_hook'};
|
|---|
| 77 | $c->error( $@ ) if ( $@ );
|
|---|
| 78 | }
|
|---|
| 79 | }
|
|---|
| 80 |
|
|---|
| 81 | for my $feed ( $self->create_feed( $c, $datas, $meta ) ) {
|
|---|
| 82 | $c->subscription->add( $feed );
|
|---|
| 83 | }
|
|---|
| 84 | }
|
|---|
| 85 | }
|
|---|
| 86 |
|
|---|
| 87 | sub find_node {
|
|---|
| 88 | my ( $self, $c, $url, $xhtml, $conf ) = @_;
|
|---|
| 89 | if ( ! $conf->{'url'} ) {
|
|---|
| 90 | $c->log( error => "'url' XPath is not specified. :$url" );
|
|---|
| 91 | return;
|
|---|
| 92 | }
|
|---|
| 93 |
|
|---|
| 94 | my $tree = HTML::TreeBuilder::XPath->new;
|
|---|
| 95 | $tree->parse( $xhtml );
|
|---|
| 96 | $tree->eof;
|
|---|
| 97 |
|
|---|
| 98 | no warnings 'redefine';
|
|---|
| 99 | local *HTML::Element::_xml_escape = $self->can('xml_escape');
|
|---|
| 100 | use warnings;
|
|---|
| 101 |
|
|---|
| 102 | my $data = [];
|
|---|
| 103 |
|
|---|
| 104 | for my $capture ( keys %{ $conf } ) {
|
|---|
| 105 | next if ( ! $conf->{$capture} );
|
|---|
| 106 | my $index = 0;
|
|---|
| 107 | for my $child ( $tree->findnodes( $conf->{$capture} ) ) {
|
|---|
| 108 | my $value = ( $child->isElementNode ) ? $child->as_XML : $child->getValue ;
|
|---|
| 109 | $data->[$index] = {} if ( ! ref $data->[$index] );
|
|---|
| 110 | $data->[$index]->{$capture} = $value;
|
|---|
| 111 | $index++;
|
|---|
| 112 | }
|
|---|
| 113 | }
|
|---|
| 114 |
|
|---|
| 115 | return $data;
|
|---|
| 116 | }
|
|---|
| 117 |
|
|---|
| 118 | sub parse {
|
|---|
| 119 | my ( $self, $c, $url, $xhtml, $conf ) = @_;
|
|---|
| 120 |
|
|---|
| 121 | my @capture = split m{\s+}, $conf->{'capture'};
|
|---|
| 122 |
|
|---|
| 123 | if ( ! @capture ) {
|
|---|
| 124 | $c->log( error => "There is no available 'capture'. :$url" );
|
|---|
| 125 | return;
|
|---|
| 126 | }
|
|---|
| 127 |
|
|---|
| 128 | if ( ! $conf->{'extract'} ) {
|
|---|
| 129 | $c->log( error => "'extrace' is not specified. :$url" );
|
|---|
| 130 | return;
|
|---|
| 131 | }
|
|---|
| 132 |
|
|---|
| 133 | my $extract = Plagger::Util::decode_content( $conf->{'extract'} );
|
|---|
| 134 | my ( $cur_pos, $prev_pos ) = ( 0, 0 );
|
|---|
| 135 |
|
|---|
| 136 | my $data = [];
|
|---|
| 137 | my $index = 0;
|
|---|
| 138 |
|
|---|
| 139 | while ( 1 ) {
|
|---|
| 140 | if ( $xhtml =~ m{$extract}sg ) {
|
|---|
| 141 | $cur_pos = pos $xhtml;
|
|---|
| 142 | my $str = substr $xhtml, $prev_pos, length( $xhtml );
|
|---|
| 143 | if ( my @matched = ( $xhtml =~ m{$extract}s ) ) {
|
|---|
| 144 | for my $name ( @capture ) {
|
|---|
| 145 | $data->[$index] = {} if ( ! ref $data->[$index] );
|
|---|
| 146 | $data->[$index]->{$name} = shift @matched;
|
|---|
| 147 | }
|
|---|
| 148 | $index++;
|
|---|
| 149 | $prev_pos = $cur_pos;
|
|---|
| 150 | }
|
|---|
| 151 | else {
|
|---|
| 152 | last;
|
|---|
| 153 | }
|
|---|
| 154 | }
|
|---|
| 155 | else {
|
|---|
| 156 | last;
|
|---|
| 157 | }
|
|---|
| 158 | }
|
|---|
| 159 |
|
|---|
| 160 | return $data;
|
|---|
| 161 | }
|
|---|
| 162 |
|
|---|
| 163 | sub create_feed {
|
|---|
| 164 | my ( $self, $c, $datas, $meta ) = @_;
|
|---|
| 165 |
|
|---|
| 166 | my @feeds = ();
|
|---|
| 167 | for my $data ( @{ $datas } ) {
|
|---|
| 168 | my $feed = Plagger::Feed->new;
|
|---|
| 169 |
|
|---|
| 170 | $feed->url( URI->new_abs( $data->{'url'}, $meta->{'link'} ) );
|
|---|
| 171 | $feed->title( $data->{'title'} || $meta->{'title'} );
|
|---|
| 172 |
|
|---|
| 173 | $feed->link( $meta->{'link'} );
|
|---|
| 174 |
|
|---|
| 175 | $feed->meta( $meta->{'meta'} ) if ( $meta->{'meta'} );
|
|---|
| 176 | $feed->tags( $meta->{'tag'} ) if ( $meta->{'tag'} );
|
|---|
| 177 |
|
|---|
| 178 | push @feeds, $feed;
|
|---|
| 179 | }
|
|---|
| 180 |
|
|---|
| 181 | return @feeds;
|
|---|
| 182 | }
|
|---|
| 183 |
|
|---|
| 184 | sub xml_escape {
|
|---|
| 185 | for my $x ( @_ ) {
|
|---|
| 186 | $x = Plagger::Util::encode_xml( $x );
|
|---|
| 187 | }
|
|---|
| 188 | }
|
|---|
| 189 |
|
|---|
| 190 | 1;
|
|---|
| 191 | __END__
|
|---|
| 192 |
|
|---|
| 193 | =head1 NAME
|
|---|
| 194 |
|
|---|
| 195 | Plagger::Plugin::Subscription::Parse - Use XPath or Regexp to extract subscriptions from web pages
|
|---|
| 196 |
|
|---|
| 197 | =head1 SYNOPSIS
|
|---|
| 198 |
|
|---|
| 199 | - module: Subscription::Parse
|
|---|
| 200 | config:
|
|---|
| 201 | subscribe:
|
|---|
| 202 | - target: http://d.hatena.ne.jp/antipop/20050628/1119966355
|
|---|
| 203 | xpath:
|
|---|
| 204 | url: //ul[@class="xoxo" or @class="subscriptionlist"]//a/@href
|
|---|
| 205 | title: //ul[@class="xoxo" or @class="subscriptionlist"]//a/@title
|
|---|
| 206 |
|
|---|
| 207 | =head1 DESCRIPTION
|
|---|
| 208 |
|
|---|
| 209 | This plugin extracets subscriptions to XHTML content,
|
|---|
| 210 | using XPath or Regular expression to find links.
|
|---|
| 211 |
|
|---|
| 212 | =head1 CONFIG
|
|---|
| 213 |
|
|---|
| 214 | =head2 subscribe
|
|---|
| 215 |
|
|---|
| 216 | =over 7
|
|---|
| 217 |
|
|---|
| 218 | =item target
|
|---|
| 219 |
|
|---|
| 220 | URI of the analyzed web page
|
|---|
| 221 |
|
|---|
| 222 | =item xpath
|
|---|
| 223 |
|
|---|
| 224 | XPath expression for each data
|
|---|
| 225 |
|
|---|
| 226 | =over 3
|
|---|
| 227 |
|
|---|
| 228 | =item url
|
|---|
| 229 |
|
|---|
| 230 | XPath expression which extracts subscriptions URI out
|
|---|
| 231 |
|
|---|
| 232 | =item title
|
|---|
| 233 |
|
|---|
| 234 | XPath expression which extracts subscriptions title out
|
|---|
| 235 |
|
|---|
| 236 | =item other name
|
|---|
| 237 |
|
|---|
| 238 | When specified XPath expression besides the name of two above,
|
|---|
| 239 | it's stocked in C<$data->{'name'}>.
|
|---|
| 240 |
|
|---|
| 241 | =back
|
|---|
| 242 |
|
|---|
| 243 | =item regexp
|
|---|
| 244 |
|
|---|
| 245 | Regular expression to extract each data out.
|
|---|
| 246 |
|
|---|
| 247 | =over 2
|
|---|
| 248 |
|
|---|
| 249 | =item extract
|
|---|
| 250 |
|
|---|
| 251 | Regular expression to extract data out
|
|---|
| 252 |
|
|---|
| 253 | =item capture
|
|---|
| 254 |
|
|---|
| 255 | The name of text that matches to the regular expression is specified.
|
|---|
| 256 | It is delimited in space.
|
|---|
| 257 |
|
|---|
| 258 | Data is stocked C<$data->{$name}>.
|
|---|
| 259 |
|
|---|
| 260 | =back
|
|---|
| 261 |
|
|---|
| 262 | =item after_hook
|
|---|
| 263 |
|
|---|
| 264 | Perl Code to process the acquired data.
|
|---|
| 265 |
|
|---|
| 266 | =item title
|
|---|
| 267 |
|
|---|
| 268 | feed title
|
|---|
| 269 |
|
|---|
| 270 | =item meta
|
|---|
| 271 |
|
|---|
| 272 | feed meta data.
|
|---|
| 273 |
|
|---|
| 274 | =item tag
|
|---|
| 275 |
|
|---|
| 276 | feed tags.
|
|---|
| 277 |
|
|---|
| 278 | =back
|
|---|
| 279 |
|
|---|
| 280 | =head2 default
|
|---|
| 281 |
|
|---|
| 282 | Value of default when there is no individual specification
|
|---|
| 283 |
|
|---|
| 284 | =over 5
|
|---|
| 285 |
|
|---|
| 286 | =item xpath
|
|---|
| 287 |
|
|---|
| 288 | =item regexp
|
|---|
| 289 |
|
|---|
| 290 | =item after_hook
|
|---|
| 291 |
|
|---|
| 292 | =item meta
|
|---|
| 293 |
|
|---|
| 294 | =item tag
|
|---|
| 295 |
|
|---|
| 296 | =back
|
|---|
| 297 |
|
|---|
| 298 | =head1 AUTHOR
|
|---|
| 299 |
|
|---|
| 300 | Naoki Okamura (Nyarla,) E<lt>thotep@nyarla.netE<gt>
|
|---|
| 301 |
|
|---|
| 302 | =head1 LICENSE
|
|---|
| 303 |
|
|---|
| 304 | This Plug-in is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
|
|---|
| 305 |
|
|---|
| 306 | =head1 SEE ALSO
|
|---|
| 307 |
|
|---|
| 308 | L<Plagger>
|
|---|
| 309 |
|
|---|
| 310 | =cut
|
|---|