root/lang/perl/plagger/lib/Plagger/Plugin/Subscription/Parse.pm

Revision 268, 6.8 kB (checked in by nyarla, 15 months ago)

lang/perl/plagger: I imported making plugin files.

Line 
1package Plagger::Plugin::Subscription::Parse;
2
3use strict;
4use warnings;
5
6use URI;
7use Encode ();
8use HTML::TreeBuilder::XPath;
9use HTML::Entities ();
10use Plagger::Util;
11use Plagger::Tag;
12use Plagger::Feed;
13
14use base qw( Plagger::Plugin );
15
16sub register {
17    my ( $self, $c ) = @_;
18    $c->register_hook(
19        $self,
20        'subscription.load' => $self->can('load'),
21    );
22}
23
24sub load {
25    my ( $self, $c ) = @_;
26
27    my $confs = $self->conf->{'subscribe'};
28       $confs = [ $confs ] if ( ref $confs ne 'ARRAY' );
29
30    my $default = $self->conf->{'default'};
31    $default->{'xpath'} ||= {
32        url     => '//a/@href',
33        title   => '//a/@title',
34    };
35
36    for my $conf ( @{ $confs } ) {
37        $conf = { target => $conf } if ( ! ref $conf );
38
39        my $uri = $conf->{'target'}
40            or $c->log( error => "Target url is missing." ) and next;
41
42        Encode::_utf8_off( $uri );
43
44        for my $name ( qw( xpath regexp after_hook ) ) {
45            $conf->{$name} ||= $default->{$name};
46        }
47
48        my $meta = {};
49        $meta->{'link'} = $conf->{'target'};
50        for my $name ( qw( title meta tag ) ) {
51            $meta->{$name} = $conf->{$name} || $default->{$name};
52        }
53
54        if ( $meta->{'tag'} && ! ref $meta->{'tag'} ) {
55            my $tags = [ Plagger::Tag->parse( $meta->{'tag'} ) ];
56            $meta->{'tag'} = $tags;
57        }
58
59        $c->log( info => "Parse $uri" );
60        my $xhtml = Plagger::Util::load_uri( URI->new( $uri ), $self );
61        my $title = Plagger::Util::extract_title( $xhtml );
62        $meta->{'title'} ||= $title || $meta->{'link'};
63
64        my $datas;
65        if ( defined $conf->{'xpath'} ) {
66            $datas = $self->find_node( $c, $conf->{'target'}, $xhtml, $conf->{'xpath'} )
67                or next;
68        }
69        else {
70            $datas = $self->parse( $c, $conf->{'target'}, $xhtml, $conf->{'regexp'} )
71                or next;
72        }
73
74        if ( $conf->{'after_hook'} ) {
75            for my $data ( @{ $datas } ) {
76                eval $conf->{'after_hook'};
77                $c->error( $@ ) if ( $@ );
78            }
79        }
80
81        for my $feed ( $self->create_feed( $c, $datas, $meta ) ) {
82            $c->subscription->add( $feed );
83        }
84    }
85}
86
87sub find_node {
88    my ( $self, $c, $url, $xhtml, $conf ) = @_;
89    if ( ! $conf->{'url'} ) {
90        $c->log( error => "'url' XPath is not specified. :$url" );
91        return;
92    }
93
94    my $tree = HTML::TreeBuilder::XPath->new;
95    $tree->parse( $xhtml );
96    $tree->eof;
97
98    no warnings 'redefine';
99    local *HTML::Element::_xml_escape = $self->can('xml_escape');
100    use warnings;
101
102    my $data = [];
103
104    for my $capture ( keys %{ $conf } ) {
105        next if ( ! $conf->{$capture} );
106        my $index = 0;
107        for my $child ( $tree->findnodes( $conf->{$capture} ) ) {
108            my $value = ( $child->isElementNode ) ? $child->as_XML : $child->getValue ;
109            $data->[$index] = {} if ( ! ref $data->[$index] );
110            $data->[$index]->{$capture} = $value;
111            $index++;
112        }
113    }
114
115    return $data;
116}
117
118sub parse {
119    my ( $self, $c, $url, $xhtml, $conf ) = @_;
120
121    my @capture = split m{\s+}, $conf->{'capture'};
122
123    if ( ! @capture ) {
124        $c->log( error => "There is no available 'capture'. :$url" );
125        return;
126    }
127
128    if ( ! $conf->{'extract'} ) {
129        $c->log( error => "'extrace' is not specified. :$url" );
130        return;
131    }
132
133    my $extract = Plagger::Util::decode_content( $conf->{'extract'} );
134    my ( $cur_pos, $prev_pos ) = ( 0, 0 );
135
136    my $data = [];
137    my $index = 0;
138
139    while ( 1 ) {
140        if ( $xhtml =~ m{$extract}sg ) {
141            $cur_pos = pos $xhtml;
142            my $str = substr $xhtml, $prev_pos, length( $xhtml );
143            if ( my @matched = ( $xhtml =~ m{$extract}s ) ) {
144                for my $name ( @capture ) {
145                    $data->[$index] = {} if ( ! ref $data->[$index] );
146                    $data->[$index]->{$name} = shift @matched;
147                }
148                $index++;
149                $prev_pos = $cur_pos;
150            }
151            else {
152                last;
153            }
154        }
155        else {
156            last;
157        }
158    }
159
160    return $data;
161}
162
163sub create_feed {
164    my ( $self, $c, $datas, $meta ) = @_;
165
166    my @feeds = ();
167    for my $data ( @{ $datas } ) {
168        my $feed = Plagger::Feed->new;
169
170        $feed->url( URI->new_abs( $data->{'url'}, $meta->{'link'} ) );
171        $feed->title( $data->{'title'} || $meta->{'title'} );
172
173        $feed->link( $meta->{'link'} );
174
175        $feed->meta( $meta->{'meta'} ) if ( $meta->{'meta'} );
176        $feed->tags( $meta->{'tag'} )  if ( $meta->{'tag'}  );
177
178        push @feeds, $feed;
179    }
180
181    return @feeds;
182}
183
184sub xml_escape {
185    for my $x ( @_ ) {
186        $x = Plagger::Util::encode_xml( $x );
187    }
188}
189
1901;
191__END__
192
193=head1 NAME
194
195Plagger::Plugin::Subscription::Parse - Use XPath or Regexp to extract subscriptions from web pages
196
197=head1 SYNOPSIS
198
199  - module: Subscription::Parse
200    config:
201      subscribe:
202        - target: http://d.hatena.ne.jp/antipop/20050628/1119966355
203          xpath:
204            url: //ul[@class="xoxo" or @class="subscriptionlist"]//a/@href
205            title: //ul[@class="xoxo" or @class="subscriptionlist"]//a/@title
206
207=head1 DESCRIPTION
208
209This plugin extracets subscriptions to XHTML content,
210using XPath or Regular expression to find links.
211
212=head1 CONFIG
213
214=head2 subscribe
215
216=over 7
217
218=item target
219
220URI of the analyzed web page
221
222=item xpath
223
224XPath expression for each data
225
226=over 3
227
228=item url
229
230XPath expression which extracts subscriptions URI out
231
232=item title
233
234XPath expression which extracts subscriptions title out
235
236=item other name
237
238When specified XPath expression besides the name of two above,
239it's stocked in C<$data->{'name'}>.
240
241=back
242
243=item regexp
244
245Regular expression to extract each data out.
246
247=over 2
248
249=item extract
250
251Regular expression to extract data out
252
253=item capture
254
255The name of text that matches to the regular expression is specified.
256It is delimited in space.
257
258Data is stocked C<$data->{$name}>.
259
260=back
261
262=item after_hook
263
264Perl Code to process the acquired data.
265
266=item title
267
268feed title
269
270=item meta
271
272feed meta data.
273
274=item tag
275
276feed tags.
277
278=back
279
280=head2 default
281
282Value of default when there is no individual specification
283
284=over 5
285
286=item xpath
287
288=item regexp
289
290=item after_hook
291
292=item meta
293
294=item tag
295
296=back
297
298=head1 AUTHOR
299
300Naoki Okamura (Nyarla,) E<lt>thotep@nyarla.netE<gt>
301
302=head1 LICENSE
303
304This Plug-in is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
305
306=head1 SEE ALSO
307
308L<Plagger>
309
310=cut
Note: See TracBrowser for help on using the browser.