root/lang/perl/HTML-Feature/trunk/lib/HTML/Feature.pm @ 17770

Revision 17770, 8.7 kB (checked in by miki, 6 years ago)

import HTML::Feature

Line 
1package HTML::Feature;
2use strict;
3use warnings;
4use vars qw($VERSION $UserAgent $engine @EXPORT_OK);
5use Exporter qw(import);
6use Carp;
7use HTTP::Response::Encoding;
8use Encode::Guess;
9use List::Util qw(first);
10use Scalar::Util qw(blessed);
11use UNIVERSAL::require;
12use URI;
13
14$VERSION   = '2.00005';
15@EXPORT_OK = qw(feature);
16
17sub new {
18    my $class = shift;
19    my %arg   = @_;
20    $class = ref $class || $class;
21    my $self = bless \%arg, $class;
22    $self->{enc_type} ||= 'utf8';
23
24    return $self;
25}
26
27sub parse {
28    my $self = shift;
29    my $obj  = shift;
30
31    if (! $obj) {
32        croak('Usage: parse( $uri | $http_response | $html_ref )');
33    }
34
35    my $pkg  = blessed($obj);
36    if (! $pkg) {
37        if (my $ref = ref $obj) {
38            # if it's a scalar reference, then we've been passed a piece of
39            # HTML code.
40            if ($ref eq 'SCALAR') {
41                return $self->parse_html( $obj, @_ );
42            }
43
44            # Otherwise we don't know how to handle
45            croak('Usage: parse( $uri | $http_response | $html_ref )');
46        }
47
48        # We seemed to have an unblessed scalar. Assume it's a URI
49        $pkg = 'URI';
50        $obj = URI->new($obj);
51    }
52
53    # If it's an object, then we can handle URI or HTTP::Response
54    if ($pkg->isa('URI')) {
55        return $self->parse_url( $obj, @_ );
56    } elsif ($pkg->isa('HTTP::Response')) {
57        return $self->parse_response( $obj, @_ );
58    } else {
59        croak('Usage: parse( $uri | $http_response | $html_ref )');
60    }
61}
62
63sub parse_url {
64    my $self = shift;
65    my $url  = shift;
66    my $ua   = $self->_user_agent();
67    my $res  = $ua->get($url);
68    $self->parse_response($res, @_);
69}
70
71sub parse_response {
72    my $self = shift;
73    my $res  = shift;
74    my $content = $self->_decode_response($res);
75    $self->_run( \$content, @_ );
76}
77
78sub parse_html {
79    my $self = shift;
80    my $html = shift;
81    my $html_ref = ref $html ? $html : \$html;
82    $self->_decode_htmlref( $html_ref );
83    $self->_run( $html_ref, @_ );
84}
85
86sub engine
87{
88    my $self   = shift;
89    my $engine = $self->{engine_obj};
90    if(! $engine){
91        my $engine_module = $self->{engine} ? $self->{engine} : 'TagStructure';
92        my $class = __PACKAGE__ . '::Engine::' . $engine_module;
93        $class->require or die $@;
94        $engine = $class->new;
95        $self->{engine_obj} = $engine;
96    }   
97    return $engine;
98}
99
100sub _run {
101    my $self     = shift;
102    my $html_ref = shift;
103    my $opts     = shift || {};
104
105    local $self->{element_flag} = exists $opts->{element_flag} ? $opts->{element_flag} : $self->{element_flag};
106    $self->engine->run($self, $html_ref);
107}
108
109sub _decode_response
110{
111    my $self = shift;
112    my $res  = shift;
113
114    my @encoding = (
115        $res->encoding,
116        # XXX - falling back to latin-1 may be risky. See Data::Decode
117        # could be multiple because HTTP response and META might be different
118        ( $res->header('Content-Type') =~ /charset=([\w\-]+)/g ),
119        "latin-1",
120    );
121    my $encoding =
122        first { defined $_ && Encode::find_encoding($_) } @encoding;
123    return Encode::decode( $encoding, $res->content );
124}
125
126sub _decode_htmlref
127{
128    my $self = shift;
129    my $html_ref = shift;
130
131    local $Encode::Guess::NoUTFAutoGuess = 1;
132    my $guess =
133        Encode::Guess::guess_encoding( $$html_ref,
134            ( 'shiftjis', 'euc-jp', '7bit-jis', 'utf8' ) );
135    unless ( ref $guess ) {
136        $$html_ref = Encode::decode( "latin-1", $$html_ref );
137    } else {
138        eval { $$html_ref = $guess->decode($$html_ref); };
139    }
140}
141
142sub _user_agent {
143    my $self = shift;
144    require LWP::UserAgent;
145    $UserAgent ||= LWP::UserAgent->new();
146    $self->{http_proxy} and $UserAgent->proxy( ['http'], $self->{http_proxy} );
147    $self->{timeout} and $UserAgent->timeout( $self->{timeout} );
148    return $UserAgent;
149}
150
151sub feature {
152    my $self   = __PACKAGE__->new;
153    my $result = $self->parse(@_);
154    my %ret    = (
155        text    => $result->text,
156        title   => $result->title,
157        desc    => $result->desc,
158        element => $result->element
159    );
160    return wantarray ? %ret : $ret{text};
161}
162
163sub extract {
164    warn
165"HTML::Feature::extract() has been deprecated. Use HTML::Feature::parse() instead";
166    my $self   = shift;
167    my %args   = @_;
168    my $result = $self->parse( $args{string} ? \$args{string} : $args{url} );
169    my $ret    = {
170        title       => $result->title,
171        description => $result->desc,
172        block       => [ { contents => $result->text } ],
173    };
174    return $ret;
175}
176
1771;
178
179__END__
180
181=head1 NAME
182
183HTML::Feature - Extract Feature Sentences From HTML Documents
184
185=head1 SYNOPSIS
186
187    use HTML::Feature;
188
189    my $f = HTML::Feature->new(enc_type => 'utf8');
190    my $result = $f->parse('http://www.perl.com');
191
192    print "Title:"        , $result->title(), "\n";
193    print "Description:"  , $result->desc(),  "\n";
194    print "Featured Text:", $result->text(),  "\n";
195
196
197    # you can get a HTML::Element object
198 
199    my $f = HTML::Feature->new();
200    my $result = $f->parse('http://www.perl.com',{element_flag => 1});
201    print "HTML Element:",  $result->element->as_HTML, "\n";
202    $result->element_delete();
203
204
205    # a simpler method is,
206
207    use HTML::Feature qw(feature);
208    print scalar feature('http://www.perl.com');
209
210    # very simple!
211
212
213=head1 DESCRIPTION
214
215This module extracst blocks of feature sentences out of an HTML document.
216
217Unlike other modules that performs similar tasks, this module by default
218extracts blocks without using morphological analysis, and instead it uses
219simple statistics processing.
220
221Because of this, HTML::Feature has an advantage over other similar modules
222in that it can be applied to documents in any language.
223
224=head1 METHODS
225
226=head2 new()
227
228    my $f = HTML::Feature->new(%param);
229    my $f = HTML::Feature->new(
230        engine => $class, # backend engine module (default: 'TagStructure')
231        max_bytes => 5000, # max number of bytes per node to analyze (default: '')
232        min_bytes => 10, # minimum number of bytes per node to analyze (default is '')
233        enc_type => 'euc-jp', # encoding of return values (default: 'utf-8')
234        http_proxy => 'http://proxy:3128', # http proxy server (default: '')
235        timeout => 10, # set the timeout value in seconds. (default: 180)
236        element_flag => 1, # flag of HTML::Element object as returned value (default: '')
237   );
238
239Instantiates a new HTML::Feature object. Takes the following parameters
240
241=over 4
242
243=item engine
244
245Specifies the class name of the engine that you want to use.
246
247HTML::Feature is designed to accept different engines to change its behavior.
248If you want to customize the behavior of HTML::Feature, specify your own
249engine in this parameter
250
251=back
252
253The rest of the arguments are directly passed to the HTML::Feature::Engine
254object constructor.
255
256=head2 parse()
257
258    my $result = $f->parse($url);
259    # or
260    my $result = $f->parse($html_ref);
261    # or
262    my $result = $f->parse($http_response);
263
264Parses the given argument. The argument can be either a URL, a string of HTML
265(must be passed as a scalar reference), or an HTTP::Response object.
266HTML::Feature will detect and delegate to the appropriate method (see below)
267
268=head2 parse_url($url)
269
270Parses an URL. This method will use LWP::UserAgent to fetch the given url.
271
272=head2 parse_html($html)
273
274Parses a string containing HTML.
275
276=head2 parse_response($http_response)
277
278Parses an HTTP::Response object.
279
280=head2 extract()
281
282    $data = $f->extract(url => $url);
283    # or
284    $data = $f->extract(string => $html);
285
286HTML::Feature::extract() has been deprecated and exists for backwards compatiblity only. Use HTML::Feature::parse() instead.
287
288extract() extracts blocks of feature sentences from the given document,
289and returns a data structure like this:
290
291    $data = {
292        title => $title,
293        description => $desc,
294        block => [
295            {
296                contents => $contents,
297                score => $score
298            },
299            .
300            .
301        ]
302    }
303
304=head2 feature
305
306feature() is a simple wrapper that does new(), parse() in one step.
307If you do not require complex operations, simply calling this will suffice.
308In scalar context, it returns the feature text only. In list context,
309some more meta data will be returned as a hash.
310
311This function is exported on demand.
312
313    use HTML::Feature qw(feature);
314    print scalar feature($url);  # print featured text
315
316    my %data = feature($url); # wantarray(hash)
317    print $data{title};
318    print $data{desc};
319    print $data{text};
320
321
322=head1 AUTHOR
323
324Takeshi Miki <miki@cpan.org>
325
326Special thanks to Daisuke Maki
327
328=head1 COPYRIGHT AND LICENSE
329
330Copyright (C) 2007 Takeshi Miki This library is free software; you can redistribute it and/or modifyit under the same terms as Perl itself, either Perl version 5.8.8 or,at your option, any later version of Perl 5 you may have available.
331
332=cut
Note: See TracBrowser for help on using the browser.