| 1 | package HTML::Feature; |
|---|
| 2 | use strict; |
|---|
| 3 | use warnings; |
|---|
| 4 | use vars qw($VERSION $UserAgent $engine @EXPORT_OK); |
|---|
| 5 | use Exporter qw(import); |
|---|
| 6 | use Carp; |
|---|
| 7 | use HTTP::Response::Encoding; |
|---|
| 8 | use Encode::Guess; |
|---|
| 9 | use List::Util qw(first); |
|---|
| 10 | use Scalar::Util qw(blessed); |
|---|
| 11 | use UNIVERSAL::require; |
|---|
| 12 | use URI; |
|---|
| 13 | |
|---|
| 14 | $VERSION = '2.00005'; |
|---|
| 15 | @EXPORT_OK = qw(feature); |
|---|
| 16 | |
|---|
| 17 | sub new { |
|---|
| 18 | my $class = shift; |
|---|
| 19 | my %arg = @_; |
|---|
| 20 | $class = ref $class || $class; |
|---|
| 21 | my $self = bless \%arg, $class; |
|---|
| 22 | $self->{enc_type} ||= 'utf8'; |
|---|
| 23 | |
|---|
| 24 | return $self; |
|---|
| 25 | } |
|---|
| 26 | |
|---|
| 27 | sub parse { |
|---|
| 28 | my $self = shift; |
|---|
| 29 | my $obj = shift; |
|---|
| 30 | |
|---|
| 31 | if (! $obj) { |
|---|
| 32 | croak('Usage: parse( $uri | $http_response | $html_ref )'); |
|---|
| 33 | } |
|---|
| 34 | |
|---|
| 35 | my $pkg = blessed($obj); |
|---|
| 36 | if (! $pkg) { |
|---|
| 37 | if (my $ref = ref $obj) { |
|---|
| 38 | # if it's a scalar reference, then we've been passed a piece of |
|---|
| 39 | # HTML code. |
|---|
| 40 | if ($ref eq 'SCALAR') { |
|---|
| 41 | return $self->parse_html( $obj, @_ ); |
|---|
| 42 | } |
|---|
| 43 | |
|---|
| 44 | # Otherwise we don't know how to handle |
|---|
| 45 | croak('Usage: parse( $uri | $http_response | $html_ref )'); |
|---|
| 46 | } |
|---|
| 47 | |
|---|
| 48 | # We seemed to have an unblessed scalar. Assume it's a URI |
|---|
| 49 | $pkg = 'URI'; |
|---|
| 50 | $obj = URI->new($obj); |
|---|
| 51 | } |
|---|
| 52 | |
|---|
| 53 | # If it's an object, then we can handle URI or HTTP::Response |
|---|
| 54 | if ($pkg->isa('URI')) { |
|---|
| 55 | return $self->parse_url( $obj, @_ ); |
|---|
| 56 | } elsif ($pkg->isa('HTTP::Response')) { |
|---|
| 57 | return $self->parse_response( $obj, @_ ); |
|---|
| 58 | } else { |
|---|
| 59 | croak('Usage: parse( $uri | $http_response | $html_ref )'); |
|---|
| 60 | } |
|---|
| 61 | } |
|---|
| 62 | |
|---|
| 63 | sub parse_url { |
|---|
| 64 | my $self = shift; |
|---|
| 65 | my $url = shift; |
|---|
| 66 | my $ua = $self->_user_agent(); |
|---|
| 67 | my $res = $ua->get($url); |
|---|
| 68 | $self->parse_response($res, @_); |
|---|
| 69 | } |
|---|
| 70 | |
|---|
| 71 | sub parse_response { |
|---|
| 72 | my $self = shift; |
|---|
| 73 | my $res = shift; |
|---|
| 74 | my $content = $self->_decode_response($res); |
|---|
| 75 | $self->_run( \$content, @_ ); |
|---|
| 76 | } |
|---|
| 77 | |
|---|
| 78 | sub parse_html { |
|---|
| 79 | my $self = shift; |
|---|
| 80 | my $html = shift; |
|---|
| 81 | my $html_ref = ref $html ? $html : \$html; |
|---|
| 82 | $self->_decode_htmlref( $html_ref ); |
|---|
| 83 | $self->_run( $html_ref, @_ ); |
|---|
| 84 | } |
|---|
| 85 | |
|---|
| 86 | sub engine |
|---|
| 87 | { |
|---|
| 88 | my $self = shift; |
|---|
| 89 | my $engine = $self->{engine_obj}; |
|---|
| 90 | if(! $engine){ |
|---|
| 91 | my $engine_module = $self->{engine} ? $self->{engine} : 'TagStructure'; |
|---|
| 92 | my $class = __PACKAGE__ . '::Engine::' . $engine_module; |
|---|
| 93 | $class->require or die $@; |
|---|
| 94 | $engine = $class->new; |
|---|
| 95 | $self->{engine_obj} = $engine; |
|---|
| 96 | } |
|---|
| 97 | return $engine; |
|---|
| 98 | } |
|---|
| 99 | |
|---|
| 100 | sub _run { |
|---|
| 101 | my $self = shift; |
|---|
| 102 | my $html_ref = shift; |
|---|
| 103 | my $opts = shift || {}; |
|---|
| 104 | |
|---|
| 105 | local $self->{element_flag} = exists $opts->{element_flag} ? $opts->{element_flag} : $self->{element_flag}; |
|---|
| 106 | $self->engine->run($self, $html_ref); |
|---|
| 107 | } |
|---|
| 108 | |
|---|
| 109 | sub _decode_response |
|---|
| 110 | { |
|---|
| 111 | my $self = shift; |
|---|
| 112 | my $res = shift; |
|---|
| 113 | |
|---|
| 114 | my @encoding = ( |
|---|
| 115 | $res->encoding, |
|---|
| 116 | # XXX - falling back to latin-1 may be risky. See Data::Decode |
|---|
| 117 | # could be multiple because HTTP response and META might be different |
|---|
| 118 | ( $res->header('Content-Type') =~ /charset=([\w\-]+)/g ), |
|---|
| 119 | "latin-1", |
|---|
| 120 | ); |
|---|
| 121 | my $encoding = |
|---|
| 122 | first { defined $_ && Encode::find_encoding($_) } @encoding; |
|---|
| 123 | return Encode::decode( $encoding, $res->content ); |
|---|
| 124 | } |
|---|
| 125 | |
|---|
| 126 | sub _decode_htmlref |
|---|
| 127 | { |
|---|
| 128 | my $self = shift; |
|---|
| 129 | my $html_ref = shift; |
|---|
| 130 | |
|---|
| 131 | local $Encode::Guess::NoUTFAutoGuess = 1; |
|---|
| 132 | my $guess = |
|---|
| 133 | Encode::Guess::guess_encoding( $$html_ref, |
|---|
| 134 | ( 'shiftjis', 'euc-jp', '7bit-jis', 'utf8' ) ); |
|---|
| 135 | unless ( ref $guess ) { |
|---|
| 136 | $$html_ref = Encode::decode( "latin-1", $$html_ref ); |
|---|
| 137 | } else { |
|---|
| 138 | eval { $$html_ref = $guess->decode($$html_ref); }; |
|---|
| 139 | } |
|---|
| 140 | } |
|---|
| 141 | |
|---|
| 142 | sub _user_agent { |
|---|
| 143 | my $self = shift; |
|---|
| 144 | require LWP::UserAgent; |
|---|
| 145 | $UserAgent ||= LWP::UserAgent->new(); |
|---|
| 146 | $self->{http_proxy} and $UserAgent->proxy( ['http'], $self->{http_proxy} ); |
|---|
| 147 | $self->{timeout} and $UserAgent->timeout( $self->{timeout} ); |
|---|
| 148 | return $UserAgent; |
|---|
| 149 | } |
|---|
| 150 | |
|---|
| 151 | sub feature { |
|---|
| 152 | my $self = __PACKAGE__->new; |
|---|
| 153 | my $result = $self->parse(@_); |
|---|
| 154 | my %ret = ( |
|---|
| 155 | text => $result->text, |
|---|
| 156 | title => $result->title, |
|---|
| 157 | desc => $result->desc, |
|---|
| 158 | element => $result->element |
|---|
| 159 | ); |
|---|
| 160 | return wantarray ? %ret : $ret{text}; |
|---|
| 161 | } |
|---|
| 162 | |
|---|
| 163 | sub extract { |
|---|
| 164 | warn |
|---|
| 165 | "HTML::Feature::extract() has been deprecated. Use HTML::Feature::parse() instead"; |
|---|
| 166 | my $self = shift; |
|---|
| 167 | my %args = @_; |
|---|
| 168 | my $result = $self->parse( $args{string} ? \$args{string} : $args{url} ); |
|---|
| 169 | my $ret = { |
|---|
| 170 | title => $result->title, |
|---|
| 171 | description => $result->desc, |
|---|
| 172 | block => [ { contents => $result->text } ], |
|---|
| 173 | }; |
|---|
| 174 | return $ret; |
|---|
| 175 | } |
|---|
| 176 | |
|---|
| 177 | 1; |
|---|
| 178 | |
|---|
| 179 | __END__ |
|---|
| 180 | |
|---|
| 181 | =head1 NAME |
|---|
| 182 | |
|---|
| 183 | HTML::Feature - Extract Feature Sentences From HTML Documents |
|---|
| 184 | |
|---|
| 185 | =head1 SYNOPSIS |
|---|
| 186 | |
|---|
| 187 | use HTML::Feature; |
|---|
| 188 | |
|---|
| 189 | my $f = HTML::Feature->new(enc_type => 'utf8'); |
|---|
| 190 | my $result = $f->parse('http://www.perl.com'); |
|---|
| 191 | |
|---|
| 192 | print "Title:" , $result->title(), "\n"; |
|---|
| 193 | print "Description:" , $result->desc(), "\n"; |
|---|
| 194 | print "Featured Text:", $result->text(), "\n"; |
|---|
| 195 | |
|---|
| 196 | |
|---|
| 197 | # you can get a HTML::Element object |
|---|
| 198 | |
|---|
| 199 | my $f = HTML::Feature->new(); |
|---|
| 200 | my $result = $f->parse('http://www.perl.com',{element_flag => 1}); |
|---|
| 201 | print "HTML Element:", $result->element->as_HTML, "\n"; |
|---|
| 202 | $result->element_delete(); |
|---|
| 203 | |
|---|
| 204 | |
|---|
| 205 | # a simpler method is, |
|---|
| 206 | |
|---|
| 207 | use HTML::Feature qw(feature); |
|---|
| 208 | print scalar feature('http://www.perl.com'); |
|---|
| 209 | |
|---|
| 210 | # very simple! |
|---|
| 211 | |
|---|
| 212 | |
|---|
| 213 | =head1 DESCRIPTION |
|---|
| 214 | |
|---|
| 215 | This module extracst blocks of feature sentences out of an HTML document. |
|---|
| 216 | |
|---|
| 217 | Unlike other modules that performs similar tasks, this module by default |
|---|
| 218 | extracts blocks without using morphological analysis, and instead it uses |
|---|
| 219 | simple statistics processing. |
|---|
| 220 | |
|---|
| 221 | Because of this, HTML::Feature has an advantage over other similar modules |
|---|
| 222 | in that it can be applied to documents in any language. |
|---|
| 223 | |
|---|
| 224 | =head1 METHODS |
|---|
| 225 | |
|---|
| 226 | =head2 new() |
|---|
| 227 | |
|---|
| 228 | my $f = HTML::Feature->new(%param); |
|---|
| 229 | my $f = HTML::Feature->new( |
|---|
| 230 | engine => $class, # backend engine module (default: 'TagStructure') |
|---|
| 231 | max_bytes => 5000, # max number of bytes per node to analyze (default: '') |
|---|
| 232 | min_bytes => 10, # minimum number of bytes per node to analyze (default is '') |
|---|
| 233 | enc_type => 'euc-jp', # encoding of return values (default: 'utf-8') |
|---|
| 234 | http_proxy => 'http://proxy:3128', # http proxy server (default: '') |
|---|
| 235 | timeout => 10, # set the timeout value in seconds. (default: 180) |
|---|
| 236 | element_flag => 1, # flag of HTML::Element object as returned value (default: '') |
|---|
| 237 | ); |
|---|
| 238 | |
|---|
| 239 | Instantiates a new HTML::Feature object. Takes the following parameters |
|---|
| 240 | |
|---|
| 241 | =over 4 |
|---|
| 242 | |
|---|
| 243 | =item engine |
|---|
| 244 | |
|---|
| 245 | Specifies the class name of the engine that you want to use. |
|---|
| 246 | |
|---|
| 247 | HTML::Feature is designed to accept different engines to change its behavior. |
|---|
| 248 | If you want to customize the behavior of HTML::Feature, specify your own |
|---|
| 249 | engine in this parameter |
|---|
| 250 | |
|---|
| 251 | =back |
|---|
| 252 | |
|---|
| 253 | The rest of the arguments are directly passed to the HTML::Feature::Engine |
|---|
| 254 | object constructor. |
|---|
| 255 | |
|---|
| 256 | =head2 parse() |
|---|
| 257 | |
|---|
| 258 | my $result = $f->parse($url); |
|---|
| 259 | # or |
|---|
| 260 | my $result = $f->parse($html_ref); |
|---|
| 261 | # or |
|---|
| 262 | my $result = $f->parse($http_response); |
|---|
| 263 | |
|---|
| 264 | Parses the given argument. The argument can be either a URL, a string of HTML |
|---|
| 265 | (must be passed as a scalar reference), or an HTTP::Response object. |
|---|
| 266 | HTML::Feature will detect and delegate to the appropriate method (see below) |
|---|
| 267 | |
|---|
| 268 | =head2 parse_url($url) |
|---|
| 269 | |
|---|
| 270 | Parses an URL. This method will use LWP::UserAgent to fetch the given url. |
|---|
| 271 | |
|---|
| 272 | =head2 parse_html($html) |
|---|
| 273 | |
|---|
| 274 | Parses a string containing HTML. |
|---|
| 275 | |
|---|
| 276 | =head2 parse_response($http_response) |
|---|
| 277 | |
|---|
| 278 | Parses an HTTP::Response object. |
|---|
| 279 | |
|---|
| 280 | =head2 extract() |
|---|
| 281 | |
|---|
| 282 | $data = $f->extract(url => $url); |
|---|
| 283 | # or |
|---|
| 284 | $data = $f->extract(string => $html); |
|---|
| 285 | |
|---|
| 286 | HTML::Feature::extract() has been deprecated and exists for backwards compatiblity only. Use HTML::Feature::parse() instead. |
|---|
| 287 | |
|---|
| 288 | extract() extracts blocks of feature sentences from the given document, |
|---|
| 289 | and returns a data structure like this: |
|---|
| 290 | |
|---|
| 291 | $data = { |
|---|
| 292 | title => $title, |
|---|
| 293 | description => $desc, |
|---|
| 294 | block => [ |
|---|
| 295 | { |
|---|
| 296 | contents => $contents, |
|---|
| 297 | score => $score |
|---|
| 298 | }, |
|---|
| 299 | . |
|---|
| 300 | . |
|---|
| 301 | ] |
|---|
| 302 | } |
|---|
| 303 | |
|---|
| 304 | =head2 feature |
|---|
| 305 | |
|---|
| 306 | feature() is a simple wrapper that does new(), parse() in one step. |
|---|
| 307 | If you do not require complex operations, simply calling this will suffice. |
|---|
| 308 | In scalar context, it returns the feature text only. In list context, |
|---|
| 309 | some more meta data will be returned as a hash. |
|---|
| 310 | |
|---|
| 311 | This function is exported on demand. |
|---|
| 312 | |
|---|
| 313 | use HTML::Feature qw(feature); |
|---|
| 314 | print scalar feature($url); # print featured text |
|---|
| 315 | |
|---|
| 316 | my %data = feature($url); # wantarray(hash) |
|---|
| 317 | print $data{title}; |
|---|
| 318 | print $data{desc}; |
|---|
| 319 | print $data{text}; |
|---|
| 320 | |
|---|
| 321 | |
|---|
| 322 | =head1 AUTHOR |
|---|
| 323 | |
|---|
| 324 | Takeshi Miki <miki@cpan.org> |
|---|
| 325 | |
|---|
| 326 | Special thanks to Daisuke Maki |
|---|
| 327 | |
|---|
| 328 | =head1 COPYRIGHT AND LICENSE |
|---|
| 329 | |
|---|
| 330 | Copyright (C) 2007 Takeshi Miki This library is free software; you can redistribute it and/or modifyit under the same terms as Perl itself, either Perl version 5.8.8 or,at your option, any later version of Perl 5 you may have available. |
|---|
| 331 | |
|---|
| 332 | =cut |
|---|