Changeset 20765
- Timestamp:
- 10/05/08 12:26:32 (5 years ago)
- Location:
- lang/perl/HTML-Feature/trunk
- Files:
-
- 4 added
- 3 modified
-
Changes (modified) (1 diff)
-
lib/HTML/Feature.pm (modified) (12 diffs)
-
lib/HTML/Feature/Engine/TagStructure.pm (modified) (6 diffs)
-
t/03_run/engine.t (added)
-
t/03_run/ua.t (added)
-
t/regression (added)
-
t/regression/join_several_words.t (added)
Legend:
- Unmodified
- Added
- Removed
-
lang/perl/HTML-Feature/trunk/Changes
r17770 r20765 1 1 Revision history for Perl extension HTML::Feature. 2 3 4 2.00006 2008.10.05 5 - fix a bug that improperty joins of several words 6 - set 'user_agent' option 7 - an object of HTML::Feature::Engine can recognize the base_url 8 via context object ( $c->{base_url} ) 9 - add test codes 2 10 3 11 2.00005 2008.08.15 -
lang/perl/HTML-Feature/trunk/lib/HTML/Feature.pm
r17770 r20765 12 12 use URI; 13 13 14 $VERSION = '2.0000 5';14 $VERSION = '2.00006'; 15 15 @EXPORT_OK = qw(feature); 16 16 … … 29 29 my $obj = shift; 30 30 31 if ( ! $obj) {31 if ( !$obj ) { 32 32 croak('Usage: parse( $uri | $http_response | $html_ref )'); 33 33 } 34 34 35 my $pkg = blessed($obj); 36 if (! $pkg) { 37 if (my $ref = ref $obj) { 35 my $pkg = blessed($obj); 36 if ( !$pkg ) { 37 if ( my $ref = ref $obj ) { 38 38 39 # if it's a scalar reference, then we've been passed a piece of 39 # HTML code. 40 if ( $ref eq 'SCALAR') {40 # HTML code. 41 if ( $ref eq 'SCALAR' ) { 41 42 return $self->parse_html( $obj, @_ ); 42 43 } … … 52 53 53 54 # If it's an object, then we can handle URI or HTTP::Response 54 if ( $pkg->isa('URI')) {55 if ( $pkg->isa('URI') ) { 55 56 return $self->parse_url( $obj, @_ ); 56 } elsif ($pkg->isa('HTTP::Response')) { 57 } 58 elsif ( $pkg->isa('HTTP::Response') ) { 57 59 return $self->parse_response( $obj, @_ ); 58 } else { 60 } 61 else { 59 62 croak('Usage: parse( $uri | $http_response | $html_ref )'); 60 63 } … … 66 69 my $ua = $self->_user_agent(); 67 70 my $res = $ua->get($url); 68 $self->parse_response( $res, @_);71 $self->parse_response( $res, @_ ); 69 72 } 70 73 … … 72 75 my $self = shift; 73 76 my $res = shift; 77 $self->{base_url} = $res->base; 74 78 my $content = $self->_decode_response($res); 75 79 $self->_run( \$content, @_ ); … … 77 81 78 82 sub parse_html { 79 my $self = shift;80 my $html = shift;83 my $self = shift; 84 my $html = shift; 81 85 my $html_ref = ref $html ? $html : \$html; 82 $self->_decode_htmlref( $html_ref);86 $self->_decode_htmlref($html_ref); 83 87 $self->_run( $html_ref, @_ ); 84 88 } 85 89 86 sub engine 87 { 90 sub engine { 88 91 my $self = shift; 89 92 my $engine = $self->{engine_obj}; 90 if (! $engine){93 if ( !$engine ) { 91 94 my $engine_module = $self->{engine} ? $self->{engine} : 'TagStructure'; 92 95 my $class = __PACKAGE__ . '::Engine::' . $engine_module; … … 94 97 $engine = $class->new; 95 98 $self->{engine_obj} = $engine; 96 } 99 } 97 100 return $engine; 98 101 } … … 103 106 my $opts = shift || {}; 104 107 105 local $self->{element_flag} = exists $opts->{element_flag} ? $opts->{element_flag} : $self->{element_flag}; 106 $self->engine->run($self, $html_ref); 107 } 108 109 sub _decode_response 110 { 108 local $self->{element_flag} = 109 exists $opts->{element_flag} 110 ? $opts->{element_flag} 111 : $self->{element_flag}; 112 $self->engine->run( $self, $html_ref ); 113 } 114 115 sub _decode_response { 111 116 my $self = shift; 112 117 my $res = shift; … … 114 119 my @encoding = ( 115 120 $res->encoding, 121 116 122 # XXX - falling back to latin-1 may be risky. See Data::Decode 117 123 # could be multiple because HTTP response and META might be different … … 119 125 "latin-1", 120 126 ); 121 my $encoding = 122 first { defined $_ && Encode::find_encoding($_) } @encoding; 127 my $encoding = first { defined $_ && Encode::find_encoding($_) } @encoding; 123 128 return Encode::decode( $encoding, $res->content ); 124 129 } 125 130 126 sub _decode_htmlref 127 { 128 my $self = shift; 131 sub _decode_htmlref { 132 my $self = shift; 129 133 my $html_ref = shift; 130 134 131 135 local $Encode::Guess::NoUTFAutoGuess = 1; 132 136 my $guess = 133 Encode::Guess::guess_encoding( $$html_ref,134 ( 'shiftjis', 'euc-jp', '7bit-jis', 'utf8' ) );137 Encode::Guess::guess_encoding( $$html_ref, 138 ( 'shiftjis', 'euc-jp', '7bit-jis', 'utf8' ) ); 135 139 unless ( ref $guess ) { 136 140 $$html_ref = Encode::decode( "latin-1", $$html_ref ); 137 } else { 141 } 142 else { 138 143 eval { $$html_ref = $guess->decode($$html_ref); }; 139 144 } … … 144 149 require LWP::UserAgent; 145 150 $UserAgent ||= LWP::UserAgent->new(); 151 $self->{user_agent} and $UserAgent->agent( $self->{user_agent} ); 146 152 $self->{http_proxy} and $UserAgent->proxy( ['http'], $self->{http_proxy} ); 147 $self->{timeout} and $UserAgent->timeout( $self->{timeout} );153 $self->{timeout} and $UserAgent->timeout( $self->{timeout} ); 148 154 return $UserAgent; 149 155 } … … 232 238 min_bytes => 10, # minimum number of bytes per node to analyze (default is '') 233 239 enc_type => 'euc-jp', # encoding of return values (default: 'utf-8') 240 user_agent => 'my-agent-name', # LWP::UserAgent->agent (default: 'libwww-perl/#.##') 234 241 http_proxy => 'http://proxy:3128', # http proxy server (default: '') 235 242 timeout => 10, # set the timeout value in seconds. (default: 180) -
lang/perl/HTML-Feature/trunk/lib/HTML/Feature/Engine/TagStructure.pm
r17770 r20765 32 32 # control code ( 0x00 - 0x1F, and 0x7F on ascii) 33 33 for ( 0 .. 31 ) { 34 next if $_ == 10;# without NL(New Line) 34 35 my $control_code = '\x' . sprintf( "%x", $_ ); 35 36 $$html_ref =~ s{$control_code}{}xmg; … … 93 94 $node_hash{short_string_length} ||= 0; 94 95 $node_hash{text} ||= $text; 95 96 next if $node_hash{text} !~ /[^ ]+/;97 96 98 97 $data->[$i]->{text} = $node_hash{text}; … … 142 141 $_; 143 142 } ( 0 .. $i ); 144 $data->[ $sorted[0] ]->{text} =~ s/ $//s; 143 144 $data->[ $sorted[0] ]->{text} and $data->[ $sorted[0] ]->{text} =~ s/ $//s; 145 145 146 146 $result->text( $data->[ $sorted[0] ]->{text} ); … … 153 153 if ( $c->{enc_type} ) { 154 154 $result->title( Encode::encode( $c->{enc_type}, $result->title ) ); 155 $result->desc( Encode::encode( $c->{enc_type}, $result->desc ) );156 $result->text( Encode::encode( $c->{enc_type}, $result->text ) );155 $result->desc( Encode::encode( $c->{enc_type}, $result->desc ) ); 156 $result->text( Encode::encode( $c->{enc_type}, $result->text ) ); 157 157 } 158 158 … … 166 166 167 167 if ( ref $node ) { 168 if ( $node->tag =~ /p|br|hr|tr|ul|li|ol|dl|dd / ) {168 if ( $node->tag =~ /p|br|hr|tr|ul|li|ol|dl|dd|h[1-6]/ ) { 169 169 $node_hash_ref->{text} .= "\n"; 170 170 } … … 185 185 } 186 186 187 188 189 187 1; 190 188
![(please configure the [header_logo] section in trac.ini)](/share/chrome/site/your_project_logo.png)