Changeset 20765

Show
Ignore:
Timestamp:
10/05/08 12:26:32 (6 years ago)
Author:
miki
Message:

いくつかのバグを修正

Location:
lang/perl/HTML-Feature/trunk
Files:
4 added
3 modified

Legend:

Unmodified
Added
Removed
  • lang/perl/HTML-Feature/trunk/Changes

    r17770 r20765  
    11Revision history for Perl extension HTML::Feature. 
     2 
     3 
     42.00006 2008.10.05 
     5    - fix a bug that improperty joins of several words 
     6    - set 'user_agent' option 
     7    - an object of HTML::Feature::Engine can recognize the base_url 
     8      via context object ( $c->{base_url} ) 
     9    - add test codes 
    210 
    3112.00005 2008.08.15 
  • lang/perl/HTML-Feature/trunk/lib/HTML/Feature.pm

    r17770 r20765  
    1212use URI; 
    1313 
    14 $VERSION   = '2.00005'; 
     14$VERSION   = '2.00006'; 
    1515@EXPORT_OK = qw(feature); 
    1616 
     
    2929    my $obj  = shift; 
    3030 
    31     if (! $obj) { 
     31    if ( !$obj ) { 
    3232        croak('Usage: parse( $uri | $http_response | $html_ref )'); 
    3333    } 
    3434 
    35     my $pkg  = blessed($obj); 
    36     if (! $pkg) { 
    37         if (my $ref = ref $obj) { 
     35    my $pkg = blessed($obj); 
     36    if ( !$pkg ) { 
     37        if ( my $ref = ref $obj ) { 
     38 
    3839            # if it's a scalar reference, then we've been passed a piece of 
    39             # HTML code.  
    40             if ($ref eq 'SCALAR') { 
     40            # HTML code. 
     41            if ( $ref eq 'SCALAR' ) { 
    4142                return $self->parse_html( $obj, @_ ); 
    4243            } 
     
    5253 
    5354    # If it's an object, then we can handle URI or HTTP::Response 
    54     if ($pkg->isa('URI')) { 
     55    if ( $pkg->isa('URI') ) { 
    5556        return $self->parse_url( $obj, @_ ); 
    56     } elsif ($pkg->isa('HTTP::Response')) { 
     57    } 
     58    elsif ( $pkg->isa('HTTP::Response') ) { 
    5759        return $self->parse_response( $obj, @_ ); 
    58     } else { 
     60    } 
     61    else { 
    5962        croak('Usage: parse( $uri | $http_response | $html_ref )'); 
    6063    } 
     
    6669    my $ua   = $self->_user_agent(); 
    6770    my $res  = $ua->get($url); 
    68     $self->parse_response($res, @_); 
     71    $self->parse_response( $res, @_ ); 
    6972} 
    7073 
     
    7275    my $self = shift; 
    7376    my $res  = shift; 
     77    $self->{base_url} = $res->base; 
    7478    my $content = $self->_decode_response($res); 
    7579    $self->_run( \$content, @_ ); 
     
    7781 
    7882sub parse_html { 
    79     my $self = shift; 
    80     my $html = shift; 
     83    my $self     = shift; 
     84    my $html     = shift; 
    8185    my $html_ref = ref $html ? $html : \$html; 
    82     $self->_decode_htmlref( $html_ref ); 
     86    $self->_decode_htmlref($html_ref); 
    8387    $self->_run( $html_ref, @_ ); 
    8488} 
    8589 
    86 sub engine 
    87 { 
     90sub engine { 
    8891    my $self   = shift; 
    8992    my $engine = $self->{engine_obj}; 
    90     if(! $engine){ 
     93    if ( !$engine ) { 
    9194        my $engine_module = $self->{engine} ? $self->{engine} : 'TagStructure'; 
    9295        my $class = __PACKAGE__ . '::Engine::' . $engine_module; 
     
    9497        $engine = $class->new; 
    9598        $self->{engine_obj} = $engine; 
    96     }     
     99    } 
    97100    return $engine; 
    98101} 
     
    103106    my $opts     = shift || {}; 
    104107 
    105     local $self->{element_flag} = exists $opts->{element_flag} ? $opts->{element_flag} : $self->{element_flag}; 
    106     $self->engine->run($self, $html_ref); 
    107 } 
    108  
    109 sub _decode_response 
    110 { 
     108    local $self->{element_flag} = 
     109      exists $opts->{element_flag} 
     110      ? $opts->{element_flag} 
     111      : $self->{element_flag}; 
     112    $self->engine->run( $self, $html_ref ); 
     113} 
     114 
     115sub _decode_response { 
    111116    my $self = shift; 
    112117    my $res  = shift; 
     
    114119    my @encoding = ( 
    115120        $res->encoding, 
     121 
    116122        # XXX - falling back to latin-1 may be risky. See Data::Decode 
    117123        # could be multiple because HTTP response and META might be different 
     
    119125        "latin-1", 
    120126    ); 
    121     my $encoding = 
    122         first { defined $_ && Encode::find_encoding($_) } @encoding; 
     127    my $encoding = first { defined $_ && Encode::find_encoding($_) } @encoding; 
    123128    return Encode::decode( $encoding, $res->content ); 
    124129} 
    125130 
    126 sub _decode_htmlref 
    127 { 
    128     my $self = shift; 
     131sub _decode_htmlref { 
     132    my $self     = shift; 
    129133    my $html_ref = shift; 
    130134 
    131135    local $Encode::Guess::NoUTFAutoGuess = 1; 
    132136    my $guess = 
    133         Encode::Guess::guess_encoding( $$html_ref, 
    134             ( 'shiftjis', 'euc-jp', '7bit-jis', 'utf8' ) ); 
     137      Encode::Guess::guess_encoding( $$html_ref, 
     138        ( 'shiftjis', 'euc-jp', '7bit-jis', 'utf8' ) ); 
    135139    unless ( ref $guess ) { 
    136140        $$html_ref = Encode::decode( "latin-1", $$html_ref ); 
    137     } else { 
     141    } 
     142    else { 
    138143        eval { $$html_ref = $guess->decode($$html_ref); }; 
    139144    } 
     
    144149    require LWP::UserAgent; 
    145150    $UserAgent ||= LWP::UserAgent->new(); 
     151    $self->{user_agent} and $UserAgent->agent( $self->{user_agent} ); 
    146152    $self->{http_proxy} and $UserAgent->proxy( ['http'], $self->{http_proxy} ); 
    147     $self->{timeout} and $UserAgent->timeout( $self->{timeout} ); 
     153    $self->{timeout}    and $UserAgent->timeout( $self->{timeout} ); 
    148154    return $UserAgent; 
    149155} 
     
    232238        min_bytes => 10, # minimum number of bytes per node to analyze (default is '') 
    233239        enc_type => 'euc-jp', # encoding of return values (default: 'utf-8') 
     240        user_agent => 'my-agent-name', # LWP::UserAgent->agent (default: 'libwww-perl/#.##')  
    234241        http_proxy => 'http://proxy:3128', # http proxy server (default: '') 
    235242        timeout => 10, # set the timeout value in seconds. (default: 180) 
  • lang/perl/HTML-Feature/trunk/lib/HTML/Feature/Engine/TagStructure.pm

    r17770 r20765  
    3232    # control code ( 0x00 - 0x1F, and 0x7F on ascii) 
    3333    for ( 0 .. 31 ) { 
     34        next if $_ == 10;# without NL(New Line) 
    3435        my $control_code = '\x' . sprintf( "%x", $_ ); 
    3536        $$html_ref =~ s{$control_code}{}xmg; 
     
    9394        $node_hash{short_string_length} ||= 0; 
    9495        $node_hash{text}                ||= $text; 
    95  
    96         next if $node_hash{text} !~ /[^ ]+/; 
    9796 
    9897        $data->[$i]->{text} = $node_hash{text}; 
     
    142141        $_; 
    143142      } ( 0 .. $i ); 
    144     $data->[ $sorted[0] ]->{text} =~ s/ $//s; 
     143 
     144    $data->[ $sorted[0] ]->{text} and $data->[ $sorted[0] ]->{text} =~ s/ $//s; 
    145145 
    146146    $result->text( $data->[ $sorted[0] ]->{text} ); 
     
    153153    if ( $c->{enc_type} ) { 
    154154        $result->title( Encode::encode( $c->{enc_type}, $result->title ) ); 
    155         $result->desc( Encode::encode( $c->{enc_type},  $result->desc ) ); 
    156         $result->text( Encode::encode( $c->{enc_type},  $result->text ) ); 
     155        $result->desc( Encode::encode( $c->{enc_type}, $result->desc ) ); 
     156        $result->text( Encode::encode( $c->{enc_type}, $result->text ) ); 
    157157    } 
    158158 
     
    166166 
    167167    if ( ref $node ) { 
    168         if ( $node->tag =~ /p|br|hr|tr|ul|li|ol|dl|dd/ ) { 
     168        if ( $node->tag =~ /p|br|hr|tr|ul|li|ol|dl|dd|h[1-6]/ ) { 
    169169            $node_hash_ref->{text} .= "\n"; 
    170170        } 
     
    185185} 
    186186 
    187  
    188  
    1891871; 
    190188