Show
Ignore:
Timestamp:
10/27/08 18:20:34 (5 years ago)
Author:
kawa0117
Message:

HTML-TagParser?-0.16-subtree.diff patch from Juergen Weigert

Location:
lang/perl/HTML-TagParser/trunk
Files:
1 added
4 modified

Legend:

Unmodified
Added
Removed
  • lang/perl/HTML-TagParser/trunk/MANIFEST

    r22228 r22229  
    1818META.yml 
    1919make-dist.sh 
     20t/12_navigation.t 
     21t/08_nest.t 
    2022t/22_yahoo.t 
    2123t/11_invalid_attr.t 
     
    2729t/06_japanese.t 
    2830t/10_escape.t 
    29 t/08_nest.t 
    3031t/05_charset.t 
    3132t/04_fetch.t 
  • lang/perl/HTML-TagParser/trunk/README

    r22228 r22229  
    1010 
    1111    Parse a HTML source and find its first <form action=""> attribute's 
    12     value. 
     12    value and find all input elements belonging to this form. 
    1313 
    1414        my $src  = '<html><form action="hoge.cgi">...</form></html>'; 
     
    1616        my $elem = $html->getElementsByTagName( "form" ); 
    1717        print "<form action=\"", $elem->getAttribute("action"), "\">\n" if ref $elem; 
     18        my @first_inputs = $elem->subTree()->getElementsByTagName( "input" ); 
     19        my $form = $first_inputs[0]->getParent(); 
    1820 
    1921    Fetch a HTML file via HTTP, and display its all <a> elements and 
     
    101103    This method returns $elem's innerText without tags. 
    102104 
     105  $subhtml = $elem->subTree(); 
     106    This method returns a new object of class HTML::Parser, with all the 
     107    elements that are in the DOM hierarchy under $elem. 
     108 
     109  $elem = $elem->nextSibling(); 
     110    This method returns the next sibling within the same parent. It returns 
     111    undef when called on a closing tag or on the lastChild node of a 
     112    parentNode. 
     113 
     114  $elem = $elem->previousSibling(); 
     115    This method returns the previous sibling within the same parent. It 
     116    returns undef when called on the firstChild node of a parentNode. 
     117 
     118  $child_elem = $elem->firstChild(); 
     119    This method returns the first child node of $elem. It returns undef when 
     120    called on a closing tag element or on a non-container or empty container 
     121    element. 
     122 
     123  $child_elems = $elem->childNodes(); 
     124    This method creates an array of all child nodes of $elem and returns the 
     125    array by reference. It returns an empty array-ref [] whenever 
     126    firstChild() would return undef. 
     127 
     128  $child_elem = $elem->lastChild(); 
     129    This method returns the last child node of $elem. It returns undef 
     130    whenever firstChild() would return undef. 
     131 
     132  $parent = $elem->parentNode(); 
     133    This method returns the parent node of $elem. It returns undef when 
     134    called on root nodes. 
     135 
    103136  $attr = $elem->attributes(); 
    104137    This method returns a hash of $elem's all attributes. 
     
    106139  $value = $elem->getAttribute( $key ); 
    107140    This method returns the value of $elem's attributes which name is $key. 
     141 
     142BUGS 
     143    The HTML-Parser is simple. Methods innerText and subTree may be fooled 
     144    by nested tags or embedded javascript code. 
     145 
     146    The methods with 'Sibling', 'child' or 'Child' in their names do not 
     147    cache their results. The most expensive ones are lastChild() and 
     148    previousSibling(). parentNode() is also expensive, but only once. It 
     149    does caching. 
     150 
     151    The DOM tree is read-only, as this is just a parser. 
    108152 
    109153INTERNATIONALIZATION 
  • lang/perl/HTML-TagParser/trunk/lib/HTML/TagParser.pm

    r22228 r22229  
    1111    print "<title>", $elem->innerText(), "</title>\n" if ref $elem; 
    1212 
    13 Parse a HTML source and find its first <form action=""> attribute's value. 
     13Parse a HTML source and find its first <form action=""> attribute's value 
     14and find all input elements belonging to this form. 
    1415 
    1516    my $src  = '<html><form action="hoge.cgi">...</form></html>'; 
     
    1718    my $elem = $html->getElementsByTagName( "form" ); 
    1819    print "<form action=\"", $elem->getAttribute("action"), "\">\n" if ref $elem; 
     20    my @first_inputs = $elem->subTree()->getElementsByTagName( "input" ); 
     21    my $form = $first_inputs[0]->getParent(); 
    1922 
    2023Fetch a HTML file via HTTP, and display its all <a> elements and attributes. 
     
    121124This method returns $elem's innerText without tags. 
    122125 
     126=head2 $subhtml = $elem->subTree(); 
     127 
     128This method returns a new object of class HTML::Parser, 
     129with all the elements that are in the DOM hierarchy under $elem. 
     130 
     131=head2 $elem = $elem->nextSibling(); 
     132 
     133This method returns the next sibling within the same parent. 
     134It returns undef when called on a closing tag or on the lastChild node 
     135of a parentNode. 
     136 
     137=head2 $elem = $elem->previousSibling(); 
     138 
     139This method returns the previous sibling within the same parent. 
     140It returns undef when called on the firstChild node of a parentNode. 
     141 
     142=head2 $child_elem = $elem->firstChild(); 
     143 
     144This method returns the first child node of $elem. 
     145It returns undef when called on a closing tag element or on a  
     146non-container or empty container element. 
     147 
     148=head2 $child_elems = $elem->childNodes(); 
     149 
     150This method creates an array of all child nodes of $elem and returns the array by reference. 
     151It returns an empty array-ref [] whenever firstChild() would return undef. 
     152 
     153=head2 $child_elem = $elem->lastChild(); 
     154 
     155This method returns the last child node of $elem. 
     156It returns undef whenever firstChild() would return undef. 
     157 
     158=head2 $parent = $elem->parentNode(); 
     159 
     160This method returns the parent node of $elem. 
     161It returns undef when called on root nodes. 
     162 
    123163=head2 $attr = $elem->attributes(); 
    124164 
     
    128168 
    129169This method returns the value of $elem's attributes which name is $key. 
     170 
     171=head1 BUGS 
     172 
     173The HTML-Parser is simple. Methods innerText and subTree may be 
     174fooled by nested tags or embedded javascript code. 
     175 
     176The methods with 'Sibling', 'child' or 'Child' in their names do not cache their results. 
     177The most expensive ones are lastChild() and previousSibling(). 
     178parentNode() is also expensive, but only once. It does caching. 
     179 
     180The DOM tree is read-only, as this is just a parser. 
    130181 
    131182=head1 INTERNATIONALIZATION 
     
    158209 
    159210use vars qw( $VERSION ); 
    160 $VERSION = "0.16"; 
     211$VERSION = "0.16.1"; 
    161212 
    162213my $J2E        = {qw( jis ISO-2022-JP sjis Shift_JIS euc EUC-JP ucs2 UCS2 )}; 
     
    164215my $SEC_OF_DAY = 60 * 60 * 24; 
    165216 
     217#  [000]        '/' if closing tag. 
     218#  [001]        tagName 
     219#  [002]        attributes string (with trailing /, if self-closing tag). 
     220#  [003]        content until next (nested) tag. 
     221#  [004]        attributes hash cache. 
     222#  [005]        innerText combined strings cache. 
     223#  [006]        index of matching closing tag (or opening tag, if [000]=='/') 
     224#  [007]        index of parent (aka container) tag. 
     225#  
    166226sub new { 
    167227    my $package = shift; 
     
    331391 
    332392    my $tagname = $elem->[001]; 
     393    my $closing = HTML::TagParser::Util::find_closing($flat, $cur); 
    333394    my $list    = []; 
    334     for ( ; $cur < $#$flat ; $cur++ ) { 
     395    for ( ; $cur < $closing ; $cur++ ) { 
    335396        push( @$list, $flat->[$cur]->[003] ); 
    336         last if ( $flat->[ $cur + 1 ]->[001] eq $tagname ); 
    337397    } 
    338398    my $text = join( "", grep { $_ ne "" } @$list ); 
     
    343403} 
    344404 
     405sub subTree 
     406{ 
     407    my $self = shift; 
     408    my ( $flat, $cur ) = @$self; 
     409    my $elem = $flat->[$cur]; 
     410    return if $elem->[000];                         # </xxx> 
     411    my $closing = HTML::TagParser::Util::find_closing($flat, $cur); 
     412    my $list    = []; 
     413    while (++$cur < $closing)  
     414      { 
     415        push @$list, $flat->[$cur]; 
     416      } 
     417 
     418    # allow the getElement...() methods on the returned object. 
     419    return bless { flat => $list }, 'HTML::TagParser'; 
     420} 
     421 
     422 
     423sub nextSibling 
     424{ 
     425    my $self = shift; 
     426    my ( $flat, $cur ) = @$self; 
     427    my $elem = $flat->[$cur]; 
     428 
     429    return undef if $elem->[000];                         # </xxx> 
     430    my $closing = HTML::TagParser::Util::find_closing($flat, $cur); 
     431    my $next_s = $flat->[$closing+1]; 
     432    return undef unless $next_s; 
     433    return undef if $next_s->[000];     # parent's </xxx> 
     434    return HTML::TagParser::Element->new( $flat, $closing+1 ); 
     435} 
     436 
     437sub firstChild 
     438{ 
     439    my $self = shift; 
     440    my ( $flat, $cur ) = @$self; 
     441    my $elem = $flat->[$cur]; 
     442    return undef if $elem->[000];                         # </xxx> 
     443    my $closing = HTML::TagParser::Util::find_closing($flat, $cur); 
     444    return undef if $closing <= $cur+1;                 # no children here. 
     445    return HTML::TagParser::Element->new( $flat, $cur+1 ); 
     446} 
     447 
     448sub childNodes 
     449{ 
     450    my $self = shift; 
     451    my ( $flat, $cur ) = @$self; 
     452    my $child = firstChild($self); 
     453    return [] unless $child;    # an empty array is easier for our callers than undef 
     454    my @c = ( $child ); 
     455    while (defined ($child = nextSibling($child))) 
     456      { 
     457        push @c, $child; 
     458      } 
     459    return \@c; 
     460} 
     461 
     462sub lastChild 
     463{ 
     464    my $c = childNodes(@_); 
     465    return undef unless $c->[0]; 
     466    return $c->[-1]; 
     467} 
     468 
     469sub previousSibling 
     470{ 
     471    my $self = shift; 
     472    my ( $flat, $cur ) = @$self; 
     473     
     474    ## This one is expensive.  
     475    ## We use find_closing() which walks forward.  
     476    ## We'd need a find_opening() which walks backwards. 
     477    ## So we walk backwards one by one and consult find_closing() 
     478    ## until we find $cur-1 or $cur. 
     479 
     480    my $idx = $cur-1; 
     481    while ($idx >= 0) 
     482      { 
     483        if ($flat->[$idx][000] && defined($flat->[$idx][006])) 
     484          { 
     485            $idx = $flat->[$idx][006];  # use cache for backwards skipping 
     486            next; 
     487          } 
     488 
     489        my $closing = HTML::TagParser::Util::find_closing($flat, $idx); 
     490        return HTML::TagParser::Element->new( $flat, $idx ) 
     491          if defined $closing and ($closing == $cur || $closing == $cur-1); 
     492        $idx--; 
     493      } 
     494    return undef; 
     495} 
     496 
     497sub parentNode 
     498{ 
     499    my $self = shift; 
     500    my ( $flat, $cur ) = @$self; 
     501 
     502    return HTML::TagParser::Element->new( $flat, $flat->[$cur][007]) if $flat->[$cur][007];     # cache 
     503 
     504    ## 
     505    ## This one is very expensive.  
     506    ## We use previousSibling() to walk backwards, and 
     507    ## previousSibling() is expensive. 
     508    ## 
     509    my $ps = $self; 
     510    my $first = $self; 
     511 
     512    while (defined($ps = previousSibling($ps))) { $first = $ps; } 
     513 
     514    my $parent = $first->[1] - 1; 
     515    return undef if $parent < 0; 
     516    die "parent too short" if HTML::TagParser::Util::find_closing($flat, $parent) <= $cur; 
     517 
     518    $flat->[$cur][007] = $parent;       # cache 
     519    return HTML::TagParser::Element->new( $flat, $parent ) 
     520} 
     521 
     522## 
     523## feature:  
     524## self-closing tags have an additional attribute '/' => '/'. 
     525## 
    345526sub attributes { 
    346527    my $self = shift; 
     
    421602} 
    422603 
     604## returns 1 beyond the end, if not found. 
     605## returns undef if called on a </xxx> closing tag 
     606sub find_closing  
     607{ 
     608  my ($flat, $cur) = @_; 
     609 
     610  return $flat->[$cur][006]        if   $flat->[$cur][006];     # cache 
     611  return $flat->[$cur][006] = $cur if (($flat->[$cur][002]||'') =~ m{/$});    # self-closing 
     612 
     613  my $name = $flat->[$cur][001]; 
     614  my $pre_nest = 0;      
     615  ## count how many levels deep this type of tag is nested. 
     616  my $idx; 
     617  for ($idx = 0; $idx <= $cur; $idx++) 
     618    { 
     619      my $e = $flat->[$idx]; 
     620      next unless   $e->[001] eq $name; 
     621      next if     (($e->[002]||'') =~ m{/$});   # self-closing 
     622      $pre_nest += ($e->[000]) ? -1 : 1; 
     623      $pre_nest = 0 if $pre_nest < 0; 
     624      $idx = $e->[006]-1 if !$e->[000] && $e->[006];    # use caches for skipping forward. 
     625    } 
     626  my $last_idx = $#$flat; 
     627 
     628  ## we move last_idx closer, in case this container  
     629  ## has not all its subcontainers closed properly. 
     630  my $post_nest = 0; 
     631  for ($idx = $last_idx; $idx > $cur; $idx--) 
     632    { 
     633      my $e = $flat->[$idx]; 
     634      next unless    $e->[001] eq $name; 
     635      $last_idx = $idx-1;               # remember where a matching tag was 
     636      next if      (($e->[002]||'') =~ m{/$});  # self-closing 
     637      $post_nest -= ($e->[000]) ? -1 : 1; 
     638      $post_nest = 0 if $post_nest < 0; 
     639      last if $pre_nest <= $post_nest; 
     640      $idx = $e->[006]+1 if $e->[000] && defined $e->[006];     # use caches for skipping backwards. 
     641    } 
     642   
     643  my $nest = 1;         # we know it is not self-closing. start behind. 
     644 
     645  for ($idx = $cur+1; $idx <= $last_idx; $idx++) 
     646    { 
     647      my $e = $flat->[$idx]; 
     648      next unless    $e->[001] eq $name; 
     649      next if      (($e->[002]||'') =~ m{/$});  # self-closing 
     650      $nest      += ($e->[000]) ? -1 : 1; 
     651      if ($nest <= 0) 
     652        { 
     653          die "assert </xxx>" unless $e->[000]; 
     654          $e->[006] = $cur;     # point back to opening tag 
     655          return $flat->[$cur][006] = $idx; 
     656        } 
     657      $idx = $e->[006]-1 if !$e->[000] && $e->[006];    # use caches for skipping forward. 
     658    }  
     659 
     660  # not all closed, but cannot go further 
     661  return $flat->[$cur][006] = $last_idx+1;       
     662} 
     663 
    423664sub find_meta_charset { 
    424665    my $txtref = shift;    # reference 
  • lang/perl/HTML-TagParser/trunk/t/08_nest.t

    r22228 r22229  
    2929 
    3030        my $foo = $html->getElementById( "foo" ); 
    31         like( $foo->innerText(), qr/AAA/s, "foo" ); 
    32 #       like( $foo->innerText(), qr/AAA.*BBB.*CCC.*DDD.*EEE.*FFF/s, "foo" ); 
     31#       like( $foo->innerText(), qr/AAA/s, "foo" ); 
     32        like( $foo->innerText(), qr/AAA.*BBB.*CCC.*DDD.*EEE.*FFF/s, "foo" ); 
    3333 
    3434        my $bar = $html->getElementById( "bar" ); 
    35         like( $bar->innerText(), qr/BBB.*CCC.*DDD\W*$/s, "bar" ); 
    36 #       like( $bar->innerText(), qr/BBB.*CCC.*DDD.*EEE/s, "bar" ); 
     35#       like( $bar->innerText(), qr/BBB.*CCC.*DDD\W*$/s, "bar" ); 
     36        like( $bar->innerText(), qr/BBB.*CCC.*DDD.*EEE/s, "bar" ); 
    3737# ---------------------------------------------------------------- 
    3838;1;