Changeset 22229 for lang/perl/HTML-TagParser
- Timestamp:
- 10/27/08 18:20:34 (5 years ago)
- Location:
- lang/perl/HTML-TagParser/trunk
- Files:
-
- 1 added
- 4 modified
-
MANIFEST (modified) (2 diffs)
-
README (modified) (4 diffs)
-
lib/HTML/TagParser.pm (modified) (9 diffs)
-
t/08_nest.t (modified) (1 diff)
-
t/12_navigation.t (added)
Legend:
- Unmodified
- Added
- Removed
-
lang/perl/HTML-TagParser/trunk/MANIFEST
r22228 r22229 18 18 META.yml 19 19 make-dist.sh 20 t/12_navigation.t 21 t/08_nest.t 20 22 t/22_yahoo.t 21 23 t/11_invalid_attr.t … … 27 29 t/06_japanese.t 28 30 t/10_escape.t 29 t/08_nest.t30 31 t/05_charset.t 31 32 t/04_fetch.t -
lang/perl/HTML-TagParser/trunk/README
r22228 r22229 10 10 11 11 Parse a HTML source and find its first <form action=""> attribute's 12 value .12 value and find all input elements belonging to this form. 13 13 14 14 my $src = '<html><form action="hoge.cgi">...</form></html>'; … … 16 16 my $elem = $html->getElementsByTagName( "form" ); 17 17 print "<form action=\"", $elem->getAttribute("action"), "\">\n" if ref $elem; 18 my @first_inputs = $elem->subTree()->getElementsByTagName( "input" ); 19 my $form = $first_inputs[0]->getParent(); 18 20 19 21 Fetch a HTML file via HTTP, and display its all <a> elements and … … 101 103 This method returns $elem's innerText without tags. 102 104 105 $subhtml = $elem->subTree(); 106 This method returns a new object of class HTML::Parser, with all the 107 elements that are in the DOM hierarchy under $elem. 108 109 $elem = $elem->nextSibling(); 110 This method returns the next sibling within the same parent. It returns 111 undef when called on a closing tag or on the lastChild node of a 112 parentNode. 113 114 $elem = $elem->previousSibling(); 115 This method returns the previous sibling within the same parent. It 116 returns undef when called on the firstChild node of a parentNode. 117 118 $child_elem = $elem->firstChild(); 119 This method returns the first child node of $elem. It returns undef when 120 called on a closing tag element or on a non-container or empty container 121 element. 122 123 $child_elems = $elem->childNodes(); 124 This method creates an array of all child nodes of $elem and returns the 125 array by reference. It returns an empty array-ref [] whenever 126 firstChild() would return undef. 127 128 $child_elem = $elem->lastChild(); 129 This method returns the last child node of $elem. It returns undef 130 whenever firstChild() would return undef. 131 132 $parent = $elem->parentNode(); 133 This method returns the parent node of $elem. It returns undef when 134 called on root nodes. 135 103 136 $attr = $elem->attributes(); 104 137 This method returns a hash of $elem's all attributes. … … 106 139 $value = $elem->getAttribute( $key ); 107 140 This method returns the value of $elem's attributes which name is $key. 141 142 BUGS 143 The HTML-Parser is simple. Methods innerText and subTree may be fooled 144 by nested tags or embedded javascript code. 145 146 The methods with 'Sibling', 'child' or 'Child' in their names do not 147 cache their results. The most expensive ones are lastChild() and 148 previousSibling(). parentNode() is also expensive, but only once. It 149 does caching. 150 151 The DOM tree is read-only, as this is just a parser. 108 152 109 153 INTERNATIONALIZATION -
lang/perl/HTML-TagParser/trunk/lib/HTML/TagParser.pm
r22228 r22229 11 11 print "<title>", $elem->innerText(), "</title>\n" if ref $elem; 12 12 13 Parse a HTML source and find its first <form action=""> attribute's value. 13 Parse a HTML source and find its first <form action=""> attribute's value 14 and find all input elements belonging to this form. 14 15 15 16 my $src = '<html><form action="hoge.cgi">...</form></html>'; … … 17 18 my $elem = $html->getElementsByTagName( "form" ); 18 19 print "<form action=\"", $elem->getAttribute("action"), "\">\n" if ref $elem; 20 my @first_inputs = $elem->subTree()->getElementsByTagName( "input" ); 21 my $form = $first_inputs[0]->getParent(); 19 22 20 23 Fetch a HTML file via HTTP, and display its all <a> elements and attributes. … … 121 124 This method returns $elem's innerText without tags. 122 125 126 =head2 $subhtml = $elem->subTree(); 127 128 This method returns a new object of class HTML::Parser, 129 with all the elements that are in the DOM hierarchy under $elem. 130 131 =head2 $elem = $elem->nextSibling(); 132 133 This method returns the next sibling within the same parent. 134 It returns undef when called on a closing tag or on the lastChild node 135 of a parentNode. 136 137 =head2 $elem = $elem->previousSibling(); 138 139 This method returns the previous sibling within the same parent. 140 It returns undef when called on the firstChild node of a parentNode. 141 142 =head2 $child_elem = $elem->firstChild(); 143 144 This method returns the first child node of $elem. 145 It returns undef when called on a closing tag element or on a 146 non-container or empty container element. 147 148 =head2 $child_elems = $elem->childNodes(); 149 150 This method creates an array of all child nodes of $elem and returns the array by reference. 151 It returns an empty array-ref [] whenever firstChild() would return undef. 152 153 =head2 $child_elem = $elem->lastChild(); 154 155 This method returns the last child node of $elem. 156 It returns undef whenever firstChild() would return undef. 157 158 =head2 $parent = $elem->parentNode(); 159 160 This method returns the parent node of $elem. 161 It returns undef when called on root nodes. 162 123 163 =head2 $attr = $elem->attributes(); 124 164 … … 128 168 129 169 This method returns the value of $elem's attributes which name is $key. 170 171 =head1 BUGS 172 173 The HTML-Parser is simple. Methods innerText and subTree may be 174 fooled by nested tags or embedded javascript code. 175 176 The methods with 'Sibling', 'child' or 'Child' in their names do not cache their results. 177 The most expensive ones are lastChild() and previousSibling(). 178 parentNode() is also expensive, but only once. It does caching. 179 180 The DOM tree is read-only, as this is just a parser. 130 181 131 182 =head1 INTERNATIONALIZATION … … 158 209 159 210 use vars qw( $VERSION ); 160 $VERSION = "0.16 ";211 $VERSION = "0.16.1"; 161 212 162 213 my $J2E = {qw( jis ISO-2022-JP sjis Shift_JIS euc EUC-JP ucs2 UCS2 )}; … … 164 215 my $SEC_OF_DAY = 60 * 60 * 24; 165 216 217 # [000] '/' if closing tag. 218 # [001] tagName 219 # [002] attributes string (with trailing /, if self-closing tag). 220 # [003] content until next (nested) tag. 221 # [004] attributes hash cache. 222 # [005] innerText combined strings cache. 223 # [006] index of matching closing tag (or opening tag, if [000]=='/') 224 # [007] index of parent (aka container) tag. 225 # 166 226 sub new { 167 227 my $package = shift; … … 331 391 332 392 my $tagname = $elem->[001]; 393 my $closing = HTML::TagParser::Util::find_closing($flat, $cur); 333 394 my $list = []; 334 for ( ; $cur < $ #$flat; $cur++ ) {395 for ( ; $cur < $closing ; $cur++ ) { 335 396 push( @$list, $flat->[$cur]->[003] ); 336 last if ( $flat->[ $cur + 1 ]->[001] eq $tagname );337 397 } 338 398 my $text = join( "", grep { $_ ne "" } @$list ); … … 343 403 } 344 404 405 sub subTree 406 { 407 my $self = shift; 408 my ( $flat, $cur ) = @$self; 409 my $elem = $flat->[$cur]; 410 return if $elem->[000]; # </xxx> 411 my $closing = HTML::TagParser::Util::find_closing($flat, $cur); 412 my $list = []; 413 while (++$cur < $closing) 414 { 415 push @$list, $flat->[$cur]; 416 } 417 418 # allow the getElement...() methods on the returned object. 419 return bless { flat => $list }, 'HTML::TagParser'; 420 } 421 422 423 sub nextSibling 424 { 425 my $self = shift; 426 my ( $flat, $cur ) = @$self; 427 my $elem = $flat->[$cur]; 428 429 return undef if $elem->[000]; # </xxx> 430 my $closing = HTML::TagParser::Util::find_closing($flat, $cur); 431 my $next_s = $flat->[$closing+1]; 432 return undef unless $next_s; 433 return undef if $next_s->[000]; # parent's </xxx> 434 return HTML::TagParser::Element->new( $flat, $closing+1 ); 435 } 436 437 sub firstChild 438 { 439 my $self = shift; 440 my ( $flat, $cur ) = @$self; 441 my $elem = $flat->[$cur]; 442 return undef if $elem->[000]; # </xxx> 443 my $closing = HTML::TagParser::Util::find_closing($flat, $cur); 444 return undef if $closing <= $cur+1; # no children here. 445 return HTML::TagParser::Element->new( $flat, $cur+1 ); 446 } 447 448 sub childNodes 449 { 450 my $self = shift; 451 my ( $flat, $cur ) = @$self; 452 my $child = firstChild($self); 453 return [] unless $child; # an empty array is easier for our callers than undef 454 my @c = ( $child ); 455 while (defined ($child = nextSibling($child))) 456 { 457 push @c, $child; 458 } 459 return \@c; 460 } 461 462 sub lastChild 463 { 464 my $c = childNodes(@_); 465 return undef unless $c->[0]; 466 return $c->[-1]; 467 } 468 469 sub previousSibling 470 { 471 my $self = shift; 472 my ( $flat, $cur ) = @$self; 473 474 ## This one is expensive. 475 ## We use find_closing() which walks forward. 476 ## We'd need a find_opening() which walks backwards. 477 ## So we walk backwards one by one and consult find_closing() 478 ## until we find $cur-1 or $cur. 479 480 my $idx = $cur-1; 481 while ($idx >= 0) 482 { 483 if ($flat->[$idx][000] && defined($flat->[$idx][006])) 484 { 485 $idx = $flat->[$idx][006]; # use cache for backwards skipping 486 next; 487 } 488 489 my $closing = HTML::TagParser::Util::find_closing($flat, $idx); 490 return HTML::TagParser::Element->new( $flat, $idx ) 491 if defined $closing and ($closing == $cur || $closing == $cur-1); 492 $idx--; 493 } 494 return undef; 495 } 496 497 sub parentNode 498 { 499 my $self = shift; 500 my ( $flat, $cur ) = @$self; 501 502 return HTML::TagParser::Element->new( $flat, $flat->[$cur][007]) if $flat->[$cur][007]; # cache 503 504 ## 505 ## This one is very expensive. 506 ## We use previousSibling() to walk backwards, and 507 ## previousSibling() is expensive. 508 ## 509 my $ps = $self; 510 my $first = $self; 511 512 while (defined($ps = previousSibling($ps))) { $first = $ps; } 513 514 my $parent = $first->[1] - 1; 515 return undef if $parent < 0; 516 die "parent too short" if HTML::TagParser::Util::find_closing($flat, $parent) <= $cur; 517 518 $flat->[$cur][007] = $parent; # cache 519 return HTML::TagParser::Element->new( $flat, $parent ) 520 } 521 522 ## 523 ## feature: 524 ## self-closing tags have an additional attribute '/' => '/'. 525 ## 345 526 sub attributes { 346 527 my $self = shift; … … 421 602 } 422 603 604 ## returns 1 beyond the end, if not found. 605 ## returns undef if called on a </xxx> closing tag 606 sub find_closing 607 { 608 my ($flat, $cur) = @_; 609 610 return $flat->[$cur][006] if $flat->[$cur][006]; # cache 611 return $flat->[$cur][006] = $cur if (($flat->[$cur][002]||'') =~ m{/$}); # self-closing 612 613 my $name = $flat->[$cur][001]; 614 my $pre_nest = 0; 615 ## count how many levels deep this type of tag is nested. 616 my $idx; 617 for ($idx = 0; $idx <= $cur; $idx++) 618 { 619 my $e = $flat->[$idx]; 620 next unless $e->[001] eq $name; 621 next if (($e->[002]||'') =~ m{/$}); # self-closing 622 $pre_nest += ($e->[000]) ? -1 : 1; 623 $pre_nest = 0 if $pre_nest < 0; 624 $idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward. 625 } 626 my $last_idx = $#$flat; 627 628 ## we move last_idx closer, in case this container 629 ## has not all its subcontainers closed properly. 630 my $post_nest = 0; 631 for ($idx = $last_idx; $idx > $cur; $idx--) 632 { 633 my $e = $flat->[$idx]; 634 next unless $e->[001] eq $name; 635 $last_idx = $idx-1; # remember where a matching tag was 636 next if (($e->[002]||'') =~ m{/$}); # self-closing 637 $post_nest -= ($e->[000]) ? -1 : 1; 638 $post_nest = 0 if $post_nest < 0; 639 last if $pre_nest <= $post_nest; 640 $idx = $e->[006]+1 if $e->[000] && defined $e->[006]; # use caches for skipping backwards. 641 } 642 643 my $nest = 1; # we know it is not self-closing. start behind. 644 645 for ($idx = $cur+1; $idx <= $last_idx; $idx++) 646 { 647 my $e = $flat->[$idx]; 648 next unless $e->[001] eq $name; 649 next if (($e->[002]||'') =~ m{/$}); # self-closing 650 $nest += ($e->[000]) ? -1 : 1; 651 if ($nest <= 0) 652 { 653 die "assert </xxx>" unless $e->[000]; 654 $e->[006] = $cur; # point back to opening tag 655 return $flat->[$cur][006] = $idx; 656 } 657 $idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward. 658 } 659 660 # not all closed, but cannot go further 661 return $flat->[$cur][006] = $last_idx+1; 662 } 663 423 664 sub find_meta_charset { 424 665 my $txtref = shift; # reference -
lang/perl/HTML-TagParser/trunk/t/08_nest.t
r22228 r22229 29 29 30 30 my $foo = $html->getElementById( "foo" ); 31 like( $foo->innerText(), qr/AAA/s, "foo" );32 #like( $foo->innerText(), qr/AAA.*BBB.*CCC.*DDD.*EEE.*FFF/s, "foo" );31 # like( $foo->innerText(), qr/AAA/s, "foo" ); 32 like( $foo->innerText(), qr/AAA.*BBB.*CCC.*DDD.*EEE.*FFF/s, "foo" ); 33 33 34 34 my $bar = $html->getElementById( "bar" ); 35 like( $bar->innerText(), qr/BBB.*CCC.*DDD\W*$/s, "bar" );36 #like( $bar->innerText(), qr/BBB.*CCC.*DDD.*EEE/s, "bar" );35 # like( $bar->innerText(), qr/BBB.*CCC.*DDD\W*$/s, "bar" ); 36 like( $bar->innerText(), qr/BBB.*CCC.*DDD.*EEE/s, "bar" ); 37 37 # ---------------------------------------------------------------- 38 38 ;1;
![(please configure the [header_logo] section in trac.ini)](/share/chrome/site/your_project_logo.png)