- Timestamp:
- 10/28/08 01:59:43 (5 years ago)
- Location:
- lang/perl/HTML-TagParser/trunk
- Files:
-
- 19 modified
-
Makefile.PL (modified) (1 prop)
-
README (modified) (1 diff, 1 prop)
-
lib/HTML/TagParser.pm (modified) (23 diffs)
-
t/01_new.t (modified) (1 prop)
-
t/02_parse.t (modified) (1 prop)
-
t/03_open.t (modified) (1 prop)
-
t/04_fetch.t (modified) (1 prop)
-
t/05_charset.t (modified) (1 prop)
-
t/06_japanese.t (modified) (1 prop)
-
t/07_getelem.t (modified) (1 prop)
-
t/08_nest.t (modified) (1 prop)
-
t/09_broken.t (modified) (1 prop)
-
t/10_escape.t (modified) (1 prop)
-
t/11_invalid_attr.t (modified) (1 prop)
-
t/20_index-j.t (modified) (1 prop)
-
t/21_index-e.t (modified) (1 prop)
-
t/22_yahoo.t (modified) (1 prop)
-
t/23_flickr.t (modified) (1 prop)
-
t/sample/fetch-samples.sh (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
lang/perl/HTML-TagParser/trunk/Makefile.PL
- Property svn:executable deleted
-
lang/perl/HTML-TagParser/trunk/README
- Property svn:executable deleted
r22229 r22252 164 164 165 165 COPYRIGHT AND LICENSE 166 Copyright (c) 2006-200 7Yusuke Kawasaki. All rights reserved. This166 Copyright (c) 2006-2008 Yusuke Kawasaki. All rights reserved. This 167 167 program is free software; you can redistribute it and/or modify it under 168 168 the same terms as Perl itself. -
lang/perl/HTML-TagParser/trunk/lib/HTML/TagParser.pm
r22229 r22252 66 66 67 67 If new() is called with a filename, 68 this method parses a local HTML file and returns its instance 68 this method parses a local HTML file and returns its instance 69 69 70 70 =head2 $html = HTML::TagParser->new( "<html>...snip...</html>" ); … … 143 143 144 144 This method returns the first child node of $elem. 145 It returns undef when called on a closing tag element or on a 145 It returns undef when called on a closing tag element or on a 146 146 non-container or empty container element. 147 147 … … 182 182 =head1 INTERNATIONALIZATION 183 183 184 This module natively understands the character encoding used in document 184 This module natively understands the character encoding used in document 185 185 by parsing its meta element. 186 186 187 187 <meta http-equiv="Content-Type" content="text/html; charset=Shift_JIS"> 188 188 189 The parsed document's encoding is converted 189 The parsed document's encoding is converted 190 190 as this class's fixed internal encoding "UTF-8". 191 191 … … 196 196 =head1 COPYRIGHT AND LICENSE 197 197 198 Copyright (c) 2006-200 7Yusuke Kawasaki. All rights reserved.199 This program is free software; you can redistribute it and/or 198 Copyright (c) 2006-2008 Yusuke Kawasaki. All rights reserved. 199 This program is free software; you can redistribute it and/or 200 200 modify it under the same terms as Perl itself. 201 201 … … 215 215 my $SEC_OF_DAY = 60 * 60 * 24; 216 216 217 # [000] '/' if closing tag.218 # [001] tagName219 # [002] attributes string (with trailing /, if self-closing tag).220 # [003] content until next (nested) tag.221 # [004] attributes hash cache.222 # [005] innerText combined strings cache.223 # [006] index of matching closing tag (or opening tag, if [000]=='/')224 # [007] index of parent (aka container) tag.225 # 217 # [000] '/' if closing tag. 218 # [001] tagName 219 # [002] attributes string (with trailing /, if self-closing tag). 220 # [003] content until next (nested) tag. 221 # [004] attributes hash cache. 222 # [005] innerText combined strings cache. 223 # [006] index of matching closing tag (or opening tag, if [000]=='/') 224 # [007] index of parent (aka container) tag. 225 # 226 226 sub new { 227 227 my $package = shift; … … 234 234 $self->fetch( $src, @_ ); 235 235 } 236 elsif ( $src !~ m#[ \<\>\|]# && -f $src ) {236 elsif ( $src !~ m#[<>|]# && -f $src ) { 237 237 $self->open($src); 238 238 } … … 397 397 } 398 398 my $text = join( "", grep { $_ ne "" } @$list ); 399 $text =~ s/^\s+//s; 400 $text =~ s/\s+$//s; 399 $text =~ s/^\s+|\s+$//sg; 401 400 # $text = "" if ( $cur == $#$flat ); # end of source 402 401 $elem->[005] = HTML::TagParser::Util::xml_unescape( $text ); … … 411 410 my $closing = HTML::TagParser::Util::find_closing($flat, $cur); 412 411 my $list = []; 413 while (++$cur < $closing) 412 while (++$cur < $closing) 414 413 { 415 414 push @$list, $flat->[$cur]; … … 431 430 my $next_s = $flat->[$closing+1]; 432 431 return undef unless $next_s; 433 return undef if $next_s->[000]; # parent's </xxx>432 return undef if $next_s->[000]; # parent's </xxx> 434 433 return HTML::TagParser::Element->new( $flat, $closing+1 ); 435 434 } … … 442 441 return undef if $elem->[000]; # </xxx> 443 442 my $closing = HTML::TagParser::Util::find_closing($flat, $cur); 444 return undef if $closing <= $cur+1; # no children here.443 return undef if $closing <= $cur+1; # no children here. 445 444 return HTML::TagParser::Element->new( $flat, $cur+1 ); 446 445 } … … 451 450 my ( $flat, $cur ) = @$self; 452 451 my $child = firstChild($self); 453 return [] unless $child; # an empty array is easier for our callers than undef452 return [] unless $child; # an empty array is easier for our callers than undef 454 453 my @c = ( $child ); 455 454 while (defined ($child = nextSibling($child))) … … 471 470 my $self = shift; 472 471 my ( $flat, $cur ) = @$self; 473 474 ## This one is expensive. 475 ## We use find_closing() which walks forward. 472 473 ## This one is expensive. 474 ## We use find_closing() which walks forward. 476 475 ## We'd need a find_opening() which walks backwards. 477 476 ## So we walk backwards one by one and consult find_closing() … … 482 481 { 483 482 if ($flat->[$idx][000] && defined($flat->[$idx][006])) 484 {485 $idx = $flat->[$idx][006];# use cache for backwards skipping486 next;487 }483 { 484 $idx = $flat->[$idx][006]; # use cache for backwards skipping 485 next; 486 } 488 487 489 488 my $closing = HTML::TagParser::Util::find_closing($flat, $idx); 490 return HTML::TagParser::Element->new( $flat, $idx )491 if defined $closing and ($closing == $cur || $closing == $cur-1);492 $idx--;489 return HTML::TagParser::Element->new( $flat, $idx ) 490 if defined $closing and ($closing == $cur || $closing == $cur-1); 491 $idx--; 493 492 } 494 493 return undef; … … 500 499 my ( $flat, $cur ) = @$self; 501 500 502 return HTML::TagParser::Element->new( $flat, $flat->[$cur][007]) if $flat->[$cur][007]; # cache501 return HTML::TagParser::Element->new( $flat, $flat->[$cur][007]) if $flat->[$cur][007]; # cache 503 502 504 503 ## 505 ## This one is very expensive. 504 ## This one is very expensive. 506 505 ## We use previousSibling() to walk backwards, and 507 506 ## previousSibling() is expensive. … … 516 515 die "parent too short" if HTML::TagParser::Util::find_closing($flat, $parent) <= $cur; 517 516 518 $flat->[$cur][007] = $parent; # cache517 $flat->[$cur][007] = $parent; # cache 519 518 return HTML::TagParser::Element->new( $flat, $parent ) 520 519 } 521 520 522 521 ## 523 ## feature: 522 ## feature: 524 523 ## self-closing tags have an additional attribute '/' => '/'. 525 524 ## … … 532 531 my $attr = {}; 533 532 while ( $elem->[002] =~ m{ 534 ([^\s \=\"\']+)(\s*=\s*(?:(")(.*?)"|(')(.*?)'|([^'"\s=]+)['"]*))?533 ([^\s="']+)(\s*=\s*(?:["']((?(?<=")(?:\\"|[^"])*?|(?:\\'|[^'])*?))["']|([^'"\s=]+)['"]*))? 535 534 }sgx ) { 536 535 my $key = $1; 537 536 my $test = $2; 538 my $val = ( $3 ? $4 : ( $5 ? $6 : $7 ));537 my $val = $3 || $4; 539 538 my $lckey = lc($key); 540 539 if ($test) { … … 576 575 577 576 sub html_to_flat { 578 my $txtref = shift; # reference579 my $flat = [];577 my $txtref = shift; # reference 578 my $flat = []; 580 579 pos($$txtref) = undef; # reset matching position 581 580 while ( $$txtref =~ m{ … … 583 582 ( / )? ( [^/!<>\s"'=]+ ) 584 583 ( (?:"[^"]*"|'[^']*'|[^"'<>])+ )? 585 | 584 | 586 585 (!-- .*? -- | ![^\-] .*? ) 587 586 ) > ([^<]*) … … 604 603 ## returns 1 beyond the end, if not found. 605 604 ## returns undef if called on a </xxx> closing tag 606 sub find_closing 605 sub find_closing 607 606 { 608 607 my ($flat, $cur) = @_; 609 608 610 return $flat->[$cur][006] if $flat->[$cur][006]; # cache609 return $flat->[$cur][006] if $flat->[$cur][006]; # cache 611 610 return $flat->[$cur][006] = $cur if (($flat->[$cur][002]||'') =~ m{/$}); # self-closing 612 611 613 612 my $name = $flat->[$cur][001]; 614 my $pre_nest = 0; 613 my $pre_nest = 0; 615 614 ## count how many levels deep this type of tag is nested. 616 615 my $idx; … … 619 618 my $e = $flat->[$idx]; 620 619 next unless $e->[001] eq $name; 621 next if (($e->[002]||'') =~ m{/$}); # self-closing620 next if (($e->[002]||'') =~ m{/$}); # self-closing 622 621 $pre_nest += ($e->[000]) ? -1 : 1; 623 622 $pre_nest = 0 if $pre_nest < 0; 624 $idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward.623 $idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward. 625 624 } 626 625 my $last_idx = $#$flat; 627 626 628 ## we move last_idx closer, in case this container 627 ## we move last_idx closer, in case this container 629 628 ## has not all its subcontainers closed properly. 630 629 my $post_nest = 0; … … 633 632 my $e = $flat->[$idx]; 634 633 next unless $e->[001] eq $name; 635 $last_idx = $idx-1; # remember where a matching tag was636 next if (($e->[002]||'') =~ m{/$}); # self-closing634 $last_idx = $idx-1; # remember where a matching tag was 635 next if (($e->[002]||'') =~ m{/$}); # self-closing 637 636 $post_nest -= ($e->[000]) ? -1 : 1; 638 637 $post_nest = 0 if $post_nest < 0; 639 638 last if $pre_nest <= $post_nest; 640 $idx = $e->[006]+1 if $e->[000] && defined $e->[006]; # use caches for skipping backwards.641 } 642 643 my $nest = 1; # we know it is not self-closing. start behind.639 $idx = $e->[006]+1 if $e->[000] && defined $e->[006]; # use caches for skipping backwards. 640 } 641 642 my $nest = 1; # we know it is not self-closing. start behind. 644 643 645 644 for ($idx = $cur+1; $idx <= $last_idx; $idx++) … … 647 646 my $e = $flat->[$idx]; 648 647 next unless $e->[001] eq $name; 649 next if (($e->[002]||'') =~ m{/$}); # self-closing648 next if (($e->[002]||'') =~ m{/$}); # self-closing 650 649 $nest += ($e->[000]) ? -1 : 1; 651 650 if ($nest <= 0) 652 651 { 653 die "assert </xxx>" unless $e->[000];654 $e->[006] = $cur;# point back to opening tag655 return $flat->[$cur][006] = $idx;656 }657 $idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward.658 } 652 die "assert </xxx>" unless $e->[000]; 653 $e->[006] = $cur; # point back to opening tag 654 return $flat->[$cur][006] = $idx; 655 } 656 $idx = $e->[006]-1 if !$e->[000] && $e->[006]; # use caches for skipping forward. 657 } 659 658 660 659 # not all closed, but cannot go further 661 return $flat->[$cur][006] = $last_idx+1; 660 return $flat->[$cur][006] = $last_idx+1; 662 661 } 663 662 … … 665 664 my $txtref = shift; # reference 666 665 while ( $$txtref =~ m{ 667 <meta \s ((?: [^>]+\s )? http-equiv =['"]?Content-Type [^>]+ ) >666 <meta \s ((?: [^>]+\s )? http-equiv\s*=\s*['"]?Content-Type [^>]+ ) > 668 667 }sxgi ) { 669 668 my $args = $1; -
lang/perl/HTML-TagParser/trunk/t/01_new.t
- Property svn:executable deleted
-
lang/perl/HTML-TagParser/trunk/t/02_parse.t
- Property svn:executable deleted
-
lang/perl/HTML-TagParser/trunk/t/03_open.t
- Property svn:executable deleted
-
lang/perl/HTML-TagParser/trunk/t/04_fetch.t
- Property svn:executable deleted
-
lang/perl/HTML-TagParser/trunk/t/05_charset.t
- Property svn:executable deleted
-
lang/perl/HTML-TagParser/trunk/t/06_japanese.t
- Property svn:executable deleted
-
lang/perl/HTML-TagParser/trunk/t/07_getelem.t
- Property svn:executable deleted
-
lang/perl/HTML-TagParser/trunk/t/08_nest.t
- Property svn:executable deleted
-
lang/perl/HTML-TagParser/trunk/t/09_broken.t
- Property svn:executable deleted
-
lang/perl/HTML-TagParser/trunk/t/10_escape.t
- Property svn:executable deleted
-
lang/perl/HTML-TagParser/trunk/t/11_invalid_attr.t
- Property svn:executable deleted
-
lang/perl/HTML-TagParser/trunk/t/20_index-j.t
- Property svn:executable deleted
-
lang/perl/HTML-TagParser/trunk/t/21_index-e.t
- Property svn:executable deleted
-
lang/perl/HTML-TagParser/trunk/t/22_yahoo.t
- Property svn:executable deleted
-
lang/perl/HTML-TagParser/trunk/t/23_flickr.t
- Property svn:executable deleted
-
lang/perl/HTML-TagParser/trunk/t/sample/fetch-samples.sh
r22228 r22252 1 1 #!/bin/sh -v 2 2 3 # ISO-8859-1 3 # ISO-8859-1 4 4 wget -N http://www.kawa.net/xp/index-e.html 5 5
![(please configure the [header_logo] section in trac.ini)](/share/chrome/site/your_project_logo.png)