Changeset 22252 for lang

Show
Ignore:
Timestamp:
10/28/08 01:59:43 (5 years ago)
Author:
drry
Message:
  • fixed regexes.
  • removed subversion properties.
  • et cetera.
Location:
lang/perl/HTML-TagParser/trunk
Files:
19 modified

Legend:

Unmodified
Added
Removed
  • lang/perl/HTML-TagParser/trunk/Makefile.PL

    • Property svn:executable deleted
  • lang/perl/HTML-TagParser/trunk/README

    • Property svn:executable deleted
    r22229 r22252  
    164164 
    165165COPYRIGHT AND LICENSE 
    166     Copyright (c) 2006-2007 Yusuke Kawasaki. All rights reserved. This 
     166    Copyright (c) 2006-2008 Yusuke Kawasaki. All rights reserved. This 
    167167    program is free software; you can redistribute it and/or modify it under 
    168168    the same terms as Perl itself. 
  • lang/perl/HTML-TagParser/trunk/lib/HTML/TagParser.pm

    r22229 r22252  
    6666 
    6767If new() is called with a filename, 
    68 this method parses a local HTML file and returns its instance  
     68this method parses a local HTML file and returns its instance 
    6969 
    7070=head2 $html = HTML::TagParser->new( "<html>...snip...</html>" ); 
     
    143143 
    144144This method returns the first child node of $elem. 
    145 It returns undef when called on a closing tag element or on a  
     145It returns undef when called on a closing tag element or on a 
    146146non-container or empty container element. 
    147147 
     
    182182=head1 INTERNATIONALIZATION 
    183183 
    184 This module natively understands the character encoding used in document  
     184This module natively understands the character encoding used in document 
    185185by parsing its meta element. 
    186186 
    187187    <meta http-equiv="Content-Type" content="text/html; charset=Shift_JIS"> 
    188188 
    189 The parsed document's encoding is converted  
     189The parsed document's encoding is converted 
    190190as this class's fixed internal encoding "UTF-8". 
    191191 
     
    196196=head1 COPYRIGHT AND LICENSE 
    197197 
    198 Copyright (c) 2006-2007 Yusuke Kawasaki. All rights reserved. 
    199 This program is free software; you can redistribute it and/or  
     198Copyright (c) 2006-2008 Yusuke Kawasaki. All rights reserved. 
     199This program is free software; you can redistribute it and/or 
    200200modify it under the same terms as Perl itself. 
    201201 
     
    215215my $SEC_OF_DAY = 60 * 60 * 24; 
    216216 
    217 #  [000]        '/' if closing tag. 
    218 #  [001]        tagName 
    219 #  [002]        attributes string (with trailing /, if self-closing tag). 
    220 #  [003]        content until next (nested) tag. 
    221 #  [004]        attributes hash cache. 
    222 #  [005]        innerText combined strings cache. 
    223 #  [006]        index of matching closing tag (or opening tag, if [000]=='/') 
    224 #  [007]        index of parent (aka container) tag. 
    225 #  
     217#  [000]        '/' if closing tag. 
     218#  [001]        tagName 
     219#  [002]        attributes string (with trailing /, if self-closing tag). 
     220#  [003]        content until next (nested) tag. 
     221#  [004]        attributes hash cache. 
     222#  [005]        innerText combined strings cache. 
     223#  [006]        index of matching closing tag (or opening tag, if [000]=='/') 
     224#  [007]        index of parent (aka container) tag. 
     225# 
    226226sub new { 
    227227    my $package = shift; 
     
    234234        $self->fetch( $src, @_ ); 
    235235    } 
    236     elsif ( $src !~ m#[\<\>\|]# && -f $src ) { 
     236    elsif ( $src !~ m#[<>|]# && -f $src ) { 
    237237        $self->open($src); 
    238238    } 
     
    397397    } 
    398398    my $text = join( "", grep { $_ ne "" } @$list ); 
    399     $text =~ s/^\s+//s; 
    400     $text =~ s/\s+$//s; 
     399    $text =~ s/^\s+|\s+$//sg; 
    401400#   $text = "" if ( $cur == $#$flat );              # end of source 
    402401    $elem->[005] = HTML::TagParser::Util::xml_unescape( $text ); 
     
    411410    my $closing = HTML::TagParser::Util::find_closing($flat, $cur); 
    412411    my $list    = []; 
    413     while (++$cur < $closing)  
     412    while (++$cur < $closing) 
    414413      { 
    415414        push @$list, $flat->[$cur]; 
     
    431430    my $next_s = $flat->[$closing+1]; 
    432431    return undef unless $next_s; 
    433     return undef if $next_s->[000];     # parent's </xxx> 
     432    return undef if $next_s->[000];     # parent's </xxx> 
    434433    return HTML::TagParser::Element->new( $flat, $closing+1 ); 
    435434} 
     
    442441    return undef if $elem->[000];                         # </xxx> 
    443442    my $closing = HTML::TagParser::Util::find_closing($flat, $cur); 
    444     return undef if $closing <= $cur+1;                 # no children here. 
     443    return undef if $closing <= $cur+1;                 # no children here. 
    445444    return HTML::TagParser::Element->new( $flat, $cur+1 ); 
    446445} 
     
    451450    my ( $flat, $cur ) = @$self; 
    452451    my $child = firstChild($self); 
    453     return [] unless $child;    # an empty array is easier for our callers than undef 
     452    return [] unless $child;    # an empty array is easier for our callers than undef 
    454453    my @c = ( $child ); 
    455454    while (defined ($child = nextSibling($child))) 
     
    471470    my $self = shift; 
    472471    my ( $flat, $cur ) = @$self; 
    473      
    474     ## This one is expensive.  
    475     ## We use find_closing() which walks forward.  
     472 
     473    ## This one is expensive. 
     474    ## We use find_closing() which walks forward. 
    476475    ## We'd need a find_opening() which walks backwards. 
    477476    ## So we walk backwards one by one and consult find_closing() 
     
    482481      { 
    483482        if ($flat->[$idx][000] && defined($flat->[$idx][006])) 
    484           { 
    485             $idx = $flat->[$idx][006];  # use cache for backwards skipping 
    486             next; 
    487           } 
     483          { 
     484            $idx = $flat->[$idx][006];  # use cache for backwards skipping 
     485            next; 
     486          } 
    488487 
    489488        my $closing = HTML::TagParser::Util::find_closing($flat, $idx); 
    490         return HTML::TagParser::Element->new( $flat, $idx ) 
    491           if defined $closing and ($closing == $cur || $closing == $cur-1); 
    492         $idx--; 
     489        return HTML::TagParser::Element->new( $flat, $idx ) 
     490          if defined $closing and ($closing == $cur || $closing == $cur-1); 
     491        $idx--; 
    493492      } 
    494493    return undef; 
     
    500499    my ( $flat, $cur ) = @$self; 
    501500 
    502     return HTML::TagParser::Element->new( $flat, $flat->[$cur][007]) if $flat->[$cur][007];     # cache 
     501    return HTML::TagParser::Element->new( $flat, $flat->[$cur][007]) if $flat->[$cur][007];     # cache 
    503502 
    504503    ## 
    505     ## This one is very expensive.  
     504    ## This one is very expensive. 
    506505    ## We use previousSibling() to walk backwards, and 
    507506    ## previousSibling() is expensive. 
     
    516515    die "parent too short" if HTML::TagParser::Util::find_closing($flat, $parent) <= $cur; 
    517516 
    518     $flat->[$cur][007] = $parent;       # cache 
     517    $flat->[$cur][007] = $parent;       # cache 
    519518    return HTML::TagParser::Element->new( $flat, $parent ) 
    520519} 
    521520 
    522521## 
    523 ## feature:  
     522## feature: 
    524523## self-closing tags have an additional attribute '/' => '/'. 
    525524## 
     
    532531    my $attr = {}; 
    533532    while ( $elem->[002] =~ m{ 
    534         ([^\s\=\"\']+)(\s*=\s*(?:(")(.*?)"|(')(.*?)'|([^'"\s=]+)['"]*))? 
     533        ([^\s="']+)(\s*=\s*(?:["']((?(?<=")(?:\\"|[^"])*?|(?:\\'|[^'])*?))["']|([^'"\s=]+)['"]*))? 
    535534    }sgx ) { 
    536535        my $key  = $1; 
    537536        my $test = $2; 
    538         my $val  = ( $3 ? $4 : ( $5 ? $6 : $7 )); 
     537        my $val  = $3 || $4; 
    539538        my $lckey = lc($key); 
    540539        if ($test) { 
     
    576575 
    577576sub html_to_flat { 
    578     my $txtref  = shift;    # reference 
    579     my $flat = []; 
     577    my $txtref = shift;    # reference 
     578    my $flat   = []; 
    580579    pos($$txtref) = undef;  # reset matching position 
    581580    while ( $$txtref =~ m{ 
     
    583582            ( / )? ( [^/!<>\s"'=]+ ) 
    584583            ( (?:"[^"]*"|'[^']*'|[^"'<>])+ )? 
    585         |    
     584        | 
    586585            (!-- .*? -- | ![^\-] .*? ) 
    587586        ) > ([^<]*) 
     
    604603## returns 1 beyond the end, if not found. 
    605604## returns undef if called on a </xxx> closing tag 
    606 sub find_closing  
     605sub find_closing 
    607606{ 
    608607  my ($flat, $cur) = @_; 
    609608 
    610   return $flat->[$cur][006]        if   $flat->[$cur][006];     # cache 
     609  return $flat->[$cur][006]        if   $flat->[$cur][006];     # cache 
    611610  return $flat->[$cur][006] = $cur if (($flat->[$cur][002]||'') =~ m{/$});    # self-closing 
    612611 
    613612  my $name = $flat->[$cur][001]; 
    614   my $pre_nest = 0;      
     613  my $pre_nest = 0; 
    615614  ## count how many levels deep this type of tag is nested. 
    616615  my $idx; 
     
    619618      my $e = $flat->[$idx]; 
    620619      next unless   $e->[001] eq $name; 
    621       next if     (($e->[002]||'') =~ m{/$});   # self-closing 
     620      next if     (($e->[002]||'') =~ m{/$});   # self-closing 
    622621      $pre_nest += ($e->[000]) ? -1 : 1; 
    623622      $pre_nest = 0 if $pre_nest < 0; 
    624       $idx = $e->[006]-1 if !$e->[000] && $e->[006];    # use caches for skipping forward. 
     623      $idx = $e->[006]-1 if !$e->[000] && $e->[006];    # use caches for skipping forward. 
    625624    } 
    626625  my $last_idx = $#$flat; 
    627626 
    628   ## we move last_idx closer, in case this container  
     627  ## we move last_idx closer, in case this container 
    629628  ## has not all its subcontainers closed properly. 
    630629  my $post_nest = 0; 
     
    633632      my $e = $flat->[$idx]; 
    634633      next unless    $e->[001] eq $name; 
    635       $last_idx = $idx-1;               # remember where a matching tag was 
    636       next if      (($e->[002]||'') =~ m{/$});  # self-closing 
     634      $last_idx = $idx-1;               # remember where a matching tag was 
     635      next if      (($e->[002]||'') =~ m{/$});  # self-closing 
    637636      $post_nest -= ($e->[000]) ? -1 : 1; 
    638637      $post_nest = 0 if $post_nest < 0; 
    639638      last if $pre_nest <= $post_nest; 
    640       $idx = $e->[006]+1 if $e->[000] && defined $e->[006];     # use caches for skipping backwards. 
    641     } 
    642    
    643   my $nest = 1;         # we know it is not self-closing. start behind. 
     639      $idx = $e->[006]+1 if $e->[000] && defined $e->[006];     # use caches for skipping backwards. 
     640    } 
     641 
     642  my $nest = 1;         # we know it is not self-closing. start behind. 
    644643 
    645644  for ($idx = $cur+1; $idx <= $last_idx; $idx++) 
     
    647646      my $e = $flat->[$idx]; 
    648647      next unless    $e->[001] eq $name; 
    649       next if      (($e->[002]||'') =~ m{/$});  # self-closing 
     648      next if      (($e->[002]||'') =~ m{/$});  # self-closing 
    650649      $nest      += ($e->[000]) ? -1 : 1; 
    651650      if ($nest <= 0) 
    652651        { 
    653           die "assert </xxx>" unless $e->[000]; 
    654           $e->[006] = $cur;     # point back to opening tag 
    655           return $flat->[$cur][006] = $idx; 
    656         } 
    657       $idx = $e->[006]-1 if !$e->[000] && $e->[006];    # use caches for skipping forward. 
    658     }  
     652          die "assert </xxx>" unless $e->[000]; 
     653          $e->[006] = $cur;     # point back to opening tag 
     654          return $flat->[$cur][006] = $idx; 
     655        } 
     656      $idx = $e->[006]-1 if !$e->[000] && $e->[006];    # use caches for skipping forward. 
     657    } 
    659658 
    660659  # not all closed, but cannot go further 
    661   return $flat->[$cur][006] = $last_idx+1;       
     660  return $flat->[$cur][006] = $last_idx+1; 
    662661} 
    663662 
     
    665664    my $txtref = shift;    # reference 
    666665    while ( $$txtref =~ m{ 
    667         <meta \s ((?: [^>]+\s )? http-equiv=['"]?Content-Type [^>]+ ) > 
     666        <meta \s ((?: [^>]+\s )? http-equiv\s*=\s*['"]?Content-Type [^>]+ ) > 
    668667    }sxgi ) { 
    669668        my $args = $1; 
  • lang/perl/HTML-TagParser/trunk/t/01_new.t

    • Property svn:executable deleted
  • lang/perl/HTML-TagParser/trunk/t/02_parse.t

    • Property svn:executable deleted
  • lang/perl/HTML-TagParser/trunk/t/03_open.t

    • Property svn:executable deleted
  • lang/perl/HTML-TagParser/trunk/t/04_fetch.t

    • Property svn:executable deleted
  • lang/perl/HTML-TagParser/trunk/t/05_charset.t

    • Property svn:executable deleted
  • lang/perl/HTML-TagParser/trunk/t/06_japanese.t

    • Property svn:executable deleted
  • lang/perl/HTML-TagParser/trunk/t/07_getelem.t

    • Property svn:executable deleted
  • lang/perl/HTML-TagParser/trunk/t/08_nest.t

    • Property svn:executable deleted
  • lang/perl/HTML-TagParser/trunk/t/09_broken.t

    • Property svn:executable deleted
  • lang/perl/HTML-TagParser/trunk/t/10_escape.t

    • Property svn:executable deleted
  • lang/perl/HTML-TagParser/trunk/t/11_invalid_attr.t

    • Property svn:executable deleted
  • lang/perl/HTML-TagParser/trunk/t/20_index-j.t

    • Property svn:executable deleted
  • lang/perl/HTML-TagParser/trunk/t/21_index-e.t

    • Property svn:executable deleted
  • lang/perl/HTML-TagParser/trunk/t/22_yahoo.t

    • Property svn:executable deleted
  • lang/perl/HTML-TagParser/trunk/t/23_flickr.t

    • Property svn:executable deleted
  • lang/perl/HTML-TagParser/trunk/t/sample/fetch-samples.sh

    r22228 r22252  
    11#!/bin/sh -v 
    22 
    3 # ISO-8859-1  
     3# ISO-8859-1 
    44wget -N http://www.kawa.net/xp/index-e.html 
    55