Show
Ignore:
Timestamp:
09/17/09 16:58:57 (5 years ago)
Author:
dayflower
Message:

CLX をきちんとハンドリングするように修正した。

Location:
lang/perl/MSWord-ExtractContent/trunk
Files:
3 modified

Legend:

Unmodified
Added
Removed
  • lang/perl/MSWord-ExtractContent/trunk/lib/File/Extract/MSWord.pm

    r35406 r35409  
    3030__END__ 
    3131 
    32 =encoding utf-8 
    33  
    34 =for stopwords 
    35 MSWord 
    36  
    3732=head1 NAME 
    3833 
  • lang/perl/MSWord-ExtractContent/trunk/lib/MSWord/ExtractContent.pm

    r35406 r35409  
    300300        = substr $self->_table_stream, $self->{_fcClx}, $self->{_lcbClx}; 
    301301 
    302     $self->{_prm_encoding} = ord(substr $clx, 0, 1, q{}); 
     302    while (length $clx > 0) { 
     303        my $clxt = ord(substr $clx, 0, 1, q{}); 
     304 
     305        last  if $clxt == 2;    # plcfpcd 
     306 
     307        if ($clxt == 1) {       # grpprl => SKIP 
     308            my $skip = _get_short(substr $clx, 0, 2, q{}); 
     309 
     310            substr $clx, 0, $skip, q{}; 
     311        } 
     312        else { 
     313            croak "Unknown CLX block."; 
     314        } 
     315    } 
     316    croak "PCDs not found"  if length $clx <= 0; 
     317 
    303318 
    304319    my $length = _get_long(substr $clx, 0, 4, q{}); 
    305     if ($length != length $clx) { 
    306         carp "Unmatched PCD length."; 
    307     } 
    308  
    309     my $n = ( length($clx) - $LENGTH_CP )  /  ( $LENGTH_CP + $LENGTH_PCD ); 
     320 
     321    my $n = ( $length - $LENGTH_CP )  /  ( $LENGTH_CP + $LENGTH_PCD ); 
    310322    printf {*STDERR} "number of PCDs: %d\n", $n  if $DEBUG; 
    311323 
     
    5015131; 
    502514__END__ 
    503  
    504 =encoding utf-8 
    505  
    506 =for stopwords 
    507 functionalities 
    508 utf 
    509 msword 
    510 CHP CHaracter PAP PAragraph 
    511515 
    512516=head1 NAME 
     
    609613=head1 LIMITATIONS 
    610614 
     615Only support Microsoft Word binary document. 
     616Does not support Microsoft Word XML document (.docx). 
     617 
    611618This module does not handle PAP (PAragraph Properties) and CHP (CHaracter 
    612619Properties), that define paragraphs and characters style. 
  • lang/perl/MSWord-ExtractContent/trunk/t/97_podspell.t

    r35406 r35409  
    1111Nobuaki 
    1212ITO 
     13functionalities 
    1314utf 
     15docx 
     16msword 
     17CHP 
     18CHaracter 
     19PAP 
     20PAragraph