root/lang/perl/MSWord-ExtractContent/trunk/lib/MSWord/ExtractContent.pm @ 35409

Revision 35409, 13.8 kB (checked in by dayflower, 4 years ago)

CLX をきちんとハンドリングするように修正した。

Line 
1package MSWord::ExtractContent;
2
3use strict;
4use warnings;
5use 5.008_001;
6
7our $VERSION = '0.01';
8
9use Carp;
10use Encode ();
11use OLE::Storage_Lite;
12
13use Exporter 'import';
14
15our @EXPORT = qw(
16    extract_whole_contents_from_msword_file
17    extract_whole_contents_from_msword_data
18    extract_contents_from_msword_file
19    extract_contents_from_msword_data
20    extract_header_from_msword_file
21    extract_header_from_msword_data
22    extract_footnote_from_msword_file
23    extract_footnote_from_msword_data
24);
25
26our $DEBUG   = 0;
27
28sub extract_whole_contents_from_msword_file {
29    my $filename = shift;
30
31    return __PACKAGE__->from_file($filename)->whole_contents(@_);
32}
33
34sub extract_whole_contents_from_msword_data {
35    my $buffer = shift;
36
37    return __PACKAGE__->from_buffer($buffer)->whole_contents(@_);
38}
39
40sub extract_contents_from_msword_file {
41    my $filename = shift;
42
43    return __PACKAGE__->from_file($filename)->contents(@_);
44}
45
46sub extract_contents_from_msword_data {
47    my $buffer = shift;
48
49    return __PACKAGE__->from_buffer($buffer)->contents(@_);
50}
51
52sub extract_header_from_msword_file {
53    my $filename = shift;
54
55    return __PACKAGE__->from_file($filename)->header(@_);
56}
57
58sub extract_header_from_msword_data {
59    my $buffer = shift;
60
61    return __PACKAGE__->from_buffer($buffer)->header(@_);
62}
63
64sub extract_footnote_from_msword_file {
65    my $filename = shift;
66
67    return __PACKAGE__->from_file($filename)->footnote(@_);
68}
69
70sub extract_footnote_from_msword_data {
71    my $buffer = shift;
72
73    return __PACKAGE__->from_buffer($buffer)->footnote(@_);
74}
75
76sub from_file {
77    my $class    = shift;
78    my $filename = shift;
79
80    my $self = $class->_ctor();
81
82    if (! $self->_setup($filename)) {
83        return;
84    }
85
86    return $self;
87}
88
89sub from_buffer {
90    my $class  = shift;
91    my $buffer = shift;
92
93    my $self = $class->_ctor();
94
95    if (! $self->_setup(\$buffer)) {
96        return;
97    }
98
99    return $self;
100}
101
102sub _ctor {
103    my $class = shift;
104       $class = ref $class if ref $class;
105
106    my $self = bless {}, $class;
107
108    return $self;
109}
110
111sub whole_contents {
112    my $self = shift;
113
114    return $self->_retrieve_and_filter_text(0, -1, @_);
115}
116
117sub contents {
118    my $self = shift;
119
120    return $self->_retrieve_and_filter_text(0, $self->{_ccpText}, @_);
121}
122
123sub footnote {
124    my $self = shift;
125
126    my $offset = $self->{_ccpText};
127
128    return $self->_retrieve_and_filter_text($offset, $self->{_ccpFtn}, @_);
129}
130
131sub header {
132    my $self = shift;
133
134    my $offset = $self->{_ccpText} + $self->{_ccpFtn};
135
136    return $self->_retrieve_and_filter_text($offset, $self->{_ccpHdd}, @_);
137}
138
139sub _retrieve_and_filter_text {
140    my $self   = shift;
141    my $offset = shift;
142    my $length = shift;
143
144    my %option = (@_ && ref $_[0] eq 'HASH') ? %{$_[0]} : @_;
145
146    my $result = $self->_retrieve_substring($offset, $length);
147
148    if ($option{as_plaintext}) {
149        $result = _format_into_plain($result);
150    }
151
152    return $result;
153}
154
155our $PPS_NAME_WORDDOC           = 'WordDocument';
156our $PPS_NAME_TABLE_TMPL        = '%dTable';
157
158our $MAGIC_MSWORD               = 0xa5ec;
159our $NFIB_MSWORD6               = 101;
160
161our $OFFSET_FIB_IDENT           = 0x0000;
162our $OFFSET_FIB_FIB             = 0x0002;
163
164our $OFFSET_FIB_FLAGS           = 0x000a;
165our $OFFSET_FIB_FCCLX           = 0x01a2;
166our $OFFSET_FIB_LCBCLX          = 0x01a6;
167
168our $OFFSET_FIB_FCMIN           = 0x0018;
169our $OFFSET_FIB_FCMAC           = 0x001c;
170our $OFFSET_FIB_CBMAC           = 0x0040;
171
172our $MASK_FIBFLAG_COMPLEX       = 0x0004;
173our $MASK_FIBFLAG_ENCRYPTED     = 0x0100;
174our $MASK_FIBFLAG_WHICHTBLSTM   = 0x0200;
175
176our $LENGTH_CP  = 4;
177our $LENGTH_PCD = 8;
178
179our $OFFSET_FIB_CCP_MAP = {
180    ccpText     => 0x004c,
181    ccpFtn      => 0x0050,
182    ccpHdd      => 0x0054,
183    ccpMcr      => 0x0058,
184    ccpAtn      => 0x005c,
185    ccpEdn      => 0x0060,
186    ccpTxbx     => 0x0064,
187    ccpHdrTxbx  => 0x0068,
188};
189
190sub _setup {
191    my ($self, $filename) = @_;
192
193    my $oledoc = OLE::Storage_Lite->new($filename);
194    return  if ! $oledoc;
195
196    # load main stream
197    my $main_stream
198        = _retrieve_entry($oledoc, $PPS_NAME_WORDDOC);
199    return  if ! $main_stream;
200    $self->_main_stream($main_stream);
201
202    # parse FIB in main stream
203    if (! $self->_parse_fib()) {
204        return;
205    }
206
207    # load table stream
208    my $name_of_table
209        = sprintf $PPS_NAME_TABLE_TMPL,
210            ($self->{_flags}->{fWhichTblStm}) ? 1 : 0,
211        ;
212
213    my $table_stream
214        = _retrieve_entry($oledoc, $name_of_table);
215    return  if ! $table_stream;
216    $self->_table_stream($table_stream);
217
218    # parse piece table
219    if (! $self->_parse_piece_table()) {
220        return;
221    }
222
223    return $self;
224}
225
226sub _parse_fib {
227    my $self = shift;
228
229    # Header
230    $self->{_wIdent} = _get_short($self->_main_stream, $OFFSET_FIB_IDENT);
231    $self->{_nFib}   = _get_short($self->_main_stream, $OFFSET_FIB_FIB);
232
233    croak "Not a MSWord doc file"
234        if $self->{_wIdent} != $MAGIC_MSWORD;
235    croak "Unsupported version"
236        if $self->{_nFib} < $NFIB_MSWORD6;
237
238    # FIB flags
239    my $flags = _get_short($self->_main_stream, $OFFSET_FIB_FLAGS);
240
241    $self->{_flags}->{fComplex}
242        = ($flags & $MASK_FIBFLAG_COMPLEX) ? 1 : 0;
243
244    $self->{_flags}->{fEncrypted}
245        = ($flags & $MASK_FIBFLAG_ENCRYPTED) ? 1 : 0;
246    croak "Encrypted MSWord doc file is not supported"
247        if $self->{_flags}->{fEncrypted};
248
249    $self->{_flags}->{fWhichTblStm}
250        = ($flags & $MASK_FIBFLAG_WHICHTBLSTM) ? 1 : 0;
251
252    # fcMin, fcMac
253    $self->{_fcMin} = _get_long($self->_main_stream, $OFFSET_FIB_FCMIN);
254    $self->{_fcMac} = _get_long($self->_main_stream, $OFFSET_FIB_FCMAC);
255    $self->{_cbMac} = _get_long($self->_main_stream, $OFFSET_FIB_CBMAC);
256
257    # CLX
258    $self->{_fcClx}  = _get_long($self->_main_stream, $OFFSET_FIB_FCCLX);
259    $self->{_lcbClx} = _get_long($self->_main_stream, $OFFSET_FIB_LCBCLX);
260
261    # CCPs
262    $self->_parse_fib_ccps();
263
264    return $self;
265}
266
267sub _parse_fib_ccps {
268    my $self = shift;
269
270    while (my ($field, $offset) = each %{ $OFFSET_FIB_CCP_MAP }) {
271        $self->{'_' . $field} = _get_long($self->_main_stream, $offset);
272    }
273
274    return $self;
275}
276
277sub _parse_piece_table {
278    my $self = shift;
279
280    if ($self->{_lcbClx} <= 0) {
281        # pseudo piece table
282        my $ccpAll = 0;
283
284        foreach my $field (keys %{ $OFFSET_FIB_CCP_MAP }) {
285            $ccpAll += $self->{'_' . $field};
286        }
287
288        my $pcd = {
289            fc  => $self->{_fcMin},
290            cp  => 0,
291            ccp => $ccpAll,
292        };
293
294        $self->{_pcds} = [ $pcd ];
295
296        return $self;
297    }
298
299    my $clx
300        = substr $self->_table_stream, $self->{_fcClx}, $self->{_lcbClx};
301
302    while (length $clx > 0) {
303        my $clxt = ord(substr $clx, 0, 1, q{});
304
305        last  if $clxt == 2;    # plcfpcd
306
307        if ($clxt == 1) {       # grpprl => SKIP
308            my $skip = _get_short(substr $clx, 0, 2, q{});
309
310            substr $clx, 0, $skip, q{};
311        }
312        else {
313            croak "Unknown CLX block.";
314        }
315    }
316    croak "PCDs not found"  if length $clx <= 0;
317
318
319    my $length = _get_long(substr $clx, 0, 4, q{});
320
321    my $n = ( $length - $LENGTH_CP )  /  ( $LENGTH_CP + $LENGTH_PCD );
322    printf {*STDERR} "number of PCDs: %d\n", $n  if $DEBUG;
323
324    my @cps;
325    for (0 .. $n) {
326        my $cp = _get_long(substr $clx, 0, $LENGTH_CP, q{});
327        push @cps, $cp;
328    }
329
330    my @pcds;
331    for my $i (1 .. $n) {
332        my $pcd_data = substr $clx, 0, $LENGTH_PCD, q{};
333
334        my $fc = _get_long($pcd_data, 2);
335
336        my $pcd = {
337            fc  => $fc,
338            cp  => $cps[$i - 1],
339            ccp => $cps[$i] - $cps[$i - 1],
340        };
341
342        push @pcds, $pcd;
343    }
344
345    $self->{_pcds} = \@pcds;
346
347    return $self;
348}
349
350sub _retrieve_substring {
351    my ($self, $offset, $length) = @_;
352
353    if (! defined $length) {
354        $length = -1;
355    }
356
357    my $pcds = $self->{_pcds};
358
359    my $i = 0;
360    while ($i < @$pcds) {
361        last if $pcds->[$i]->{cp} > $offset;
362
363        $i ++;
364    }
365    $i --;
366    die 'could not find suitable heading piece'  if $i < 0;
367
368
369    my ($utf16, $ascii);
370
371    my $output = q{};
372
373    while ($length > 0 || $length < 0) {
374        my $pcd = $pcds->[$i];
375
376        my $len = $length;
377           $len = $pcd->{ccp}  if $pcd->{ccp} < $len || $len < 0;
378
379        if ($pcd->{fc} & 0x40000000) {
380            # cp1252
381            $ascii ||= Encode::find_encoding('CP1252');
382
383            my $fc = ($pcd->{fc} ^ 0x40000000) >> 1;
384
385            $fc += $offset;
386            $offset = 0;
387
388            my $piece = substr $self->_main_stream, $fc, $len;
389
390            $output .= $ascii->decode($piece);
391        }
392        else {
393            # utf-16le
394            $utf16 ||= Encode::find_encoding('UTF-16LE');
395
396            my $fc = $pcd->{fc};
397
398            $fc += $offset * 2;
399            $offset = 0;
400
401            my $piece = substr $self->_main_stream, $fc, $len * 2;
402
403            $output .= $utf16->decode($piece);
404        }
405
406        $length -= $len  if $length >= 0;
407
408        $i ++;
409        last if $i >= @$pcds;
410    }
411
412    return $output;
413}
414
415sub _main_stream {
416    my $self = shift;
417
418    if (@_) {
419        $self->{main_stream} = shift;
420    }
421
422    return $self->{main_stream};
423}
424
425sub _table_stream {
426    my $self = shift;
427
428    if (@_) {
429        $self->{table_stream} = shift;
430    }
431
432    return $self->{table_stream};
433}
434
435sub _retrieve_entry {
436    my ($oledoc, $label) = @_;
437
438    my $utf16 = Encode::find_encoding('UTF-16LE');
439
440    my $u_label = $utf16->encode($label);
441
442    my ($pps) = $oledoc->getPpsSearch([ $u_label ], 1);
443
444    return  if ! defined $pps;
445
446    return $pps->{Data};
447}
448
449sub _get_short {
450    my ($buffer, $offset) = @_;
451    $offset ||= 0;
452
453    return if ! defined $buffer;
454
455    my $bin = substr $buffer, $offset, 2;
456    return if length $bin < 2;
457
458    my ($value) = unpack 'v', $bin;
459
460    return $value;
461}
462
463sub _get_long {
464    my ($buffer, $offset) = @_;
465    $offset ||= 0;
466
467    return if ! defined $buffer;
468
469    my $bin = substr $buffer, $offset, 4;
470    return if length $bin < 4;
471
472    my ($value) = unpack 'V', $bin;
473
474    return $value;
475}
476
477sub _format_into_plain {
478    my $text = shift;
479
480    my %unimap = (
481        "\x0a" => "\n",         # ASIS: Line Feed
482        "\x09" => "\t",         # ASIS: Tab
483
484        "\x0d" => "\n",         # Paragraph ends; \n + U+2029?
485
486        "\x0b" => "\n",         # Hard line breaks
487
488        "\x2d" => "\x2d",       # ASIS: Breaking hyphens; U+2010?
489        "\x1f" => "\x{00ad}",   # Non-required hyphens (into Soft hyphen)
490        "\x1e" => "\x{2011}",   # Non-breaking hyphens
491
492        "\xa0" => "\xa0",       # ASIS: Non-breaking-spaces
493
494        "\x0c" => "\x0c",       # ASIS: Page breaks or Section marks
495
496        "\x0e" => "\x0e",       # ASIS: Column breaks
497
498        "\x13" => "",           # Field begin mark
499        "\x15" => "",           # Field end mark
500        "\x14" => "",           # Field separator
501
502        "\x07" => "\t",         # Cell mark or Row mark
503    );
504
505    # pseudo row mark detection
506    $text =~ s{ ( [\x07]* ) [\x07]{2} }{$1\n}gxmso;
507
508    $text =~ s{([\x00-\x1f])}{ $unimap{$1} || '' }egxmso;
509
510    return $text;
511}
512
5131;
514__END__
515
516=head1 NAME
517
518MSWord::ExtractContent - Extract text contents from MSWord document
519
520=head1 SYNOPSIS
521
522    use MSWord::ExtractContent;
523    use Encode;
524   
525    # load and parse MSWord file
526    my $word = MSWord::ExtractContent->from_file('sample.doc');
527    # output contents
528    print encode('UTF-8', $word->contents);
529    # can retrieve header
530    my $header = $word->header( as_plaintext => 1);
531   
532    # non-OO style
533    my $footnote = extract_contents_from_msword_file('sample.doc'));
534
535=head1 DESCRIPTION
536
537MSWord::ExtractContent is contents extractor for Microsoft Word document.
538
539=head1 METHODS
540
541=over 4
542
543=item CLASS-E<gt>from_file($filename)
544
545Parses specified MSWord document file and returns result object.
546
547=item CLASS-E<gt>from_buffer($buffer)
548
549Parses specified MSWord document data and returns result object.
550
551=back
552
553=head1 METHODS FOR RESULT OBJECT
554
555With following four methods, you can retrieve text contents from result
556object generated above methods.
557They produce text strings in Unicode string form, that means resulting text
558may be utf-8 flagged, so you can not output such text on console directly.
559(Recommended to use Encode::encode() with suitable encoding for output.)
560
561=over 4
562
563=item $doc-E<gt>contents([ $options ]);
564
565=item $doc-E<gt>header([ $options ]);
566
567=item $doc-E<gt>footnote([ $options ]);
568
569=item $doc-E<gt>whole_contents([ $options ]);
570
571=back
572
573=head1 OPTIONS FOR CONTENTS RETRIEVAL
574
575=over 4
576
577=item as_plaintext
578
579If this option is specified, contents retrieval methods described above
580filter some of special characters (such as cell mark) from resulting contents.
581
582=back
583
584=head1 FUNCTIONAL INTERFACE
585
586=over 4
587
588=item extract_contents_from_msword_file($filename, [ $options ]);
589
590=item extract_header_from_msword_file($filename, [ $options ]);
591
592=item extract_footnote_from_msword_file($filename, [ $options ]);
593
594=item extract_whole_contents_from_msword_file($filename, [ $options ]);
595
596=back
597
598Functionalities of following methods are same as above, but they parse
599document data buffers instead of parsing files.
600
601=over 4
602
603=item extract_contents_from_msword_data($buffer, [ $options ]);
604
605=item extract_header_from_msword_data($buffer, [ $options ]);
606
607=item extract_footnote_from_msword_data($buffer, [ $options ]);
608
609=item extract_whole_contents_from_msword_data($buffer, [ $options ]);
610
611=back
612
613=head1 LIMITATIONS
614
615Only support Microsoft Word binary document.
616Does not support Microsoft Word XML document (.docx).
617
618This module does not handle PAP (PAragraph Properties) and CHP (CHaracter
619Properties), that define paragraphs and characters style.
620Those styling information are required to determine functionalities of
621some of special characters (such as row mark, footnote reference, and etc).
622
623This module does not handle summary information stream in Word file.
624
625=head1 AUTHOR
626
627ITO Nobuaki E<lt>banb@cpan.orgE<gt>
628
629=head1 LICENSE
630
631This library is free software; you can redistribute it and/or modify
632it under the same terms as Perl itself.
633
634=head1 SEE ALSO
635
636=cut
Note: See TracBrowser for help on using the browser.