| 1 | package MSWord::ExtractContent; |
|---|
| 2 | |
|---|
| 3 | use strict; |
|---|
| 4 | use warnings; |
|---|
| 5 | use 5.008_001; |
|---|
| 6 | |
|---|
| 7 | our $VERSION = '0.01'; |
|---|
| 8 | |
|---|
| 9 | use Carp; |
|---|
| 10 | use Encode (); |
|---|
| 11 | use OLE::Storage_Lite; |
|---|
| 12 | |
|---|
| 13 | use Exporter 'import'; |
|---|
| 14 | |
|---|
| 15 | our @EXPORT = qw( |
|---|
| 16 | extract_whole_contents_from_msword_file |
|---|
| 17 | extract_whole_contents_from_msword_data |
|---|
| 18 | extract_contents_from_msword_file |
|---|
| 19 | extract_contents_from_msword_data |
|---|
| 20 | extract_header_from_msword_file |
|---|
| 21 | extract_header_from_msword_data |
|---|
| 22 | extract_footnote_from_msword_file |
|---|
| 23 | extract_footnote_from_msword_data |
|---|
| 24 | ); |
|---|
| 25 | |
|---|
| 26 | our $DEBUG = 0; |
|---|
| 27 | |
|---|
| 28 | sub extract_whole_contents_from_msword_file { |
|---|
| 29 | my $filename = shift; |
|---|
| 30 | |
|---|
| 31 | return __PACKAGE__->from_file($filename)->whole_contents(@_); |
|---|
| 32 | } |
|---|
| 33 | |
|---|
| 34 | sub extract_whole_contents_from_msword_data { |
|---|
| 35 | my $buffer = shift; |
|---|
| 36 | |
|---|
| 37 | return __PACKAGE__->from_buffer($buffer)->whole_contents(@_); |
|---|
| 38 | } |
|---|
| 39 | |
|---|
| 40 | sub extract_contents_from_msword_file { |
|---|
| 41 | my $filename = shift; |
|---|
| 42 | |
|---|
| 43 | return __PACKAGE__->from_file($filename)->contents(@_); |
|---|
| 44 | } |
|---|
| 45 | |
|---|
| 46 | sub extract_contents_from_msword_data { |
|---|
| 47 | my $buffer = shift; |
|---|
| 48 | |
|---|
| 49 | return __PACKAGE__->from_buffer($buffer)->contents(@_); |
|---|
| 50 | } |
|---|
| 51 | |
|---|
| 52 | sub extract_header_from_msword_file { |
|---|
| 53 | my $filename = shift; |
|---|
| 54 | |
|---|
| 55 | return __PACKAGE__->from_file($filename)->header(@_); |
|---|
| 56 | } |
|---|
| 57 | |
|---|
| 58 | sub extract_header_from_msword_data { |
|---|
| 59 | my $buffer = shift; |
|---|
| 60 | |
|---|
| 61 | return __PACKAGE__->from_buffer($buffer)->header(@_); |
|---|
| 62 | } |
|---|
| 63 | |
|---|
| 64 | sub extract_footnote_from_msword_file { |
|---|
| 65 | my $filename = shift; |
|---|
| 66 | |
|---|
| 67 | return __PACKAGE__->from_file($filename)->footnote(@_); |
|---|
| 68 | } |
|---|
| 69 | |
|---|
| 70 | sub extract_footnote_from_msword_data { |
|---|
| 71 | my $buffer = shift; |
|---|
| 72 | |
|---|
| 73 | return __PACKAGE__->from_buffer($buffer)->footnote(@_); |
|---|
| 74 | } |
|---|
| 75 | |
|---|
| 76 | sub from_file { |
|---|
| 77 | my $class = shift; |
|---|
| 78 | my $filename = shift; |
|---|
| 79 | |
|---|
| 80 | my $self = $class->_ctor(); |
|---|
| 81 | |
|---|
| 82 | if (! $self->_setup($filename)) { |
|---|
| 83 | return; |
|---|
| 84 | } |
|---|
| 85 | |
|---|
| 86 | return $self; |
|---|
| 87 | } |
|---|
| 88 | |
|---|
| 89 | sub from_buffer { |
|---|
| 90 | my $class = shift; |
|---|
| 91 | my $buffer = shift; |
|---|
| 92 | |
|---|
| 93 | my $self = $class->_ctor(); |
|---|
| 94 | |
|---|
| 95 | if (! $self->_setup(\$buffer)) { |
|---|
| 96 | return; |
|---|
| 97 | } |
|---|
| 98 | |
|---|
| 99 | return $self; |
|---|
| 100 | } |
|---|
| 101 | |
|---|
| 102 | sub _ctor { |
|---|
| 103 | my $class = shift; |
|---|
| 104 | $class = ref $class if ref $class; |
|---|
| 105 | |
|---|
| 106 | my $self = bless {}, $class; |
|---|
| 107 | |
|---|
| 108 | return $self; |
|---|
| 109 | } |
|---|
| 110 | |
|---|
| 111 | sub whole_contents { |
|---|
| 112 | my $self = shift; |
|---|
| 113 | |
|---|
| 114 | return $self->_retrieve_and_filter_text(0, -1, @_); |
|---|
| 115 | } |
|---|
| 116 | |
|---|
| 117 | sub contents { |
|---|
| 118 | my $self = shift; |
|---|
| 119 | |
|---|
| 120 | return $self->_retrieve_and_filter_text(0, $self->{_ccpText}, @_); |
|---|
| 121 | } |
|---|
| 122 | |
|---|
| 123 | sub footnote { |
|---|
| 124 | my $self = shift; |
|---|
| 125 | |
|---|
| 126 | my $offset = $self->{_ccpText}; |
|---|
| 127 | |
|---|
| 128 | return $self->_retrieve_and_filter_text($offset, $self->{_ccpFtn}, @_); |
|---|
| 129 | } |
|---|
| 130 | |
|---|
| 131 | sub header { |
|---|
| 132 | my $self = shift; |
|---|
| 133 | |
|---|
| 134 | my $offset = $self->{_ccpText} + $self->{_ccpFtn}; |
|---|
| 135 | |
|---|
| 136 | return $self->_retrieve_and_filter_text($offset, $self->{_ccpHdd}, @_); |
|---|
| 137 | } |
|---|
| 138 | |
|---|
| 139 | sub _retrieve_and_filter_text { |
|---|
| 140 | my $self = shift; |
|---|
| 141 | my $offset = shift; |
|---|
| 142 | my $length = shift; |
|---|
| 143 | |
|---|
| 144 | my %option = (@_ && ref $_[0] eq 'HASH') ? %{$_[0]} : @_; |
|---|
| 145 | |
|---|
| 146 | my $result = $self->_retrieve_substring($offset, $length); |
|---|
| 147 | |
|---|
| 148 | if ($option{as_plaintext}) { |
|---|
| 149 | $result = _format_into_plain($result); |
|---|
| 150 | } |
|---|
| 151 | |
|---|
| 152 | return $result; |
|---|
| 153 | } |
|---|
| 154 | |
|---|
| 155 | our $PPS_NAME_WORDDOC = 'WordDocument'; |
|---|
| 156 | our $PPS_NAME_TABLE_TMPL = '%dTable'; |
|---|
| 157 | |
|---|
| 158 | our $MAGIC_MSWORD = 0xa5ec; |
|---|
| 159 | our $NFIB_MSWORD6 = 101; |
|---|
| 160 | |
|---|
| 161 | our $OFFSET_FIB_IDENT = 0x0000; |
|---|
| 162 | our $OFFSET_FIB_FIB = 0x0002; |
|---|
| 163 | |
|---|
| 164 | our $OFFSET_FIB_FLAGS = 0x000a; |
|---|
| 165 | our $OFFSET_FIB_FCCLX = 0x01a2; |
|---|
| 166 | our $OFFSET_FIB_LCBCLX = 0x01a6; |
|---|
| 167 | |
|---|
| 168 | our $OFFSET_FIB_FCMIN = 0x0018; |
|---|
| 169 | our $OFFSET_FIB_FCMAC = 0x001c; |
|---|
| 170 | our $OFFSET_FIB_CBMAC = 0x0040; |
|---|
| 171 | |
|---|
| 172 | our $MASK_FIBFLAG_COMPLEX = 0x0004; |
|---|
| 173 | our $MASK_FIBFLAG_ENCRYPTED = 0x0100; |
|---|
| 174 | our $MASK_FIBFLAG_WHICHTBLSTM = 0x0200; |
|---|
| 175 | |
|---|
| 176 | our $LENGTH_CP = 4; |
|---|
| 177 | our $LENGTH_PCD = 8; |
|---|
| 178 | |
|---|
| 179 | our $OFFSET_FIB_CCP_MAP = { |
|---|
| 180 | ccpText => 0x004c, |
|---|
| 181 | ccpFtn => 0x0050, |
|---|
| 182 | ccpHdd => 0x0054, |
|---|
| 183 | ccpMcr => 0x0058, |
|---|
| 184 | ccpAtn => 0x005c, |
|---|
| 185 | ccpEdn => 0x0060, |
|---|
| 186 | ccpTxbx => 0x0064, |
|---|
| 187 | ccpHdrTxbx => 0x0068, |
|---|
| 188 | }; |
|---|
| 189 | |
|---|
| 190 | sub _setup { |
|---|
| 191 | my ($self, $filename) = @_; |
|---|
| 192 | |
|---|
| 193 | my $oledoc = OLE::Storage_Lite->new($filename); |
|---|
| 194 | return if ! $oledoc; |
|---|
| 195 | |
|---|
| 196 | # load main stream |
|---|
| 197 | my $main_stream |
|---|
| 198 | = _retrieve_entry($oledoc, $PPS_NAME_WORDDOC); |
|---|
| 199 | return if ! $main_stream; |
|---|
| 200 | $self->_main_stream($main_stream); |
|---|
| 201 | |
|---|
| 202 | # parse FIB in main stream |
|---|
| 203 | if (! $self->_parse_fib()) { |
|---|
| 204 | return; |
|---|
| 205 | } |
|---|
| 206 | |
|---|
| 207 | # load table stream |
|---|
| 208 | my $name_of_table |
|---|
| 209 | = sprintf $PPS_NAME_TABLE_TMPL, |
|---|
| 210 | ($self->{_flags}->{fWhichTblStm}) ? 1 : 0, |
|---|
| 211 | ; |
|---|
| 212 | |
|---|
| 213 | my $table_stream |
|---|
| 214 | = _retrieve_entry($oledoc, $name_of_table); |
|---|
| 215 | return if ! $table_stream; |
|---|
| 216 | $self->_table_stream($table_stream); |
|---|
| 217 | |
|---|
| 218 | # parse piece table |
|---|
| 219 | if (! $self->_parse_piece_table()) { |
|---|
| 220 | return; |
|---|
| 221 | } |
|---|
| 222 | |
|---|
| 223 | return $self; |
|---|
| 224 | } |
|---|
| 225 | |
|---|
| 226 | sub _parse_fib { |
|---|
| 227 | my $self = shift; |
|---|
| 228 | |
|---|
| 229 | # Header |
|---|
| 230 | $self->{_wIdent} = _get_short($self->_main_stream, $OFFSET_FIB_IDENT); |
|---|
| 231 | $self->{_nFib} = _get_short($self->_main_stream, $OFFSET_FIB_FIB); |
|---|
| 232 | |
|---|
| 233 | croak "Not a MSWord doc file" |
|---|
| 234 | if $self->{_wIdent} != $MAGIC_MSWORD; |
|---|
| 235 | croak "Unsupported version" |
|---|
| 236 | if $self->{_nFib} < $NFIB_MSWORD6; |
|---|
| 237 | |
|---|
| 238 | # FIB flags |
|---|
| 239 | my $flags = _get_short($self->_main_stream, $OFFSET_FIB_FLAGS); |
|---|
| 240 | |
|---|
| 241 | $self->{_flags}->{fComplex} |
|---|
| 242 | = ($flags & $MASK_FIBFLAG_COMPLEX) ? 1 : 0; |
|---|
| 243 | |
|---|
| 244 | $self->{_flags}->{fEncrypted} |
|---|
| 245 | = ($flags & $MASK_FIBFLAG_ENCRYPTED) ? 1 : 0; |
|---|
| 246 | croak "Encrypted MSWord doc file is not supported" |
|---|
| 247 | if $self->{_flags}->{fEncrypted}; |
|---|
| 248 | |
|---|
| 249 | $self->{_flags}->{fWhichTblStm} |
|---|
| 250 | = ($flags & $MASK_FIBFLAG_WHICHTBLSTM) ? 1 : 0; |
|---|
| 251 | |
|---|
| 252 | # fcMin, fcMac |
|---|
| 253 | $self->{_fcMin} = _get_long($self->_main_stream, $OFFSET_FIB_FCMIN); |
|---|
| 254 | $self->{_fcMac} = _get_long($self->_main_stream, $OFFSET_FIB_FCMAC); |
|---|
| 255 | $self->{_cbMac} = _get_long($self->_main_stream, $OFFSET_FIB_CBMAC); |
|---|
| 256 | |
|---|
| 257 | # CLX |
|---|
| 258 | $self->{_fcClx} = _get_long($self->_main_stream, $OFFSET_FIB_FCCLX); |
|---|
| 259 | $self->{_lcbClx} = _get_long($self->_main_stream, $OFFSET_FIB_LCBCLX); |
|---|
| 260 | |
|---|
| 261 | # CCPs |
|---|
| 262 | $self->_parse_fib_ccps(); |
|---|
| 263 | |
|---|
| 264 | return $self; |
|---|
| 265 | } |
|---|
| 266 | |
|---|
| 267 | sub _parse_fib_ccps { |
|---|
| 268 | my $self = shift; |
|---|
| 269 | |
|---|
| 270 | while (my ($field, $offset) = each %{ $OFFSET_FIB_CCP_MAP }) { |
|---|
| 271 | $self->{'_' . $field} = _get_long($self->_main_stream, $offset); |
|---|
| 272 | } |
|---|
| 273 | |
|---|
| 274 | return $self; |
|---|
| 275 | } |
|---|
| 276 | |
|---|
| 277 | sub _parse_piece_table { |
|---|
| 278 | my $self = shift; |
|---|
| 279 | |
|---|
| 280 | if ($self->{_lcbClx} <= 0) { |
|---|
| 281 | # pseudo piece table |
|---|
| 282 | my $ccpAll = 0; |
|---|
| 283 | |
|---|
| 284 | foreach my $field (keys %{ $OFFSET_FIB_CCP_MAP }) { |
|---|
| 285 | $ccpAll += $self->{'_' . $field}; |
|---|
| 286 | } |
|---|
| 287 | |
|---|
| 288 | my $pcd = { |
|---|
| 289 | fc => $self->{_fcMin}, |
|---|
| 290 | cp => 0, |
|---|
| 291 | ccp => $ccpAll, |
|---|
| 292 | }; |
|---|
| 293 | |
|---|
| 294 | $self->{_pcds} = [ $pcd ]; |
|---|
| 295 | |
|---|
| 296 | return $self; |
|---|
| 297 | } |
|---|
| 298 | |
|---|
| 299 | my $clx |
|---|
| 300 | = substr $self->_table_stream, $self->{_fcClx}, $self->{_lcbClx}; |
|---|
| 301 | |
|---|
| 302 | while (length $clx > 0) { |
|---|
| 303 | my $clxt = ord(substr $clx, 0, 1, q{}); |
|---|
| 304 | |
|---|
| 305 | last if $clxt == 2; # plcfpcd |
|---|
| 306 | |
|---|
| 307 | if ($clxt == 1) { # grpprl => SKIP |
|---|
| 308 | my $skip = _get_short(substr $clx, 0, 2, q{}); |
|---|
| 309 | |
|---|
| 310 | substr $clx, 0, $skip, q{}; |
|---|
| 311 | } |
|---|
| 312 | else { |
|---|
| 313 | croak "Unknown CLX block."; |
|---|
| 314 | } |
|---|
| 315 | } |
|---|
| 316 | croak "PCDs not found" if length $clx <= 0; |
|---|
| 317 | |
|---|
| 318 | |
|---|
| 319 | my $length = _get_long(substr $clx, 0, 4, q{}); |
|---|
| 320 | |
|---|
| 321 | my $n = ( $length - $LENGTH_CP ) / ( $LENGTH_CP + $LENGTH_PCD ); |
|---|
| 322 | printf {*STDERR} "number of PCDs: %d\n", $n if $DEBUG; |
|---|
| 323 | |
|---|
| 324 | my @cps; |
|---|
| 325 | for (0 .. $n) { |
|---|
| 326 | my $cp = _get_long(substr $clx, 0, $LENGTH_CP, q{}); |
|---|
| 327 | push @cps, $cp; |
|---|
| 328 | } |
|---|
| 329 | |
|---|
| 330 | my @pcds; |
|---|
| 331 | for my $i (1 .. $n) { |
|---|
| 332 | my $pcd_data = substr $clx, 0, $LENGTH_PCD, q{}; |
|---|
| 333 | |
|---|
| 334 | my $fc = _get_long($pcd_data, 2); |
|---|
| 335 | |
|---|
| 336 | my $pcd = { |
|---|
| 337 | fc => $fc, |
|---|
| 338 | cp => $cps[$i - 1], |
|---|
| 339 | ccp => $cps[$i] - $cps[$i - 1], |
|---|
| 340 | }; |
|---|
| 341 | |
|---|
| 342 | push @pcds, $pcd; |
|---|
| 343 | } |
|---|
| 344 | |
|---|
| 345 | $self->{_pcds} = \@pcds; |
|---|
| 346 | |
|---|
| 347 | return $self; |
|---|
| 348 | } |
|---|
| 349 | |
|---|
| 350 | sub _retrieve_substring { |
|---|
| 351 | my ($self, $offset, $length) = @_; |
|---|
| 352 | |
|---|
| 353 | if (! defined $length) { |
|---|
| 354 | $length = -1; |
|---|
| 355 | } |
|---|
| 356 | |
|---|
| 357 | my $pcds = $self->{_pcds}; |
|---|
| 358 | |
|---|
| 359 | my $i = 0; |
|---|
| 360 | while ($i < @$pcds) { |
|---|
| 361 | last if $pcds->[$i]->{cp} > $offset; |
|---|
| 362 | |
|---|
| 363 | $i ++; |
|---|
| 364 | } |
|---|
| 365 | $i --; |
|---|
| 366 | die 'could not find suitable heading piece' if $i < 0; |
|---|
| 367 | |
|---|
| 368 | |
|---|
| 369 | my ($utf16, $ascii); |
|---|
| 370 | |
|---|
| 371 | my $output = q{}; |
|---|
| 372 | |
|---|
| 373 | while ($length > 0 || $length < 0) { |
|---|
| 374 | my $pcd = $pcds->[$i]; |
|---|
| 375 | |
|---|
| 376 | my $len = $length; |
|---|
| 377 | $len = $pcd->{ccp} if $pcd->{ccp} < $len || $len < 0; |
|---|
| 378 | |
|---|
| 379 | if ($pcd->{fc} & 0x40000000) { |
|---|
| 380 | # cp1252 |
|---|
| 381 | $ascii ||= Encode::find_encoding('CP1252'); |
|---|
| 382 | |
|---|
| 383 | my $fc = ($pcd->{fc} ^ 0x40000000) >> 1; |
|---|
| 384 | |
|---|
| 385 | $fc += $offset; |
|---|
| 386 | $offset = 0; |
|---|
| 387 | |
|---|
| 388 | my $piece = substr $self->_main_stream, $fc, $len; |
|---|
| 389 | |
|---|
| 390 | $output .= $ascii->decode($piece); |
|---|
| 391 | } |
|---|
| 392 | else { |
|---|
| 393 | # utf-16le |
|---|
| 394 | $utf16 ||= Encode::find_encoding('UTF-16LE'); |
|---|
| 395 | |
|---|
| 396 | my $fc = $pcd->{fc}; |
|---|
| 397 | |
|---|
| 398 | $fc += $offset * 2; |
|---|
| 399 | $offset = 0; |
|---|
| 400 | |
|---|
| 401 | my $piece = substr $self->_main_stream, $fc, $len * 2; |
|---|
| 402 | |
|---|
| 403 | $output .= $utf16->decode($piece); |
|---|
| 404 | } |
|---|
| 405 | |
|---|
| 406 | $length -= $len if $length >= 0; |
|---|
| 407 | |
|---|
| 408 | $i ++; |
|---|
| 409 | last if $i >= @$pcds; |
|---|
| 410 | } |
|---|
| 411 | |
|---|
| 412 | return $output; |
|---|
| 413 | } |
|---|
| 414 | |
|---|
| 415 | sub _main_stream { |
|---|
| 416 | my $self = shift; |
|---|
| 417 | |
|---|
| 418 | if (@_) { |
|---|
| 419 | $self->{main_stream} = shift; |
|---|
| 420 | } |
|---|
| 421 | |
|---|
| 422 | return $self->{main_stream}; |
|---|
| 423 | } |
|---|
| 424 | |
|---|
| 425 | sub _table_stream { |
|---|
| 426 | my $self = shift; |
|---|
| 427 | |
|---|
| 428 | if (@_) { |
|---|
| 429 | $self->{table_stream} = shift; |
|---|
| 430 | } |
|---|
| 431 | |
|---|
| 432 | return $self->{table_stream}; |
|---|
| 433 | } |
|---|
| 434 | |
|---|
| 435 | sub _retrieve_entry { |
|---|
| 436 | my ($oledoc, $label) = @_; |
|---|
| 437 | |
|---|
| 438 | my $utf16 = Encode::find_encoding('UTF-16LE'); |
|---|
| 439 | |
|---|
| 440 | my $u_label = $utf16->encode($label); |
|---|
| 441 | |
|---|
| 442 | my ($pps) = $oledoc->getPpsSearch([ $u_label ], 1); |
|---|
| 443 | |
|---|
| 444 | return if ! defined $pps; |
|---|
| 445 | |
|---|
| 446 | return $pps->{Data}; |
|---|
| 447 | } |
|---|
| 448 | |
|---|
| 449 | sub _get_short { |
|---|
| 450 | my ($buffer, $offset) = @_; |
|---|
| 451 | $offset ||= 0; |
|---|
| 452 | |
|---|
| 453 | return if ! defined $buffer; |
|---|
| 454 | |
|---|
| 455 | my $bin = substr $buffer, $offset, 2; |
|---|
| 456 | return if length $bin < 2; |
|---|
| 457 | |
|---|
| 458 | my ($value) = unpack 'v', $bin; |
|---|
| 459 | |
|---|
| 460 | return $value; |
|---|
| 461 | } |
|---|
| 462 | |
|---|
| 463 | sub _get_long { |
|---|
| 464 | my ($buffer, $offset) = @_; |
|---|
| 465 | $offset ||= 0; |
|---|
| 466 | |
|---|
| 467 | return if ! defined $buffer; |
|---|
| 468 | |
|---|
| 469 | my $bin = substr $buffer, $offset, 4; |
|---|
| 470 | return if length $bin < 4; |
|---|
| 471 | |
|---|
| 472 | my ($value) = unpack 'V', $bin; |
|---|
| 473 | |
|---|
| 474 | return $value; |
|---|
| 475 | } |
|---|
| 476 | |
|---|
| 477 | sub _format_into_plain { |
|---|
| 478 | my $text = shift; |
|---|
| 479 | |
|---|
| 480 | my %unimap = ( |
|---|
| 481 | "\x0a" => "\n", # ASIS: Line Feed |
|---|
| 482 | "\x09" => "\t", # ASIS: Tab |
|---|
| 483 | |
|---|
| 484 | "\x0d" => "\n", # Paragraph ends; \n + U+2029? |
|---|
| 485 | |
|---|
| 486 | "\x0b" => "\n", # Hard line breaks |
|---|
| 487 | |
|---|
| 488 | "\x2d" => "\x2d", # ASIS: Breaking hyphens; U+2010? |
|---|
| 489 | "\x1f" => "\x{00ad}", # Non-required hyphens (into Soft hyphen) |
|---|
| 490 | "\x1e" => "\x{2011}", # Non-breaking hyphens |
|---|
| 491 | |
|---|
| 492 | "\xa0" => "\xa0", # ASIS: Non-breaking-spaces |
|---|
| 493 | |
|---|
| 494 | "\x0c" => "\x0c", # ASIS: Page breaks or Section marks |
|---|
| 495 | |
|---|
| 496 | "\x0e" => "\x0e", # ASIS: Column breaks |
|---|
| 497 | |
|---|
| 498 | "\x13" => "", # Field begin mark |
|---|
| 499 | "\x15" => "", # Field end mark |
|---|
| 500 | "\x14" => "", # Field separator |
|---|
| 501 | |
|---|
| 502 | "\x07" => "\t", # Cell mark or Row mark |
|---|
| 503 | ); |
|---|
| 504 | |
|---|
| 505 | # pseudo row mark detection |
|---|
| 506 | $text =~ s{ ( [\x07]* ) [\x07]{2} }{$1\n}gxmso; |
|---|
| 507 | |
|---|
| 508 | $text =~ s{([\x00-\x1f])}{ $unimap{$1} || '' }egxmso; |
|---|
| 509 | |
|---|
| 510 | return $text; |
|---|
| 511 | } |
|---|
| 512 | |
|---|
| 513 | 1; |
|---|
| 514 | __END__ |
|---|
| 515 | |
|---|
| 516 | =head1 NAME |
|---|
| 517 | |
|---|
| 518 | MSWord::ExtractContent - Extract text contents from MSWord document |
|---|
| 519 | |
|---|
| 520 | =head1 SYNOPSIS |
|---|
| 521 | |
|---|
| 522 | use MSWord::ExtractContent; |
|---|
| 523 | use Encode; |
|---|
| 524 | |
|---|
| 525 | # load and parse MSWord file |
|---|
| 526 | my $word = MSWord::ExtractContent->from_file('sample.doc'); |
|---|
| 527 | # output contents |
|---|
| 528 | print encode('UTF-8', $word->contents); |
|---|
| 529 | # can retrieve header |
|---|
| 530 | my $header = $word->header( as_plaintext => 1); |
|---|
| 531 | |
|---|
| 532 | # non-OO style |
|---|
| 533 | my $footnote = extract_contents_from_msword_file('sample.doc')); |
|---|
| 534 | |
|---|
| 535 | =head1 DESCRIPTION |
|---|
| 536 | |
|---|
| 537 | MSWord::ExtractContent is contents extractor for Microsoft Word document. |
|---|
| 538 | |
|---|
| 539 | =head1 METHODS |
|---|
| 540 | |
|---|
| 541 | =over 4 |
|---|
| 542 | |
|---|
| 543 | =item CLASS-E<gt>from_file($filename) |
|---|
| 544 | |
|---|
| 545 | Parses specified MSWord document file and returns result object. |
|---|
| 546 | |
|---|
| 547 | =item CLASS-E<gt>from_buffer($buffer) |
|---|
| 548 | |
|---|
| 549 | Parses specified MSWord document data and returns result object. |
|---|
| 550 | |
|---|
| 551 | =back |
|---|
| 552 | |
|---|
| 553 | =head1 METHODS FOR RESULT OBJECT |
|---|
| 554 | |
|---|
| 555 | With following four methods, you can retrieve text contents from result |
|---|
| 556 | object generated above methods. |
|---|
| 557 | They produce text strings in Unicode string form, that means resulting text |
|---|
| 558 | may be utf-8 flagged, so you can not output such text on console directly. |
|---|
| 559 | (Recommended to use Encode::encode() with suitable encoding for output.) |
|---|
| 560 | |
|---|
| 561 | =over 4 |
|---|
| 562 | |
|---|
| 563 | =item $doc-E<gt>contents([ $options ]); |
|---|
| 564 | |
|---|
| 565 | =item $doc-E<gt>header([ $options ]); |
|---|
| 566 | |
|---|
| 567 | =item $doc-E<gt>footnote([ $options ]); |
|---|
| 568 | |
|---|
| 569 | =item $doc-E<gt>whole_contents([ $options ]); |
|---|
| 570 | |
|---|
| 571 | =back |
|---|
| 572 | |
|---|
| 573 | =head1 OPTIONS FOR CONTENTS RETRIEVAL |
|---|
| 574 | |
|---|
| 575 | =over 4 |
|---|
| 576 | |
|---|
| 577 | =item as_plaintext |
|---|
| 578 | |
|---|
| 579 | If this option is specified, contents retrieval methods described above |
|---|
| 580 | filter some of special characters (such as cell mark) from resulting contents. |
|---|
| 581 | |
|---|
| 582 | =back |
|---|
| 583 | |
|---|
| 584 | =head1 FUNCTIONAL INTERFACE |
|---|
| 585 | |
|---|
| 586 | =over 4 |
|---|
| 587 | |
|---|
| 588 | =item extract_contents_from_msword_file($filename, [ $options ]); |
|---|
| 589 | |
|---|
| 590 | =item extract_header_from_msword_file($filename, [ $options ]); |
|---|
| 591 | |
|---|
| 592 | =item extract_footnote_from_msword_file($filename, [ $options ]); |
|---|
| 593 | |
|---|
| 594 | =item extract_whole_contents_from_msword_file($filename, [ $options ]); |
|---|
| 595 | |
|---|
| 596 | =back |
|---|
| 597 | |
|---|
| 598 | Functionalities of following methods are same as above, but they parse |
|---|
| 599 | document data buffers instead of parsing files. |
|---|
| 600 | |
|---|
| 601 | =over 4 |
|---|
| 602 | |
|---|
| 603 | =item extract_contents_from_msword_data($buffer, [ $options ]); |
|---|
| 604 | |
|---|
| 605 | =item extract_header_from_msword_data($buffer, [ $options ]); |
|---|
| 606 | |
|---|
| 607 | =item extract_footnote_from_msword_data($buffer, [ $options ]); |
|---|
| 608 | |
|---|
| 609 | =item extract_whole_contents_from_msword_data($buffer, [ $options ]); |
|---|
| 610 | |
|---|
| 611 | =back |
|---|
| 612 | |
|---|
| 613 | =head1 LIMITATIONS |
|---|
| 614 | |
|---|
| 615 | Only support Microsoft Word binary document. |
|---|
| 616 | Does not support Microsoft Word XML document (.docx). |
|---|
| 617 | |
|---|
| 618 | This module does not handle PAP (PAragraph Properties) and CHP (CHaracter |
|---|
| 619 | Properties), that define paragraphs and characters style. |
|---|
| 620 | Those styling information are required to determine functionalities of |
|---|
| 621 | some of special characters (such as row mark, footnote reference, and etc). |
|---|
| 622 | |
|---|
| 623 | This module does not handle summary information stream in Word file. |
|---|
| 624 | |
|---|
| 625 | =head1 AUTHOR |
|---|
| 626 | |
|---|
| 627 | ITO Nobuaki E<lt>banb@cpan.orgE<gt> |
|---|
| 628 | |
|---|
| 629 | =head1 LICENSE |
|---|
| 630 | |
|---|
| 631 | This library is free software; you can redistribute it and/or modify |
|---|
| 632 | it under the same terms as Perl itself. |
|---|
| 633 | |
|---|
| 634 | =head1 SEE ALSO |
|---|
| 635 | |
|---|
| 636 | =cut |
|---|