| 1 | package PDIC; |
|---|
| 2 | |
|---|
| 3 | use 5.008006; |
|---|
| 4 | use strict; |
|---|
| 5 | use warnings; |
|---|
| 6 | |
|---|
| 7 | use PDIC::Version; # major_version |
|---|
| 8 | use PDIC::Util::Binary qw(bcd_to_dec); |
|---|
| 9 | use PDIC::Util::LittleEndian; # get_XXX_value |
|---|
| 10 | |
|---|
| 11 | #use utf8; |
|---|
| 12 | use Encode; |
|---|
| 13 | use Encode::BOCU1; ## optional |
|---|
| 14 | |
|---|
| 15 | require Exporter; |
|---|
| 16 | |
|---|
| 17 | our @ISA = qw(Exporter); |
|---|
| 18 | |
|---|
| 19 | # Items to export into callers namespace by default. Note: do not export |
|---|
| 20 | # names by default without a very good reason. Use EXPORT_OK instead. |
|---|
| 21 | # Do not simply export all your public functions/methods/constants. |
|---|
| 22 | |
|---|
| 23 | # This allows declaration use PDIC ':all'; |
|---|
| 24 | # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK |
|---|
| 25 | # will save memory. |
|---|
| 26 | our %EXPORT_TAGS = ( 'all' => [ qw( |
|---|
| 27 | |
|---|
| 28 | ) ] ); |
|---|
| 29 | |
|---|
| 30 | our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); |
|---|
| 31 | |
|---|
| 32 | our @EXPORT = qw( |
|---|
| 33 | |
|---|
| 34 | ); |
|---|
| 35 | |
|---|
| 36 | our $VERSION = '0.01'; |
|---|
| 37 | |
|---|
| 38 | # Preloaded methods go here. |
|---|
| 39 | |
|---|
| 40 | sub new { |
|---|
| 41 | my ($class, $pdicfile) = @_; |
|---|
| 42 | my $self = { |
|---|
| 43 | FILE => $pdicfile, |
|---|
| 44 | MAJOR_VERSION => &PDIC::Version::major_version($pdicfile), |
|---|
| 45 | HEADER => undef, |
|---|
| 46 | INDEX => undef, |
|---|
| 47 | INDEX_TREE => undef, |
|---|
| 48 | |
|---|
| 49 | INDEX_ABC_TABLE => undef, |
|---|
| 50 | INDEX_CACHE => undef ## memoize |
|---|
| 51 | }; |
|---|
| 52 | # my $proto = shift; |
|---|
| 53 | # my $class = ref($proto) || $proto; |
|---|
| 54 | # my ($pdicfile) = @_; |
|---|
| 55 | # my $pdicfile = shift; |
|---|
| 56 | |
|---|
| 57 | bless $self, $class; |
|---|
| 58 | |
|---|
| 59 | # if ($pdicfile) { |
|---|
| 60 | # $self{PDICFILE} = $pdicfile; |
|---|
| 61 | # $self{HEADER} = $self->load_header; |
|---|
| 62 | # } |
|---|
| 63 | return $self; |
|---|
| 64 | } |
|---|
| 65 | |
|---|
| 66 | sub file { |
|---|
| 67 | my $self = shift; |
|---|
| 68 | return $self->{FILE}; |
|---|
| 69 | } |
|---|
| 70 | sub header { |
|---|
| 71 | my $self = shift; |
|---|
| 72 | return %{$self->{HEADER}} if $self->{HEADER}; |
|---|
| 73 | return %{$self->load_header}; |
|---|
| 74 | } |
|---|
| 75 | sub version { |
|---|
| 76 | my $self = shift; |
|---|
| 77 | my %header = $self->header; |
|---|
| 78 | |
|---|
| 79 | &bcd_to_dec($header{'version'}) / 100; |
|---|
| 80 | } |
|---|
| 81 | sub major_version { |
|---|
| 82 | my $self = shift; |
|---|
| 83 | |
|---|
| 84 | $self->{MAJOR_VERSION}; |
|---|
| 85 | } |
|---|
| 86 | sub is_bocu { |
|---|
| 87 | my $self = shift; |
|---|
| 88 | my %header = $self->header; |
|---|
| 89 | |
|---|
| 90 | ($header{'dictype'} & 0x08 && $header{'os'} & 0x20) ? 1 : 0; |
|---|
| 91 | } |
|---|
| 92 | sub load_header { |
|---|
| 93 | my $self = shift; |
|---|
| 94 | |
|---|
| 95 | # return undef unless $self->{FILE}; |
|---|
| 96 | # return $self->{HEADER} if $self->{HEADER}; |
|---|
| 97 | |
|---|
| 98 | # |
|---|
| 99 | # get_pdic_header_buf($self->{pdicfile}); |
|---|
| 100 | # my $major_version = major_version($self->file); |
|---|
| 101 | # printf("major version = %d\n", $major_version); |
|---|
| 102 | |
|---|
| 103 | my $buf; |
|---|
| 104 | open FH, '<', $self->file or die "$self->file:$!"; |
|---|
| 105 | read FH, $buf, 256; |
|---|
| 106 | close FH; |
|---|
| 107 | |
|---|
| 108 | my %values; |
|---|
| 109 | # my @ar = unpack("a100a40ssssssSsssLCCClCSSSSSLlllCC9LCC43", $buf); |
|---|
| 110 | if ($self->major_version == 4) { |
|---|
| 111 | my @ar = unpack("a100a40vvvvvvvvvvVCCCVCvvvvvVVVVCC9VCC43", $buf); |
|---|
| 112 | %values = ( |
|---|
| 113 | headername => $ar[0], |
|---|
| 114 | dictitle => $ar[1], |
|---|
| 115 | version => $ar[2], |
|---|
| 116 | word => $ar[3], |
|---|
| 117 | ljapa => $ar[4], |
|---|
| 118 | block_size => $ar[5], |
|---|
| 119 | index_block => $ar[6], |
|---|
| 120 | header_size => $ar[7], |
|---|
| 121 | index_size => $ar[8], |
|---|
| 122 | empty_block => ($ar[9] == 0xffff) ? -1 : $ar[9], |
|---|
| 123 | nindex => $ar[10], |
|---|
| 124 | nblock => $ar[11], |
|---|
| 125 | nword => $ar[12], |
|---|
| 126 | dicorder => $ar[13], |
|---|
| 127 | dictype => $ar[14], |
|---|
| 128 | attrlen => $ar[15], |
|---|
| 129 | olenumber => $ar[16], |
|---|
| 130 | os => $ar[17], |
|---|
| 131 | lid_word => $ar[18], |
|---|
| 132 | lid_japa => $ar[19], |
|---|
| 133 | lid_exp => $ar[20], |
|---|
| 134 | lid_pron => $ar[21], |
|---|
| 135 | lid_other => $ar[22], |
|---|
| 136 | extheader => $ar[23], |
|---|
| 137 | empty_block2 => ($ar[24] == 0xffffffff) ? -1 : $ar[24], |
|---|
| 138 | nindex2 => $ar[25], |
|---|
| 139 | nblock2 => $ar[26], |
|---|
| 140 | index_blkbit => $ar[27], |
|---|
| 141 | reserved => $ar[28], |
|---|
| 142 | update_count => $ar[29], |
|---|
| 143 | charcode => $ar[30], |
|---|
| 144 | dummy => $ar[31] |
|---|
| 145 | ); |
|---|
| 146 | } elsif ($self->major_version == 5) { |
|---|
| 147 | my @ar = unpack("a100a40vvvvvvvvvvVCCCCVvvvvvCCVVVVC8VC4C8C32", $buf); |
|---|
| 148 | %values = ( |
|---|
| 149 | headername => $ar[0], |
|---|
| 150 | dictitle => $ar[1], |
|---|
| 151 | version => $ar[2], |
|---|
| 152 | word => $ar[3], |
|---|
| 153 | ljapa => $ar[4], |
|---|
| 154 | block_size => $ar[5], |
|---|
| 155 | index_block => $ar[6], |
|---|
| 156 | header_size => $ar[7], |
|---|
| 157 | index_size => $ar[8], |
|---|
| 158 | empty_block => ($ar[9] == 0xffff) ? -1 : $ar[9], |
|---|
| 159 | nindex => $ar[10], |
|---|
| 160 | nblock => $ar[11], |
|---|
| 161 | nword => $ar[12], |
|---|
| 162 | dicorder => $ar[13], |
|---|
| 163 | dictype => $ar[14], |
|---|
| 164 | attrlen => $ar[15], |
|---|
| 165 | # olenumber => $ar[16], |
|---|
| 166 | # os => $ar[17], |
|---|
| 167 | os => $ar[16], |
|---|
| 168 | olenumber => $ar[17], |
|---|
| 169 | |
|---|
| 170 | lid_word => $ar[18], |
|---|
| 171 | lid_japa => $ar[19], |
|---|
| 172 | lid_exp => $ar[20], |
|---|
| 173 | lid_pron => $ar[21], |
|---|
| 174 | lid_other => $ar[22], |
|---|
| 175 | |
|---|
| 176 | # extheader => $ar[23], |
|---|
| 177 | # empty_block2 => $ar[24], |
|---|
| 178 | # nindex2 => $ar[25], |
|---|
| 179 | # nblock2 => $ar[26], |
|---|
| 180 | # index_blkbit => $ar[27], |
|---|
| 181 | index_blkbit => $ar[23], |
|---|
| 182 | dummy0 => $ar[24], |
|---|
| 183 | extheader => $ar[25], |
|---|
| 184 | empty_block2 => ($ar[26] == 0xffffffff) ? -1 : $ar[26], |
|---|
| 185 | nindex2 => $ar[27], |
|---|
| 186 | nblock2 => $ar[28], |
|---|
| 187 | reserved => $ar[29], |
|---|
| 188 | update_count => $ar[30], |
|---|
| 189 | #charcode => $ar[30], |
|---|
| 190 | dummy00 => $ar[31], |
|---|
| 191 | dicident => $ar[32], |
|---|
| 192 | dummy => $ar[33] |
|---|
| 193 | ); |
|---|
| 194 | } else { |
|---|
| 195 | %values = (); |
|---|
| 196 | } |
|---|
| 197 | # print %values; |
|---|
| 198 | $self->{HEADER} = \%values; |
|---|
| 199 | \%values; |
|---|
| 200 | } |
|---|
| 201 | |
|---|
| 202 | sub index { |
|---|
| 203 | my $self = shift; |
|---|
| 204 | return @{$self->{INDEX}} if $self->{INDEX}; |
|---|
| 205 | return @{$self->load_index}; |
|---|
| 206 | } |
|---|
| 207 | sub load_index { |
|---|
| 208 | my $self = shift; |
|---|
| 209 | my %header = $self->header; |
|---|
| 210 | |
|---|
| 211 | open FH, '<', $self->file or die "$self->file:$!"; |
|---|
| 212 | seek FH, $header{'header_size'} + $header{'extheader'}, 0; |
|---|
| 213 | |
|---|
| 214 | my $index_block = $header{'index_block'}; |
|---|
| 215 | my $nindex = $header{'nindex2'}; |
|---|
| 216 | my $index_blkbit = $header{'index_blkbit'}; |
|---|
| 217 | my $index_blkbyte = ($header{'index_blkbit'} == 1) ? 4 : 2; |
|---|
| 218 | |
|---|
| 219 | # printf("number of indices: %d\n", $nindex); |
|---|
| 220 | # printf("index blkbit: %d\n", $index_blkbyte << 3); |
|---|
| 221 | # printf("index buffer size: %d\n", $header{'index_block'} << 8); |
|---|
| 222 | |
|---|
| 223 | my $buf = ''; |
|---|
| 224 | my $ofs = 0, my $n = 0; |
|---|
| 225 | my @ar = (); |
|---|
| 226 | for (my $i=0; $i<$index_block; $i++) { |
|---|
| 227 | my $buf2; |
|---|
| 228 | read FH, $buf2, 256; |
|---|
| 229 | $buf .= $buf2; |
|---|
| 230 | $buf .= '\0' x 256 if $i == $index_block - 1; |
|---|
| 231 | while (length($buf) >= 253) { |
|---|
| 232 | my $phys; |
|---|
| 233 | if ($index_blkbit == 1) { |
|---|
| 234 | $phys = get_ulong_value($buf); |
|---|
| 235 | } else { |
|---|
| 236 | $phys = get_ushort_value($buf); |
|---|
| 237 | } |
|---|
| 238 | $buf = substr($buf, $index_blkbyte); |
|---|
| 239 | $ofs += $index_blkbyte; |
|---|
| 240 | my $entry = get_cstring($buf); #unpack("Z*", $buf); |
|---|
| 241 | |
|---|
| 242 | # $ar[$n] = [$phys,$entry,$ofs]; |
|---|
| 243 | # $ar[$n] = [$phys,$entry]; |
|---|
| 244 | $ar[$n*2] = $phys; |
|---|
| 245 | $ar[$n*2+1] = $entry; |
|---|
| 246 | $n++; |
|---|
| 247 | # printf("%d (%d)(%d) %s\n", $n, $ofs, $phys, $entry); |
|---|
| 248 | |
|---|
| 249 | my $bytes = length($entry) + 1; |
|---|
| 250 | $buf = substr($buf, $bytes); |
|---|
| 251 | $ofs += $bytes; |
|---|
| 252 | |
|---|
| 253 | last if $n == $nindex; |
|---|
| 254 | } |
|---|
| 255 | last if $n == $nindex; |
|---|
| 256 | } |
|---|
| 257 | close FH; |
|---|
| 258 | |
|---|
| 259 | $self->{INDEX} = \@ar; |
|---|
| 260 | \@ar; |
|---|
| 261 | } |
|---|
| 262 | sub create_index_abc_table { |
|---|
| 263 | my $self = shift; |
|---|
| 264 | my @ar = (); |
|---|
| 265 | |
|---|
| 266 | my @index = $self->index; |
|---|
| 267 | # printf("index : %d .. %d\n", 0, $#index); |
|---|
| 268 | # printf(" (%s %s)\n", $index[0], $index[1]); |
|---|
| 269 | printf(" %d (%d:'%s') .. %d (%d:'%s')\n", |
|---|
| 270 | 0, $index[0], $index[1], |
|---|
| 271 | $#index-1, $index[$#index-1], $index[$#index]); |
|---|
| 272 | |
|---|
| 273 | my $first_char = ord($index[1]); # ord(first_word) |
|---|
| 274 | my $last_char = ord($index[$#index]); # ord(last_word) |
|---|
| 275 | for (my $i=0; $i<$first_char; $i++) { |
|---|
| 276 | $ar[$i*2] = $ar[$i*2+1] = -1; |
|---|
| 277 | } |
|---|
| 278 | for (my $i=$last_char+1; $i<256; $i++) { |
|---|
| 279 | $ar[$i*2] = $ar[$i*2+1] = -1; |
|---|
| 280 | } |
|---|
| 281 | for (my $i=$first_char; $i<=$last_char; $i++) { |
|---|
| 282 | my $c = chr($i); |
|---|
| 283 | my $ix = $self->index_search_peer($c); |
|---|
| 284 | my $c_ff = chr($i) . chr(255); |
|---|
| 285 | my $ix_ff = $self->index_search_peer($c_ff); |
|---|
| 286 | printf("%02x --> (%d .. %d)\n", $i, $ix, $ix_ff); |
|---|
| 287 | } |
|---|
| 288 | |
|---|
| 289 | # for (my $i=0; $i<256; $i++) { |
|---|
| 290 | # my $c = chr($i); |
|---|
| 291 | # my $ix = $self->index_search_peer($c); |
|---|
| 292 | # my $c_ff = chr($i) . chr(255); |
|---|
| 293 | # my $ix_ff = $self->index_search_peer($c_ff); |
|---|
| 294 | # printf("%02x --> (%d .. %d)\n", $i, $ix, $ix_ff); |
|---|
| 295 | # } |
|---|
| 296 | $self->{INDEX_ABC_TABLE} = \@ar; |
|---|
| 297 | \@ar; |
|---|
| 298 | } |
|---|
| 299 | |
|---|
| 300 | sub index_search_peer { |
|---|
| 301 | my ($self,$needle) = @_; |
|---|
| 302 | my @index = $self->index; |
|---|
| 303 | |
|---|
| 304 | for (my $i=$#index; $i>0; $i-=2) { |
|---|
| 305 | my $entry = $index[$i]; |
|---|
| 306 | next if $needle lt $entry; #skip until |
|---|
| 307 | # return [$index[$i-1],$index[$i]] if $entry le $needle; |
|---|
| 308 | # return $i if $entry le $needle; |
|---|
| 309 | return $i; |
|---|
| 310 | # return $index[$i-1] if $entry le $needle; |
|---|
| 311 | } |
|---|
| 312 | return -1; |
|---|
| 313 | } |
|---|
| 314 | sub index_search { |
|---|
| 315 | my ($self,$ref_cond) = @_; |
|---|
| 316 | my @index = $self->index; |
|---|
| 317 | |
|---|
| 318 | my %cond = %$ref_cond; |
|---|
| 319 | # while ((my $key, my $value) = each(%$ref_cond)) { |
|---|
| 320 | # printf("%s => %s\n", $key, $value); |
|---|
| 321 | # } |
|---|
| 322 | # print $cond{upper_limit} . "\n"; |
|---|
| 323 | |
|---|
| 324 | my $lower = index_search_peer($self,$cond{lower_limit}); |
|---|
| 325 | return () if $lower < 0; |
|---|
| 326 | return ( $index[$lower-1] ) unless $cond{upper_limit}; |
|---|
| 327 | |
|---|
| 328 | # my $needle_len = length($needle); |
|---|
| 329 | # my $upper_limit = substr($needle,0,$needle_len-1) . chr(ord(substr($needle,$needle_len-1,1))+1); |
|---|
| 330 | # my $upper_limit = ; |
|---|
| 331 | # print "$upper_limit\n"; |
|---|
| 332 | |
|---|
| 333 | my $upper = index_search_peer($self,$cond{upper_limit}); |
|---|
| 334 | my @ar = (); |
|---|
| 335 | for (my $i=$lower; $i<=$upper; $i+=2) { |
|---|
| 336 | push(@ar,$index[$i-1]); |
|---|
| 337 | } |
|---|
| 338 | return @ar; |
|---|
| 339 | } |
|---|
| 340 | |
|---|
| 341 | sub get_datablock_addr { |
|---|
| 342 | my ($self,$phys) = @_; |
|---|
| 343 | |
|---|
| 344 | return undef if $phys < 0; |
|---|
| 345 | |
|---|
| 346 | my %header = $self->header; |
|---|
| 347 | my $addr = $header{'header_size'} + $header{'extheader'} + |
|---|
| 348 | ($header{'index_block'} << 8) + ($phys << 8); |
|---|
| 349 | |
|---|
| 350 | # printf("%x\n", $addr); |
|---|
| 351 | $addr; |
|---|
| 352 | } |
|---|
| 353 | |
|---|
| 354 | sub fields_in_datablock { |
|---|
| 355 | my ($self,$phys,$ref_cond,$ref_result) = @_; |
|---|
| 356 | |
|---|
| 357 | my $is_aligned = ($self->major_version == 5) ? 1 : 0; |
|---|
| 358 | |
|---|
| 359 | my %cond; |
|---|
| 360 | if (defined($ref_cond)) { |
|---|
| 361 | %cond = %$ref_cond; |
|---|
| 362 | } else { |
|---|
| 363 | %cond = (); |
|---|
| 364 | } |
|---|
| 365 | |
|---|
| 366 | # my @result = @$ref_result; |
|---|
| 367 | my $result_count = 0; |
|---|
| 368 | |
|---|
| 369 | # printf("%d => %x\n", $phys, $self->get_datablock_addr($phys)); ## |
|---|
| 370 | |
|---|
| 371 | open FH, '<', $self->file or die "$self->file:$!"; |
|---|
| 372 | seek FH, $self->get_datablock_addr($phys), 0; |
|---|
| 373 | |
|---|
| 374 | my $buf; |
|---|
| 375 | read FH, $buf, 2; |
|---|
| 376 | my $used_blocks = get_ushort_value($buf,0); |
|---|
| 377 | |
|---|
| 378 | my $field_length_byte; |
|---|
| 379 | if ($used_blocks & 0x8000) { |
|---|
| 380 | $field_length_byte = 4; |
|---|
| 381 | $used_blocks &= 0x7fff; |
|---|
| 382 | } else { |
|---|
| 383 | $field_length_byte = 2; |
|---|
| 384 | } |
|---|
| 385 | # printf("block length = %d, field length byte = %d\n", $used_blocks, $field_length_byte); |
|---|
| 386 | if ($used_blocks == 0) { |
|---|
| 387 | print "Detected an emptyblock. "; |
|---|
| 388 | |
|---|
| 389 | read FH, $buf, 4; |
|---|
| 390 | my $next_emptyblock = get_ulong_value($buf); |
|---|
| 391 | # $buf = substr($buf,4); |
|---|
| 392 | printf("Next emptyblock at %x\n", $next_emptyblock); |
|---|
| 393 | close FH; |
|---|
| 394 | return 0; |
|---|
| 395 | } |
|---|
| 396 | |
|---|
| 397 | my $datablock_length = ($used_blocks << 8) - 2; |
|---|
| 398 | my $rest = $datablock_length; |
|---|
| 399 | # $block_length--; |
|---|
| 400 | # my $buf_rest = 254; |
|---|
| 401 | |
|---|
| 402 | # printf("%d + %d\n", $buf_rest, $block_length << 8); |
|---|
| 403 | my $entry_base = ''; |
|---|
| 404 | while ($rest > 0) { |
|---|
| 405 | # printf("[%d / %d]\n", $rest, $datablock_length); |
|---|
| 406 | # read a chunk |
|---|
| 407 | # read FH, $buf, $field_length_byte + 1; |
|---|
| 408 | if ($is_aligned) { |
|---|
| 409 | read FH, $buf, $field_length_byte + 2; |
|---|
| 410 | } else { |
|---|
| 411 | read FH, $buf, $field_length_byte + 1; |
|---|
| 412 | } |
|---|
| 413 | my $field_length = get_unsigned_value($buf,0,$field_length_byte); |
|---|
| 414 | last if $field_length == 0; |
|---|
| 415 | |
|---|
| 416 | my $ofs = $field_length_byte; |
|---|
| 417 | my $compressed_length = get_uchar_value($buf,$ofs++); |
|---|
| 418 | my $entry_attr; |
|---|
| 419 | if ($is_aligned) { |
|---|
| 420 | $entry_attr = get_uchar_value($buf,$ofs++); ## 5 |
|---|
| 421 | last unless $entry_attr & 0x80; |
|---|
| 422 | $entry_attr &= 0x7f; |
|---|
| 423 | } |
|---|
| 424 | # $rest -= $field_length_byte + 1; |
|---|
| 425 | $rest -= $ofs; |
|---|
| 426 | # printf("C-L:%d, attr=%x \n", $compressed_length, $entry_attr); |
|---|
| 427 | |
|---|
| 428 | # my $entry = unpack("Z*", $buf); |
|---|
| 429 | read FH, $buf, $field_length; |
|---|
| 430 | $ofs = 0; |
|---|
| 431 | my $compressed_entry = get_cstring($buf, $ofs); |
|---|
| 432 | $ofs += length($compressed_entry) + 1; |
|---|
| 433 | my $entry = substr($entry_base,0,$compressed_length) . $compressed_entry; |
|---|
| 434 | $entry_base = $entry; |
|---|
| 435 | unless ($is_aligned) { |
|---|
| 436 | $entry_attr = get_uchar_value($buf,$ofs++); |
|---|
| 437 | last unless $entry_attr & 0x80; |
|---|
| 438 | $entry_attr &= 0x7f; |
|---|
| 439 | } |
|---|
| 440 | |
|---|
| 441 | my $trans; |
|---|
| 442 | my %ext_contents = (); |
|---|
| 443 | if ($entry_attr & 0x10) { |
|---|
| 444 | # extended |
|---|
| 445 | $trans = get_cstring($buf,$ofs); |
|---|
| 446 | $ofs += length($trans) + 1; |
|---|
| 447 | |
|---|
| 448 | while (1) { |
|---|
| 449 | my $ext_attr = get_uchar_value($buf,$ofs++); |
|---|
| 450 | last if $ext_attr == 0x80; |
|---|
| 451 | |
|---|
| 452 | my $flags = ($ext_attr >> 8) & 0x07; |
|---|
| 453 | my $ext_content; |
|---|
| 454 | if ($flags == 0) { |
|---|
| 455 | # no flags |
|---|
| 456 | $ext_content = get_cstring($buf,$ofs); |
|---|
| 457 | $ofs += length($ext_content) + 1; |
|---|
| 458 | } elsif ($flags == 1) { |
|---|
| 459 | # BINARY_DATA |
|---|
| 460 | my $size = get_unsigned_value($buf,$ofs,$field_length_byte); |
|---|
| 461 | $ofs += $field_length_byte; |
|---|
| 462 | $ext_content = substr($buf,$ofs,$size); |
|---|
| 463 | $ofs += $size; |
|---|
| 464 | } elsif ($flags == 5) { |
|---|
| 465 | # COMPRESSED_DATA | BINARY_DATA |
|---|
| 466 | my $size = get_unsigned_value($buf,$ofs,$field_length_byte); |
|---|
| 467 | $ofs += $field_length_byte; |
|---|
| 468 | my $rawdata_length = get_uchar_value($buf,$ofs); |
|---|
| 469 | $ofs++; |
|---|
| 470 | my $rawdata = substr($buf,$ofs,$rawdata_length); |
|---|
| 471 | $ofs += $rawdata_length; |
|---|
| 472 | my $compressed_data = substr($buf,$ofs,$size-1-$rawdata_length); |
|---|
| 473 | |
|---|
| 474 | $ext_content = $rawdata; ## |
|---|
| 475 | } |
|---|
| 476 | |
|---|
| 477 | if ($ext_attr == 1) { |
|---|
| 478 | $ext_contents{example} = $ext_content; |
|---|
| 479 | } elsif ($ext_attr == 2) { |
|---|
| 480 | $ext_contents{pron} = $ext_content; |
|---|
| 481 | } elsif ($ext_attr == 4) { |
|---|
| 482 | $ext_contents{link} = $ext_content; |
|---|
| 483 | } else { |
|---|
| 484 | $ext_contents{$ext_attr} = $ext_content; |
|---|
| 485 | } |
|---|
| 486 | # push(@ext_contents, \($ext_attr,$ext_content)); |
|---|
| 487 | } |
|---|
| 488 | } else { |
|---|
| 489 | # normal |
|---|
| 490 | $trans = substr($buf,$ofs,$field_length - $ofs); |
|---|
| 491 | } |
|---|
| 492 | |
|---|
| 493 | # printf("field length : %d bytes\n", $field_length); |
|---|
| 494 | # printf("compressed_length : %d bytes\n", $compressed_length); |
|---|
| 495 | # printf("entry : %s\n", $entry); |
|---|
| 496 | # printf("entry attr : %x\n", $entry_attr); |
|---|
| 497 | # printf("trans : %s\n", $trans); |
|---|
| 498 | # printf("ext_contents : %d\n", $#ext_contents); |
|---|
| 499 | # printf("%s /// %s\n", $entry, $trans); |
|---|
| 500 | if ((%cond && $cond{lower_limit} le $entry && $entry le $cond{upper_limit}) |
|---|
| 501 | || (! %cond)) { |
|---|
| 502 | my %field = ( entry => $entry, |
|---|
| 503 | entry_attr => $entry_attr, |
|---|
| 504 | trans => $trans, |
|---|
| 505 | ext_contents => \%ext_contents |
|---|
| 506 | ); |
|---|
| 507 | push(@$ref_result, \%field); |
|---|
| 508 | $result_count++; |
|---|
| 509 | } |
|---|
| 510 | |
|---|
| 511 | # $buf = substr($buf, $field_length); |
|---|
| 512 | $rest -= $field_length; |
|---|
| 513 | # $ofs = 0; |
|---|
| 514 | } |
|---|
| 515 | close FH; |
|---|
| 516 | |
|---|
| 517 | $result_count; |
|---|
| 518 | } |
|---|
| 519 | |
|---|
| 520 | # |
|---|
| 521 | # render_field(\$field, $output_encoding [,$output_format]) |
|---|
| 522 | # |
|---|
| 523 | sub render_field { |
|---|
| 524 | my ($self,$ref_field,$output_encoding,$output_format) = @_; |
|---|
| 525 | # my $is_bocu = $self->is_bocu; |
|---|
| 526 | $output_format = '' unless $output_format; |
|---|
| 527 | |
|---|
| 528 | my %field = %{$ref_field}; |
|---|
| 529 | |
|---|
| 530 | my $entry = $field{entry}; |
|---|
| 531 | my $trans = $field{trans}; |
|---|
| 532 | |
|---|
| 533 | my %ext_contents = %{$field{ext_contents}}; |
|---|
| 534 | my $pron = ''; |
|---|
| 535 | my $example = ''; |
|---|
| 536 | if (keys(%ext_contents) > 0) { |
|---|
| 537 | $pron = $ext_contents{'pron'}; |
|---|
| 538 | $example = $ext_contents{'example'}; |
|---|
| 539 | } |
|---|
| 540 | |
|---|
| 541 | my $dict_encoding = ($self->is_bocu)? 'bocu1' : 'shiftjis'; |
|---|
| 542 | if ($dict_encoding ne $output_encoding) { |
|---|
| 543 | Encode::from_to($entry, $dict_encoding, $output_encoding); |
|---|
| 544 | Encode::from_to($trans, $dict_encoding, $output_encoding); |
|---|
| 545 | Encode::from_to($example, $dict_encoding, $output_encoding) if $example; |
|---|
| 546 | } |
|---|
| 547 | |
|---|
| 548 | if ($output_format eq 'PDIC-TEXT') { |
|---|
| 549 | $trans =~ s/\015\n/ \\ /g; |
|---|
| 550 | |
|---|
| 551 | printf("%s\n", $entry); |
|---|
| 552 | if ($example) { |
|---|
| 553 | printf("%s / %s\n", $trans, $example); |
|---|
| 554 | } else { |
|---|
| 555 | printf("%s\n", $trans); |
|---|
| 556 | } |
|---|
| 557 | } elsif ($output_format eq 'PDIC-1LINE') { |
|---|
| 558 | $trans =~ s/\015\n/ \\ /g; |
|---|
| 559 | if ($example) { |
|---|
| 560 | printf("%s /// %s / %s\n", $entry, $trans, $example); |
|---|
| 561 | } else { |
|---|
| 562 | printf("%s /// %s\n", $entry, $trans); |
|---|
| 563 | } |
|---|
| 564 | } elsif ($output_format eq 'TAB') { |
|---|
| 565 | $trans =~ s/\015\n/ \\ /g; |
|---|
| 566 | if ($example) { |
|---|
| 567 | printf("%s\t%s\t%s\n", $entry, $trans, $example); |
|---|
| 568 | } else { |
|---|
| 569 | printf("%s\t%s\n", $entry, $trans); |
|---|
| 570 | } |
|---|
| 571 | } elsif ($output_format eq 'CSV') { |
|---|
| 572 | # $trans =~ s/\015\n/ \\ /g; |
|---|
| 573 | $trans =~ s/"/\\"/g; |
|---|
| 574 | if ($example) { |
|---|
| 575 | printf("\"%s\",\"%s\",\"%s\"\n", $entry, $trans, $example); |
|---|
| 576 | } else { |
|---|
| 577 | printf("\"%s\",\"%s\"\n", $entry, $trans); |
|---|
| 578 | } |
|---|
| 579 | } else { |
|---|
| 580 | if ($pron) { |
|---|
| 581 | printf("%s [%s]\n", $entry, $pron); |
|---|
| 582 | } else { |
|---|
| 583 | printf("%s\n", $entry); |
|---|
| 584 | } |
|---|
| 585 | $trans =~ s/\015\n/\n\t/g; |
|---|
| 586 | printf("\t%s\n", $trans); |
|---|
| 587 | printf("\t%s\n", $example) if $example; |
|---|
| 588 | } |
|---|
| 589 | } |
|---|
| 590 | |
|---|
| 591 | ## |
|---|
| 592 | ## dump($pdic, $output_encoding, $output_format) |
|---|
| 593 | ## |
|---|
| 594 | sub dump { |
|---|
| 595 | my ($pdic,$output_encoding,$output_format) = @_; |
|---|
| 596 | my $cnt = 0; |
|---|
| 597 | |
|---|
| 598 | my @index = $pdic->index; |
|---|
| 599 | # $pdic->dump_index( @index ); |
|---|
| 600 | |
|---|
| 601 | for (my $i=0; $i<=$#index; $i+=2) { |
|---|
| 602 | my $phys = $index[$i]; |
|---|
| 603 | dump_datablock( $pdic, $phys, $output_encoding, $output_format ); |
|---|
| 604 | } |
|---|
| 605 | } |
|---|
| 606 | ## |
|---|
| 607 | ## dump_header($pdic) |
|---|
| 608 | ## |
|---|
| 609 | sub dump_header { |
|---|
| 610 | my $pdic = shift; |
|---|
| 611 | |
|---|
| 612 | my %header = $pdic->header; |
|---|
| 613 | while ((my $key, my $value) = each(%header)) { |
|---|
| 614 | printf("%s => %s\n", $key, $value); |
|---|
| 615 | } |
|---|
| 616 | } |
|---|
| 617 | ## |
|---|
| 618 | ## dump_index($pdic, $output_encoding) |
|---|
| 619 | ## |
|---|
| 620 | sub dump_index { |
|---|
| 621 | my ($pdic,$output_encoding) = @_; |
|---|
| 622 | unless ($output_encoding) { |
|---|
| 623 | $output_encoding = $pdic->is_bocu ? 'utf8' : 'shiftjis'; |
|---|
| 624 | } |
|---|
| 625 | |
|---|
| 626 | my @index = $pdic->index; |
|---|
| 627 | |
|---|
| 628 | for (my $i=0; $i<=$#index; $i+=2) { |
|---|
| 629 | my $entry = $index[$i+1]; |
|---|
| 630 | |
|---|
| 631 | my $dict_encoding = ($pdic->is_bocu)? 'bocu' : 'shiftjis'; |
|---|
| 632 | if ($dict_encoding ne $output_encoding) { |
|---|
| 633 | Encode::from_to($entry, $dict_encoding, $output_encoding); |
|---|
| 634 | } |
|---|
| 635 | printf("- phys=%d entry=\"%s\"\n", $index[$i], $entry); |
|---|
| 636 | } |
|---|
| 637 | } |
|---|
| 638 | ## |
|---|
| 639 | ## dump_datablock($pdic, $phys, $output_encoding, $output_format) |
|---|
| 640 | ## |
|---|
| 641 | sub dump_datablock { |
|---|
| 642 | my ($pdic,$phys,$output_encoding,$output_format) = @_; |
|---|
| 643 | |
|---|
| 644 | my @result = (); |
|---|
| 645 | my $cnt = 0; |
|---|
| 646 | |
|---|
| 647 | $pdic->fields_in_datablock( $phys, undef, \@result ); |
|---|
| 648 | $cnt += $#result + 1; |
|---|
| 649 | |
|---|
| 650 | foreach my $ref_field (@result) { |
|---|
| 651 | $pdic->render_field($ref_field,$output_encoding,$output_format); |
|---|
| 652 | } |
|---|
| 653 | } |
|---|
| 654 | |
|---|
| 655 | 1; |
|---|
| 656 | __END__ |
|---|
| 657 | # Below is stub documentation for your module. You'd better edit it! |
|---|
| 658 | |
|---|
| 659 | =head1 NAME |
|---|
| 660 | |
|---|
| 661 | PDIC - Perl extension for processing PDIC-format dictionary files |
|---|
| 662 | |
|---|
| 663 | =head1 SYNOPSIS |
|---|
| 664 | |
|---|
| 665 | use PDIC; |
|---|
| 666 | blah blah blah |
|---|
| 667 | |
|---|
| 668 | =head1 DESCRIPTION |
|---|
| 669 | |
|---|
| 670 | Stub documentation for PDIC, created by h2xs. It looks like the |
|---|
| 671 | author of the extension was negligent enough to leave the stub |
|---|
| 672 | unedited. |
|---|
| 673 | |
|---|
| 674 | Blah blah blah. |
|---|
| 675 | |
|---|
| 676 | =head2 EXPORT |
|---|
| 677 | |
|---|
| 678 | None by default. |
|---|
| 679 | |
|---|
| 680 | |
|---|
| 681 | =head1 SEE ALSO |
|---|
| 682 | |
|---|
| 683 | Mention other useful documentation such as the documentation of |
|---|
| 684 | related modules or operating system documentation (such as man pages |
|---|
| 685 | in UNIX), or any relevant external documentation such as RFCs or |
|---|
| 686 | standards. |
|---|
| 687 | |
|---|
| 688 | If you have a mailing list set up for your module, mention it here. |
|---|
| 689 | |
|---|
| 690 | If you have a web site set up for your module, mention it here. |
|---|
| 691 | |
|---|
| 692 | =head1 AUTHOR |
|---|
| 693 | |
|---|
| 694 | Naoya Tozuka E<lt>naoyat@naochan.comE<gt> |
|---|
| 695 | |
|---|
| 696 | =head1 COPYRIGHT AND LICENSE |
|---|
| 697 | |
|---|
| 698 | Copyright (C) 2006 by Naoya Tozuka.xs All Rights Reserved. |
|---|
| 699 | |
|---|
| 700 | This library is free software; you can redistribute it and/or modify |
|---|
| 701 | it under the same terms as Perl itself, either Perl version 5.8.6 or, |
|---|
| 702 | at your option, any later version of Perl 5 you may have available. |
|---|
| 703 | |
|---|
| 704 | |
|---|
| 705 | =cut |
|---|