Changeset 2743

Show
Ignore:
Timestamp:
12/07/07 11:07:48 (7 years ago)
Author:
naoya_t
Message:

r2730@localhost: naochan | 2007-12-07 10:22:04 +0900
optimized version

Location:
lang/perl/Encode-BOCU1/trunk
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • lang/perl/Encode-BOCU1/trunk/Changes

    r2741 r2743  
    11Revision history for Perl extension Encode::BOCU1. 
    22 
    3 0.01  Mon Jul  3 04:30:00 2006 
     30.01  Mon Jun 29 12:00:00 2006 
    44        - original version 
    55 
     
    77        - optimized version 
    88 
     9 
  • lang/perl/Encode-BOCU1/trunk/blib/lib/Encode/BOCU1.pm

    r2722 r2743  
    11package Encode::BOCU1; 
     2 
     3use 5.008; 
    24use strict; 
     5use warnings; 
     6use Carp; 
     7 
    38use base qw(Encode::Encoding); 
    49 
    5 our $VERSION = '0.01'; 
     10our $VERSION = '0.02'; 
    611 
    712__PACKAGE__->Define('bocu1'); 
    813 
    914use Encode::Alias; 
    10 define_alias( qr/^bocu.1$/i => '"bocu1"'); # BOCU-1, Bocu_1, bocu.1, ... 
     15define_alias( qr/^bocu.1$/i => '"bocu1"'); 
    1116define_alias( qr/^bocu$/i => '"bocu1"'); 
     17 
    1218 
    1319# 
     
    1723    my ($obj, $str, $check) = @_; 
    1824    my $octet = utf8_to_bocu1($str); 
    19     $_[1] = '' if $check; # $this is what in-place edit means 
     25 
     26    $_[1] = '' if $check; 
    2027    return $octet; 
    2128} 
    22 sub decode ($$;$) { 
     29sub decode($$;$) { 
    2330    my ($obj, $octet, $check) = @_; 
    2431    my $str = bocu1_to_utf8($octet); 
     32 
    2533    $_[1] = '' if $check; 
    2634    return $str; 
     
    2836 
    2937# 
    30 # Subroutines 
    31 # 
    32 # based on the sample C code available on http://www.unicode.org/notes/tn6/ 
    33 # US Patent 6737994 "Binary-Ordered Compression For Unicode" by IBM 
     38# subroutines 
    3439# 
    3540my @bocu1_trail_to_byte = ( 
    36 #   0     1     2     3     4     5     6     7 
    37     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, 
    38 #   8     9     a     b     c     d     e     f 
    39     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 
    40 #   10    11    12    13 
    41     0x1c, 0x1d, 0x1e, 0x1f ); 
     41#   0 - 19 (0x0 - 0x13) 
     42          0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 
     43    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,             0x1c, 0x1d, 0x1e, 0x1f, 
     44#   20 - 242 (0x14 - 0xf2) 
     45          0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 
     46    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 
     47    0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 
     48    0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 
     49    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 
     50    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 
     51    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 
     52    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 
     53    0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 
     54    0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 
     55    0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 
     56    0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 
     57    0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 
     58    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ); 
     59 
    4260my @bocu1_byte_to_trail = ( 
    43 #   0     1     2     3     4     5     6     7 
    44     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, 
    45 #   8     9     a     b     c     d     e     f 
    46     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1, 
    47 #   10    11    12    13    14    15    16    17 
    48     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 
    49 #   18    19    1a    1b    1c    1d    1e    1f 
    50     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13, 
    51 #   20 
    52     -1 ); 
    53  
    54 # Compute the next "previous" value for differencing from the current code point. 
    55 # 
    56 # @param $c : current code point, 0..0x10ffff 
    57 # @return "previous code point" state value 
    58 sub bocu1_prev { 
    59     my $c = shift; 
    60  
    61     if (0x3040 <= $c && $c <= 0x309f) { 
    62         # Hiragana is not 128-aligned 
    63         0x3070; 
    64     } elsif (0x4e00 <= $c && $c <= 0x9fa5) { 
    65         # CJI Unihan 
    66         0x7711; # 0x4e00 - -10513 
    67     } elsif (0xac00 <= $c && $c <= 0xd7a3) { 
    68         # Korean Hangul 
    69         0xc1d1; # (ac00 + d7a3)/2 
    70     } else { 
    71         # mostly small scripts 
    72         ($c & ~0x7f) + 0x40; 
    73     } 
    74 } 
    75  
    76 # Encode a difference -0x10ffff..0x10ffff in 1..4 bytes and return a BOCU-1 string 
    77 # 
    78 # The encoding favors small absolute differences with short encodings 
    79 # to compress runs of same-script characters. 
    80  
    81 # @param $diff : difference value -0x10ffff..0x10ffff 
    82 # @return $bocu1str : BOCU-1 string 
    83 sub pack_diff { 
    84     my $diff = shift; ## -1114111 (=-0x10ffff) .. 1114111 (=0x10ffff) 
    85     my ($lead,$count); 
    86  
    87     if ($diff >= -64) { 
    88         # mostly positive differences, and single-byte negative ones 
    89         if ($diff <= 63) { # -64 .. 63 
    90             # single byte 
    91             return chr(0x90 + $diff); # 0x50 .. 0xcf 
    92         } elsif ($diff <= 10512) { # 64 .. 10512 
    93             # two bytes 
    94             $diff -= 64; # 0 .. 10448 (= 43*243-1) 
    95             $lead = 0xd0; # 0xd0 .. 0xfa 
    96             $count = 1; 
    97         } elsif ($diff <= 187659) { # 10513 .. 187659 
    98             # three bytes 
    99             $diff -= 10513; # 0 .. 177146 (= 3*243*243-1) 
    100             $lead = 0xfb; # 0xfb .. 0xfd 
    101             $count = 2; 
    102         } else { # if ($diff <= 14536566) { # 187660 .. (14536566) 
    103 #            # four bytes 
    104 #            $diff -= 187660; # 0 .. 14348906 (1*243*243*243-1) 
    105 #            $lead = 0xfe; # 0xfe 
    106 #            $count = 3; 
     61#   0x00 - 0x20 
     62    -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1, 
     63    0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13, 
     64    -1, 
     65#   0x21 - 0xff 
     66          0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 
     67    0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 
     68    0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 
     69    0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 
     70    0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 
     71    0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 
     72    0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 
     73    0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 
     74    0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 
     75    0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 
     76    0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 
     77    0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 
     78    0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 
     79    0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2 ); 
     80 
     81sub bocu1_to_utf8 { 
     82    my $bocu1str = shift; 
     83    my @chars = unpack("C*", $bocu1str); 
     84 
     85    my $pc = 0x40; 
     86    my @codepoints; 
     87    for (my $i=0; $i<=$#chars; $i++) { 
     88        my $lead = $chars[$i]; 
     89        my $cp = 0; 
     90        my $diff = 0; 
     91        if ($lead <= 0x20) { 
     92            $cp = $lead; 
     93        } elsif ($lead == 0x21) { # 21 t1 t2 t3 
     94            my $t1 = $bocu1_byte_to_trail[$chars[++$i]]; 
     95            my $t2 = $bocu1_byte_to_trail[$chars[++$i]]; 
     96            my $t3 = $bocu1_byte_to_trail[$chars[++$i]]; 
     97            croak "illegal trail char" if $t1 < 0 || $t2 < 0 || $t3 < 0; 
     98            $diff = 14161247 + $t1 * 59049 + $t2 * 243 + $t3 
     99        } elsif ($lead < 0x25) { # [22-24] t1 t2 
     100            my $t1 = $bocu1_byte_to_trail[$chars[++$i]]; 
     101            my $t2 = $bocu1_byte_to_trail[$chars[++$i]]; 
     102            croak "illegal trail char" if $t1 < 0 || $t2 < 0; 
     103            $diff = -2195326 + $lead * 59049 + $t1 * 243 + $t2; 
     104        } elsif ($lead < 0x50) { # [25-4F] t1 
     105            my $t1 = $bocu1_byte_to_trail[$chars[++$i]]; 
     106            croak "illegal trail char" if $t1 < 0; 
     107            $diff = -19504 + $lead * 243 + $t1; 
     108        } elsif ($lead < 0xd0) { # [50-CF] 
     109            $diff = $lead - 0x90; 
     110        } elsif ($lead < 0xfb) { # [D0-FA] t1 
     111            my $t1 = $bocu1_byte_to_trail[$chars[++$i]]; 
     112            croak "illegal trail char" if $t1 < 0; 
     113            $diff = -50480 + $lead * 243 + $t1; 
     114        } elsif ($lead < 0xfe) { # [FB-FD] t1 t2 
     115            my $t1 = $bocu1_byte_to_trail[$chars[++$i]]; 
     116            my $t2 = $bocu1_byte_to_trail[$chars[++$i]]; 
     117            croak "illegal trail char" if $t1 < 0 || $t2 < 0; 
     118            $diff = -14810786 + $lead * 59049 + $t1 * 243 + $t2; 
     119        } elsif ($lead == 0xfe) { # FE t1 t2 t3 
     120            my $t1 = $bocu1_byte_to_trail[$chars[++$i]]; 
     121            my $t2 = $bocu1_byte_to_trail[$chars[++$i]]; 
     122            my $t3 = $bocu1_byte_to_trail[$chars[++$i]]; 
     123            croak "illegal trail char" if $t1 < 0 || $t2 < 0 || $t3 < 0; 
     124            $diff = 187660 + $t1 * 59049 + $t2 * 243 + $t3; 
     125        } elsif ($lead == 0xff) { 
     126            ## reset 
     127            $cp = 0; 
     128            $diff = 0; 
    107129        } 
    108     } else { # $diff < -64 
    109         # two- and four-byte negative differences 
    110         if ($diff >= -10513) { # -10513 .. -65 
    111             # two bytes 
    112             $diff -= -64; # -43*243 .. -1 
    113             $lead = 0x50; # 0x25 .. 0x4f 
    114             $count = 1; 
    115         } elsif ($diff >= -187660) { # -187660 .. -10514 
    116             # three bytes 
    117             $diff -= -10513; # -3*243*243 .. -1 
    118             $lead = 0x25; # 0x22 .. 0x24 
    119             $count = 2; 
    120         } else { # if ($diff >= -14536567) { # (-14536567) .. -187661 
    121             # four bytes 
    122             $diff -= -187660; # -1*243*243*243 .. -1 
    123             $lead = 0x22; # 0x21 
    124             $count = 3; 
     130 
     131        # codepoint, next pc 
     132        if ($lead <= 0x20) { 
     133            $pc = 0x40 if ($lead < 0x20); 
     134            push(@codepoints,$lead); 
     135        } elsif ($lead < 0xff) { 
     136            $cp = $pc + $diff; 
     137            $cp = 0 if $cp < 0; 
     138            push(@codepoints,$cp); 
     139            if ($cp < 0x20) { 
     140                $pc = 0x40; 
     141            } elsif ($cp == 0x20) { 
     142                # keep pc 
     143            } elsif (0x3040 <= $cp && $cp <= 0x309f) { 
     144                $pc = 0x3070; 
     145            } elsif (0x4e00 <= $cp && $cp <= 0x9fa5) { 
     146                $pc = 0x7711; 
     147            } elsif (0xac00 <= $cp && $cp <= 0xd7a3) { 
     148                $pc = 0xc1d1; 
     149            } else { 
     150                $pc = ($cp & ~0x7f) + 0x40; 
     151            } 
     152        } else { # 0xff : reset 
     153            $pc = 0x40; 
    125154        } 
    126155    } 
    127156 
    128     # calculate trail bytes 
    129     my $bocu1_str = ''; 
    130     for (my $i=$count; $i>0; $i--) { 
    131         my $trail = $diff % 243; 
    132         $diff = ($diff - $trail) / 243; 
    133          
    134         my $byte = $trail >= 20 ? $trail + 13 : $bocu1_trail_to_byte[$trail]; 
    135         $bocu1_str = chr($byte) . $bocu1_str; 
    136     } 
    137     chr($lead + $diff) . $bocu1_str; 
    138 } 
    139  
    140 # 
    141 # BOCU-1 encoder function. 
    142 # 
    143 # @param \$prev : reference to the integer that holds 
    144 #        the "previous code point" state; 
    145 #        the initial value should be 0 which 
    146 #        encode_bocu1() will set to the actual BOCU-1 initial state value 
    147 # @param $c : the code point to encode 
    148 # @return $bocu1str : BOCU-1 string 
    149 #         or undef if an error occurs 
    150 # 
    151 # @see pack_diff() 
    152 # 
    153 sub encode_bocu1 { 
    154     my ($ref_prev,$c) = @_; 
    155     if (!defined($ref_prev) || $c < 0 || $c > 0x10ffff) { 
    156         # ERROR : illegal argument 
    157         return undef; 
    158     } 
    159  
    160     my $prev = $$ref_prev; 
    161     if ($prev == 0) { 
    162         # lenient handling of initial value 0 
    163         $prev = $$ref_prev = 0x40; 
    164     } 
    165  
    166     if ($c <= 0x20) { 
    167         # 
    168         # ISO C0 control & space: 
    169         # Encode directly for MIME compatibility, 
    170         # and reset state except for space, to not disrupt compression. 
    171         # 
    172         if ($c != 0x20) { 
    173             $$ref_prev = 0x40; 
    174         } 
    175         return chr($c); 
    176     } 
    177  
    178     # 
    179     # all other Unicode code points $c==U+0021..U+10ffff 
    180     # are encoded with the difference $c - $prev 
    181     # 
    182     # a new prev is computed from $c, 
    183     # placed in the middle of a 0x80-block (for most small scripts) or 
    184     # in the middle of the Unihan and Hangul blocks 
    185     # to statistically minimize the following difference 
    186     # 
    187     $$ref_prev = &bocu1_prev($c); 
    188     &pack_diff($c - $prev); 
    189 } 
    190  
    191 # 
    192 # Function for BOCU-1 decoder; handles multi-byte lead bytes. 
    193 # 
    194 # @param \%rx : reference to the decoder state structure { prev, count, diff }. 
    195 # @param $b : lead byte; 
    196 #          0x21 <= $b <  0x50 
    197 #       or 0xd0 <= $b <= 0xfe 
    198 # @return -1 (state change only) 
    199 # 
    200 # @see decode_bocu1() 
    201 # 
    202 sub decode_bocu1_lead_byte { 
    203     my ($ref_rx, $b) = @_; 
    204     my ($c,$count); 
    205  
    206     if ($b >= 0x50) { 
    207         # positive difference 
    208         ## since d0 <= $b ... 
    209         if ($b < 0xfb) { # d0 .. fa 
    210             # two bytes 
    211             $c = ($b - 0xd0) *243 + 63 + 1; # ( .. 42)*243 + 64  # .. 10270+r 
    212             $count = 1; 
    213         } elsif ($b < 0xfe) { # fb fc fd 
    214             # three bytes 
    215             $c = ($b - 0xfb) *243*243 + 10512 + 1; # (0..2)*243*243 + 10513  # 10513 .. 128611+r 
    216             $count = 2; 
    217         } else { # fe 
    218             # four bytes 
    219             $c = 187659 + 1; # 3 *243*243 * 10512 + 1  # 187660 ..  
    220             $count = 3; 
    221         } 
    222     } else { 
    223         # negative difference 
    224         if ($b >= 0x25) { # 25 .. 4f 
    225             # two bytes 
    226             $c = ($b - 0x50) * 243 - 64; # (-43 .. -1)*243-64 = -10513 .. -307 
    227             $count = 1; 
    228         } elsif ($b > 0x21) { # 22 23 24 
    229             # three bytes 
    230             $c = ($b - 0x25) *243*243 - 10513; # (-3 .. -1)*243*243 - 10513 = -187660 .. -69562 
    231             $count = 2; 
     157    my $utf8str = pack("U*", @codepoints); 
     158    Encode::_utf8_on($utf8str); 
     159    $utf8str; 
     160} 
     161 
     162sub utf8_to_bocu1 { 
     163    my $utf8str = shift; 
     164 
     165    my @chars = unpack("U*", $utf8str); 
     166    my $bocu1str = '*' x $#chars; 
     167    $bocu1str = ''; 
     168    my $pc = 0x40; 
     169    for (my $i=0; $i<=$#chars; $i++) { 
     170        my $cp = $chars[$i]; 
     171        next if $i == 0 && $cp == 0xfeff; 
     172 
     173        croak "unsupported codepoint (>0x1fffff)." if $cp > 0x001fffff; 
     174        # cp -> diff -> bocu1 
     175        if ($cp <= 0x20) { 
     176            $bocu1str .= chr($cp); 
     177            $pc = 0x40 unless $cp == 0x20; 
    232178        } else { 
    233             # four bytes 
    234             $c = -243*243*243 - 187660; # -1*243*243*243 - 187660 = -14536567 
    235             $count = 3; 
     179            my $diff = $cp - $pc; 
     180            my $b; 
     181            if ($diff < -187660) { # [...,-187660) : 21 
     182                $diff -= -14536567; 
     183                my $t3 = $diff % 243; $diff = int($diff / 243); 
     184                my $t2 = $diff % 243; $diff = int($diff / 243); 
     185                my $t1 = $diff % 243; $diff = int($diff / 243); 
     186#               my $t0 = $diff; 
     187                $b = pack("C4", 0x21, $bocu1_trail_to_byte[$t1], $bocu1_trail_to_byte[$t2], $bocu1_trail_to_byte[$t3]); 
     188            } elsif ($diff < -10513) { # [-187660,-10513) : 22-24 
     189                $diff -= -187660; 
     190                my $t2 = $diff % 243; $diff = int($diff / 243); 
     191                my $t1 = $diff % 243; $diff = int($diff / 243); 
     192                my $t0 = $diff; 
     193                $b = pack("C3", (0x22 + $t0), $bocu1_trail_to_byte[$t1], $bocu1_trail_to_byte[$t2]); 
     194            } elsif ($diff < -64) { # [-10513,-64) : 25-4F 
     195                $diff -= -10513; 
     196                my $t1 = $diff % 243; $diff = int($diff / 243); 
     197                my $t0 = $diff; 
     198                $b = pack("C2", (0x25 + $t0), $bocu1_trail_to_byte[$t1]); 
     199            } elsif ($diff < 64) { # [-64,63) : 50-CF 
     200                $diff -= -64; 
     201                my $t0 = $diff; 
     202                $b = pack("C", (0x50 + $t0)); 
     203            } elsif ($diff < 10513) { # [64,10513) : D0-FA 
     204                $diff -= 64; 
     205                my $t1 = $diff % 243; $diff = int($diff / 243); 
     206                my $t0 = $diff; 
     207                $b = pack("C2", (0xd0 + $t0), $bocu1_trail_to_byte[$t1]); 
     208            } elsif ($diff < 187660) { # [10513,187660) : FB-FD 
     209                $diff -= 10513; 
     210                my $t2 = $diff % 243; $diff = int($diff / 243); 
     211                my $t1 = $diff % 243; $diff = int($diff / 243); 
     212                my $t0 = $diff; 
     213                $b = pack("C3", (0xfb + $t0), $bocu1_trail_to_byte[$t1], $bocu1_trail_to_byte[$t2]); 
     214            } else { # [187660,...) : FE 
     215                $diff -= 187660; 
     216                my $t3 = $diff % 243; $diff = int($diff / 243); 
     217                my $t2 = $diff % 243; $diff = int($diff / 243); 
     218                my $t1 = $diff % 243; $diff = int($diff / 243); 
     219#               my $t0 = $diff; 
     220                $b = pack("C4", 0xfe, $bocu1_trail_to_byte[$t1], $bocu1_trail_to_byte[$t2], $bocu1_trail_to_byte[$t3]); 
     221            } 
     222            $bocu1str .= $b; 
     223 
     224            # next pc 
     225            if (0x3040 <= $cp && $cp <= 0x309f) { 
     226                $pc = 0x3070; 
     227            } elsif (0x4e00 <= $cp && $cp <= 0x9fa5) { 
     228                $pc = 0x7711; 
     229            } elsif (0xac00 <= $cp && $cp <= 0xd7a3) { 
     230                $pc = 0xc1d1; 
     231            } else { 
     232                $pc = $cp & ~0x7f | 0x40; 
     233            } 
    236234        } 
    237235    } 
    238236 
    239     # set the state for decoding the trail byte(s) 
    240     $$ref_rx{diff} = $c; 
    241     $$ref_rx{count} = $count; 
    242  
    243     -1; 
    244 } 
    245  
    246 # 
    247 # Function for BOCU-1 decoder; handles multi-byte trail bytes. 
    248 # 
    249 # @param \%rx : reference to the decoder state structure 
    250 # @param $b : trail byte 
    251 # @return result value, same as decodeBocu1 
    252 # 
    253 # @see decode_bocu1() 
    254 # 
    255 sub decode_bocu1_trail_byte { 
    256     my ($ref_rx, $b) = @_; 
    257     my ($t, $c, $count); 
    258  
    259     if ($b <= 0x20) { 
    260         # skip some C0 controls and make the trail byte range contiguous 
    261         $t = $bocu1_byte_to_trail[$b]; 
    262         if ($t < 0) { 
    263             # illegal trail byte value 
    264             $$ref_rx{prev} = 0x40; 
    265             $$ref_rx{count} = 0; 
    266             return -99; 
    267         } 
    268     } else { 
    269         $t = $b - 13; # BOCU1_TRAIL_BYTE_OFFSET; 
    270     } 
    271  
    272     # add trail byte into difference and decrement count 
    273     $c = $$ref_rx{diff}; 
    274     $count = $$ref_rx{count}; 
    275  
    276     if ($count == 1) { 
    277         # final trail byte, deliver a code point 
    278         $c = $$ref_rx{prev} + $c + $t; 
    279         if (0 <= $c && $c <= 0x10ffff) { 
    280             # valid code point result 
    281             $$ref_rx{prev} = &bocu1_prev($c); 
    282             $$ref_rx{count} = 0; 
    283             return $c; 
    284         } else { 
    285             # illegal code point result 
    286             $$ref_rx{prev} = 0x40; 
    287             $$ref_rx{count} = 0; 
    288             return -99; 
    289         } 
    290     } 
    291  
    292     # intermediate trail byte 
    293     if ($count == 2) { 
    294         $$ref_rx{diff} = $c + $t * 243; 
    295     } else { # if ($count == 3) { 
    296         $$ref_rx{diff} = $c + $t * 243 * 243; 
    297     } 
    298     $$ref_rx{count} = $count - 1; 
    299     -1; 
    300 } 
    301  
    302 # 
    303 # BOCU-1 decoder function. 
    304 # 
    305 # @param \%rx : reference to the decoder state structure; 
    306 #        the initial values should be 0 which 
    307 #        decodeBocu1 will set to actual initial state values 
    308 # @param $b : an input byte 
    309 # @return 
    310 #      0..0x10ffff for a result code point 
    311 #      -1 if only the state changed without code point output 
    312 #     <-1 if an error occurs 
    313 # 
    314 sub decode_bocu1 { 
    315     my ($ref_rx, $b) = @_; 
    316     my ($prev, $c, $count); 
    317  
    318     return -99 unless defined($ref_rx); ## ERROR: illegal argument 
    319  
    320     $prev = $$ref_rx{prev}; 
    321     if ($prev == 0) { 
    322         # lenient handling of initial 0 values 
    323         $prev = $$ref_rx{prev} = 0x40; 
    324         $count = $$ref_rx{count} = 0; 
    325     } else { 
    326         $count = $$ref_rx{count}; 
    327     } 
    328  
    329     if ($count == 0) { 
    330         # byte in lead position 
    331         if ($b <= 0x20) { 
    332             # 
    333             # Direct-encoded C0 control code or space. 
    334             # Reset prev for C0 control codes but not for space. 
    335             # 
    336             if ($b != 0x20) { 
    337                 $$ref_rx{prev} = 0x40; 
    338             } 
    339             return $b; 
    340         } 
    341  
    342         # 
    343         # $b is a difference lead byte. 
    344         # 
    345         # Return a code point directly from a single-byte difference. 
    346         # 
    347         # For multi-byte difference lead bytes, set the decoder state 
    348         # with the partial difference value from the lead byte and 
    349         # with the number of trail bytes. 
    350         # 
    351         # For four-byte differences, the signedness also affects the 
    352         # first trail byte, which has special handling farther below. 
    353         # 
    354         if ($b >= 0x50 && $b < 0xd0) { # 50 .. cf 
    355             # single-byte difference 
    356             $c = $prev + ($b - 0x90); 
    357             $$ref_rx{prev} = &bocu1_prev($c); 
    358             return $c; 
    359         } elsif ($b == 0xff) { # BOCU1_RESET 
    360             # only reset the state, no code point 
    361             $$ref_rx{prev} = 0x40; 
    362             return -1; 
    363         } else { 
    364             return decode_bocu1_lead_byte($ref_rx, $b); 
    365         } 
    366     } else { 
    367         # trail byte in any position 
    368         return decode_bocu1_trail_byte($ref_rx, $b); 
    369     } 
    370 } 
    371  
    372 # 
    373 # Decode a BOCU-1 byte sequence to a UCS-4 codepoint stream. 
    374 # 
    375 # @param : $bocu1str : input BOCU-1 string 
    376 # @return : @codepoints : UCS-4 codepoint stream 
    377 # 
    378 sub bocu1_to_codepoints { 
    379     my $bocu1str = shift; 
    380     my @chars = unpack("C*", $bocu1str); 
    381  
    382     my @codepoints = (); 
    383     my %rx = ( prev => 0, count => 0, diff => 0 ); 
    384  
    385     for (my $i=0; $i<=$#chars; $i++) { 
    386         my $c = &decode_bocu1(\%rx, $chars[$i]); 
    387         if ($c < -1) { 
    388             ## ERROR: "error: readString detects encoding error at string index %ld\n", i 
    389             return -1; 
    390         } 
    391         if ($c >= 0) { 
    392             push(@codepoints, $c) 
    393         } 
    394     } 
    395     return @codepoints; 
    396 } 
    397  
    398 ### 
    399 sub utf8_to_bocu1 { 
    400     my $utf8str = shift; 
    401     my $bocu1str = ''; 
    402  
    403     my @codepoints = unpack("U*", $utf8str); 
    404     my $prev = 0; 
    405     for (my $i=0; $i<=$#codepoints; $i++) { 
    406         my $codepoint = $codepoints[$i]; 
    407         next if $codepoint == 0xfeff && $i == 0; 
    408  
    409         $bocu1str .= &encode_bocu1(\$prev, $codepoint); 
    410     } 
    411237    $bocu1str; 
    412 } 
    413  
    414 sub bocu1_to_utf8 { 
    415     my $bocu1str = shift; 
    416  
    417     my @codepoints = &bocu1_to_codepoints($bocu1str); 
    418     my $utf8str = pack("U*", @codepoints); 
    419  
    420     Encode::_utf8_on($utf8str); 
    421  
    422     $utf8str; 
    423238} 
    424239 
     
    434249use Encode::BOCU1; 
    435250 
    436 $string = 'Some UTF-8 text to convert' 
     251$string = 'Some text to convert... in UTF-8' 
    437252Encode::from_to($string,'utf8','bocu1'); 
    438253Encode::from_to($string,'bocu1','shiftjis'); 
     
    441256 
    442257BOCU-1 is a MIME-compatible application of the Binary Ordered Compression for Unicode 
    443 [BOCU] base algorithm developed and patented by IBM. 
     258[BOCU] base algorithm. 
     259http://www.unicode.org/notes/tn6/ 
     260http://icu.sourceforge.net/docs/papers/binary_ordered_compression_for_unicode.html 
    444261 
    445262Encode::BOCU1 enables to convert any encoding systems supported by Encode.pm 
    446263from/to BOCU-1 through UTF-8. 
    447264 
    448 =head1 SEE ALSO 
    449  
    450 http://www.unicode.org/notes/tn6/ 
    451 http://icu.sourceforge.net/docs/papers/binary_ordered_compression_for_unicode.html 
    452  
    453265=head1 COPYRIGHT AND LICENSE 
    454266 
    455 This is pure-perl port of "BOCU-1 Sample C Code" written by Markus W. Scherer on 2002jan24, 
    456 available from http://www.unicode.org/notes/tn6/. 
    457  
    458 Ported by Naoya Tozuka E<lt>naoyat@naochan.comE<gt> 
    459  
    460 As with the original C code, this port is licensed under the X license (ICU version). 
     267Copyright (C) 2006 Naoya Tozuka E<lt>naoyat@naochan.comE<gt> 
     268 
     269Based on pure-perl port of "Sample C Sources" on http://www.unicode.org/notes/tn6/. 
     270"Sample C Sources" are licensed under the X license (ICU version). 
     271This module is licensed under the same license. 
     272 
     273BOCU ("Binary-Ordered Compression For Unicode") is patent-protected technology of IBM. 
     274(US Patent 6737994) 
     275 
    461276ICU License : http://dev.icu-project.org/cgi-bin/viewcvs.cgi/*checkout*/icu/license.html 
    462277 
    463 BOCU "Binary-Ordered Compression For Unicode" is a patent-protected technology of IBM. 
    464 (US Patent 6737994) 
    465  
    466278=cut 
  • lang/perl/Encode-BOCU1/trunk/blib/man3/Encode::BOCU1.3pm

    r2722 r2743  
    137137use Encode::BOCU1; 
    138138.PP 
    139 $string = 'Some \s-1UTF\-8\s0 text to convert' 
     139$string = 'Some text to convert... in \s-1UTF\-8\s0' 
    140140Encode::from_to($string,'utf8','bocu1'); 
    141141Encode::from_to($string,'bocu1','shiftjis'); 
     
    143143.IX Header "DESCRIPTION" 
    144144\&\s-1BOCU\-1\s0 is a MIME-compatible application of the Binary Ordered Compression for Unicode 
    145 [\s-1BOCU\s0] base algorithm developed and patented by \s-1IBM\s0. 
     145[\s-1BOCU\s0] base algorithm. 
     146http://www.unicode.org/notes/tn6/ 
     147http://icu.sourceforge.net/docs/papers/binary_ordered_compression_for_unicode.html 
    146148.PP 
    147149Encode::BOCU1 enables to convert any encoding systems supported by Encode.pm 
    148150from/to \s-1BOCU\-1\s0 through \s-1UTF\-8\s0. 
    149 .SH "SEE ALSO" 
    150 .IX Header "SEE ALSO" 
    151 http://www.unicode.org/notes/tn6/ 
    152 http://icu.sourceforge.net/docs/papers/binary_ordered_compression_for_unicode.html 
    153151.SH "COPYRIGHT AND LICENSE" 
    154152.IX Header "COPYRIGHT AND LICENSE" 
    155 This is pure-perl port of \*(L"\s-1BOCU\-1\s0 Sample C Code\*(R" written by Markus W. Scherer on 2002jan24, 
    156 available from http://www.unicode.org/notes/tn6/. 
     153Copyright (C) 2006 Naoya Tozuka <naoyat@naochan.com> 
    157154.PP 
    158 Ported by Naoya Tozuka <naoyat@naochan.com> 
     155Based on pure-perl port of \*(L"Sample C Sources\*(R" on http://www.unicode.org/notes/tn6/. 
     156\&\*(L"Sample C Sources\*(R" are licensed under the X license (\s-1ICU\s0 version). 
     157This module is licensed under the same license. 
    159158.PP 
    160 As with the original C code, this port is licensed under the X license (\s-1ICU\s0 version). 
     159\&\s-1BOCU\s0 (\*(L"Binary\-Ordered Compression For Unicode\*(R") is patent-protected technology of \s-1IBM\s0. 
     160(\s-1US\s0 Patent 6737994) 
     161.PP 
    161162\&\s-1ICU\s0 License : http://dev.icu\-project.org/cgi\-bin/viewcvs.cgi/*checkout*/icu/license.html 
    162 .PP 
    163 \&\s-1BOCU\s0 \*(L"Binary\-Ordered Compression For Unicode\*(R" is a patent-protected technology of \s-1IBM\s0. 
    164 (\s-1US\s0 Patent 6737994) 
  • lang/perl/Encode-BOCU1/trunk/lib/Encode/BOCU1.pm

    r2722 r2743  
    11package Encode::BOCU1; 
     2 
     3use 5.008; 
    24use strict; 
     5use warnings; 
     6use Carp; 
     7 
    38use base qw(Encode::Encoding); 
    49 
    5 our $VERSION = '0.01'; 
     10our $VERSION = '0.02'; 
    611 
    712__PACKAGE__->Define('bocu1'); 
    813 
    914use Encode::Alias; 
    10 define_alias( qr/^bocu.1$/i => '"bocu1"'); # BOCU-1, Bocu_1, bocu.1, ... 
     15define_alias( qr/^bocu.1$/i => '"bocu1"'); 
    1116define_alias( qr/^bocu$/i => '"bocu1"'); 
     17 
    1218 
    1319# 
     
    1723    my ($obj, $str, $check) = @_; 
    1824    my $octet = utf8_to_bocu1($str); 
    19     $_[1] = '' if $check; # $this is what in-place edit means 
     25 
     26    $_[1] = '' if $check; 
    2027    return $octet; 
    2128} 
    22 sub decode ($$;$) { 
     29sub decode($$;$) { 
    2330    my ($obj, $octet, $check) = @_; 
    2431    my $str = bocu1_to_utf8($octet); 
     32 
    2533    $_[1] = '' if $check; 
    2634    return $str; 
     
    2836 
    2937# 
    30 # Subroutines 
    31 # 
    32 # based on the sample C code available on http://www.unicode.org/notes/tn6/ 
    33 # US Patent 6737994 "Binary-Ordered Compression For Unicode" by IBM 
     38# subroutines 
    3439# 
    3540my @bocu1_trail_to_byte = ( 
    36 #   0     1     2     3     4     5     6     7 
    37     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, 
    38 #   8     9     a     b     c     d     e     f 
    39     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 
    40 #   10    11    12    13 
    41     0x1c, 0x1d, 0x1e, 0x1f ); 
     41#   0 - 19 (0x0 - 0x13) 
     42          0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 
     43    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,             0x1c, 0x1d, 0x1e, 0x1f, 
     44#   20 - 242 (0x14 - 0xf2) 
     45          0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 
     46    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 
     47    0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 
     48    0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 
     49    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 
     50    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 
     51    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 
     52    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 
     53    0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 
     54    0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 
     55    0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 
     56    0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 
     57    0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 
     58    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ); 
     59 
    4260my @bocu1_byte_to_trail = ( 
    43 #   0     1     2     3     4     5     6     7 
    44     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, 
    45 #   8     9     a     b     c     d     e     f 
    46     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1, 
    47 #   10    11    12    13    14    15    16    17 
    48     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 
    49 #   18    19    1a    1b    1c    1d    1e    1f 
    50     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13, 
    51 #   20 
    52     -1 ); 
    53  
    54 # Compute the next "previous" value for differencing from the current code point. 
    55 # 
    56 # @param $c : current code point, 0..0x10ffff 
    57 # @return "previous code point" state value 
    58 sub bocu1_prev { 
    59     my $c = shift; 
    60  
    61     if (0x3040 <= $c && $c <= 0x309f) { 
    62         # Hiragana is not 128-aligned 
    63         0x3070; 
    64     } elsif (0x4e00 <= $c && $c <= 0x9fa5) { 
    65         # CJI Unihan 
    66         0x7711; # 0x4e00 - -10513 
    67     } elsif (0xac00 <= $c && $c <= 0xd7a3) { 
    68         # Korean Hangul 
    69         0xc1d1; # (ac00 + d7a3)/2 
    70     } else { 
    71         # mostly small scripts 
    72         ($c & ~0x7f) + 0x40; 
    73     } 
    74 } 
    75  
    76 # Encode a difference -0x10ffff..0x10ffff in 1..4 bytes and return a BOCU-1 string 
    77 # 
    78 # The encoding favors small absolute differences with short encodings 
    79 # to compress runs of same-script characters. 
    80  
    81 # @param $diff : difference value -0x10ffff..0x10ffff 
    82 # @return $bocu1str : BOCU-1 string 
    83 sub pack_diff { 
    84     my $diff = shift; ## -1114111 (=-0x10ffff) .. 1114111 (=0x10ffff) 
    85     my ($lead,$count); 
    86  
    87     if ($diff >= -64) { 
    88         # mostly positive differences, and single-byte negative ones 
    89         if ($diff <= 63) { # -64 .. 63 
    90             # single byte 
    91             return chr(0x90 + $diff); # 0x50 .. 0xcf 
    92         } elsif ($diff <= 10512) { # 64 .. 10512 
    93             # two bytes 
    94             $diff -= 64; # 0 .. 10448 (= 43*243-1) 
    95             $lead = 0xd0; # 0xd0 .. 0xfa 
    96             $count = 1; 
    97         } elsif ($diff <= 187659) { # 10513 .. 187659 
    98             # three bytes 
    99             $diff -= 10513; # 0 .. 177146 (= 3*243*243-1) 
    100             $lead = 0xfb; # 0xfb .. 0xfd 
    101             $count = 2; 
    102         } else { # if ($diff <= 14536566) { # 187660 .. (14536566) 
    103 #            # four bytes 
    104 #            $diff -= 187660; # 0 .. 14348906 (1*243*243*243-1) 
    105 #            $lead = 0xfe; # 0xfe 
    106 #            $count = 3; 
     61#   0x00 - 0x20 
     62    -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1, 
     63    0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13, 
     64    -1, 
     65#   0x21 - 0xff 
     66          0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 
     67    0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 
     68    0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 
     69    0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 
     70    0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 
     71    0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 
     72    0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 
     73    0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 
     74    0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 
     75    0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 
     76    0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 
     77    0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 
     78    0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 
     79    0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2 ); 
     80 
     81sub bocu1_to_utf8 { 
     82    my $bocu1str = shift; 
     83    my @chars = unpack("C*", $bocu1str); 
     84 
     85    my $pc = 0x40; 
     86    my @codepoints; 
     87    for (my $i=0; $i<=$#chars; $i++) { 
     88        my $lead = $chars[$i]; 
     89        my $cp = 0; 
     90        my $diff = 0; 
     91        if ($lead <= 0x20) { 
     92            $cp = $lead; 
     93        } elsif ($lead == 0x21) { # 21 t1 t2 t3 
     94            my $t1 = $bocu1_byte_to_trail[$chars[++$i]]; 
     95            my $t2 = $bocu1_byte_to_trail[$chars[++$i]]; 
     96            my $t3 = $bocu1_byte_to_trail[$chars[++$i]]; 
     97            croak "illegal trail char" if $t1 < 0 || $t2 < 0 || $t3 < 0; 
     98            $diff = 14161247 + $t1 * 59049 + $t2 * 243 + $t3 
     99        } elsif ($lead < 0x25) { # [22-24] t1 t2 
     100            my $t1 = $bocu1_byte_to_trail[$chars[++$i]]; 
     101            my $t2 = $bocu1_byte_to_trail[$chars[++$i]]; 
     102            croak "illegal trail char" if $t1 < 0 || $t2 < 0; 
     103            $diff = -2195326 + $lead * 59049 + $t1 * 243 + $t2; 
     104        } elsif ($lead < 0x50) { # [25-4F] t1 
     105            my $t1 = $bocu1_byte_to_trail[$chars[++$i]]; 
     106            croak "illegal trail char" if $t1 < 0; 
     107            $diff = -19504 + $lead * 243 + $t1; 
     108        } elsif ($lead < 0xd0) { # [50-CF] 
     109            $diff = $lead - 0x90; 
     110        } elsif ($lead < 0xfb) { # [D0-FA] t1 
     111            my $t1 = $bocu1_byte_to_trail[$chars[++$i]]; 
     112            croak "illegal trail char" if $t1 < 0; 
     113            $diff = -50480 + $lead * 243 + $t1; 
     114        } elsif ($lead < 0xfe) { # [FB-FD] t1 t2 
     115            my $t1 = $bocu1_byte_to_trail[$chars[++$i]]; 
     116            my $t2 = $bocu1_byte_to_trail[$chars[++$i]]; 
     117            croak "illegal trail char" if $t1 < 0 || $t2 < 0; 
     118            $diff = -14810786 + $lead * 59049 + $t1 * 243 + $t2; 
     119        } elsif ($lead == 0xfe) { # FE t1 t2 t3 
     120            my $t1 = $bocu1_byte_to_trail[$chars[++$i]]; 
     121            my $t2 = $bocu1_byte_to_trail[$chars[++$i]]; 
     122            my $t3 = $bocu1_byte_to_trail[$chars[++$i]]; 
     123            croak "illegal trail char" if $t1 < 0 || $t2 < 0 || $t3 < 0; 
     124            $diff = 187660 + $t1 * 59049 + $t2 * 243 + $t3; 
     125        } elsif ($lead == 0xff) { 
     126            ## reset 
     127            $cp = 0; 
     128            $diff = 0; 
    107129        } 
    108     } else { # $diff < -64 
    109         # two- and four-byte negative differences 
    110         if ($diff >= -10513) { # -10513 .. -65 
    111             # two bytes 
    112             $diff -= -64; # -43*243 .. -1 
    113             $lead = 0x50; # 0x25 .. 0x4f 
    114             $count = 1; 
    115         } elsif ($diff >= -187660) { # -187660 .. -10514 
    116             # three bytes 
    117             $diff -= -10513; # -3*243*243 .. -1 
    118             $lead = 0x25; # 0x22 .. 0x24 
    119             $count = 2; 
    120         } else { # if ($diff >= -14536567) { # (-14536567) .. -187661 
    121             # four bytes 
    122             $diff -= -187660; # -1*243*243*243 .. -1 
    123             $lead = 0x22; # 0x21 
    124             $count = 3; 
     130 
     131        # codepoint, next pc 
     132        if ($lead <= 0x20) { 
     133            $pc = 0x40 if ($lead < 0x20); 
     134            push(@codepoints,$lead); 
     135        } elsif ($lead < 0xff) { 
     136            $cp = $pc + $diff; 
     137            $cp = 0 if $cp < 0; 
     138            push(@codepoints,$cp); 
     139            if ($cp < 0x20) { 
     140                $pc = 0x40; 
     141            } elsif ($cp == 0x20) { 
     142                # keep pc 
     143            } elsif (0x3040 <= $cp && $cp <= 0x309f) { 
     144                $pc = 0x3070; 
     145            } elsif (0x4e00 <= $cp && $cp <= 0x9fa5) { 
     146                $pc = 0x7711; 
     147            } elsif (0xac00 <= $cp && $cp <= 0xd7a3) { 
     148                $pc = 0xc1d1; 
     149            } else { 
     150                $pc = ($cp & ~0x7f) + 0x40; 
     151            } 
     152        } else { # 0xff : reset 
     153            $pc = 0x40; 
    125154        } 
    126155    } 
    127156 
    128     # calculate trail bytes 
    129     my $bocu1_str = ''; 
    130     for (my $i=$count; $i>0; $i--) { 
    131         my $trail = $diff % 243; 
    132         $diff = ($diff - $trail) / 243; 
    133          
    134         my $byte = $trail >= 20 ? $trail + 13 : $bocu1_trail_to_byte[$trail]; 
    135         $bocu1_str = chr($byte) . $bocu1_str; 
    136     } 
    137     chr($lead + $diff) . $bocu1_str; 
    138 } 
    139  
    140 # 
    141 # BOCU-1 encoder function. 
    142 # 
    143 # @param \$prev : reference to the integer that holds 
    144 #        the "previous code point" state; 
    145 #        the initial value should be 0 which 
    146 #        encode_bocu1() will set to the actual BOCU-1 initial state value 
    147 # @param $c : the code point to encode 
    148 # @return $bocu1str : BOCU-1 string 
    149 #         or undef if an error occurs 
    150 # 
    151 # @see pack_diff() 
    152 # 
    153 sub encode_bocu1 { 
    154     my ($ref_prev,$c) = @_; 
    155     if (!defined($ref_prev) || $c < 0 || $c > 0x10ffff) { 
    156         # ERROR : illegal argument 
    157         return undef; 
    158     } 
    159  
    160     my $prev = $$ref_prev; 
    161     if ($prev == 0) { 
    162         # lenient handling of initial value 0 
    163         $prev = $$ref_prev = 0x40; 
    164     } 
    165  
    166     if ($c <= 0x20) { 
    167         # 
    168         # ISO C0 control & space: 
    169         # Encode directly for MIME compatibility, 
    170         # and reset state except for space, to not disrupt compression. 
    171         # 
    172         if ($c != 0x20) { 
    173             $$ref_prev = 0x40; 
    174         } 
    175         return chr($c); 
    176     } 
    177  
    178     # 
    179     # all other Unicode code points $c==U+0021..U+10ffff 
    180     # are encoded with the difference $c - $prev 
    181     # 
    182     # a new prev is computed from $c, 
    183     # placed in the middle of a 0x80-block (for most small scripts) or 
    184     # in the middle of the Unihan and Hangul blocks 
    185     # to statistically minimize the following difference 
    186     # 
    187     $$ref_prev = &bocu1_prev($c); 
    188     &pack_diff($c - $prev); 
    189 } 
    190  
    191 # 
    192 # Function for BOCU-1 decoder; handles multi-byte lead bytes. 
    193 # 
    194 # @param \%rx : reference to the decoder state structure { prev, count, diff }. 
    195 # @param $b : lead byte; 
    196 #          0x21 <= $b <  0x50 
    197 #       or 0xd0 <= $b <= 0xfe 
    198 # @return -1 (state change only) 
    199 # 
    200 # @see decode_bocu1() 
    201 # 
    202 sub decode_bocu1_lead_byte { 
    203     my ($ref_rx, $b) = @_; 
    204     my ($c,$count); 
    205  
    206     if ($b >= 0x50) { 
    207         # positive difference 
    208         ## since d0 <= $b ... 
    209         if ($b < 0xfb) { # d0 .. fa 
    210             # two bytes 
    211             $c = ($b - 0xd0) *243 + 63 + 1; # ( .. 42)*243 + 64  # .. 10270+r 
    212             $count = 1; 
    213         } elsif ($b < 0xfe) { # fb fc fd 
    214             # three bytes 
    215             $c = ($b - 0xfb) *243*243 + 10512 + 1; # (0..2)*243*243 + 10513  # 10513 .. 128611+r 
    216             $count = 2; 
    217         } else { # fe 
    218             # four bytes 
    219             $c = 187659 + 1; # 3 *243*243 * 10512 + 1  # 187660 ..  
    220             $count = 3; 
    221         } 
    222     } else { 
    223         # negative difference 
    224         if ($b >= 0x25) { # 25 .. 4f 
    225             # two bytes 
    226             $c = ($b - 0x50) * 243 - 64; # (-43 .. -1)*243-64 = -10513 .. -307 
    227             $count = 1; 
    228         } elsif ($b > 0x21) { # 22 23 24 
    229             # three bytes 
    230             $c = ($b - 0x25) *243*243 - 10513; # (-3 .. -1)*243*243 - 10513 = -187660 .. -69562 
    231             $count = 2; 
     157    my $utf8str = pack("U*", @codepoints); 
     158    Encode::_utf8_on($utf8str); 
     159    $utf8str; 
     160} 
     161 
     162sub utf8_to_bocu1 { 
     163    my $utf8str = shift; 
     164 
     165    my @chars = unpack("U*", $utf8str); 
     166    my $bocu1str = '*' x $#chars; 
     167    $bocu1str = ''; 
     168    my $pc = 0x40; 
     169    for (my $i=0; $i<=$#chars; $i++) { 
     170        my $cp = $chars[$i]; 
     171        next if $i == 0 && $cp == 0xfeff; 
     172 
     173        croak "unsupported codepoint (>0x1fffff)." if $cp > 0x001fffff; 
     174        # cp -> diff -> bocu1 
     175        if ($cp <= 0x20) { 
     176            $bocu1str .= chr($cp); 
     177            $pc = 0x40 unless $cp == 0x20; 
    232178        } else { 
    233             # four bytes 
    234             $c = -243*243*243 - 187660; # -1*243*243*243 - 187660 = -14536567 
    235             $count = 3; 
     179            my $diff = $cp - $pc; 
     180            my $b; 
     181            if ($diff < -187660) { # [...,-187660) : 21 
     182                $diff -= -14536567; 
     183                my $t3 = $diff % 243; $diff = int($diff / 243); 
     184                my $t2 = $diff % 243; $diff = int($diff / 243); 
     185                my $t1 = $diff % 243; $diff = int($diff / 243); 
     186#               my $t0 = $diff; 
     187                $b = pack("C4", 0x21, $bocu1_trail_to_byte[$t1], $bocu1_trail_to_byte[$t2], $bocu1_trail_to_byte[$t3]); 
     188            } elsif ($diff < -10513) { # [-187660,-10513) : 22-24 
     189                $diff -= -187660; 
     190                my $t2 = $diff % 243; $diff = int($diff / 243); 
     191                my $t1 = $diff % 243; $diff = int($diff / 243); 
     192                my $t0 = $diff; 
     193                $b = pack("C3", (0x22 + $t0), $bocu1_trail_to_byte[$t1], $bocu1_trail_to_byte[$t2]); 
     194            } elsif ($diff < -64) { # [-10513,-64) : 25-4F 
     195                $diff -= -10513; 
     196                my $t1 = $diff % 243; $diff = int($diff / 243); 
     197                my $t0 = $diff; 
     198                $b = pack("C2", (0x25 + $t0), $bocu1_trail_to_byte[$t1]); 
     199            } elsif ($diff < 64) { # [-64,63) : 50-CF 
     200                $diff -= -64; 
     201                my $t0 = $diff; 
     202                $b = pack("C", (0x50 + $t0)); 
     203            } elsif ($diff < 10513) { # [64,10513) : D0-FA 
     204                $diff -= 64; 
     205                my $t1 = $diff % 243; $diff = int($diff / 243); 
     206                my $t0 = $diff; 
     207                $b = pack("C2", (0xd0 + $t0), $bocu1_trail_to_byte[$t1]); 
     208            } elsif ($diff < 187660) { # [10513,187660) : FB-FD 
     209                $diff -= 10513; 
     210                my $t2 = $diff % 243; $diff = int($diff / 243); 
     211                my $t1 = $diff % 243; $diff = int($diff / 243); 
     212                my $t0 = $diff; 
     213                $b = pack("C3", (0xfb + $t0), $bocu1_trail_to_byte[$t1], $bocu1_trail_to_byte[$t2]); 
     214            } else { # [187660,...) : FE 
     215                $diff -= 187660; 
     216                my $t3 = $diff % 243; $diff = int($diff / 243); 
     217                my $t2 = $diff % 243; $diff = int($diff / 243); 
     218                my $t1 = $diff % 243; $diff = int($diff / 243); 
     219#               my $t0 = $diff; 
     220                $b = pack("C4", 0xfe, $bocu1_trail_to_byte[$t1], $bocu1_trail_to_byte[$t2], $bocu1_trail_to_byte[$t3]); 
     221            } 
     222            $bocu1str .= $b; 
     223 
     224            # next pc 
     225            if (0x3040 <= $cp && $cp <= 0x309f) { 
     226                $pc = 0x3070; 
     227            } elsif (0x4e00 <= $cp && $cp <= 0x9fa5) { 
     228                $pc = 0x7711; 
     229            } elsif (0xac00 <= $cp && $cp <= 0xd7a3) { 
     230                $pc = 0xc1d1; 
     231            } else { 
     232                $pc = $cp & ~0x7f | 0x40; 
     233            } 
    236234        } 
    237235    } 
    238236 
    239     # set the state for decoding the trail byte(s) 
    240     $$ref_rx{diff} = $c; 
    241     $$ref_rx{count} = $count; 
    242  
    243     -1; 
    244 } 
    245  
    246 # 
    247 # Function for BOCU-1 decoder; handles multi-byte trail bytes. 
    248 # 
    249 # @param \%rx : reference to the decoder state structure 
    250 # @param $b : trail byte 
    251 # @return result value, same as decodeBocu1 
    252 # 
    253 # @see decode_bocu1() 
    254 # 
    255 sub decode_bocu1_trail_byte { 
    256     my ($ref_rx, $b) = @_; 
    257     my ($t, $c, $count); 
    258  
    259     if ($b <= 0x20) { 
    260         # skip some C0 controls and make the trail byte range contiguous 
    261         $t = $bocu1_byte_to_trail[$b]; 
    262         if ($t < 0) { 
    263             # illegal trail byte value 
    264             $$ref_rx{prev} = 0x40; 
    265             $$ref_rx{count} = 0; 
    266             return -99; 
    267         } 
    268     } else { 
    269         $t = $b - 13; # BOCU1_TRAIL_BYTE_OFFSET; 
    270     } 
    271  
    272     # add trail byte into difference and decrement count 
    273     $c = $$ref_rx{diff}; 
    274     $count = $$ref_rx{count}; 
    275  
    276     if ($count == 1) { 
    277         # final trail byte, deliver a code point 
    278         $c = $$ref_rx{prev} + $c + $t; 
    279         if (0 <= $c && $c <= 0x10ffff) { 
    280             # valid code point result 
    281             $$ref_rx{prev} = &bocu1_prev($c); 
    282             $$ref_rx{count} = 0; 
    283             return $c; 
    284         } else { 
    285             # illegal code point result 
    286             $$ref_rx{prev} = 0x40; 
    287             $$ref_rx{count} = 0; 
    288             return -99; 
    289         } 
    290     } 
    291  
    292     # intermediate trail byte 
    293     if ($count == 2) { 
    294         $$ref_rx{diff} = $c + $t * 243; 
    295     } else { # if ($count == 3) { 
    296         $$ref_rx{diff} = $c + $t * 243 * 243; 
    297     } 
    298     $$ref_rx{count} = $count - 1; 
    299     -1; 
    300 } 
    301  
    302 # 
    303 # BOCU-1 decoder function. 
    304 # 
    305 # @param \%rx : reference to the decoder state structure; 
    306 #        the initial values should be 0 which 
    307 #        decodeBocu1 will set to actual initial state values 
    308 # @param $b : an input byte 
    309 # @return 
    310 #      0..0x10ffff for a result code point 
    311 #      -1 if only the state changed without code point output 
    312 #     <-1 if an error occurs 
    313 # 
    314 sub decode_bocu1 { 
    315     my ($ref_rx, $b) = @_; 
    316     my ($prev, $c, $count); 
    317  
    318     return -99 unless defined($ref_rx); ## ERROR: illegal argument 
    319  
    320     $prev = $$ref_rx{prev}; 
    321     if ($prev == 0) { 
    322         # lenient handling of initial 0 values 
    323         $prev = $$ref_rx{prev} = 0x40; 
    324         $count = $$ref_rx{count} = 0; 
    325     } else { 
    326         $count = $$ref_rx{count}; 
    327     } 
    328  
    329     if ($count == 0) { 
    330         # byte in lead position 
    331         if ($b <= 0x20) { 
    332             # 
    333             # Direct-encoded C0 control code or space. 
    334             # Reset prev for C0 control codes but not for space. 
    335             # 
    336             if ($b != 0x20) { 
    337                 $$ref_rx{prev} = 0x40; 
    338             } 
    339             return $b; 
    340         } 
    341  
    342         # 
    343         # $b is a difference lead byte. 
    344         # 
    345         # Return a code point directly from a single-byte difference. 
    346         # 
    347         # For multi-byte difference lead bytes, set the decoder state 
    348         # with the partial difference value from the lead byte and 
    349         # with the number of trail bytes. 
    350         # 
    351         # For four-byte differences, the signedness also affects the 
    352         # first trail byte, which has special handling farther below. 
    353         # 
    354         if ($b >= 0x50 && $b < 0xd0) { # 50 .. cf 
    355             # single-byte difference 
    356             $c = $prev + ($b - 0x90); 
    357             $$ref_rx{prev} = &bocu1_prev($c); 
    358             return $c; 
    359         } elsif ($b == 0xff) { # BOCU1_RESET 
    360             # only reset the state, no code point 
    361             $$ref_rx{prev} = 0x40; 
    362             return -1; 
    363         } else { 
    364             return decode_bocu1_lead_byte($ref_rx, $b); 
    365         } 
    366     } else { 
    367         # trail byte in any position 
    368         return decode_bocu1_trail_byte($ref_rx, $b); 
    369     } 
    370 } 
    371  
    372 # 
    373 # Decode a BOCU-1 byte sequence to a UCS-4 codepoint stream. 
    374 # 
    375 # @param : $bocu1str : input BOCU-1 string 
    376 # @return : @codepoints : UCS-4 codepoint stream 
    377 # 
    378 sub bocu1_to_codepoints { 
    379     my $bocu1str = shift; 
    380     my @chars = unpack("C*", $bocu1str); 
    381  
    382     my @codepoints = (); 
    383     my %rx = ( prev => 0, count => 0, diff => 0 ); 
    384  
    385     for (my $i=0; $i<=$#chars; $i++) { 
    386         my $c = &decode_bocu1(\%rx, $chars[$i]); 
    387         if ($c < -1) { 
    388             ## ERROR: "error: readString detects encoding error at string index %ld\n", i 
    389             return -1; 
    390         } 
    391         if ($c >= 0) { 
    392             push(@codepoints, $c) 
    393         } 
    394     } 
    395     return @codepoints; 
    396 } 
    397  
    398 ### 
    399 sub utf8_to_bocu1 { 
    400     my $utf8str = shift; 
    401     my $bocu1str = ''; 
    402  
    403     my @codepoints = unpack("U*", $utf8str); 
    404     my $prev = 0; 
    405     for (my $i=0; $i<=$#codepoints; $i++) { 
    406         my $codepoint = $codepoints[$i]; 
    407         next if $codepoint == 0xfeff && $i == 0; 
    408  
    409         $bocu1str .= &encode_bocu1(\$prev, $codepoint); 
    410     } 
    411237    $bocu1str; 
    412 } 
    413  
    414 sub bocu1_to_utf8 { 
    415     my $bocu1str = shift; 
    416  
    417     my @codepoints = &bocu1_to_codepoints($bocu1str); 
    418     my $utf8str = pack("U*", @codepoints); 
    419  
    420     Encode::_utf8_on($utf8str); 
    421  
    422     $utf8str; 
    423238} 
    424239 
     
    434249use Encode::BOCU1; 
    435250 
    436 $string = 'Some UTF-8 text to convert' 
     251$string = 'Some text to convert... in UTF-8' 
    437252Encode::from_to($string,'utf8','bocu1'); 
    438253Encode::from_to($string,'bocu1','shiftjis'); 
     
    441256 
    442257BOCU-1 is a MIME-compatible application of the Binary Ordered Compression for Unicode 
    443 [BOCU] base algorithm developed and patented by IBM. 
     258[BOCU] base algorithm. 
     259http://www.unicode.org/notes/tn6/ 
     260http://icu.sourceforge.net/docs/papers/binary_ordered_compression_for_unicode.html 
    444261 
    445262Encode::BOCU1 enables to convert any encoding systems supported by Encode.pm 
    446263from/to BOCU-1 through UTF-8. 
    447264 
    448 =head1 SEE ALSO 
    449  
    450 http://www.unicode.org/notes/tn6/ 
    451 http://icu.sourceforge.net/docs/papers/binary_ordered_compression_for_unicode.html 
    452  
    453265=head1 COPYRIGHT AND LICENSE 
    454266 
    455 This is pure-perl port of "BOCU-1 Sample C Code" written by Markus W. Scherer on 2002jan24, 
    456 available from http://www.unicode.org/notes/tn6/. 
    457  
    458 Ported by Naoya Tozuka E<lt>naoyat@naochan.comE<gt> 
    459  
    460 As with the original C code, this port is licensed under the X license (ICU version). 
     267Copyright (C) 2006 Naoya Tozuka E<lt>naoyat@naochan.comE<gt> 
     268 
     269Based on pure-perl port of "Sample C Sources" on http://www.unicode.org/notes/tn6/. 
     270"Sample C Sources" are licensed under the X license (ICU version). 
     271This module is licensed under the same license. 
     272 
     273BOCU ("Binary-Ordered Compression For Unicode") is patent-protected technology of IBM. 
     274(US Patent 6737994) 
     275 
    461276ICU License : http://dev.icu-project.org/cgi-bin/viewcvs.cgi/*checkout*/icu/license.html 
    462277 
    463 BOCU "Binary-Ordered Compression For Unicode" is a patent-protected technology of IBM. 
    464 (US Patent 6737994) 
    465  
    466278=cut