root/lang/perl/Encode-BOCU1-XS/trunk/IBM_CODES/bocu1.c @ 2725

Revision 2725, 10.8 kB (checked in by naoya_t, 6 years ago)

r2712@localhost: naochan | 2007-12-07 09:44:14 +0900
Encode-BOCU1-XS initial import

Line 
1/*
2******************************************************************************
3*
4*   Copyright (C) 2002, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*   For licensing terms see the ICU X License:
8*   http://oss.software.ibm.com/cvs/icu/~checkout~/icu/license.html
9*
10******************************************************************************
11*   file name:  bocu1.c
12*   encoding:   US-ASCII
13*   tab size:   8 (not used)
14*   indentation:4
15*
16*   created on: 2002jan24
17*   created by: Markus W. Scherer
18*
19*   This is a sample implementation of encoder and decoder functions for BOCU-1,
20*   a MIME-compatible Binary Ordered Compression for Unicode.
21*/
22
23#include <stdio.h>
24#include <string.h>
25
26/*
27 * Standard ICU header.
28 * - Includes inttypes.h or defines its types.
29 * - Defines UChar for UTF-16 as an unsigned 16-bit type (wchar_t or uint16_t).
30 * - Defines UTF* macros to handle reading and writing
31 *   of in-process UTF-8/16 strings.
32 */
33#include "unicode/utypes.h"
34
35#include "bocu1.h"
36
37/* BOCU-1 implementation functions ------------------------------------------ */
38
39/**
40 * Compute the next "previous" value for differencing
41 * from the current code point.
42 *
43 * @param c current code point, 0..0x10ffff
44 * @return "previous code point" state value
45 */
46U_INLINE int32_t
47bocu1Prev(int32_t c) {
48    /* compute new prev */
49    if(0x3040<=c && c<=0x309f) {
50        /* Hiragana is not 128-aligned */
51        return 0x3070;
52    } else if(0x4e00<=c && c<=0x9fa5) {
53        /* CJK Unihan */
54        return 0x4e00-BOCU1_REACH_NEG_2;
55    } else if(0xac00<=c && c<=0xd7a3) {
56        /* Korean Hangul */
57        return (0xd7a3+0xac00)/2;
58    } else {
59        /* mostly small scripts */
60        return (c&~0x7f)+BOCU1_ASCII_PREV;
61    }
62}
63
64/**
65 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
66 * and return a packed integer with them.
67 *
68 * The encoding favors small absolut differences with short encodings
69 * to compress runs of same-script characters.
70 *
71 * @param diff difference value -0x10ffff..0x10ffff
72 * @return
73 *      0x010000zz for 1-byte sequence zz
74 *      0x0200yyzz for 2-byte sequence yy zz
75 *      0x03xxyyzz for 3-byte sequence xx yy zz
76 *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
77 */
78U_CFUNC int32_t
79packDiff(int32_t diff) {
80    int32_t result, m, lead, count, shift;
81
82    if(diff>=BOCU1_REACH_NEG_1) {
83        /* mostly positive differences, and single-byte negative ones */
84        if(diff<=BOCU1_REACH_POS_1) {
85            /* single byte */
86            return 0x01000000|(BOCU1_MIDDLE+diff);
87        } else if(diff<=BOCU1_REACH_POS_2) {
88            /* two bytes */
89            diff-=BOCU1_REACH_POS_1+1;
90            lead=BOCU1_START_POS_2;
91            count=1;
92        } else if(diff<=BOCU1_REACH_POS_3) {
93            /* three bytes */
94            diff-=BOCU1_REACH_POS_2+1;
95            lead=BOCU1_START_POS_3;
96            count=2;
97        } else {
98            /* four bytes */
99            diff-=BOCU1_REACH_POS_3+1;
100            lead=BOCU1_START_POS_4;
101            count=3;
102        }
103    } else {
104        /* two- and four-byte negative differences */
105        if(diff>=BOCU1_REACH_NEG_2) {
106            /* two bytes */
107            diff-=BOCU1_REACH_NEG_1;
108            lead=BOCU1_START_NEG_2;
109            count=1;
110        } else if(diff>=BOCU1_REACH_NEG_3) {
111            /* three bytes */
112            diff-=BOCU1_REACH_NEG_2;
113            lead=BOCU1_START_NEG_3;
114            count=2;
115        } else {
116            /* four bytes */
117            diff-=BOCU1_REACH_NEG_3;
118            lead=BOCU1_START_NEG_4;
119            count=3;
120        }
121    }
122
123    /* encode the length of the packed result */
124    if(count<3) {
125        result=(count+1)<<24;
126    } else /* count==3, MSB used for the lead byte */ {
127        result=0;
128    }
129
130    /* calculate trail bytes like digits in itoa() */
131    shift=0;
132    do {
133        NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
134        result|=BOCU1_TRAIL_TO_BYTE(m)<<shift;
135        shift+=8;
136    } while(--count>0);
137
138    /* add lead byte */
139    result|=(lead+diff)<<shift;
140
141    return result;
142}
143
144/**
145 * BOCU-1 encoder function.
146 *
147 * @param pPrev pointer to the integer that holds
148 *        the "previous code point" state;
149 *        the initial value should be 0 which
150 *        encodeBocu1 will set to the actual BOCU-1 initial state value
151 * @param c the code point to encode
152 * @return the packed 1/2/3/4-byte encoding, see packDiff(),
153 *         or 0 if an error occurs
154 *
155 * @see packDiff
156 */
157U_CFUNC int32_t
158encodeBocu1(int32_t *pPrev, int32_t c) {
159    int32_t prev;
160
161    if(pPrev==NULL || c<0 || c>0x10ffff) {
162        /* illegal argument */
163        return 0;
164    }
165
166    prev=*pPrev;
167    if(prev==0) {
168        /* lenient handling of initial value 0 */
169        prev=*pPrev=BOCU1_ASCII_PREV;
170    }
171
172    if(c<=0x20) {
173        /*
174         * ISO C0 control & space:
175         * Encode directly for MIME compatibility,
176         * and reset state except for space, to not disrupt compression.
177         */
178        if(c!=0x20) {
179            *pPrev=BOCU1_ASCII_PREV;
180        }
181        return 0x01000000|c;
182    }
183
184    /*
185     * all other Unicode code points c==U+0021..U+10ffff
186     * are encoded with the difference c-prev
187     *
188     * a new prev is computed from c,
189     * placed in the middle of a 0x80-block (for most small scripts) or
190     * in the middle of the Unihan and Hangul blocks
191     * to statistically minimize the following difference
192     */
193    *pPrev=bocu1Prev(c);
194    return packDiff(c-prev);
195}
196
197/**
198 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
199 *
200 * @param pRx pointer to the decoder state structure
201 * @param b lead byte;
202 *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LEAD
203 * @return -1 (state change only)
204 *
205 * @see decodeBocu1
206 */
207static int32_t
208decodeBocu1LeadByte(Bocu1Rx *pRx, uint8_t b) {
209    int32_t c, count;
210
211    if(b>=BOCU1_START_NEG_2) {
212        /* positive difference */
213        if(b<BOCU1_START_POS_3) {
214            /* two bytes */
215            c=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
216            count=1;
217        } else if(b<BOCU1_START_POS_4) {
218            /* three bytes */
219            c=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
220            count=2;
221        } else {
222            /* four bytes */
223            c=BOCU1_REACH_POS_3+1;
224            count=3;
225        }
226    } else {
227        /* negative difference */
228        if(b>=BOCU1_START_NEG_3) {
229            /* two bytes */
230            c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
231            count=1;
232        } else if(b>BOCU1_MIN) {
233            /* three bytes */
234            c=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
235            count=2;
236        } else {
237            /* four bytes */
238            c=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
239            count=3;
240        }
241    }
242
243    /* set the state for decoding the trail byte(s) */
244    pRx->diff=c;
245    pRx->count=count;
246    return -1;
247}
248
249/**
250 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
251 *
252 * @param pRx pointer to the decoder state structure
253 * @param b trail byte
254 * @return result value, same as decodeBocu1
255 *
256 * @see decodeBocu1
257 */
258static int32_t
259decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) {
260    int32_t t, c, count;
261
262    if(b<=0x20) {
263        /* skip some C0 controls and make the trail byte range contiguous */
264        t=bocu1ByteToTrail[b];
265        if(t<0) {
266            /* illegal trail byte value */
267            pRx->prev=BOCU1_ASCII_PREV;
268            pRx->count=0;
269            return -99;
270        }
271#if BOCU1_MAX_TRAIL<0xff
272    } else if(b>BOCU1_MAX_TRAIL) {
273        return -99;
274#endif
275    } else {
276        t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET;
277    }
278
279    /* add trail byte into difference and decrement count */
280    c=pRx->diff;
281    count=pRx->count;
282
283    if(count==1) {
284        /* final trail byte, deliver a code point */
285        c=pRx->prev+c+t;
286        if(0<=c && c<=0x10ffff) {
287            /* valid code point result */
288            pRx->prev=bocu1Prev(c);
289            pRx->count=0;
290            return c;
291        } else {
292            /* illegal code point result */
293            pRx->prev=BOCU1_ASCII_PREV;
294            pRx->count=0;
295            return -99;
296        }
297    }
298
299    /* intermediate trail byte */
300    if(count==2) {
301        pRx->diff=c+t*BOCU1_TRAIL_COUNT;
302    } else /* count==3 */ {
303        pRx->diff=c+t*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT;
304    }
305    pRx->count=count-1;
306    return -1;
307}
308
309/**
310 * BOCU-1 decoder function.
311 *
312 * @param pRx pointer to the decoder state structure;
313 *        the initial values should be 0 which
314 *        decodeBocu1 will set to actual initial state values
315 * @param b an input byte
316 * @return
317 *      0..0x10ffff for a result code point
318 *      -1 if only the state changed without code point output
319 *     <-1 if an error occurs
320 */
321U_CFUNC int32_t
322decodeBocu1(Bocu1Rx *pRx, uint8_t b) {
323    int32_t prev, c, count;
324
325    if(pRx==NULL) {
326        /* illegal argument */
327        return -99;
328    }
329
330    prev=pRx->prev;
331    if(prev==0) {
332        /* lenient handling of initial 0 values */
333        prev=pRx->prev=BOCU1_ASCII_PREV;
334        count=pRx->count=0;
335    } else {
336        count=pRx->count;
337    }
338
339    if(count==0) {
340        /* byte in lead position */
341        if(b<=0x20) {
342            /*
343             * Direct-encoded C0 control code or space.
344             * Reset prev for C0 control codes but not for space.
345             */
346            if(b!=0x20) {
347                pRx->prev=BOCU1_ASCII_PREV;
348            }
349            return b;
350        }
351
352        /*
353         * b is a difference lead byte.
354         *
355         * Return a code point directly from a single-byte difference.
356         *
357         * For multi-byte difference lead bytes, set the decoder state
358         * with the partial difference value from the lead byte and
359         * with the number of trail bytes.
360         *
361         * For four-byte differences, the signedness also affects the
362         * first trail byte, which has special handling farther below.
363         */
364        if(b>=BOCU1_START_NEG_2 && b<BOCU1_START_POS_2) {
365            /* single-byte difference */
366            c=prev+((int32_t)b-BOCU1_MIDDLE);
367            pRx->prev=bocu1Prev(c);
368            return c;
369        } else if(b==BOCU1_RESET) {
370            /* only reset the state, no code point */
371            pRx->prev=BOCU1_ASCII_PREV;
372            return -1;
373        } else {
374            return decodeBocu1LeadByte(pRx, b);
375        }
376    } else {
377        /* trail byte in any position */
378        return decodeBocu1TrailByte(pRx, b);
379    }
380}
Note: See TracBrowser for help on using the browser.