| 109 | | UV bom; |
| 110 | | endian = (size == 4) ? 'N' : 'n'; |
| 111 | | bom = enc_unpack(aTHX_ &s,e,size,endian); |
| 112 | | if (bom != BOM_BE) { |
| 113 | | if (bom == BOM16LE) { |
| 114 | | endian = 'v'; |
| 115 | | } |
| 116 | | else if (bom == BOM32LE) { |
| 117 | | endian = 'V'; |
| 118 | | } |
| 119 | | else { |
| 120 | | croak("%"SVf":Unrecognised BOM %"UVxf, |
| 121 | | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
| 122 | | bom); |
| 123 | | } |
| 124 | | } |
| | 136 | UV bom; |
| | 137 | endian = (size == 4) ? 'N' : 'n'; |
| | 138 | bom = enc_unpack(aTHX_ &s,e,size,endian); |
| | 139 | if (bom != BOM_BE) { |
| | 140 | if (bom == BOM16LE) { |
| | 141 | endian = 'v'; |
| | 142 | } |
| | 143 | else if (bom == BOM32LE) { |
| | 144 | endian = 'V'; |
| | 145 | } |
| | 146 | else { |
| | 147 | croak("%"SVf":Unrecognised BOM %"UVxf, |
| | 148 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
| | 149 | bom); |
| | 150 | } |
| | 151 | } |
| 133 | | UV ord = enc_unpack(aTHX_ &s,e,size,endian); |
| 134 | | U8 *d; |
| 135 | | if (issurrogate(ord)) { |
| 136 | | if (ucs2 || size == 4) { |
| 137 | | if (check) { |
| 138 | | croak("%"SVf":no surrogates allowed %"UVxf, |
| 139 | | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
| 140 | | ord); |
| 141 | | } |
| 142 | | if (s+size <= e) { |
| 143 | | /* skip the next one as well */ |
| 144 | | enc_unpack(aTHX_ &s,e,size,endian); |
| 145 | | } |
| 146 | | ord = FBCHAR; |
| 147 | | } |
| 148 | | else { |
| 149 | | UV lo; |
| 150 | | if (!isHiSurrogate(ord)) { |
| 151 | | if (check) { |
| 152 | | croak("%"SVf":Malformed HI surrogate %"UVxf, |
| 153 | | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
| 154 | | ord); |
| 155 | | } |
| 156 | | else { |
| 157 | | ord = FBCHAR; |
| 158 | | } |
| 159 | | } |
| 160 | | else { |
| 161 | | if (s+size > e) { |
| 162 | | /* Partial character */ |
| 163 | | s -= size; /* back up to 1st half */ |
| 164 | | break; /* And exit loop */ |
| 165 | | } |
| 166 | | lo = enc_unpack(aTHX_ &s,e,size,endian); |
| 167 | | if (!isLoSurrogate(lo)){ |
| 168 | | if (check) { |
| 169 | | croak("%"SVf":Malformed LO surrogate %"UVxf, |
| 170 | | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
| 171 | | ord); |
| 172 | | } |
| 173 | | else { |
| 174 | | ord = FBCHAR; |
| 175 | | } |
| 176 | | } |
| 177 | | else { |
| 178 | | ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); |
| 179 | | } |
| 180 | | } |
| 181 | | } |
| 182 | | } |
| 183 | | |
| 184 | | if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) { |
| 185 | | if (check) { |
| 186 | | croak("%"SVf":Unicode character %"UVxf" is illegal", |
| 187 | | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
| 188 | | ord); |
| 189 | | } else { |
| 190 | | ord = FBCHAR; |
| 191 | | } |
| 192 | | } |
| 193 | | |
| 194 | | d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1); |
| 195 | | d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0); |
| 196 | | SvCUR_set(result,d - (U8 *)SvPVX(result)); |
| 197 | | } |
| | 169 | UV ord = enc_unpack(aTHX_ &s,e,size,endian); |
| | 170 | U8 *d; |
| | 171 | if (issurrogate(ord)) { |
| | 172 | if (ucs2 == -1) { |
| | 173 | ucs2 = SvTRUE(attr("ucs2", 4)); |
| | 174 | } |
| | 175 | if (ucs2 || size == 4) { |
| | 176 | if (check) { |
| | 177 | croak("%"SVf":no surrogates allowed %"UVxf, |
| | 178 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
| | 179 | ord); |
| | 180 | } |
| | 181 | if (s+size <= e) { |
| | 182 | /* skip the next one as well */ |
| | 183 | enc_unpack(aTHX_ &s,e,size,endian); |
| | 184 | } |
| | 185 | ord = FBCHAR; |
| | 186 | } |
| | 187 | else { |
| | 188 | UV lo; |
| | 189 | if (!isHiSurrogate(ord)) { |
| | 190 | if (check) { |
| | 191 | croak("%"SVf":Malformed HI surrogate %"UVxf, |
| | 192 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
| | 193 | ord); |
| | 194 | } |
| | 195 | else { |
| | 196 | ord = FBCHAR; |
| | 197 | } |
| | 198 | } |
| | 199 | else { |
| | 200 | if (s+size > e) { |
| | 201 | /* Partial character */ |
| | 202 | s -= size; /* back up to 1st half */ |
| | 203 | break; /* And exit loop */ |
| | 204 | } |
| | 205 | lo = enc_unpack(aTHX_ &s,e,size,endian); |
| | 206 | if (!isLoSurrogate(lo)) { |
| | 207 | if (check) { |
| | 208 | croak("%"SVf":Malformed LO surrogate %"UVxf, |
| | 209 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
| | 210 | ord); |
| | 211 | } |
| | 212 | else { |
| | 213 | ord = FBCHAR; |
| | 214 | } |
| | 215 | } |
| | 216 | else { |
| | 217 | ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); |
| | 218 | } |
| | 219 | } |
| | 220 | } |
| | 221 | } |
| | 222 | |
| | 223 | if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) { |
| | 224 | if (check) { |
| | 225 | croak("%"SVf":Unicode character %"UVxf" is illegal", |
| | 226 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
| | 227 | ord); |
| | 228 | } else { |
| | 229 | ord = FBCHAR; |
| | 230 | } |
| | 231 | } |
| | 232 | |
| | 233 | if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) { |
| | 234 | /* Do not allocate >8Mb more than the minimum needed. |
| | 235 | This prevents allocating too much in the rogue case of a large |
| | 236 | input consisting initially of long sequence uft8-byte unicode |
| | 237 | chars followed by single utf8-byte chars. */ |
| | 238 | STRLEN remaining = (e - s)/usize; |
| | 239 | STRLEN max_alloc = remaining + (8*1024*1024); |
| | 240 | STRLEN est_alloc = remaining * UTF8_MAXLEN; |
| | 241 | STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */ |
| | 242 | (est_alloc > max_alloc ? max_alloc : est_alloc); |
| | 243 | resultbuf = (U8 *) SvGROW(result, newlen); |
| | 244 | resultbuflen = SvLEN(result); |
| | 245 | } |
| | 246 | |
| | 247 | d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord, 0); |
| | 248 | SvCUR_set(result, d - (U8 *)SvPVX(result)); |
| | 249 | } |
| | 250 | |
| 199 | | /* unlikely to happen because it's fixed-length -- dankogai */ |
| 200 | | if (check & ENCODE_WARN_ON_ERR){ |
| 201 | | Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character", |
| 202 | | *hv_fetch((HV *)SvRV(obj),"Name",4,0)); |
| 203 | | } |
| 204 | | } |
| 205 | | if (check && !(check & ENCODE_LEAVE_SRC)){ |
| 206 | | if (s < e) { |
| 207 | | Move(s,SvPVX(str),e-s,U8); |
| 208 | | SvCUR_set(str,(e-s)); |
| 209 | | } |
| 210 | | else { |
| 211 | | SvCUR_set(str,0); |
| 212 | | } |
| 213 | | *SvEND(str) = '\0'; |
| 214 | | } |
| | 252 | /* unlikely to happen because it's fixed-length -- dankogai */ |
| | 253 | if (check & ENCODE_WARN_ON_ERR) { |
| | 254 | Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character", |
| | 255 | *hv_fetch((HV *)SvRV(obj),"Name",4,0)); |
| | 256 | } |
| | 257 | } |
| | 258 | if (check && !(check & ENCODE_LEAVE_SRC)) { |
| | 259 | if (s < e) { |
| | 260 | Move(s,SvPVX(str),e-s,U8); |
| | 261 | SvCUR_set(str,(e-s)); |
| | 262 | } |
| | 263 | else { |
| | 264 | SvCUR_set(str,0); |
| | 265 | } |
| | 266 | *SvEND(str) = '\0'; |
| | 267 | } |
| | 268 | |
| | 269 | if (!temp_result) |
| | 270 | shrink_buffer(result); |
| | 271 | |
| 245 | | STRLEN len; |
| 246 | | UV ord = utf8n_to_uvuni(s, e-s, &len, 0); |
| 247 | | s += len; |
| 248 | | if (size != 4 && invalid_ucs2(ord)) { |
| 249 | | if (!issurrogate(ord)){ |
| 250 | | if (ucs2) { |
| 251 | | if (check) { |
| 252 | | croak("%"SVf":code point \"\\x{%"UVxf"}\" too high", |
| 253 | | *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord); |
| 254 | | } |
| 255 | | enc_pack(aTHX_ result,size,endian,FBCHAR); |
| 256 | | }else{ |
| 257 | | UV hi = ((ord - 0x10000) >> 10) + 0xD800; |
| 258 | | UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; |
| 259 | | enc_pack(aTHX_ result,size,endian,hi); |
| 260 | | enc_pack(aTHX_ result,size,endian,lo); |
| 261 | | } |
| 262 | | } |
| 263 | | else { |
| 264 | | /* not supposed to happen */ |
| 265 | | enc_pack(aTHX_ result,size,endian,FBCHAR); |
| 266 | | } |
| 267 | | } |
| 268 | | else { |
| 269 | | enc_pack(aTHX_ result,size,endian,ord); |
| 270 | | } |
| | 314 | STRLEN len; |
| | 315 | UV ord = utf8n_to_uvuni(s, e-s, &len, 0); |
| | 316 | s += len; |
| | 317 | if (size != 4 && invalid_ucs2(ord)) { |
| | 318 | if (!issurrogate(ord)) { |
| | 319 | if (ucs2 == -1) { |
| | 320 | ucs2 = SvTRUE(attr("ucs2", 4)); |
| | 321 | } |
| | 322 | if (ucs2) { |
| | 323 | if (check) { |
| | 324 | croak("%"SVf":code point \"\\x{%"UVxf"}\" too high", |
| | 325 | *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord); |
| | 326 | } |
| | 327 | enc_pack(aTHX_ result,size,endian,FBCHAR); |
| | 328 | } else { |
| | 329 | UV hi = ((ord - 0x10000) >> 10) + 0xD800; |
| | 330 | UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; |
| | 331 | enc_pack(aTHX_ result,size,endian,hi); |
| | 332 | enc_pack(aTHX_ result,size,endian,lo); |
| | 333 | } |
| | 334 | } |
| | 335 | else { |
| | 336 | /* not supposed to happen */ |
| | 337 | enc_pack(aTHX_ result,size,endian,FBCHAR); |
| | 338 | } |
| | 339 | } |
| | 340 | else { |
| | 341 | enc_pack(aTHX_ result,size,endian,ord); |
| | 342 | } |
| 273 | | /* UTF-8 partial char happens often on PerlIO. |
| 274 | | Since this is okay and normal, we do not warn. |
| 275 | | But this is critical when you choose to LEAVE_SRC |
| 276 | | in which case we die */ |
| 277 | | if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)){ |
| 278 | | Perl_croak(aTHX_ "%"SVf":partial character is not allowed " |
| 279 | | "when CHECK = 0x%" UVuf, |
| 280 | | *hv_fetch((HV *)SvRV(obj),"Name",4,0), check); |
| 281 | | } |
| 282 | | |
| 283 | | } |
| 284 | | if (check && !(check & ENCODE_LEAVE_SRC)){ |
| 285 | | if (s < e) { |
| 286 | | Move(s,SvPVX(utf8),e-s,U8); |
| 287 | | SvCUR_set(utf8,(e-s)); |
| 288 | | } |
| 289 | | else { |
| 290 | | SvCUR_set(utf8,0); |
| 291 | | } |
| 292 | | *SvEND(utf8) = '\0'; |
| 293 | | } |
| | 345 | /* UTF-8 partial char happens often on PerlIO. |
| | 346 | Since this is okay and normal, we do not warn. |
| | 347 | But this is critical when you choose to LEAVE_SRC |
| | 348 | in which case we die */ |
| | 349 | if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) { |
| | 350 | Perl_croak(aTHX_ "%"SVf":partial character is not allowed " |
| | 351 | "when CHECK = 0x%" UVuf, |
| | 352 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), check); |
| | 353 | } |
| | 354 | } |
| | 355 | if (check && !(check & ENCODE_LEAVE_SRC)) { |
| | 356 | if (s < e) { |
| | 357 | Move(s,SvPVX(utf8),e-s,U8); |
| | 358 | SvCUR_set(utf8,(e-s)); |
| | 359 | } |
| | 360 | else { |
| | 361 | SvCUR_set(utf8,0); |
| | 362 | } |
| | 363 | *SvEND(utf8) = '\0'; |
| | 364 | } |
| | 365 | |
| | 366 | if (!temp_result) |
| | 367 | shrink_buffer(result); |
| | 368 | |