Changeset 38929
- Timestamp:
- 05/22/11 08:21:44 (2 years ago)
- Location:
- lang/perl/Encode/trunk
- Files:
-
- 5 modified
-
Changes (modified) (1 diff)
-
Encode.pm (modified) (31 diffs)
-
Encode.xs (modified) (1 diff)
-
Unicode/Unicode.xs (modified) (1 diff)
-
lib/Encode/Alias.pm (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
-
lang/perl/Encode/trunk/Changes
r38757 r38929 1 1 # Revision history for Perl extension Encode. 2 2 # 3 # $Id: Changes,v 2.4 2 2010/12/31 22:48:48dankogai Exp dankogai $3 # $Id: Changes,v 2.43 2011/05/21 23:14:43 dankogai Exp dankogai $ 4 4 # 5 $Revision: 2.42 $ $Date: 2010/12/31 22:48:48 $ 5 $Revision: 2.43 $ $Date: 2011/05/21 23:14:43 $ 6 ! lib/Encode/Alias.pm 7 Addressed RT#68361: Encode::Bytes x-mac-... aliases missing 8 https://rt.cpan.org/Ticket/Display.html?id=68361 9 ! Encode.pm 10 Applied the 0001-Fix-typo-in-pod.patch 11 https://rt.cpan.org/Ticket/Update.html?id=64381 12 Addressed RT#65796 Deep recursion error finding invalid charset 13 https://rt.cpan.org/Ticket/Update.html?id=65796 14 Applied a jumbo doc patch by Tom Christiansen 15 Message-Id: <14795.1304618434@chthon> 16 17 2.42 2010/12/31 22:48:48 6 18 ! Encode.xs 7 19 ! Unicode/Unicode.xs -
lang/perl/Encode/trunk/Encode.pm
r38757 r38929 1 1 # 2 # $Id: Encode.pm,v 2.4 2 2010/12/31 22:48:10 dankogai Exp$2 # $Id: Encode.pm,v 2.43 2011/05/21 23:14:43 dankogai Exp dankogai $ 3 3 # 4 4 package Encode; 5 5 use strict; 6 6 use warnings; 7 our $VERSION = sprintf "%d.%02d", q$Revision: 2.4 2$ =~ /(\d+)/g;7 our $VERSION = sprintf "%d.%02d", q$Revision: 2.43 $ =~ /(\d+)/g; 8 8 sub DEBUG () { 0 } 9 9 use XSLoader (); … … 69 69 else { 70 70 %enc = %Encoding; 71 for my $mod ( map { m/::/ o? $_ : "Encode::$_" } @_ ) {71 for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) { 72 72 DEBUG and warn $mod; 73 73 for my $enc ( keys %ExtModule ) { … … 101 101 sub getEncoding { 102 102 my ( $class, $name, $skip_external ) = @_; 103 104 $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796 103 105 104 106 ref($name) && $name->can('renew') and return $name; … … 335 337 bless { Name => "utf8" } => "Encode::utf8"; 336 338 $Encode::Encoding{"utf-8-strict"} = 337 bless { Name => "utf-8-strict", strict_utf8 => 1 } =>338 "Encode::utf8";339 bless { Name => "utf-8-strict", strict_utf8 => 1 } 340 => "Encode::utf8"; 339 341 } 340 342 } … … 346 348 =head1 NAME 347 349 348 Encode - character encodings 350 Encode - character encodings in Perl 349 351 350 352 =head1 SYNOPSIS … … 354 356 =head2 Table of Contents 355 357 356 Encode consists of a collection of modules whose details are too big357 to fit in one document. This PODitself explains the top-level APIs358 Encode consists of a collection of modules whose details are too extensive 359 to fit in one document. This one itself explains the top-level APIs 358 360 and general topics at a glance. For other topics and more details, 359 see the PODs below:361 see the documentation for these modules: 360 362 361 363 Name Description … … 372 374 =head1 DESCRIPTION 373 375 374 The C<Encode> module provides the interface s between Perl'sstrings376 The C<Encode> module provides the interface between Perl strings 375 377 and the rest of the system. Perl strings are sequences of 376 B<characters>.377 378 The repertoire of characters that Perl can represent is a t least that378 I<characters>. 379 380 The repertoire of characters that Perl can represent is a superset of those 379 381 defined by the Unicode Consortium. On most platforms the ordinal 380 values of the characters (as returned by C<ord(ch)>) is the "Unicode381 codepoint " for the character (the exceptions are those platforms where382 the legacy encoding is some variant of EBCDIC rather than a super -set383 of ASCII - see L<perlebcdic>).384 385 Traditionally, computer data has been moved around in 8-bit chunks 386 often called "bytes" . These chunks are also known as "octets" in387 networking standards. Perl is widely used to manipulate data of many 388 types - not only strings of characters representing human or computer 389 languages but also "binary" data being the machine's representation of 390 numbers, pixels in an image - orjust about anything.382 values of a character as returned by C<ord(I<S>)> is the I<Unicode 383 codepoint> for that character. The exceptions are platforms where 384 the legacy encoding is some variant of EBCDIC rather than a superset 385 of ASCII; see L<perlebcdic>. 386 387 During recent history, data is moved around a computer in 8-bit chunks, 388 often called "bytes" but also known as "octets" in standards documents. 389 Perl is widely used to manipulate data of many types: not only strings of 390 characters representing human or computer languages, but also "binary" 391 data, being the machine's representation of numbers, pixels in an image, or 392 just about anything. 391 393 392 394 When Perl is processing "binary data", the programmer wants Perl to 393 process "sequences of bytes". This is not a problem for Perl - asa395 process "sequences of bytes". This is not a problem for Perl: because a 394 396 byte has 256 possible values, it easily fits in Perl's much larger 395 397 "logical character". … … 401 403 =item * 402 404 403 I<character>: a character in the range 0 ..(2**32-1) (or more).404 (What Perl's strings are made of.) 405 I<character>: a character in the range 0 .. 2**32-1 (or more); 406 what Perl's strings are made of. 405 407 406 408 =item * 407 409 408 I<byte>: a character in the range 0..255 409 (A special case of a Perl character.) 410 I<byte>: a character in the range 0..255; 411 A special case of a Perl character. 410 412 411 413 =item * 412 414 413 I<octet>: 8 bits of data, with ordinal values 0..255 414 (Term for bytes passed to or from a non-Perl context, e.g. a disk file.) 415 I<octet>: 8 bits of data, with ordinal values 0..255; 416 Term for bytes passed to or from a non-Perl context, such as a disk file. 415 417 416 418 =back 417 419 418 =head1 PERL ENCODING API420 =head1 THE PERL ENCODING API 419 421 420 422 =over 2 421 423 422 =item $octets = encode(ENCODING, $string[, CHECK])423 424 Encodes a string from Perl's internal form into I<ENCODING> and returns425 a sequence of octets. ENCODING can be either a canonical name or 426 an alias. For encoding names and aliases, see L</"Defining Aliases">. 427 For CHECK, see L</"Handling Malformed Data">.428 429 For example, to convert a string from Perl's internal format to430 iso-8859-1 (also known as Latin1), 424 =item $octets = encode(ENCODING, STRING[, CHECK]) 425 426 Encodes the scalar value I<STRING> from Perl's internal form into 427 I<ENCODING> and returns a sequence of octets. I<ENCODING> can be either a 428 canonical name or an alias. For encoding names and aliases, see 429 L</"Defining Aliases">. For CHECK, see L</"Handling Malformed Data">. 430 431 For example, to convert a string from Perl's internal format into 432 ISO-8859-1, also known as Latin1: 431 433 432 434 $octets = encode("iso-8859-1", $string); 433 435 434 436 B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then 435 $octets B<may not be equal to> $string. Though they both contain the 436 same data, the UTF8 flag for $octets is B<always> off. When you 437 encode anything, UTF8 flag of the result is always off, even when it 438 contains completely valid utf8 string. See L</"The UTF8 flag"> below. 439 440 If the $string is C<undef> then C<undef> is returned. 441 442 =item $string = decode(ENCODING, $octets [, CHECK]) 443 444 Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's 445 internal form and returns the resulting string. As in encode(), 446 ENCODING can be either a canonical name or an alias. For encoding names 447 and aliases, see L</"Defining Aliases">. For CHECK, see 448 L</"Handling Malformed Data">. 449 450 For example, to convert ISO-8859-1 data to a string in Perl's internal format: 437 $octets I<might not be equal to> $string. Though both contain the 438 same data, the UTF8 flag for $octets is I<always> off. When you 439 encode anything, the UTF8 flag on the result is always off, even when it 440 contains a completely valid utf8 string. See L</"The UTF8 flag"> below. 441 442 If the $string is C<undef>, then C<undef> is returned. 443 444 =item $string = decode(ENCODING, OCTETS[, CHECK]) 445 446 This function returns the string that results from decoding the scalar 447 value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into 448 Perl's internal form. The returns the resulting string. As with encode(), 449 I<ENCODING> can be either a canonical name or an alias. For encoding names 450 and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling 451 Malformed Data">. 452 453 For example, to convert ISO-8859-1 data into a string in Perl's 454 internal format: 451 455 452 456 $string = decode("iso-8859-1", $octets); 453 457 454 458 B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string 455 B<may not be equal to> $octets. Though they both contain the same data, 456 the UTF8 flag for $string is on unless $octets entirely consists of 457 ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF8 flag">459 I<might not be equal to> $octets. Though both contain the same data, the 460 UTF8 flag for $string is on unless $octets consists entirely of ASCII data 461 on ASCII machines or EBCDIC on EBCDIC machines. See L</"The UTF8 flag"> 458 462 below. 459 463 460 If the $string is C<undef> then C<undef> is returned.464 If the $string is C<undef>, then C<undef> is returned. 461 465 462 466 =item [$obj =] find_encoding(ENCODING) 463 467 464 Returns the I<encoding object> corresponding to ENCODING. Returns 465 undef if no matching ENCODING is find. 466 467 This object is what actually does the actual (en|de)coding. 468 Returns the I<encoding object> corresponding to I<ENCODING>. Returns 469 C<undef> if no matching I<ENCODING> is find. The returned object is 470 what does the actual encoding or decoding. 468 471 469 472 $utf8 = decode($name, $bytes); … … 471 474 is in fact 472 475 473 $utf8 = do{474 $obj = find_encoding($name);475 croak qq(encoding "$name" not found) unless ref $obj;476 $obj->decode($bytes)477 };476 $utf8 = do { 477 $obj = find_encoding($name); 478 croak qq(encoding "$name" not found) unless ref $obj; 479 $obj->decode($bytes); 480 }; 478 481 479 482 with more error checking. 480 483 481 Therefore you cansave time by reusing this object as follows;482 483 my $enc = find_encoding("iso-8859-1");484 while(<>){485 my $utf8 = $enc->decode($_);486 # and do someting with $utf8;487 }484 You can therefore save time by reusing this object as follows; 485 486 my $enc = find_encoding("iso-8859-1"); 487 while(<>) { 488 my $utf8 = $enc->decode($_); 489 ... # now do something with $utf8; 490 } 488 491 489 492 Besides C<< ->decode >> and C<< ->encode >>, other methods are 490 available as well. For instance, C<< -> name >> returns the canonical493 available as well. For instance, C<< ->name >> returns the canonical 491 494 name of the encoding object. 492 495 … … 497 500 =item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK]) 498 501 499 Converts B<in-place> data between two encodings. The data in $octets500 must be encoded as octets and notas characters in Perl's internal501 format. For example, to convert ISO-8859-1 data to Microsoft's CP1250502 Converts I<in-place> data between two encodings. The data in $octets 503 must be encoded as octets and I<not> as characters in Perl's internal 504 format. For example, to convert ISO-8859-1 data into Microsoft's CP1250 502 505 encoding: 503 506 … … 508 511 from_to($octets, "cp1250", "iso-8859-1"); 509 512 510 Note that because the conversion happens in place, the data to be511 converted cannot be a string constant ;it must be a scalar variable.512 513 from_to() returns the length of the converted string in octets on 514 success, I<undef> on error.515 516 B<CAVEAT>: The following operations look the same but are not quite so;513 Because the conversion happens in place, the data to be 514 converted cannot be a string constant: it must be a scalar variable. 515 516 from_to() returns the length of the converted string in octets on success, 517 and C<undef> on error. 518 519 B<CAVEAT>: The following operations may look the same, but are not: 517 520 518 521 from_to($data, "iso-8859-1", "utf8"); #1 519 522 $data = decode("iso-8859-1", $data); #2 520 523 521 Both #1 and #2 make $data consist of a completely valid UTF-8 string 522 but only #2 turns UTF8 flag on. #1 is equivalent to524 Both #1 and #2 make $data consist of a completely valid UTF-8 string, 525 but only #2 turns the UTF8 flag on. #1 is equivalent to: 523 526 524 527 $data = encode("utf8", decode("iso-8859-1", $data)); … … 526 529 See L</"The UTF8 flag"> below. 527 530 528 Also note that 531 Also note that: 529 532 530 533 from_to($octets, $from, $to, $check); 531 534 532 is equivalent t o535 is equivalent t:o 533 536 534 537 $octets = encode($to, decode($from, $octets), $check); 535 538 536 Yes, it does notrespect the $check during decoding. It is537 deliberately done that way. If you need minute control, C<decode>538 then C<encode> as follows; 539 Yes, it does I<not> respect the $check during decoding. It is 540 deliberately done that way. If you need minute control, use C<decode> 541 followed by C<encode> as follows: 539 542 540 543 $octets = encode($to, decode($from, $octets, $check_from), $check_to); … … 542 545 =item $octets = encode_utf8($string); 543 546 544 Equivalent to C<$octets = encode("utf8", $string);> The characters 545 that comprise $string are encoded in Perl's internal format and the 546 result is returned as a sequence of octets. All possible 547 characters have a UTF-8 representation so this function cannot fail. 548 547 Equivalent to C<$octets = encode("utf8", $string)>. The characters in 548 $string are encoded in Perl's internal format, and the result is returned 549 as a sequence of octets. Because all possible characters in Perl have a 550 (loose, not strict) UTF-8 representation, this function cannot fail. 549 551 550 552 =item $string = decode_utf8($octets [, CHECK]); 551 553 552 equivalent to C<$string = decode("utf8", $octets [, CHECK])>.553 The sequence of octets represented by 554 $octets is decoded from UTF-8 into a sequence of logical 555 characters. Not all sequences of octets form valid UTF-8 encodings, so 556 it is possible for this call to fail. For CHECK, see557 L</"Handling Malformed Data">.554 Equivalent to C<$string = decode("utf8", $octets [, CHECK])>. 555 The sequence of octets represented by $octets is decoded 556 from UTF-8 into a sequence of logical characters. 557 Because not all sequences of octets are valid UTF-8, 558 it is quite possible for this function to fail. 559 For CHECK, see L</"Handling Malformed Data">. 558 560 559 561 =back … … 564 566 @list = Encode->encodings(); 565 567 566 Returns a list of the canonical names of the available encodings that567 are loaded. To get a list of all available encodings including the 568 ones that are not loaded yet, say 568 Returns a list of canonical names of available encodings that have already 569 been loaded. To get a list of all available encodings including those that 570 have not yet been loaded, say: 569 571 570 572 @all_encodings = Encode->encodings(":all"); 571 573 572 Or you can give the name of a specific module .574 Or you can give the name of a specific module: 573 575 574 576 @with_jp = Encode->encodings("Encode::JP"); 575 577 576 When " ::" is not in the name, "Encode::" is assumed.578 When "C<::>" is not in the name, "C<Encode::>" is assumed. 577 579 578 580 @ebcdic = Encode->encodings("EBCDIC"); … … 587 589 use Encode; 588 590 use Encode::Alias; 589 define_alias( newName=> ENCODING);590 591 After that, newName can be used as an alias for ENCODING.592 ENCODINGmay be either the name of an encoding or an593 I<encoding object> 594 595 B ut before you do so, make sure the alias is nonexistent with591 define_alias(NEWNAME => ENCODING); 592 593 After that, I<NEWNAME> can be used as an alias for I<ENCODING>. 594 <ENCODING> may be either the name of an encoding or an 595 I<encoding object>. 596 597 Before you do that, first make sure the alias is nonexistent using 596 598 C<resolve_alias()>, which returns the canonical name thereof. 597 i.e. 599 For example: 598 600 599 601 Encode::resolve_alias("latin1") eq "iso-8859-1" # true … … 602 604 603 605 resolve_alias() does not need C<use Encode::Alias>; it can be 604 exported via C<use Encode qw(resolve_alias)>.606 imported via C<use Encode qw(resolve_alias)>. 605 607 606 608 See L<Encode::Alias> for details. … … 609 611 610 612 The canonical name of a given encoding does not necessarily agree with 611 IANA IANACharacter Set Registry, commonly seen as C<< Content-Type:612 text/plain; charset=I< whatever> >>. For most cases canonical names613 work but sometimes it does not (notably 'utf-8-strict').614 615 Therefore as of Encode version 2.21, a new method C<mime_name()> isadded.613 IANA Character Set Registry, commonly seen as C<< Content-Type: 614 text/plain; charset=I<WHATEVER> >>. For most cases, the canonical name 615 works, but sometimes it does not, most notably with "utf-8-strict". 616 617 As of C<Encode> version 2.21, a new method C<mime_name()> is thereforeadded. 616 618 617 619 use Encode; 618 my $enc = find_encoding( 'UTF-8');620 my $enc = find_encoding("UTF-8"); 619 621 warn $enc->name; # utf-8-strict 620 622 warn $enc->mime_name; # UTF-8 … … 624 626 =head1 Encoding via PerlIO 625 627 626 If your perl supports I<PerlIO> (which is the default), you can use a 627 PerlIO layer to decode and encode directly via a filehandle. The 628 following two examples are totally identical in their functionality. 629 630 # via PerlIO 631 open my $in, "<:encoding(shiftjis)", $infile or die; 632 open my $out, ">:encoding(euc-jp)", $outfile or die; 633 while(<$in>){ print $out $_; } 634 635 # via from_to 636 open my $in, "<", $infile or die; 637 open my $out, ">", $outfile or die; 638 while(<$in>){ 639 from_to($_, "shiftjis", "euc-jp", 1); 640 print $out $_; 641 } 642 643 Unfortunately, it may be that encodings are PerlIO-savvy. You can check 644 if your encoding is supported by PerlIO by calling the C<perlio_ok> 645 method. 646 647 Encode::perlio_ok("hz"); # False 648 find_encoding("euc-cn")->perlio_ok; # True where PerlIO is available 649 650 use Encode qw(perlio_ok); # exported upon request 628 If your perl supports C<PerlIO> (which is the default), you can use a 629 C<PerlIO> layer to decode and encode directly via a filehandle. The 630 following two examples are fully identical in functionality: 631 632 ### Version 1 via PerlIO 633 open(INPUT, "< :encoding(shiftjis)", $infile) 634 || die "Can't open < $infile for reading: $!"; 635 open(OUTPUT, "> :encoding(euc-jp)", $outfile) 636 || die "Can't open > $output for writing: $!"; 637 while (<INPUT>) { # auto decodes $_ 638 print OUTPUT; # auto encodes $_ 639 } 640 close(INPUT) || die "can't close $infile: $!"; 641 close(OUTPUT) || die "can't close $outfile: $!"; 642 643 ### Version 2 via from_to() 644 open(INPUT, "< :raw", $infile) 645 || die "Can't open < $infile for reading: $!"; 646 open(OUTPUT, "> :raw", $outfile) 647 || die "Can't open > $output for writing: $!"; 648 649 while (<INPUT>) { 650 from_to($_, "shiftjis", "euc-jp", 1); # switch encoding 651 print OUTPUT; # emit raw (but properly encoded) data 652 } 653 close(INPUT) || die "can't close $infile: $!"; 654 close(OUTPUT) || die "can't close $outfile: $!"; 655 656 In the first version above, you let the appropriate encoding layer 657 handle the conversion. In the second, you explicitly translate 658 from one encoding to the other. 659 660 Unfortunately, it may be that encodings are C<PerlIO>-savvy. You can check 661 to see whether your encoding is supported by C<PerlIO> by invoking the 662 C<perlio_ok> method on it: 663 664 Encode::perlio_ok("hz"); # false 665 find_encoding("euc-cn")->perlio_ok; # true wherever PerlIO is available 666 667 use Encode qw(perlio_ok); # imported upon request 651 668 perlio_ok("euc-jp") 652 669 653 Fortunately, all encodings that come with Encode core are PerlIO-savvy654 except for hz and ISO-2022-kr. Forgory details, see670 Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy 671 except for "hz" and "ISO-2022-kr". For the gory details, see 655 672 L<Encode::Encoding> and L<Encode::PerlIO>. 656 673 657 674 =head1 Handling Malformed Data 658 675 659 The optional I<CHECK> argument tells Encode what to do when it 660 encounters malformed data. Without CHECK, Encode::FB_DEFAULT ( == 0 ) 661 is assumed. 662 663 As of version 2.12 Encode supports coderef values for CHECK. See below. 676 The optional I<CHECK> argument tells C<Encode> what to do when 677 encountering malformed data. Without I<CHECK>, C<Encode::FB_DEFAULT> 678 (== 0) is assumed. 679 680 As of version 2.12, C<Encode> supports coderef values for C<CHECK>; 681 see below. 664 682 665 683 =over 2 … … 678 696 =item I<CHECK> = Encode::FB_DEFAULT ( == 0) 679 697 680 If I<CHECK> is 0, (en|de)code will put a I<substitution character> in681 place of a malformed character. When you encode, E<lt>subcharE<gt> 682 will be used. When you decode the code point C<0xFFFD> is used. If 683 the data is supposed to be UTF-8, an optional lexical warning 684 (category utf8)is given.698 If I<CHECK> is 0, encoding and decoding replace any malformed character 699 with a I<substitution character>. When you encode, I<SUBCHAR> is used. 700 When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is 701 used. If the data is supposed to be UTF-8, an optional lexical warning of 702 warning category C<"utf8"> is given. 685 703 686 704 =item I<CHECK> = Encode::FB_CROAK ( == 1) 687 705 688 If I<CHECK> is 1, methods will die on error immediatelywith an error689 message. Therefore, when I<CHECK> is set to 1, you should trap the690 e rror with eval{} unless you really want to let it die.706 If I<CHECK> is 1, methods immediately die with an error 707 message. Therefore, when I<CHECK> is 1, you should trap 708 exceptions with C<eval{}>, unless you really want to let it C<die>. 691 709 692 710 =item I<CHECK> = Encode::FB_QUIET 693 711 694 If I<CHECK> is set to Encode::FB_QUIET, (en|de)code willimmediately712 If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately 695 713 return the portion of the data that has been processed so far when an 696 error occurs. The data argument will beoverwritten with everything697 after that point (that is, the unprocessed part of data). This is698 handy when you have to call decoderepeatedly in the case where your714 error occurs. The data argument is overwritten with everything 715 after that point; that is, the unprocessed portion of the data. This is 716 handy when you have to call C<decode> repeatedly in the case where your 699 717 source data may contain partial multi-byte character sequences, 700 ( i.e. you are reading with a fixed-width buffer). Here is asample701 code t hat does exactly this:702 703 my $buffer = ''; my $string = '';704 while(read $fh, $buffer, 256, length($buffer)){705 $string .= decode($encoding, $buffer, Encode::FB_QUIET);706 # $buffer now contains the unprocessed partial character707 }718 (that is, you are reading with a fixed-width buffer). Here's some sample 719 code to do exactly that: 720 721 my($buffer, $string) = ("", ""); 722 while (read($fh, $buffer, 256, length($buffer))) { 723 $string .= decode($encoding, $buffer, Encode::FB_QUIET); 724 # $buffer now contains the unprocessed partial character 725 } 708 726 709 727 =item I<CHECK> = Encode::FB_WARN 710 728 711 This is the same as above, except that it warns on error. Handy when712 you are debugging the mode above.729 This is the same as C<FB_QUIET> above, except that instead of being silent 730 on errors, it issues a warning. This is handy for when you are debugging. 713 731 714 732 =item perlqq mode (I<CHECK> = Encode::FB_PERLQQ) … … 718 736 =item XML charref mode (I<CHECK> = Encode::FB_XMLCREF) 719 737 720 For encodings that are implemented by Encode::XS, CHECK ==721 Encode::FB_PERLQQ turns (en|de)codeinto C<perlqq> fallback mode.722 723 When you decode, C<\xI<HH>> will be inserted for a malformed character,724 where I<HH> is the hex representation of the octet that could not be 725 decoded to utf8. And when you encode, C<\x{I<HHHH>}> will be inserted, 726 where I<HHHH> is the Unicode ID of the character that cannot be found 727 in the character repertoire of the encoding.728 729 HTML/XML character reference modes are about the same, in place of730 C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number and738 For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==> 739 C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode. 740 741 When you decode, C<\xI<HH>> is inserted for a malformed character, where 742 I<HH> is the hex representation of the octet that could not be decoded to 743 utf8. When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is 744 the Unicode code point (in any number of hex digits) of the character that 745 cannot be found in the character repertoire of the encoding. 746 747 The HTML/XML character reference modes are about the same. In place of 748 C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and 731 749 XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number. 732 750 733 In Encode2.10 or later, C<LEAVE_SRC> is also implied.751 In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied. 734 752 735 753 =item The bitmask 736 754 737 These modes are a ctually set via a bitmask. Here is how the FB_XX738 constants are laid out. You can import the FB_XXconstants via739 C<use Encode qw(:fallbacks)> ;you can import the generic bitmask755 These modes are all actually set via a bitmask. Here is how the C<FB_I<XXX>> 756 constants are laid out. You can import the C<FB_I<XXX>> constants via 757 C<use Encode qw(:fallbacks)>, and you can import the generic bitmask 740 758 constants via C<use Encode qw(:fallback_all)>. 741 759 … … 755 773 =item Encode::LEAVE_SRC 756 774 757 If the C<Encode::LEAVE_SRC> bit is not set, but I<CHECK> is, then the second758 argument to C<encode()> or C<decode()> may be assigned to by the functions. If 759 you're not interested in this, then bitwise-or the bitmask with it.775 If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the 776 second argument to encode() or decode() will be overwritten in place. 777 If you're not interested in this, then bitwise-OR it with the bitmask. 760 778 761 779 =back … … 763 781 =head2 coderef for CHECK 764 782 765 As of Encode 2.12 CHECKcan also be a code reference which takes the766 ord value ofunmapped caharacter as an argument and returns a string767 that represents the fallback character. For instance ,783 As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the 784 ordinal value of the unmapped caharacter as an argument and returns a string 785 that represents the fallback character. For instance: 768 786 769 787 $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift }); 770 788 771 Acts like FB_PERLQQ but E<lt>U+I<XXXX>E<gt> is used instead of 772 \x{I<XXXX>}. 789 Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>. 773 790 774 791 =head1 Defining Encodings … … 777 794 778 795 use Encode qw(define_encoding); 779 define_encoding($object, 'canonicalName'[, alias...]);780 781 I< canonicalName> will be associated with I<$object>. The object796 define_encoding($object, CANONICAL_NAME [, alias...]); 797 798 I<CANONICAL_NAME> will be associated with I<$object>. The object 782 799 should provide the interface described in L<Encode::Encoding>. 783 If more than two arguments are provided thenadditional784 arguments are taken asaliases for I<$object>.785 786 See L<Encode::Encoding> for moredetails.800 If more than two arguments are provided, additional 801 arguments are considered aliases for I<$object>. 802 803 See L<Encode::Encoding> for details. 787 804 788 805 =head1 The UTF8 flag 789 806 790 Before the introduction of Unicode support in perl, The C<eq> operator807 Before the introduction of Unicode support in Perl, The C<eq> operator 791 808 just compared the strings represented by two scalars. Beginning with 792 perl 5.8, C<eq> compares two strings with simultaneous consideration of793 I<the UTF8 flag>. To explain why we made it so, I will quotepage 402 of794 C<Programming Perl, 3rd ed.>809 Perl 5.8, C<eq> compares two strings with simultaneous consideration of 810 I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of 811 I<Programming Perl, 3rd ed.> 795 812 796 813 =over 2 … … 818 835 =back 819 836 820 Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 821 was born and many features documented in the book remained 822 unimplemented for a long time. Perl 5.8 corrected this and the introduction 823 of the UTF8 flag is one of them. You can think of this perl notion as of a 824 byte-oriented mode (UTF8 flag off) and a character-oriented mode (UTF8 825 flag on). 826 827 Here is how Encode takes care of the UTF8 flag. 837 When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been 838 born yet, many features documented in the book remained unimplemented for a 839 long time. Perl 5.8 corrected much of this, and the introduction of the 840 UTF8 flag is one of them. You can think of there being two fundamentally 841 different kinds of strings and string-operations in Perl: one a 842 byte-oriented mode for when the internal UTF8 flag is off, and the other a 843 character-oriented mode for when the internal UTF8 flag is on. 844 845 Here is how C<Encode> handles the UTF8 flag. 828 846 829 847 =over 2 … … 831 849 =item * 832 850 833 When you encode, the resulting UTF8 flag is always off.851 When you I<encode>, the resulting UTF8 flag is always B<off>. 834 852 835 853 =item * 836 854 837 When you decode, the resulting UTF8 flag is on unless you can 838 unambiguously represent data. Here is the definition of 839 dis-ambiguity. 840 841 After C<$utf8 = decode('foo', $octet);>, 855 When you I<decode>, the resulting UTF8 flag is B<on>--I<unless> you can 856 unambiguously represent data. Here is what we mean by "unambiguously". 857 After C<$utf8 = decode("foo", $octet)>, 842 858 843 859 When $octet is... The UTF8 flag in $utf8 is … … 848 864 --------------------------------------------- 849 865 850 As you see, there is one exception , In ASCII. That way you can assume851 Goal #1. And with EncodeGoal #2 is assumed but you still have to be852 careful in such cases mentioned in B<CAVEAT> paragraphs.853 854 This UTF8 flag is not visible in perl scripts, exactly for the same855 reason you cannot (or you I<don't have to>) see if a scalar contains a 856 string, integer, or floatingpoint number. But you can still peek857 and poke these if you will. See the section below.866 As you see, there is one exception: in ASCII. That way you can assume 867 Goal #1. And with C<Encode>, Goal #2 is assumed but you still have to be 868 careful in the cases mentioned in the B<CAVEAT> paragraphs above. 869 870 This UTF8 flag is not visible in Perl scripts, exactly for the same reason 871 you cannot (or rather, you I<don't have to>) see whether a scalar contains 872 a string, an integer, or a floating-point number. But you can still peek 873 and poke these if you will. See the next section. 858 874 859 875 =back … … 862 878 863 879 The following API uses parts of Perl's internals in the current 864 implementation. As such, they are efficient but may change. 880 implementation. As such, they are efficient but may change in a future 881 release. 865 882 866 883 =over 2 … … 868 885 =item is_utf8(STRING [, CHECK]) 869 886 870 [INTERNAL] Tests whether the UTF8 flag is turned on in the STRING.871 If CHECK is true, also checks the data in STRING for beingwell-formed887 [INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>. 888 If I<CHECK> is true, also checks whether I<STRING> contains well-formed 872 889 UTF-8. Returns true if successful, false otherwise. 873 890 874 As of perl 5.8.1, L<utf8> also has utf8::is_utf8().891 As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function. 875 892 876 893 =item _utf8_on(STRING) 877 894 878 [INTERNAL] Turns on the UTF8 flag in STRING. The data in STRING is 879 B<not> checked for being well-formed UTF-8. Do not use unless you 880 B<know> that the STRING is well-formed UTF-8. Returns the previous 881 state of the UTF8 flag (so please don't treat the return value as 882 indicating success or failure), or C<undef> if STRING is not a string. 883 884 This function does not work on tainted values. 895 [INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>. The I<STRING> 896 is I<not> checked for containing only well-formed UTF-8. Do not use this 897 unless you I<know with absolute certainty> that the STRING holds only 898 well-formed UTF-8. Returns the previous state of the UTF8 flag (so please 899 don't treat the return value as indicating success or failure), or C<undef> 900 if I<STRING> is not a string. 901 902 B<NOTE>: For security reasons, this function does not work on tainted values. 885 903 886 904 =item _utf8_off(STRING) 887 905 888 [INTERNAL] Turns off the UTF8 flag in STRING. Do not use frivolously. 889 Returns the previous state of the UTF8 flag (so please don't treat the 890 return value as indicating success or failure), or C<undef> if STRING is 891 not a string. 892 893 This function does not work on tainted values. 906 [INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>. Do not use 907 frivolously. Returns the previous state of the UTF8 flag, or C<undef> if 908 I<STRING> is not a string. Do not treat the return value as indicative of 909 success or failure, because that isn't what it means: it is only the 910 previous setting. 911 912 B<NOTE>: For security reasons, this function does not work on tainted values. 894 913 895 914 =back … … 901 920 computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed. 902 921 903 That has been the perl's notion of UTF-8 but official UTF-8 is more 904 strict; Its ranges is much narrower (0 .. 10FFFF), some sequences are 905 not allowed (i.e. Those used in the surrogate pair, 0xFFFE, et al). 906 907 Now that is overruled by Larry Wall himself. 922 That has historically been Perl's notion of UTF-8, as that is how UTF-8 was 923 first conceived by Ken Thompson when he invented it. However, thanks to 924 later revisions to the applicable standards, official UTF-8 is now rather 925 stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF 926 to cover only 21 bits instead of 32 or 64 bits) and some sequences 927 are not allowed, like those used in surrogate pairs, the 31 non-character 928 code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane 929 (0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc. 930 931 The former default in which Perl would always use a loose interpretation of 932 UTF-8 has now been overruled: 908 933 909 934 From: Larry Wall <larry@wall.org> … … 912 937 Subject: Re: Make Encode.pm support the real UTF-8 913 938 Message-Id: <20041204025158.GA28754@wall.org> 914 939 915 940 On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote: 916 941 : I've no problem with 'utf8' being perl's unrestricted uft8 encoding, 917 942 : but "UTF-8" is the name of the standard and should give the 918 943 : corresponding behaviour. 919 944 920 945 For what it's worth, that's how I've always kept them straight in my 921 946 head. 922 947 923 948 Also for what it's worth, Perl 6 will mostly default to strict but 924 949 make it easy to switch back to lax. 925 950 926 951 Larry 927 952 928 Do you copy? As of Perl 5.8.7, B<UTF-8> means strict, official UTF-8 929 while B<utf8> means liberal, lax, version thereof. And Encode version 930 2.10 or later thus groks the difference between C<UTF-8> and C"utf8". 953 Got that? As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current 954 sense, which is conservative and strict and security-conscious, whereas 955 B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and 956 lax. C<Encode> version 2.10 or later thus groks this subtle but critically 957 important distinction between C<"UTF-8"> and C<"utf8">. 931 958 932 959 encode("utf8", "\x{FFFF_FFFF}", 1); # okay 933 960 encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks 934 961 935 C<UTF-8> in Encode is actually a canonical name for C<utf-8-strict>. 936 Yes, the hyphen between "UTF" and "8" is important. Without it Encode 937 goes "liberal" 962 In the C<Encode> module, C<"UTF-8"> is actually a canonical name for 963 C<"utf-8-strict">. That hyphen between the C<"UTF"> and the C<"8"> is 964 critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive: 938 965 939 966 find_encoding("UTF-8")->name # is 'utf-8-strict' 940 967 find_encoding("utf-8")->name # ditto. names are case insensitive 941 find_encoding("utf_8")->name # ditto. "_" are treated as "-"968 find_encoding("utf_8")->name # ditto. "_" are treated as "-" 942 969 find_encoding("UTF8")->name # is 'utf8'. 943 970 944 The UTF8 flag is internally called UTF8, without a hyphen. It indicates945 whether a string is internally encoded as utf8, also without a hypen.971 Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates 972 whether a string is internally encoded as "utf8", also without a hyphen. 946 973 947 974 =head1 SEE ALSO … … 959 986 =head1 MAINTAINER 960 987 961 This project was originated by Nick Ing-Simmons and later maintained962 by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for a full 963 list of people involved. For any questions, use 964 E<lt>perl-unicode@perl.orgE<gt> sowe can all share.965 966 While Dan Kogai retains the copyright as a maintainer, thecredit967 should go to all those invol oved. See AUTHORS for those submitted968 codes.988 This project was originated by the late Nick Ing-Simmons and later 989 maintained by Dan Kogai I<< <dankogai@dan.co.jp> >>. See AUTHORS 990 for a full list of people involved. For any questions, send mail to 991 I<< <perl-unicode@perl.org> >> so that we can all share. 992 993 While Dan Kogai retains the copyright as a maintainer, credit 994 should go to all those involved. See AUTHORS for a list of those 995 who submitted code to the project. 969 996 970 997 =head1 COPYRIGHT 971 998 972 Copyright 2002-20 06 Dan Kogai E<lt>dankogai@dan.co.jpE<gt>999 Copyright 2002-2011 Dan Kogai I<< <dankogai@dan.co.jp> >>. 973 1000 974 1001 This library is free software; you can redistribute it and/or modify -
lang/perl/Encode/trunk/Encode.xs
r38757 r38929 1 1 /* 2 $Id: Encode.xs,v 2.20 2010/12/31 22:48:48 dankogai Exp dankogai$2 $Id: Encode.xs,v 2.20 2010/12/31 22:48:48 dankogai Exp $ 3 3 */ 4 4 -
lang/perl/Encode/trunk/Unicode/Unicode.xs
r38757 r38929 1 1 /* 2 $Id: Unicode.xs,v 2.7 2010/12/31 22:48:48 dankogai Exp dankogai$2 $Id: Unicode.xs,v 2.7 2010/12/31 22:48:48 dankogai Exp $ 3 3 */ 4 4 -
lang/perl/Encode/trunk/lib/Encode/Alias.pm
r38736 r38929 3 3 use warnings; 4 4 no warnings 'redefine'; 5 our $VERSION = do { my @r = ( q$Revision: 2.1 3$ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };5 our $VERSION = do { my @r = ( q$Revision: 2.14 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 6 6 sub DEBUG () { 0 } 7 7 … … 207 207 # predefined in *.ucm; unneeded 208 208 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"'); 209 define_alias( qr/^ mac_(.*)$/i => '"mac$1"' );209 define_alias( qr/^(?:x[_-])?mac[_-](.*)$/i => '"mac$1"' ); 210 210 # http://rt.cpan.org/Ticket/Display.html?id=36326 211 211 define_alias( qr/^macintosh$/i => '"MacRoman"' );
![(please configure the [header_logo] section in trac.ini)](/share/chrome/site/your_project_logo.png)