Changeset 38929

Show
Ignore:
Timestamp:
05/22/11 08:21:44 (2 years ago)
Author:
dankogai
Message:

VERSION 2.43

Location:
lang/perl/Encode/trunk
Files:
5 modified

Legend:

Unmodified
Added
Removed
  • lang/perl/Encode/trunk/Changes

    r38757 r38929  
    11# Revision history for Perl extension Encode. 
    22# 
    3 # $Id: Changes,v 2.42 2010/12/31 22:48:48 dankogai Exp dankogai $ 
     3# $Id: Changes,v 2.43 2011/05/21 23:14:43 dankogai Exp dankogai $ 
    44# 
    5 $Revision: 2.42 $ $Date: 2010/12/31 22:48:48 $ 
     5$Revision: 2.43 $ $Date: 2011/05/21 23:14:43 $ 
     6! lib/Encode/Alias.pm 
     7  Addressed RT#68361: Encode::Bytes x-mac-... aliases missing 
     8  https://rt.cpan.org/Ticket/Display.html?id=68361 
     9! Encode.pm 
     10  Applied the 0001-Fix-typo-in-pod.patch 
     11  https://rt.cpan.org/Ticket/Update.html?id=64381 
     12  Addressed RT#65796 Deep recursion error finding invalid charset 
     13  https://rt.cpan.org/Ticket/Update.html?id=65796 
     14  Applied a jumbo doc patch by Tom Christiansen 
     15  Message-Id: <14795.1304618434@chthon> 
     16 
     172.42 2010/12/31 22:48:48 
    618! Encode.xs 
    719! Unicode/Unicode.xs 
  • lang/perl/Encode/trunk/Encode.pm

    r38757 r38929  
    11# 
    2 # $Id: Encode.pm,v 2.42 2010/12/31 22:48:10 dankogai Exp $ 
     2# $Id: Encode.pm,v 2.43 2011/05/21 23:14:43 dankogai Exp dankogai $ 
    33# 
    44package Encode; 
    55use strict; 
    66use warnings; 
    7 our $VERSION = sprintf "%d.%02d", q$Revision: 2.42 $ =~ /(\d+)/g; 
     7our $VERSION = sprintf "%d.%02d", q$Revision: 2.43 $ =~ /(\d+)/g; 
    88sub DEBUG () { 0 } 
    99use XSLoader (); 
     
    6969    else { 
    7070        %enc = %Encoding; 
    71         for my $mod ( map { m/::/o ? $_ : "Encode::$_" } @_ ) { 
     71        for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) { 
    7272            DEBUG and warn $mod; 
    7373            for my $enc ( keys %ExtModule ) { 
     
    101101sub getEncoding { 
    102102    my ( $class, $name, $skip_external ) = @_; 
     103 
     104    $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796 
    103105 
    104106    ref($name) && $name->can('renew') and return $name; 
     
    335337          bless { Name => "utf8" } => "Encode::utf8"; 
    336338        $Encode::Encoding{"utf-8-strict"} = 
    337           bless { Name => "utf-8-strict", strict_utf8 => 1 } => 
    338           "Encode::utf8"; 
     339          bless { Name => "utf-8-strict", strict_utf8 => 1 }  
     340            => "Encode::utf8"; 
    339341    } 
    340342} 
     
    346348=head1 NAME 
    347349 
    348 Encode - character encodings 
     350Encode - character encodings in Perl 
    349351 
    350352=head1 SYNOPSIS 
     
    354356=head2 Table of Contents 
    355357 
    356 Encode consists of a collection of modules whose details are too big 
    357 to fit in one document.  This POD itself explains the top-level APIs 
     358Encode consists of a collection of modules whose details are too extensive 
     359to fit in one document.  This one itself explains the top-level APIs 
    358360and general topics at a glance.  For other topics and more details, 
    359 see the PODs below: 
     361see the documentation for these modules: 
    360362 
    361363  Name                          Description 
     
    372374=head1 DESCRIPTION 
    373375 
    374 The C<Encode> module provides the interfaces between Perl's strings 
     376The C<Encode> module provides the interface between Perl strings 
    375377and the rest of the system.  Perl strings are sequences of 
    376 B<characters>. 
    377  
    378 The repertoire of characters that Perl can represent is at least that 
     378I<characters>. 
     379 
     380The repertoire of characters that Perl can represent is a superset of those 
    379381defined by the Unicode Consortium. On most platforms the ordinal 
    380 values of the characters (as returned by C<ord(ch)>) is the "Unicode 
    381 codepoint" for the character (the exceptions are those platforms where 
    382 the legacy encoding is some variant of EBCDIC rather than a super-set 
    383 of ASCII - see L<perlebcdic>). 
    384  
    385 Traditionally, computer data has been moved around in 8-bit chunks 
    386 often called "bytes". These chunks are also known as "octets" in 
    387 networking standards. Perl is widely used to manipulate data of many 
    388 types - not only strings of characters representing human or computer 
    389 languages but also "binary" data being the machine's representation of 
    390 numbers, pixels in an image - or just about anything. 
     382values of a character as returned by C<ord(I<S>)> is the I<Unicode 
     383codepoint> for that character. The exceptions are platforms where 
     384the legacy encoding is some variant of EBCDIC rather than a superset 
     385of ASCII; see L<perlebcdic>. 
     386 
     387During recent history, data is moved around a computer in 8-bit chunks, 
     388often called "bytes" but also known as "octets" in standards documents. 
     389Perl is widely used to manipulate data of many types: not only strings of 
     390characters representing human or computer languages, but also "binary" 
     391data, being the machine's representation of numbers, pixels in an image, or 
     392just about anything. 
    391393 
    392394When Perl is processing "binary data", the programmer wants Perl to 
    393 process "sequences of bytes". This is not a problem for Perl - as a 
     395process "sequences of bytes". This is not a problem for Perl: because a 
    394396byte has 256 possible values, it easily fits in Perl's much larger 
    395397"logical character". 
     
    401403=item * 
    402404 
    403 I<character>: a character in the range 0..(2**32-1) (or more). 
    404 (What Perl's strings are made of.) 
     405I<character>: a character in the range 0 .. 2**32-1 (or more); 
     406what Perl's strings are made of. 
    405407 
    406408=item * 
    407409 
    408 I<byte>: a character in the range 0..255 
    409 (A special case of a Perl character.) 
     410I<byte>: a character in the range 0..255; 
     411A special case of a Perl character. 
    410412 
    411413=item * 
    412414 
    413 I<octet>: 8 bits of data, with ordinal values 0..255 
    414 (Term for bytes passed to or from a non-Perl context, e.g. a disk file.) 
     415I<octet>: 8 bits of data, with ordinal values 0..255; 
     416Term for bytes passed to or from a non-Perl context, such as a disk file. 
    415417 
    416418=back 
    417419 
    418 =head1 PERL ENCODING API 
     420=head1 THE PERL ENCODING API 
    419421 
    420422=over 2 
    421423 
    422 =item $octets  = encode(ENCODING, $string [, CHECK]) 
    423  
    424 Encodes a string from Perl's internal form into I<ENCODING> and returns 
    425 a sequence of octets.  ENCODING can be either a canonical name or 
    426 an alias.  For encoding names and aliases, see L</"Defining Aliases">. 
    427 For CHECK, see L</"Handling Malformed Data">. 
    428  
    429 For example, to convert a string from Perl's internal format to 
    430 iso-8859-1 (also known as Latin1), 
     424=item $octets  = encode(ENCODING, STRING[, CHECK]) 
     425 
     426Encodes the scalar value I<STRING> from Perl's internal form into 
     427I<ENCODING> and returns a sequence of octets.  I<ENCODING> can be either a 
     428canonical name or an alias.  For encoding names and aliases, see 
     429L</"Defining Aliases">.  For CHECK, see L</"Handling Malformed Data">. 
     430 
     431For example, to convert a string from Perl's internal format into 
     432ISO-8859-1, also known as Latin1: 
    431433 
    432434  $octets = encode("iso-8859-1", $string); 
    433435 
    434436B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then 
    435 $octets B<may not be equal to> $string.  Though they both contain the 
    436 same data, the UTF8 flag for $octets is B<always> off.  When you 
    437 encode anything, UTF8 flag of the result is always off, even when it 
    438 contains completely valid utf8 string. See L</"The UTF8 flag"> below. 
    439  
    440 If the $string is C<undef> then C<undef> is returned. 
    441  
    442 =item $string = decode(ENCODING, $octets [, CHECK]) 
    443  
    444 Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's 
    445 internal form and returns the resulting string.  As in encode(), 
    446 ENCODING can be either a canonical name or an alias. For encoding names 
    447 and aliases, see L</"Defining Aliases">.  For CHECK, see 
    448 L</"Handling Malformed Data">. 
    449  
    450 For example, to convert ISO-8859-1 data to a string in Perl's internal format: 
     437$octets I<might not be equal to> $string.  Though both contain the 
     438same data, the UTF8 flag for $octets is I<always> off.  When you 
     439encode anything, the UTF8 flag on the result is always off, even when it 
     440contains a completely valid utf8 string. See L</"The UTF8 flag"> below. 
     441 
     442If the $string is C<undef>, then C<undef> is returned. 
     443 
     444=item $string = decode(ENCODING, OCTETS[, CHECK]) 
     445 
     446This function returns the string that results from decoding the scalar 
     447value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into 
     448Perl's internal form.  The returns the resulting string.  As with encode(), 
     449I<ENCODING> can be either a canonical name or an alias. For encoding names 
     450and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling 
     451Malformed Data">. 
     452 
     453For example, to convert ISO-8859-1 data into a string in Perl's 
     454internal format: 
    451455 
    452456  $string = decode("iso-8859-1", $octets); 
    453457 
    454458B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string 
    455 B<may not be equal to> $octets.  Though they both contain the same data, 
    456 the UTF8 flag for $string is on unless $octets entirely consists of 
    457 ASCII data (or EBCDIC on EBCDIC machines).  See L</"The UTF8 flag"> 
     459I<might not be equal to> $octets.  Though both contain the same data, the 
     460UTF8 flag for $string is on unless $octets consists entirely of ASCII data 
     461on ASCII machines or EBCDIC on EBCDIC machines.  See L</"The UTF8 flag"> 
    458462below. 
    459463 
    460 If the $string is C<undef> then C<undef> is returned. 
     464If the $string is C<undef>, then C<undef> is returned. 
    461465 
    462466=item [$obj =] find_encoding(ENCODING) 
    463467 
    464 Returns the I<encoding object> corresponding to ENCODING.  Returns 
    465 undef if no matching ENCODING is find. 
    466  
    467 This object is what actually does the actual (en|de)coding. 
     468Returns the I<encoding object> corresponding to I<ENCODING>.  Returns 
     469C<undef> if no matching I<ENCODING> is find.  The returned object is 
     470what does the actual encoding or decoding. 
    468471 
    469472  $utf8 = decode($name, $bytes); 
     
    471474is in fact 
    472475 
    473   $utf8 = do{ 
    474     $obj = find_encoding($name); 
    475     croak qq(encoding "$name" not found) unless ref $obj; 
    476     $obj->decode($bytes) 
    477   }; 
     476    $utf8 = do { 
     477        $obj = find_encoding($name); 
     478        croak qq(encoding "$name" not found) unless ref $obj; 
     479        $obj->decode($bytes); 
     480    }; 
    478481 
    479482with more error checking. 
    480483 
    481 Therefore you can save time by reusing this object as follows; 
    482  
    483   my $enc = find_encoding("iso-8859-1"); 
    484   while(<>){ 
    485      my $utf8 = $enc->decode($_); 
    486      # and do someting with $utf8; 
    487   } 
     484You can therefore save time by reusing this object as follows; 
     485 
     486    my $enc = find_encoding("iso-8859-1"); 
     487    while(<>) { 
     488        my $utf8 = $enc->decode($_); 
     489        ... # now do something with $utf8; 
     490    } 
    488491 
    489492Besides C<< ->decode >> and C<< ->encode >>, other methods are 
    490 available as well.  For instance, C<< -> name >> returns the canonical 
     493available as well.  For instance, C<< ->name >> returns the canonical 
    491494name of the encoding object. 
    492495 
     
    497500=item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK]) 
    498501 
    499 Converts B<in-place> data between two encodings. The data in $octets 
    500 must be encoded as octets and not as characters in Perl's internal 
    501 format. For example, to convert ISO-8859-1 data to Microsoft's CP1250 
     502Converts I<in-place> data between two encodings. The data in $octets 
     503must be encoded as octets and I<not> as characters in Perl's internal 
     504format. For example, to convert ISO-8859-1 data into Microsoft's CP1250 
    502505encoding: 
    503506 
     
    508511  from_to($octets, "cp1250", "iso-8859-1"); 
    509512 
    510 Note that because the conversion happens in place, the data to be 
    511 converted cannot be a string constant; it must be a scalar variable. 
    512  
    513 from_to() returns the length of the converted string in octets on 
    514 success, I<undef> on error. 
    515  
    516 B<CAVEAT>: The following operations look the same but are not quite so; 
     513Because the conversion happens in place, the data to be 
     514converted cannot be a string constant: it must be a scalar variable. 
     515 
     516from_to() returns the length of the converted string in octets on success, 
     517and C<undef> on error. 
     518 
     519B<CAVEAT>: The following operations may look the same, but are not: 
    517520 
    518521  from_to($data, "iso-8859-1", "utf8"); #1 
    519522  $data = decode("iso-8859-1", $data);  #2 
    520523 
    521 Both #1 and #2 make $data consist of a completely valid UTF-8 string 
    522 but only #2 turns UTF8 flag on.  #1 is equivalent to 
     524Both #1 and #2 make $data consist of a completely valid UTF-8 string, 
     525but only #2 turns the UTF8 flag on.  #1 is equivalent to: 
    523526 
    524527  $data = encode("utf8", decode("iso-8859-1", $data)); 
     
    526529See L</"The UTF8 flag"> below. 
    527530 
    528 Also note that 
     531Also note that: 
    529532 
    530533  from_to($octets, $from, $to, $check); 
    531534 
    532 is equivalent to 
     535is equivalent t:o 
    533536 
    534537  $octets = encode($to, decode($from, $octets), $check); 
    535538 
    536 Yes, it does not respect the $check during decoding.  It is 
    537 deliberately done that way.  If you need minute control, C<decode> 
    538 then C<encode> as follows; 
     539Yes, it does I<not> respect the $check during decoding.  It is 
     540deliberately done that way.  If you need minute control, use C<decode> 
     541followed by C<encode> as follows: 
    539542 
    540543  $octets = encode($to, decode($from, $octets, $check_from), $check_to); 
     
    542545=item $octets = encode_utf8($string); 
    543546 
    544 Equivalent to C<$octets = encode("utf8", $string);> The characters 
    545 that comprise $string are encoded in Perl's internal format and the 
    546 result is returned as a sequence of octets. All possible 
    547 characters have a UTF-8 representation so this function cannot fail. 
    548  
     547Equivalent to C<$octets = encode("utf8", $string)>.  The characters in 
     548$string are encoded in Perl's internal format, and the result is returned 
     549as a sequence of octets.  Because all possible characters in Perl have a 
     550(loose, not strict) UTF-8 representation, this function cannot fail. 
    549551 
    550552=item $string = decode_utf8($octets [, CHECK]); 
    551553 
    552 equivalent to C<$string = decode("utf8", $octets [, CHECK])>. 
    553 The sequence of octets represented by 
    554 $octets is decoded from UTF-8 into a sequence of logical 
    555 characters. Not all sequences of octets form valid UTF-8 encodings, so 
    556 it is possible for this call to fail.  For CHECK, see 
    557 L</"Handling Malformed Data">. 
     554Equivalent to C<$string = decode("utf8", $octets [, CHECK])>. 
     555The sequence of octets represented by $octets is decoded 
     556from UTF-8 into a sequence of logical characters. 
     557Because not all sequences of octets are valid UTF-8, 
     558it is quite possible for this function to fail. 
     559For CHECK, see L</"Handling Malformed Data">. 
    558560 
    559561=back 
     
    564566  @list = Encode->encodings(); 
    565567 
    566 Returns a list of the canonical names of the available encodings that 
    567 are loaded.  To get a list of all available encodings including the 
    568 ones that are not loaded yet, say 
     568Returns a list of canonical names of available encodings that have already 
     569been loaded.  To get a list of all available encodings including those that 
     570have not yet been loaded, say: 
    569571 
    570572  @all_encodings = Encode->encodings(":all"); 
    571573 
    572 Or you can give the name of a specific module. 
     574Or you can give the name of a specific module: 
    573575 
    574576  @with_jp = Encode->encodings("Encode::JP"); 
    575577 
    576 When "::" is not in the name, "Encode::" is assumed. 
     578When "C<::>" is not in the name, "C<Encode::>" is assumed. 
    577579 
    578580  @ebcdic = Encode->encodings("EBCDIC"); 
     
    587589  use Encode; 
    588590  use Encode::Alias; 
    589   define_alias(newName => ENCODING); 
    590  
    591 After that, newName can be used as an alias for ENCODING. 
    592 ENCODING may be either the name of an encoding or an 
    593 I<encoding object> 
    594  
    595 But before you do so, make sure the alias is nonexistent with 
     591  define_alias(NEWNAME => ENCODING); 
     592 
     593After that, I<NEWNAME> can be used as an alias for I<ENCODING>. 
     594<ENCODING> may be either the name of an encoding or an 
     595I<encoding object>. 
     596 
     597Before you do that, first make sure the alias is nonexistent using 
    596598C<resolve_alias()>, which returns the canonical name thereof. 
    597 i.e. 
     599For example: 
    598600 
    599601  Encode::resolve_alias("latin1") eq "iso-8859-1" # true 
     
    602604 
    603605resolve_alias() does not need C<use Encode::Alias>; it can be 
    604 exported via C<use Encode qw(resolve_alias)>. 
     606imported via C<use Encode qw(resolve_alias)>. 
    605607 
    606608See L<Encode::Alias> for details. 
     
    609611 
    610612The canonical name of a given encoding does not necessarily agree with 
    611 IANA IANA Character Set Registry, commonly seen as C<< Content-Type: 
    612 text/plain; charset=I<whatever> >>.  For most cases canonical names 
    613 work but sometimes it does not (notably 'utf-8-strict'). 
    614  
    615 Therefore as of Encode version 2.21, a new method C<mime_name()> is added. 
     613IANA Character Set Registry, commonly seen as C<< Content-Type: 
     614text/plain; charset=I<WHATEVER> >>.  For most cases, the canonical name 
     615works, but sometimes it does not, most notably with "utf-8-strict". 
     616 
     617As of C<Encode> version 2.21, a new method C<mime_name()> is thereforeadded. 
    616618 
    617619  use Encode; 
    618   my $enc = find_encoding('UTF-8'); 
     620  my $enc = find_encoding("UTF-8"); 
    619621  warn $enc->name;      # utf-8-strict 
    620622  warn $enc->mime_name; # UTF-8 
     
    624626=head1 Encoding via PerlIO 
    625627 
    626 If your perl supports I<PerlIO> (which is the default), you can use a 
    627 PerlIO layer to decode and encode directly via a filehandle.  The 
    628 following two examples are totally identical in their functionality. 
    629  
    630   # via PerlIO 
    631   open my $in,  "<:encoding(shiftjis)", $infile  or die; 
    632   open my $out, ">:encoding(euc-jp)",   $outfile or die; 
    633   while(<$in>){ print $out $_; } 
    634  
    635   # via from_to 
    636   open my $in,  "<", $infile  or die; 
    637   open my $out, ">", $outfile or die; 
    638   while(<$in>){ 
    639     from_to($_, "shiftjis", "euc-jp", 1); 
    640     print $out $_; 
    641   } 
    642  
    643 Unfortunately, it may be that encodings are PerlIO-savvy.  You can check 
    644 if your encoding is supported by PerlIO by calling the C<perlio_ok> 
    645 method. 
    646  
    647   Encode::perlio_ok("hz");             # False 
    648   find_encoding("euc-cn")->perlio_ok;  # True where PerlIO is available 
    649  
    650   use Encode qw(perlio_ok);            # exported upon request 
     628If your perl supports C<PerlIO> (which is the default), you can use a 
     629C<PerlIO> layer to decode and encode directly via a filehandle.  The 
     630following two examples are fully identical in functionality: 
     631 
     632  ### Version 1 via PerlIO 
     633    open(INPUT,  "< :encoding(shiftjis)", $infile) 
     634        || die "Can't open < $infile for reading: $!"; 
     635    open(OUTPUT, "> :encoding(euc-jp)",  $outfile) 
     636        || die "Can't open > $output for writing: $!"; 
     637    while (<INPUT>) {   # auto decodes $_ 
     638        print OUTPUT;   # auto encodes $_ 
     639    } 
     640    close(INPUT)   || die "can't close $infile: $!"; 
     641    close(OUTPUT)  || die "can't close $outfile: $!"; 
     642 
     643  ### Version 2 via from_to() 
     644    open(INPUT,  "< :raw", $infile) 
     645        || die "Can't open < $infile for reading: $!"; 
     646    open(OUTPUT, "> :raw",  $outfile) 
     647        || die "Can't open > $output for writing: $!"; 
     648 
     649    while (<INPUT>) { 
     650        from_to($_, "shiftjis", "euc-jp", 1);  # switch encoding 
     651        print OUTPUT;   # emit raw (but properly encoded) data 
     652    } 
     653    close(INPUT)   || die "can't close $infile: $!"; 
     654    close(OUTPUT)  || die "can't close $outfile: $!"; 
     655 
     656In the first version above, you let the appropriate encoding layer 
     657handle the conversion.  In the second, you explicitly translate 
     658from one encoding to the other. 
     659 
     660Unfortunately, it may be that encodings are C<PerlIO>-savvy.  You can check 
     661to see whether your encoding is supported by C<PerlIO> by invoking the 
     662C<perlio_ok> method on it: 
     663 
     664  Encode::perlio_ok("hz");             # false 
     665  find_encoding("euc-cn")->perlio_ok;  # true wherever PerlIO is available 
     666 
     667  use Encode qw(perlio_ok);            # imported upon request 
    651668  perlio_ok("euc-jp") 
    652669 
    653 Fortunately, all encodings that come with Encode core are PerlIO-savvy 
    654 except for hz and ISO-2022-kr.  For gory details, see 
     670Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy 
     671except for "hz" and "ISO-2022-kr".  For the gory details, see 
    655672L<Encode::Encoding> and L<Encode::PerlIO>. 
    656673 
    657674=head1 Handling Malformed Data 
    658675 
    659 The optional I<CHECK> argument tells Encode what to do when it 
    660 encounters malformed data.  Without CHECK, Encode::FB_DEFAULT ( == 0 ) 
    661 is assumed. 
    662  
    663 As of version 2.12 Encode supports coderef values for CHECK.  See below. 
     676The optional I<CHECK> argument tells C<Encode> what to do when 
     677encountering malformed data.  Without I<CHECK>, C<Encode::FB_DEFAULT> 
     678(== 0) is assumed. 
     679 
     680As of version 2.12, C<Encode> supports coderef values for C<CHECK>; 
     681see below. 
    664682 
    665683=over 2 
     
    678696=item I<CHECK> = Encode::FB_DEFAULT ( == 0) 
    679697 
    680 If I<CHECK> is 0, (en|de)code will put a I<substitution character> in 
    681 place of a malformed character.  When you encode, E<lt>subcharE<gt> 
    682 will be used.  When you decode the code point C<0xFFFD> is used.  If 
    683 the data is supposed to be UTF-8, an optional lexical warning 
    684 (category utf8) is given. 
     698If I<CHECK> is 0, encoding and decoding replace any malformed character 
     699with a I<substitution character>.  When you encode, I<SUBCHAR> is used. 
     700When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is 
     701used.  If the data is supposed to be UTF-8, an optional lexical warning of 
     702warning category C<"utf8"> is given. 
    685703 
    686704=item I<CHECK> = Encode::FB_CROAK ( == 1) 
    687705 
    688 If I<CHECK> is 1, methods will die on error immediately with an error 
    689 message.  Therefore, when I<CHECK> is set to 1,  you should trap the 
    690 error with eval{} unless you really want to let it die. 
     706If I<CHECK> is 1, methods immediately die with an error 
     707message.  Therefore, when I<CHECK> is 1, you should trap 
     708exceptions with C<eval{}>, unless you really want to let it C<die>. 
    691709 
    692710=item I<CHECK> = Encode::FB_QUIET 
    693711 
    694 If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately 
     712If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately 
    695713return the portion of the data that has been processed so far when an 
    696 error occurs. The data argument will be overwritten with everything 
    697 after that point (that is, the unprocessed part of data).  This is 
    698 handy when you have to call decode repeatedly in the case where your 
     714error occurs. The data argument is overwritten with everything 
     715after that point; that is, the unprocessed portion of the data.  This is 
     716handy when you have to call C<decode> repeatedly in the case where your 
    699717source data may contain partial multi-byte character sequences, 
    700 (i.e. you are reading with a fixed-width buffer). Here is a sample 
    701 code that does exactly this: 
    702  
    703   my $buffer = ''; my $string = ''; 
    704   while(read $fh, $buffer, 256, length($buffer)){ 
    705     $string .= decode($encoding, $buffer, Encode::FB_QUIET); 
    706     # $buffer now contains the unprocessed partial character 
    707   } 
     718(that is, you are reading with a fixed-width buffer). Here's some sample 
     719code to do exactly that: 
     720 
     721    my($buffer, $string) = ("", ""); 
     722    while (read($fh, $buffer, 256, length($buffer))) { 
     723        $string .= decode($encoding, $buffer, Encode::FB_QUIET); 
     724        # $buffer now contains the unprocessed partial character 
     725    } 
    708726 
    709727=item I<CHECK> = Encode::FB_WARN 
    710728 
    711 This is the same as above, except that it warns on error.  Handy when 
    712 you are debugging the mode above. 
     729This is the same as C<FB_QUIET> above, except that instead of being silent 
     730on errors, it issues a warning.  This is handy for when you are debugging. 
    713731 
    714732=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ) 
     
    718736=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF) 
    719737 
    720 For encodings that are implemented by Encode::XS, CHECK == 
    721 Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode. 
    722  
    723 When you decode, C<\xI<HH>> will be inserted for a malformed character, 
    724 where I<HH> is the hex representation of the octet  that could not be 
    725 decoded to utf8.  And when you encode, C<\x{I<HHHH>}> will be inserted, 
    726 where I<HHHH> is the Unicode ID of the character that cannot be found 
    727 in the character repertoire of the encoding. 
    728  
    729 HTML/XML character reference modes are about the same, in place of 
    730 C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number and 
     738For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==> 
     739C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode. 
     740 
     741When you decode, C<\xI<HH>> is inserted for a malformed character, where 
     742I<HH> is the hex representation of the octet that could not be decoded to 
     743utf8.  When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is 
     744the Unicode code point (in any number of hex digits) of the character that 
     745cannot be found in the character repertoire of the encoding. 
     746 
     747The HTML/XML character reference modes are about the same. In place of 
     748C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and 
    731749XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number. 
    732750 
    733 In Encode 2.10 or later, C<LEAVE_SRC> is also implied. 
     751In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied. 
    734752 
    735753=item The bitmask 
    736754 
    737 These modes are actually set via a bitmask.  Here is how the FB_XX 
    738 constants are laid out.  You can import the FB_XX constants via 
    739 C<use Encode qw(:fallbacks)>; you can import the generic bitmask 
     755These modes are all actually set via a bitmask.  Here is how the C<FB_I<XXX>> 
     756constants are laid out.  You can import the C<FB_I<XXX>> constants via 
     757C<use Encode qw(:fallbacks)>, and you can import the generic bitmask 
    740758constants via C<use Encode qw(:fallback_all)>. 
    741759 
     
    755773=item Encode::LEAVE_SRC 
    756774 
    757 If the C<Encode::LEAVE_SRC> bit is not set, but I<CHECK> is, then the second 
    758 argument to C<encode()> or C<decode()> may be assigned to by the functions. If 
    759 you're not interested in this, then bitwise-or the bitmask with it. 
     775If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the 
     776second argument to encode() or decode() will be overwritten in place. 
     777If you're not interested in this, then bitwise-OR it with the bitmask. 
    760778 
    761779=back 
     
    763781=head2 coderef for CHECK 
    764782 
    765 As of Encode 2.12 CHECK can also be a code reference which takes the 
    766 ord value of unmapped caharacter as an argument and returns a string 
    767 that represents the fallback character.  For instance, 
     783As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the 
     784ordinal value of the unmapped caharacter as an argument and returns a string 
     785that represents the fallback character.  For instance: 
    768786 
    769787  $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift }); 
    770788 
    771 Acts like FB_PERLQQ but E<lt>U+I<XXXX>E<gt> is used instead of 
    772 \x{I<XXXX>}. 
     789Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>. 
    773790 
    774791=head1 Defining Encodings 
     
    777794 
    778795    use Encode qw(define_encoding); 
    779     define_encoding($object, 'canonicalName' [, alias...]); 
    780  
    781 I<canonicalName> will be associated with I<$object>.  The object 
     796    define_encoding($object, CANONICAL_NAME [, alias...]); 
     797 
     798I<CANONICAL_NAME> will be associated with I<$object>.  The object 
    782799should provide the interface described in L<Encode::Encoding>. 
    783 If more than two arguments are provided then additional 
    784 arguments are taken as aliases for I<$object>. 
    785  
    786 See L<Encode::Encoding> for more details. 
     800If more than two arguments are provided, additional 
     801arguments are considered aliases for I<$object>. 
     802 
     803See L<Encode::Encoding> for details. 
    787804 
    788805=head1 The UTF8 flag 
    789806 
    790 Before the introduction of Unicode support in perl, The C<eq> operator 
     807Before the introduction of Unicode support in Perl, The C<eq> operator 
    791808just compared the strings represented by two scalars. Beginning with 
    792 perl 5.8, C<eq> compares two strings with simultaneous consideration of 
    793 I<the UTF8 flag>. To explain why we made it so, I will quote page 402 of 
    794 C<Programming Perl, 3rd ed.> 
     809Perl 5.8, C<eq> compares two strings with simultaneous consideration of 
     810I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of 
     811I<Programming Perl, 3rd ed.> 
    795812 
    796813=over 2 
     
    818835=back 
    819836 
    820 Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 
    821 was born and many features documented in the book remained 
    822 unimplemented for a long time.  Perl 5.8 corrected this and the introduction 
    823 of the UTF8 flag is one of them.  You can think of this perl notion as of a 
    824 byte-oriented mode (UTF8 flag off) and a character-oriented mode (UTF8 
    825 flag on). 
    826  
    827 Here is how Encode takes care of the UTF8 flag. 
     837When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been 
     838born yet, many features documented in the book remained unimplemented for a 
     839long time.  Perl 5.8 corrected much of this, and the introduction of the 
     840UTF8 flag is one of them.  You can think of there being two fundamentally 
     841different kinds of strings and string-operations in Perl: one a 
     842byte-oriented mode  for when the internal UTF8 flag is off, and the other a 
     843character-oriented mode for when the internal UTF8 flag is on. 
     844 
     845Here is how C<Encode> handles the UTF8 flag. 
    828846 
    829847=over 2 
     
    831849=item * 
    832850 
    833 When you encode, the resulting UTF8 flag is always off. 
     851When you I<encode>, the resulting UTF8 flag is always B<off>. 
    834852 
    835853=item * 
    836854 
    837 When you decode, the resulting UTF8 flag is on unless you can 
    838 unambiguously represent data.  Here is the definition of 
    839 dis-ambiguity. 
    840  
    841 After C<$utf8 = decode('foo', $octet);>, 
     855When you I<decode>, the resulting UTF8 flag is B<on>--I<unless> you can 
     856unambiguously represent data.  Here is what we mean by "unambiguously". 
     857After C<$utf8 = decode("foo", $octet)>, 
    842858 
    843859  When $octet is...   The UTF8 flag in $utf8 is 
     
    848864  --------------------------------------------- 
    849865 
    850 As you see, there is one exception, In ASCII.  That way you can assume 
    851 Goal #1.  And with Encode Goal #2 is assumed but you still have to be 
    852 careful in such cases mentioned in B<CAVEAT> paragraphs. 
    853  
    854 This UTF8 flag is not visible in perl scripts, exactly for the same 
    855 reason you cannot (or you I<don't have to>) see if a scalar contains a 
    856 string, integer, or floating point number.   But you can still peek 
    857 and poke these if you will.  See the section below. 
     866As you see, there is one exception: in ASCII.  That way you can assume 
     867Goal #1.  And with C<Encode>, Goal #2 is assumed but you still have to be 
     868careful in the cases mentioned in the B<CAVEAT> paragraphs above. 
     869 
     870This UTF8 flag is not visible in Perl scripts, exactly for the same reason 
     871you cannot (or rather, you I<don't have to>) see whether a scalar contains 
     872a string, an integer, or a floating-point number.   But you can still peek 
     873and poke these if you will.  See the next section. 
    858874 
    859875=back 
     
    862878 
    863879The following API uses parts of Perl's internals in the current 
    864 implementation.  As such, they are efficient but may change. 
     880implementation.  As such, they are efficient but may change in a future 
     881release. 
    865882 
    866883=over 2 
     
    868885=item is_utf8(STRING [, CHECK]) 
    869886 
    870 [INTERNAL] Tests whether the UTF8 flag is turned on in the STRING. 
    871 If CHECK is true, also checks the data in STRING for being well-formed 
     887[INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>. 
     888If I<CHECK> is true, also checks whether I<STRING> contains well-formed 
    872889UTF-8.  Returns true if successful, false otherwise. 
    873890 
    874 As of perl 5.8.1, L<utf8> also has utf8::is_utf8(). 
     891As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function. 
    875892 
    876893=item _utf8_on(STRING) 
    877894 
    878 [INTERNAL] Turns on the UTF8 flag in STRING.  The data in STRING is 
    879 B<not> checked for being well-formed UTF-8.  Do not use unless you 
    880 B<know> that the STRING is well-formed UTF-8.  Returns the previous 
    881 state of the UTF8 flag (so please don't treat the return value as 
    882 indicating success or failure), or C<undef> if STRING is not a string. 
    883  
    884 This function does not work on tainted values. 
     895[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>.  The I<STRING> 
     896is I<not> checked for containing only well-formed UTF-8.  Do not use this 
     897unless you I<know with absolute certainty> that the STRING holds only 
     898well-formed UTF-8.  Returns the previous state of the UTF8 flag (so please 
     899don't treat the return value as indicating success or failure), or C<undef> 
     900if I<STRING> is not a string. 
     901 
     902B<NOTE>: For security reasons, this function does not work on tainted values. 
    885903 
    886904=item _utf8_off(STRING) 
    887905 
    888 [INTERNAL] Turns off the UTF8 flag in STRING.  Do not use frivolously. 
    889 Returns the previous state of the UTF8 flag (so please don't treat the 
    890 return value as indicating success or failure), or C<undef> if STRING is 
    891 not a string. 
    892  
    893 This function does not work on tainted values. 
     906[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>.  Do not use 
     907frivolously.  Returns the previous state of the UTF8 flag, or C<undef> if 
     908I<STRING> is not a string.  Do not treat the return value as indicative of 
     909success or failure, because that isn't what it means: it is only the 
     910previous setting. 
     911 
     912B<NOTE>: For security reasons, this function does not work on tainted values. 
    894913 
    895914=back 
     
    901920  computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed. 
    902921 
    903 That has been the perl's notion of UTF-8 but official UTF-8 is more 
    904 strict; Its ranges is much narrower (0 .. 10FFFF), some sequences are 
    905 not allowed (i.e. Those used in the surrogate pair, 0xFFFE, et al). 
    906  
    907 Now that is overruled by Larry Wall himself. 
     922That has historically been Perl's notion of UTF-8, as that is how UTF-8 was 
     923first conceived by Ken Thompson when he invented it. However, thanks to 
     924later revisions to the applicable standards, official UTF-8 is now rather 
     925stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF 
     926to cover only 21 bits instead of 32 or 64 bits) and some sequences 
     927are not allowed, like those used in surrogate pairs, the 31 non-character 
     928code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane 
     929(0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc. 
     930 
     931The former default in which Perl would always use a loose interpretation of 
     932UTF-8 has now been overruled: 
    908933 
    909934  From: Larry Wall <larry@wall.org> 
     
    912937  Subject: Re: Make Encode.pm support the real UTF-8 
    913938  Message-Id: <20041204025158.GA28754@wall.org> 
    914    
     939 
    915940  On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote: 
    916941  : I've no problem with 'utf8' being perl's unrestricted uft8 encoding, 
    917942  : but "UTF-8" is the name of the standard and should give the 
    918943  : corresponding behaviour. 
    919    
     944 
    920945  For what it's worth, that's how I've always kept them straight in my 
    921946  head. 
    922    
     947 
    923948  Also for what it's worth, Perl 6 will mostly default to strict but 
    924949  make it easy to switch back to lax. 
    925    
     950 
    926951  Larry 
    927952 
    928 Do you copy?  As of Perl 5.8.7, B<UTF-8> means strict, official UTF-8 
    929 while B<utf8> means liberal, lax, version thereof.  And Encode version 
    930 2.10 or later thus groks the difference between C<UTF-8> and C"utf8". 
     953Got that?  As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current 
     954sense, which is conservative and strict and security-conscious, whereas 
     955B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and 
     956lax.  C<Encode> version 2.10 or later thus groks this subtle but critically 
     957important distinction between C<"UTF-8"> and C<"utf8">. 
    931958 
    932959  encode("utf8",  "\x{FFFF_FFFF}", 1); # okay 
    933960  encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks 
    934961 
    935 C<UTF-8> in Encode is actually a canonical name for C<utf-8-strict>. 
    936 Yes, the hyphen between "UTF" and "8" is important.  Without it Encode 
    937 goes "liberal" 
     962In the C<Encode> module, C<"UTF-8"> is actually a canonical name for 
     963C<"utf-8-strict">.  That hyphen between the C<"UTF"> and the C<"8"> is 
     964critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive: 
    938965 
    939966  find_encoding("UTF-8")->name # is 'utf-8-strict' 
    940967  find_encoding("utf-8")->name # ditto. names are case insensitive 
    941   find_encoding("utf_8")->name  # ditto. "_" are treated as "-" 
     968  find_encoding("utf_8")->name # ditto. "_" are treated as "-" 
    942969  find_encoding("UTF8")->name  # is 'utf8'. 
    943970 
    944 The UTF8 flag is internally called UTF8, without a hyphen. It indicates 
    945 whether a string is internally encoded as utf8, also without a hypen. 
     971Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates 
     972whether a string is internally encoded as "utf8", also without a hyphen. 
    946973 
    947974=head1 SEE ALSO 
     
    959986=head1 MAINTAINER 
    960987 
    961 This project was originated by Nick Ing-Simmons and later maintained 
    962 by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>.  See AUTHORS for a full 
    963 list of people involved.  For any questions, use 
    964 E<lt>perl-unicode@perl.orgE<gt> so we can all share. 
    965  
    966 While Dan Kogai retains the copyright as a maintainer, the credit 
    967 should go to all those involoved.  See AUTHORS for those submitted 
    968 codes. 
     988This project was originated by the late Nick Ing-Simmons and later 
     989maintained by Dan Kogai I<< <dankogai@dan.co.jp> >>.  See AUTHORS 
     990for a full list of people involved.  For any questions, send mail to 
     991I<< <perl-unicode@perl.org> >> so that we can all share. 
     992 
     993While Dan Kogai retains the copyright as a maintainer, credit 
     994should go to all those involved.  See AUTHORS for a list of those 
     995who submitted code to the project. 
    969996 
    970997=head1 COPYRIGHT 
    971998 
    972 Copyright 2002-2006 Dan Kogai E<lt>dankogai@dan.co.jpE<gt> 
     999Copyright 2002-2011 Dan Kogai I<< <dankogai@dan.co.jp> >>. 
    9731000 
    9741001This library is free software; you can redistribute it and/or modify 
  • lang/perl/Encode/trunk/Encode.xs

    r38757 r38929  
    11/* 
    2  $Id: Encode.xs,v 2.20 2010/12/31 22:48:48 dankogai Exp dankogai $ 
     2 $Id: Encode.xs,v 2.20 2010/12/31 22:48:48 dankogai Exp $ 
    33 */ 
    44 
  • lang/perl/Encode/trunk/Unicode/Unicode.xs

    r38757 r38929  
    11/* 
    2  $Id: Unicode.xs,v 2.7 2010/12/31 22:48:48 dankogai Exp dankogai $ 
     2 $Id: Unicode.xs,v 2.7 2010/12/31 22:48:48 dankogai Exp $ 
    33 */ 
    44 
  • lang/perl/Encode/trunk/lib/Encode/Alias.pm

    r38736 r38929  
    33use warnings; 
    44no warnings 'redefine'; 
    5 our $VERSION = do { my @r = ( q$Revision: 2.13 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 
     5our $VERSION = do { my @r = ( q$Revision: 2.14 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 
    66sub DEBUG () { 0 } 
    77 
     
    207207    # predefined in *.ucm; unneeded 
    208208    # define_alias( qr/\bmacIcelandic$/i => '"macIceland"'); 
    209     define_alias( qr/^mac_(.*)$/i => '"mac$1"' ); 
     209    define_alias( qr/^(?:x[_-])?mac[_-](.*)$/i => '"mac$1"' ); 
    210210    # http://rt.cpan.org/Ticket/Display.html?id=36326 
    211211    define_alias( qr/^macintosh$/i => '"MacRoman"' );