Changeset 29767

Show
Ignore:
Timestamp:
02/09/09 18:50:51 (12 months ago)
Author:
drry
Message:
  • fixed regexes.
  • et cetera.
Location:
lang/perl/Encode/trunk
Files:
3 modified

Legend:

Unmodified
Added
Removed
  • lang/perl/Encode/trunk/Unicode/Unicode.pm

    r1747 r29767  
    7070=head1 SYNOPSIS 
    7171 
    72     use Encode qw/encode decode/;  
     72    use Encode qw/encode decode/; 
    7373    $ucs2 = encode("UCS-2BE", $utf8); 
    7474    $utf8 = decode("UCS-2BE", $ucs2); 
     
    231231 
    232232Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but 
    233 perl does not prohibit the use of characters within this range.  To perl,  
     233perl does not prohibit the use of characters within this range.  To perl, 
    234234every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>. 
    235235 
     
    242242Unicode encodings simply croaks. 
    243243 
    244   % perl -MEncode -e '$_ = "\xfe\xff\xd8\xd9\xda\xdb\0\n"' \ 
    245          -e 'Encode::from_to($_, "utf16","shift_jis", 0); print' 
     244  % perl -MEncode -e'$_ = "\xfe\xff\xd8\xd9\xda\xdb\0\n"' \ 
     245         -e'Encode::from_to($_, "utf16","shift_jis", 0); print' 
    246246  UTF-16:Malformed LO surrogate d8d9 at /path/to/Encode.pm line 184. 
    247   % perl -MEncode -e '$a = "BOM missing"' \ 
    248          -e ' Encode::from_to($a, "utf16", "shift_jis", 0); print' 
     247  % perl -MEncode -e'$a = "BOM missing"' \ 
     248         -e' Encode::from_to($a, "utf16", "shift_jis", 0); print' 
    249249  UTF-16:Unrecognised BOM 424f at /path/to/Encode.pm line 184. 
    250250 
     
    265265 
    266266Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)> 
    267 by Larry Wall, Tom Christiansen, Jon Orwant;  
     267by Larry Wall, Tom Christiansen, Jon Orwant; 
    268268O'Reilly & Associates; ISBN 0-596-00027-8 
    269269 
  • lang/perl/Encode/trunk/encoding.pm

    r7855 r29767  
    207207  # or you can even do this if your shell supports your native encoding 
    208208 
    209   perl -Mencoding=latin2 -e '...' # Feeling centrally European? 
    210   perl -Mencoding=euc-kr -e '...' # Or Korean? 
     209  perl -Mencoding=latin2 -e'...' # Feeling centrally European? 
     210  perl -Mencoding=euc-kr -e'...' # Or Korean? 
    211211 
    212212  # more control 
     
    332332or later fixes this problem. 
    333333 
    334 =item tr//  
     334=item tr// 
    335335 
    336336C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0 
     
    339339=item DATA pseudo-filehandle 
    340340 
    341 Another feature that was overlooked was C<DATA>.  
     341Another feature that was overlooked was C<DATA>. 
    342342 
    343343=back 
     
    349349=item use encoding [I<ENCNAME>] ; 
    350350 
    351 Sets the script encoding to I<ENCNAME>.  And unless ${^UNICODE}  
     351Sets the script encoding to I<ENCNAME>.  And unless ${^UNICODE} 
    352352exists and non-zero, PerlIO layers of STDIN and STDOUT are set to 
    353353":encoding(I<ENCNAME>)". 
     
    427427 
    428428The pragma is a per script, not a per block lexical.  Only the last 
    429 C<use encoding> or C<no encoding> matters, and it affects  
    430 B<the whole script>.  However, the <no encoding> pragma is supported and  
    431 B<use encoding> can appear as many times as you want in a given script.  
     429C<use encoding> or C<no encoding> matters, and it affects 
     430B<the whole script>.  However, the <no encoding> pragma is supported and 
     431B<use encoding> can appear as many times as you want in a given script. 
    432432The multiple use of this pragma is discouraged. 
    433433 
    434434By the same reason, the use this pragma inside modules is also 
    435 discouraged (though not as strongly discouraged as the case above.   
     435discouraged (though not as strongly discouraged as the case above. 
    436436See below). 
    437437 
     
    602602  $camel = "*non-ascii*"; 
    603603  binmode(STDOUT=>':encoding(utf8)'); # bang! 
    604   write;              # funny  
     604  write;              # funny 
    605605  print $camel, "\n"; # fine 
    606606 
     
    635635If 1. didn't work but we are under the locale pragma, the environment 
    636636variables LC_ALL and LANG (in that order) are matched for encodings 
    637 (the part after C<.>, if any), and if any found, that is used  
     637(the part after C<.>, if any), and if any found, that is used 
    638638as the default encoding for the open pragma. 
    639639 
     
    654654=head1 HISTORY 
    655655 
    656 This pragma first appeared in Perl 5.8.0.  For features that require  
     656This pragma first appeared in Perl 5.8.0.  For features that require 
    6576575.8.1 and better, see above. 
    658658 
  • lang/perl/Encode/trunk/lib/Encode/MIME/Header.pm

    r29396 r29767  
    4545 
    4646    # multi-line header to single line 
    47     $str =~ s/(?:\r|\n|\r\n)[ \t]+//gos; 
     47    $str =~ s/(?:\r\n|[\r\n)[ \t]+//gos; 
    4848 
    4949    1 while ( $str =~ 
    50         s/(\=\?[0-9A-Za-z\-_]+\?[Qq]\?)(.*?)\?\=\1(.*?)\?\=/$1$2$3\?\=/ ) 
     50        s/(=\?[-0-9A-Za-z_]+\?[Qq]\?)(.*?)\?=\1(.*?\?=)/$1$2$3/ ) 
    5151      ;    # Concat consecutive QP encoded mime headers 
    5252           # Fixes breaking inside multi-byte characters 
    5353 
    5454    $str =~ s{ 
    55         =\?                  # begin encoded word 
    56         ([0-9A-Za-z\-_]+) # charset (encoding) 
    57                 (?:\*[A-Za-z]{1,8}(?:-[A-Za-z]{1,8})*)? # language (RFC 2231) 
     55        =\?              # begin encoded word 
     56        ([-0-9A-Za-z_]+) # charset (encoding) 
     57        (?:\*[A-Za-z]{1,8}(?:-[A-Za-z]{1,8})*)? # language (RFC 2231) 
    5858        \?([QqBb])\?     # delimiter 
    5959        (.*?)            # Base64-encodede contents 
    60         \?=              # end encoded word       
    61         }{ 
    62         if    (uc($2) eq 'B'){ 
     60        \?=              # end encoded word 
     61    }{ 
     62        if      (uc($2) eq 'B'){ 
    6363            $obj->{decode_b} or croak qq(MIME "B" unsupported); 
    6464            decode_b($1, $3); 
    65         }elsif(uc($2) eq 'Q'){ 
     65        } elsif (uc($2) eq 'Q'){ 
    6666            $obj->{decode_q} or croak qq(MIME "Q" unsupported); 
    6767            decode_q($1, $3); 
    68         }else{ 
     68        } else { 
    6969            croak qq(MIME "$2" encoding is nonexistent!); 
    7070        } 
    71         }egox; 
     71    }egox; 
    7272    $_[1] = '' if $chk; 
    7373    return $str; 
     
    9595my $especials = 
    9696  join( '|' => map { quotemeta( chr($_) ) } 
    97       unpack( "C*", qq{()<>@,;:\"\'/[]?.=} ) ); 
     97      unpack( "C*", qq{()<>@,;:"'/[]?.=} ) ); 
    9898 
    9999my $re_encoded_word = qr{ 
    100        (?: 
    101     =\?               # begin encoded word 
    102     (?:[0-9A-Za-z\-_]+) # charset (encoding) 
    103         (?:\*\w+(?:-\w+)*)? # language (RFC 2231) 
    104     \?(?:[QqBb])\?      # delimiter 
    105     (?:.*?)             # Base64-encodede contents 
    106     \?=                 # end encoded word 
    107        ) 
    108       }xo; 
     100    =\?                # begin encoded word 
     101    (?:[-0-9A-Za-z_]+) # charset (encoding) 
     102    (?:\*[A-Za-z]{1,8}+(?:-[A-Za-z]{1,8})*)? # language (RFC 2231) 
     103    \?(?:[QqBb])\?     # delimiter 
     104    (?:.*?)            # Base64-encodede contents 
     105    \?=                # end encoded word 
     106}xo; 
    109107 
    110108my $re_especials = qr{$re_encoded_word|$especials}xo; 
     
    113111    my ( $obj, $str, $chk ) = @_; 
    114112    my @line = (); 
    115     for my $line ( split /\r|\n|\r\n/o, $str ) { 
     113    for my $line ( split /\r\n|[\r\n]/o, $str ) { 
    116114        my ( @word, @subline ); 
    117115        for my $word ( split /($re_especials)/o, $line ) { 
     
    177175    $chunk = encode_utf8($chunk); 
    178176    $chunk =~ s{ 
    179         ([^0-9A-Za-z]) 
    180            }{ 
    181            join("" => map {sprintf "=%02X", $_} unpack("C*", $1)) 
    182            }egox; 
     177            [^0-9A-Za-z] 
     178       }{ 
     179            join("" => map {sprintf "=%02X", $_} unpack("C*", $&)) 
     180       }egox; 
    183181    return HEAD . 'Q?' . $chunk . TAIL; 
    184182} 
     
    193191=head1 SYNOPSIS 
    194192 
    195     use Encode qw/encode decode/;  
     193    use Encode qw/encode decode/; 
    196194    $utf8   = decode('MIME-Header', $header); 
    197195    $header = encode('MIME-Header', $utf8); 
     
    238236 
    239237RFC 2047, L<http://www.faqs.org/rfcs/rfc2047.html> and many other 
    240 locations.  
     238locations. 
    241239 
    242240=cut