| 1 | # $Id: encoding.pm,v 2.7 2008/03/12 09:51:11 dankogai Exp $ |
|---|
| 2 | package encoding; |
|---|
| 3 | our $VERSION = '2.6_01'; |
|---|
| 4 | |
|---|
| 5 | use Encode; |
|---|
| 6 | use strict; |
|---|
| 7 | use warnings; |
|---|
| 8 | |
|---|
| 9 | sub DEBUG () { 0 } |
|---|
| 10 | |
|---|
| 11 | BEGIN { |
|---|
| 12 | if ( ord("A") == 193 ) { |
|---|
| 13 | require Carp; |
|---|
| 14 | Carp::croak("encoding: pragma does not support EBCDIC platforms"); |
|---|
| 15 | } |
|---|
| 16 | } |
|---|
| 17 | |
|---|
| 18 | our $HAS_PERLIO = 0; |
|---|
| 19 | eval { require PerlIO::encoding }; |
|---|
| 20 | unless ($@) { |
|---|
| 21 | $HAS_PERLIO = ( PerlIO::encoding->VERSION >= 0.02 ); |
|---|
| 22 | } |
|---|
| 23 | |
|---|
| 24 | sub _exception { |
|---|
| 25 | my $name = shift; |
|---|
| 26 | $] > 5.008 and return 0; # 5.8.1 or higher then no |
|---|
| 27 | my %utfs = map { $_ => 1 } |
|---|
| 28 | qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE |
|---|
| 29 | UTF-32 UTF-32BE UTF-32LE); |
|---|
| 30 | $utfs{$name} or return 0; # UTFs or no |
|---|
| 31 | require Config; |
|---|
| 32 | Config->import(); |
|---|
| 33 | our %Config; |
|---|
| 34 | return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no |
|---|
| 35 | } |
|---|
| 36 | |
|---|
| 37 | sub in_locale { $^H & ( $locale::hint_bits || 0 ) } |
|---|
| 38 | |
|---|
| 39 | sub _get_locale_encoding { |
|---|
| 40 | my $locale_encoding; |
|---|
| 41 | |
|---|
| 42 | # I18N::Langinfo isn't available everywhere |
|---|
| 43 | eval { |
|---|
| 44 | require I18N::Langinfo; |
|---|
| 45 | I18N::Langinfo->import(qw(langinfo CODESET)); |
|---|
| 46 | $locale_encoding = langinfo( CODESET() ); |
|---|
| 47 | }; |
|---|
| 48 | |
|---|
| 49 | my $country_language; |
|---|
| 50 | |
|---|
| 51 | no warnings 'uninitialized'; |
|---|
| 52 | |
|---|
| 53 | if ( (not $locale_encoding) && in_locale() ) { |
|---|
| 54 | if ( $ENV{LC_ALL} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) { |
|---|
| 55 | ( $country_language, $locale_encoding ) = ( $1, $2 ); |
|---|
| 56 | } |
|---|
| 57 | elsif ( $ENV{LANG} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) { |
|---|
| 58 | ( $country_language, $locale_encoding ) = ( $1, $2 ); |
|---|
| 59 | } |
|---|
| 60 | |
|---|
| 61 | # LANGUAGE affects only LC_MESSAGES only on glibc |
|---|
| 62 | } |
|---|
| 63 | elsif ( not $locale_encoding ) { |
|---|
| 64 | if ( $ENV{LC_ALL} =~ /\butf-?8\b/i |
|---|
| 65 | || $ENV{LANG} =~ /\butf-?8\b/i ) |
|---|
| 66 | { |
|---|
| 67 | $locale_encoding = 'utf8'; |
|---|
| 68 | } |
|---|
| 69 | |
|---|
| 70 | # Could do more heuristics based on the country and language |
|---|
| 71 | # parts of LC_ALL and LANG (the parts before the dot (if any)), |
|---|
| 72 | # since we have Locale::Country and Locale::Language available. |
|---|
| 73 | # TODO: get a database of Language -> Encoding mappings |
|---|
| 74 | # (the Estonian database at http://www.eki.ee/letter/ |
|---|
| 75 | # would be excellent!) --jhi |
|---|
| 76 | } |
|---|
| 77 | if ( defined $locale_encoding |
|---|
| 78 | && lc($locale_encoding) eq 'euc' |
|---|
| 79 | && defined $country_language ) |
|---|
| 80 | { |
|---|
| 81 | if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) { |
|---|
| 82 | $locale_encoding = 'euc-jp'; |
|---|
| 83 | } |
|---|
| 84 | elsif ( $country_language =~ /^ko_KR|korean?$/i ) { |
|---|
| 85 | $locale_encoding = 'euc-kr'; |
|---|
| 86 | } |
|---|
| 87 | elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) { |
|---|
| 88 | $locale_encoding = 'euc-cn'; |
|---|
| 89 | } |
|---|
| 90 | elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) { |
|---|
| 91 | $locale_encoding = 'euc-tw'; |
|---|
| 92 | } |
|---|
| 93 | else { |
|---|
| 94 | require Carp; |
|---|
| 95 | Carp::croak( |
|---|
| 96 | "encoding: Locale encoding '$locale_encoding' too ambiguous" |
|---|
| 97 | ); |
|---|
| 98 | } |
|---|
| 99 | } |
|---|
| 100 | |
|---|
| 101 | return $locale_encoding; |
|---|
| 102 | } |
|---|
| 103 | |
|---|
| 104 | sub import { |
|---|
| 105 | my $class = shift; |
|---|
| 106 | my $name = shift; |
|---|
| 107 | if ( $name eq ':_get_locale_encoding' ) { # used by lib/open.pm |
|---|
| 108 | my $caller = caller(); |
|---|
| 109 | { |
|---|
| 110 | no strict 'refs'; |
|---|
| 111 | *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding; |
|---|
| 112 | } |
|---|
| 113 | return; |
|---|
| 114 | } |
|---|
| 115 | $name = _get_locale_encoding() if $name eq ':locale'; |
|---|
| 116 | my %arg = @_; |
|---|
| 117 | $name = $ENV{PERL_ENCODING} unless defined $name; |
|---|
| 118 | my $enc = find_encoding($name); |
|---|
| 119 | unless ( defined $enc ) { |
|---|
| 120 | require Carp; |
|---|
| 121 | Carp::croak("encoding: Unknown encoding '$name'"); |
|---|
| 122 | } |
|---|
| 123 | $name = $enc->name; # canonize |
|---|
| 124 | unless ( $arg{Filter} ) { |
|---|
| 125 | DEBUG and warn "_exception($name) = ", _exception($name); |
|---|
| 126 | _exception($name) or ${^ENCODING} = $enc; |
|---|
| 127 | $HAS_PERLIO or return 1; |
|---|
| 128 | } |
|---|
| 129 | else { |
|---|
| 130 | defined( ${^ENCODING} ) and undef ${^ENCODING}; |
|---|
| 131 | |
|---|
| 132 | # implicitly 'use utf8' |
|---|
| 133 | require utf8; # to fetch $utf8::hint_bits; |
|---|
| 134 | $^H |= $utf8::hint_bits; |
|---|
| 135 | eval { |
|---|
| 136 | require Filter::Util::Call; |
|---|
| 137 | Filter::Util::Call->import; |
|---|
| 138 | filter_add( |
|---|
| 139 | sub { |
|---|
| 140 | my $status = filter_read(); |
|---|
| 141 | if ( $status > 0 ) { |
|---|
| 142 | $_ = $enc->decode( $_, 1 ); |
|---|
| 143 | DEBUG and warn $_; |
|---|
| 144 | } |
|---|
| 145 | $status; |
|---|
| 146 | } |
|---|
| 147 | ); |
|---|
| 148 | }; |
|---|
| 149 | $@ eq '' and DEBUG and warn "Filter installed"; |
|---|
| 150 | } |
|---|
| 151 | defined ${^UNICODE} and ${^UNICODE} != 0 and return 1; |
|---|
| 152 | for my $h (qw(STDIN STDOUT)) { |
|---|
| 153 | if ( $arg{$h} ) { |
|---|
| 154 | unless ( defined find_encoding( $arg{$h} ) ) { |
|---|
| 155 | require Carp; |
|---|
| 156 | Carp::croak( |
|---|
| 157 | "encoding: Unknown encoding for $h, '$arg{$h}'"); |
|---|
| 158 | } |
|---|
| 159 | eval { binmode( $h, ":raw :encoding($arg{$h})" ) }; |
|---|
| 160 | } |
|---|
| 161 | else { |
|---|
| 162 | unless ( exists $arg{$h} ) { |
|---|
| 163 | eval { |
|---|
| 164 | no warnings 'uninitialized'; |
|---|
| 165 | binmode( $h, ":raw :encoding($name)" ); |
|---|
| 166 | }; |
|---|
| 167 | } |
|---|
| 168 | } |
|---|
| 169 | if ($@) { |
|---|
| 170 | require Carp; |
|---|
| 171 | Carp::croak($@); |
|---|
| 172 | } |
|---|
| 173 | } |
|---|
| 174 | return 1; # I doubt if we need it, though |
|---|
| 175 | } |
|---|
| 176 | |
|---|
| 177 | sub unimport { |
|---|
| 178 | no warnings; |
|---|
| 179 | undef ${^ENCODING}; |
|---|
| 180 | if ($HAS_PERLIO) { |
|---|
| 181 | binmode( STDIN, ":raw" ); |
|---|
| 182 | binmode( STDOUT, ":raw" ); |
|---|
| 183 | } |
|---|
| 184 | else { |
|---|
| 185 | binmode(STDIN); |
|---|
| 186 | binmode(STDOUT); |
|---|
| 187 | } |
|---|
| 188 | if ( $INC{"Filter/Util/Call.pm"} ) { |
|---|
| 189 | eval { filter_del() }; |
|---|
| 190 | } |
|---|
| 191 | } |
|---|
| 192 | |
|---|
| 193 | 1; |
|---|
| 194 | __END__ |
|---|
| 195 | |
|---|
| 196 | =pod |
|---|
| 197 | |
|---|
| 198 | =head1 NAME |
|---|
| 199 | |
|---|
| 200 | encoding - allows you to write your script in non-ascii or non-utf8 |
|---|
| 201 | |
|---|
| 202 | =head1 SYNOPSIS |
|---|
| 203 | |
|---|
| 204 | use encoding "greek"; # Perl like Greek to you? |
|---|
| 205 | use encoding "euc-jp"; # Jperl! |
|---|
| 206 | |
|---|
| 207 | # or you can even do this if your shell supports your native encoding |
|---|
| 208 | |
|---|
| 209 | perl -Mencoding=latin2 -e '...' # Feeling centrally European? |
|---|
| 210 | perl -Mencoding=euc-kr -e '...' # Or Korean? |
|---|
| 211 | |
|---|
| 212 | # more control |
|---|
| 213 | |
|---|
| 214 | # A simple euc-cn => utf-8 converter |
|---|
| 215 | use encoding "euc-cn", STDOUT => "utf8"; while(<>){print}; |
|---|
| 216 | |
|---|
| 217 | # "no encoding;" supported (but not scoped!) |
|---|
| 218 | no encoding; |
|---|
| 219 | |
|---|
| 220 | # an alternate way, Filter |
|---|
| 221 | use encoding "euc-jp", Filter=>1; |
|---|
| 222 | # now you can use kanji identifiers -- in euc-jp! |
|---|
| 223 | |
|---|
| 224 | # switch on locale - |
|---|
| 225 | # note that this probably means that unless you have a complete control |
|---|
| 226 | # over the environments the application is ever going to be run, you should |
|---|
| 227 | # NOT use the feature of encoding pragma allowing you to write your script |
|---|
| 228 | # in any recognized encoding because changing locale settings will wreck |
|---|
| 229 | # the script; you can of course still use the other features of the pragma. |
|---|
| 230 | use encoding ':locale'; |
|---|
| 231 | |
|---|
| 232 | =head1 ABSTRACT |
|---|
| 233 | |
|---|
| 234 | Let's start with a bit of history: Perl 5.6.0 introduced Unicode |
|---|
| 235 | support. You could apply C<substr()> and regexes even to complex CJK |
|---|
| 236 | characters -- so long as the script was written in UTF-8. But back |
|---|
| 237 | then, text editors that supported UTF-8 were still rare and many users |
|---|
| 238 | instead chose to write scripts in legacy encodings, giving up a whole |
|---|
| 239 | new feature of Perl 5.6. |
|---|
| 240 | |
|---|
| 241 | Rewind to the future: starting from perl 5.8.0 with the B<encoding> |
|---|
| 242 | pragma, you can write your script in any encoding you like (so long |
|---|
| 243 | as the C<Encode> module supports it) and still enjoy Unicode support. |
|---|
| 244 | This pragma achieves that by doing the following: |
|---|
| 245 | |
|---|
| 246 | =over |
|---|
| 247 | |
|---|
| 248 | =item * |
|---|
| 249 | |
|---|
| 250 | Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from |
|---|
| 251 | the encoding specified to utf8. In Perl 5.8.1 and later, literals in |
|---|
| 252 | C<tr///> and C<DATA> pseudo-filehandle are also converted. |
|---|
| 253 | |
|---|
| 254 | =item * |
|---|
| 255 | |
|---|
| 256 | Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding |
|---|
| 257 | specified. |
|---|
| 258 | |
|---|
| 259 | =back |
|---|
| 260 | |
|---|
| 261 | =head2 Literal Conversions |
|---|
| 262 | |
|---|
| 263 | You can write code in EUC-JP as follows: |
|---|
| 264 | |
|---|
| 265 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji |
|---|
| 266 | #<-char-><-char-> # 4 octets |
|---|
| 267 | s/\bCamel\b/$Rakuda/; |
|---|
| 268 | |
|---|
| 269 | And with C<use encoding "euc-jp"> in effect, it is the same thing as |
|---|
| 270 | the code in UTF-8: |
|---|
| 271 | |
|---|
| 272 | my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters |
|---|
| 273 | s/\bCamel\b/$Rakuda/; |
|---|
| 274 | |
|---|
| 275 | =head2 PerlIO layers for C<STD(IN|OUT)> |
|---|
| 276 | |
|---|
| 277 | The B<encoding> pragma also modifies the filehandle layers of |
|---|
| 278 | STDIN and STDOUT to the specified encoding. Therefore, |
|---|
| 279 | |
|---|
| 280 | use encoding "euc-jp"; |
|---|
| 281 | my $message = "Camel is the symbol of perl.\n"; |
|---|
| 282 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji |
|---|
| 283 | $message =~ s/\bCamel\b/$Rakuda/; |
|---|
| 284 | print $message; |
|---|
| 285 | |
|---|
| 286 | Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", |
|---|
| 287 | not "\x{99F1}\x{99DD} is the symbol of perl.\n". |
|---|
| 288 | |
|---|
| 289 | You can override this by giving extra arguments; see below. |
|---|
| 290 | |
|---|
| 291 | =head2 Implicit upgrading for byte strings |
|---|
| 292 | |
|---|
| 293 | By default, if strings operating under byte semantics and strings |
|---|
| 294 | with Unicode character data are concatenated, the new string will |
|---|
| 295 | be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>. |
|---|
| 296 | |
|---|
| 297 | The B<encoding> pragma changes this to use the specified encoding |
|---|
| 298 | instead. For example: |
|---|
| 299 | |
|---|
| 300 | use encoding 'utf8'; |
|---|
| 301 | my $string = chr(20000); # a Unicode string |
|---|
| 302 | utf8::encode($string); # now it's a UTF-8 encoded byte string |
|---|
| 303 | # concatenate with another Unicode string |
|---|
| 304 | print length($string . chr(20000)); |
|---|
| 305 | |
|---|
| 306 | Will print C<2>, because C<$string> is upgraded as UTF-8. Without |
|---|
| 307 | C<use encoding 'utf8';>, it will print C<4> instead, since C<$string> |
|---|
| 308 | is three octets when interpreted as Latin-1. |
|---|
| 309 | |
|---|
| 310 | =head2 Side effects |
|---|
| 311 | |
|---|
| 312 | If the C<encoding> pragma is in scope then the lengths returned are |
|---|
| 313 | calculated from the length of C<$/> in Unicode characters, which is not |
|---|
| 314 | always the same as the length of C<$/> in the native encoding. |
|---|
| 315 | |
|---|
| 316 | This pragma affects utf8::upgrade, but not utf8::downgrade. |
|---|
| 317 | |
|---|
| 318 | =head1 FEATURES THAT REQUIRE 5.8.1 |
|---|
| 319 | |
|---|
| 320 | Some of the features offered by this pragma requires perl 5.8.1. Most |
|---|
| 321 | of these are done by Inaba Hiroto. Any other features and changes |
|---|
| 322 | are good for 5.8.0. |
|---|
| 323 | |
|---|
| 324 | =over |
|---|
| 325 | |
|---|
| 326 | =item "NON-EUC" doublebyte encodings |
|---|
| 327 | |
|---|
| 328 | Because perl needs to parse script before applying this pragma, such |
|---|
| 329 | encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH; |
|---|
| 330 | \x5c) in the second byte fails because the second byte may |
|---|
| 331 | accidentally escape the quoting character that follows. Perl 5.8.1 |
|---|
| 332 | or later fixes this problem. |
|---|
| 333 | |
|---|
| 334 | =item tr// |
|---|
| 335 | |
|---|
| 336 | C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0 |
|---|
| 337 | See the section below for details. |
|---|
| 338 | |
|---|
| 339 | =item DATA pseudo-filehandle |
|---|
| 340 | |
|---|
| 341 | Another feature that was overlooked was C<DATA>. |
|---|
| 342 | |
|---|
| 343 | =back |
|---|
| 344 | |
|---|
| 345 | =head1 USAGE |
|---|
| 346 | |
|---|
| 347 | =over 4 |
|---|
| 348 | |
|---|
| 349 | =item use encoding [I<ENCNAME>] ; |
|---|
| 350 | |
|---|
| 351 | Sets the script encoding to I<ENCNAME>. And unless ${^UNICODE} |
|---|
| 352 | exists and non-zero, PerlIO layers of STDIN and STDOUT are set to |
|---|
| 353 | ":encoding(I<ENCNAME>)". |
|---|
| 354 | |
|---|
| 355 | Note that STDERR WILL NOT be changed. |
|---|
| 356 | |
|---|
| 357 | Also note that non-STD file handles remain unaffected. Use C<use |
|---|
| 358 | open> or C<binmode> to change layers of those. |
|---|
| 359 | |
|---|
| 360 | If no encoding is specified, the environment variable L<PERL_ENCODING> |
|---|
| 361 | is consulted. If no encoding can be found, the error C<Unknown encoding |
|---|
| 362 | 'I<ENCNAME>'> will be thrown. |
|---|
| 363 | |
|---|
| 364 | =item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ; |
|---|
| 365 | |
|---|
| 366 | You can also individually set encodings of STDIN and STDOUT via the |
|---|
| 367 | C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the |
|---|
| 368 | first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding |
|---|
| 369 | completely off. |
|---|
| 370 | |
|---|
| 371 | When ${^UNICODE} exists and non-zero, these options will completely |
|---|
| 372 | ignored. ${^UNICODE} is a variable introduced in perl 5.8.1. See |
|---|
| 373 | L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for |
|---|
| 374 | details (perl 5.8.1 and later). |
|---|
| 375 | |
|---|
| 376 | =item use encoding I<ENCNAME> Filter=E<gt>1; |
|---|
| 377 | |
|---|
| 378 | This turns the encoding pragma into a source filter. While the |
|---|
| 379 | default approach just decodes interpolated literals (in qq() and |
|---|
| 380 | qr()), this will apply a source filter to the entire source code. See |
|---|
| 381 | L</"The Filter Option"> below for details. |
|---|
| 382 | |
|---|
| 383 | =item no encoding; |
|---|
| 384 | |
|---|
| 385 | Unsets the script encoding. The layers of STDIN, STDOUT are |
|---|
| 386 | reset to ":raw" (the default unprocessed raw stream of bytes). |
|---|
| 387 | |
|---|
| 388 | =back |
|---|
| 389 | |
|---|
| 390 | =head1 The Filter Option |
|---|
| 391 | |
|---|
| 392 | The magic of C<use encoding> is not applied to the names of |
|---|
| 393 | identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human |
|---|
| 394 | is a single Han ideograph) work, you still need to write your script |
|---|
| 395 | in UTF-8 -- or use a source filter. That's what 'Filter=>1' does. |
|---|
| 396 | |
|---|
| 397 | What does this mean? Your source code behaves as if it is written in |
|---|
| 398 | UTF-8 with 'use utf8' in effect. So even if your editor only supports |
|---|
| 399 | Shift_JIS, for example, you can still try examples in Chapter 15 of |
|---|
| 400 | C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8 |
|---|
| 401 | identifiers. |
|---|
| 402 | |
|---|
| 403 | This option is significantly slower and (as of this writing) non-ASCII |
|---|
| 404 | identifiers are not very stable WITHOUT this option and with the |
|---|
| 405 | source code written in UTF-8. |
|---|
| 406 | |
|---|
| 407 | =head2 Filter-related changes at Encode version 1.87 |
|---|
| 408 | |
|---|
| 409 | =over |
|---|
| 410 | |
|---|
| 411 | =item * |
|---|
| 412 | |
|---|
| 413 | The Filter option now sets STDIN and STDOUT like non-filter options. |
|---|
| 414 | And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like |
|---|
| 415 | non-filter version. |
|---|
| 416 | |
|---|
| 417 | =item * |
|---|
| 418 | |
|---|
| 419 | C<use utf8> is implicitly declared so you no longer have to C<use |
|---|
| 420 | utf8> to C<${"\x{4eba}"}++>. |
|---|
| 421 | |
|---|
| 422 | =back |
|---|
| 423 | |
|---|
| 424 | =head1 CAVEATS |
|---|
| 425 | |
|---|
| 426 | =head2 NOT SCOPED |
|---|
| 427 | |
|---|
| 428 | The pragma is a per script, not a per block lexical. Only the last |
|---|
| 429 | C<use encoding> or C<no encoding> matters, and it affects |
|---|
| 430 | B<the whole script>. However, the <no encoding> pragma is supported and |
|---|
| 431 | B<use encoding> can appear as many times as you want in a given script. |
|---|
| 432 | The multiple use of this pragma is discouraged. |
|---|
| 433 | |
|---|
| 434 | By the same reason, the use this pragma inside modules is also |
|---|
| 435 | discouraged (though not as strongly discouraged as the case above. |
|---|
| 436 | See below). |
|---|
| 437 | |
|---|
| 438 | If you still have to write a module with this pragma, be very careful |
|---|
| 439 | of the load order. See the codes below; |
|---|
| 440 | |
|---|
| 441 | # called module |
|---|
| 442 | package Module_IN_BAR; |
|---|
| 443 | use encoding "bar"; |
|---|
| 444 | # stuff in "bar" encoding here |
|---|
| 445 | 1; |
|---|
| 446 | |
|---|
| 447 | # caller script |
|---|
| 448 | use encoding "foo" |
|---|
| 449 | use Module_IN_BAR; |
|---|
| 450 | # surprise! use encoding "bar" is in effect. |
|---|
| 451 | |
|---|
| 452 | The best way to avoid this oddity is to use this pragma RIGHT AFTER |
|---|
| 453 | other modules are loaded. i.e. |
|---|
| 454 | |
|---|
| 455 | use Module_IN_BAR; |
|---|
| 456 | use encoding "foo"; |
|---|
| 457 | |
|---|
| 458 | =head2 DO NOT MIX MULTIPLE ENCODINGS |
|---|
| 459 | |
|---|
| 460 | Notice that only literals (string or regular expression) having only |
|---|
| 461 | legacy code points are affected: if you mix data like this |
|---|
| 462 | |
|---|
| 463 | \xDF\x{100} |
|---|
| 464 | |
|---|
| 465 | the data is assumed to be in (Latin 1 and) Unicode, not in your native |
|---|
| 466 | encoding. In other words, this will match in "greek": |
|---|
| 467 | |
|---|
| 468 | "\xDF" =~ /\x{3af}/ |
|---|
| 469 | |
|---|
| 470 | but this will not |
|---|
| 471 | |
|---|
| 472 | "\xDF\x{100}" =~ /\x{3af}\x{100}/ |
|---|
| 473 | |
|---|
| 474 | since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on |
|---|
| 475 | the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL |
|---|
| 476 | LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You |
|---|
| 477 | should not be mixing your legacy data and Unicode in the same string. |
|---|
| 478 | |
|---|
| 479 | This pragma also affects encoding of the 0x80..0xFF code point range: |
|---|
| 480 | normally characters in that range are left as eight-bit bytes (unless |
|---|
| 481 | they are combined with characters with code points 0x100 or larger, |
|---|
| 482 | in which case all characters need to become UTF-8 encoded), but if |
|---|
| 483 | the C<encoding> pragma is present, even the 0x80..0xFF range always |
|---|
| 484 | gets UTF-8 encoded. |
|---|
| 485 | |
|---|
| 486 | After all, the best thing about this pragma is that you don't have to |
|---|
| 487 | resort to \x{....} just to spell your name in a native encoding. |
|---|
| 488 | So feel free to put your strings in your encoding in quotes and |
|---|
| 489 | regexes. |
|---|
| 490 | |
|---|
| 491 | =head2 tr/// with ranges |
|---|
| 492 | |
|---|
| 493 | The B<encoding> pragma works by decoding string literals in |
|---|
| 494 | C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this |
|---|
| 495 | does not apply to C<tr///>. Therefore, |
|---|
| 496 | |
|---|
| 497 | use encoding 'euc-jp'; |
|---|
| 498 | #.... |
|---|
| 499 | $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/; |
|---|
| 500 | # -------- -------- -------- -------- |
|---|
| 501 | |
|---|
| 502 | Does not work as |
|---|
| 503 | |
|---|
| 504 | $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/; |
|---|
| 505 | |
|---|
| 506 | =over |
|---|
| 507 | |
|---|
| 508 | =item Legend of characters above |
|---|
| 509 | |
|---|
| 510 | utf8 euc-jp charnames::viacode() |
|---|
| 511 | ----------------------------------------- |
|---|
| 512 | \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A |
|---|
| 513 | \x{3093} \xA4\xF3 HIRAGANA LETTER N |
|---|
| 514 | \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A |
|---|
| 515 | \x{30f3} \xA5\xF3 KATAKANA LETTER N |
|---|
| 516 | |
|---|
| 517 | =back |
|---|
| 518 | |
|---|
| 519 | This counterintuitive behavior has been fixed in perl 5.8.1. |
|---|
| 520 | |
|---|
| 521 | =head3 workaround to tr///; |
|---|
| 522 | |
|---|
| 523 | In perl 5.8.0, you can work around as follows; |
|---|
| 524 | |
|---|
| 525 | use encoding 'euc-jp'; |
|---|
| 526 | # .... |
|---|
| 527 | eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ }; |
|---|
| 528 | |
|---|
| 529 | Note the C<tr//> expression is surrounded by C<qq{}>. The idea behind |
|---|
| 530 | is the same as classic idiom that makes C<tr///> 'interpolate'. |
|---|
| 531 | |
|---|
| 532 | tr/$from/$to/; # wrong! |
|---|
| 533 | eval qq{ tr/$from/$to/ }; # workaround. |
|---|
| 534 | |
|---|
| 535 | Nevertheless, in case of B<encoding> pragma even C<q//> is affected so |
|---|
| 536 | C<tr///> not being decoded was obviously against the will of Perl5 |
|---|
| 537 | Porters so it has been fixed in Perl 5.8.1 or later. |
|---|
| 538 | |
|---|
| 539 | =head1 EXAMPLE - Greekperl |
|---|
| 540 | |
|---|
| 541 | use encoding "iso 8859-7"; |
|---|
| 542 | |
|---|
| 543 | # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode. |
|---|
| 544 | |
|---|
| 545 | $a = "\xDF"; |
|---|
| 546 | $b = "\x{100}"; |
|---|
| 547 | |
|---|
| 548 | printf "%#x\n", ord($a); # will print 0x3af, not 0xdf |
|---|
| 549 | |
|---|
| 550 | $c = $a . $b; |
|---|
| 551 | |
|---|
| 552 | # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". |
|---|
| 553 | |
|---|
| 554 | # chr() is affected, and ... |
|---|
| 555 | |
|---|
| 556 | print "mega\n" if ord(chr(0xdf)) == 0x3af; |
|---|
| 557 | |
|---|
| 558 | # ... ord() is affected by the encoding pragma ... |
|---|
| 559 | |
|---|
| 560 | print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; |
|---|
| 561 | |
|---|
| 562 | # ... as are eq and cmp ... |
|---|
| 563 | |
|---|
| 564 | print "peta\n" if "\x{3af}" eq pack("C", 0xdf); |
|---|
| 565 | print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; |
|---|
| 566 | |
|---|
| 567 | # ... but pack/unpack C are not affected, in case you still |
|---|
| 568 | # want to go back to your native encoding |
|---|
| 569 | |
|---|
| 570 | print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; |
|---|
| 571 | |
|---|
| 572 | =head1 KNOWN PROBLEMS |
|---|
| 573 | |
|---|
| 574 | =over |
|---|
| 575 | |
|---|
| 576 | =item literals in regex that are longer than 127 bytes |
|---|
| 577 | |
|---|
| 578 | For native multibyte encodings (either fixed or variable length), |
|---|
| 579 | the current implementation of the regular expressions may introduce |
|---|
| 580 | recoding errors for regular expression literals longer than 127 bytes. |
|---|
| 581 | |
|---|
| 582 | =item EBCDIC |
|---|
| 583 | |
|---|
| 584 | The encoding pragma is not supported on EBCDIC platforms. |
|---|
| 585 | (Porters who are willing and able to remove this limitation are |
|---|
| 586 | welcome.) |
|---|
| 587 | |
|---|
| 588 | =item format |
|---|
| 589 | |
|---|
| 590 | This pragma doesn't work well with format because PerlIO does not |
|---|
| 591 | get along very well with it. When format contains non-ascii |
|---|
| 592 | characters it prints funny or gets "wide character warnings". |
|---|
| 593 | To understand it, try the code below. |
|---|
| 594 | |
|---|
| 595 | # Save this one in utf8 |
|---|
| 596 | # replace *non-ascii* with a non-ascii string |
|---|
| 597 | my $camel; |
|---|
| 598 | format STDOUT = |
|---|
| 599 | *non-ascii*@>>>>>>> |
|---|
| 600 | $camel |
|---|
| 601 | . |
|---|
| 602 | $camel = "*non-ascii*"; |
|---|
| 603 | binmode(STDOUT=>':encoding(utf8)'); # bang! |
|---|
| 604 | write; # funny |
|---|
| 605 | print $camel, "\n"; # fine |
|---|
| 606 | |
|---|
| 607 | Without binmode this happens to work but without binmode, print() |
|---|
| 608 | fails instead of write(). |
|---|
| 609 | |
|---|
| 610 | At any rate, the very use of format is questionable when it comes to |
|---|
| 611 | unicode characters since you have to consider such things as character |
|---|
| 612 | width (i.e. double-width for ideographs) and directions (i.e. BIDI for |
|---|
| 613 | Arabic and Hebrew). |
|---|
| 614 | |
|---|
| 615 | =item Thread safety |
|---|
| 616 | |
|---|
| 617 | C<use encoding ...> is not thread-safe (i.e., do not use in threaded |
|---|
| 618 | applications). |
|---|
| 619 | |
|---|
| 620 | =back |
|---|
| 621 | |
|---|
| 622 | =head2 The Logic of :locale |
|---|
| 623 | |
|---|
| 624 | The logic of C<:locale> is as follows: |
|---|
| 625 | |
|---|
| 626 | =over 4 |
|---|
| 627 | |
|---|
| 628 | =item 1. |
|---|
| 629 | |
|---|
| 630 | If the platform supports the langinfo(CODESET) interface, the codeset |
|---|
| 631 | returned is used as the default encoding for the open pragma. |
|---|
| 632 | |
|---|
| 633 | =item 2. |
|---|
| 634 | |
|---|
| 635 | If 1. didn't work but we are under the locale pragma, the environment |
|---|
| 636 | variables LC_ALL and LANG (in that order) are matched for encodings |
|---|
| 637 | (the part after C<.>, if any), and if any found, that is used |
|---|
| 638 | as the default encoding for the open pragma. |
|---|
| 639 | |
|---|
| 640 | =item 3. |
|---|
| 641 | |
|---|
| 642 | If 1. and 2. didn't work, the environment variables LC_ALL and LANG |
|---|
| 643 | (in that order) are matched for anything looking like UTF-8, and if |
|---|
| 644 | any found, C<:utf8> is used as the default encoding for the open |
|---|
| 645 | pragma. |
|---|
| 646 | |
|---|
| 647 | =back |
|---|
| 648 | |
|---|
| 649 | If your locale environment variables (LC_ALL, LC_CTYPE, LANG) |
|---|
| 650 | contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), |
|---|
| 651 | the default encoding of your STDIN, STDOUT, and STDERR, and of |
|---|
| 652 | B<any subsequent file open>, is UTF-8. |
|---|
| 653 | |
|---|
| 654 | =head1 HISTORY |
|---|
| 655 | |
|---|
| 656 | This pragma first appeared in Perl 5.8.0. For features that require |
|---|
| 657 | 5.8.1 and better, see above. |
|---|
| 658 | |
|---|
| 659 | The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6. |
|---|
| 660 | |
|---|
| 661 | =head1 SEE ALSO |
|---|
| 662 | |
|---|
| 663 | L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>, |
|---|
| 664 | |
|---|
| 665 | Ch. 15 of C<Programming Perl (3rd Edition)> |
|---|
| 666 | by Larry Wall, Tom Christiansen, Jon Orwant; |
|---|
| 667 | O'Reilly & Associates; ISBN 0-596-00027-8 |
|---|
| 668 | |
|---|
| 669 | =cut |
|---|