| 1 | package Encode::MIME::Header; |
|---|
| 2 | use strict; |
|---|
| 3 | use warnings; |
|---|
| 4 | no warnings 'redefine'; |
|---|
| 5 | |
|---|
| 6 | our $VERSION = do { my @r = ( q$Revision: 2.11 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; |
|---|
| 7 | use Encode qw(find_encoding encode_utf8 decode_utf8); |
|---|
| 8 | use MIME::Base64; |
|---|
| 9 | use Carp; |
|---|
| 10 | |
|---|
| 11 | my %seed = ( |
|---|
| 12 | decode_b => '1', # decodes 'B' encoding ? |
|---|
| 13 | decode_q => '1', # decodes 'Q' encoding ? |
|---|
| 14 | encode => 'B', # encode with 'B' or 'Q' ? |
|---|
| 15 | bpl => 75, # bytes per line |
|---|
| 16 | ); |
|---|
| 17 | |
|---|
| 18 | $Encode::Encoding{'MIME-Header'} = |
|---|
| 19 | bless { %seed, Name => 'MIME-Header', } => __PACKAGE__; |
|---|
| 20 | |
|---|
| 21 | $Encode::Encoding{'MIME-B'} = bless { |
|---|
| 22 | %seed, |
|---|
| 23 | decode_q => 0, |
|---|
| 24 | Name => 'MIME-B', |
|---|
| 25 | } => __PACKAGE__; |
|---|
| 26 | |
|---|
| 27 | $Encode::Encoding{'MIME-Q'} = bless { |
|---|
| 28 | %seed, |
|---|
| 29 | decode_q => 1, |
|---|
| 30 | encode => 'Q', |
|---|
| 31 | Name => 'MIME-Q', |
|---|
| 32 | } => __PACKAGE__; |
|---|
| 33 | |
|---|
| 34 | use base qw(Encode::Encoding); |
|---|
| 35 | |
|---|
| 36 | sub needs_lines { 1 } |
|---|
| 37 | sub perlio_ok { 0 } |
|---|
| 38 | |
|---|
| 39 | sub decode($$;$) { |
|---|
| 40 | use utf8; |
|---|
| 41 | my ( $obj, $str, $chk ) = @_; |
|---|
| 42 | |
|---|
| 43 | # zap spaces between encoded words |
|---|
| 44 | $str =~ s/\?=\s+=\?/\?==\?/gos; |
|---|
| 45 | |
|---|
| 46 | # multi-line header to single line |
|---|
| 47 | $str =~ s/(?:\r\n|[\r\n])[ \t]//gos; |
|---|
| 48 | |
|---|
| 49 | 1 while ( $str =~ |
|---|
| 50 | s/(=\?[-0-9A-Za-z_]+\?[Qq]\?)(.*?)\?=\1(.*?\?=)/$1$2$3/ ) |
|---|
| 51 | ; # Concat consecutive QP encoded mime headers |
|---|
| 52 | # Fixes breaking inside multi-byte characters |
|---|
| 53 | |
|---|
| 54 | $str =~ s{ |
|---|
| 55 | =\? # begin encoded word |
|---|
| 56 | ([-0-9A-Za-z_]+) # charset (encoding) |
|---|
| 57 | (?:\*[A-Za-z]{1,8}(?:-[A-Za-z]{1,8})*)? # language (RFC 2231) |
|---|
| 58 | \?([QqBb])\? # delimiter |
|---|
| 59 | (.*?) # Base64-encodede contents |
|---|
| 60 | \?= # end encoded word |
|---|
| 61 | }{ |
|---|
| 62 | if (uc($2) eq 'B'){ |
|---|
| 63 | $obj->{decode_b} or croak qq(MIME "B" unsupported); |
|---|
| 64 | decode_b($1, $3, $chk); |
|---|
| 65 | } elsif (uc($2) eq 'Q'){ |
|---|
| 66 | $obj->{decode_q} or croak qq(MIME "Q" unsupported); |
|---|
| 67 | decode_q($1, $3, $chk); |
|---|
| 68 | } else { |
|---|
| 69 | croak qq(MIME "$2" encoding is nonexistent!); |
|---|
| 70 | } |
|---|
| 71 | }egox; |
|---|
| 72 | $_[1] = $str if $chk; |
|---|
| 73 | return $str; |
|---|
| 74 | } |
|---|
| 75 | |
|---|
| 76 | sub decode_b { |
|---|
| 77 | my $enc = shift; |
|---|
| 78 | my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc"); |
|---|
| 79 | my $db64 = decode_base64(shift); |
|---|
| 80 | my $chk = shift; |
|---|
| 81 | return $d->name eq 'utf8' |
|---|
| 82 | ? Encode::decode_utf8($db64) |
|---|
| 83 | : $d->decode( $db64, $chk || Encode::FB_PERLQQ ); |
|---|
| 84 | } |
|---|
| 85 | |
|---|
| 86 | sub decode_q { |
|---|
| 87 | my ( $enc, $q, $chk ) = @_; |
|---|
| 88 | my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc"); |
|---|
| 89 | $q =~ s/_/ /go; |
|---|
| 90 | $q =~ s/=([0-9A-Fa-f]{2})/pack("C", hex($1))/ego; |
|---|
| 91 | return $d->name eq 'utf8' |
|---|
| 92 | ? Encode::decode_utf8($q) |
|---|
| 93 | : $d->decode( $q, $chk || Encode::FB_PERLQQ ); |
|---|
| 94 | } |
|---|
| 95 | |
|---|
| 96 | my $especials = |
|---|
| 97 | join( '|' => map { quotemeta( chr($_) ) } |
|---|
| 98 | unpack( "C*", qq{()<>@,;:"'/[]?.=} ) ); |
|---|
| 99 | |
|---|
| 100 | my $re_encoded_word = qr{ |
|---|
| 101 | =\? # begin encoded word |
|---|
| 102 | (?:[-0-9A-Za-z_]+) # charset (encoding) |
|---|
| 103 | (?:\*[A-Za-z]{1,8}(?:-[A-Za-z]{1,8})*)? # language (RFC 2231) |
|---|
| 104 | \?(?:[QqBb])\? # delimiter |
|---|
| 105 | (?:.*?) # Base64-encodede contents |
|---|
| 106 | \?= # end encoded word |
|---|
| 107 | }xo; |
|---|
| 108 | |
|---|
| 109 | my $re_especials = qr{$re_encoded_word|$especials}xo; |
|---|
| 110 | |
|---|
| 111 | sub encode($$;$) { |
|---|
| 112 | my ( $obj, $str, $chk ) = @_; |
|---|
| 113 | my @line = (); |
|---|
| 114 | for my $line ( split /\r\n|[\r\n]/o, $str ) { |
|---|
| 115 | my ( @word, @subline ); |
|---|
| 116 | for my $word ( split /($re_especials)/o, $line ) { |
|---|
| 117 | if ( $word =~ /[^\x00-\x7f]/o |
|---|
| 118 | or $word =~ /^$re_encoded_word$/o ) |
|---|
| 119 | { |
|---|
| 120 | push @word, $obj->_encode($word); |
|---|
| 121 | } |
|---|
| 122 | else { |
|---|
| 123 | push @word, $word; |
|---|
| 124 | } |
|---|
| 125 | } |
|---|
| 126 | my $subline = ''; |
|---|
| 127 | for my $word (@word) { |
|---|
| 128 | use bytes (); |
|---|
| 129 | if ( bytes::length($subline) + bytes::length($word) > |
|---|
| 130 | $obj->{bpl} ) |
|---|
| 131 | { |
|---|
| 132 | push @subline, $subline; |
|---|
| 133 | $subline = ''; |
|---|
| 134 | } |
|---|
| 135 | $subline .= $word; |
|---|
| 136 | } |
|---|
| 137 | $subline and push @subline, $subline; |
|---|
| 138 | push @line, join( "\n " => @subline ); |
|---|
| 139 | } |
|---|
| 140 | $_[1] = '' if $chk; |
|---|
| 141 | return join( "\n", @line ); |
|---|
| 142 | } |
|---|
| 143 | |
|---|
| 144 | use constant HEAD => '=?UTF-8?'; |
|---|
| 145 | use constant TAIL => '?='; |
|---|
| 146 | use constant SINGLE => { B => \&_encode_b, Q => \&_encode_q, }; |
|---|
| 147 | |
|---|
| 148 | sub _encode { |
|---|
| 149 | my ( $o, $str ) = @_; |
|---|
| 150 | my $enc = $o->{encode}; |
|---|
| 151 | my $llen = ( $o->{bpl} - length(HEAD) - 2 - length(TAIL) ); |
|---|
| 152 | |
|---|
| 153 | # to coerce a floating-point arithmetics, the following contains |
|---|
| 154 | # .0 in numbers -- dankogai |
|---|
| 155 | $llen *= $enc eq 'B' ? 3.0 / 4.0 : 1.0 / 3.0; |
|---|
| 156 | my @result = (); |
|---|
| 157 | my $chunk = ''; |
|---|
| 158 | while ( length( my $chr = substr( $str, 0, 1, '' ) ) ) { |
|---|
| 159 | use bytes (); |
|---|
| 160 | if ( bytes::length($chunk) + bytes::length($chr) > $llen ) { |
|---|
| 161 | push @result, SINGLE->{$enc}($chunk); |
|---|
| 162 | $chunk = ''; |
|---|
| 163 | } |
|---|
| 164 | $chunk .= $chr; |
|---|
| 165 | } |
|---|
| 166 | length($chunk) and push @result, SINGLE->{$enc}($chunk); |
|---|
| 167 | return @result; |
|---|
| 168 | } |
|---|
| 169 | |
|---|
| 170 | sub _encode_b { |
|---|
| 171 | HEAD . 'B?' . encode_base64( encode_utf8(shift), '' ) . TAIL; |
|---|
| 172 | } |
|---|
| 173 | |
|---|
| 174 | sub _encode_q { |
|---|
| 175 | my $chunk = shift; |
|---|
| 176 | $chunk = encode_utf8($chunk); |
|---|
| 177 | $chunk =~ s{ |
|---|
| 178 | ([^0-9A-Za-z]) |
|---|
| 179 | }{ |
|---|
| 180 | join("" => map {sprintf "=%02X", $_} unpack("C*", $1)) |
|---|
| 181 | }egox; |
|---|
| 182 | return HEAD . 'Q?' . $chunk . TAIL; |
|---|
| 183 | } |
|---|
| 184 | |
|---|
| 185 | 1; |
|---|
| 186 | __END__ |
|---|
| 187 | |
|---|
| 188 | =head1 NAME |
|---|
| 189 | |
|---|
| 190 | Encode::MIME::Header -- MIME 'B' and 'Q' header encoding |
|---|
| 191 | |
|---|
| 192 | =head1 SYNOPSIS |
|---|
| 193 | |
|---|
| 194 | use Encode qw/encode decode/; |
|---|
| 195 | $utf8 = decode('MIME-Header', $header); |
|---|
| 196 | $header = encode('MIME-Header', $utf8); |
|---|
| 197 | |
|---|
| 198 | =head1 ABSTRACT |
|---|
| 199 | |
|---|
| 200 | This module implements RFC 2047 Mime Header Encoding. There are 3 |
|---|
| 201 | variant encoding names; C<MIME-Header>, C<MIME-B> and C<MIME-Q>. The |
|---|
| 202 | difference is described below |
|---|
| 203 | |
|---|
| 204 | decode() encode() |
|---|
| 205 | ---------------------------------------------- |
|---|
| 206 | MIME-Header Both B and Q =?UTF-8?B?....?= |
|---|
| 207 | MIME-B B only; Q croaks =?UTF-8?B?....?= |
|---|
| 208 | MIME-Q Q only; B croaks =?UTF-8?Q?....?= |
|---|
| 209 | |
|---|
| 210 | =head1 DESCRIPTION |
|---|
| 211 | |
|---|
| 212 | When you decode(=?I<encoding>?I<X>?I<ENCODED WORD>?=), I<ENCODED WORD> |
|---|
| 213 | is extracted and decoded for I<X> encoding (B for Base64, Q for |
|---|
| 214 | Quoted-Printable). Then the decoded chunk is fed to |
|---|
| 215 | decode(I<encoding>). So long as I<encoding> is supported by Encode, |
|---|
| 216 | any source encoding is fine. |
|---|
| 217 | |
|---|
| 218 | When you encode, it just encodes UTF-8 string with I<X> encoding then |
|---|
| 219 | quoted with =?UTF-8?I<X>?....?= . The parts that RFC 2047 forbids to |
|---|
| 220 | encode are left as is and long lines are folded within 76 bytes per |
|---|
| 221 | line. |
|---|
| 222 | |
|---|
| 223 | =head1 BUGS |
|---|
| 224 | |
|---|
| 225 | It would be nice to support encoding to non-UTF8, such as =?ISO-2022-JP? |
|---|
| 226 | and =?ISO-8859-1?= but that makes the implementation too complicated. |
|---|
| 227 | These days major mail agents all support =?UTF-8? so I think it is |
|---|
| 228 | just good enough. |
|---|
| 229 | |
|---|
| 230 | Due to popular demand, 'MIME-Header-ISO_2022_JP' was introduced by |
|---|
| 231 | Makamaka. Thre are still too many MUAs especially cellular phone |
|---|
| 232 | handsets which does not grok UTF-8. |
|---|
| 233 | |
|---|
| 234 | =head1 SEE ALSO |
|---|
| 235 | |
|---|
| 236 | L<Encode> |
|---|
| 237 | |
|---|
| 238 | RFC 2047, L<http://www.faqs.org/rfcs/rfc2047.html> and many other |
|---|
| 239 | locations. |
|---|
| 240 | |
|---|
| 241 | =cut |
|---|