| 1 | # -------------------------------------------------------------------- # |
|---|
| 2 | # EscapeUTF8.pm - Escape ab-normal charactors in UTF-8 |
|---|
| 3 | # Copyright (C) 2000-2004 Kawasaki Yuusuke <u-suke@kawa.net> |
|---|
| 4 | # -------------------------------------------------------------------- # |
|---|
| 5 | # 2004/10/24 EntityRef.pm ��б� |
|---|
| 6 | # 2004/11/07 POD �������ޤ�����EntityRef.pm ���� |
|---|
| 7 | # -------------------------------------------------------------------- # |
|---|
| 8 | package EscapeUTF8; |
|---|
| 9 | use strict; |
|---|
| 10 | use vars qw( $VERSION ); |
|---|
| 11 | $VERSION = "0.01"; |
|---|
| 12 | # -------------------------------------------------------------------- # |
|---|
| 13 | =head1 NAME |
|---|
| 14 | |
|---|
| 15 | EscapeUTF8.pm - Escape IBM extended Kanji and emoji in Shift_JIS |
|---|
| 16 | |
|---|
| 17 | =head1 SYNOPSIS |
|---|
| 18 | |
|---|
| 19 | use EscapeUTF8; |
|---|
| 20 | my $text = ""; |
|---|
| 21 | EscapeUTF8::escape( \$text, $ENV{HTTP_USER_AGENT} ); |
|---|
| 22 | EscapeUTF8::unescape( \$text, $ENV{HTTP_USER_AGENT} ); |
|---|
| 23 | |
|---|
| 24 | =head1 DESCRIPTION |
|---|
| 25 | |
|---|
| 26 | escape( UTF8_STRING, USER_AGENT ); |
|---|
| 27 | |
|---|
| 28 | ���⤷�ޤ������� |
|---|
| 29 | unescape( UTF8_STRING, USER_AGENT ); |
|---|
| 30 | |
|---|
| 31 | UTF8_STRING�ʥ����顼ʸ���Υ����������ޤ��ϥ����顼ʸ������ |
|---|
| 32 | &#xHHHH; ����˥��������פ��줿�ʬ��F-8 �Х��ʥ�������ޤ��� |
|---|
| 33 | |
|---|
| 34 | =head1 COPYRIGHT |
|---|
| 35 | |
|---|
| 36 | Copyright 2004 Kawasaki Yusuke <u-suke@kawa.net> |
|---|
| 37 | http://www.kawa.net/ |
|---|
| 38 | |
|---|
| 39 | =cut |
|---|
| 40 | # -------------------------------------------------------------------- # |
|---|
| 41 | # EntityRef.pm �����夹�� -------------------------------------------------------------------- # |
|---|
| 42 | # my $ENTITYREF; |
|---|
| 43 | # -------------------------------------------------------------------- # |
|---|
| 44 | # ���������סʲ��⤷�ʤ���# -------------------------------------------------------------------- # |
|---|
| 45 | sub escape { |
|---|
| 46 | my $src = \$_[0]; |
|---|
| 47 | my $ref = ref $$src ? $$src : $src; # ɬ���������������� |
|---|
| 48 | |
|---|
| 49 | # nothing to do. |
|---|
| 50 | |
|---|
| 51 | undef; |
|---|
| 52 | } |
|---|
| 53 | # -------------------------------------------------------------------- # |
|---|
| 54 | # UTF-8 ���&#ddddd; &#xHHHH; ɽ����������ʥ������� -------------------------------------------------------------------- # |
|---|
| 55 | sub unescape { |
|---|
| 56 | my $src = \$_[0]; |
|---|
| 57 | my $ref = ref $$src ? $$src : $src; # ɬ���������������� |
|---|
| 58 | |
|---|
| 59 | # # € ��λ��Ȥ�xHHHH; �����Ѵ����� if ( defined $EntityRef::VERSION ) { |
|---|
| 60 | # $ENTITYREF ||= new EntityRef(); |
|---|
| 61 | # if ( $ENTITYREF ) { |
|---|
| 62 | # $ENTITYREF->entity2hex( $ref ); |
|---|
| 63 | # } |
|---|
| 64 | # } |
|---|
| 65 | |
|---|
| 66 | # &#ddddd; ��#xHHHH; ����ʥ�������� $$ref =~ s{ |
|---|
| 67 | (\&\#(?:([0-9]{3,5})|x([0-9A-Fa-f]{4}));) |
|---|
| 68 | }{ |
|---|
| 69 | my $str = $1; |
|---|
| 70 | my $conv = &one_unescape_utf8( $2||hex($3) ); |
|---|
| 71 | $str = $conv if defined $conv; |
|---|
| 72 | $str; |
|---|
| 73 | }gex; |
|---|
| 74 | } |
|---|
| 75 | # -------------------------------------------------------------------- # |
|---|
| 76 | # Unicode ��椫��TF8 �Х��ʥ����������ʸ����# -------------------------------------------------------------------- # |
|---|
| 77 | sub one_unescape_utf8 { |
|---|
| 78 | my $code = shift; # Unicode ������ֹ� my $str = undef; |
|---|
| 79 | if ( $code < 0x0080 || $code >= 0xFFFE ) { |
|---|
| 80 | # ASCII �ʬ���£ϣ͡���£ͣФ�Ѵ����ʤ� |
|---|
| 81 | } elsif ( $code >= 0xE000 && $code <= 0xF8FF ) { |
|---|
| 82 | # Unicode �����������ʤ� |
|---|
| 83 | } elsif ( $code >= 63647 && $code <= 63996 ) { |
|---|
| 84 | # �ɥ��⳨ʸ��ift_JISɽ�����Υ�������Ѵ����ʤ� |
|---|
| 85 | } elsif ( $code <= 0x07FF ) { |
|---|
| 86 | # 0x0080��x07FF ��Ѵ����� $str = pack( "C*" => 0xC0|($code>>6),0x80|($code&0x3F)); |
|---|
| 87 | } elsif ( $code <= 0xFFFD ) { |
|---|
| 88 | # 0x0800��xFFFD ��Ѵ����� $str = pack( "C*" => 0xE0|($code>>12), |
|---|
| 89 | 0x80|(($code>>6)&0x3F), |
|---|
| 90 | 0x80|($code&0x3F)); |
|---|
| 91 | } |
|---|
| 92 | $str; |
|---|
| 93 | } |
|---|
| 94 | # -------------------------------------------------------------------- # |
|---|
| 95 | # ��� MIME �إå�ǥ�����# -------------------------------------------------------------------- # |
|---|
| 96 | sub mime_decode { |
|---|
| 97 | my $src = \$_[0]; |
|---|
| 98 | my $ref = ref $$src ? $$src : $src; # ɬ���������������� |
|---|
| 99 | &require_mime_base64(); # MIME::Base64 ��߹��� $$ref =~ s{ |
|---|
| 100 | \=\?UTF-8\?B\?([^\s\?]+)\?\= |
|---|
| 101 | }{ |
|---|
| 102 | MIME::Base64::decode_base64($1); |
|---|
| 103 | }iegx; |
|---|
| 104 | } |
|---|
| 105 | # -------------------------------------------------------------------- # |
|---|
| 106 | # ��� MIME �إå������# -------------------------------------------------------------------- # |
|---|
| 107 | sub mime_encode { |
|---|
| 108 | my $src = \$_[0]; |
|---|
| 109 | my $ref = ref $$src ? $$src : $src; # ɬ���������������� |
|---|
| 110 | &require_mime_base64(); # MIME::Base64 ��߹��� $$ref =~ s{ |
|---|
| 111 | ( \e\$B ((?:[\x21-\x7E][\x21-\x7E])+) \e\(B ) |
|---|
| 112 | }{ |
|---|
| 113 | "=?UTF-8?B?".MIME::Base64::encode_base64($1, "")."?="; |
|---|
| 114 | }egx; |
|---|
| 115 | } |
|---|
| 116 | # -------------------------------------------------------------------- # |
|---|
| 117 | sub require_mime_base64 { |
|---|
| 118 | if ( ! defined $MIME::Base64::VERSION ) { |
|---|
| 119 | eval 'require "MIME/Base64.pm";'; |
|---|
| 120 | } |
|---|
| 121 | if ( ! defined $MIME::Base64::VERSION ) { |
|---|
| 122 | die "MIME::Base64 is required for EscapeUTF8::mime_encode()\n"; |
|---|
| 123 | } |
|---|
| 124 | } |
|---|
| 125 | # -------------------------------------------------------------------- # |
|---|
| 126 | ;1; # End of the script. |
|---|
| 127 | # -------------------------------------------------------------------- # |
|---|