| 1 | package URI::Escape::XS; |
|---|
| 2 | # |
|---|
| 3 | # $Id: XS.pm,v 0.4 2009/01/16 08:26:52 dankogai Exp dankogai $ |
|---|
| 4 | # |
|---|
| 5 | use 5.008001; |
|---|
| 6 | use warnings; |
|---|
| 7 | use strict; |
|---|
| 8 | our $VERSION = sprintf "%d.%02d", q$Revision: 0.4 $ =~ /(\d+)/g; |
|---|
| 9 | |
|---|
| 10 | use base qw(Exporter); |
|---|
| 11 | our @EXPORT = qw(encodeURIComponent decodeURIComponent |
|---|
| 12 | encodeURIComponentIDN decodeURIComponentIDN); |
|---|
| 13 | our @EXPORT_OK = qw(uri_escape uri_unescape); |
|---|
| 14 | |
|---|
| 15 | require XSLoader; |
|---|
| 16 | XSLoader::load('URI::Escape::XS', $VERSION); |
|---|
| 17 | |
|---|
| 18 | |
|---|
| 19 | sub uri_unescape { |
|---|
| 20 | wantarray |
|---|
| 21 | ? map { decodeURIComponent($_) } @_ |
|---|
| 22 | : decodeURIComponent(shift) |
|---|
| 23 | } |
|---|
| 24 | |
|---|
| 25 | { |
|---|
| 26 | use bytes; |
|---|
| 27 | my %escapes = map { chr($_) => sprintf("%%%02X", $_) } (0..255); |
|---|
| 28 | my %regexp; |
|---|
| 29 | sub uri_escape { |
|---|
| 30 | return unless @_; |
|---|
| 31 | my ($text, $patn) = @_; |
|---|
| 32 | return undef unless defined $text; |
|---|
| 33 | $text .= ''; # RT#39344 -- force string |
|---|
| 34 | if (defined $patn){ |
|---|
| 35 | unless (exists $regexp{$patn}){ |
|---|
| 36 | my $re; |
|---|
| 37 | eval { |
|---|
| 38 | $re = qr/[$patn]/; |
|---|
| 39 | }; |
|---|
| 40 | if ($@){ |
|---|
| 41 | require Carp; |
|---|
| 42 | Carp::croak(__PACKAGE__, $@); |
|---|
| 43 | } |
|---|
| 44 | $regexp{$patn} = $re; |
|---|
| 45 | } |
|---|
| 46 | $text =~ s/($regexp{$patn})/$escapes{$1}/ge; |
|---|
| 47 | return $text; |
|---|
| 48 | } else { |
|---|
| 49 | return encodeURIComponent($text); |
|---|
| 50 | } |
|---|
| 51 | } |
|---|
| 52 | } |
|---|
| 53 | |
|---|
| 54 | eval { require Net::IDN::Encode }; |
|---|
| 55 | if ( !$@ ) { |
|---|
| 56 | require Encode; |
|---|
| 57 | *decodeURIComponentIDN = sub ($) { |
|---|
| 58 | my $uri = Encode::decode_utf8( decodeURIComponent(shift) ); |
|---|
| 59 | $uri =~ s{\A (https?://)([^/:]+)(:[\d]+)?(.*) } |
|---|
| 60 | { |
|---|
| 61 | $1 |
|---|
| 62 | . Net::IDN::Encode::domain_to_unicode($2) . ($3||'') |
|---|
| 63 | . $4; |
|---|
| 64 | }msex; |
|---|
| 65 | return $uri; |
|---|
| 66 | }; |
|---|
| 67 | |
|---|
| 68 | *encodeURIComponentIDN = sub ($) { |
|---|
| 69 | my $uri = shift; |
|---|
| 70 | $uri =~ s{\A (https?)://([^/:]+)(:[\d]+)?(.*) } |
|---|
| 71 | { |
|---|
| 72 | $1 . ":%2F%2F" |
|---|
| 73 | . Net::IDN::Encode::domain_to_ascii($2) . ($3||'') |
|---|
| 74 | . encodeURIComponent($4); |
|---|
| 75 | }msex; |
|---|
| 76 | return $uri; |
|---|
| 77 | }; |
|---|
| 78 | } |
|---|
| 79 | |
|---|
| 80 | 1; |
|---|
| 81 | __END__ |
|---|
| 82 | =encoding utf8 |
|---|
| 83 | |
|---|
| 84 | =head1 NAME |
|---|
| 85 | |
|---|
| 86 | URI::Escape::XS - Drop-In replacement for URI::Escape |
|---|
| 87 | |
|---|
| 88 | =head1 VERSION |
|---|
| 89 | |
|---|
| 90 | $Id: XS.pm,v 0.4 2009/01/16 08:26:52 dankogai Exp dankogai $ |
|---|
| 91 | |
|---|
| 92 | =cut |
|---|
| 93 | |
|---|
| 94 | =head1 SYNOPSIS |
|---|
| 95 | |
|---|
| 96 | # use it instead of URI::Escape |
|---|
| 97 | use URI::Escape::XS qw/uri_escape uri_unescape/; |
|---|
| 98 | $safe = uri_escape("10% is enough\n"); |
|---|
| 99 | $verysafe = uri_escape("foo", "\0-\377"); |
|---|
| 100 | $str = uri_unescape($safe); |
|---|
| 101 | |
|---|
| 102 | # or use encodeURIComponent and decodeURIComponent |
|---|
| 103 | use URI::Escape::XS; |
|---|
| 104 | $safe = encodeURIComponent("10% is enough\n"); |
|---|
| 105 | $str = decodeURIComponent("10%25%20is%20enough%0A"); |
|---|
| 106 | |
|---|
| 107 | # if you have CNet::IDN::Encode installed |
|---|
| 108 | $safe = encodeURIComponentIDN("http://弾.jp/dan/"); |
|---|
| 109 | $str = decodeURIComponentIDN("http:%2F%2Fxn--81t.jp%2Fdan%2F"); |
|---|
| 110 | |
|---|
| 111 | =head1 EXPORT |
|---|
| 112 | |
|---|
| 113 | =head2 by default |
|---|
| 114 | |
|---|
| 115 | L</encodeURIComponent> and L</decodeURIComponent> |
|---|
| 116 | |
|---|
| 117 | L</encodeURIComponentIDN> and L</decodeURIComponentIDN> if L<Net::IDN::Encode> is available |
|---|
| 118 | |
|---|
| 119 | =head2 on demand |
|---|
| 120 | |
|---|
| 121 | L</uri_escape> and L</uri_unescape> |
|---|
| 122 | |
|---|
| 123 | =head1 FUNCTIONS |
|---|
| 124 | |
|---|
| 125 | =head2 encodeURIComponent |
|---|
| 126 | |
|---|
| 127 | Does what JavaScript's encodeURIComponent does. |
|---|
| 128 | |
|---|
| 129 | $uri = encodeURIComponent("http://www.example.com/"); |
|---|
| 130 | # http%3A%2F%2Fwww.example.com%2F |
|---|
| 131 | |
|---|
| 132 | Note you cannot customize characters to escape. If you need to do so, |
|---|
| 133 | use L</uri_escape>. |
|---|
| 134 | |
|---|
| 135 | =head2 decodeURIComponent |
|---|
| 136 | |
|---|
| 137 | Does what JavaScript's decodeURIComponent does. |
|---|
| 138 | |
|---|
| 139 | $str = decodeURIComponent("http%3A%2F%2Fwww.example.com%2F"); |
|---|
| 140 | # http://www.example.com/ |
|---|
| 141 | |
|---|
| 142 | It decode not only %HH sequences but also %uHHHH sequences, with |
|---|
| 143 | surrogate pairs correctly decoded. |
|---|
| 144 | |
|---|
| 145 | $str = decodeURIComponent("%uD869%uDEB2%u5F3E%u0061"); |
|---|
| 146 | # \x{2A6B2}\x{5F3E}a |
|---|
| 147 | |
|---|
| 148 | This function UNCONDITIONALLY returns the decoded string with utf8 flag off. To get utf8-decoded string, use L<Encode> and |
|---|
| 149 | |
|---|
| 150 | decode_utf8(decodeURIComponent($uri)); |
|---|
| 151 | |
|---|
| 152 | This is the correct behavior because you cannot tell if the decoded |
|---|
| 153 | string actually contains UTF-8 decoded string, like ISO-8859-1 and |
|---|
| 154 | Shift_JIS. |
|---|
| 155 | |
|---|
| 156 | =head2 encodeURIComponentIDN |
|---|
| 157 | |
|---|
| 158 | Same as L</encodeURIComponent> except that the host part is encoded in |
|---|
| 159 | punycode. L<Net::IDN::Encode> is required to use this function. |
|---|
| 160 | |
|---|
| 161 | URIs with Internationalizing Domain Names require two encodings: |
|---|
| 162 | Punycode for host part and URI escape for the rest. |
|---|
| 163 | |
|---|
| 164 | Currently only FULL URIs with C<http:> or C<https:> are supported. |
|---|
| 165 | |
|---|
| 166 | =head2 decodeURIComponentIDN |
|---|
| 167 | |
|---|
| 168 | Same as L</decodeURIComponent> except that the host part is encoded in |
|---|
| 169 | punycode. L<Net::IDN::Encode> is required to use this function. |
|---|
| 170 | |
|---|
| 171 | =head2 uri_escape |
|---|
| 172 | |
|---|
| 173 | Does exactly the same as L<URI::Escape>::uri_escape() B<except> |
|---|
| 174 | when utf8-flagged string is fed. |
|---|
| 175 | |
|---|
| 176 | L<URI::Escape>::uri_escape() croak and urge you to |
|---|
| 177 | C<uri_escape_utf8()> but it is pointless because URI itself has no |
|---|
| 178 | such things as utf8 flag. The function in this module ALWAYS TREATS |
|---|
| 179 | the string as byte sequence. That way you can safely use this |
|---|
| 180 | function without worring about utf8 flags. |
|---|
| 181 | |
|---|
| 182 | Note this function is NOT EXPORTED by default. That way you can use |
|---|
| 183 | L<URI::Escape> and L<URI::Escape::XS> simultaneously. |
|---|
| 184 | |
|---|
| 185 | =head2 uri_unescape |
|---|
| 186 | |
|---|
| 187 | Does exactly the same as L<URI::Escape>::uri_escape() B<except> |
|---|
| 188 | when %uHHHH is fed. |
|---|
| 189 | |
|---|
| 190 | L<URI::Escape>::uri_unescape() simply ignores %uHHHH sequences while |
|---|
| 191 | the function in this module does decode it into the corresponding |
|---|
| 192 | UTF-8 B<byte sequence>. |
|---|
| 193 | |
|---|
| 194 | Like L<uri_escape>, this funciton is NOT EXPORTED by default. |
|---|
| 195 | |
|---|
| 196 | =head2 Note on the %uHHHH sequence |
|---|
| 197 | |
|---|
| 198 | With this module the resulting strings never have the utf8 flag on. |
|---|
| 199 | So if you want to decode it to perl utf8, You have to explicitly |
|---|
| 200 | decode via L<Encode>. Remember. URIs have always been a byte |
|---|
| 201 | sequence, not UTF-8 characters. |
|---|
| 202 | |
|---|
| 203 | If the %uHHHH sequence became standard, you could have safely told if a |
|---|
| 204 | given URI is in Unicode. But more fortunately than unfortunately, the |
|---|
| 205 | RFC proposal was rejected so you cannot tell which encoding is used |
|---|
| 206 | just by looking at the URI. |
|---|
| 207 | |
|---|
| 208 | L<http://en.wikipedia.org/wiki/Percent-encoding#Non-standard_implementations> |
|---|
| 209 | |
|---|
| 210 | I said fortunately because %uHHHH can be nasty for non-BMP characters. |
|---|
| 211 | Since each %uHHHH can hold one 16-bit value, you need a I<surrogate |
|---|
| 212 | pair> to represent it if it is U+10000 and above. |
|---|
| 213 | |
|---|
| 214 | In spite of that, there are a significant number of URIs with %uHHHH |
|---|
| 215 | escapes. Therefore this module supports decoding only. |
|---|
| 216 | |
|---|
| 217 | =head1 SPEED |
|---|
| 218 | |
|---|
| 219 | Since this module uses XS, it is really fast except for |
|---|
| 220 | uri_escape("noop"). |
|---|
| 221 | |
|---|
| 222 | Regexp which is used in L<URI::Escape> is really fast for non-matching |
|---|
| 223 | but slows down significantly when it has to replace string. |
|---|
| 224 | |
|---|
| 225 | =head2 BENCHMARK |
|---|
| 226 | |
|---|
| 227 | On Macbook Pro 2GHz, Perl 5.8.8. |
|---|
| 228 | |
|---|
| 229 | http://www.google.co.jp/search?q=%E5%B0%8F%E9%A3%BC%E5%BC%BE |
|---|
| 230 | ============================================================ |
|---|
| 231 | Unescape it |
|---|
| 232 | ----------- |
|---|
| 233 | U::E 58526/s -- -88% |
|---|
| 234 | U::E::XS 486968/s 732% -- |
|---|
| 235 | -------------- |
|---|
| 236 | Escape it back |
|---|
| 237 | -------------- |
|---|
| 238 | U::E 30046/s -- -78% |
|---|
| 239 | U::E::XS 136992/s 356% -- |
|---|
| 240 | |
|---|
| 241 | www.example.com |
|---|
| 242 | =============== |
|---|
| 243 | Unescape it |
|---|
| 244 | ----------- |
|---|
| 245 | Rate U::E U::E::XS |
|---|
| 246 | U::E 821972/s -- -4% |
|---|
| 247 | U::E::XS 854732/s 4% -- |
|---|
| 248 | -------------- |
|---|
| 249 | Escape it back |
|---|
| 250 | ------------- |
|---|
| 251 | U::E::XS 522969/s -- -7% |
|---|
| 252 | U::E 565112/s 8% -- |
|---|
| 253 | |
|---|
| 254 | =head1 AUTHOR |
|---|
| 255 | |
|---|
| 256 | Dan Kogai, C<< <dankogai at dan.co.jp> >> |
|---|
| 257 | |
|---|
| 258 | =head1 BUGS |
|---|
| 259 | |
|---|
| 260 | Please report any bugs or feature requests to |
|---|
| 261 | C<bug-uri-escape-xs at rt.cpan.org>, or through the web interface at |
|---|
| 262 | L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=URI-Escape-XS>. |
|---|
| 263 | I will be notified, and then you'll automatically be notified of progress on |
|---|
| 264 | your bug as I make changes. |
|---|
| 265 | |
|---|
| 266 | =head1 SUPPORT |
|---|
| 267 | |
|---|
| 268 | You can find documentation for this module with the perldoc command. |
|---|
| 269 | |
|---|
| 270 | perldoc URI::Escape::XS |
|---|
| 271 | |
|---|
| 272 | You can also look for information at: |
|---|
| 273 | |
|---|
| 274 | =over 4 |
|---|
| 275 | |
|---|
| 276 | =item * AnnoCPAN: Annotated CPAN documentation |
|---|
| 277 | |
|---|
| 278 | L<http://annocpan.org/dist/URI-Escape-XS> |
|---|
| 279 | |
|---|
| 280 | =item * CPAN Ratings |
|---|
| 281 | |
|---|
| 282 | L<http://cpanratings.perl.org/d/URI-Escape-XS> |
|---|
| 283 | |
|---|
| 284 | =item * RT: CPAN's request tracker |
|---|
| 285 | |
|---|
| 286 | L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=URI-Escape-XS> |
|---|
| 287 | |
|---|
| 288 | =item * Search CPAN |
|---|
| 289 | |
|---|
| 290 | L<http://search.cpan.org/dist/URI-Escape-XS> |
|---|
| 291 | |
|---|
| 292 | =back |
|---|
| 293 | |
|---|
| 294 | =head1 ACKNOWLEDGEMENTS |
|---|
| 295 | |
|---|
| 296 | Gisle Aas for L<URI::Escape> |
|---|
| 297 | |
|---|
| 298 | Koichi Taniguchi for L<URI::Escape::JavaScript> |
|---|
| 299 | |
|---|
| 300 | Claus Färber for L<Net::IDN::Encode> |
|---|
| 301 | |
|---|
| 302 | =head1 COPYRIGHT & LICENSE |
|---|
| 303 | |
|---|
| 304 | Copyright 2007-2008 Dan Kogai, all rights reserved. |
|---|
| 305 | |
|---|
| 306 | This program is free software; you can redistribute it and/or modify it |
|---|
| 307 | under the same terms as Perl itself. |
|---|
| 308 | |
|---|
| 309 | =cut |
|---|