root/lang/perl/URI-Escape-XS/trunk/lib/URI/Escape/XS.pm @ 28505

Revision 28505, 7.9 kB (checked in by dankogai, 4 years ago)

VERSION 0.04

Line 
1package URI::Escape::XS;
2#
3# $Id: XS.pm,v 0.3 2009/01/16 06:38:52 dankogai Exp dankogai $
4#
5use 5.008001;
6use warnings;
7use strict;
8our $VERSION = sprintf "%d.%02d", q$Revision: 0.3 $ =~ /(\d+)/g;
9
10use base qw(Exporter);
11our @EXPORT    = qw(encodeURIComponent decodeURIComponent
12                    encodeURIComponentIDN decodeURIComponentIDN);
13our @EXPORT_OK = qw(uri_escape uri_unescape);
14
15require XSLoader;
16XSLoader::load('URI::Escape::XS', $VERSION);
17
18
19sub uri_unescape {
20    wantarray
21        ? map { decodeURIComponent($_) } @_
22        : decodeURIComponent(shift)
23}
24
25{
26    use bytes;
27    my %escapes = map { chr($_) => sprintf("%%%02X", $_) } (0..255);
28    my %regexp;
29    sub uri_escape {
30        return unless @_;
31        my ($text, $patn) = @_;
32        return undef unless defined $text;
33        $text .= '';    # RT#39344 -- force string
34        if (defined $patn){
35            unless (exists $regexp{$patn}){
36                my $re;
37                eval {
38                    $re = qr/[$patn]/;
39                };
40                if ($@){
41                    require Carp;
42                    Carp::croak(__PACKAGE__, $@);
43                }
44                $regexp{$patn} = $re;
45            }
46            $text =~ s/($regexp{$patn})/$escapes{$1}/ge;
47            return $text;
48        } else {
49            return encodeURIComponent($text);
50        }
51    }
52}
53
54eval { require Net::IDN::Encode };
55if ( !$@ ) {
56    require Encode;
57    *decodeURIComponentIDN = sub ($) {
58        my $uri = Encode::decode_utf8( decodeURIComponent(shift) );
59        $uri =~ s{\A (https?://)([^/]+)(.*) }
60                 {
61                     $1
62                         . Net::IDN::Encode::domain_to_unicode($2)
63                             . $3;
64                 }msex;
65        return $uri;
66    };
67
68    *encodeURIComponentIDN = sub ($) {
69        my $uri = shift;
70        $uri =~ s{\A (https?)://([^/]+)(.*) }
71                 {
72                     $1 . ":%2F%2F"
73                         . Net::IDN::Encode::domain_to_ascii($2)
74                             . encodeURIComponent($3);
75                 }msex;
76        return $uri;
77    };
78}
79
801;
81__END__
82=encoding utf8
83
84=head1 NAME
85
86URI::Escape::XS - Drop-In replacement for URI::Escape
87
88=head1 VERSION
89
90$Id: XS.pm,v 0.3 2009/01/16 06:38:52 dankogai Exp dankogai $
91
92=cut
93
94=head1 SYNOPSIS
95
96    # use it instead of URI::Escape
97    use URI::Escape::XS qw/uri_escape uri_unescape/;
98    $safe = uri_escape("10% is enough\n");
99    $verysafe = uri_escape("foo", "\0-\377");
100    $str  = uri_unescape($safe);
101
102    # or use encodeURIComponent and decodeURIComponent
103    use URI::Escape::XS;
104    $safe = encodeURIComponent("10% is enough\n");
105    $str  = decodeURIComponent("10%25%20is%20enough%0A");
106
107    # if you have CNet::IDN::Encode installed
108    $safe = encodeURIComponentIDN("http://弾.jp/dan/")
109    $str  = decodeURIComponentIDN("http:%2F%2Fxn--81t.jp%2Fdan%2F");
110
111=head1 EXPORT
112
113=head2 by default
114
115L</encodeURIComponent> and L</decodeURIComponent>
116
117L</encodeURIComponentIDN> and L</decodeURIComponentIDN> if L<Net::IDN::Encode> is available
118
119=head2 on demand
120
121L</uri_escape> and L</uri_unescape>
122
123=head1 FUNCTIONS
124
125=head2 encodeURIComponent
126
127Does what JavaScript's encodeURIComponent does.
128
129  $uri = encodeURIComponent("http://www.example.com/");
130  # http%3A%2F%2Fwww.example.com%2F
131
132Note you cannot customize characters to escape.  If you need to do so,
133use L</uri_escape>.
134
135=head2 decodeURIComponent
136
137Does what JavaScript's decodeURIComponent does.
138
139  $str = decodeURIComponent("http%3A%2F%2Fwww.example.com%2F");
140  # http://www.example.com/
141
142It decode not only %HH sequences but also %uHHHH sequences, with
143surrogate pairs correctly decoded.
144
145  $str = decodeURIComponent("%uD869%uDEB2%u5F3E%u0061");
146  # \x{2A6B2}\x{5F3E}a
147
148This function UNCONDITIONALLY returns the decoded string with utf8 flag off.  To get utf8-decoded string, use L<Encode> and
149
150  decode_utf8(decodeURIComponent($uri));
151
152This is the correct behavior because you cannot tell if the decoded
153string actually contains UTF-8 decoded string, like ISO-8859-1 and
154Shift_JIS.
155
156=head2 encodeURIComponentIDN
157
158Same as L</encodeURIComponent> except that the host part is encoded in
159punycode.  L<Net::IDN::Encode> is required to use this function.
160
161URIs with Internationalizing Domain Names require two encodings:
162Punycode for host part and URI escape for the rest.
163
164Currently only FULL URIs with C<http:> or C<https:> are supported.
165
166=head2 decodeURIComponentIDN
167
168Same as L</decodeURIComponent> except that the host part is encoded in
169punycode.  L<Net::IDN::Encode> is required to use this function.
170
171=head2 uri_escape
172
173Does exactly the same as L<URI::Escape>::uri_escape() B<except>
174when utf8-flagged string is fed.
175
176L<URI::Escape>::uri_escape() croak and urge you to
177C<uri_escape_utf8()> but it is pointless because URI itself has no
178such things as utf8 flag.  The function in this module ALWAYS TREATS
179the string as byte sequence.  That way you can safely use this
180function without worring about utf8 flags.
181
182Note this function is NOT EXPORTED by default.  That way you can use
183L<URI::Escape> and L<URI::Escape::XS> simultaneously.
184
185=head2 uri_unescape
186
187Does exactly the same as L<URI::Escape>::uri_escape() B<except>
188when %uHHHH is fed.
189
190L<URI::Escape>::uri_unescape() simply ignores %uHHHH sequences while
191the function in this module does decode it into the corresponding
192UTF-8 B<byte sequence>.
193
194Like L<uri_escape>, this funciton is NOT EXPORTED by default.
195
196=head2 Note on the %uHHHH sequence
197
198With this module the resulting strings never have the utf8 flag on.
199So if you want to decode it to perl utf8, You have to explicitly
200decode via L<Encode>.  Remember.  URIs have always been a byte
201sequence, not UTF-8 characters.
202
203If the %uHHHH sequence became standard, you could have safely told if a
204given URI is in Unicode.  But more fortunately than unfortunately, the
205RFC proposal was rejected so you cannot tell which encoding is used
206just by looking at the URI.
207
208L<http://en.wikipedia.org/wiki/Percent-encoding#Non-standard_implementations>
209
210I said fortunately because %uHHHH can be nasty for non-BMP characters.
211Since each %uHHHH can hold one 16-bit value, you need a I<surrogate
212pair> to represent it if it is U+10000 and above.
213
214In spite of that, there are a significant number of URIs with %uHHHH
215escapes.  Therefore this module supports decoding only.
216
217=head1 SPEED
218
219Since this module uses XS, it is really fast except for
220uri_escape("noop").
221
222Regexp which is used in L<URI::Escape> is really fast for non-matching
223but slows down significantly when it has to replace string.
224
225=head2 BENCHMARK
226
227On Macbook Pro 2GHz, Perl 5.8.8.
228
229 http://www.google.co.jp/search?q=%E5%B0%8F%E9%A3%BC%E5%BC%BE
230 ============================================================
231 Unescape it
232 -----------
233 U::E      58526/s       --     -88%
234 U::E::XS 486968/s     732%       --
235 --------------
236 Escape it back
237 --------------
238 U::E      30046/s       --     -78%
239 U::E::XS 136992/s     356%       --
240
241 www.example.com
242 ===============
243 Unescape it
244 -----------
245               Rate     U::E U::E::XS
246  U::E     821972/s       --      -4%
247  U::E::XS 854732/s       4%       --
248 --------------
249 Escape it back
250 -------------
251 U::E::XS 522969/s       --      -7%
252 U::E     565112/s       8%       --
253
254=head1 AUTHOR
255
256Dan Kogai, C<< <dankogai at dan.co.jp> >>
257
258=head1 BUGS
259
260Please report any bugs or feature requests to
261C<bug-uri-escape-xs at rt.cpan.org>, or through the web interface at
262L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=URI-Escape-XS>.
263I will be notified, and then you'll automatically be notified of progress on
264your bug as I make changes.
265
266=head1 SUPPORT
267
268You can find documentation for this module with the perldoc command.
269
270    perldoc URI::Escape::XS
271
272You can also look for information at:
273
274=over 4
275
276=item * AnnoCPAN: Annotated CPAN documentation
277
278L<http://annocpan.org/dist/URI-Escape-XS>
279
280=item * CPAN Ratings
281
282L<http://cpanratings.perl.org/d/URI-Escape-XS>
283
284=item * RT: CPAN's request tracker
285
286L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=URI-Escape-XS>
287
288=item * Search CPAN
289
290L<http://search.cpan.org/dist/URI-Escape-XS>
291
292=back
293
294=head1 ACKNOWLEDGEMENTS
295
296Gisle Aas for L<URI::Escape>
297
298Koichi Taniguchi for L<URI::Escape::JavaScript>
299
300Claus Färber for L<Net::IDN::Encode>
301
302=head1 COPYRIGHT & LICENSE
303
304Copyright 2007-2008 Dan Kogai, all rights reserved.
305
306This program is free software; you can redistribute it and/or modify it
307under the same terms as Perl itself.
308
309=cut
Note: See TracBrowser for help on using the browser.