| 1 | =head1 NAME |
|---|
| 2 | |
|---|
| 3 | XML::TreePP -- Pure Perl implementation for parsing/writing xml files |
|---|
| 4 | |
|---|
| 5 | =head1 SYNOPSIS |
|---|
| 6 | |
|---|
| 7 | parse xml file into hash tree |
|---|
| 8 | |
|---|
| 9 | use XML::TreePP; |
|---|
| 10 | my $tpp = XML::TreePP->new(); |
|---|
| 11 | my $tree = $tpp->parsefile( "index.rdf" ); |
|---|
| 12 | print "Title: ", $tree->{"rdf:RDF"}->{item}->[0]->{title}, "\n"; |
|---|
| 13 | print "URL: ", $tree->{"rdf:RDF"}->{item}->[0]->{link}, "\n"; |
|---|
| 14 | |
|---|
| 15 | write xml as string from hash tree |
|---|
| 16 | |
|---|
| 17 | use XML::TreePP; |
|---|
| 18 | my $tpp = XML::TreePP->new(); |
|---|
| 19 | my $tree = { rss => { channel => { item => [ { |
|---|
| 20 | title => "The Perl Directory", |
|---|
| 21 | link => "http://www.perl.org/", |
|---|
| 22 | }, { |
|---|
| 23 | title => "The Comprehensive Perl Archive Network", |
|---|
| 24 | link => "http://cpan.perl.org/", |
|---|
| 25 | } ] } } }; |
|---|
| 26 | my $xml = $tpp->write( $tree ); |
|---|
| 27 | print $xml; |
|---|
| 28 | |
|---|
| 29 | get remote xml file with HTTP-GET and parse it into hash tree |
|---|
| 30 | |
|---|
| 31 | use XML::TreePP; |
|---|
| 32 | my $tpp = XML::TreePP->new(); |
|---|
| 33 | my $tree = $tpp->parsehttp( GET => "http://use.perl.org/index.rss" ); |
|---|
| 34 | print "Title: ", $tree->{"rdf:RDF"}->{channel}->{title}, "\n"; |
|---|
| 35 | print "URL: ", $tree->{"rdf:RDF"}->{channel}->{link}, "\n"; |
|---|
| 36 | |
|---|
| 37 | get remote xml file with HTTP-POST and parse it into hash tree |
|---|
| 38 | |
|---|
| 39 | use XML::TreePP; |
|---|
| 40 | my $tpp = XML::TreePP->new( force_array => [qw( item )] ); |
|---|
| 41 | my $cgiurl = "http://search.hatena.ne.jp/keyword"; |
|---|
| 42 | my $keyword = "ajax"; |
|---|
| 43 | my $cgiquery = "mode=rss2&word=".$keyword; |
|---|
| 44 | my $tree = $tpp->parsehttp( POST => $cgiurl, $cgiquery ); |
|---|
| 45 | print "Link: ", $tree->{rss}->{channel}->{item}->[0]->{link}, "\n"; |
|---|
| 46 | print "Desc: ", $tree->{rss}->{channel}->{item}->[0]->{description}, "\n"; |
|---|
| 47 | |
|---|
| 48 | =head1 DESCRIPTION |
|---|
| 49 | |
|---|
| 50 | XML::TreePP module parses XML file and expands it for a hash tree. |
|---|
| 51 | And also generate XML file from a hash tree. |
|---|
| 52 | This is a pure Perl implementation. |
|---|
| 53 | You can also download XML from remote web server |
|---|
| 54 | like XMLHttpRequest object at JavaScript language. |
|---|
| 55 | |
|---|
| 56 | =head1 EXAMPLES |
|---|
| 57 | |
|---|
| 58 | =head2 Parse XML file |
|---|
| 59 | |
|---|
| 60 | Sample XML source: |
|---|
| 61 | |
|---|
| 62 | <?xml version="1.0" encoding="UTF-8"?> |
|---|
| 63 | <family name="Kawasaki"> |
|---|
| 64 | <father>Yasuhisa</father> |
|---|
| 65 | <mother>Chizuko</mother> |
|---|
| 66 | <children> |
|---|
| 67 | <girl>Shiori</girl> |
|---|
| 68 | <boy>Yusuke</boy> |
|---|
| 69 | <boy>Kairi</boy> |
|---|
| 70 | </children> |
|---|
| 71 | </family> |
|---|
| 72 | |
|---|
| 73 | Sample program to read a xml file and dump it: |
|---|
| 74 | |
|---|
| 75 | use XML::TreePP; |
|---|
| 76 | use Data::Dumper; |
|---|
| 77 | my $tpp = XML::TreePP->new(); |
|---|
| 78 | my $tree = $tpp->parsefile( "family.xml" ); |
|---|
| 79 | my $text = Dumper( $tree ); |
|---|
| 80 | print $text; |
|---|
| 81 | |
|---|
| 82 | Result dumped: |
|---|
| 83 | |
|---|
| 84 | $VAR1 = { |
|---|
| 85 | 'family' => { |
|---|
| 86 | '-name' => 'Kawasaki', |
|---|
| 87 | 'father' => 'Yasuhisa', |
|---|
| 88 | 'mother' => 'Chizuko', |
|---|
| 89 | 'children' => { |
|---|
| 90 | 'girl' => 'Shiori' |
|---|
| 91 | 'boy' => [ |
|---|
| 92 | 'Yusuke', |
|---|
| 93 | 'Kairi' |
|---|
| 94 | ], |
|---|
| 95 | } |
|---|
| 96 | } |
|---|
| 97 | }; |
|---|
| 98 | |
|---|
| 99 | Details: |
|---|
| 100 | |
|---|
| 101 | print $tree->{family}->{father}; # the father's given name. |
|---|
| 102 | |
|---|
| 103 | The prefix '-' is added on every attributes' name. |
|---|
| 104 | |
|---|
| 105 | print $tree->{family}->{"-name"}; # the family name of the family |
|---|
| 106 | |
|---|
| 107 | The array is used because the family has two boys. |
|---|
| 108 | |
|---|
| 109 | print $tree->{family}->{children}->{boy}->[1]; # The second boy's name |
|---|
| 110 | print $tree->{family}->{children}->{girl}; # The girl's name |
|---|
| 111 | |
|---|
| 112 | =head2 Text node and attributes: |
|---|
| 113 | |
|---|
| 114 | If a element has both of a text node and attributes |
|---|
| 115 | or both of a text node and other child nodes, |
|---|
| 116 | value of a text node is moved to '#text' like child nodes. |
|---|
| 117 | |
|---|
| 118 | use XML::TreePP; |
|---|
| 119 | use Data::Dumper; |
|---|
| 120 | my $tpp = XML::TreePP->new(); |
|---|
| 121 | my $source = '<span class="author">Kawasaki Yusuke</span>'; |
|---|
| 122 | my $tree = $tpp->parse( $source ); |
|---|
| 123 | my $text = Dumper( $tree ); |
|---|
| 124 | print $text; |
|---|
| 125 | |
|---|
| 126 | The result dumped is following: |
|---|
| 127 | |
|---|
| 128 | $VAR1 = { |
|---|
| 129 | 'span' => { |
|---|
| 130 | '-class' => 'author', |
|---|
| 131 | '#text' => 'Kawasaki Yusuke' |
|---|
| 132 | } |
|---|
| 133 | }; |
|---|
| 134 | |
|---|
| 135 | The special node name of '#text' is used because this elements |
|---|
| 136 | has attribute(s) in addition to the text node. |
|---|
| 137 | |
|---|
| 138 | =head1 CONSTRUCTOR AND OPTIONS |
|---|
| 139 | |
|---|
| 140 | =head2 $tpp = XML::TreePP->new(); |
|---|
| 141 | |
|---|
| 142 | This constructor method returns a new XML::TreePP object. |
|---|
| 143 | |
|---|
| 144 | =head2 $tpp = XML::TreePP->new( %options ); |
|---|
| 145 | |
|---|
| 146 | Its first argument is a hash variable to set one or more options |
|---|
| 147 | like following: |
|---|
| 148 | |
|---|
| 149 | =head2 $tpp->set( option_name => $option_value ); |
|---|
| 150 | |
|---|
| 151 | This method sets a option value for "option_name". |
|---|
| 152 | If $option_value is not defined, its option is deleted. |
|---|
| 153 | Options below are available: |
|---|
| 154 | |
|---|
| 155 | =head2 $tpp->set( output_encoding => 'UTF-8' ); |
|---|
| 156 | |
|---|
| 157 | You can define a encoding of xml file generated by write/writefile |
|---|
| 158 | methods. On Perl 5.8.x and later, you can select it from every |
|---|
| 159 | encodings supported by Encode.pm. On Perl 5.6.x or before with |
|---|
| 160 | Jcode.pm, you can use 'Shift_JIS', 'EUC-JP', 'ISO-2022-JP' and |
|---|
| 161 | 'UTF-8'. The default value is 'UTF-8'. |
|---|
| 162 | |
|---|
| 163 | =head2 $tpp->set( force_array => [ 'rdf:li', 'item', '-xmlns' ] ); |
|---|
| 164 | |
|---|
| 165 | This option allows you to specify a list of element names which |
|---|
| 166 | should always be forced into an array representation |
|---|
| 167 | The default value is null, it means that context of the elements |
|---|
| 168 | will determine to make array or to keep it scalar. |
|---|
| 169 | |
|---|
| 170 | =head2 $tpp->set( first_out => [ 'link', 'title', '-type' ] ); |
|---|
| 171 | |
|---|
| 172 | This option allows you to specify a list of element/attribute |
|---|
| 173 | names which should always appears at first on output XML code. |
|---|
| 174 | The default value is null, it means alphabetical order is used. |
|---|
| 175 | |
|---|
| 176 | =head2 $tpp->set( last_out => [ 'items', 'item', 'entry' ] ); |
|---|
| 177 | |
|---|
| 178 | This option allows you to specify a list of element/attribute |
|---|
| 179 | names which should always appears at last on output XML code. |
|---|
| 180 | |
|---|
| 181 | =head2 $tpp->set( cdata_scalar_ref => 1 ); |
|---|
| 182 | |
|---|
| 183 | This option allows you to convert a cdata section into a reference |
|---|
| 184 | for scalar on parsing XML source. If this option is false, per |
|---|
| 185 | default, cdata section is converted into a scalar. |
|---|
| 186 | |
|---|
| 187 | =head2 $tpp->set( user_agent => 'Mozilla/4.0 (compatible; ...)' ); |
|---|
| 188 | |
|---|
| 189 | This option allows you to specify a HTTP_USER_AGENT string which |
|---|
| 190 | is used by parsehttp() method. |
|---|
| 191 | The default string is "XML-TreePP/#.##", where "#.##" is |
|---|
| 192 | substituted with the version number of this library. |
|---|
| 193 | |
|---|
| 194 | =head2 $tpp->set( attr_prefix => '@' ); |
|---|
| 195 | |
|---|
| 196 | This option allows you to specify a prefix character(s) which |
|---|
| 197 | is inserted before each attribute names. |
|---|
| 198 | The default character is '-'. |
|---|
| 199 | Or set '@' to access attribute values like E4X, ECMAScript for XML. |
|---|
| 200 | |
|---|
| 201 | =head2 $tpp->set( ignore_error => 1 ); |
|---|
| 202 | |
|---|
| 203 | This module calls Carp::croak function on an error per default. |
|---|
| 204 | This option makes all errors ignored and just return. |
|---|
| 205 | |
|---|
| 206 | =head2 $tpp->set( xml_decl => '' ); |
|---|
| 207 | |
|---|
| 208 | This module generates an XML declaration on writing an XML code per default. |
|---|
| 209 | This option forces to change or leave it. |
|---|
| 210 | |
|---|
| 211 | =head2 $tpp->get( "option_name" ); |
|---|
| 212 | |
|---|
| 213 | This method returns a current option value for "option_name". |
|---|
| 214 | |
|---|
| 215 | =head1 METHODS |
|---|
| 216 | |
|---|
| 217 | =head2 $tree = $tpp->parse( $source ); |
|---|
| 218 | |
|---|
| 219 | This method reads XML source and returns a hash tree converted. |
|---|
| 220 | The first argument is a scalar or a reference to a scalar. |
|---|
| 221 | |
|---|
| 222 | =head2 $tree = $tpp->parsefile( $file ); |
|---|
| 223 | |
|---|
| 224 | This method reads a XML file and returns a hash tree converted. |
|---|
| 225 | The first argument is a filename. |
|---|
| 226 | |
|---|
| 227 | =head2 $tree = $tpp->parsehttp( $method, $url, $body, $head ); |
|---|
| 228 | |
|---|
| 229 | This method receives a XML file from a remote server via HTTP and |
|---|
| 230 | returns a hash tree converted. |
|---|
| 231 | $method is a method of HTTP connection: GET/POST/PUT/DELETE |
|---|
| 232 | $url is an URI of a XML file. |
|---|
| 233 | $body is a request body when you use POST method. |
|---|
| 234 | $head is a request headers as a hash ref. |
|---|
| 235 | LWP::UserAgent module or HTTP::Lite module is required to fetch a file. |
|---|
| 236 | |
|---|
| 237 | =head2 $source = $tpp->write( $tree, $encode ); |
|---|
| 238 | |
|---|
| 239 | This method parses a hash tree and returns a XML source generated. |
|---|
| 240 | $tree is a referecen to a hash tree. |
|---|
| 241 | |
|---|
| 242 | =head2 $tpp->writefile( $file, $tree, $encode ); |
|---|
| 243 | |
|---|
| 244 | This method parses a hash tree and writes a XML source into a file. |
|---|
| 245 | $file is a filename to create. |
|---|
| 246 | $tree is a referecen to a hash tree. |
|---|
| 247 | |
|---|
| 248 | =head1 AUTHOR |
|---|
| 249 | |
|---|
| 250 | Yusuke Kawasaki, http://www.kawa.net/ |
|---|
| 251 | |
|---|
| 252 | =head1 COPYRIGHT AND LICENSE |
|---|
| 253 | |
|---|
| 254 | Copyright (c) 2006 Yusuke Kawasaki. All rights reserved. This program |
|---|
| 255 | is free software; you can redistribute it and/or modify it under the same |
|---|
| 256 | terms as Perl itself. |
|---|
| 257 | |
|---|
| 258 | =cut |
|---|
| 259 | |
|---|
| 260 | package XML::TreePP; |
|---|
| 261 | use strict; |
|---|
| 262 | use Carp; |
|---|
| 263 | use Symbol; |
|---|
| 264 | |
|---|
| 265 | use vars qw( $VERSION ); |
|---|
| 266 | $VERSION = '0.18'; |
|---|
| 267 | |
|---|
| 268 | my $XML_ENCODING = 'UTF-8'; |
|---|
| 269 | my $INTERNAL_ENCODING = 'UTF-8'; |
|---|
| 270 | my $USER_AGENT = 'XML-TreePP/'.$VERSION.' '; |
|---|
| 271 | my $ATTR_PREFIX = '-'; |
|---|
| 272 | |
|---|
| 273 | sub new { |
|---|
| 274 | my $package = shift; |
|---|
| 275 | my $self = {@_}; |
|---|
| 276 | bless $self, $package; |
|---|
| 277 | $self; |
|---|
| 278 | } |
|---|
| 279 | |
|---|
| 280 | sub die { |
|---|
| 281 | my $self = shift; |
|---|
| 282 | my $mess = shift; |
|---|
| 283 | return if $self->{ignore_error}; |
|---|
| 284 | Carp::croak $mess; |
|---|
| 285 | } |
|---|
| 286 | |
|---|
| 287 | sub warn { |
|---|
| 288 | my $self = shift; |
|---|
| 289 | my $mess = shift; |
|---|
| 290 | return if $self->{ignore_error}; |
|---|
| 291 | Carp::carp $mess; |
|---|
| 292 | } |
|---|
| 293 | |
|---|
| 294 | sub set { |
|---|
| 295 | my $self = shift; |
|---|
| 296 | my $key = shift; |
|---|
| 297 | my $val = shift; |
|---|
| 298 | if ( defined $val ) { |
|---|
| 299 | $self->{$key} = $val; |
|---|
| 300 | } |
|---|
| 301 | else { |
|---|
| 302 | delete $self->{$key}; |
|---|
| 303 | } |
|---|
| 304 | } |
|---|
| 305 | |
|---|
| 306 | sub get { |
|---|
| 307 | my $self = shift; |
|---|
| 308 | my $key = shift; |
|---|
| 309 | $self->{$key} if exists $self->{$key}; |
|---|
| 310 | } |
|---|
| 311 | |
|---|
| 312 | sub writefile { |
|---|
| 313 | my $self = shift; |
|---|
| 314 | my $file = shift; |
|---|
| 315 | my $tree = shift or return $self->die( 'Invalid tree' ); |
|---|
| 316 | my $encode = shift; |
|---|
| 317 | return $self->die( 'Invalid filename' ) unless defined $file; |
|---|
| 318 | my $text = $self->write( $tree, $encode ); |
|---|
| 319 | $self->write_raw_xml( $file, $text ); |
|---|
| 320 | } |
|---|
| 321 | |
|---|
| 322 | sub write { |
|---|
| 323 | my $self = shift; |
|---|
| 324 | my $tree = shift or return $self->die( 'Invalid tree' ); |
|---|
| 325 | my $from = $self->{internal_encoding} || $INTERNAL_ENCODING; |
|---|
| 326 | my $to = shift || $self->{output_encoding} || $XML_ENCODING; |
|---|
| 327 | my $decl = $self->{xml_decl}; |
|---|
| 328 | $decl = '<?xml version="1.0" encoding="' . $to . '" ?>' unless defined $decl; |
|---|
| 329 | if ( exists $self->{first_out} ) { |
|---|
| 330 | my $keys = $self->{first_out}; |
|---|
| 331 | $keys = [$keys] unless ref $keys; |
|---|
| 332 | $self->{__first_out} = { map { $_ => 1 } @$keys }; |
|---|
| 333 | } |
|---|
| 334 | if ( exists $self->{last_out} ) { |
|---|
| 335 | my $keys = $self->{last_out}; |
|---|
| 336 | $keys = [$keys] unless ref $keys; |
|---|
| 337 | $self->{__last_out} = { map { $_ => 1 } @$keys }; |
|---|
| 338 | } |
|---|
| 339 | my $text = $self->hash_to_xml( undef, $tree ); |
|---|
| 340 | if ( $from && $to ) { |
|---|
| 341 | my $stat = $self->encode_from_to( \$text, $from, $to ); |
|---|
| 342 | return $self->die( "Unsupported encoding: $to" ) unless $stat; |
|---|
| 343 | } |
|---|
| 344 | return $text if ( $decl eq '' ); |
|---|
| 345 | join( "\n", $decl, $text ); |
|---|
| 346 | } |
|---|
| 347 | |
|---|
| 348 | sub parsehttp { |
|---|
| 349 | my $self = shift; |
|---|
| 350 | |
|---|
| 351 | if ( exists $self->{user_agent} ) { |
|---|
| 352 | my $agent = $self->{user_agent}; |
|---|
| 353 | $agent .= $USER_AGENT if ( $agent =~ /\s$/s ); |
|---|
| 354 | $self->{__user_agent} = $agent if ( $agent ne '' ); |
|---|
| 355 | } else { |
|---|
| 356 | $self->{__user_agent} = $USER_AGENT; |
|---|
| 357 | } |
|---|
| 358 | |
|---|
| 359 | my $http = $self->{__http_module}; |
|---|
| 360 | unless ( $http ) { |
|---|
| 361 | $http = &find_http_module(); |
|---|
| 362 | $self->{__http_module} = $http; |
|---|
| 363 | } |
|---|
| 364 | |
|---|
| 365 | if ( $http eq 'LWP::UserAgent' ) { |
|---|
| 366 | return $self->parsehttp_lwp(@_); |
|---|
| 367 | } |
|---|
| 368 | elsif ( $http eq 'HTTP::Lite' ) { |
|---|
| 369 | return $self->parsehttp_lite(@_); |
|---|
| 370 | } |
|---|
| 371 | else { |
|---|
| 372 | return $self->die( "LWP::UserAgent or HTTP::Lite is required: $_[1]" ); |
|---|
| 373 | } |
|---|
| 374 | } |
|---|
| 375 | |
|---|
| 376 | sub find_http_module { |
|---|
| 377 | return 'LWP::UserAgent' if defined $LWP::UserAgent::VERSION; |
|---|
| 378 | return 'HTTP::Lite' if defined $HTTP::Lite::VERSION; |
|---|
| 379 | |
|---|
| 380 | local $@; |
|---|
| 381 | eval { require LWP::UserAgent; }; |
|---|
| 382 | return 'LWP::UserAgent' if defined $LWP::UserAgent::VERSION; |
|---|
| 383 | |
|---|
| 384 | eval { require HTTP::Lite; }; |
|---|
| 385 | return 'HTTP::Lite' if defined $HTTP::Lite::VERSION; |
|---|
| 386 | return; |
|---|
| 387 | } |
|---|
| 388 | |
|---|
| 389 | sub parsehttp_lwp { |
|---|
| 390 | my $self = shift; |
|---|
| 391 | my $method = shift or return $self->die( 'Invalid HTTP method' ); |
|---|
| 392 | my $url = shift or return $self->die( 'Invalid URL' ); |
|---|
| 393 | my $body = shift; |
|---|
| 394 | my $header = shift; |
|---|
| 395 | |
|---|
| 396 | my $ua = LWP::UserAgent->new(); |
|---|
| 397 | $ua->timeout(10); |
|---|
| 398 | $ua->env_proxy(); |
|---|
| 399 | |
|---|
| 400 | $ua->agent( $self->{__user_agent} ) if defined $self->{__user_agent}; |
|---|
| 401 | my $req = HTTP::Request->new( $method, $url ); |
|---|
| 402 | my $ct = 0; |
|---|
| 403 | if ( ref $header ) { |
|---|
| 404 | foreach my $field ( sort keys %$header ) { |
|---|
| 405 | my $value = $header->{$field}; |
|---|
| 406 | $req->header( $field => $value ); |
|---|
| 407 | $ct ++ if ( $field =~ /^Content-Type$/i ); |
|---|
| 408 | } |
|---|
| 409 | } |
|---|
| 410 | if ( defined $body && ! $ct ) { |
|---|
| 411 | $req->header( 'Content-Type' => 'application/x-www-form-urlencoded' ); |
|---|
| 412 | } |
|---|
| 413 | $req->content($body) if defined $body; |
|---|
| 414 | my $res = $ua->request($req); |
|---|
| 415 | return unless $res->is_success(); |
|---|
| 416 | my $text = $res->content(); |
|---|
| 417 | $self->parse( \$text ); |
|---|
| 418 | } |
|---|
| 419 | |
|---|
| 420 | sub parsehttp_lite { |
|---|
| 421 | my $self = shift; |
|---|
| 422 | my $method = shift or return $self->die( 'Invalid HTTP method' ); |
|---|
| 423 | my $url = shift or return $self->die( 'Invalid URL' ); |
|---|
| 424 | my $body = shift; |
|---|
| 425 | my $header = shift; |
|---|
| 426 | |
|---|
| 427 | my $http = HTTP::Lite->new(); |
|---|
| 428 | $http->method($method); |
|---|
| 429 | my $ua = 0; |
|---|
| 430 | if ( ref $header ) { |
|---|
| 431 | foreach my $field ( sort keys %$header ) { |
|---|
| 432 | my $value = $header->{$field}; |
|---|
| 433 | $http->add_req_header( $field, $value ); |
|---|
| 434 | $ua ++ if ( $field =~ /^User-Agent$/i ); |
|---|
| 435 | } |
|---|
| 436 | } |
|---|
| 437 | if ( defined $self->{__user_agent} && ! $ua ) { |
|---|
| 438 | $http->add_req_header( 'User-Agent', $self->{__user_agent} ); |
|---|
| 439 | } |
|---|
| 440 | $http->{content} = $body if defined $body; |
|---|
| 441 | $http->request($url) or return; |
|---|
| 442 | my $text = $http->body(); |
|---|
| 443 | $self->parse( \$text ); |
|---|
| 444 | } |
|---|
| 445 | |
|---|
| 446 | sub parsefile { |
|---|
| 447 | my $self = shift; |
|---|
| 448 | my $file = shift; |
|---|
| 449 | return $self->die( 'Invalid filename' ) unless defined $file; |
|---|
| 450 | my $text = $self->read_raw_xml($file); |
|---|
| 451 | $self->parse( \$text ); |
|---|
| 452 | } |
|---|
| 453 | |
|---|
| 454 | sub parse { |
|---|
| 455 | my $self = shift; |
|---|
| 456 | my $textref = ref $_[0] ? $_[0] : \$_[0]; |
|---|
| 457 | return $self->die( 'Invalid XML source' ) if ( ref($textref) ne 'SCALAR' ); |
|---|
| 458 | return $self->die( 'Null XML source' ) unless defined $$textref; |
|---|
| 459 | |
|---|
| 460 | my $to = $self->{internal_encoding} || $INTERNAL_ENCODING; |
|---|
| 461 | if ($to) { |
|---|
| 462 | my $from = &xml_decl_encoding($textref); |
|---|
| 463 | if ($from) { |
|---|
| 464 | my $stat = $self->encode_from_to( $textref, $from, $to ); |
|---|
| 465 | return $self->die( "Unsupported encoding: $from" ) unless $stat; |
|---|
| 466 | } |
|---|
| 467 | } |
|---|
| 468 | if ( exists $self->{force_array} ) { |
|---|
| 469 | my $force = $self->{force_array}; |
|---|
| 470 | $force = [$force] unless ref $force; |
|---|
| 471 | $self->{__force_array} = { map { $_ => 1 } @$force }; |
|---|
| 472 | } |
|---|
| 473 | my $flat = $self->xml_to_flat($textref); |
|---|
| 474 | my $tree = $self->flat_to_tree( $flat, '' ); |
|---|
| 475 | wantarray ? ( $tree, $$textref ) : $tree; |
|---|
| 476 | } |
|---|
| 477 | |
|---|
| 478 | sub xml_to_flat { |
|---|
| 479 | my $self = shift; |
|---|
| 480 | my $textref = shift; # reference |
|---|
| 481 | my $flat = []; |
|---|
| 482 | my $prefix = $self->{attr_prefix} || $ATTR_PREFIX; |
|---|
| 483 | while ( $$textref =~ m{ |
|---|
| 484 | ([^<]*) < |
|---|
| 485 | (( |
|---|
| 486 | \? ([^<>]*) \? |
|---|
| 487 | )|( |
|---|
| 488 | \!\[CDATA\[(.*?)\]\] |
|---|
| 489 | )|( |
|---|
| 490 | \!DOCTYPE\s+([^\[\]<>]*(?:\[.*?\]\s*)?) |
|---|
| 491 | )|( |
|---|
| 492 | \!--(.*?)-- |
|---|
| 493 | )|( |
|---|
| 494 | ([^\!\?\s<>](?:"[^"]*"|'[^']*'|[^"'<>])*) |
|---|
| 495 | )) |
|---|
| 496 | > ([^<]*) |
|---|
| 497 | }sxg ) { |
|---|
| 498 | my ( |
|---|
| 499 | $ahead, $match, $typePI, $contPI, $typeCDATA, |
|---|
| 500 | $contCDATA, $typeDocT, $contDocT, $typeCmnt, $contCmnt, |
|---|
| 501 | $typeElem, $contElem, $follow |
|---|
| 502 | ) |
|---|
| 503 | = ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13 ); |
|---|
| 504 | if ( defined $ahead && $ahead =~ /\S/ ) { |
|---|
| 505 | $self->warn( "Invalid string: [$ahead] before <$match>" ); |
|---|
| 506 | } |
|---|
| 507 | |
|---|
| 508 | if ($typeElem) { # Element |
|---|
| 509 | my $node = {}; |
|---|
| 510 | if ( $contElem =~ s#^/## ) { |
|---|
| 511 | $node->{endTag}++; |
|---|
| 512 | } |
|---|
| 513 | elsif ( $contElem =~ s#/$## ) { |
|---|
| 514 | # one line |
|---|
| 515 | } |
|---|
| 516 | else { |
|---|
| 517 | $node->{startTag}++; |
|---|
| 518 | } |
|---|
| 519 | $node->{tagName} = $1 if ( $contElem =~ s#^(\S+)\s*## ); |
|---|
| 520 | unless ( $node->{endTag} ) { |
|---|
| 521 | my $attr = {}; |
|---|
| 522 | while ( |
|---|
| 523 | $contElem =~ m/([^\s\=\"\']+)=(?:(")(.*?)"|'(.*?)')/sg ) |
|---|
| 524 | { |
|---|
| 525 | my $key = $1; |
|---|
| 526 | my $val = &xml_unescape( $2 ? $3 : $4 ); |
|---|
| 527 | $attr->{$prefix.$key} = $val; |
|---|
| 528 | } |
|---|
| 529 | $node->{attributes} = $attr if scalar keys %$attr; |
|---|
| 530 | } |
|---|
| 531 | push( @$flat, $node ); |
|---|
| 532 | } |
|---|
| 533 | elsif ($typeCDATA) { ## CDATASection |
|---|
| 534 | if ( exists $self->{cdata_scalar_ref} && $self->{cdata_scalar_ref} ) { |
|---|
| 535 | push( @$flat, \$contCDATA ); # as reference for scalar |
|---|
| 536 | } |
|---|
| 537 | else { |
|---|
| 538 | push( @$flat, $contCDATA ); # as scalar like text node |
|---|
| 539 | } |
|---|
| 540 | } |
|---|
| 541 | elsif ($typeCmnt) { # Comment (ignore) |
|---|
| 542 | } |
|---|
| 543 | elsif ($typeDocT) { # DocumentType (ignore) |
|---|
| 544 | } |
|---|
| 545 | elsif ($typePI) { # ProcessingInstruction (ignore) |
|---|
| 546 | } |
|---|
| 547 | else { |
|---|
| 548 | $self->warn( "Invalid Tag: <$match>" ); |
|---|
| 549 | } |
|---|
| 550 | if ( $follow =~ /\S/ ) { # text node |
|---|
| 551 | my $val = &xml_unescape($follow); |
|---|
| 552 | push( @$flat, $val ); |
|---|
| 553 | } |
|---|
| 554 | } |
|---|
| 555 | $flat; |
|---|
| 556 | } |
|---|
| 557 | |
|---|
| 558 | sub flat_to_tree { |
|---|
| 559 | my $self = shift; |
|---|
| 560 | my $source = shift; |
|---|
| 561 | my $parent = shift; |
|---|
| 562 | my $tree = {}; |
|---|
| 563 | my $text = []; |
|---|
| 564 | |
|---|
| 565 | while ( scalar @$source ) { |
|---|
| 566 | my $node = shift @$source; |
|---|
| 567 | if ( !ref $node || UNIVERSAL::isa( $node, "SCALAR" ) ) { |
|---|
| 568 | push( @$text, $node ); # cdata or text node |
|---|
| 569 | next; |
|---|
| 570 | } |
|---|
| 571 | my $name = $node->{tagName}; |
|---|
| 572 | if ( $node->{endTag} ) { |
|---|
| 573 | last if ( $parent eq $name ); |
|---|
| 574 | return $self->die( "Invalid tag sequence: <$parent></$name>" ); |
|---|
| 575 | } |
|---|
| 576 | my $elem = $node->{attributes}; |
|---|
| 577 | if ( $node->{startTag} ) { # recursive call |
|---|
| 578 | my $child = $self->flat_to_tree( $source, $name ); |
|---|
| 579 | if ( ref $elem && scalar keys %$elem ) { |
|---|
| 580 | if ( UNIVERSAL::isa( $child, "HASH" ) ) { |
|---|
| 581 | # some attributes and some child nodes |
|---|
| 582 | foreach my $key ( keys %$child ) { |
|---|
| 583 | $elem->{$key} = $child->{$key}; |
|---|
| 584 | } |
|---|
| 585 | } |
|---|
| 586 | elsif ( defined $child ) { |
|---|
| 587 | # some attributes and text node |
|---|
| 588 | $elem->{'#text'} = $child; |
|---|
| 589 | } |
|---|
| 590 | } |
|---|
| 591 | else { |
|---|
| 592 | # no attributes and text node or nothing |
|---|
| 593 | $elem = $child; |
|---|
| 594 | } |
|---|
| 595 | } |
|---|
| 596 | # next unless defined $elem; |
|---|
| 597 | $tree->{$name} ||= []; |
|---|
| 598 | push( @{ $tree->{$name} }, $elem ); |
|---|
| 599 | } |
|---|
| 600 | foreach my $key ( keys %$tree ) { |
|---|
| 601 | next if $self->{__force_array}->{$key}; |
|---|
| 602 | next if ( 1 < scalar @{ $tree->{$key} } ); |
|---|
| 603 | $tree->{$key} = shift @{ $tree->{$key} }; |
|---|
| 604 | } |
|---|
| 605 | if ( scalar @$text ) { |
|---|
| 606 | if ( scalar @$text == 1 ) { |
|---|
| 607 | $text = shift @$text; |
|---|
| 608 | } |
|---|
| 609 | elsif ( ! scalar grep {ref $_} @$text ) { |
|---|
| 610 | $text = join( '', @$text ); |
|---|
| 611 | } |
|---|
| 612 | else { |
|---|
| 613 | my $join = join( '', map {ref $_ ? $$_ : $_} @$text ); |
|---|
| 614 | $text = \$join; |
|---|
| 615 | } |
|---|
| 616 | if ( scalar keys %$tree ) { |
|---|
| 617 | # some child nodes and also text node |
|---|
| 618 | $tree->{'#text'} = $text; |
|---|
| 619 | } |
|---|
| 620 | else { |
|---|
| 621 | # only text node without child nodes |
|---|
| 622 | $tree = $text; |
|---|
| 623 | } |
|---|
| 624 | } |
|---|
| 625 | $tree; |
|---|
| 626 | } |
|---|
| 627 | |
|---|
| 628 | sub hash_to_xml { |
|---|
| 629 | my $self = shift; |
|---|
| 630 | my $name = shift; |
|---|
| 631 | my $hash = shift; |
|---|
| 632 | my $out = []; |
|---|
| 633 | my $attr = []; |
|---|
| 634 | my $allkeys = [ sort keys %$hash ]; |
|---|
| 635 | my $firstkeys = [ grep { $self->{__first_out}->{$_} } @$allkeys ] if ref $self->{__first_out}; |
|---|
| 636 | my $lastkeys = [ grep { $self->{__last_out}->{$_} } @$allkeys ] if ref $self->{__last_out}; |
|---|
| 637 | $allkeys = [ grep { !$self->{__first_out}->{$_} } @$allkeys ] if ref $self->{__first_out}; |
|---|
| 638 | $allkeys = [ grep { !$self->{__last_out}->{$_} } @$allkeys ] if ref $self->{__last_out}; |
|---|
| 639 | my $prefix = $self->{attr_prefix} || $ATTR_PREFIX; |
|---|
| 640 | $prefix = "\Q$prefix\E"; |
|---|
| 641 | |
|---|
| 642 | foreach my $loopkey ( $firstkeys, $allkeys, $lastkeys ) { |
|---|
| 643 | next unless ref $loopkey; |
|---|
| 644 | foreach my $key ( grep { !/^$prefix/ } @$loopkey ) { |
|---|
| 645 | my $val = $hash->{$key}; |
|---|
| 646 | if ( !defined $val ) { |
|---|
| 647 | push( @$out, "<$key />" ); |
|---|
| 648 | } |
|---|
| 649 | elsif ( UNIVERSAL::isa( $val, 'ARRAY' ) ) { |
|---|
| 650 | my $child = $self->array_to_xml( $key, $val ); |
|---|
| 651 | push( @$out, $child ); |
|---|
| 652 | } |
|---|
| 653 | elsif ( UNIVERSAL::isa( $val, 'SCALAR' ) ) { |
|---|
| 654 | my $child = $self->scalaref_to_cdata( $key, $val ); |
|---|
| 655 | push( @$out, $child ); |
|---|
| 656 | } |
|---|
| 657 | elsif ( ref $val ) { |
|---|
| 658 | my $child = $self->hash_to_xml( $key, $val ); |
|---|
| 659 | push( @$out, $child ); |
|---|
| 660 | } |
|---|
| 661 | else { |
|---|
| 662 | my $child = $self->scalar_to_xml( $key, $val ); |
|---|
| 663 | push( @$out, $child ); |
|---|
| 664 | } |
|---|
| 665 | } |
|---|
| 666 | foreach my $key ( grep { /^$prefix/ } @$loopkey ) { |
|---|
| 667 | my $name = ( $key =~ /^$prefix(.*)$/s )[0]; |
|---|
| 668 | my $val = &xml_escape( $hash->{$key} ); |
|---|
| 669 | push( @$attr, ' ' . $name . '="' . $val . '"' ); |
|---|
| 670 | } |
|---|
| 671 | } |
|---|
| 672 | my $jattr = join( '', @$attr ); |
|---|
| 673 | |
|---|
| 674 | # s/^(\s*<)/ $1/mg foreach @$out; # indent |
|---|
| 675 | my $text = join( '', @$out ); |
|---|
| 676 | if ( defined $name ) { |
|---|
| 677 | if ( scalar @$out ) { |
|---|
| 678 | $text = "<$name$jattr>$text</$name>\n"; |
|---|
| 679 | } |
|---|
| 680 | else { |
|---|
| 681 | $text = "<$name$jattr />\n"; |
|---|
| 682 | } |
|---|
| 683 | } |
|---|
| 684 | $text; |
|---|
| 685 | } |
|---|
| 686 | |
|---|
| 687 | sub array_to_xml { |
|---|
| 688 | my $self = shift; |
|---|
| 689 | my $name = shift; |
|---|
| 690 | my $array = shift; |
|---|
| 691 | my $out = []; |
|---|
| 692 | foreach my $val (@$array) { |
|---|
| 693 | if ( !defined $val ) { |
|---|
| 694 | push( @$out, "<$name />\n" ); |
|---|
| 695 | } |
|---|
| 696 | elsif ( UNIVERSAL::isa( $val, 'ARRAY' ) ) { |
|---|
| 697 | my $child = $self->array_to_xml( $name, $val ); |
|---|
| 698 | push( @$out, $child ); |
|---|
| 699 | } |
|---|
| 700 | elsif ( UNIVERSAL::isa( $val, 'SCALAR' ) ) { |
|---|
| 701 | my $child = $self->scalaref_to_cdata( $name, $val ); |
|---|
| 702 | push( @$out, $child ); |
|---|
| 703 | } |
|---|
| 704 | elsif ( ref $val ) { |
|---|
| 705 | my $child = $self->hash_to_xml( $name, $val ); |
|---|
| 706 | push( @$out, $child ); |
|---|
| 707 | } |
|---|
| 708 | else { |
|---|
| 709 | my $child = $self->scalar_to_xml( $name, $val ); |
|---|
| 710 | push( @$out, $child ); |
|---|
| 711 | } |
|---|
| 712 | } |
|---|
| 713 | |
|---|
| 714 | # s/^(\s*<)/ $1/mg foreach @$out; # indent |
|---|
| 715 | my $text = join( '', @$out ); |
|---|
| 716 | $text; |
|---|
| 717 | } |
|---|
| 718 | |
|---|
| 719 | sub scalaref_to_cdata { |
|---|
| 720 | my $self = shift; |
|---|
| 721 | my $name = shift; |
|---|
| 722 | my $ref = shift; |
|---|
| 723 | my $text = '<![CDATA[' . $$ref . ']]>'; |
|---|
| 724 | $text = "<$name>$text</$name>\n" if ( $name ne '#text' ); |
|---|
| 725 | $text; |
|---|
| 726 | } |
|---|
| 727 | |
|---|
| 728 | sub scalar_to_xml { |
|---|
| 729 | my $self = shift; |
|---|
| 730 | my $name = shift; |
|---|
| 731 | my $scalar = shift; |
|---|
| 732 | my $copy = $scalar; |
|---|
| 733 | my $text = &xml_escape($copy); |
|---|
| 734 | $text = "<$name>$text</$name>\n" if ( $name ne '#text' ); |
|---|
| 735 | $text; |
|---|
| 736 | } |
|---|
| 737 | |
|---|
| 738 | sub write_raw_xml { |
|---|
| 739 | my $self = shift; |
|---|
| 740 | my $file = shift; |
|---|
| 741 | my $fh = Symbol::gensym(); |
|---|
| 742 | open( $fh, ">$file" ) or return $self->die( "$! - $file" ); |
|---|
| 743 | print $fh @_; |
|---|
| 744 | close($fh); |
|---|
| 745 | } |
|---|
| 746 | |
|---|
| 747 | sub read_raw_xml { |
|---|
| 748 | my $self = shift; |
|---|
| 749 | my $file = shift; |
|---|
| 750 | my $fh = Symbol::gensym(); |
|---|
| 751 | open( $fh, $file ) or return $self->die( "$! - $file" ); |
|---|
| 752 | local $/ = undef; |
|---|
| 753 | my $text = <$fh>; |
|---|
| 754 | close($fh); |
|---|
| 755 | $text; |
|---|
| 756 | } |
|---|
| 757 | |
|---|
| 758 | sub xml_decl_encoding { |
|---|
| 759 | my $textref = shift; |
|---|
| 760 | return unless defined $$textref; |
|---|
| 761 | my $args = ( $$textref =~ /^\s*<\?xml(\s+\S.*)\?>/s )[0] or return; |
|---|
| 762 | my $getcode = ( $args =~ /\s+encoding="(.*?)"/ )[0] or return; |
|---|
| 763 | $getcode; |
|---|
| 764 | } |
|---|
| 765 | |
|---|
| 766 | sub encode_from_to { |
|---|
| 767 | my $self = shift; |
|---|
| 768 | my $txtref = shift or return; |
|---|
| 769 | my $from = shift or return; |
|---|
| 770 | my $to = shift or return; |
|---|
| 771 | return $to if ( uc($from) eq uc($to) ); |
|---|
| 772 | &load_encode() if ( $] > 5.008 ); |
|---|
| 773 | |
|---|
| 774 | unless ( defined $Encode::EUCJPMS::VERSION ) { |
|---|
| 775 | $from = 'EUC-JP' if ( $from =~ /\beuc-?jp-?(win|ms)$/i ); |
|---|
| 776 | $to = 'EUC-JP' if ( $to =~ /\beuc-?jp-?(win|ms)$/i ); |
|---|
| 777 | } |
|---|
| 778 | |
|---|
| 779 | if ( defined $Encode::VERSION ) { |
|---|
| 780 | my $check = ( $Encode::VERSION < 2.13 ) ? 0x400 : Encode::FB_XMLCREF(); |
|---|
| 781 | Encode::from_to( $$txtref, $from, $to, $check ); |
|---|
| 782 | } |
|---|
| 783 | elsif ( ( uc($from) eq 'ISO-8859-1' |
|---|
| 784 | || uc($from) eq 'US-ASCII' |
|---|
| 785 | || uc($from) eq 'LATIN-1' ) && uc($to) eq 'UTF-8' ) { |
|---|
| 786 | &latin1_to_utf8($txtref); |
|---|
| 787 | } |
|---|
| 788 | else { |
|---|
| 789 | my $jfrom = &get_jcode_name($from); |
|---|
| 790 | my $jto = &get_jcode_name($to); |
|---|
| 791 | return $to if ( uc($jfrom) eq uc($jto) ); |
|---|
| 792 | if ( $jfrom && $jto ) { |
|---|
| 793 | &load_jcode(); |
|---|
| 794 | if ( defined $Jcode::VERSION ) { |
|---|
| 795 | Jcode::convert( $txtref, $jto, $jfrom ); |
|---|
| 796 | } |
|---|
| 797 | else { |
|---|
| 798 | return $self->die( "Jcode.pm is required: $from to $to" ); |
|---|
| 799 | } |
|---|
| 800 | } |
|---|
| 801 | else { |
|---|
| 802 | return $self->die( "Encode.pm is required: $from to $to" ); |
|---|
| 803 | } |
|---|
| 804 | } |
|---|
| 805 | $to; |
|---|
| 806 | } |
|---|
| 807 | |
|---|
| 808 | sub load_jcode { |
|---|
| 809 | return if defined $Jcode::VERSION; |
|---|
| 810 | local $@; |
|---|
| 811 | eval { require Jcode; }; |
|---|
| 812 | } |
|---|
| 813 | |
|---|
| 814 | sub load_encode { |
|---|
| 815 | return if defined $Encode::VERSION; |
|---|
| 816 | local $@; |
|---|
| 817 | eval { require Encode; }; |
|---|
| 818 | } |
|---|
| 819 | |
|---|
| 820 | sub latin1_to_utf8 { |
|---|
| 821 | my $strref = shift; |
|---|
| 822 | $$strref =~ s{ |
|---|
| 823 | ([\x80-\xFF]) |
|---|
| 824 | }{ |
|---|
| 825 | pack( 'C2' => 0xC0|(ord($1)>>6),0x80|(ord($1)&0x3F) ) |
|---|
| 826 | }exg; |
|---|
| 827 | } |
|---|
| 828 | |
|---|
| 829 | sub get_jcode_name { |
|---|
| 830 | my $src = shift; |
|---|
| 831 | my $dst; |
|---|
| 832 | if ( $src =~ /^utf-?8$/i ) { |
|---|
| 833 | $dst = 'utf8'; |
|---|
| 834 | } |
|---|
| 835 | elsif ( $src =~ /^euc.*jp(-?(win|ms))?$/i ) { |
|---|
| 836 | $dst = 'euc'; |
|---|
| 837 | } |
|---|
| 838 | elsif ( $src =~ /^(shift.*jis|cp932|windows-31j)$/i ) { |
|---|
| 839 | $dst = 'sjis'; |
|---|
| 840 | } |
|---|
| 841 | elsif ( $src =~ /^iso-2022-jp/ ) { |
|---|
| 842 | $dst = 'jis'; |
|---|
| 843 | } |
|---|
| 844 | $dst; |
|---|
| 845 | } |
|---|
| 846 | |
|---|
| 847 | sub xml_escape { |
|---|
| 848 | my $str = shift; |
|---|
| 849 | # except for TAB(\x09),CR(\x0D),LF(\x0A) |
|---|
| 850 | $str =~ s{ |
|---|
| 851 | ([\x00-\x08\x0B\x0C\x0E-\x1F\x7F]) |
|---|
| 852 | }{ |
|---|
| 853 | sprintf( '&#%d;', ord($1) ); |
|---|
| 854 | }gex; |
|---|
| 855 | $str =~ s/&(?!#(\d+;|x[\dA-Fa-f]+;))/&/g; |
|---|
| 856 | $str =~ s/</</g; |
|---|
| 857 | $str =~ s/>/>/g; |
|---|
| 858 | $str =~ s/'/'/g; |
|---|
| 859 | $str =~ s/"/"/g; |
|---|
| 860 | $str; |
|---|
| 861 | } |
|---|
| 862 | |
|---|
| 863 | sub xml_unescape { |
|---|
| 864 | my $str = shift; |
|---|
| 865 | my $map = {qw( quot " lt < gt > apos ' amp & )}; |
|---|
| 866 | $str =~ s{ |
|---|
| 867 | (&(?:\#(\d+)|\#x([0-9a-fA-F]+)|(quot|lt|gt|apos|amp));) |
|---|
| 868 | }{ |
|---|
| 869 | $4 ? $map->{$4} : &char_deref($1,$2,$3); |
|---|
| 870 | }gex; |
|---|
| 871 | $str; |
|---|
| 872 | } |
|---|
| 873 | |
|---|
| 874 | sub char_deref { |
|---|
| 875 | my( $str, $dec, $hex ) = @_; |
|---|
| 876 | if ( defined $dec ) { |
|---|
| 877 | return &code_to_utf8( $dec ) if ( $dec < 256 ); |
|---|
| 878 | } |
|---|
| 879 | elsif ( defined $hex ) { |
|---|
| 880 | my $num = hex($hex); |
|---|
| 881 | return &code_to_utf8( $num ) if ( $num < 256 ); |
|---|
| 882 | } |
|---|
| 883 | return $str; |
|---|
| 884 | } |
|---|
| 885 | |
|---|
| 886 | sub code_to_utf8 { |
|---|
| 887 | my $code = shift; |
|---|
| 888 | if ( $code < 128 ) { |
|---|
| 889 | return pack( C => $code ); |
|---|
| 890 | } |
|---|
| 891 | elsif ( $code < 256 ) { |
|---|
| 892 | return pack( C2 => 0xC0|($code>>6), 0x80|($code&0x3F)); |
|---|
| 893 | } |
|---|
| 894 | elsif ( $code < 65536 ) { |
|---|
| 895 | return pack( C3 => 0xC0|($code>>12), 0x80|(($code>>6)&0x3F), 0x80|($code&0x3F)); |
|---|
| 896 | } |
|---|
| 897 | return shift if scalar @_; # default value |
|---|
| 898 | sprintf( '&#x%04X;', $code ); |
|---|
| 899 | } |
|---|
| 900 | |
|---|
| 901 | 1; |
|---|