root/lang/perl/XML-FromHTML/trunk/lib/XML/FromHTML.pm

Revision 30408, 2.5 kB (checked in by dankogai, 3 years ago)

XML::FromHTML - Converts HTML to XHTML

Line 
1#
2# $Id$
3#
4package XML::FromHTML;
5use warnings;
6use strict;
7our $VERSION = sprintf "%d.%02d", q$Revision: 0.2 $ =~ /(\d+)/g;
8
9use base 'Exporter';
10our @EXPORT = qw/html2xml/;
11
12use base 'HTML::Tidy';
13
14sub new {
15    my $class = shift;
16    bless HTML::Tidy->new(
17        {
18            @_,
19            doctype          => 'omit', # important for speed!
20            indent           => 0,
21            numeric_entities => 1,
22            output_xhtml     => 1,
23            tidy_mark        => 0,
24            wrap             => 0,
25        }
26    ), $class;
27}
28
29sub html2xml {
30    my ($src, $enc) = @_;
31    if ($enc){
32        require Encode;
33        Encode::from_to($src, $enc, 'utf8');
34    }
35    __PACKAGE__->new->clean($src)
36}
37
38if ($0 eq __FILE__){
39    warn html2xml(qq{<a href="http://www.dan.co.jp/">Dan Kogai</a>})
40}
41
421; # End of XML::FromHTML
43
44=head1 NAME
45
46XML::FromHTML - Converts HTML to XHTML
47
48=head1 VERSION
49
50$Id$
51
52=head1 SYNOPSIS
53
54  use XML::FromHTML;
55  my $xhtml = XML::FromHTML->new->clean($html);
56  # or simply
57  my $xhtml = html2xml($html);
58  use XML::Simple;
59  my $obj = XMLin $xhtml;                           # works fine!
60  use XML::LibXML;
61  my $dom = XML::LibXML->new->parse_string($xhtml); # this, too!
62
63=head1 DESCRIPTION
64
65This module uses L<HTML::Tidy> just to turn HTML into XML.  T/O.
66
67=head1 EXPORT
68
69C<html2xml> by default.
70
71=head1 FUNCTIONS
72
73=head2 new
74
75Creates an XML::FromHTML object, whose parent class is L<HTML::Tidy>.
76
77=head2 html2xml
78
79just a shortcut for:
80
81  XML::FromHTML->new->clean(shift);
82
83=head1 AUTHOR
84
85Dan Kogai, C<< <dankogai at dan.co.jp> >>
86
87=head1 BUGS
88
89Please report any bugs or feature requests to C<bug-xml-fromhtml at rt.cpan.org>, or through
90the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=XML-FromHTML>.  I will be notified, and then you'll
91automatically be notified of progress on your bug as I make changes.
92
93=head1 SUPPORT
94
95You can find documentation for this module with the perldoc command.
96
97    perldoc XML::FromHTML
98
99You can also look for information at:
100
101=over 4
102
103=item * RT: CPAN's request tracker
104
105L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=XML-FromHTML>
106
107=item * AnnoCPAN: Annotated CPAN documentation
108
109L<http://annocpan.org/dist/XML-FromHTML>
110
111=item * CPAN Ratings
112
113L<http://cpanratings.perl.org/d/XML-FromHTML>
114
115=item * Search CPAN
116
117L<http://search.cpan.org/dist/XML-FromHTML/>
118
119=back
120
121=head1 ACKNOWLEDGEMENTS
122
123L<HTML::Tidy>, L<XML::Liberal>
124
125=head1 COPYRIGHT & LICENSE
126
127Copyright 2009 Dan Kogai, all rights reserved.
128
129This program is free software; you can redistribute it and/or modify it
130under the same terms as Perl itself.
131
132
133=cut
Note: See TracBrowser for help on using the browser.