root/lang/perl/HTML-Selector-XPath/trunk/lib/HTML/Selector/XPath.pm @ 1140

Revision 1140, 5.9 kB (checked in by miyagawa, 6 years ago)

r5478@rock (orig r1994): miyagawa | 2006-10-02 09:39:14 -0700
update CAVEATS re css selector validatoin

Line 
1package HTML::Selector::XPath;
2
3use strict;
4our $VERSION = '0.01';
5
6require Exporter;
7our @EXPORT_OK = qw(selector_to_xpath);
8*import = \&Exporter::import;
9
10sub selector_to_xpath {
11    __PACKAGE__->new(shift)->to_xpath;
12}
13
14my $reg = {
15    # tag name/id/class
16    element => qr/^([#.]?)([a-z0-9\\*_-]*)((\|)([a-z0-9\\*_-]*))?/i,
17    # attribute presence
18    attr1   => qr/^\[([^\]]*)\]/,
19    # attribute value match
20    attr2   => qr/^\[\s*([^~\|=\s]+)\s*([~\|]?=)\s*"([^"]+)"\s*\]/i,
21    attrN   => qr/^:not\((.*?)\)/i,
22    pseudo  => qr/^:([()a-z_-]+)/i,
23    # adjacency/direct descendance
24    combinator => qr/^(\s*[>+\s])/i,
25    # rule separator
26    comma => qr/^\s*,/i,
27};
28
29
30sub new {
31    my($class, $exp) = @_;
32    bless { expression => $exp }, $class;
33}
34
35sub selector {
36    my $self = shift;
37    $self->{expression} = shift if @_;
38    $self->{expression};
39}
40
41sub to_xpath {
42    my $self = shift;
43    my $rule = $self->{expression} or return;
44
45    my $index = 1;
46    my @parts = ("//", "*");
47    my $last_rule = '';
48    my @next_parts;
49
50    # Loop through each "unit" of the rule
51    while (length $rule && $rule ne $last_rule) {
52        $last_rule = $rule;
53
54        $rule =~ s/^\s*|\s*$//g;
55        last unless length $rule;
56
57        # Match elements
58        if ($rule =~ s/$reg->{element}//) {
59
60            # to add *[1]/self:: for follow-sibling
61            if (@next_parts) {
62                push @parts, @next_parts, (pop @parts);
63                $index += @next_parts;
64                @next_parts = ();
65            }
66
67            if ($1 eq '#') { # ID
68                push @parts, "[\@id='$2']";
69            } elsif ($1 eq '.') { # class
70                push @parts, "[contains(concat(' ', \@class, ' '), ' $2 ')]";
71            } else {
72                $parts[$index] = $5 || $2;
73            }
74        }
75
76        # Match attribute selectors
77        if ($rule =~ s/$reg->{attr2}//) {
78            # negation (e.g. [input!="text"]) isn't implemented in CSS, but include it anyway:
79            if ($2 eq '!=') {
80                push @parts, "[\@$1!='$3]";
81            } elsif ($2 eq '~=') { # substring attribute match
82                push @parts, "[contains(concat(' ', \@$1, ' '), ' $3 ')]";
83            } elsif ($2 eq '|=') {
84                push @parts, "[\@$1='$3' or starts-with(\@$1, '$3-')]";
85            } else { # exact match
86                push @parts, "[\@$1='$3']";
87            }
88        } else {
89            if ($rule =~ s/$reg->{attr1}//) {
90                push @parts, "[\@$1]";
91            }
92        }
93
94        # Match negation
95        if ($rule =~ s/$reg->{attrN}//) {
96            my $sub_rule = $1;
97            if ($sub_rule =~ s/$reg->{attr2}//) {
98                if ($2 eq '=') {
99                    push @parts, "[\@$1!='$3']";
100                } elsif ($2 eq '~=') {
101                    push @parts, ":not([contains(concat(' ', \@$1, ' '), ' $3 ')])";
102                } elsif ($2 eq '|=') {
103                    push @parts, ":not([\@$1='$3' or starts-with(\@$1, '$3-')])";
104                }
105            } elsif ($sub_rule =~ s/$reg->{attr1}//) {
106                push @parts, ":not([\@$1])";
107            }
108        }
109
110        # Ignore pseudoclasses/pseudoelements
111        while ($rule =~ s/$reg->{pseudo}//) {
112            if ( $1 eq 'first-child') {
113                $parts[$#parts] = '*[1]/self::' . $parts[$#parts];
114            } elsif ($1 =~ /^lang\(([\w\-]+)\)$/) {
115                push @parts, "[\@xml:lang='$1' or starts-with(\@xml:lang, '$1-')]";
116            }
117        }
118
119        # Match combinators (> and +)
120        if ($rule =~ s/$reg->{combinator}//) {
121            my $match = $1;
122            if ($match =~ />/) {
123                push @parts, "/";
124            } elsif ($match =~ /\+/) {
125                push @parts, "/following-sibling::";
126                @next_parts = ('*[1]/self::');
127            } else {
128                push @parts, "//";
129            }
130
131            # new context
132            $index = @parts;
133            push @parts, "*";
134        }
135
136        # Match commas
137        if ($rule =~ s/$reg->{comma}//) {
138            push @parts, " | ", "//", "*"; # ending one rule and beginning another
139            $index = @parts - 1;
140        }
141    }
142
143    return join '', @parts;
144}
145
1461;
147__END__
148
149=head1 NAME
150
151HTML::Selector::XPath - CSS Selector to XPath compiler
152
153=head1 SYNOPSIS
154
155  use HTML::Selector::XPath;
156
157  my $selector = HTML::Selector::XPath->new("li#main");
158  $selector->to_xpath; # //li[@id='main']
159
160  # functional interface
161  use HTML::Selector::Xpath 'selector_to_xpath';
162  my $xpath = selector_to_xpath('div.foo');
163
164=head1 DESCRIPTION
165
166HTML::Selector::XPath is a utility function to compile CSS2 selector
167to the equivalent XPath expression.
168
169=head1 CAVEATS
170
171=head2 NOT PSEUDO CLASS
172
173This module supports I<:first-child> and I<:lang> pseudo class, and a
174partial support for I<:not> CSS 3 pseudo class as well. When you use
175I<:not>, this module will produce the equivalent XPath expression
176I<:not()>, which is only available in XPath 2.0 implementation.
177
178So far as I have tested, I<:not()> is not available in Perl XPath
179modules like L<XML::LibXML> and L<HTML::Builder::XPath>.
180
181=head2 CSS SELECTOR VALIDATION
182
183This module doesn't validate if the original CSS Selector expression
184is valid. For example,
185
186  div.123foo
187
188is an invalid CSS selector (class names should not begin with
189numbers), but this module ignores that and tries to generate
190an equivalent XPath expression anyway.
191
192=head1 AUTHOR
193
194Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>
195
196Most of the code is based on Joe Hewitt's getElementsBySelector.js on
197L<http://www.joehewitt.com/blog/2006-03-20.php> and Andrew Dupont's
198patch to Prototype.js on L<http://dev.rubyonrails.org/ticket/5171>,
199but slightly modified using CSS to XPath translation table per
200L<http://plasmasturm.org/log/444/>
201
202=head1 LICENSE
203
204This library is free software; you can redistribute it and/or modify
205it under the same terms as Perl itself.
206
207=head1 SEE ALSO
208
209L<http://www.w3.org/TR/REC-CSS2/selector.html>
210L<http://use.perl.org/~miyagawa/journal/31090>
211
212=cut
Note: See TracBrowser for help on using the browser.