root/lang/perl/Net-Twitter-Scraper/trunk/lib/Net/Twitter/Scraper.pm @ 14505

Revision 14505, 3.3 kB (checked in by hetappi, 5 years ago)

initial import

Line 
1package Net::Twitter::Scraper;
2
3use warnings;
4use strict;
5our $VERSION = '0.00';
6
7use Carp qw/croak/;
8use DateTime::Format::W3CDTF;
9use HTTP::Status;
10use LWP::UserAgent;
11use URI;
12use Web::Scraper;
13
14sub new {
15  my($class, %opt) = @_;
16
17  my $username = $opt{username} || undef;
18  my $password = $opt{password} || undef;
19
20  my $ua = LWP::UserAgent->new(
21    cookie_jar => { },
22    keepalive => 1,
23    requests_redirectable => ['GET', 'POST']
24  );
25
26  my $self = bless {
27    ua => $ua,
28    username => $username,
29    password => $password,
30    is_login => 0,
31    response_code => undef,
32    response_message => undef,
33  }, $class;
34
35  $self->_login($username, $password)
36    if defined $username && defined $password;
37
38  $self;
39}
40
41sub _login {
42  my($self, $username, $password) = @_;
43
44  $self->{is_login} = 0;
45  my $resp = $self->{ua}->post(
46    'https://twitter.com/sessions',
47    { username_or_email => $username, password => $password }
48  );
49
50  $resp->is_success && $resp->request->uri->as_string eq 'https://twitter.com/home'
51    or croak "Can't login twitter.com";
52
53  $self->{is_login} = 1;
54}
55
56sub _set_status {
57  my($self, $code) = @_;
58
59  $self->{response_code} = $code;
60  $self->{response_message} = HTTP::Status::status_message($code);
61}
62
63sub _format_datetime {
64  my($str) = @_;
65
66  my $dt = DateTime::Format::W3CDTF->new->parse_datetime($str);
67  $dt->strftime('%a %b %d %H:%M:%S %z %Y');
68}
69
70sub http_code {
71  my($self) = @_;
72
73  return $self->{response_code};
74}
75
76sub http_message {
77  my($self) = @_;
78
79  return $self->{response_message};
80}
81
82# todo: since, since_id
83sub archive {
84  my($self, $args) = @_;
85
86  $self->{is_login}
87    or croak "Can't call the operation without login";
88
89  my $scraper = scraper {
90    process '//table[@id="timeline"]/tr', 'statuses[]' => scraper {
91      process '//abbr[@class="published"]',
92        created_at => [ '@title', \&_format_datetime ];
93##      process '//', favorited => '';
94      process '//.',
95        id => [ '@id', sub { $_[0] =~ /status_(\d+)/, $1 } ];
96##      process '//', in_reply_to_status_id => '';
97##      process '//', in_reply_to_user_id => '';
98      process '//span[@class="meta entry-meta"]/a[2]',
99        source => sub { $_->as_HTML };
100      process '//span[@class="entry-content"]',
101        text => [ 'text', sub { $_[0] =~ /^\s*(.*?)\s*$/, $1 } ];
102##      process '//', truncated => '';
103##      process '//', user => '';
104    };
105  };
106  $scraper->user_agent($self->{ua});
107
108  my $url = 'http://twitter.com/account/archive';
109  my $page = defined $args && defined $args->{page} ? $args->{page} : 1;
110  my @statuses;
111  for (my $i = $page; $i <= 4 + $page; ++$i) {
112    my $res;
113    eval {
114      $res = $scraper->scrape(URI->new($url . '?page=' . $i));
115    };
116    if ($@) {
117      # ;-)
118      $self->_set_status(500);
119      return undef;
120    }
121    push @statuses, @{$res->{statuses}};
122  }
123
124  $self->_set_status(200);
125
126  \@statuses;
127}
128
1291;
130
131=head1 NAME
132
133Net::Twitter::Scraper - Twitter Scraper
134
135=head1 SYNOPSIS
136
137  use YAML;
138  use Net::Twitter::Scraper;
139
140  my $tw = Net::Twitter::Scraper->new(
141    username => 'your id',
142    password => 'your pswd'
143  );
144
145  my $archive = $tw->archive();
146  binmode STDOUT, ':utf8';
147  print Dump $archive;
148
149=head1 DESCRIPTION
150
151
152=head1 SEE ALSO
153
154L<Net::Twitter>
155
156=head1 NOTES
157
158Alpha quality
159
160=head1 AUTHOR
161
162hetappi E<lt>hetappi.pm at gmail.comE<gt>
163
164=head1 COPYRIGHT AND LICENSE
165
166
167=cut
Note: See TracBrowser for help on using the browser.