Changeset 333
- Timestamp:
- 10/02/07 14:43:41 (6 years ago)
- Location:
- lang/perl/WWW-Mixi-Scraper/trunk
- Files:
-
- 29 modified
-
Changes (modified) (1 diff)
-
META.yml (modified) (1 diff)
-
lib/WWW/Mixi/Scraper.pm (modified) (2 diffs)
-
lib/WWW/Mixi/Scraper/Plugin/ListComment.pm (modified) (2 diffs)
-
lib/WWW/Mixi/Scraper/Plugin/ListDiary.pm (modified) (3 diffs)
-
lib/WWW/Mixi/Scraper/Plugin/NewBBS.pm (modified) (2 diffs)
-
lib/WWW/Mixi/Scraper/Plugin/NewFriendDiary.pm (modified) (2 diffs)
-
lib/WWW/Mixi/Scraper/Plugin/ShowFriend.pm (modified) (5 diffs)
-
lib/WWW/Mixi/Scraper/Plugin/ShowLog.pm (modified) (2 diffs)
-
lib/WWW/Mixi/Scraper/Plugin/ViewBBS.pm (modified) (3 diffs)
-
lib/WWW/Mixi/Scraper/Plugin/ViewDiary.pm (modified) (1 diff)
-
lib/WWW/Mixi/Scraper/Plugin/ViewEvent.pm (modified) (4 diffs)
-
lib/WWW/Mixi/Scraper/Plugin/ViewMessage.pm (modified) (1 diff)
-
lib/WWW/Mixi/Scraper/Utils.pm (modified) (1 diff)
-
t_live/lib/Utils.pm (modified) (1 diff)
-
t_live/list_comment.t (modified) (1 diff)
-
t_live/list_diary.t (modified) (1 diff)
-
t_live/list_message.t (modified) (1 diff)
-
t_live/new_bbs.t (modified) (1 diff)
-
t_live/new_friend_diary.t (modified) (1 diff)
-
t_live/new_music.t (modified) (1 diff)
-
t_live/new_video.t (modified) (1 diff)
-
t_live/show_calendar.t (modified) (1 diff)
-
t_live/show_friend.t (modified) (1 diff)
-
t_live/show_log.t (modified) (1 diff)
-
t_live/view_bbs.t (modified) (2 diffs)
-
t_live/view_diary.t (modified) (2 diffs)
-
t_live/view_event.t (modified) (1 diff)
-
t_live/view_message.t (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
lang/perl/WWW-Mixi-Scraper/trunk/Changes
r4 r333 1 1 Revision history for WWW-Mixi-Scraper 2 2 3 0.07 not yet released 3 0.07 2007/10/02 4 - scraping rules are totally refactored due to the mixi's renewal. 5 now it uses id/class attributes everywhere, so scraping is much 6 easier. mixi++ (and k*z*b*r*++, who secretly sent me a patch) 7 - and now live tests dump their contents if you test verbosely. 4 8 - new plugin: NewMusic 5 9 -
lang/perl/WWW-Mixi-Scraper/trunk/META.yml
r4 r333 1 1 --- #YAML:1.0 2 2 name: WWW-Mixi-Scraper 3 version: 0.0 63 version: 0.07 4 4 abstract: yet another mixi scraper 5 5 license: perl -
lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper.pm
r4 r333 4 4 use warnings; 5 5 6 our $VERSION = '0.0 6';6 our $VERSION = '0.07'; 7 7 8 8 use String::CamelCase qw( decamelize ); … … 93 93 WWW::Mixi has much longer history and is full-stack. The data it returns tends to be more complete, fine-tuned, and raw in many ways (including encoding). However, it tends to suffer from minor html changes as it heavily relies on regexes, and maybe it is too monolithic. 94 94 95 In contrast, WWW::Mixi::Scraper hopefully tends to survive minor html changes as it relies on XPath. And basically it uses decoded perl strings, not octets. It's smaller, and pluggable. However, its data is more or less pre-processed and tends to lose some aspects such as proper line breaks. Also, it may be easier to be polluted with garbages (partly because mixi doesn't rely much on CSS; it's hard to locate exact area to scrape by XPath). And it may be harder to understand and maintain XPath rules.95 In contrast, WWW::Mixi::Scraper hopefully tends to survive minor html changes as it relies on XPath. And basically it uses decoded perl strings, not octets. It's smaller, and pluggable. However, its data is more or less pre-processed and tends to lose some aspects such as proper line breaks. Also, it may be easier to be polluted with garbages. And it may be harder to understand and maintain XPath rules. 96 96 97 97 Which to choose? It depends. For now, ::Scraper is too limited, but if all you want is rough data to tell you who updated, or what was updated, ::Scraper may be a good option. -
lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/ListComment.pm
r4 r333 12 12 my %scraper; 13 13 $scraper{comments} = scraper { 14 process ' td[width="450"]',14 process 'dl>dd', 15 15 string => 'TEXT'; 16 process ' td[width="450"]>a',16 process 'dl>dd>a', 17 17 link => '@href', 18 18 subject => 'TEXT'; 19 process ' td[width="180"]',19 process 'dl>dt', 20 20 time => 'TEXT'; 21 21 result qw( string time link subject ); … … 23 23 24 24 $scraper{list} = scraper { 25 process ' tr[bgcolor="#FFFFFF"]',25 process 'div.listCommentArea>ul.entryList01>li', 26 26 'comments[]' => $scraper{comments}; 27 27 result qw( comments ); -
lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/ListDiary.pm
r4 r333 24 24 25 25 $scraper{diaries} = scraper { 26 process ' td[nowrap]',26 process 'div.listDiaryTitle>dl>dd', 27 27 time => 'TEXT'; 28 process ' td[bgcolor="#FFF4E0"]>a',28 process 'div.listDiaryTitle>dl>dt>a', 29 29 link => '@href', 30 30 subject => 'TEXT'; 31 process ' td[bgcolor="#FFFFFF"]>table[cellpadding="3"]>tr>td[class="h120"]',31 process 'p', 32 32 description => 'TEXT'; 33 process ' td[bgcolor="#FFFFFF"]>table[cellpadding="3"]>tr>td[class="h120"]>table>tr>td>a>img',33 process 'div.diaryPhoto>a>img', 34 34 'images[]' => '@src'; 35 process ' td[align="right"]>a',35 process 'div.diaryEditMenu>ul>li', 36 36 'meta[]' => $scraper{meta}; 37 37 result qw( time link subject description images meta ); … … 39 39 40 40 $scraper{list} = scraper { 41 process ' table[width="525"]>tr',41 process 'div.listDiaryBlock', 42 42 'diaries[]' => $scraper{diaries}; 43 43 result qw( diaries ); … … 46 46 my $stash = $self->post_process($scraper{list}->scrape(\$html)); 47 47 48 my $tmp; 49 my @diaries; 50 foreach my $item ( @{ $stash } ) { 51 if ( $item->{time} ) { # meta 52 $tmp = { 53 time => $item->{time}, 54 link => $item->{link}, 55 subject => $item->{subject}, 56 }; 57 } 58 elsif ( $item->{description} ) { 59 $tmp->{description} = $item->{description}; 60 $tmp->{images} = $item->{images}; 61 } 62 elsif ( $item->{meta} ) { 63 foreach my $meta ( @{ $item->{meta} || [] } ) { 64 if ( ($meta->{href} || '') =~ /#(?:write|comment)$/ ) { 65 my ($count) = $meta->{text} =~ /\((\d+)\)/; 66 $tmp->{count} = $count; 67 } 48 foreach my $diary ( @{ $stash } ) { 49 my $meta = delete $diary->{meta}; 50 foreach my $item ( @{ $meta || [] } ) { 51 if ( ($item->{href} || '') =~ /#(?:write|comment)$/ ) { 52 my ($count) = $item->{text} =~ /(\d+)/; 53 $diary->{count} = $count; 68 54 } 69 push @diaries, $tmp;70 55 } 71 56 } 72 57 73 return \@diaries;58 return $stash; 74 59 } 75 60 -
lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/NewBBS.pm
r4 r333 12 12 my %scraper; 13 13 $scraper{entries} = scraper { 14 process ' td[width="180"]',14 process 'dl>dt', 15 15 time => 'TEXT'; 16 process ' td[width="450"]>a',16 process 'dl>dd>a', 17 17 subject => 'TEXT', 18 18 link => '@href'; 19 process ' td[width="450"]',19 process 'dl>dd', 20 20 string => 'TEXT'; 21 21 result qw( string subject link time ); … … 23 23 24 24 $scraper{list} = scraper { 25 process ' tr[bgcolor="#FFFFFF"]',25 process 'div.newBbsArea>ul.entryList01>li', 26 26 'entries[]' => $scraper{entries}; 27 27 result qw( entries ); -
lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/NewFriendDiary.pm
r4 r333 12 12 my %scraper; 13 13 $scraper{entries} = scraper { 14 process ' td[width="180"]',14 process 'dl>dt', 15 15 time => 'TEXT'; 16 process ' td[width="450"]>a',16 process 'dl>dd>a', 17 17 subject => 'TEXT', 18 18 link => '@href'; 19 process ' td[width="450"]',19 process 'dl>dd', 20 20 string => 'TEXT'; 21 21 result qw( string subject link time ); … … 23 23 24 24 $scraper{list} = scraper { 25 process ' tr[bgcolor="#FFFFFF"]',25 process 'ul.entryList01>li', 26 26 'entries[]' => $scraper{entries}; 27 27 result qw( entries ); -
lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/ShowFriend.pm
r4 r333 5 5 use WWW::Mixi::Scraper::Plugin; 6 6 use WWW::Mixi::Scraper::Utils qw( _uri ); 7 use utf8; 7 8 8 9 validator {qw( id is_number )}; … … 22 23 my %scraper; 23 24 $scraper{items} = scraper { 24 process ' td[width="80"]',25 process 'dl>dt', 25 26 key => 'TEXT'; 26 process ' td[width!="80"]',27 process 'dl>dd', 27 28 value => 'TEXT'; 28 29 result qw( key value ); … … 30 31 31 32 $scraper{profile} = scraper { 32 process ' table[width="425"]>tr[bgcolor="#FFFFFF"]',33 process 'div#profile>ul>li', 33 34 'items[]' => $scraper{items}; 34 35 result qw( items ); … … 58 59 59 60 $scraper{outline} = scraper { 60 process 'table[bgcolor="#FEC977"]>tr>td[colspan="3"]', 61 'string[]' => 'TEXT'; 62 process 'table[width="270"]>tr>td[colspan="3"]>a', 61 process 'div#myProfile>div.contents01>h3', 62 'string' => 'TEXT'; 63 process 'div#myProfile>div.contents01>p.loginTime', 64 'description' => 'TEXT'; 65 process 'div#myProfile>p.friendPath>a', 63 66 'relations[]' => $scraper{relations}; 64 process ' table[width="250"]>tr>td>img[vspace="2"]',67 process 'div#myProfile>div.contents01>img', 65 68 image => '@src'; 66 result qw( image string relations ); 69 process 'div#localNavigation>ul.localNaviFriend>li.top>a', 70 link => '@href'; 71 result qw( image string relations description link ); 67 72 }; 68 73 … … 78 83 $stash->{relation} = shift @relations if @relations > 1; 79 84 80 foreach my $string (@{ delete $stash->{string} || [] }) { 81 if ( $string =~ /^(.+)\((\d+)\)\s+\(([^)]+)\)\s*$/ ) { 82 $stash->{name} = $1; 83 $stash->{count} = $2; 84 $stash->{description} = $3; 85 } 86 elsif ( $string =~ /^(.+)\((\d+)\)\s*$/ ) { # may be yourself 87 $stash->{name} = $1; 88 $stash->{count} = $2; 89 } 85 my $string = delete $stash->{string} || ''; 86 if ( $string =~ s/\((\d+)\)$// ) { 87 $stash->{name} = $string; 88 $stash->{count} = $1; 90 89 } 91 92 # XXX: this fails when you test with local files. 93 # In this case, we can scrape the link from the 'snavi' toolbar 94 # but it's ugly. 95 $stash->{link} = $self->{uri}; 90 if ( $stash->{description} ) { 91 $stash->{description} =~ s/^(//; 92 $stash->{description} =~ s/)$//; 93 } 96 94 97 95 return $stash; -
lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/ShowLog.pm
r4 r333 4 4 use warnings; 5 5 use WWW::Mixi::Scraper::Plugin; 6 7 validator {( page => 'is_number' )}; 6 8 7 9 sub scrape { … … 24 26 }; 25 27 26 return $self->post_process($scraper{list}->scrape(\$html)); 28 return $self->post_process($scraper{list}->scrape(\$html), \&_callback); 29 } 30 31 sub _callback { 32 my $item = shift; 33 my @parts = split /\s/, ($item->{time} || ''), 3; 34 $item->{time} = join ' ', @parts[0..1]; 27 35 } 28 36 -
lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/ViewBBS.pm
r4 r333 4 4 use warnings; 5 5 use WWW::Mixi::Scraper::Plugin; 6 use WWW::Mixi::Scraper::Utils qw( _datetime _uri ); 6 7 7 8 validator {qw( … … 25 26 26 27 $scraper{topic} = scraper { 27 process ' table[bgcolor="#dfa473"]>tr>td[bgcolor="#ffd8b0"]',28 process 'dl[class="bbsList01 bbsDetail"]>dt>span.date', 28 29 time => 'TEXT'; 29 process ' table[bgcolor="#dfa473"]>tr>td[bgcolor="#fff4e0"]',30 process 'dl[class="bbsList01 bbsDetail"]>dt>span.titleSpan', 30 31 subject => 'TEXT'; 31 process ' table[bgcolor="#dfa473"]>tr>td[bgcolor="#fdf9f2"]>a',32 process 'dd.bbsContent>dl>dt>a', 32 33 name => 'TEXT', 33 34 name_link => '@href'; 34 process ' table[bgcolor="#dfa473"]>tr>td[bgcolor="#ffffff"]>table[width="500"]>tr>td[class="h120"]',35 process 'dd.bbsContent>dl>dd', 35 36 description => 'TEXT'; 36 process ' table[bgcolor="#dfa473"]>tr>td[bgcolor="#ffffff"]>table[width="500"]>tr>td[class="h120"]>table>tr>td[valign="middle"]',37 process 'dd.bbsContent>dl>dd>div.communityPhoto>table>tr>td', 37 38 'images[]' => $scraper{images}; 38 result qw( time subject description name name_link images ); 39 process 'div#localNavigation>ul.localNaviCommunity>li.top>a', 40 link => '@href'; 41 result qw( time subject description name name_link images link ); 39 42 }; 40 43 … … 42 45 my $stash = $self->post_process($scraper{topic}->scrape(\$html))->[0]; 43 46 44 # XXX: this fails when you test with local files.45 # However, this link cannot be extracted from the html,46 # at least as of writing this. ugh.47 $stash->{link} = $self->{uri};48 49 47 $scraper{comments} = scraper { 50 process 'tr', 51 string => 'TEXT'; 52 process 'tr[valign="top"]>td[nowrap]', 53 time => 'TEXT'; 54 process 'tr[valign="top"]>td[bgcolor="#fdf9f2"]>a', 48 process 'dt>a', 55 49 link => '@href', 56 50 name => 'TEXT'; 57 process ' td[bgcolor="#ffffff"]>table[cellpadding="5"]>tr>td[class="h120"]',51 process 'dd', 58 52 description => 'TEXT'; 59 result qw( string timelink name description );53 result qw( link name description ); 60 54 }; 61 55 62 56 $scraper{list} = scraper { 63 process 'table[cellpadding="3"]>tr', 57 process 'dl.commentList01>dt[class="commentDate clearfix"]>span.date', 58 'times[]' => 'TEXT'; 59 process 'dl.commentList01>dd>dl.commentContent01', 64 60 'comments[]' => $scraper{comments}; 65 result qw( comments );61 result qw( times comments ); 66 62 }; 67 63 68 my $stash_c = $self->post_process($scraper{list}->scrape(\$html)) ;64 my $stash_c = $self->post_process($scraper{list}->scrape(\$html))->[0]; 69 65 70 my $tmp; 71 my @comments; 72 foreach my $comment ( @{ $stash_c } ) { 73 next if !$comment->{string} || $comment->{string} =~ /^\s*$/s; 74 if ( $comment->{time} ) { # meta 75 $tmp = { 76 time => $comment->{time}, 77 name => $comment->{name}, 78 link => $comment->{link}, 79 }; 80 } 81 elsif ( $comment->{description} && $tmp->{time} ) { # body 82 $tmp->{description} = $comment->{description}; 83 push @comments, $tmp; 84 $tmp = {}; 85 } 66 my @comments = @{ $stash_c->{comments} || [] }; 67 my @times = @{ $stash_c->{times} || [] }; 68 foreach my $comment ( @comments ) { 69 $comment->{time} = _datetime( shift @times ); 70 $comment->{link} = _uri( $comment->{link} ); 86 71 } 87 72 $stash->{comments} = \@comments; -
lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/ViewDiary.pm
r4 r333 22 22 }; 23 23 24 $scraper{diary _body} = scraper {25 process ' tr[valign="top"]>td[nowrap]',24 $scraper{diary} = scraper { 25 process 'div.viewDiaryBox>div.listDiaryTitle>dl>dd', 26 26 time => 'TEXT'; 27 process ' tr[valign="top"]>td[width="430"]',27 process 'div.viewDiaryBox>div.listDiaryTitle>dl>dt', 28 28 subject => 'TEXT'; 29 process 'tr>td>table[width="410"]>tr>td[class="h12"]', 29 process 'div.viewDiaryBox>div.listDiaryTitle>dl>dt>span', 30 string => 'TEXT'; 31 process 'div#diary_body', 30 32 description => 'TEXT'; 31 process ' tr>td>table[width="410"]>tr>td>table>tr>td[valign="middle"]',33 process 'div.diaryPhoto>table>tr>td', 32 34 'images[]' => $scraper{images}; 33 result qw( time subject description images ); 34 }; 35 36 $scraper{diary} = scraper { 37 process 'td[width="540"]>table[bgcolor="#F8A448"]>tr>td[colspan="2"]>table[cellpadding="3"]', 38 diary => $scraper{diary_body}; 39 result qw( diary ); 35 process 'div#localNavigation>ul.localNaviHome>li.top>a', 36 mylink => '@href'; 37 process 'div#localNavigation>ul.localNaviFriend>li.top>a', 38 link => '@href'; 39 result qw( time subject description images link mylink string ); 40 40 }; 41 41 42 42 my $stash = $self->post_process($scraper{diary}->scrape(\$html))->[0]; 43 $stash->{link} ||= delete $stash->{mylink}; 43 44 44 # XXX: this fails when you test with local files. 45 # However, this link cannot be extracted from the html, 46 # at least as of writing this. ugh. 47 $stash->{link} = $self->{uri}; 45 my $string = delete $stash->{string} || ''; 46 $stash->{subject} =~ s/$string$//; 48 47 49 48 $scraper{comments} = scraper { 50 process 'tr', 51 string => 'TEXT'; 52 process 'td[nowrap]', 49 process 'dl.commentList01>dt>span.commentTitleDate', 53 50 time => 'TEXT'; 54 process ' td[width="430"]>table[width="410"]>tr>td>a',51 process 'dl.commentList01>dt>span.commentTitleName>a', 55 52 link => '@href', 56 53 name => 'TEXT'; 57 process ' td[bgcolor="#ffffff"]>table[cellpadding="5"]>tr>td[class="h12"]',54 process 'dl.commentList01>dd', 58 55 description => 'TEXT'; 59 result qw( stringtime link name description );56 result qw( time link name description ); 60 57 }; 61 58 62 59 $scraper{list} = scraper { 63 process ' a[name="comment"]+table>tr>td[colspan="2"]>table[cellpadding="3"]>tr',60 process 'div.diaryCommentbox', 64 61 'comments[]' => $scraper{comments}; 65 62 result qw( comments ); 66 63 }; 67 64 68 my $stash_c = $self->post_process($scraper{list}->scrape(\$html)); 69 70 my $tmp; 71 my @comments; 72 foreach my $comment ( @{ $stash_c } ) { 73 next if !$comment->{string} || $comment->{string} =~ /^\s*$/s; 74 if ( $comment->{time} ) { # meta 75 $tmp = { 76 time => $comment->{time}, 77 name => $comment->{name}, 78 link => $comment->{link}, 79 }; 80 } 81 else { # body 82 $tmp->{description} = $comment->{description}; 83 push @comments, $tmp; 84 } 85 } 86 $stash->{comments} = \@comments; 65 $stash->{comments} = $self->post_process($scraper{list}->scrape(\$html)); 87 66 88 67 return $stash; -
lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/ViewEvent.pm
r4 r333 4 4 use warnings; 5 5 use WWW::Mixi::Scraper::Plugin; 6 use WWW::Mixi::Scraper::Utils qw( _uri _datetime ); 6 7 use utf8; 7 8 … … 24 25 }; 25 26 26 $scraper{topic} = scraper { 27 process 'td[rowspan]', 28 'time' => 'TEXT'; 29 process 'td[nowrap]', 30 'name' => 'TEXT'; 31 process 'td:not([align])', 32 'string' => 'TEXT'; 33 process 'td:not([rowspan])>a', 34 'link' => '@href'; 35 process 'td[colspan="2"]>table>tr>td[valign="middle"]', 36 'images[]' => $scraper{images}; 37 result qw( time name string link images ); 27 $scraper{infos} = scraper { 28 process 'dt', 29 name => 'TEXT'; 30 process 'dd', 31 string => 'TEXT'; 32 process 'dd>a', 33 link => '@href', 34 subject => 'TEXT'; 35 result qw( name string link subject ); 38 36 }; 39 37 40 $scraper{table} = scraper { 41 process 'table[bgcolor="#F8A448"]>tr>td[colspan="2"]>table[width="630"]>tr', 42 'topic[]' => $scraper{topic}; 43 result qw( topic ); 38 $scraper{topic} = scraper { 39 process 'dl.bbsList01>dt>span.date', 40 'time' => 'TEXT'; 41 process 'dl.bbsList01>dt[class="bbsTitle clearfix"]>span.titleSpan', 42 'subject' => 'TEXT'; 43 process 'dd.bbsContent>dl>dt>a', 44 'name' => 'TEXT', 45 'name_link' => '@href'; 46 process 'dd.bbsContent>dl>dd', 47 'description' => 'TEXT'; 48 process 'div.communityPhoto>table>tr>td', 49 'images[]' => $scraper{images}; 50 process 'dl.bbsList01>dd.bbsInfo>dl', 51 'infos[]' => $scraper{infos}; 52 result qw( time subject name name_link images infos description ); 44 53 }; 45 54 46 55 $scraper{comment_body} = scraper { 47 process 'td[rowspan]', 48 'time' => 'TEXT'; 49 process 'td[bgcolor="#FDF9F2"]>font>b', 50 'subject' => 'TEXT'; 51 process 'td[bgcolor="#FDF9F2"]>a', 56 process 'dl.commentContent01>dt>a', 52 57 'link' => '@href', 53 58 'name' => 'TEXT'; 54 process ' td[bgcolor="#FFFFFF"]>table>tr>td[width="500"]',59 process 'dl.commentContent01>dd', 55 60 'description' => 'TEXT'; 56 process ' td[bgcolor="#FFFFFF"]>table>tr>td[width="500"]>table>tr>td[valign="middle"]',61 process 'dl.commentContent01>dd>table>tr>td', 57 62 'images[]' => $scraper{images}; 58 result qw( time name link subjectdescription images );63 result qw( link name description images ); 59 64 }; 60 65 61 66 $scraper{comment} = scraper { 62 process 'table[bgcolor="#DFB479"]>tr>td>table[width="630"]>tr', 67 process 'dl.commentList01>dt>span.date', 68 'dates[]' => 'TEXT'; 69 process 'dl.commentList01>dt>span.senderId', 70 'sender_ids[]' => 'TEXT'; 71 process 'dl.commentList01>dd', 63 72 'comments[]' => $scraper{comment_body}; 64 result 'comments';73 result qw( dates comments sender_ids ); 65 74 }; 66 75 67 my $stash = {}; 68 my $items = $self->post_process($scraper{table}->scrape(\$html)); 76 my $stash = $self->post_process($scraper{topic}->scrape(\$html))->[0]; 69 77 70 foreach my $item (@{ $items || [] }) { 71 if ( $item->{time} ) { 72 $stash->{time} = $item->{time}; 73 } 74 if ( $item->{images} ) { 75 $stash->{images} = $item->{images}; 76 } 77 78 next unless $item->{name}; 79 80 if ( $item->{name} eq 'タイトル' ) { 81 $stash->{subject} = $item->{string}; 82 } 78 foreach my $item (@{ $stash->{infos} || [] }) { 83 79 if ( $item->{name} eq '開催日時' ) { 84 80 $stash->{date} = $item->{string}; … … 90 86 $stash->{location} = $item->{string}; 91 87 } 92 if ( $item->{name} eq '詳細' ) {93 $stash->{description} = $item->{string};94 }95 if ( $item->{name} eq '企画者' ) {96 $stash->{name} = $item->{string};97 $stash->{name_link} = $item->{link};98 }99 88 if ( $item->{name} eq '参加者' ) { 100 my ($count, $subject) = $item->{string} =~ /(\d+人)\s+(\S+)/; 101 $stash->{list}->{count} = $count; 102 $stash->{list}->{link} = $item->{link}; 103 $stash->{list}->{subject} = $subject; 104 } 105 if ( $item->{name} eq '関連コミュニティ' ) { 106 $stash->{community}->{name} = $item->{string}; 107 $stash->{community}->{link} = $item->{link}; 89 $stash->{list}->{count} = $item->{string}; 90 $stash->{list}->{link} = _uri( $item->{link} ); 91 $stash->{list}->{subject} = $item->{subject}; 108 92 } 109 93 } … … 114 98 $stash->{link} = $self->{uri}; 115 99 116 my $stash_c = $self->post_process($scraper{comment}->scrape(\$html)) ;100 my $stash_c = $self->post_process($scraper{comment}->scrape(\$html))->[0]; 117 101 118 my $tmp; 119 my @comments; 120 foreach my $comment (@{ $stash_c || [] }) { 121 next if !$comment->{description} && !$comment->{time}; 122 if ( $comment->{time} ) { # meta 123 $tmp = { 124 time => $comment->{time}, 125 name => $comment->{name}, 126 subject => $comment->{subject}, 127 link => $comment->{link}, 128 }; 129 } 130 if ( $comment->{description} ) { 131 $tmp->{description} = $comment->{description}; 132 $tmp->{images} = $comment->{images}; 133 push @comments, $tmp; 134 $tmp = {}; 102 my @dates = @{ $stash_c->{dates} || [] }; 103 my @sender_ids = @{ $stash_c->{sender_ids} || [] }; 104 my @comments = @{ $stash_c->{comments} || [] }; 105 foreach my $comment ( @comments ) { 106 $comment->{time} = _datetime( shift @dates ); 107 $comment->{subject} = shift @sender_ids; 108 $comment->{link} = _uri( $comment->{link} ); 109 110 if ( $comment->{images} ) { 111 foreach my $image ( @{ $comment->{images} || [] } ) { 112 $image->{link} = _uri( $image->{link} ); 113 $image->{thumb_link} = _uri( $image->{thumb_link} ); 114 } 135 115 } 136 116 } 117 137 118 $stash->{comments} = \@comments; 138 119 -
lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/ViewMessage.pm
r4 r333 46 46 47 47 my $time = ( map { $_->{string} } grep { !$_->{table} } @{ $stash->{body} } )[0]; 48 $time =~ s/^.+://;48 $time =~ s/^.*(\d{4})\D+(\d{2})\D+(\d{2})\D+(\d{2})\D+(\d{2}).*$/$1\-$2\-$3 $4:$5/; 49 49 50 50 my $message = { -
lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Utils.pm
r4 r333 17 17 18 18 sub _datetime { 19 my $ string= shift;19 my $date = shift; 20 20 21 unless ( defined $ string) {21 unless ( defined $date ) { 22 22 warn "datetime is not defined"; return; 23 23 } 24 24 25 $string =~ s/^\s+//s; 26 my @string = split /\s+/s, $string; 27 my ($date, $time); 28 if ( $string[2] && $string[2] =~ /\d+:\d+/ ) { 29 $date = join "", @string[0,1]; 30 $time = $string[2]; 31 } 32 else { 33 $date = $string[0]; 34 $time = $string[1]; 25 my $time; 26 if ( $date =~ s/\s*(\d+:\d+(?::\d+)?)\s*$// ) { 27 $time = $1; 35 28 } 36 29 37 30 $date =~ s/\D/\-/g; 38 31 $date =~ s/\-+$//; 39 40 if ( $time ) {41 $time =~ s/\D/:/g;42 $time =~ s/:+$//;43 }44 32 45 33 return $time ? "$date $time" : $date; # should be DateTime object? -
lang/perl/WWW-Mixi-Scraper/trunk/t_live/lib/Utils.pm
r233 r333 125 125 _ok( $key, $item->{$key} ); 126 126 my $dt = $date_format->parse_datetime( $item->{$key} ); 127 Test::More::ok defined $dt ;127 Test::More::ok defined $dt, 'proper datetime'; 128 128 } 129 129 if ( $rule eq 'uri' ) { 130 130 _ok( $key, $item->{$key} ); 131 Test::More::ok ref $item->{$key} && $item->{$key}->isa('URI') ;131 Test::More::ok ref $item->{$key} && $item->{$key}->isa('URI'), 'proper uri'; 132 132 } 133 133 if ( ref $rule eq 'HASH' ) { -
lang/perl/WWW-Mixi-Scraper/trunk/t_live/list_comment.t
r233 r333 15 15 date_format('%Y-%m-%d %H:%M'); 16 16 17 run_tests('list_comment') or ok 'ignored';17 run_tests('list_comment') or ok 1, 'skipped: no tests'; 18 18 19 19 sub test { 20 20 my @items = $mixi->list_comment->parse(@_); 21 21 22 return ok 'skipped: no comments' unless @items;22 return ok 1, 'skipped: no comments' unless @items; 23 23 24 24 foreach my $item ( @items ) { -
lang/perl/WWW-Mixi-Scraper/trunk/t_live/list_diary.t
r233 r333 20 20 # date_format('%m-%d %H:%M'); 21 21 22 run_tests('list_diary') or ok 'ignored';22 run_tests('list_diary') or ok 1, 'skipped: no tests'; 23 23 24 24 sub test { 25 25 my @items = $mixi->list_diary->parse(@_); 26 26 27 return ok 'skipped: no diary' unless @items;27 return ok 1, 'skipped: no diary' unless @items; 28 28 29 29 foreach my $item ( @items ) { -
lang/perl/WWW-Mixi-Scraper/trunk/t_live/list_message.t
r233 r333 17 17 # date_format('%m-%d'); 18 18 19 run_tests('list_message') or ok 'ignored';19 run_tests('list_message') or ok 1, 'skipped: no tests'; 20 20 21 21 sub test { 22 22 my @items = $mixi->list_message->parse(@_) ; 23 23 24 return ok 'skipped: no messages' unless @items;24 return ok 1, 'skipped: no messages' unless @items; 25 25 26 26 foreach my $item ( @items ) { -
lang/perl/WWW-Mixi-Scraper/trunk/t_live/new_bbs.t
r233 r333 15 15 date_format('%Y-%m-%d %H:%M'); 16 16 17 run_tests('new_bbs') or ok 'ignored';17 run_tests('new_bbs') or ok 1, 'skipped: no tests'; 18 18 19 19 sub test { 20 20 my @items = $mixi->new_bbs->parse(@_); 21 21 22 return ok 'skipped: no new bbs entries' unless @items;22 return ok 1, 'skipped: no new bbs entries' unless @items; 23 23 24 24 foreach my $item ( @items ) { -
lang/perl/WWW-Mixi-Scraper/trunk/t_live/new_friend_diary.t
r233 r333 15 15 date_format('%Y-%m-%d %H:%M'); 16 16 17 run_tests('new_friend_diary') or ok 'ignored';17 run_tests('new_friend_diary') or ok 1, 'skipped: no tests'; 18 18 19 19 sub test { 20 20 my @items = $mixi->new_friend_diary->parse(@_); 21 21 22 return ok 'skipped: no new diary entries' unless @items;22 return ok 1, 'skipped: no new diary entries' unless @items; 23 23 24 24 foreach my $item ( @items ) { -
lang/perl/WWW-Mixi-Scraper/trunk/t_live/new_music.t
r233 r333 15 15 date_format('%Y-%m-%d %H:%M'); 16 16 17 run_tests('new_music') or ok 'ignored';17 run_tests('new_music') or ok 1, 'skipped: no tests'; 18 18 19 19 sub test { 20 20 my @items = $mixi->new_music->parse(@_); 21 21 22 return ok 'skipped: no new musics' unless @items;22 return ok 1, 'skipped: no new musics' unless @items; 23 23 24 24 foreach my $item ( @items ) { -
lang/perl/WWW-Mixi-Scraper/trunk/t_live/new_video.t
r233 r333 15 15 date_format('%Y-%m-%d %H:%M'); 16 16 17 run_tests('new_video') or ok 'ignored';17 run_tests('new_video') or ok 1, 'skipped: no tests'; 18 18 19 19 sub test { 20 20 my @items = $mixi->new_video->parse(@_); 21 21 22 return ok 'skipped: no new videos' unless @items;22 return ok 1, 'skipped: no new videos' unless @items; 23 23 24 24 foreach my $item ( @items ) { -
lang/perl/WWW-Mixi-Scraper/trunk/t_live/show_calendar.t
r233 r333 17 17 date_format('%Y-%m-%d'); 18 18 19 run_tests('show_calendar') or ok 'ignored';19 run_tests('show_calendar') or ok 1, 'skipped: no tests'; 20 20 21 21 sub test { 22 22 my @items = $mixi->show_calendar->parse(@_); 23 23 24 return ok 'skipped: no calendar items' unless @items;24 return ok 1, 'skipped: no calendar items' unless @items; 25 25 26 26 foreach my $item ( @items ) { -
lang/perl/WWW-Mixi-Scraper/trunk/t_live/show_friend.t
r233 r333 13 13 }; 14 14 15 run_tests('show_friend') or ok 'ignored';15 run_tests('show_friend') or ok 1, 'skipped: no tests'; 16 16 17 17 sub test { -
lang/perl/WWW-Mixi-Scraper/trunk/t_live/show_log.t
r233 r333 14 14 date_format('%Y-%m-%d %H:%M'); 15 15 16 run_tests('show_log') or ok 'ignored';16 run_tests('show_log') or ok 1, 'skipped: no tests'; 17 17 18 18 sub test { 19 19 my @items = $mixi->show_log->parse(@_); 20 20 21 return ok 'skipped: no logs' unless @items;21 return ok 1, 'skipped: no logs' unless @items; 22 22 23 23 foreach my $item ( @items ) { -
lang/perl/WWW-Mixi-Scraper/trunk/t_live/view_bbs.t
r233 r333 12 12 time => 'datetime', 13 13 name_link => 'uri', 14 link => 'uri _if_remote',15 comment => {14 link => 'uri', 15 comments => { 16 16 name => 'string', 17 17 description => 'string', … … 27 27 date_format('%Y-%m-%d %H:%M'); 28 28 29 run_tests('view_bbs') or ok 'ignored';29 run_tests('view_bbs') or ok 1, 'skipped: no tests'; 30 30 31 31 sub test { -
lang/perl/WWW-Mixi-Scraper/trunk/t_live/view_diary.t
r233 r333 10 10 description => 'string', 11 11 time => 'datetime', 12 link => 'uri _if_remote',12 link => 'uri', 13 13 # not yet implemented 14 14 # level => { … … 30 30 date_format('%Y-%m-%d %H:%M'); 31 31 32 run_tests('view_diary') or ok 'ignored';32 run_tests('view_diary') or ok 1, 'skipped: no tests'; 33 33 34 34 sub test { -
lang/perl/WWW-Mixi-Scraper/trunk/t_live/view_event.t
r233 r333 51 51 date_format('%Y-%m-%d %H:%M'); 52 52 53 run_tests('view_event') or ok 'ignored';53 run_tests('view_event') or ok 1, 'skipped: no tests'; 54 54 55 55 sub test { -
lang/perl/WWW-Mixi-Scraper/trunk/t_live/view_message.t
r233 r333 17 17 date_format('%Y-%m-%d %H:%M'); 18 18 19 run_tests('view_message') or ok 'ignored';19 run_tests('view_message') or ok 1, 'skipped: no tests'; 20 20 21 21 sub test {
![(please configure the [header_logo] section in trac.ini)](/share/chrome/site/your_project_logo.png)