Changeset 333

Show
Ignore:
Timestamp:
10/02/07 14:43:41 (6 years ago)
Author:
charsbar
Message:

lang/perl/WWW-Mixi-Scraper: merged 20071001_mixi_renewal branch and uploaded 0.07 to CPAN

Location:
lang/perl/WWW-Mixi-Scraper/trunk
Files:
29 modified

Legend:

Unmodified
Added
Removed
  • lang/perl/WWW-Mixi-Scraper/trunk/Changes

    r4 r333  
    11Revision history for WWW-Mixi-Scraper 
    22 
    3 0.07  not yet released 
     30.07  2007/10/02 
     4  - scraping rules are totally refactored due to the mixi's renewal. 
     5    now it uses id/class attributes everywhere, so scraping is much 
     6    easier. mixi++ (and k*z*b*r*++, who secretly sent me a patch) 
     7  - and now live tests dump their contents if you test verbosely. 
    48  - new plugin: NewMusic 
    59 
  • lang/perl/WWW-Mixi-Scraper/trunk/META.yml

    r4 r333  
    11--- #YAML:1.0 
    22name:                WWW-Mixi-Scraper 
    3 version:             0.06 
     3version:             0.07 
    44abstract:            yet another mixi scraper 
    55license:             perl 
  • lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper.pm

    r4 r333  
    44use warnings; 
    55 
    6 our $VERSION = '0.06'; 
     6our $VERSION = '0.07'; 
    77 
    88use String::CamelCase qw( decamelize ); 
     
    9393WWW::Mixi has much longer history and is full-stack. The data it returns tends to be more complete, fine-tuned, and raw in many ways (including encoding). However, it tends to suffer from minor html changes as it heavily relies on regexes, and maybe it is too monolithic. 
    9494 
    95 In contrast, WWW::Mixi::Scraper hopefully tends to survive minor html changes as it relies on XPath. And basically it uses decoded perl strings, not octets. It's smaller, and pluggable. However, its data is more or less pre-processed and tends to lose some aspects such as proper line breaks. Also, it may be easier to be polluted with garbages (partly because mixi doesn't rely much on CSS; it's hard to locate exact area to scrape by XPath). And it may be harder to understand and maintain XPath rules. 
     95In contrast, WWW::Mixi::Scraper hopefully tends to survive minor html changes as it relies on XPath. And basically it uses decoded perl strings, not octets. It's smaller, and pluggable. However, its data is more or less pre-processed and tends to lose some aspects such as proper line breaks. Also, it may be easier to be polluted with garbages. And it may be harder to understand and maintain XPath rules. 
    9696 
    9797Which to choose? It depends. For now, ::Scraper is too limited, but if all you want is rough data to tell you who updated, or what was updated, ::Scraper may be a good option. 
  • lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/ListComment.pm

    r4 r333  
    1212  my %scraper; 
    1313  $scraper{comments} = scraper { 
    14     process 'td[width="450"]', 
     14    process 'dl>dd', 
    1515      string => 'TEXT'; 
    16     process 'td[width="450"]>a', 
     16    process 'dl>dd>a', 
    1717      link => '@href', 
    1818      subject => 'TEXT'; 
    19     process 'td[width="180"]', 
     19    process 'dl>dt', 
    2020      time => 'TEXT'; 
    2121    result qw( string time link subject ); 
     
    2323 
    2424  $scraper{list} = scraper { 
    25     process 'tr[bgcolor="#FFFFFF"]', 
     25    process 'div.listCommentArea>ul.entryList01>li', 
    2626      'comments[]' => $scraper{comments}; 
    2727    result qw( comments ); 
  • lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/ListDiary.pm

    r4 r333  
    2424 
    2525  $scraper{diaries} = scraper { 
    26     process 'td[nowrap]', 
     26    process 'div.listDiaryTitle>dl>dd', 
    2727      time => 'TEXT'; 
    28     process 'td[bgcolor="#FFF4E0"]>a', 
     28    process 'div.listDiaryTitle>dl>dt>a', 
    2929      link   => '@href', 
    3030      subject => 'TEXT'; 
    31     process 'td[bgcolor="#FFFFFF"]>table[cellpadding="3"]>tr>td[class="h120"]', 
     31    process 'p', 
    3232      description => 'TEXT'; 
    33     process 'td[bgcolor="#FFFFFF"]>table[cellpadding="3"]>tr>td[class="h120"]>table>tr>td>a>img', 
     33    process 'div.diaryPhoto>a>img', 
    3434      'images[]' => '@src'; 
    35     process 'td[align="right"]>a', 
     35    process 'div.diaryEditMenu>ul>li', 
    3636      'meta[]' => $scraper{meta}; 
    3737    result qw( time link subject description images meta ); 
     
    3939 
    4040  $scraper{list} = scraper { 
    41     process 'table[width="525"]>tr', 
     41    process 'div.listDiaryBlock', 
    4242      'diaries[]' => $scraper{diaries}; 
    4343    result qw( diaries ); 
     
    4646  my $stash = $self->post_process($scraper{list}->scrape(\$html)); 
    4747 
    48   my $tmp; 
    49   my @diaries; 
    50   foreach my $item ( @{ $stash } ) { 
    51     if ( $item->{time} ) {  # meta 
    52       $tmp = { 
    53         time    => $item->{time}, 
    54         link    => $item->{link}, 
    55         subject => $item->{subject}, 
    56       }; 
    57     } 
    58     elsif ( $item->{description} ) { 
    59       $tmp->{description} = $item->{description}; 
    60       $tmp->{images}      = $item->{images}; 
    61     } 
    62     elsif ( $item->{meta} ) { 
    63       foreach my $meta ( @{ $item->{meta} || [] } ) { 
    64         if ( ($meta->{href} || '') =~ /#(?:write|comment)$/ ) { 
    65           my ($count) = $meta->{text} =~ /\((\d+)\)/; 
    66           $tmp->{count} = $count; 
    67         } 
     48  foreach my $diary ( @{ $stash } ) { 
     49    my $meta = delete $diary->{meta}; 
     50    foreach my $item ( @{ $meta || [] } ) { 
     51      if ( ($item->{href} || '') =~ /#(?:write|comment)$/ ) { 
     52        my ($count) = $item->{text} =~ /(\d+)/; 
     53        $diary->{count} = $count; 
    6854      } 
    69       push @diaries, $tmp; 
    7055    } 
    7156  } 
    7257 
    73   return \@diaries; 
     58  return $stash; 
    7459} 
    7560 
  • lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/NewBBS.pm

    r4 r333  
    1212  my %scraper; 
    1313  $scraper{entries} = scraper { 
    14     process 'td[width="180"]', 
     14    process 'dl>dt', 
    1515      time => 'TEXT'; 
    16     process 'td[width="450"]>a', 
     16    process 'dl>dd>a', 
    1717      subject => 'TEXT', 
    1818      link    => '@href'; 
    19     process 'td[width="450"]', 
     19    process 'dl>dd', 
    2020      string => 'TEXT'; 
    2121    result qw( string subject link time ); 
     
    2323 
    2424  $scraper{list} = scraper { 
    25     process 'tr[bgcolor="#FFFFFF"]', 
     25    process 'div.newBbsArea>ul.entryList01>li', 
    2626      'entries[]' => $scraper{entries}; 
    2727    result qw( entries ); 
  • lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/NewFriendDiary.pm

    r4 r333  
    1212  my %scraper; 
    1313  $scraper{entries} = scraper { 
    14     process 'td[width="180"]', 
     14    process 'dl>dt', 
    1515      time => 'TEXT'; 
    16     process 'td[width="450"]>a', 
     16    process 'dl>dd>a', 
    1717      subject => 'TEXT', 
    1818      link    => '@href'; 
    19     process 'td[width="450"]', 
     19    process 'dl>dd', 
    2020      string => 'TEXT'; 
    2121    result qw( string subject link time ); 
     
    2323 
    2424  $scraper{list} = scraper { 
    25     process 'tr[bgcolor="#FFFFFF"]', 
     25    process 'ul.entryList01>li', 
    2626      'entries[]' => $scraper{entries}; 
    2727    result qw( entries ); 
  • lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/ShowFriend.pm

    r4 r333  
    55use WWW::Mixi::Scraper::Plugin; 
    66use WWW::Mixi::Scraper::Utils qw( _uri ); 
     7use utf8; 
    78 
    89validator {qw( id is_number )}; 
     
    2223  my %scraper; 
    2324  $scraper{items} = scraper { 
    24     process 'td[width="80"]', 
     25    process 'dl>dt', 
    2526      key => 'TEXT'; 
    26     process 'td[width!="80"]', 
     27    process 'dl>dd', 
    2728      value => 'TEXT'; 
    2829    result qw( key value ); 
     
    3031 
    3132  $scraper{profile} = scraper { 
    32     process 'table[width="425"]>tr[bgcolor="#FFFFFF"]', 
     33    process 'div#profile>ul>li', 
    3334      'items[]' => $scraper{items}; 
    3435    result qw( items ); 
     
    5859 
    5960  $scraper{outline} = scraper { 
    60     process 'table[bgcolor="#FEC977"]>tr>td[colspan="3"]', 
    61       'string[]' => 'TEXT'; 
    62     process 'table[width="270"]>tr>td[colspan="3"]>a', 
     61    process 'div#myProfile>div.contents01>h3', 
     62      'string' => 'TEXT'; 
     63    process 'div#myProfile>div.contents01>p.loginTime', 
     64      'description' => 'TEXT'; 
     65    process 'div#myProfile>p.friendPath>a', 
    6366      'relations[]' => $scraper{relations}; 
    64     process 'table[width="250"]>tr>td>img[vspace="2"]', 
     67    process 'div#myProfile>div.contents01>img', 
    6568      image => '@src'; 
    66     result qw( image string relations ); 
     69    process 'div#localNavigation>ul.localNaviFriend>li.top>a', 
     70      link  => '@href'; 
     71    result qw( image string relations description link ); 
    6772  }; 
    6873 
     
    7883  $stash->{relation} = shift @relations if @relations > 1; 
    7984 
    80   foreach my $string (@{ delete $stash->{string} || [] }) { 
    81     if ( $string =~ /^(.+)\((\d+)\)\s+\(([^)]+)\)\s*$/ ) { 
    82       $stash->{name} = $1; 
    83       $stash->{count} = $2; 
    84       $stash->{description} = $3; 
    85     } 
    86     elsif ( $string =~ /^(.+)\((\d+)\)\s*$/ ) { # may be yourself 
    87       $stash->{name} = $1; 
    88       $stash->{count} = $2; 
    89     } 
     85  my $string = delete $stash->{string} || ''; 
     86  if ( $string =~ s/\((\d+)\)$// ) { 
     87    $stash->{name}  = $string; 
     88    $stash->{count} = $1; 
    9089  } 
    91  
    92   # XXX: this fails when you test with local files. 
    93   # In this case, we can scrape the link from the 'snavi' toolbar 
    94   # but it's ugly. 
    95   $stash->{link} = $self->{uri}; 
     90  if ( $stash->{description} ) { 
     91    $stash->{description} =~ s/^(//; 
     92    $stash->{description} =~ s/)$//; 
     93  } 
    9694 
    9795  return $stash; 
  • lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/ShowLog.pm

    r4 r333  
    44use warnings; 
    55use WWW::Mixi::Scraper::Plugin; 
     6 
     7validator {( page => 'is_number' )}; 
    68 
    79sub scrape { 
     
    2426  }; 
    2527 
    26   return $self->post_process($scraper{list}->scrape(\$html)); 
     28  return $self->post_process($scraper{list}->scrape(\$html), \&_callback); 
     29} 
     30 
     31sub _callback { 
     32  my $item  = shift; 
     33  my @parts = split /\s/, ($item->{time} || ''), 3; 
     34  $item->{time} = join ' ', @parts[0..1]; 
    2735} 
    2836 
  • lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/ViewBBS.pm

    r4 r333  
    44use warnings; 
    55use WWW::Mixi::Scraper::Plugin; 
     6use WWW::Mixi::Scraper::Utils qw( _datetime _uri ); 
    67 
    78validator {qw( 
     
    2526 
    2627  $scraper{topic} = scraper { 
    27     process 'table[bgcolor="#dfa473"]>tr>td[bgcolor="#ffd8b0"]', 
     28    process 'dl[class="bbsList01 bbsDetail"]>dt>span.date', 
    2829      time => 'TEXT'; 
    29     process 'table[bgcolor="#dfa473"]>tr>td[bgcolor="#fff4e0"]', 
     30    process 'dl[class="bbsList01 bbsDetail"]>dt>span.titleSpan', 
    3031      subject => 'TEXT'; 
    31     process 'table[bgcolor="#dfa473"]>tr>td[bgcolor="#fdf9f2"]>a', 
     32    process 'dd.bbsContent>dl>dt>a', 
    3233      name      => 'TEXT', 
    3334      name_link => '@href'; 
    34     process 'table[bgcolor="#dfa473"]>tr>td[bgcolor="#ffffff"]>table[width="500"]>tr>td[class="h120"]', 
     35    process 'dd.bbsContent>dl>dd', 
    3536      description => 'TEXT'; 
    36     process 'table[bgcolor="#dfa473"]>tr>td[bgcolor="#ffffff"]>table[width="500"]>tr>td[class="h120"]>table>tr>td[valign="middle"]', 
     37    process 'dd.bbsContent>dl>dd>div.communityPhoto>table>tr>td', 
    3738      'images[]' => $scraper{images}; 
    38     result qw( time subject description name name_link images ); 
     39    process 'div#localNavigation>ul.localNaviCommunity>li.top>a', 
     40      link => '@href'; 
     41    result qw( time subject description name name_link images link ); 
    3942  }; 
    4043 
     
    4245  my $stash = $self->post_process($scraper{topic}->scrape(\$html))->[0]; 
    4346 
    44   # XXX: this fails when you test with local files. 
    45   # However, this link cannot be extracted from the html, 
    46   # at least as of writing this. ugh. 
    47   $stash->{link} = $self->{uri}; 
    48  
    4947  $scraper{comments} = scraper { 
    50     process 'tr', 
    51       string => 'TEXT'; 
    52     process 'tr[valign="top"]>td[nowrap]', 
    53       time => 'TEXT'; 
    54     process 'tr[valign="top"]>td[bgcolor="#fdf9f2"]>a', 
     48    process 'dt>a', 
    5549      link => '@href', 
    5650      name => 'TEXT'; 
    57     process 'td[bgcolor="#ffffff"]>table[cellpadding="5"]>tr>td[class="h120"]', 
     51    process 'dd', 
    5852      description => 'TEXT'; 
    59     result qw( string time link name description ); 
     53    result qw( link name description ); 
    6054  }; 
    6155 
    6256  $scraper{list} = scraper { 
    63     process 'table[cellpadding="3"]>tr', 
     57    process 'dl.commentList01>dt[class="commentDate clearfix"]>span.date', 
     58      'times[]' => 'TEXT'; 
     59    process 'dl.commentList01>dd>dl.commentContent01', 
    6460      'comments[]' => $scraper{comments}; 
    65     result qw( comments ); 
     61    result qw( times comments ); 
    6662  }; 
    6763 
    68   my $stash_c = $self->post_process($scraper{list}->scrape(\$html)); 
     64  my $stash_c = $self->post_process($scraper{list}->scrape(\$html))->[0]; 
    6965 
    70   my $tmp; 
    71   my @comments; 
    72   foreach my $comment ( @{ $stash_c } ) { 
    73     next if !$comment->{string} || $comment->{string} =~ /^\s*$/s; 
    74     if ( $comment->{time} ) {  # meta 
    75       $tmp = { 
    76         time => $comment->{time}, 
    77         name => $comment->{name}, 
    78         link => $comment->{link}, 
    79       }; 
    80     } 
    81     elsif ( $comment->{description} && $tmp->{time} ) {  # body 
    82       $tmp->{description} = $comment->{description}; 
    83       push @comments, $tmp; 
    84       $tmp = {}; 
    85     } 
     66  my @comments = @{ $stash_c->{comments} || [] }; 
     67  my @times    = @{ $stash_c->{times} || [] }; 
     68  foreach my $comment ( @comments ) { 
     69    $comment->{time} = _datetime( shift @times ); 
     70    $comment->{link} = _uri( $comment->{link} ); 
    8671  } 
    8772  $stash->{comments} = \@comments; 
  • lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/ViewDiary.pm

    r4 r333  
    2222  }; 
    2323 
    24   $scraper{diary_body} = scraper { 
    25     process 'tr[valign="top"]>td[nowrap]', 
     24  $scraper{diary} = scraper { 
     25    process 'div.viewDiaryBox>div.listDiaryTitle>dl>dd', 
    2626      time => 'TEXT'; 
    27     process 'tr[valign="top"]>td[width="430"]', 
     27    process 'div.viewDiaryBox>div.listDiaryTitle>dl>dt', 
    2828      subject => 'TEXT'; 
    29     process 'tr>td>table[width="410"]>tr>td[class="h12"]', 
     29    process 'div.viewDiaryBox>div.listDiaryTitle>dl>dt>span', 
     30      string => 'TEXT'; 
     31    process 'div#diary_body', 
    3032      description => 'TEXT'; 
    31     process 'tr>td>table[width="410"]>tr>td>table>tr>td[valign="middle"]', 
     33    process 'div.diaryPhoto>table>tr>td', 
    3234      'images[]' => $scraper{images}; 
    33     result qw( time subject description images ); 
    34   }; 
    35  
    36   $scraper{diary} = scraper { 
    37     process 'td[width="540"]>table[bgcolor="#F8A448"]>tr>td[colspan="2"]>table[cellpadding="3"]', 
    38       diary => $scraper{diary_body}; 
    39     result qw( diary ); 
     35    process 'div#localNavigation>ul.localNaviHome>li.top>a', 
     36      mylink => '@href'; 
     37    process 'div#localNavigation>ul.localNaviFriend>li.top>a', 
     38      link => '@href'; 
     39    result qw( time subject description images link mylink string ); 
    4040  }; 
    4141 
    4242  my $stash = $self->post_process($scraper{diary}->scrape(\$html))->[0]; 
     43  $stash->{link} ||= delete $stash->{mylink}; 
    4344 
    44   # XXX: this fails when you test with local files. 
    45   # However, this link cannot be extracted from the html, 
    46   # at least as of writing this. ugh. 
    47   $stash->{link} = $self->{uri}; 
     45  my $string = delete $stash->{string} || ''; 
     46  $stash->{subject} =~ s/$string$//; 
    4847 
    4948  $scraper{comments} = scraper { 
    50     process 'tr', 
    51       string => 'TEXT'; 
    52     process 'td[nowrap]', 
     49    process 'dl.commentList01>dt>span.commentTitleDate', 
    5350      time => 'TEXT'; 
    54     process 'td[width="430"]>table[width="410"]>tr>td>a', 
     51    process 'dl.commentList01>dt>span.commentTitleName>a', 
    5552      link => '@href', 
    5653      name => 'TEXT'; 
    57     process 'td[bgcolor="#ffffff"]>table[cellpadding="5"]>tr>td[class="h12"]', 
     54    process 'dl.commentList01>dd', 
    5855      description => 'TEXT'; 
    59     result qw( string time link name description ); 
     56    result qw( time link name description ); 
    6057  }; 
    6158 
    6259  $scraper{list} = scraper { 
    63     process 'a[name="comment"]+table>tr>td[colspan="2"]>table[cellpadding="3"]>tr', 
     60    process 'div.diaryCommentbox', 
    6461      'comments[]' => $scraper{comments}; 
    6562    result qw( comments ); 
    6663  }; 
    6764 
    68   my $stash_c = $self->post_process($scraper{list}->scrape(\$html)); 
    69  
    70   my $tmp; 
    71   my @comments; 
    72   foreach my $comment ( @{ $stash_c } ) { 
    73     next if !$comment->{string} || $comment->{string} =~ /^\s*$/s; 
    74     if ( $comment->{time} ) {  # meta 
    75       $tmp = { 
    76         time => $comment->{time}, 
    77         name => $comment->{name}, 
    78         link => $comment->{link}, 
    79       }; 
    80     } 
    81     else {  # body 
    82       $tmp->{description} = $comment->{description}; 
    83       push @comments, $tmp; 
    84     } 
    85   } 
    86   $stash->{comments} = \@comments; 
     65  $stash->{comments} = $self->post_process($scraper{list}->scrape(\$html)); 
    8766 
    8867  return $stash; 
  • lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/ViewEvent.pm

    r4 r333  
    44use warnings; 
    55use WWW::Mixi::Scraper::Plugin; 
     6use WWW::Mixi::Scraper::Utils qw( _uri _datetime ); 
    67use utf8; 
    78 
     
    2425  }; 
    2526 
    26   $scraper{topic} = scraper { 
    27     process 'td[rowspan]', 
    28       'time' => 'TEXT'; 
    29     process 'td[nowrap]', 
    30       'name' => 'TEXT'; 
    31     process 'td:not([align])', 
    32       'string' => 'TEXT'; 
    33     process 'td:not([rowspan])>a', 
    34       'link' => '@href'; 
    35     process 'td[colspan="2"]>table>tr>td[valign="middle"]', 
    36       'images[]' => $scraper{images}; 
    37     result qw( time name string link images ); 
     27  $scraper{infos} = scraper { 
     28    process 'dt', 
     29      name => 'TEXT'; 
     30    process 'dd', 
     31      string => 'TEXT'; 
     32    process 'dd>a', 
     33      link    => '@href', 
     34      subject => 'TEXT'; 
     35    result qw( name string link subject ); 
    3836  }; 
    3937 
    40   $scraper{table} = scraper { 
    41     process 'table[bgcolor="#F8A448"]>tr>td[colspan="2"]>table[width="630"]>tr', 
    42       'topic[]' => $scraper{topic}; 
    43     result qw( topic ); 
     38  $scraper{topic} = scraper { 
     39    process 'dl.bbsList01>dt>span.date', 
     40      'time' => 'TEXT'; 
     41    process 'dl.bbsList01>dt[class="bbsTitle clearfix"]>span.titleSpan', 
     42      'subject' => 'TEXT'; 
     43    process 'dd.bbsContent>dl>dt>a', 
     44      'name'      => 'TEXT', 
     45      'name_link' => '@href'; 
     46    process 'dd.bbsContent>dl>dd', 
     47      'description' => 'TEXT'; 
     48    process 'div.communityPhoto>table>tr>td', 
     49      'images[]' => $scraper{images}; 
     50    process 'dl.bbsList01>dd.bbsInfo>dl', 
     51      'infos[]' => $scraper{infos}; 
     52    result qw( time subject name name_link images infos description ); 
    4453  }; 
    4554 
    4655  $scraper{comment_body} = scraper { 
    47     process 'td[rowspan]', 
    48       'time' => 'TEXT'; 
    49     process 'td[bgcolor="#FDF9F2"]>font>b', 
    50       'subject' => 'TEXT'; 
    51     process 'td[bgcolor="#FDF9F2"]>a', 
     56    process 'dl.commentContent01>dt>a', 
    5257      'link' => '@href', 
    5358      'name' => 'TEXT'; 
    54     process 'td[bgcolor="#FFFFFF"]>table>tr>td[width="500"]', 
     59    process 'dl.commentContent01>dd', 
    5560      'description' => 'TEXT'; 
    56     process 'td[bgcolor="#FFFFFF"]>table>tr>td[width="500"]>table>tr>td[valign="middle"]', 
     61    process 'dl.commentContent01>dd>table>tr>td', 
    5762      'images[]' => $scraper{images}; 
    58     result qw( time name link subject description images ); 
     63    result qw( link name description images ); 
    5964  }; 
    6065 
    6166  $scraper{comment} = scraper { 
    62     process 'table[bgcolor="#DFB479"]>tr>td>table[width="630"]>tr', 
     67    process 'dl.commentList01>dt>span.date', 
     68      'dates[]' => 'TEXT'; 
     69    process 'dl.commentList01>dt>span.senderId', 
     70      'sender_ids[]' => 'TEXT'; 
     71    process 'dl.commentList01>dd', 
    6372      'comments[]' => $scraper{comment_body}; 
    64     result 'comments'; 
     73    result qw( dates comments sender_ids ); 
    6574  }; 
    6675 
    67   my $stash = {}; 
    68   my $items = $self->post_process($scraper{table}->scrape(\$html)); 
     76  my $stash = $self->post_process($scraper{topic}->scrape(\$html))->[0]; 
    6977 
    70   foreach my $item (@{ $items || [] }) { 
    71     if ( $item->{time} ) { 
    72       $stash->{time} = $item->{time}; 
    73     } 
    74     if ( $item->{images} ) { 
    75       $stash->{images} = $item->{images}; 
    76     } 
    77  
    78     next unless $item->{name}; 
    79  
    80     if ( $item->{name} eq 'タイトル' ) { 
    81       $stash->{subject} = $item->{string}; 
    82     } 
     78  foreach my $item (@{ $stash->{infos} || [] }) { 
    8379    if ( $item->{name} eq '開催日時' ) { 
    8480      $stash->{date} = $item->{string}; 
     
    9086      $stash->{location} = $item->{string}; 
    9187    } 
    92     if ( $item->{name} eq '詳細' ) { 
    93       $stash->{description} = $item->{string}; 
    94     } 
    95     if ( $item->{name} eq '企画者' ) { 
    96       $stash->{name}      = $item->{string}; 
    97       $stash->{name_link} = $item->{link}; 
    98     } 
    9988    if ( $item->{name} eq '参加者' ) { 
    100       my ($count, $subject) = $item->{string} =~ /(\d+人)\s+(\S+)/; 
    101       $stash->{list}->{count}   = $count; 
    102       $stash->{list}->{link}    = $item->{link}; 
    103       $stash->{list}->{subject} = $subject; 
    104     } 
    105     if ( $item->{name} eq '関連コミュニティ' ) { 
    106       $stash->{community}->{name} = $item->{string}; 
    107       $stash->{community}->{link} = $item->{link}; 
     89      $stash->{list}->{count}   = $item->{string}; 
     90      $stash->{list}->{link}    = _uri( $item->{link} ); 
     91      $stash->{list}->{subject} = $item->{subject}; 
    10892    } 
    10993  } 
     
    11498  $stash->{link} = $self->{uri}; 
    11599 
    116   my $stash_c = $self->post_process($scraper{comment}->scrape(\$html)); 
     100  my $stash_c = $self->post_process($scraper{comment}->scrape(\$html))->[0]; 
    117101 
    118   my $tmp; 
    119   my @comments; 
    120   foreach my $comment (@{ $stash_c || [] }) { 
    121     next if !$comment->{description} && !$comment->{time}; 
    122     if ( $comment->{time} ) { # meta 
    123       $tmp = { 
    124         time    => $comment->{time}, 
    125         name    => $comment->{name}, 
    126         subject => $comment->{subject}, 
    127         link    => $comment->{link}, 
    128       }; 
    129     } 
    130     if ( $comment->{description} ) { 
    131       $tmp->{description} = $comment->{description}; 
    132       $tmp->{images}      = $comment->{images}; 
    133       push @comments, $tmp; 
    134       $tmp = {}; 
     102  my @dates      = @{ $stash_c->{dates} || [] }; 
     103  my @sender_ids = @{ $stash_c->{sender_ids} || [] }; 
     104  my @comments   = @{ $stash_c->{comments} || [] }; 
     105  foreach my $comment ( @comments ) { 
     106    $comment->{time}    = _datetime( shift @dates ); 
     107    $comment->{subject} = shift @sender_ids; 
     108    $comment->{link}    = _uri( $comment->{link} ); 
     109 
     110    if ( $comment->{images} ) { 
     111      foreach my $image ( @{ $comment->{images} || [] } ) { 
     112        $image->{link}       = _uri( $image->{link} ); 
     113        $image->{thumb_link} = _uri( $image->{thumb_link} ); 
     114      } 
    135115    } 
    136116  } 
     117 
    137118  $stash->{comments} = \@comments; 
    138119 
  • lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Plugin/ViewMessage.pm

    r4 r333  
    4646 
    4747  my $time = ( map { $_->{string} } grep { !$_->{table} } @{ $stash->{body} } )[0]; 
    48   $time =~ s/^.+://; 
     48     $time =~ s/^.*(\d{4})\D+(\d{2})\D+(\d{2})\D+(\d{2})\D+(\d{2}).*$/$1\-$2\-$3 $4:$5/; 
    4949 
    5050  my $message = { 
  • lang/perl/WWW-Mixi-Scraper/trunk/lib/WWW/Mixi/Scraper/Utils.pm

    r4 r333  
    1717 
    1818sub _datetime { 
    19   my $string = shift; 
     19  my $date = shift; 
    2020 
    21   unless ( defined $string ) { 
     21  unless ( defined $date ) { 
    2222    warn "datetime is not defined"; return; 
    2323  } 
    2424 
    25   $string =~ s/^\s+//s; 
    26   my @string = split /\s+/s, $string; 
    27   my ($date, $time); 
    28   if ( $string[2] && $string[2] =~ /\d+:\d+/ ) { 
    29      $date = join "", @string[0,1]; 
    30      $time = $string[2]; 
    31   } 
    32   else { 
    33      $date = $string[0]; 
    34      $time = $string[1]; 
     25  my $time; 
     26  if ( $date =~ s/\s*(\d+:\d+(?::\d+)?)\s*$// ) { 
     27    $time = $1; 
    3528  } 
    3629 
    3730  $date =~ s/\D/\-/g; 
    3831  $date =~ s/\-+$//; 
    39  
    40   if ( $time ) { 
    41     $time =~ s/\D/:/g; 
    42     $time =~ s/:+$//; 
    43   } 
    4432 
    4533  return $time ? "$date $time" : $date; # should be DateTime object? 
  • lang/perl/WWW-Mixi-Scraper/trunk/t_live/lib/Utils.pm

    r233 r333  
    125125      _ok( $key, $item->{$key} ); 
    126126      my $dt = $date_format->parse_datetime( $item->{$key} ); 
    127       Test::More::ok defined $dt; 
     127      Test::More::ok defined $dt, 'proper datetime'; 
    128128    } 
    129129    if ( $rule eq 'uri' ) { 
    130130      _ok( $key, $item->{$key} ); 
    131       Test::More::ok ref $item->{$key} && $item->{$key}->isa('URI'); 
     131      Test::More::ok ref $item->{$key} && $item->{$key}->isa('URI'), 'proper uri'; 
    132132    } 
    133133    if ( ref $rule eq 'HASH' ) { 
  • lang/perl/WWW-Mixi-Scraper/trunk/t_live/list_comment.t

    r233 r333  
    1515date_format('%Y-%m-%d %H:%M'); 
    1616 
    17 run_tests('list_comment') or ok 'ignored'; 
     17run_tests('list_comment') or ok 1, 'skipped: no tests'; 
    1818 
    1919sub test { 
    2020  my @items = $mixi->list_comment->parse(@_); 
    2121 
    22   return ok 'skipped: no comments' unless @items; 
     22  return ok 1, 'skipped: no comments' unless @items; 
    2323 
    2424  foreach my $item ( @items ) { 
  • lang/perl/WWW-Mixi-Scraper/trunk/t_live/list_diary.t

    r233 r333  
    2020# date_format('%m-%d %H:%M'); 
    2121 
    22 run_tests('list_diary') or ok 'ignored'; 
     22run_tests('list_diary') or ok 1, 'skipped: no tests'; 
    2323 
    2424sub test { 
    2525  my @items = $mixi->list_diary->parse(@_); 
    2626 
    27   return ok 'skipped: no diary' unless @items; 
     27  return ok 1, 'skipped: no diary' unless @items; 
    2828 
    2929  foreach my $item ( @items ) { 
  • lang/perl/WWW-Mixi-Scraper/trunk/t_live/list_message.t

    r233 r333  
    1717# date_format('%m-%d'); 
    1818 
    19 run_tests('list_message') or ok 'ignored'; 
     19run_tests('list_message') or ok 1, 'skipped: no tests'; 
    2020 
    2121sub test { 
    2222  my @items = $mixi->list_message->parse(@_) ; 
    2323 
    24   return ok 'skipped: no messages' unless @items; 
     24  return ok 1, 'skipped: no messages' unless @items; 
    2525 
    2626  foreach my $item ( @items ) { 
  • lang/perl/WWW-Mixi-Scraper/trunk/t_live/new_bbs.t

    r233 r333  
    1515date_format('%Y-%m-%d %H:%M'); 
    1616 
    17 run_tests('new_bbs') or ok 'ignored'; 
     17run_tests('new_bbs') or ok 1, 'skipped: no tests'; 
    1818 
    1919sub test { 
    2020  my @items = $mixi->new_bbs->parse(@_); 
    2121 
    22   return ok 'skipped: no new bbs entries' unless @items; 
     22  return ok 1, 'skipped: no new bbs entries' unless @items; 
    2323 
    2424  foreach my $item ( @items ) { 
  • lang/perl/WWW-Mixi-Scraper/trunk/t_live/new_friend_diary.t

    r233 r333  
    1515date_format('%Y-%m-%d %H:%M'); 
    1616 
    17 run_tests('new_friend_diary') or ok 'ignored'; 
     17run_tests('new_friend_diary') or ok 1, 'skipped: no tests'; 
    1818 
    1919sub test { 
    2020  my @items = $mixi->new_friend_diary->parse(@_); 
    2121 
    22   return ok 'skipped: no new diary entries' unless @items; 
     22  return ok 1, 'skipped: no new diary entries' unless @items; 
    2323 
    2424  foreach my $item ( @items ) { 
  • lang/perl/WWW-Mixi-Scraper/trunk/t_live/new_music.t

    r233 r333  
    1515date_format('%Y-%m-%d %H:%M'); 
    1616 
    17 run_tests('new_music') or ok 'ignored'; 
     17run_tests('new_music') or ok 1, 'skipped: no tests'; 
    1818 
    1919sub test { 
    2020  my @items = $mixi->new_music->parse(@_); 
    2121 
    22   return ok 'skipped: no new musics' unless @items; 
     22  return ok 1, 'skipped: no new musics' unless @items; 
    2323 
    2424  foreach my $item ( @items ) { 
  • lang/perl/WWW-Mixi-Scraper/trunk/t_live/new_video.t

    r233 r333  
    1515date_format('%Y-%m-%d %H:%M'); 
    1616 
    17 run_tests('new_video') or ok 'ignored'; 
     17run_tests('new_video') or ok 1, 'skipped: no tests'; 
    1818 
    1919sub test { 
    2020  my @items = $mixi->new_video->parse(@_); 
    2121 
    22   return ok 'skipped: no new videos' unless @items; 
     22  return ok 1, 'skipped: no new videos' unless @items; 
    2323 
    2424  foreach my $item ( @items ) { 
  • lang/perl/WWW-Mixi-Scraper/trunk/t_live/show_calendar.t

    r233 r333  
    1717date_format('%Y-%m-%d'); 
    1818 
    19 run_tests('show_calendar') or ok 'ignored'; 
     19run_tests('show_calendar') or ok 1, 'skipped: no tests'; 
    2020 
    2121sub test { 
    2222  my @items = $mixi->show_calendar->parse(@_); 
    2323 
    24   return ok 'skipped: no calendar items' unless @items; 
     24  return ok 1, 'skipped: no calendar items' unless @items; 
    2525 
    2626  foreach my $item ( @items ) { 
  • lang/perl/WWW-Mixi-Scraper/trunk/t_live/show_friend.t

    r233 r333  
    1313}; 
    1414 
    15 run_tests('show_friend') or ok 'ignored'; 
     15run_tests('show_friend') or ok 1, 'skipped: no tests'; 
    1616 
    1717sub test { 
  • lang/perl/WWW-Mixi-Scraper/trunk/t_live/show_log.t

    r233 r333  
    1414date_format('%Y-%m-%d %H:%M'); 
    1515 
    16 run_tests('show_log') or ok 'ignored'; 
     16run_tests('show_log') or ok 1, 'skipped: no tests'; 
    1717 
    1818sub test { 
    1919  my @items = $mixi->show_log->parse(@_); 
    2020 
    21   return ok 'skipped: no logs' unless @items; 
     21  return ok 1, 'skipped: no logs' unless @items; 
    2222 
    2323  foreach my $item ( @items ) { 
  • lang/perl/WWW-Mixi-Scraper/trunk/t_live/view_bbs.t

    r233 r333  
    1212  time        => 'datetime', 
    1313  name_link   => 'uri', 
    14   link        => 'uri_if_remote', 
    15   comment => { 
     14  link        => 'uri', 
     15  comments => { 
    1616    name        => 'string', 
    1717    description => 'string', 
     
    2727date_format('%Y-%m-%d %H:%M'); 
    2828 
    29 run_tests('view_bbs') or ok 'ignored'; 
     29run_tests('view_bbs') or ok 1, 'skipped: no tests'; 
    3030 
    3131sub test { 
  • lang/perl/WWW-Mixi-Scraper/trunk/t_live/view_diary.t

    r233 r333  
    1010  description => 'string', 
    1111  time        => 'datetime', 
    12   link        => 'uri_if_remote', 
     12  link        => 'uri', 
    1313# not yet implemented 
    1414#  level => { 
     
    3030date_format('%Y-%m-%d %H:%M'); 
    3131 
    32 run_tests('view_diary') or ok 'ignored'; 
     32run_tests('view_diary') or ok 1, 'skipped: no tests'; 
    3333 
    3434sub test { 
  • lang/perl/WWW-Mixi-Scraper/trunk/t_live/view_event.t

    r233 r333  
    5151date_format('%Y-%m-%d %H:%M'); 
    5252 
    53 run_tests('view_event') or ok 'ignored'; 
     53run_tests('view_event') or ok 1, 'skipped: no tests'; 
    5454 
    5555sub test { 
  • lang/perl/WWW-Mixi-Scraper/trunk/t_live/view_message.t

    r233 r333  
    1717date_format('%Y-%m-%d %H:%M'); 
    1818 
    19 run_tests('view_message') or ok 'ignored'; 
     19run_tests('view_message') or ok 1, 'skipped: no tests'; 
    2020 
    2121sub test {