Index: /lang/perl/misc/adobeuserforumsjp2rss.pl
===================================================================
--- /lang/perl/misc/adobeuserforumsjp2rss.pl (revision 19917)
+++ /lang/perl/misc/adobeuserforumsjp2rss.pl (revision 19917)
@@ -0,0 +1,252 @@
+#!/usr/bin/perl
+ 
+use strict;
+use warnings;
+use DateTime;
+use DateTime::Format::W3CDTF;
+use DateTime::Format::Strptime;
+use Encode;
+use HTML::Entities;
+use Web::Scraper;
+use WWW::Mechanize;
+use URI;
+use utf8;
+use XML::RSS;
+#use Carp qw(fataslToBrowser);
+#use YAML::Syck;
+#use Data::Dumper;
+ 
+ 
+# 設定
+ 
+my $base_url   = 'http://forums.adobe.co.jp/';
+my $forums = [
+  {
+    product    => 'Illustrator',
+    source_url => 'http://forums.adobe.co.jp/cgi-bin/WebX?14@@.ef7bfff',
+    prefix     => 'ai',
+  },
+  {
+    product    => 'InDesign',
+    source_url => 'http://forums.adobe.co.jp/cgi-bin/WebX?14@@.ef7c001',
+    prefix     => 'id',
+  },
+  {
+    product    => 'Photoshop',
+    source_url => 'http://forums.adobe.co.jp/cgi-bin/WebX?14@@.ef7c00b',
+    prefix     => 'ps',
+  },
+  {
+    product    => 'Adobe Flash',
+    source_url => 'http://forums.adobe.co.jp/cgi-bin/WebX?14@@.31abad38',
+    prefix     => 'fl',
+  },
+  {
+    product    => 'Audition',
+    source_url => 'http://forums.adobe.co.jp/cgi-bin/WebX?14@@.16e68b40',
+    prefix     => 'au',
+  },
+];
+ 
+my $mech = WWW::Mechanize->new();
+$mech->agent_alias( 'Windows IE 6' );
+ 
+ 
+# メイン
+ 
+foreach my $item ( @$forums ) {
+  my $xml = generate_rss($item);
+  open my $fh, ">", "adobeforumsjp_$item->{prefix}.xml" or die $!;
+  binmode $fh => ':utf8';
+  print $fh $xml;
+  close $fh;
+}
+exit; # ｵﾜﾀ
+ 
+ 
+# アドビユーザフォーラムの特定のプロダクトページの
+# トピックリストを巡回してRSSを生成する
+sub generate_rss {
+  my $hash = shift;
+  my $soruce_url = $hash->{source_url};
+  
+  my $list = [];
+  
+  # トピック一覧ページからトピックリストを取得する
+  my $res_topic = get_topic( $soruce_url );
+  
+  # トピックリストを巡回し、RSS用のデータを加工
+  foreach my $item ( @{$res_topic->{topics}} ) {
+    # 新着エントリがついているトピックのみ巡回
+    if ( ( exists $item->{new}) && $item->{new} > 0) {
+      # トピックのURLを指定して、トピック内のスレッドを取得する
+      my $res_individual = individual( $item->{link} );
+      
+      # トピック内の親項目の処理
+      my $item_parent    = $res_individual->{parent}->[0];
+      next if !exists $item_parent->{date}; # dateが無い項目は飛ばす
+      $item_parent->{link}
+        = URI->new_abs( $item->{link}, $base_url );
+      push @$list, $item_parent;
+      
+      # トピック内のスレッドの処理
+      foreach my $item_child ( @{$res_individual->{children}} ) {
+        next if !exists $item_child->{date}; # dateが無い項目は飛ばす
+        $item_child->{title} = "Re[$item_child->{order}]:$item_parent->{title}";
+        $item_child->{link}  = URI->new_abs( $item_child->{link}, $base_url );
+        $item_child->{annotate} = $item_parent->{link};
+        push @$list, $item_child;
+      }
+    }
+  }
+  
+  my $temp_list = $list;
+  
+  # 日付文字列でソート
+  @{$list} = sort { $b->{date} cmp $a->{date} } @{$temp_list};
+  # 50件でトリミング
+  splice (@$list, 50) if $#{@$list} > 50;
+  
+  
+  # RSS 生成
+  
+  my $rss = XML::RSS->new;
+  $rss->add_module(
+    prefix => 'content',
+    uri    => 'http://purl.org/rss/1.0/modules/content/'
+  );
+  $rss->add_module(
+    prefix => 'annotate',
+    uri    => 'http://purl.org/rss/1.0/modules/annotate/',
+  );
+  $rss->channel(
+      title       => "アドビユーザフォーラム/$hash->{product}",
+      link        => $soruce_url,
+      description => "アドビユーザフォーラム/$hash->{product}の最新書き込み",
+      language    => 'ja',
+      dc => {
+        publisher    => 'forums.adobe.co.jp',
+        contributor  => 'DTPWiki.jp',
+      },
+      syn => {
+        updatePeriod    => 'hourly',
+        updateFrequency => '1',
+      },
+  );
+  for my $item ( @{$list} ) {
+    $rss->add_item(
+      link        => $item->{link},
+      title       => $item->{title},
+      description => encode_entities($item->{plaintext}, q(<>&"')),
+      dc => {
+        date      => $item->{date},
+        creator   => $item->{creator},
+      },
+      content => {
+        encoded => '<![CDATA[<p>'
+                   . $item->{description}
+                   . '</p>]]>',
+      },
+      annotate => exists $item->{annotate}
+                    ? { reference => $item->{annotate}, }
+                    : undef,
+    );
+  }
+  return $rss->as_string();
+}
+ 
+ 
+sub get_topic {
+  my $url = shift;
+  $mech->get($url);
+  my $s = scraper {
+    process '//tr[@bgcolor]',
+      'topics[]' => scraper {
+        process '//td[1]/a/img',
+        type => '@src',
+        process '//td[2]',
+        title => 'TEXT',
+        process '//td[2]/p/b/a',
+        link => '@href',
+        process '//td[3]/p/a',
+        creator => 'TEXT',
+        process '//td[5]/p/a/font',
+        new  => 'TEXT',
+        process '//td[6]/p/font',
+        update => 'TEXT',
+      },
+  };
+  my $res = $s->scrape( decode('cp932', $mech->content() ) );
+  return $res;
+}
+ 
+ 
+sub individual {
+  my $url = shift;
+  # 時刻文字列を解析する関数
+  my $datetime
+  = sub {
+      my $strp = DateTime::Format::Strptime->new(
+                   pattern => '- %Y年%m月%d日　%H:%M'
+                     # 文字列のパターンを指定
+                 );
+      my $dt = $strp->parse_datetime( $_->string_value )
+                    ->set_time_zone('Asia/Tokyo');
+      return DateTime::Format::W3CDTF->format_datetime($dt);
+  };
+  # 文字列の先頭空白を取り除く関数
+  my $trim
+  = sub { (my $s = $_->string_value)=~s/^\s+?//;return $s; };
+  
+  my $item = shift;
+  # mechでHTMLを取得
+  $mech->get( $url );
+  # mechで取得したコンテンツを内部UTF-8に変換
+  my $html = decode('cp932', $mech->content() );
+  # Web::Scraperが誤作動しないように、HTML内を置換
+  $html =~ s/\x0D\x0A|\x0D|\x0A/\n/g;
+  $html =~ s|<P>\n\n<!-- get rid of hr here\? -->|<blockquote>|mo;
+  $html =~ s|<HR SIZE="1" NOSHADE>|</blockquote><blockquote>|sgo;
+  $html =~ s|</FONT/>|</FONT>|g;
+  $html =~ s|<P>\n\n<!-- messages -->|</p></blockquote>|mo;
+  # Web::Scraper準備
+  my $s = scraper {
+    process '//html/body/table[4]',
+      'parent[]' => scraper {
+        process '//b[@class="size3"]',
+        title => 'TEXT',
+        process '//p/b[1]/img',
+        type  => '@src',
+        process '//p/b[1]/a',
+        creator => $trim,
+        process '//p/b[1]/font',
+        date  => $datetime,
+        process '//p/font[@size=3]',
+        description => 'HTML',
+        process '//p/font[@size=3]',
+        plaintext => 'TEXT',
+    };
+    process 'blockquote',
+      'children[]' => scraper {
+        process '//tr/td/b[1]/a/font/img',
+        type  => '@src',
+        process '//tr/td/b[1]/a/font',
+        creator => $trim,
+        process '//tr/td/b[1]/font',
+        date  => $datetime,
+        process '//tr/td/a',
+        order => 'HTML',
+        process '//tr/td/a',
+        link => '@href',
+        process '//font/p',
+        description => 'HTML',
+        process '//font/p',
+        plaintext => 'TEXT',
+      };
+  };
+  # スクレイプ実行
+  my $res = $s->scrape( $html );
+  return $res;
+}
+ 
+__END__
