| 1 | #!/usr/bin/env ruby -Ku
|
|---|
| 2 | $KCODE="u"
|
|---|
| 3 | # ====================================================================================
|
|---|
| 4 | # Twitter Archive Filter
|
|---|
| 5 | #
|
|---|
| 6 | # 1.0.1 => 2008/04/21 by Seasons
|
|---|
| 7 | # 1.0.0 => 2008/04/21 by Seasons
|
|---|
| 8 | #
|
|---|
| 9 | # Special Thanks!!
|
|---|
| 10 | # Twitter : @gan2
|
|---|
| 11 | #
|
|---|
| 12 | # mailto:keisuke@hata.biz
|
|---|
| 13 | # twitter:Seasons
|
|---|
| 14 | # ====================================================================================
|
|---|
| 15 |
|
|---|
| 16 | require 'rubygems'
|
|---|
| 17 | require 'scrapi'
|
|---|
| 18 | require 'pp'
|
|---|
| 19 | require 'net/http'
|
|---|
| 20 | require 'kconv'
|
|---|
| 21 | require 'optparse'
|
|---|
| 22 | require '.twitter_user_pass' #=> Twitter Username & Password
|
|---|
| 23 |
|
|---|
| 24 | $stdout.sync = true
|
|---|
| 25 |
|
|---|
| 26 | alias :_puts :puts
|
|---|
| 27 | def puts(*args)
|
|---|
| 28 | _puts *args
|
|---|
| 29 | $stdout.flush
|
|---|
| 30 | end
|
|---|
| 31 |
|
|---|
| 32 | #-------------------------------------------------------------------------------------
|
|---|
| 33 | # System Config
|
|---|
| 34 | #-------------------------------------------------------------------------------------
|
|---|
| 35 | BASEPATH = '/account/archive' #=> default get page archive
|
|---|
| 36 | #BASEPATH = '/home' #=> if you get recent timeline
|
|---|
| 37 | #-------------------------------------------------------------------------------------
|
|---|
| 38 |
|
|---|
| 39 | # *=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=
|
|---|
| 40 | #
|
|---|
| 41 | # Twitter Archive Filter
|
|---|
| 42 | #
|
|---|
| 43 | # *=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=
|
|---|
| 44 | class TwitterArchiveFilter
|
|---|
| 45 |
|
|---|
| 46 | def initialize (keyword , pagenum , username , usebrace )
|
|---|
| 47 | @items = []
|
|---|
| 48 | @username = username
|
|---|
| 49 | @usebrace = usebrace
|
|---|
| 50 | @pagenum = pagenum
|
|---|
| 51 | @http = Net::HTTP.new('twitter.com', 80)
|
|---|
| 52 | @keyword_reg = usebrace ? /\[#{keyword}\]/i : /#{keyword}/i
|
|---|
| 53 | end
|
|---|
| 54 |
|
|---|
| 55 | # ===========================================================================
|
|---|
| 56 | # @brief : filter
|
|---|
| 57 | #
|
|---|
| 58 | # @param : none
|
|---|
| 59 | #
|
|---|
| 60 | # @ret : none
|
|---|
| 61 | # ===========================================================================
|
|---|
| 62 | def filter()
|
|---|
| 63 | getArchives()
|
|---|
| 64 | end
|
|---|
| 65 |
|
|---|
| 66 | # ===========================================================================
|
|---|
| 67 | # @brief : dump twitter archives
|
|---|
| 68 | #
|
|---|
| 69 | # @param : output filename
|
|---|
| 70 | # @param : verbose
|
|---|
| 71 | #
|
|---|
| 72 | # @ret : none
|
|---|
| 73 | #
|
|---|
| 74 | # @note
|
|---|
| 75 | # utf8 encoding
|
|---|
| 76 | # ===========================================================================
|
|---|
| 77 | def dump(outputfilename,verbose)
|
|---|
| 78 | open(outputfilename,"w") do |f|
|
|---|
| 79 | @items.each do |msg,time|
|
|---|
| 80 | f.puts "#{time} : #{msg}" #=> message : time
|
|---|
| 81 | puts "#{time} : #{msg}".tosjis if verbose
|
|---|
| 82 | end
|
|---|
| 83 | end
|
|---|
| 84 |
|
|---|
| 85 | end
|
|---|
| 86 |
|
|---|
| 87 | # ===========================================================================
|
|---|
| 88 | # @brief : add archive items
|
|---|
| 89 | #
|
|---|
| 90 | # @param : archive item
|
|---|
| 91 | #
|
|---|
| 92 | # @ret : none
|
|---|
| 93 | # ===========================================================================
|
|---|
| 94 | def addItems( items )
|
|---|
| 95 | return unless items
|
|---|
| 96 | @items.concat items[:messages].zip( items[:times] )
|
|---|
| 97 | end
|
|---|
| 98 |
|
|---|
| 99 | # ===========================================================================
|
|---|
| 100 | # @brief : get archives
|
|---|
| 101 | #
|
|---|
| 102 | # @param : none
|
|---|
| 103 | #
|
|---|
| 104 | # @ret : none
|
|---|
| 105 | #
|
|---|
| 106 | # @note
|
|---|
| 107 | # ===========================================================================
|
|---|
| 108 | def getArchives()
|
|---|
| 109 |
|
|---|
| 110 | nextlink = @username ? "/#{@username}" : BASEPATH
|
|---|
| 111 | count = 0
|
|---|
| 112 | while( nextlink )
|
|---|
| 113 | break if count == @pagenum
|
|---|
| 114 | html = getPageArchive( nextlink )
|
|---|
| 115 | items = getItems(html)
|
|---|
| 116 | next unless items or items[:messages] or items[:times] #=> Retry if failed get items...
|
|---|
| 117 | addItems( items )
|
|---|
| 118 | nextlink = getNextLink(html)
|
|---|
| 119 | puts "GetPage [#{count += 1}]"
|
|---|
| 120 | end
|
|---|
| 121 | #keyword filter
|
|---|
| 122 | @items = @items.reject{|msg,time| msg !~ @keyword_reg } if @keyword_reg
|
|---|
| 123 |
|
|---|
| 124 | end
|
|---|
| 125 |
|
|---|
| 126 | # ===========================================================================
|
|---|
| 127 | # @brief : get Messages & Times
|
|---|
| 128 | #
|
|---|
| 129 | # @param : html body
|
|---|
| 130 | #
|
|---|
| 131 | # @ret : Twitter message & time
|
|---|
| 132 | # ret = getItems()
|
|---|
| 133 | # ret["messages"] => Messagesage
|
|---|
| 134 | # ret["times"] => Times
|
|---|
| 135 | # ===========================================================================
|
|---|
| 136 | def getItems( html )
|
|---|
| 137 | items = Scraper.define do
|
|---|
| 138 | process 'td.content>span.entry-title' , "messages[]" => :text
|
|---|
| 139 | process 'td.content>span.meta>a>abbr.published' , "times[]" => "@title"
|
|---|
| 140 | result :messages , :times
|
|---|
| 141 | end.scrape( html , :parser_options => {:char_encoding=>'utf8'} )
|
|---|
| 142 | items
|
|---|
| 143 |
|
|---|
| 144 | end
|
|---|
| 145 |
|
|---|
| 146 | # ===========================================================================
|
|---|
| 147 | # @brief : get next LinkPage
|
|---|
| 148 | #
|
|---|
| 149 | # @param : html body
|
|---|
| 150 | #
|
|---|
| 151 | # @ret : Next url
|
|---|
| 152 | # ===========================================================================
|
|---|
| 153 | def getNextLink( html )
|
|---|
| 154 | links = Scraper.define do
|
|---|
| 155 | process 'div.pagination>a' , :url => "@href" , :kind => :text
|
|---|
| 156 | result :url , :kind
|
|---|
| 157 | end.scrape( html , :parser_options => {:char_encoding=>'utf8'} )
|
|---|
| 158 | links[:kind] =~ /Older/ ? links[:url] : nil
|
|---|
| 159 |
|
|---|
| 160 | end
|
|---|
| 161 |
|
|---|
| 162 | # ===========================================================================
|
|---|
| 163 | # @brief : get archives
|
|---|
| 164 | #
|
|---|
| 165 | # @param : get page(/account/archive)
|
|---|
| 166 | #
|
|---|
| 167 | # @ret : result(html body)
|
|---|
| 168 | # html = getPageArchive()
|
|---|
| 169 | # ===========================================================================
|
|---|
| 170 | def getPageArchive( page )
|
|---|
| 171 | html = ""
|
|---|
| 172 | req = Net::HTTP::Get.new( page )
|
|---|
| 173 | req.basic_auth( USERNAME , PASSWORD ) unless @username
|
|---|
| 174 | rs = @http.request( req )
|
|---|
| 175 | return html unless rs
|
|---|
| 176 | html= rs.body
|
|---|
| 177 |
|
|---|
| 178 | end
|
|---|
| 179 |
|
|---|
| 180 | private :getArchives , :getItems , :getNextLink , :getPageArchive , :addItems
|
|---|
| 181 |
|
|---|
| 182 | end
|
|---|
| 183 |
|
|---|
| 184 | if $0 == __FILE__
|
|---|
| 185 |
|
|---|
| 186 | pagenum = -1
|
|---|
| 187 | keyword = nil
|
|---|
| 188 | username = nil
|
|---|
| 189 | verbose = false
|
|---|
| 190 | usebrace = false
|
|---|
| 191 | logfilename = 'archive.log'
|
|---|
| 192 |
|
|---|
| 193 | opt = OptionParser.new
|
|---|
| 194 | opt.banner = "\nUsage: #{$0} -k KEYWORD -s STOPOLDERPAGE\n ex) #{$0} -k vim -p 10 -b -l archive.log\n ex) #{$0} -k vim -p 10 -u Seasons"
|
|---|
| 195 | opt.on( '-k' , '--keyword=KEYWORD' , String ) { |key| keyword = key }
|
|---|
| 196 | opt.on( '-p' , '--pagenum=PAGENUM' , Integer ) { |page| pagenum = page if page > 0 }
|
|---|
| 197 | opt.on( '-l' , '--logfile=LOGFILE' , String ) { |filename| logfilename = filename }
|
|---|
| 198 | opt.on( '-u' , '--user=TWITTERUSERNAME' , String ) { |user| username = user }
|
|---|
| 199 | opt.on( '-b' , '--breace' ){ |brace_flg| usebrace = brace_flg }
|
|---|
| 200 | opt.on( '-v' , '--verbose' ){ |verbose_flg| verbose = verbose_flg }
|
|---|
| 201 |
|
|---|
| 202 | def opt.error(msg = nil)
|
|---|
| 203 | $stderr.puts msg if msg
|
|---|
| 204 | $stderr.puts help()
|
|---|
| 205 | exit 1
|
|---|
| 206 | end
|
|---|
| 207 | begin
|
|---|
| 208 | opt.parse!
|
|---|
| 209 | rescue OptionParser::ParseError => err
|
|---|
| 210 | opt.error err.message
|
|---|
| 211 | end
|
|---|
| 212 |
|
|---|
| 213 | puts "Keyword => " + (keyword ? usebrace ? "[#{keyword}]" : keyword : "*.*")
|
|---|
| 214 | puts "PageNum => #{pagenum}"
|
|---|
| 215 | puts "LogFile => #{logfilename}"
|
|---|
| 216 | tw = TwitterArchiveFilter.new(keyword , pagenum , username , usebrace)
|
|---|
| 217 | tw.filter()
|
|---|
| 218 | tw.dump(logfilename,verbose)
|
|---|
| 219 | puts "Succeed Twitter Archive!! > #{logfilename}"
|
|---|
| 220 |
|
|---|
| 221 | end
|
|---|
| 222 |
|
|---|