| 1 | require 'rubygems' |
|---|
| 2 | require 'hpricot' |
|---|
| 3 | require 'open-uri' |
|---|
| 4 | |
|---|
| 5 | module Hatena |
|---|
| 6 | module Haiku |
|---|
| 7 | class Entry |
|---|
| 8 | ICON_SERVER_URI = 'http://www.hatena.ne.jp/users/' |
|---|
| 9 | |
|---|
| 10 | def initialize(options = {}) |
|---|
| 11 | @id = options[:id] |
|---|
| 12 | @keyword = options[:keyword] |
|---|
| 13 | @permalink = options[:permalink] |
|---|
| 14 | @timestamp = options[:timestamp] |
|---|
| 15 | @body = options[:body] |
|---|
| 16 | @source = options[:source] |
|---|
| 17 | end |
|---|
| 18 | attr_accessor :id, :keyword, :permalink, :timestamp, :body, :source |
|---|
| 19 | |
|---|
| 20 | def icon_uri |
|---|
| 21 | File.join(ICON_SERVER_URI, id[0..1], id, "profile.gif") |
|---|
| 22 | end |
|---|
| 23 | end |
|---|
| 24 | |
|---|
| 25 | class Fetcher |
|---|
| 26 | SERVER_ROOT_URI = 'http://h.hatena.ne.jp/' |
|---|
| 27 | |
|---|
| 28 | def initialize(options = {}) |
|---|
| 29 | @interval = options[:interval] || 3 |
|---|
| 30 | @debug = options[:debug] || false |
|---|
| 31 | @max_entries_of_page = options[:max_entries_of_page] || 20 |
|---|
| 32 | @entries = [] |
|---|
| 33 | @lastest_fetched_at = Time.now |
|---|
| 34 | end |
|---|
| 35 | |
|---|
| 36 | def fetch(uri, start_page = 1, &block) |
|---|
| 37 | page = start_page |
|---|
| 38 | entries_buf = [] |
|---|
| 39 | |
|---|
| 40 | while |
|---|
| 41 | fetch_uri = uri + "?page=#{page}" |
|---|
| 42 | html = open(fetch_uri.untaint) |
|---|
| 43 | |
|---|
| 44 | next if html.nil? |
|---|
| 45 | |
|---|
| 46 | document = Hpricot(html) |
|---|
| 47 | document.search('div.entries/div.entry')[1..@max_entries_of_page].each_with_index do |entry,idx| |
|---|
| 48 | break if entry.inner_html.scan('google_afc').size > 0 |
|---|
| 49 | |
|---|
| 50 | id = entry.search('span.username/a').inner_html |
|---|
| 51 | keyword = entry.search('h2.title/a').last.inner_html |
|---|
| 52 | timestamp = entry.search('span.timestamp/a').first |
|---|
| 53 | permalink = File.join(SERVER_ROOT_URI, timestamp[:href]) |
|---|
| 54 | timestamp = Time.local *timestamp.inner_html.split(/[- :]/) |
|---|
| 55 | body = entry.search('div.body').inner_html.strip |
|---|
| 56 | source = entry.search('span.source/a').inner_html |
|---|
| 57 | |
|---|
| 58 | next if timestamp >= @lastest_fetched_at |
|---|
| 59 | |
|---|
| 60 | entry = Entry.new( |
|---|
| 61 | :id => id, |
|---|
| 62 | :keyword => keyword, |
|---|
| 63 | :permalink => permalink, |
|---|
| 64 | :timestamp => timestamp, |
|---|
| 65 | :body => body, |
|---|
| 66 | :source => source |
|---|
| 67 | ) |
|---|
| 68 | @lastest_fetched_at = entry.timestamp |
|---|
| 69 | |
|---|
| 70 | yield entry if block_given? |
|---|
| 71 | |
|---|
| 72 | entries_buf << entry |
|---|
| 73 | end rescue nil |
|---|
| 74 | |
|---|
| 75 | break if entries_buf.size == 0 |
|---|
| 76 | |
|---|
| 77 | debug_print "page #{page} was fetched." |
|---|
| 78 | |
|---|
| 79 | @entries += entries_buf |
|---|
| 80 | page += 1 |
|---|
| 81 | entries_buf.clear |
|---|
| 82 | sleep @interval |
|---|
| 83 | end |
|---|
| 84 | |
|---|
| 85 | debug_print "Total; #{@entries.size} entries." |
|---|
| 86 | |
|---|
| 87 | @entries |
|---|
| 88 | end |
|---|
| 89 | |
|---|
| 90 | private |
|---|
| 91 | def debug_print(content) |
|---|
| 92 | puts content if @debug |
|---|
| 93 | end |
|---|
| 94 | end # class Fetcher |
|---|
| 95 | end # module Haiku |
|---|
| 96 | end # module Hatena |
|---|
| 97 | |
|---|
| 98 | if __FILE__ == $0 |
|---|
| 99 | uri = ARGV.shift || 'http://h.hatena.ne.jp/id/trashsuite/' |
|---|
| 100 | interval = ARGV.shift || 3 |
|---|
| 101 | require 'pp' |
|---|
| 102 | fetcher = Hatena::Haiku::Fetcher.new( |
|---|
| 103 | :interval => interval.to_i, |
|---|
| 104 | :debug => true |
|---|
| 105 | ) |
|---|
| 106 | entries = fetcher.fetch(uri) do |entry| |
|---|
| 107 | print "#{entry.timestamp}::#{entry.source}:: " |
|---|
| 108 | puts entry.body[0..100] |
|---|
| 109 | end |
|---|
| 110 | pp entries.last |
|---|
| 111 | pp entries.last.icon_uri |
|---|
| 112 | end |
|---|