| 1 | # scrape.rb - k-tai emoji scraper |
|---|
| 2 | # |
|---|
| 3 | # Author:: MIZOGUCHI Coji <mizoguchi.coji at gmail.com> |
|---|
| 4 | # License:: Distribute under the same terms as Ruby |
|---|
| 5 | # |
|---|
| 6 | # $Id$ |
|---|
| 7 | # |
|---|
| 8 | require 'config/common.rb' |
|---|
| 9 | require 'rubygems' |
|---|
| 10 | require 'scrapi' |
|---|
| 11 | require 'open-uri' |
|---|
| 12 | require 'nkf' |
|---|
| 13 | $KCODE = 'utf-8' |
|---|
| 14 | |
|---|
| 15 | module EmojiScraper |
|---|
| 16 | EMOJI_DIR = File.join(SSB::CONFIG[:public_dir], 'emoji') |
|---|
| 17 | |
|---|
| 18 | def self.emoji_dir(carrier) |
|---|
| 19 | File.join(EMOJI_DIR, carrier) |
|---|
| 20 | end |
|---|
| 21 | |
|---|
| 22 | def self.scrape_docomo |
|---|
| 23 | base_uri = |
|---|
| 24 | ['http://www.nttdocomo.co.jp/service/imode/make/content/pictograph/basic/', |
|---|
| 25 | 'http://www.nttdocomo.co.jp/service/imode/make/content/pictograph/extention/'] |
|---|
| 26 | |
|---|
| 27 | emoji_scraper = Scraper.define do |
|---|
| 28 | process 'td:nth-child(3) > span.txt', :code => :text |
|---|
| 29 | process 'td > img', :uri => '@src' |
|---|
| 30 | result :code, :uri |
|---|
| 31 | end |
|---|
| 32 | |
|---|
| 33 | scraper = Scraper.define do |
|---|
| 34 | array :emoji |
|---|
| 35 | process 'tr.acenter', :emoji => emoji_scraper |
|---|
| 36 | result :emoji |
|---|
| 37 | end |
|---|
| 38 | |
|---|
| 39 | opt = { :char_encoding => 'utf8' } |
|---|
| 40 | base_uri.each do |uri| |
|---|
| 41 | html = NKF::nkf('-w', open(uri).read) |
|---|
| 42 | scraper.scrape(html, opt).select{|e| e unless e.uri.nil? }.each do |p| |
|---|
| 43 | tmpfile = File.join(emoji_dir('docomo'), p.code + '.tmp.gif') |
|---|
| 44 | filename = File.join(emoji_dir('docomo'), p.code + '.gif') |
|---|
| 45 | pict_uri = URI.parse(uri) + p.uri |
|---|
| 46 | open(pict_uri) do |img| |
|---|
| 47 | open(tmpfile, 'w') do |out| |
|---|
| 48 | out.write img.read |
|---|
| 49 | end |
|---|
| 50 | end |
|---|
| 51 | |
|---|
| 52 | `convert -transparent white -resize 16x16 #{tmpfile} #{filename}` |
|---|
| 53 | `rm #{tmpfile}` |
|---|
| 54 | puts pict_uri.to_s + " => " + filename |
|---|
| 55 | end |
|---|
| 56 | end |
|---|
| 57 | end |
|---|
| 58 | |
|---|
| 59 | def self.scrape_kddi |
|---|
| 60 | tmpfile = 'ezicon.lzh' |
|---|
| 61 | open('http://www.au.kddi.com/ezfactory/tec/spec/lzh/icon_image.lzh') do |f| |
|---|
| 62 | open(tmpfile, 'w') do |out| |
|---|
| 63 | out.print f.read |
|---|
| 64 | end |
|---|
| 65 | end |
|---|
| 66 | |
|---|
| 67 | `lha -x #{tmpfile}` |
|---|
| 68 | Dir.glob('icon_image/*.ai') do |src| |
|---|
| 69 | if(match = src.match(/(\d+).+\.ai$/)) |
|---|
| 70 | puts src |
|---|
| 71 | out_filename = File.join(emoji_dir('kddi'), match[1] + '.gif') |
|---|
| 72 | `convert -trim -geometry 16x16 +repage "#{src}" #{out_filename}` |
|---|
| 73 | end |
|---|
| 74 | end |
|---|
| 75 | `rm #{tmpfile}` |
|---|
| 76 | `rm -Rf icon_image` |
|---|
| 77 | end |
|---|
| 78 | |
|---|
| 79 | def self.scrape_thirdforce |
|---|
| 80 | base_uri = 'http://creation.mb.softbank.jp/web/' |
|---|
| 81 | page = 'web_pic_%02d.html' |
|---|
| 82 | 1.upto(6) do |n| |
|---|
| 83 | pict_scraper = Scraper.define { |
|---|
| 84 | process 'td:nth-child(2)[bgcolor="#FFFFFF"]', :unicode => :text |
|---|
| 85 | process 'td > img', :image => '@src' |
|---|
| 86 | result :unicode, :image |
|---|
| 87 | } |
|---|
| 88 | Scraper.define{ |
|---|
| 89 | process 'table[width="100%"] > tr', 'pictograms[]' => pict_scraper |
|---|
| 90 | result :pictograms |
|---|
| 91 | }.scrape(URI.parse(base_uri + page % n)).select {|x| not x.nil? }.select {|x| not x.unicode.nil? }.each {|pictinfo| |
|---|
| 92 | tmpfile = File.join(emoji_dir('softbank'), pictinfo.unicode + '.tmp.gif') |
|---|
| 93 | filename = File.join(emoji_dir('softbank'), pictinfo.unicode + '.gif') |
|---|
| 94 | pict_uri = URI.parse(base_uri) + pictinfo.image |
|---|
| 95 | |
|---|
| 96 | open(pict_uri) do |img| |
|---|
| 97 | open(tmpfile, 'w') do |out| |
|---|
| 98 | out.write img.read |
|---|
| 99 | end |
|---|
| 100 | end |
|---|
| 101 | |
|---|
| 102 | `convert -transparent white -resize 16x16 #{tmpfile} #{filename}` |
|---|
| 103 | `rm #{tmpfile}` |
|---|
| 104 | puts pict_uri.to_s + " => " + filename |
|---|
| 105 | } |
|---|
| 106 | end |
|---|
| 107 | end |
|---|
| 108 | |
|---|
| 109 | def self.run |
|---|
| 110 | scrape_docomo |
|---|
| 111 | scrape_kddi |
|---|
| 112 | scrape_thirdforce |
|---|
| 113 | end |
|---|
| 114 | end |
|---|
| 115 | |
|---|
| 116 | if $0 == __FILE__ |
|---|
| 117 | EmojiScraper.run |
|---|
| 118 | end |
|---|