root/lang/perl/Encode-JP-Mobile/trunk/tools/docomo-scrape.pl

Revision 4878, 1.4 kB (checked in by miyagawa, 11 months ago)

updating Charnames to get English names by mapping kddi/softbank to DoCoMo? and 'borrow' these names.

  • Property svn:executable set to *
Line 
1#!/usr/bin/perl
2use strict;
3use warnings;
4use HTML::Selector::XPath 0.03;
5use Web::Scraper;
6use URI;
7use YAML;
8
9my @url = (
10    URI->new("http://www.nttdocomo.co.jp/service/imode/make/content/pictograph/basic/index.html"),
11    URI->new("http://www.nttdocomo.co.jp/english/service/imode/make/content/pictograph/basic/index.html"),
12    URI->new("http://www.nttdocomo.co.jp/service/imode/make/content/pictograph/extention/index.html"),
13    URI->new("http://www.nttdocomo.co.jp/english/service/imode/make/content/pictograph/extention/index.html"),
14);
15
16my $res;
17my $i;
18my @prev;
19for my $uri (@url) {
20    my $scraper = scraper {
21        process 'tr', 'characters[]', scraper {
22            process 'td:nth-child(1)', 'number', 'TEXT';
23            process 'td:nth-child(2) > img', 'image', [ '@src', sub { $_->as_string } ];
24            process 'td:nth-child(3)', 'sjis', 'TEXT';
25            process 'td:nth-child(5)', 'unicode', 'TEXT';
26            process 'td:nth-child(6)', 'name',  'TEXT';
27        };
28    };
29    my @chars = @{ $scraper->scrape($uri)->{characters} };
30
31    # remove headers
32    shift @chars; shift @chars;
33
34    if (++$i % 2) {
35        @prev = @chars;
36    } else {
37        @prev == @chars or die "ja/en count doesn't match";
38        for my $c (0..$#prev) {
39            $prev[$c]->{name_en} = $chars[$c]->{name};
40        }
41        push @$res, @prev;
42    }
43}
44
45binmode STDOUT, ":utf8";
46print Dump($res);
47
Note: See TracBrowser for help on using the browser.