root/lang/perl/Encode-JP-Mobile/trunk/tools/softbank-scrape-autosjis.pl

Revision 5232, 0.9 kB (checked in by tokuhirom, 9 months ago)

r5459@skinny (orig r5199): chiba | 2008-01-22 01:06:57 +0900
remove debug line


  • Property svn:executable set to *
Line 
1#!/usr/bin/perl
2use strict;
3use warnings;
4use Path::Class;
5use YAML;
6use FindBin;
7
8# how to make 103-111-HTML_2.0.0.txt
9# 1. get PDF from http://www2.developers.softbankmobile.co.jp/dp/tool_dl/download.php?docid=120&companyid=
10# 2. xdoc2txt -n 103-111-HTML_2.0.0.pdf > 103-111-HTML_2.0.0.txt
11#  ref. http://www31.ocn.ne.jp/~h_ishida/xdoc2txt.html
12
13my $pdf_text_file = shift or die "Usage: softbank-scrape-autosjis.pl 103-111-HTML_2.0.0.txt";
14my $pdf_fh =file($pdf_text_file)->openr;
15
16my %map;
17while (my $line = <$pdf_fh>) {
18    chomp $line;
19    next if $line !~ /^&#\d\d\d\d\d;\s*&#x/;
20
21    my @codes = split /\s+/, $line;
22    next if @codes != 4;
23
24    my $unicode  = strip_entity_ref_mark($codes[1]);
25    my $shiftjis = $codes[3];
26
27    $map{ $unicode } = $shiftjis;
28}
29close $pdf_fh;
30
31print Dump(\%map);
32
33
34sub strip_entity_ref_mark {
35    local $_ = shift;
36    s/(^&#x|;$)//g;
37    $_;
38}
39
Note: See TracBrowser for help on using the browser.