|
Revision 5232, 0.9 kB
(checked in by tokuhirom, 9 months ago)
|
r5459@skinny (orig r5199): chiba | 2008-01-22 01:06:57 +0900
remove debug line
|
-
Property svn:executable set to
*
|
| Line | |
|---|
| 1 | #!/usr/bin/perl |
|---|
| 2 | use strict; |
|---|
| 3 | use warnings; |
|---|
| 4 | use Path::Class; |
|---|
| 5 | use YAML; |
|---|
| 6 | use FindBin; |
|---|
| 7 | |
|---|
| 8 | # how to make 103-111-HTML_2.0.0.txt |
|---|
| 9 | # 1. get PDF from http://www2.developers.softbankmobile.co.jp/dp/tool_dl/download.php?docid=120&companyid= |
|---|
| 10 | # 2. xdoc2txt -n 103-111-HTML_2.0.0.pdf > 103-111-HTML_2.0.0.txt |
|---|
| 11 | # ref. http://www31.ocn.ne.jp/~h_ishida/xdoc2txt.html |
|---|
| 12 | |
|---|
| 13 | my $pdf_text_file = shift or die "Usage: softbank-scrape-autosjis.pl 103-111-HTML_2.0.0.txt"; |
|---|
| 14 | my $pdf_fh =file($pdf_text_file)->openr; |
|---|
| 15 | |
|---|
| 16 | my %map; |
|---|
| 17 | while (my $line = <$pdf_fh>) { |
|---|
| 18 | chomp $line; |
|---|
| 19 | next if $line !~ /^&#\d\d\d\d\d;\s*&#x/; |
|---|
| 20 | |
|---|
| 21 | my @codes = split /\s+/, $line; |
|---|
| 22 | next if @codes != 4; |
|---|
| 23 | |
|---|
| 24 | my $unicode = strip_entity_ref_mark($codes[1]); |
|---|
| 25 | my $shiftjis = $codes[3]; |
|---|
| 26 | |
|---|
| 27 | $map{ $unicode } = $shiftjis; |
|---|
| 28 | } |
|---|
| 29 | close $pdf_fh; |
|---|
| 30 | |
|---|
| 31 | print Dump(\%map); |
|---|
| 32 | |
|---|
| 33 | |
|---|
| 34 | sub strip_entity_ref_mark { |
|---|
| 35 | local $_ = shift; |
|---|
| 36 | s/(^&#x|;$)//g; |
|---|
| 37 | $_; |
|---|
| 38 | } |
|---|
| 39 | |
|---|