root/lang/perl/Geography-AddressExtract-Japan/trunk/lib/Geography/AddressExtract/Japan.pm @ 1572

Revision 1572, 3.3 kB (checked in by yappo, 6 years ago)

r10@haruna (orig r8): ko | 2006-06-21 21:15:08 +0900
Add address normalize lite.
Add 'ue' 'shita' prefix


Line 
1package Geography::AddressExtract::Japan;
2use strict;
3use warnings;
4use encoding "euc-jp";
5
6use Carp;
7use UNIVERSAL::require;
8
9use base qw( Class::Accessor::Fast );
10__PACKAGE__->mk_accessors( qw(opt regexp map addresses) );
11
12use Geography::AddressExtract::Japan::Address;
13
14our $VERSION = '0.00_02';
15
16sub new {
17    my($class, %opt) = @_;
18    my $self = bless { opt => { %opt } }, $class;
19    $self->init;
20    $self;
21}
22
23sub init {
24    my $self = shift;
25
26    for my $module (qw(city aza number dupe )) {
27        $self->{regexp}->{$module} =
28            $self->load_module('Regexp', $self->opt->{overload}->{regexp}->{$module} || ucfirst($module));
29    }
30    for my $module (qw(city)) {
31        $self->{map}->{$module} =
32            $self->load_module('Map', $self->opt->{overload}->{map}->{$module} || ucfirst($module));
33    }
34}
35
36sub load_module {
37    my($self, $type, $module) = @_;
38    $module = sprintf('Geography::AddressExtract::Japan::%s::%s', $type, $module) unless $module =~ /::/;
39    $module->require or croak $@;
40    $module->create;
41}
42
43sub extract {
44    my($proto, $data) = @_;
45    my $self =  ref $proto ? $proto : $proto->new;
46
47    $self->addresses([]);
48
49    $self->_extract($data, sprintf('(%s)\s*(%s)\s*(%s)', $self->regexp->{city}, $self->regexp->{aza}, $self->regexp->{number}));
50    $self->_extract($data, sprintf('(%s)\s*(%s)\P{Han}', $self->regexp->{city}, $self->regexp->{aza}));
51    $self->_extract($data, '(' . $self->regexp->{city} . ')');
52
53    $self->dedupe;
54
55    wantarray ? @{ $self->addresses } : $self->addresses;
56}
57
58sub _extract {
59    my($self, $data, $pattern) = @_;
60
61    while ($data =~ /$pattern/g) {
62        my %opt = (
63            index      => length($`),
64            match_text => $&,
65        );
66        $opt{city}   = $1 if $1;
67        $opt{aza}    = $2 if $2;
68        $opt{number} = $3 if $3;
69
70        $self->normalize($', \%opt);#');
71                         
72        push @{ $self->{addresses} }, Geography::AddressExtract::Japan::Address->new(%opt);
73    }
74}
75
76sub normalize {
77    my($self, $right, $opt) = @_;
78
79    if ($opt->{number} && $opt->{number} =~ /^([���������岼])/) {
80        my $prefix = $1;
81        if ($right =~ /^((?:��?)?[-���ݤΥ�?(?:(?:(?:[�����͸����Ȭ����)?[�����͸����Ȭ�塻]+|\d+)|[a-zA-Z����)��/) {
82            my $append = $1;
83            $opt->{aza} .= $prefix;
84            $opt->{number} =~ s/^$prefix//;
85            $opt->{number} .= $append;
86        }
87    }
88
89}
90
91sub dedupe {
92    my $self = shift;
93
94    return unless @{ $self->addresses };
95
96    # sort
97    $self->addresses( sort { $a->index <=> $b->index } @{ $self->addresses });
98
99    # index unique
100    my($last, @set, @addrs);
101    for my $addr (@{ $self->addresses }) {
102        if ($last) {
103            if ($last->index eq $addr->index) {
104                my $cur = @set ? shift @set : $last;
105                push @set, length $addr > length $cur ? $addr : $cur;
106            } else {
107                push @addrs, @set ? shift @set : $last;
108            }
109        }
110        $last = $addr;
111    }
112    push(@addrs, @set ? shift @set : $last) if $last;
113    $self->addresses([ @addrs ]);
114
115    # dupe unique
116    my @map;
117    @addrs = ();
118    for my $addr (@{ $self->addresses }) {
119        my $i = $addr->index;
120        unless ($map[$i]) {
121            for my $str (split //, $addr) {
122                $map[$i++] = $str;
123            }
124            push @addrs, $addr;
125        }
126    }
127    $self->addresses([ @addrs ]);
128}
129
130
1311;
132
133__END__
134
135AUTHOR: Kazuhiro Osawa ko@yappo.ne.jp
Note: See TracBrowser for help on using the browser.