| 1 | package Acme::Samurai::Base; |
|---|
| 2 | use strict; |
|---|
| 3 | use warnings; |
|---|
| 4 | use Class::Trigger; |
|---|
| 5 | use Encode; |
|---|
| 6 | use File::ShareDir 'module_file'; |
|---|
| 7 | use Text::MeCab; |
|---|
| 8 | |
|---|
| 9 | use base 'Class::Accessor::Fast'; |
|---|
| 10 | __PACKAGE__->mk_accessors(qw( text parts mecab_option )); |
|---|
| 11 | |
|---|
| 12 | my $encoding = Encode::find_encoding( Text::MeCab::ENCODING ); |
|---|
| 13 | |
|---|
| 14 | sub parts_push { |
|---|
| 15 | push @{ shift->parts }, @_; |
|---|
| 16 | } |
|---|
| 17 | |
|---|
| 18 | sub parts_pop { |
|---|
| 19 | pop @{ shift->parts }; |
|---|
| 20 | } |
|---|
| 21 | |
|---|
| 22 | sub mecab_new { |
|---|
| 23 | my $self = shift; |
|---|
| 24 | my $mecab = Text::MeCab->new({ |
|---|
| 25 | node_format => '%m,%H', |
|---|
| 26 | unk_format => '%m,%H', |
|---|
| 27 | bos_format => '%m,%H', |
|---|
| 28 | eos_format => '%m,%H', |
|---|
| 29 | userdic => module_file(ref $self, 'user.dic'), |
|---|
| 30 | %{ $self->mecab_option || {} }, |
|---|
| 31 | }); |
|---|
| 32 | } |
|---|
| 33 | |
|---|
| 34 | sub call { |
|---|
| 35 | my ($self, $method, @args) = @_; |
|---|
| 36 | |
|---|
| 37 | $self->call_trigger("pre.$method", @args); |
|---|
| 38 | |
|---|
| 39 | if ($self->can($method)) { |
|---|
| 40 | $self->$method(@args); |
|---|
| 41 | } |
|---|
| 42 | |
|---|
| 43 | $self->call_trigger("post.$method", @args); |
|---|
| 44 | } |
|---|
| 45 | |
|---|
| 46 | sub transform { |
|---|
| 47 | my ($self, $text) = @_; |
|---|
| 48 | |
|---|
| 49 | $self->text($text || ""); |
|---|
| 50 | $self->parts([]); |
|---|
| 51 | |
|---|
| 52 | $self->call('prepare'); |
|---|
| 53 | |
|---|
| 54 | my $mecab = $self->mecab_new; |
|---|
| 55 | |
|---|
| 56 | for my $part (split /(\s+)/, $self->text) { |
|---|
| 57 | if ($part =~ /\s/) { |
|---|
| 58 | $self->parts_push($part); |
|---|
| 59 | next; |
|---|
| 60 | } |
|---|
| 61 | foreach ( |
|---|
| 62 | my $node = $mecab->parse( $encoding->encode($part) ); |
|---|
| 63 | $node; |
|---|
| 64 | $node = $node->next |
|---|
| 65 | ) { |
|---|
| 66 | next if $node->stat_type =~ /BOS|EOS/; |
|---|
| 67 | $self->call('node_filter', $node->decoded_node($mecab)); |
|---|
| 68 | } |
|---|
| 69 | } |
|---|
| 70 | |
|---|
| 71 | $self->text(join "", @{ $self->parts }); |
|---|
| 72 | $self->call('finalize'); |
|---|
| 73 | $self->text; |
|---|
| 74 | } |
|---|
| 75 | |
|---|
| 76 | sub node_filter { |
|---|
| 77 | my ($self, $node) = @_; |
|---|
| 78 | my $part = $node->features->{extra} || $node->surface; |
|---|
| 79 | |
|---|
| 80 | if (my $sub = $self->can($node->features->{pos} . '_rule') ) { |
|---|
| 81 | my $ret = $sub->($self, $node); |
|---|
| 82 | $part = $ret if defined $ret; |
|---|
| 83 | } |
|---|
| 84 | |
|---|
| 85 | $self->parts_push($part); |
|---|
| 86 | } |
|---|
| 87 | |
|---|
| 88 | sub Text::MeCab::Node::stat_type { |
|---|
| 89 | { |
|---|
| 90 | 0 => 'MECAB_NOR_NODE', # normal |
|---|
| 91 | 1 => 'MECAB_UNK_NODE', # unknown |
|---|
| 92 | 2 => 'MECAB_BOS_NODE', # begin of sentence |
|---|
| 93 | 3 => 'MECAB_EOS_NODE', # end of sentence |
|---|
| 94 | }->{ |
|---|
| 95 | shift->stat() |
|---|
| 96 | }; |
|---|
| 97 | } |
|---|
| 98 | |
|---|
| 99 | our @dic_keys = qw( |
|---|
| 100 | pos category1 category2 category3 |
|---|
| 101 | inflect inflect_type original yomi pronounse |
|---|
| 102 | extra extra2 extra3 |
|---|
| 103 | ); |
|---|
| 104 | |
|---|
| 105 | sub Text::MeCab::Node::decoded_node { |
|---|
| 106 | my ($node, $mecab) = @_; |
|---|
| 107 | |
|---|
| 108 | my $format = $encoding->decode( $node->format($mecab) ); |
|---|
| 109 | my ($surface, @feature) = split /,/, $format; |
|---|
| 110 | |
|---|
| 111 | return Acme::Samurai::Base::Node->new({ |
|---|
| 112 | mecab => $mecab, |
|---|
| 113 | node => $node, |
|---|
| 114 | stat_type => $node->stat_type, |
|---|
| 115 | surface => $surface, |
|---|
| 116 | feature => join(",", @feature), |
|---|
| 117 | features => do { |
|---|
| 118 | my %_tmp; @_tmp{ @dic_keys } = @feature; |
|---|
| 119 | \%_tmp; |
|---|
| 120 | }, |
|---|
| 121 | }); |
|---|
| 122 | } |
|---|
| 123 | |
|---|
| 124 | |
|---|
| 125 | package Acme::Samurai::Base::Node; |
|---|
| 126 | use strict; |
|---|
| 127 | use Scalar::Util qw(weaken); |
|---|
| 128 | |
|---|
| 129 | use base 'Class::Accessor::Fast'; |
|---|
| 130 | __PACKAGE__->mk_accessors(qw( mecab node stat_type surface feature features )); |
|---|
| 131 | |
|---|
| 132 | sub new { |
|---|
| 133 | my $self = shift->SUPER::new(@_); |
|---|
| 134 | weaken $self->{mecab}; |
|---|
| 135 | $self; |
|---|
| 136 | } |
|---|
| 137 | |
|---|
| 138 | for my $sub (qw( next prev )) { |
|---|
| 139 | no strict 'refs'; ## no critic |
|---|
| 140 | *{__PACKAGE__ . "::${sub}_node"} = sub { |
|---|
| 141 | my $self = shift; |
|---|
| 142 | my $node = $self->node->$sub; |
|---|
| 143 | return $node->decoded_node($self->mecab) |
|---|
| 144 | if $node; |
|---|
| 145 | }; |
|---|
| 146 | } |
|---|
| 147 | |
|---|
| 148 | 1; |
|---|