| 1 | # $Id$ |
|---|
| 2 | # |
|---|
| 3 | # Copyright (c) 2006-2008 Daisuke Maki <daisuke@endeworks.jp> |
|---|
| 4 | # All rights reserved. |
|---|
| 5 | |
|---|
| 6 | package Text::MeCab; |
|---|
| 7 | use strict; |
|---|
| 8 | use warnings; |
|---|
| 9 | use 5.006; |
|---|
| 10 | use Exporter 'import'; |
|---|
| 11 | our ($VERSION, @ISA, %EXPORT_TAGS, @EXPORT_OK); |
|---|
| 12 | BEGIN |
|---|
| 13 | { |
|---|
| 14 | $VERSION = '0.20000'; |
|---|
| 15 | if ($] > 5.006) { |
|---|
| 16 | require XSLoader; |
|---|
| 17 | XSLoader::load(__PACKAGE__, $VERSION); |
|---|
| 18 | } else { |
|---|
| 19 | require DynaLoader; |
|---|
| 20 | @ISA = qw(DynaLoader); |
|---|
| 21 | __PACKAGE__->bootstrap; |
|---|
| 22 | } |
|---|
| 23 | |
|---|
| 24 | %EXPORT_TAGS = (all => [ qw(MECAB_NOR_NODE MECAB_UNK_NODE MECAB_BOS_NODE MECAB_EOS_NODE) ]); |
|---|
| 25 | @EXPORT_OK = map { @$_ } values %EXPORT_TAGS; |
|---|
| 26 | } |
|---|
| 27 | |
|---|
| 28 | my %BOOLEAN_OPTIONS = ( |
|---|
| 29 | map { ($_, 'bool') } qw( |
|---|
| 30 | --all-morphs --partial --allocate-sentence --version --help |
|---|
| 31 | ) |
|---|
| 32 | ); |
|---|
| 33 | |
|---|
| 34 | sub new |
|---|
| 35 | { |
|---|
| 36 | my $class = shift; |
|---|
| 37 | |
|---|
| 38 | my %args = ref $_[0] eq 'HASH' ? %{$_[0]} : @_; |
|---|
| 39 | $args{'allocate-sentence'} = 1; |
|---|
| 40 | |
|---|
| 41 | my @args = ('perl-TextMeCab'); |
|---|
| 42 | while (my($key, $value) = each %args) { |
|---|
| 43 | $key =~ s/_/-/g; |
|---|
| 44 | $key =~ s/^/--/; |
|---|
| 45 | |
|---|
| 46 | if (exists $BOOLEAN_OPTIONS{$key}) { |
|---|
| 47 | push @args, $key; |
|---|
| 48 | } else { |
|---|
| 49 | push @args, join('=', $key, $value); |
|---|
| 50 | } |
|---|
| 51 | } |
|---|
| 52 | |
|---|
| 53 | $class->_XS_new(\@args); |
|---|
| 54 | } |
|---|
| 55 | |
|---|
| 56 | 1; |
|---|
| 57 | |
|---|
| 58 | __END__ |
|---|
| 59 | |
|---|
| 60 | =head1 NAME |
|---|
| 61 | |
|---|
| 62 | Text::MeCab - Alternate Interface To libmecab |
|---|
| 63 | |
|---|
| 64 | =head1 SYNOPSIS |
|---|
| 65 | |
|---|
| 66 | use Text::MeCab; |
|---|
| 67 | my $mecab = Text::MeCab->new({ |
|---|
| 68 | rcfile => $rcfile, |
|---|
| 69 | dicdir => $dicdir, |
|---|
| 70 | userdic => $userdic, |
|---|
| 71 | lattice_level => $lattice_level, |
|---|
| 72 | all_morphs => $all_morphs, |
|---|
| 73 | output_format_type => $output_format_type, |
|---|
| 74 | partial => $partial, |
|---|
| 75 | node_format => $node_format, |
|---|
| 76 | unk_format => $unk_format, |
|---|
| 77 | bos_format => $bos_format, |
|---|
| 78 | eos_format => $eos_format, |
|---|
| 79 | input_buffer_size => $input_buffer_size, |
|---|
| 80 | allocate_sentence => $allocate_sentence, |
|---|
| 81 | nbest => $nbest, |
|---|
| 82 | theta => $theta, |
|---|
| 83 | }); |
|---|
| 84 | |
|---|
| 85 | for (my $node = $mecab->parse($text); $node; $node = $node->next) { |
|---|
| 86 | # See perdoc for Text::MeCab::Node for list of methods |
|---|
| 87 | print $node->surface, "\n"; |
|---|
| 88 | } |
|---|
| 89 | |
|---|
| 90 | # use constants |
|---|
| 91 | use Text::MeCab qw(:all); |
|---|
| 92 | use Text::MeCab qw(MECAB_NOR_NODE); |
|---|
| 93 | |
|---|
| 94 | # want to use a command line arguments? |
|---|
| 95 | my $mecab = Text::MeCab->new("--userdic=/foo/bar/baz", "-P"); |
|---|
| 96 | |
|---|
| 97 | # check what mecab version we compiled against? |
|---|
| 98 | print "Compiled with ", &Text::MeCab::MECAB_VERSION, "\n"; |
|---|
| 99 | |
|---|
| 100 | =head1 DESCRIPTION |
|---|
| 101 | |
|---|
| 102 | libmecab (http://mecab.sourceforge.ne.jp) already has a perl interface built |
|---|
| 103 | with it, so why a new module? I just feel that while a subtle difference, |
|---|
| 104 | making the perl interface through a tied hash is just... weird. |
|---|
| 105 | |
|---|
| 106 | So Text::MeCab gives you a more natural, Perl-ish way to access libmecab! |
|---|
| 107 | |
|---|
| 108 | WARNING: Version 0.20000 has only been tested against libmecab 0.96. |
|---|
| 109 | |
|---|
| 110 | =head1 Text::MeCab AND FORMATS |
|---|
| 111 | |
|---|
| 112 | mecab allows users to specify an output format, via --*-format options. |
|---|
| 113 | These are respected ONLY if you use the format() method: |
|---|
| 114 | |
|---|
| 115 | my $mecab = Text::MeCab->new({ |
|---|
| 116 | output_format_type => "user", |
|---|
| 117 | node_format => "%m %pn" |
|---|
| 118 | }); |
|---|
| 119 | |
|---|
| 120 | for(my $node = $mecab->parse($text); $node; $node = $node->next) { |
|---|
| 121 | print $node->format($mecab); |
|---|
| 122 | } |
|---|
| 123 | |
|---|
| 124 | Note that you also need to set the output_format_type parameter as well. |
|---|
| 125 | |
|---|
| 126 | =head1 Text::MeCab AND SCOPING |
|---|
| 127 | |
|---|
| 128 | [NOTE: The memory management issue has been changed since 0.09] |
|---|
| 129 | |
|---|
| 130 | libmecab's default behavior is such that when you analyze a text and get a |
|---|
| 131 | node back, that node is tied to the mecab "tagger" object that performed the |
|---|
| 132 | analysis. Therefore, when that tagger is destroyed via mecab_destroy(), |
|---|
| 133 | all nodes that are associated to it are freed as well. |
|---|
| 134 | |
|---|
| 135 | Text::MeCab defaults to the same behavior, so the following won't work: |
|---|
| 136 | |
|---|
| 137 | sub get_mecab_node { |
|---|
| 138 | my $mecab = Text::MeCab->new; |
|---|
| 139 | my $node = $mecab->parse($_[0]); |
|---|
| 140 | return $node; |
|---|
| 141 | } |
|---|
| 142 | |
|---|
| 143 | my $node = get_mecab_node($text); |
|---|
| 144 | |
|---|
| 145 | By the time get_mecab_node() returns, the Text::MeCab object is DESTROY'ed, |
|---|
| 146 | and so is $node (actually, the object exists, but it will complain when you |
|---|
| 147 | try to access the node's internals, because the C struct that was there |
|---|
| 148 | has already been freed). |
|---|
| 149 | |
|---|
| 150 | In such cases, use the dclone() method. This will copy the *entire* node |
|---|
| 151 | structure and create a new Text::MeCab::Node::Cloned instance. |
|---|
| 152 | |
|---|
| 153 | sub get_mecab_node { |
|---|
| 154 | my $mecab = Text::MeCab->new; |
|---|
| 155 | my $node = $mecab->parse($_[0]); |
|---|
| 156 | return $node->dclone(); |
|---|
| 157 | } |
|---|
| 158 | |
|---|
| 159 | The returned Text::MeCab::Node::Cloned object is exactly the same as |
|---|
| 160 | Text::MeCab::Node object on the surface. It just uses a different but |
|---|
| 161 | very similar C struct underneath. It is blessed into a different namespace |
|---|
| 162 | only because we need to use a different memory management strategy. |
|---|
| 163 | |
|---|
| 164 | Do be aware of the memory issue. You WILL use up twice as much memory. |
|---|
| 165 | |
|---|
| 166 | Also please note that if you try the first example, accessing the node |
|---|
| 167 | *WILL* result in a segfault. This is *NOT* a bug: it's a feature :) |
|---|
| 168 | While it is possible to control the memory management such that accessing |
|---|
| 169 | a field in a node that has already expired results in a legal croak(), |
|---|
| 170 | we do not go to the length to ensure this, because it will result in |
|---|
| 171 | a performance penalty. |
|---|
| 172 | |
|---|
| 173 | Just remember that unless you dclone() a node, then you are NOT allowed to |
|---|
| 174 | access it when the original tagger goes out scope: |
|---|
| 175 | |
|---|
| 176 | { |
|---|
| 177 | my $mecab = Text::MeCab->new; |
|---|
| 178 | $node = $mecab->parse(...); |
|---|
| 179 | } |
|---|
| 180 | |
|---|
| 181 | $node->surface; # segfault!!!! |
|---|
| 182 | |
|---|
| 183 | Always remember to dclone() before doing this! |
|---|
| 184 | |
|---|
| 185 | =head1 PERFORMANCE |
|---|
| 186 | |
|---|
| 187 | Belows is the result of running tools/benchmark.pl on my PowerBook: |
|---|
| 188 | |
|---|
| 189 | daisuke@beefcake Text-MeCab$ perl tools/benchmark.pl |
|---|
| 190 | Rate mecab text_mecab |
|---|
| 191 | mecab 5.53/s -- -63% |
|---|
| 192 | text_mecab 14.9/s 170% -- |
|---|
| 193 | |
|---|
| 194 | =head1 METHODS |
|---|
| 195 | |
|---|
| 196 | =head2 new HASHREF | LIST |
|---|
| 197 | |
|---|
| 198 | Creates a new Text::MeCab instance. |
|---|
| 199 | |
|---|
| 200 | You can either specify a hashref and use named parameters, or you can use the |
|---|
| 201 | exact command line arguments that the mecab command accepts. |
|---|
| 202 | |
|---|
| 203 | Below is the list of accepted named options. See the man page for mecab for |
|---|
| 204 | details about each option. |
|---|
| 205 | |
|---|
| 206 | =over 4 |
|---|
| 207 | |
|---|
| 208 | =item B<rcfile> |
|---|
| 209 | |
|---|
| 210 | =item B<dicdir> |
|---|
| 211 | |
|---|
| 212 | =item B<lattice_level> |
|---|
| 213 | |
|---|
| 214 | =item B<all_morphs> |
|---|
| 215 | |
|---|
| 216 | =item B<output_format_type> |
|---|
| 217 | |
|---|
| 218 | =item B<partial> |
|---|
| 219 | |
|---|
| 220 | =item B<node_format> |
|---|
| 221 | |
|---|
| 222 | =item B<unk_format> |
|---|
| 223 | |
|---|
| 224 | =item B<bos_format> |
|---|
| 225 | |
|---|
| 226 | =item B<eos_format> |
|---|
| 227 | |
|---|
| 228 | =item B<input_buffer_size> |
|---|
| 229 | |
|---|
| 230 | =item B<allocate_sentence> |
|---|
| 231 | |
|---|
| 232 | =item B<nbest> |
|---|
| 233 | |
|---|
| 234 | =item B<theta> |
|---|
| 235 | |
|---|
| 236 | =back |
|---|
| 237 | |
|---|
| 238 | =head2 parse SCALAR |
|---|
| 239 | |
|---|
| 240 | Parses the given text via mecab, and returns a Text::MeCab::Node object. |
|---|
| 241 | |
|---|
| 242 | =head2 ENCODING |
|---|
| 243 | |
|---|
| 244 | my $encoding = Text::MeCab::ENCODING |
|---|
| 245 | |
|---|
| 246 | Returns the encoding of the underlying mecab library that was detected at |
|---|
| 247 | compile time. |
|---|
| 248 | |
|---|
| 249 | =head2 MECAB_VERSION |
|---|
| 250 | |
|---|
| 251 | The version number from libmecab's mecab_version() |
|---|
| 252 | |
|---|
| 253 | =head2 MECAB_TARGET_VERSION |
|---|
| 254 | |
|---|
| 255 | =head2 MECAB_TARGET_MAJOR_VERSION |
|---|
| 256 | |
|---|
| 257 | =head2 MECAB_TARGET_MINOR_VERSION |
|---|
| 258 | |
|---|
| 259 | The version number detected at compile time of Text::MeCab. |
|---|
| 260 | |
|---|
| 261 | =head2 MECAB_NOR_NODE |
|---|
| 262 | |
|---|
| 263 | =head2 MECAB_UNK_NODE |
|---|
| 264 | |
|---|
| 265 | =head2 MECAB_BOS_NODE |
|---|
| 266 | |
|---|
| 267 | =head2 MECAB_EOS_NODE |
|---|
| 268 | |
|---|
| 269 | =head2 MECAB_USR_DIC |
|---|
| 270 | |
|---|
| 271 | =head2 MECAB_SYS_DIC |
|---|
| 272 | |
|---|
| 273 | =head2 MECAB_UNK_DIC |
|---|
| 274 | |
|---|
| 275 | =head1 SEE ALSO |
|---|
| 276 | |
|---|
| 277 | http://mecab.sourceforge.ne.jp |
|---|
| 278 | |
|---|
| 279 | =head1 AUTHOR |
|---|
| 280 | |
|---|
| 281 | Copyright (c) 2006-2008 Daisuke Maki E<lt>daisuke@endeworks.jpE<gt> |
|---|
| 282 | All rights reserved. |
|---|
| 283 | |
|---|
| 284 | =cut |
|---|