root/lang/perl/Text-MeCab/trunk/lib/Text/MeCab.pm @ 4158

Revision 4158, 6.9 kB (checked in by daisuke, 5 years ago)

lang/perl/Text-MeCab?; don't subclass Exporter, use our, use perl 5.6

  • Property svn:keywords set to Id
Line 
1# $Id$
2#
3# Copyright (c) 2006-2008 Daisuke Maki <daisuke@endeworks.jp>
4# All rights reserved.
5
6package Text::MeCab;
7use strict;
8use warnings;
9use 5.006;
10use Exporter 'import';
11our ($VERSION, @ISA, %EXPORT_TAGS, @EXPORT_OK);
12BEGIN
13{
14    $VERSION = '0.20000';
15    if ($] > 5.006) {
16        require XSLoader;
17        XSLoader::load(__PACKAGE__, $VERSION);
18    } else {
19        require DynaLoader;
20        @ISA = qw(DynaLoader);
21        __PACKAGE__->bootstrap;
22    }
23
24    %EXPORT_TAGS = (all => [ qw(MECAB_NOR_NODE MECAB_UNK_NODE MECAB_BOS_NODE MECAB_EOS_NODE) ]);
25    @EXPORT_OK = map { @$_ } values %EXPORT_TAGS;
26}
27
28my %BOOLEAN_OPTIONS = (
29    map { ($_, 'bool') } qw(
30        --all-morphs --partial --allocate-sentence --version --help
31    )
32);
33
34sub new
35{
36    my $class = shift;
37
38    my %args = ref $_[0] eq 'HASH' ? %{$_[0]} : @_;
39    $args{'allocate-sentence'} = 1;
40
41    my @args = ('perl-TextMeCab');
42    while (my($key, $value) = each %args) {
43        $key =~ s/_/-/g;
44        $key =~ s/^/--/;
45
46        if (exists $BOOLEAN_OPTIONS{$key}) {
47            push @args, $key;
48        } else {
49            push @args, join('=', $key, $value);
50        }
51    }
52
53    $class->_XS_new(\@args);
54}
55
561;
57
58__END__
59
60=head1 NAME
61
62Text::MeCab - Alternate Interface To libmecab
63
64=head1 SYNOPSIS
65
66  use Text::MeCab;
67  my $mecab = Text::MeCab->new({
68    rcfile             => $rcfile,
69    dicdir             => $dicdir,
70    userdic            => $userdic,
71    lattice_level      => $lattice_level,
72    all_morphs         => $all_morphs,
73    output_format_type => $output_format_type,
74    partial            => $partial,
75    node_format        => $node_format,
76    unk_format         => $unk_format,
77    bos_format         => $bos_format,
78    eos_format         => $eos_format,
79    input_buffer_size  => $input_buffer_size,
80    allocate_sentence  => $allocate_sentence,
81    nbest              => $nbest,
82    theta              => $theta,
83  });
84
85  for (my $node = $mecab->parse($text); $node; $node = $node->next) {
86     # See perdoc for Text::MeCab::Node for list of methods
87     print $node->surface, "\n";
88  }
89
90  # use constants
91  use Text::MeCab qw(:all);
92  use Text::MeCab qw(MECAB_NOR_NODE);
93
94  # want to use a command line arguments?
95  my $mecab = Text::MeCab->new("--userdic=/foo/bar/baz", "-P");
96
97  # check what mecab version we compiled against?
98  print "Compiled with ", &Text::MeCab::MECAB_VERSION, "\n";
99
100=head1 DESCRIPTION
101
102libmecab (http://mecab.sourceforge.ne.jp) already has a perl interface built
103with it, so why a new module? I just feel that while a subtle difference,
104making the perl interface through a tied hash is just... weird.
105
106So Text::MeCab gives you a more natural, Perl-ish way to access libmecab!
107
108WARNING: Version 0.20000 has only been tested against libmecab 0.96.
109
110=head1 Text::MeCab AND FORMATS
111
112mecab allows users to specify an output format, via --*-format options.
113These are respected ONLY if you use the format() method:
114
115  my $mecab = Text::MeCab->new({
116    output_format_type => "user",
117    node_format => "%m %pn"
118  });
119
120  for(my $node = $mecab->parse($text); $node; $node = $node->next) {
121    print $node->format($mecab);
122  }
123
124Note that you also need to set the output_format_type parameter as well.
125
126=head1 Text::MeCab AND SCOPING
127
128[NOTE: The memory management issue has been changed since 0.09]
129
130libmecab's default behavior is such that when you analyze a text and get a
131node back, that node is tied to the mecab "tagger" object that performed the
132analysis. Therefore, when that tagger is destroyed via mecab_destroy(),
133all nodes that are associated to it are freed as well.
134
135Text::MeCab defaults to the same behavior, so the following won't work:
136
137  sub get_mecab_node {
138     my $mecab = Text::MeCab->new;
139     my $node  = $mecab->parse($_[0]);
140     return $node;
141  }
142
143  my $node = get_mecab_node($text);
144
145By the time get_mecab_node() returns, the Text::MeCab object is DESTROY'ed,
146and so is $node (actually, the object exists, but it will complain when you
147try to access the node's internals, because the C struct that was there
148has already been freed).
149
150In such cases, use the dclone() method. This will copy the *entire* node
151structure and create a new Text::MeCab::Node::Cloned instance.
152
153  sub get_mecab_node {
154     my $mecab = Text::MeCab->new;
155     my $node  = $mecab->parse($_[0]);
156     return $node->dclone();
157  }
158
159The returned Text::MeCab::Node::Cloned object is exactly the same as
160Text::MeCab::Node object on the surface. It just uses a different but
161very similar C struct underneath. It is blessed into a different namespace
162only because we need to use a different memory management strategy.
163
164Do be aware of the memory issue. You WILL use up twice as much memory.
165
166Also please note that if you try the first example, accessing the node
167*WILL* result in a segfault. This is *NOT* a bug: it's a feature :)
168While it is possible to control the memory management such that accessing
169a field in a node that has already expired results in a legal croak(),
170we do not go to the length to ensure this, because it will result in
171a performance penalty.
172
173Just remember that unless you dclone() a node, then you are NOT allowed to
174access it when the original tagger goes out scope:
175
176   {
177       my $mecab = Text::MeCab->new;
178       $node = $mecab->parse(...);
179   }
180
181   $node->surface; # segfault!!!!
182
183Always remember to dclone() before doing this!
184
185=head1 PERFORMANCE
186
187Belows is the result of running tools/benchmark.pl on my PowerBook:
188
189  daisuke@beefcake Text-MeCab$ perl tools/benchmark.pl
190               Rate      mecab text_mecab
191  mecab      5.53/s         --       -63%
192  text_mecab 14.9/s       170%         --
193
194=head1 METHODS
195
196=head2 new HASHREF | LIST
197
198Creates a new Text::MeCab instance.
199
200You can either specify a hashref and use named parameters, or you can use the
201exact command line arguments that the mecab command accepts.
202
203Below is the list of accepted named options. See the man page for mecab for
204details about each option.
205
206=over 4
207
208=item B<rcfile>
209
210=item B<dicdir>
211
212=item B<lattice_level>
213
214=item B<all_morphs>
215
216=item B<output_format_type>
217
218=item B<partial>
219
220=item B<node_format>
221
222=item B<unk_format>
223
224=item B<bos_format>
225
226=item B<eos_format>
227
228=item B<input_buffer_size>
229
230=item B<allocate_sentence>
231
232=item B<nbest>
233
234=item B<theta>
235
236=back
237
238=head2 parse SCALAR
239
240Parses the given text via mecab, and returns a Text::MeCab::Node object.
241
242=head2 ENCODING
243
244  my $encoding = Text::MeCab::ENCODING
245
246Returns the encoding of the underlying mecab library that was detected at
247compile time.
248
249=head2 MECAB_VERSION
250
251The version number from libmecab's mecab_version()
252
253=head2 MECAB_TARGET_VERSION
254
255=head2 MECAB_TARGET_MAJOR_VERSION
256
257=head2 MECAB_TARGET_MINOR_VERSION
258
259The version number detected at compile time of Text::MeCab.
260
261=head2 MECAB_NOR_NODE
262
263=head2 MECAB_UNK_NODE
264
265=head2 MECAB_BOS_NODE
266
267=head2 MECAB_EOS_NODE
268
269=head2 MECAB_USR_DIC
270
271=head2 MECAB_SYS_DIC
272
273=head2 MECAB_UNK_DIC
274
275=head1 SEE ALSO
276
277http://mecab.sourceforge.ne.jp
278
279=head1 AUTHOR
280
281Copyright (c) 2006-2008 Daisuke Maki E<lt>daisuke@endeworks.jpE<gt>
282All rights reserved.
283
284=cut
Note: See TracBrowser for help on using the browser.