root/lang/perl/Text-MeCab/trunk/lib/Text/MeCab.pm @ 6718

Revision 6718, 7.0 kB (checked in by daisuke, 5 years ago)

ad license info

  • Property svn:keywords set to Id
Line 
1# $Id$
2#
3# Copyright (c) 2006-2008 Daisuke Maki <daisuke@endeworks.jp>
4# All rights reserved.
5
6package Text::MeCab;
7use strict;
8use warnings;
9use 5.006;
10use Exporter 'import';
11our ($VERSION, @ISA, %EXPORT_TAGS, @EXPORT_OK);
12BEGIN
13{
14    $VERSION = '0.20005';
15    if ($] > 5.006) {
16        require XSLoader;
17        XSLoader::load(__PACKAGE__, $VERSION);
18    } else {
19        require DynaLoader;
20        @ISA = qw(DynaLoader);
21        __PACKAGE__->bootstrap;
22    }
23
24    %EXPORT_TAGS = (all => [ qw(MECAB_NOR_NODE MECAB_UNK_NODE MECAB_BOS_NODE MECAB_EOS_NODE) ]);
25    @EXPORT_OK = map { @$_ } values %EXPORT_TAGS;
26}
27
28my %BOOLEAN_OPTIONS = (
29    map { ($_, 'bool') } qw(
30        --all-morphs --partial --allocate-sentence --version --help
31    )
32);
33
34sub new
35{
36    my $class = shift;
37
38    my %args = ref $_[0] eq 'HASH' ? %{$_[0]} : @_;
39    $args{'allocate-sentence'} = 1;
40
41    my @args = ('perl-TextMeCab');
42    while (my($key, $value) = each %args) {
43        $key =~ s/_/-/g;
44        $key =~ s/^/--/;
45
46        if (exists $BOOLEAN_OPTIONS{$key}) {
47            push @args, $key;
48        } else {
49            push @args, join('=', $key, $value);
50        }
51    }
52
53    $class->_XS_new(\@args);
54}
55
561;
57
58__END__
59
60=head1 NAME
61
62Text::MeCab - Alternate Interface To libmecab
63
64=head1 SYNOPSIS
65
66  use Text::MeCab;
67  my $mecab = Text::MeCab->new({
68    rcfile             => $rcfile,
69    dicdir             => $dicdir,
70    userdic            => $userdic,
71    lattice_level      => $lattice_level,
72    all_morphs         => $all_morphs,
73    output_format_type => $output_format_type,
74    partial            => $partial,
75    node_format        => $node_format,
76    unk_format         => $unk_format,
77    bos_format         => $bos_format,
78    eos_format         => $eos_format,
79    input_buffer_size  => $input_buffer_size,
80    allocate_sentence  => $allocate_sentence,
81    nbest              => $nbest,
82    theta              => $theta,
83  });
84
85  for (my $node = $mecab->parse($text); $node; $node = $node->next) {
86     # See perdoc for Text::MeCab::Node for list of methods
87     print $node->surface, "\n";
88  }
89
90  # use constants
91  use Text::MeCab qw(:all);
92  use Text::MeCab qw(MECAB_NOR_NODE);
93
94  # check what mecab version we compiled against?
95  print "Compiled with ", &Text::MeCab::MECAB_VERSION, "\n";
96
97=head1 DESCRIPTION
98
99libmecab (http://mecab.sourceforge.ne.jp) already has a perl interface built
100with it, so why a new module? I just feel that while a subtle difference,
101making the perl interface through a tied hash is just... weird.
102
103So Text::MeCab gives you a more natural, Perl-ish way to access libmecab!
104
105WARNING: Version 0.20000 has only been tested against libmecab 0.96.
106
107=head1 Text::MeCab AND FORMATS
108
109mecab allows users to specify an output format, via --*-format options.
110These are respected ONLY if you use the format() method:
111
112  my $mecab = Text::MeCab->new({
113    output_format_type => "user",
114    node_format => "%m %pn"
115  });
116
117  for(my $node = $mecab->parse($text); $node; $node = $node->next) {
118    print $node->format($mecab);
119  }
120
121Note that you also need to set the output_format_type parameter as well.
122
123=head1 Text::MeCab AND SCOPING
124
125[NOTE: The memory management issue has been changed since 0.09]
126
127libmecab's default behavior is such that when you analyze a text and get a
128node back, that node is tied to the mecab "tagger" object that performed the
129analysis. Therefore, when that tagger is destroyed via mecab_destroy(),
130all nodes that are associated to it are freed as well.
131
132Text::MeCab defaults to the same behavior, so the following won't work:
133
134  sub get_mecab_node {
135     my $mecab = Text::MeCab->new;
136     my $node  = $mecab->parse($_[0]);
137     return $node;
138  }
139
140  my $node = get_mecab_node($text);
141
142By the time get_mecab_node() returns, the Text::MeCab object is DESTROY'ed,
143and so is $node (actually, the object exists, but it will complain when you
144try to access the node's internals, because the C struct that was there
145has already been freed).
146
147In such cases, use the dclone() method. This will copy the *entire* node
148structure and create a new Text::MeCab::Node::Cloned instance.
149
150  sub get_mecab_node {
151     my $mecab = Text::MeCab->new;
152     my $node  = $mecab->parse($_[0]);
153     return $node->dclone();
154  }
155
156The returned Text::MeCab::Node::Cloned object is exactly the same as
157Text::MeCab::Node object on the surface. It just uses a different but
158very similar C struct underneath. It is blessed into a different namespace
159only because we need to use a different memory management strategy.
160
161Do be aware of the memory issue. You WILL use up twice as much memory.
162
163Also please note that if you try the first example, accessing the node
164*WILL* result in a segfault. This is *NOT* a bug: it's a feature :)
165While it is possible to control the memory management such that accessing
166a field in a node that has already expired results in a legal croak(),
167we do not go to the length to ensure this, because it will result in
168a performance penalty.
169
170Just remember that unless you dclone() a node, then you are NOT allowed to
171access it when the original tagger goes out scope:
172
173   {
174       my $mecab = Text::MeCab->new;
175       $node = $mecab->parse(...);
176   }
177
178   $node->surface; # segfault!!!!
179
180Always remember to dclone() before doing this!
181
182=head1 PERFORMANCE
183
184Belows is the result of running tools/benchmark.pl on my PowerBook:
185
186  daisuke@beefcake Text-MeCab$ perl tools/benchmark.pl
187               Rate      mecab text_mecab
188  mecab      5.53/s         --       -63%
189  text_mecab 14.9/s       170%         --
190
191=head1 METHODS
192
193=head2 new HASHREF | LIST
194
195Creates a new Text::MeCab instance.
196
197You can either specify a hashref and use named parameters, or you can use the
198exact command line arguments that the mecab command accepts.
199
200Below is the list of accepted named options. See the man page for mecab for
201details about each option.
202
203=over 4
204
205=item B<rcfile>
206
207=item B<dicdir>
208
209=item B<lattice_level>
210
211=item B<all_morphs>
212
213=item B<output_format_type>
214
215=item B<partial>
216
217=item B<node_format>
218
219=item B<unk_format>
220
221=item B<bos_format>
222
223=item B<eos_format>
224
225=item B<input_buffer_size>
226
227=item B<allocate_sentence>
228
229=item B<nbest>
230
231=item B<theta>
232
233=back
234
235=head2 parse SCALAR
236
237Parses the given text via mecab, and returns a Text::MeCab::Node object.
238
239=head2 ENCODING
240
241  my $encoding = Text::MeCab::ENCODING
242
243Returns the encoding of the underlying mecab library that was detected at
244compile time.
245
246=head2 MECAB_VERSION
247
248The version number from libmecab's mecab_version()
249
250=head2 MECAB_TARGET_VERSION
251
252=head2 MECAB_TARGET_MAJOR_VERSION
253
254=head2 MECAB_TARGET_MINOR_VERSION
255
256The version number detected at compile time of Text::MeCab.
257
258=head2 MECAB_NOR_NODE
259
260=head2 MECAB_UNK_NODE
261
262=head2 MECAB_BOS_NODE
263
264=head2 MECAB_EOS_NODE
265
266=head2 MECAB_USR_DIC
267
268=head2 MECAB_SYS_DIC
269
270=head2 MECAB_UNK_DIC
271
272=head2 MECAB_CONFIG
273
274Path to mecab-config, if available.
275
276=head1 SEE ALSO
277
278http://mecab.sourceforge.ne.jp
279
280=head1 LICENSE
281
282This program is free software; you can redistribute it and/or modify it
283under the same terms as Perl itself.
284
285See http://www.perl.com/perl/misc/Artistic.html
286
287=head1 AUTHOR
288
289Copyright (c) 2006-2008 Daisuke Maki E<lt>daisuke@endeworks.jpE<gt>
290All rights reserved.
291
292=cut
Note: See TracBrowser for help on using the browser.