root/lang/perl/Mezasi/trunk/lib/Mezasi/Dictionary.pm @ 3139

Revision 3139, 8.1 kB (checked in by hiroyukim, 6 years ago)

import Mezasi

Line 
1package Mezasi::Dictionary;
2use strict;
3use warnings;
4use utf8;
5
6use Switch;
7use IO::File;
8use File::Copy;
9use List::MoreUtils qw/uniq/;
10use Mezasi::Trie;
11use Data::Dumper;
12
13our $TEXT_FILENAME = 'sixamo.txt';
14our $DIC_FILENAME  = 'sixamo.dic';
15our $WindowSize    = 500;
16
17sub new {
18    my ( $class, $dirname ) = @_;
19
20    my $self = bless  {
21        occur         => \my %occur,
22        rel           => \my %rel,
23        trie          => Mezasi::Trie->new,
24        dirname       => $dirname || undef,
25        text_filename => "@{[$dirname]}/@{[$TEXT_FILENAME]}",
26        dic_filename  => "@{[$dirname]}/@{[$DIC_FILENAME]}",
27        text          => \my @text,
28        line_num      => 0,
29    } , $class;
30
31    return $self;
32}
33
34sub load {
35    my ( $self, $dirname ) = @_;
36
37    my $dic = Dictionary->new($dirname);
38    $dic->load_text;
39    $dic->load_dictionary;
40
41    return $dic;
42}
43
44sub load_text {
45    my $self = shift;
46
47    return unless( -e $self->{text_filename} );
48
49    my $io = IO::File->new( $self->{text_filename} , "<:utf8" );   
50
51    while( my $line = <$io> ) {
52        chomp($line);
53
54        push @{$self->{text}}, $line ;
55    }
56}
57
58sub load_dictionary {
59    my $self = shift;
60
61    return unless( -e $self->{dic_filename} );
62
63    my @lines = IO::File->new($self->{dic_filename}, "<:utf8")->getlines;
64
65    my $line_num = shift @lines;
66    ($self->{line_num},) = ( $line_num =~ /line_num:\s*(.*)\s*$/i );
67
68    for my $line (@lines) {
69        chomp $line;
70
71        my ($word,$num,$sum,$occur) = split(/\t/, $line );
72
73        if( $occur ) {
74            $self->{occur}->{$word} = [ map { int $_ } split( /,/, $occur ) ];
75            $self->add_term($word);
76            $self->{rel}->{$word} = {};
77            $self->{rel}->{$word}->{num} = int $num;
78            $self->{rel}->{$word}->{num} = int $sum;
79        }
80    }
81}
82
83sub save_text {
84    my $self = shift;
85
86    my $tmp_file_name = "@{[$self->dirname]}/sixamo.tmp.@{[$$]}-@{[rand(100)]}";
87
88    my $fp = IO::File->new( $tmp_file_name, ">>:utf8" );
89
90    for my $line (@{$self->text}) {
91        print $fp $line;
92    }
93
94    File::Copy::move( $tmp_file_name , $self->{dic_filename} );
95}
96
97sub save_dictionary {
98    my $self = shift;
99
100    my $tmp_file_name = "@{[$self->dirname]}/sixamo.tmp.@{[$$]}-@{[rand(100)]}";
101
102    my $fp = IO::File->new( $tmp_file_name,">:utf8" );
103
104    print $fp $self->to_s;
105
106    File::Copy::move( $tmp_file_name , $self->{dic_filename} );
107
108}
109
110sub to_s {
111    my $self = shift;
112
113    my $result = '';
114    $result .= "line_num: @{[$self->line_num]}\n";
115    $result .= "\n";
116
117    %{$self->{occur}} = map { ( $_ =>  $self->{occur}->{$_} ) } grep { not ( scalar( @{$self->{occur}->{$_}} ) == 0 ) } keys %{$self->{occur}};
118   
119    while( my ( $key , $value ) = each %{$self->{occur}} ) {
120        $self->{oocur}->{$key} = [ map { $value->[$_] } (-100..-1) ] if( scalar( @{$value} ) > 100 );
121    }
122   
123    my @tmp;
124    for my $key ( sort { $a cmp $b } keys %{$self->{occur}} ) {
125        push @tmp , [ - scalar( @{$self->{occur}->{$key}} ) , $self->{rel}->{$key}->{num}, scalar( @{[split(//,$key)]} ) , $key ];
126    }
127
128    for my $key ( @tmp ) {
129        $result .= sprintf("%s\t\%s\t\%s\t%s\n",
130            $key,
131            $self->{ocuur}->{$key}->{num},
132            $self->{ocuur}->{$key}->{sum},
133            join(',', @{$self->{occur}->{$key}} )
134        );     
135    }
136
137    return $result;
138}
139
140sub learn_from_text {
141    my ($self, $progress ) = @_;
142
143    my $modified = undef;
144
145    my $read_size = 0;
146    my @buf_prev;
147    my $end_flag  = undef;
148    my $idx       = $self->{line_num};
149
150    while(1) {
151        my @buf;
152
153        if( $progress ) {
154            my $idx2 = $read_size / $WindowSize * $WindowSize ;
155
156            if( $idx2 % 100000 == 0 ) {
157                warn sprintf("\n%5dk", $idx2/1000 ) ;
158            }
159            elsif ( $idx2 % 20000 == 0 ) {
160                warn "*";
161            }
162            elsif ( $idx2 % 2000 == 0 ) {
163                warn ".";
164            }
165        }
166
167        my $tmp = $read_size;
168        while( $tmp/$WindowSize == $read_size/$WindowSize ) {
169            if( $idx >= scalar( @{$self->{text}} ) ) {
170                $end_flag = 1;
171                last;
172            }
173
174            push @buf , $self->{text}->[$idx] ;
175            $tmp += scalar( @{$self->{text}} ) ;
176            $idx++;
177        }
178
179        $read_size = $tmp;
180
181        last if $end_flag ;
182
183        if( scalar( @buf_prev ) > 0 ) {
184            $self->learn(  [ (@buf_prev , @buf) ] , $self->{line_num} );
185            $modified = 1;
186
187            $self->{line_num} += scalar( @buf_prev );
188        }
189
190        @buf_prev = @buf;
191    }
192
193    warn "\n" if $progress;
194
195    return $modified;
196}
197
198sub store_text {
199    my ($self, @lines) = @_;
200
201    my @ary;
202    for my $line (@lines) {
203        $line =~ s/\s+/ /;
204        push @ary, $line;
205    }
206
207    @{$self->{text}} = @ary;
208
209    my $fp = IO::File->new( $self->{text_filename} , ">>:utf8");
210    for my $line (@ary) {
211        chomp $line;
212
213        print $fp $line."\n";
214    }
215
216    $fp->close;
217
218    return;
219}
220
221sub learn {
222    my ($self, $lines, $idx ) = @_;
223
224    my @new_terms = Freq->extract_terms( $lines, 30 );
225   
226    for my $term ( @new_terms ) {
227        $self->add_term($term);
228    }
229
230    if( $idx ) {
231        my @words_all;
232
233        my $count=0;
234        for my $line ( @{$lines} ) {
235            my $num = $idx + 1;
236            my @words = @{$self->split_into_terms($line)};
237            @words_all = (@words_all,@words);
238
239            for my $term (@words) {
240                if( not  $self->{occur}->{$term}  || $num > $self->{occur}->{$term}->{-1} ) {
241                    push @{$self->{occur}->{$term}} , $num;
242                }
243            }
244        }
245
246        $self->weight_update(@words_all);
247
248        for my $term (@{$self->{terms}}) {
249            my $occur = $self->{occur}->{$term};
250            my $size  = scalar( keys %{$self->{occur}} );
251
252            if( $size < 4 && $size > 0 && $occur->{num} * $size * 150 < $idx ) {
253                $self->del_term($term);
254            }
255        }
256    }
257}
258
259sub split_into_keywords {
260    my ($self, $str ) = @_;
261
262    my @terms = @{$self->split_into_terms($str)};
263    my $result = {};
264    for my $word ( @terms ) {
265        $result->{$word} += $self->weight($word);   
266    }
267
268    return $result;
269}
270
271sub split_into_terms {
272    my ($self, $str, $num ) = @_;
273
274    return $self->{trie}->split_into_terms($str,$num);
275}
276
277sub weight_update {
278    my ($self, @words) = @_;
279    my $width = 20;
280
281    for my $term (@words) {
282        $self->{rel}->{$term} = {} unless defined $self->{rel}->{$term};
283    }
284
285    my $size = scalar @words;
286    for my $idx1 (0..($size-$width)) {
287        my $word1 = $words[$idx1];
288       
289        for my $idx2 ( ($idx1+1)..($idx1+$width) ) {
290            $self->{rel}->{$word1}->{num} += 1 if $word1 eq $words[$idx2];
291            $self->{rel}->{$word1}->{sum} += 1;
292        }
293    }
294
295    for my $idx1 ( 0..($width + 1 ) ) {
296        my $word1 = $words[$idx1];
297
298        if( $word1 ) {
299            for my $idx2 ( reverse ( 1..($idx1 -1)) ) {
300                $self->{rel}->{$word1}->{num} += 1 if $word1 eq $words[-$idx2];
301                $self->{rel}->{$word1}->{sum} += 1;
302            }
303        }
304    }
305
306    return;
307}
308
309sub weight {
310    my ($self, $word ) = @_;
311
312    if( not $self->{rel}->{$word} || ($self->{rel}->{$word}->{sum}||0) == 0 ) {
313        return 0;
314    }
315    else {
316        my $num = $self->{rel}->{$word}->{num} || 0;
317        my $sum = $self->{rel}->{$word}->{sum} || 0;
318
319        return $num/($sum*($sum+100));
320    }
321}
322
323sub lines {
324    my ($self, $word ) = @_;
325
326    return   $self->{occur}->{$word} || [];
327}
328
329sub terms {
330    my  $self = shift;
331
332    return ( keys %{$self->{occur}} );
333}
334
335sub add_term {
336    my ( $self, $str ) = @_;
337
338    $self->{occur}->{$str} = qw() unless $self->{occur}->{$str};
339
340    $self->{trie}->add($str);
341    $self->{rel}->{$str}  = {} unless $self->{rel}->{$str};
342
343    return;
344}
345
346sub del_term {
347    my ($self, $str ) = @_;
348
349    my @occur = $self->{occur}->{$str};
350
351    delete $self->{occur}->{$str};
352    delete $self->{trie}->{$str};
353    delete $self->{rel}->{$str};
354
355    my @tmp = $self->split_into_terms($str);
356
357    for my $word (@tmp) {
358        $self->{occur}->{$word} = [ sort { $a <=> $b } uniq ( @{$self->{occur}->{$word}} , @occur ) ];
359        $self->weight_update(@tmp) if( scalar( @tmp ) > 0 );
360    }
361
362    return;
363}
364
3651;
Note: See TracBrowser for help on using the browser.