| 1 | package Mezasi::Util; |
|---|
| 2 | use strict; |
|---|
| 3 | use warnings; |
|---|
| 4 | use utf8; |
|---|
| 5 | use base qw/Class::Accessor::Fast/; |
|---|
| 6 | |
|---|
| 7 | use Data::Dumper; |
|---|
| 8 | use List::Util qw/sum/; |
|---|
| 9 | use List::MoreUtils qw/uniq/; |
|---|
| 10 | use boolean qw/:all/; |
|---|
| 11 | use JSON::Syck; |
|---|
| 12 | |
|---|
| 13 | use Mezasi::Trie; |
|---|
| 14 | |
|---|
| 15 | our $MarkovKeySize = 1; |
|---|
| 16 | |
|---|
| 17 | sub markov { |
|---|
| 18 | my ( $self, $src, $keywords, $trie ) = @_; |
|---|
| 19 | |
|---|
| 20 | my $mar = $self->markov_generate( $src , $trie ); |
|---|
| 21 | my $result = $self->markov_select($mar, $keywords ); |
|---|
| 22 | |
|---|
| 23 | return $result; |
|---|
| 24 | } |
|---|
| 25 | |
|---|
| 26 | sub markov_generate { |
|---|
| 27 | my ( $self, $src, $trie ) = @_; |
|---|
| 28 | |
|---|
| 29 | return '' if( scalar( @{$src} ) == 0 ); |
|---|
| 30 | |
|---|
| 31 | my @ary = @{$trie->split_into_terms( join( "\n", @{$src} )."\n", 'true')}; |
|---|
| 32 | my $size = scalar @ary; |
|---|
| 33 | push @ary, grep {$_} map { $ary[$_] } (0..$MarkovKeySize); |
|---|
| 34 | |
|---|
| 35 | my %table; |
|---|
| 36 | for my $idx ( 0..$size - 1 ) { |
|---|
| 37 | my $key = JSON::Syck::Dump([ grep {$_} map { $ary[$_] } ($idx..($idx+$MarkovKeySize - 1)) ]); |
|---|
| 38 | $table{$key} = [] unless $table{$key}; |
|---|
| 39 | push @{$table{$key}},$ary[$idx + $MarkovKeySize ] ; |
|---|
| 40 | } |
|---|
| 41 | my %uniq; |
|---|
| 42 | my %backup; |
|---|
| 43 | |
|---|
| 44 | while( my ($key, $value ) = each %table ) { |
|---|
| 45 | if( scalar( @{$value} ) == 1 ) { |
|---|
| 46 | $uniq{$key} = $value->[0]; |
|---|
| 47 | } |
|---|
| 48 | else { |
|---|
| 49 | # dup の挙動を追うと新規を作ることになる。 |
|---|
| 50 | $backup{$key} = [ @{$table{$key}} ]; |
|---|
| 51 | } |
|---|
| 52 | } |
|---|
| 53 | my $key = JSON::Syck::Dump([ map { $ary[$_] } (0..($MarkovKeySize - 1 )) ]); |
|---|
| 54 | my $result = join('',@{JSON::Syck::Load($key)}); |
|---|
| 55 | for my $count ( 0..10000 ) { |
|---|
| 56 | my $str; |
|---|
| 57 | if( defined $uniq{$key} ) { |
|---|
| 58 | $str = $uniq{$key}; |
|---|
| 59 | } |
|---|
| 60 | else { |
|---|
| 61 | if( $table{$key} && scalar( @{$table{$key}} ) == 0 ) { |
|---|
| 62 | $table{$key} = ( ref $backup{$key} ) ? [@{$backup{$key}}] : []; |
|---|
| 63 | } |
|---|
| 64 | my $idx = rand( ( ref $table{$key} ) ? scalar @{$table{$key}} : 0 ); |
|---|
| 65 | my $str = $table{$key}->[$idx]; |
|---|
| 66 | |
|---|
| 67 | $table{$key}->[$idx] = undef; |
|---|
| 68 | $table{$key} = [ grep { $_ } @{$table{$key}} ]; |
|---|
| 69 | } |
|---|
| 70 | |
|---|
| 71 | $result .= $str || ''; |
|---|
| 72 | $key = JSON::Syck::Load($key); |
|---|
| 73 | push @{$key} , $str ; |
|---|
| 74 | $key = JSON::Syck::Dump([ map { $key->[$_] } (1..$MarkovKeySize ) ]) ; |
|---|
| 75 | } |
|---|
| 76 | |
|---|
| 77 | return $result; |
|---|
| 78 | } |
|---|
| 79 | |
|---|
| 80 | sub markov_split { |
|---|
| 81 | my ($self, $str ) = @_; |
|---|
| 82 | |
|---|
| 83 | my @result; |
|---|
| 84 | while( $str =~ /\A(.{25,}?)([。、.,]+|[?!.,]+[\s ])[ ]*/ ) { |
|---|
| 85 | my $m = $1; |
|---|
| 86 | my $m2 = $2; |
|---|
| 87 | my $post_match = $'; |
|---|
| 88 | if( $m2 ) { |
|---|
| 89 | $m2 =~ s/、/。/m; |
|---|
| 90 | $m2 =~ s/,/./m; |
|---|
| 91 | } |
|---|
| 92 | $m .= $m2; |
|---|
| 93 | push @result, $m; |
|---|
| 94 | $str = $post_match; |
|---|
| 95 | } |
|---|
| 96 | |
|---|
| 97 | if( scalar( @{[split(//,$str)]} ) > 0 ) { |
|---|
| 98 | push @result, $str; |
|---|
| 99 | } |
|---|
| 100 | return \@result; |
|---|
| 101 | } |
|---|
| 102 | |
|---|
| 103 | sub markov_select { |
|---|
| 104 | my ($self, $result, $keywords ) = @_; |
|---|
| 105 | |
|---|
| 106 | my @tmp = split(/\n/, $result) or qw(); |
|---|
| 107 | my @test = map { @{$self->markov_split($_)} } @tmp; |
|---|
| 108 | my @result_ary = uniq map { @{$self->markov_split($_)} } @tmp; |
|---|
| 109 | @result_ary = grep { $_ || $_ !~ /\0/ } @result_ary; |
|---|
| 110 | my %result_hash; |
|---|
| 111 | |
|---|
| 112 | my $trie = Mezasi::Trie->new([ keys %{$keywords}]); |
|---|
| 113 | |
|---|
| 114 | for my $str ( @result_ary ) { |
|---|
| 115 | my @terms = uniq @{$trie->split_into_terms($str)}; |
|---|
| 116 | $result_hash{$str} = ( sum map { $keywords->{$_} } @terms ) || 0; |
|---|
| 117 | } |
|---|
| 118 | |
|---|
| 119 | $result = $self->roulette_select(\%result_hash); |
|---|
| 120 | return $result || ''; |
|---|
| 121 | } |
|---|
| 122 | |
|---|
| 123 | sub roulette_select { |
|---|
| 124 | my ($self, $h ) = @_; |
|---|
| 125 | |
|---|
| 126 | return if( scalar( keys %$h ) == 0 ); |
|---|
| 127 | |
|---|
| 128 | my $sum = sum values %$h; |
|---|
| 129 | |
|---|
| 130 | if( $sum == 0 ) { |
|---|
| 131 | return $self->random_select( [ keys %$h ] ); |
|---|
| 132 | } |
|---|
| 133 | |
|---|
| 134 | my $r = int( rand * $sum ); |
|---|
| 135 | while( my ($key , $value ) = each %$h ) { |
|---|
| 136 | $r -= $value; |
|---|
| 137 | if( $r <= 0 ) { |
|---|
| 138 | return $key ; |
|---|
| 139 | } |
|---|
| 140 | } |
|---|
| 141 | |
|---|
| 142 | return $self->random_select( [ keys %$h ] ); |
|---|
| 143 | } |
|---|
| 144 | |
|---|
| 145 | sub random_select { |
|---|
| 146 | my ($self, $ary ) = @_; |
|---|
| 147 | |
|---|
| 148 | return $ary->[rand(scalar(@{$ary}))]; |
|---|
| 149 | } |
|---|
| 150 | |
|---|
| 151 | sub message_normalize { |
|---|
| 152 | my ($self, $str ) = @_; |
|---|
| 153 | |
|---|
| 154 | $str =~ s/「」//g; |
|---|
| 155 | $str =~ s/()//g; |
|---|
| 156 | $str =~ s/『』//g; |
|---|
| 157 | $str =~ s/\(\)//g; |
|---|
| 158 | |
|---|
| 159 | return $str; |
|---|
| 160 | } |
|---|
| 161 | |
|---|
| 162 | 1; |
|---|