Changeset 7884 for lang/perl/Acme-Shukugawa-Atom
- Timestamp:
- 03/13/08 15:41:34 (9 months ago)
- Location:
- lang/perl/Acme-Shukugawa-Atom/trunk
- Files:
-
- 2 modified
-
lib/Acme/Shukugawa/Atom.pm (modified) (7 diffs)
-
t/01_basic.t (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
-
lang/perl/Acme-Shukugawa-Atom/trunk/lib/Acme/Shukugawa/Atom.pm
r7882 r7884 24 24 # Special case handling -- this could be optimized further 25 25 # put it in a sharefile later 26 my (@SPECIAL, $EXCEPTION );26 my (@SPECIAL, $EXCEPTION, $RE_SMALL, $RE_SYLLABLE, $RE_NBAR); 27 27 BEGIN 28 28 { 29 $RE_SMALL = decode_utf8("[ャュョッー]"); 30 $RE_SYLLABLE = decode_utf8("(?:.$RE_SMALL?)"); 31 $RE_NBAR = decode_utf8("^ンー"); 29 32 @SPECIAL = ( 30 33 '小飼弾|(?i)dankogai|(?i)kogaidan' => 'ガイダンコ', … … 32 35 '別に' => 'ジリサワゴネタ', 33 36 '予約した' => 'バミった', 34 '[22][44]時| [00]時' => 'テッペン',37 '[22][44]時|午前[00]時' => 'テッペン', 35 38 '巨乳|胸(?:の|が)(大きい|でかい|デカイ)' => 'パイオツカイデー', 36 39 '女性|女の人|お姉さん|おねーさん' => 'チャンネー', … … 70 73 71 74 foreach (my $node = $mecab->parse($text); $node; $node = $node->next) { 75 next unless $node->surface; 72 76 my $surface = decode_utf8($node->surface); 73 next unless $surface; 74 if ($surface =~ /^\p{InHiragana}+$/) { 77 my $feature = decode_utf8($node->feature); 78 my ($type, $yomi) = (split(/,/, $feature))[0,8]; 79 80 if ($type eq '動詞' && $node->next) { 81 # 助動詞を計算に入れる 82 my $next_feature = decode_utf8($node->next->feature); 83 my ($next_type, $next_yomi) = (split(/,/, $next_feature))[0,8]; 84 if ($next_type eq '助動詞') { 85 $yomi .= $next_yomi; 86 $node = $node->next; 87 } 88 } 89 90 if ($type =~ /副詞|助動詞|形容詞|接続詞|助詞/ && $surface =~ /^\p{InHiragana}+$/) { 75 91 $ret .= $surface; 92 } elsif ($yomi) { 93 $ret .= $self->atomize($yomi) || $surface; 76 94 } else { 77 my $feature = decode_utf8($node->feature); 78 79 if (my $yomi = (split(/,/, $feature))[8]) { 80 $ret .= $self->atomize($yomi) || $surface; 81 } else { 82 $ret .= $surface; 83 } 95 $ret .= $surface; 84 96 } 85 97 } … … 93 105 # 寿司→シースー 94 106 # ン、が最後だったらひっくり返さない 95 my $small = decode_utf8("[ャュョッー]");96 my $syllable = decode_utf8("(?:.$small?)");97 my $nbar = decode_utf8("^ンー");98 107 sub apply_shisu_rule 99 108 { 100 109 my ($self, $yomi) = @_; 101 return $yomi if $yomi =~ s/^($ syllable)($syllable)$/$2ー$1ー/;110 return $yomi if $yomi =~ s/^($RE_SYLLABLE)($RE_SYLLABLE)$/$2ー$1ー/; 102 111 return; 103 112 } … … 109 118 my ($self, $yomi) = @_; 110 119 111 if ($yomi =~ s/^(${syllable}[$nbar]?)([^$nbar].)$/$2$1/) { 112 $yomi =~ s/([^ー])$/$1ー/; 120 # warn "WAIHA $yomi"; 121 if ($yomi =~ s/^(${RE_SYLLABLE}[$RE_NBAR]?)([^$RE_NBAR].)$/$2$1/) { 122 $yomi =~ s/(^.[^ー].*[^ー])$/$1ー/; 113 123 return $yomi; 114 124 } … … 116 126 } 117 127 128 # クリビツルール 129 # びっくり→クリビツ 130 sub apply_kuribitsu_rule 131 { 132 my ($self, $yomi) = @_; 133 134 # warn "KURIBITSU $yomi"; 135 if ($yomi =~ s/^(..)([^$RE_NBAR]${RE_SYLLABLE}$)/$2$1/) { 136 return $yomi; 137 } 138 return; 139 } 140 118 141 sub atomize 119 142 { … … 122 145 123 146 # Length 124 my $length = length($yomi); 125 $length -= ($yomi =~ /$small/g); 147 my $word_length = length($yomi); 148 my $length = $word_length - ($yomi =~ /$RE_SMALL/g); 149 if ($length == 3 && $yomi =~ s/^(${RE_SYLLABLE})ッ/${1}ツ/) { 150 # warn "Special rule!"; 151 $length = 4; 152 } 153 my $done = 0; 154 155 # warn "$yomi LENGTH: $length"; 126 156 if ($length == 2) { 127 return $self->apply_shisu_rule($yomi); 157 my $tmp = $self->apply_shisu_rule($yomi); 158 if ($tmp) { 159 $yomi = $tmp; 160 $done = 1; 161 } 128 162 } 129 163 130 164 if ($length == 3) { 131 return $self->apply_waiha_rule($yomi); 132 } 133 134 my $done = 0; 165 my $tmp = $self->apply_waiha_rule($yomi); 166 if ($tmp) { 167 $yomi = $tmp; 168 $done = 1; 169 } 170 } 171 135 172 if ($length == 4) { # 4 character words tend to have special xformation 136 if ($yomi =~ s/^(.ー)(..)$/$2$1/) { 173 my $tmp = $self->apply_kuribitsu_rule($yomi); 174 if ($tmp) { 175 $yomi = $tmp; 137 176 $done = 1; 138 177 } -
lang/perl/Acme-Shukugawa-Atom/trunk/t/01_basic.t
r7882 r7884 10 10 my %data = ( 11 11 "六本木の胸の大きいお姉さんがいる店を予約した" 12 => "ギロッポンのパイオツカイデーチャンネーが いるセーミーをバミった",12 => "ギロッポンのパイオツカイデーチャンネーがルーイーセーミーをバミった", 13 13 "ハワイ" => "ワイハー", 14 "寿司" => "シースー", 15 "銀座" => "ザギン", 14 "銀座で午前0時に寿司行こう" => "ザギンでテッペンにシースーコウイー", 16 15 "狼" => "カミオー", 17 16 # "鋏" => "サミハー", <- mecabの辞書にない? … … 19 18 "チャンバーのチークーはどうしてカイデー?", 20 19 "別にdankogaiはエヌジーというわけではない" => 21 "ジリサワゴネタガイダンコはジーエヌーというわけではない" 20 "ジリサワゴネタガイダンコはジーエヌというケーワーではない", 21 "びっくり" => "クリビツ", 22 22 ); 23 23
![(please configure the [header_logo] section in trac.ini)](/share/chrome/site/your_project_logo.png)