Changeset 12244 for platform/tdiary/filter
- Timestamp:
- 05/23/08 17:24:47 (6 months ago)
- Location:
- platform/tdiary/filter
- Files:
-
- 2 modified
-
plugin/spambayes.rb (modified) (5 diffs)
-
spambayes.rb (modified) (12 diffs)
Legend:
- Unmodified
- Added
- Removed
-
platform/tdiary/filter/plugin/spambayes.rb
r10806 r12244 37 37 @conf = conf 38 38 filter_path = conf.filter_path || "#{PATH}/tdiary/filter" 39 require "#{filter_path}/spambayes" 39 require "#{filter_path}/spambayes" unless ::TDiary::Filter::SpambayesFilter.const_defined?(:Misc) 40 40 41 41 extend ::TDiary::Filter::SpambayesFilter::Misc 42 42 ::TDiary::Filter::SpambayesFilter::Misc.conf = conf 43 bayes_filter 43 44 end 44 45 … … 113 114 @conf[conf_log] = @cgi.params[conf_log][0] || nil 114 115 @conf[conf_mail] = @cgi.params[conf_mail][0] || nil 115 @conf[conf_threshold] = @cgi.params[conf_threshold][0] || nil 116 cpt_ham = (@cgi.params[conf_threshold_ham][0]||threshold_ham).to_f 117 cpt_spam = (@cgi.params[conf_threshold][0]||threshold).to_f 118 if 0<cpt_ham and cpt_ham<=cpt_spam and cpt_spam<1.0 119 @conf[conf_threshold] = cpt_spam 120 @conf[conf_threshold_ham] = cpt_ham 121 end 116 122 prm = @cgi.params[conf_filter][0] || "Plain" 117 123 @conf[conf_filter] ||= "Plain" … … 139 145 <li>#{Res.use_filter_to_referer} : <input type='checkbox' name='#{conf_for_referer}' #{@conf[conf_for_referer] ? "checked='checked'" : ""}> 140 146 <li>#{Res.save_error_log} : <input type='checkbox' name='#{conf_log}' #{@conf[conf_log] ? "checked='checked'" : ""}> 141 <li>#{Res.threshold} : <input type="text" name="#{conf_threshold}" value="#{threshold}"></li>147 <li>#{Res.threshold}: 0 <= Ham < <input type="text" name="#{conf_threshold_ham}" value="#{threshold_ham}"> <= Doubt <= <input type="text" name="#{conf_threshold}" value="#{threshold}"> < Spam <= 1.0</li> 142 148 <li>#{Res.receiver_addr} : <input type="text" name="#{conf_mail}" value="#{@conf[conf_mail]}"></li> 143 149 <li><select name='#{conf_filter}'> … … 452 458 spams = [] 453 459 hams = [] 454 processed = false455 460 @cgi.params.each do |k, v| 456 461 next unless k=~/^r([shd])(.*)$/ 457 processed = true458 462 type = $1 459 463 referer = Referer.from_html($2) … … 468 472 end 469 473 end 470 if processed 471 bayes_filter.save 472 ["ham", "spam", "doubt"].each do |k| 473 size = (@cgi.params[k[0, 1]+"size"][0]||"0").to_i 474 Referer.truncate_list(referer_cache(k), size) 475 end 476 477 PStore.new(referer_corpus).transaction do |db| 478 spams.concat(db['spam']||[]) 479 hams.concat(db['ham']||[]) 480 db["spam"] = spams.uniq 481 db["ham"] = hams.uniq 482 end 474 475 bayes_filter.save 476 ["ham", "spam", "doubt"].each do |k| 477 size = (@cgi.params[k[0, 1]+"size"][0]||"0").to_i 478 Referer.truncate_list(referer_cache(k), size) 479 end 480 481 PStore.new(referer_corpus).transaction do |db| 482 spams.concat(db['spam']||[]) 483 hams.concat(db['ham']||[]) 484 db["spam"] = spams.uniq 485 db["ham"] = hams.uniq 483 486 end 484 487 -
platform/tdiary/filter/spambayes.rb
r5705 r12244 3 3 4 4 require "bayes" 5 require "uri"6 5 7 6 module TDiary::Filter … … 48 47 def conf_mail; "#{PREFIX}.mail"; end 49 48 def conf_threshold; "#{PREFIX}.threshold"; end 49 def conf_threshold_ham; "#{PREFIX}.threshold_ham"; end 50 50 def conf_use; "#{PREFIX}.use"; end 51 51 def conf_log; "#{PREFIX}.log"; end … … 92 92 93 93 def bayes_db 94 "#{@conf.data_path}/bayes.db"94 "#{@conf.data_path}/bayes.db" 95 95 end 96 96 … … 103 103 case @conf[conf_filter] 104 104 when /graham/i 105 @bayes_filter ||= Bayes::PaulGraham.new(bayes_db )105 @bayes_filter ||= Bayes::PaulGraham.new(bayes_db, Bayes::CHARSET::UTF8) 106 106 else 107 @bayes_filter ||= Bayes::PlainBayes.new(bayes_db) 108 end 107 @bayes_filter ||= Bayes::PlainBayes.new(bayes_db, Bayes::CHARSET::UTF8) 108 end 109 convert_to_utf8 unless @bayes_filter.charset==Bayes::CHARSET::UTF8 109 110 @bayes_filter 111 end 112 113 def convert_to_utf8 114 require "bayes/convert" 115 require "kconv" 116 117 @bayes_filter.convert(Bayes::CHARSET::UTF8, Bayes::CHARSET::EUC) 118 @bayes_filter.save 119 comments = [] 120 ["S", "H", "D"].each do |c| 121 comments.concat(Dir["#{bayes_cache}/#{c}*"]) 122 end 123 ["S", "H"].each do |c| 124 comments.concat(Dir["#{corpus_path}/#{c}*"]) 125 end 126 comments.each do |f| 127 Comment.load(f).convert_to_utf8.save(f) 128 end 110 129 end 111 130 112 131 def threshold 113 132 (@conf[conf_threshold]||"0.95").to_f 133 end 134 135 def threshold_ham 136 (@conf[conf_threshold_ham]||"0.05").to_f 114 137 end 115 138 … … 118 141 path 119 142 else 120 @conf.base_url.sub(/\/*$/, '/') + (path||'') 121 end 122 end 123 124 def url2(path=nil) 125 if path && URI.parse(path).absolute? 126 path 127 else 128 base = URI.parse @conf.base_url 129 base.path = base.path.sub(%r{/*$}, '/') + (path || '') 130 base.to_s 143 File.join(@conf.base_url, (path||"")) 131 144 end 132 145 end … … 161 174 @remote_addr = cgi.remote_addr || "" 162 175 d = cgi.params['date'][0] || Time.now.strftime("%Y%m%d") 163 @diary_date = Time::local(*d.scan(/^(\d{4})(\d{2})(\d{2})$/)[0]) + 12*60*60 176 @diary_date = Time::local(*d.scan(/^(\d{4})(\d\d)(\d\d)$/)[0]) + 12*60*60 177 end 178 179 def convert_to_utf8 180 @name = @name.kconv(Kconv::UTF8, Kconv::EUC) 181 @body = @body.kconv(Kconv::UTF8, Kconv::EUC) 182 self 183 end 184 185 def save(filename) 186 open(filename, "w") do |f| 187 f.flock(File::LOCK_SH) 188 f.rewind 189 Marshal.dump(self, f) 190 end 164 191 end 165 192 … … 168 195 end 169 196 197 RE_URL = %r[(?:https?|ftp)://[a-zA-Z0-9;/?:@&=+$,\-_.!~*\'()%]+] 170 198 def token 171 199 r = TokenList.new … … 178 206 r.add_mail_addr(@mail, "M") 179 207 b = @body.dup 180 URI.extract(b, %w[http https ftp]) do |url| 181 r.add_url(url, "U") 208 b.gsub!(RE_URL) do |m| 209 r.add_url(m, "U") 210 "" 182 211 end 183 212 r.add_message(b) … … 269 298 270 299 def split_url 271 begin 272 url = URI.parse(@referer) 273 query = url.query 274 fragment = url.fragment 275 url.query = nil 276 url.fragment = nil 277 base = url.to_s 278 rescue 279 base, query, fragment = @referer.scan(/^(.*?)(?:\?([^#]*?)(?:#(.*))?)?$/)[0] 280 end 281 [base, query, fragment] 300 base, request, anchor = @referer.scan(/^(.*?)(?:\?(.*?)(?:\#(.*))?)?$/)[0] 282 301 end 283 302 … … 410 429 end 411 430 431 def ham?(tokens) 432 e = bayes_filter.estimate(tokens) || (threshold_ham+threshold)/2 433 case 434 when e<threshold_ham 435 true 436 when e>threshold 437 false 438 else 439 nil 440 end 441 end 442 412 443 def comment_filter(diary, comment) 413 444 return false if force_filtering? … … 420 451 ham_url = "Register as ham : #{base_url}confirm_ham" 421 452 422 e = bayes_filter.estimate(data.token) 423 case 424 when e == nil 425 r = false 426 tag = "DOUBT" 427 url = "#{spam_url}\n#{ham_url}" 428 when e>threshold 453 case ham?(data.token) 454 when true 455 r = true 456 tag = "HAM" 457 url = spam_url 458 when false 429 459 r = false 430 460 tag = "SPAM" 431 461 url = ham_url 432 462 else 433 r = true434 tag = " HAM"435 url = spam_url463 r = false 464 tag = "DOUBT" 465 url = "#{spam_url}\n#{ham_url}" 436 466 end 437 467 cn = tag[0,1]+data.cache_name 438 open("#{bayes_cache}/#{cn}", "w") do |f| 439 f.flock(File::LOCK_SH) 440 f.rewind 441 Marshal.dump(data, f) 442 end 468 data.save("#{bayes_cache}/#{cn}") 443 469 url.gsub!(/(\n|\z)/){";comment_id=#{cn}#$1"} 444 470 … … 487 513 referer = Referer.new(referer, ENV["REMOTE_ADDR"]) 488 514 token = referer.token 489 e = bayes_filter.estimate(token) 490 case 491 when e==nil 492 r = false 493 key = "doubt" 494 when e>threshold 515 case ham?(token) 516 when true 517 key = "ham" 518 when false 495 519 r = false 496 520 key = "spam" 497 521 else 498 key = "ham" 522 r = false 523 key = "doubt" 499 524 end 500 525 open(referer_cache(key), "a") do |f|
![(please configure the [header_logo] section in trac.ini)](/share/chrome/site/your_project_logo.png)