Changeset 14186 for lang/python
- Timestamp:
- 06/17/08 23:07:51 (5 months ago)
- Location:
- lang/python/googlebayes
- Files:
-
- 7 added
- 2 modified
-
README_ja.txt (added)
-
googlebayes/learn.py (modified) (6 diffs)
-
sample.txt (added)
-
sample_ok.txt (added)
-
sample_spam.txt (added)
-
scripts (added)
-
scripts/gbbayes.py (added)
-
scripts/gblearn.py (added)
-
setup.py (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
lang/python/googlebayes/googlebayes/learn.py
r14179 r14186 4 4 5 5 import re 6 import sys 6 7 import cPickle as pickle 7 8 from itertools import chain, groupby, imap … … 18 19 yield buf[i:i+n] 19 20 20 def addcount(dct, tpl):21 def _addcount(dct, tpl): 21 22 u""" 22 23 合計計算用辞書dct に、データ列tpl内の頻度を加算する。 … … 25 26 dct[k] = dct.get(k, 0) + len(list(g)) 26 27 27 def toprob(dct, docnum):28 def _toprob(dct, docnum): 28 29 u""" 29 30 頻度辞書を頻度確率辞書に変換する。 … … 35 36 return ans 36 37 37 def countngramprob(stream):38 def _countngramprob(stream): 38 39 u""" 39 40 bi-gram と、 tri-gramの確率辞書を計算して返します。 … … 51 52 bbuf.extend(set(ngram(line, 2))) 52 53 if linenum % BATCH == 0: 53 addcount(trigram, tbuf)54 addcount(bigram, bbuf)54 _addcount(trigram, tbuf) 55 _addcount(bigram, bbuf) 55 56 tbuf = [] 56 57 bbuf = [] 57 58 else: 58 addcount(trigram, tbuf)59 addcount(bigram, bbuf)60 return toprob(bigram, linenum),toprob(trigram, linenum)59 _addcount(trigram, tbuf) 60 _addcount(bigram, bbuf) 61 return _toprob(bigram, linenum), _toprob(trigram, linenum) 61 62 62 63 RSSURL = "http://blogsearch.google.co.jp/blogsearch_feeds?hl=ja&client=firefox-a&um=1&q=%s&lr=lang_ja&ie=utf-8&num=50&output=rss" 63 64 REMOVEHTML = re.compile(u"<[^>]+>") 64 def create_data_stream(kw):65 def _create_data_stream(kw): 65 66 u""" 66 67 キーワードから、blog検索を行い、その結果の概要文の列を返す。 … … 71 72 yield REMOVEHTML.sub(u"", ent.get("summary", u"")) 72 73 73 def calcbayes(okkw, spamkw):74 def _calcbayes(okkw, spamkw): 74 75 u""" 75 76 OKキーワード、spamキーワードをもとに、blog検索を行い、それぞれの文たちから 76 77 Bayes計算用の確率一覧を作成して返す。 77 78 """ 78 okprob = countngramprob(chain(*(imap(create_data_stream, okkw))))79 spamprob = countngramprob(chain(*(imap(create_data_stream, spamkw))))79 okprob = _countngramprob(chain(*(imap(_create_data_stream, okkw)))) 80 spamprob = _countngramprob(chain(*(imap(_create_data_stream, spamkw)))) 80 81 return okprob, spamprob 81 82 82 def test():83 for line in create_data_stream(u"Python"):84 print line["summary"].encode("utf-8")83 def learn(okwords, spamwords, fname): 84 u""" 85 学習を行います。学習結果はファイルに保存されます。 85 86 86 def main(fname): 87 引数 88 okwords Unicodeで書かれた、OK語のリスト 89 spamwords Unicodeで書かれた、SPAM語のリスト 90 fname 学習結果を保存するファイル名 91 """ 87 92 fp = file(fname, "w") 88 okprob, spamprob = calcbayes( 89 [u"python", u"C++", u"ruby 開発", u"自然言語処理"], 90 [u"巨乳", u"熟女", u"ハメ撮り", u"オナニー", u"儲かる 情報商材"] 91 ) 93 okprob, spamprob = _calcbayes(okwords, spamwords) 92 94 pickle.dump(okprob, fp) 93 95 pickle.dump(spamprob, fp) 94 96 fp.close() 95 97 98 def _test(fname): 99 learn( 100 [u"python", u"C++", u"ruby 開発", u"自然言語処理"], 101 [u"巨乳", u"熟女", u"ハメ撮り", u"オナニー", u"儲かる 情報商材"], 102 fname 103 ) 104 96 105 if __name__ == "__main__": 97 import optparse 98 99 parser = optparse.OptionParser(u""" 100 学習データを作成します。 101 """) 102 parser.add_option("-o", "--output", dest="output", help=u"出力ファイル名。", default=None) 103 (options, args) = parser.parse_args() 104 105 #test() 106 main(options.output) 107 108 106 _test("sample.dat") -
lang/python/googlebayes/setup.py
r14179 r14186 10 10 author="Hiroshi Ayukawa", 11 11 author_email="ayukawa.hiroshi@gmail.com", 12 packages=["googlebayes"] 12 packages=["googlebayes"], 13 scripts=["scripts/gblearn.py", "scripts/gbbayes.py"] 13 14 ) 14 15
![(please configure the [header_logo] section in trac.ini)](/share/chrome/site/your_project_logo.png)