| 1 | #!/usr/local/Python25/bin/python |
|---|
| 2 | # _*_ coding: utf-8 _*_ |
|---|
| 3 | # Copyright (C) 2007 Ayukawa Hiroshi |
|---|
| 4 | from collections import defaultdict |
|---|
| 5 | import re |
|---|
| 6 | import sys |
|---|
| 7 | import MeCab |
|---|
| 8 | import math |
|---|
| 9 | |
|---|
| 10 | from yoyaku.engine import TopicSegmentation |
|---|
| 11 | from yoyaku.engine.HyokiyureData import HYOKIYURE |
|---|
| 12 | |
|---|
| 13 | def sentence_generator(tagger, src, mecabencoding="utf-8"): |
|---|
| 14 | MARU = u"。".encode(mecabencoding) |
|---|
| 15 | node = tagger.parseToNode(src) |
|---|
| 16 | nodes = [] |
|---|
| 17 | while node: |
|---|
| 18 | nodes.append(node) |
|---|
| 19 | if node.surface == MARU: |
|---|
| 20 | yield nodes |
|---|
| 21 | nodes = [] |
|---|
| 22 | node = node.next |
|---|
| 23 | if nodes: yield nodes |
|---|
| 24 | |
|---|
| 25 | def stopworddetector(mecabencoding="utf-8"): |
|---|
| 26 | NUM_CHAR = re.compile(ur"^[0-90-9\-\.]+$") |
|---|
| 27 | MEISHI = u"名詞".encode(mecabencoding) |
|---|
| 28 | MICHI = u"未知".encode(mecabencoding) |
|---|
| 29 | DOUSHI = u"動詞".encode(mecabencoding) |
|---|
| 30 | HIRA = re.compile(u"^[ぁ-ん]+$") |
|---|
| 31 | STOPWORD = [u"(", u"(", u")", u")", u"等", u"同", u".", u"/", u"その後"] |
|---|
| 32 | def hantei(node): |
|---|
| 33 | w = unicode(node.surface, mecabencoding, "ignore") |
|---|
| 34 | stop = not w.replace(u"。", u"") in STOPWORD |
|---|
| 35 | hira = not HIRA.match(w) |
|---|
| 36 | hinshi = node.feature.startswith(MEISHI) or node.feature.startswith(MICHI) or node.feature.startswith(DOUSHI) |
|---|
| 37 | num = not NUM_CHAR.match(w) |
|---|
| 38 | return node.surface and hinshi and hira and num and stop |
|---|
| 39 | return hantei |
|---|
| 40 | |
|---|
| 41 | KANJI = re.compile(u"^[一-龠]") |
|---|
| 42 | def gettf(sentencenodes, swdetect, mecab_encoding): |
|---|
| 43 | DOUSHI = u"動詞".encode(mecab_encoding) |
|---|
| 44 | tf = defaultdict(int) |
|---|
| 45 | preword = None |
|---|
| 46 | for node in sentencenodes: |
|---|
| 47 | #print node.feature |
|---|
| 48 | word = unicode(node.feature.split(",")[6], mecab_encoding, "ignore") |
|---|
| 49 | if word == u"*": |
|---|
| 50 | word = unicode(node.surface, mecab_encoding, "ignore") |
|---|
| 51 | word = HYOKIYURE.get(word, word) |
|---|
| 52 | preword = word |
|---|
| 53 | if swdetect(node): |
|---|
| 54 | if node.feature.startswith(DOUSHI): |
|---|
| 55 | tf[word] += 0.5 |
|---|
| 56 | else: |
|---|
| 57 | tf[word] += 1 |
|---|
| 58 | for w in word: |
|---|
| 59 | if KANJI.match(w): |
|---|
| 60 | tf[w] += 0.3 |
|---|
| 61 | #for k in tf.keys(): |
|---|
| 62 | # print "TF", k, "=", tf[k] |
|---|
| 63 | return tf |
|---|
| 64 | |
|---|
| 65 | def test(window, me, ie, oe, evaluate_coeff): |
|---|
| 66 | lines = [] |
|---|
| 67 | tfs = [] |
|---|
| 68 | MARU = u"。".encode(ie) |
|---|
| 69 | for line in sys.stdin: |
|---|
| 70 | line = line.strip() |
|---|
| 71 | if line: |
|---|
| 72 | if not line.endswith(MARU): |
|---|
| 73 | line += MARU |
|---|
| 74 | lines.append(line) |
|---|
| 75 | body = "".join(lines) |
|---|
| 76 | tagger = MeCab.Tagger() |
|---|
| 77 | gen = sentence_generator(tagger, unicode(body, ie, "ignore").encode(me, "ignore"), me) |
|---|
| 78 | swdetect = stopworddetector(me) |
|---|
| 79 | lines = [] |
|---|
| 80 | for sentence in gen: |
|---|
| 81 | tf = gettf(sentence, swdetect, me) |
|---|
| 82 | line = "".join([x.surface for x in sentence]) |
|---|
| 83 | tfs.append(tf) |
|---|
| 84 | lines.append(line) |
|---|
| 85 | df = defaultdict(float) |
|---|
| 86 | for tf in tfs: |
|---|
| 87 | for k in tf.keys(): |
|---|
| 88 | df[k] += 1.0 |
|---|
| 89 | n = float(len(tfs)) |
|---|
| 90 | for tf in tfs: |
|---|
| 91 | for k in tf.keys(): |
|---|
| 92 | tf[k] *= math.log(n/df[k]) |
|---|
| 93 | #for k in sorted(df.keys(), key=lambda x: df[x]): |
|---|
| 94 | # print "DF", k.encode(oe, "ignore"), math.log(n/df[k]) |
|---|
| 95 | segs = TopicSegmentation.segmentation(tfs, window, evaluate_coeff) |
|---|
| 96 | segs.sort() |
|---|
| 97 | print "="*20 |
|---|
| 98 | print " RESULT" |
|---|
| 99 | print "="*20 |
|---|
| 100 | for i, line in enumerate(lines): |
|---|
| 101 | if i in segs: |
|---|
| 102 | print "-"*20 |
|---|
| 103 | print unicode(line, me, "ignore").encode(oe, "ignore") |
|---|
| 104 | |
|---|
| 105 | |
|---|
| 106 | if __name__ == "__main__": |
|---|
| 107 | import optparse |
|---|
| 108 | |
|---|
| 109 | parser = optparse.OptionParser(u""" |
|---|
| 110 | 標準入力から入力された、一行一文の日本語文テキストをトピックごとに分割します。 |
|---|
| 111 | """) |
|---|
| 112 | parser.add_option("-w", "--window", dest="window", help=u"sharpenmatrix作成時の幅係数", type="int", default=2) |
|---|
| 113 | parser.add_option("-e", "--evaluate", dest="c", help=u"分割振幅係数", type="float", default=0.05) |
|---|
| 114 | parser.add_option("-m", "--mecab-encoding", dest="mecab", help=u"mecab辞書のエンコーディング default: utf-8", default="utf-8") |
|---|
| 115 | parser.add_option("-i", "--input-encoding", dest="input", help=u"入力のエンコーディング default: utf-8", default="utf-8") |
|---|
| 116 | parser.add_option("-o", "--output-encoding", dest="output", help=u"出力のエンコーディング default: utf-8", default="utf-8") |
|---|
| 117 | |
|---|
| 118 | (options, args) = parser.parse_args() |
|---|
| 119 | |
|---|
| 120 | test(options.window, options.mecab, options.input, options.output, options.c) |
|---|
| 121 | |
|---|
| 122 | |
|---|