root/lang/python/yoyaku/yoyaku/engine/TopicSegmentationJP.py

Revision 11041, 4.3 kB (checked in by ayu, 4 years ago)
  • Property svn:executable set to *
Line 
1#!/usr/local/Python25/bin/python
2# _*_ coding: utf-8 _*_
3# Copyright (C)  2007 Ayukawa Hiroshi
4from collections import defaultdict
5import re
6import sys
7import MeCab
8import math
9
10from yoyaku.engine import TopicSegmentation
11from yoyaku.engine.HyokiyureData import HYOKIYURE
12   
13def sentence_generator(tagger, src, mecabencoding="utf-8"):
14    MARU = u"。".encode(mecabencoding)
15    node = tagger.parseToNode(src)
16    nodes = []
17    while node:
18        nodes.append(node)
19        if node.surface == MARU:
20            yield nodes
21            nodes = []
22        node = node.next
23    if nodes: yield nodes
24
25def stopworddetector(mecabencoding="utf-8"):
26    NUM_CHAR = re.compile(ur"^[0-90-9\-\.]+$")
27    MEISHI = u"名詞".encode(mecabencoding)
28    MICHI = u"未知".encode(mecabencoding)
29    DOUSHI = u"動詞".encode(mecabencoding)
30    HIRA = re.compile(u"^[ぁ-ん]+$")
31    STOPWORD = [u"(", u"(", u")", u")", u"等", u"同", u".", u"/", u"その後"]
32    def hantei(node):
33        w = unicode(node.surface, mecabencoding, "ignore")
34        stop = not w.replace(u"。", u"") in STOPWORD
35        hira = not HIRA.match(w)
36        hinshi = node.feature.startswith(MEISHI) or node.feature.startswith(MICHI) or node.feature.startswith(DOUSHI)
37        num = not NUM_CHAR.match(w)
38        return node.surface and hinshi and hira and num and stop
39    return hantei
40
41KANJI = re.compile(u"^[一-龠]")
42def gettf(sentencenodes, swdetect, mecab_encoding):
43    DOUSHI = u"動詞".encode(mecab_encoding)
44    tf = defaultdict(int)
45    preword = None
46    for node in sentencenodes:
47        #print node.feature
48        word = unicode(node.feature.split(",")[6], mecab_encoding, "ignore")
49        if word == u"*":
50            word = unicode(node.surface, mecab_encoding, "ignore")
51        word = HYOKIYURE.get(word, word)
52        preword = word
53        if swdetect(node):
54            if node.feature.startswith(DOUSHI):
55                tf[word] += 0.5
56            else:
57                tf[word] += 1
58                for w in word:
59                    if KANJI.match(w):
60                        tf[w] += 0.3
61    #for k in tf.keys():
62    #    print "TF", k, "=", tf[k]
63    return tf
64
65def test(window, me, ie, oe, evaluate_coeff):
66    lines = []
67    tfs = []
68    MARU = u"。".encode(ie)
69    for line in sys.stdin:
70        line = line.strip()
71        if line:
72            if not line.endswith(MARU):
73                line += MARU
74            lines.append(line)
75    body = "".join(lines)
76    tagger = MeCab.Tagger()
77    gen = sentence_generator(tagger, unicode(body, ie, "ignore").encode(me, "ignore"), me)
78    swdetect = stopworddetector(me)
79    lines = []
80    for sentence in gen:
81        tf = gettf(sentence, swdetect, me)
82        line = "".join([x.surface for x in sentence])
83        tfs.append(tf)
84        lines.append(line)
85    df = defaultdict(float)
86    for tf in tfs:
87        for k in tf.keys():
88            df[k] += 1.0
89    n = float(len(tfs))
90    for tf in tfs:
91        for k in tf.keys():
92            tf[k] *= math.log(n/df[k])
93    #for k in sorted(df.keys(), key=lambda x: df[x]):
94    #    print "DF", k.encode(oe, "ignore"), math.log(n/df[k])
95    segs = TopicSegmentation.segmentation(tfs, window, evaluate_coeff)
96    segs.sort()
97    print "="*20
98    print "      RESULT"
99    print "="*20
100    for i, line in enumerate(lines):
101        if i in segs:
102            print "-"*20
103        print unicode(line, me, "ignore").encode(oe, "ignore")
104   
105
106if __name__ == "__main__":
107    import optparse
108   
109    parser = optparse.OptionParser(u"""
110    標準入力から入力された、一行一文の日本語文テキストをトピックごとに分割します。
111    """)
112    parser.add_option("-w", "--window", dest="window", help=u"sharpenmatrix作成時の幅係数", type="int", default=2)
113    parser.add_option("-e", "--evaluate", dest="c", help=u"分割振幅係数", type="float", default=0.05)
114    parser.add_option("-m", "--mecab-encoding", dest="mecab", help=u"mecab辞書のエンコーディング default: utf-8", default="utf-8")
115    parser.add_option("-i", "--input-encoding", dest="input", help=u"入力のエンコーディング default: utf-8", default="utf-8")
116    parser.add_option("-o", "--output-encoding", dest="output", help=u"出力のエンコーディング default: utf-8", default="utf-8")
117
118    (options, args) = parser.parse_args()
119   
120    test(options.window, options.mecab, options.input, options.output, options.c)
121
122
Note: See TracBrowser for help on using the browser.