| 14 | | |
| 15 | | def printmatrix(name, mat): |
| 16 | | return |
| 17 | | print "-"*20 |
| 18 | | print name |
| 19 | | for i in range(len(mat)): |
| 20 | | for j in range(len(mat)): |
| 21 | | print "%.03f" % mat[i][j], |
| 22 | | print |
| 23 | | print "-"*20 |
| 24 | | |
| 25 | | def tfmatrix(tfs): |
| 26 | | ans = defaultdict(lambda : defaultdict(float)) |
| 27 | | tflen = [math.sqrt(sum([x*x for x in tf.values()])) for tf in tfs] |
| 28 | | for i in range(len(tfs)): |
| 29 | | for j in range(len(tfs)): |
| 30 | | if i < j: |
| 31 | | ans[i][j] = sum([tfs[i][x]*tfs[j][x] for x in set(tfs[i].keys())&set(tfs[j].keys())]) / (tflen[i]*tflen[j]) |
| 32 | | elif i > j: |
| 33 | | ans[i][j] = ans[j][i] |
| 34 | | else: |
| 35 | | ans[i][i] = 1.0 |
| 36 | | printmatrix("tfmatrix", ans) |
| 37 | | return ans |
| 38 | | |
| 39 | | def sharpen_matrix(matrix, window): |
| 40 | | size = len(matrix) |
| 41 | | newmatrix = defaultdict(lambda : defaultdict(float)) |
| 42 | | for i in range(size): |
| 43 | | for j in range(size): |
| 44 | | if i <= j: |
| 45 | | lx = i - window |
| 46 | | ux = i + window |
| 47 | | ly = j - window |
| 48 | | uy = j + window |
| 49 | | if lx < 0: lx = 0 |
| 50 | | if ux >= size: ux = size-1 |
| 51 | | if ly < 0: ly = 0 |
| 52 | | if uy >= size: uy = size-1 |
| 53 | | area = (ux - lx) * (uy - ly) |
| 54 | | num = 0 |
| 55 | | for x in range(lx, ux+1): |
| 56 | | for y in range(ly, uy+1): |
| 57 | | if x != i and y != j and matrix[x][y] < matrix[i][j]: |
| 58 | | num += 1 |
| 59 | | newmatrix[i][j] = num / float(area) |
| 60 | | if i > j: |
| 61 | | newmatrix[i][j] = newmatrix[j][i] |
| 62 | | printmatrix("sharpen_matrix", newmatrix) |
| 63 | | return newmatrix |
| 64 | | |
| 65 | | def precluster_matrix(r): |
| 66 | | s = defaultdict(lambda:defaultdict(float)) |
| 67 | | size = len(r) |
| 68 | | for i in range(size): |
| 69 | | s[i][i] = r[i][i] |
| 70 | | for i in range(size-1): |
| 71 | | s[i+1][i] = 2*r[i+1][i]+s[i][i]+s[i+1][i+1] |
| 72 | | s[i][i+1] = s[i+1][i] |
| 73 | | for j in range(2,size): |
| 74 | | for i in range(size-j): |
| 75 | | s[i+j][i] = 2*r[i+j][i]+s[i+j-1][i]+s[i+j][i+1]-s[i+j-1][i+1] |
| 76 | | s[i][i+j] = s[i+j][i] |
| 77 | | printmatrix("precluster_matrix", s) |
| 78 | | return s |
| 79 | | |
| 80 | | def density_of_submatrix(mat, sep): |
| 81 | | w = 0.0 |
| 82 | | a = 0.0 |
| 83 | | for i,j in zip(sep[:-1], sep[1:]): |
| 84 | | w += mat[i][j-1] |
| 85 | | a += (j-i)**2 |
| 86 | | return w / float(a) |
| 87 | | |
| 88 | | def split_matrix_rec(mat): |
| 89 | | MAX_TOPIC = 50 |
| 90 | | size = len(mat) |
| 91 | | D = [] |
| 92 | | ssep = [0, size] |
| 93 | | orgssep = [] |
| 94 | | D.append(mat[0][size-1]/float(size**2)) |
| 95 | | c = 1.2 |
| 96 | | def evaluate(s): |
| 97 | | a = sum(s) / float(len(s)) |
| 98 | | v = sum([(x-a)**2 for x in s])/float(len(s)) |
| 99 | | ans = a+c*math.sqrt(v) |
| 100 | | #print "evaluate", ans |
| 101 | | return ans |
| 102 | | dd = [D[-1]] |
| 103 | | while len(ssep) < size: |
| 104 | | #print "dd", dd |
| 105 | | #print "D", D |
| 106 | | #print "ssep", ssep |
| 107 | | maxp = 0 |
| 108 | | maxd = 0 |
| 109 | | maxsep = None |
| 110 | | for i in range(size): |
| 111 | | if i in ssep: continue |
| 112 | | tsep = ssep + [i] |
| 113 | | tsep.sort() |
| 114 | | d = density_of_submatrix(mat, tsep) |
| 115 | | #print "d",d |
| 116 | | if d > maxd: |
| 117 | | #print "max found!", d, i |
| 118 | | maxd = d |
| 119 | | maxp = i |
| 120 | | #print "splitting at ", maxp |
| 121 | | ssep.append(maxp) |
| 122 | | ssep.sort() |
| 123 | | orgssep.append(maxp) |
| 124 | | d = density_of_submatrix(mat, ssep) |
| 125 | | D.append(d) |
| 126 | | dd.append(d-D[-2]) |
| 127 | | eva = evaluate(dd) |
| 128 | | #print "eva", eva |
| 129 | | b = len(list(takewhile(lambda x: x >= eva, dd))) |
| 130 | | #print "dd", dd |
| 131 | | #print "D", D |
| 132 | | #print "ssep", ssep |
| 133 | | #print "orgssep", orgssep |
| 134 | | #print "orgssep", orgssep[:b] |
| 135 | | return orgssep[:b] |
| 136 | | |
| 137 | | def segmentation(tfs, sharpen_window=5): |
| 138 | | matrix = precluster_matrix(sharpen_matrix(tfmatrix(tfs), window=sharpen_window)) |
| 139 | | return split_matrix_rec(matrix) |