annotate spring_roc.py @ 29:41353488926c draft

"planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
author guerler
date Sun, 22 Nov 2020 14:15:24 +0000
parents
children b0e195a47df7
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
29
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
1 #! /usr/bin/env python
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
2 import argparse
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
3 import math
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
4 import random
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
5 from datetime import datetime
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
6
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
7 from matplotlib import pyplot as plt
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
8
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
9
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
10 def getIds(rawIds):
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
11 return rawIds.split("|")
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
12
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
13
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
14 def getCenterId(rawId):
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
15 elements = rawId.split("|")
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
16 if len(elements) > 1:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
17 return elements[1]
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
18 return rawId
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
19
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
20
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
21 def getOrganism(rawId):
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
22 elements = rawId.split("_")
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
23 return elements[-1]
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
24
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
25
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
26 def getKey(a, b):
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
27 if a > b:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
28 name = "%s_%s" % (a, b)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
29 else:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
30 name = "%s_%s" % (b, a)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
31 return name
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
32
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
33
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
34 def getPercentage(rate, denominator):
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
35 if denominator > 0:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
36 return 100.0 * rate / denominator
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
37 return 0.0
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
38
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
39
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
40 def getFilter(filterName):
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
41 print("Loading target organism(s)...")
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
42 filterSets = dict()
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
43 with open(filterName) as filterFile:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
44 for line in filterFile:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
45 columns = line.split()
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
46 for colIndex in [0, 1]:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
47 if colIndex >= len(columns):
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
48 break
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
49 colEntry = columns[colIndex]
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
50 id = getCenterId(colEntry)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
51 organism = getOrganism(colEntry)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
52 if organism not in filterSets:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
53 filterSets[organism] = set()
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
54 filterSets[organism].add(id)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
55 print("Organism(s) in set: %s." % filterSets.keys())
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
56 return filterSets
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
57
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
58
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
59 def getReference(fileName, filterA=None, filterB=None, minScore=None, aCol=0,
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
60 bCol=1, scoreCol=-1, separator=None,
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
61 skipFirstLine=False, filterValues=list()):
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
62 index = dict()
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
63 count = 0
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
64 with open(fileName) as fp:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
65 line = fp.readline()
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
66 if skipFirstLine:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
67 line = fp.readline()
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
68 while line:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
69 ls = line.split(separator)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
70 if separator is not None:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
71 aList = getIds(ls[aCol])
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
72 bList = getIds(ls[bCol])
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
73 else:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
74 aList = [getCenterId(ls[aCol])]
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
75 bList = [getCenterId(ls[bCol])]
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
76 validEntry = False
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
77 for a in aList:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
78 for b in bList:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
79 skip = False
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
80 if a == "-" or b == "-":
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
81 skip = True
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
82 if filterA is not None:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
83 if a not in filterA and b not in filterA:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
84 skip = True
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
85 if filterB is not None:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
86 if a not in filterB and b not in filterB:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
87 skip = True
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
88 for f in filterValues:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
89 if len(ls) > f[0]:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
90 columnEntry = ls[f[0]].lower()
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
91 searchEntry = f[1].lower()
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
92 if columnEntry.find(searchEntry) == -1:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
93 skip = True
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
94 if not skip:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
95 name = getKey(a, b)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
96 if name not in index:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
97 validEntry = True
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
98 if scoreCol >= 0 and len(ls) > scoreCol:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
99 score = float(ls[scoreCol])
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
100 skip = False
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
101 if minScore is not None:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
102 if minScore > score:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
103 return index, count
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
104 if not skip:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
105 index[name] = score
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
106 else:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
107 index[name] = 1.0
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
108 if validEntry:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
109 count = count + 1
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
110 line = fp.readline()
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
111 return index, count
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
112
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
113
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
114 def getXY(prediction, positive, positiveCount, negative):
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
115 sortedPrediction = sorted(prediction.items(), key=lambda x: x[1],
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
116 reverse=True)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
117 positiveTotal = positiveCount
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
118 negativeTotal = len(negative)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
119 x = list([0])
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
120 y = list([0])
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
121 xMax = 0
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
122 topCount = 0
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
123 topMCC = 0.0
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
124 topPrecision = 0.0
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
125 topScore = 0.0
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
126 tp = 0
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
127 fp = 0
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
128 count = 0
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
129 for (name, score) in sortedPrediction:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
130 found = False
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
131 if name in positive:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
132 found = True
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
133 tp = tp + 1
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
134 if name in negative:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
135 found = True
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
136 fp = fp + 1
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
137 precision = 0.0
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
138 if tp > 0 or fp > 0:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
139 precision = tp / (tp + fp)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
140 fn = positiveTotal - tp
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
141 tn = negativeTotal - fp
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
142 denom = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
143 if denom > 0.0:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
144 mcc = (tp*tn-fp*fn)/math.sqrt(denom)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
145 if mcc >= topMCC:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
146 topMCC = mcc
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
147 topScore = score
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
148 topCount = count
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
149 topPrecision = precision
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
150 if found:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
151 yValue = getPercentage(tp, tp + fn)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
152 xValue = getPercentage(fp, fp + tn)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
153 y.append(yValue)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
154 x.append(xValue)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
155 xMax = max(xValue, xMax)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
156 count = count + 1
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
157 print("Top ranking prediction %s." % str(sortedPrediction[0]))
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
158 print("Total count of prediction set: %s (precision=%1.2f)." %
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
159 (topCount, topPrecision))
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
160 print("Total count of positive set: %s." % len(positive))
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
161 print("Total count of negative set: %s." % len(negative))
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
162 print("Matthews-Correlation-Coefficient: %s at Score >= %s." %
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
163 (round(topMCC, 2), topScore))
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
164 return x, y, xMax
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
165
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
166
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
167 def main(args):
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
168 # load source files
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
169 filterSets = getFilter(args.input)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
170 filterKeys = list(filterSets.keys())
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
171 filterA = filterSets[filterKeys[0]]
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
172 if len(filterKeys) > 1:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
173 filterB = filterSets[filterKeys[1]]
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
174 else:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
175 filterB = filterA
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
176
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
177 # identify biogrid filter options
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
178 filterValues = []
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
179 if args.method:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
180 filterValues.append([11, args.method])
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
181 if args.experiment:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
182 filterValues.append([12, args.experiment])
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
183 if args.throughput:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
184 filterValues.append([17, args.throughput])
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
185
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
186 # process biogrid database
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
187 print("Loading positive set from BioGRID file...")
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
188 positive, positiveCount = getReference(args.biogrid, aCol=23, bCol=26,
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
189 separator="\t", filterA=filterA,
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
190 filterB=filterB, skipFirstLine=True,
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
191 filterValues=filterValues)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
192
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
193 # rescan biogrid database to identify set of putative interactions
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
194 if filterValues:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
195 print("Filtered entries by (column, value): %s" % filterValues)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
196 print("Loading putative set from BioGRID file...")
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
197 putative, putativeCount = getReference(args.biogrid, aCol=23, bCol=26,
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
198 separator="\t", filterA=filterA,
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
199 filterB=filterB,
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
200 skipFirstLine=True)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
201 print("Found %s." % putativeCount)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
202 else:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
203 putative = positive
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
204
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
205 # process prediction file
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
206 print("Loading prediction file...")
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
207 prediction, _ = getReference(args.input, scoreCol=2)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
208
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
209 # estimate background noise
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
210 print("Estimating background noise...")
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
211 negative = set()
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
212 filterAList = list(filterA)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
213 filterBList = list(filterB)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
214 negativeCount = positiveCount
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
215 negativeRequired = negativeCount
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
216 random.seed(datetime.now())
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
217 while negativeRequired > 0:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
218 nameA = random.choice(filterAList)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
219 nameB = random.choice(filterBList)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
220 key = getKey(nameA, nameB)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
221 if key not in putative and key not in negative:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
222 negative.add(key)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
223 negativeRequired = negativeRequired - 1
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
224
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
225 # create plot
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
226 print("Producing plot data...")
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
227 print("Total count in prediction file: %d." % len(prediction))
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
228 print("Total count in positive file: %d." % len(positive))
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
229 plt.ylabel('True Positive Rate (%)')
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
230 plt.xlabel('False Positive Rate (%)')
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
231 title = " vs. ".join(filterSets)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
232 plt.suptitle(title)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
233 if filterValues:
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
234 filterAttributes = list(map(lambda x: x[1], filterValues))
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
235 plt.title("BioGRID filters: %s" % filterAttributes, fontsize=10)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
236 x, y, xMax = getXY(prediction, positive, positiveCount, negative)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
237 plt.plot(x, y)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
238 plt.plot([0, xMax], [0, xMax])
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
239 plt.savefig(args.output, format="png")
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
240
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
241
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
242 if __name__ == "__main__":
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
243 parser = argparse.ArgumentParser(description='Create ROC plot.')
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
244 parser.add_argument('-i', '--input', help='Input prediction file.',
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
245 required=True)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
246 parser.add_argument('-b', '--biogrid', help='BioGRID interaction ' +
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
247 'database file', required=True)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
248 parser.add_argument('-e', '--experiment', help='Type (physical/genetic)',
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
249 default="", required=False)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
250 parser.add_argument('-t', '--throughput', help='Throughput (low/high)',
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
251 default="", required=False)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
252 parser.add_argument('-m', '--method', help='Method e.g. Two-hybrid',
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
253 default="", required=False)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
254 parser.add_argument('-o', '--output', help='Output (png)', required=True)
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
255 args = parser.parse_args()
41353488926c "planemo upload commit 1c0a60f98e36bccb6d6c85ff82a8d737a811b4d5"
guerler
parents:
diff changeset
256 main(args)