comparison spring_roc.py @ 34:b300ddbbf9d0 draft

"planemo upload commit 0410e2fadc4e9fc1df6010de7b3678154cbdfe62-dirty"
author guerler
date Tue, 24 Nov 2020 17:55:07 +0000
parents 3071750405c9
children 0bcc0a269916
comparison
equal deleted inserted replaced
33:f115fbf3ac63 34:b300ddbbf9d0
1 #! /usr/bin/env python 1 #! /usr/bin/env python
2 import argparse 2 import argparse
3 import math 3 import math
4 import random 4 import random
5 from os.path import isfile 5 from os.path import isfile
6 from datetime import datetime
7 6
8 from matplotlib import pyplot as plt 7 from matplotlib import pyplot as plt
9 8
10 9
11 def getIds(rawIds): 10 def getIds(rawIds):
205 204
206 # process prediction file 205 # process prediction file
207 print("Loading prediction file...") 206 print("Loading prediction file...")
208 prediction, _ = getReference(args.input, scoreCol=2) 207 prediction, _ = getReference(args.input, scoreCol=2)
209 208
210 # get subcellular locations from UniProt export 209 # determine negative set
211 locations = dict() 210 print("Identifying non-interacting pairs...")
212 if isfile(args.locations):
213 regions = list()
214 if args.regions:
215 regions = args.regions.split(",")
216 with open(args.locations) as locFile:
217 for line in locFile:
218 searchKey = "SUBCELLULAR LOCATION"
219 searchPos = line.find(searchKey)
220 if searchPos != -1:
221 uniId = line.split()[0]
222 locStart = searchPos + len(searchKey) + 1
223 locId = line[locStart:].split()[0]
224 if regions:
225 if locId not in regions:
226 continue
227 if uniId in filterA or uniId in filterB:
228 locations[uniId] = locId
229 print("Found %d subcellular locations." % (len(list(locations.keys()))))
230
231 # estimate background noise
232 print("Estimating background noise...")
233 negative = set() 211 negative = set()
234 filterAList = sorted(list(filterA)) 212 if isfile(args.negative):
235 filterBList = sorted(list(filterB)) 213 # load from explicit file
236 negativeRequired = positiveCount 214 with open(args.negative) as file:
237 random.seed(0) 215 for line in file:
238 totalAttempts = int(len(filterAList) * len(filterBList) / 2) 216 cols = line.split()
239 while totalAttempts > 0: 217 nameA = cols[0]
240 totalAttempts = totalAttempts - 1 218 nameB = cols[1]
241 nameA = random.choice(filterAList) 219 key = getKey(nameA, nameB)
242 nameB = random.choice(filterBList) 220 if key not in putative and key not in negative:
243 if locations: 221 negative.add(key)
244 if nameA not in locations or nameB not in locations: 222 else:
245 continue 223 # get subcellular locations from UniProt export
246 if locations[nameA] == locations[nameB]: 224 locations = dict()
247 continue 225 if isfile(args.locations):
248 key = getKey(nameA, nameB) 226 regions = list()
249 if key not in putative and key not in negative: 227 if args.regions:
250 negative.add(key) 228 regions = args.regions.split(",")
251 negativeRequired = negativeRequired - 1 229 with open(args.locations) as locFile:
252 if negativeRequired == 0: 230 for line in locFile:
253 break 231 searchKey = "SUBCELLULAR LOCATION"
232 searchPos = line.find(searchKey)
233 if searchPos != -1:
234 uniId = line.split()[0]
235 locStart = searchPos + len(searchKey) + 1
236 locId = line[locStart:].split()[0]
237 if regions:
238 if locId not in regions:
239 continue
240 if uniId in filterA or uniId in filterB:
241 locations[uniId] = locId
242 print("Found %d subcellular locations." % (len(list(locations.keys()))))
243
244 # randomly sample non-interacting pairs
245 filterAList = sorted(list(filterA))
246 filterBList = sorted(list(filterB))
247 negativeRequired = positiveCount
248 random.seed(0)
249 totalAttempts = int(len(filterAList) * len(filterBList) / 2)
250 while totalAttempts > 0:
251 totalAttempts = totalAttempts - 1
252 nameA = random.choice(filterAList)
253 nameB = random.choice(filterBList)
254 if locations:
255 if nameA not in locations or nameB not in locations:
256 continue
257 if locations[nameA] == locations[nameB]:
258 continue
259 key = getKey(nameA, nameB)
260 if key not in putative and key not in negative:
261 negative.add(key)
262 negativeRequired = negativeRequired - 1
263 if negativeRequired == 0:
264 break
254 265
255 # create plot 266 # create plot
256 print("Producing plot data...") 267 print("Producing plot data...")
257 print("Total count in prediction file: %d." % len(prediction)) 268 print("Total count in prediction file: %d." % len(prediction))
258 print("Total count in positive file: %d." % len(positive)) 269 print("Total count in positive file: %d." % len(positive))
269 plt.savefig(args.output, format="png") 280 plt.savefig(args.output, format="png")
270 281
271 282
272 if __name__ == "__main__": 283 if __name__ == "__main__":
273 parser = argparse.ArgumentParser(description='Create ROC plot.') 284 parser = argparse.ArgumentParser(description='Create ROC plot.')
274 parser.add_argument('-i', '--input', help='Input prediction file.', required=True) 285 parser.add_argument('-i', '--input', help='Input prediction file (2-columns).', required=True)
275 parser.add_argument('-b', '--biogrid', help='BioGRID interaction database file', required=True) 286 parser.add_argument('-b', '--biogrid', help='BioGRID interaction database file', required=True)
276 parser.add_argument('-l', '--locations', help='UniProt export table with subcellular locations', required=False) 287 parser.add_argument('-l', '--locations', help='UniProt export table with subcellular locations', default="", required=False)
277 parser.add_argument('-r', '--regions', help='Comma-separated regions', required=False) 288 parser.add_argument('-r', '--regions', help='Comma-separated regions', required=False)
289 parser.add_argument('-n', '--negative', help='Negative set (2-columns)', default="", required=False)
278 parser.add_argument('-e', '--experiment', help='Type (physical/genetic)', default="", required=False) 290 parser.add_argument('-e', '--experiment', help='Type (physical/genetic)', default="", required=False)
279 parser.add_argument('-t', '--throughput', help='Throughput (low/high)', default="", required=False) 291 parser.add_argument('-t', '--throughput', help='Throughput (low/high)', default="", required=False)
280 parser.add_argument('-m', '--method', help='Method e.g. Two-hybrid', default="", required=False) 292 parser.add_argument('-m', '--method', help='Method e.g. Two-hybrid', default="", required=False)
281 parser.add_argument('-o', '--output', help='Output (png)', required=True) 293 parser.add_argument('-o', '--output', help='Output (png)', required=True)
282 args = parser.parse_args() 294 args = parser.parse_args()