diff spring_roc.py @ 30:b0e195a47df7 draft

"planemo upload commit b306c715d8284c097220bc5c8461399fdc05eac7"
author guerler
date Tue, 24 Nov 2020 14:02:08 +0000
parents 41353488926c
children 3071750405c9
line wrap: on
line diff
--- a/spring_roc.py	Sun Nov 22 14:15:24 2020 +0000
+++ b/spring_roc.py	Tue Nov 24 14:02:08 2020 +0000
@@ -2,6 +2,7 @@
 import argparse
 import math
 import random
+from os.path import isfile
 from datetime import datetime
 
 from matplotlib import pyplot as plt
@@ -206,21 +207,45 @@
     print("Loading prediction file...")
     prediction, _ = getReference(args.input, scoreCol=2)
 
+    # get subcellular locations from UniProt export
+    locations = dict()
+    if isfile(args.locations):
+        with open(args.locations) as locFile:
+            for line in locFile:
+                searchKey = "SUBCELLULAR LOCATION"
+                searchPos = line.find(searchKey)
+                if searchPos != -1:
+                    uniId = line.split()[0]
+                    locStart = searchPos + len(searchKey) + 1
+                    locId = line[locStart:].split()[0]
+                    if locId in ["Nucleus", "Membrane", "Cytoplasm"]:
+                        if uniId in filterA and uniId in filterB:
+                            locations[uniId] = locId
+        print("Found %d subcellular locations." % (len(list(locations.keys()))))
+
     # estimate background noise
     print("Estimating background noise...")
     negative = set()
-    filterAList = list(filterA)
-    filterBList = list(filterB)
-    negativeCount = positiveCount
-    negativeRequired = negativeCount
-    random.seed(datetime.now())
-    while negativeRequired > 0:
+    filterAList = sorted(list(filterA))
+    filterBList = sorted(list(filterB))
+    negativeRequired = positiveCount
+    random.seed(0)
+    totalAttempts = int(len(filterAList) * len(filterBList) / 2)
+    while totalAttempts > 0:
+        totalAttempts = totalAttempts - 1
         nameA = random.choice(filterAList)
         nameB = random.choice(filterBList)
+        if locations:
+            if nameA not in locations or nameB not in locations:
+                continue
+            if locations[nameA] == locations[nameB]:
+                continue
         key = getKey(nameA, nameB)
         if key not in putative and key not in negative:
             negative.add(key)
             negativeRequired = negativeRequired - 1
+            if negativeRequired == 0:
+                break
 
     # create plot
     print("Producing plot data...")
@@ -241,16 +266,12 @@
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Create ROC plot.')
-    parser.add_argument('-i', '--input', help='Input prediction file.',
-                        required=True)
-    parser.add_argument('-b', '--biogrid', help='BioGRID interaction ' +
-                        'database file', required=True)
-    parser.add_argument('-e', '--experiment', help='Type (physical/genetic)',
-                        default="", required=False)
-    parser.add_argument('-t', '--throughput', help='Throughput (low/high)',
-                        default="", required=False)
-    parser.add_argument('-m', '--method', help='Method e.g. Two-hybrid',
-                        default="", required=False)
+    parser.add_argument('-i', '--input', help='Input prediction file.', required=True)
+    parser.add_argument('-b', '--biogrid', help='BioGRID interaction database file', required=True)
+    parser.add_argument('-l', '--locations', help='UniProt export table with subcellular locations', required=False)
+    parser.add_argument('-e', '--experiment', help='Type (physical/genetic)', default="", required=False)
+    parser.add_argument('-t', '--throughput', help='Throughput (low/high)', default="", required=False)
+    parser.add_argument('-m', '--method', help='Method e.g. Two-hybrid', default="", required=False)
     parser.add_argument('-o', '--output', help='Output (png)', required=True)
     args = parser.parse_args()
     main(args)