diff MSMS_Extractor.py @ 2:aa944e3a353c draft

planemo upload
author pravs
date Thu, 03 Aug 2017 13:53:09 -0400
parents 093015b1b904
children c2f8e3164537
line wrap: on
line diff
--- a/MSMS_Extractor.py	Thu Feb 16 11:56:45 2017 -0500
+++ b/MSMS_Extractor.py	Thu Aug 03 13:53:09 2017 -0400
@@ -4,6 +4,7 @@
 # University of Minnesota
 #
 #
+#
 
 def main():
     from pyteomics import mzml
@@ -15,9 +16,15 @@
     import pandas as pd
     from operator import itemgetter
     from itertools import groupby
-    if len(sys.argv) >= 5:
+    import random
+    
+    if len(sys.argv) >= 7:
         # Start of Reading Scans from PSM file
         # Creating dictionary of PSM file: key = filename key = list of scan numbers
+        
+        removeORretain = sys.argv[5].strip()
+        randomScans = int(sys.argv[6].strip())
+        
         ScanFile = sys.argv[2]
         spectrumTitleList = list(pd.read_csv(ScanFile, "\t")['Spectrum Title'])
         scanFileNumber = [[".".join(each.split(".")[:-3]), int(each.split(".")[-2:-1][0])] for each in spectrumTitleList]
@@ -34,7 +41,6 @@
         outPath = sys.argv[3]
         ##outFile = sys.argv[3].split("/")[-1]
         allScanList = []
-        
         # Read all scan numbers using indexedmzML/indexList/index/offset tags
         for k in mzml.read(inputPath).iterfind('indexedmzML/indexList/index/offset'):
             if re.search("scan=(\d+)", k['idRef']):
@@ -44,17 +50,28 @@
         
         fraction_name = sys.argv[4]
         if scanDict.has_key(fraction_name):
-            scan2remove = scanDict[fraction_name]
+            scansInList = scanDict[fraction_name]
         else:
-            scan2remove = []
-        scan2retain = list(set(allScanList) - set(scan2remove))
-        scan2retain.sort()
-        scansRemoved = list(set(allScanList) - set(scan2retain))
-        # scan2retain contains scans that is to be retained
+            scansInList = []
+        scansNotInList = list(set(allScanList) - set(scansInList))
         
+        if removeORretain == "remove":
+            scan2retain = scansNotInList
+            scan2retain.sort()
+            scansRemoved = scansInList
+            # scan2retain contains scans that is to be retained
+            
+        elif removeORretain == "retain":
+            # Randomly select spectra
+            random_scans = list(map(lambda _: random.choice(scansNotInList), range(randomScans)))
+            
+            scan2retain = random_scans + scansInList
+            scan2retain.sort()
+            scansRemoved = list(set(allScanList) - set(scan2retain))
+            # scan2retain contains scans that is to be retained
+            
         # Print Stats
         print >> sys.stdout,"Total number of Scan Numbers: %d" % len(list(set(allScanList)))
-        print >> sys.stdout,"Number of Scans to remove: %d" % len(list(set(scan2remove)))
         print >> sys.stdout,"Number of Scans retained: %d" % len(scan2retain)
         print >> sys.stdout,"Number of Scans removed: %d" % len(scansRemoved)