Repository 'msms_extractor'
hg clone https://toolshed.g2.bx.psu.edu/repos/pravs/msms_extractor

Changeset 2:aa944e3a353c (2017-08-03)
Previous changeset 1:f444e529363d (2017-02-16) Next changeset 3:e7c63cfef363 (2017-08-03)
Commit message:
planemo upload
modified:
MSMS_Extractor.py
MSMS_Extractor.xml
b
diff -r f444e529363d -r aa944e3a353c MSMS_Extractor.py
--- a/MSMS_Extractor.py Thu Feb 16 11:56:45 2017 -0500
+++ b/MSMS_Extractor.py Thu Aug 03 13:53:09 2017 -0400
[
@@ -4,6 +4,7 @@
 # University of Minnesota
 #
 #
+#
 
 def main():
     from pyteomics import mzml
@@ -15,9 +16,15 @@
     import pandas as pd
     from operator import itemgetter
     from itertools import groupby
-    if len(sys.argv) >= 5:
+    import random
+    
+    if len(sys.argv) >= 7:
         # Start of Reading Scans from PSM file
         # Creating dictionary of PSM file: key = filename key = list of scan numbers
+        
+        removeORretain = sys.argv[5].strip()
+        randomScans = int(sys.argv[6].strip())
+        
         ScanFile = sys.argv[2]
         spectrumTitleList = list(pd.read_csv(ScanFile, "\t")['Spectrum Title'])
         scanFileNumber = [[".".join(each.split(".")[:-3]), int(each.split(".")[-2:-1][0])] for each in spectrumTitleList]
@@ -34,7 +41,6 @@
         outPath = sys.argv[3]
         ##outFile = sys.argv[3].split("/")[-1]
         allScanList = []
-        
         # Read all scan numbers using indexedmzML/indexList/index/offset tags
         for k in mzml.read(inputPath).iterfind('indexedmzML/indexList/index/offset'):
             if re.search("scan=(\d+)", k['idRef']):
@@ -44,17 +50,28 @@
         
         fraction_name = sys.argv[4]
         if scanDict.has_key(fraction_name):
-            scan2remove = scanDict[fraction_name]
+            scansInList = scanDict[fraction_name]
         else:
-            scan2remove = []
-        scan2retain = list(set(allScanList) - set(scan2remove))
-        scan2retain.sort()
-        scansRemoved = list(set(allScanList) - set(scan2retain))
-        # scan2retain contains scans that is to be retained
+            scansInList = []
+        scansNotInList = list(set(allScanList) - set(scansInList))
         
+        if removeORretain == "remove":
+            scan2retain = scansNotInList
+            scan2retain.sort()
+            scansRemoved = scansInList
+            # scan2retain contains scans that is to be retained
+            
+        elif removeORretain == "retain":
+            # Randomly select spectra
+            random_scans = list(map(lambda _: random.choice(scansNotInList), range(randomScans)))
+            
+            scan2retain = random_scans + scansInList
+            scan2retain.sort()
+            scansRemoved = list(set(allScanList) - set(scan2retain))
+            # scan2retain contains scans that is to be retained
+            
         # Print Stats
         print >> sys.stdout,"Total number of Scan Numbers: %d" % len(list(set(allScanList)))
-        print >> sys.stdout,"Number of Scans to remove: %d" % len(list(set(scan2remove)))
         print >> sys.stdout,"Number of Scans retained: %d" % len(scan2retain)
         print >> sys.stdout,"Number of Scans removed: %d" % len(scansRemoved)
         
b
diff -r f444e529363d -r aa944e3a353c MSMS_Extractor.xml
--- a/MSMS_Extractor.xml Thu Feb 16 11:56:45 2017 -0500
+++ b/MSMS_Extractor.xml Thu Aug 03 13:53:09 2017 -0400
[
@@ -1,11 +1,11 @@
 
-<tool id="MSMS_Extractor" name="MSMS_Extractor" version="1.0.0">
-  <description>Removes scans with identified PSMs from the mzML file(s).</description>
+<tool id="MSMS_Extractor" name="MSMS_Extractor" version="1.1.0">
+  <description>Extract scans based on PSM report from the mzML file(s).</description>
   <requirements>
       <requirement type="package" version="3.0.9016">proteowizard</requirement>
       <requirement type="package" version="3.4">pyteomics</requirement>
   </requirements>
-  <command interpreter="python"><![CDATA[MSMS_Extractor.py $spectrumfile $psmreportfile $output ${spectrumfile.name.rsplit('.',1)[0]}]]></command>
+  <command interpreter="python"><![CDATA[MSMS_Extractor.py $spectrumfile $psmreportfile $output ${spectrumfile.name.rsplit('.',1)[0]} $removeretain.doremoveretain $removeretain.num_random_scans]]></command>
   <inputs>
     <param name="spectrumfile" type="data" format="mzml">
       <label>Input mzML File</label>
@@ -13,6 +13,17 @@
     <param name="psmreportfile" type="data" format="tabular">
       <label>Input PSM Report File</label>
     </param>
+    
+    <conditional name="removeretain">
+        <param name="doremoveretain" type="boolean" truevalue="retain" falsevalue="remove" label="Remove or Retain the given Scans" help="Retain=Yes; Remove=No (default)" />
+        <when value="remove">
+            <param name="num_random_scans" type="hidden" value="0" />
+        </when>
+        <when value="retain">
+            <param name="num_random_scans" type="integer" label="Add N random scans in addition to those in the list" value="0" optional="false" />
+        </when>
+      </conditional>
+      
   </inputs>
 
   <outputs>
@@ -21,6 +32,16 @@
   
 
   <help>
-MSMS_Extractor reads scan numbers from the PSM report (scan numbers with identified PSM) and removes it from the mzML file.
+MSMS_Extractor reads scan numbers from the PSM report (scan numbers with identified PSM) and gives option to create a new mzml file, either with those scans or without those scans.
+
+Remove option:
+Creates a new mzml file with all the unidentified scans (removes those that are in the PSM report).
+
+Retain option:
+Creates a new mzml file with only those scans that is present in the PSM report. In addition to this, it also has an option to add N number of randomly selected scans to the output mzml file.
+
+
+Please Note: This tool currently works only with PeptideShaker generated PSM report file.
+
   </help>
 </tool>