diff msms_extractor.py @ 0:4bef80b09854 draft default tip

"planemo upload commit 498440b547e1feaa6a81764b55ac8208626d70a8"
author galaxyp
date Tue, 29 Oct 2019 11:09:46 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/msms_extractor.py	Tue Oct 29 11:09:46 2019 -0400
@@ -0,0 +1,148 @@
+# 
+# Developed by Praveen Kumar
+# Galaxy-P Team (Griffin's Lab)
+# University of Minnesota
+#
+#
+#
+
+
+from pyteomics import mzml
+import os
+import sys
+import shutil
+import subprocess
+import re
+import pandas as pd
+from operator import itemgetter
+from itertools import groupby
+import random
+import argparse
+
+def main():
+    if len(sys.argv) >= 7:
+        parser = argparse.ArgumentParser()
+        parser.add_argument("msms", help="mzML File")
+        parser.add_argument("psm", help="PSM Report File")
+        parser.add_argument("out", help="Output filename")
+        parser.add_argument("filestring", help="MSMS File string as identifier")
+        parser.add_argument("remove_retain", help="Remove scans reported in the PSM report")
+        parser.add_argument("random", help="Random MSMS scans used with to use with --retain (default=0)", default=0)
+        args = parser.parse_args()
+        # Start of Reading Scans from PSM file
+        # Creating dictionary of PSM file: key = filename key = list of scan numbers
+        
+        # removeORretain = sys.argv[5].strip()
+        # randomScans = int(sys.argv[6].strip())
+        
+        removeORretain = args.remove_retain
+        randomScans = int(args.random)
+        
+        
+        # ScanFile = sys.argv[2]
+        ScanFile = args.psm
+        spectrumTitleList = list(pd.read_csv(ScanFile, "\t")['Spectrum Title'])
+        scanFileNumber = [[".".join(each.split(".")[:-3]), int(each.split(".")[-2:-1][0])] for each in spectrumTitleList]
+        scanDict = {}
+        for each in scanFileNumber:
+            if each[0] in scanDict.keys():
+                scanDict[each[0]].append(int(each[1]))
+            else:
+                scanDict[each[0]] = [int(each[1])]
+        # End of Reading Scans from PSM file
+        
+        # inputPath = sys.argv[1]
+        inputPath = args.msms
+        ##outPath = "/".join(sys.argv[3].split("/")[:-1])
+        # outPath = sys.argv[3]
+        outPath = args.out
+        ##outFile = sys.argv[3].split("/")[-1]
+        allScanList = []
+        # Read all scan numbers using indexedmzML/indexList/index/offset tags
+        for k in mzml.read(inputPath).iterfind('indexedmzML/indexList/index/offset'):
+            if re.search("scan=(\d+)", k['idRef']):
+                a = re.search("scan=(\d+)", k['idRef'])
+                allScanList.append(int(a.group(1)))
+        allScanList = list(set(allScanList))
+        # End of Reading mzML file
+        # fraction_name = sys.argv[4]
+        fraction_name = args.filestring
+        if fraction_name in scanDict.keys():
+            scansInList = scanDict[fraction_name]
+        else:
+            scansInList = []
+        scansNotInList = list(set(allScanList) - set(scansInList))
+        flag = 0
+        if removeORretain == "remove":
+            scan2retain = scansNotInList
+            scan2retain = list(set(scan2retain))
+            scan2retain.sort()
+            scansRemoved = scansInList
+            # scan2retain contains scans that is to be retained
+        elif removeORretain == "retain" and randomScans < len(scansNotInList):
+            # Randomly select spectra
+            random_scans = random.sample(scansNotInList, randomScans)
+            
+            scan2retain = random_scans + scansInList
+            scan2retain = list(set(scan2retain))
+            scan2retain.sort()
+            scansRemoved = list(set(allScanList) - set(scan2retain))
+            # scan2retain contains scans that is to be retained
+        else:
+            flag = 1
+            
+        if flag == 1:
+            scan2retain = scansInList
+            scan2retain = list(set(scan2retain))
+            scan2retain.sort()
+            scansRemoved = list(set(allScanList) - set(scan2retain))
+            
+            # scan2retain contains scans that is to be retained
+            print("ERROR: Number of Random Scans queried is more than available. The result has provided zero random scans.", file=sys.stdout)
+            print("Number of available scans for random selection: " + str(len(scansNotInList)), file=sys.stdout)
+            print("Try a number less than the available number. Thanks!!", file=sys.stdout)
+            print("Number of Scans retained: " + str(len(scan2retain)), file=sys.stdout)
+        else:
+            # Print Stats
+            print("Total number of Scan Numbers: " + str(len(list(set(allScanList)))), file=sys.stdout)
+            print("Number of Scans retained: " + str(len(scan2retain)), file=sys.stdout)
+            print("Number of Scans removed: " + str(len(scansRemoved)), file=sys.stdout)
+            
+        
+        # Identifying groups of continuous numbers in the scan2retain and creating scanString
+        scanString = ""
+        for a, b in groupby(enumerate(scan2retain), lambda x:x[1]-x[0]):
+            x = list(map(itemgetter(1), b))
+            scanString = scanString + "["+str(x[0])+","+str(x[-1])+"] "
+        # end identifying
+        # start create filter file
+        filter_file = open("filter.txt", "w")
+        filter_file.write("filter=scanNumber %s\n" % scanString)
+        filter_file.close()
+        # end create filter file 
+    
+        # Prepare command for msconvert
+        inputFile = fraction_name+".mzML"
+        os.symlink(inputPath,inputFile)
+        outFile = "filtered_"+fraction_name+".mzML"
+        # msconvert_command = "msconvert " + inputFile + " --filter " + "\"scanNumber " + scanString + " \" " + " --outfile " + outFile + " --mzML --zlib"
+        msconvert_command = "msconvert " + inputFile + " -c filter.txt " + " --outfile " + outFile + " --mzML --zlib"
+        
+        
+        # Run msconvert
+        try:
+            subprocess.check_output(msconvert_command, stderr=subprocess.STDOUT, shell=True)
+        except subprocess.CalledProcessError as e:
+            sys.stderr.write( "msconvert resulted in error: %s: %s" % ( e.returncode, e.output ))
+            sys.exit(e.returncode)
+        # Copy output to 
+        shutil.copyfile(outFile, outPath)
+    else:
+        print("Please contact the admin. Number of inputs are not sufficient to run the program.\n")
+
+if __name__ == "__main__":
+    main()
+    
+    
+    
+