comparison msms_extractor.py @ 0:4bef80b09854 draft default tip

"planemo upload commit 498440b547e1feaa6a81764b55ac8208626d70a8"
author galaxyp
date Tue, 29 Oct 2019 11:09:46 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4bef80b09854
1 #
2 # Developed by Praveen Kumar
3 # Galaxy-P Team (Griffin's Lab)
4 # University of Minnesota
5 #
6 #
7 #
8
9
10 from pyteomics import mzml
11 import os
12 import sys
13 import shutil
14 import subprocess
15 import re
16 import pandas as pd
17 from operator import itemgetter
18 from itertools import groupby
19 import random
20 import argparse
21
22 def main():
23 if len(sys.argv) >= 7:
24 parser = argparse.ArgumentParser()
25 parser.add_argument("msms", help="mzML File")
26 parser.add_argument("psm", help="PSM Report File")
27 parser.add_argument("out", help="Output filename")
28 parser.add_argument("filestring", help="MSMS File string as identifier")
29 parser.add_argument("remove_retain", help="Remove scans reported in the PSM report")
30 parser.add_argument("random", help="Random MSMS scans used with to use with --retain (default=0)", default=0)
31 args = parser.parse_args()
32 # Start of Reading Scans from PSM file
33 # Creating dictionary of PSM file: key = filename key = list of scan numbers
34
35 # removeORretain = sys.argv[5].strip()
36 # randomScans = int(sys.argv[6].strip())
37
38 removeORretain = args.remove_retain
39 randomScans = int(args.random)
40
41
42 # ScanFile = sys.argv[2]
43 ScanFile = args.psm
44 spectrumTitleList = list(pd.read_csv(ScanFile, "\t")['Spectrum Title'])
45 scanFileNumber = [[".".join(each.split(".")[:-3]), int(each.split(".")[-2:-1][0])] for each in spectrumTitleList]
46 scanDict = {}
47 for each in scanFileNumber:
48 if each[0] in scanDict.keys():
49 scanDict[each[0]].append(int(each[1]))
50 else:
51 scanDict[each[0]] = [int(each[1])]
52 # End of Reading Scans from PSM file
53
54 # inputPath = sys.argv[1]
55 inputPath = args.msms
56 ##outPath = "/".join(sys.argv[3].split("/")[:-1])
57 # outPath = sys.argv[3]
58 outPath = args.out
59 ##outFile = sys.argv[3].split("/")[-1]
60 allScanList = []
61 # Read all scan numbers using indexedmzML/indexList/index/offset tags
62 for k in mzml.read(inputPath).iterfind('indexedmzML/indexList/index/offset'):
63 if re.search("scan=(\d+)", k['idRef']):
64 a = re.search("scan=(\d+)", k['idRef'])
65 allScanList.append(int(a.group(1)))
66 allScanList = list(set(allScanList))
67 # End of Reading mzML file
68 # fraction_name = sys.argv[4]
69 fraction_name = args.filestring
70 if fraction_name in scanDict.keys():
71 scansInList = scanDict[fraction_name]
72 else:
73 scansInList = []
74 scansNotInList = list(set(allScanList) - set(scansInList))
75 flag = 0
76 if removeORretain == "remove":
77 scan2retain = scansNotInList
78 scan2retain = list(set(scan2retain))
79 scan2retain.sort()
80 scansRemoved = scansInList
81 # scan2retain contains scans that is to be retained
82 elif removeORretain == "retain" and randomScans < len(scansNotInList):
83 # Randomly select spectra
84 random_scans = random.sample(scansNotInList, randomScans)
85
86 scan2retain = random_scans + scansInList
87 scan2retain = list(set(scan2retain))
88 scan2retain.sort()
89 scansRemoved = list(set(allScanList) - set(scan2retain))
90 # scan2retain contains scans that is to be retained
91 else:
92 flag = 1
93
94 if flag == 1:
95 scan2retain = scansInList
96 scan2retain = list(set(scan2retain))
97 scan2retain.sort()
98 scansRemoved = list(set(allScanList) - set(scan2retain))
99
100 # scan2retain contains scans that is to be retained
101 print("ERROR: Number of Random Scans queried is more than available. The result has provided zero random scans.", file=sys.stdout)
102 print("Number of available scans for random selection: " + str(len(scansNotInList)), file=sys.stdout)
103 print("Try a number less than the available number. Thanks!!", file=sys.stdout)
104 print("Number of Scans retained: " + str(len(scan2retain)), file=sys.stdout)
105 else:
106 # Print Stats
107 print("Total number of Scan Numbers: " + str(len(list(set(allScanList)))), file=sys.stdout)
108 print("Number of Scans retained: " + str(len(scan2retain)), file=sys.stdout)
109 print("Number of Scans removed: " + str(len(scansRemoved)), file=sys.stdout)
110
111
112 # Identifying groups of continuous numbers in the scan2retain and creating scanString
113 scanString = ""
114 for a, b in groupby(enumerate(scan2retain), lambda x:x[1]-x[0]):
115 x = list(map(itemgetter(1), b))
116 scanString = scanString + "["+str(x[0])+","+str(x[-1])+"] "
117 # end identifying
118 # start create filter file
119 filter_file = open("filter.txt", "w")
120 filter_file.write("filter=scanNumber %s\n" % scanString)
121 filter_file.close()
122 # end create filter file
123
124 # Prepare command for msconvert
125 inputFile = fraction_name+".mzML"
126 os.symlink(inputPath,inputFile)
127 outFile = "filtered_"+fraction_name+".mzML"
128 # msconvert_command = "msconvert " + inputFile + " --filter " + "\"scanNumber " + scanString + " \" " + " --outfile " + outFile + " --mzML --zlib"
129 msconvert_command = "msconvert " + inputFile + " -c filter.txt " + " --outfile " + outFile + " --mzML --zlib"
130
131
132 # Run msconvert
133 try:
134 subprocess.check_output(msconvert_command, stderr=subprocess.STDOUT, shell=True)
135 except subprocess.CalledProcessError as e:
136 sys.stderr.write( "msconvert resulted in error: %s: %s" % ( e.returncode, e.output ))
137 sys.exit(e.returncode)
138 # Copy output to
139 shutil.copyfile(outFile, outPath)
140 else:
141 print("Please contact the admin. Number of inputs are not sufficient to run the program.\n")
142
143 if __name__ == "__main__":
144 main()
145
146
147
148