0
|
1 #
|
|
2 # Developed by Praveen Kumar
|
|
3 # Galaxy-P Team (Griffin's Lab)
|
|
4 # University of Minnesota
|
|
5 #
|
|
6 #
|
2
|
7 #
|
0
|
8
|
|
9 def main():
|
|
10 from pyteomics import mzml
|
|
11 import os
|
|
12 import sys
|
|
13 import shutil
|
|
14 import subprocess
|
|
15 import re
|
|
16 import pandas as pd
|
|
17 from operator import itemgetter
|
|
18 from itertools import groupby
|
2
|
19 import random
|
|
20
|
|
21 if len(sys.argv) >= 7:
|
0
|
22 # Start of Reading Scans from PSM file
|
|
23 # Creating dictionary of PSM file: key = filename key = list of scan numbers
|
2
|
24
|
|
25 removeORretain = sys.argv[5].strip()
|
|
26 randomScans = int(sys.argv[6].strip())
|
|
27
|
0
|
28 ScanFile = sys.argv[2]
|
|
29 spectrumTitleList = list(pd.read_csv(ScanFile, "\t")['Spectrum Title'])
|
|
30 scanFileNumber = [[".".join(each.split(".")[:-3]), int(each.split(".")[-2:-1][0])] for each in spectrumTitleList]
|
|
31 scanDict = {}
|
|
32 for each in scanFileNumber:
|
|
33 if scanDict.has_key(each[0]):
|
|
34 scanDict[each[0]].append(int(each[1]))
|
|
35 else:
|
|
36 scanDict[each[0]] = [int(each[1])]
|
|
37 # End of Reading Scans from PSM file
|
|
38
|
|
39 inputPath = sys.argv[1]
|
|
40 ##outPath = "/".join(sys.argv[3].split("/")[:-1])
|
|
41 outPath = sys.argv[3]
|
|
42 ##outFile = sys.argv[3].split("/")[-1]
|
|
43 allScanList = []
|
|
44 # Read all scan numbers using indexedmzML/indexList/index/offset tags
|
|
45 for k in mzml.read(inputPath).iterfind('indexedmzML/indexList/index/offset'):
|
|
46 if re.search("scan=(\d+)", k['idRef']):
|
|
47 a = re.search("scan=(\d+)", k['idRef'])
|
|
48 allScanList.append(int(a.group(1)))
|
5
|
49 allScanList = list(set(allScanList))
|
0
|
50 # End of Reading mzML file
|
|
51
|
|
52 fraction_name = sys.argv[4]
|
|
53 if scanDict.has_key(fraction_name):
|
2
|
54 scansInList = scanDict[fraction_name]
|
0
|
55 else:
|
2
|
56 scansInList = []
|
|
57 scansNotInList = list(set(allScanList) - set(scansInList))
|
5
|
58 flag = 0
|
2
|
59 if removeORretain == "remove":
|
|
60 scan2retain = scansNotInList
|
5
|
61 scan2retain = list(set(scan2retain))
|
2
|
62 scan2retain.sort()
|
|
63 scansRemoved = scansInList
|
|
64 # scan2retain contains scans that is to be retained
|
5
|
65 elif removeORretain == "retain" and randomScans < len(scansNotInList):
|
2
|
66 # Randomly select spectra
|
5
|
67 random_scans = random.sample(scansNotInList, randomScans)
|
2
|
68
|
|
69 scan2retain = random_scans + scansInList
|
5
|
70 scan2retain = list(set(scan2retain))
|
2
|
71 scan2retain.sort()
|
|
72 scansRemoved = list(set(allScanList) - set(scan2retain))
|
|
73 # scan2retain contains scans that is to be retained
|
5
|
74 else:
|
|
75 flag = 1
|
2
|
76
|
5
|
77 if flag == 1:
|
|
78 scan2retain = scansInList
|
|
79 scan2retain = list(set(scan2retain))
|
|
80 scan2retain.sort()
|
|
81 scansRemoved = list(set(allScanList) - set(scan2retain))
|
|
82
|
|
83 # scan2retain contains scans that is to be retained
|
|
84 print >> sys.stdout,"ERROR: Number of Random Scans queried is more than available. The result has provided zero random scans."
|
|
85 print >> sys.stdout,"Number of available scans for random selection: %d" % len(scansNotInList)
|
|
86 print >> sys.stdout,"Try a number less than the available number. Thanks!!"
|
|
87 print >> sys.stdout,"Number of Scans retained: %d" % len(scan2retain)
|
|
88 else:
|
|
89 # Print Stats
|
|
90 print >> sys.stdout,"Total number of Scan Numbers: %d" % len(list(set(allScanList)))
|
|
91 print >> sys.stdout,"Number of Scans retained: %d" % len(scan2retain)
|
|
92 print >> sys.stdout,"Number of Scans removed: %d" % len(scansRemoved)
|
0
|
93
|
|
94
|
|
95 # Identifying groups of continuous numbers in the scan2retain and creating scanString
|
|
96 scanString = ""
|
|
97 for a, b in groupby(enumerate(scan2retain), lambda(i,x):i-x):
|
|
98 x = map(itemgetter(1), b)
|
|
99 scanString = scanString + "["+str(x[0])+","+str(x[-1])+"] "
|
|
100 # end identifying
|
5
|
101
|
0
|
102 # start create filter file
|
|
103 filter_file = open("filter.txt", "w")
|
|
104 filter_file.write("filter=scanNumber %s\n" % scanString)
|
|
105 filter_file.close()
|
|
106 # end create filter file
|
5
|
107
|
0
|
108 # Prepare command for msconvert
|
|
109 inputFile = fraction_name+".mzML"
|
|
110 os.symlink(inputPath,inputFile)
|
|
111 outFile = "filtered_"+fraction_name+".mzML"
|
|
112 # msconvert_command = "msconvert " + inputFile + " --filter " + "\"scanNumber " + scanString + " \" " + " --outfile " + outFile + " --mzML --zlib"
|
|
113 msconvert_command = "msconvert " + inputFile + " -c filter.txt " + " --outfile " + outFile + " --mzML --zlib"
|
|
114
|
|
115
|
|
116 # Run msconvert
|
|
117 try:
|
|
118 subprocess.check_output(msconvert_command, stderr=subprocess.STDOUT, shell=True)
|
|
119 except subprocess.CalledProcessError as e:
|
|
120 sys.stderr.write( "msconvert resulted in error: %s: %s" % ( e.returncode, e.output ))
|
|
121 sys.exit(e.returncode)
|
|
122 # Copy output to
|
|
123 shutil.copyfile(outFile, outPath)
|
|
124 else:
|
|
125 print "Please contact the admin. Number of inputs are not sufficient to run the program.\n"
|
|
126
|
|
127 if __name__ == "__main__":
|
|
128 main()
|
|
129
|
|
130
|
|
131
|
|
132
|