0
|
1 """
|
|
2 Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University
|
|
3
|
|
4 Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
5 of this software and associated documentation files (the "Software"), to deal
|
|
6 in the Software without restriction, including without limitation the rights
|
|
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
8 copies of the Software, and to permit persons to whom the Software is
|
|
9 furnished to do so, subject to the following conditions:
|
|
10
|
|
11 The above copyright notice and this permission notice shall be included in
|
|
12 all copies or substantial portions of the Software.
|
|
13
|
|
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
20 THE SOFTWARE.
|
|
21 """
|
|
22
|
|
23 #Library functions definition
|
|
24 #----------Begin-------------
|
|
25 from numpy import array, linalg, ones
|
|
26 from numpy.polynomial import polynomial
|
|
27 import math
|
|
28 import pdfread
|
|
29 import sys
|
|
30
|
|
31
|
|
32 def calibrate(standards):
|
|
33 '''
|
|
34 Calculates the RT to RI conversion: RI = a + b*RT
|
|
35 @param standards: variable containing RI and RT data
|
|
36 '''
|
|
37 A = ones((len(standards['R.T.']), 2), dtype=float)
|
|
38 A[:, 0] = array(map(float, standards['R.T.']))
|
|
39 [coeff, res, r, s] = linalg.lstsq(A, array(map(float, standards['RI'])))
|
|
40
|
|
41 return coeff
|
|
42
|
|
43
|
|
44 def calibrate_poly(standards):
|
|
45 '''
|
|
46 Calculates the RT to RI conversion using a polynomial model
|
|
47 @param standards: variable containing RI and RT data
|
|
48 '''
|
|
49 retention_time = array(map(float, standards['R.T.']))
|
|
50 retention_index = array(map(float, standards['RI']))
|
|
51
|
|
52 # Fit a 3rd degree polynomial
|
|
53 fit = polynomial.polyfit(retention_time, retention_index, 3)[::-1]
|
|
54 return [fit[0], fit[1], fit[2], fit[3]]
|
|
55
|
|
56
|
|
57 def calculate_poly(retention_time, poly_cal):
|
|
58 '''
|
|
59 Converts a given retention time to retention index using the calculated polynomial model
|
|
60 @param retention_time: retention_time to convert to retention index
|
|
61 @param poly_cal: result from calculating regression
|
|
62 '''
|
|
63 # Calculates RI based on given retention_time using polynomial function
|
|
64 retention_time = array(map(float, retention_time))
|
|
65 if len(retention_time) > 1:
|
|
66 ri_exp = []
|
|
67 for i in retention_time:
|
|
68 ri_exp.append(poly_cal[0] * (i ** 3) + poly_cal[1] * (i ** 2) + (i * poly_cal[2]) + poly_cal[3])
|
|
69 return ri_exp
|
|
70 else:
|
|
71 return poly_cal[0] * (retention_time ** 3) + poly_cal[1] * (retention_time ** 2) + (retention_time * poly_cal[2]) + poly_cal[3]
|
|
72
|
|
73
|
|
74 def convert_rt(hit_list, coeff):
|
|
75 '''
|
|
76 Converts a given retention time to retention index using the linear model
|
|
77 @param hit_list: list holding the retention time
|
|
78 @param coeff: calculated coefficient (slope and intercept) using the linear model
|
|
79 '''
|
|
80 #Convert RT to RI
|
|
81 hit_list['RIexp'] = array(map(float, hit_list['R.T.'])) * coeff[0] + coeff[1]
|
|
82 return hit_list
|
|
83
|
|
84
|
|
85 def convert_rt_poly(hit_list, poly_cal):
|
|
86 '''
|
|
87 Calls the actual RT to RI converter and returns the updated list with added RIexp value
|
|
88 @param hit_list: result list containing the retention time
|
|
89 '''
|
|
90 hit_list['RIexp'] = array(map(float, calculate_poly(hit_list['R.T.'], poly_cal)))
|
|
91 return hit_list
|
|
92
|
|
93
|
|
94 def get_data(libdata, LabelCol):
|
|
95 '''
|
|
96 Retrieves datacolumns indicated by LabelCol from libdata (generic function)
|
|
97 Returns a dict with the requested column names as keys
|
|
98 @param libdata: file from which data is loaded
|
|
99 @param LabelCol: columns to retrieve
|
|
100 '''
|
|
101 from numpy import take
|
|
102 LibData = open(libdata, 'r').read().split('\n')
|
|
103
|
|
104 #Get the labels of the columns in the file
|
|
105 FirstLine = LibData.pop(0).split('\t')
|
|
106
|
|
107 FirstLine[-1] = FirstLine[-1].replace('\r', '')
|
|
108
|
|
109 # Create a temporate variable containing the all data
|
|
110 tmp_data = []
|
|
111 for ll in LibData:
|
|
112 if ll != '':
|
|
113 tmp_data.append(ll.split('\t'))
|
|
114
|
|
115 # Find the indices of the desired data
|
|
116 ind = []
|
|
117 try:
|
|
118 for key in LabelCol:
|
|
119 ind.append(FirstLine.index(key))
|
|
120 except:
|
|
121 print str(key) + " not found in first line of library (" + str(libdata) + ")"
|
|
122 print"the folowing items are found in the first line of the library:\n" + str(FirstLine)
|
|
123 sys.exit(1)
|
|
124 # Extract the desired data
|
|
125 data = []
|
|
126 for x in tmp_data:
|
|
127 data.append(take(array(x), ind))
|
|
128
|
|
129 # library_data = dict(zip(LabelCol,transpose(data)))
|
|
130 library_data = dict(zip(LabelCol, map(lambda *x: list(x), *data)))
|
|
131 return library_data
|
|
132
|
|
133
|
|
134 def rank_hit(hit_list, library_data, window):
|
|
135 '''
|
|
136 Computes the Rank and % relative error
|
|
137 @param hit_list: input data
|
|
138 @param library_data: library used for reading the RIsvr data
|
|
139 @param window: error window
|
|
140 '''
|
|
141 hit_match_ripred = []
|
|
142 hit_match_syn = []
|
|
143 # Convert 'Name' data to list in order to be indexed
|
|
144 # library_data['Name']=list(library_data['Name'])
|
|
145
|
|
146 for hit_cas, hit_name in zip(hit_list['CAS'], hit_list['Name']):
|
|
147 index = 0
|
|
148 if hit_cas != 'undef':
|
|
149 try:
|
|
150 index = library_data['CAS'].index(hit_cas.replace(' ', '').replace('-', ''))
|
|
151 except:
|
|
152 try:
|
|
153 index = library_data['Name'].index(hit_name.replace(' ', ''))
|
|
154 except:
|
|
155 # If for any reason the hit is not present
|
|
156 # in the mainlib library indicate this with -999
|
|
157 index = 0
|
|
158 else:
|
|
159 try:
|
|
160 index = library_data['Name'].index(hit_name.replace(' ', ''))
|
|
161 except:
|
|
162 # If for any reason the hit is not present
|
|
163 # in the mainlib library indicate this with -999
|
|
164 index = 0
|
|
165 if index != 0:
|
|
166 hit_match_ripred.append(float(library_data['RIsvr'][index]))
|
|
167 hit_match_syn.append(library_data['Synonyms'][index])
|
|
168 else:
|
|
169 hit_match_ripred.append(-999)
|
|
170 hit_match_syn.append('None')
|
|
171 hit_list['RIsvr'] = hit_match_ripred
|
|
172 hit_list['Synonyms'] = hit_match_syn
|
|
173
|
|
174 # Determine the relative difference between the experimental
|
|
175 # and the predicted RI
|
|
176 ri_err = []
|
|
177
|
|
178 for ri_exp, ri_qsar in zip(hit_list['RIexp'], hit_list['RIsvr']):
|
|
179 if int(ri_qsar) != -999:
|
|
180 ri_err.append(float(int(float(ri_qsar)) - int(float(ri_exp))) / int(float(ri_exp)) * 100)
|
|
181 else:
|
|
182 ri_err.append(-999)
|
|
183
|
|
184 # Define the rank of the hits
|
|
185 hit_rank = []
|
|
186
|
|
187 for tt in ri_err:
|
|
188 if tt == -999:
|
|
189 # The name of the hit generated with AMDIS did not match a name
|
|
190 # in the mainlib library
|
|
191 hit_rank.append(4)
|
|
192 else:
|
|
193 # Rank 1 - ri_err is within +/- window/2
|
|
194 if abs(tt) <= float(window) / 2:
|
|
195 hit_rank.append(1)
|
|
196 # Rank 2 - window/2 < ri_err <= window
|
|
197 if abs(tt) > float(window) / 2 and abs(tt) <= float(window):
|
|
198 hit_rank.append(2)
|
|
199 # Rank 3 - ri_err > window
|
|
200 if abs(tt) > float(window):
|
|
201 hit_rank.append(3)
|
|
202 hit_list['Rank'] = hit_rank
|
|
203 hit_list['%rel.err'] = ri_err
|
|
204 return hit_list
|
|
205
|
|
206 def print_to_file(hit_list, filename, keys_to_print, print_subsets=True):
|
|
207 '''
|
|
208 Writes output data to files (four output files are generated):
|
|
209 filename_ranked - the hits are ranked
|
|
210 filename_filter_in - only hits with rank 1 and 2 are retained
|
|
211 filename_filter_out - hits with rank 3 are filtered out
|
|
212 filename_filter_missed - hits with rank 4 - there was no match with the
|
|
213 library data
|
|
214 @param hit_list: a dictionary with the ranked hits
|
|
215 @param filename: the core of the output file names
|
|
216 @param keys_to_print: determines the order in which the data are printed
|
|
217 @param print_subsets:
|
|
218 '''
|
|
219 from numpy import take
|
|
220
|
|
221 out_ranked = open(filename["ranked"], 'w')
|
|
222 if (print_subsets):
|
|
223 out_in = open(filename["filter_in"], 'w')
|
|
224 out_out = open(filename["filter_out"], 'w')
|
|
225 out_missed = open(filename["missed"], 'w')
|
|
226
|
|
227 #Convert RIexp and RIsvr into integer for printing
|
|
228 hit_list['RIexp'] = map(int, map(math.ceil, hit_list['RIexp']))
|
|
229 hit_list['RIsvr'] = map(int, map(math.ceil, hit_list['RIsvr']))
|
|
230
|
|
231 #Establish the right order of the data to be printed
|
|
232 tmp_items = hit_list.items()
|
|
233 tmp_keys = hit_list.keys()
|
|
234 ind = []
|
|
235
|
|
236 for key in keys_to_print:
|
|
237 ind.append(tmp_keys.index(key))
|
|
238
|
|
239 #Print the labels of the columns
|
|
240 line = '\t'.join(take(array(tmp_keys), ind))
|
|
241 out_ranked.write('%s\n' % line)
|
|
242 if (print_subsets):
|
|
243 out_in.write('%s\n' % line)
|
|
244 out_out.write('%s\n' % line)
|
|
245 out_missed.write('%s\n' % line)
|
|
246
|
|
247 #Collect the data for each hit in the right order and print them
|
|
248 #in the output file
|
|
249 i = 0
|
|
250 for name in hit_list['Name']:
|
|
251 tt = []
|
|
252 for x in iter(tmp_items):
|
|
253 # trim the value if it is a string:
|
|
254 if isinstance(x[1][i], basestring):
|
|
255 tt.append(x[1][i].strip())
|
|
256 else:
|
|
257 tt.append(x[1][i])
|
|
258 tmp1 = take(array(tt), ind)
|
|
259 line = '\t'.join(tmp1.tolist())
|
|
260
|
|
261 out_ranked.write('%s\n' % line)
|
|
262 if(print_subsets):
|
|
263 if hit_list['Rank'][i] == 4:
|
|
264 out_missed.write('%s\n' % line)
|
|
265 if hit_list['Rank'][i] == 3:
|
|
266 out_out.write('%s\n' % line)
|
|
267 if hit_list['Rank'][i] == 1 or hit_list['Rank'][i] == 2:
|
|
268 out_in.write('%s\n' % line)
|
|
269
|
|
270 i = i + 1
|
|
271
|
|
272 #---------End--------------
|
|
273 def main():
|
|
274 #Ranking and filtering procedure
|
|
275 if len(sys.argv) < 2:
|
|
276 print "Usage:"
|
|
277 print "python RankFilter_GC-MS.py input \n"
|
|
278 print "input is a text file that specifies the names and the location"
|
|
279 print "of the files with the sample, compounds for calibration, library"
|
|
280 print "data, the core of the name ot the outputs, and the value of the"
|
|
281 print "window used for the filtering \n"
|
|
282
|
|
283 sys.exit(1)
|
|
284
|
|
285 #------Read the input file------
|
|
286 try:
|
|
287 ifile = open(sys.argv[1], 'r').read().split('\n')
|
|
288 except:
|
|
289 print sys.argv[1], " file can not be found"
|
|
290 sys.exit()
|
|
291
|
|
292 #Get the file names for the data
|
|
293 #labels - the type of input files
|
|
294 #filenames - the names of the input files
|
|
295 labels = []
|
|
296 filenames = []
|
|
297
|
|
298 for l in ifile:
|
|
299 l = l.strip()
|
|
300 if l != '':
|
|
301 labels.append(l.split('=')[0].replace(' ', '').replace('\r', ''))
|
|
302 filenames.append(l.split('=')[1].replace(' ', '').replace('\r', ''))
|
|
303
|
|
304 InputData = dict(zip(labels, filenames))
|
|
305
|
|
306 #this part checkes if the ouput option is set. The output files are saved as the output variable as prefix for the output files
|
|
307 #if the output is not found , each output file has to be selected by forehand. This comes in handy for pipeline tools as galaxy
|
|
308 print_subsets = True
|
|
309 NDIS_is_tabular = False
|
|
310
|
|
311 if 'output' in InputData:
|
|
312 output_files = {"ranked":InputData['output'] + "_ranked", \
|
|
313 "filter_in":InputData['output'] + "_filter_in", \
|
|
314 "filter_out":InputData['output'] + "filter_out", \
|
|
315 "missed":InputData['output'] + "_missed", \
|
|
316 "missed_parse_pdf":InputData['output'] + "_missed_parse_pdf"}
|
|
317 elif 'tabular' in InputData:
|
|
318 NDIS_is_tabular = True
|
|
319 if(not "onefile" in InputData):
|
|
320 output_files = {"ranked":InputData['ranked'], \
|
|
321 "filter_in":InputData['filter_in'], \
|
|
322 "filter_out":InputData['filter_out'], \
|
|
323 "missed":InputData['missed']}
|
|
324 else:
|
|
325 print_subsets = False
|
|
326 output_files = {"ranked":InputData['onefile']}
|
|
327 else:
|
|
328 output_files = {"ranked":InputData['ranked'], \
|
|
329 "filter_in":InputData['filter_in'], \
|
|
330 "filter_out":InputData['filter_out'], \
|
|
331 "missed":InputData['missed'], \
|
|
332 "missed_parse_pdf":InputData['missed_parse_pdf']}
|
|
333
|
|
334 #------Start with calibration------
|
|
335 #Check whether a file with data for the calibration is specified
|
|
336 #Specify which data to be read from the file with standard compounds
|
|
337 LabelColStand = ['Name', 'R.T.', 'RI']
|
|
338
|
|
339 if InputData['calibration'] != 'none':
|
|
340 #get the coeffiecients for the RT to RI convertion
|
|
341
|
|
342 try:
|
|
343 ifile = open(InputData['calibration'], 'r')
|
|
344 except:
|
|
345 print "file", InputData['calibration'], "can not be found"
|
|
346 sys.exit(1)
|
|
347
|
|
348 standards = get_data(InputData['calibration'], LabelColStand)
|
|
349 if InputData['model'] == 'linear':
|
|
350 coeff = calibrate(standards)
|
|
351 elif InputData['model'] == 'poly':
|
|
352 poly_cal = calibrate_poly(standards)
|
|
353 else:
|
|
354 print "error: model ", InputData['model'], " can not be found. Use 'linear' or 'poly' "
|
|
355 sys.exit(1)
|
|
356 else:
|
|
357 #No file has been specified for the calibration
|
|
358 #Use the default coefficients
|
|
359 print 'No file has been specified for the calibration'
|
|
360 print 'WARNING: the default coefficients will be used'
|
|
361 coeff = array([29.4327, 454.5260])
|
|
362
|
|
363 if InputData['analysis_type'] == 'AMDIS':
|
|
364 try:
|
|
365 AmdisOut = open(InputData['sample'], 'r')
|
|
366 print("open ok")
|
|
367 #Specify which data to be extracted from the AMDIS output table
|
|
368 #Weighted and Reverse are measure of matching between the experimental
|
|
369 #and the library spectra. The experimental spectrum is used as template
|
|
370 #for the calculation of Weighted, whereas for Reverse the template is the
|
|
371 #library spectrum
|
|
372 LabelCol = ['CAS', 'Name', 'R.T.', 'Weighted', 'Reverse', 'Purity']
|
|
373
|
|
374 #Get the data from the AMDIS output file
|
|
375 HitList = get_data(InputData['sample'], LabelCol)
|
|
376 #Remove '>' from the names
|
|
377 HitList['Name'] = [s.replace('>', '') for s in HitList['Name']]
|
|
378 except:
|
|
379 print "the file", InputData['sample'], "can not be found"
|
|
380 sys.exit(1)
|
|
381 if InputData['analysis_type'] == 'NIST':
|
|
382 #HitList_missed - a variable of type dictionary containing the hits with the symbol ";"
|
|
383 #in the name
|
|
384 if not NDIS_is_tabular:
|
|
385 print "Warning; NDIS is not tabular format, reading PDF..\n"
|
|
386 [HitList, HitList_missed] = pdfread.getPDF(InputData['sample'])
|
|
387 else:
|
|
388 HitList = pdfread.read_tabular(InputData['sample'])
|
|
389
|
|
390 #Convert RT to RI
|
|
391 if InputData['model'] == 'linear':
|
|
392 HitList = convert_rt(HitList, coeff)
|
|
393 if InputData['model'] == 'poly':
|
|
394 print "Executing convert_rt_poly().."
|
|
395 HitList = convert_rt_poly(HitList, poly_cal)
|
|
396
|
|
397 #------Read the library data with the predicted RI------
|
|
398 try:
|
|
399 LibData = open(InputData['lib_data'], 'r')
|
|
400 except:
|
|
401 print "the file", InputData['lib_data'], "can not be found"
|
|
402 sys.exit(1)
|
|
403
|
|
404 #Specify which data to be extracted from the library data file
|
|
405 LabelColLib = ['CAS', 'Name', 'RIsvr', 'Synonyms']
|
|
406 LibraryData = get_data(InputData['lib_data'], LabelColLib)
|
|
407
|
|
408 #------Match the hits with the library data and rank them------
|
|
409 if InputData['window'] != '':
|
|
410 HitList = rank_hit(HitList, LibraryData, InputData['window'])
|
|
411 else:
|
|
412 print "No value for the window used for the filtering is specified \n"
|
|
413 sys.exit(1)
|
|
414
|
|
415 #------Print the ranked and filtered hits------
|
|
416 #Specify which data to be printed
|
|
417 if InputData['analysis_type'] == 'AMDIS':
|
|
418 keys_to_print = ['R.T.', 'CAS', 'Name', 'Rank', 'RIexp', 'RIsvr', '%rel.err', 'Weighted', 'Reverse', 'Synonyms']
|
|
419 else:
|
|
420 keys_to_print = ['ID', 'R.T.', 'Name', 'CAS', 'Rank', 'RIexp', 'RIsvr', '%rel.err', 'Forward', 'Reverse', 'Synonyms', 'Library']
|
|
421
|
|
422 #skip this error output from reading a pdftotext file when file is tabular
|
|
423 if InputData['analysis_type'] == 'NIST' and not NDIS_is_tabular:
|
|
424 out_missed_pdf = open(output_files['missed_parse_pdf'], 'w')
|
|
425 for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']):
|
|
426 out_missed_pdf.write('%s\n' % '\t'.join([y, x]))
|
|
427 out_missed_pdf.close()
|
|
428
|
|
429 print_to_file(HitList, output_files, keys_to_print, print_subsets)
|
|
430
|
|
431 if __name__ == '__main__':
|
|
432 main()
|