comparison rankfilter_GCMS/rankfilter.py @ 0:9d5f4f5f764b

Initial commit to toolshed
author pieter.lukasse@wur.nl
date Thu, 16 Jan 2014 13:10:00 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9d5f4f5f764b
1 """
2 Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University
3
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 """
22
23 #Library functions definition
24 #----------Begin-------------
25 from numpy import array, linalg, ones
26 from numpy.polynomial import polynomial
27 import math
28 import pdfread
29 import sys
30
31
32 def calibrate(standards):
33 '''
34 Calculates the RT to RI conversion: RI = a + b*RT
35 @param standards: variable containing RI and RT data
36 '''
37 A = ones((len(standards['R.T.']), 2), dtype=float)
38 A[:, 0] = array(map(float, standards['R.T.']))
39 [coeff, res, r, s] = linalg.lstsq(A, array(map(float, standards['RI'])))
40
41 return coeff
42
43
44 def calibrate_poly(standards):
45 '''
46 Calculates the RT to RI conversion using a polynomial model
47 @param standards: variable containing RI and RT data
48 '''
49 retention_time = array(map(float, standards['R.T.']))
50 retention_index = array(map(float, standards['RI']))
51
52 # Fit a 3rd degree polynomial
53 fit = polynomial.polyfit(retention_time, retention_index, 3)[::-1]
54 return [fit[0], fit[1], fit[2], fit[3]]
55
56
57 def calculate_poly(retention_time, poly_cal):
58 '''
59 Converts a given retention time to retention index using the calculated polynomial model
60 @param retention_time: retention_time to convert to retention index
61 @param poly_cal: result from calculating regression
62 '''
63 # Calculates RI based on given retention_time using polynomial function
64 retention_time = array(map(float, retention_time))
65 if len(retention_time) > 1:
66 ri_exp = []
67 for i in retention_time:
68 ri_exp.append(poly_cal[0] * (i ** 3) + poly_cal[1] * (i ** 2) + (i * poly_cal[2]) + poly_cal[3])
69 return ri_exp
70 else:
71 return poly_cal[0] * (retention_time ** 3) + poly_cal[1] * (retention_time ** 2) + (retention_time * poly_cal[2]) + poly_cal[3]
72
73
74 def convert_rt(hit_list, coeff):
75 '''
76 Converts a given retention time to retention index using the linear model
77 @param hit_list: list holding the retention time
78 @param coeff: calculated coefficient (slope and intercept) using the linear model
79 '''
80 #Convert RT to RI
81 hit_list['RIexp'] = array(map(float, hit_list['R.T.'])) * coeff[0] + coeff[1]
82 return hit_list
83
84
85 def convert_rt_poly(hit_list, poly_cal):
86 '''
87 Calls the actual RT to RI converter and returns the updated list with added RIexp value
88 @param hit_list: result list containing the retention time
89 '''
90 hit_list['RIexp'] = array(map(float, calculate_poly(hit_list['R.T.'], poly_cal)))
91 return hit_list
92
93
94 def get_data(libdata, LabelCol):
95 '''
96 Retrieves datacolumns indicated by LabelCol from libdata (generic function)
97 Returns a dict with the requested column names as keys
98 @param libdata: file from which data is loaded
99 @param LabelCol: columns to retrieve
100 '''
101 from numpy import take
102 LibData = open(libdata, 'r').read().split('\n')
103
104 #Get the labels of the columns in the file
105 FirstLine = LibData.pop(0).split('\t')
106
107 FirstLine[-1] = FirstLine[-1].replace('\r', '')
108
109 # Create a temporate variable containing the all data
110 tmp_data = []
111 for ll in LibData:
112 if ll != '':
113 tmp_data.append(ll.split('\t'))
114
115 # Find the indices of the desired data
116 ind = []
117 try:
118 for key in LabelCol:
119 ind.append(FirstLine.index(key))
120 except:
121 print str(key) + " not found in first line of library (" + str(libdata) + ")"
122 print"the folowing items are found in the first line of the library:\n" + str(FirstLine)
123 sys.exit(1)
124 # Extract the desired data
125 data = []
126 for x in tmp_data:
127 data.append(take(array(x), ind))
128
129 # library_data = dict(zip(LabelCol,transpose(data)))
130 library_data = dict(zip(LabelCol, map(lambda *x: list(x), *data)))
131 return library_data
132
133
134 def rank_hit(hit_list, library_data, window):
135 '''
136 Computes the Rank and % relative error
137 @param hit_list: input data
138 @param library_data: library used for reading the RIsvr data
139 @param window: error window
140 '''
141 hit_match_ripred = []
142 hit_match_syn = []
143 # Convert 'Name' data to list in order to be indexed
144 # library_data['Name']=list(library_data['Name'])
145
146 for hit_cas, hit_name in zip(hit_list['CAS'], hit_list['Name']):
147 index = 0
148 if hit_cas != 'undef':
149 try:
150 index = library_data['CAS'].index(hit_cas.replace(' ', '').replace('-', ''))
151 except:
152 try:
153 index = library_data['Name'].index(hit_name.replace(' ', ''))
154 except:
155 # If for any reason the hit is not present
156 # in the mainlib library indicate this with -999
157 index = 0
158 else:
159 try:
160 index = library_data['Name'].index(hit_name.replace(' ', ''))
161 except:
162 # If for any reason the hit is not present
163 # in the mainlib library indicate this with -999
164 index = 0
165 if index != 0:
166 hit_match_ripred.append(float(library_data['RIsvr'][index]))
167 hit_match_syn.append(library_data['Synonyms'][index])
168 else:
169 hit_match_ripred.append(-999)
170 hit_match_syn.append('None')
171 hit_list['RIsvr'] = hit_match_ripred
172 hit_list['Synonyms'] = hit_match_syn
173
174 # Determine the relative difference between the experimental
175 # and the predicted RI
176 ri_err = []
177
178 for ri_exp, ri_qsar in zip(hit_list['RIexp'], hit_list['RIsvr']):
179 if int(ri_qsar) != -999:
180 ri_err.append(float(int(float(ri_qsar)) - int(float(ri_exp))) / int(float(ri_exp)) * 100)
181 else:
182 ri_err.append(-999)
183
184 # Define the rank of the hits
185 hit_rank = []
186
187 for tt in ri_err:
188 if tt == -999:
189 # The name of the hit generated with AMDIS did not match a name
190 # in the mainlib library
191 hit_rank.append(4)
192 else:
193 # Rank 1 - ri_err is within +/- window/2
194 if abs(tt) <= float(window) / 2:
195 hit_rank.append(1)
196 # Rank 2 - window/2 < ri_err <= window
197 if abs(tt) > float(window) / 2 and abs(tt) <= float(window):
198 hit_rank.append(2)
199 # Rank 3 - ri_err > window
200 if abs(tt) > float(window):
201 hit_rank.append(3)
202 hit_list['Rank'] = hit_rank
203 hit_list['%rel.err'] = ri_err
204 return hit_list
205
206 def print_to_file(hit_list, filename, keys_to_print, print_subsets=True):
207 '''
208 Writes output data to files (four output files are generated):
209 filename_ranked - the hits are ranked
210 filename_filter_in - only hits with rank 1 and 2 are retained
211 filename_filter_out - hits with rank 3 are filtered out
212 filename_filter_missed - hits with rank 4 - there was no match with the
213 library data
214 @param hit_list: a dictionary with the ranked hits
215 @param filename: the core of the output file names
216 @param keys_to_print: determines the order in which the data are printed
217 @param print_subsets:
218 '''
219 from numpy import take
220
221 out_ranked = open(filename["ranked"], 'w')
222 if (print_subsets):
223 out_in = open(filename["filter_in"], 'w')
224 out_out = open(filename["filter_out"], 'w')
225 out_missed = open(filename["missed"], 'w')
226
227 #Convert RIexp and RIsvr into integer for printing
228 hit_list['RIexp'] = map(int, map(math.ceil, hit_list['RIexp']))
229 hit_list['RIsvr'] = map(int, map(math.ceil, hit_list['RIsvr']))
230
231 #Establish the right order of the data to be printed
232 tmp_items = hit_list.items()
233 tmp_keys = hit_list.keys()
234 ind = []
235
236 for key in keys_to_print:
237 ind.append(tmp_keys.index(key))
238
239 #Print the labels of the columns
240 line = '\t'.join(take(array(tmp_keys), ind))
241 out_ranked.write('%s\n' % line)
242 if (print_subsets):
243 out_in.write('%s\n' % line)
244 out_out.write('%s\n' % line)
245 out_missed.write('%s\n' % line)
246
247 #Collect the data for each hit in the right order and print them
248 #in the output file
249 i = 0
250 for name in hit_list['Name']:
251 tt = []
252 for x in iter(tmp_items):
253 # trim the value if it is a string:
254 if isinstance(x[1][i], basestring):
255 tt.append(x[1][i].strip())
256 else:
257 tt.append(x[1][i])
258 tmp1 = take(array(tt), ind)
259 line = '\t'.join(tmp1.tolist())
260
261 out_ranked.write('%s\n' % line)
262 if(print_subsets):
263 if hit_list['Rank'][i] == 4:
264 out_missed.write('%s\n' % line)
265 if hit_list['Rank'][i] == 3:
266 out_out.write('%s\n' % line)
267 if hit_list['Rank'][i] == 1 or hit_list['Rank'][i] == 2:
268 out_in.write('%s\n' % line)
269
270 i = i + 1
271
272 #---------End--------------
273 def main():
274 #Ranking and filtering procedure
275 if len(sys.argv) < 2:
276 print "Usage:"
277 print "python RankFilter_GC-MS.py input \n"
278 print "input is a text file that specifies the names and the location"
279 print "of the files with the sample, compounds for calibration, library"
280 print "data, the core of the name ot the outputs, and the value of the"
281 print "window used for the filtering \n"
282
283 sys.exit(1)
284
285 #------Read the input file------
286 try:
287 ifile = open(sys.argv[1], 'r').read().split('\n')
288 except:
289 print sys.argv[1], " file can not be found"
290 sys.exit()
291
292 #Get the file names for the data
293 #labels - the type of input files
294 #filenames - the names of the input files
295 labels = []
296 filenames = []
297
298 for l in ifile:
299 l = l.strip()
300 if l != '':
301 labels.append(l.split('=')[0].replace(' ', '').replace('\r', ''))
302 filenames.append(l.split('=')[1].replace(' ', '').replace('\r', ''))
303
304 InputData = dict(zip(labels, filenames))
305
306 #this part checkes if the ouput option is set. The output files are saved as the output variable as prefix for the output files
307 #if the output is not found , each output file has to be selected by forehand. This comes in handy for pipeline tools as galaxy
308 print_subsets = True
309 NDIS_is_tabular = False
310
311 if 'output' in InputData:
312 output_files = {"ranked":InputData['output'] + "_ranked", \
313 "filter_in":InputData['output'] + "_filter_in", \
314 "filter_out":InputData['output'] + "filter_out", \
315 "missed":InputData['output'] + "_missed", \
316 "missed_parse_pdf":InputData['output'] + "_missed_parse_pdf"}
317 elif 'tabular' in InputData:
318 NDIS_is_tabular = True
319 if(not "onefile" in InputData):
320 output_files = {"ranked":InputData['ranked'], \
321 "filter_in":InputData['filter_in'], \
322 "filter_out":InputData['filter_out'], \
323 "missed":InputData['missed']}
324 else:
325 print_subsets = False
326 output_files = {"ranked":InputData['onefile']}
327 else:
328 output_files = {"ranked":InputData['ranked'], \
329 "filter_in":InputData['filter_in'], \
330 "filter_out":InputData['filter_out'], \
331 "missed":InputData['missed'], \
332 "missed_parse_pdf":InputData['missed_parse_pdf']}
333
334 #------Start with calibration------
335 #Check whether a file with data for the calibration is specified
336 #Specify which data to be read from the file with standard compounds
337 LabelColStand = ['Name', 'R.T.', 'RI']
338
339 if InputData['calibration'] != 'none':
340 #get the coeffiecients for the RT to RI convertion
341
342 try:
343 ifile = open(InputData['calibration'], 'r')
344 except:
345 print "file", InputData['calibration'], "can not be found"
346 sys.exit(1)
347
348 standards = get_data(InputData['calibration'], LabelColStand)
349 if InputData['model'] == 'linear':
350 coeff = calibrate(standards)
351 elif InputData['model'] == 'poly':
352 poly_cal = calibrate_poly(standards)
353 else:
354 print "error: model ", InputData['model'], " can not be found. Use 'linear' or 'poly' "
355 sys.exit(1)
356 else:
357 #No file has been specified for the calibration
358 #Use the default coefficients
359 print 'No file has been specified for the calibration'
360 print 'WARNING: the default coefficients will be used'
361 coeff = array([29.4327, 454.5260])
362
363 if InputData['analysis_type'] == 'AMDIS':
364 try:
365 AmdisOut = open(InputData['sample'], 'r')
366 print("open ok")
367 #Specify which data to be extracted from the AMDIS output table
368 #Weighted and Reverse are measure of matching between the experimental
369 #and the library spectra. The experimental spectrum is used as template
370 #for the calculation of Weighted, whereas for Reverse the template is the
371 #library spectrum
372 LabelCol = ['CAS', 'Name', 'R.T.', 'Weighted', 'Reverse', 'Purity']
373
374 #Get the data from the AMDIS output file
375 HitList = get_data(InputData['sample'], LabelCol)
376 #Remove '>' from the names
377 HitList['Name'] = [s.replace('>', '') for s in HitList['Name']]
378 except:
379 print "the file", InputData['sample'], "can not be found"
380 sys.exit(1)
381 if InputData['analysis_type'] == 'NIST':
382 #HitList_missed - a variable of type dictionary containing the hits with the symbol ";"
383 #in the name
384 if not NDIS_is_tabular:
385 print "Warning; NDIS is not tabular format, reading PDF..\n"
386 [HitList, HitList_missed] = pdfread.getPDF(InputData['sample'])
387 else:
388 HitList = pdfread.read_tabular(InputData['sample'])
389
390 #Convert RT to RI
391 if InputData['model'] == 'linear':
392 HitList = convert_rt(HitList, coeff)
393 if InputData['model'] == 'poly':
394 print "Executing convert_rt_poly().."
395 HitList = convert_rt_poly(HitList, poly_cal)
396
397 #------Read the library data with the predicted RI------
398 try:
399 LibData = open(InputData['lib_data'], 'r')
400 except:
401 print "the file", InputData['lib_data'], "can not be found"
402 sys.exit(1)
403
404 #Specify which data to be extracted from the library data file
405 LabelColLib = ['CAS', 'Name', 'RIsvr', 'Synonyms']
406 LibraryData = get_data(InputData['lib_data'], LabelColLib)
407
408 #------Match the hits with the library data and rank them------
409 if InputData['window'] != '':
410 HitList = rank_hit(HitList, LibraryData, InputData['window'])
411 else:
412 print "No value for the window used for the filtering is specified \n"
413 sys.exit(1)
414
415 #------Print the ranked and filtered hits------
416 #Specify which data to be printed
417 if InputData['analysis_type'] == 'AMDIS':
418 keys_to_print = ['R.T.', 'CAS', 'Name', 'Rank', 'RIexp', 'RIsvr', '%rel.err', 'Weighted', 'Reverse', 'Synonyms']
419 else:
420 keys_to_print = ['ID', 'R.T.', 'Name', 'CAS', 'Rank', 'RIexp', 'RIsvr', '%rel.err', 'Forward', 'Reverse', 'Synonyms', 'Library']
421
422 #skip this error output from reading a pdftotext file when file is tabular
423 if InputData['analysis_type'] == 'NIST' and not NDIS_is_tabular:
424 out_missed_pdf = open(output_files['missed_parse_pdf'], 'w')
425 for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']):
426 out_missed_pdf.write('%s\n' % '\t'.join([y, x]))
427 out_missed_pdf.close()
428
429 print_to_file(HitList, output_files, keys_to_print, print_subsets)
430
431 if __name__ == '__main__':
432 main()