Mercurial > repos > pieterlukasse > prims_metabolomics
comparison rankfilter_GCMS/rankfilter.py @ 0:9d5f4f5f764b
Initial commit to toolshed
author | pieter.lukasse@wur.nl |
---|---|
date | Thu, 16 Jan 2014 13:10:00 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9d5f4f5f764b |
---|---|
1 """ | |
2 Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University | |
3 | |
4 Permission is hereby granted, free of charge, to any person obtaining a copy | |
5 of this software and associated documentation files (the "Software"), to deal | |
6 in the Software without restriction, including without limitation the rights | |
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
8 copies of the Software, and to permit persons to whom the Software is | |
9 furnished to do so, subject to the following conditions: | |
10 | |
11 The above copyright notice and this permission notice shall be included in | |
12 all copies or substantial portions of the Software. | |
13 | |
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
20 THE SOFTWARE. | |
21 """ | |
22 | |
23 #Library functions definition | |
24 #----------Begin------------- | |
25 from numpy import array, linalg, ones | |
26 from numpy.polynomial import polynomial | |
27 import math | |
28 import pdfread | |
29 import sys | |
30 | |
31 | |
32 def calibrate(standards): | |
33 ''' | |
34 Calculates the RT to RI conversion: RI = a + b*RT | |
35 @param standards: variable containing RI and RT data | |
36 ''' | |
37 A = ones((len(standards['R.T.']), 2), dtype=float) | |
38 A[:, 0] = array(map(float, standards['R.T.'])) | |
39 [coeff, res, r, s] = linalg.lstsq(A, array(map(float, standards['RI']))) | |
40 | |
41 return coeff | |
42 | |
43 | |
44 def calibrate_poly(standards): | |
45 ''' | |
46 Calculates the RT to RI conversion using a polynomial model | |
47 @param standards: variable containing RI and RT data | |
48 ''' | |
49 retention_time = array(map(float, standards['R.T.'])) | |
50 retention_index = array(map(float, standards['RI'])) | |
51 | |
52 # Fit a 3rd degree polynomial | |
53 fit = polynomial.polyfit(retention_time, retention_index, 3)[::-1] | |
54 return [fit[0], fit[1], fit[2], fit[3]] | |
55 | |
56 | |
57 def calculate_poly(retention_time, poly_cal): | |
58 ''' | |
59 Converts a given retention time to retention index using the calculated polynomial model | |
60 @param retention_time: retention_time to convert to retention index | |
61 @param poly_cal: result from calculating regression | |
62 ''' | |
63 # Calculates RI based on given retention_time using polynomial function | |
64 retention_time = array(map(float, retention_time)) | |
65 if len(retention_time) > 1: | |
66 ri_exp = [] | |
67 for i in retention_time: | |
68 ri_exp.append(poly_cal[0] * (i ** 3) + poly_cal[1] * (i ** 2) + (i * poly_cal[2]) + poly_cal[3]) | |
69 return ri_exp | |
70 else: | |
71 return poly_cal[0] * (retention_time ** 3) + poly_cal[1] * (retention_time ** 2) + (retention_time * poly_cal[2]) + poly_cal[3] | |
72 | |
73 | |
74 def convert_rt(hit_list, coeff): | |
75 ''' | |
76 Converts a given retention time to retention index using the linear model | |
77 @param hit_list: list holding the retention time | |
78 @param coeff: calculated coefficient (slope and intercept) using the linear model | |
79 ''' | |
80 #Convert RT to RI | |
81 hit_list['RIexp'] = array(map(float, hit_list['R.T.'])) * coeff[0] + coeff[1] | |
82 return hit_list | |
83 | |
84 | |
85 def convert_rt_poly(hit_list, poly_cal): | |
86 ''' | |
87 Calls the actual RT to RI converter and returns the updated list with added RIexp value | |
88 @param hit_list: result list containing the retention time | |
89 ''' | |
90 hit_list['RIexp'] = array(map(float, calculate_poly(hit_list['R.T.'], poly_cal))) | |
91 return hit_list | |
92 | |
93 | |
94 def get_data(libdata, LabelCol): | |
95 ''' | |
96 Retrieves datacolumns indicated by LabelCol from libdata (generic function) | |
97 Returns a dict with the requested column names as keys | |
98 @param libdata: file from which data is loaded | |
99 @param LabelCol: columns to retrieve | |
100 ''' | |
101 from numpy import take | |
102 LibData = open(libdata, 'r').read().split('\n') | |
103 | |
104 #Get the labels of the columns in the file | |
105 FirstLine = LibData.pop(0).split('\t') | |
106 | |
107 FirstLine[-1] = FirstLine[-1].replace('\r', '') | |
108 | |
109 # Create a temporate variable containing the all data | |
110 tmp_data = [] | |
111 for ll in LibData: | |
112 if ll != '': | |
113 tmp_data.append(ll.split('\t')) | |
114 | |
115 # Find the indices of the desired data | |
116 ind = [] | |
117 try: | |
118 for key in LabelCol: | |
119 ind.append(FirstLine.index(key)) | |
120 except: | |
121 print str(key) + " not found in first line of library (" + str(libdata) + ")" | |
122 print"the folowing items are found in the first line of the library:\n" + str(FirstLine) | |
123 sys.exit(1) | |
124 # Extract the desired data | |
125 data = [] | |
126 for x in tmp_data: | |
127 data.append(take(array(x), ind)) | |
128 | |
129 # library_data = dict(zip(LabelCol,transpose(data))) | |
130 library_data = dict(zip(LabelCol, map(lambda *x: list(x), *data))) | |
131 return library_data | |
132 | |
133 | |
134 def rank_hit(hit_list, library_data, window): | |
135 ''' | |
136 Computes the Rank and % relative error | |
137 @param hit_list: input data | |
138 @param library_data: library used for reading the RIsvr data | |
139 @param window: error window | |
140 ''' | |
141 hit_match_ripred = [] | |
142 hit_match_syn = [] | |
143 # Convert 'Name' data to list in order to be indexed | |
144 # library_data['Name']=list(library_data['Name']) | |
145 | |
146 for hit_cas, hit_name in zip(hit_list['CAS'], hit_list['Name']): | |
147 index = 0 | |
148 if hit_cas != 'undef': | |
149 try: | |
150 index = library_data['CAS'].index(hit_cas.replace(' ', '').replace('-', '')) | |
151 except: | |
152 try: | |
153 index = library_data['Name'].index(hit_name.replace(' ', '')) | |
154 except: | |
155 # If for any reason the hit is not present | |
156 # in the mainlib library indicate this with -999 | |
157 index = 0 | |
158 else: | |
159 try: | |
160 index = library_data['Name'].index(hit_name.replace(' ', '')) | |
161 except: | |
162 # If for any reason the hit is not present | |
163 # in the mainlib library indicate this with -999 | |
164 index = 0 | |
165 if index != 0: | |
166 hit_match_ripred.append(float(library_data['RIsvr'][index])) | |
167 hit_match_syn.append(library_data['Synonyms'][index]) | |
168 else: | |
169 hit_match_ripred.append(-999) | |
170 hit_match_syn.append('None') | |
171 hit_list['RIsvr'] = hit_match_ripred | |
172 hit_list['Synonyms'] = hit_match_syn | |
173 | |
174 # Determine the relative difference between the experimental | |
175 # and the predicted RI | |
176 ri_err = [] | |
177 | |
178 for ri_exp, ri_qsar in zip(hit_list['RIexp'], hit_list['RIsvr']): | |
179 if int(ri_qsar) != -999: | |
180 ri_err.append(float(int(float(ri_qsar)) - int(float(ri_exp))) / int(float(ri_exp)) * 100) | |
181 else: | |
182 ri_err.append(-999) | |
183 | |
184 # Define the rank of the hits | |
185 hit_rank = [] | |
186 | |
187 for tt in ri_err: | |
188 if tt == -999: | |
189 # The name of the hit generated with AMDIS did not match a name | |
190 # in the mainlib library | |
191 hit_rank.append(4) | |
192 else: | |
193 # Rank 1 - ri_err is within +/- window/2 | |
194 if abs(tt) <= float(window) / 2: | |
195 hit_rank.append(1) | |
196 # Rank 2 - window/2 < ri_err <= window | |
197 if abs(tt) > float(window) / 2 and abs(tt) <= float(window): | |
198 hit_rank.append(2) | |
199 # Rank 3 - ri_err > window | |
200 if abs(tt) > float(window): | |
201 hit_rank.append(3) | |
202 hit_list['Rank'] = hit_rank | |
203 hit_list['%rel.err'] = ri_err | |
204 return hit_list | |
205 | |
206 def print_to_file(hit_list, filename, keys_to_print, print_subsets=True): | |
207 ''' | |
208 Writes output data to files (four output files are generated): | |
209 filename_ranked - the hits are ranked | |
210 filename_filter_in - only hits with rank 1 and 2 are retained | |
211 filename_filter_out - hits with rank 3 are filtered out | |
212 filename_filter_missed - hits with rank 4 - there was no match with the | |
213 library data | |
214 @param hit_list: a dictionary with the ranked hits | |
215 @param filename: the core of the output file names | |
216 @param keys_to_print: determines the order in which the data are printed | |
217 @param print_subsets: | |
218 ''' | |
219 from numpy import take | |
220 | |
221 out_ranked = open(filename["ranked"], 'w') | |
222 if (print_subsets): | |
223 out_in = open(filename["filter_in"], 'w') | |
224 out_out = open(filename["filter_out"], 'w') | |
225 out_missed = open(filename["missed"], 'w') | |
226 | |
227 #Convert RIexp and RIsvr into integer for printing | |
228 hit_list['RIexp'] = map(int, map(math.ceil, hit_list['RIexp'])) | |
229 hit_list['RIsvr'] = map(int, map(math.ceil, hit_list['RIsvr'])) | |
230 | |
231 #Establish the right order of the data to be printed | |
232 tmp_items = hit_list.items() | |
233 tmp_keys = hit_list.keys() | |
234 ind = [] | |
235 | |
236 for key in keys_to_print: | |
237 ind.append(tmp_keys.index(key)) | |
238 | |
239 #Print the labels of the columns | |
240 line = '\t'.join(take(array(tmp_keys), ind)) | |
241 out_ranked.write('%s\n' % line) | |
242 if (print_subsets): | |
243 out_in.write('%s\n' % line) | |
244 out_out.write('%s\n' % line) | |
245 out_missed.write('%s\n' % line) | |
246 | |
247 #Collect the data for each hit in the right order and print them | |
248 #in the output file | |
249 i = 0 | |
250 for name in hit_list['Name']: | |
251 tt = [] | |
252 for x in iter(tmp_items): | |
253 # trim the value if it is a string: | |
254 if isinstance(x[1][i], basestring): | |
255 tt.append(x[1][i].strip()) | |
256 else: | |
257 tt.append(x[1][i]) | |
258 tmp1 = take(array(tt), ind) | |
259 line = '\t'.join(tmp1.tolist()) | |
260 | |
261 out_ranked.write('%s\n' % line) | |
262 if(print_subsets): | |
263 if hit_list['Rank'][i] == 4: | |
264 out_missed.write('%s\n' % line) | |
265 if hit_list['Rank'][i] == 3: | |
266 out_out.write('%s\n' % line) | |
267 if hit_list['Rank'][i] == 1 or hit_list['Rank'][i] == 2: | |
268 out_in.write('%s\n' % line) | |
269 | |
270 i = i + 1 | |
271 | |
272 #---------End-------------- | |
273 def main(): | |
274 #Ranking and filtering procedure | |
275 if len(sys.argv) < 2: | |
276 print "Usage:" | |
277 print "python RankFilter_GC-MS.py input \n" | |
278 print "input is a text file that specifies the names and the location" | |
279 print "of the files with the sample, compounds for calibration, library" | |
280 print "data, the core of the name ot the outputs, and the value of the" | |
281 print "window used for the filtering \n" | |
282 | |
283 sys.exit(1) | |
284 | |
285 #------Read the input file------ | |
286 try: | |
287 ifile = open(sys.argv[1], 'r').read().split('\n') | |
288 except: | |
289 print sys.argv[1], " file can not be found" | |
290 sys.exit() | |
291 | |
292 #Get the file names for the data | |
293 #labels - the type of input files | |
294 #filenames - the names of the input files | |
295 labels = [] | |
296 filenames = [] | |
297 | |
298 for l in ifile: | |
299 l = l.strip() | |
300 if l != '': | |
301 labels.append(l.split('=')[0].replace(' ', '').replace('\r', '')) | |
302 filenames.append(l.split('=')[1].replace(' ', '').replace('\r', '')) | |
303 | |
304 InputData = dict(zip(labels, filenames)) | |
305 | |
306 #this part checkes if the ouput option is set. The output files are saved as the output variable as prefix for the output files | |
307 #if the output is not found , each output file has to be selected by forehand. This comes in handy for pipeline tools as galaxy | |
308 print_subsets = True | |
309 NDIS_is_tabular = False | |
310 | |
311 if 'output' in InputData: | |
312 output_files = {"ranked":InputData['output'] + "_ranked", \ | |
313 "filter_in":InputData['output'] + "_filter_in", \ | |
314 "filter_out":InputData['output'] + "filter_out", \ | |
315 "missed":InputData['output'] + "_missed", \ | |
316 "missed_parse_pdf":InputData['output'] + "_missed_parse_pdf"} | |
317 elif 'tabular' in InputData: | |
318 NDIS_is_tabular = True | |
319 if(not "onefile" in InputData): | |
320 output_files = {"ranked":InputData['ranked'], \ | |
321 "filter_in":InputData['filter_in'], \ | |
322 "filter_out":InputData['filter_out'], \ | |
323 "missed":InputData['missed']} | |
324 else: | |
325 print_subsets = False | |
326 output_files = {"ranked":InputData['onefile']} | |
327 else: | |
328 output_files = {"ranked":InputData['ranked'], \ | |
329 "filter_in":InputData['filter_in'], \ | |
330 "filter_out":InputData['filter_out'], \ | |
331 "missed":InputData['missed'], \ | |
332 "missed_parse_pdf":InputData['missed_parse_pdf']} | |
333 | |
334 #------Start with calibration------ | |
335 #Check whether a file with data for the calibration is specified | |
336 #Specify which data to be read from the file with standard compounds | |
337 LabelColStand = ['Name', 'R.T.', 'RI'] | |
338 | |
339 if InputData['calibration'] != 'none': | |
340 #get the coeffiecients for the RT to RI convertion | |
341 | |
342 try: | |
343 ifile = open(InputData['calibration'], 'r') | |
344 except: | |
345 print "file", InputData['calibration'], "can not be found" | |
346 sys.exit(1) | |
347 | |
348 standards = get_data(InputData['calibration'], LabelColStand) | |
349 if InputData['model'] == 'linear': | |
350 coeff = calibrate(standards) | |
351 elif InputData['model'] == 'poly': | |
352 poly_cal = calibrate_poly(standards) | |
353 else: | |
354 print "error: model ", InputData['model'], " can not be found. Use 'linear' or 'poly' " | |
355 sys.exit(1) | |
356 else: | |
357 #No file has been specified for the calibration | |
358 #Use the default coefficients | |
359 print 'No file has been specified for the calibration' | |
360 print 'WARNING: the default coefficients will be used' | |
361 coeff = array([29.4327, 454.5260]) | |
362 | |
363 if InputData['analysis_type'] == 'AMDIS': | |
364 try: | |
365 AmdisOut = open(InputData['sample'], 'r') | |
366 print("open ok") | |
367 #Specify which data to be extracted from the AMDIS output table | |
368 #Weighted and Reverse are measure of matching between the experimental | |
369 #and the library spectra. The experimental spectrum is used as template | |
370 #for the calculation of Weighted, whereas for Reverse the template is the | |
371 #library spectrum | |
372 LabelCol = ['CAS', 'Name', 'R.T.', 'Weighted', 'Reverse', 'Purity'] | |
373 | |
374 #Get the data from the AMDIS output file | |
375 HitList = get_data(InputData['sample'], LabelCol) | |
376 #Remove '>' from the names | |
377 HitList['Name'] = [s.replace('>', '') for s in HitList['Name']] | |
378 except: | |
379 print "the file", InputData['sample'], "can not be found" | |
380 sys.exit(1) | |
381 if InputData['analysis_type'] == 'NIST': | |
382 #HitList_missed - a variable of type dictionary containing the hits with the symbol ";" | |
383 #in the name | |
384 if not NDIS_is_tabular: | |
385 print "Warning; NDIS is not tabular format, reading PDF..\n" | |
386 [HitList, HitList_missed] = pdfread.getPDF(InputData['sample']) | |
387 else: | |
388 HitList = pdfread.read_tabular(InputData['sample']) | |
389 | |
390 #Convert RT to RI | |
391 if InputData['model'] == 'linear': | |
392 HitList = convert_rt(HitList, coeff) | |
393 if InputData['model'] == 'poly': | |
394 print "Executing convert_rt_poly().." | |
395 HitList = convert_rt_poly(HitList, poly_cal) | |
396 | |
397 #------Read the library data with the predicted RI------ | |
398 try: | |
399 LibData = open(InputData['lib_data'], 'r') | |
400 except: | |
401 print "the file", InputData['lib_data'], "can not be found" | |
402 sys.exit(1) | |
403 | |
404 #Specify which data to be extracted from the library data file | |
405 LabelColLib = ['CAS', 'Name', 'RIsvr', 'Synonyms'] | |
406 LibraryData = get_data(InputData['lib_data'], LabelColLib) | |
407 | |
408 #------Match the hits with the library data and rank them------ | |
409 if InputData['window'] != '': | |
410 HitList = rank_hit(HitList, LibraryData, InputData['window']) | |
411 else: | |
412 print "No value for the window used for the filtering is specified \n" | |
413 sys.exit(1) | |
414 | |
415 #------Print the ranked and filtered hits------ | |
416 #Specify which data to be printed | |
417 if InputData['analysis_type'] == 'AMDIS': | |
418 keys_to_print = ['R.T.', 'CAS', 'Name', 'Rank', 'RIexp', 'RIsvr', '%rel.err', 'Weighted', 'Reverse', 'Synonyms'] | |
419 else: | |
420 keys_to_print = ['ID', 'R.T.', 'Name', 'CAS', 'Rank', 'RIexp', 'RIsvr', '%rel.err', 'Forward', 'Reverse', 'Synonyms', 'Library'] | |
421 | |
422 #skip this error output from reading a pdftotext file when file is tabular | |
423 if InputData['analysis_type'] == 'NIST' and not NDIS_is_tabular: | |
424 out_missed_pdf = open(output_files['missed_parse_pdf'], 'w') | |
425 for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']): | |
426 out_missed_pdf.write('%s\n' % '\t'.join([y, x])) | |
427 out_missed_pdf.close() | |
428 | |
429 print_to_file(HitList, output_files, keys_to_print, print_subsets) | |
430 | |
431 if __name__ == '__main__': | |
432 main() |