comparison library_lookup.py @ 0:9d5f4f5f764b

Initial commit to toolshed
author pieter.lukasse@wur.nl
date Thu, 16 Jan 2014 13:10:00 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9d5f4f5f764b
1 '''
2 Logic for searching a Retention Index database file given output from NIST
3 '''
4 import match_library
5 import re
6 import sys
7 import csv
8
9 __author__ = "Marcel Kempenaar"
10 __contact__ = "brs@nbic.nl"
11 __copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
12 __license__ = "MIT"
13
14 def create_lookup_table(library_file, column_type_name, statphase):
15 '''
16 Creates a dictionary holding the contents of the library to be searched
17 @param library_file: library to read
18 @param column_type_name: the columns type name
19 @param statphase: the columns stationary phase
20 '''
21 (data, header) = match_library.read_library(library_file)
22 # Test for presence of required columns
23 if ('columntype' not in header or
24 'columnphasetype' not in header or
25 'cas' not in header):
26 raise IOError('Missing columns in ', library_file)
27
28 column_type_column = header.index("columntype")
29 statphase_column = header.index("columnphasetype")
30 cas_column = header.index("cas")
31
32 filtered_library = [line for line in data if line[column_type_column] == column_type_name
33 and line[statphase_column] == statphase]
34 lookup_dict = {}
35 for element in filtered_library:
36 # Here the cas_number is set to the numeric part of the cas_column value, so if the
37 # cas_column value is 'C1433' then cas_number will be '1433'
38 cas_number = str(re.findall(r'\d+', (element[cas_column]).strip())[0])
39 try:
40 lookup_dict[cas_number].append(element)
41 except KeyError:
42 lookup_dict[cas_number] = [element]
43 return lookup_dict
44
45
46 def _preferred(hits, pref, ctype, polar, model, method):
47 '''
48 Returns all entries in the lookup_dict that have the same column name, type and polarity
49 as given by the user, uses regression if selected given the model and method to use. The
50 regression is applied on the column with the best R-squared value in the model
51 @param hits: all entries in the lookup_dict for the given CAS number
52 @param pref: preferred GC-column, can be one or more names
53 @param ctype: column type (capillary etc.)
54 @param polar: polarity (polar / non-polar etc.)
55 @param model: data loaded from file containing regression models
56 @param method: supported regression method (i.e. poly(nomial) or linear)
57 '''
58 match = []
59 for column in pref:
60 for hit in hits:
61 if hit[4] == ctype and hit[5] == polar and hit[6] == column:
62 # Create copy of found hit since it will be altered downstream
63 match.extend(hit)
64 return match, False
65
66 # No hit found for current CAS number, return if not performing regression
67 if not model:
68 return False, False
69
70 # Perform regression
71 for column in pref:
72 if column not in model:
73 break
74 # Order regression candidates by R-squared value (last element)
75 order = sorted(model[column].items(), key=lambda col: col[1][-1])
76 # Create list of regression candidate column names
77 regress_columns = list(reversed([column for (column, _) in order]))
78 # Names of available columns
79 available = [hit[6] for hit in hits]
80
81 # TODO: combine Rsquared and number of datapoints to get the best regression match
82 '''
83 # Iterate regression columns (in order) and retrieve their models
84 models = {}
85 for col in regress_columns:
86 if col in available:
87 hit = list(hits[available.index(col)])
88 if hit[4] == ctype:
89 # models contains all model data including residuals [-2] and rsquared [-1]
90 models[pref[0]] = model[pref[0]][hit[6]]
91 # Get the combined maximum for residuals and rsquared
92 best_match = models[]
93 # Apply regression
94 if method == 'poly':
95 regressed = _apply_poly_regression(best_match, hit[6], float(hit[3]), model)
96 if regressed:
97 hit[3] = regressed
98 else:
99 return False, False
100 else:
101 hit[3] = _apply_linear_regression(best_match, hit[6], float(hit[3]), model)
102 match.extend(hit)
103 return match, hit[6]
104 '''
105
106 for col in regress_columns:
107 if col in available:
108 hit = list(hits[available.index(col)])
109 if hit[4] == ctype:
110 # Perform regression using a column for which regression is possible
111 if method == 'poly':
112 # Polynomial is only possible within a set border, if the RI falls outside
113 # of this border, skip this lookup
114 regressed = _apply_poly_regression(pref[0], hit[6], float(hit[3]), model)
115 if regressed:
116 hit[3] = regressed
117 else:
118 return False, False
119 else:
120 hit[3] = _apply_linear_regression(pref[0], hit[6], float(hit[3]), model)
121 match.extend(hit)
122 return match, hit[6]
123
124 return False, False
125
126
127
128 def default_hit(row, cas_nr, compound_id):
129 '''
130 This method will return a "default"/empty hit for cases where the
131 method _preferred() returns False (i.e. a RI could not be found
132 for the given cas nr, also not via regression.
133 '''
134 return [
135 #'CAS',
136 'C' + cas_nr,
137 #'NAME',
138 '',
139 #'FORMULA',
140 '',
141 #'RI',
142 '0.0',
143 #'Column.type',
144 '',
145 #'Column.phase.type',
146 '',
147 #'Column.name',
148 '',
149 #'phase.coding',
150 ' ',
151 #'CAS_column.Name',
152 '',
153 #'Centrotype', -> NOTE THAT compound_id is not ALWAYS centrotype...depends on MsClust algorithm used...for now only one MsClust algorithm is used so it is not an issue, but this should be updated/corrected once that changes
154 compound_id,
155 #'Regression.Column.Name',
156 '',
157 #'min',
158 '',
159 #'max',
160 '',
161 #'nr.duplicates',
162 '']
163
164
165 def format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method):
166 '''
167 Looks up the compounds in the library lookup table and formats the results
168 @param lookup_dict: dictionary containing the library to be searched
169 @param nist_tabular_filename: NIST output file to be matched
170 @param pref: (list of) column-name(s) to look for
171 @param ctype: column type of interest
172 @param polar: polarity of the used column
173 @param model: data loaded from file containing regression models
174 @param method: supported regression method (i.e. poly(nomial) or linear)
175 '''
176 (nist_tabular_list, header_clean) = match_library.read_library(nist_tabular_filename)
177 # Retrieve indices of the CAS and compound_id columns (exit if not present)
178 try:
179 casi = header_clean.index("cas")
180 idi = header_clean.index("id")
181 except:
182 raise IOError("'CAS' or 'compound_id' not found in header of library file")
183
184 data = []
185 for row in nist_tabular_list:
186 casf = str(row[casi].replace('-', '').strip())
187 compound_id = str(row[idi].split('-')[0])
188 if casf in lookup_dict:
189 found_hit, regress = _preferred(lookup_dict[casf], pref, ctype, polar, model, method)
190 if found_hit:
191 # Keep cas nr as 'C'+ numeric part:
192 found_hit[0] = 'C' + casf
193 # Add compound id
194 found_hit.insert(9, compound_id)
195 # Add information on regression process
196 found_hit.insert(10, regress if regress else 'None')
197 # Replace column index references with actual number of duplicates
198 dups = len(found_hit[-1].split(','))
199 if dups > 1:
200 found_hit[-1] = str(dups + 1)
201 else:
202 found_hit[-1] = '0'
203 data.append(found_hit)
204 found_hit = ''
205 else:
206 data.append(default_hit(row, casf, compound_id))
207 else:
208 data.append(default_hit(row, casf, compound_id))
209
210 casf = ''
211 compound_id = ''
212 found_hit = []
213 dups = []
214 return data
215
216
217 def _save_data(content, outfile):
218 '''
219 Write to output file
220 @param content: content to write
221 @param outfile: file to write to
222 '''
223 # header
224 header = ['CAS',
225 'NAME',
226 'FORMULA',
227 'RI',
228 'Column.type',
229 'Column.phase.type',
230 'Column.name',
231 'phase.coding',
232 'CAS_column.Name',
233 'Centrotype',
234 'Regression.Column.Name',
235 'min',
236 'max',
237 'nr.duplicates']
238 output_handle = csv.writer(open(outfile, 'wb'), delimiter="\t")
239 output_handle.writerow(header)
240 for entry in content:
241 output_handle.writerow(entry)
242
243
244 def _read_model(model_file):
245 '''
246 Creates an easy to search dictionary for getting the regression parameters
247 for each valid combination of GC-columns
248 @param model_file: filename containing the regression models
249 '''
250 regress = list(csv.reader(open(model_file, 'rU'), delimiter='\t'))
251 if len(regress.pop(0)) > 9:
252 method = 'poly'
253 else:
254 method = 'linear'
255
256 model = {}
257 # Create new dictionary for each GC-column
258 for line in regress:
259 model[line[0]] = {}
260
261 # Add data
262 for line in regress:
263 if method == 'poly':
264 model[line[0]][line[1]] = [float(col) for col in line[2:11]]
265 else: # linear
266 model[line[0]][line[1]] = [float(col) for col in line[2:9]]
267
268 return model, method
269
270
271 def _apply_poly_regression(column1, column2, retention_index, model):
272 '''
273 Calculates a new retention index (RI) value using a given 3rd-degree polynomial
274 model based on data from GC columns 1 and 2
275 @param column1: name of the selected GC-column
276 @param column2: name of the GC-column to use for regression
277 @param retention_index: RI to convert
278 @param model: dictionary containing model information for all GC-columns
279 '''
280 coeff = model[column1][column2]
281 # If the retention index to convert is within range of the data the model is based on, perform regression
282 if coeff[4] < retention_index < coeff[5]:
283 return (coeff[3] * (retention_index ** 3) + coeff[2] * (retention_index ** 2) +
284 (retention_index * coeff[1]) + coeff[0])
285 else:
286 return False
287
288
289 def _apply_linear_regression(column1, column2, retention_index, model):
290 '''
291 Calculates a new retention index (RI) value using a given linear model based on data
292 from GC columns 1 and 2
293 @param column1: name of the selected GC-column
294 @param column2: name of the GC-column to use for regression
295 @param retention_index: RI to convert
296 @param model: dictionary containing model information for all GC-columns
297 '''
298 # TODO: No use of limits
299 coeff = model[column1][column2]
300 return coeff[1] * retention_index + coeff[0]
301
302
303 def main():
304 '''
305 Library Lookup main function
306 '''
307 library_file = sys.argv[1]
308 nist_tabular_filename = sys.argv[2]
309 ctype = sys.argv[3]
310 polar = sys.argv[4]
311 outfile = sys.argv[5]
312 pref = sys.argv[6:-1]
313 regress = sys.argv[-1]
314
315 if regress != 'False':
316 model, method = _read_model(regress)
317 else:
318 model, method = False, None
319
320 lookup_dict = create_lookup_table(library_file, ctype, polar)
321 data = format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method)
322
323 _save_data(data, outfile)
324
325
326 if __name__ == "__main__":
327 main()