Mercurial > repos > eslerm > vkmz
comparison vkmz.py @ 0:0b8ddf650752 draft
planemo upload for repository https://github.com/HegemanLab/VKMZ commit 7c299d22bdce251ce599cd34df76919d297a7007-dirty
author | eslerm |
---|---|
date | Wed, 02 May 2018 18:31:06 -0400 |
parents | |
children | b02af8eb8e6e |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0b8ddf650752 |
---|---|
1 ''' | |
2 based on the BMRB compound database which can be found at: | |
3 http://www.bmrb.wisc.edu/ftp/pub/bmrb/relational_tables/metabolomics/Chem_comp.csv | |
4 ''' | |
5 | |
6 import re | |
7 import argparse | |
8 import multiprocessing | |
9 from multiprocessing import Pool | |
10 import csv | |
11 | |
12 import numpy as np | |
13 import math | |
14 import pandas as pd | |
15 from plotly import __version__ | |
16 import plotly.offline as py | |
17 import plotly.graph_objs as go | |
18 | |
19 parser = argparse.ArgumentParser() | |
20 inputSubparser = parser.add_subparsers(help='Select input type:', dest='input-type') | |
21 parse_tsv = inputSubparser.add_parser('tsv', help='Use tabular data as input.') | |
22 parse_tsv.add_argument('--input', '-i', required=True, help='Path to tabular file. Must include columns: sample ID, mz, polarity, intensity, & retention time.') | |
23 parse_tsv.add_argument('--no-plot', '-np', action='store_true', help='Disable plot generation.') | |
24 parse_xcms = inputSubparser.add_parser('xcms', help='Use XCMS data as input.') | |
25 parse_xcms.add_argument('--data-matrix', '-xd', required=True, nargs='?', type=str, help='Path to XCMS dataMatrix file.') | |
26 parse_xcms.add_argument('--sample-metadata', '-xs', required=True, nargs='?', type=str, help='Path to XCMS sampleMetadata file.') | |
27 parse_xcms.add_argument('--variable-metadata', '-xv', required=True, nargs='?', type=str, help='Path to XCMS variableMetadata file.') | |
28 parse_xcms.add_argument('--no-plot', '-n', action='store_true', help='Disable plot generation.') | |
29 parse_plot = inputSubparser.add_parser('plot', help='Only plot data.') | |
30 parse_plot.add_argument('--input', '-i', required=True, nargs='?', type=str, help='Path to VKMZ generated tabular file.') | |
31 for inputSubparser in [parse_tsv, parse_xcms]: | |
32 inputSubparser.add_argument('--output', '-o', nargs='?', type=str, required=True, help='Specify output file path.') | |
33 inputSubparser.add_argument('--error', '-e', nargs='?', type=float, required=True, help='Mass error of mass spectrometer in PPM') | |
34 inputSubparser.add_argument('--database', '-d', nargs='?', default='databases/bmrb-light.tsv', help='Select database.') | |
35 inputSubparser.add_argument('--directory', nargs='?', default='', type=str, help='Define directory of tool.') | |
36 inputSubparser.add_argument('--no-adjustment', '-na', action='store_true', help='Use flag to turn off polarity based mass adjustment. This flag should always be used if reprocessing data generated by VKMZ.') | |
37 inputSubparser.add_argument('--multiprocessing', '-m', action='store_true', help='Use flag to turn on multiprocessing.') | |
38 inputSubparser.add_argument('--plottype', '-p', nargs='?', default='scatter-2d', choices=['scatter-2d', 'scatter-3d'], help='Select plot type.') | |
39 inputSubparser.add_argument('--size', '-s', nargs='?', default=5, type=int, help='Set size of of dots. size+2*log(size*peak/(highest_peak/lowest_peak') | |
40 inputSubparser.add_argument('--size-algorithm', '-a', nargs='?', default=0, type=int, choices=[0,1,2],help='Size algorithm selector. Algo 0: size, Algo 1: size+2*log(size*peak/(highest_peak/lowest_peak, Algo 2: size+2*size*peak/(highest_peak-lowest_peak)') | |
41 args = parser.parse_args() | |
42 | |
43 vkInputType = getattr(args, "input-type") | |
44 | |
45 # read inputs, arguments and define globals | |
46 vkError = getattr(args, "error") | |
47 | |
48 vkMultiprocessing = getattr(args, "multiprocessing") | |
49 | |
50 vkNoAdjustment = getattr(args, "no_adjustment") | |
51 | |
52 vkDatabaseFile = getattr(args, "database") | |
53 vkDirectory = getattr(args, "directory") | |
54 | |
55 vkMass = [] | |
56 vkFormula = [] | |
57 try: | |
58 with open(vkDirectory+vkDatabaseFile, 'r') as tsv: | |
59 next(tsv) # skip first row | |
60 for row in tsv: | |
61 mass, formula = row.split() | |
62 vkMass.append(mass) | |
63 vkFormula.append(formula) | |
64 except ValueError: | |
65 print('The %s database could not be loaded.' % vkDatabaseFile) | |
66 vkMaxIndex = len(vkMass)-1 | |
67 | |
68 vkOutput = getattr(args, "output") | |
69 | |
70 vkPlotType = getattr(args, 'plottype') | |
71 | |
72 vkSize = getattr(args, 'size') | |
73 | |
74 vkSizeAlgo = getattr(args, 'size_algorithm') | |
75 | |
76 # control predictions | |
77 def forecaster(vkInput): | |
78 if vkMultiprocessing: | |
79 try: | |
80 pool = Pool() | |
81 vkOutputList = pool.map(featurePrediction, vkInput) | |
82 except Exception as e: | |
83 print("Error during multirpocessing: "+str(e)) | |
84 finally: | |
85 pool.close() | |
86 pool.join() | |
87 else: | |
88 vkOutputList = map(featurePrediction, vkInput) | |
89 vkOutputList = [x for x in vkOutputList if x is not None] | |
90 return(vkOutputList) | |
91 | |
92 # predict feature formulas and creates output list | |
93 def featurePrediction(feature): | |
94 if vkNoAdjustment: | |
95 mass = feature[2] | |
96 else: | |
97 mass = adjust(feature[2], feature[1]) # mz & polarity | |
98 uncertainty = mass * vkError / 1e6 | |
99 prediction = predict(mass, uncertainty, 0, vkMaxIndex) | |
100 if prediction != -1: | |
101 feature[2] = mass | |
102 predictions = predictNeighbors(mass, uncertainty, prediction) | |
103 feature[5] = predictions | |
104 predictionClosest = predictions[0] | |
105 formula = predictionClosest[1] | |
106 formulaList = re.findall('[A-Z][a-z]?|[0-9]+', formula) | |
107 formulaDictionary = {'C':0,'H':0,'O':0,'N':0} | |
108 i = 0; | |
109 while i < len(formulaList): | |
110 if formulaList[i] in formulaDictionary: | |
111 # if there is only one of this element | |
112 if i+1 == len(formulaList) or formulaList[i+1].isalpha(): | |
113 formulaDictionary[formulaList[i]] = 1 | |
114 else: | |
115 formulaDictionary[formulaList[i]] = formulaList[i+1] | |
116 i+=1 | |
117 i+=1 | |
118 predictionClosest.append(formulaDictionary) | |
119 hc = float(formulaDictionary['H'])/float(formulaDictionary['C']) | |
120 oc = float(formulaDictionary['O'])/float(formulaDictionary['C']) | |
121 nc = float(formulaDictionary['N'])/float(formulaDictionary['C']) | |
122 predictionClosestDelta = feature[5][0][2] | |
123 feature += [predictionClosestDelta, hc, oc, nc] | |
124 return(feature) | |
125 | |
126 # adjust observed mass to a neutral mass | |
127 def adjust(mass, polarity): | |
128 # value to adjust by | |
129 proton = 1.007276 | |
130 if polarity == 'positive': | |
131 mass += proton | |
132 elif polarity == 'negative': | |
133 mass -= proton | |
134 return mass | |
135 | |
136 # Binary search to match observed mass to known mass within error | |
137 # https://en.wikipedia.org/wiki/Binary_search_tree | |
138 def predict(mass, uncertainty, left, right): | |
139 mid = ((right - left) / 2) + left | |
140 if left <= mid <= right and mid <= vkMaxIndex: | |
141 delta = float(vkMass[mid]) - mass | |
142 if uncertainty >= abs(delta): | |
143 return mid | |
144 elif uncertainty < delta: | |
145 return predict(mass, uncertainty, left, mid-1) | |
146 else: | |
147 return predict(mass, uncertainty, mid+1, right) | |
148 return -1 | |
149 | |
150 # find and sort known masses within error limit of observed mass | |
151 def predictNeighbors(mass, uncertainty, prediction): | |
152 i = 0 | |
153 neighbors = [[vkMass[prediction],vkFormula[prediction],(float(vkMass[prediction])-mass)],] | |
154 while prediction+i+1 <= vkMaxIndex: | |
155 neighbor = prediction+i+1 | |
156 delta = float(vkMass[neighbor])-mass | |
157 if uncertainty >= abs(delta): | |
158 neighbors.append([vkMass[neighbor],vkFormula[neighbor],delta]) | |
159 i += 1 | |
160 else: | |
161 break | |
162 i = 0 | |
163 while prediction+i-1 >= 0: | |
164 neighbor = prediction+i-1 | |
165 delta = float(vkMass[neighbor])-mass | |
166 if uncertainty >= abs(delta): | |
167 neighbors.append([vkMass[neighbor],vkFormula[neighbor],(float(vkMass[neighbor])-mass)]) | |
168 i -= 1 | |
169 else: | |
170 break | |
171 neighbors = sorted(neighbors, key = (lambda delta: abs(delta[2]))) | |
172 return neighbors | |
173 | |
174 # write output file | |
175 def saveForcast(vkOutputList): | |
176 try: | |
177 with open(vkOutput+'.tsv', 'w') as f: | |
178 f.writelines(str("sample_id\tpolarity\tmz\tretention_time\tintensity\tpredictions\tdelta\tH:C\tO:C\tN:C") + '\n') | |
179 for feature in vkOutputList: | |
180 f.writelines(feature[0]+'\t'+feature[1]+'\t'+str(feature[2])+'\t'+str(feature[3])+'\t'+str(feature[4])+'\t'+str(feature[5])+'\t'+str(feature[6])+'\t'+str(feature[7])+'\t'+str(feature[8])+'\t'+str(feature[9])+'\t'+'\n') | |
181 except ValueError: | |
182 print('"%s" could not be saved.' % filename) | |
183 | |
184 def plotRatios(vkData): | |
185 max_rt = 0 | |
186 min_intensity = 10.0**10 | |
187 max_intensity = 0.0 | |
188 max_hc = 0 | |
189 max_oc = 0 | |
190 max_nc = 0 | |
191 for row in vkData: | |
192 if row[3] > max_rt: | |
193 max_rt = row[3] | |
194 intensity = float(row[4]) | |
195 if intensity < min_intensity: | |
196 min_intensity = intensity | |
197 if intensity > max_intensity: | |
198 max_intensity = intensity | |
199 if row[7] > max_hc: | |
200 max_hc = row[7] | |
201 if row[8] > max_oc: | |
202 max_oc = row[8] | |
203 if row[9] > max_nc: | |
204 max_nc = row[9] | |
205 labels = ['sampleID', 'polarity', 'mz', 'rt', 'intensity', 'predictions', 'delta', 'hc', 'oc', 'nc'] | |
206 df = pd.DataFrame.from_records(vkData, columns=labels) | |
207 sampleIDs = df.sampleID.unique() | |
208 data = [] | |
209 menus = [] | |
210 i = 0 | |
211 for sampleID in sampleIDs: | |
212 dfSample = df.loc[df['sampleID'] == sampleID] | |
213 if vkSizeAlgo == 0: | |
214 size = dfSample.intensity.apply(lambda x: vkSize) | |
215 else: | |
216 size = dfSample.intensity.apply(lambda x: vkSize+4*vkSize*float(x)/max_intensity) | |
217 trace = go.Scatter( | |
218 x = dfSample.oc, | |
219 y = dfSample.hc, | |
220 text = dfSample.predictions.apply(lambda x: "Prediction: "+str(x[0][1])+"<br>mz: " +str(x[0][0])+"<br>Delta: "+str(x[0][2])), | |
221 line = dict(width = 0.5), | |
222 mode = 'markers', | |
223 marker = dict( | |
224 size = size, | |
225 color = dfSample.rt, | |
226 colorscale = 'Viridis', | |
227 cmin = 0, | |
228 cmax = max_rt, | |
229 colorbar=dict(title='Retention Time (s)'), | |
230 line = dict(width = 0.5), | |
231 showscale = True | |
232 ), | |
233 opacity = 0.8 | |
234 ) | |
235 data.append(trace) | |
236 vision = [] | |
237 j = 0 | |
238 while j < len(sampleIDs): | |
239 if j != i: | |
240 vision.append(False) | |
241 else: | |
242 vision.append(True) | |
243 j += 1 | |
244 menu = dict( | |
245 method = 'update', | |
246 label = sampleID, | |
247 args = [{'visible': vision}, {'title': sampleID}] | |
248 ) | |
249 menus.append(menu) | |
250 i += 1 | |
251 updatemenus = list([ | |
252 dict( | |
253 active = -1, | |
254 buttons = menus | |
255 ) | |
256 ]) | |
257 layout = go.Layout( | |
258 title = "Van Krevelen Diagram", | |
259 showlegend = False, | |
260 xaxis = dict( | |
261 title = 'Oxygen to Carbon Ratio', | |
262 zeroline = False, | |
263 gridcolor = 'rgb(183,183,183)', | |
264 showline = True, | |
265 range = [0, max_oc] | |
266 ), | |
267 yaxis = dict( | |
268 title = 'Hydrogen to Carbon Ratio', | |
269 zeroline = False, | |
270 gridcolor = 'rgb(183,183,183)', | |
271 showline = True, | |
272 range = [0, max_hc] | |
273 ), | |
274 margin = dict(r=0, b=100, l=100, t=100), | |
275 updatemenus = updatemenus | |
276 ) | |
277 fig = go.Figure(data=data, layout=layout) | |
278 py.plot(fig, auto_open=False, show_link=False, filename=vkOutput+'.html') | |
279 | |
280 def polaritySanitizer(sample_polarity): | |
281 if sample_polarity.lower() in {'positive','pos','+'}: | |
282 sample_polarity = 'positive' | |
283 elif sample_polarity.lower() in {'negative', 'neg', '-'}: | |
284 sample_polarity = 'negative' | |
285 else: | |
286 print('A sample has an unknown polarity type: %s. Polarity in the XCMS sample metadata should be set to "negative" or "positive".' % sample_polarity) | |
287 raise ValueError | |
288 return sample_polarity | |
289 | |
290 # main | |
291 if vkInputType == "tsv": | |
292 vkInput = [] | |
293 tsvFile = getattr(args, "input") | |
294 try: | |
295 with open(tsvFile, 'r') as f: | |
296 next(f) # skip hearder line | |
297 tsvData = csv.reader(f, delimiter='\t') | |
298 for row in tsvData: | |
299 vkInput.append([row[0],polaritySanitizer(row[1]),float(row[2]),float(row[3]),float(row[4]),[]]) | |
300 except ValueError: | |
301 print('The %s data file could not be read.' % tsvFile) | |
302 vkData = forecaster(vkInput) | |
303 saveForcast(vkData) | |
304 plotRatios(vkData) | |
305 elif vkInputType == "xcms": | |
306 vkInput = [] | |
307 xcmsSampleMetadataFile = getattr(args, "sample_metadata") | |
308 try: | |
309 polarity = {} | |
310 with open(xcmsSampleMetadataFile, 'r') as f: | |
311 xcmsSampleMetadata = csv.reader(f, delimiter='\t') | |
312 next(xcmsSampleMetadata, None) # skip header | |
313 for row in xcmsSampleMetadata: | |
314 sample = row[0] | |
315 sample_polarity = polaritySanitizer(row[2]) | |
316 polarity[sample] = sample_polarity | |
317 except ValueError: | |
318 print('The %s data file could not be read. Check that polarity is set to "negative" or "positive"' % xcmsSampleMetadataFile) | |
319 xcmsVariableMetadataFile = getattr(args, "variable_metadata") | |
320 try: | |
321 mz = {} | |
322 rt = {} | |
323 variable_index = {} | |
324 mz_index = int() | |
325 rt_index = int() | |
326 with open(xcmsVariableMetadataFile, 'r') as f: | |
327 xcmsVariableMetadata = csv.reader(f, delimiter='\t') | |
328 i = 0 | |
329 for row in xcmsVariableMetadata: | |
330 if i != 0: | |
331 mz[row[0]] = float(row[mz_index]) | |
332 rt[row[0]] = float(row[rt_index]) | |
333 else: | |
334 for column in row: | |
335 variable_index[column] = i | |
336 i += 1 | |
337 mz_index = variable_index["mz"] | |
338 rt_index = variable_index["rt"] | |
339 except ValueError: | |
340 print('The %s data file could not be read.' % xcmsVariableMetadataFile) | |
341 xcmsDataMatrixFile = getattr(args, "data_matrix") | |
342 try: | |
343 with open(xcmsDataMatrixFile, 'r') as f: | |
344 xcmsDataMatrix = csv.reader(f, delimiter='\t') | |
345 first_row = True | |
346 for row in xcmsDataMatrix: | |
347 if first_row: | |
348 sample_id = row | |
349 first_row = False | |
350 else: | |
351 i = 0 | |
352 while(i < len(row)): | |
353 if i == 0: | |
354 i+=1 | |
355 else: | |
356 intensity = row[i] | |
357 if intensity not in {'NA', '#DIV/0!', '0'}: | |
358 variable = row[0] | |
359 sample = sample_id[i] | |
360 # XCMS data may include empty columns | |
361 if sample != "": | |
362 vkInput.append([sample, polarity[sample], mz[variable], rt[variable], float(intensity), []]) | |
363 i+=1 | |
364 except ValueError: | |
365 print('The %s data file could not be read.' % xcmsDataMatrixFile) | |
366 vkData = forecaster(vkInput) | |
367 saveForcast(vkData) | |
368 plotRatios(vkData) | |
369 else: | |
370 vkData = [] | |
371 tsvPlotvFile = getattr(args, "input") | |
372 try: | |
373 with open(tsvPlotFile, 'r') as f: | |
374 next(f) # skip header line | |
375 plotData = csv.reader(f, delimiter='\t') | |
376 for row in plotData: | |
377 vkData.append([row[0],row[1],float(row[2]),float(row[3]),float(row[4]),list(row[4]),float(row[5]),float(row[6]),float(row[7]),float(row[8])]) | |
378 except ValueError: | |
379 print('The %s data file could not be read.' % tsvFile) | |
380 plotRatios(vkData) | |
381 |