comparison vkmz.py @ 0:0b8ddf650752 draft

planemo upload for repository https://github.com/HegemanLab/VKMZ commit 7c299d22bdce251ce599cd34df76919d297a7007-dirty
author eslerm
date Wed, 02 May 2018 18:31:06 -0400
parents
children b02af8eb8e6e
comparison
equal deleted inserted replaced
-1:000000000000 0:0b8ddf650752
1 '''
2 based on the BMRB compound database which can be found at:
3 http://www.bmrb.wisc.edu/ftp/pub/bmrb/relational_tables/metabolomics/Chem_comp.csv
4 '''
5
6 import re
7 import argparse
8 import multiprocessing
9 from multiprocessing import Pool
10 import csv
11
12 import numpy as np
13 import math
14 import pandas as pd
15 from plotly import __version__
16 import plotly.offline as py
17 import plotly.graph_objs as go
18
19 parser = argparse.ArgumentParser()
20 inputSubparser = parser.add_subparsers(help='Select input type:', dest='input-type')
21 parse_tsv = inputSubparser.add_parser('tsv', help='Use tabular data as input.')
22 parse_tsv.add_argument('--input', '-i', required=True, help='Path to tabular file. Must include columns: sample ID, mz, polarity, intensity, & retention time.')
23 parse_tsv.add_argument('--no-plot', '-np', action='store_true', help='Disable plot generation.')
24 parse_xcms = inputSubparser.add_parser('xcms', help='Use XCMS data as input.')
25 parse_xcms.add_argument('--data-matrix', '-xd', required=True, nargs='?', type=str, help='Path to XCMS dataMatrix file.')
26 parse_xcms.add_argument('--sample-metadata', '-xs', required=True, nargs='?', type=str, help='Path to XCMS sampleMetadata file.')
27 parse_xcms.add_argument('--variable-metadata', '-xv', required=True, nargs='?', type=str, help='Path to XCMS variableMetadata file.')
28 parse_xcms.add_argument('--no-plot', '-n', action='store_true', help='Disable plot generation.')
29 parse_plot = inputSubparser.add_parser('plot', help='Only plot data.')
30 parse_plot.add_argument('--input', '-i', required=True, nargs='?', type=str, help='Path to VKMZ generated tabular file.')
31 for inputSubparser in [parse_tsv, parse_xcms]:
32 inputSubparser.add_argument('--output', '-o', nargs='?', type=str, required=True, help='Specify output file path.')
33 inputSubparser.add_argument('--error', '-e', nargs='?', type=float, required=True, help='Mass error of mass spectrometer in PPM')
34 inputSubparser.add_argument('--database', '-d', nargs='?', default='databases/bmrb-light.tsv', help='Select database.')
35 inputSubparser.add_argument('--directory', nargs='?', default='', type=str, help='Define directory of tool.')
36 inputSubparser.add_argument('--no-adjustment', '-na', action='store_true', help='Use flag to turn off polarity based mass adjustment. This flag should always be used if reprocessing data generated by VKMZ.')
37 inputSubparser.add_argument('--multiprocessing', '-m', action='store_true', help='Use flag to turn on multiprocessing.')
38 inputSubparser.add_argument('--plottype', '-p', nargs='?', default='scatter-2d', choices=['scatter-2d', 'scatter-3d'], help='Select plot type.')
39 inputSubparser.add_argument('--size', '-s', nargs='?', default=5, type=int, help='Set size of of dots. size+2*log(size*peak/(highest_peak/lowest_peak')
40 inputSubparser.add_argument('--size-algorithm', '-a', nargs='?', default=0, type=int, choices=[0,1,2],help='Size algorithm selector. Algo 0: size, Algo 1: size+2*log(size*peak/(highest_peak/lowest_peak, Algo 2: size+2*size*peak/(highest_peak-lowest_peak)')
41 args = parser.parse_args()
42
43 vkInputType = getattr(args, "input-type")
44
45 # read inputs, arguments and define globals
46 vkError = getattr(args, "error")
47
48 vkMultiprocessing = getattr(args, "multiprocessing")
49
50 vkNoAdjustment = getattr(args, "no_adjustment")
51
52 vkDatabaseFile = getattr(args, "database")
53 vkDirectory = getattr(args, "directory")
54
55 vkMass = []
56 vkFormula = []
57 try:
58 with open(vkDirectory+vkDatabaseFile, 'r') as tsv:
59 next(tsv) # skip first row
60 for row in tsv:
61 mass, formula = row.split()
62 vkMass.append(mass)
63 vkFormula.append(formula)
64 except ValueError:
65 print('The %s database could not be loaded.' % vkDatabaseFile)
66 vkMaxIndex = len(vkMass)-1
67
68 vkOutput = getattr(args, "output")
69
70 vkPlotType = getattr(args, 'plottype')
71
72 vkSize = getattr(args, 'size')
73
74 vkSizeAlgo = getattr(args, 'size_algorithm')
75
76 # control predictions
77 def forecaster(vkInput):
78 if vkMultiprocessing:
79 try:
80 pool = Pool()
81 vkOutputList = pool.map(featurePrediction, vkInput)
82 except Exception as e:
83 print("Error during multirpocessing: "+str(e))
84 finally:
85 pool.close()
86 pool.join()
87 else:
88 vkOutputList = map(featurePrediction, vkInput)
89 vkOutputList = [x for x in vkOutputList if x is not None]
90 return(vkOutputList)
91
92 # predict feature formulas and creates output list
93 def featurePrediction(feature):
94 if vkNoAdjustment:
95 mass = feature[2]
96 else:
97 mass = adjust(feature[2], feature[1]) # mz & polarity
98 uncertainty = mass * vkError / 1e6
99 prediction = predict(mass, uncertainty, 0, vkMaxIndex)
100 if prediction != -1:
101 feature[2] = mass
102 predictions = predictNeighbors(mass, uncertainty, prediction)
103 feature[5] = predictions
104 predictionClosest = predictions[0]
105 formula = predictionClosest[1]
106 formulaList = re.findall('[A-Z][a-z]?|[0-9]+', formula)
107 formulaDictionary = {'C':0,'H':0,'O':0,'N':0}
108 i = 0;
109 while i < len(formulaList):
110 if formulaList[i] in formulaDictionary:
111 # if there is only one of this element
112 if i+1 == len(formulaList) or formulaList[i+1].isalpha():
113 formulaDictionary[formulaList[i]] = 1
114 else:
115 formulaDictionary[formulaList[i]] = formulaList[i+1]
116 i+=1
117 i+=1
118 predictionClosest.append(formulaDictionary)
119 hc = float(formulaDictionary['H'])/float(formulaDictionary['C'])
120 oc = float(formulaDictionary['O'])/float(formulaDictionary['C'])
121 nc = float(formulaDictionary['N'])/float(formulaDictionary['C'])
122 predictionClosestDelta = feature[5][0][2]
123 feature += [predictionClosestDelta, hc, oc, nc]
124 return(feature)
125
126 # adjust observed mass to a neutral mass
127 def adjust(mass, polarity):
128 # value to adjust by
129 proton = 1.007276
130 if polarity == 'positive':
131 mass += proton
132 elif polarity == 'negative':
133 mass -= proton
134 return mass
135
136 # Binary search to match observed mass to known mass within error
137 # https://en.wikipedia.org/wiki/Binary_search_tree
138 def predict(mass, uncertainty, left, right):
139 mid = ((right - left) / 2) + left
140 if left <= mid <= right and mid <= vkMaxIndex:
141 delta = float(vkMass[mid]) - mass
142 if uncertainty >= abs(delta):
143 return mid
144 elif uncertainty < delta:
145 return predict(mass, uncertainty, left, mid-1)
146 else:
147 return predict(mass, uncertainty, mid+1, right)
148 return -1
149
150 # find and sort known masses within error limit of observed mass
151 def predictNeighbors(mass, uncertainty, prediction):
152 i = 0
153 neighbors = [[vkMass[prediction],vkFormula[prediction],(float(vkMass[prediction])-mass)],]
154 while prediction+i+1 <= vkMaxIndex:
155 neighbor = prediction+i+1
156 delta = float(vkMass[neighbor])-mass
157 if uncertainty >= abs(delta):
158 neighbors.append([vkMass[neighbor],vkFormula[neighbor],delta])
159 i += 1
160 else:
161 break
162 i = 0
163 while prediction+i-1 >= 0:
164 neighbor = prediction+i-1
165 delta = float(vkMass[neighbor])-mass
166 if uncertainty >= abs(delta):
167 neighbors.append([vkMass[neighbor],vkFormula[neighbor],(float(vkMass[neighbor])-mass)])
168 i -= 1
169 else:
170 break
171 neighbors = sorted(neighbors, key = (lambda delta: abs(delta[2])))
172 return neighbors
173
174 # write output file
175 def saveForcast(vkOutputList):
176 try:
177 with open(vkOutput+'.tsv', 'w') as f:
178 f.writelines(str("sample_id\tpolarity\tmz\tretention_time\tintensity\tpredictions\tdelta\tH:C\tO:C\tN:C") + '\n')
179 for feature in vkOutputList:
180 f.writelines(feature[0]+'\t'+feature[1]+'\t'+str(feature[2])+'\t'+str(feature[3])+'\t'+str(feature[4])+'\t'+str(feature[5])+'\t'+str(feature[6])+'\t'+str(feature[7])+'\t'+str(feature[8])+'\t'+str(feature[9])+'\t'+'\n')
181 except ValueError:
182 print('"%s" could not be saved.' % filename)
183
184 def plotRatios(vkData):
185 max_rt = 0
186 min_intensity = 10.0**10
187 max_intensity = 0.0
188 max_hc = 0
189 max_oc = 0
190 max_nc = 0
191 for row in vkData:
192 if row[3] > max_rt:
193 max_rt = row[3]
194 intensity = float(row[4])
195 if intensity < min_intensity:
196 min_intensity = intensity
197 if intensity > max_intensity:
198 max_intensity = intensity
199 if row[7] > max_hc:
200 max_hc = row[7]
201 if row[8] > max_oc:
202 max_oc = row[8]
203 if row[9] > max_nc:
204 max_nc = row[9]
205 labels = ['sampleID', 'polarity', 'mz', 'rt', 'intensity', 'predictions', 'delta', 'hc', 'oc', 'nc']
206 df = pd.DataFrame.from_records(vkData, columns=labels)
207 sampleIDs = df.sampleID.unique()
208 data = []
209 menus = []
210 i = 0
211 for sampleID in sampleIDs:
212 dfSample = df.loc[df['sampleID'] == sampleID]
213 if vkSizeAlgo == 0:
214 size = dfSample.intensity.apply(lambda x: vkSize)
215 else:
216 size = dfSample.intensity.apply(lambda x: vkSize+4*vkSize*float(x)/max_intensity)
217 trace = go.Scatter(
218 x = dfSample.oc,
219 y = dfSample.hc,
220 text = dfSample.predictions.apply(lambda x: "Prediction: "+str(x[0][1])+"<br>mz: " +str(x[0][0])+"<br>Delta: "+str(x[0][2])),
221 line = dict(width = 0.5),
222 mode = 'markers',
223 marker = dict(
224 size = size,
225 color = dfSample.rt,
226 colorscale = 'Viridis',
227 cmin = 0,
228 cmax = max_rt,
229 colorbar=dict(title='Retention Time (s)'),
230 line = dict(width = 0.5),
231 showscale = True
232 ),
233 opacity = 0.8
234 )
235 data.append(trace)
236 vision = []
237 j = 0
238 while j < len(sampleIDs):
239 if j != i:
240 vision.append(False)
241 else:
242 vision.append(True)
243 j += 1
244 menu = dict(
245 method = 'update',
246 label = sampleID,
247 args = [{'visible': vision}, {'title': sampleID}]
248 )
249 menus.append(menu)
250 i += 1
251 updatemenus = list([
252 dict(
253 active = -1,
254 buttons = menus
255 )
256 ])
257 layout = go.Layout(
258 title = "Van Krevelen Diagram",
259 showlegend = False,
260 xaxis = dict(
261 title = 'Oxygen to Carbon Ratio',
262 zeroline = False,
263 gridcolor = 'rgb(183,183,183)',
264 showline = True,
265 range = [0, max_oc]
266 ),
267 yaxis = dict(
268 title = 'Hydrogen to Carbon Ratio',
269 zeroline = False,
270 gridcolor = 'rgb(183,183,183)',
271 showline = True,
272 range = [0, max_hc]
273 ),
274 margin = dict(r=0, b=100, l=100, t=100),
275 updatemenus = updatemenus
276 )
277 fig = go.Figure(data=data, layout=layout)
278 py.plot(fig, auto_open=False, show_link=False, filename=vkOutput+'.html')
279
280 def polaritySanitizer(sample_polarity):
281 if sample_polarity.lower() in {'positive','pos','+'}:
282 sample_polarity = 'positive'
283 elif sample_polarity.lower() in {'negative', 'neg', '-'}:
284 sample_polarity = 'negative'
285 else:
286 print('A sample has an unknown polarity type: %s. Polarity in the XCMS sample metadata should be set to "negative" or "positive".' % sample_polarity)
287 raise ValueError
288 return sample_polarity
289
290 # main
291 if vkInputType == "tsv":
292 vkInput = []
293 tsvFile = getattr(args, "input")
294 try:
295 with open(tsvFile, 'r') as f:
296 next(f) # skip hearder line
297 tsvData = csv.reader(f, delimiter='\t')
298 for row in tsvData:
299 vkInput.append([row[0],polaritySanitizer(row[1]),float(row[2]),float(row[3]),float(row[4]),[]])
300 except ValueError:
301 print('The %s data file could not be read.' % tsvFile)
302 vkData = forecaster(vkInput)
303 saveForcast(vkData)
304 plotRatios(vkData)
305 elif vkInputType == "xcms":
306 vkInput = []
307 xcmsSampleMetadataFile = getattr(args, "sample_metadata")
308 try:
309 polarity = {}
310 with open(xcmsSampleMetadataFile, 'r') as f:
311 xcmsSampleMetadata = csv.reader(f, delimiter='\t')
312 next(xcmsSampleMetadata, None) # skip header
313 for row in xcmsSampleMetadata:
314 sample = row[0]
315 sample_polarity = polaritySanitizer(row[2])
316 polarity[sample] = sample_polarity
317 except ValueError:
318 print('The %s data file could not be read. Check that polarity is set to "negative" or "positive"' % xcmsSampleMetadataFile)
319 xcmsVariableMetadataFile = getattr(args, "variable_metadata")
320 try:
321 mz = {}
322 rt = {}
323 variable_index = {}
324 mz_index = int()
325 rt_index = int()
326 with open(xcmsVariableMetadataFile, 'r') as f:
327 xcmsVariableMetadata = csv.reader(f, delimiter='\t')
328 i = 0
329 for row in xcmsVariableMetadata:
330 if i != 0:
331 mz[row[0]] = float(row[mz_index])
332 rt[row[0]] = float(row[rt_index])
333 else:
334 for column in row:
335 variable_index[column] = i
336 i += 1
337 mz_index = variable_index["mz"]
338 rt_index = variable_index["rt"]
339 except ValueError:
340 print('The %s data file could not be read.' % xcmsVariableMetadataFile)
341 xcmsDataMatrixFile = getattr(args, "data_matrix")
342 try:
343 with open(xcmsDataMatrixFile, 'r') as f:
344 xcmsDataMatrix = csv.reader(f, delimiter='\t')
345 first_row = True
346 for row in xcmsDataMatrix:
347 if first_row:
348 sample_id = row
349 first_row = False
350 else:
351 i = 0
352 while(i < len(row)):
353 if i == 0:
354 i+=1
355 else:
356 intensity = row[i]
357 if intensity not in {'NA', '#DIV/0!', '0'}:
358 variable = row[0]
359 sample = sample_id[i]
360 # XCMS data may include empty columns
361 if sample != "":
362 vkInput.append([sample, polarity[sample], mz[variable], rt[variable], float(intensity), []])
363 i+=1
364 except ValueError:
365 print('The %s data file could not be read.' % xcmsDataMatrixFile)
366 vkData = forecaster(vkInput)
367 saveForcast(vkData)
368 plotRatios(vkData)
369 else:
370 vkData = []
371 tsvPlotvFile = getattr(args, "input")
372 try:
373 with open(tsvPlotFile, 'r') as f:
374 next(f) # skip header line
375 plotData = csv.reader(f, delimiter='\t')
376 for row in plotData:
377 vkData.append([row[0],row[1],float(row[2]),float(row[3]),float(row[4]),list(row[4]),float(row[5]),float(row[6]),float(row[7]),float(row[8])])
378 except ValueError:
379 print('The %s data file could not be read.' % tsvFile)
380 plotRatios(vkData)
381