comparison vkmz.py @ 1:b02af8eb8e6e draft

planemo upload for repository https://github.com/HegemanLab/VKMZ commit 5e7a43415df3902b44b7623cb2c6ffb8845751ac
author eslerm
date Wed, 30 May 2018 13:17:32 -0400
parents 0b8ddf650752
children 04079c34452a
comparison
equal deleted inserted replaced
0:0b8ddf650752 1:b02af8eb8e6e
31 for inputSubparser in [parse_tsv, parse_xcms]: 31 for inputSubparser in [parse_tsv, parse_xcms]:
32 inputSubparser.add_argument('--output', '-o', nargs='?', type=str, required=True, help='Specify output file path.') 32 inputSubparser.add_argument('--output', '-o', nargs='?', type=str, required=True, help='Specify output file path.')
33 inputSubparser.add_argument('--error', '-e', nargs='?', type=float, required=True, help='Mass error of mass spectrometer in PPM') 33 inputSubparser.add_argument('--error', '-e', nargs='?', type=float, required=True, help='Mass error of mass spectrometer in PPM')
34 inputSubparser.add_argument('--database', '-d', nargs='?', default='databases/bmrb-light.tsv', help='Select database.') 34 inputSubparser.add_argument('--database', '-d', nargs='?', default='databases/bmrb-light.tsv', help='Select database.')
35 inputSubparser.add_argument('--directory', nargs='?', default='', type=str, help='Define directory of tool.') 35 inputSubparser.add_argument('--directory', nargs='?', default='', type=str, help='Define directory of tool.')
36 inputSubparser.add_argument('--polarity', '-p', choices=['positive','negative'], help='Force polarity mode. Ignore variables in input file.')
36 inputSubparser.add_argument('--no-adjustment', '-na', action='store_true', help='Use flag to turn off polarity based mass adjustment. This flag should always be used if reprocessing data generated by VKMZ.') 37 inputSubparser.add_argument('--no-adjustment', '-na', action='store_true', help='Use flag to turn off polarity based mass adjustment. This flag should always be used if reprocessing data generated by VKMZ.')
37 inputSubparser.add_argument('--multiprocessing', '-m', action='store_true', help='Use flag to turn on multiprocessing.') 38 inputSubparser.add_argument('--multiprocessing', '-m', action='store_true', help='Use flag to turn on multiprocessing.')
38 inputSubparser.add_argument('--plottype', '-p', nargs='?', default='scatter-2d', choices=['scatter-2d', 'scatter-3d'], help='Select plot type.') 39 inputSubparser.add_argument('--plottype', '-t', nargs='?', default='scatter-2d', choices=['scatter-2d', 'scatter-3d'], help='Select plot type.')
39 inputSubparser.add_argument('--size', '-s', nargs='?', default=5, type=int, help='Set size of of dots. size+2*log(size*peak/(highest_peak/lowest_peak') 40 inputSubparser.add_argument('--size', '-s', nargs='?', default=5, type=int, help='Set maxium size of plot symbols.')
40 inputSubparser.add_argument('--size-algorithm', '-a', nargs='?', default=0, type=int, choices=[0,1,2],help='Size algorithm selector. Algo 0: size, Algo 1: size+2*log(size*peak/(highest_peak/lowest_peak, Algo 2: size+2*size*peak/(highest_peak-lowest_peak)') 41 inputSubparser.add_argument('--size-algorithm', '-a', nargs='?', default=0, type=int, choices=[0,1], help="Symbol size algorithm selector. Algorithm 0 sets all symbols to the maxium size. Algorithm 2 determines a features symbol size by it's log intensity.")
41 args = parser.parse_args() 42 args = parser.parse_args()
42 43
43 vkInputType = getattr(args, "input-type") 44 vkInputType = getattr(args, "input-type")
44 45
45 # read inputs, arguments and define globals 46 # read inputs, arguments and define globals
46 vkError = getattr(args, "error") 47 vkError = getattr(args, "error")
48
49 vkPolarity = getattr(args, "polarity")
47 50
48 vkMultiprocessing = getattr(args, "multiprocessing") 51 vkMultiprocessing = getattr(args, "multiprocessing")
49 52
50 vkNoAdjustment = getattr(args, "no_adjustment") 53 vkNoAdjustment = getattr(args, "no_adjustment")
51 54
98 uncertainty = mass * vkError / 1e6 101 uncertainty = mass * vkError / 1e6
99 prediction = predict(mass, uncertainty, 0, vkMaxIndex) 102 prediction = predict(mass, uncertainty, 0, vkMaxIndex)
100 if prediction != -1: 103 if prediction != -1:
101 feature[2] = mass 104 feature[2] = mass
102 predictions = predictNeighbors(mass, uncertainty, prediction) 105 predictions = predictNeighbors(mass, uncertainty, prediction)
103 feature[5] = predictions 106 feature.append(predictions) # feature[5]
104 predictionClosest = predictions[0] 107 predictionClosest = predictions[0]
105 formula = predictionClosest[1] 108 formula = predictionClosest[1]
106 formulaList = re.findall('[A-Z][a-z]?|[0-9]+', formula) 109 formulaList = re.findall('[A-Z][a-z]?|[0-9]+', formula)
107 formulaDictionary = {'C':0,'H':0,'O':0,'N':0} 110 formulaDictionary = {'C':0,'H':0,'O':0,'N':0}
108 i = 0; 111 i = 0;
121 nc = float(formulaDictionary['N'])/float(formulaDictionary['C']) 124 nc = float(formulaDictionary['N'])/float(formulaDictionary['C'])
122 predictionClosestDelta = feature[5][0][2] 125 predictionClosestDelta = feature[5][0][2]
123 feature += [predictionClosestDelta, hc, oc, nc] 126 feature += [predictionClosestDelta, hc, oc, nc]
124 return(feature) 127 return(feature)
125 128
126 # adjust observed mass to a neutral mass 129 # adjust charged mass to a neutral mass
127 def adjust(mass, polarity): 130 def adjust(mass, polarity):
128 # value to adjust by 131 # value to adjust by
129 proton = 1.007276 132 proton = 1.007276
130 if polarity == 'positive': 133 if polarity == 'positive':
131 mass += proton 134 mass += proton
145 return predict(mass, uncertainty, left, mid-1) 148 return predict(mass, uncertainty, left, mid-1)
146 else: 149 else:
147 return predict(mass, uncertainty, mid+1, right) 150 return predict(mass, uncertainty, mid+1, right)
148 return -1 151 return -1
149 152
153 def plotData(vkData):
154 if vkSizeAlgo == 0:
155 for row in vkData:
156 row.append(vkSize)
157 else:
158 max_intensity = 0.0
159 for row in vkData:
160 intensity = row[4]
161 if intensity > max_intensity:
162 max_intensity = intensity
163 alpha = vkSize/math.log(max_intensity+1)
164 for row in vkData:
165 intensity = row[4]
166 row.append(alpha*math.log(intensity+1))
167 return vkData
168
150 # find and sort known masses within error limit of observed mass 169 # find and sort known masses within error limit of observed mass
151 def predictNeighbors(mass, uncertainty, prediction): 170 def predictNeighbors(mass, uncertainty, prediction):
152 i = 0 171 i = 0
153 neighbors = [[vkMass[prediction],vkFormula[prediction],(float(vkMass[prediction])-mass)],] 172 neighbors = [[vkMass[prediction],vkFormula[prediction],(float(vkMass[prediction])-mass)],]
154 while prediction+i+1 <= vkMaxIndex: 173 while prediction+i+1 <= vkMaxIndex:
171 neighbors = sorted(neighbors, key = (lambda delta: abs(delta[2]))) 190 neighbors = sorted(neighbors, key = (lambda delta: abs(delta[2])))
172 return neighbors 191 return neighbors
173 192
174 # write output file 193 # write output file
175 def saveForcast(vkOutputList): 194 def saveForcast(vkOutputList):
176 try: 195 try:
177 with open(vkOutput+'.tsv', 'w') as f: 196 with open(vkOutput+'.tsv', 'w') as f:
178 f.writelines(str("sample_id\tpolarity\tmz\tretention_time\tintensity\tpredictions\tdelta\tH:C\tO:C\tN:C") + '\n') 197 f.writelines(str("sample_id\tpolarity\tmz\tretention_time\tintensity\tpredictions\tdelta\tH:C\tO:C\tN:C\tsymbol_size") + '\n')
179 for feature in vkOutputList: 198 for feature in vkOutputList:
180 f.writelines(feature[0]+'\t'+feature[1]+'\t'+str(feature[2])+'\t'+str(feature[3])+'\t'+str(feature[4])+'\t'+str(feature[5])+'\t'+str(feature[6])+'\t'+str(feature[7])+'\t'+str(feature[8])+'\t'+str(feature[9])+'\t'+'\n') 199 f.writelines(feature[0]+'\t'+feature[1]+'\t'+str(feature[2])+'\t'+str(feature[3])+'\t'+str(feature[4])+'\t'+str(feature[5])+'\t'+str(feature[6])+'\t'+str(feature[7])+'\t'+str(feature[8])+'\t'+str(feature[9])+'\t'+str(feature[10])+'\t'+'\n')
181 except ValueError: 200 except ValueError:
182 print('"%s" could not be saved.' % filename) 201 print('"%s" could not be saved.' % filename)
183 202
184 def plotRatios(vkData): 203 def plotRatios(vkData):
185 max_rt = 0 204 max_rt = 0
186 min_intensity = 10.0**10
187 max_intensity = 0.0
188 max_hc = 0 205 max_hc = 0
189 max_oc = 0 206 max_oc = 0
190 max_nc = 0 207 max_nc = 0
191 for row in vkData: 208 for row in vkData:
192 if row[3] > max_rt: 209 if row[3] > max_rt:
193 max_rt = row[3] 210 max_rt = row[3]
194 intensity = float(row[4])
195 if intensity < min_intensity:
196 min_intensity = intensity
197 if intensity > max_intensity:
198 max_intensity = intensity
199 if row[7] > max_hc: 211 if row[7] > max_hc:
200 max_hc = row[7] 212 max_hc = row[7]
201 if row[8] > max_oc: 213 if row[8] > max_oc:
202 max_oc = row[8] 214 max_oc = row[8]
203 if row[9] > max_nc: 215 if row[9] > max_nc:
204 max_nc = row[9] 216 max_nc = row[9]
205 labels = ['sampleID', 'polarity', 'mz', 'rt', 'intensity', 'predictions', 'delta', 'hc', 'oc', 'nc'] 217 labels = ['sampleID', 'polarity', 'mz', 'rt', 'intensity', 'predictions', 'delta', 'hc', 'oc', 'nc', 'symbol_size']
206 df = pd.DataFrame.from_records(vkData, columns=labels) 218 df = pd.DataFrame.from_records(vkData, columns=labels)
207 sampleIDs = df.sampleID.unique() 219 sampleIDs = df.sampleID.unique()
208 data = [] 220 data = []
209 menus = [] 221 menus = []
210 i = 0 222 i = 0
211 for sampleID in sampleIDs: 223 for sampleID in sampleIDs:
212 dfSample = df.loc[df['sampleID'] == sampleID] 224 dfSample = df.loc[df['sampleID'] == sampleID]
213 if vkSizeAlgo == 0: 225 size = dfSample.symbol_size
214 size = dfSample.intensity.apply(lambda x: vkSize)
215 else:
216 size = dfSample.intensity.apply(lambda x: vkSize+4*vkSize*float(x)/max_intensity)
217 trace = go.Scatter( 226 trace = go.Scatter(
218 x = dfSample.oc, 227 x = dfSample.oc,
219 y = dfSample.hc, 228 y = dfSample.hc,
220 text = dfSample.predictions.apply(lambda x: "Prediction: "+str(x[0][1])+"<br>mz: " +str(x[0][0])+"<br>Delta: "+str(x[0][2])), 229 text = dfSample.predictions.apply(lambda x: "Prediction: "+str(x[0][1])+"<br>mz: " +str(x[0][0])+"<br>Delta: "+str(x[0][2])),
221 line = dict(width = 0.5), 230 line = dict(width = 0.5),
222 mode = 'markers', 231 mode = 'markers',
223 marker = dict( 232 marker = dict(
224 size = size, 233 size = size,
234 sizemode = "area",
225 color = dfSample.rt, 235 color = dfSample.rt,
226 colorscale = 'Viridis', 236 colorscale = 'Viridis',
227 cmin = 0, 237 cmin = 0,
228 cmax = max_rt, 238 cmax = max_rt,
229 colorbar=dict(title='Retention Time (s)'), 239 colorbar=dict(title='Retention Time (s)'),
294 try: 304 try:
295 with open(tsvFile, 'r') as f: 305 with open(tsvFile, 'r') as f:
296 next(f) # skip hearder line 306 next(f) # skip hearder line
297 tsvData = csv.reader(f, delimiter='\t') 307 tsvData = csv.reader(f, delimiter='\t')
298 for row in tsvData: 308 for row in tsvData:
299 vkInput.append([row[0],polaritySanitizer(row[1]),float(row[2]),float(row[3]),float(row[4]),[]]) 309 vkInput.append([row[0],polaritySanitizer(row[1]),float(row[2]),float(row[3]),float(row[4])])
300 except ValueError: 310 except ValueError:
301 print('The %s data file could not be read.' % tsvFile) 311 print('The %s data file could not be read.' % tsvFile)
302 vkData = forecaster(vkInput) 312 vkData = forecaster(vkInput)
313 vkData = plotData(vkData)
303 saveForcast(vkData) 314 saveForcast(vkData)
304 plotRatios(vkData) 315 plotRatios(vkData)
305 elif vkInputType == "xcms": 316 elif vkInputType == "xcms":
306 vkInput = [] 317 vkInput = []
307 xcmsSampleMetadataFile = getattr(args, "sample_metadata") 318 xcmsSampleMetadataFile = getattr(args, "sample_metadata")
310 with open(xcmsSampleMetadataFile, 'r') as f: 321 with open(xcmsSampleMetadataFile, 'r') as f:
311 xcmsSampleMetadata = csv.reader(f, delimiter='\t') 322 xcmsSampleMetadata = csv.reader(f, delimiter='\t')
312 next(xcmsSampleMetadata, None) # skip header 323 next(xcmsSampleMetadata, None) # skip header
313 for row in xcmsSampleMetadata: 324 for row in xcmsSampleMetadata:
314 sample = row[0] 325 sample = row[0]
315 sample_polarity = polaritySanitizer(row[2]) 326 if vkPolarity:
316 polarity[sample] = sample_polarity 327 polarity[sample] = vkPolarity
328 else:
329 sample_polarity = polaritySanitizer(row[2])
330 polarity[sample] = sample_polarity
317 except ValueError: 331 except ValueError:
318 print('The %s data file could not be read. Check that polarity is set to "negative" or "positive"' % xcmsSampleMetadataFile) 332 print('The %s data file could not be read. Check that polarity is set to "negative" or "positive"' % xcmsSampleMetadataFile)
319 xcmsVariableMetadataFile = getattr(args, "variable_metadata") 333 xcmsVariableMetadataFile = getattr(args, "variable_metadata")
320 try: 334 try:
321 mz = {} 335 mz = {}
357 if intensity not in {'NA', '#DIV/0!', '0'}: 371 if intensity not in {'NA', '#DIV/0!', '0'}:
358 variable = row[0] 372 variable = row[0]
359 sample = sample_id[i] 373 sample = sample_id[i]
360 # XCMS data may include empty columns 374 # XCMS data may include empty columns
361 if sample != "": 375 if sample != "":
362 vkInput.append([sample, polarity[sample], mz[variable], rt[variable], float(intensity), []]) 376 vkInput.append([sample, polarity[sample], mz[variable], rt[variable], float(intensity)])
363 i+=1 377 i+=1
364 except ValueError: 378 except ValueError:
365 print('The %s data file could not be read.' % xcmsDataMatrixFile) 379 print('The %s data file could not be read.' % xcmsDataMatrixFile)
366 vkData = forecaster(vkInput) 380 vkData = forecaster(vkInput)
381 vkData = plotData(vkData)
367 saveForcast(vkData) 382 saveForcast(vkData)
368 plotRatios(vkData) 383 plotRatios(vkData)
369 else: 384 else:
370 vkData = [] 385 vkData = []
371 tsvPlotvFile = getattr(args, "input") 386 tsvPlotvFile = getattr(args, "input")