Mercurial > repos > eslerm > vkmz
comparison vkmz.py @ 1:b02af8eb8e6e draft
planemo upload for repository https://github.com/HegemanLab/VKMZ commit 5e7a43415df3902b44b7623cb2c6ffb8845751ac
author | eslerm |
---|---|
date | Wed, 30 May 2018 13:17:32 -0400 |
parents | 0b8ddf650752 |
children | 04079c34452a |
comparison
equal
deleted
inserted
replaced
0:0b8ddf650752 | 1:b02af8eb8e6e |
---|---|
31 for inputSubparser in [parse_tsv, parse_xcms]: | 31 for inputSubparser in [parse_tsv, parse_xcms]: |
32 inputSubparser.add_argument('--output', '-o', nargs='?', type=str, required=True, help='Specify output file path.') | 32 inputSubparser.add_argument('--output', '-o', nargs='?', type=str, required=True, help='Specify output file path.') |
33 inputSubparser.add_argument('--error', '-e', nargs='?', type=float, required=True, help='Mass error of mass spectrometer in PPM') | 33 inputSubparser.add_argument('--error', '-e', nargs='?', type=float, required=True, help='Mass error of mass spectrometer in PPM') |
34 inputSubparser.add_argument('--database', '-d', nargs='?', default='databases/bmrb-light.tsv', help='Select database.') | 34 inputSubparser.add_argument('--database', '-d', nargs='?', default='databases/bmrb-light.tsv', help='Select database.') |
35 inputSubparser.add_argument('--directory', nargs='?', default='', type=str, help='Define directory of tool.') | 35 inputSubparser.add_argument('--directory', nargs='?', default='', type=str, help='Define directory of tool.') |
36 inputSubparser.add_argument('--polarity', '-p', choices=['positive','negative'], help='Force polarity mode. Ignore variables in input file.') | |
36 inputSubparser.add_argument('--no-adjustment', '-na', action='store_true', help='Use flag to turn off polarity based mass adjustment. This flag should always be used if reprocessing data generated by VKMZ.') | 37 inputSubparser.add_argument('--no-adjustment', '-na', action='store_true', help='Use flag to turn off polarity based mass adjustment. This flag should always be used if reprocessing data generated by VKMZ.') |
37 inputSubparser.add_argument('--multiprocessing', '-m', action='store_true', help='Use flag to turn on multiprocessing.') | 38 inputSubparser.add_argument('--multiprocessing', '-m', action='store_true', help='Use flag to turn on multiprocessing.') |
38 inputSubparser.add_argument('--plottype', '-p', nargs='?', default='scatter-2d', choices=['scatter-2d', 'scatter-3d'], help='Select plot type.') | 39 inputSubparser.add_argument('--plottype', '-t', nargs='?', default='scatter-2d', choices=['scatter-2d', 'scatter-3d'], help='Select plot type.') |
39 inputSubparser.add_argument('--size', '-s', nargs='?', default=5, type=int, help='Set size of of dots. size+2*log(size*peak/(highest_peak/lowest_peak') | 40 inputSubparser.add_argument('--size', '-s', nargs='?', default=5, type=int, help='Set maxium size of plot symbols.') |
40 inputSubparser.add_argument('--size-algorithm', '-a', nargs='?', default=0, type=int, choices=[0,1,2],help='Size algorithm selector. Algo 0: size, Algo 1: size+2*log(size*peak/(highest_peak/lowest_peak, Algo 2: size+2*size*peak/(highest_peak-lowest_peak)') | 41 inputSubparser.add_argument('--size-algorithm', '-a', nargs='?', default=0, type=int, choices=[0,1], help="Symbol size algorithm selector. Algorithm 0 sets all symbols to the maxium size. Algorithm 2 determines a features symbol size by it's log intensity.") |
41 args = parser.parse_args() | 42 args = parser.parse_args() |
42 | 43 |
43 vkInputType = getattr(args, "input-type") | 44 vkInputType = getattr(args, "input-type") |
44 | 45 |
45 # read inputs, arguments and define globals | 46 # read inputs, arguments and define globals |
46 vkError = getattr(args, "error") | 47 vkError = getattr(args, "error") |
48 | |
49 vkPolarity = getattr(args, "polarity") | |
47 | 50 |
48 vkMultiprocessing = getattr(args, "multiprocessing") | 51 vkMultiprocessing = getattr(args, "multiprocessing") |
49 | 52 |
50 vkNoAdjustment = getattr(args, "no_adjustment") | 53 vkNoAdjustment = getattr(args, "no_adjustment") |
51 | 54 |
98 uncertainty = mass * vkError / 1e6 | 101 uncertainty = mass * vkError / 1e6 |
99 prediction = predict(mass, uncertainty, 0, vkMaxIndex) | 102 prediction = predict(mass, uncertainty, 0, vkMaxIndex) |
100 if prediction != -1: | 103 if prediction != -1: |
101 feature[2] = mass | 104 feature[2] = mass |
102 predictions = predictNeighbors(mass, uncertainty, prediction) | 105 predictions = predictNeighbors(mass, uncertainty, prediction) |
103 feature[5] = predictions | 106 feature.append(predictions) # feature[5] |
104 predictionClosest = predictions[0] | 107 predictionClosest = predictions[0] |
105 formula = predictionClosest[1] | 108 formula = predictionClosest[1] |
106 formulaList = re.findall('[A-Z][a-z]?|[0-9]+', formula) | 109 formulaList = re.findall('[A-Z][a-z]?|[0-9]+', formula) |
107 formulaDictionary = {'C':0,'H':0,'O':0,'N':0} | 110 formulaDictionary = {'C':0,'H':0,'O':0,'N':0} |
108 i = 0; | 111 i = 0; |
121 nc = float(formulaDictionary['N'])/float(formulaDictionary['C']) | 124 nc = float(formulaDictionary['N'])/float(formulaDictionary['C']) |
122 predictionClosestDelta = feature[5][0][2] | 125 predictionClosestDelta = feature[5][0][2] |
123 feature += [predictionClosestDelta, hc, oc, nc] | 126 feature += [predictionClosestDelta, hc, oc, nc] |
124 return(feature) | 127 return(feature) |
125 | 128 |
126 # adjust observed mass to a neutral mass | 129 # adjust charged mass to a neutral mass |
127 def adjust(mass, polarity): | 130 def adjust(mass, polarity): |
128 # value to adjust by | 131 # value to adjust by |
129 proton = 1.007276 | 132 proton = 1.007276 |
130 if polarity == 'positive': | 133 if polarity == 'positive': |
131 mass += proton | 134 mass += proton |
145 return predict(mass, uncertainty, left, mid-1) | 148 return predict(mass, uncertainty, left, mid-1) |
146 else: | 149 else: |
147 return predict(mass, uncertainty, mid+1, right) | 150 return predict(mass, uncertainty, mid+1, right) |
148 return -1 | 151 return -1 |
149 | 152 |
153 def plotData(vkData): | |
154 if vkSizeAlgo == 0: | |
155 for row in vkData: | |
156 row.append(vkSize) | |
157 else: | |
158 max_intensity = 0.0 | |
159 for row in vkData: | |
160 intensity = row[4] | |
161 if intensity > max_intensity: | |
162 max_intensity = intensity | |
163 alpha = vkSize/math.log(max_intensity+1) | |
164 for row in vkData: | |
165 intensity = row[4] | |
166 row.append(alpha*math.log(intensity+1)) | |
167 return vkData | |
168 | |
150 # find and sort known masses within error limit of observed mass | 169 # find and sort known masses within error limit of observed mass |
151 def predictNeighbors(mass, uncertainty, prediction): | 170 def predictNeighbors(mass, uncertainty, prediction): |
152 i = 0 | 171 i = 0 |
153 neighbors = [[vkMass[prediction],vkFormula[prediction],(float(vkMass[prediction])-mass)],] | 172 neighbors = [[vkMass[prediction],vkFormula[prediction],(float(vkMass[prediction])-mass)],] |
154 while prediction+i+1 <= vkMaxIndex: | 173 while prediction+i+1 <= vkMaxIndex: |
171 neighbors = sorted(neighbors, key = (lambda delta: abs(delta[2]))) | 190 neighbors = sorted(neighbors, key = (lambda delta: abs(delta[2]))) |
172 return neighbors | 191 return neighbors |
173 | 192 |
174 # write output file | 193 # write output file |
175 def saveForcast(vkOutputList): | 194 def saveForcast(vkOutputList): |
176 try: | 195 try: |
177 with open(vkOutput+'.tsv', 'w') as f: | 196 with open(vkOutput+'.tsv', 'w') as f: |
178 f.writelines(str("sample_id\tpolarity\tmz\tretention_time\tintensity\tpredictions\tdelta\tH:C\tO:C\tN:C") + '\n') | 197 f.writelines(str("sample_id\tpolarity\tmz\tretention_time\tintensity\tpredictions\tdelta\tH:C\tO:C\tN:C\tsymbol_size") + '\n') |
179 for feature in vkOutputList: | 198 for feature in vkOutputList: |
180 f.writelines(feature[0]+'\t'+feature[1]+'\t'+str(feature[2])+'\t'+str(feature[3])+'\t'+str(feature[4])+'\t'+str(feature[5])+'\t'+str(feature[6])+'\t'+str(feature[7])+'\t'+str(feature[8])+'\t'+str(feature[9])+'\t'+'\n') | 199 f.writelines(feature[0]+'\t'+feature[1]+'\t'+str(feature[2])+'\t'+str(feature[3])+'\t'+str(feature[4])+'\t'+str(feature[5])+'\t'+str(feature[6])+'\t'+str(feature[7])+'\t'+str(feature[8])+'\t'+str(feature[9])+'\t'+str(feature[10])+'\t'+'\n') |
181 except ValueError: | 200 except ValueError: |
182 print('"%s" could not be saved.' % filename) | 201 print('"%s" could not be saved.' % filename) |
183 | 202 |
184 def plotRatios(vkData): | 203 def plotRatios(vkData): |
185 max_rt = 0 | 204 max_rt = 0 |
186 min_intensity = 10.0**10 | |
187 max_intensity = 0.0 | |
188 max_hc = 0 | 205 max_hc = 0 |
189 max_oc = 0 | 206 max_oc = 0 |
190 max_nc = 0 | 207 max_nc = 0 |
191 for row in vkData: | 208 for row in vkData: |
192 if row[3] > max_rt: | 209 if row[3] > max_rt: |
193 max_rt = row[3] | 210 max_rt = row[3] |
194 intensity = float(row[4]) | |
195 if intensity < min_intensity: | |
196 min_intensity = intensity | |
197 if intensity > max_intensity: | |
198 max_intensity = intensity | |
199 if row[7] > max_hc: | 211 if row[7] > max_hc: |
200 max_hc = row[7] | 212 max_hc = row[7] |
201 if row[8] > max_oc: | 213 if row[8] > max_oc: |
202 max_oc = row[8] | 214 max_oc = row[8] |
203 if row[9] > max_nc: | 215 if row[9] > max_nc: |
204 max_nc = row[9] | 216 max_nc = row[9] |
205 labels = ['sampleID', 'polarity', 'mz', 'rt', 'intensity', 'predictions', 'delta', 'hc', 'oc', 'nc'] | 217 labels = ['sampleID', 'polarity', 'mz', 'rt', 'intensity', 'predictions', 'delta', 'hc', 'oc', 'nc', 'symbol_size'] |
206 df = pd.DataFrame.from_records(vkData, columns=labels) | 218 df = pd.DataFrame.from_records(vkData, columns=labels) |
207 sampleIDs = df.sampleID.unique() | 219 sampleIDs = df.sampleID.unique() |
208 data = [] | 220 data = [] |
209 menus = [] | 221 menus = [] |
210 i = 0 | 222 i = 0 |
211 for sampleID in sampleIDs: | 223 for sampleID in sampleIDs: |
212 dfSample = df.loc[df['sampleID'] == sampleID] | 224 dfSample = df.loc[df['sampleID'] == sampleID] |
213 if vkSizeAlgo == 0: | 225 size = dfSample.symbol_size |
214 size = dfSample.intensity.apply(lambda x: vkSize) | |
215 else: | |
216 size = dfSample.intensity.apply(lambda x: vkSize+4*vkSize*float(x)/max_intensity) | |
217 trace = go.Scatter( | 226 trace = go.Scatter( |
218 x = dfSample.oc, | 227 x = dfSample.oc, |
219 y = dfSample.hc, | 228 y = dfSample.hc, |
220 text = dfSample.predictions.apply(lambda x: "Prediction: "+str(x[0][1])+"<br>mz: " +str(x[0][0])+"<br>Delta: "+str(x[0][2])), | 229 text = dfSample.predictions.apply(lambda x: "Prediction: "+str(x[0][1])+"<br>mz: " +str(x[0][0])+"<br>Delta: "+str(x[0][2])), |
221 line = dict(width = 0.5), | 230 line = dict(width = 0.5), |
222 mode = 'markers', | 231 mode = 'markers', |
223 marker = dict( | 232 marker = dict( |
224 size = size, | 233 size = size, |
234 sizemode = "area", | |
225 color = dfSample.rt, | 235 color = dfSample.rt, |
226 colorscale = 'Viridis', | 236 colorscale = 'Viridis', |
227 cmin = 0, | 237 cmin = 0, |
228 cmax = max_rt, | 238 cmax = max_rt, |
229 colorbar=dict(title='Retention Time (s)'), | 239 colorbar=dict(title='Retention Time (s)'), |
294 try: | 304 try: |
295 with open(tsvFile, 'r') as f: | 305 with open(tsvFile, 'r') as f: |
296 next(f) # skip hearder line | 306 next(f) # skip hearder line |
297 tsvData = csv.reader(f, delimiter='\t') | 307 tsvData = csv.reader(f, delimiter='\t') |
298 for row in tsvData: | 308 for row in tsvData: |
299 vkInput.append([row[0],polaritySanitizer(row[1]),float(row[2]),float(row[3]),float(row[4]),[]]) | 309 vkInput.append([row[0],polaritySanitizer(row[1]),float(row[2]),float(row[3]),float(row[4])]) |
300 except ValueError: | 310 except ValueError: |
301 print('The %s data file could not be read.' % tsvFile) | 311 print('The %s data file could not be read.' % tsvFile) |
302 vkData = forecaster(vkInput) | 312 vkData = forecaster(vkInput) |
313 vkData = plotData(vkData) | |
303 saveForcast(vkData) | 314 saveForcast(vkData) |
304 plotRatios(vkData) | 315 plotRatios(vkData) |
305 elif vkInputType == "xcms": | 316 elif vkInputType == "xcms": |
306 vkInput = [] | 317 vkInput = [] |
307 xcmsSampleMetadataFile = getattr(args, "sample_metadata") | 318 xcmsSampleMetadataFile = getattr(args, "sample_metadata") |
310 with open(xcmsSampleMetadataFile, 'r') as f: | 321 with open(xcmsSampleMetadataFile, 'r') as f: |
311 xcmsSampleMetadata = csv.reader(f, delimiter='\t') | 322 xcmsSampleMetadata = csv.reader(f, delimiter='\t') |
312 next(xcmsSampleMetadata, None) # skip header | 323 next(xcmsSampleMetadata, None) # skip header |
313 for row in xcmsSampleMetadata: | 324 for row in xcmsSampleMetadata: |
314 sample = row[0] | 325 sample = row[0] |
315 sample_polarity = polaritySanitizer(row[2]) | 326 if vkPolarity: |
316 polarity[sample] = sample_polarity | 327 polarity[sample] = vkPolarity |
328 else: | |
329 sample_polarity = polaritySanitizer(row[2]) | |
330 polarity[sample] = sample_polarity | |
317 except ValueError: | 331 except ValueError: |
318 print('The %s data file could not be read. Check that polarity is set to "negative" or "positive"' % xcmsSampleMetadataFile) | 332 print('The %s data file could not be read. Check that polarity is set to "negative" or "positive"' % xcmsSampleMetadataFile) |
319 xcmsVariableMetadataFile = getattr(args, "variable_metadata") | 333 xcmsVariableMetadataFile = getattr(args, "variable_metadata") |
320 try: | 334 try: |
321 mz = {} | 335 mz = {} |
357 if intensity not in {'NA', '#DIV/0!', '0'}: | 371 if intensity not in {'NA', '#DIV/0!', '0'}: |
358 variable = row[0] | 372 variable = row[0] |
359 sample = sample_id[i] | 373 sample = sample_id[i] |
360 # XCMS data may include empty columns | 374 # XCMS data may include empty columns |
361 if sample != "": | 375 if sample != "": |
362 vkInput.append([sample, polarity[sample], mz[variable], rt[variable], float(intensity), []]) | 376 vkInput.append([sample, polarity[sample], mz[variable], rt[variable], float(intensity)]) |
363 i+=1 | 377 i+=1 |
364 except ValueError: | 378 except ValueError: |
365 print('The %s data file could not be read.' % xcmsDataMatrixFile) | 379 print('The %s data file could not be read.' % xcmsDataMatrixFile) |
366 vkData = forecaster(vkInput) | 380 vkData = forecaster(vkInput) |
381 vkData = plotData(vkData) | |
367 saveForcast(vkData) | 382 saveForcast(vkData) |
368 plotRatios(vkData) | 383 plotRatios(vkData) |
369 else: | 384 else: |
370 vkData = [] | 385 vkData = [] |
371 tsvPlotvFile = getattr(args, "input") | 386 tsvPlotvFile = getattr(args, "input") |