Mercurial > repos > eslerm > vkmz
diff vkmz.py @ 6:35b984684450 draft
planemo upload for repository https://github.com/HegemanLab/VKMZ commit 5ef8d2b36eb35ff5aad5d5e9b78c38405fc95c1a
author | eslerm |
---|---|
date | Tue, 10 Jul 2018 17:58:35 -0400 |
parents | 04079c34452a |
children |
line wrap: on
line diff
--- a/vkmz.py Thu May 31 12:06:20 2018 -0400 +++ b/vkmz.py Tue Jul 10 17:58:35 2018 -0400 @@ -1,297 +1,30 @@ -''' -based on the BMRB compound database which can be found at: -http://www.bmrb.wisc.edu/ftp/pub/bmrb/relational_tables/metabolomics/Chem_comp.csv -''' - +import argparse +import csv +import math import re -import argparse -import multiprocessing -from multiprocessing import Pool -import csv - -import numpy as np -import math -import pandas as pd -from plotly import __version__ -import plotly.offline as py -import plotly.graph_objs as go parser = argparse.ArgumentParser() inputSubparser = parser.add_subparsers(help='Select input type:', dest='input-type') parse_tsv = inputSubparser.add_parser('tsv', help='Use tabular data as input.') parse_tsv.add_argument('--input', '-i', required=True, help='Path to tabular file. Must include columns: sample ID, mz, polarity, intensity, & retention time.') -parse_tsv.add_argument('--no-plot', '-np', action='store_true', help='Disable plot generation.') parse_xcms = inputSubparser.add_parser('xcms', help='Use XCMS data as input.') -parse_xcms.add_argument('--data-matrix', '-xd', required=True, nargs='?', type=str, help='Path to XCMS dataMatrix file.') -parse_xcms.add_argument('--sample-metadata', '-xs', required=True, nargs='?', type=str, help='Path to XCMS sampleMetadata file.') -parse_xcms.add_argument('--variable-metadata', '-xv', required=True, nargs='?', type=str, help='Path to XCMS variableMetadata file.') -parse_xcms.add_argument('--no-plot', '-n', action='store_true', help='Disable plot generation.') -parse_plot = inputSubparser.add_parser('plot', help='Only plot data.') -parse_plot.add_argument('--input', '-i', required=True, nargs='?', type=str, help='Path to VKMZ generated tabular file.') +parse_xcms.add_argument('--data-matrix', '-xd', required=True, nargs='?', type=str, help='Path to XCMS data matrix file.') +parse_xcms.add_argument('--sample-metadata', '-xs', required=True, nargs='?', type=str, help='Path to XCMS sample metadata file.') +parse_xcms.add_argument('--variable-metadata', '-xv', required=True, nargs='?', type=str, help='Path to XCMS variable metadata file.') for inputSubparser in [parse_tsv, parse_xcms]: inputSubparser.add_argument('--output', '-o', nargs='?', type=str, required=True, help='Specify output file path.') - inputSubparser.add_argument('--error', '-e', nargs='?', type=float, required=True, help='Mass error of mass spectrometer in PPM') - inputSubparser.add_argument('--database', '-d', nargs='?', default='databases/bmrb-light.tsv', help='Select database.') - inputSubparser.add_argument('--directory', nargs='?', default='', type=str, help='Define directory of tool.') - inputSubparser.add_argument('--polarity', '-p', choices=['positive','negative'], help='Force polarity mode. Ignore variables in input file.') - inputSubparser.add_argument('--no-adjustment', '-na', action='store_true', help='Use flag to turn off polarity based mass adjustment. This flag should always be used if reprocessing data generated by VKMZ.') - inputSubparser.add_argument('--unique', '-u', action='store_true', help='Set flag to only output features which have a single match.') - inputSubparser.add_argument('--multiprocessing', '-m', action='store_true', help='Set flag to turn on multiprocessing.') - inputSubparser.add_argument('--plottype', '-t', nargs='?', default='scatter-2d', choices=['scatter-2d', 'scatter-3d'], help='Select plot type.') - inputSubparser.add_argument('--size', '-s', nargs='?', default=5, type=int, help='Set maxium size of plot symbols.') - inputSubparser.add_argument('--size-algorithm', '-a', nargs='?', default=0, type=int, choices=[0,1], help="Symbol size algorithm selector. Algorithm 0 sets all symbols to the maxium size. Algorithm 2 determines a features symbol size by it's log intensity.") + inputSubparser.add_argument('--error', '-e', nargs='?', type=float, required=True, help='Mass error of mass spectrometer in parts-per-million.') + inputSubparser.add_argument('--database', '-db', nargs='?', default='databases/bmrb-light.tsv', help='Select database of known formula masses.') + inputSubparser.add_argument('--directory','-dir', nargs='?', default='', type=str, help='Define path of tool directory. Assumes relative path if unset.') + inputSubparser.add_argument('--polarity', '-p', choices=['positive','negative'], help='Force polarity mode to positive or negative. Overrides variables in input file.') + inputSubparser.add_argument('--neutral', '-n', action='store_true', help='Set neutral flag if masses in input data are neutral. No mass adjustmnet will be made.') + inputSubparser.add_argument('--unique', '-u', action='store_true', help='Set flag to remove features with multiple predictions.') args = parser.parse_args() -vkInputType = getattr(args, "input-type") - -# read inputs, arguments and define globals -vkError = getattr(args, "error") - -vkPolarity = getattr(args, "polarity") - -vkUnique = getattr(args, "unique") - -vkMultiprocessing = getattr(args, "multiprocessing") - -vkNoAdjustment = getattr(args, "no_adjustment") - -vkDatabaseFile = getattr(args, "database") -vkDirectory = getattr(args, "directory") - -vkMass = [] -vkFormula = [] -try: - with open(vkDirectory+vkDatabaseFile, 'r') as tsv: - next(tsv) # skip first row - for row in tsv: - mass, formula = row.split() - vkMass.append(mass) - vkFormula.append(formula) -except ValueError: - print('The %s database could not be loaded.' % vkDatabaseFile) -vkMaxIndex = len(vkMass)-1 - -vkOutput = getattr(args, "output") - -vkPlotType = getattr(args, 'plottype') - -vkSize = getattr(args, 'size') - -vkSizeAlgo = getattr(args, 'size_algorithm') - -# control predictions -def forecaster(vkInput): - if vkMultiprocessing: - try: - pool = Pool() - vkOutputList = pool.map(featurePrediction, vkInput) - except Exception as e: - print("Error during multirpocessing: "+str(e)) - finally: - pool.close() - pool.join() - else: - vkOutputList = map(featurePrediction, vkInput) - vkOutputList = [x for x in vkOutputList if x is not None] - return(vkOutputList) - -# predict feature formulas and creates output list -def featurePrediction(feature): - if vkNoAdjustment: - mass = feature[2] - else: - mass = adjust(feature[2], feature[1]) # mz & polarity - uncertainty = mass * vkError / 1e6 - prediction = predict(mass, uncertainty, 0, vkMaxIndex) - if prediction != -1: - feature[2] = mass - predictions = predictNeighbors(mass, uncertainty, prediction) - if vkUnique and len(predictions) > 1: - return - feature.append(predictions) # feature[5] - predictionClosest = predictions[0] - formula = predictionClosest[1] - formulaList = re.findall('[A-Z][a-z]?|[0-9]+', formula) - formulaDictionary = {'C':0,'H':0,'O':0,'N':0} - i = 0; - while i < len(formulaList): - if formulaList[i] in formulaDictionary: - # if there is only one of this element - if i+1 == len(formulaList) or formulaList[i+1].isalpha(): - formulaDictionary[formulaList[i]] = 1 - else: - formulaDictionary[formulaList[i]] = formulaList[i+1] - i+=1 - i+=1 - predictionClosest.append(formulaDictionary) - hc = float(formulaDictionary['H'])/float(formulaDictionary['C']) - oc = float(formulaDictionary['O'])/float(formulaDictionary['C']) - nc = float(formulaDictionary['N'])/float(formulaDictionary['C']) - predictionClosestDelta = feature[5][0][2] - feature += [predictionClosestDelta, hc, oc, nc] - return(feature) - -# adjust charged mass to a neutral mass -def adjust(mass, polarity): - # value to adjust by - proton = 1.007276 - if polarity == 'positive': - mass += proton - elif polarity == 'negative': - mass -= proton - return mass +# store input constants +INPUT_TYPE = getattr(args, "input-type") +POLARITY = getattr(args, "polarity") -# Binary search to match observed mass to known mass within error -# https://en.wikipedia.org/wiki/Binary_search_tree -def predict(mass, uncertainty, left, right): - mid = ((right - left) / 2) + left - if left <= mid <= right and mid <= vkMaxIndex: - delta = float(vkMass[mid]) - mass - if uncertainty >= abs(delta): - return mid - elif uncertainty < delta: - return predict(mass, uncertainty, left, mid-1) - else: - return predict(mass, uncertainty, mid+1, right) - return -1 - -def plotData(vkData): - if vkSizeAlgo == 0: - for row in vkData: - row.append(vkSize) - else: - max_intensity = 0.0 - for row in vkData: - intensity = row[4] - if intensity > max_intensity: - max_intensity = intensity - alpha = vkSize/math.log(max_intensity+1) - for row in vkData: - intensity = row[4] - row.append(alpha*math.log(intensity+1)) - return vkData - -# find and sort known masses within error limit of observed mass -def predictNeighbors(mass, uncertainty, prediction): - i = 0 - neighbors = [[vkMass[prediction],vkFormula[prediction],(float(vkMass[prediction])-mass)],] - while prediction+i+1 <= vkMaxIndex: - neighbor = prediction+i+1 - delta = float(vkMass[neighbor])-mass - if uncertainty >= abs(delta): - neighbors.append([vkMass[neighbor],vkFormula[neighbor],delta]) - i += 1 - else: - break - i = 0 - while prediction+i-1 >= 0: - neighbor = prediction+i-1 - delta = float(vkMass[neighbor])-mass - if uncertainty >= abs(delta): - neighbors.append([vkMass[neighbor],vkFormula[neighbor],(float(vkMass[neighbor])-mass)]) - i -= 1 - else: - break - neighbors = sorted(neighbors, key = (lambda delta: abs(delta[2]))) - return neighbors - -# write output file -def saveForcast(vkOutputList): - try: - with open(vkOutput+'.tsv', 'w') as f: - f.writelines(str("sample_id\tpolarity\tmz\tretention_time\tintensity\tpredictions\tdelta\tH:C\tO:C\tN:C\tsymbol_size") + '\n') - for feature in vkOutputList: - f.writelines(feature[0]+'\t'+feature[1]+'\t'+str(feature[2])+'\t'+str(feature[3])+'\t'+str(feature[4])+'\t'+str(feature[5])+'\t'+str(feature[6])+'\t'+str(feature[7])+'\t'+str(feature[8])+'\t'+str(feature[9])+'\t'+str(feature[10])+'\t'+'\n') - except ValueError: - print('"%s" could not be saved.' % filename) - -def plotRatios(vkData): - max_rt = 0 - max_hc = 0 - max_oc = 0 - max_nc = 0 - for row in vkData: - if row[3] > max_rt: - max_rt = row[3] - if row[7] > max_hc: - max_hc = row[7] - if row[8] > max_oc: - max_oc = row[8] - if row[9] > max_nc: - max_nc = row[9] - labels = ['sampleID', 'polarity', 'mz', 'rt', 'intensity', 'predictions', 'delta', 'hc', 'oc', 'nc', 'symbol_size'] - df = pd.DataFrame.from_records(vkData, columns=labels) - sampleIDs = df.sampleID.unique() - data = [] - menus = [] - i = 0 - for sampleID in sampleIDs: - dfSample = df.loc[df['sampleID'] == sampleID] - size = dfSample.symbol_size - trace = go.Scatter( - x = dfSample.oc, - y = dfSample.hc, - text = dfSample.predictions.apply(lambda x: "Prediction: "+str(x[0][1])+"<br>mz: " +str(x[0][0])+"<br>Delta: "+str(x[0][2])), - line = dict(width = 0.5), - mode = 'markers', - marker = dict( - size = size, - sizemode = "area", - color = dfSample.rt, - colorscale = 'Viridis', - cmin = 0, - cmax = max_rt, - colorbar=dict(title='Retention Time (s)'), - line = dict(width = 0.5), - showscale = True - ), - opacity = 0.8 - ) - data.append(trace) - vision = [] - j = 0 - while j < len(sampleIDs): - if j != i: - vision.append(False) - else: - vision.append(True) - j += 1 - menu = dict( - method = 'update', - label = sampleID, - args = [{'visible': vision}, {'title': sampleID}] - ) - menus.append(menu) - i += 1 - updatemenus = list([ - dict( - active = -1, - buttons = menus - ) - ]) - layout = go.Layout( - title = "Van Krevelen Diagram", - showlegend = False, - xaxis = dict( - title = 'Oxygen to Carbon Ratio', - zeroline = False, - gridcolor = 'rgb(183,183,183)', - showline = True, - range = [0, max_oc] - ), - yaxis = dict( - title = 'Hydrogen to Carbon Ratio', - zeroline = False, - gridcolor = 'rgb(183,183,183)', - showline = True, - range = [0, max_hc] - ), - margin = dict(r=0, b=100, l=100, t=100), - updatemenus = updatemenus - ) - fig = go.Figure(data=data, layout=layout) - py.plot(fig, auto_open=False, show_link=False, filename=vkOutput+'.html') - def polaritySanitizer(sample_polarity): if sample_polarity.lower() in {'positive','pos','+'}: sample_polarity = 'positive' @@ -302,9 +35,9 @@ raise ValueError return sample_polarity -# main -if vkInputType == "tsv": - vkInput = [] +# read input +vkInput = [] # each element is a feature from input +if INPUT_TYPE == "tsv": tsvFile = getattr(args, "input") try: with open(tsvFile, 'r') as f: @@ -314,12 +47,7 @@ vkInput.append([row[0],polaritySanitizer(row[1]),float(row[2]),float(row[3]),float(row[4])]) except ValueError: print('The %s data file could not be read.' % tsvFile) - vkData = forecaster(vkInput) - vkData = plotData(vkData) - saveForcast(vkData) - plotRatios(vkData) -elif vkInputType == "xcms": - vkInput = [] +else: # INPUT_TYPE == "xcms" xcmsSampleMetadataFile = getattr(args, "sample_metadata") try: polarity = {} @@ -328,8 +56,8 @@ next(xcmsSampleMetadata, None) # skip header for row in xcmsSampleMetadata: sample = row[0] - if vkPolarity: - polarity[sample] = vkPolarity + if POLARITY: + polarity[sample] = POLARITY else: sample_polarity = polaritySanitizer(row[2]) polarity[sample] = sample_polarity @@ -382,20 +110,130 @@ i+=1 except ValueError: print('The %s data file could not be read.' % xcmsDataMatrixFile) - vkData = forecaster(vkInput) - vkData = plotData(vkData) - saveForcast(vkData) - plotRatios(vkData) -else: - vkData = [] - tsvPlotvFile = getattr(args, "input") - try: - with open(tsvPlotFile, 'r') as f: - next(f) # skip header line - plotData = csv.reader(f, delimiter='\t') - for row in plotData: - vkData.append([row[0],row[1],float(row[2]),float(row[3]),float(row[4]),list(row[4]),float(row[5]),float(row[6]),float(row[7]),float(row[8])]) + +# store||generate remaining constants +OUTPUT = getattr(args, "output") +MASS_ERROR = getattr(args, "error") +UNIQUE = getattr(args, "unique") +NEUTRAL = getattr(args, "neutral") +DATABASE = getattr(args, "database") +DIRECTORY = getattr(args, "directory") +MASS = [] +FORMULA = [] +try: + with open(DIRECTORY+DATABASE, 'r') as tsv: + for row in tsv: + mass, formula = row.split() + MASS.append(mass) + FORMULA.append(formula) +except ValueError: + print('The %s database could not be loaded.' % DATABASE) +MAX_MASS_INDEX = len(MASS)-1 + +# adjust charged mass to a neutral mass +def adjust(mass, polarity): + # value to adjust by + proton = 1.007276 + if polarity == 'positive': + mass -= proton + else: # sanitized to negative + mass += proton + return mass + +# binary search to match a neutral mass to known mass within error +def predict(mass, uncertainty, left, right): + mid = ((right - left) / 2) + left + if left <= mid <= right and mid <= MAX_MASS_INDEX: + delta = float(MASS[mid]) - mass + if uncertainty >= abs(delta): + return mid + elif uncertainty < delta: + return predict(mass, uncertainty, left, mid-1) + else: + return predict(mass, uncertainty, mid+1, right) + return -1 + +# find and rank predictions which are adjacent to the index of an intial prediction +def predictNeighbors(mass, uncertainty, prediction): + i = 0 + neighbors = [[MASS[prediction],FORMULA[prediction],(float(MASS[prediction])-mass)],] + while prediction+i+1 <= MAX_MASS_INDEX: + neighbor = prediction+i+1 + delta = float(MASS[neighbor])-mass + if uncertainty >= abs(delta): + neighbors.append([MASS[neighbor],FORMULA[neighbor],delta]) + i += 1 + else: + break + i = 0 + while prediction+i-1 >= 0: + neighbor = prediction+i-1 + delta = float(MASS[neighbor])-mass + if uncertainty >= abs(delta): + neighbors.append([MASS[neighbor],FORMULA[neighbor],(float(MASS[neighbor])-mass)]) + i -= 1 + else: + break + neighbors = sorted(neighbors, key = (lambda delta: abs(delta[2]))) + return neighbors + +# predict formulas by the mass of a feature +def featurePrediction(feature): + if NEUTRAL: + mass = feature[2] + else: + mass = adjust(feature[2], feature[1]) # mz & polarity + uncertainty = mass * MASS_ERROR / 1e6 + prediction = predict(mass, uncertainty, 0, MAX_MASS_INDEX) + if prediction != -1: # else feature if forgotten + predictions = predictNeighbors(mass, uncertainty, prediction) + if UNIQUE and len(predictions) > 1: + return + feature.append(predictions) # feature[5] + formula = predictions[0][1] # formula of prediction with lowest abs(delta) + formulaList = re.findall('[A-Z][a-z]?|[0-9]+', formula) + formulaDictionary = {'C':0,'H':0,'O':0,'N':0} # other elements are easy to add + i = 0; + while i < len(formulaList): + if formulaList[i] in formulaDictionary: + # if there is only one of this element + if i+1 == len(formulaList) or formulaList[i+1].isalpha(): + formulaDictionary[formulaList[i]] = 1 + else: + formulaDictionary[formulaList[i]] = formulaList[i+1] + i+=1 + i+=1 + hc = float(formulaDictionary['H'])/float(formulaDictionary['C']) + oc = float(formulaDictionary['O'])/float(formulaDictionary['C']) + nc = float(formulaDictionary['N'])/float(formulaDictionary['C']) + feature += [hc, oc, nc] # feature[6], [7], [8] + return(feature) + +# write output file +def write(vkData): + json = '' + try: + # write tabular file and generate json for html output + with open(OUTPUT+'.tsv', 'w') as f: + f.writelines(str("sample_id\tpolarity\tmz\trt\tintensity\tpredictions\thc\toc\tnc") + '\n') + for feature in vkData: + f.writelines(feature[0]+'\t'+feature[1]+'\t'+str(feature[2])+'\t'+str(feature[3])+'\t'+str(feature[4])+'\t'+str(feature[5])+'\t'+str(feature[6])+'\t'+str(feature[7])+'\t'+str(feature[8])+'\n') + json += '{sample_id:\''+str(feature[0])+'\', polarity:\''+str(feature[1])+'\', mz:'+str(feature[2])+', rt:'+str(feature[3])+', intensity:'+str(feature[4])+', predictions:'+str(feature[5])+', hc:'+str(feature[6])+', oc:'+str(feature[7])+', nc:'+str(feature[8])+'},' + json = json[:-1] # remove final comma + # write html + try: + with open(DIRECTORY+'d3.html', 'r') as template, open(OUTPUT+'.html', 'w') as f: + for line in template: + line = re.sub('^var data.*$', 'var data = ['+json+']', line, flags=re.M) + f.write(line) + except ValueError: + print('"%s" could not be read or "%s" could not be written' % template, f) except ValueError: - print('The %s data file could not be read.' % tsvFile) - plotRatios(vkData) + print('"%s" could not be saved.' % filename) +# main +vkData = map(featurePrediction, vkInput) +vkData = [x for x in vkData if x is not None] +# sort by intensity so D3 draws largest symbols first +vkData.sort(key=lambda x: x[4], reverse=True) +write(vkData)