diff vkmz.py @ 0:0b8ddf650752 draft

planemo upload for repository https://github.com/HegemanLab/VKMZ commit 7c299d22bdce251ce599cd34df76919d297a7007-dirty
author eslerm
date Wed, 02 May 2018 18:31:06 -0400
parents
children b02af8eb8e6e
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vkmz.py	Wed May 02 18:31:06 2018 -0400
@@ -0,0 +1,381 @@
+'''
+based on the BMRB compound database which can be found at:
+http://www.bmrb.wisc.edu/ftp/pub/bmrb/relational_tables/metabolomics/Chem_comp.csv
+'''
+
+import re
+import argparse
+import multiprocessing
+from multiprocessing import Pool
+import csv
+
+import numpy as np
+import math
+import pandas as pd
+from plotly import __version__
+import plotly.offline as py
+import plotly.graph_objs as go
+ 
+parser = argparse.ArgumentParser()
+inputSubparser = parser.add_subparsers(help='Select input type:', dest='input-type')
+parse_tsv = inputSubparser.add_parser('tsv', help='Use tabular data as input.')
+parse_tsv.add_argument('--input', '-i', required=True, help='Path to tabular file. Must include columns: sample ID, mz, polarity, intensity, & retention time.')
+parse_tsv.add_argument('--no-plot', '-np', action='store_true', help='Disable plot generation.')
+parse_xcms = inputSubparser.add_parser('xcms', help='Use XCMS data as input.')
+parse_xcms.add_argument('--data-matrix', '-xd', required=True, nargs='?', type=str, help='Path to XCMS dataMatrix file.')
+parse_xcms.add_argument('--sample-metadata', '-xs', required=True, nargs='?', type=str, help='Path to XCMS sampleMetadata file.')
+parse_xcms.add_argument('--variable-metadata', '-xv', required=True, nargs='?', type=str, help='Path to XCMS variableMetadata file.')
+parse_xcms.add_argument('--no-plot', '-n', action='store_true', help='Disable plot generation.')
+parse_plot = inputSubparser.add_parser('plot', help='Only plot data.')
+parse_plot.add_argument('--input', '-i', required=True, nargs='?', type=str, help='Path to VKMZ generated tabular file.')
+for inputSubparser in [parse_tsv, parse_xcms]:
+  inputSubparser.add_argument('--output',   '-o', nargs='?', type=str, required=True, help='Specify output file path.')
+  inputSubparser.add_argument('--error',    '-e', nargs='?', type=float, required=True, help='Mass error of mass spectrometer in PPM')
+  inputSubparser.add_argument('--database', '-d', nargs='?', default='databases/bmrb-light.tsv', help='Select database.')
+  inputSubparser.add_argument('--directory', nargs='?', default='', type=str, help='Define directory of tool.')
+  inputSubparser.add_argument('--no-adjustment', '-na', action='store_true', help='Use flag to turn off polarity based mass adjustment. This flag should always be used if reprocessing data generated by VKMZ.')
+  inputSubparser.add_argument('--multiprocessing', '-m', action='store_true', help='Use flag to turn on multiprocessing.')
+  inputSubparser.add_argument('--plottype', '-p', nargs='?', default='scatter-2d', choices=['scatter-2d', 'scatter-3d'], help='Select plot type.')
+  inputSubparser.add_argument('--size',     '-s', nargs='?', default=5, type=int, help='Set size of of dots. size+2*log(size*peak/(highest_peak/lowest_peak')
+  inputSubparser.add_argument('--size-algorithm', '-a', nargs='?', default=0, type=int, choices=[0,1,2],help='Size algorithm selector. Algo 0: size, Algo 1: size+2*log(size*peak/(highest_peak/lowest_peak, Algo 2: size+2*size*peak/(highest_peak-lowest_peak)')
+args = parser.parse_args()
+
+vkInputType = getattr(args, "input-type")
+
+# read inputs, arguments and define globals
+vkError = getattr(args, "error")
+
+vkMultiprocessing = getattr(args, "multiprocessing")
+
+vkNoAdjustment = getattr(args, "no_adjustment")
+
+vkDatabaseFile = getattr(args, "database")
+vkDirectory = getattr(args, "directory")
+
+vkMass = []
+vkFormula = []
+try:
+  with open(vkDirectory+vkDatabaseFile, 'r') as tsv:
+    next(tsv) # skip first row
+    for row in tsv:
+      mass, formula = row.split()
+      vkMass.append(mass)
+      vkFormula.append(formula)
+except ValueError:
+  print('The %s database could not be loaded.' % vkDatabaseFile)
+vkMaxIndex = len(vkMass)-1
+
+vkOutput = getattr(args, "output")
+
+vkPlotType = getattr(args, 'plottype')
+
+vkSize = getattr(args, 'size')
+
+vkSizeAlgo = getattr(args, 'size_algorithm')
+
+# control predictions
+def forecaster(vkInput):
+  if vkMultiprocessing:
+    try:
+      pool = Pool()
+      vkOutputList = pool.map(featurePrediction, vkInput)
+    except Exception as e:
+      print("Error during multirpocessing: "+str(e))
+    finally:
+      pool.close()
+      pool.join()
+  else:
+    vkOutputList = map(featurePrediction, vkInput)
+  vkOutputList = [x for x in vkOutputList if x is not None]
+  return(vkOutputList)
+
+# predict feature formulas and creates output list
+def featurePrediction(feature):
+  if vkNoAdjustment:
+    mass = feature[2]
+  else:
+    mass = adjust(feature[2], feature[1]) # mz & polarity
+  uncertainty = mass * vkError / 1e6
+  prediction = predict(mass, uncertainty, 0, vkMaxIndex)
+  if prediction != -1:
+    feature[2] = mass
+    predictions = predictNeighbors(mass, uncertainty, prediction)
+    feature[5] = predictions
+    predictionClosest = predictions[0]
+    formula = predictionClosest[1]
+    formulaList = re.findall('[A-Z][a-z]?|[0-9]+', formula)
+    formulaDictionary = {'C':0,'H':0,'O':0,'N':0}
+    i = 0;
+    while i < len(formulaList):
+      if formulaList[i] in formulaDictionary:
+        # if there is only one of this element
+        if i+1 == len(formulaList) or formulaList[i+1].isalpha():
+          formulaDictionary[formulaList[i]] = 1
+        else:
+          formulaDictionary[formulaList[i]] = formulaList[i+1]
+          i+=1
+      i+=1
+    predictionClosest.append(formulaDictionary)
+    hc = float(formulaDictionary['H'])/float(formulaDictionary['C'])
+    oc = float(formulaDictionary['O'])/float(formulaDictionary['C'])
+    nc = float(formulaDictionary['N'])/float(formulaDictionary['C'])
+    predictionClosestDelta = feature[5][0][2]
+    feature += [predictionClosestDelta, hc, oc, nc]
+    return(feature)
+
+# adjust observed mass to a neutral mass
+def adjust(mass, polarity):
+  # value to adjust by
+  proton = 1.007276
+  if polarity == 'positive':
+    mass += proton
+  elif polarity == 'negative':
+    mass -= proton
+  return mass
+
+# Binary search to match observed mass to known mass within error
+# https://en.wikipedia.org/wiki/Binary_search_tree
+def predict(mass, uncertainty, left, right):
+  mid = ((right - left) / 2) + left
+  if left <= mid <= right and mid <= vkMaxIndex:
+    delta = float(vkMass[mid]) - mass
+    if uncertainty >= abs(delta):
+      return mid
+    elif uncertainty < delta:
+      return predict(mass, uncertainty, left, mid-1)
+    else:
+      return predict(mass, uncertainty, mid+1, right)
+  return -1
+  
+# find and sort known masses within error limit of observed mass
+def predictNeighbors(mass, uncertainty, prediction):
+  i = 0
+  neighbors = [[vkMass[prediction],vkFormula[prediction],(float(vkMass[prediction])-mass)],]
+  while prediction+i+1 <= vkMaxIndex:
+    neighbor = prediction+i+1
+    delta = float(vkMass[neighbor])-mass
+    if uncertainty >= abs(delta):
+      neighbors.append([vkMass[neighbor],vkFormula[neighbor],delta])
+      i += 1
+    else:
+      break
+  i = 0
+  while prediction+i-1 >= 0:
+    neighbor = prediction+i-1
+    delta = float(vkMass[neighbor])-mass
+    if uncertainty >= abs(delta):
+      neighbors.append([vkMass[neighbor],vkFormula[neighbor],(float(vkMass[neighbor])-mass)])
+      i -= 1
+    else:
+      break
+  neighbors = sorted(neighbors, key = (lambda delta: abs(delta[2])))
+  return neighbors
+
+# write output file
+def saveForcast(vkOutputList):
+  try:
+    with open(vkOutput+'.tsv', 'w') as f: 
+      f.writelines(str("sample_id\tpolarity\tmz\tretention_time\tintensity\tpredictions\tdelta\tH:C\tO:C\tN:C") + '\n')
+      for feature in vkOutputList:
+       f.writelines(feature[0]+'\t'+feature[1]+'\t'+str(feature[2])+'\t'+str(feature[3])+'\t'+str(feature[4])+'\t'+str(feature[5])+'\t'+str(feature[6])+'\t'+str(feature[7])+'\t'+str(feature[8])+'\t'+str(feature[9])+'\t'+'\n')
+  except ValueError:
+    print('"%s" could not be saved.' % filename)
+
+def plotRatios(vkData):
+  max_rt = 0
+  min_intensity = 10.0**10
+  max_intensity = 0.0
+  max_hc = 0
+  max_oc = 0
+  max_nc = 0
+  for row in vkData:
+    if row[3] > max_rt:
+      max_rt = row[3]
+    intensity = float(row[4])
+    if intensity < min_intensity:
+      min_intensity = intensity
+    if intensity > max_intensity:
+      max_intensity = intensity
+    if row[7] > max_hc:
+      max_hc = row[7]
+    if row[8] > max_oc:
+      max_oc = row[8]
+    if row[9] > max_nc:
+      max_nc = row[9]
+  labels = ['sampleID', 'polarity', 'mz', 'rt', 'intensity', 'predictions', 'delta', 'hc', 'oc', 'nc']
+  df = pd.DataFrame.from_records(vkData, columns=labels)
+  sampleIDs = df.sampleID.unique()
+  data = []
+  menus = []
+  i = 0
+  for sampleID in sampleIDs:
+    dfSample = df.loc[df['sampleID'] == sampleID]
+    if vkSizeAlgo == 0:
+      size = dfSample.intensity.apply(lambda x: vkSize)
+    else:
+      size = dfSample.intensity.apply(lambda x: vkSize+4*vkSize*float(x)/max_intensity)
+    trace = go.Scatter(
+      x = dfSample.oc,
+      y = dfSample.hc,
+      text = dfSample.predictions.apply(lambda x: "Prediction: "+str(x[0][1])+"<br>mz: " +str(x[0][0])+"<br>Delta: "+str(x[0][2])),
+      line = dict(width = 0.5),
+      mode = 'markers',
+      marker = dict(
+        size = size,
+        color = dfSample.rt,
+        colorscale = 'Viridis',
+        cmin = 0,
+        cmax = max_rt,
+        colorbar=dict(title='Retention Time (s)'),
+        line = dict(width = 0.5),
+        showscale = True
+      ),
+      opacity = 0.8
+    )
+    data.append(trace)
+    vision = []
+    j = 0
+    while j < len(sampleIDs):
+      if j != i:
+        vision.append(False)
+      else:
+        vision.append(True)
+      j += 1
+    menu = dict(
+      method = 'update',
+      label = sampleID,
+      args = [{'visible': vision}, {'title': sampleID}]
+    )
+    menus.append(menu)
+    i += 1
+  updatemenus = list([
+    dict(
+      active = -1,
+      buttons = menus
+    )
+  ])
+  layout = go.Layout(
+    title = "Van Krevelen Diagram",
+    showlegend = False,
+    xaxis = dict(
+      title = 'Oxygen to Carbon Ratio',
+      zeroline = False,
+      gridcolor = 'rgb(183,183,183)',
+      showline = True,
+      range = [0, max_oc]
+    ),
+    yaxis = dict(
+      title = 'Hydrogen to Carbon Ratio',
+      zeroline = False,
+      gridcolor = 'rgb(183,183,183)',
+      showline = True,
+      range = [0, max_hc]
+    ),
+    margin = dict(r=0, b=100, l=100, t=100),
+    updatemenus = updatemenus
+  )     
+  fig = go.Figure(data=data, layout=layout)
+  py.plot(fig, auto_open=False, show_link=False, filename=vkOutput+'.html')
+ 
+def polaritySanitizer(sample_polarity):
+  if sample_polarity.lower() in {'positive','pos','+'}:
+    sample_polarity = 'positive'
+  elif sample_polarity.lower() in {'negative', 'neg', '-'}:
+    sample_polarity = 'negative'
+  else:
+    print('A sample has an unknown polarity type: %s. Polarity in the XCMS sample metadata should be set to "negative" or "positive".' % sample_polarity)
+    raise ValueError
+  return sample_polarity
+
+# main
+if vkInputType == "tsv":
+  vkInput = []
+  tsvFile = getattr(args, "input")
+  try:
+    with open(tsvFile, 'r') as f:
+      next(f) # skip hearder line
+      tsvData = csv.reader(f, delimiter='\t')
+      for row in tsvData:
+        vkInput.append([row[0],polaritySanitizer(row[1]),float(row[2]),float(row[3]),float(row[4]),[]])
+  except ValueError:
+    print('The %s data file could not be read.' % tsvFile)
+  vkData = forecaster(vkInput)
+  saveForcast(vkData)
+  plotRatios(vkData)
+elif vkInputType == "xcms":
+  vkInput = []
+  xcmsSampleMetadataFile = getattr(args, "sample_metadata")
+  try:
+    polarity = {}
+    with open(xcmsSampleMetadataFile, 'r') as f:
+      xcmsSampleMetadata = csv.reader(f, delimiter='\t')
+      next(xcmsSampleMetadata, None) # skip header
+      for row in xcmsSampleMetadata:
+        sample = row[0]
+        sample_polarity = polaritySanitizer(row[2])
+        polarity[sample] = sample_polarity
+  except ValueError:
+    print('The %s data file could not be read. Check that polarity is set to "negative" or "positive"' % xcmsSampleMetadataFile)
+  xcmsVariableMetadataFile = getattr(args, "variable_metadata")
+  try:
+    mz = {}
+    rt = {}
+    variable_index = {}
+    mz_index = int()
+    rt_index = int()
+    with open(xcmsVariableMetadataFile, 'r') as f:
+      xcmsVariableMetadata = csv.reader(f, delimiter='\t')
+      i = 0
+      for row in xcmsVariableMetadata:
+        if i != 0:
+          mz[row[0]] = float(row[mz_index])
+          rt[row[0]] = float(row[rt_index])
+        else:
+          for column in row:
+            variable_index[column] = i
+            i += 1
+          mz_index = variable_index["mz"]
+          rt_index = variable_index["rt"]
+  except ValueError:
+    print('The %s data file could not be read.' % xcmsVariableMetadataFile)
+  xcmsDataMatrixFile = getattr(args, "data_matrix")
+  try:
+    with open(xcmsDataMatrixFile, 'r') as f:
+      xcmsDataMatrix = csv.reader(f, delimiter='\t')
+      first_row = True
+      for row in xcmsDataMatrix:
+        if first_row:
+          sample_id = row
+          first_row = False
+        else: 
+          i = 0
+          while(i < len(row)):
+            if i == 0:
+              i+=1
+            else:
+              intensity = row[i]
+              if intensity not in {'NA', '#DIV/0!', '0'}:
+                variable = row[0]
+                sample = sample_id[i]
+                # XCMS data may include empty columns
+                if sample != "":
+                  vkInput.append([sample, polarity[sample], mz[variable], rt[variable], float(intensity), []])
+            i+=1
+  except ValueError:
+    print('The %s data file could not be read.' % xcmsDataMatrixFile)
+  vkData = forecaster(vkInput)
+  saveForcast(vkData)
+  plotRatios(vkData)
+else:
+  vkData = []
+  tsvPlotvFile = getattr(args, "input")
+  try:
+    with open(tsvPlotFile, 'r') as f:
+      next(f) # skip header line
+      plotData = csv.reader(f, delimiter='\t')
+      for row in plotData:
+        vkData.append([row[0],row[1],float(row[2]),float(row[3]),float(row[4]),list(row[4]),float(row[5]),float(row[6]),float(row[7]),float(row[8])])
+  except ValueError:
+    print('The %s data file could not be read.' % tsvFile)
+  plotRatios(vkData)
+