diff vkmz.py @ 1:b02af8eb8e6e draft

planemo upload for repository https://github.com/HegemanLab/VKMZ commit 5e7a43415df3902b44b7623cb2c6ffb8845751ac
author eslerm
date Wed, 30 May 2018 13:17:32 -0400
parents 0b8ddf650752
children 04079c34452a
line wrap: on
line diff
--- a/vkmz.py	Wed May 02 18:31:06 2018 -0400
+++ b/vkmz.py	Wed May 30 13:17:32 2018 -0400
@@ -33,11 +33,12 @@
   inputSubparser.add_argument('--error',    '-e', nargs='?', type=float, required=True, help='Mass error of mass spectrometer in PPM')
   inputSubparser.add_argument('--database', '-d', nargs='?', default='databases/bmrb-light.tsv', help='Select database.')
   inputSubparser.add_argument('--directory', nargs='?', default='', type=str, help='Define directory of tool.')
+  inputSubparser.add_argument('--polarity', '-p', choices=['positive','negative'], help='Force polarity mode. Ignore variables in input file.')
   inputSubparser.add_argument('--no-adjustment', '-na', action='store_true', help='Use flag to turn off polarity based mass adjustment. This flag should always be used if reprocessing data generated by VKMZ.')
   inputSubparser.add_argument('--multiprocessing', '-m', action='store_true', help='Use flag to turn on multiprocessing.')
-  inputSubparser.add_argument('--plottype', '-p', nargs='?', default='scatter-2d', choices=['scatter-2d', 'scatter-3d'], help='Select plot type.')
-  inputSubparser.add_argument('--size',     '-s', nargs='?', default=5, type=int, help='Set size of of dots. size+2*log(size*peak/(highest_peak/lowest_peak')
-  inputSubparser.add_argument('--size-algorithm', '-a', nargs='?', default=0, type=int, choices=[0,1,2],help='Size algorithm selector. Algo 0: size, Algo 1: size+2*log(size*peak/(highest_peak/lowest_peak, Algo 2: size+2*size*peak/(highest_peak-lowest_peak)')
+  inputSubparser.add_argument('--plottype', '-t', nargs='?', default='scatter-2d', choices=['scatter-2d', 'scatter-3d'], help='Select plot type.')
+  inputSubparser.add_argument('--size',     '-s', nargs='?', default=5, type=int, help='Set maxium size of plot symbols.')
+  inputSubparser.add_argument('--size-algorithm', '-a', nargs='?', default=0, type=int, choices=[0,1], help="Symbol size algorithm selector. Algorithm 0 sets all symbols to the maxium size. Algorithm 2 determines a features symbol size by it's log intensity.")
 args = parser.parse_args()
 
 vkInputType = getattr(args, "input-type")
@@ -45,6 +46,8 @@
 # read inputs, arguments and define globals
 vkError = getattr(args, "error")
 
+vkPolarity = getattr(args, "polarity")
+
 vkMultiprocessing = getattr(args, "multiprocessing")
 
 vkNoAdjustment = getattr(args, "no_adjustment")
@@ -100,7 +103,7 @@
   if prediction != -1:
     feature[2] = mass
     predictions = predictNeighbors(mass, uncertainty, prediction)
-    feature[5] = predictions
+    feature.append(predictions) # feature[5]
     predictionClosest = predictions[0]
     formula = predictionClosest[1]
     formulaList = re.findall('[A-Z][a-z]?|[0-9]+', formula)
@@ -123,7 +126,7 @@
     feature += [predictionClosestDelta, hc, oc, nc]
     return(feature)
 
-# adjust observed mass to a neutral mass
+# adjust charged mass to a neutral mass
 def adjust(mass, polarity):
   # value to adjust by
   proton = 1.007276
@@ -147,6 +150,22 @@
       return predict(mass, uncertainty, mid+1, right)
   return -1
   
+def plotData(vkData):
+  if vkSizeAlgo == 0:
+    for row in vkData:
+      row.append(vkSize)
+  else:
+    max_intensity = 0.0
+    for row in vkData:
+      intensity = row[4]
+      if intensity > max_intensity:
+        max_intensity = intensity   
+    alpha = vkSize/math.log(max_intensity+1)
+    for row in vkData:
+      intensity = row[4]
+      row.append(alpha*math.log(intensity+1))
+  return vkData
+
 # find and sort known masses within error limit of observed mass
 def predictNeighbors(mass, uncertainty, prediction):
   i = 0
@@ -173,36 +192,29 @@
 
 # write output file
 def saveForcast(vkOutputList):
-  try:
+  try: 
     with open(vkOutput+'.tsv', 'w') as f: 
-      f.writelines(str("sample_id\tpolarity\tmz\tretention_time\tintensity\tpredictions\tdelta\tH:C\tO:C\tN:C") + '\n')
+      f.writelines(str("sample_id\tpolarity\tmz\tretention_time\tintensity\tpredictions\tdelta\tH:C\tO:C\tN:C\tsymbol_size") + '\n')
       for feature in vkOutputList:
-       f.writelines(feature[0]+'\t'+feature[1]+'\t'+str(feature[2])+'\t'+str(feature[3])+'\t'+str(feature[4])+'\t'+str(feature[5])+'\t'+str(feature[6])+'\t'+str(feature[7])+'\t'+str(feature[8])+'\t'+str(feature[9])+'\t'+'\n')
+        f.writelines(feature[0]+'\t'+feature[1]+'\t'+str(feature[2])+'\t'+str(feature[3])+'\t'+str(feature[4])+'\t'+str(feature[5])+'\t'+str(feature[6])+'\t'+str(feature[7])+'\t'+str(feature[8])+'\t'+str(feature[9])+'\t'+str(feature[10])+'\t'+'\n')
   except ValueError:
     print('"%s" could not be saved.' % filename)
 
 def plotRatios(vkData):
   max_rt = 0
-  min_intensity = 10.0**10
-  max_intensity = 0.0
   max_hc = 0
   max_oc = 0
   max_nc = 0
   for row in vkData:
     if row[3] > max_rt:
       max_rt = row[3]
-    intensity = float(row[4])
-    if intensity < min_intensity:
-      min_intensity = intensity
-    if intensity > max_intensity:
-      max_intensity = intensity
     if row[7] > max_hc:
       max_hc = row[7]
     if row[8] > max_oc:
       max_oc = row[8]
     if row[9] > max_nc:
       max_nc = row[9]
-  labels = ['sampleID', 'polarity', 'mz', 'rt', 'intensity', 'predictions', 'delta', 'hc', 'oc', 'nc']
+  labels = ['sampleID', 'polarity', 'mz', 'rt', 'intensity', 'predictions', 'delta', 'hc', 'oc', 'nc', 'symbol_size']
   df = pd.DataFrame.from_records(vkData, columns=labels)
   sampleIDs = df.sampleID.unique()
   data = []
@@ -210,10 +222,7 @@
   i = 0
   for sampleID in sampleIDs:
     dfSample = df.loc[df['sampleID'] == sampleID]
-    if vkSizeAlgo == 0:
-      size = dfSample.intensity.apply(lambda x: vkSize)
-    else:
-      size = dfSample.intensity.apply(lambda x: vkSize+4*vkSize*float(x)/max_intensity)
+    size = dfSample.symbol_size
     trace = go.Scatter(
       x = dfSample.oc,
       y = dfSample.hc,
@@ -222,6 +231,7 @@
       mode = 'markers',
       marker = dict(
         size = size,
+        sizemode = "area",
         color = dfSample.rt,
         colorscale = 'Viridis',
         cmin = 0,
@@ -296,10 +306,11 @@
       next(f) # skip hearder line
       tsvData = csv.reader(f, delimiter='\t')
       for row in tsvData:
-        vkInput.append([row[0],polaritySanitizer(row[1]),float(row[2]),float(row[3]),float(row[4]),[]])
+        vkInput.append([row[0],polaritySanitizer(row[1]),float(row[2]),float(row[3]),float(row[4])])
   except ValueError:
     print('The %s data file could not be read.' % tsvFile)
   vkData = forecaster(vkInput)
+  vkData = plotData(vkData)
   saveForcast(vkData)
   plotRatios(vkData)
 elif vkInputType == "xcms":
@@ -312,8 +323,11 @@
       next(xcmsSampleMetadata, None) # skip header
       for row in xcmsSampleMetadata:
         sample = row[0]
-        sample_polarity = polaritySanitizer(row[2])
-        polarity[sample] = sample_polarity
+        if vkPolarity:
+          polarity[sample] = vkPolarity
+        else:
+          sample_polarity = polaritySanitizer(row[2])
+          polarity[sample] = sample_polarity
   except ValueError:
     print('The %s data file could not be read. Check that polarity is set to "negative" or "positive"' % xcmsSampleMetadataFile)
   xcmsVariableMetadataFile = getattr(args, "variable_metadata")
@@ -359,11 +373,12 @@
                 sample = sample_id[i]
                 # XCMS data may include empty columns
                 if sample != "":
-                  vkInput.append([sample, polarity[sample], mz[variable], rt[variable], float(intensity), []])
+                  vkInput.append([sample, polarity[sample], mz[variable], rt[variable], float(intensity)])
             i+=1
   except ValueError:
     print('The %s data file could not be read.' % xcmsDataMatrixFile)
   vkData = forecaster(vkInput)
+  vkData = plotData(vkData)
   saveForcast(vkData)
   plotRatios(vkData)
 else: