changeset 2:e03582f26617 draft

planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit 7682e8e7ae2bfb926d94b414b9a1649389f33582
author eschen42
date Sun, 12 Nov 2017 19:45:36 -0500
parents 0c2ad44b6c9c
children 5aaab36bc523
files w4mcorcov.xml w4mcorcov_calc.R w4mcorcov_wrapper.R
diffstat 3 files changed, 75 insertions(+), 31 deletions(-) [+]
line wrap: on
line diff
--- a/w4mcorcov.xml	Sun Oct 22 18:47:57 2017 -0400
+++ b/w4mcorcov.xml	Sun Nov 12 19:45:36 2017 -0500
@@ -1,4 +1,4 @@
-<tool id="w4mcorcov" name="OPLS-DA_Contrasts" version="0.98.3">
+<tool id="w4mcorcov" name="OPLS-DA_Contrasts" version="0.98.5">
 
   <description>OPLS-DA Contrasts of Univariate Results</description>
   
@@ -23,6 +23,7 @@
     levCSV '$levCSV'
     matchingC '$matchingC'
     labelFeatures '$labelFeatures'
+    labelOrthoFeatures '$labelOrthoFeatures'
     contrast_detail '$contrast_detail'
     contrast_corcov '$contrast_corcov'
     contrast_salience '$contrast_salience'
@@ -33,7 +34,7 @@
     <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="Samples x metadata (tabular data - decimal: '.'; missing: NA; mode: character or numerical; separator: tab character)" />
     <param name="variableMetadata_in" label="Variable metadata file (ideally from Univariate)" type="data" format="tabular" help="Features x metadata (tabular data - decimal: '.'; missing: NA; mode: character or numerical; separator: tab character)" />
     <param name="facC" label="Factor of interest" type="text" help="REQUIRED - The name of the column of sampleMetadata corresponding to the qualitative variable used to define the contrasts.  Except when the 'Univariate Significance-test' is set to 'none', this also must be a portion of the column names in the variableMetadata file."/>
-    <param name="tesC" label="Univariate Significance-Test" type="select" help="Either 'none' or the name of the statistical test that was run by the 'Univariate' tool to produce the variableMetadata file; that name must also be a portion of the column names in that file.">
+    <param name="tesC" label="Univariate significance-test" type="select" help="Either 'none' or the name of the statistical test that was run by the 'Univariate' tool to produce the variableMetadata file; that name must also be a portion of the column names in that file.">
       <option value="none">none - Display all features from variableMetadata (rather than choosing a subset based on significance in univariate testing)</option>
       <option value="ttest">ttest - Student's t-test (parametric test, qualitative factor with exactly 2 levels)</option>
       <option value="anova">anova - Analysis of variance (parametric test, qualitative factor with more than 2 levels)</option>
@@ -47,7 +48,7 @@
       truevalue="TRUE"
       falsevalue="FALSE"
       label="Retain only pairwise-significant features"
-      help="When 'none' is chosen, all features are included in the analysis.  Otherwise, when this option is set to 'Yes', analysis will be performed including only features that differ significantly for the pair of levels being contrasted; when set to 'No', any feature that varies significantly across all levels will be included (i.e., exclude any feature that is not significantly different across all levels).  See examples below."/>
+      help="When 'none' is chosen as the test, all features are included in the analysis (i. e., this parameter is ignored).  Otherwise, when this option is set to 'Yes', analysis will be performed including only features that differ significantly for the pair of levels being contrasted; when set to 'No', any feature that varies significantly across all levels will be included (i.e., exclude any feature that is not significantly different across all levels).  See examples below."/>
     <param name="levCSV" label="Levels of interest" type="text" value = "*" help="Comma-separated level-names (or comma-less regular expressions to match level-names) to consider in analysis; must match at least two levels; levels must be non-numeric; may include wild cards or regular expressions.  Note that extra space characters will affect results - 'a,b' is correct, but 'a , b' is not and may fail or give different results.">
       <sanitizer>
         <valid initial="string.letters">
@@ -74,17 +75,25 @@
         </valid>
       </sanitizer>
     </param>
-    <param name="matchingC" label="Level-name matching" type="select" help="How to specify level-names generically (if at all).">
+    <param name="matchingC" label="Level-name matching" type="select" help="How to specify level-names generically. (See help below for details on using wild cards or regular expressions.)">
       <option value="none">do no generic matching (default)</option>
       <option value="wildcard" selected="true">use wild-cards for matching level-names</option>
       <option value="regex">use regular expressions for matching level-names</option>
     </param>
-    <param name="labelFeatures" type="text" value="3" label="Number of features having extreme loadings to label on cov-vs.-cor plot" help="Specify the number of features at each of the four loading-extremes that should be labelled (with the name of the feature) on the covariance-vs.-correlation plot; specify 'ALL' to label all features; this choice has no effect on the OPLS-DA loadings plot."/>
+    <param name="labelFeatures" type="text" value="3" label="How many features having extreme loadings should be labelled on cov-vs.-cor plot" help="Specify the number of features at each of the loading-extremes that should be labelled (with the name of the feature) on the covariance-vs.-correlation plot; specify 'ALL' to label all features or '0' to label no features; this choice has no effect on the OPLS-DA loadings plot."/>
+    <param
+      name="labelOrthoFeatures"
+      type="boolean"
+      checked="false"
+      truevalue="TRUE"
+      falsevalue="FALSE"
+      label="Label features having extreme orthogonal loadings"
+      help="When using the preceding parameter to label only features at the loading-extremess in the cor-vs.-cov plot, use 'no' here to label only features having extreme parallel loadings (loadp); this is the default.  Choose 'yes' to add labels also to features having extreme orthogonal loadings (both loado and loadp); this may clutter the plot."/>
   </inputs>
 
   <outputs>
     <!--
-      pdf1: summaries of each contrasts, clearly labeled by level=pair name
+      pdf1: summaries of each contrasts, clearly labelled by level=pair name
         * first PCA score-plot
         * then PLS score-plot
         * then PLS S-PLOT; color in red features with VIP > 1; color in grey any non-pairwise-significant features, if these are included
@@ -120,7 +129,8 @@
       <param name="tesC" value="kruskal"/>
       <param name="facC" value="k10"/>
       <param name="pairSigFeatOnly" value="FALSE"/>
-      <param name="labelFeatures" value="TRUE"/>
+      <param name="labelFeatures" value="3"/>
+      <param name="labelOrthogonalFeatures" value="FALSE"/>
       <param name="levCSV" value="k[12],k[3-4]"/>
       <param name="matchingC" value="regex"/>
       <output name="contrast_corcov">
@@ -183,7 +193,8 @@
       <param name="tesC" value="kruskal"/>
       <param name="facC" value="k10"/>
       <param name="pairSigFeatOnly" value="TRUE"/>
-      <param name="labelFeatures" value="TRUE"/>
+      <param name="labelFeatures" value="3"/>
+      <param name="labelOrthogonalFeatures" value="TRUE"/>
       <param name="levCSV" value="k[12],k[3-4]"/>
       <param name="matchingC" value="regex"/>
       <output name="contrast_corcov">
@@ -244,7 +255,8 @@
       <param name="tesC" value="none"/>
       <param name="facC" value="k10"/>
       <param name="pairSigFeatOnly" value="TRUE"/>
-      <param name="labelFeatures" value="FALSE"/>
+      <param name="labelFeatures" value="3"/>
+      <param name="labelOrthogonalFeatures" value="FALSE"/>
       <param name="levCSV" value="k[12],k[3-4]"/>
       <param name="matchingC" value="regex"/>
       <output name="contrast_corcov">
@@ -333,8 +345,8 @@
 The W4M 'Univariate' tool (Thévenot *et al.*, 2015) adds the results of family-wise corrected pairwise significance-tests as columns of the **variableMetadata** dataset.
 For instance, suppose that you ran Kruskal-Wallis testing for a column named 'cluster' in sampleMetadata that has values 'k1' and 'k2' and at least one other value.
 
-- A column of variableMetadata would be labeled 'cluster_kruskal_sig' and would have values '1' and '0'; when the samples are grouped by 'cluster', '1' means that there is strong evidence against the hypothesis that there is no difference among the intensities for the feature across all sample-groups.
-- A column of variableMetadata would be labeled 'cluster_kruskal_k1.k2_sig' and would have values '1' and '0', where '1' means that there is significant evidence against the hypothesis that samples from sampleMetadata whose 'cluster' column contains 'k1' or 'k2' have the same intensity for that feature.
+- A column of variableMetadata would be labelled 'cluster_kruskal_sig' and would have values '1' and '0'; when the samples are grouped by 'cluster', '1' means that there is strong evidence against the hypothesis that there is no difference among the intensities for the feature across all sample-groups.
+- A column of variableMetadata would be labelled 'cluster_kruskal_k1.k2_sig' and would have values '1' and '0', where '1' means that there is significant evidence against the hypothesis that samples from sampleMetadata whose 'cluster' column contains 'k1' or 'k2' have the same intensity for that feature.
 
 The 'PLS-DA Contrasts' tool produces graphics and data for OPLS-DA contrasts of feature-intensities between significantly different pairs of factor-levels.  For each factor-level, the tool performs a contrast with all other factor-levels combined and then separately with each other factor-level.  
 
@@ -342,7 +354,14 @@
 
 Although this tool can be used in a purely exploratory manner by supplying the variableMetadata file without the columns added by the W4M 'Univariate' tool, **the preferred workflow is to use univariate testing to exclude features that are not significantly different and use OPLS-DA to visualize the differences identified in univariate testing** (Thévenot *et al.*, 2015); an appropriate exception would be to visualize contrasts of a specific list of metabolites.
 
-It must be stressed that there may be no *single* definitive computational approach to select features that are reliable biomarkers, especially from a small number of samples or experiments.  A few possible choices are examining extreme values on S-PLOTs, examining "variable importance in projection VIP for OPLS-DA" (Galindo-Prieto *et al.* 2014), and examining a feature's "selectivity ratio" (Rajalahti *et al.*, 2009).  In this spirit, this tool reports the S-PLOT covariance and correlation (Wiklund *op. cit.*) and VIP metrics, and it introduces an informal "salience" metric to flag features that may merit attention without dimensional reduction; future versions may add selectivity ratio.  
+It must be stressed that there may be no *single* definitive computational approach to select features that are reliable biomarkers, especially from a small number of samples or experiments.  A few possible choices are:
+
+- picking features with maximum loadings along the projection parallel to the predictor (loadp),
+- examining extreme values on S-PLOTs (for which covariance is linearly related to loadp),
+- examining "variable importance in projection VIP for OPLS-DA" (Galindo-Prieto *et al.* 2014), and
+- examining a feature's "selectivity ratio" (Rajalahti *et al.*, 2009).
+
+In this spirit, this tool reports the S-PLOT covariance and correlation (Wiklund *op. cit.*) and VIP metrics, and it introduces an informal "salience" metric to flag features that may merit attention without dimensional reduction; future versions may add selectivity ratio.  
 
 For a more systematic approach to biomarker identification, please consider the W4M 'biosigner' tool (Rinuardo *et al.* 2016), which applies three different identification metrics to the selection process.
 
@@ -419,6 +438,14 @@
   | Indicator of **how levels are to be specified generically** (if at all) - wild cards, regular expressions, or none (no generic matching).
   |
 
+[IN] Label how many extreme features
+  | Specify the number of features at each of the loading-extremes that should be labelled (with the name of the feature) on the covariance-vs.-correlation plot; specify 'ALL' to label all features; this choice has no effect on the OPLS-DA loadings plot.
+  |
+
+[IN] Label features with extreme loado
+  | If the previous parameter has limited the the number of features to be labelled at each of the loading-extremes, then the extreme values for both loado and loadp will be labelled when this parameter is set to 'yes'; otherwise (in the default case) only extreme values for loadp will be lableld.  The default was chosen to make the plot less cluttered.
+  |
+
 [OUT] Contrast-detail output PDF
   | Several plots for each two-projection OPLS-DA analysis:
 
@@ -435,7 +462,7 @@
 - **featureID** - feature-identifier
 - **factorLevel1** - factor-level 1
 - **factorLevel2** - factor-level 2 (or "other" when contrasting factor-level 1 with all other levels)
-- **correlation** - correlation of the features projection explaining the difference between the features, < 0 when intensity for level 1 is greater (from formula in Supplement to Wiklund, *op. cit.*)
+- **correlation** - correlation of the features projection explaining the difference between the features, < 0 when intensity for level 1 is greater (from formula in Supplement to Wiklund, *op. cit.*).  Note that, for a given contrast, there is a linear relationship between 'loadp' and 'correlation'.
 - **covariance** - covariance of the features projection explaining the difference between the features, < 0 when intensity for level 1 is greater (from formula in *ibid.*)
 - **vip4p** - "variable importance in projection" to the predictive projection, VIP\ :subscript:`4,p` (Galindo-Prieto *op. cit.*)
 - **vip4o** - "variable importance in projection" to the orthogonal projection, VIP\ :subscript:`4,o` (*ibid.*)
@@ -624,11 +651,16 @@
 Release notes
 -------------
 
+0.98.5
+
+- bug fix: fit feature-labels within clipping region of cor-vs.cov plot
+- new feature: optionally (and by default) suppress labels for features with extreme orthogonal loadings
+
 0.98.3
 
 - add support for two-level factors
 - add adjusted mz and rt to output tables
-- allow explicitly setting the number of features with extreme loadings to be labeled on the correlation vs. covariance plot
+- allow explicitly setting the number of features with extreme loadings to be labelled on the correlation vs. covariance plot
 - add loadings to corcov table
 
 0.98.2
@@ -638,6 +670,7 @@
 
   ]]></help>
   <citations>
+    <citation type="doi">10.5281/zenodo.1034784</citation>
     <!-- Galindo_Prieto_2014 Variable influence on projection (VIP) for OPLS -->
     <citation type="doi">10.1002/cem.2627</citation>
     <!-- Giacomoni_2014 W4M 2.5 -->
--- a/w4mcorcov_calc.R	Sun Oct 22 18:47:57 2017 -0400
+++ b/w4mcorcov_calc.R	Sun Nov 12 19:45:36 2017 -0500
@@ -7,8 +7,8 @@
 #### OPLS-DA
 algoC <- "nipals"
 
-do_detail_plot <- function(x_dataMatrix, x_predictor, x_is_match, x_algorithm, x_prefix, x_show_labels, x_progress = print, x_env) {
-  off <- function(x) if (x_show_labels == "0") x else 0
+do_detail_plot <- function(x_dataMatrix, x_predictor, x_is_match, x_algorithm, x_prefix, x_show_labels, x_show_loado_labels, x_progress = print, x_env) {
+  off <- function(x) if (x_show_labels == "0") 0 else x
   if (x_is_match && ncol(x_dataMatrix) > 0 && length(unique(x_predictor))> 1) {
     my_oplsda <- opls(
         x      = x_dataMatrix
@@ -34,7 +34,7 @@
         lim_x <- max(sapply(X=c(min_x, max_x), FUN=abs))
         covariance <- covariance / lim_x
         lim_x <- 1.2
-        main_label <- sprintf("%s for levels %s versus %s", x_prefix, fctr_lvl_1, fctr_lvl_2)
+        main_label <- sprintf("%s for level %s versus %s", x_prefix, fctr_lvl_1, fctr_lvl_2)
         main_cex <- min(1.0, 46.0/nchar(main_label))
         # "It is generally accepted that a variable should be selected if vj>1, [27–29],
         #   but a proper threshold between 0.83 and 1.21 can yield more relevant variables according to [28]."
@@ -50,8 +50,8 @@
           y = plus_cor
         , x = plus_cov
         , type="p"
-        , xlim=c(-lim_x, lim_x + off(0.1))
-        , ylim=c(-1.0 - off(0.1), 1.0)
+        , xlim=c( -lim_x - off(0.2), lim_x + off(0.2) )
+        , ylim=c( -1.0   - off(0.2), 1.0   + off(0.2) )
         , xlab = sprintf("relative covariance(feature,t1)")
         , ylab = sprintf("correlation(feature,t1)")
         , main = main_label
@@ -62,8 +62,8 @@
         )
         low_x <- -0.7 * lim_x
         high_x <- 0.7 * lim_x
-        text(x = low_x, y = -0.05, labels =  fctr_lvl_1)
-        text(x = high_x, y = 0.05, labels =  fctr_lvl_2)
+        text(x = low_x, y = -0.05, labels =  fctr_lvl_1, col = "blue")
+        text(x = high_x, y = 0.05, labels =  fctr_lvl_2, col = "red")
         if ( x_show_labels != "0" ) {
           my_loadp <- loadp
           my_loado <- loado
@@ -77,17 +77,22 @@
           n_labels <- min( n_labels, (1 + length(loadp)) / 2 )
           labels_to_show <- c(
             names(head(sort(my_loadp),n = n_labels))
-          , names(head(sort(my_loado),n = n_labels))
           , names(tail(sort(my_loadp),n = n_labels))
-          , names(tail(sort(my_loado),n = n_labels))
           )
+          if ( x_show_loado_labels ) {
+            labels_to_show <- c(
+              labels_to_show
+            , names(head(sort(my_loado),n = n_labels))
+            , names(tail(sort(my_loado),n = n_labels))
+            )
+          }
           labels <- unname(sapply( X = tsv1$featureID, FUN = function(x) if( x %in% labels_to_show ) x else "" ))
           text(
             y = plus_cor - 0.013
           , x = plus_cov + 0.020
-          , cex = 0.3
+          , cex = 0.4
           , labels = labels
-          , col = rgb(blue = blue, red = red, green = 0, alpha = 0.2 + 0.8 * alpha)
+          , col = rgb(blue = 0, red = 0, green = 0, alpha = 0.5) # rgb(blue = blue, red = red, green = 0, alpha = 0.2 + 0.8 * alpha)
           , srt = -30 # slant 30 degrees downward
           , adj = 0   # left-justified
           )
@@ -164,6 +169,7 @@
   # matchingC is one of { "none", "wildcard", "regex" }
   matchingC <- calc_env$matchingC
   labelFeatures <- calc_env$labelFeatures
+  labelOrthoFeatures <- calc_env$labelOrthoFeatures
 
   # arg/env checking
   if (!(facC %in% names(smpl_metadata))) {
@@ -297,6 +303,7 @@
         , x_algorithm   = algoC
         , x_prefix      = if (pairSigFeatOnly) "Significantly contrasting features" else "Significant features"
         , x_show_labels = labelFeatures
+        , x_show_loado_labels = labelOrthoFeatures
         , x_progress    = progress_action
         , x_env         = calc_env
         )
@@ -352,6 +359,7 @@
           , x_algorithm   = algoC
           , x_prefix      = if (pairSigFeatOnly) "Significantly contrasting features" else "Significant features"
           , x_show_labels = labelFeatures
+          , x_show_loado_labels = labelOrthoFeatures
           , x_progress    = progress_action
           , x_env         = calc_env
           )
@@ -404,6 +412,7 @@
               , x_algorithm   = algoC
               , x_prefix      = "Features"
               , x_show_labels = labelFeatures
+              , x_show_loado_labels = labelOrthoFeatures
               , x_progress    = progress_action
               , x_env         = calc_env
               )
@@ -448,6 +457,7 @@
             , x_algorithm   = algoC
             , x_prefix      = "Features"
             , x_show_labels = labelFeatures
+            , x_show_loado_labels = labelOrthoFeatures
             , x_progress    = progress_action
             , x_env         = calc_env
             )
--- a/w4mcorcov_wrapper.R	Sun Oct 22 18:47:57 2017 -0400
+++ b/w4mcorcov_wrapper.R	Sun Nov 12 19:45:36 2017 -0500
@@ -67,12 +67,13 @@
 
 # other parameters
 
-my_env$tesC            <- as.character(argVc["tesC"])
-my_env$facC            <- as.character(argVc["facC"])
-my_env$pairSigFeatOnly <- as.logical(argVc["pairSigFeatOnly"])
-my_env$levCSV          <- as.character(argVc["levCSV"])
-my_env$matchingC       <- as.character(argVc["matchingC"])
-my_env$labelFeatures   <- as.character(argVc["labelFeatures"]) # number of features to label at each extreme of the loadings or 'ALL'
+my_env$tesC               <- as.character(argVc["tesC"])
+my_env$facC               <- as.character(argVc["facC"])
+my_env$pairSigFeatOnly    <- as.logical(argVc["pairSigFeatOnly"])
+my_env$levCSV             <- as.character(argVc["levCSV"])
+my_env$matchingC          <- as.character(argVc["matchingC"])
+my_env$labelFeatures      <- as.character(argVc["labelFeatures"]) # number of features to label at each extreme of the loadings or 'ALL'
+my_env$labelOrthoFeatures <- as.logical(argVc["labelOrthoFeatures"])
 
 label_features <- my_env$labelFeatures
 labelfeatures_check <- TRUE