Mercurial > repos > eschen42 > w4mcorcov

diff w4mcorcov_calc.R @ 13:2ae2d26e3270 draft
planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit e89c652c0849eb1d5a1e6c9100c72c64a8d388b4
author: eschen42
date: Wed, 12 Dec 2018 09:20:02 -0500
parents: ddaf84e15d06
children: 90708fdbc22d
--- a/w4mcorcov_calc.R	Thu Nov 08 23:06:09 2018 -0500
+++ b/w4mcorcov_calc.R	Wed Dec 12 09:20:02 2018 -0500
@@ -1,12 +1,4 @@
-# center with 'colMeans()' - ref: http://gastonsanchez.com/visually-enforced/how-to/2014/01/15/Center-data-in-R/
-center_colmeans <- function(x) {
-  xcenter = colMeans(x)
-  x - rep(xcenter, rep.int(nrow(x), ncol(x)))
-}
-
-#### OPLS-DA
-algoC <- "nipals"
-
+# compute and output detail plots
 do_detail_plot <- function(
   x_dataMatrix
 , x_predictor
@@ -21,7 +13,7 @@
   off <- function(x) if (x_show_labels == "0") 0 else x
   if ( x_is_match
       && ncol(x_dataMatrix) > 0
-      && length(unique(x_predictor))> 1
+      && length(unique(x_predictor)) > 1
       && x_crossval_i < nrow(x_dataMatrix)
   ) {
     my_oplsda <- opls(
@@ -36,20 +28,22 @@
       , scaleC = "pareto" # data centered and pareto scaled here only. This line fixes issue #2.
       )
     # strip out variables having negligible variance
-    x_dataMatrix <- x_dataMatrix[,names(my_oplsda@vipVn), drop = FALSE]
+    x_dataMatrix <- x_dataMatrix[ , names(my_oplsda@vipVn), drop = FALSE]
     my_oplsda_suppLs_y_levels <- levels(as.factor(my_oplsda@suppLs$y))
 
     fctr_lvl_1 <- my_oplsda_suppLs_y_levels[1]
     fctr_lvl_2 <- my_oplsda_suppLs_y_levels[2]
     do_s_plot <- function(
-        x_env
-      , predictor_projection_x   = TRUE
-      , cplot_x      = FALSE
-      , cor_vs_cov_x = NULL
-      )
-    {
+      x_env
+    , predictor_projection_x   = TRUE
+    , cplot_x      = FALSE
+    , cor_vs_cov_x = NULL
+    ) {
       if (cplot_x) {
         cplot_y_correlation <- (x_env$cplot_y == "correlation")
+        default_lim_x <- 10
+      } else {
+        default_lim_x <- 1.2
       }
       if (is.null(cor_vs_cov_x)) {
         my_cor_vs_cov <- cor_vs_cov(
@@ -57,14 +51,17 @@
           , ropls_x    = my_oplsda
           , predictor_projection_x = predictor_projection_x
           , x_progress
+          , x_env
           )
       } else {
         my_cor_vs_cov <- cor_vs_cov_x
       }
-      # str(my_cor_vs_cov)
+
       if (is.null(my_cor_vs_cov) || sum(!is.na(my_cor_vs_cov$tsv1$covariance)) < 2) {
         if (is.null(cor_vs_cov_x)) {
-          x_progress("No cor_vs_cov data produced")
+          x_progress(
+            sprintf("No cor_vs_cov data produced for level %s versus %s", fctr_lvl_1, fctr_lvl_2)
+          )
         }
         plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n")
         text(x=1, y=1, labels="too few covariance data")
@@ -76,216 +73,240 @@
           min_x <- min(covariance, na.rm = TRUE)
           max_x <- max(covariance, na.rm = TRUE)
           lim_x <- max(sapply(X=c(min_x, max_x), FUN=abs))
-          covariance <- covariance / lim_x
-          lim_x <- 1.2
-          # "It is generally accepted that a variable should be selected if vj>1, [27–29],
+
+          # Regarding using VIP as a guide to selecting a biomarker:
+          #   "It is generally accepted that a variable should be selected if vj>1, [27–29],
           #   but a proper threshold between 0.83 and 1.21 can yield more relevant variables according to [28]."
           #   (Mehmood 2012 doi:10.1186/1748-7188-6-27)
           plus_cor <- correlation
           plus_cov <- covariance
-          cex <- 0.65
-          which_projection <- if (projection == 1) "t1" else "o1"
-          which_loading <- if (projection == 1) "parallel" else "orthogonal"
-          if (projection == 1) { # predictor-projection
-            vipcp <- pmax(0, pmin(1,(vip4p-0.83)/(1.21-0.83)))
-            if (!cplot_x) { # S-plot predictor-projection
-              my_xlab <- "relative covariance(feature,t1)"
-              my_x <- plus_cov
-              my_ylab <- "correlation(feature,t1)"
-              my_y <- plus_cor
-            } else { # C-plot predictor-projection
-              my_xlab <- "variable importance in predictor-projection"
-              my_x <- vip4p
-              if (cplot_y_correlation) {
+          if (length(plus_cor) != 0 && length(plus_cor) != 0) {
+            cex <- 0.65
+            if (projection == 1) {
+              # predictor-projection
+              vipcp <- pmax(0, pmin(1, (vip4p - 0.83) / (1.21 - 0.83)))
+              if (!cplot_x) {
+                # S-plot predictor-projection
+                my_xlab <- "covariance(feature,t1)"
+                my_x <- plus_cov
                 my_ylab <- "correlation(feature,t1)"
                 my_y <- plus_cor
+                # X,Y limits for S-PLOT
+                my_xlim <- c( -lim_x, lim_x ) * (1.0 + off(0.3))
+                my_ylim <- c( -1.0, 1.0 ) * (1.0 + off(0.2) )
               } else {
-                my_ylab <- "relative covariance(feature,t1)"
-                my_y <- plus_cov
+                # C-plot predictor-projection
+                my_xlab <- "variable importance in predictor-projection"
+                my_x <- vip4p
+                if (cplot_y_correlation) {
+                  my_ylab <- "correlation(feature,t1)"
+                  my_y <- plus_cor
+                  my_ylim <- c( -1.0, 1.0 ) * (1.0 + off(0.2) )
+                } else {
+                  my_ylab <- "covariance(feature,t1)"
+                  my_y <- plus_cov
+                  my_ylim <- max(abs(plus_cov))
+                  my_ylim <- c( -my_ylim, my_ylim ) * (1.0 + off(0.2) )
+                }
+                # X,Y limits for C-plot
+                lim_x <- max(my_x, na.rm = TRUE) * 1.1
+                lim_x <- min(lim_x, default_lim_x)
+                my_xlim <- c( 0, lim_x ) # + off(0.2) )
               }
-            }
-            if (cplot_x) {
-              lim_x <- max(my_x, na.rm = TRUE) * 1.1
-              my_xlim <- c( 0, lim_x + off(0.2) )
+              my_load_distal <- loadp
+              my_load_proximal <- loado
+              red  <- as.numeric(correlation > 0) * vipcp
+              blue <- as.numeric(correlation < 0) * vipcp
+              alpha <- 0.1 + 0.4 * vipcp
+              red[is.na(red)] <- 0
+              blue[is.na(blue)] <- 0
+              alpha[is.na(alpha)] <- 0
+              my_col <- rgb(blue = blue, red = red, green = 0, alpha = alpha)
+              main_label <- sprintf("%s for level %s versus %s"
+                                   , x_prefix, fctr_lvl_1, fctr_lvl_2)
             } else {
-              my_xlim <- c( -lim_x - off(0.2), lim_x + off(0.2) )
-            }
-            my_ylim <- c( -1.0   - off(0.2), 1.0   + off(0.2) )
-            my_load_distal <- loadp
-            my_load_proximal <- loado
-            red  <- as.numeric(correlation > 0) * vipcp
-            blue <- as.numeric(correlation < 0) * vipcp
-            alpha <- 0.1 + 0.4 * vipcp
-            red[is.na(red)] <- 0
-            blue[is.na(blue)] <- 0
-            alpha[is.na(alpha)] <- 0
-            my_col <- rgb(blue = blue, red = red, green = 0, alpha = alpha)
-            main_label <- sprintf("%s for level %s versus %s"
-                                 , x_prefix, fctr_lvl_1, fctr_lvl_2)
-          } else { # orthogonal projection
-            vipco <- pmax(0, pmin(1,(vip4o-0.83)/(1.21-0.83)))
-            if (!cplot_x) {
-              my_xlab <- "relative covariance(feature,to1)"
-              my_x <- -plus_cov
-            } else {
-              my_xlab <- "variable importance in orthogonal projection"
-              my_x <- vip4o
-            }
-            if (!cplot_x) { # S-plot orthogonal projection
-              my_xlim <- c( -lim_x - off(0.2), lim_x + off(0.2) )
-              my_ylab <- "correlation(feature,to1)"
-              my_y <- plus_cor
-            } else { # C-plot orthogonal projection
-              lim_x <- max(my_x, na.rm = TRUE) * 1.1
-              my_xlim <- c( 0, lim_x + off(0.2) )
-              if (cplot_y_correlation) {
+              # orthogonal projection
+              vipco <- pmax(0, pmin(1, (vip4o - 0.83) / (1.21 - 0.83)))
+              if (!cplot_x) {
+                # S-PLOT orthogonal-projection
+                my_xlab <- "covariance(feature,to1)"
+                my_x <- -plus_cov
+                # X,Y limits for S-PLOT
+                my_xlim <- c( -lim_x, lim_x ) * (1.0 + off(0.3))
                 my_ylab <- "correlation(feature,to1)"
                 my_y <- plus_cor
+                my_ylim <- c( -1.0, 1.0 ) * (1.0 + off(0.2) )
               } else {
-                my_ylab <- "relative covariance(feature,to1)"
-                my_y <- plus_cov
+                # C-plot orthogonal-projection
+                my_xlab <- "variable importance in orthogonal projection"
+                my_x <- vip4o
+                # C-plot orthogonal projection
+                # X,Y limits for C-plot
+                lim_x <- max(my_x, na.rm = TRUE) * 1.1
+                my_xlim <- c( 0, lim_x ) # + off(0.2) )
+                if (cplot_y_correlation) {
+                  my_ylab <- "correlation(feature,to1)"
+                  my_y <- plus_cor
+                  my_ylim <- c( -1.0, 1.0 ) * (1.0 + off(0.2) )
+                } else {
+                  my_ylab <- "covariance(feature,to1)"
+                  my_y <- plus_cov
+                  my_ylim <- max(abs(plus_cov))
+                  my_ylim <- c( -my_ylim, my_ylim ) * (1.0 + off(0.2) )
+                }
               }
-            }
-            my_ylim <- c( -1.0   - off(0.2), 1.0   + off(0.2) )
-            my_load_distal <- loado
-            my_load_proximal <- loadp
-            alpha <- 0.1 + 0.4 * vipco
-            alpha[is.na(alpha)] <- 0
-            my_col <- rgb(blue = 0, red = 0, green = 0, alpha = alpha)
-            main_label <- sprintf(
-              "Features influencing orthogonal projection for %s versus %s"
-            , fctr_lvl_1, fctr_lvl_2)
-          }
-          main_cex <- min(1.0, 46.0/nchar(main_label))
-          my_feature_label_slant <- -30 # slant feature labels 30 degrees downward
-          my_pch <- sapply(X = cor_p_value, function(x) if (x < 0.01) 16 else if (x < 0.05) 17 else 18)
-          plot(
-            y = my_y
-          , x = my_x
-          , type = "p"
-          , xlim = my_xlim
-          , ylim = my_ylim
-          , xlab = my_xlab
-          , ylab = my_ylab
-          , main = main_label
-          , cex.main = main_cex
-          , cex = cex
-          , pch = my_pch
-          , col = my_col
-          )
-          low_x <- -0.7 * lim_x
-          high_x <- 0.7 * lim_x
-          if (projection == 1 && !cplot_x) {
-            text(x = low_x, y = -0.05, labels =  fctr_lvl_1, col = "blue")
-            text(x = high_x, y = 0.05, labels =  fctr_lvl_2, col = "red")
-          }
-          if ( x_show_labels != "0" ) {
-            names(my_load_distal) <- tsv1$featureID
-            names(my_load_proximal) <- tsv1$featureID
-            if ( x_show_labels == "ALL" ) {
-              n_labels <- length(my_load_distal)
-            } else {
-              n_labels <- as.numeric(x_show_labels)
+              my_load_distal <- loado
+              my_load_proximal <- loadp
+              alpha <- 0.1 + 0.4 * vipco
+              alpha[is.na(alpha)] <- 0
+              my_col <- rgb(blue = 0, red = 0, green = 0, alpha = alpha)
+              main_label <- sprintf(
+                "Features influencing orthogonal projection for %s versus %s"
+              , fctr_lvl_1, fctr_lvl_2)
             }
-            n_labels <- min( n_labels, (1 + length(my_load_distal)) / 2 )
-            labels_to_show <- c(
-              names(head(sort(my_load_distal),n = n_labels))
-            , names(tail(sort(my_load_distal),n = n_labels))
-            )
-            labels <- unname(
-              sapply(
-                X = tsv1$featureID
-              , FUN = function(x) if( x %in% labels_to_show ) x else ""
+            main_cex <- min(1.0, 46.0/nchar(main_label))
+            my_feature_label_slant <- -30 # slant feature labels 30 degrees downward
+            my_pch <- sapply(X = cor_p_value, function(x) if (x < 0.01) 16 else if (x < 0.05) 17 else 18)
+            if ( sum(is.infinite(my_xlim)) == 0 ) {
+              plot(
+                y = my_y
+              , x = my_x
+              , type = "p"
+              , xlim = my_xlim
+              , ylim = my_ylim
+              , xlab = my_xlab
+              , ylab = my_ylab
+              , main = main_label
+              , cex.main = main_cex
+              , cex = cex
+              , pch = my_pch
+              , col = my_col
               )
-            )
-            x_text_offset <- 0.024
-            y_text_off <- 0.017
-            if (!cplot_x) { # S-plot
-              y_text_offset <- if (projection == 1) -y_text_off else y_text_off
-            } else { # C-plot
-              y_text_offset <-
-                sapply(
-                  X = (my_y > 0)
-                , FUN = function(x) { if (x) y_text_off else -y_text_off }
+              low_x <- -0.7 * lim_x
+              high_x <- 0.7 * lim_x
+              if (projection == 1 && !cplot_x) {
+                text(x = low_x, y = -0.05, labels =  fctr_lvl_1, col = "blue")
+                text(x = high_x, y = 0.05, labels =  fctr_lvl_2, col = "red")
+              }
+              if ( x_show_labels != "0" ) {
+                names(my_load_distal) <- tsv1$featureID
+                names(my_load_proximal) <- tsv1$featureID
+                if ( x_show_labels == "ALL" ) {
+                  n_labels <- length(my_load_distal)
+                } else {
+                  n_labels <- as.numeric(x_show_labels)
+                }
+                n_labels <- min( n_labels, (1 + length(my_load_distal)) / 2 )
+                labels_to_show <- c(
+                  names(head(sort(my_load_distal), n = n_labels))
+                , names(tail(sort(my_load_distal), n = n_labels))
+                )
+                labels <- unname(
+                  sapply(
+                    X = tsv1$featureID
+                  , FUN = function(x) if ( x %in% labels_to_show ) x else ""
+                  )
                 )
-            }
-            label_features <- function(x_arg, y_arg, labels_arg, slant_arg) {
-              if (length(labels_arg) > 0) {
-                unique_slant <- unique(slant_arg)
-                if (length(unique_slant) == 1) {
-                  text(
-                    y = y_arg
-                  , x = x_arg + x_text_offset
-                  , cex = 0.4
-                  , labels = labels_arg
-                  , col = rgb(blue = 0, red = 0, green = 0, alpha = 0.5) # grey semi-transparent labels
-                  , srt = slant_arg
-                  , adj = 0   # left-justified
+                x_text_offset <- 0.024
+                y_text_off <- 0.017
+                if (!cplot_x) {
+                  # S-plot
+                  y_text_offset <- if (projection == 1) -y_text_off else y_text_off
+                } else {
+                  # C-plot
+                  y_text_offset <-
+                    sapply(
+                      X = (my_y > 0)
+                    , FUN = function(x) {
+                        if (x) y_text_off else -y_text_off
+                      }
+                    )
+                }
+                label_features <- function(x_arg, y_arg, labels_arg, slant_arg) {
+                  if (length(labels_arg) > 0) {
+                    unique_slant <- unique(slant_arg)
+                    if (length(unique_slant) == 1) {
+                      text(
+                        y = y_arg
+                      , x = x_arg + x_text_offset
+                      , cex = 0.4
+                      , labels = labels_arg
+                      , col = rgb(blue = 0, red = 0, green = 0, alpha = 0.5) # grey semi-transparent labels
+                      , srt = slant_arg
+                      , adj = 0   # left-justified
+                      )
+                    } else {
+                      for (slant in unique_slant) {
+                        text(
+                          y = y_arg[slant_arg == slant]
+                        , x = x_arg[slant_arg == slant] + x_text_offset
+                        , cex = 0.4
+                        , labels = labels_arg[slant_arg == slant]
+                        , col = rgb(blue = 0, red = 0, green = 0, alpha = 0.5) # grey semi-transparent labels
+                        , srt = slant
+                        , adj = 0   # left-justified
+                        )
+                      }
+                    }
+                  }
+                }
+                if (!cplot_x) {
+                  my_slant <- (if (projection == 1) 1 else -1) * my_feature_label_slant
+                } else {
+                  my_slant <- sapply(
+                                X = (my_y > 0)
+                              , FUN = function(x) if (x) 2 else -2
+                              ) * my_feature_label_slant
+                }
+                if (length(my_x) > 1) {
+                  label_features(
+                    x_arg      = my_x  [my_x > 0]
+                  , y_arg      = my_y  [my_x > 0] - y_text_offset
+                  , labels_arg = labels[my_x > 0]
+                  , slant_arg = (if (!cplot_x) -my_slant else (my_slant))
                   )
-                } else {
-                  for (slant in unique_slant) {
-                    text(
-                      y = y_arg[slant_arg == slant]
-                    , x = x_arg[slant_arg == slant] + x_text_offset
-                    , cex = 0.4
-                    , labels = labels_arg[slant_arg == slant]
-                    , col = rgb(blue = 0, red = 0, green = 0, alpha = 0.5) # grey semi-transparent labels
-                    , srt = slant
-                    , adj = 0   # left-justified
+                  if (!cplot_x) {
+                    label_features(
+                      x_arg      = my_x  [my_x < 0]
+                    , y_arg      = my_y  [my_x < 0] + y_text_offset
+                    , labels_arg = labels[my_x < 0]
+                    , slant_arg = my_slant
                     )
                   }
-                }
-              }
-            }
-            if (!cplot_x) {
-              my_slant <- (if (projection == 1) 1 else -1) * my_feature_label_slant
+                } else {
+                  if (!cplot_x) {
+                    my_slant <- (if (my_x > 1) -1 else 1) * my_slant
+                    my_y_arg <- my_y + (if (my_x > 1) -1 else 1) * y_text_offset
+                  } else {
+                    my_slant <- (if (my_y > 1) -1 else 1) * my_slant
+                    my_y_arg <- my_y + (if (my_y > 1) -1 else 1) * y_text_offset
+                  }
+                  label_features(
+                    x_arg = my_x
+                  , y_arg = my_y_arg
+                  , labels_arg = labels
+                  , slant_arg = my_slant
+                  )
+                } # end if (length(my_x) > 1)
+              } # end if ( x_show_labels != "0" )
             } else {
-              my_slant <- sapply(
-                            X = (my_y > 0)
-                          , FUN = function(x) if (x) 2 else -2
-                          ) * my_feature_label_slant
-            }
-            if (length(my_x) > 1) {
-              label_features(
-                x_arg      = my_x  [my_x > 0]
-              , y_arg      = my_y  [my_x > 0] - y_text_offset
-              , labels_arg = labels[my_x > 0]
-              , slant_arg = (if (!cplot_x) -my_slant else (my_slant))
-              )
-              if (!cplot_x) {
-                label_features(
-                  x_arg      = my_x  [my_x < 0]
-                , y_arg      = my_y  [my_x < 0] + y_text_offset
-                , labels_arg = labels[my_x < 0]
-                , slant_arg = my_slant
-                )
-              }
-            } else {
-              if (!cplot_x) {
-                my_slant <- (if (my_x > 1) -1 else 1) * my_slant
-                my_y_arg = my_y + (if (my_x > 1) -1 else 1) * y_text_offset
-              } else {
-                my_slant <- (if (my_y > 1) -1 else 1) * my_slant
-                my_y_arg = my_y + (if (my_y > 1) -1 else 1) * y_text_offset
-              }
-              label_features(
-                x_arg = my_x
-              , y_arg = my_y_arg
-              , labels_arg = labels
-              , slant_arg = my_slant
-              )
-            }
-          }
-        }
-      )
+              plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n")
+              text(x=1, y=1, labels="no S-plot is possible")
+            } # end if (sum(is.infinte(my_xlim))==0)
+          } else {
+            plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n")
+            text(x=1, y=1, labels="no S-plot is possible")
+          } # end if (length(plus_cor) != 0 && length(plus_cor) != 0)
+        } # end action
+      ) # end with( my_cor_vs_cov, action )
       return (my_cor_vs_cov)
-    }
+    } # end function do_s_plot
     my_cor_vs_cov <- do_s_plot(
       x_env = x_env
     , predictor_projection_x = TRUE
     , cplot_x = FALSE
     )
-    typeVc <- c("correlation",      # 1
+    typevc <- c("correlation",      # 1
                 "outlier",          # 2
                 "overview",         # 3
                 "permutation",      # 4
@@ -299,36 +320,37 @@
                 "xy-weight"         # 12
                )                    # [c(3,8,9)] # [c(4,3,8,9)]
     if ( length(my_oplsda@orthoVipVn) > 0 ) {
-      my_typevc <- typeVc[c(9,3,8)]
+      my_typevc <- typevc[c(9,3,8)]
     } else {
       my_typevc <- c("(dummy)","overview","(dummy)")
     }
     my_ortho_cor_vs_cov_exists <- FALSE
     for (my_type in my_typevc) {
-      if (my_type %in% typeVc) {
+      if (my_type %in% typevc) {
         tryCatch({
-          if ( my_type != "x-loading" ) {
-             plot(
-               x            = my_oplsda
-             , typeVc       = my_type
-             , parCexN      = 0.4
-             , parDevNewL   = FALSE
-             , parLayL      = TRUE
-             , parEllipsesL = TRUE
-             )
-             if (my_type == "overview") {
-               sub_label <- sprintf("%s versus %s", fctr_lvl_1, fctr_lvl_2)
-               title(sub = sub_label)
-             }
-          } else {
-            my_ortho_cor_vs_cov <- do_s_plot(
-              x_env = x_env
-            , predictor_projection_x = FALSE
-            , cplot_x = FALSE
-            )
-            my_ortho_cor_vs_cov_exists <- TRUE
+            if ( my_type != "x-loading" ) {
+               plot(
+                 x            = my_oplsda
+               , typeVc       = my_type
+               , parCexN      = 0.4
+               , parDevNewL   = FALSE
+               , parLayL      = TRUE
+               , parEllipsesL = TRUE
+               )
+               if (my_type == "overview") {
+                 sub_label <- sprintf("%s versus %s", fctr_lvl_1, fctr_lvl_2)
+                 title(sub = sub_label)
+               }
+            } else {
+              my_ortho_cor_vs_cov <- do_s_plot(
+                x_env = x_env
+              , predictor_projection_x = FALSE
+              , cplot_x = FALSE
+              )
+              my_ortho_cor_vs_cov_exists <- TRUE
+            }
           }
-        }, error = function(e) {
+        , error = function(e) {
           x_progress(
             sprintf(
               "factor level %s or %s may have only one sample - %s"
@@ -347,12 +369,17 @@
     cplot_o <- x_env$cplot_o
     if (cplot_p || cplot_o) {
       if (cplot_p) {
-        do_s_plot(
-          x_env = x_env
-        , predictor_projection_x = TRUE
-        , cplot_x = TRUE
-        , cor_vs_cov_x = my_cor_vs_cov
-        )
+        if (!is.null(my_cor_vs_cov)) {
+          do_s_plot(
+            x_env = x_env
+          , predictor_projection_x = TRUE
+          , cplot_x = TRUE
+          , cor_vs_cov_x = my_cor_vs_cov
+          )
+        } else {
+          plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n")
+          text(x=1, y=1, labels="no predictor projection is possible")
+        }
         did_plots <- 1
       } else {
         did_plots <- 0
@@ -405,10 +432,10 @@
 
   # extract parameters from the environment
   vrbl_metadata <- calc_env$vrbl_metadata
-  vrbl_metadata_names <- vrbl_metadata[,1]
+  vrbl_metadata_names <- vrbl_metadata[, 1]
   smpl_metadata <- calc_env$smpl_metadata
   data_matrix <- calc_env$data_matrix
-  pairSigFeatOnly <- calc_env$pairSigFeatOnly
+  pair_significant_features_only <- calc_env$pairSigFeatOnly
   facC <- calc_env$facC
   tesC <- calc_env$tesC
   # extract the levels from the environment
@@ -433,22 +460,27 @@
   names(rt)      <- vrbl_metadata$variableMetadata
   rt_lookup      <- function(feature) unname(rt[feature])
 
-  # calculate salience_df as data.frame(feature, max_level, max_median, max_rcv, mean_median, salience, salient_rcv)
+  # calculate salience_df as data.frame(feature, max_level, max_median, salience_rcv, mean_median, salience, salience_rcv)
   salience_df <- calc_env$salience_df <- w4msalience(
     data_matrix    = data_matrix
   , sample_class   = smpl_metadata[,facC]
   , failure_action = failure_action
   )
   salience_tsv_action({
-    my_df <- data.frame(
-      featureID    = salience_df$feature
-    , salientLevel = salience_df$max_level
-    , salientRCV   = salience_df$salient_rcv
-    , salience     = salience_df$salience
-    , mz           = mz_lookup(salience_df$feature)
-    , rt           = rt_lookup(salience_df$feature)
-    )
-    my_df[order(-my_df$salience),]
+    with (
+      salience_df
+    , {
+      my_df <<- data.frame(
+        featureID       = feature
+      , salientLevel    = max_level
+      , salienceRCV     = salience_rcv
+      , relativeSalientDistance = relative_salient_distance
+      , salience        = salience
+      , mz              = mz_lookup(feature)
+      , rt              = rt_lookup(feature)
+      )
+    })
+    my_df[order(-my_df$relativeSalientDistance),]
   })
 
   # transform wildcards to regexen
@@ -560,8 +592,8 @@
           x_dataMatrix  = my_matrix
         , x_predictor   = predictor
         , x_is_match    = TRUE
-        , x_algorithm   = algoC
-        , x_prefix      = if (pairSigFeatOnly) {
+        , x_algorithm   = "nipals"
+        , x_prefix      = if (pair_significant_features_only) {
                             "Significantly contrasting features"
                           } else {
                             "Significant features"
@@ -627,15 +659,15 @@
                 }
               )
             col_selector <- vrbl_metadata_names[
-              if ( pairSigFeatOnly ) fully_significant else overall_significant
+              if (pair_significant_features_only) fully_significant else overall_significant
             ]
             my_matrix <- tdm[ chosen_samples, col_selector, drop = FALSE ]
             my_cor_cov <- do_detail_plot(
               x_dataMatrix  = my_matrix
             , x_predictor   = predictor
             , x_is_match    = is_match
-            , x_algorithm   = algoC
-            , x_prefix      = if (pairSigFeatOnly) {
+            , x_algorithm   = "nipals"
+            , x_prefix      = if (pair_significant_features_only) {
                                 "Significantly contrasting features"
                               } else {
                                 "Significant features"
@@ -668,7 +700,8 @@
         }
       }
     }
-  } else { # tesC == "none"
+  } else {
+    # tesC == "none"
     # find all the levels for factor facC in sampleMetadata
     level_union <- unique(sort(smpl_metadata_facC))
     # identify the selected levels for factor facC from sampleMetadata
@@ -686,7 +719,6 @@
             fctr_lvl_2 <- {
               if ( fctr_lvl_1 %in% completed )
                 return("DUMMY")
-              # strF(completed)
               completed <<- c(completed, fctr_lvl_1)
               setdiff(level_union, fctr_lvl_1)
             }
@@ -717,7 +749,7 @@
                   x_dataMatrix  = my_matrix
                 , x_predictor   = predictor
                 , x_is_match    = is_match
-                , x_algorithm   = algoC
+                , x_algorithm   = "nipals"
                 , x_prefix      = "Features"
                 , x_show_labels = labelFeatures
                 , x_progress    = progress_action
@@ -770,7 +802,7 @@
                 x_dataMatrix  = my_matrix
               , x_predictor   = predictor
               , x_is_match    = is_match
-              , x_algorithm   = algoC
+              , x_algorithm   = "nipals"
               , x_prefix      = "Features"
               , x_show_labels = labelFeatures
               , x_progress    = progress_action
@@ -804,7 +836,7 @@
   if (!did_plot) {
     failure_action(
       sprintf(
-        "bad parameter!  sampleMetadata must have at least two levels of factor '%s' matching '%s'"
+        "Did not plot. Does sampleMetadata have at least two levels of factor '%s' matching '%s' ?"
       , facC, originalLevCSV))
     return ( FALSE )
   }
@@ -821,12 +853,14 @@
 , ropls_x
 , predictor_projection_x = TRUE
 , x_progress = print
+, x_env
 ) {
   tryCatch({
-    return(
-      cor_vs_cov_try( matrix_x, ropls_x, predictor_projection_x, x_progress)
-    )
-  }, error = function(e) {
+      return(
+        cor_vs_cov_try( matrix_x, ropls_x, predictor_projection_x, x_progress, x_env)
+      )
+    }
+  , error = function(e) {
     x_progress(
       sprintf(
         "cor_vs_cov fatal error - %s"
@@ -842,7 +876,12 @@
 , ropls_x                       # an instance of ropls::opls
 , predictor_projection_x = TRUE # TRUE for predictor projection; FALSE for orthogonal projection
 , x_progress = print            # function to produce progress and error messages
+, x_env
 ) {
+  my_matrix_x <- matrix_x
+  my_matrix_x[my_matrix_x==0] <- NA
+  fdr_features <- x_env$fdr_features
+
   x_class <- class(ropls_x)
   if ( !( as.character(x_class) == "opls" ) ) {
     stop(
@@ -866,12 +905,12 @@
 
   # I used equations (1) and (2) from Wiklund 2008, doi:10.1021/ac0713510
   #   (and not from the supplement despite the statement that, for the NIPALS algorithm,
-  #   the equations from the supplement should be used) because of the definition of the 
+  #   the equations from the supplement should be used) because of the definition of the
   #   Pearson/Galton coefficient of correlation is defined as
   #   $$
   #      \rho_{X,Y}= \frac{\operatorname{cov}(X,Y)}{\sigma_X \sigma_Y}
   #   $$
-  #   as described (among other places) on Wikipedia at 
+  #   as described (among other places) on Wikipedia at
   #     https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#For_a_population
   # The equations in the supplement said to use, for the predictive component t1,
   #      \rho_{t1,X_i}= \frac{\operatorname{cov}(t1,X_i)}{(\operatorname{mag}(t1))(\operatorname{mag}(X_i))}
@@ -879,112 +918,116 @@
   # perhaps my data are not centered exactly the same way that theirs were.
   # The correlations calculated here are in agreement with those calculated with the code from
   #   page 22 of https://cran.r-project.org/web/packages/muma/muma.pdf
-  # I did transform covariance to "relative covariance" (relative to the maximum value)
-  #   to keep the figures consistent with one another.
+
 
-  # count the features (one column for each sample)
-  Nfeatures <- ncol(matrix_x)
-  # count the samples (one row for each sample)
-  Nobservations <- nrow(matrix_x)
-  # a one-dimensional magnitude function (i.e., take the vector norm)
-  vector_norm <- function(one_dimensional) sqrt(sum(one_dimensional * one_dimensional))
-  # calculate the standard deviation for each feature 
-  sd_xi <- sapply(X = 1:Nfeatures, FUN = function(x) sd(matrix_x[,x]))
+  # count the features/variables (one column for each sample)
+  # count the features/variables (one column for each sample)
+  n_features <- ncol(my_matrix_x)
+  all_n_features <- x_env$fdr_features
+  if (length(grep("^[0-9][0-9]*$", all_n_features)) > 0) {
+    all_n_features <- as.integer(all_n_features)
+  } else {
+    all_n_features <- n_features
+  }
+  fdr_n_features <- max(n_features, all_n_features)
+  # print("n_features")
+  # print(n_features)
+
+  # count the samples/observations (one row for each sample)
+  n_observations <- nrow(my_matrix_x)
+
   # choose whether to plot the predictive score vector or orthogonal score vector
   if (predictor_projection_x)
-     score_matrix <- ropls_x@scoreMN
+     score_vector <- ropls_x@scoreMN
   else
-     score_matrix <- ropls_x@orthoScoreMN
-  # transpose the score (or orthoscore) vector for use as a premultiplier in covariance calculation
-  score_matrix_transposed <- t(score_matrix)
-  # compute the norm of the vector (i.e., the magnitude)
-  score_matrix_magnitude  <- vector_norm(score_matrix)
-  # compute the standard deviation of the vector
-  score_matrix_sd         <- sd(score_matrix)
-  # compute the relative covariance of each feature with the score vector
+     score_vector <- ropls_x@orthoScoreMN
+
+  # compute the covariance of each feature with the score vector
   result$covariance <-
-    score_matrix_transposed %*% matrix_x / ( score_matrix_magnitude * score_matrix_magnitude )
+    sapply(
+      X = 1:n_features
+    , FUN = function(i) {
+        cov(score_vector, my_matrix_x[ , i, drop = TRUE], use = "pairwise.complete.obs")
+      }
+    )
+  # access covariance by feature name
+  names(result$covariance) <- colnames(my_matrix_x)
+
   # compute the correlation of each feature with the score vector
   result$correlation <-
-    score_matrix_transposed %*% matrix_x / ( (Nobservations - 1) * ( score_matrix_sd * sd_xi ) )
-
-  # convert covariance and correlation from one-dimensional matrices to arrays of values, 
-  #   which are accessed by feature name below
-  p1     <- result$covariance  <- result$covariance [ 1, , drop = TRUE ]
-  # x_progress("strF(p1)")
-  # x_progress(strF(p1))
+    sapply(
+      X = 1:n_features
+    , FUN = function(i) {
+        cor(score_vector, my_matrix_x[ , i, drop = TRUE], use = "pairwise.complete.obs")
+      }
+    )
+  # access correlation by feature name
+  names(result$correlation) <- colnames(my_matrix_x)
 
-  pcorr1 <- result$correlation <- result$correlation[ 1, , drop = TRUE ]
-  # x_progress("pearson strF(pcorr1)")
-  # x_progress(strF(pcorr1))
-  # x_progress(typeof(pcorr1))
-  # x_progress(str(pcorr1))
-  
-  # # this is how to use Spearman correlation instead of pearson
-  # result$spearcor <- sapply(
-  #   X = 1:Nfeatures
-  # , FUN = function(i) {
-  #     stats::cor(
-  #       x = as.vector(score_matrix)
-  #     , y = as.vector(matrix_x[,i])
-  #     # , method = "spearman"
-  #     , method = "pearson"
-  #     )
-  #   }
-  # )
-  # names(result$spearcor) <- names(p1)
-  # pcorr1 <- result$spearcor
-  # x_progress("spearman strF(pcorr1)")
-  # x_progress(strF(pcorr1))
-  # x_progress(typeof(pcorr1))
-  # x_progress(str(pcorr1))
-  # pcorr1 <- result$correlation <- result$spearcor
+  # eliminate NAs in either correlation or covariance
+  nas <- is.na(result$correlation) | is.na(result$covariance)
+  featureID          <- names(ropls_x@vipVn)
+  featureID          <- featureID[!nas]
+  result$correlation <- result$correlation[!nas]
+  result$covariance  <- result$covariance[!nas]
+  n_features <- length(featureID)
 
-  # correl.ci(r, n, a = 0.05, rho = 0)
   correl_pci <- lapply(
-    X = 1:Nfeatures
-  , FUN = function(i) correl.ci(r = pcorr1[i], n = Nobservations)
+    X = 1:n_features
+  , FUN = function(i) {
+      correl.ci(
+        r = result$correlation[i]
+      , n_obs = n_observations
+      , n_vars = fdr_n_features
+      )
+    }
   )
   result$p_value_raw <- sapply(
-    X = 1:Nfeatures
+    X = 1:n_features
+  , FUN = function(i) correl_pci[[i]]$p.value.raw
+  )
+  result$p_value_raw[is.na(result$p_value_raw)] <- 1.0
+  result$p_value <- sapply(
+    X = 1:n_features
   , FUN = function(i) correl_pci[[i]]$p.value
   )
-  result$p_value_raw[is.na(result$p_value_raw)] <- 0.0
+  result$p_value[is.na(result$p_value)] <- 1.0
   result$ci_lower <- sapply(
-    X = 1:Nfeatures
-  , FUN = function(i) correl_pci[[i]]$CI['lower']
+    X = 1:n_features
+  , FUN = function(i) correl_pci[[i]]$CI["lower"]
   )
   result$ci_upper <- sapply(
-    X = 1:Nfeatures
-  , FUN = function(i) correl_pci[[i]]$CI['upper']
+    X = 1:n_features
+  , FUN = function(i) correl_pci[[i]]$CI["upper"]
   )
 
 
   # extract "variant 4 of Variable Influence on Projection for OPLS" (see Galindo_Prieto_2014, DOI 10.1002/cem.2627)
   #    Length = number of features; labels = feature identifiers.  (The same is true for $correlation and $covariance.)
-  result$vip4p     <- as.numeric(ropls_x@vipVn)
-  result$vip4o     <- as.numeric(ropls_x@orthoVipVn)
+  result$vip4p     <- as.numeric(ropls_x@vipVn)[!nas]
+  result$vip4o     <- as.numeric(ropls_x@orthoVipVn)[!nas]
   if (length(result$vip4o) == 0) result$vip4o <- NA
   # extract the loadings
-  result$loadp     <- as.numeric(ropls_x@loadingMN)
-  result$loado     <- as.numeric(ropls_x@orthoLoadingMN)
+  result$loadp     <- as.numeric(ropls_x@loadingMN)[!nas]
+  result$loado     <- as.numeric(ropls_x@orthoLoadingMN)[!nas]
   # get the level names
   level_names      <- sort(levels(as.factor(ropls_x@suppLs$y)))
   fctr_lvl_1       <- level_names[1]
   fctr_lvl_2       <- level_names[2]
-  feature_count    <- length(ropls_x@vipVn)
-  result$level1    <- rep.int(x = fctr_lvl_1, times = feature_count)
-  result$level2    <- rep.int(x = fctr_lvl_2, times = feature_count)
+  result$level1    <- rep.int(x = fctr_lvl_1, times = n_features)
+  result$level2    <- rep.int(x = fctr_lvl_2, times = n_features)
   greaterLevel <- sapply(
     X = result$correlation
-  , FUN = function(my_corr)
+  , FUN = function(my_corr) {
       tryCatch({
-          if ( is.nan( my_corr ) ) {
-            NA 
+          if ( is.na(my_corr) || ! is.numeric( my_corr ) ) {
+            NA
           } else {
             if ( my_corr < 0 ) fctr_lvl_1 else fctr_lvl_2
           }
-        }, error = function(e) {
+        }
+      , error = function(e) {
+          print(my_corr)
           x_progress(
             sprintf(
               "cor_vs_cov -> sapply:  error - substituting NA - %s"
@@ -992,16 +1035,11 @@
             )
           )
           NA
-        })
+        }
+      )
+    }
   )
 
-  # begin fixes for https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/issues/1
-  featureID          <- names(ropls_x@vipVn)
-  greaterLevel       <- greaterLevel[featureID]
-  result$correlation <- result$correlation[featureID]
-  result$covariance  <- result$covariance[featureID]
-  # end fixes for https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/issues/1
-
   # build a data frame to hold the content for the tab-separated values file
   tsv1 <- data.frame(
     featureID     = featureID
@@ -1016,8 +1054,8 @@
   , loadp         = result$loadp
   , loado         = result$loado
   , cor_p_val_raw = result$p_value_raw
-  , cor_p_value   = p.adjust(p = result$p_value_raw, method = "BY")
-  , cor_ci_lower  = result$ci_lower 
+  , cor_p_value   = result$p_value
+  , cor_ci_lower  = result$ci_lower
   , cor_ci_upper  = result$ci_upper
   )
   rownames(tsv1) <- tsv1$featureID
@@ -1039,7 +1077,7 @@
   tsv1 <- tsv1[!is.na(tsv1$covariance),]
   superresult$tsv1 <- tsv1
 
-  # # I did not include these but left them commentd out in case future 
+  # # I did not include these but left them commentd out in case future
   # #   consumers of this routine want to use it in currently unanticipated ways
   # result$superresult <- superresult
   # result$oplsda    <- ropls_x
@@ -1059,22 +1097,28 @@
 # which follows
 #   https://en.wikipedia.org/wiki/Fisher_transformation#Definition
 
-correl.ci <- function(r, n, a = 0.05, rho = 0) {
-  ## r is the calculated correlation coefficient for n pairs
+correl.ci <- function(r, n_obs, n_vars, a = 0.05, rho = 0) {
+  ## r is the calculated correlation coefficient for n_obs pairs of observations of one variable
   ## a is the significance level
   ## rho is the hypothesised correlation
   zh0 <- atanh(rho) # 0.5*log((1+rho)/(1-rho)), i.e., Fisher's z-transformation for Ho
   zh1 <- atanh(r)   # 0.5*log((1+r)/(1-r)), i.e., Fisher's z-transformation for H1
-  se <- (1 - r^2)/sqrt(n - 3) ## standard error for Fisher's z-transformation of Ho
+  se <- (1 - r^2)/sqrt(n_obs - 3) ## standard error for Fisher's z-transformation of Ho
   test <- (zh1 - zh0)/se ### test statistic
   pvalue <- 2*(1 - pnorm(abs(test))) ## p-value
-  zL <- zh1 - qnorm(1 - a/2)*se
-  zH <- zh1 + qnorm(1 - a/2)*se
-  fishL <- tanh(zL) # (exp(2*zL)-1)/(exp(2*zL)+1), i.e., lower confidence limit
-  fishH <- tanh(zH) # (exp(2*zH)-1)/(exp(2*zH)+1), i.e., upper confidence limit
-  CI <- c(fishL, fishH)
-  names(CI) <- c('lower', 'upper')
-  list(correlation = r, p.value = pvalue, CI = CI)
+  pvalue.adj <- p.adjust(p = pvalue, method = "BY", n = n_vars)
+  z_L <- zh1 - qnorm(1 - a/2)*se
+  z_h <- zh1 + qnorm(1 - a/2)*se
+  fish_l <- tanh(z_L) # (exp(2*z_l)-1)/(exp(2*z_l)+1), i.e., lower confidence limit
+  fish_h <- tanh(z_h) # (exp(2*z_h)-1)/(exp(2*z_h)+1), i.e., upper confidence limit
+  ci <- c(fish_l, fish_h)
+  names(ci) <- c("lower", "upper")
+  list(
+    correlation = r
+  , p.value.raw = pvalue
+  , p.value = pvalue.adj
+  , CI = ci
+  )
 }
 
-# vim: sw=2 ts=2 et :
+# vim: sw=2 ts=2 et ai :
author	eschen42
date	Wed, 12 Dec 2018 09:20:02 -0500
parents	ddaf84e15d06
children	90708fdbc22d