w4mcorcov: w4mcorcov_calc.R comparison

comparison w4mcorcov_calc.R @ 12:ddaf84e15d06 draft

planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit 6775c83b89d9d903c81a2229cdc200fc93538dfe-dirty

author	eschen42
date	Thu, 08 Nov 2018 23:06:09 -0500
parents	ddcc33ff3205
children	2ae2d26e3270

comparison

equal deleted inserted replaced

-:ddcc33ff3205
+:ddaf84e15d06
 , scaleC = "pareto" # data centered and pareto scaled here only. This line fixes issue #2.
 )
 # strip out variables having negligible variance
 x_dataMatrix <- x_dataMatrix[,names(my_oplsda@vipVn), drop = FALSE]
 my_oplsda_suppLs_y_levels <- levels(as.factor(my_oplsda@suppLs$y))
-# x_progress(strF(x_dataMatrix))
-# x_progress(strF(my_oplsda))
-#x_progress(head(my_oplsda_suppLs_y_levels))
-#x_progress(unique(my_oplsda_suppLs_y_levels))
 fctr_lvl_1 <- my_oplsda_suppLs_y_levels[1]
 fctr_lvl_2 <- my_oplsda_suppLs_y_levels[2]
 do_s_plot <- function(
 x_env
 , predictor_projection_x   = TRUE
 , cplot_x      = FALSE
 , cor_vs_cov_x = NULL
 )
 {
-#print(ls(x_env))               # "cplot_y" etc
-#print(str(x_env$cplot_y))      # chr "covariance"
 if (cplot_x) {
-#print(x_env$cplot_y)         # "covariance"
 cplot_y_correlation <- (x_env$cplot_y == "correlation")
-#print(cplot_y_correlation)   # FALSE
 }
 if (is.null(cor_vs_cov_x)) {
 my_cor_vs_cov <- cor_vs_cov(
 matrix_x   = x_dataMatrix
 , ropls_x    = my_oplsda
 , x_progress
 )
 } else {
 my_cor_vs_cov <- cor_vs_cov_x
 }
-# print("str(my_cor_vs_cov)")
 # str(my_cor_vs_cov)
 if (is.null(my_cor_vs_cov) || sum(!is.na(my_cor_vs_cov$tsv1$covariance)) < 2) {
 if (is.null(cor_vs_cov_x)) {
 x_progress("No cor_vs_cov data produced")
 }
 "Features influencing orthogonal projection for %s versus %s"
 , fctr_lvl_1, fctr_lvl_2)
 }
 main_cex <- min(1.0, 46.0/nchar(main_label))
 my_feature_label_slant <- -30 # slant feature labels 30 degrees downward
+my_pch <- sapply(X = cor_p_value, function(x) if (x < 0.01) 16 else if (x < 0.05) 17 else 18)
 plot(
 y = my_y
 , x = my_x
 , type = "p"
 , xlim = my_xlim
 , xlab = my_xlab
 , ylab = my_ylab
 , main = main_label
 , cex.main = main_cex
 , cex = cex
-, pch = 16
+, pch = my_pch
 , col = my_col
 )
 low_x <- -0.7 * lim_x
 high_x <- 0.7 * lim_x
 if (projection == 1 && !cplot_x) {
 X = (my_y > 0)
 , FUN = function(x) { if (x) y_text_off else -y_text_off }
 )
 }
 label_features <- function(x_arg, y_arg, labels_arg, slant_arg) {
-# print("str(x_arg)")
-# print(str(x_arg))
-# print("str(y_arg)")
-# print(str(y_arg))
-# print("str(labels_arg)")
-# print(str(labels_arg))
 if (length(labels_arg) > 0) {
 unique_slant <- unique(slant_arg)
 if (length(unique_slant) == 1) {
 text(
 y = y_arg
 return ( NULL )
 })
 }
 cor_vs_cov_try <- function(
-matrix_x
+matrix_x                      # rows are samples; columns, features
-, ropls_x
+, ropls_x                       # an instance of ropls::opls
-, predictor_projection_x = TRUE
+, predictor_projection_x = TRUE # TRUE for predictor projection; FALSE for orthogonal projection
-, x_progress = print
+, x_progress = print            # function to produce progress and error messages
 ) {
 x_class <- class(ropls_x)
-if ( !( as.character(x_class) == "opls" ) ) { # || !( attr(class(x_class),"package") == "ropls" ) )
+if ( !( as.character(x_class) == "opls" ) ) {
 stop(
 paste(
 "cor_vs_cov: Expected ropls_x to be of class ropls::opls but instead it was of class "
 , as.character(x_class)
 )
 )
 }
+if ( !ropls_x@suppLs$algoC == "nipals" ) {
+# suppLs$algoC - Character: algorithm used - "svd" for singular value decomposition; "nipals" for NIPALS
+stop(
+paste(
+"cor_vs_cov: Expected ropls::opls instance to have been computed by the NIPALS algorithm rather than "
+, ropls_x@suppLs$algoC
+)
+)
+}
 result <- list()
 result$projection <- projection <- if (predictor_projection_x) 1 else 2
-# suppLs$algoC - Character: algorithm used - "svd" for singular value decomposition; "nipals" for NIPALS
-if ( ropls_x@suppLs$algoC == "nipals") {
+# I used equations (1) and (2) from Wiklund 2008, doi:10.1021/ac0713510
-# Equations (1) and (2) from *Supplement to* Wiklund 2008, doi:10.1021/ac0713510
+#   (and not from the supplement despite the statement that, for the NIPALS algorithm,
-mag <- function(one_dimensional) sqrt(sum(one_dimensional * one_dimensional))
+#   the equations from the supplement should be used) because of the definition of the
-mag_xi <- sapply(X = 1:ncol(matrix_x), FUN = function(x) mag(matrix_x[,x]))
+#   Pearson/Galton coefficient of correlation is defined as
-if (predictor_projection_x)
+#   $$
-score_matrix <- ropls_x@scoreMN
+#      \rho_{X,Y}= \frac{\operatorname{cov}(X,Y)}{\sigma_X \sigma_Y}
-else
+#   $$
-score_matrix <- ropls_x@orthoScoreMN
+#   as described (among other places) on Wikipedia at
-score_matrix_transposed <- t(score_matrix)
+#     https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#For_a_population
-score_matrix_magnitude <- mag(score_matrix)
+# The equations in the supplement said to use, for the predictive component t1,
-result$covariance <-
+#      \rho_{t1,X_i}= \frac{\operatorname{cov}(t1,X_i)}{(\operatorname{mag}(t1))(\operatorname{mag}(X_i))}
-score_matrix_transposed %*% matrix_x / ( score_matrix_magnitude * score_matrix_magnitude )
+# but the results that I got were dramatically different from published results for S-PLOTs;
-result$correlation <-
+# perhaps my data are not centered exactly the same way that theirs were.
-score_matrix_transposed %*% matrix_x / ( score_matrix_magnitude * mag_xi )
+# The correlations calculated here are in agreement with those calculated with the code from
-} else {
+#   page 22 of https://cran.r-project.org/web/packages/muma/muma.pdf
-# WARNING - untested code - I don't have test data to exercise this branch
+# I did transform covariance to "relative covariance" (relative to the maximum value)
-# Equations (1) and (2) from Wiklund 2008, doi:10.1021/ac0713510
+#   to keep the figures consistent with one another.
-# scoreMN - Numerical matrix of x scores (T; dimensions: nrow(x) x predI) X = TP' + E; Y = TC' + F
-if (predictor_projection_x)
+# count the features (one column for each sample)
-score_matrix <- ropls_x@scoreMN
+Nfeatures <- ncol(matrix_x)
-else
+# count the samples (one row for each sample)
-score_matrix <- ropls_x@orthoScoreMN
+Nobservations <- nrow(matrix_x)
-score_matrix_transposed <- t(score_matrix)
+# a one-dimensional magnitude function (i.e., take the vector norm)
-cov_divisor <- nrow(matrix_x) - 1
+vector_norm <- function(one_dimensional) sqrt(sum(one_dimensional * one_dimensional))
-result$covariance <- sapply(
+# calculate the standard deviation for each feature
-X = 1:ncol(matrix_x)
+sd_xi <- sapply(X = 1:Nfeatures, FUN = function(x) sd(matrix_x[,x]))
-, FUN = function(x) score_matrix_transposed %*% matrix_x[,x] / cov_divisor
+# choose whether to plot the predictive score vector or orthogonal score vector
-)
+if (predictor_projection_x)
-score_sd <- sapply(
+score_matrix <- ropls_x@scoreMN
-X = 1:ncol(score_matrix)
+else
-, FUN = function(x) sd(score_matrix[,x])
+score_matrix <- ropls_x@orthoScoreMN
-)
+# transpose the score (or orthoscore) vector for use as a premultiplier in covariance calculation
-# xSdVn - Numerical vector: variable standard deviations of the 'x' matrix
+score_matrix_transposed <- t(score_matrix)
-xSdVn <- ropls_x@xSdVn
+# compute the norm of the vector (i.e., the magnitude)
-result$correlation <- sapply(
+score_matrix_magnitude  <- vector_norm(score_matrix)
-X = 1:ncol(matrix_x)
+# compute the standard deviation of the vector
-, FUN = function(x) {
+score_matrix_sd         <- sd(score_matrix)
-( score_matrix_transposed / score_sd ) %*% ( matrix_x[,x] / (xSdVn[x] * cov_divisor) )
+# compute the relative covariance of each feature with the score vector
-}
+result$covariance <-
-)
+score_matrix_transposed %*% matrix_x / ( score_matrix_magnitude * score_matrix_magnitude )
-}
+# compute the correlation of each feature with the score vector
-result$correlation <- result$correlation[ 1, , drop = TRUE ]
+result$correlation <-
-result$covariance  <- result$covariance [ 1, , drop = TRUE ]
+score_matrix_transposed %*% matrix_x / ( (Nobservations - 1) * ( score_matrix_sd * sd_xi ) )
-# Variant 4 of Variable Influence on Projection for OPLS from Galindo_Prieto_2014
+# convert covariance and correlation from one-dimensional matrices to arrays of values,
+#   which are accessed by feature name below
+p1     <- result$covariance  <- result$covariance [ 1, , drop = TRUE ]
+# x_progress("strF(p1)")
+# x_progress(strF(p1))
+pcorr1 <- result$correlation <- result$correlation[ 1, , drop = TRUE ]
+# x_progress("pearson strF(pcorr1)")
+# x_progress(strF(pcorr1))
+# x_progress(typeof(pcorr1))
+# x_progress(str(pcorr1))
+# # this is how to use Spearman correlation instead of pearson
+# result$spearcor <- sapply(
+#   X = 1:Nfeatures
+# , FUN = function(i) {
+#     stats::cor(
+#       x = as.vector(score_matrix)
+#     , y = as.vector(matrix_x[,i])
+#     # , method = "spearman"
+#     , method = "pearson"
+#     )
+#   }
+# )
+# names(result$spearcor) <- names(p1)
+# pcorr1 <- result$spearcor
+# x_progress("spearman strF(pcorr1)")
+# x_progress(strF(pcorr1))
+# x_progress(typeof(pcorr1))
+# x_progress(str(pcorr1))
+# pcorr1 <- result$correlation <- result$spearcor
+# correl.ci(r, n, a = 0.05, rho = 0)
+correl_pci <- lapply(
+X = 1:Nfeatures
+, FUN = function(i) correl.ci(r = pcorr1[i], n = Nobservations)
+)
+result$p_value_raw <- sapply(
+X = 1:Nfeatures
+, FUN = function(i) correl_pci[[i]]$p.value
+)
+result$p_value_raw[is.na(result$p_value_raw)] <- 0.0
+result$ci_lower <- sapply(
+X = 1:Nfeatures
+, FUN = function(i) correl_pci[[i]]$CI['lower']
+)
+result$ci_upper <- sapply(
+X = 1:Nfeatures
+, FUN = function(i) correl_pci[[i]]$CI['upper']
+)
+# extract "variant 4 of Variable Influence on Projection for OPLS" (see Galindo_Prieto_2014, DOI 10.1002/cem.2627)
 #    Length = number of features; labels = feature identifiers.  (The same is true for $correlation and $covariance.)
 result$vip4p     <- as.numeric(ropls_x@vipVn)
 result$vip4o     <- as.numeric(ropls_x@orthoVipVn)
+if (length(result$vip4o) == 0) result$vip4o <- NA
+# extract the loadings
 result$loadp     <- as.numeric(ropls_x@loadingMN)
 result$loado     <- as.numeric(ropls_x@orthoLoadingMN)
 # get the level names
 level_names      <- sort(levels(as.factor(ropls_x@suppLs$y)))
 fctr_lvl_1       <- level_names[1]
 fctr_lvl_2       <- level_names[2]
 feature_count    <- length(ropls_x@vipVn)
 result$level1    <- rep.int(x = fctr_lvl_1, times = feature_count)
 result$level2    <- rep.int(x = fctr_lvl_2, times = feature_count)
-superresult <- list()
-if (length(result$vip4o) == 0) result$vip4o <- NA
 greaterLevel <- sapply(
 X = result$correlation
 , FUN = function(my_corr)
 tryCatch({
 if ( is.nan( my_corr ) ) {
-print("my_corr is NaN")
 NA
 } else {
 if ( my_corr < 0 ) fctr_lvl_1 else fctr_lvl_2
 }
 }, error = function(e) {
 greaterLevel       <- greaterLevel[featureID]
 result$correlation <- result$correlation[featureID]
 result$covariance  <- result$covariance[featureID]
 # end fixes for https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/issues/1
+# build a data frame to hold the content for the tab-separated values file
 tsv1 <- data.frame(
-featureID           = featureID
+featureID     = featureID
-, factorLevel1        = result$level1
+, factorLevel1  = result$level1
-, factorLevel2        = result$level2
+, factorLevel2  = result$level2
-, greaterLevel        = greaterLevel
+, greaterLevel  = greaterLevel
-, projection          = result$projection
+, projection    = result$projection
-, correlation         = result$correlation
+, correlation   = result$correlation
-, covariance          = result$covariance
+, covariance    = result$covariance
-, vip4p               = result$vip4p
+, vip4p         = result$vip4p
-, vip4o               = result$vip4o
+, vip4o         = result$vip4o
-, loadp               = result$loadp
+, loadp         = result$loadp
-, loado               = result$loado
+, loado         = result$loado
-, row.names           = NULL
+, cor_p_val_raw = result$p_value_raw
+, cor_p_value   = p.adjust(p = result$p_value_raw, method = "BY")
+, cor_ci_lower  = result$ci_lower
+, cor_ci_upper  = result$ci_upper
 )
-tsv1 <- tsv1[!is.na(tsv1$correlation),]
+rownames(tsv1) <- tsv1$featureID
-tsv1 <- tsv1[!is.na(tsv1$covariance),]
-superresult$tsv1 <- tsv1
+# build the superresult, i.e., the result returned by this function
-rownames(superresult$tsv1) <- tsv1$featureID
+superresult <- list()
 superresult$projection <- result$projection
 superresult$covariance <- result$covariance
 superresult$correlation <- result$correlation
 superresult$vip4p <- result$vip4p
 superresult$vip4o <- result$vip4o
 superresult$loadp <- result$loadp
 superresult$loado <- result$loado
+superresult$cor_p_value <- tsv1$cor_p_value
 superresult$details <- result
-result$superresult <- superresult
-# Include thise in case future consumers of this routine want to use it in currently unanticipated ways
+# remove any rows having NA for covariance or correlation
-result$oplsda    <- ropls_x
+tsv1 <- tsv1[!is.na(tsv1$correlation),]
-result$predictor <- ropls_x@suppLs$y   # in case future consumers of this routine want to use it in currently unanticipated ways
+tsv1 <- tsv1[!is.na(tsv1$covariance),]
+superresult$tsv1 <- tsv1
+# # I did not include these but left them commentd out in case future
+# #   consumers of this routine want to use it in currently unanticipated ways
+# result$superresult <- superresult
+# result$oplsda    <- ropls_x
+# result$predictor <- ropls_x@suppLs$y
 return (superresult)
 }
+# Code for correl.ci was adapted from correl function from:
+#   @book{
+#     Tsagris_2018,
+#     author = {Tsagris, Michail},
+#     year = {2018},
+#     link = {https://www.researchgate.net/publication/324363311_Multivariate_data_analysis_in_R},
+#     title = {Multivariate data analysis in R}
+#   }
+# which follows
+#   https://en.wikipedia.org/wiki/Fisher_transformation#Definition
+correl.ci <- function(r, n, a = 0.05, rho = 0) {
+## r is the calculated correlation coefficient for n pairs
+## a is the significance level
+## rho is the hypothesised correlation
+zh0 <- atanh(rho) # 0.5*log((1+rho)/(1-rho)), i.e., Fisher's z-transformation for Ho
+zh1 <- atanh(r)   # 0.5*log((1+r)/(1-r)), i.e., Fisher's z-transformation for H1
+se <- (1 - r^2)/sqrt(n - 3) ## standard error for Fisher's z-transformation of Ho
+test <- (zh1 - zh0)/se ### test statistic
+pvalue <- 2*(1 - pnorm(abs(test))) ## p-value
+zL <- zh1 - qnorm(1 - a/2)*se
+zH <- zh1 + qnorm(1 - a/2)*se
+fishL <- tanh(zL) # (exp(2*zL)-1)/(exp(2*zL)+1), i.e., lower confidence limit
+fishH <- tanh(zH) # (exp(2*zH)-1)/(exp(2*zH)+1), i.e., upper confidence limit
+CI <- c(fishL, fishH)
+names(CI) <- c('lower', 'upper')
+list(correlation = r, p.value = pvalue, CI = CI)
+}
 # vim: sw=2 ts=2 et :

Mercurial > repos > eschen42 > w4mcorcov

comparison w4mcorcov_calc.R @ 12:ddaf84e15d06 draft