comparison w4mcorcov_calc.R @ 12:ddaf84e15d06 draft

planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit 6775c83b89d9d903c81a2229cdc200fc93538dfe-dirty
author eschen42
date Thu, 08 Nov 2018 23:06:09 -0500
parents ddcc33ff3205
children 2ae2d26e3270
comparison
equal deleted inserted replaced
11:ddcc33ff3205 12:ddaf84e15d06
36 , scaleC = "pareto" # data centered and pareto scaled here only. This line fixes issue #2. 36 , scaleC = "pareto" # data centered and pareto scaled here only. This line fixes issue #2.
37 ) 37 )
38 # strip out variables having negligible variance 38 # strip out variables having negligible variance
39 x_dataMatrix <- x_dataMatrix[,names(my_oplsda@vipVn), drop = FALSE] 39 x_dataMatrix <- x_dataMatrix[,names(my_oplsda@vipVn), drop = FALSE]
40 my_oplsda_suppLs_y_levels <- levels(as.factor(my_oplsda@suppLs$y)) 40 my_oplsda_suppLs_y_levels <- levels(as.factor(my_oplsda@suppLs$y))
41 # x_progress(strF(x_dataMatrix)) 41
42 # x_progress(strF(my_oplsda))
43 #x_progress(head(my_oplsda_suppLs_y_levels))
44 #x_progress(unique(my_oplsda_suppLs_y_levels))
45 fctr_lvl_1 <- my_oplsda_suppLs_y_levels[1] 42 fctr_lvl_1 <- my_oplsda_suppLs_y_levels[1]
46 fctr_lvl_2 <- my_oplsda_suppLs_y_levels[2] 43 fctr_lvl_2 <- my_oplsda_suppLs_y_levels[2]
47 do_s_plot <- function( 44 do_s_plot <- function(
48 x_env 45 x_env
49 , predictor_projection_x = TRUE 46 , predictor_projection_x = TRUE
50 , cplot_x = FALSE 47 , cplot_x = FALSE
51 , cor_vs_cov_x = NULL 48 , cor_vs_cov_x = NULL
52 ) 49 )
53 { 50 {
54 #print(ls(x_env)) # "cplot_y" etc
55 #print(str(x_env$cplot_y)) # chr "covariance"
56 if (cplot_x) { 51 if (cplot_x) {
57 #print(x_env$cplot_y) # "covariance"
58 cplot_y_correlation <- (x_env$cplot_y == "correlation") 52 cplot_y_correlation <- (x_env$cplot_y == "correlation")
59 #print(cplot_y_correlation) # FALSE
60 } 53 }
61 if (is.null(cor_vs_cov_x)) { 54 if (is.null(cor_vs_cov_x)) {
62 my_cor_vs_cov <- cor_vs_cov( 55 my_cor_vs_cov <- cor_vs_cov(
63 matrix_x = x_dataMatrix 56 matrix_x = x_dataMatrix
64 , ropls_x = my_oplsda 57 , ropls_x = my_oplsda
66 , x_progress 59 , x_progress
67 ) 60 )
68 } else { 61 } else {
69 my_cor_vs_cov <- cor_vs_cov_x 62 my_cor_vs_cov <- cor_vs_cov_x
70 } 63 }
71 # print("str(my_cor_vs_cov)")
72 # str(my_cor_vs_cov) 64 # str(my_cor_vs_cov)
73 if (is.null(my_cor_vs_cov) || sum(!is.na(my_cor_vs_cov$tsv1$covariance)) < 2) { 65 if (is.null(my_cor_vs_cov) || sum(!is.na(my_cor_vs_cov$tsv1$covariance)) < 2) {
74 if (is.null(cor_vs_cov_x)) { 66 if (is.null(cor_vs_cov_x)) {
75 x_progress("No cor_vs_cov data produced") 67 x_progress("No cor_vs_cov data produced")
76 } 68 }
164 "Features influencing orthogonal projection for %s versus %s" 156 "Features influencing orthogonal projection for %s versus %s"
165 , fctr_lvl_1, fctr_lvl_2) 157 , fctr_lvl_1, fctr_lvl_2)
166 } 158 }
167 main_cex <- min(1.0, 46.0/nchar(main_label)) 159 main_cex <- min(1.0, 46.0/nchar(main_label))
168 my_feature_label_slant <- -30 # slant feature labels 30 degrees downward 160 my_feature_label_slant <- -30 # slant feature labels 30 degrees downward
161 my_pch <- sapply(X = cor_p_value, function(x) if (x < 0.01) 16 else if (x < 0.05) 17 else 18)
169 plot( 162 plot(
170 y = my_y 163 y = my_y
171 , x = my_x 164 , x = my_x
172 , type = "p" 165 , type = "p"
173 , xlim = my_xlim 166 , xlim = my_xlim
175 , xlab = my_xlab 168 , xlab = my_xlab
176 , ylab = my_ylab 169 , ylab = my_ylab
177 , main = main_label 170 , main = main_label
178 , cex.main = main_cex 171 , cex.main = main_cex
179 , cex = cex 172 , cex = cex
180 , pch = 16 173 , pch = my_pch
181 , col = my_col 174 , col = my_col
182 ) 175 )
183 low_x <- -0.7 * lim_x 176 low_x <- -0.7 * lim_x
184 high_x <- 0.7 * lim_x 177 high_x <- 0.7 * lim_x
185 if (projection == 1 && !cplot_x) { 178 if (projection == 1 && !cplot_x) {
215 X = (my_y > 0) 208 X = (my_y > 0)
216 , FUN = function(x) { if (x) y_text_off else -y_text_off } 209 , FUN = function(x) { if (x) y_text_off else -y_text_off }
217 ) 210 )
218 } 211 }
219 label_features <- function(x_arg, y_arg, labels_arg, slant_arg) { 212 label_features <- function(x_arg, y_arg, labels_arg, slant_arg) {
220 # print("str(x_arg)")
221 # print(str(x_arg))
222 # print("str(y_arg)")
223 # print(str(y_arg))
224 # print("str(labels_arg)")
225 # print(str(labels_arg))
226 if (length(labels_arg) > 0) { 213 if (length(labels_arg) > 0) {
227 unique_slant <- unique(slant_arg) 214 unique_slant <- unique(slant_arg)
228 if (length(unique_slant) == 1) { 215 if (length(unique_slant) == 1) {
229 text( 216 text(
230 y = y_arg 217 y = y_arg
849 return ( NULL ) 836 return ( NULL )
850 }) 837 })
851 } 838 }
852 839
853 cor_vs_cov_try <- function( 840 cor_vs_cov_try <- function(
854 matrix_x 841 matrix_x # rows are samples; columns, features
855 , ropls_x 842 , ropls_x # an instance of ropls::opls
856 , predictor_projection_x = TRUE 843 , predictor_projection_x = TRUE # TRUE for predictor projection; FALSE for orthogonal projection
857 , x_progress = print 844 , x_progress = print # function to produce progress and error messages
858 ) { 845 ) {
859 x_class <- class(ropls_x) 846 x_class <- class(ropls_x)
860 if ( !( as.character(x_class) == "opls" ) ) { # || !( attr(class(x_class),"package") == "ropls" ) ) 847 if ( !( as.character(x_class) == "opls" ) ) {
861 stop( 848 stop(
862 paste( 849 paste(
863 "cor_vs_cov: Expected ropls_x to be of class ropls::opls but instead it was of class " 850 "cor_vs_cov: Expected ropls_x to be of class ropls::opls but instead it was of class "
864 , as.character(x_class) 851 , as.character(x_class)
865 ) 852 )
866 ) 853 )
867 } 854 }
855 if ( !ropls_x@suppLs$algoC == "nipals" ) {
856 # suppLs$algoC - Character: algorithm used - "svd" for singular value decomposition; "nipals" for NIPALS
857 stop(
858 paste(
859 "cor_vs_cov: Expected ropls::opls instance to have been computed by the NIPALS algorithm rather than "
860 , ropls_x@suppLs$algoC
861 )
862 )
863 }
868 result <- list() 864 result <- list()
869 result$projection <- projection <- if (predictor_projection_x) 1 else 2 865 result$projection <- projection <- if (predictor_projection_x) 1 else 2
870 # suppLs$algoC - Character: algorithm used - "svd" for singular value decomposition; "nipals" for NIPALS 866
871 if ( ropls_x@suppLs$algoC == "nipals") { 867 # I used equations (1) and (2) from Wiklund 2008, doi:10.1021/ac0713510
872 # Equations (1) and (2) from *Supplement to* Wiklund 2008, doi:10.1021/ac0713510 868 # (and not from the supplement despite the statement that, for the NIPALS algorithm,
873 mag <- function(one_dimensional) sqrt(sum(one_dimensional * one_dimensional)) 869 # the equations from the supplement should be used) because of the definition of the
874 mag_xi <- sapply(X = 1:ncol(matrix_x), FUN = function(x) mag(matrix_x[,x])) 870 # Pearson/Galton coefficient of correlation is defined as
875 if (predictor_projection_x) 871 # $$
876 score_matrix <- ropls_x@scoreMN 872 # \rho_{X,Y}= \frac{\operatorname{cov}(X,Y)}{\sigma_X \sigma_Y}
877 else 873 # $$
878 score_matrix <- ropls_x@orthoScoreMN 874 # as described (among other places) on Wikipedia at
879 score_matrix_transposed <- t(score_matrix) 875 # https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#For_a_population
880 score_matrix_magnitude <- mag(score_matrix) 876 # The equations in the supplement said to use, for the predictive component t1,
881 result$covariance <- 877 # \rho_{t1,X_i}= \frac{\operatorname{cov}(t1,X_i)}{(\operatorname{mag}(t1))(\operatorname{mag}(X_i))}
882 score_matrix_transposed %*% matrix_x / ( score_matrix_magnitude * score_matrix_magnitude ) 878 # but the results that I got were dramatically different from published results for S-PLOTs;
883 result$correlation <- 879 # perhaps my data are not centered exactly the same way that theirs were.
884 score_matrix_transposed %*% matrix_x / ( score_matrix_magnitude * mag_xi ) 880 # The correlations calculated here are in agreement with those calculated with the code from
885 } else { 881 # page 22 of https://cran.r-project.org/web/packages/muma/muma.pdf
886 # WARNING - untested code - I don't have test data to exercise this branch 882 # I did transform covariance to "relative covariance" (relative to the maximum value)
887 # Equations (1) and (2) from Wiklund 2008, doi:10.1021/ac0713510 883 # to keep the figures consistent with one another.
888 # scoreMN - Numerical matrix of x scores (T; dimensions: nrow(x) x predI) X = TP' + E; Y = TC' + F 884
889 if (predictor_projection_x) 885 # count the features (one column for each sample)
890 score_matrix <- ropls_x@scoreMN 886 Nfeatures <- ncol(matrix_x)
891 else 887 # count the samples (one row for each sample)
892 score_matrix <- ropls_x@orthoScoreMN 888 Nobservations <- nrow(matrix_x)
893 score_matrix_transposed <- t(score_matrix) 889 # a one-dimensional magnitude function (i.e., take the vector norm)
894 cov_divisor <- nrow(matrix_x) - 1 890 vector_norm <- function(one_dimensional) sqrt(sum(one_dimensional * one_dimensional))
895 result$covariance <- sapply( 891 # calculate the standard deviation for each feature
896 X = 1:ncol(matrix_x) 892 sd_xi <- sapply(X = 1:Nfeatures, FUN = function(x) sd(matrix_x[,x]))
897 , FUN = function(x) score_matrix_transposed %*% matrix_x[,x] / cov_divisor 893 # choose whether to plot the predictive score vector or orthogonal score vector
898 ) 894 if (predictor_projection_x)
899 score_sd <- sapply( 895 score_matrix <- ropls_x@scoreMN
900 X = 1:ncol(score_matrix) 896 else
901 , FUN = function(x) sd(score_matrix[,x]) 897 score_matrix <- ropls_x@orthoScoreMN
902 ) 898 # transpose the score (or orthoscore) vector for use as a premultiplier in covariance calculation
903 # xSdVn - Numerical vector: variable standard deviations of the 'x' matrix 899 score_matrix_transposed <- t(score_matrix)
904 xSdVn <- ropls_x@xSdVn 900 # compute the norm of the vector (i.e., the magnitude)
905 result$correlation <- sapply( 901 score_matrix_magnitude <- vector_norm(score_matrix)
906 X = 1:ncol(matrix_x) 902 # compute the standard deviation of the vector
907 , FUN = function(x) { 903 score_matrix_sd <- sd(score_matrix)
908 ( score_matrix_transposed / score_sd ) %*% ( matrix_x[,x] / (xSdVn[x] * cov_divisor) ) 904 # compute the relative covariance of each feature with the score vector
909 } 905 result$covariance <-
910 ) 906 score_matrix_transposed %*% matrix_x / ( score_matrix_magnitude * score_matrix_magnitude )
911 } 907 # compute the correlation of each feature with the score vector
912 result$correlation <- result$correlation[ 1, , drop = TRUE ] 908 result$correlation <-
913 result$covariance <- result$covariance [ 1, , drop = TRUE ] 909 score_matrix_transposed %*% matrix_x / ( (Nobservations - 1) * ( score_matrix_sd * sd_xi ) )
914 910
915 # Variant 4 of Variable Influence on Projection for OPLS from Galindo_Prieto_2014 911 # convert covariance and correlation from one-dimensional matrices to arrays of values,
912 # which are accessed by feature name below
913 p1 <- result$covariance <- result$covariance [ 1, , drop = TRUE ]
914 # x_progress("strF(p1)")
915 # x_progress(strF(p1))
916
917 pcorr1 <- result$correlation <- result$correlation[ 1, , drop = TRUE ]
918 # x_progress("pearson strF(pcorr1)")
919 # x_progress(strF(pcorr1))
920 # x_progress(typeof(pcorr1))
921 # x_progress(str(pcorr1))
922
923 # # this is how to use Spearman correlation instead of pearson
924 # result$spearcor <- sapply(
925 # X = 1:Nfeatures
926 # , FUN = function(i) {
927 # stats::cor(
928 # x = as.vector(score_matrix)
929 # , y = as.vector(matrix_x[,i])
930 # # , method = "spearman"
931 # , method = "pearson"
932 # )
933 # }
934 # )
935 # names(result$spearcor) <- names(p1)
936 # pcorr1 <- result$spearcor
937 # x_progress("spearman strF(pcorr1)")
938 # x_progress(strF(pcorr1))
939 # x_progress(typeof(pcorr1))
940 # x_progress(str(pcorr1))
941 # pcorr1 <- result$correlation <- result$spearcor
942
943 # correl.ci(r, n, a = 0.05, rho = 0)
944 correl_pci <- lapply(
945 X = 1:Nfeatures
946 , FUN = function(i) correl.ci(r = pcorr1[i], n = Nobservations)
947 )
948 result$p_value_raw <- sapply(
949 X = 1:Nfeatures
950 , FUN = function(i) correl_pci[[i]]$p.value
951 )
952 result$p_value_raw[is.na(result$p_value_raw)] <- 0.0
953 result$ci_lower <- sapply(
954 X = 1:Nfeatures
955 , FUN = function(i) correl_pci[[i]]$CI['lower']
956 )
957 result$ci_upper <- sapply(
958 X = 1:Nfeatures
959 , FUN = function(i) correl_pci[[i]]$CI['upper']
960 )
961
962
963 # extract "variant 4 of Variable Influence on Projection for OPLS" (see Galindo_Prieto_2014, DOI 10.1002/cem.2627)
916 # Length = number of features; labels = feature identifiers. (The same is true for $correlation and $covariance.) 964 # Length = number of features; labels = feature identifiers. (The same is true for $correlation and $covariance.)
917 result$vip4p <- as.numeric(ropls_x@vipVn) 965 result$vip4p <- as.numeric(ropls_x@vipVn)
918 result$vip4o <- as.numeric(ropls_x@orthoVipVn) 966 result$vip4o <- as.numeric(ropls_x@orthoVipVn)
967 if (length(result$vip4o) == 0) result$vip4o <- NA
968 # extract the loadings
919 result$loadp <- as.numeric(ropls_x@loadingMN) 969 result$loadp <- as.numeric(ropls_x@loadingMN)
920 result$loado <- as.numeric(ropls_x@orthoLoadingMN) 970 result$loado <- as.numeric(ropls_x@orthoLoadingMN)
921 # get the level names 971 # get the level names
922 level_names <- sort(levels(as.factor(ropls_x@suppLs$y))) 972 level_names <- sort(levels(as.factor(ropls_x@suppLs$y)))
923 fctr_lvl_1 <- level_names[1] 973 fctr_lvl_1 <- level_names[1]
924 fctr_lvl_2 <- level_names[2] 974 fctr_lvl_2 <- level_names[2]
925 feature_count <- length(ropls_x@vipVn) 975 feature_count <- length(ropls_x@vipVn)
926 result$level1 <- rep.int(x = fctr_lvl_1, times = feature_count) 976 result$level1 <- rep.int(x = fctr_lvl_1, times = feature_count)
927 result$level2 <- rep.int(x = fctr_lvl_2, times = feature_count) 977 result$level2 <- rep.int(x = fctr_lvl_2, times = feature_count)
928 superresult <- list()
929 if (length(result$vip4o) == 0) result$vip4o <- NA
930 greaterLevel <- sapply( 978 greaterLevel <- sapply(
931 X = result$correlation 979 X = result$correlation
932 , FUN = function(my_corr) 980 , FUN = function(my_corr)
933 tryCatch({ 981 tryCatch({
934 if ( is.nan( my_corr ) ) { 982 if ( is.nan( my_corr ) ) {
935 print("my_corr is NaN")
936 NA 983 NA
937 } else { 984 } else {
938 if ( my_corr < 0 ) fctr_lvl_1 else fctr_lvl_2 985 if ( my_corr < 0 ) fctr_lvl_1 else fctr_lvl_2
939 } 986 }
940 }, error = function(e) { 987 }, error = function(e) {
953 greaterLevel <- greaterLevel[featureID] 1000 greaterLevel <- greaterLevel[featureID]
954 result$correlation <- result$correlation[featureID] 1001 result$correlation <- result$correlation[featureID]
955 result$covariance <- result$covariance[featureID] 1002 result$covariance <- result$covariance[featureID]
956 # end fixes for https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/issues/1 1003 # end fixes for https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/issues/1
957 1004
1005 # build a data frame to hold the content for the tab-separated values file
958 tsv1 <- data.frame( 1006 tsv1 <- data.frame(
959 featureID = featureID 1007 featureID = featureID
960 , factorLevel1 = result$level1 1008 , factorLevel1 = result$level1
961 , factorLevel2 = result$level2 1009 , factorLevel2 = result$level2
962 , greaterLevel = greaterLevel 1010 , greaterLevel = greaterLevel
963 , projection = result$projection 1011 , projection = result$projection
964 , correlation = result$correlation 1012 , correlation = result$correlation
965 , covariance = result$covariance 1013 , covariance = result$covariance
966 , vip4p = result$vip4p 1014 , vip4p = result$vip4p
967 , vip4o = result$vip4o 1015 , vip4o = result$vip4o
968 , loadp = result$loadp 1016 , loadp = result$loadp
969 , loado = result$loado 1017 , loado = result$loado
970 , row.names = NULL 1018 , cor_p_val_raw = result$p_value_raw
1019 , cor_p_value = p.adjust(p = result$p_value_raw, method = "BY")
1020 , cor_ci_lower = result$ci_lower
1021 , cor_ci_upper = result$ci_upper
971 ) 1022 )
972 tsv1 <- tsv1[!is.na(tsv1$correlation),] 1023 rownames(tsv1) <- tsv1$featureID
973 tsv1 <- tsv1[!is.na(tsv1$covariance),] 1024
974 superresult$tsv1 <- tsv1 1025 # build the superresult, i.e., the result returned by this function
975 rownames(superresult$tsv1) <- tsv1$featureID 1026 superresult <- list()
976 superresult$projection <- result$projection 1027 superresult$projection <- result$projection
977 superresult$covariance <- result$covariance 1028 superresult$covariance <- result$covariance
978 superresult$correlation <- result$correlation 1029 superresult$correlation <- result$correlation
979 superresult$vip4p <- result$vip4p 1030 superresult$vip4p <- result$vip4p
980 superresult$vip4o <- result$vip4o 1031 superresult$vip4o <- result$vip4o
981 superresult$loadp <- result$loadp 1032 superresult$loadp <- result$loadp
982 superresult$loado <- result$loado 1033 superresult$loado <- result$loado
1034 superresult$cor_p_value <- tsv1$cor_p_value
983 superresult$details <- result 1035 superresult$details <- result
984 result$superresult <- superresult 1036
985 # Include thise in case future consumers of this routine want to use it in currently unanticipated ways 1037 # remove any rows having NA for covariance or correlation
986 result$oplsda <- ropls_x 1038 tsv1 <- tsv1[!is.na(tsv1$correlation),]
987 result$predictor <- ropls_x@suppLs$y # in case future consumers of this routine want to use it in currently unanticipated ways 1039 tsv1 <- tsv1[!is.na(tsv1$covariance),]
1040 superresult$tsv1 <- tsv1
1041
1042 # # I did not include these but left them commentd out in case future
1043 # # consumers of this routine want to use it in currently unanticipated ways
1044 # result$superresult <- superresult
1045 # result$oplsda <- ropls_x
1046 # result$predictor <- ropls_x@suppLs$y
1047
988 return (superresult) 1048 return (superresult)
989 } 1049 }
990 1050
1051 # Code for correl.ci was adapted from correl function from:
1052 # @book{
1053 # Tsagris_2018,
1054 # author = {Tsagris, Michail},
1055 # year = {2018},
1056 # link = {https://www.researchgate.net/publication/324363311_Multivariate_data_analysis_in_R},
1057 # title = {Multivariate data analysis in R}
1058 # }
1059 # which follows
1060 # https://en.wikipedia.org/wiki/Fisher_transformation#Definition
1061
1062 correl.ci <- function(r, n, a = 0.05, rho = 0) {
1063 ## r is the calculated correlation coefficient for n pairs
1064 ## a is the significance level
1065 ## rho is the hypothesised correlation
1066 zh0 <- atanh(rho) # 0.5*log((1+rho)/(1-rho)), i.e., Fisher's z-transformation for Ho
1067 zh1 <- atanh(r) # 0.5*log((1+r)/(1-r)), i.e., Fisher's z-transformation for H1
1068 se <- (1 - r^2)/sqrt(n - 3) ## standard error for Fisher's z-transformation of Ho
1069 test <- (zh1 - zh0)/se ### test statistic
1070 pvalue <- 2*(1 - pnorm(abs(test))) ## p-value
1071 zL <- zh1 - qnorm(1 - a/2)*se
1072 zH <- zh1 + qnorm(1 - a/2)*se
1073 fishL <- tanh(zL) # (exp(2*zL)-1)/(exp(2*zL)+1), i.e., lower confidence limit
1074 fishH <- tanh(zH) # (exp(2*zH)-1)/(exp(2*zH)+1), i.e., upper confidence limit
1075 CI <- c(fishL, fishH)
1076 names(CI) <- c('lower', 'upper')
1077 list(correlation = r, p.value = pvalue, CI = CI)
1078 }
1079
991 # vim: sw=2 ts=2 et : 1080 # vim: sw=2 ts=2 et :