Mercurial > repos > eschen42 > w4mcorcov
comparison w4mcorcov_salience.R @ 13:2ae2d26e3270 draft
planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit e89c652c0849eb1d5a1e6c9100c72c64a8d388b4
author | eschen42 |
---|---|
date | Wed, 12 Dec 2018 09:20:02 -0500 |
parents | 06c51af11531 |
children |
comparison
equal
deleted
inserted
replaced
12:ddaf84e15d06 | 13:2ae2d26e3270 |
---|---|
17 t_data_matrix <- t(data_matrix) | 17 t_data_matrix <- t(data_matrix) |
18 if ( !is.matrix(t_data_matrix) || !is.numeric(t_data_matrix) ) { | 18 if ( !is.matrix(t_data_matrix) || !is.numeric(t_data_matrix) ) { |
19 failure_action("w4msalience: Expected data_matrix to be a matrix (or data.frame) of numeric") | 19 failure_action("w4msalience: Expected data_matrix to be a matrix (or data.frame) of numeric") |
20 return (NULL) | 20 return (NULL) |
21 } | 21 } |
22 | |
23 feature_names <- colnames(t_data_matrix) | |
24 | |
22 n_features <- ncol(t_data_matrix) | 25 n_features <- ncol(t_data_matrix) |
23 n_features_plus_1 <- 1 + n_features | |
24 features <- colnames(t_data_matrix) | |
25 n_samples <- nrow(t_data_matrix) | 26 n_samples <- nrow(t_data_matrix) |
26 if ( length(sample_class) != n_samples ) { | 27 if ( length(sample_class) != n_samples ) { |
27 strF(data_matrix) | 28 strF(data_matrix) |
28 strF(sample_class) | 29 strF(sample_class) |
29 failure_action(sprintf("w4msalience: The data_matrix has %d samples but sample_class has %d", n_samples, length(sample_class))) | 30 failure_action( |
31 sprintf( | |
32 "w4msalience: The data_matrix has %d samples but sample_class has %d" | |
33 , n_samples | |
34 , length(sample_class) | |
35 ) | |
36 ) | |
30 return (NULL) | 37 return (NULL) |
31 } | 38 } |
32 # end sanity checks | 39 # end sanity checks |
33 | 40 |
34 # "For each feature, 'select sample_class, median(intensity) from feature group by sample_class'." | 41 # "For each feature, 'select sample_class, median(intensity) from feature group by sample_class'." |
37 x = as.data.frame(t_data_matrix) | 44 x = as.data.frame(t_data_matrix) |
38 , by = list(sample_class) | 45 , by = list(sample_class) |
39 , FUN = "median" | 46 , FUN = "median" |
40 ) | 47 ) |
41 | 48 |
42 # "For each feature, 'select sample_class, max(intensity) from feature group by sample_class'." | |
43 maxOfFeatureBySampleClassLevel <- aggregate( | |
44 x = as.data.frame(t_data_matrix) | |
45 , by = list(sample_class) | |
46 , FUN = "max" | |
47 ) | |
48 | |
49 # "For each feature, 'select sample_class, rcv(intensity) from feature group by sample_class'." | 49 # "For each feature, 'select sample_class, rcv(intensity) from feature group by sample_class'." |
50 # cv is less robust; deviation from normality degrades performance | 50 # cv is less robust; deviation from normality degrades performance |
51 # cv(x) == sd(x) / mean(x) | 51 # cv(x) == sd(x) / mean(x) |
52 # rcv is a "robust" coefficient of variation, expressed as a proportion | 52 # rcv is a "robust" coefficient of variation, expressed as a proportion |
53 # rcv(x) == mad(x) / median(x) | 53 # rcv(x) == mad(x) / median(x) |
54 madOfFeatureBySampleClassLevel <- aggregate( | 54 madOfFeatureBySampleClassLevel <- aggregate( |
55 x = as.data.frame(t_data_matrix) | 55 x = as.data.frame(t_data_matrix) |
56 , by = list(sample_class) | 56 , by = list(sample_class) |
57 , FUN = "mad" | 57 , FUN = "mad" |
58 ) | 58 ) |
59 rcvOfFeatureBySampleClassLevel <- as.matrix( | 59 |
60 madOfFeatureBySampleClassLevel[,2:n_features_plus_1] / medianOfFeatureBySampleClassLevel[,2:n_features_plus_1] | 60 # Note that `apply(X=array(1:10), MARGIN = 1, FUN = function(x) return(c(x,x^2)))` |
61 # produces a matrix with two rows and ten columns | |
62 | |
63 my_list <- apply( | |
64 X = array(1:n_features) | |
65 , MARGIN = 1 | |
66 , FUN = function(x) { | |
67 my_df <- data.frame( | |
68 median = medianOfFeatureBySampleClassLevel[ , 1 + x] | |
69 , mad = madOfFeatureBySampleClassLevel[ , 1 + x] | |
70 ) | |
71 my_df$salient_level <- medianOfFeatureBySampleClassLevel[ , 1] | |
72 my_df <- my_df[ order(my_df$median, decreasing = TRUE), ] | |
73 my_dist_df <- my_df[ 1:2, ] | |
74 # "robust coefficient of variation", i.e., | |
75 # mad(feature-intensity for class-level max_level) / median(feature-intensity for class-level max_level) | |
76 rcv_result <- my_dist_df$mad[1] / my_dist_df$median[1] | |
77 dist_result <- | |
78 ( my_dist_df$median[1] - my_dist_df$median[2] ) / | |
79 sqrt( my_dist_df$mad[1] * my_dist_df$mad[2] ) | |
80 if (is.infinite(dist_result) || is.nan(dist_result)) | |
81 dist_result <- 0 | |
82 mean_median <- mean(my_df$median) | |
83 salience_result <- if (mean_median > 0) my_df$median[1] / mean_median else 0 | |
84 return ( | |
85 data.frame( | |
86 dist_result = dist_result | |
87 , max_median = my_df$median[1] | |
88 , mean_median = mean_median | |
89 , salience_result = salience_result | |
90 , salient_level = my_df$salient_level[1] | |
91 , rcv_result = rcv_result | |
92 ) | |
93 ) | |
94 } | |
61 ) | 95 ) |
62 rcvOfFeatureBySampleClassLevel[is.nan(rcvOfFeatureBySampleClassLevel)] <- max(9999,max(rcvOfFeatureBySampleClassLevel, na.rm = TRUE)) | 96 results_matrix <- sapply(X = 1:n_features, FUN = function(i) my_list[[i]]) |
97 results_df <- as.data.frame(t(results_matrix)) | |
63 | 98 |
64 # "For each feature, 'select max(max_feature_intensity) from feature'." | 99 relative_salient_distance <- unlist(results_df$dist_result) |
65 maxApplyMaxOfFeatureBySampleClassLevel <- sapply( | 100 salience <- unlist(results_df$salience_result) |
66 X = 1:n_features | 101 salient_level <- unlist(results_df$salient_level) |
67 , FUN = function(i) { | 102 max_median <- unlist(results_df$max_median) |
68 match( | 103 mean_median <- unlist(results_df$mean_median) |
69 max(maxOfFeatureBySampleClassLevel[, i + 1]) | 104 rcv_result <- unlist(results_df$rcv_result) |
70 , maxOfFeatureBySampleClassLevel[, i + 1] | 105 |
71 ) | 106 salience_df <- |
72 } | 107 data.frame( |
108 # the feature name | |
109 feature = feature_names | |
110 # the name (or factor-level) of the class-level with the highest median intensity for the feature | |
111 , max_level = salient_level | |
112 # the median intensity for the feature and the level max_level | |
113 , max_median = max_median | |
114 # the distance between the maximum intensities for the feature at the two highest levels | |
115 , relative_salient_distance = relative_salient_distance | |
116 # the coefficient of variation (expressed as a proportion) for the intensity for the feature and the level max_level | |
117 , salience_rcv = rcv_result | |
118 # the mean of the medians of intensity for all class-levels for the feature | |
119 , mean_median = mean_median | |
120 # raw salience is the ratio of the most-prominent level to the mean of all levels for the feature | |
121 , salience = salience | |
122 # don't coerce strings to factors (this is a parameter for the data.frame constructor, not a column of the data.frame) | |
123 , stringsAsFactors = FALSE | |
73 ) | 124 ) |
74 | |
75 # "For each feature, 'select mean(median_feature_intensity) from feature'." | |
76 meanApplyMedianOfFeatureBySampleClassLevel <- sapply( | |
77 X = 1:n_features | |
78 , FUN = function(i) mean(medianOfFeatureBySampleClassLevel[, i + 1]) | |
79 ) | |
80 | |
81 # Compute the 'salience' for each feature, i.e., how salient the intensity of a feature | |
82 # is for one particular class-level relative to the intensity across all class-levels. | |
83 salience_df <- data.frame( | |
84 # the feature name | |
85 feature = features | |
86 # the name (or factor-level) of the class-level with the highest median intensity for the feature | |
87 , max_level = medianOfFeatureBySampleClassLevel[maxApplyMaxOfFeatureBySampleClassLevel,1] | |
88 # the median intensity for the feature and the level max_level | |
89 , max_median = sapply( | |
90 X = 1:n_features | |
91 , FUN = function(i) { | |
92 maxOfFeatureBySampleClassLevel[maxApplyMaxOfFeatureBySampleClassLevel[i], 1 + i] | |
93 } | |
94 ) | |
95 # the coefficient of variation (expressed as a proportion) for the intensity for the feature and the level max_level | |
96 , max_rcv = sapply( | |
97 X = 1:n_features | |
98 , FUN = function(i) { | |
99 rcvOfFeatureBySampleClassLevel[maxApplyMaxOfFeatureBySampleClassLevel[i], i] | |
100 } | |
101 ) | |
102 # the mean of the medians of intensity for all class-levels for the feature | |
103 , mean_median = meanApplyMedianOfFeatureBySampleClassLevel | |
104 # don't coerce strings to factors (this is a parameter for the data.frame constructor, not a column of the data.frame) | |
105 , stringsAsFactors = FALSE | |
106 ) | |
107 # raw salience is the ratio of the most-prominent level to the mean of all levels for the feature | |
108 salience_df$salience <- sapply( | |
109 X = 1:nrow(salience_df) | |
110 , FUN = function(i) with(salience_df[i,], if (mean_median > 0) { max_median / mean_median } else { 0 } ) | |
111 ) | |
112 # "robust coefficient of variation, i.e., mad(feature-intensity for class-level max_level) / median(feature-intensity for class-level max_level) | |
113 salience_df$salient_rcv <- salience_df$max_rcv | |
114 | 125 |
115 return (salience_df) | 126 return (salience_df) |
116 } | 127 } |
117 | 128 |