comparison w4mcorcov_salience.R @ 0:23f9fad4edfc draft

planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit bd26542b811de06c1a877337a2840a9f899c2b94
author eschen42
date Mon, 16 Oct 2017 14:56:52 -0400
parents
children 06c51af11531
comparison
equal deleted inserted replaced
-1:000000000000 0:23f9fad4edfc
1 w4msalience <- function(
2 data_matrix # a matrix of intensities; features as rows, and samples as columns
3 , sample_class # a vector of sample class-levels; length(sample_class) == ncol(data_matrix)
4 , failure_action = stop
5 ) {
6 library(stats)
7 # begin sanity checks
8 if ( !is.vector(sample_class) || !( is.character(sample_class) || is.factor(sample_class) ) ) {
9 failure_action("w4msalience: Expected sample_class to be a vector of characters of factor-levels")
10 return (NULL)
11 }
12 if ( !is.matrix(data_matrix) && !is.data.frame(data_matrix) ) {
13 failure_action("w4msalience: Expected data_matrix to be a matrix (or data.frame) of numeric")
14 return (NULL)
15 }
16 # transpose data_matrix so that columns are the features
17 t_data_matrix <- t(data_matrix)
18 if ( !is.matrix(t_data_matrix) || !is.numeric(t_data_matrix) ) {
19 failure_action("w4msalience: Expected data_matrix to be a matrix (or data.frame) of numeric")
20 return (NULL)
21 }
22 n_features <- ncol(t_data_matrix)
23 n_features_plus_1 <- 1 + n_features
24 features <- colnames(t_data_matrix)
25 n_samples <- nrow(t_data_matrix)
26 if ( length(sample_class) != n_samples ) {
27 strF(data_matrix)
28 strF(sample_class)
29 failure_action(sprintf("w4msalience: The data_matrix has %d samples but sample_class has %d", n_samples, length(sample_class)))
30 return (NULL)
31 }
32 # end sanity checks
33
34 # "For each feature, 'select sample_class, median(intensity) from feature group by sample_class'."
35 # The first column(s) of the result of aggregate has the classifier value(s) specified in the 'by' list.
36 medianOfFeatureBySampleClassLevel <- aggregate(
37 x = as.data.frame(t_data_matrix)
38 , by = list(sample_class)
39 , FUN = "median"
40 )
41
42 # "For each feature, 'select sample_class, max(intensity) from feature group by sample_class'."
43 maxOfFeatureBySampleClassLevel <- aggregate(
44 x = as.data.frame(t_data_matrix)
45 , by = list(sample_class)
46 , FUN = "max"
47 )
48
49 # "For each feature, 'select sample_class, rcv(intensity) from feature group by sample_class'."
50 # cv is less robust; deviation from normality degrades performance
51 # cv(x) == sd(x) / mean(x)
52 # rcv is a "robust" coefficient of variation, expressed as a proportion
53 # rcv(x) == mad(x) / median(x)
54 madOfFeatureBySampleClassLevel <- aggregate(
55 x = as.data.frame(t_data_matrix)
56 , by = list(sample_class)
57 , FUN = "mad"
58 )
59 rcvOfFeatureBySampleClassLevel <- as.matrix(
60 madOfFeatureBySampleClassLevel[,2:n_features_plus_1] / medianOfFeatureBySampleClassLevel[,2:n_features_plus_1]
61 )
62 rcvOfFeatureBySampleClassLevel[is.nan(rcvOfFeatureBySampleClassLevel)] <- max(9999,max(rcvOfFeatureBySampleClassLevel, na.rm = TRUE))
63
64 # "For each feature, 'select max(median_feature_intensity) from feature'."
65 maxApplyMedianOfFeatureBySampleClassLevel <- sapply(
66 X = 1:n_features
67 , FUN = function(i) {
68 match(
69 max(maxOfFeatureBySampleClassLevel[, i + 1])
70 , maxOfFeatureBySampleClassLevel[, i + 1]
71 )
72 }
73 )
74
75 # "For each feature, 'select mean(median_feature_intensity) from feature'."
76 meanApplyMedianOfFeatureBySampleClassLevel <- sapply(
77 X = 1:n_features
78 , FUN = function(i) mean(medianOfFeatureBySampleClassLevel[, i + 1])
79 )
80
81 # Compute the 'salience' for each feature, i.e., how salient the intensity of a feature
82 # is for one particular class-level relative to the intensity across all class-levels.
83 salience_df <- data.frame(
84 # the feature name
85 feature = features
86 # the name (or factor-level) of the class-level with the highest median intensity for the feature
87 , max_level = medianOfFeatureBySampleClassLevel[maxApplyMedianOfFeatureBySampleClassLevel,1]
88 # the median intensity for the feature and the level max_level
89 , max_median = sapply(
90 X = 1:n_features
91 , FUN = function(i) {
92 maxOfFeatureBySampleClassLevel[maxApplyMedianOfFeatureBySampleClassLevel[i], 1 + i]
93 }
94 )
95 # the coefficient of variation (expressed as a proportion) for the intensity for the feature and the level max_level
96 , max_rcv = sapply(
97 X = 1:n_features
98 , FUN = function(i) {
99 rcvOfFeatureBySampleClassLevel[maxApplyMedianOfFeatureBySampleClassLevel[i], i]
100 }
101 )
102 # the mean of the medians of intensity for all class-levels for the feature
103 , mean_median = meanApplyMedianOfFeatureBySampleClassLevel
104 # don't coerce strings to factors (this is a parameter for the data.frame constructor, not a column of the data.frame)
105 , stringsAsFactors = FALSE
106 )
107 # raw salience is the ratio of the most-prominent level to the mean of all levels for the feature
108 salience_df$salience <- sapply(
109 X = 1:nrow(salience_df)
110 , FUN = function(i) with(salience_df[i,], if (mean_median > 0) { max_median / mean_median } else { 0 } )
111 )
112 # "robust coefficient of variation, i.e., mad(feature-intensity for class-level max_level) / median(feature-intensity for class-level max_level)
113 salience_df$salient_rcv <- salience_df$max_rcv
114
115 return (salience_df)
116 }
117