Mercurial > repos > eschen42 > w4mcorcov
diff w4mcorcov_salience.R @ 13:2ae2d26e3270 draft
planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit e89c652c0849eb1d5a1e6c9100c72c64a8d388b4
author | eschen42 |
---|---|
date | Wed, 12 Dec 2018 09:20:02 -0500 |
parents | 06c51af11531 |
children |
line wrap: on
line diff
--- a/w4mcorcov_salience.R Thu Nov 08 23:06:09 2018 -0500 +++ b/w4mcorcov_salience.R Wed Dec 12 09:20:02 2018 -0500 @@ -19,14 +19,21 @@ failure_action("w4msalience: Expected data_matrix to be a matrix (or data.frame) of numeric") return (NULL) } + + feature_names <- colnames(t_data_matrix) + n_features <- ncol(t_data_matrix) - n_features_plus_1 <- 1 + n_features - features <- colnames(t_data_matrix) n_samples <- nrow(t_data_matrix) if ( length(sample_class) != n_samples ) { strF(data_matrix) strF(sample_class) - failure_action(sprintf("w4msalience: The data_matrix has %d samples but sample_class has %d", n_samples, length(sample_class))) + failure_action( + sprintf( + "w4msalience: The data_matrix has %d samples but sample_class has %d" + , n_samples + , length(sample_class) + ) + ) return (NULL) } # end sanity checks @@ -39,13 +46,6 @@ , FUN = "median" ) - # "For each feature, 'select sample_class, max(intensity) from feature group by sample_class'." - maxOfFeatureBySampleClassLevel <- aggregate( - x = as.data.frame(t_data_matrix) - , by = list(sample_class) - , FUN = "max" - ) - # "For each feature, 'select sample_class, rcv(intensity) from feature group by sample_class'." # cv is less robust; deviation from normality degrades performance # cv(x) == sd(x) / mean(x) @@ -56,61 +56,72 @@ , by = list(sample_class) , FUN = "mad" ) - rcvOfFeatureBySampleClassLevel <- as.matrix( - madOfFeatureBySampleClassLevel[,2:n_features_plus_1] / medianOfFeatureBySampleClassLevel[,2:n_features_plus_1] - ) - rcvOfFeatureBySampleClassLevel[is.nan(rcvOfFeatureBySampleClassLevel)] <- max(9999,max(rcvOfFeatureBySampleClassLevel, na.rm = TRUE)) - # "For each feature, 'select max(max_feature_intensity) from feature'." - maxApplyMaxOfFeatureBySampleClassLevel <- sapply( - X = 1:n_features - , FUN = function(i) { - match( - max(maxOfFeatureBySampleClassLevel[, i + 1]) - , maxOfFeatureBySampleClassLevel[, i + 1] - ) - } - ) - - # "For each feature, 'select mean(median_feature_intensity) from feature'." - meanApplyMedianOfFeatureBySampleClassLevel <- sapply( - X = 1:n_features - , FUN = function(i) mean(medianOfFeatureBySampleClassLevel[, i + 1]) - ) + # Note that `apply(X=array(1:10), MARGIN = 1, FUN = function(x) return(c(x,x^2)))` + # produces a matrix with two rows and ten columns - # Compute the 'salience' for each feature, i.e., how salient the intensity of a feature - # is for one particular class-level relative to the intensity across all class-levels. - salience_df <- data.frame( - # the feature name - feature = features - # the name (or factor-level) of the class-level with the highest median intensity for the feature - , max_level = medianOfFeatureBySampleClassLevel[maxApplyMaxOfFeatureBySampleClassLevel,1] - # the median intensity for the feature and the level max_level - , max_median = sapply( - X = 1:n_features - , FUN = function(i) { - maxOfFeatureBySampleClassLevel[maxApplyMaxOfFeatureBySampleClassLevel[i], 1 + i] - } + my_list <- apply( + X = array(1:n_features) + , MARGIN = 1 + , FUN = function(x) { + my_df <- data.frame( + median = medianOfFeatureBySampleClassLevel[ , 1 + x] + , mad = madOfFeatureBySampleClassLevel[ , 1 + x] + ) + my_df$salient_level <- medianOfFeatureBySampleClassLevel[ , 1] + my_df <- my_df[ order(my_df$median, decreasing = TRUE), ] + my_dist_df <- my_df[ 1:2, ] + # "robust coefficient of variation", i.e., + # mad(feature-intensity for class-level max_level) / median(feature-intensity for class-level max_level) + rcv_result <- my_dist_df$mad[1] / my_dist_df$median[1] + dist_result <- + ( my_dist_df$median[1] - my_dist_df$median[2] ) / + sqrt( my_dist_df$mad[1] * my_dist_df$mad[2] ) + if (is.infinite(dist_result) || is.nan(dist_result)) + dist_result <- 0 + mean_median <- mean(my_df$median) + salience_result <- if (mean_median > 0) my_df$median[1] / mean_median else 0 + return ( + data.frame( + dist_result = dist_result + , max_median = my_df$median[1] + , mean_median = mean_median + , salience_result = salience_result + , salient_level = my_df$salient_level[1] + , rcv_result = rcv_result + ) + ) + } + ) + results_matrix <- sapply(X = 1:n_features, FUN = function(i) my_list[[i]]) + results_df <- as.data.frame(t(results_matrix)) + + relative_salient_distance <- unlist(results_df$dist_result) + salience <- unlist(results_df$salience_result) + salient_level <- unlist(results_df$salient_level) + max_median <- unlist(results_df$max_median) + mean_median <- unlist(results_df$mean_median) + rcv_result <- unlist(results_df$rcv_result) + + salience_df <- + data.frame( + # the feature name + feature = feature_names + # the name (or factor-level) of the class-level with the highest median intensity for the feature + , max_level = salient_level + # the median intensity for the feature and the level max_level + , max_median = max_median + # the distance between the maximum intensities for the feature at the two highest levels + , relative_salient_distance = relative_salient_distance + # the coefficient of variation (expressed as a proportion) for the intensity for the feature and the level max_level + , salience_rcv = rcv_result + # the mean of the medians of intensity for all class-levels for the feature + , mean_median = mean_median + # raw salience is the ratio of the most-prominent level to the mean of all levels for the feature + , salience = salience + # don't coerce strings to factors (this is a parameter for the data.frame constructor, not a column of the data.frame) + , stringsAsFactors = FALSE ) - # the coefficient of variation (expressed as a proportion) for the intensity for the feature and the level max_level - , max_rcv = sapply( - X = 1:n_features - , FUN = function(i) { - rcvOfFeatureBySampleClassLevel[maxApplyMaxOfFeatureBySampleClassLevel[i], i] - } - ) - # the mean of the medians of intensity for all class-levels for the feature - , mean_median = meanApplyMedianOfFeatureBySampleClassLevel - # don't coerce strings to factors (this is a parameter for the data.frame constructor, not a column of the data.frame) - , stringsAsFactors = FALSE - ) - # raw salience is the ratio of the most-prominent level to the mean of all levels for the feature - salience_df$salience <- sapply( - X = 1:nrow(salience_df) - , FUN = function(i) with(salience_df[i,], if (mean_median > 0) { max_median / mean_median } else { 0 } ) - ) - # "robust coefficient of variation, i.e., mad(feature-intensity for class-level max_level) / median(feature-intensity for class-level max_level) - salience_df$salient_rcv <- salience_df$max_rcv return (salience_df) }