Mercurial > repos > eschen42 > w4mcorcov

diff w4mcorcov_salience.R @ 0:23f9fad4edfc draft
planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit bd26542b811de06c1a877337a2840a9f899c2b94
author: eschen42
date: Mon, 16 Oct 2017 14:56:52 -0400
children: 06c51af11531
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/w4mcorcov_salience.R	Mon Oct 16 14:56:52 2017 -0400
@@ -0,0 +1,117 @@
+w4msalience <- function(
+  data_matrix   # a matrix of intensities; features as rows, and samples as columns
+, sample_class  # a vector of sample class-levels; length(sample_class) == ncol(data_matrix)
+, failure_action = stop
+) {
+  library(stats)
+  # begin sanity checks
+  if ( !is.vector(sample_class) || !( is.character(sample_class) || is.factor(sample_class) ) ) {
+    failure_action("w4msalience:  Expected sample_class to be a vector of characters of factor-levels")
+    return (NULL)
+  }
+  if ( !is.matrix(data_matrix) && !is.data.frame(data_matrix) ) {
+    failure_action("w4msalience:  Expected data_matrix to be a matrix (or data.frame) of numeric")
+    return (NULL)
+  }
+  # transpose data_matrix so that columns are the features
+  t_data_matrix <- t(data_matrix)
+  if ( !is.matrix(t_data_matrix) || !is.numeric(t_data_matrix) ) {
+    failure_action("w4msalience:  Expected data_matrix to be a matrix (or data.frame) of numeric")
+    return (NULL)
+  }
+  n_features <- ncol(t_data_matrix)
+  n_features_plus_1 <- 1 + n_features
+  features   <- colnames(t_data_matrix)
+  n_samples  <- nrow(t_data_matrix)
+  if ( length(sample_class) != n_samples ) {
+    strF(data_matrix)
+    strF(sample_class)
+    failure_action(sprintf("w4msalience:  The data_matrix has %d samples but sample_class has %d", n_samples, length(sample_class)))
+    return (NULL)
+  }
+  # end sanity checks
+
+  # "For each feature, 'select sample_class, median(intensity) from feature group by sample_class'."
+  # The first column(s) of the result of aggregate has the classifier value(s) specified in the 'by' list.
+  medianOfFeatureBySampleClassLevel <- aggregate(
+      x = as.data.frame(t_data_matrix)
+    , by = list(sample_class)
+    , FUN = "median"
+    )
+
+  # "For each feature, 'select sample_class, max(intensity) from feature group by sample_class'."
+  maxOfFeatureBySampleClassLevel <- aggregate(
+      x = as.data.frame(t_data_matrix)
+    , by = list(sample_class)
+    , FUN = "max"
+    )
+
+  # "For each feature, 'select sample_class, rcv(intensity) from feature group by sample_class'."
+  #   cv is less robust; deviation from normality degrades performance
+  #     cv(x) == sd(x) / mean(x)
+  #   rcv is a "robust" coefficient of variation, expressed as a proportion
+  #     rcv(x) == mad(x) / median(x)
+  madOfFeatureBySampleClassLevel <- aggregate(
+      x = as.data.frame(t_data_matrix)
+    , by = list(sample_class)
+    , FUN = "mad"
+  )
+  rcvOfFeatureBySampleClassLevel <- as.matrix(
+    madOfFeatureBySampleClassLevel[,2:n_features_plus_1] / medianOfFeatureBySampleClassLevel[,2:n_features_plus_1]
+  )
+  rcvOfFeatureBySampleClassLevel[is.nan(rcvOfFeatureBySampleClassLevel)] <- max(9999,max(rcvOfFeatureBySampleClassLevel, na.rm = TRUE)) 
+
+  # "For each feature, 'select max(median_feature_intensity) from feature'."
+  maxApplyMedianOfFeatureBySampleClassLevel <- sapply(
+      X = 1:n_features
+    , FUN = function(i) {
+        match(
+          max(maxOfFeatureBySampleClassLevel[, i + 1])
+        , maxOfFeatureBySampleClassLevel[, i + 1]
+        )
+      }
+    )
+
+  # "For each feature, 'select mean(median_feature_intensity) from feature'."
+  meanApplyMedianOfFeatureBySampleClassLevel <- sapply(
+      X = 1:n_features
+    , FUN = function(i) mean(medianOfFeatureBySampleClassLevel[, i + 1])
+    )
+
+  # Compute the 'salience' for each feature, i.e., how salient the intensity of a feature
+  #   is for one particular class-level relative to the intensity across all class-levels.
+  salience_df <- data.frame(
+    # the feature name
+    feature = features
+    # the name (or factor-level) of the class-level with the highest median intensity for the feature
+  , max_level = medianOfFeatureBySampleClassLevel[maxApplyMedianOfFeatureBySampleClassLevel,1]
+    # the median intensity for the feature and the level max_level
+  , max_median = sapply(
+        X = 1:n_features
+      , FUN = function(i) {
+          maxOfFeatureBySampleClassLevel[maxApplyMedianOfFeatureBySampleClassLevel[i], 1 + i]
+        }
+    )
+    # the coefficient of variation (expressed as a proportion) for the intensity for the feature and the level max_level
+  , max_rcv = sapply(
+        X = 1:n_features
+      , FUN = function(i) {
+          rcvOfFeatureBySampleClassLevel[maxApplyMedianOfFeatureBySampleClassLevel[i], i]
+        }
+    )
+    # the mean of the medians of intensity for all class-levels for the feature
+  , mean_median = meanApplyMedianOfFeatureBySampleClassLevel
+    # don't coerce strings to factors (this is a parameter for the data.frame constructor, not a column of the data.frame)
+  , stringsAsFactors = FALSE
+  )
+  # raw salience is the ratio of the most-prominent level to the mean of all levels for the feature
+  salience_df$salience <- sapply(
+      X = 1:nrow(salience_df)
+    , FUN = function(i) with(salience_df[i,], if (mean_median > 0) { max_median / mean_median } else { 0 } )
+    )
+  # "robust coefficient of variation, i.e., mad(feature-intensity for class-level max_level) / median(feature-intensity for class-level max_level)
+  salience_df$salient_rcv <- salience_df$max_rcv
+
+  return (salience_df)
+}
+
author	eschen42
date	Mon, 16 Oct 2017 14:56:52 -0400
parents
children	06c51af11531