diff w4mcorcov_salience.R @ 13:2ae2d26e3270 draft

planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit e89c652c0849eb1d5a1e6c9100c72c64a8d388b4
author eschen42
date Wed, 12 Dec 2018 09:20:02 -0500
parents 06c51af11531
children
line wrap: on
line diff
--- a/w4mcorcov_salience.R	Thu Nov 08 23:06:09 2018 -0500
+++ b/w4mcorcov_salience.R	Wed Dec 12 09:20:02 2018 -0500
@@ -19,14 +19,21 @@
     failure_action("w4msalience:  Expected data_matrix to be a matrix (or data.frame) of numeric")
     return (NULL)
   }
+
+  feature_names <- colnames(t_data_matrix)
+
   n_features <- ncol(t_data_matrix)
-  n_features_plus_1 <- 1 + n_features
-  features   <- colnames(t_data_matrix)
   n_samples  <- nrow(t_data_matrix)
   if ( length(sample_class) != n_samples ) {
     strF(data_matrix)
     strF(sample_class)
-    failure_action(sprintf("w4msalience:  The data_matrix has %d samples but sample_class has %d", n_samples, length(sample_class)))
+    failure_action(
+      sprintf(
+        "w4msalience:  The data_matrix has %d samples but sample_class has %d"
+      , n_samples
+      , length(sample_class)
+      )
+    )
     return (NULL)
   }
   # end sanity checks
@@ -39,13 +46,6 @@
     , FUN = "median"
     )
 
-  # "For each feature, 'select sample_class, max(intensity) from feature group by sample_class'."
-  maxOfFeatureBySampleClassLevel <- aggregate(
-      x = as.data.frame(t_data_matrix)
-    , by = list(sample_class)
-    , FUN = "max"
-    )
-
   # "For each feature, 'select sample_class, rcv(intensity) from feature group by sample_class'."
   #   cv is less robust; deviation from normality degrades performance
   #     cv(x) == sd(x) / mean(x)
@@ -56,61 +56,72 @@
     , by = list(sample_class)
     , FUN = "mad"
   )
-  rcvOfFeatureBySampleClassLevel <- as.matrix(
-    madOfFeatureBySampleClassLevel[,2:n_features_plus_1] / medianOfFeatureBySampleClassLevel[,2:n_features_plus_1]
-  )
-  rcvOfFeatureBySampleClassLevel[is.nan(rcvOfFeatureBySampleClassLevel)] <- max(9999,max(rcvOfFeatureBySampleClassLevel, na.rm = TRUE)) 
 
-  # "For each feature, 'select max(max_feature_intensity) from feature'."
-  maxApplyMaxOfFeatureBySampleClassLevel <- sapply(
-      X = 1:n_features
-    , FUN = function(i) {
-        match(
-          max(maxOfFeatureBySampleClassLevel[, i + 1])
-        , maxOfFeatureBySampleClassLevel[, i + 1]
-        )
-      }
-    )
-
-  # "For each feature, 'select mean(median_feature_intensity) from feature'."
-  meanApplyMedianOfFeatureBySampleClassLevel <- sapply(
-      X = 1:n_features
-    , FUN = function(i) mean(medianOfFeatureBySampleClassLevel[, i + 1])
-    )
+  # Note that `apply(X=array(1:10), MARGIN = 1, FUN = function(x) return(c(x,x^2)))`
+  #   produces a matrix with two rows and ten columns
 
-  # Compute the 'salience' for each feature, i.e., how salient the intensity of a feature
-  #   is for one particular class-level relative to the intensity across all class-levels.
-  salience_df <- data.frame(
-    # the feature name
-    feature = features
-    # the name (or factor-level) of the class-level with the highest median intensity for the feature
-  , max_level = medianOfFeatureBySampleClassLevel[maxApplyMaxOfFeatureBySampleClassLevel,1]
-    # the median intensity for the feature and the level max_level
-  , max_median = sapply(
-        X = 1:n_features
-      , FUN = function(i) {
-          maxOfFeatureBySampleClassLevel[maxApplyMaxOfFeatureBySampleClassLevel[i], 1 + i]
-        }
+  my_list <- apply(
+    X = array(1:n_features)
+  , MARGIN = 1
+  , FUN = function(x) {
+      my_df <- data.frame(
+        median = medianOfFeatureBySampleClassLevel[ , 1 + x]
+      , mad = madOfFeatureBySampleClassLevel[ , 1 + x]
+      )
+      my_df$salient_level <- medianOfFeatureBySampleClassLevel[ , 1]
+      my_df <- my_df[ order(my_df$median, decreasing = TRUE), ]
+      my_dist_df <- my_df[  1:2, ]
+      # "robust coefficient of variation", i.e.,
+      #    mad(feature-intensity for class-level max_level) / median(feature-intensity for class-level max_level)
+      rcv_result <- my_dist_df$mad[1] / my_dist_df$median[1]
+      dist_result <-
+        ( my_dist_df$median[1] - my_dist_df$median[2] ) /
+        sqrt( my_dist_df$mad[1] * my_dist_df$mad[2] )
+      if (is.infinite(dist_result) || is.nan(dist_result))
+        dist_result <- 0
+      mean_median <- mean(my_df$median)
+      salience_result <- if (mean_median > 0) my_df$median[1] / mean_median else 0
+      return (
+        data.frame(
+          dist_result     = dist_result
+        , max_median      = my_df$median[1]
+        , mean_median     = mean_median
+        , salience_result = salience_result
+        , salient_level   = my_df$salient_level[1]
+        , rcv_result      = rcv_result
+        )
+      )
+    }
+  )
+  results_matrix  <- sapply(X = 1:n_features, FUN = function(i) my_list[[i]])
+  results_df <- as.data.frame(t(results_matrix))
+
+  relative_salient_distance <- unlist(results_df$dist_result)
+  salience <- unlist(results_df$salience_result)
+  salient_level <- unlist(results_df$salient_level)
+  max_median <- unlist(results_df$max_median)
+  mean_median <- unlist(results_df$mean_median)
+  rcv_result <- unlist(results_df$rcv_result)
+
+  salience_df <-
+    data.frame(
+      # the feature name
+      feature                   = feature_names
+      # the name (or factor-level) of the class-level with the highest median intensity for the feature
+    , max_level                 = salient_level
+      # the median intensity for the feature and the level max_level
+    , max_median                = max_median
+      # the distance between the maximum intensities for the feature at the two highest levels
+    , relative_salient_distance = relative_salient_distance
+      # the coefficient of variation (expressed as a proportion) for the intensity for the feature and the level max_level
+    , salience_rcv              = rcv_result
+      # the mean of the medians of intensity for all class-levels for the feature
+    , mean_median               = mean_median
+      # raw salience is the ratio of the most-prominent level to the mean of all levels for the feature
+    , salience                  = salience
+      # don't coerce strings to factors (this is a parameter for the data.frame constructor, not a column of the data.frame)
+    , stringsAsFactors = FALSE
     )
-    # the coefficient of variation (expressed as a proportion) for the intensity for the feature and the level max_level
-  , max_rcv = sapply(
-        X = 1:n_features
-      , FUN = function(i) {
-          rcvOfFeatureBySampleClassLevel[maxApplyMaxOfFeatureBySampleClassLevel[i], i]
-        }
-    )
-    # the mean of the medians of intensity for all class-levels for the feature
-  , mean_median = meanApplyMedianOfFeatureBySampleClassLevel
-    # don't coerce strings to factors (this is a parameter for the data.frame constructor, not a column of the data.frame)
-  , stringsAsFactors = FALSE
-  )
-  # raw salience is the ratio of the most-prominent level to the mean of all levels for the feature
-  salience_df$salience <- sapply(
-      X = 1:nrow(salience_df)
-    , FUN = function(i) with(salience_df[i,], if (mean_median > 0) { max_median / mean_median } else { 0 } )
-    )
-  # "robust coefficient of variation, i.e., mad(feature-intensity for class-level max_level) / median(feature-intensity for class-level max_level)
-  salience_df$salient_rcv <- salience_df$max_rcv
 
   return (salience_df)
 }