Mercurial > repos > eschen42 > w4mcorcov
view w4mcorcov_salience.R @ 13:2ae2d26e3270 draft
planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit e89c652c0849eb1d5a1e6c9100c72c64a8d388b4
author | eschen42 |
---|---|
date | Wed, 12 Dec 2018 09:20:02 -0500 |
parents | 06c51af11531 |
children |
line wrap: on
line source
w4msalience <- function( data_matrix # a matrix of intensities; features as rows, and samples as columns , sample_class # a vector of sample class-levels; length(sample_class) == ncol(data_matrix) , failure_action = stop ) { library(stats) # begin sanity checks if ( !is.vector(sample_class) || !( is.character(sample_class) || is.factor(sample_class) ) ) { failure_action("w4msalience: Expected sample_class to be a vector of characters of factor-levels") return (NULL) } if ( !is.matrix(data_matrix) && !is.data.frame(data_matrix) ) { failure_action("w4msalience: Expected data_matrix to be a matrix (or data.frame) of numeric") return (NULL) } # transpose data_matrix so that columns are the features t_data_matrix <- t(data_matrix) if ( !is.matrix(t_data_matrix) || !is.numeric(t_data_matrix) ) { failure_action("w4msalience: Expected data_matrix to be a matrix (or data.frame) of numeric") return (NULL) } feature_names <- colnames(t_data_matrix) n_features <- ncol(t_data_matrix) n_samples <- nrow(t_data_matrix) if ( length(sample_class) != n_samples ) { strF(data_matrix) strF(sample_class) failure_action( sprintf( "w4msalience: The data_matrix has %d samples but sample_class has %d" , n_samples , length(sample_class) ) ) return (NULL) } # end sanity checks # "For each feature, 'select sample_class, median(intensity) from feature group by sample_class'." # The first column(s) of the result of aggregate has the classifier value(s) specified in the 'by' list. medianOfFeatureBySampleClassLevel <- aggregate( x = as.data.frame(t_data_matrix) , by = list(sample_class) , FUN = "median" ) # "For each feature, 'select sample_class, rcv(intensity) from feature group by sample_class'." # cv is less robust; deviation from normality degrades performance # cv(x) == sd(x) / mean(x) # rcv is a "robust" coefficient of variation, expressed as a proportion # rcv(x) == mad(x) / median(x) madOfFeatureBySampleClassLevel <- aggregate( x = as.data.frame(t_data_matrix) , by = list(sample_class) , FUN = "mad" ) # Note that `apply(X=array(1:10), MARGIN = 1, FUN = function(x) return(c(x,x^2)))` # produces a matrix with two rows and ten columns my_list <- apply( X = array(1:n_features) , MARGIN = 1 , FUN = function(x) { my_df <- data.frame( median = medianOfFeatureBySampleClassLevel[ , 1 + x] , mad = madOfFeatureBySampleClassLevel[ , 1 + x] ) my_df$salient_level <- medianOfFeatureBySampleClassLevel[ , 1] my_df <- my_df[ order(my_df$median, decreasing = TRUE), ] my_dist_df <- my_df[ 1:2, ] # "robust coefficient of variation", i.e., # mad(feature-intensity for class-level max_level) / median(feature-intensity for class-level max_level) rcv_result <- my_dist_df$mad[1] / my_dist_df$median[1] dist_result <- ( my_dist_df$median[1] - my_dist_df$median[2] ) / sqrt( my_dist_df$mad[1] * my_dist_df$mad[2] ) if (is.infinite(dist_result) || is.nan(dist_result)) dist_result <- 0 mean_median <- mean(my_df$median) salience_result <- if (mean_median > 0) my_df$median[1] / mean_median else 0 return ( data.frame( dist_result = dist_result , max_median = my_df$median[1] , mean_median = mean_median , salience_result = salience_result , salient_level = my_df$salient_level[1] , rcv_result = rcv_result ) ) } ) results_matrix <- sapply(X = 1:n_features, FUN = function(i) my_list[[i]]) results_df <- as.data.frame(t(results_matrix)) relative_salient_distance <- unlist(results_df$dist_result) salience <- unlist(results_df$salience_result) salient_level <- unlist(results_df$salient_level) max_median <- unlist(results_df$max_median) mean_median <- unlist(results_df$mean_median) rcv_result <- unlist(results_df$rcv_result) salience_df <- data.frame( # the feature name feature = feature_names # the name (or factor-level) of the class-level with the highest median intensity for the feature , max_level = salient_level # the median intensity for the feature and the level max_level , max_median = max_median # the distance between the maximum intensities for the feature at the two highest levels , relative_salient_distance = relative_salient_distance # the coefficient of variation (expressed as a proportion) for the intensity for the feature and the level max_level , salience_rcv = rcv_result # the mean of the medians of intensity for all class-levels for the feature , mean_median = mean_median # raw salience is the ratio of the most-prominent level to the mean of all levels for the feature , salience = salience # don't coerce strings to factors (this is a parameter for the data.frame constructor, not a column of the data.frame) , stringsAsFactors = FALSE ) return (salience_df) }