comparison MannWhitney_DE.R @ 4:6916ac5a9ef0 draft default tip

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/gsc_mannwhitney_de commit c394391dcf541d91ee1dfdc0c3d80cd7a21942ff
author artbio
date Thu, 30 Nov 2023 02:03:53 +0000
parents 3d86c89f15bf
children
comparison
equal deleted inserted replaced
3:3d86c89f15bf 4:6916ac5a9ef0
1 #################### 1 # Perform a differential analysis between 2 groups of cells.
2 # Differential #
3 # analysis #
4 ####################
5
6 # Perform a differential analysis between 2
7 # groups of cells.
8 2
9 # Example of command 3 # Example of command
10 # Rscript MannWhitney_DE.R --input <input.tsv> --sep <tab> --colnames <TRUE> --metadata <signature.tsv> --column_name <rate> --fdr <0.01> --output <diff_analysis.tsv> 4 # Rscript MannWhitney_DE.R --input <input.tsv> --sep <tab> --colnames <TRUE> --metadata <signature.tsv> --column_name <rate> --fdr <0.01> --output <diff_analysis.tsv>
11 5
12 # load packages that are provided in the conda env 6 options(show.error.messages = FALSE,
13 options( show.error.messages=F, 7 error = function() {
14 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) 8 cat(geterrmessage(), file = stderr())
9 q("no", 1, FALSE)
10 }
11 )
12
15 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") 13 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
16 warnings()
17 library(optparse)
18 14
19 #Arguments 15 suppressPackageStartupMessages({
20 option_list = list( 16 library(optparse)
17 })
18
19 sessionInfo()
20
21 option_list <- list(
21 make_option( 22 make_option(
22 "--input", 23 "--input",
23 default = NA, 24 default = NA,
24 type = 'character', 25 type = "character",
25 help = "Input file that contains log2(CPM +1) values" 26 help = "Input file that contains log2(CPM +1) values"
26 ), 27 ),
27 make_option( 28 make_option(
28 "--sep", 29 "--sep",
29 default = '\t', 30 default = "\t",
30 type = 'character', 31 type = "character",
31 help = "File separator [default : '%default' ]" 32 help = "File separator [default : '%default' ]"
32 ), 33 ),
33 make_option( 34 make_option(
34 "--colnames", 35 "--colnames",
35 default = TRUE, 36 default = TRUE,
36 type = 'logical', 37 type = "logical",
37 help = "Consider first line as header ? [default : '%default' ]" 38 help = "Consider first line as header ? [default : '%default' ]"
38 ), 39 ),
39 make_option( 40 make_option(
40 "--comparison_factor_file", 41 "--comparison_factor_file",
41 default = NA, 42 default = NA,
42 type = 'character', 43 type = "character",
43 help = " A two column table : cell identifiers and a comparison factor that split cells in two categories (high/low, HOM/HET,...)" 44 help = " A two column table : cell identifiers and a comparison factor that split cells in two categories (high/low, HOM/HET,...)"
44 ), 45 ),
45 make_option( 46 make_option(
46 "--factor1", 47 "--factor1",
47 type = 'character', 48 type = "character",
48 help = "level associated to the control condition in the factor file" 49 help = "level associated to the control condition in the factor file"
49 ), 50 ),
50 make_option( 51 make_option(
51 "--factor2", 52 "--factor2",
52 type = 'character', 53 type = "character",
53 help = "level associated to the test condition in the factor file" 54 help = "level associated to the test condition in the factor file"
54 ), 55 ),
55 make_option( 56 make_option(
56 "--fdr", 57 "--fdr",
57 default = 0.01, 58 default = 0.01,
58 type = 'numeric', 59 type = "numeric",
59 help = "FDR threshold [default : '%default' ]" 60 help = "FDR threshold [default : '%default' ]"
60 ), 61 ),
61 make_option( 62 make_option(
62 "--log", 63 "--log",
63 default=FALSE, 64 default = FALSE,
64 action="store_true", 65 action = "store_true",
65 type = 'logical', 66 type = "logical",
66 help = "Expression data are log-transformed [default : '%default' ]" 67 help = "Expression data are log-transformed [default : '%default' ]"
67 ), 68 ),
68 make_option( 69 make_option(
69 "--output", 70 "--output",
70 default = "results.tsv", 71 default = "results.tsv",
71 type = 'character', 72 type = "character",
72 help = "Output name [default : '%default' ]" 73 help = "Output name [default : '%default' ]"
73 ) 74 )
74 ) 75 )
75 76
76 opt = parse_args(OptionParser(option_list = option_list), 77 opt <- parse_args(OptionParser(option_list = option_list),
77 args = commandArgs(trailingOnly = TRUE)) 78 args = commandArgs(trailingOnly = TRUE))
78 79
79 if (opt$sep == "tab") {opt$sep = "\t"} 80 if (opt$sep == "tab") {
80 if (opt$sep == "comma") {opt$sep = ","} 81 opt$sep <- "\t"
82 }
83 if (opt$sep == "comma") {
84 opt$sep <- ","
85 }
81 86
82 #Open files 87 #Open files
83 data.counts <- read.table( 88 data.counts <- read.table(
84 opt$input, 89 opt$input,
85 h = opt$colnames, 90 h = opt$colnames,
86 row.names = 1, 91 row.names = 1,
87 sep = opt$sep, 92 sep = opt$sep,
88 check.names = F 93 check.names = FALSE
89 ) 94 )
90 95
91 metadata <- read.table( 96 metadata <- read.table(
92 opt$comparison_factor_file, 97 opt$comparison_factor_file,
93 header = TRUE, 98 header = TRUE,
94 stringsAsFactors = F, 99 stringsAsFactors = FALSE,
95 sep = "\t", 100 sep = "\t",
96 check.names = FALSE, 101 check.names = FALSE,
97 row.names = 1 102 row.names = 1
98 ) 103 )
99 104
100 metadata <- subset(metadata, rownames(metadata) %in% colnames(data.counts)) 105 metadata <- subset(metadata, rownames(metadata) %in% colnames(data.counts))
101 106
102 # Create two logical named vectors for each factor level of cell signature 107 # Create two logical named vectors for each factor level of cell signature
103 factor1_cells <- setNames(metadata[,1] == opt$factor1, rownames(metadata)) 108 factor1_cells <- setNames(metadata[, 1] == opt$factor1, rownames(metadata))
104 factor2_cells <- setNames(metadata[,1] == opt$factor2, rownames(metadata)) 109 factor2_cells <- setNames(metadata[, 1] == opt$factor2, rownames(metadata))
105 110
106 ## Mann-Whitney test (Two-sample Wilcoxon test) 111 ## Mann-Whitney test (Two-sample Wilcoxon test)
107 MW_test <- data.frame(t(apply(data.counts, 1, function(x) { 112 MW_test <- data.frame(t(apply(data.counts, 1, function(x) {
108 do.call("cbind", wilcox.test(x[names(factor1_cells)[factor1_cells]], x[names(factor2_cells)[factor2_cells]]))[, 1:2] 113 do.call("cbind", wilcox.test(x[names(factor1_cells)[factor1_cells]], x[names(factor2_cells)[factor2_cells]]))[, 1:2]
109 })), stringsAsFactors = F) 114 })), stringsAsFactors = FALSE)
110 115
111 # Benjamini-Hochberg correction and significativity 116 # Benjamini-Hochberg correction and significativity
112 MW_test$p.adjust <- p.adjust(as.numeric(MW_test$p.value), method = "BH" , n = nrow(MW_test)) 117 MW_test$p.adjust <- p.adjust(as.numeric(MW_test$p.value), method = "BH", n = nrow(MW_test))
113 # MW_test$Critical.value <- (rank(MW_test$p.value) / nrow(MW_test)) * opt$fdr
114 MW_test$Significant <- MW_test$p.adjust < opt$fdr 118 MW_test$Significant <- MW_test$p.adjust < opt$fdr
115 119
116 ## Descriptive Statistics Function 120 ## Descriptive Statistics Function
117 descriptive_stats <- function(InputData) { 121 descriptive_stats <- function(InputData) {
118 SummaryData = data.frame( 122 SummaryData <- data.frame(
119 mean = rowMeans(InputData), 123 mean = rowMeans(InputData),
120 SD = apply(InputData, 1, sd), 124 SD = apply(InputData, 1, sd),
121 Variance = apply(InputData, 1, var), 125 Variance = apply(InputData, 1, var),
122 Percentage_Detection = apply(InputData, 1, function(x, y = InputData) { 126 Percentage_Detection = apply(InputData, 1, function(x, y = InputData) {
123 (sum(x != 0) / ncol(y)) * 100 127 (sum(x != 0) / ncol(y)) * 100
124 }), 128 }),
125 mean_condition2 = rowMeans(InputData[,factor2_cells]), 129 mean_condition2 = rowMeans(InputData[, factor2_cells]),
126 mean_condition1 = rowMeans(InputData[, factor1_cells]) 130 mean_condition1 = rowMeans(InputData[, factor1_cells])
127 ) 131 )
128 if(opt$log) { 132 if (opt$log) {
129 SummaryData$log2FC <- SummaryData$mean_condition2 - SummaryData$mean_condition1 133 SummaryData$log2FC <- SummaryData$mean_condition2 - SummaryData$mean_condition1
130 } else { 134 } else {
131 SummaryData$log2FC <- log2(SummaryData$mean_condition2 / SummaryData$mean_condition1) 135 SummaryData$log2FC <- log2(SummaryData$mean_condition2 / SummaryData$mean_condition1)
132 } 136 }
133 return(SummaryData) 137 return(SummaryData)
134 } 138 }
135 139
136 gene_stats <- descriptive_stats(data.counts) 140 gene_stats <- descriptive_stats(data.counts)
137 141
138 results <- merge(gene_stats, MW_test, by = "row.names") 142 results <- merge(gene_stats, MW_test, by = "row.names")
139 colnames(results)[1] <- "genes" 143 colnames(results)[1] <- "genes"
140 144
141 ## Annotate Significant column 145 ## Annotate Significant column
142 results$Significant[results$Significant == T & !is.na(results$Significant)] <- ifelse(subset(results, Significant == T)$log2FC > 0, "UP", "DOWN") 146 results$Significant[results$Significant == TRUE & !is.na(results$Significant)] <- ifelse(subset(results, Significant == TRUE)$log2FC > 0, "UP", "DOWN")
143 results$Significant[results$Significant == F & !is.na(results$Significant)] <- "NS" 147 results$Significant[results$Significant == FALSE & !is.na(results$Significant)] <- "NS"
144 148
145 149
146 # Save files 150 # Save files
147 write.table( 151 write.table(
148 results[order(results$p.adjust),], 152 results[order(results$p.adjust), ],
149 opt$output, 153 opt$output,
150 sep = "\t", 154 sep = "\t",
151 quote = F, 155 quote = FALSE,
152 col.names = T, 156 col.names = TRUE,
153 row.names = F 157 row.names = FALSE
154 ) 158 )