comparison MannWhitney_DE.R @ 0:c67dba545a37 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_mannwhitney_de commit 09dcd74dbc01f448518cf3db3e646afb0675a6fe
author artbio
date Mon, 24 Jun 2019 13:39:39 -0400
parents
children 3d86c89f15bf
comparison
equal deleted inserted replaced
-1:000000000000 0:c67dba545a37
1 ####################
2 # Differential #
3 # analysis #
4 ####################
5
6 # Perform a differential analysis between 2
7 # groups of cells.
8
9 # Example of command
10 # Rscript MannWhitney_DE.R --input <input.tsv> --sep <tab> --colnames <TRUE> --metadata <signature.tsv> --column_name <rate> --fdr <0.01> --output <diff_analysis.tsv>
11
12 # load packages that are provided in the conda env
13 options( show.error.messages=F,
14 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
15 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
16 warnings()
17 library(optparse)
18
19 #Arguments
20 option_list = list(
21 make_option(
22 "--input",
23 default = NA,
24 type = 'character',
25 help = "Input file that contains log2(CPM +1) values"
26 ),
27 make_option(
28 "--sep",
29 default = '\t',
30 type = 'character',
31 help = "File separator [default : '%default' ]"
32 ),
33 make_option(
34 "--colnames",
35 default = TRUE,
36 type = 'logical',
37 help = "Consider first line as header ? [default : '%default' ]"
38 ),
39 make_option(
40 "--comparison_factor_file",
41 default = NA,
42 type = 'character',
43 help = " A two column table : cell identifiers and a comparison factor that split cells in two categories (high/low, HOM/HET,...)"
44 ),
45 make_option(
46 "--factor1",
47 type = 'character',
48 help = "level associated to the control condition in the factor file"
49 ),
50 make_option(
51 "--factor2",
52 type = 'character',
53 help = "level associated to the test condition in the factor file"
54 ),
55 make_option(
56 "--fdr",
57 default = 0.01,
58 type = 'numeric',
59 help = "FDR threshold [default : '%default' ]"
60 ),
61 make_option(
62 "--log",
63 default=FALSE,
64 action="store_true",
65 type = 'logical',
66 help = "Expression data are log-transformed [default : '%default' ]"
67 ),
68 make_option(
69 "--output",
70 default = "results.tsv",
71 type = 'character',
72 help = "Output name [default : '%default' ]"
73 )
74 )
75
76 opt = parse_args(OptionParser(option_list = option_list),
77 args = commandArgs(trailingOnly = TRUE))
78
79 if (opt$sep == "tab") {opt$sep = "\t"}
80 if (opt$sep == "comma") {opt$sep = ","}
81
82 #Open files
83 data.counts <- read.table(
84 opt$input,
85 h = opt$colnames,
86 row.names = 1,
87 sep = opt$sep,
88 check.names = F
89 )
90
91 metadata <- read.table(
92 opt$comparison_factor_file,
93 header = TRUE,
94 stringsAsFactors = F,
95 sep = "\t",
96 check.names = FALSE,
97 row.names = 1
98 )
99
100 metadata <- subset(metadata, rownames(metadata) %in% colnames(data.counts))
101
102 # Create two logical named vectors for each factor level of cell signature
103 factor1_cells <- setNames(metadata[,1] == opt$factor1, rownames(metadata))
104 factor2_cells <- setNames(metadata[,1] == opt$factor2, rownames(metadata))
105
106 ## Mann-Whitney test (Two-sample Wilcoxon test)
107 MW_test <- data.frame(t(apply(data.counts, 1, function(x) {
108 do.call("cbind", wilcox.test(x[names(factor1_cells)[factor1_cells]], x[names(factor2_cells)[factor2_cells]]))[, 1:2]
109 })), stringsAsFactors = F)
110
111 # Benjamini-Hochberg correction and significativity
112 MW_test$p.adjust <- p.adjust(as.numeric(MW_test$p.value), method = "BH" , n = nrow(MW_test))
113 # MW_test$Critical.value <- (rank(MW_test$p.value) / nrow(MW_test)) * opt$fdr
114 MW_test$Significant <- MW_test$p.adjust < opt$fdr
115
116 ## Descriptive Statistics Function
117 descriptive_stats <- function(InputData) {
118 SummaryData = data.frame(
119 mean = rowMeans(InputData),
120 SD = apply(InputData, 1, sd),
121 Variance = apply(InputData, 1, var),
122 Percentage_Detection = apply(InputData, 1, function(x, y = InputData) {
123 (sum(x != 0) / ncol(y)) * 100
124 }),
125 mean_condition2 = rowMeans(InputData[,factor2_cells]),
126 mean_condition1 = rowMeans(InputData[, factor1_cells])
127 )
128 if(opt$log) {
129 SummaryData$log2FC <- SummaryData$mean_condition2 - SummaryData$mean_condition1
130 } else {
131 SummaryData$log2FC <- log2(SummaryData$mean_condition2 / SummaryData$mean_condition1)
132 }
133 return(SummaryData)
134 }
135
136 gene_stats <- descriptive_stats(data.counts)
137
138 results <- merge(gene_stats, MW_test, by = "row.names")
139 colnames(results)[1] <- "genes"
140
141 # Save files
142 write.table(
143 results,
144 opt$output,
145 sep = "\t",
146 quote = F,
147 col.names = T,
148 row.names = F
149 )