comparison cpm_tpm_rpk.R @ 4:be358a1ebf67 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/cpm_tpm_rpk commit a8486c89b7ddabfb1e814c2c42f6c04d3896904c
author artbio
date Thu, 05 Oct 2023 13:50:22 +0000
parents 8b1020c25f0f
children bcff1eb6fdb5
comparison
equal deleted inserted replaced
3:8b1020c25f0f 4:be358a1ebf67
1 if (length(commandArgs(TRUE)) == 0) { 1 if (length(commandArgs(TRUE)) == 0) {
2 system("Rscript cpm_tpm_rpk.R -h", intern = F) 2 system("Rscript cpm_tpm_rpk.R -h", intern = FALSE)
3 q("no") 3 q("no")
4 } 4 }
5 5
6 6
7 # load packages that are provided in the conda env 7 # load packages that are provided in the conda env
8 options( show.error.messages=F, 8 options(show.error.messages = FALSE,
9 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) 9 error = function() {
10 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") 10 cat(geterrmessage(), file = stderr())
11 q("no", 1, FALSE)
12 }
13 )
14 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") # nolint⁠
11 warnings() 15 warnings()
12 library(optparse) 16 library(optparse)
13 library(ggplot2) 17 library(ggplot2)
14 library(reshape2) 18 library(reshape2)
15 library(Rtsne) 19 library(Rtsne) # nolint⁠
16 library(ggfortify) 20 library(ggfortify)
17 21
18 22
19 23
20 #Arguments 24 #Arguments
21 option_list = list( 25 option_list <- list(
22 make_option( 26 make_option(
23 c("-d", "--data"), 27 c("-d", "--data"),
24 default = NA, 28 default = NA,
25 type = 'character', 29 type = "character",
26 help = "Input file that contains count values to transform" 30 help = "Input file that contains count values to transform"
27 ), 31 ),
28 make_option( 32 make_option(
29 c("-t", "--type"), 33 c("-t", "--type"),
30 default = 'cpm', 34 default = "cpm",
31 type = 'character', 35 type = "character",
32 help = "Transformation type, either cpm, tpm, rpk or none[default : '%default' ]" 36 help = "Transformation type, either cpm, tpm, rpk or none[default : '%default' ]"
33 ), 37 ),
34 make_option( 38 make_option(
35 c("-s", "--sep"), 39 c("-s", "--sep"),
36 default = '\t', 40 default = "\t",
37 type = 'character', 41 type = "character",
38 help = "File separator [default : '%default' ]" 42 help = "File separator [default : '%default' ]"
39 ), 43 ),
40 make_option( 44 make_option(
41 c("-c", "--colnames"), 45 c("-c", "--colnames"),
42 default = TRUE, 46 default = TRUE,
43 type = 'logical', 47 type = "logical",
44 help = "Consider first line as header ? [default : '%default' ]" 48 help = "Consider first line as header ? [default : '%default' ]"
45 ), 49 ),
46 make_option( 50 make_option(
47 c("-f", "--gene"), 51 c("-f", "--gene"),
48 default = NA, 52 default = NA,
49 type = 'character', 53 type = "character",
50 help = "Two column of gene length file" 54 help = "Two column of gene length file"
51 ), 55 ),
52 make_option( 56 make_option(
53 c("-a", "--gene_sep"), 57 c("-a", "--gene_sep"),
54 default = '\t', 58 default = "\t",
55 type = 'character', 59 type = "character",
56 help = "Gene length file separator [default : '%default' ]" 60 help = "Gene length file separator [default : '%default' ]"
57 ), 61 ),
58 make_option( 62 make_option(
59 c("-b", "--gene_header"), 63 c("-b", "--gene_header"),
60 default = TRUE, 64 default = TRUE,
61 type = 'logical', 65 type = "logical",
62 help = "Consider first line of gene length as header ? [default : '%default' ]" 66 help = "Consider first line of gene length as header ? [default : '%default' ]"
63 ), 67 ),
64 make_option( 68 make_option(
65 c("-l", "--log"), 69 c("-l", "--log"),
66 default = FALSE, 70 default = FALSE,
67 type = 'logical', 71 type = "logical",
68 help = "Should be log transformed as well ? (log2(data +1)) [default : '%default' ]" 72 help = "Should be log transformed as well ? (log2(data +1)) [default : '%default' ]"
69 ), 73 ),
70 make_option( 74 make_option(
71 c("-o", "--out"), 75 c("-o", "--out"),
72 default = "res.tab", 76 default = "res.tab",
73 type = 'character', 77 type = "character",
74 help = "Output name [default : '%default' ]" 78 help = "Output name [default : '%default' ]"
75 ), 79 ),
76 make_option( 80 make_option(
77 "--visu", 81 "--visu",
78 default = FALSE, 82 default = FALSE,
79 type = 'logical', 83 type = "logical",
80 help = "performs T-SNE [default : '%default' ]" 84 help = "performs T-SNE [default : '%default' ]"
81 ), 85 ),
82 make_option( 86 make_option(
83 "--tsne_labels", 87 "--tsne_labels",
84 default = FALSE, 88 default = FALSE,
85 type = 'logical', 89 type = "logical",
86 help = "add labels to t-SNE plot [default : '%default' ]" 90 help = "add labels to t-SNE plot [default : '%default' ]"
87 ), 91 ),
88 make_option( 92 make_option(
89 "--seed", 93 "--seed",
90 default = 42, 94 default = 42,
91 type = 'integer', 95 type = "integer",
92 help = "Seed value for reproducibility [default : '%default' ]" 96 help = "Seed value for reproducibility [default : '%default' ]"
93 ), 97 ),
94 make_option( 98 make_option(
95 "--perp", 99 "--perp",
96 default = 5.0, 100 default = 5.0,
97 type = 'numeric', 101 type = "numeric",
98 help = "perplexity [default : '%default' ]" 102 help = "perplexity [default : '%default' ]"
99 ), 103 ),
100 make_option( 104 make_option(
101 "--theta", 105 "--theta",
102 default = 1.0, 106 default = 1.0,
103 type = 'numeric', 107 type = "numeric",
104 help = "theta [default : '%default' ]" 108 help = "theta [default : '%default' ]"
105 ), 109 ),
106 make_option( 110 make_option(
107 c("-D", "--tsne_out"), 111 c("-D", "--tsne_out"),
108 default = "tsne.pdf", 112 default = "tsne.pdf",
109 type = 'character', 113 type = "character",
110 help = "T-SNE pdf [default : '%default' ]" 114 help = "T-SNE pdf [default : '%default' ]"
111 ), 115 ),
112 make_option( 116 make_option(
113 "--pca_out", 117 "--pca_out",
114 default = "pca.pdf", 118 default = "pca.pdf",
115 type = 'character', 119 type = "character",
116 help = "PCA pdf [default : '%default' ]" 120 help = "PCA pdf [default : '%default' ]"
117 ) 121 )
118 122
119 ) 123 )
120 124
121 opt = parse_args(OptionParser(option_list = option_list), 125 opt <- parse_args(OptionParser(option_list = option_list),
122 args = commandArgs(trailingOnly = TRUE)) 126 args = commandArgs(trailingOnly = TRUE))
123 127
124 if (opt$data == "" & !(opt$help)) { 128 if (opt$data == "" && !(opt$help)) {
125 stop("At least one argument must be supplied (count data).\n", 129 stop("At least one argument must be supplied (count data).\n",
126 call. = FALSE) 130 call. = FALSE)
127 } else if ((opt$type == "tpm" | opt$type == "rpk") & opt$gene == "") { 131 } else if ((opt$type == "tpm" || opt$type == "rpk") && opt$gene == "") {
128 stop("At least two arguments must be supplied (count data and gene length file).\n", 132 stop("At least two arguments must be supplied (count data and gene length file).\n",
129 call. = FALSE) 133 call. = FALSE)
130 } else if (opt$type != "tpm" & opt$type != "rpk" & opt$type != "cpm" & opt$type != "none") { 134 } else if (opt$type != "tpm" && opt$type != "rpk" && opt$type != "cpm" && opt$type != "none") {
131 stop("Wrong transformation requested (--type option) must be : cpm, tpm or rpk.\n", 135 stop("Wrong transformation requested (--type option) must be : cpm, tpm or rpk.\n",
132 call. = FALSE) 136 call. = FALSE)
133 } 137 }
134 138
135 if (opt$sep == "tab") {opt$sep = "\t"} 139 if (opt$sep == "tab") {
136 if (opt$gene_sep == "tab") {opt$gene_sep = "\t"} 140 opt$sep <- "\t"
141 }
142 if (opt$gene_sep == "tab") {
143 opt$gene_sep <- "\t"
144 }
137 145
138 cpm <- function(count) { 146 cpm <- function(count) {
139 t(t(count) / colSums(count)) * 1000000 147 (count / colSums(count)) * 1000000
140 } 148 }
141 149
142 150
143 rpk <- function(count, length) { 151 rpk <- function(count, length) {
144 count / (length / 1000) 152 count / (length / 1000)
145 } 153 }
146 154
147 tpm <- function(count, length) { 155 tpm <- function(count, length) {
148 RPK = rpk(count, length) 156 rpk <- rpk(count, length)
149 perMillion_factor = colSums(RPK) / 1000000 157 per_million_factor <- colSums(rpk) / 1000000
150 TPM = RPK / perMillion_factor 158 tpm <- rpk / per_million_factor
151 return(TPM) 159 return(tpm)
152 } 160 }
153 161
154 data = read.table( 162 #### running code ####
163
164 data <- read.table(
155 opt$data, 165 opt$data,
156 check.names = FALSE, 166 check.names = FALSE,
157 header = opt$colnames, 167 header = opt$colnames,
158 row.names = 1, 168 row.names = 1,
159 sep = opt$sep 169 sep = opt$sep
160 ) 170 )
161 171
162 if (opt$type == "tpm" | opt$type == "rpk") { 172 if (opt$type == "tpm" || opt$type == "rpk") {
163 gene_length = as.data.frame( 173 gene_length <- as.data.frame(
164 read.table( 174 read.table(
165 opt$gene, 175 opt$gene,
166 h = opt$gene_header, 176 header = opt$gene_header,
167 row.names = 1, 177 row.names = 1,
168 sep = opt$gene_sep 178 sep = opt$gene_sep
169 ) 179 )
170 ) 180 )
171 gene_length = as.data.frame(gene_length[match(rownames(data), rownames(gene_length)), ], rownames(data)) 181 gene_length <- as.data.frame(gene_length[match(rownames(data), rownames(gene_length)), ], rownames(data))
172 } 182 }
173 183
174 184
175 if (opt$type == "cpm") 185 if (opt$type == "cpm")
176 res = cpm(data) 186 res <- cpm(data)
177 if (opt$type == "tpm") 187 if (opt$type == "tpm")
178 res = as.data.frame(apply(data, 2, tpm, length = gene_length), row.names = rownames(data)) 188 res <- as.data.frame(apply(data, 2, tpm, length = gene_length), row.names = rownames(data))
179 if (opt$type == "rpk") 189 if (opt$type == "rpk")
180 res = as.data.frame(apply(data, 2, rpk, length = gene_length), row.names = rownames(data)) 190 res <- as.data.frame(apply(data, 2, rpk, length = gene_length), row.names = rownames(data))
181 if (opt$type == "none") 191 if (opt$type == "none")
182 res = data 192 res <- data
183 colnames(res) = colnames(data) 193 colnames(res) <- colnames(data)
184
185 194
186 if (opt$log == TRUE) { 195 if (opt$log == TRUE) {
187 res = log2(res + 1) 196 res <- log2(res + 1)
188 } 197 }
198
199 if (opt$visu == TRUE) {
200 df <- res
201 # filter and transpose df for tsne and pca
202 df <- df[rowSums(df) != 0, ] # remove lines without information (with only 0 counts)
203 tdf <- t(df)
204 # make tsne and plot results
205 set.seed(opt$seed) ## Sets seed for reproducibility
206 tsne_out <- Rtsne(tdf, perplexity = opt$perp, theta = opt$theta)
207 embedding <- as.data.frame(tsne_out$Y)
208 embedding$Class <- as.factor(sub("Class_", "", rownames(tdf)))
209 gg_legend <- theme(legend.position = "none")
210 ggplot(embedding, aes(x = V1, y = V2)) +
211 geom_point(size = 1, color = "red") +
212 gg_legend +
213 xlab("") +
214 ylab("") +
215 ggtitle("t-SNE") +
216 if (opt$tsne_labels == TRUE) {
217 geom_text(aes(label = Class), hjust = -0.2, vjust = -0.5, size = 2.5, color = "darkblue")
218 }
219 ggsave(file = opt$tsne_out, device = "pdf")
220 # make PCA and plot result with ggfortify (autoplot)
221 tdf_pca <- prcomp(tdf, center = TRUE, scale. = TRUE)
222 if (opt$tsne_labels == TRUE) {
223 autoplot(tdf_pca, shape = FALSE, label = TRUE, label.size = 2.5, label.vjust = 1.2,
224 label.hjust = 1.2,
225 colour = "darkblue") +
226 geom_point(size = 1, color = "red") +
227 xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) +
228 ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) +
229 ggtitle("PCA")
230 ggsave(file = opt$pca_out, device = "pdf")
231 } else {
232 autoplot(tdf_pca, shape = TRUE, colour = "darkblue") +
233 geom_point(size = 1, color = "red") +
234 xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) +
235 ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) +
236 ggtitle("PCA")
237 ggsave(file = opt$pca_out, device = "pdf")
238 }
239 }
240
241 # at this stage, we select numeric columns and round theirs values to 8 decimals for cleaner output
242 is_num <- sapply(res, is.numeric)
243 res[is_num] <- lapply(res[is_num], round, 8)
189 244
190 write.table( 245 write.table(
191 cbind(Features = rownames(res), res), 246 cbind(Features = rownames(res), res),
192 opt$out, 247 opt$out,
193 col.names = opt$colnames, 248 col.names = opt$colnames,
194 row.names = F, 249 row.names = FALSE,
195 quote = F, 250 quote = FALSE,
196 sep = "\t" 251 sep = "\t"
197 ) 252 )
198
199 ##
200 if (opt$visu == TRUE) {
201 df = res
202 # filter and transpose df for tsne and pca
203 df = df[rowSums(df) != 0,] # remove lines without information (with only 0 counts)
204 tdf = t(df)
205 # make tsne and plot results
206 set.seed(opt$seed) ## Sets seed for reproducibility
207 tsne_out <- Rtsne(tdf, perplexity=opt$perp, theta=opt$theta) #
208 embedding <- as.data.frame(tsne_out$Y)
209 embedding$Class <- as.factor(sub("Class_", "", rownames(tdf)))
210 gg_legend = theme(legend.position="none")
211 ggplot(embedding, aes(x=V1, y=V2)) +
212 geom_point(size=1, color='red') +
213 gg_legend +
214 xlab("") +
215 ylab("") +
216 ggtitle('t-SNE') +
217 if (opt$tsne_labels == TRUE) {
218 geom_text(aes(label=Class),hjust=-0.2, vjust=-0.5, size=2.5, color='darkblue')
219 }
220 ggsave(file=opt$tsne_out, device="pdf")
221 # make PCA and plot result with ggfortify (autoplot)
222 tdf.pca <- prcomp(tdf, center = TRUE, scale. = T)
223 if (opt$tsne_labels == TRUE) {
224 autoplot(tdf.pca, shape=F, label=T, label.size=2.5, label.vjust=1.2,
225 label.hjust=1.2,
226 colour="darkblue") +
227 geom_point(size=1, color='red') +
228 xlab(paste("PC1",summary(tdf.pca)$importance[2,1]*100, "%")) +
229 ylab(paste("PC2",summary(tdf.pca)$importance[2,2]*100, "%")) +
230 ggtitle('PCA')
231 ggsave(file=opt$pca_out, device="pdf")
232 } else {
233 autoplot(tdf.pca, shape=T, colour="darkblue") +
234 geom_point(size=1, color='red') +
235 xlab(paste("PC1",summary(tdf.pca)$importance[2,1]*100, "%")) +
236 ylab(paste("PC2",summary(tdf.pca)$importance[2,2]*100, "%")) +
237 ggtitle('PCA')
238 ggsave(file=opt$pca_out, device="pdf")
239 }
240 }
241
242
243
244
245
246
247
248
249
250
251
252