comparison cpm_tpm_rpk.R @ 0:35d032c46a4e draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/cpm_tpm_rpk commit cc0fd23c039cc4a39c5e4e320b50666b7d9b6f65
author artbio
date Wed, 25 Jul 2018 13:05:17 -0400
parents
children b74bab5157c4
comparison
equal deleted inserted replaced
-1:000000000000 0:35d032c46a4e
1 if (length(commandArgs(TRUE)) == 0) {
2 system("Rscript cpm_tpm_rpk.R -h", intern = F)
3 q("no")
4 }
5
6 # Load necessary packages (install them if it's not the case)
7 requiredPackages = c('optparse')
8 for (p in requiredPackages) {
9 if (!require(p, character.only = TRUE, quietly = T)) {
10 install.packages(p)
11 }
12 suppressPackageStartupMessages(suppressMessages(library(p, character.only = TRUE)))
13 }
14
15
16 #Arguments
17 option_list = list(
18 make_option(
19 c("-d", "--data"),
20 default = NA,
21 type = 'character',
22 help = "Input file that contains count values to transform"
23 ),
24 make_option(
25 c("-t", "--type"),
26 default = 'cpm',
27 type = 'character',
28 help = "Transformation type, either cpm, tpm or rpk [default : '%default' ]"
29 ),
30 make_option(
31 c("-s", "--sep"),
32 default = '\t',
33 type = 'character',
34 help = "File separator [default : '%default' ]"
35 ),
36 make_option(
37 c("-c", "--colnames"),
38 default = TRUE,
39 type = 'logical',
40 help = "Consider first line as header ? [default : '%default' ]"
41 ),
42 make_option(
43 c("-f", "--gene"),
44 default = NA,
45 type = 'character',
46 help = "Two column of gene length file"
47 ),
48 make_option(
49 c("-a", "--gene_sep"),
50 default = '\t',
51 type = 'character',
52 help = "Gene length file separator [default : '%default' ]"
53 ),
54 make_option(
55 c("-b", "--gene_header"),
56 default = TRUE,
57 type = 'logical',
58 help = "Consider first line of gene length as header ? [default : '%default' ]"
59 ),
60 make_option(
61 c("-l", "--log"),
62 default = FALSE,
63 type = 'logical',
64 help = "Should be log transformed as well ? (log2(data +1)) [default : '%default' ]"
65 ),
66 make_option(
67 c("-o", "--out"),
68 default = "res.tab",
69 type = 'character',
70 help = "Output name [default : '%default' ]"
71 )
72 )
73
74
75 opt = parse_args(OptionParser(option_list = option_list),
76 args = commandArgs(trailingOnly = TRUE))
77
78 if (opt$data == "" & !(opt$help)) {
79 stop("At least one argument must be supplied (count data).\n",
80 call. = FALSE)
81 } else if ((opt$type == "tpm" | opt$type == "rpk") & opt$gene == "") {
82 stop("At least two arguments must be supplied (count data and gene length file).\n",
83 call. = FALSE)
84 } else if (opt$type != "tpm" & opt$type != "rpk" & opt$type != "cpm") {
85 stop("Wrong transformation requested (--type option) must be : cpm, tpm or rpk.\n",
86 call. = FALSE)
87 }
88
89 if (opt$sep == "tab") {opt$sep = "\t"}
90 if (opt$gene_sep == "tab") {opt$gene_sep = "\t"}
91
92 cpm <- function(count) {
93 t(t(count) / colSums(count)) * 1000000
94 }
95
96
97 rpk <- function(count, length) {
98 count / (length / 1000)
99 }
100
101 tpm <- function(count, length) {
102 RPK = rpk(count, length)
103 perMillion_factor = colSums(RPK) / 1000000
104 TPM = RPK / perMillion_factor
105 return(TPM)
106 }
107
108 data = read.table(
109 opt$data,
110 h = opt$colnames,
111 row.names = 1,
112 sep = opt$sep
113 )
114
115 if (opt$type == "tpm" | opt$type == "rpk") {
116 gene_length = as.data.frame(
117 read.table(
118 opt$gene,
119 h = opt$gene_header,
120 row.names = 1,
121 sep = opt$gene_sep
122 )
123 )
124 gene_length = as.data.frame(gene_length[match(rownames(data), rownames(gene_length)), ], rownames(data))
125 }
126
127
128 if (opt$type == "cpm")
129 res = cpm(data)
130 if (opt$type == "tpm")
131 res = as.data.frame(apply(data, 2, tpm, length = gene_length), row.names = rownames(data))
132 if (opt$type == "rpk")
133 res = as.data.frame(apply(data, 2, rpk, length = gene_length), row.names = rownames(data))
134 colnames(res) = colnames(data)
135
136
137 if (opt$log == TRUE) {
138 res = log2(res + 1)
139 }
140
141 write.table(
142 cbind(Features = rownames(res), res),
143 opt$out,
144 col.names = opt$colnames,
145 row.names = F,
146 quote = F,
147 sep = "\t"
148 )
149