comparison normalization.R @ 0:79f00bc83ecc draft default tip

planemo upload commit a2411926bebc2ca3bb31215899a9f18a67e59556
author vmarcon
date Thu, 18 Jan 2018 06:20:30 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:79f00bc83ecc
1 # R Script implementing different kind of normalisation
2 # Input : a file containing a table with numeric values
3 # except for the first column containing sample names
4 # and the first line containing variable names
5 # separator expected is <TAB>
6 #
7 # Normalization method :
8 # log, DESeq2, Rlog, Standard_score, Pareto, TSS, TSS+CLR, Pareto
9 #
10 # Ouptut : input table with values normalized according
11 # to the normalization procedure chosen
12 #-----------------------------------------------------------------
13 # Authors : luc.jouneau(at)inra.fr
14 # valentin.marcon(at)inra.fr
15 # Version : 0.9
16 # Date : 30/08/2017
17 #-----------------------------------------------------------------
18
19 normalization=function(
20 ##########################################################
21 # Function input
22 ##########################################################
23 #Possible values : "log", "DESeq2", "Rlog", "Standard_score", "Pareto", "TSS", "TSS_CLR"
24 transformation_method="Standard_score",
25 na_encoding="NA",
26 #Path to file containg table of values (separator="tab")
27 input_file="",
28 #Path to file produced after transformation
29 output_file="out/table_out.txt",
30 #Path to file containing messages for user if something bad happens
31 log_file="log/normalization_report.html",
32 #Boolean flag (0/1) indicating if variables are in line or in columns
33 variable_in_line="1") {
34
35 ##########################################################
36 # Read and verify data
37 ##########################################################
38 #1°) Checks valids for all modules
39 if (variable_in_line=="1") {
40 column_use="individual"
41 line_use="variable"
42 } else {
43 line_use="individual"
44 column_use="variable"
45 }
46 log_error=function(message="") {
47 cat("<HTML><HEAD><TITLE>Normalization report</TITLE></HEAD><BODY>\n",file=log_file,append=F,sep="")
48 cat("&#9888 An error occurred while trying to read your table.\n<BR>",file=log_file,append=T,sep="")
49 cat("Please check that:\n<BR>",file=log_file,append=T,sep="")
50 cat("<UL>\n",file=log_file,append=T,sep="")
51 cat(" <LI> the table you want to process contains the same number of columns for each line</LI>\n",file=log_file,append=T,sep="")
52 cat(" <LI> the first line of your table is a header line (specifying the name of each ",column_use,")</LI>\n",file=log_file,append=T,sep="")
53 cat(" <LI> the first column of your table specifies the name of each ",line_use,"</LI>\n",file=log_file,append=T,sep="")
54 cat(" <LI> both individual and variable names should be unique</LI>\n",file=log_file,append=T,sep="")
55 cat(" <LI> each value is separated from the other by a <B>TAB</B> character</LI>\n",file=log_file,append=T,sep="")
56 cat(" <LI> except for first line and first column, table should contain a numeric value</LI>\n",file=log_file,append=T,sep="")
57 cat(" <LI> this value may contain character '.' as decimal separator or '",na_encoding,"' for missing values</LI>\n",file=log_file,append=T,sep="")
58 cat("</UL>\n",file=log_file,append=T,sep="")
59 cat("-------<BR>\nError messages recieved :<BR><FONT color=red>\n",conditionMessage(message),"</FONT>\n",file=log_file,append=T,sep="")
60 cat("</BODY></HTML>\n",file=log_file,append=T,sep="")
61 q(save="no",status=1)
62 }
63
64 tab_in=tryCatch(
65 {
66 tab_in=read.table(file=input_file,sep="\t",header=T,quote="\"",na.strings=na_encoding,check.names=FALSE)
67 },
68 error=function(cond) {
69 log_error(message=cond)
70 return(NA)
71 },
72 warning=function(cond) {
73 log_error(message=cond)
74 return(NA)
75 },
76 finally={
77 #Do nothing special
78 }
79 )
80
81 if (ncol(tab_in)<2) {
82 log_error(simpleCondition("The table you want to normalize contains less than two columns."))
83 }
84
85 rn=as.character(tab_in[,1])
86 if (length(rn)!=length(unique(rn))) {
87 duplicated_rownames=table(rn)
88 duplicated_rownames=duplicated_rownames[duplicated_rownames>1]
89 duplicated_rownames=names(duplicated_rownames)
90 if (length(duplicated_rownames)>3) {
91 duplicated_rownames=c(duplicated_rownames[1:3],"...")
92 }
93 duplicated_rownames=paste(duplicated_rownames,collapse=", ")
94 log_error(simpleCondition(
95 paste("The table you want to normalize have duplicated values in the first column (",
96 line_use," names) - duplicated ",line_use," names : ",duplicated_rownames,sep="")
97 ))
98 }
99 tab=tab_in[,-1]
100 rownames(tab)=rn
101
102 #Check all columns are numeric
103 tab=as.matrix(tab)
104 cell.with.na=c()
105 for (i in 1:ncol(tab)) {
106 na.v1=is.na(tab[,i])
107 na.v2=is.na(as.numeric(tab[,i]))
108 if (sum(na.v1)!=sum(na.v2)) {
109 sel=which(na.v1!=na.v2)
110 sel=sel[1]
111 value=tab[sel,i]
112 log_error(simpleCondition(
113 paste("Column '",colnames(tab)[i],"' of your table contains non numeric values. Please check its content (on line #",sel," : value='",value,"').",sep="")
114 ))
115 }
116 if (length(cell.with.na)==0 & sum(na.v1)!=0) {
117 cell.with.na=c(i,which(na.v1)[1])
118 }
119 }
120
121 #2°) Checks only valid for normalization module
122 if (transformation_method %in% c("DESeq2","Rlog")) {
123 #Check there is no missing values
124 if (length(cell.with.na)!=0) {
125 log_error(simpleCondition(
126 paste("Column '",colnames(tab)[cell.with.na[1]],"' of your table contains missing values (see line #",cell.with.na[2],").\n",
127 transformation_method," normalization does not accept missing values. ",sep="")
128 ))
129 }
130 }
131 if (transformation_method %in% c("DESeq2","Rlog","TSS","TSS_CLR")) {
132 #Check values are integer
133 for (i in 1:ncol(tab)) {
134 if (!is.integer(tab[,i])) {
135 sel=which(!is.integer(tab[,i]))
136 sel=sel[1]
137 value=tab[sel,i]
138 log_error(simpleCondition(
139 paste("Column '",colnames(tab)[i],"' of your table contains non integer values.\n",
140 transformation_method," normalization only accepts integer values. ",
141 "Please check its content (on line #",sel," : value=",value,").",sep="")
142 ))
143 }
144 }
145 }
146
147 if (transformation_method %in% c("log","TSS","TSS_CLR","DESeq2","Rlog")) {
148 #Check values are positive
149 for (i in 1:ncol(tab)) {
150 if (sum(tab[,i]>=0 | is.na(tab[,i]))!=nrow(tab)) {
151 sel=which(tab[,i]<0)
152 sel=sel[1]
153 value=tab[sel,i]
154 log_error(simpleCondition(
155 paste("Column '",colnames(tab)[i],"' of your table contains negative values.\n",
156 transformation_method," normalization only accepts positive or null values. ",
157 "Please check its content (on line #",sel," : value=",value,").",sep="")
158 ))
159 }
160 }
161 }
162
163 ##########################################################
164 # End of data checks
165 ##########################################################
166
167 ### Transpose if variable are in line ###
168 if (variable_in_line=="1") {
169 #Transpose matrix
170 tab=t(tab)
171 }
172
173 ##########################################################
174 ### Value transformation
175 ##########################################################
176
177 #Avoid null values when there is a log transformation
178 na.replaced=c()
179 log.transformed=FALSE
180 if (transformation_method %in% c("log","TSS_CLR")) {
181 log.transformed=TRUE
182 for (idx_col in 1:ncol(tab)) {
183 sel=tab[,idx_col]==0
184 na.replaced=cbind(na.replaced,sel)
185 tab[sel,idx_col]=1e-2
186 }
187 }
188
189 ### log ###
190 if (transformation_method=="log") {
191 tab=log2(tab)
192 }
193
194 ### DESeq2 or Rlog ###
195 if (transformation_method %in% c("DESeq2","Rlog")) {
196 library(DESeq2)
197 n <- ncol(tab)
198 dds <- DESeqDataSetFromMatrix(tab,
199 colData = data.frame(condition = c("a", rep("b", n - 1))),
200 design = formula(~ condition))
201 colnames(dds) <- colnames(tab)
202 dds <- estimateSizeFactors(dds)
203 tab <- switch(transformation_method,
204 DESeq2 = counts(dds, normalized = TRUE),
205 Rlog = assay(rlogTransformation(dds))
206 )
207 }
208
209 ### Standard_score ###
210 if (transformation_method=="Standard_score") {
211 tab=scale(tab)
212 }
213
214 ### Pareto ###
215 if (transformation_method=="Pareto") {
216 tab.centered <- apply(tab, 2, function(x) x - mean(x,na.rm=TRUE))
217 tab.sc <- apply(tab.centered, 2, function(x) x/sqrt(sd(x,na.rm=TRUE)))
218 tab=tab.sc
219 }
220
221 ### TSS ###
222 if (transformation_method=="TSS") {
223 tab= t(apply(tab, 1, function(x) x/sum(x,na.rm=TRUE)))
224 }
225
226 ### TSS + CLR avec function de mixOmics ###
227 if (transformation_method=="TSS_CLR") {
228 #From http://stackoverflow.com/questions/2602583/geometric-mean-is-there-a-built-in
229 geometric.mean = function(x, na.rm=TRUE){
230 exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
231 }
232 tab = t(apply(tab+1e-2,1,function(x) log(x/geometric.mean(x,na.rm=TRUE))))
233 }
234
235
236 #If there is a log transformation put 0 where there was NA
237 if (log.transformed) {
238 for (idx_col in 1:ncol(tab)) {
239 tab[na.replaced[,idx_col],idx_col]=0
240 }
241 }
242
243 #If there are missing values, replace it with NA_enconding
244 for (idx_col in 1:ncol(tab)) {
245 sel=is.na(tab[,idx_col])
246 tab[sel,idx_col]=na_encoding
247 }
248
249 ##########################################################
250 # Prepare and write output table
251 ##########################################################
252 if (variable_in_line=="1") {
253 #Transpose matrix again
254 tab=t(tab)
255 }
256
257 tab_out=cbind(rownames(tab),tab)
258 colnames(tab_out)[1]=colnames(tab_in)[1]
259
260 write.table(file=output_file,tab_out,sep="\t",row.names=F,quote=F)
261
262 ##########################################################
263 # Treatment successfull
264 ##########################################################
265 cat("<HTML><HEAD><TITLE>Normalization report</TITLE></HEAD><BODY>\n",file=log_file,append=F,sep="")
266 cat(paste("&#10132; You choose to apply the transformation method :",transformation_method,"<BR>"),file=log_file,append=F,sep="")
267 cat("&#10003; Your normalization process is successfull !<BR>",file=log_file,append=T,sep="")
268 cat("</BODY></HTML>\n",file=log_file,append=T,sep="")
269
270 q(save="no",status=0)
271
272 } # end of function
273
274 ##########################################################
275 # Test
276 ##########################################################
277 #Used for debug : LJO 6/3/2017
278 #normalization()
279 #setwd("H:/INRA/cati/groupe stats/Galaxy/normalisation")
280 #normalization(transformation_method="Standard_score",na_encoding="NA",input_file="datasets/valid - decathlon.txt",output_file="out/table_out.txt",log_file="log/normalization.html",variable_in_line="0")
281 #normalization(transformation_method="Pareto",na_encoding="NA",input_file="datasets/valid - decathlon.txt",output_file="out/table_out.txt",log_file="log/normalization.html",variable_in_line="1")
282