Mercurial > repos > vmarcon > normalization
comparison normalization.R @ 0:79f00bc83ecc draft default tip
planemo upload commit a2411926bebc2ca3bb31215899a9f18a67e59556
| author | vmarcon |
|---|---|
| date | Thu, 18 Jan 2018 06:20:30 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:79f00bc83ecc |
|---|---|
| 1 # R Script implementing different kind of normalisation | |
| 2 # Input : a file containing a table with numeric values | |
| 3 # except for the first column containing sample names | |
| 4 # and the first line containing variable names | |
| 5 # separator expected is <TAB> | |
| 6 # | |
| 7 # Normalization method : | |
| 8 # log, DESeq2, Rlog, Standard_score, Pareto, TSS, TSS+CLR, Pareto | |
| 9 # | |
| 10 # Ouptut : input table with values normalized according | |
| 11 # to the normalization procedure chosen | |
| 12 #----------------------------------------------------------------- | |
| 13 # Authors : luc.jouneau(at)inra.fr | |
| 14 # valentin.marcon(at)inra.fr | |
| 15 # Version : 0.9 | |
| 16 # Date : 30/08/2017 | |
| 17 #----------------------------------------------------------------- | |
| 18 | |
| 19 normalization=function( | |
| 20 ########################################################## | |
| 21 # Function input | |
| 22 ########################################################## | |
| 23 #Possible values : "log", "DESeq2", "Rlog", "Standard_score", "Pareto", "TSS", "TSS_CLR" | |
| 24 transformation_method="Standard_score", | |
| 25 na_encoding="NA", | |
| 26 #Path to file containg table of values (separator="tab") | |
| 27 input_file="", | |
| 28 #Path to file produced after transformation | |
| 29 output_file="out/table_out.txt", | |
| 30 #Path to file containing messages for user if something bad happens | |
| 31 log_file="log/normalization_report.html", | |
| 32 #Boolean flag (0/1) indicating if variables are in line or in columns | |
| 33 variable_in_line="1") { | |
| 34 | |
| 35 ########################################################## | |
| 36 # Read and verify data | |
| 37 ########################################################## | |
| 38 #1°) Checks valids for all modules | |
| 39 if (variable_in_line=="1") { | |
| 40 column_use="individual" | |
| 41 line_use="variable" | |
| 42 } else { | |
| 43 line_use="individual" | |
| 44 column_use="variable" | |
| 45 } | |
| 46 log_error=function(message="") { | |
| 47 cat("<HTML><HEAD><TITLE>Normalization report</TITLE></HEAD><BODY>\n",file=log_file,append=F,sep="") | |
| 48 cat("⚠ An error occurred while trying to read your table.\n<BR>",file=log_file,append=T,sep="") | |
| 49 cat("Please check that:\n<BR>",file=log_file,append=T,sep="") | |
| 50 cat("<UL>\n",file=log_file,append=T,sep="") | |
| 51 cat(" <LI> the table you want to process contains the same number of columns for each line</LI>\n",file=log_file,append=T,sep="") | |
| 52 cat(" <LI> the first line of your table is a header line (specifying the name of each ",column_use,")</LI>\n",file=log_file,append=T,sep="") | |
| 53 cat(" <LI> the first column of your table specifies the name of each ",line_use,"</LI>\n",file=log_file,append=T,sep="") | |
| 54 cat(" <LI> both individual and variable names should be unique</LI>\n",file=log_file,append=T,sep="") | |
| 55 cat(" <LI> each value is separated from the other by a <B>TAB</B> character</LI>\n",file=log_file,append=T,sep="") | |
| 56 cat(" <LI> except for first line and first column, table should contain a numeric value</LI>\n",file=log_file,append=T,sep="") | |
| 57 cat(" <LI> this value may contain character '.' as decimal separator or '",na_encoding,"' for missing values</LI>\n",file=log_file,append=T,sep="") | |
| 58 cat("</UL>\n",file=log_file,append=T,sep="") | |
| 59 cat("-------<BR>\nError messages recieved :<BR><FONT color=red>\n",conditionMessage(message),"</FONT>\n",file=log_file,append=T,sep="") | |
| 60 cat("</BODY></HTML>\n",file=log_file,append=T,sep="") | |
| 61 q(save="no",status=1) | |
| 62 } | |
| 63 | |
| 64 tab_in=tryCatch( | |
| 65 { | |
| 66 tab_in=read.table(file=input_file,sep="\t",header=T,quote="\"",na.strings=na_encoding,check.names=FALSE) | |
| 67 }, | |
| 68 error=function(cond) { | |
| 69 log_error(message=cond) | |
| 70 return(NA) | |
| 71 }, | |
| 72 warning=function(cond) { | |
| 73 log_error(message=cond) | |
| 74 return(NA) | |
| 75 }, | |
| 76 finally={ | |
| 77 #Do nothing special | |
| 78 } | |
| 79 ) | |
| 80 | |
| 81 if (ncol(tab_in)<2) { | |
| 82 log_error(simpleCondition("The table you want to normalize contains less than two columns.")) | |
| 83 } | |
| 84 | |
| 85 rn=as.character(tab_in[,1]) | |
| 86 if (length(rn)!=length(unique(rn))) { | |
| 87 duplicated_rownames=table(rn) | |
| 88 duplicated_rownames=duplicated_rownames[duplicated_rownames>1] | |
| 89 duplicated_rownames=names(duplicated_rownames) | |
| 90 if (length(duplicated_rownames)>3) { | |
| 91 duplicated_rownames=c(duplicated_rownames[1:3],"...") | |
| 92 } | |
| 93 duplicated_rownames=paste(duplicated_rownames,collapse=", ") | |
| 94 log_error(simpleCondition( | |
| 95 paste("The table you want to normalize have duplicated values in the first column (", | |
| 96 line_use," names) - duplicated ",line_use," names : ",duplicated_rownames,sep="") | |
| 97 )) | |
| 98 } | |
| 99 tab=tab_in[,-1] | |
| 100 rownames(tab)=rn | |
| 101 | |
| 102 #Check all columns are numeric | |
| 103 tab=as.matrix(tab) | |
| 104 cell.with.na=c() | |
| 105 for (i in 1:ncol(tab)) { | |
| 106 na.v1=is.na(tab[,i]) | |
| 107 na.v2=is.na(as.numeric(tab[,i])) | |
| 108 if (sum(na.v1)!=sum(na.v2)) { | |
| 109 sel=which(na.v1!=na.v2) | |
| 110 sel=sel[1] | |
| 111 value=tab[sel,i] | |
| 112 log_error(simpleCondition( | |
| 113 paste("Column '",colnames(tab)[i],"' of your table contains non numeric values. Please check its content (on line #",sel," : value='",value,"').",sep="") | |
| 114 )) | |
| 115 } | |
| 116 if (length(cell.with.na)==0 & sum(na.v1)!=0) { | |
| 117 cell.with.na=c(i,which(na.v1)[1]) | |
| 118 } | |
| 119 } | |
| 120 | |
| 121 #2°) Checks only valid for normalization module | |
| 122 if (transformation_method %in% c("DESeq2","Rlog")) { | |
| 123 #Check there is no missing values | |
| 124 if (length(cell.with.na)!=0) { | |
| 125 log_error(simpleCondition( | |
| 126 paste("Column '",colnames(tab)[cell.with.na[1]],"' of your table contains missing values (see line #",cell.with.na[2],").\n", | |
| 127 transformation_method," normalization does not accept missing values. ",sep="") | |
| 128 )) | |
| 129 } | |
| 130 } | |
| 131 if (transformation_method %in% c("DESeq2","Rlog","TSS","TSS_CLR")) { | |
| 132 #Check values are integer | |
| 133 for (i in 1:ncol(tab)) { | |
| 134 if (!is.integer(tab[,i])) { | |
| 135 sel=which(!is.integer(tab[,i])) | |
| 136 sel=sel[1] | |
| 137 value=tab[sel,i] | |
| 138 log_error(simpleCondition( | |
| 139 paste("Column '",colnames(tab)[i],"' of your table contains non integer values.\n", | |
| 140 transformation_method," normalization only accepts integer values. ", | |
| 141 "Please check its content (on line #",sel," : value=",value,").",sep="") | |
| 142 )) | |
| 143 } | |
| 144 } | |
| 145 } | |
| 146 | |
| 147 if (transformation_method %in% c("log","TSS","TSS_CLR","DESeq2","Rlog")) { | |
| 148 #Check values are positive | |
| 149 for (i in 1:ncol(tab)) { | |
| 150 if (sum(tab[,i]>=0 | is.na(tab[,i]))!=nrow(tab)) { | |
| 151 sel=which(tab[,i]<0) | |
| 152 sel=sel[1] | |
| 153 value=tab[sel,i] | |
| 154 log_error(simpleCondition( | |
| 155 paste("Column '",colnames(tab)[i],"' of your table contains negative values.\n", | |
| 156 transformation_method," normalization only accepts positive or null values. ", | |
| 157 "Please check its content (on line #",sel," : value=",value,").",sep="") | |
| 158 )) | |
| 159 } | |
| 160 } | |
| 161 } | |
| 162 | |
| 163 ########################################################## | |
| 164 # End of data checks | |
| 165 ########################################################## | |
| 166 | |
| 167 ### Transpose if variable are in line ### | |
| 168 if (variable_in_line=="1") { | |
| 169 #Transpose matrix | |
| 170 tab=t(tab) | |
| 171 } | |
| 172 | |
| 173 ########################################################## | |
| 174 ### Value transformation | |
| 175 ########################################################## | |
| 176 | |
| 177 #Avoid null values when there is a log transformation | |
| 178 na.replaced=c() | |
| 179 log.transformed=FALSE | |
| 180 if (transformation_method %in% c("log","TSS_CLR")) { | |
| 181 log.transformed=TRUE | |
| 182 for (idx_col in 1:ncol(tab)) { | |
| 183 sel=tab[,idx_col]==0 | |
| 184 na.replaced=cbind(na.replaced,sel) | |
| 185 tab[sel,idx_col]=1e-2 | |
| 186 } | |
| 187 } | |
| 188 | |
| 189 ### log ### | |
| 190 if (transformation_method=="log") { | |
| 191 tab=log2(tab) | |
| 192 } | |
| 193 | |
| 194 ### DESeq2 or Rlog ### | |
| 195 if (transformation_method %in% c("DESeq2","Rlog")) { | |
| 196 library(DESeq2) | |
| 197 n <- ncol(tab) | |
| 198 dds <- DESeqDataSetFromMatrix(tab, | |
| 199 colData = data.frame(condition = c("a", rep("b", n - 1))), | |
| 200 design = formula(~ condition)) | |
| 201 colnames(dds) <- colnames(tab) | |
| 202 dds <- estimateSizeFactors(dds) | |
| 203 tab <- switch(transformation_method, | |
| 204 DESeq2 = counts(dds, normalized = TRUE), | |
| 205 Rlog = assay(rlogTransformation(dds)) | |
| 206 ) | |
| 207 } | |
| 208 | |
| 209 ### Standard_score ### | |
| 210 if (transformation_method=="Standard_score") { | |
| 211 tab=scale(tab) | |
| 212 } | |
| 213 | |
| 214 ### Pareto ### | |
| 215 if (transformation_method=="Pareto") { | |
| 216 tab.centered <- apply(tab, 2, function(x) x - mean(x,na.rm=TRUE)) | |
| 217 tab.sc <- apply(tab.centered, 2, function(x) x/sqrt(sd(x,na.rm=TRUE))) | |
| 218 tab=tab.sc | |
| 219 } | |
| 220 | |
| 221 ### TSS ### | |
| 222 if (transformation_method=="TSS") { | |
| 223 tab= t(apply(tab, 1, function(x) x/sum(x,na.rm=TRUE))) | |
| 224 } | |
| 225 | |
| 226 ### TSS + CLR avec function de mixOmics ### | |
| 227 if (transformation_method=="TSS_CLR") { | |
| 228 #From http://stackoverflow.com/questions/2602583/geometric-mean-is-there-a-built-in | |
| 229 geometric.mean = function(x, na.rm=TRUE){ | |
| 230 exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)) | |
| 231 } | |
| 232 tab = t(apply(tab+1e-2,1,function(x) log(x/geometric.mean(x,na.rm=TRUE)))) | |
| 233 } | |
| 234 | |
| 235 | |
| 236 #If there is a log transformation put 0 where there was NA | |
| 237 if (log.transformed) { | |
| 238 for (idx_col in 1:ncol(tab)) { | |
| 239 tab[na.replaced[,idx_col],idx_col]=0 | |
| 240 } | |
| 241 } | |
| 242 | |
| 243 #If there are missing values, replace it with NA_enconding | |
| 244 for (idx_col in 1:ncol(tab)) { | |
| 245 sel=is.na(tab[,idx_col]) | |
| 246 tab[sel,idx_col]=na_encoding | |
| 247 } | |
| 248 | |
| 249 ########################################################## | |
| 250 # Prepare and write output table | |
| 251 ########################################################## | |
| 252 if (variable_in_line=="1") { | |
| 253 #Transpose matrix again | |
| 254 tab=t(tab) | |
| 255 } | |
| 256 | |
| 257 tab_out=cbind(rownames(tab),tab) | |
| 258 colnames(tab_out)[1]=colnames(tab_in)[1] | |
| 259 | |
| 260 write.table(file=output_file,tab_out,sep="\t",row.names=F,quote=F) | |
| 261 | |
| 262 ########################################################## | |
| 263 # Treatment successfull | |
| 264 ########################################################## | |
| 265 cat("<HTML><HEAD><TITLE>Normalization report</TITLE></HEAD><BODY>\n",file=log_file,append=F,sep="") | |
| 266 cat(paste("➔ You choose to apply the transformation method :",transformation_method,"<BR>"),file=log_file,append=F,sep="") | |
| 267 cat("✓ Your normalization process is successfull !<BR>",file=log_file,append=T,sep="") | |
| 268 cat("</BODY></HTML>\n",file=log_file,append=T,sep="") | |
| 269 | |
| 270 q(save="no",status=0) | |
| 271 | |
| 272 } # end of function | |
| 273 | |
| 274 ########################################################## | |
| 275 # Test | |
| 276 ########################################################## | |
| 277 #Used for debug : LJO 6/3/2017 | |
| 278 #normalization() | |
| 279 #setwd("H:/INRA/cati/groupe stats/Galaxy/normalisation") | |
| 280 #normalization(transformation_method="Standard_score",na_encoding="NA",input_file="datasets/valid - decathlon.txt",output_file="out/table_out.txt",log_file="log/normalization.html",variable_in_line="0") | |
| 281 #normalization(transformation_method="Pareto",na_encoding="NA",input_file="datasets/valid - decathlon.txt",output_file="out/table_out.txt",log_file="log/normalization.html",variable_in_line="1") | |
| 282 |
