Mercurial > repos > vmarcon > summary_statistics
comparison summary_statistics.R @ 0:46ddb0591d8b draft default tip
planemo upload commit a2411926bebc2ca3bb31215899a9f18a67e59556
author | vmarcon |
---|---|
date | Thu, 18 Jan 2018 07:44:37 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:46ddb0591d8b |
---|---|
1 ########################################################################### | |
2 # Quality controls and descriptive analysis plots # | |
3 ########################################################################### | |
4 # Authors: Melanie Petera # | |
5 ########################################################################### | |
6 # Description : This script allows various displays of data for quality # | |
7 # control and descriptive analysis. The input data is a matrix of # | |
8 # quantitative variables, and it returns chosen plots in png format # | |
9 # and a table with chosen statistics. # | |
10 ########################################################################### | |
11 # Specific R packages: # | |
12 # - edgeR (needed for MA plots) # | |
13 ########################################################################### | |
14 # Version 1 (06-06-2014): display boxplot, histogram, density plot, # | |
15 # MA plot, pairs plot, and return a table of chosen statistics # | |
16 # (quantiles, mean, variance, standard error of the mean) # | |
17 ########################################################################### | |
18 | |
19 desc_fct <- function(file.in, nacode, table_file, graph_file, stat, chosen.stat, ploting, chosen.plot, log_file){ | |
20 # Parameters: | |
21 # - file.in: count matrix input (tab-separated) [file name] | |
22 # - nacode: missing value coding character | |
23 # - table_file: results file containing table of chosen statistics [file name] | |
24 # - graph_file: pdf file containing plots for chosen statistics [file name] | |
25 # - stat: should statistics be calculated? (TRUE/FALSE) | |
26 # - chosen.stat: character listing the chosen statistics (comma-separated) | |
27 # - ploting: should graphics be displayed? (TRUE/FALSE) | |
28 # - chosen.plot: character listing the chosen plots (comma-separated) | |
29 # - log_file: a log file [file name] | |
30 | |
31 | |
32 ########################################################## | |
33 # Read and verify data - - - - - - - - - - - - | |
34 # Checks valids for all modules | |
35 | |
36 library(methods) | |
37 | |
38 log_error=function(message="") { | |
39 line_use="line" | |
40 column_use="column" | |
41 | |
42 cat("<HTML><HEAD><TITLE>Normalization report</TITLE></HEAD><BODY>\n",file=log_file,append=F,sep="") | |
43 cat("⚠ An error occurred while trying to read your table.\n<BR>",file=log_file,append=T,sep="") | |
44 cat("Please check that:\n<BR>",file=log_file,append=T,sep="") | |
45 cat("<UL>\n",file=log_file,append=T,sep="") | |
46 cat(" <LI> the table you want to process contains the same number of columns for each line</LI>\n",file=log_file,append=T,sep="") | |
47 cat(" <LI> the first line of your table is a header line (specifying the name of each ",column_use,")</LI>\n",file=log_file,append=T,sep="") | |
48 cat(" <LI> the first column of your table specifies the name of each ",line_use,"</LI>\n",file=log_file,append=T,sep="") | |
49 cat(" <LI> both individual and variable names should be unique</LI>\n",file=log_file,append=T,sep="") | |
50 cat(" <LI> each value is separated from the other by a <B>TAB</B> character</LI>\n",file=log_file,append=T,sep="") | |
51 cat(" <LI> except for first line and first column, table should contain a numeric value</LI>\n",file=log_file,append=T,sep="") | |
52 cat(" <LI> this value may contain character '.' as decimal separator or '",nacode,"' for missing values</LI>\n",file=log_file,append=T,sep="") | |
53 cat("</UL>\n",file=log_file,append=T,sep="") | |
54 cat("-------<BR>\nError messages recieved:<BR><FONT color=red>\n",conditionMessage(message),"</FONT>\n",file=log_file,append=T,sep="") | |
55 cat("</BODY></HTML>\n",file=log_file,append=T,sep="") | |
56 q(save="no",status=1) | |
57 } | |
58 | |
59 tab_in=tryCatch( | |
60 { | |
61 tab_in=read.table(file.in,header=TRUE,na.strings=nacode,sep="\t",check.names=FALSE,quote="\"") | |
62 }, | |
63 error=function(cond) { | |
64 log_error(message=cond) | |
65 return(NA) | |
66 }, | |
67 warning=function(cond) { | |
68 log_error(message=cond) | |
69 return(NA) | |
70 }, | |
71 finally={ | |
72 #Do nothing special | |
73 } | |
74 ) | |
75 | |
76 if (ncol(tab_in)<2) { | |
77 log_error(simpleCondition("The table you want to use contains less than two columns.")) | |
78 } | |
79 | |
80 rn=as.character(tab_in[,1]) | |
81 if (length(rn)!=length(unique(rn))) { | |
82 duplicated_rownames=table(rn) | |
83 duplicated_rownames=duplicated_rownames[duplicated_rownames>1] | |
84 duplicated_rownames=names(duplicated_rownames) | |
85 if (length(duplicated_rownames)>3) { | |
86 duplicated_rownames=c(duplicated_rownames[1:3],"...") | |
87 } | |
88 duplicated_rownames=paste(duplicated_rownames,collapse=", ") | |
89 log_error(simpleCondition( | |
90 paste("The table you want to use have duplicated values in the first column (", | |
91 " - duplicated names: ",duplicated_rownames,sep="") | |
92 )) | |
93 } | |
94 tab=tab_in[,-1] | |
95 rownames(tab)=rn | |
96 | |
97 #Check all columns are numerical | |
98 tab=as.matrix(tab) | |
99 cell.with.na=c() | |
100 for (i in 1:ncol(tab)) { | |
101 na.v1=is.na(tab[,i]) | |
102 na.v2=is.na(as.numeric(tab[,i])) | |
103 if (sum(na.v1)!=sum(na.v2)) { | |
104 sel=which(na.v1!=na.v2) | |
105 sel=sel[1] | |
106 value=tab[sel,i] | |
107 log_error(simpleCondition( | |
108 paste("Column '",colnames(tab)[i],"' of your table contains non numerical values. Please check its content (on line #",sel,": value='",value,"').",sep="") | |
109 )) | |
110 } | |
111 if (length(cell.with.na)==0 & sum(na.v1)!=0) { | |
112 cell.with.na=c(i,which(na.v1)[1]) | |
113 } | |
114 } | |
115 | |
116 Dataset <- tab_in | |
117 | |
118 ########################################################## | |
119 # Statistics table computation - - - - - - - - - | |
120 | |
121 log="" | |
122 | |
123 if(stat=="T" & length(chosen.stat)!=0){ | |
124 | |
125 stat.list <- strsplit(chosen.stat,",")[[1]] | |
126 stat.res <- t(Dataset[0,,drop=FALSE]) | |
127 | |
128 numdig <- 5 | |
129 | |
130 if("mean" %in% stat.list){ | |
131 stat.res <- cbind(stat.res,c("Mean",round(colMeans(Dataset[,-1],na.rm=TRUE),digits=numdig))) | |
132 } | |
133 | |
134 if("sd" %in% stat.list){ | |
135 colSd <- apply(Dataset[,-1],2,sd,na.rm=TRUE) | |
136 stat.res <- cbind(stat.res,c("Std.Dev",round(colSd,digits=numdig))) | |
137 } | |
138 | |
139 if("variance" %in% stat.list){ | |
140 colVar <- apply(Dataset[,-1],2,var,na.rm=TRUE) | |
141 stat.res <- cbind(stat.res,c("Variance",round(colVar,digits=numdig))) | |
142 } | |
143 | |
144 if(("median" %in% stat.list)&&(!("quartile" %in% stat.list))){ | |
145 colMed <- apply(Dataset[,-1],2,median,na.rm=TRUE) | |
146 stat.res <- cbind(stat.res,c("Median",round(colMed,digits=numdig))) | |
147 } | |
148 | |
149 if("quartile" %in% stat.list){ | |
150 colQ <- round(apply(Dataset[,-1],2,quantile,na.rm=TRUE),digits=numdig) | |
151 stat.res <- cbind(stat.res,c("Min",colQ[1,]),c("Q1",colQ[2,]), | |
152 c("Median",colQ[3,]),c("Q3",colQ[4,]),c("Max",colQ[5,])) | |
153 } | |
154 | |
155 if("decile" %in% stat.list){ | |
156 colD <- round(t(apply(Dataset[,-1],2,quantile,na.rm=TRUE,seq(0,1,0.1))),digits=numdig) | |
157 colD <- rbind(paste("D",seq(0,10,1),sep=""),colD) | |
158 stat.res <- cbind(stat.res,colD) | |
159 } | |
160 | |
161 write.table(stat.res,table_file,col.names=FALSE,sep="\t",quote=FALSE) | |
162 | |
163 log=paste(log,"➔ You choose to compute :",chosen.stat,"<BR>") | |
164 | |
165 } # end if(stat) | |
166 else{ | |
167 log=paste(log,"➔ You don't choose any stats<BR>") | |
168 } | |
169 | |
170 ########################################################## | |
171 # Graphics generation - - - - - - - - - - - - - | |
172 | |
173 if(ploting=="T" & length(chosen.plot)!=0){ | |
174 | |
175 nb_graph_per_row=4 | |
176 nb_graph=ncol(Dataset)-1 | |
177 | |
178 nb_row=round(nb_graph/nb_graph_per_row) | |
179 | |
180 nb_empty_plot=nb_graph %% nb_graph_per_row | |
181 if (nb_empty_plot != 0) { | |
182 nb_row=nb_row+1 | |
183 } | |
184 | |
185 page_height=3.5 * nb_row | |
186 | |
187 pdf(file=graph_file,height=page_height) | |
188 | |
189 graph.list <- strsplit(chosen.plot,",")[[1]] | |
190 | |
191 #For the pair plot, we stick to the default layout | |
192 if("pairsplot" %in% graph.list){ | |
193 pairs(Dataset[,-1]) | |
194 } | |
195 | |
196 #For the other plots, we have 4 plots per line | |
197 par(mfrow=c(nb_row,nb_graph_per_row),mar=c(3, 3, 3, 1) + 0.1) | |
198 | |
199 if("boxplot" %in% graph.list){ | |
200 for(ech in 2:ncol(Dataset)){ | |
201 boxplot(Dataset[,ech],main=colnames(Dataset)[ech],xlab=NULL) | |
202 } | |
203 #Complete page with empty plots | |
204 i=0; while (i<nb_empty_plot) {plot.new();i=i+1;} | |
205 } | |
206 | |
207 if("histogram" %in% graph.list){ | |
208 for(ech in 2:ncol(Dataset)){ | |
209 hist(Dataset[,ech],main=colnames(Dataset)[ech],xlab=NULL) | |
210 } | |
211 #Complete page with empty plots | |
212 i=0; while (i<nb_empty_plot) {plot.new();i=i+1;} | |
213 } | |
214 | |
215 if("density" %in% graph.list){ | |
216 for(ech in 2:ncol(Dataset)){ | |
217 plot(density(Dataset[,ech],na.rm=TRUE),main=colnames(Dataset)[ech]) | |
218 } | |
219 #Complete page with empty plots | |
220 i=0; while (i<nb_empty_plot) {plot.new();i=i+1;} | |
221 } | |
222 | |
223 | |
224 if("MAplot" %in% graph.list){ | |
225 if(min(Dataset[,-1],na.rm=TRUE)<0){ | |
226 cat("\n----\nError: MAplot only available for positive variables\n----",file=log_file,append=T,sep="") | |
227 q(save="no",status=1) | |
228 } | |
229 library(limma) | |
230 | |
231 library(edgeR) #Warning : Import also limma package | |
232 for(ech in 2:(ncol(Dataset)-1)){ | |
233 for(ech2 in (ech+1):ncol(Dataset)){ | |
234 temp.pair <- na.omit(Dataset[,c(ech,ech2)]) | |
235 maPlot(temp.pair[,1],temp.pair[,2],main=paste(colnames(Dataset)[ech],"VS",colnames(Dataset)[ech2])) | |
236 } | |
237 } | |
238 #Do not complete page with empty plots for this plot because it generates nb_variables X nb_variables graphs | |
239 } | |
240 | |
241 #Close pdf device | |
242 dev.off() | |
243 | |
244 log=paste(log,"➔ You choose to plot :",chosen.plot,"<BR>") | |
245 } # end if(ploting) | |
246 else{ | |
247 log=paste(log,"➔ You don't choose any plot<BR>") | |
248 } | |
249 | |
250 | |
251 | |
252 ########################################################## | |
253 # Treatment successfull | |
254 ########################################################## | |
255 cat("<HTML><HEAD><TITLE>Summary statistics report</TITLE></HEAD><BODY>\n",file=log_file,append=F,sep="") | |
256 cat(log,file=log_file,append=T,sep="") | |
257 cat("✓ Your process is successfull!<BR>",file=log_file,append=T,sep="") | |
258 cat("</BODY></HTML>\n",file=log_file,append=T,sep="") | |
259 | |
260 | |
261 } # end of function | |
262 | |
263 |