Mercurial > repos > pfrommolt > ngsrich
comparison NGSrich_0.5.5/R/summarize_enrichment.R @ 0:89ad0a9cca52 default tip
Uploaded
| author | pfrommolt |
|---|---|
| date | Mon, 21 Nov 2011 08:12:19 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:89ad0a9cca52 |
|---|---|
| 1 #!/usr/bin/Rscript | |
| 2 | |
| 3 infile=as.character(commandArgs()[6]) | |
| 4 outdir=as.character(commandArgs()[7]) | |
| 5 poor=as.numeric(commandArgs()[8]) | |
| 6 high=as.numeric(commandArgs()[9]) | |
| 7 | |
| 8 | |
| 9 outfile=paste(outdir,"/summary.html",sep="") | |
| 10 cutoff=c(poor,high) | |
| 11 | |
| 12 xmlfiles<-bedfiles<-character(0) | |
| 13 indirs<-as.character(readLines(infile)) | |
| 14 for(thisdir in indirs){ | |
| 15 thisdir=paste(thisdir,"/data",sep="") | |
| 16 xmlfiles=c(xmlfiles,list.files(thisdir,pattern=".xml",full.names=TRUE)) | |
| 17 bedfiles=c(bedfiles,list.files(thisdir,pattern=".bed",full.names=TRUE)) | |
| 18 } | |
| 19 nsamples<-length(indirs) | |
| 20 | |
| 21 samplenames<-character(0) | |
| 22 diffs<-matrix(nrow=5,ncol=nsamples) | |
| 23 averagecov<-stddevcov<-targetsize<-numreads_total<-numreads_target<-numeric(0) | |
| 24 sample1<-sample5<-sample10<-sample20<-sample30<-numeric(0) | |
| 25 | |
| 26 | |
| 27 ##Read XML input files | |
| 28 xmltag<-function(line){ | |
| 29 return(strsplit(strsplit(line,">")[[1]][2],"</")[[1]][1]) | |
| 30 } | |
| 31 i=1 | |
| 32 nextfold=0 | |
| 33 for(file in xmlfiles){ | |
| 34 numreads<-numeric(0) | |
| 35 inlines<-readLines(file) | |
| 36 for(line in inlines){ | |
| 37 if(length(grep("SampleName",line)>0)){samplenames<-c(samplenames,xmltag(line))} | |
| 38 if(length(grep("NumberReads",line)>0)){numreads<-c(numreads,as.numeric(xmltag(line)))} | |
| 39 if(length(grep("AvTargetCoverage",line)>0)){averagecov=c(averagecov,as.numeric(xmltag(line)))} | |
| 40 if(length(grep("SDTargetCoverage",line)>0)){stddevcov=c(stddevcov,as.numeric(xmltag(line)))} | |
| 41 if(length(grep("TargetSize",line)>0)){targetsize=as.numeric(xmltag(line))} | |
| 42 if(length(grep("<from1x>",line)>0)){nextfold=1} | |
| 43 else{ | |
| 44 if((nextfold==1) && length(grep("PercBases",line))>0){ | |
| 45 sample1<-c(sample1,as.numeric(xmltag(line))) | |
| 46 nextfold=0 | |
| 47 } | |
| 48 } | |
| 49 if(length(grep("<from5x>",line)>0)){nextfold=5} | |
| 50 else{ | |
| 51 if((nextfold==5) && length(grep("PercBases",line))>0){ | |
| 52 sample5<-c(sample5,as.numeric(xmltag(line))) | |
| 53 nextfold=0 | |
| 54 } | |
| 55 } | |
| 56 if(length(grep("<from10x>",line)>0)){nextfold=10} | |
| 57 else{ | |
| 58 if((nextfold==10) && length(grep("PercBases",line))>0){ | |
| 59 sample10<-c(sample10,as.numeric(xmltag(line))) | |
| 60 nextfold=0 | |
| 61 } | |
| 62 } | |
| 63 if(length(grep("<from20x>",line)>0)){nextfold=20} | |
| 64 else{ | |
| 65 if((nextfold==20) && length(grep("PercBases",line))>0){ | |
| 66 sample20<-c(sample20,as.numeric(xmltag(line))) | |
| 67 nextfold=0 | |
| 68 } | |
| 69 } | |
| 70 if(length(grep("<from30x>",line)>0)){nextfold=30} | |
| 71 else{ | |
| 72 if((nextfold==30) && length(grep("PercBases",line))>0){ | |
| 73 sample30<-c(sample30,as.numeric(xmltag(line))) | |
| 74 nextfold=0 | |
| 75 } | |
| 76 } | |
| 77 } | |
| 78 numreads_total<-c(numreads_total,numreads[1]) | |
| 79 numreads_target<-c(numreads_target,numreads[2]) | |
| 80 diffs[,i]<-c(sample30[i],sample20[i]-sample30[i],sample10[i]-sample20[i],sample5[i]-sample10[i],sample1[i]-sample5[i]) | |
| 81 i=i+1 | |
| 82 } | |
| 83 | |
| 84 | |
| 85 ##Create barplot | |
| 86 rownames(diffs)<-c("Covered 30x","Covered 20x","Covered 10x","Covered 5x","Covered 1x") | |
| 87 plot_samplenames<-samplenames | |
| 88 s=0 | |
| 89 for(sname in plot_samplenames){ | |
| 90 s=s+1 | |
| 91 if(nchar(sname)>10){plot_samplenames[s]<-paste("Sample ",s,"\n(table above)",sep="")} | |
| 92 } | |
| 93 colnames(diffs)<-plot_samplenames | |
| 94 png(file=paste(outdir,"/summary_plot.png",sep=""),width=66*(4+nsamples),height=400) | |
| 95 par(oma=c(1,0,0,1)) | |
| 96 barplot(diffs, | |
| 97 legend=TRUE, | |
| 98 col=c("darkblue","steelblue1","tomato1","tomato3","tomato4"), | |
| 99 xlim=c(0,4+nsamples),ylim=c(0,100), | |
| 100 ylab="Percentage",las=2 ) #, | |
| 101 # args.legend=list(x=nsamples+4,y=80)) | |
| 102 dev.off() | |
| 103 | |
| 104 | |
| 105 | |
| 106 ##Read BED files | |
| 107 sametarget=TRUE | |
| 108 thisbed<-read.table(bedfiles[1],stringsAsFactors=FALSE) | |
| 109 genes<-levels(as.factor(thisbed$V4)) | |
| 110 meancov<-matrix(nrow=length(genes),ncol=length(bedfiles)) | |
| 111 chr<-start<-end<-character(0) | |
| 112 | |
| 113 i=1 | |
| 114 for(file in bedfiles){ | |
| 115 thisbed<-read.table(file,stringsAsFactors=FALSE) | |
| 116 genes_thissample<-levels(as.factor(thisbed$V4)) | |
| 117 if(length(genes_thissample)!=length(genes)){sametarget=FALSE} | |
| 118 for(j in 1:length(genes)){ | |
| 119 thisgene<-thisbed[thisbed$V4==genes[j],] | |
| 120 chr[j]<-thisgene[1,1] | |
| 121 start[j]<-thisgene[1,2] | |
| 122 end[j]<-thisgene[1,3] | |
| 123 meancov[j,i]<-as.numeric(mean(thisgene[,5])) | |
| 124 } | |
| 125 i=i+1 | |
| 126 } | |
| 127 | |
| 128 meancov<-data.frame(chr,start,end,genes,meancov) | |
| 129 for(i in 1:nrow(meancov)){ | |
| 130 print(i) | |
| 131 meancov$ave[i]<-round(mean(as.numeric(meancov[i,4+(1:nsamples)])),4) | |
| 132 meancov$stddev[i]<-round(sd(as.numeric(meancov[i,4+(1:nsamples)])),4) | |
| 133 meancov$mini[i]<-round(min(as.numeric(meancov[i,4+(1:nsamples)])),4) | |
| 134 meancov$maxi[i]<-round(max(as.numeric(meancov[i,4+(1:nsamples)])),4) | |
| 135 } | |
| 136 meancov_sorted<-meancov[order(meancov$ave),] | |
| 137 poorly_all<-meancov_sorted[meancov_sorted$max<cutoff[1],] | |
| 138 | |
| 139 | |
| 140 ##Write HTML output | |
| 141 cat(file=outfile,paste( | |
| 142 "<html>\n", | |
| 143 "<head>\n", | |
| 144 "<title>Enrichment Performance</title>\n", | |
| 145 "<style type=\"text/css\">\n", | |
| 146 "body{font-family:sans-serif;}\n", | |
| 147 "h2,h3{color: darkblue;}\n", | |
| 148 "a{color: darkblue;}\n", | |
| 149 " table.output td{\n", | |
| 150 " padding: 4px; background-color: lightskyblue;\n", | |
| 151 " border: 1px solid #000; border-color: darkblue;\n", | |
| 152 "}\n", | |
| 153 "</Style>\n", | |
| 154 "</head>\n", | |
| 155 "<body>\n", | |
| 156 "<h2>Target Enrichment Performance</h2><br/>\n", | |
| 157 "<table class=\"output\">\n", | |
| 158 " <tr>\n", | |
| 159 " <td><b>Sample</b></td><td><b># Reads</b></td><td><b># On Target</b></td><td><b>Coverage Mean</b></td><td><b>Coverage Std Dev</b></td><td><b>Covered 1x</b></td><td><b>Covered 5x</b></td><td><b>Covered 10x</b></td><td><b>Covered 20x</b></td><td><b>Covered 30x</b></td><td><b>Details</b></td>\n", | |
| 160 "</tr>\n",sep="")) | |
| 161 for(i in 1:nsamples){ | |
| 162 cat(file=outfile,paste( | |
| 163 "<tr><td>",samplenames[i],"</td><td>",numreads_total[i],"</td>", | |
| 164 "<td>",numreads_target[i],"</td><td>",averagecov[i],"</td>", | |
| 165 "<td>",stddevcov[i],"</td><td>",sample1[i],"</td><td>",sample5[i],"</td>", | |
| 166 "<td>",sample10[i],"</td><td>",sample20[i],"</td><td>",sample30[i],"</td>", | |
| 167 "<td><a href=\"",rev(strsplit(indirs[i],"/")[[1]])[1],"/",samplenames[i],"_enrichment.html\">Show</a></td></tr>\n", | |
| 168 sep=""),append=TRUE) | |
| 169 } | |
| 170 cat(file=outfile,paste( | |
| 171 "</table>\n", | |
| 172 "<br/><br/>\n", | |
| 173 "<h2>Target Base Percentage Covered</h2>\n", | |
| 174 "<img src=\"summary_plot.png\"></img>\n",sep=""),append=TRUE) | |
| 175 if(!sametarget){ | |
| 176 warning("Target regions are not the same for all samples.") | |
| 177 } else{ | |
| 178 cat(file=outfile,paste( | |
| 179 "<h2>Poorly Covered in All Samples (Cutoff: ",cutoff[1],"x)</h2>\n",sep=""),append=TRUE) | |
| 180 if(nrow(poorly_all)==0){ | |
| 181 cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE) | |
| 182 } else{ | |
| 183 cat(file=outfile,paste( | |
| 184 "<table class=\"output\">\n", | |
| 185 "<tr>\n", | |
| 186 "<td><b>Target Region</b></td><td><b>Gene</b></td><td><b>Mean Across Samples</b></td><td><b>Std Dev Across Samples</b></td>\n",sep=""),append=TRUE) | |
| 187 for(i in 1:nrow(poorly_all)){ | |
| 188 cat(file=outfile,paste( | |
| 189 "<tr>\n", | |
| 190 "<td>",poorly_all$chr[i],":",poorly_all$start[i],"-",poorly_all$end[i],"</td>\n", | |
| 191 "<td>",poorly_all$genes[i],"</td>\n", | |
| 192 "<td>",poorly_all$ave[i],"</td>\n", | |
| 193 "<td>",poorly_all$stddev[i],"</td>\n", | |
| 194 "<tr>\n",sep=""),append=TRUE) | |
| 195 } | |
| 196 cat(file=outfile,"</table>\n",append=TRUE) | |
| 197 } | |
| 198 | |
| 199 meancov_sorted<-meancov[order(meancov$ave,decreasing=TRUE),] ##Highly Covered | |
| 200 highly_all<-meancov_sorted[meancov_sorted$min>cutoff[2],] | |
| 201 cat(file=outfile,paste( | |
| 202 "<h2>Highly Covered in All Samples (Cutoff: ",cutoff[2],"x)</h2>\n",sep=""),append=TRUE) | |
| 203 if(nrow(highly_all)==0){ | |
| 204 cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE) | |
| 205 } else{ | |
| 206 cat(file=outfile,paste("<table class=\"output\">\n", | |
| 207 "<tr>\n", | |
| 208 "<td><b>Target Region</b></td><td><b>Gene</b></td><td><b>Mean Across Sample</b></td><td><b>Std Dev Across Samples</b></td>\n",sep=""),append=TRUE) | |
| 209 for(i in 1:nrow(highly_all)){ | |
| 210 cat(file=outfile,paste( | |
| 211 "<tr>\n", | |
| 212 "<td>",highly_all$chr[i],":",highly_all$start[i],"-",highly_all$end[i],"</td>\n", | |
| 213 "<td>",highly_all$genes[i],"</td>\n", | |
| 214 "<td>",highly_all$ave[i],"</td>\n", | |
| 215 "<td>",highly_all$stddev[i],"</td>\n", | |
| 216 "<tr>\n",sep=""),append=TRUE) | |
| 217 } | |
| 218 cat(file=outfile,"</table>\n",append=TRUE) | |
| 219 } | |
| 220 | |
| 221 } | |
| 222 | |
| 223 cat(file=outfile,"</body>\n</html>\n",append=TRUE) |
