ngsrich: NGSrich_0.5.5/R/summarize_enrichment.R comparison

comparison NGSrich_0.5.5/R/summarize_enrichment.R @ 0:89ad0a9cca52 default tip

Uploaded

author	pfrommolt
date	Mon, 21 Nov 2011 08:12:19 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:89ad0a9cca52
+#!/usr/bin/Rscript
+infile=as.character(commandArgs()[6])
+outdir=as.character(commandArgs()[7])
+poor=as.numeric(commandArgs()[8])
+high=as.numeric(commandArgs()[9])
+outfile=paste(outdir,"/summary.html",sep="")
+cutoff=c(poor,high)
+xmlfiles<-bedfiles<-character(0)
+indirs<-as.character(readLines(infile))
+for(thisdir in indirs){
+thisdir=paste(thisdir,"/data",sep="")
+xmlfiles=c(xmlfiles,list.files(thisdir,pattern=".xml",full.names=TRUE))
+bedfiles=c(bedfiles,list.files(thisdir,pattern=".bed",full.names=TRUE))
+}
+nsamples<-length(indirs)
+samplenames<-character(0)
+diffs<-matrix(nrow=5,ncol=nsamples)
+averagecov<-stddevcov<-targetsize<-numreads_total<-numreads_target<-numeric(0)
+sample1<-sample5<-sample10<-sample20<-sample30<-numeric(0)
+##Read XML input files
+xmltag<-function(line){
+return(strsplit(strsplit(line,">")[[1]][2],"</")[[1]][1])
+}
+i=1
+nextfold=0
+for(file in xmlfiles){
+numreads<-numeric(0)
+inlines<-readLines(file)
+for(line in inlines){
+if(length(grep("SampleName",line)>0)){samplenames<-c(samplenames,xmltag(line))}
+if(length(grep("NumberReads",line)>0)){numreads<-c(numreads,as.numeric(xmltag(line)))}
+if(length(grep("AvTargetCoverage",line)>0)){averagecov=c(averagecov,as.numeric(xmltag(line)))}
+if(length(grep("SDTargetCoverage",line)>0)){stddevcov=c(stddevcov,as.numeric(xmltag(line)))}
+if(length(grep("TargetSize",line)>0)){targetsize=as.numeric(xmltag(line))}
+if(length(grep("<from1x>",line)>0)){nextfold=1}
+else{
+if((nextfold==1) && length(grep("PercBases",line))>0){
+sample1<-c(sample1,as.numeric(xmltag(line)))
+nextfold=0
+}
+}
+if(length(grep("<from5x>",line)>0)){nextfold=5}
+else{
+if((nextfold==5) && length(grep("PercBases",line))>0){
+sample5<-c(sample5,as.numeric(xmltag(line)))
+nextfold=0
+}
+}
+if(length(grep("<from10x>",line)>0)){nextfold=10}
+else{
+if((nextfold==10) && length(grep("PercBases",line))>0){
+sample10<-c(sample10,as.numeric(xmltag(line)))
+nextfold=0
+}
+}
+if(length(grep("<from20x>",line)>0)){nextfold=20}
+else{
+if((nextfold==20) && length(grep("PercBases",line))>0){
+sample20<-c(sample20,as.numeric(xmltag(line)))
+nextfold=0
+}
+}
+if(length(grep("<from30x>",line)>0)){nextfold=30}
+else{
+if((nextfold==30) && length(grep("PercBases",line))>0){
+sample30<-c(sample30,as.numeric(xmltag(line)))
+nextfold=0
+}
+}
+}
+numreads_total<-c(numreads_total,numreads[1])
+numreads_target<-c(numreads_target,numreads[2])
+diffs[,i]<-c(sample30[i],sample20[i]-sample30[i],sample10[i]-sample20[i],sample5[i]-sample10[i],sample1[i]-sample5[i])
+i=i+1
+}
+##Create barplot
+rownames(diffs)<-c("Covered 30x","Covered 20x","Covered 10x","Covered 5x","Covered 1x")
+plot_samplenames<-samplenames
+s=0
+for(sname in plot_samplenames){
+s=s+1
+if(nchar(sname)>10){plot_samplenames[s]<-paste("Sample ",s,"\n(table above)",sep="")}
+}
+colnames(diffs)<-plot_samplenames
+png(file=paste(outdir,"/summary_plot.png",sep=""),width=66*(4+nsamples),height=400)
+par(oma=c(1,0,0,1))
+barplot(diffs,
+	legend=TRUE,
+col=c("darkblue","steelblue1","tomato1","tomato3","tomato4"),
+xlim=c(0,4+nsamples),ylim=c(0,100),
+ylab="Percentage",las=2 ) #,
+#        args.legend=list(x=nsamples+4,y=80))
+dev.off()
+##Read BED files
+sametarget=TRUE
+thisbed<-read.table(bedfiles[1],stringsAsFactors=FALSE)
+genes<-levels(as.factor(thisbed$V4))
+meancov<-matrix(nrow=length(genes),ncol=length(bedfiles))
+chr<-start<-end<-character(0)
+i=1
+for(file in bedfiles){
+thisbed<-read.table(file,stringsAsFactors=FALSE)
+genes_thissample<-levels(as.factor(thisbed$V4))
+if(length(genes_thissample)!=length(genes)){sametarget=FALSE}
+for(j in 1:length(genes)){
+thisgene<-thisbed[thisbed$V4==genes[j],]
+chr[j]<-thisgene[1,1]
+start[j]<-thisgene[1,2]
+end[j]<-thisgene[1,3]
+meancov[j,i]<-as.numeric(mean(thisgene[,5]))
+}
+i=i+1
+}
+meancov<-data.frame(chr,start,end,genes,meancov)
+for(i in 1:nrow(meancov)){
+print(i)
+meancov$ave[i]<-round(mean(as.numeric(meancov[i,4+(1:nsamples)])),4)
+meancov$stddev[i]<-round(sd(as.numeric(meancov[i,4+(1:nsamples)])),4)
+meancov$mini[i]<-round(min(as.numeric(meancov[i,4+(1:nsamples)])),4)
+meancov$maxi[i]<-round(max(as.numeric(meancov[i,4+(1:nsamples)])),4)
+}
+meancov_sorted<-meancov[order(meancov$ave),]
+poorly_all<-meancov_sorted[meancov_sorted$max<cutoff[1],]
+##Write HTML output
+cat(file=outfile,paste(
+"<html>\n",
+"<head>\n",
+"<title>Enrichment Performance</title>\n",
+"<style type=\"text/css\">\n",
+"body{font-family:sans-serif;}\n",
+"h2,h3{color: darkblue;}\n",
+"a{color: darkblue;}\n",
+"   table.output td{\n",
+"   padding: 4px; background-color: lightskyblue;\n",
+"   border: 1px solid #000; border-color: darkblue;\n",
+"}\n",
+"</Style>\n",
+"</head>\n",
+"<body>\n",
+"<h2>Target Enrichment Performance</h2><br/>\n",
+"<table class=\"output\">\n",
+"  <tr>\n",
+"  <td><b>Sample</b></td><td><b># Reads</b></td><td><b># On Target</b></td><td><b>Coverage Mean</b></td><td><b>Coverage Std Dev</b></td><td><b>Covered 1x</b></td><td><b>Covered 5x</b></td><td><b>Covered 10x</b></td><td><b>Covered 20x</b></td><td><b>Covered 30x</b></td><td><b>Details</b></td>\n",
+"</tr>\n",sep=""))
+for(i in 1:nsamples){
+cat(file=outfile,paste(
+"<tr><td>",samplenames[i],"</td><td>",numreads_total[i],"</td>",
+"<td>",numreads_target[i],"</td><td>",averagecov[i],"</td>",
+"<td>",stddevcov[i],"</td><td>",sample1[i],"</td><td>",sample5[i],"</td>",
+"<td>",sample10[i],"</td><td>",sample20[i],"</td><td>",sample30[i],"</td>",
+"<td><a href=\"",rev(strsplit(indirs[i],"/")[[1]])[1],"/",samplenames[i],"_enrichment.html\">Show</a></td></tr>\n",
+sep=""),append=TRUE)
+}
+cat(file=outfile,paste(
+"</table>\n",
+"<br/><br/>\n",
+"<h2>Target Base Percentage Covered</h2>\n",
+"<img src=\"summary_plot.png\"></img>\n",sep=""),append=TRUE)
+if(!sametarget){
+	warning("Target regions are not the same for all samples.")
+} else{
+cat(file=outfile,paste(
+"<h2>Poorly Covered in All Samples (Cutoff: ",cutoff[1],"x)</h2>\n",sep=""),append=TRUE)
+if(nrow(poorly_all)==0){
+cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE)
+} else{
+cat(file=outfile,paste(
+"<table class=\"output\">\n",
+"<tr>\n",
+"<td><b>Target Region</b></td><td><b>Gene</b></td><td><b>Mean Across Samples</b></td><td><b>Std Dev Across Samples</b></td>\n",sep=""),append=TRUE)
+for(i in 1:nrow(poorly_all)){
+cat(file=outfile,paste(
+"<tr>\n",
+"<td>",poorly_all$chr[i],":",poorly_all$start[i],"-",poorly_all$end[i],"</td>\n",
+"<td>",poorly_all$genes[i],"</td>\n",
+"<td>",poorly_all$ave[i],"</td>\n",
+"<td>",poorly_all$stddev[i],"</td>\n",
+"<tr>\n",sep=""),append=TRUE)
+}
+cat(file=outfile,"</table>\n",append=TRUE)
+}
+meancov_sorted<-meancov[order(meancov$ave,decreasing=TRUE),] ##Highly Covered
+highly_all<-meancov_sorted[meancov_sorted$min>cutoff[2],]
+cat(file=outfile,paste(
+"<h2>Highly Covered in All Samples (Cutoff: ",cutoff[2],"x)</h2>\n",sep=""),append=TRUE)
+if(nrow(highly_all)==0){
+cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE)
+} else{
+	cat(file=outfile,paste("<table class=\"output\">\n",
+"<tr>\n",
+"<td><b>Target Region</b></td><td><b>Gene</b></td><td><b>Mean Across Sample</b></td><td><b>Std Dev Across Samples</b></td>\n",sep=""),append=TRUE)
+for(i in 1:nrow(highly_all)){
+cat(file=outfile,paste(
+"<tr>\n",
+"<td>",highly_all$chr[i],":",highly_all$start[i],"-",highly_all$end[i],"</td>\n",
+"<td>",highly_all$genes[i],"</td>\n",
+"<td>",highly_all$ave[i],"</td>\n",
+"<td>",highly_all$stddev[i],"</td>\n",
+"<tr>\n",sep=""),append=TRUE)
+}
+cat(file=outfile,"</table>\n",append=TRUE)
+}
+}
+cat(file=outfile,"</body>\n</html>\n",append=TRUE)

Mercurial > repos > pfrommolt > ngsrich

comparison NGSrich_0.5.5/R/summarize_enrichment.R @ 0:89ad0a9cca52 default tip