diff getGenomicScore.py @ 20:16ba480adf96

Uploaded
author xuebing
date Sat, 31 Mar 2012 08:31:22 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/getGenomicScore.py	Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,78 @@
+import random,string,os,sys
+
+    
+def getScore(intvfile,outfile,summary_type,bwfilepath,nbin,strand,outplot,span):
+    f = open(intvfile)
+    tmpsh = "".join(random.sample(string.letters+string.digits, 8))
+    tmpout = "".join(random.sample(string.letters+string.digits, 8))
+    tmp = open(tmpsh,'w')
+    if os.path.isdir(bwfilepath):
+        for line in f:
+            flds = line.strip().split('\t')
+            cmd = 'bigWigSummary -type='+summary_type+' '+bwfilepath+'/'+flds[0]+'.bw '+flds[0]+' '+flds[1]+' '+flds[2]+' '+nbin+' >> '+tmpout+' 2>>'+tmpout+'\n'
+            tmp.write(cmd)
+    else:
+        for line in f:
+            flds = line.strip().split('\t')
+            cmd = 'bigWigSummary -type='+summary_type+' '+bwfilepath+' '+flds[0]+' '+flds[1]+' '+flds[2]+' '+nbin+' >> '+tmpout+' 2>>'+tmpout+'\n'
+            tmp.write(cmd)
+    f.close()        
+    # remove blank lines
+    tmp.write("sed '/^$/d' "+tmpout+'>'+tmpout+".1\n")
+    tmp.write("sed '/^Can/d' "+tmpout+".1 >"+tmpout+".2\n")
+    # set n/a to 0
+    tmp.write("sed 's/n\/a/0/g' "+tmpout+".2 >"+tmpout+".3\n")
+    # replace text with 0
+    zeros = ''.join(['0\t']*int(nbin))
+    tmp.write("sed 's/^[a-zA-Z]/"+zeros+"/' "+tmpout+".3 >"+tmpout+".4\n")
+    # cut the first nbin columns
+    tmp.write("cut -f 1-"+nbin+" "+tmpout+".4 > "+tmpout+".5\n")     
+    tmp.write("paste "+intvfile+" "+tmpout+".5 >"+outfile+"\n")
+    tmp.close()
+    os.system('chmod +x '+tmpsh)
+    os.system('./'+tmpsh)
+    #os.system('rm '+tmpout+'*')
+    #os.system('rm '+tmpsh)
+
+    # strandness: need to reverse bins for - strand
+    if nbin > 1 and strand > 0:
+        strand = strand - 1 # python is 0 based
+        os.system('mv '+outfile+' '+tmpout)
+        f = open(tmpout)
+        out = open(outfile,'w')
+        for line in f:
+            flds=line.strip().split('\t')
+            if flds[strand] == '+':
+                out.write(line)
+            else:
+                scores = flds[-int(nbin):]
+                scores.reverse()
+                flds = flds[:-int(nbin)]+scores
+                out.write('\t'.join(flds)+'\n')
+        os.system('rm '+tmpout)
+        f.close()
+        out.close()
+    # plot
+    if int(nbin) > 1:
+        rscript = open(tmpsh,"w")
+        rscript.write("options(warn=-1)\n")
+        rscript.write("x <- read.table('"+outfile+"',sep='\t')\n")
+        rscript.write("x <- x[,(ncol(x)+1-"+nbin+"):ncol(x)]\n")
+        rscript.write("pdf('"+outplot+"')\n")
+        rscript.write("avg <- apply(x,2,mean)\n")
+        rscript.write("err <- apply(x,2,sd)/sqrt(nrow(x))\n")
+        rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
+        rscript.write("xticks <- seq(ncol(x))-(1+ncol(x))/2\n")
+        if span >= 0.1:
+            rscript.write("avg = loess(avg~xticks,span="+str(span)+")$fitted\n")
+            rscript.write("err = loess(err~xticks,span="+str(span)+")$fitted\n")
+        rscript.write("par(cex=1.5)\n")
+        rscript.write("plot(xticks,avg,ylab='average conservation score',xlab='relative position (bin)',type='l',lwd=0,ylim=ylim)\n")   
+        rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='slateblue1',border=NA)\n")
+        rscript.write("lines(xticks,avg,type='l',lwd=1)\n")   
+        rscript.write("dev.off()\n")
+        rscript.close()
+        os.system("R --vanilla < "+tmpsh)
+        os.system("rm "+tmpsh)
+
+getScore(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],int(sys.argv[6]),sys.argv[7],float(sys.argv[8]))