# HG changeset patch
# User xuebing
# Date 1333197082 14400
# Node ID 16ba480adf96ff8b16c80f071293ab0992c2d153
# Parent d325683ec36846f2d99d50723bbada93867b7cb8
Uploaded
diff -r d325683ec368 -r 16ba480adf96 StartGenometriCorr.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/StartGenometriCorr.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,23 @@
+
+between two files of genomic intervals
+
+Start_GenometriCorr.R $config $query $reference $output_options $output
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This tool determines the statistical relationship (if any) between two sets of genomic intervals. Output can be text only, plot (ECDF curves), or a more colorful graphic.
+
+
\ No newline at end of file
diff -r d325683ec368 -r 16ba480adf96 Start_GenometriCorr.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Start_GenometriCorr.R Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,105 @@
+# Start_GenometriCorr.R
+
+###################################################
+# #
+# command-line interface to GenometriCorr #
+# functions, for use with Galaxy. #
+# #
+###################################################
+
+capture.output <- function (result, pdffile, output_options)
+{
+ if(output_options != "stats")
+ {
+ pdf(file=pdffile, width=10, height=19, paper="special")
+
+ if (output_options != "vis") #need to do a plot
+ {
+ mymat <- matrix(ncol=3, nrow=4)
+ mymat[1,1] <- 1
+ mymat[1,2] <- 2
+ mymat[1,3] <- 3
+ mymat[2,1] <- 4
+ mymat[2,2] <- 5
+ mymat[2,3] <- 6
+ mymat[3,1] <- 7
+ mymat[3,2] <- 8
+ mymat[3,3] <- 9
+ mymat[4,1] <- 10
+ mymat[4,2] <- 11
+ mymat[4,3] <- 12
+
+ layout(mymat, heights=c(0.2,0.2,0.2,0.2))
+ plot(result, pdffile, make.new=FALSE)
+ }
+ if (output_options != "plot") #need to do the bigger graphic
+ {
+ mymat <- matrix(ncol=2, nrow=8)
+ mymat[1,1] <- 2
+ mymat[1,2] <- 3
+ mymat[2,1] <- 4
+ mymat[2,2] <- 4
+ mymat[3,1] <- 1
+ mymat[3,2] <- 1
+ mymat[4,1] <- 5
+ mymat[4,2] <- 6
+ mymat[5,1] <- 7
+ mymat[5,2] <- 7
+ mymat[6,1] <- 8
+ mymat[6,2] <- 9
+ mymat[7,1] <- 10
+ mymat[7,2] <- 10
+ mymat[8,1] <- 11
+ mymat[8,2] <- 12
+ layoutresults <- 3
+
+ layout(mymat, heights=c(0.05,0.05,0.15,0.15,0.15,0.15,0.15,0.15))
+ visualize(result, pdffile, make.new=FALSE)
+ }
+ dev.off()
+ }
+
+ if (output_options == "stats")
+ {
+ show(result)
+ }
+}
+
+
+
+# Reads the command line arguments
+args <- commandArgs(trailingOnly=T)
+
+suppressPackageStartupMessages(library('GenometriCorr', warn.conflicts=F, verbose=F))
+suppressPackageStartupMessages(library('graphics', warn.conflicts=F, verbose=F))
+suppressPackageStartupMessages(library('gdata', warn.conflicts=F, verbose=F))
+suppressPackageStartupMessages(library('gplots', warn.conflicts=F, verbose=F))
+suppressPackageStartupMessages(library('gtools', warn.conflicts=F, verbose=F))
+suppressPackageStartupMessages(library('caTools', warn.conflicts=F, verbose=F))
+suppressPackageStartupMessages(library('grid', warn.conflicts=F, verbose=F))
+
+
+
+# Variables
+query_file <- ""
+reference_file <- ""
+config_file <- ""
+output_options <- ""
+
+# Parse the command line arguments
+
+config_file <- args[1]
+query_file <- as.character(args[2])
+reference_file <- as.character(args[3])
+output_options <- args[4]
+pdffile <- args[5]
+
+conf<-new("GenometriCorrConfig",config_file)
+
+print('OK')
+
+result<-suppressWarnings(suppressPackageStartupMessages(GenometriCorr:::run.config(conf,query=query_file,reference=reference_file)))
+print('OK2')
+
+hideoutput <- capture.output(result, pdffile=args[5], output_options)
+
diff -r d325683ec368 -r 16ba480adf96 align2database.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/align2database.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,104 @@
+'''
+align mulitple bed to one bed
+python align_multiple.py hmChIP_mm9_peak_bed/mm9-GSE19999_PolII_P25_all.cod.bed hmChIP_mm9_peak_bed/ test.txt test.pdf 100 5000
+'''
+
+import os,sys,random
+def main():
+ queryfile = sys.argv[1]
+ inpath = sys.argv[2]
+ outputdata = sys.argv[3]
+ outputerr = sys.argv[4]
+ barplotpdf = sys.argv[5]
+ min_feat = sys.argv[6] # min features overlap
+ windowsize = sys.argv[7]
+ anchor = sys.argv[8]
+ span = sys.argv[9] # loess smooth parameter
+
+ inpath = inpath.rstrip('/')
+ #os.system('rm '+inpath+'/*tmp*')
+
+ infiles = os.listdir(inpath)
+
+ #print len(infiles),' files\n'
+ i = 0
+ for infile in infiles:
+ if 'tmp' in infile:
+ #os.system('rm '+inpath+'/'+infile)
+ continue
+ i = i +1
+ print i,infile
+ output = infile.split('/')[-1]+'-to-'+queryfile.split('/')[-1]#'.summary'
+ if anchor == 'database':
+ command = 'python /Users/xuebing/galaxy-dist/tools/mytools/alignr.py -b '+inpath+'/'+infile+' -a '+queryfile+' -o '+output+' --summary-only -q -w '+windowsize
+ else:
+ command = 'python /Users/xuebing/galaxy-dist/tools/mytools/alignr.py -a '+inpath+'/'+infile+' -b '+queryfile+' -o '+output+' --summary-only -q -w '+windowsize
+ #print command+'\n'
+ os.system(command)
+ print 'start visualization...'
+ # visualize
+ rscriptfile = 'f-'+str(random.random())+'.r'
+ r = open(rscriptfile,'w')
+ r.write("files <- dir('.','summary',full.name=T)\n")
+ #r.write("print(files)\n")
+ r.write("x <- read.table(files[1])\n")
+ r.write("err <- read.table(gsub('summary','standarderror',files[1]))\n")
+ r.write("for (filename in files[2:length(files)]){\n")
+ r.write(" x <- rbind(x,read.table(filename))\n")
+ r.write(" err <- rbind(err,read.table(gsub('summary','standarderror',filename)))\n")
+ r.write("}\n")
+ r.write("x <- x[x[,2] > "+min_feat+",]\n")
+ r.write("err <- err[err[,2] > "+min_feat+",]\n")
+ r.write("name <- as.character(x[,1])\n")
+ r.write("nfeat <- x[,2]\n")
+ r.write("x <- x[,3:ncol(x)]\n")
+ r.write("err <- err[,3:ncol(err)]\n")
+ r.write("for (i in 1:nrow(x)) {\n")
+ r.write(" name[i] <- strsplit(name[i],'-to-')[[1]][1]\n")
+ r.write("}\n")
+ # remove rows that have no variation, which cause problem in heatmap. This is the case when align to itself
+ r.write("toremove <- seq(nrow(x))\n")
+ r.write("for (i in 1:nrow(x)){\n")
+ r.write(" toremove[i] <- all(x[i,] == x[i,1])\n")
+ r.write("}\n")
+ r.write("x <- x[!toremove,]\n")
+ r.write("err <- err[!toremove,]\n")
+ r.write("name <- name[!toremove]\n")
+ r.write("nfeat <- nfeat[!toremove]\n")
+ r.write("write.table(cbind(name,nfeat,x),file='"+outputdata+"',sep ='\\t',quote=F,row.names=F,col.names=F)\n")
+ r.write("write.table(cbind(name,nfeat,err),file='"+outputerr+"',sep ='\\t',quote=F,row.names=F,col.names=F)\n")
+
+ r.write("pdf('"+barplotpdf+"')\n")
+ r.write("heatmap(t(scale(t(as.matrix(x,nc=ncol(x))))),Colv=NA,labRow=name,labCol=NA,margins=c(1,8),cexRow=0.02+1/log(nrow(x)),col=heat.colors(1000))\n")
+
+ if windowsize != '0' :
+ r.write("xticks <- seq(-"+windowsize+","+windowsize+",length.out=100)\n")
+ r.write("xlab <- 'relative position (bp)'\n")
+ else:
+ r.write("xticks <- seq(100)\n")
+ r.write("xlab <- 'relative position (bin)'\n")
+
+ #r.write("plot(xticks,colSums(t(scale(t(as.matrix(x,nc=ncol(x)))))),xlab='relative position (bp)',type='l',lwd=2,main='total signal')\n")
+ r.write("for (i in 1:nrow(x)) {\n")
+ r.write(" avg <- x[i,]/nfeat[i]\n")
+ #r.write(" maxv <- max(avg)\n")
+ #r.write(" minv <- min(avg)\n")
+ #r.write(" medv <- median(avg)\n")
+ #r.write(" if (maxv < "+fold+"*medv | minv*"+fold+">medv){next}\n")
+ #smooth
+ if float(span) >= 0.1:
+ r.write(" avg = loess(as.numeric(avg)~xticks,span="+span+")$fitted\n")
+ r.write(" err[i,] = loess(as.numeric(err[i,])~xticks,span="+span+")$fitted\n")
+ r.write(" par(cex=1.5)\n")
+ r.write(" plot(xticks,avg,ylab='average coverage',main=paste(name[i],'\n n=',nfeat[i],sep=''),xlab=xlab,type='l',lwd=1,ylim=c(min(avg-err[i,]),max(avg+err[i,])))\n")
+ r.write(" polygon(c(xticks,rev(xticks)),c(avg+err[i,],rev(avg-err[i,])),col='slateblue1',border=NA)\n")
+ r.write(" lines(xticks,avg,type='l',lwd=1)\n")
+ r.write("}\n")
+ r.write("dev.off()\n")
+ r.close()
+ os.system("R --vanilla < "+rscriptfile)
+ os.system('rm '+rscriptfile)
+ os.system('rm *.summary')
+ os.system('rm *.standarderror')
+
+main()
diff -r d325683ec368 -r 16ba480adf96 align2database.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/align2database.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,54 @@
+
+ features
+ align2database.py $query $database $output_coverage $output_standarderror $output_plot $minfeat $windowsize $anchor $span> $outlog
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**Example output**
+
+.. image:: ./static/operation_icons/align_multiple2.png
+
+
+**What it does**
+
+This tool aligns a query interval set (such as ChIP peaks) to a database of features (such as other ChIP peaks or TSS/splice sites), calculates and plots the relative distance of database features to the query intervals. Currently two databases are available:
+
+-- **ChIP peaks** from 191 ChIP experiments (processed from hmChIP database, see individual peak/BED files in **Shared Data**)
+
+-- **Annotated gene features**, such as: TSS, TES, 5'ss, 3'ss, CDS start and end, miRNA seed matches, enhancers, CpG island, microsatellite, small RNA, poly A sites (3P-seq-tags), miRNA genes, and tRNA genes.
+
+Two output files are generated. One is the coverage/profile for each feature in the database that has a minimum overlap with the query set. The first two columns are feature name and the total number of overlapping intervals from the query. Column 3 to column 102 are coverage at each bin. The other file is an PDF file plotting both the heatmap for all features and the average coverage for each individual database feature.
+
+
+**How it works**
+
+For each interval/peak in the query file, a window (default 10,000bp) is created around the center of the interval and is divided into 100 bins. For each database feature set (such as Pol II peaks), the tool counts how many intervals in the database feature file overlap with each bin. The count is then averaged over all query intervals that have at least one hit in at least one bin. Overall the plotted 'average coverage' represnts the fraction of query features (only those with hits, number shown in individual plot title) that has database feature interval covering that bin. The extreme is when the database feature is the same as the query, then every query interval is covered at the center, the average coverage of the center bin will be 1.
+
+The heatmap is scaled for each row before clustering.
+
+
+
+
diff -r d325683ec368 -r 16ba480adf96 align2multiple.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/align2multiple.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,109 @@
+
+ features
+ cat $script_file | R --vanilla --slave > $logfile
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ## Setup R error handling to go to stderr
+ cat('\n[',date(),'] Start running job\n')
+ options(warn=-1)
+ windowsize = as.integer("$windowsize")
+ labels = '$label'
+ ## align query to itself
+ cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/alignr.py -a $query -b $query -o $label-$label --profile-only -q -w $windowsize -n $nbins'
+ cat('\n[',date(),'] ',cmd,'\n')
+ system(cmd)
+ ## align other sets to query
+ #for $i,$s in enumerate( $series )
+ labels = c(labels,'$s.label.value')
+ cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/alignr.py -a $s.input.file_name -b $query -o $label-$s.label.value --profile-only -q -w $windowsize -n $nbins'
+ cat('\n[',date(),'] ',cmd,'\n')
+ system(cmd)
+ #end for
+ cat('\n[',date(),'] Read output\n')
+ ## read output of query2query
+ print(paste(labels[1],labels[1],sep='-'))
+ x = read.table(paste(labels[1],labels[1],sep='-'))
+ ids = as.character(x[,1])
+ nfeat = nrow(x)
+ x = as.matrix(x[,3:ncol(x)])
+ nbin = ncol(x)
+
+ ## a table mapping id to position
+ ind = list()
+ for (i in 1:nfeat){
+ ind[[ids[i]]] = i
+ }
+ ## read other output files
+ for (i in 2:length(labels)){
+ print(paste(labels[1],labels[i],sep='-'))
+ x0 = read.table(paste(labels[1],labels[i],sep='-'))
+ ids0 = as.character(x0[,1])
+ x0 = as.matrix(x0[,3:ncol(x0)])
+ x1 = matrix(0,nfeat,nbin)
+ for (j in 1:nrow(x0)){
+ #cat(j,'\t',ids0[j],'\t',ind[[ids0[j]]],'\n')
+ x1[ind[[ids0[j]]],] = x0[j,]
+ }
+ x = cbind(x,x1)
+ }
+ ## reorder
+ if ("${sort}" == "sort"){
+ cat('\n[',date(),'] Sort intervals\n')
+ for (i in rev(2:length(labels))){
+ x = x[order(x[,i*nbin-nbin/2]>0),]
+ }
+ }
+ png("${out_file1}")
+ ##par(mfrow=c(2,length(labels)),mar=c(1,1,4,1))
+ layout(matrix(seq(2*length(labels)),nrow=2,byrow=T),heights=c(1,5))
+ cat('\n[',date(),'] Plot summary\n')
+ par(mar=c(0,0,4,0)+0.1)
+ for (i in 1:length(labels)){
+ plot(colSums(x[,((i-1)*nbin+1):(i*nbin)]),type='l',axes=F,main=labels[i])
+ }
+ cat('\n[',date(),'] Plot heatmap\n')
+ par(mar=c(0,0,0,0)+0.1)
+ for (i in 1:length(labels)){
+ image(-t(log2(1+x[,((i-1)*nbin+1):(i*nbin)])),axes=F)
+ }
+ dev.off()
+ cat('\n[',date(),'] Finished\n')
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+This tool allows you to check the co-localization pattern of multiple interval sets. All interval sets are aligned to the center of the intervals in the query interval set.
+
+Each row represents a window of certain size around the center of one interval in the query set, such as ChIP peaks. Each heatmap shows the position of other features in the SAME window (the same rows in each heatmap represent the same interval/genomic position).
+
+
+The example below shows that of all Fox2 peaks, half of them are within 1kb of TSS. Of the half outside TSS, about one half has H3K4me1, two thirds of which are further depleted of H3K4me3.
+
+-----
+
+**Example**
+
+.. image:: ./static/images/align2multiple.png
+
+
+
diff -r d325683ec368 -r 16ba480adf96 aligndb-hg18-ensGene.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/aligndb-hg18-ensGene.loc.sample Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,24 @@
+TSS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-TSS-1000+1000-noOverlap.bed
+TES-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-TES-1000+1000-noOverlap.bed
+first5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-first5SS-1000+1000-noOverlap.bed
+first3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-first3SS-1000+1000-noOverlap.bed
+internal5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-internal5SS-1000+1000-noOverlap.bed
+internal3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-internal3SS-1000+1000-noOverlap.bed
+last5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-last5SS-1000+1000-noOverlap.bed
+last3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-last3SS-1000+1000-noOverlap.bed
+single5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-single5SS-1000+1000-noOverlap.bed
+single3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-single3SS-1000+1000-noOverlap.bed
+
+TSS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-TSS-1000+1000.bed
+TES /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-TES-1000+1000.bed
+first5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-first5SS-1000+1000.bed
+first3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-first3SS-1000+1000.bed
+internal5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-internal5SS-1000+1000.bed
+internal3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-internal3SS-1000+1000.bed
+last5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-last5SS-1000+1000.bed
+last3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-last3SS-1000+1000.bed
+single5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-single5SS-1000+1000.bed
+single3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-single3SS-1000+1000.bed
+CDSstart /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-CDSstart-1000+1000.bed
+CDSend /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/ensGene-hg18-CDSend-1000+1000.bed
+
diff -r d325683ec368 -r 16ba480adf96 aligndb-hg18-knownGene.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/aligndb-hg18-knownGene.loc.sample Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,24 @@
+TSS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-TSS-1000+1000-noOverlap.bed
+TES-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-TES-1000+1000-noOverlap.bed
+first5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-first5SS-1000+1000-noOverlap.bed
+first3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-first3SS-1000+1000-noOverlap.bed
+internal5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-internal5SS-1000+1000-noOverlap.bed
+internal3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-internal3SS-1000+1000-noOverlap.bed
+last5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-last5SS-1000+1000-noOverlap.bed
+last3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-last3SS-1000+1000-noOverlap.bed
+single5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-single5SS-1000+1000-noOverlap.bed
+single3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-single3SS-1000+1000-noOverlap.bed
+
+TSS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-TSS-1000+1000.bed
+TES /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-TES-1000+1000.bed
+first5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-first5SS-1000+1000.bed
+first3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-first3SS-1000+1000.bed
+internal5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-internal5SS-1000+1000.bed
+internal3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-internal3SS-1000+1000.bed
+last5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-last5SS-1000+1000.bed
+last3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-last3SS-1000+1000.bed
+single5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-single5SS-1000+1000.bed
+single3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-single3SS-1000+1000.bed
+CDSstart /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-CDSstart-1000+1000.bed
+CDSend /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/knownGene-hg18-CDSend-1000+1000.bed
+
diff -r d325683ec368 -r 16ba480adf96 aligndb-hg18-refGene.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/aligndb-hg18-refGene.loc.sample Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,24 @@
+TSS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-TSS-1000+1000-noOverlap.bed
+TES-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-TES-1000+1000-noOverlap.bed
+first5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-first5SS-1000+1000-noOverlap.bed
+first3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-first3SS-1000+1000-noOverlap.bed
+internal5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-internal5SS-1000+1000-noOverlap.bed
+internal3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-internal3SS-1000+1000-noOverlap.bed
+last5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-last5SS-1000+1000-noOverlap.bed
+last3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-last3SS-1000+1000-noOverlap.bed
+single5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-single5SS-1000+1000-noOverlap.bed
+single3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-single3SS-1000+1000-noOverlap.bed
+
+TSS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-TSS-1000+1000.bed
+TES /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-TES-1000+1000.bed
+first5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-first5SS-1000+1000.bed
+first3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-first3SS-1000+1000.bed
+internal5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-internal5SS-1000+1000.bed
+internal3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-internal3SS-1000+1000.bed
+last5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-last5SS-1000+1000.bed
+last3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-last3SS-1000+1000.bed
+single5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-single5SS-1000+1000.bed
+single3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-single3SS-1000+1000.bed
+CDSstart /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-CDSstart-1000+1000.bed
+CDSend /Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/refGene-hg18-CDSend-1000+1000.bed
+
diff -r d325683ec368 -r 16ba480adf96 aligndb-mm9-ensGene .loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/aligndb-mm9-ensGene .loc.sample Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,24 @@
+TSS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-TSS-1000+1000-noOverlap.bed
+TES-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-TES-1000+1000-noOverlap.bed
+first5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-first5SS-1000+1000-noOverlap.bed
+first3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-first3SS-1000+1000-noOverlap.bed
+internal5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-internal5SS-1000+1000-noOverlap.bed
+internal3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-internal3SS-1000+1000-noOverlap.bed
+last5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-last5SS-1000+1000-noOverlap.bed
+last3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-last3SS-1000+1000-noOverlap.bed
+single5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-single5SS-1000+1000-noOverlap.bed
+single3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-single3SS-1000+1000-noOverlap.bed
+
+TSS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-TSS-1000+1000.bed
+TES /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-TES-1000+1000.bed
+first5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-first5SS-1000+1000.bed
+first3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-first3SS-1000+1000.bed
+internal5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-internal5SS-1000+1000.bed
+internal3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-internal3SS-1000+1000.bed
+last5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-last5SS-1000+1000.bed
+last3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-last3SS-1000+1000.bed
+single5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-single5SS-1000+1000.bed
+single3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-single3SS-1000+1000.bed
+CDSstart /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-CDSstart-1000+1000.bed
+CDSend /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-CDSend-1000+1000.bed
+
diff -r d325683ec368 -r 16ba480adf96 aligndb-mm9-ensGene.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/aligndb-mm9-ensGene.loc.sample Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,24 @@
+TSS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-TSS-1000+1000-noOverlap.bed
+TES-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-TES-1000+1000-noOverlap.bed
+first5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-first5SS-1000+1000-noOverlap.bed
+first3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-first3SS-1000+1000-noOverlap.bed
+internal5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-internal5SS-1000+1000-noOverlap.bed
+internal3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-internal3SS-1000+1000-noOverlap.bed
+last5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-last5SS-1000+1000-noOverlap.bed
+last3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-last3SS-1000+1000-noOverlap.bed
+single5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-single5SS-1000+1000-noOverlap.bed
+single3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-single3SS-1000+1000-noOverlap.bed
+
+TSS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-TSS-1000+1000.bed
+TES /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-TES-1000+1000.bed
+first5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-first5SS-1000+1000.bed
+first3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-first3SS-1000+1000.bed
+internal5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-internal5SS-1000+1000.bed
+internal3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-internal3SS-1000+1000.bed
+last5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-last5SS-1000+1000.bed
+last3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-last3SS-1000+1000.bed
+single5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-single5SS-1000+1000.bed
+single3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-single3SS-1000+1000.bed
+CDSstart /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-CDSstart-1000+1000.bed
+CDSend /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/ensGene-mm9-CDSend-1000+1000.bed
+
diff -r d325683ec368 -r 16ba480adf96 aligndb-mm9-knownGene.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/aligndb-mm9-knownGene.loc.sample Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,24 @@
+TSS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-TSS-1000+1000-noOverlap.bed
+TES-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-TES-1000+1000-noOverlap.bed
+first5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-first5SS-1000+1000-noOverlap.bed
+first3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-first3SS-1000+1000-noOverlap.bed
+internal5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-internal5SS-1000+1000-noOverlap.bed
+internal3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-internal3SS-1000+1000-noOverlap.bed
+last5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-last5SS-1000+1000-noOverlap.bed
+last3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-last3SS-1000+1000-noOverlap.bed
+single5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-single5SS-1000+1000-noOverlap.bed
+single3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-single3SS-1000+1000-noOverlap.bed
+
+TSS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-TSS-1000+1000.bed
+TES /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-TES-1000+1000.bed
+first5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-first5SS-1000+1000.bed
+first3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-first3SS-1000+1000.bed
+internal5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-internal5SS-1000+1000.bed
+internal3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-internal3SS-1000+1000.bed
+last5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-last5SS-1000+1000.bed
+last3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-last3SS-1000+1000.bed
+single5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-single5SS-1000+1000.bed
+single3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-single3SS-1000+1000.bed
+CDSstart /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-CDSstart-1000+1000.bed
+CDSend /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/knownGene-mm9-CDSend-1000+1000.bed
+
diff -r d325683ec368 -r 16ba480adf96 aligndb-mm9-refGene.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/aligndb-mm9-refGene.loc.sample Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,23 @@
+TSS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-TSS-1000+1000-noOverlap.bed
+TES-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-TES-1000+1000-noOverlap.bed
+first5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-first5SS-1000+1000-noOverlap.bed
+first3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-first3SS-1000+1000-noOverlap.bed
+internal5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-internal5SS-1000+1000-noOverlap.bed
+internal3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-internal3SS-1000+1000-noOverlap.bed
+last5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-last5SS-1000+1000-noOverlap.bed
+last3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-last3SS-1000+1000-noOverlap.bed
+single5SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-single5SS-1000+1000-noOverlap.bed
+single3SS-filtered /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-single3SS-1000+1000-noOverlap.bed
+
+TSS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-TSS-1000+1000.bed
+TES /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-TES-1000+1000.bed
+first5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-first5SS-1000+1000.bed
+first3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-first3SS-1000+1000.bed
+internal5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-internal5SS-1000+1000.bed
+internal3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-internal3SS-1000+1000.bed
+last5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-last5SS-1000+1000.bed
+last3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-last3SS-1000+1000.bed
+single5SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-single5SS-1000+1000.bed
+single3SS /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-single3SS-1000+1000.bed
+CDSstart /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-CDSstart-1000+1000.bed
+CDSend /Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation-all/refGene-mm9-CDSend-1000+1000.bed
diff -r d325683ec368 -r 16ba480adf96 alignr.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/alignr.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,353 @@
+'''
+the scripts takes two files as input, and compute the coverage of
+features in input 1 across features in input 2. Features in input 2 are
+divided into bins and coverage is computed for each bin.
+
+please check the help information by typing:
+
+ python align.py -h
+
+
+requirement:
+ please install the following tools first:
+ bedtools: for read/region overlapping, http://code.google.com/p/bedtools/
+
+'''
+
+import os,sys,os.path
+from optparse import OptionParser
+
+def lineCount(filename):
+ with open(filename) as f:
+ for i, l in enumerate(f):
+ pass
+ return i + 1
+
+def combineFilename(f1,f2):
+ '''
+ fuse two file names into one
+ '''
+ return f1.split('/')[-1]+'-'+f2.split('/')[-1]
+
+def checkFormat(filename1,filename2,input1format):
+ '''
+ check the format of input files
+ '''
+
+ # file1
+ # read the first line, see how many filds
+ ncol1 = 6
+ if input1format == "BED":
+ f = open(filename1)
+ line = f.readline().strip().split('\t')
+ ncol1 = len(line)
+ if ncol1 < 3:
+ print "ERROR: "+filename1+" has only "+str(ncol1)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited."
+ sys.exit(1)
+ f.close()
+
+ # file2
+ f = open(filename2)
+ line = f.readline().strip().split('\t')
+ ncol2 = len(line)
+ if ncol2 < 3:
+ print "ERROR: "+filename2+" has only "+str(ncol2)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited."
+ sys.exit(1)
+
+ return ncol1,ncol2
+
+
+def makeBed(filename,ncol):
+ '''
+ add up to 6 column
+ '''
+ f = open(filename)
+ outfile = filename+'.tmp.bed'
+ outf = open(outfile,'w')
+ if ncol == 3:
+ for line in f:
+ outf.write(line.strip()+'\t.\t0\t+\n')
+ elif ncol == 4:
+ for line in f:
+ outf.write(line.strip()+'\t0\t+\n')
+ if ncol == 5:
+ for line in f:
+ outf.write(line.strip()+'\t+\n')
+ f.close()
+ outf.close()
+ return outfile
+
+def makeWindow(filename,window):
+
+ outfile = filename+'-window='+str(window)+'.tmp.bed'
+ if not os.path.exists(outfile):
+ f=open(filename)
+ out = open(outfile,'w')
+ lines = f.readlines()
+ if 'track' in lines[0]:
+ del lines[0]
+ for line in lines:
+ flds = line.strip().split('\t')
+
+ #new position
+ center = (int(flds[1]) + int(flds[2]))/2
+ start = center - window
+ end = center + window
+ if start >= 0:
+ flds[1] = str(start)
+ flds[2] = str(end)
+ out.write('\t'.join(flds)+'\n')
+ f.close()
+ out.close()
+ return outfile
+
+def groupReadsMapped2aRegion(filename,ncol):
+ '''
+ read output from intersectBED
+ find all reads mapped to each region
+ '''
+ try:
+ f=open(filename)
+ #If filename cannot be opened, print an error message and exit
+ except IOError:
+ print "could not open",filename,"Are you sure this file exists?"
+ sys.exit(1)
+ lines = f.readlines()
+
+ allReadsStart = {}
+ allReadsEnd = {}
+ regionStrand = {}
+ regionStart = {}
+ regionEnd = {}
+
+ for line in lines:
+ flds = line.strip().split('\t')
+ key = '_'.join(flds[ncol:(ncol+4)])
+ if not allReadsStart.has_key(key):
+ allReadsStart[key] = list()
+ allReadsEnd[key] = list()
+ #print flds[ncol+0],flds[ncol+1],flds[ncol+2]
+ allReadsStart[key].append(int(flds[1]))
+ allReadsEnd[key].append(int(flds[2]))
+ regionStrand[key] = flds[ncol+5]
+ regionStart[key] = int(flds[ncol+1])
+ regionEnd[key] = int(flds[ncol+2])
+ return (allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd)
+
+
+def createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,nbins):
+ '''
+ each region is divided into nbins
+ compute the number of reads covering each bin for each region
+ '''
+ RegionProfile = {}
+ nRead = {} # num of all reads in the region
+ for region in allReadsStart.keys():
+ RegionProfile[region] = [0]*nbins
+ nRead[region] = len(allReadsStart[region])
+ #print region,nRead[region],allReadsStart[region]
+ for i in range(nRead[region]):
+ RegionProfile[region] = updateRegionCount(RegionProfile[region],allReadsStart[region][i],allReadsEnd[region][i],regionStart[region],regionEnd[region],regionStrand[region],nbins)
+ return RegionProfile,nRead
+
+def updateRegionCount(RegionCount,readStart,readEnd,regionStart,regionEnd,strand,nbins):
+ '''
+ each region is divided into nbins,
+ add 1 to each bin covered by the read
+ '''
+ L = regionEnd-regionStart
+ start = int(nbins*(readStart-regionStart)/L)
+ end = int(nbins*(readEnd-regionStart)/L)
+ if start < 0:
+ start = 0
+ if end > nbins:
+ end = nbins
+ if strand == '-':
+ for i in range(start,end):
+ RegionCount[nbins-1-i] = RegionCount[nbins-1-i] + 1
+ else: # if the 6th column of the input is not strand, will treat as + strand by default
+ for i in range(start,end):
+ RegionCount[i] = RegionCount[i] + 1
+ return RegionCount
+
+def saveProfile(filename,Profile,nRegion):
+ out = open(filename,'w')
+ for regionType in Profile.keys():
+ #print Profile[regionType]
+ out.write(regionType+'\t'+str(nRegion[regionType])+'\t'+'\t'.join(map(str,Profile[regionType]))+'\n')
+
+def saveSummary(filename,Profile,nbin):
+ out = open(filename+'.summary','w')
+
+ nfeat = len(Profile)
+ summaryprofile = [0]*nbin
+ for regionType in Profile.keys():
+ for i in range(nbin):
+ summaryprofile[i] += Profile[regionType][i]
+ out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,summaryprofile))+'\n')
+ out.close()
+ # calculate standard error
+ out = open(filename+'.standarderror','w')
+ sd = [0.0]*nbin
+ u = [0.0]*nbin
+ for i in range(nbin):
+ u[i] = float(summaryprofile[i])/nfeat
+ for regionType in Profile.keys():
+ sd[i] = sd[i] + (Profile[regionType][i] - u[i])**2
+ sd[i] = sd[i]**0.5 / nfeat
+ out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,sd))+'\n')
+ out.close()
+
+def main():
+ usage = "usage: %prog [options] -a inputA -b inputB"
+ parser = OptionParser(usage)
+ parser.add_option("-a", dest="inputA",
+ help="(required) input file A, interval (first 3 columns are chrN, start and end) or BAM format. The script computes the depth of coverage of features in file A across the features in file B" )
+ parser.add_option("-b",dest="inputB",
+ help="(required) input file B, interval file" )
+ parser.add_option("-f",dest="aformat",default="BED",
+ help="Format of input file A. Can be BED (default) or BAM")
+ parser.add_option("-w",type='int',dest="window",
+ help="Generate new inputB by making a window of 2 x WINDOW bp (in total) flanking the center of each input feature" )
+ parser.add_option("-n", type="int", dest="nbins",default=100,
+ help="number of bins. Features in B are binned, and the coverage is computed for each bin. Default is 100")
+ parser.add_option("-s",action="store_true", dest="strandness",
+ help="enforce strandness: require overlapping on the same strand. Default is off")
+ parser.add_option("-p",action="store_true", dest="plot",default=False,
+ help="load existed intersectBed outputfile")
+ parser.add_option("-q", action="store_true", dest="quiet",default=False,
+ help="suppress output on screen")
+ parser.add_option("-o", dest="output_data",
+ help="(optional) output coverage file (txt) name." )
+ parser.add_option("-v", dest="output_plot",
+ help="(optional) output plot (pdf) file name." )
+ parser.add_option("-l", dest="plot_title", default="",
+ help="(optional) output title of the plot." )
+ parser.add_option("--ylim", dest="ylim", default="min,max",
+ help="(optional) ylim of the plot" )
+ parser.add_option("--summary-only", action="store_true", dest="summary_only",default=False,
+ help="save profile summary only (no data for individual features)")
+ parser.add_option("--compute-se", action="store_true", dest="compute_se",default=False,
+ help="compute and plot standard deviation for each bin. used when --summary-only is on")
+ parser.add_option("--profile-only", action="store_true", dest="profile_only",default=False,
+ help="save profile only (no plot)")
+ parser.add_option("--span", type="float", dest="span",default=0.1,
+ help="loess span smooth parameter, 0.1 ~ 1")
+
+ (options, args) = parser.parse_args()
+
+ if options.inputA == None or options.inputB == None:
+ parser.error("Please specify two input files!!")
+
+ if not options.quiet:
+ print "checking input file format..."
+
+ ncol,ncol2 = checkFormat(options.inputA ,options.inputB,options.aformat)
+
+ if ncol2 < 6:
+ options.inputB = makeBed(options.inputB,ncol2)
+ if not options.quiet:
+ print "fill up 6 columns"
+
+ if options.window > 0:
+ if not options.quiet:
+ print "making windows from "+options.inputB+"..."
+ options.inputB = makeWindow(options.inputB,options.window)
+
+ output = combineFilename(str(options.inputA),str(options.inputB))
+
+ if not options.plot:
+ if options.aformat == "BAM":
+ cmd = "intersectBed -abam "+str(options.inputA)+" -b "+str(options.inputB) + ' -bed -split '
+ else:
+ cmd = "intersectBed -a "+str(options.inputA)+" -b "+str(options.inputB)
+ if options.strandness:
+ cmd = cmd + ' -s'
+ cmd = cmd +" -wo > "+ output+'-intersect.tmp.bed'
+ if not options.quiet:
+ print "search for overlappings: "+cmd
+ status = os.system(cmd)
+ if status != 0:
+ sys.exit(1)
+
+
+ if not options.quiet:
+ print 'group reads mapped to the same region...'
+
+ allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd = groupReadsMapped2aRegion(output+'-intersect.tmp.bed',ncol)
+
+ if len(allReadsStart) == 0:
+ if not options.quiet:
+ print 'no overlap found!!'
+ os.system('rm *tmp.*')
+ sys.exit(1)
+
+ if not options.quiet:
+ print 'count number of reads mapped to each bin...'
+
+ RegionProfile,nRead = createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,options.nbins)
+
+ if options.output_data == None:
+ options.output_data = output+'.txt'
+
+ if options.summary_only:
+ saveSummary(options.output_data,RegionProfile,options.nbins)
+
+ else:
+ saveProfile(options.output_data,RegionProfile,nRead)
+
+ if not options.quiet:
+ print 'results saved to: '+ options.output_data
+
+ if not (options.summary_only or options.profile_only ):
+ # visualize
+
+ if options.window < 1:
+ xlab = 'relative position (bins)'
+ else:
+ xlab = 'relative position (bp)'
+
+ if options.output_plot == None:
+ options.output_plot = output+'.pdf'
+
+ title = options.plot_title+'\n n = '+str(len(RegionProfile))
+
+ rscript = open("tmp.r","w")
+ rscript.write("x <- read.table('"+options.output_data+"')\n")
+ rscript.write("pdf('"+options.output_plot+"')\n")
+ rscript.write("avg <- colSums(x[,3:ncol(x)])/nrow(x)\n")
+ rscript.write("err <- sd(x[,3:ncol(x)])/sqrt(nrow(x))\n")
+
+ if options.window == 0:
+ rscript.write("xticks <- seq("+str(options.nbins)+")\n")
+ else:
+ rscript.write("xticks <- seq("+str(-options.window)+","+str(options.window)+",length.out="+str(options.nbins)+")\n")
+
+ if options.ylim != 'min,max':
+ rscript.write("ylim=c("+options.ylim+")\n")
+ else:
+ rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
+ rscript.write("par(cex=1.5)\n")
+ #smooth
+ if options.span >= 0.1:
+ rscript.write("avg = loess(avg~xticks,span="+str(options.span)+")$fitted\n")
+ rscript.write("err = loess(err~xticks,span="+str(options.span)+")$fitted\n")
+ rscript.write("plot(xticks,avg,ylab='average coverage',main='"+title+"',xlab='"+xlab+"',type='l',lwd=0,ylim=ylim)\n")
+ rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='slateblue1',border=NA)\n")
+ rscript.write("lines(xticks,avg,type='l',lwd=1)\n")
+ #rscript.write("xticks <- barplot(avg,names.arg=seq("+str(options.nbins)+"),ylab='average coverage',main='"+title+"',xlab='"+xlab+"',,ylim=c(min(avg-err),max(avg+err)))\n")
+ #rscript.write("arrows(xticks,avg+err, xticks, avg-err, angle=90, code=3, length=0.0,col='green')\n")
+ #rscript.write("lines(xticks,avg,lwd=2)\n")
+ #rscript.write("lines(xticks,avg-err,col='green')\n")
+ #rscript.write("lines(xticks,avg+err,col='green')\n")
+ rscript.write("dev.off()\n")
+ rscript.close()
+
+ os.system("R --vanilla < tmp.r")
+
+ # remove intermediate output
+ os.system('rm *tmp.*')
+
+
+if __name__ == "__main__":
+ main()
diff -r d325683ec368 -r 16ba480adf96 alignr.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/alignr.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,142 @@
+
+ two interval sets
+ alignr.py -a $inputa -w $windowsize -n $nbins -o $output_data -v $output_plot $stranded -q -l $outputlabel --ylim=$ylim --span $span
+ #if $inputb_source_type.inputb_select == "user":
+ -b "$inputb"
+ #else:
+ -b "${inputb_source_type.selectedb.fields.value}"
+ #end if
+ #if $inputa_format.inputa_select == "BAM":
+ -f BAM
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool aligns two sets of intervals, finds overlaps, calculates and plots the coverage of the first set across the second set. Applications include:
+
+- check read distribution around TSS/poly A site/splice site/motif site/miRNA target site
+- check relative position/overlap of two lists of ChIP-seq peaks
+
+Two output files are generated. One is the coverage/profile for each interval in input 2. The first two columns are interval ID and the total number of overlapping intervals from input 1. Column 3 to column nbins+2 are coverage at each bin. The other file is an PDF file plotting the average coverage of each bin. To modify the visualization, please downlaod the coverage file and make your own plots.
+
+-----
+
+**Annotated features**
+
+Currently supports mouse genome build mm9 and human hg18. Each interval spans 1000bp upstream and 1000bp downstream of a feature such as TSS. Features with overlapping exons in the intronic/intergenic part of the 2000bp interval are removed.
+
+-----
+
+**Usage**
+
+ -h, --help show this help message and exit
+ -a INPUTA (required) input file A, BED-like (first 3 columns: chr, start, end) or BAM format. The
+ script computes the depth of coverage of features in file
+ A across the features in file B
+ -b INPUTB (required) input file B, BED format or MACS peak file.
+ Requires an unique name for each line in column 4
+ -m inputB is a MACS peak file.
+ -f AFORMAT Format of input file A. Can be BED (default) or BAM
+ -w WINDOW Generate new inputB by making a window of 2 x WINDOW bp
+ (in total) flanking the center of each input feature
+ -n NBINS number of bins. Features in B are binned, and the coverage
+ is computed for each bin. Default is 100
+ -s enforce strandness: require overlapping on the same
+ strand. Default is off
+ -p load existed intersectBed outputfile
+ -q suppress output on screen
+ -o OUTPUTPROFILE (optional) output profile name.
+ -v PLOTFILE (optional) plot file name
+
+
+
diff -r d325683ec368 -r 16ba480adf96 alignvis.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/alignvis.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,76 @@
+import sys,os
+
+infile = sys.argv[1]
+outfile = sys.argv[2]
+uselog = sys.argv[3]
+subset = sys.argv[4]
+reorder = sys.argv[5]
+color = sys.argv[6]
+scale = sys.argv[7] # rescale each row
+rscript = open('tmp.r','w')
+
+rscript.write("x <- read.table('"+infile+"')\n")
+rscript.write("nfeat <- nrow(x) \n")
+rscript.write("nbin <- ncol(x) - 2\n")
+rscript.write("totalcov <- x[,2]\n")
+rscript.write("x <- x[,3:ncol(x)]\n")
+
+if subset =='subset':
+ rscript.write("if (nfeat*nbin > 100000) {\n")
+ rscript.write(" nfeat2 <- as.integer(100000/nbin)\n")
+ rscript.write(" subind <- sample(seq(nfeat),nfeat2)\n")
+ rscript.write(" x <- x[subind,]\n")
+ rscript.write(" totalcov <- totalcov[subind]\n")
+ rscript.write("}\n")
+
+rscript.write("pdf('"+outfile+"')\n")
+
+if uselog == 'uselog':
+ rscript.write("x <- -(log(1+as.matrix(x,nc=ncol(x)-2)))\n")
+else:
+ rscript.write("x <- -as.matrix(x,nc=ncol(x)-2)\n")
+if scale == 'scale':
+ rscript.write("x <- scale(x)\n")
+if reorder == 'average':
+ rscript.write("hc <- hclust(dist(x),method= 'average')\n")
+ rscript.write("x <- x[hc$order,]\n")
+elif reorder == 'centroid':
+ rscript.write("hc <- hclust(dist(x),method= 'centroid')\n")
+ rscript.write("x <- x[hc$order,]\n")
+elif reorder == 'complete':
+ rscript.write("hc <- hclust(dist(x),method= 'complete')\n")
+ rscript.write("x <- x[hc$order,]\n")
+elif reorder == 'single':
+ rscript.write("hc <- hclust(dist(x),method= 'single')\n")
+ rscript.write("x <- x[hc$order,]\n")
+elif reorder == 'median':
+ rscript.write("hc <- hclust(dist(x),method= 'median')\n")
+ rscript.write("x <- x[hc$order,]\n")
+elif reorder == 'sort_by_total':
+ rscript.write("srt <- sort(totalcov,index.return=T)\n")
+ rscript.write("x <- x[srt$ix,]\n")
+elif reorder == 'sort_by_center':
+ rscript.write("srt <- sort(x[,as.integer(nbin/2)],index.return=T)\n")
+ rscript.write("x <- x[srt$ix,]\n")
+if color == 'heat':
+ rscript.write("colormap = heat.colors(1000)\n")
+elif color == 'topo':
+ rscript.write("colormap = topo.colors(1000)\n")
+elif color == 'rainbow':
+ rscript.write("colormap = rainbow(1000)\n")
+elif color == 'terrain':
+ rscript.write("colormap = terrain.colors(1000)\n")
+else:
+ rscript.write("colormap = gray.colors(1000)\n")
+
+#rscript.write("qt <- quantile(as.vector(x),probs=c(0.1,0.9))\n")
+#rscript.write("breaks <- c(min(as.vector(x)),seq(qt[1],qt[2],length.out=99),max(as.vector(x)))\n")
+#rscript.write("image(t(x),col=colormap,breaks=breaks,axes=F)\n")
+rscript.write("image(t(x),col=colormap,axes=F)\n")
+rscript.write("dev.off()\n")
+
+rscript.close()
+
+os.system("R --slave < tmp.r")
+os.system("rm tmp.r")
+
diff -r d325683ec368 -r 16ba480adf96 alignvis.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/alignvis.r Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,8 @@
+args <- commandArgs(TRUE)
+x <- read.table(args[1])
+pdf(args[2])
+#visualize the profile with heatmap
+srt <- sort(x[,2],index.return=T) # sort by total number of reads
+image(-t(log(as.matrix(x[srt$ix[1:nrow(x)],3:ncol(x)],nc=ncol(x)-2))),col=gray.colors(100))
+dev.off()
+
diff -r d325683ec368 -r 16ba480adf96 alignvis.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/alignvis.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,42 @@
+
+ of align output
+ alignvis.py $input $output $uselog $subset $reorder $color $scale
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool generates a heatmap for output from 'align' tool. Each row is the color-coded coverage of a feature, and the features are sorted by the total coverage in the interval.
+
+**Example**
+
+.. image:: ./static/operation_icons/heatmap.png
+
+
+
diff -r d325683ec368 -r 16ba480adf96 bedClean.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bedClean.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,43 @@
+import sys
+
+def readChrSize(filename):
+ f = open(filename)
+ chrSize = {}
+ for line in f:
+ chrom,size = line.strip().split()
+ chrSize[chrom]=int(size)
+ f.close()
+ return chrSize
+
+def cleanFile(filename,chrSize,outfile):
+ f = open(filename)
+ out = open(outfile,'w')
+ i = 0
+ for line in f:
+ i = i + 1
+ flds = line.strip().split('\t')
+ if len(flds) < 3:
+ print 'line',i,'incomplete line:\n',line
+ elif chrSize.has_key(flds[0]):
+ if int(flds[1]) > int(flds[2]):
+ tmp = flds[1]
+ flds[1] = flds[2]
+ flds[2] = tmp
+ if int( flds[1]) < 0 or int(flds[2]) <0:
+ print 'line',i,'negative coordinates:\n',line
+ elif int(flds[2]) > chrSize[flds[0]]:
+ print 'line',i,'end larger than chr size:\n',line
+ else:
+ if flds[5] == '*':
+ flds[5] = '+'
+ print 'line',i,' strand * changed to +\n', line
+ out.write('\t'.join(flds)+'\n')
+ else:
+ print 'line',i,'chromosome',flds[0],'not found!\n',line
+ f.close()
+ out.close()
+
+if len(sys.argv) < 4:
+ print "python bedClean.py in.bed chrsizefile out.bed"
+ exit()
+cleanFile(sys.argv[1],readChrSize(sys.argv[2]),sys.argv[3])
diff -r d325683ec368 -r 16ba480adf96 bed_to_bam.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bed_to_bam.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,19 @@
+
+ convert BED or GFF or VCF to BAM
+ bedToBam -i $input -g $genome $bed12 $mapq $ubam > $outfile
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r d325683ec368 -r 16ba480adf96 bedclean.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bedclean.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,33 @@
+
+ remove off-chromosome lines
+ bedclean.py $input $genome $output > $log
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**Description**
+
+remove lines that are
+
+1. comment or track name lines
+
+2. on chr*_random
+
+3. or have negative coordinates
+
+4. or the end is larger than chromosome size
+
+
+
+
diff -r d325683ec368 -r 16ba480adf96 bedsort.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bedsort.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,23 @@
+
+ a interval file by chr and start
+ head -n $skip $input > $output
+ && tail -n+`expr $skip + 1` $input | sort -k1,1 -k2,2g >> $output
+
+
+
+
+
+
+
+
+
+
+**Description**
+
+Unix command used::
+
+ head -n $skip $input > $output
+ tail -n+`expr $skip + 1` $input | sort -k1,1 -k2,2g >> $output
+
+
+
diff -r d325683ec368 -r 16ba480adf96 bigWigAverageOverBed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bigWigAverageOverBed.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,12 @@
+
+ average interval coverage
+ bigWigAverageOverBed $bw $bed $outtab -bedOut=$outbed 2> err
+
+
+
+
+
+
+
+
+
diff -r d325683ec368 -r 16ba480adf96 binaverage.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/binaverage.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,74 @@
+
+ of numeric columns
+ cat $script_file | R --vanilla --slave > $out_log
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ## Setup R error handling to go to stderr
+ options(warn=-1)
+ source("/Users/xuebing/galaxy-dist/tools/mytools/cdf.r")
+ x = read.table("${input}",sep='\t')
+ x = x[,c($data_bin,$data_avg)]
+ label_avg = "${label_avg}"
+ label_bin = "${label_bin}"
+ if ("${log_bin}" == "logbin"){
+ x[,1] = log2(1+x[,1])
+ label_bin = paste('log2',label_bin)
+ }
+ if ("${log_avg}" == "logavg"){
+ x[,2] = log2(1+x[,2])
+ label_avg = paste('log2',label_avg)
+ }
+ res = binaverage(x,$nbin,"${bintype}")
+ attach(res)
+ for (i in 1:${nbin}){
+ print(paste(label_bin,labels[i],sep=':'))
+ print(summary(binned[[i]]))
+ }
+ pdf("${out_file}")
+ mycdf(binned,"${title}",labels,"$legendloc",label_avg,label_bin)
+ dev.off()
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+This tool generates barplot and CDF plot comparing data/rows in a numeric column that are binned by a second numeric column. The input should have at least two numeric columns. One of the column is used to group rows into bins, and then values in the other column are compared using barplot, CDF plot, and KS test.
+
+
+
diff -r d325683ec368 -r 16ba480adf96 binnedAverage.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/binnedAverage.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,77 @@
+'''
+get binned score of intervals,allow extension
+'''
+
+import os,sys,numpy,random,string
+
+from resize import *
+
+from bx.bbi.bigwig_file import BigWigFile
+
+def binning(x,n):
+ # make n bin of x
+ y = numpy.zeros(n,dtype=float)
+ if len(x) == 0:
+ return y
+ step = float(len(x))/n
+ for k in range(n):
+ i = int(step*k)
+ j = int(step*(k+1)) + 1
+ y[k] = x[i:j].mean()
+ #print i,j,k,y[k]
+ return y
+
+def getBinnedScore(bwfile,intvfile,outfile,outplot,nbin):
+ '''
+ get binned average and std
+ '''
+ fbw = open(bwfile)
+ bw = BigWigFile(file=fbw)
+ fin = open(intvfile)
+ out = open(outfile,'w')
+ zeros = '\t'.join(['0']*nbin)
+ for line in fin:
+ #chrom,start,end,name,score,strand
+ line = line.strip()
+ flds = line.split('\t')
+ #get the score at base resolution as an array
+ scores = bw.get_as_array(flds[0],int(flds[1]),int(flds[2]))
+ if scores == None:
+ print 'not found:\t',line
+ out.write(line+'\t'+zeros+'\n')
+ continue
+ # reverse if on minus strand
+ if flds[5] == '-':
+ scores = scores[::-1]
+ # no score = 0
+ scores = numpy.nan_to_num(scores)
+ # bin the data
+ binned = binning(scores,nbin)
+ out.write(line+'\t'+'\t'.join(map(str,binned))+'\n')
+ fin.close()
+ out.close()
+ # plot
+ if nbin > 1:
+ tmp = "".join(random.sample(string.letters+string.digits, 8))
+ rscript = open(tmp,"w")
+ rscript.write("options(warn=-1)\n")
+ rscript.write("x <- read.table('"+outfile+"',sep='\t')\n")
+ rscript.write("x <- x[,(ncol(x)+1-"+str(nbin)+"):ncol(x)]\n")
+ rscript.write("pdf('"+outplot+"')\n")
+ rscript.write("avg <- apply(x,2,mean)\n")
+ rscript.write("err <- apply(x,2,sd)/sqrt(nrow(x))\n")
+ rscript.write("print(avg)\n")
+ rscript.write("print(err)\n")
+ rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
+ rscript.write("xticks <- seq(ncol(x))\n")
+ rscript.write("plot(xticks,avg,xlab='',ylab='average',type='l',lwd=0,ylim=ylim)\n")
+ rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='lightgreen',border=NA)\n")
+ rscript.write("lines(xticks,avg,type='l',lwd=1)\n")
+ rscript.write("dev.off()\n")
+ rscript.close()
+ os.system("R --vanilla < "+tmp)
+ os.system("rm "+tmp)
+
+print sys.argv
+prog,bwfile,intvfile,nbin,outfile,outplot = sys.argv
+getBinnedScore(bwfile,intvfile,outfile,outplot,int(nbin))
diff -r d325683ec368 -r 16ba480adf96 bwBinavg.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bwBinavg.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,44 @@
+
+ for intervals
+ getGenomicScore.py $input $output $score_type $bwfile $nbin $strand $outplot $span
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+Each interval is binned and the average base-resolution score/coverage/density in the bigwig file is added as new columns appended at the end of the original file .
+
+**Example**
+
+If your original data has the following format:
+
++-----+-----+---+------+
+|chrom|start|end|other2|
++-----+-----+---+------+
+
+and you choose to divide each interval into 3 bins and return the mean scores of each bin, your output will look like this:
+
++-----+-----+---+------+-----+-----+-----+
+|chrom|start|end|other2|mean1|mean2|mean3|
++-----+-----+---+------+-----+-----+-----+
+
+
+
+
diff -r d325683ec368 -r 16ba480adf96 closestBed.py
diff -r d325683ec368 -r 16ba480adf96 closestBed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/closestBed.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,32 @@
+
+ find closest features
+ closestBed -a $inputa -b $inputb $strandness -d $no -t $tie> $output_data
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This is a wrapper for closestBed.
+
+
+
+
diff -r d325683ec368 -r 16ba480adf96 collapseBed.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/collapseBed.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,58 @@
+'''
+collapse intervals
+'''
+
+def collapseInterval_strand(filename):
+ uniqintv = {}
+ data = {}
+ f = open(filename)
+ header = f.readline()
+ if 'chr' in header:
+ flds = header.strip().split('\t')
+ key = '\t'.join([flds[0],flds[1],flds[2],flds[5]])
+ uniqintv[key] = 1
+ data[key] = flds
+ for line in f:
+ flds = line.strip().split('\t')
+ key = '\t'.join([flds[0],flds[1],flds[2],flds[5]])
+ if uniqintv.has_key(key):
+ uniqintv[key] = uniqintv[key] + 1
+ else:
+ uniqintv[key] = 1
+ data[key] = flds
+ f.close()
+ for key in uniqintv.keys():
+ print '\t'.join(data[key]+[str(uniqintv[key])])
+ #flds = key.split('\t')
+ #print '\t'.join([flds[0],flds[1],flds[2],'.',str(uniqintv[key]),flds[3]])
+
+def collapseInterval(filename):
+ uniqintv = {}
+ data = {}
+ f = open(filename)
+ header = f.readline()
+ if 'chr' in header:
+ flds = header.strip().split('\t')
+ key = '\t'.join([flds[0],flds[1],flds[2]])
+ uniqintv[key] = 1
+ data[key] = flds
+ for line in f:
+ flds = line.strip().split('\t')
+ key = '\t'.join([flds[0],flds[1],flds[2]])
+ if uniqintv.has_key(key):
+ uniqintv[key] = uniqintv[key] + 1
+ else:
+ uniqintv[key] = 1
+ data[key] = flds
+ f.close()
+ for key in uniqintv.keys():
+ print '\t'.join(data[key]+[str(uniqintv[key])])
+ #flds = key.split('\t')
+ #print '\t'.join([flds[0],flds[1],flds[2],'.',str(uniqintv[key])])
+
+import sys
+
+if sys.argv[2] == 'strand':
+ collapseInterval_strand(sys.argv[1])
+else:
+ collapseInterval(sys.argv[1])
diff -r d325683ec368 -r 16ba480adf96 collapseBed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/collapseBed.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,19 @@
+
+ intervals
+ collapseBed2.py $input $strand $score > $outfile
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool collapses genomic intervals that have the same position (and strandness if specified) and output a set of unique intervals.
+
+
+
diff -r d325683ec368 -r 16ba480adf96 collapseBed2.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/collapseBed2.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,36 @@
+'''
+collapse intervals
+'''
+
+def collapseInterval_strand(filename,c_strand,c_score):
+ # keeping max column c
+ uniqintv = {}
+ data = {}
+ f = open(filename)
+ header = f.readline()
+ if 'chr' in header:
+ flds = header.strip().split('\t')
+ key = '\t'.join([flds[0],flds[1],flds[2],flds[c_strand]])
+ uniqintv[key] = float(flds[c_score])
+ data[key] = flds
+ for line in f:
+ flds = line.strip().split('\t')
+ key = '\t'.join([flds[0],flds[1],flds[2],flds[c_strand]])
+ if not uniqintv.has_key(key):
+ uniqintv[key] = float(flds[c_score])
+ data[key] = flds
+ elif uniqintv[key] < float(flds[c_score]):
+ uniqintv[key] = float(flds[c_score])
+ data[key] = flds
+
+ f.close()
+ for key in uniqintv.keys():
+ print '\t'.join(data[key])
+
+import sys
+
+if sys.argv[2] == '0':#ignore strand
+ sys.argv[2] = 1
+if sys.argv[3] == '0':# ignore score
+ sys.argv[3] = 2
+collapseInterval_strand(sys.argv[1],int(sys.argv[2])-1,int(sys.argv[3])-1)
diff -r d325683ec368 -r 16ba480adf96 collapseTab.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/collapseTab.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,37 @@
+'''
+collapse tabular files, with key columns, and max columns
+'''
+
+def collapseTab(filename,c_key,c_max):
+ # keeping rows with max value in column c_max
+ nCol = max(max(c_key),c_max)
+ c_max = c_max - 1
+ for i in range(len(c_key)):
+ c_key[i] = c_key[i] - 1
+ uniqintv = {}
+ data = {}
+ f = open(filename)
+ for line in f:
+ flds = line.strip().split('\t')
+ if len(flds) < nCol:
+ continue
+ key = ''
+ for i in c_key:
+ key = key + flds[i-1] # i is 1-based, python is 0-based
+ if not uniqintv.has_key(key):
+ uniqintv[key] = float(flds[c_max])
+ data[key] = flds
+ elif uniqintv[key] < float(flds[c_max]):
+ uniqintv[key] = float(flds[c_max])
+ data[key] = flds
+
+ f.close()
+ for key in uniqintv.keys():
+ print '\t'.join(data[key])
+
+import sys
+
+# convert string to number list
+c_key = map(int,sys.argv[2].split(','))
+c_max = int(sys.argv[3])
+collapseTab(sys.argv[1],c_key,c_max)
diff -r d325683ec368 -r 16ba480adf96 collapseTab.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/collapseTab.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,19 @@
+
+ files
+ collapseTab.py $input $key $max > $outfile
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+Similar to 'Group' but returns the entire line.
+
+
+
diff -r d325683ec368 -r 16ba480adf96 endbias.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/endbias.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,52 @@
+'''
+usage:
+
+python endbias.py utr5-coverage utr3-coverage outputfile
+'''
+import sys,math
+
+def getCoverage(filename):
+ f = open(filename)
+ coverage = {}
+ for line in f:
+ flds = line.strip().split('\t')
+ score = float(flds[4])
+ name = (flds[0].split('utr'))[0].strip('_')
+ if coverage.has_key(name):
+ if score > coverage[name]:
+ coverage[name] = score
+ else:
+ coverage[name] = score
+ return coverage
+
+def endBias(filename,utr5,utr3):
+ out = open(filename,'w')
+ for txpt in utr5.keys():
+ if utr3.has_key(txpt):
+ out.write('\t'.join([txpt,str(utr5[txpt]),str(utr3[txpt]),str(math.log((1+utr5[txpt])/(1+utr3[txpt]),2))])+'\n')
+ out.close()
+
+
+utr5 = getCoverage(sys.argv[1])
+utr3 = getCoverage(sys.argv[2])
+endBias(sys.argv[3],utr5,utr3)
+
+'''
+
+utr5 = getCoverage('hmga2-utr5.coverage')
+utr3 = getCoverage('hmga2-utr3.coverage')
+logratio, cov5,cov3= endBias(utr5,utr3)
+2**pylab.median(logratio.values())
+
+log2utr5 = pylab.log2(pylab.array(cov5)+1)
+log2utr3 = pylab.log2(pylab.array(cov3)+1)
+
+pylab.plot(log2utr5,log2utr3,'bo')
+
+pylab.show()
+
+utr5 = getCoverage('control-utr5.coverage')
+utr3 = getCoverage('control-utr3.coverage')
+logratio, cov5,cov3= endBias(utr5,utr3)
+2**pylab.median(logratio.values())
+'''
diff -r d325683ec368 -r 16ba480adf96 endbias.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/endbias.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,11 @@
+
+ of UTR coverage
+ endbias.py $input1 $input2 $output
+
+
+
+
+
+
+
+
diff -r d325683ec368 -r 16ba480adf96 genomeView.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genomeView.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,108 @@
+
+ plot and correlation
+ cat $script_file | R --vanilla --slave 2> err.log
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ## Setup R error handling to go to stderr
+ options(warn=-1)
+ source("/Users/xuebing/galaxy-dist/tools/mytools/genomeview.r")
+ genome = read.table( "${genome}")
+ uselog = as.character("${log}")
+ union = as.character("${union}")
+ resolution = as.integer("${resolution}")
+ cat('resolution=',resolution,'\n')
+ offset = caloffset(genome)
+ mcov = matrix(ncol=1,nrow=as.integer(offset[length(offset)] / resolution))
+ ## Open output PDF file
+ pdf( "${out_file1}" ,height=4,width=20)
+ labels = character(0)
+ ## Determine range of all series in the plot
+ #for $i, $s in enumerate( $series )
+ x = read.table( "${s.input.file_name}" )
+ res = coverage(x,genome,offset,resolution)
+ plotcov(res,genome,offset,"${s.label.value}",uselog)
+ labels = c(labels,"${s.label.value}")
+ attach(res)
+ mcov = cbind(mcov,cov)
+ detach(res)
+ #end for
+ dev.off()
+ pdf("${out_file2}")
+ mcov = mcov[,-1]
+ nSample = length(labels)
+ if (nSample > 1) {
+ if (union == 'union') {
+ cm = matrix(0,nrow=nSample,ncol=nSample)
+ for (i in 1:(nSample-1)) {
+ cm[i,i] = 1
+ for (j in (i+1):nSample){
+ cm[i,j] = union_correlation(mcov[,i],mcov[,j])
+ cm[j,i] = cm[i,j]
+ }
+ }
+ cm[nSample,nSample] = 1
+ } else {
+ cm = cor(mcov)
+ }
+ rm(mcov)
+ ##heatmap(-cm,margins=c(8,8),sym=T,scale='none',labRow=labels,labCol=labels)
+ ##heatmap2(cm,'none',TRUE,c(8,8),labels,labels)
+ x = cm
+ h = heatmap(-x,scale='none',sym=T,margins=c(8,8),labRow=labels,labRol=labels)
+ attach(h)
+ x = x[rowInd,colInd]
+ tx = numeric(0)
+ ty = numeric(0)
+ txt = character(0)
+ for (i in 1:nrow(x)){
+ for (j in 1:ncol(x)){
+ tx = c(tx,i)
+ ty = c(ty,ncol(x)-j+1)
+ txt = c(txt,round(x[i,j]*100)/100)
+ }
+ }
+ heatmap(-x,scale='none',sym=T,margins=c(8,8),labRow=labels[rowInd],labCol=labels[colInd],add.expr=text(tx,ty,txt,col='black'))
+ library(gplots)
+ heatmap.2(cm,margins=c(8,8),scale='none',key=TRUE,trace='none', symkey=T,symbreaks=T,col=bluered,labRow=labels,labCol=labels,symm=T)
+ }
+ dev.off()
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+This tool allows you to plot multiple intervals across all chromosomes at different resolution, and it also plots the correlation matrix if multiple intervals are provided.
+
+-----
+
+**Example**
+
+.. image:: ./static/images/correlationmatrix.png
+.. image:: ./static/images/wholegenome.png
+
+
+
diff -r d325683ec368 -r 16ba480adf96 genomeview-old2.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genomeview-old2.r Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,52 @@
+
+caloffset = function(genome){
+ total_len = sum(as.numeric(genome[,2]))
+ offset = 0
+ for (i in 1:nrow(genome)) {
+ offset = c(offset,offset[i]+genome[i,2])
+ }
+ offset
+}
+
+coverage = function(intervals,genome,offset,resolution) {
+
+ nChr = length(offset) - 1
+ total_len = offset[nChr+1]
+ nbin = as.integer(total_len / resolution)
+ #cat('nbin=',nbin,'genomelen=',total_len,'\n')
+ cov = numeric(nbin)#coverage
+ col = numeric(nbin)#color
+ for (i in 1:nChr) {
+ d = x[x[,1]==as.character(genome[i,1]),2:3]
+ if (nrow(d) > 0){
+ #cat('dim(d)=',dim(d),'\n')
+ d = ceiling((d+offset[i])*nbin/total_len)
+ for (j in 1:nrow(d)){
+ cov[d[j,1]:d[j,2]] = cov[d[j,1]:d[j,2]] + 1
+ }
+ }
+ col[ceiling(offset[i]*nbin/total_len):ceiling(offset[i]*nbin/total_len)] = i
+ }
+ list(nbin=nbin,cov=cov)
+}
+
+# plot coverage
+# res = genomeView(x,genome,100000)
+plotcov = function(res,genome,offset,title,uselog) {
+ if (uselog == 'log'){
+ res$cov = log10(res$cov+1)
+ }
+ ymax = max(res$cov)
+ par(mar=c(5,5,5,1))
+ plot(seq(length(res$cov)),res$cov,type='h',cex=0.1,cex.axis=2,cex.lab=2,cex.main=3,col=res$col,xaxt='n',main=title,xlab='chromosome',ylab='coverage',frame.plot=F,ylim=c(0,ymax))
+ xticks = numeric(nrow(genome))
+ for (i in 1:nrow(genome)){
+ xticks[i] = (offset[i]+offset[i+1])/2*res$nbin/offset[length(offset)]
+ }
+ mtext(genome[,1],side=1,at=xticks,adj=1,las=2,col=seq(nrow(genome)))
+}
+
+union_correlation = function(x,y){
+ z = x>0 | y>0
+ cor(x[z],y[z])
+}
diff -r d325683ec368 -r 16ba480adf96 genomeview.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genomeview.r Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,70 @@
+
+caloffset = function(genome){
+ total_len = sum(as.numeric(genome[,2]))
+ offset = 0
+ for (i in 1:nrow(genome)) {
+ offset = c(offset,offset[i]+genome[i,2])
+ }
+ offset
+}
+
+coverage = function(intervals,genome,offset,resolution) {
+
+ nChr = length(offset) - 1
+ total_len = offset[nChr+1]
+ nbin = as.integer(total_len / resolution)
+ cov = numeric(nbin)#coverage
+ col = numeric(nbin)#color
+ for (i in 1:nChr) {
+ d = x[x[,1]==as.character(genome[i,1]),2:3]
+ d = ceiling(((d[,1]+d[,2])/2+offset[i])*nbin/total_len)
+ t = table(d)
+ pos = as.numeric(row.names(t))
+ cov[pos] = cov[pos] + as.numeric(t)
+ col[pos] = i
+ }
+ list(nbin=nbin,cov=cov,col=col)
+}
+
+# plot coverage
+# res = genomeView(x,genome,100000)
+plotcov = function(res,genome,offset,title,uselog) {
+ if (uselog == 'log'){
+ res$cov = log10(res$cov+1)
+ }
+ ymax = max(res$cov)
+ #print(ymax)
+ par(mar=c(5,5,5,1))
+ plot(seq(length(res$cov)),res$cov,type='h',cex=0.1,cex.axis=2,cex.lab=2,cex.main=3,col=res$col,xaxt='n',main=title,xlab='chromosome',ylab='coverage',frame.plot=F,ylim=c(0,ymax))
+ xticks = numeric(nrow(genome))
+ for (i in 1:nrow(genome)){
+ xticks[i] = (offset[i]+offset[i+1])/2*res$nbin/offset[length(offset)]
+ }
+ mtext(genome[,1],side=1,at=xticks,adj=1,las=2,col=seq(nrow(genome)))
+}
+
+union_correlation = function(x,y){
+ z = x>0 | y>0
+ cor(x[z],y[z])
+}
+
+
+heatmap2 = function(x,scale,sym,margins,labRow,labCol){
+ h = heatmap(x,scale=scale,sym=sym,margins=margins,labRow=labRow,labCol=labCol)
+ x = x[h$rowInd,h$colInd]
+ tx = numeric(0)
+ ty = numeric(0)
+ txt = character(0)
+ for (i in 1:nrow(x)){
+ for (j in 1:ncol(x)){
+ tx <- c(tx,i)
+ ty <- c(ty,ncol(x)-j+1)
+ txt <- c(txt,format(x[i,j],digits=2,nsmall=2))
+ }
+ }
+ #heatmap(x,scale=scale,sym=sym,margins=margins,labRow=labRow[h$rowInd],labCol=labCol[h$colInd],add.expr=text(1:4,1:4,1:4))
+ cat(dim(tx))
+ text(tx,ty,txt)
+ heatmap(x,scale=scale,sym=sym,margins=margins,labRow=labRow[h$rowInd],labCol=labCol[h$colInd],add.expr=text(tx,ty,txt))
+
+}
diff -r d325683ec368 -r 16ba480adf96 genomeview_notused
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genomeview_notused Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,45 @@
+
+caloffset = function(genome){
+ total_len = sum(as.numeric(genome[,2]))
+ offset = 0
+ for (i in 1:nrow(genome)) {
+ offset = c(offset,offset[i]+genome[i,2])
+ }
+ offset
+}
+
+coverage = function(intervals,genome,offset,resolution) {
+
+ nChr = length(offset) - 1
+ total_len = offset[nChr+1]
+ nbin = as.integer(total_len / resolution)
+
+ pos = numeric(0)
+ cov = numeric(0)#coverage
+ col = numeric(0)#color
+ for (i in 1:nChr) {
+ d = x[x[,1]==as.character(genome[i,1]),2:3]
+ d = ceiling(((d[,1]+d[,2])/2+offset[i])*nbin/total_len)
+ t = table(d)
+ pos = c(pos,as.numeric(row.names(t)))
+ cov = c(cov, as.numeric(t))
+ col = c(col,numeric(length(t))+i)
+ }
+ list(nbin=nbin,pos=pos,cov=cov,col=col)
+}
+
+# plot coverage
+# res = genomeView(x,genome,100000)
+plotcov = function(res,genome,offset,title,uselog) {
+ if (uselog == 'log'){
+ res$cov = log10(res$cov+1)
+ }
+ ymax = max(res$cov)
+ par(mar=c(5,5,5,1))
+ plot(res$pos,res$cov,type='h',cex=0.1,cex.axis=2,cex.lab=2,cex.main=3,col=res$col,xaxt='n',main=title,xlab='chromosome',ylab='coverage',frame.plot=F,xlim=c(0,res$nbin),ylim=c(0,ymax))
+ xticks = numeric(nrow(genome))
+ for (i in 1:nrow(genome)){
+ xticks[i] = (offset[i]+offset[i+1])/2*res$nbin/offset[length(offset)]
+ }
+ mtext(genome[,1],side=1,at=xticks,adj=1,las=2,col=seq(nrow(genome)))
+}
diff -r d325683ec368 -r 16ba480adf96 getGenomicScore.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/getGenomicScore.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,78 @@
+import random,string,os,sys
+
+
+def getScore(intvfile,outfile,summary_type,bwfilepath,nbin,strand,outplot,span):
+ f = open(intvfile)
+ tmpsh = "".join(random.sample(string.letters+string.digits, 8))
+ tmpout = "".join(random.sample(string.letters+string.digits, 8))
+ tmp = open(tmpsh,'w')
+ if os.path.isdir(bwfilepath):
+ for line in f:
+ flds = line.strip().split('\t')
+ cmd = 'bigWigSummary -type='+summary_type+' '+bwfilepath+'/'+flds[0]+'.bw '+flds[0]+' '+flds[1]+' '+flds[2]+' '+nbin+' >> '+tmpout+' 2>>'+tmpout+'\n'
+ tmp.write(cmd)
+ else:
+ for line in f:
+ flds = line.strip().split('\t')
+ cmd = 'bigWigSummary -type='+summary_type+' '+bwfilepath+' '+flds[0]+' '+flds[1]+' '+flds[2]+' '+nbin+' >> '+tmpout+' 2>>'+tmpout+'\n'
+ tmp.write(cmd)
+ f.close()
+ # remove blank lines
+ tmp.write("sed '/^$/d' "+tmpout+'>'+tmpout+".1\n")
+ tmp.write("sed '/^Can/d' "+tmpout+".1 >"+tmpout+".2\n")
+ # set n/a to 0
+ tmp.write("sed 's/n\/a/0/g' "+tmpout+".2 >"+tmpout+".3\n")
+ # replace text with 0
+ zeros = ''.join(['0\t']*int(nbin))
+ tmp.write("sed 's/^[a-zA-Z]/"+zeros+"/' "+tmpout+".3 >"+tmpout+".4\n")
+ # cut the first nbin columns
+ tmp.write("cut -f 1-"+nbin+" "+tmpout+".4 > "+tmpout+".5\n")
+ tmp.write("paste "+intvfile+" "+tmpout+".5 >"+outfile+"\n")
+ tmp.close()
+ os.system('chmod +x '+tmpsh)
+ os.system('./'+tmpsh)
+ #os.system('rm '+tmpout+'*')
+ #os.system('rm '+tmpsh)
+
+ # strandness: need to reverse bins for - strand
+ if nbin > 1 and strand > 0:
+ strand = strand - 1 # python is 0 based
+ os.system('mv '+outfile+' '+tmpout)
+ f = open(tmpout)
+ out = open(outfile,'w')
+ for line in f:
+ flds=line.strip().split('\t')
+ if flds[strand] == '+':
+ out.write(line)
+ else:
+ scores = flds[-int(nbin):]
+ scores.reverse()
+ flds = flds[:-int(nbin)]+scores
+ out.write('\t'.join(flds)+'\n')
+ os.system('rm '+tmpout)
+ f.close()
+ out.close()
+ # plot
+ if int(nbin) > 1:
+ rscript = open(tmpsh,"w")
+ rscript.write("options(warn=-1)\n")
+ rscript.write("x <- read.table('"+outfile+"',sep='\t')\n")
+ rscript.write("x <- x[,(ncol(x)+1-"+nbin+"):ncol(x)]\n")
+ rscript.write("pdf('"+outplot+"')\n")
+ rscript.write("avg <- apply(x,2,mean)\n")
+ rscript.write("err <- apply(x,2,sd)/sqrt(nrow(x))\n")
+ rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
+ rscript.write("xticks <- seq(ncol(x))-(1+ncol(x))/2\n")
+ if span >= 0.1:
+ rscript.write("avg = loess(avg~xticks,span="+str(span)+")$fitted\n")
+ rscript.write("err = loess(err~xticks,span="+str(span)+")$fitted\n")
+ rscript.write("par(cex=1.5)\n")
+ rscript.write("plot(xticks,avg,ylab='average conservation score',xlab='relative position (bin)',type='l',lwd=0,ylim=ylim)\n")
+ rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='slateblue1',border=NA)\n")
+ rscript.write("lines(xticks,avg,type='l',lwd=1)\n")
+ rscript.write("dev.off()\n")
+ rscript.close()
+ os.system("R --vanilla < "+tmpsh)
+ os.system("rm "+tmpsh)
+
+getScore(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],int(sys.argv[6]),sys.argv[7],float(sys.argv[8]))
diff -r d325683ec368 -r 16ba480adf96 intersectSig.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/intersectSig.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,90 @@
+'''
+find overlap and test signifiance
+'''
+
+import os,sys
+
+def lineCount(filename):
+ if os.stat(filename).st_size == 0:
+ return 0
+ with open(filename) as f:
+ for i, l in enumerate(f):
+ pass
+ print i
+ return i+1
+
+def intersect(fileA,fileB,outfile,fraction,reciprocal):
+ # return fileA intervals that overlap with interval in fileB
+ cmd = 'intersectBed -a '+fileA+' -b '+fileB + ' -u -wa -f '+fraction +' '+ reciprocal + '>'+outfile
+ #print cmd
+ os.system(cmd)
+
+def shuffle(fileA,fileB,genomefile,fraction,reciprocal,N):
+ # shuffle fileA N times, return the distribution of overlaps
+ nOverlap = []
+ for i in range(N):
+ # shuffle fileA using shuffleBed
+ #cmd = 'shuffleBed -i '+fileA+' -g '+genomefile +'>fileA.shuffled'
+ # using random_interval.py
+ cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/random_interval.py '+fileA+' fileA.shuffled across '+genomefile
+ os.system(cmd)
+ intersect('fileA.shuffled',fileB,'tmp',fraction,reciprocal)
+ nOverlap.append(lineCount('tmp'))
+ os.system('rm tmp')
+ os.system('rm fileA.shuffled')
+ return nOverlap
+
+def main():
+ fileA = sys.argv[1]
+ fileB = sys.argv[2]
+ outfile = sys.argv[3]
+ outplot = sys.argv[4]
+ outshuffle = sys.argv[5]
+ N = int(sys.argv[6]) # times to shuffle
+ genomefile = sys.argv[7]
+ fraction = sys.argv[8]
+ if len(sys.argv) == 10:
+ reciprocal = sys.argv[9] # can only be '-r'
+ else:
+ reciprocal = ''
+
+ #print sys.argv
+
+ # number of lines in input
+ nA = lineCount(fileA)
+ nB = lineCount(fileB)
+
+ # intersect on real data
+ intersect(fileA,fileB,outfile,fraction,reciprocal)
+ # number of overlaps
+ nOverlapReal = lineCount(outfile)
+
+ #print 'number of intervals in inputA that overlap with intervals in inputB:',nOverlapReal
+
+ # shuffle fileA to estimate background
+ nOverlapNull = shuffle(fileA,fileB,genomefile,fraction,reciprocal,N)
+ out = open(outshuffle,'w')
+ out.write("\t".join(map(str,nOverlapNull)))
+ out.close()
+
+ # plot histogram
+ rscript = open('tmp.r','w')
+ rscript.write("options(warn=-1)\n")
+ rscript.write("x0 <- "+str(nOverlapReal)+"\n")
+ rscript.write("x <- c("+','.join(map(str,nOverlapNull))+")\n")
+ rscript.write("library(MASS)\n")
+ rscript.write("pv <- min((1+sum(x>=x0))/length(x),(1+sum(x<=x0))/length(x))\n")
+ rscript.write("title <- paste('actual:chance = ',x0,':',format(mean(x),digits=1,nsmall=1),' = ',format(x0/mean(x),digits=1,nsmall=2),', p-value < ',pv,sep='')\n")
+ rscript.write("pdf('"+outplot+"')\n")
+ rscript.write("library(grid)\n")
+ rscript.write("library(VennDiagram)\n")
+ rscript.write("venn <- venn.diagram(x=list(A=1:"+str(nA)+",B="+str(nA-nOverlapReal+1)+":"+str(nA+nB-nOverlapReal)+"),filename=NULL,fill=c('red','blue'),col='transparent',alpha=0.5,label.col='black',cex=3,lwd=0,fontfamily='serif',fontface='bold',cat.col = c('red', 'blue'),cat.cex=3,cat.fontfamily='serif',cat.fontface='bold')\n")
+ rscript.write("grid.draw(venn)\n")
+ rscript.write("h <- hist(x,breaks=50,xlab='number of overlaps',ylab='frequency',main=title)\n")
+ rscript.write("plot(h$mids,h$counts,type='h',xlim=c(min(h$mids,x0),max(x0,h$mids)),ylim=c(0,max(h$counts)),xlab='number of overlaps',ylab='frequency',main=title)\n")
+ rscript.write("points(x0,0,col='red')\n")
+ rscript.write("dev.off()\n")
+ rscript.close()
+ os.system("R --vanilla < tmp.r")
+ os.system('rm tmp.r')
+main()
diff -r d325683ec368 -r 16ba480adf96 intersectSig.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/intersectSig.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,30 @@
+
+ of two interval lists
+ intersectSig.py $fileA $fileB $outfile $outplot $outshuffle $n $genome $fraction $reciprocal
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool uses intersectBed to find intervals in the first dataset that overlap with intervals in the second dataset. To estimate the significance of the overlap, the first dataset is shuffled then intersect with the second dataset to generate a null distribution of the number of overlaps. The tool returns venn diagram plot, histogram of the null distribution, overlapped intervals from the first input, and the null distribution of overlaps.
+
+
+
+
diff -r d325683ec368 -r 16ba480adf96 intersectbed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/intersectbed.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,103 @@
+
+ intersect two interval sets
+ intersectBed -a $inputa -b $inputb $output_opt $strandness $r -f $f $split > $output_data
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This is a wrapper for intersecBed.
+
+ Program: intersectBed (v2.13.3)
+ Author: Aaron Quinlan (aaronquinlan@gmail.com)
+ Summary: Report overlaps between two feature files.
+
+Usage::
+
+ intersectBed [OPTIONS] -a (bed/gff/vcf) -b (bed/gff/vcf)
+
+Options::
+ -abam The A input file is in BAM format. Output will be BAM as well.
+
+ -ubam Write uncompressed BAM output. Default is to write compressed BAM.
+
+ -bed When using BAM input (-abam), write output as BED. The default
+ is to write output in BAM when using -abam.
+
+ -wa Write the original entry in A for each overlap.
+
+ -wb Write the original entry in B for each overlap.
+ - Useful for knowing _what_ A overlaps. Restricted by -f and -r.
+
+ -wo Write the original A and B entries plus the number of base
+ pairs of overlap between the two features.
+ - Overlaps restricted by -f and -r.
+ Only A features with overlap are reported.
+
+ -wao Write the original A and B entries plus the number of base
+ pairs of overlap between the two features.
+ - Overlapping features restricted by -f and -r.
+ However, A features w/o overlap are also reported
+ with a NULL B feature and overlap = 0.
+
+ -u Write the original A entry _once_ if _any_ overlaps found in B.
+ - In other words, just report the fact >=1 hit was found.
+ - Overlaps restricted by -f and -r.
+
+ -c For each entry in A, report the number of overlaps with B.
+ - Reports 0 for A entries that have no overlap with B.
+ - Overlaps restricted by -f and -r.
+
+ -v Only report those entries in A that have _no overlaps_ with B.
+ - Similar to "grep -v" (an homage).
+
+ -f Minimum overlap required as a fraction of A.
+ - Default is 1E-9 (i.e., 1bp).
+ - FLOAT (e.g. 0.50)
+
+ -r Require that the fraction overlap be reciprocal for A and B.
+ - In other words, if -f is 0.90 and -r is used, this requires
+ that B overlap 90% of A and A _also_ overlaps 90% of B.
+
+ -s Require same strandedness. That is, only report hits in B that
+ overlap A on the _same_ strand.
+ - By default, overlaps are reported without respect to strand.
+
+ -S Require different strandedness. That is, only report hits in B that
+ overlap A on the _opposite_ strand.
+ - By default, overlaps are reported without respect to strand.
+
+ -split Treat "split" BAM or BED12 entries as distinct BED intervals.
+
+ -sorted Use the "chromsweep" algorithm for sorted (-k1,1 -k2,2n) input
+ NOTE: this will trust, but not enforce that data is sorted. Caveat emptor.
+
+
+
+
diff -r d325683ec368 -r 16ba480adf96 intervalOverlap.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/intervalOverlap.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,82 @@
+'''
+find overlap and test signifiance
+'''
+
+import os,sys
+
+def lineCount(filename):
+ i = 0
+ with open(filename) as f:
+ for i, l in enumerate(f):
+ pass
+ return i + 1
+
+def intersect(fileA,fileB,outfile,fraction,reciprocal):
+ # return fileA intervals that overlap with interval in fileB
+ cmd = 'intersectBed -a '+fileA+' -b '+fileB + ' --wo -f '+fraction +' '+ reciprocal + '>'+outfile
+ #print cmd
+ os.system(cmd)
+
+def parseIntersect(filename):
+ # get number of overlapped A and B
+ nA = 0
+ nB = 0
+ return nA,nb
+
+def shuffle(fileA,fileB,genomefile,fraction,reciprocal,N):
+ # shuffle fileA N times, return the distribution of overlaps
+ nOverlap = []
+ for i in range(N):
+ # shuffle fileA using shuffleBed
+ #cmd = 'shuffleBed -i '+fileA+' -g '+genomefile +'>fileA.shuffled'
+ # using random_interval.py
+ cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/random_interval.py '+fileA+' fileA.shuffled across '+genomefile
+ os.system(cmd)
+ intersect('fileA.shuffled',fileB,'tmp',fraction,reciprocal)
+ nOverlap.append(lineCount('tmp'))
+ os.system('rm tmp')
+ os.system('rm fileA.shuffled')
+ return nOverlap
+
+def main():
+ fileA = sys.argv[1]
+ fileB = sys.argv[2]
+ outfile = sys.argv[3]
+ outplot = sys.argv[4]
+ N = int(sys.argv[5]) # times to shuffle
+ genomefile = sys.argv[6]
+ fraction = sys.argv[7]
+ if len(sys.argv) == 9:
+ reciprocal = sys.argv[8] # can only be '-r'
+ else:
+ reciprocal = ''
+
+ print sys.argv
+
+ # intersect on real data
+ intersect(fileA,fileB,outfile,fraction,reciprocal)
+ # number of overlaps
+ nOverlapReal = lineCount(outfile)
+
+ print 'number of intervals in inputA that overlap with intervals in inputB:',nOverlapReal
+
+ # shuffle fileA to estimate background
+ nOverlapNull = shuffle(fileA,fileB,genomefile,fraction,reciprocal,N)
+
+ # plot histogram
+ rscript = open('tmp.r','w')
+ rscript.write("x0 <- "+str(nOverlapReal)+"\n")
+ rscript.write("x <- c("+','.join(map(str,nOverlapNull))+")\n")
+ rscript.write("library(MASS)\n")
+ rscript.write("\n")
+ rscript.write("pv <- min((1+sum(x>=x0))/length(x),(1+sum(x<=x0))/length(x))\n")
+ rscript.write("title <- paste('actual:chance = ',x0,':',round(mean(x)),' = ',format(x0/mean(x),digits=1,nsmall=2),', p-value < ',pv,sep='')\n")
+ rscript.write("pdf('"+outplot+"')\n")
+ rscript.write("h <- hist(x,breaks=50,xlab='number of overlaps',ylab='frequency',main=title)\n")
+ rscript.write("plot(h$mids,h$counts,type='h',xlim=c(min(h$mids,x0),max(x0,h$mids)),ylim=c(0,max(h$counts)),xlab='number of overlaps',ylab='frequency',main=title)\n")
+ rscript.write("points(x0,0,col='red')\n")
+ rscript.write("dev.off()")
+ rscript.close()
+ os.system("R --vanilla < tmp.r")
+ os.system('rm tmp.r')
+main()
diff -r d325683ec368 -r 16ba480adf96 intervalSize.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/intervalSize.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,18 @@
+'''
+plot histogram of interval size
+'''
+
+import os,sys
+
+inputfile = sys.argv[1]
+outputfile = sys.argv[2]
+
+rf = open('tmp.r','w')
+rf.write("x <- read.table('"+inputfile+"')\n")
+rf.write("len <- x[,3]-x[,2]\n")
+rf.write("pdf('"+outputfile+"')\n")
+rf.write("hist(len,breaks=100,xlab='interval size',main=paste('mean=',mean(len),sep=''))\n")
+rf.write("dev.off()")
+rf.close()
+os.system("R --vanilla < tmp.r")
+os.system('rm tmp.r')
diff -r d325683ec368 -r 16ba480adf96 intervalSize.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/intervalSize.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,17 @@
+
+ distribution
+ intervalSize.py $input $output
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool generates a histogram of the interval size.
+
+
+
diff -r d325683ec368 -r 16ba480adf96 makebigwig.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/makebigwig.sh Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,57 @@
+# use of output: move to public_html and visualize in ucsc genome browser with the following:
+# track name="xxx" color=0,0,255 type=bigWig bigDataUrl=http://rous.mit.edu/~wuxbl/xxx.bw
+
+if [ $# -lt 6 ]
+then
+ echo "./bigwig.sh infile outtag bam/bed sorted/none genome strand/none [-split]"
+ exit
+fi
+
+f=$1
+outf=$2
+extension=$3
+sorted=$4
+genome=$5
+strand=$6
+split=$7
+i=i
+if [ $extension = bam ]
+then
+ i=ibam
+ if [ $sorted != sorted ]
+ then
+ echo 'sorting bam file...=>' $f.sorted.bam
+ samtools sort $f $f.sorted
+ f=$f.sorted.bam
+ fi
+else
+ if [ $sorted != sorted ]
+ then
+ echo 'sorting bed file...=>' $f.sorted.bed
+ sort -k1,1 $f > $f.sorted.bed
+ f=$f.sorted.bed
+ fi
+fi
+
+ echo 'making bedgraph file...=>' $f.bedgraph
+ if [ $strand != strand ]
+ then
+ genomeCoverageBed -bg -$i $f -g $genome $split > $f.bedgraph
+ echo 'making bigwig file...=>' $outf.bw
+ bedGraphToBigWig $f.bedgraph $genome $outf
+ else
+ genomeCoverageBed -bg -$i $f -g $genome $split -strand + > $f+.bedgraph
+ genomeCoverageBed -bg -$i $f -g $genome $split -strand - > $f-.bedgraph
+ echo 'making bigwig file for + strand...=>' $outf+.bw
+ bedGraphToBigWig $f+.bedgraph $genome $outf+.bw
+ echo 'making bigwig file for - strand...=>' $outf-.bw
+ bedGraphToBigWig $f-.bedgraph $genome $outf-.bw
+ fi
+
+# remove intermediate files
+if [ $sorted != sorted ]
+ then
+ rm $f
+fi
+rm $f*.bedgraph
+
diff -r d325683ec368 -r 16ba480adf96 makebigwig.sh-old
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/makebigwig.sh-old Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,59 @@
+# make bigwig file for genome browser visulization
+# usage
+# makebigwig.sh bedorbam sorted genome strand -split
+# input file types: *.bed, *.bam
+
+# use of output: move to public_html and visualize in ucsc genome browser with the following:
+# track name="xxx" color=0,0,255 type=bigWig bigDataUrl=http://rous.mit.edu/~wuxbl/xxx.bw
+
+if [ $# -lt 6 ]
+then
+ echo "./makebigwig.sh infile outfile bedorbam sorted genome [-split -strand]"
+ exit
+fi
+
+f=$1
+outf=$2
+extension=$3
+sorted=$4
+genome=$5
+strand=$6
+split=$7
+i=i
+echo 'genome:' $genome
+echo 'strand:' $strand
+
+if [ $extension = bam ]
+then
+ i=ibam
+ if [ $sorted != sorted ]
+ then
+ echo 'sorting bam file...=>' $f.sorted.bam
+ samtools sort $f $f.sorted
+ f=$f.sorted.bam
+ fi
+else
+ if [ $sorted != sorted ]
+ then
+ echo 'sorting bed file...=>' $f.sorted.bed
+ sort -k1,1 -k2,2g $f > $f.sorted.bed
+ f=$f.sorted.bed
+ fi
+fi
+
+ echo 'making bedgraph file...=>' $f.bedgraph
+ if [ $strand != strand ]
+ then
+ genomeCoverageBed -bg -$i $f -g $genome $split > $f.bedgraph
+ echo 'making bigwig file...=>' $f.bw
+ bedGraphToBigWig $f.bedgraph $genome $outf
+ else
+ genomeCoverageBed -bg -$i $f -g $genome $split -strand + > $f+.bedgraph
+ genomeCoverageBed -bg -$i $f -g $genome $split -strand - > $f-.bedgraph
+ echo 'making bigwig file for + strand...' $f+.bw
+ bedGraphToBigWig $f+.bedgraph $genome $outf+
+ echo 'making bigwig file for - strand...=>' $f-.bw
+ bedGraphToBigWig $f-.bedgraph $genome $outf-
+ fi
+ rm $f
+
diff -r d325683ec368 -r 16ba480adf96 makebigwig.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/makebigwig.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,37 @@
+
+ from BED or BAM
+ makebigwig.sh $input $outfile
+ #if $inputa_format.bedorbam == "bed":
+ bed
+ #else:
+ bam
+ #end if
+ $sorted $genome none $split >$log 2> $log
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r d325683ec368 -r 16ba480adf96 makewindow.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/makewindow.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,19 @@
+def makeWindow(filename,outfile,window):
+ window = window/2
+ f=open(filename)
+ out = open(outfile,'w')
+ for line in f:
+ flds = line.strip().split()
+ #new position
+ center = (int(flds[1]) + int(flds[2]))/2
+ start = center - window
+ end = center + window
+ if start >= 0:
+ flds[1] = str(start)
+ flds[2] = str(end)
+ out.write('\t'.join(flds)+'\n')
+ f.close()
+ out.close()
+
+import sys
+makeWindow(sys.argv[1],sys.argv[2],int(sys.argv[3]))
diff -r d325683ec368 -r 16ba480adf96 makewindow.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/makewindow.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,18 @@
+
+ around interval center
+ makewindow.py $input $output $window
+
+
+
+
+
+
+
+
+
+**Description**
+
+For each interval in the input file, take the middle point, then extend each side windowsize/2 bps.
+
+
+
diff -r d325683ec368 -r 16ba480adf96 metaintv.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaintv.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,109 @@
+'''
+get binned score of intervals,allow extension
+'''
+
+import os,sys,numpy
+
+from resize import *
+
+from bx.bbi.bigwig_file import BigWigFile
+
+def binning(x,n):
+ # make n bin of x
+ y = numpy.zeros(n,dtype=float)
+ if len(x) == 0:
+ return y
+ step = float(len(x))/n
+ for k in range(n):
+ i = int(step*k)
+ j = int(step*(k+1)) + 1
+ y[k] = x[i:j].mean()
+ #print i,j,k,y[k]
+ return y
+
+def getBinnedScore(bwfile,intvfile,nbin):
+ '''
+ get binned average and std
+ '''
+ fbw = open(bwfile)
+ bw = BigWigFile(file=fbw)
+ fin = open(intvfile)
+ avg = numpy.zeros(nbin)
+ sqr = numpy.zeros(nbin)
+ N = 0
+ for line in fin:
+ #chrom,start,end,name,score,strand
+ flds = line.strip().split('\t')
+ #get the score at base resolution as an array
+ scores = bw.get_as_array(flds[0],int(flds[1]),int(flds[2]))
+ if scores == None:
+ print 'not found:\t',line
+ continue
+ N = N + 1
+ #print line,scores
+ # reverse if on minus strand
+ if flds[5] == '-':
+ scores = scores[::-1]
+ # no score = 0
+ scores = numpy.nan_to_num(scores)
+ # bin the data
+ binned = binning(scores,nbin)
+ avg = avg + binned
+ sqr = sqr + binned**2
+ # compute avg and std
+ avg = avg / N
+ err = ((sqr/N-avg**2)**0.5)/(N**0.5)
+ return avg,err
+
+def getExtendedBinScore(bwfile,intvfile,nbins,exts):
+ '''
+ nbins: n1,n2,n3
+ exts: l1,l2,l3,l4
+ '''
+ # make left extension
+ resize(intvfile,intvfile+'.tmp','start-'+str(exts[0]),'start+'+str(exts[1]),'stranded')
+ # compute binned average
+ avg,err = getBinnedScore(bwfile,intvfile+'.tmp',nbins[0])
+ # make center region
+ resize(intvfile,intvfile+'.tmp','start+'+str(exts[1]),'end-'+str(exts[2]),'stranded')
+ # compute binned average
+ avg1,err1 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[1])
+ avg = numpy.concatenate((avg,avg1))
+ err = numpy.concatenate((err,err1))
+ # make right region
+ resize(intvfile,intvfile+'.tmp','end-'+str(exts[2]),'end+'+str(exts[3]),'stranded')
+ # compute binned average
+ avg2,err2 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[2])
+ avg = numpy.concatenate((avg,avg2))
+ err = numpy.concatenate((err,err2))
+
+ return avg,err
+
+print sys.argv
+prog,bwfile,intvfile,nbin,outfile,outplot = sys.argv
+avg, err = getBinnedScore(bwfile,intvfile,int(nbin))
+out = open(outfile,'w')
+numpy.savetxt(out, avg, fmt='%.6f', delimiter=' ', newline=' ')
+out.write('\n')
+numpy.savetxt(out, err, fmt='%.6f', delimiter=' ', newline=' ')
+out.write('\n')
+out.close()
+
+# plot
+rscript = open("tmp.r","w")
+rscript.write("options(warn=-1)\n")
+rscript.write("x <- read.table('"+outfile+"')\n")
+rscript.write("pdf('"+outplot+"')\n")
+rscript.write("avg <- x[1,]\n")
+rscript.write("err <- x[2,]\n")
+rscript.write("print(x)\n")
+rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
+rscript.write("xticks <- seq(ncol(x))\n")
+rscript.write("plot(xticks,avg,xlab='',ylab='average coverage',type='l',lwd=0,ylim=ylim)\n")
+rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='lightgreen',border=NA)\n")
+rscript.write("lines(xticks,avg,type='l',lwd=1)\n")
+rscript.write("dev.off()\n")
+rscript.close()
+os.system("R --vanilla < tmp.r")
+os.system("rm tmp.r")
+
diff -r d325683ec368 -r 16ba480adf96 metaintv.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaintv.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,36 @@
+
+ from bigwig
+ binnedAverage.py $bwfile $intvfile $nbin $outfile $outplot
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+Each interval is binned and the average base-resolution score/coverage/density in the bigwig file is added as new columns appended at the end of the original file .
+
+**Example**
+
+If your original data has the following format:
+
++-----+-----+---+------+
+|chrom|start|end|other2|
++-----+-----+---+------+
+
+and you choose to divide each interval into 3 bins and return the mean scores of each bin, your output will look like this:
+
++-----+-----+---+------+-----+-----+-----+
+|chrom|start|end|other2|mean1|mean2|mean3|
++-----+-----+---+------+-----+-----+-----+
+
+
+
+
diff -r d325683ec368 -r 16ba480adf96 metaintv2.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaintv2.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,109 @@
+'''
+get binned score of intervals,allow extension
+'''
+
+import os,sys,numpy
+
+from resize import *
+
+from bx.bbi.bigwig_file import BigWigFile
+
+def binning(x,n):
+ # make n bin of x
+ y = numpy.zeros(n,dtype=float)
+ if len(x) == 0:
+ return y
+ step = float(len(x))/n
+ for k in range(n):
+ i = int(step*k)
+ j = int(step*(k+1)) + 1
+ y[k] = x[i:j].mean()
+ #print i,j,k,y[k]
+ return y
+
+def getBinnedScore(bwfile,intvfile,nbin):
+ '''
+ get binned average and std
+ '''
+ fbw = open(bwfile)
+ bw = BigWigFile(file=fbw)
+ fin = open(intvfile)
+ avg = numpy.zeros(nbin)
+ sqr = numpy.zeros(nbin)
+ N = 0
+ for line in fin:
+ #chrom,start,end,name,score,strand
+ flds = line.strip().split('\t')
+ #get the score at base resolution as an array
+ scores = bw.get_as_array(flds[0],int(flds[1]),int(flds[2]))
+ if scores == None:
+ print 'not found:\t',line
+ continue
+ N = N + 1
+ #print line,scores
+ # reverse if on minus strand
+ if flds[5] == '-':
+ scores = scores[::-1]
+ # no score = 0
+ scores = numpy.nan_to_num(scores)
+ # bin the data
+ binned = binning(scores,nbin)
+ avg = avg + binned
+ sqr = sqr + binned**2
+ # compute avg and std
+ avg = avg / N
+ err = ((sqr/N-avg**2)**0.5)/(N**0.5)
+ return avg,err
+
+def getExtendedBinScore(bwfile,intvfile,nbins,exts):
+ '''
+ nbins: n1,n2,n3
+ exts: l1,l2,l3,l4
+ '''
+ # make left extension
+ resize(intvfile,intvfile+'.tmp','start-'+str(exts[0]),'start+'+str(exts[1]),'stranded')
+ # compute binned average
+ avg,err = getBinnedScore(bwfile,intvfile+'.tmp',nbins[0])
+ # make center region
+ resize(intvfile,intvfile+'.tmp','start+'+str(exts[1]),'end-'+str(exts[2]),'stranded')
+ # compute binned average
+ avg1,err1 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[1])
+ avg = numpy.concatenate((avg,avg1))
+ err = numpy.concatenate((err,err1))
+ # make right region
+ resize(intvfile,intvfile+'.tmp','end-'+str(exts[2]),'end+'+str(exts[3]),'stranded')
+ # compute binned average
+ avg2,err2 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[2])
+ avg = numpy.concatenate((avg,avg2))
+ err = numpy.concatenate((err,err2))
+
+ return avg,err
+
+print sys.argv
+bwfile,intvfile,exts,nbins,outfile,outplot = sys.argv
+avg, err = getExtendedBinScore(bwfile,intvfile,numpy.fromstring(nbins,sep=','),numpy.fromstring(exts,sep=','))
+out = open(outfile,'w')
+numpy.savetxt(out, avg, fmt='%.6f', delimiter=' ', newline=' ')
+out.write('\n')
+numpy.savetxt(out, err, fmt='%.6f', delimiter=' ', newline=' ')
+out.write('\n')
+out.close()
+
+# plot
+rscript = open("tmp.r","w")
+rscript.write("options(warn=-1)\n")
+rscript.write("x <- read.table('"+outfile+"')\n")
+rscript.write("pdf('"+outplot+"')\n")
+rscript.write("avg <- x[1,]\n")
+rscript.write("err <- x[2,]\n")
+rscript.write("print(x)\n")
+rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
+rscript.write("xticks <- seq(ncol(x))\n")
+rscript.write("plot(xticks,avg,ylab='average coverage',type='l',lwd=0,ylim=ylim)\n")
+rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='lightgreen',border=NA)\n")
+rscript.write("lines(xticks,avg,type='l',lwd=1)\n")
+rscript.write("dev.off()\n")
+rscript.close()
+os.system("R --vanilla < tmp.r")
+os.system("rm tmp.r")
+
diff -r d325683ec368 -r 16ba480adf96 metaintv3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaintv3.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,109 @@
+'''
+get binned score of intervals,allow extension
+'''
+
+import os,sys,numpy
+
+from resize import *
+
+from bx.bbi.bigwig_file import BigWigFile
+
+def binning(x,n):
+ # make n bin of x
+ y = numpy.zeros(n,dtype=float)
+ if len(x) == 0:
+ return y
+ step = float(len(x))/n
+ for k in range(n):
+ i = int(step*k)
+ j = int(step*(k+1)) + 1
+ y[k] = x[i:j].mean()
+ #print i,j,k,y[k]
+ return y
+
+def getBinnedScore(bwfile,intvfile,nbin):
+ '''
+ get binned average and std
+ '''
+ fbw = open(bwfile)
+ bw = BigWigFile(file=fbw)
+ fin = open(intvfile)
+ avg = numpy.zeros(nbin)
+ sqr = numpy.zeros(nbin)
+ N = 0
+ for line in fin:
+ #chrom,start,end,name,score,strand
+ flds = line.strip().split('\t')
+ #get the score at base resolution as an array
+ scores = bw.get_as_array(flds[0],int(flds[1]),int(flds[2]))
+ if scores == None:
+ print 'not found:\t',line
+ continue
+ N = N + 1
+ #print line,scores
+ # reverse if on minus strand
+ if flds[5] == '-':
+ scores = scores[::-1]
+ # no score = 0
+ scores = numpy.nan_to_num(scores)
+ # bin the data
+ binned = binning(scores,nbin)
+ avg = avg + binned
+ sqr = sqr + binned**2
+ # compute avg and std
+ avg = avg / N
+ err = ((sqr/N-avg**2)**0.5)/(N**0.5)
+ return avg,err
+
+def getExtendedBinScore(bwfile,intvfile,nbins,exts):
+ '''
+ nbins: n1,n2,n3
+ exts: l1,l2,l3,l4
+ '''
+ # make left extension
+ resize(intvfile,intvfile+'.tmp','start-'+str(exts[0]),'start+'+str(exts[1]),'stranded')
+ # compute binned average
+ avg,err = getBinnedScore(bwfile,intvfile+'.tmp',nbins[0])
+ # make center region
+ resize(intvfile,intvfile+'.tmp','start+'+str(exts[1]),'end-'+str(exts[2]),'stranded')
+ # compute binned average
+ avg1,err1 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[1])
+ avg = numpy.concatenate((avg,avg1))
+ err = numpy.concatenate((err,err1))
+ # make right region
+ resize(intvfile,intvfile+'.tmp','end-'+str(exts[2]),'end+'+str(exts[3]),'stranded')
+ # compute binned average
+ avg2,err2 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[2])
+ avg = numpy.concatenate((avg,avg2))
+ err = numpy.concatenate((err,err2))
+
+ return avg,err
+
+print sys.argv
+bwfile,intvfile,exts,nbins,outfile,outplot = sys.argv
+avg, err = getExtendedBinScore(bwfile,intvfile,numpy.fromstring(nbins,sep=','),numpy.fromstring(exts,sep=','))
+out = open(outfile,'w')
+numpy.savetxt(out, avg, fmt='%.6f', delimiter=' ', newline=' ')
+out.write('\n')
+numpy.savetxt(out, err, fmt='%.6f', delimiter=' ', newline=' ')
+out.write('\n')
+out.close()
+
+# plot
+rscript = open("tmp.r","w")
+rscript.write("options(warn=-1)\n")
+rscript.write("x <- read.table('"+outfile+"')\n")
+rscript.write("pdf('"+outplot+"')\n")
+rscript.write("avg <- x[1,]\n")
+rscript.write("err <- x[2,]\n")
+rscript.write("print(x)\n")
+rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
+rscript.write("xticks <- seq(ncol(x))\n")
+rscript.write("plot(xticks,avg,ylab='average coverage',type='l',lwd=0,ylim=ylim)\n")
+rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='lightgreen',border=NA)\n")
+rscript.write("lines(xticks,avg,type='l',lwd=1)\n")
+rscript.write("dev.off()\n")
+rscript.close()
+os.system("R --vanilla < tmp.r")
+os.system("rm tmp.r")
+
diff -r d325683ec368 -r 16ba480adf96 metaintv_ext.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaintv_ext.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,128 @@
+'''
+get binned score of intervals,allow extension
+'''
+
+import os,sys,numpy
+import string, random
+
+from resize import *
+
+from bx.bbi.bigwig_file import BigWigFile
+
+def binning(x,n):
+ # make n bin of x
+ y = numpy.zeros(n,dtype=float)
+ if len(x) == 0:
+ return y
+ step = float(len(x))/n
+ for k in range(n):
+ i = int(step*k)
+ j = int(step*(k+1)) + 1
+ y[k] = x[i:j].mean()
+ #print i,j,k,y[k]
+ return y
+
+def getBinnedScore(bwfile,intvfile,nbin):
+ '''
+ get binned average and std
+ '''
+ fbw = open(bwfile)
+ bw = BigWigFile(file=fbw)
+ fin = open(intvfile)
+ avg = numpy.zeros(nbin)
+ sqr = numpy.zeros(nbin)
+ N = 0
+ for line in fin:
+ #print N
+ #chrom,start,end,name,score,strand
+ flds = line.strip().split('\t')
+ #get the score at base resolution as an array
+ scores = bw.get_as_array(flds[0],int(flds[1]),int(flds[2]))
+ if scores == None:
+ print 'not found:\t',N,line
+ continue
+ N = N + 1
+ #print line,scores
+ # reverse if on minus strand
+ if flds[5] == '-':
+ scores = scores[::-1]
+ # no score = 0
+ scores = numpy.nan_to_num(scores)
+ # bin the data
+ binned = binning(scores,nbin)
+ avg = avg + binned
+ sqr = sqr + binned**2
+ # compute avg and std
+ avg = avg / N
+ err = ((sqr/N-avg**2)**0.5)/(N**0.5)
+ return avg,err,N
+
+def getExtendedBinScore(bwfile,intvfile,nbins,exts):
+ '''
+ nbins: n1,n2,n3
+ exts: l1,l2,l3,l4
+ '''
+ avg = []
+ err = []
+ tmpfile = "".join(random.sample(string.letters+string.digits, 8))
+ if exts[0]>0 or exts[1]>0:
+ print 'make left extension'
+ resize(intvfile,tmpfile,'start-'+str(exts[0]),'start+'+str(exts[1]),'stranded')
+ print 'compute binned average'
+ avg,err,N = getBinnedScore(bwfile,tmpfile,nbins[0])
+ print 'regions used:',N
+ print 'make center region'
+ resize(intvfile,tmpfile,'start+'+str(exts[1]),'end-'+str(exts[2]),'stranded')
+ print 'compute binned average'
+ avg1,err1,N = getBinnedScore(bwfile,tmpfile,nbins[1])
+ print 'regions used:',N
+ avg = numpy.concatenate((avg,avg1))
+ err = numpy.concatenate((err,err1))
+ if exts[2]>0 or exts[3]>0:
+ print 'make right region'
+ resize(intvfile,tmpfile,'end-'+str(exts[2]),'end+'+str(exts[3]),'stranded')
+ print 'compute binned average'
+ avg2,err2,N = getBinnedScore(bwfile,tmpfile,nbins[2])
+ print 'regions used:',N
+ avg = numpy.concatenate((avg,avg2))
+ err = numpy.concatenate((err,err2))
+ os.system('rm '+tmpfile)
+ return avg,err
+
+prog,bwfile,intvfile,exts,nbins,outfile,outplot = sys.argv
+nbins = numpy.fromstring(nbins,dtype=int,sep=',')
+exts = numpy.fromstring(exts,dtype=int,sep=',')
+avg, err = getExtendedBinScore(bwfile,intvfile,nbins,exts)
+print 'save data'
+out = open(outfile,'w')
+numpy.savetxt(out, avg, fmt='%.6f', delimiter=' ', newline=' ')
+out.write('\n')
+numpy.savetxt(out, err, fmt='%.6f', delimiter=' ', newline=' ')
+out.write('\n')
+out.close()
+
+print 'plot'
+start = exts[0]*nbins[0]/(exts[0]+exts[1])
+end = nbins[0]+nbins[1]+exts[2]*nbins[2]/(exts[2]+exts[3])
+#print start,end
+rscript = open("tmp.r","w")
+rscript.write("options(warn=-1)\n")
+rscript.write("x <- read.table('"+outfile+"')\n")
+rscript.write("pdf('"+outplot+"')\n")
+rscript.write("avg <- x[1,]\n")
+rscript.write("err <- x[2,]\n")
+#rscript.write("print(x)\n")
+rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
+rscript.write("xticks <- seq(ncol(x))\n")
+#rscript.write("print(xticks)\n")
+rscript.write("plot(xticks,avg,xlab='',ylab='average coverage',type='l',lwd=0,ylim=ylim,xaxt='n')\n")
+rscript.write("axis(1, at=c(min(xticks),"+str(start)+","+str(end)+",max(xticks)),labels=c(-"+str(exts[0])+",0,0,"+str(exts[3])+"), las=2)\n")
+rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='lightgreen',border=NA)\n")
+rscript.write("lines(xticks,avg,type='l',lwd=1)\n")
+rscript.write("lines(c(min(xticks),max(xticks)),c(0,0),lwd=2)\n")
+rscript.write("lines(c("+str(start)+","+str(end)+"),c(0,0),lwd=10)\n")
+rscript.write("dev.off()\n")
+rscript.close()
+os.system("R --vanilla --slave < tmp.r")
+os.system("rm tmp.r")
+
diff -r d325683ec368 -r 16ba480adf96 metaintv_ext.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaintv_ext.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,23 @@
+
+ from bigwig (allow extension)
+ metaintv_ext.py $bwfile $intvfile $exts $nbins $outfile $outplot > $outlog
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+To be added
+
+
+
diff -r d325683ec368 -r 16ba480adf96 phastCons.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/phastCons.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,48 @@
+
+ phastCons or phyloP,vertebrate30way
+ getGenomicScore.py $input $output $score_type $score_path $nbin $strand $outplot $span
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+The score for each interval is added as a new column appended at the end of the original file .
+
+**Example**
+
+If your original data has the following format:
+
++-----+-----+---+------+
+|chrom|start|end|other2|
++-----+-----+---+------+
+
+and you choose to return the mean of phastCons scores, your output will look like this:
+
++-----+-----+---+------+----+
+|chrom|start|end|other2|mean|
++-----+-----+---+------+----+
+
+
+
+
diff -r d325683ec368 -r 16ba480adf96 random_interval.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/random_interval.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,96 @@
+'''
+simulate a random interval set that mimics the size and strand of a reference set
+'''
+
+def inferSizeFromRefBed(filename):
+ '''
+ read reference interval set, get chrom size information
+ '''
+ chrSize = {}
+ f = open(filename)
+ for line in f:
+ flds = line.strip().split('\t')
+ if not chrSize.has_key(flds[0]):
+ chrSize[flds[0]] = int(flds[2])
+ elif chrSize[flds[0]] < int(flds[2]):
+ chrSize[flds[0]] = int(flds[2])
+ f.close()
+ return chrSize
+
+def getChrSize(filename):
+ chrSize = {}
+ f = open(filename)
+ for line in f:
+ flds = line.strip().split()
+ if len(flds) >1:
+ chrSize[flds[0]] = int(flds[1])
+ f.close()
+ return chrSize
+
+def makeWeightedChrom(chrSize):
+ '''
+ make a list of chr_id, the freq is proportional to its length
+ '''
+
+ genome_len = 0
+
+ for chrom in chrSize:
+ chrom_len = chrSize[chrom]
+ genome_len += chrom_len
+
+ weighted_chrom = []
+ for chrom in chrSize:
+ weight = int(round(1000*float(chrSize[chrom])/genome_len))
+ weighted_chrom += [chrom]*weight
+
+ return weighted_chrom
+
+def randomIntervalWithinChrom(infile,outfile,chrSize):
+ '''
+ '''
+ fin = open(infile)
+ fout = open(outfile,'w')
+ n = 0
+ for line in fin:
+ n = n + 1
+ flds = line.strip().split('\t')
+ interval_size = int(flds[2]) - int(flds[1])
+ flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size))
+ flds[2] = str(int(flds[1])+interval_size)
+ fout.write('\t'.join(flds)+'\n')
+ fin.close()
+ fout.close()
+
+def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom):
+ '''
+ '''
+ fin = open(infile)
+ fout = open(outfile,'w')
+ n = 0
+ for line in fin:
+ n = n + 1
+ flds = line.strip().split('\t')
+ interval_size = int(flds[2]) - int(flds[1])
+ # find a random chrom
+ flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)]
+ flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size))
+ flds[2] = str(int(flds[1])+interval_size)
+ fout.write('\t'.join(flds)+'\n')
+ fin.close()
+ fout.close()
+
+import sys,random
+def main():
+ # python random_interval.py test100.bed testout.bed across human.hg18.genome
+
+ reference_interval_file = sys.argv[1]
+ output_file = sys.argv[2]
+ across_or_within_chrom = sys.argv[3] # within or across
+ chrom_size_file = sys.argv[4]
+ chrSize = getChrSize(chrom_size_file)
+ print chrSize.keys()
+ if across_or_within_chrom == 'within':
+ randomIntervalWithinChrom(reference_interval_file,output_file,chrSize)
+ else:
+ randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize))
+main()
diff -r d325683ec368 -r 16ba480adf96 random_interval.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/random_interval.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,45 @@
+
+ weight chromosome by length
+ random_interval.py $input $output $within $genome
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool will generate a set of intervals randomly distributed in the genome, mimicking the size distribution of the reference set. The same number of intervals are generated.
+
+
+**How it works**
+
+For each interval in the reference set, the script picks a random position as the new start in the genome, and then pick the end such that the size of the random interval is the same as the original one. The default setting is to move the interval to any chromosome, with the probability proportional to the size/length of the chromosome. You can have it pick a random position in the same chromosome, such that in the randomized set each chromosome has the same number of intervals as the reference set. The size of the chromosome can be either learned from the reference set (chromosome size = max(interval end)) or read from a chromosome size file. When learning from the reference set, only regions spanned by reference intervals are used to generate random intervals. Regions (may be an entire chromosome) not covered by the reference set will not appear in the output.
+
+**Chromosome size file**
+
+Chromosome size files for hg18,hg19,mm8,and mm9 can be found in 'Shared Data'. To use those files, select the correct one and import into to the history, then the file will be listed in the drop-down menu of this tool. You can also make your own chromosme size file: each line specifies the size of a chromosome (tab-delimited):
+
+chr1 92394392
+
+chr2 232342342
+
+
+You can use the following script from UCSC genome browser to download chromosome size files for other genomes:
+
+http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/fetchChromSizes
+
+
+
+
+
diff -r d325683ec368 -r 16ba480adf96 removeDuplicate.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/removeDuplicate.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,10 @@
+
+ lines
+ cat $input | sort | uniq > $output
+
+
+
+
+
+
+
diff -r d325683ec368 -r 16ba480adf96 resize.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/resize.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,57 @@
+'''
+change start and end of interval files
+'''
+
+import sys
+
+def resize(infile,outfile,expr_start,expr_end,strand):
+ fin = open(infile)
+ fout = open(outfile,'w')
+ if expr_start[0:3] == 'end':
+ c1 = 2
+ n1 = int(expr_start[3:])
+ else:
+ c1 = 1
+ n1 = int(expr_start[5:])
+ if expr_end[0:3] == 'end':
+ c2 = 2
+ n2 = int(expr_end[3:])
+ else:
+ c2 = 1
+ n2 = int(expr_end[5:])
+ if strand == 'ignore':
+ for line in fin:
+ flds = line.strip().split('\t')
+ start = int(flds[c1]) + n1
+ if start >= 0:
+ end = int(flds[c2]) + n2
+ if end >= start:
+ flds[1] = str(start)
+ flds[2] = str(end)
+ fout.write('\t'.join(flds)+'\n')
+ else:# upstream downstream
+ for line in fin:
+ flds = line.strip().split('\t')
+ if flds[5] == '+':
+ start = int(flds[c1]) + n1
+ if start >= 0:
+ end = int(flds[c2]) + n2
+ if end >= start:
+ flds[1] = str(start)
+ flds[2] = str(end)
+ fout.write('\t'.join(flds)+'\n')
+ else: # on the - strand
+ start = int(flds[3-c2]) - n2 # end=end+n2
+ if start >= 0:
+ end = int(flds[3-c1]) - n1
+ if end >= start:
+ flds[1] = str(start)
+ flds[2] = str(end)
+ fout.write('\t'.join(flds)+'\n')
+ fin.close()
+ fout.close()
+
+if __name__ == "__main__":
+ resize(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5])
+ # python resize.py in.bed out.bed start-3 end+5 both
+ # python resize.py expr_start expr_end strand(both/+/-)
diff -r d325683ec368 -r 16ba480adf96 resize.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/resize.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,32 @@
+
+ intervals
+ resize.py $infile $outfile $expr_start $expr_end $strand
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool changes start and end of each row in an interval file. When strandness is enforced, chromosome start and end are treated as the 5' and 3' end for intervals on the '+' strand, and the opposite for intervals on the '-' strand. In the expression such as 'start=start-1000', 'start' and 'end' are interpreted as the 5' and 3' end, respectively, and the operator '+' and '-' means moving downsteam and upsteam, respectively. For example, when enforcing strandness,
+
+**start=start-1000**: extend 1000 bp on the 5' end (moving start upstream)
+
+**start=start+1000**: trancate 1000 bp on the 5' end (moving start downsteam)
+
+**end=end+1000**: extend 1000 bp on the 3' end (moving end downsteam)
+
+**end=start+1000**: moving the end to 1000 bp downsteam of the start (return the first 1000 bp on the 5' end)
+
+**end=start+1**: taking the 5' end of the interval
+
+**start=end-1**: taking the 3' end of the interval
+
+
+
diff -r d325683ec368 -r 16ba480adf96 shuffleBed.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/shuffleBed.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,107 @@
+'''
+simulate a random interval set that mimics the size and strand of a reference set
+'''
+
+def inferSizeFromRefBed(filename,header):
+ '''
+ read reference interval set, get chrom size information
+ '''
+ chrSize = {}
+ f = open(filename)
+ if header:
+ header = f.readline()
+ for line in f:
+ flds = line.strip().split('\t')
+ if not chrSize.has_key(flds[0]):
+ chrSize[flds[0]] = int(flds[2])
+ elif chrSize[flds[0]] < int(flds[2]):
+ chrSize[flds[0]] = int(flds[2])
+ f.close()
+ return chrSize
+
+def getChrSize(filename):
+ chrSize = {}
+ f = open(filename)
+ for line in f:
+ flds = line.strip().split('\t')
+ if len(flds) >1:
+ chrSize[flds[0]] = int(flds[1])
+ f.close()
+ return chrSize
+
+def makeWeightedChrom(chrSize):
+ '''
+ make a list of chr_id, the freq is proportional to its length
+ '''
+
+ genome_len = 0
+
+ for chrom in chrSize:
+ chrom_len = chrSize[chrom]
+ genome_len += chrom_len
+
+ weighted_chrom = []
+ for chrom in chrSize:
+ weight = int(round(1000*float(chrSize[chrom])/genome_len))
+ weighted_chrom += [chrom]*weight
+
+ return weighted_chrom
+
+def randomIntervalWithinChrom(infile,outfile,chrSize,header):
+ '''
+ '''
+ fin = open(infile)
+ if header:
+ header = fin.readline()
+ fout = open(outfile,'w')
+ n = 0
+ for line in fin:
+ n = n + 1
+ flds = line.strip().split('\t')
+ interval_size = int(flds[2]) - int(flds[1])
+ rstart = random.randint(0,chrSize[flds[0]]-interval_size)
+ fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n')
+ fin.close()
+ fout.close()
+
+def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom,header):
+ '''
+ '''
+ fin = open(infile)
+ if header:
+ header = fin.readline()
+ fout = open(outfile,'w')
+ n = 0
+ for line in fin:
+ n = n + 1
+ flds = line.strip().split('\t')
+ interval_size = int(flds[2]) - int(flds[1])
+ # find a random chrom
+ flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)]
+ # random start in the chrom
+ rstart = random.randint(0,chrSize[flds[0]]-interval_size)
+ fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n')
+ fin.close()
+ fout.close()
+
+import sys,random
+def main():
+ # python random_interval.py test100.bed testout.bed across header human.hg18.genome
+
+ reference_interval_file = sys.argv[1]
+ output_file = sys.argv[2]
+ across_or_within_chrom = sys.argv[3] # within or across
+ if sys.argv[4] == 'header':
+ header = True
+ else:
+ header = False
+ if len(sys.argv) == 6:
+ chrom_size_file = sys.argv[5]
+ chrSize = getChrSize(chrom_size_file)
+ else:
+ chrSize = inferSizeFromRefBed(reference_interval_file,header)
+ if across_or_within_chrom == 'within':
+ randomIntervalWithinChrom(reference_interval_file,output_file,chrSize,header)
+ else:
+ randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize),header)
+main()
diff -r d325683ec368 -r 16ba480adf96 shuffleBed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/shuffleBed.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,43 @@
+
+ chromosome not weighted by length
+ shuffleBed -i $input -g $genome $chrom > $outfile
+ #if $limit.limit_select=="include":
+ -incl $limitfile
+ #else if $limit.limit_select=="exclude":
+ -excl $limitfile
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+Every chromosome are choosed with equal probability, regardless their size. Please use the tool 'random intervals' instead for general randomization.
+
+
+
diff -r d325683ec368 -r 16ba480adf96 spatial_proximity.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/spatial_proximity.py Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,36 @@
+
+import os,sys
+
+file1 = sys.argv[1]
+file2 = sys.argv[2]
+genome = sys.argv[3]
+outplot = sys.argv[4]
+outlog = sys.argv[5]
+outbed = sys.argv[6]
+
+strandness = ''
+if len(sys.argv) > 7:
+ strandness = sys.argv[7]
+
+# real distance
+cmd = 'closestBed -a '+file1+' -b '+file2 + ' '+strandness + ' -d -t first > '+outbed
+os.system(cmd)
+# shuffle
+cmd = 'shuffleBed -chrom -g '+genome+' -i '+file1+'> shuffled.bed'
+os.system(cmd)
+# shuffled distance
+cmd = 'closestBed -a shuffled.bed -b '+file2 + ' '+strandness + ' -d -t first > shuffled.dist'
+os.system(cmd)
+
+
+# test in R
+r = open('tmp.r','w')
+r.write("options(warn=-1)\n")
+r.write("source('/Users/xuebing/galaxy-dist/tools/mytools/cdf.r')\n")
+r.write("x = read.table('"+outbed+"',sep='\t')\n")
+r.write("y = read.table('shuffled.dist',sep='\t')\n")
+r.write("pdf('"+outplot+"')\n")
+r.write("mycdf(list(log10(1+x[,ncol(x)]),log10(1+y[,ncol(y)])),'spatial distance',c('real','shuffled'),'topleft','log10 distance','')\n")
+r.write("dev.off()\n")
+r.close()
+os.system("R --vanilla < tmp.r >"+outlog)
diff -r d325683ec368 -r 16ba480adf96 spatial_proximity.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/spatial_proximity.xml Sat Mar 31 08:31:22 2012 -0400
@@ -0,0 +1,32 @@
+
+ of two interval sets
+ spatial_proximity.py $inputa $inputb $genome $outplot $outlog $outbed $strandness
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+for each feature in the first interval set, find the closest in the second set, then compared the distance distribution to shuffled set 1.
+
+
+