Mercurial > repos > xuebing > sharplabtool
changeset 14:76e1b1b21cce default tip
Deleted selected files
| author | xuebing | 
|---|---|
| date | Tue, 13 Mar 2012 19:05:10 -0400 | 
| parents | 292186c14b08 | 
| children | |
| files | StartGenometriCorr.xml Start_GenometriCorr.R align2database.py align2database.xml align2multiple.xml alignr.py alignr.xml alignvis.py alignvis.r alignvis.xml altschulEriksonDinuclShuffle.py bedClean.py bed_to_bam.xml bedclean.py bedclean.xml bedsort.xml bigWigAverageOverBed.xml binaverage.xml binnedAverage.py bowtie2bed.pl bowtie2bed.xml bwBinavg.xml cdf.r cdf.xml closestBed.py closestBed.xml collapseBed.py collapseBed.xml collapseBed2.py collapseTab.py collapseTab.xml convertEnsembl.py convertEnsembl.xml dreme.xml endbias.py endbias.xml fasta-dinucleotide-shuffle.py fastamarkov.xml fastashuffle1.xml fastashuffle2.xml fastqdump.xml fimo2-old.xml fimo2.xml fimo2bed.py fimo2bed.xml genomeView.xml genomeview.r getGenomicScore.py headtail.xml intersectSig.py intersectSig.xml intersectbed.xml intervalOverlap.py intervalSize.py intervalSize.xml iupac2meme.xml makebigwig.sh makebigwig.xml makewindow.py makewindow.xml meme.xml memelogo.xml metaintv.py metaintv.xml metaintv2.py metaintv3.py metaintv_ext.py metaintv_ext.xml phastCons.xml plotmatrix.py plotmatrix.xml r_wrapper.sh random_interval.py random_interval.xml removeDuplicate.xml resize.py resize.xml revcompl.py revcompl.xml sampline.py sampline.xml seq2meme.py seq2meme.xml seqshuffle.py sequence.py shuffleBed.py shuffleBed.xml spatial_proximity.py spatial_proximity.xml splicesite.xml venn.xml | 
| diffstat | 90 files changed, 0 insertions(+), 6365 deletions(-) [+] | 
line wrap: on
 line diff
--- a/StartGenometriCorr.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ -<tool id="genometric_correlation" name="Genometric Correlation"> -<description>between two files of genomic intervals</description> -<command interpreter="Rscript --vanilla"> -Start_GenometriCorr.R $config $query $reference $output_options $output -</command> -<inputs> -<param format="text" name="config" type="data" label="Configuration file"/> -<param format="text" name="query" type="data" label="Query intervals file"/> -<param format="text" name="reference" type="data" label="Reference intervals file"/> -<param name="output_options" type="select" label="Type of output"> -<option value="plot">ECDF plots</option> -<option value="vis">Graphic visualization</option> -<option value="stats">Text output of statistics</option> -<option value="plot_vis">All</option> -</param> -</inputs> -<outputs> -<data name="output" format="pdf"/> -</outputs> -<help> -This tool determines the statistical relationship (if any) between two sets of genomic intervals. Output can be text only, plot (ECDF curves), or a more colorful graphic. -</help> -</tool> \ No newline at end of file
--- a/Start_GenometriCorr.R Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,105 +0,0 @@ -# Start_GenometriCorr.R - -################################################### -# # -# command-line interface to GenometriCorr # -# functions, for use with Galaxy. # -# # -################################################### - -capture.output <- function (result, pdffile, output_options) -{ - if(output_options != "stats") - { - pdf(file=pdffile, width=10, height=19, paper="special") - - if (output_options != "vis") #need to do a plot - { - mymat <- matrix(ncol=3, nrow=4) - mymat[1,1] <- 1 - mymat[1,2] <- 2 - mymat[1,3] <- 3 - mymat[2,1] <- 4 - mymat[2,2] <- 5 - mymat[2,3] <- 6 - mymat[3,1] <- 7 - mymat[3,2] <- 8 - mymat[3,3] <- 9 - mymat[4,1] <- 10 - mymat[4,2] <- 11 - mymat[4,3] <- 12 - - layout(mymat, heights=c(0.2,0.2,0.2,0.2)) - plot(result, pdffile, make.new=FALSE) - } - if (output_options != "plot") #need to do the bigger graphic - { - mymat <- matrix(ncol=2, nrow=8) - mymat[1,1] <- 2 - mymat[1,2] <- 3 - mymat[2,1] <- 4 - mymat[2,2] <- 4 - mymat[3,1] <- 1 - mymat[3,2] <- 1 - mymat[4,1] <- 5 - mymat[4,2] <- 6 - mymat[5,1] <- 7 - mymat[5,2] <- 7 - mymat[6,1] <- 8 - mymat[6,2] <- 9 - mymat[7,1] <- 10 - mymat[7,2] <- 10 - mymat[8,1] <- 11 - mymat[8,2] <- 12 - layoutresults <- 3 - - layout(mymat, heights=c(0.05,0.05,0.15,0.15,0.15,0.15,0.15,0.15)) - visualize(result, pdffile, make.new=FALSE) - } - dev.off() - } - - if (output_options == "stats") - { - show(result) - } -} - - - -# Reads the command line arguments -args <- commandArgs(trailingOnly=T) - -suppressPackageStartupMessages(library('GenometriCorr', warn.conflicts=F, verbose=F)) -suppressPackageStartupMessages(library('graphics', warn.conflicts=F, verbose=F)) -suppressPackageStartupMessages(library('gdata', warn.conflicts=F, verbose=F)) -suppressPackageStartupMessages(library('gplots', warn.conflicts=F, verbose=F)) -suppressPackageStartupMessages(library('gtools', warn.conflicts=F, verbose=F)) -suppressPackageStartupMessages(library('caTools', warn.conflicts=F, verbose=F)) -suppressPackageStartupMessages(library('grid', warn.conflicts=F, verbose=F)) - - - -# Variables -query_file <- "" -reference_file <- "" -config_file <- "" -output_options <- "" - -# Parse the command line arguments - -config_file <- args[1] -query_file <- as.character(args[2]) -reference_file <- as.character(args[3]) -output_options <- args[4] -pdffile <- args[5] - -conf<-new("GenometriCorrConfig",config_file) - -print('OK') - -result<-suppressWarnings(suppressPackageStartupMessages(GenometriCorr:::run.config(conf,query=query_file,reference=reference_file))) -print('OK2') - -hideoutput <- capture.output(result, pdffile=args[5], output_options) -
--- a/align2database.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,104 +0,0 @@ -''' -align mulitple bed to one bed -python align_multiple.py hmChIP_mm9_peak_bed/mm9-GSE19999_PolII_P25_all.cod.bed hmChIP_mm9_peak_bed/ test.txt test.pdf 100 5000 -''' - -import os,sys,random -def main(): - queryfile = sys.argv[1] - inpath = sys.argv[2] - outputdata = sys.argv[3] - outputerr = sys.argv[4] - barplotpdf = sys.argv[5] - min_feat = sys.argv[6] # min features overlap - windowsize = sys.argv[7] - anchor = sys.argv[8] - span = sys.argv[9] # loess smooth parameter - - inpath = inpath.rstrip('/') - #os.system('rm '+inpath+'/*tmp*') - - infiles = os.listdir(inpath) - - #print len(infiles),' files\n' - i = 0 - for infile in infiles: - if 'tmp' in infile: - #os.system('rm '+inpath+'/'+infile) - continue - i = i +1 - print i,infile - output = infile.split('/')[-1]+'-to-'+queryfile.split('/')[-1]#'.summary' - if anchor == 'database': - command = 'python /Users/xuebing/galaxy-dist/tools/mytools/alignr.py -b '+inpath+'/'+infile+' -a '+queryfile+' -o '+output+' --summary-only -q -w '+windowsize - else: - command = 'python /Users/xuebing/galaxy-dist/tools/mytools/alignr.py -a '+inpath+'/'+infile+' -b '+queryfile+' -o '+output+' --summary-only -q -w '+windowsize - #print command+'\n' - os.system(command) - print 'start visualization...' - # visualize - rscriptfile = 'f-'+str(random.random())+'.r' - r = open(rscriptfile,'w') - r.write("files <- dir('.','summary',full.name=T)\n") - #r.write("print(files)\n") - r.write("x <- read.table(files[1])\n") - r.write("err <- read.table(gsub('summary','standarderror',files[1]))\n") - r.write("for (filename in files[2:length(files)]){\n") - r.write(" x <- rbind(x,read.table(filename))\n") - r.write(" err <- rbind(err,read.table(gsub('summary','standarderror',filename)))\n") - r.write("}\n") - r.write("x <- x[x[,2] > "+min_feat+",]\n") - r.write("err <- err[err[,2] > "+min_feat+",]\n") - r.write("name <- as.character(x[,1])\n") - r.write("nfeat <- x[,2]\n") - r.write("x <- x[,3:ncol(x)]\n") - r.write("err <- err[,3:ncol(err)]\n") - r.write("for (i in 1:nrow(x)) {\n") - r.write(" name[i] <- strsplit(name[i],'-to-')[[1]][1]\n") - r.write("}\n") - # remove rows that have no variation, which cause problem in heatmap. This is the case when align to itself - r.write("toremove <- seq(nrow(x))\n") - r.write("for (i in 1:nrow(x)){\n") - r.write(" toremove[i] <- all(x[i,] == x[i,1])\n") - r.write("}\n") - r.write("x <- x[!toremove,]\n") - r.write("err <- err[!toremove,]\n") - r.write("name <- name[!toremove]\n") - r.write("nfeat <- nfeat[!toremove]\n") - r.write("write.table(cbind(name,nfeat,x),file='"+outputdata+"',sep ='\\t',quote=F,row.names=F,col.names=F)\n") - r.write("write.table(cbind(name,nfeat,err),file='"+outputerr+"',sep ='\\t',quote=F,row.names=F,col.names=F)\n") - - r.write("pdf('"+barplotpdf+"')\n") - r.write("heatmap(t(scale(t(as.matrix(x,nc=ncol(x))))),Colv=NA,labRow=name,labCol=NA,margins=c(1,8),cexRow=0.02+1/log(nrow(x)),col=heat.colors(1000))\n") - - if windowsize != '0' : - r.write("xticks <- seq(-"+windowsize+","+windowsize+",length.out=100)\n") - r.write("xlab <- 'relative position (bp)'\n") - else: - r.write("xticks <- seq(100)\n") - r.write("xlab <- 'relative position (bin)'\n") - - #r.write("plot(xticks,colSums(t(scale(t(as.matrix(x,nc=ncol(x)))))),xlab='relative position (bp)',type='l',lwd=2,main='total signal')\n") - r.write("for (i in 1:nrow(x)) {\n") - r.write(" avg <- x[i,]/nfeat[i]\n") - #r.write(" maxv <- max(avg)\n") - #r.write(" minv <- min(avg)\n") - #r.write(" medv <- median(avg)\n") - #r.write(" if (maxv < "+fold+"*medv | minv*"+fold+">medv){next}\n") - #smooth - if float(span) >= 0.1: - r.write(" avg = loess(as.numeric(avg)~xticks,span="+span+")$fitted\n") - r.write(" err[i,] = loess(as.numeric(err[i,])~xticks,span="+span+")$fitted\n") - r.write(" par(cex=1.5)\n") - r.write(" plot(xticks,avg,ylab='average coverage',main=paste(name[i],'\n n=',nfeat[i],sep=''),xlab=xlab,type='l',lwd=1,ylim=c(min(avg-err[i,]),max(avg+err[i,])))\n") - r.write(" polygon(c(xticks,rev(xticks)),c(avg+err[i,],rev(avg-err[i,])),col='slateblue1',border=NA)\n") - r.write(" lines(xticks,avg,type='l',lwd=1)\n") - r.write("}\n") - r.write("dev.off()\n") - r.close() - os.system("R --vanilla < "+rscriptfile) - os.system('rm '+rscriptfile) - os.system('rm *.summary') - os.system('rm *.standarderror') - -main()
--- a/align2database.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ -<tool id="align2database" name="align-to-database"> - <description> features </description> - <command interpreter="python"> align2database.py $query $database $output_coverage $output_standarderror $output_plot $minfeat $windowsize $anchor $span> $outlog </command> - <inputs> - <param name="query" type="data" format="interval" label="Query intervals" help= "keep it small (less than 1,000,000 lines)"/> - <param name="database" type="select" label="Feature database"> - <option value="/Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/feature_database" selected="true">All mm9 features (over 200)</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/annotation">Annotated mm9 features</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/CLIP">protein bound RNA (CLIP) mm9 </option> - <option value="/Users/xuebing/galaxy-dist/tool-data/aligndb/mm9/conservedmiRNAseedsite">conserved miRNA target sites mm9 </option> - <option value="/Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/all-feature">Human ChIP hmChIP database hg18</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/aligndb/hg18/gene-feature">Human gene features hg18</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/aligndb/hg19/conservedmiRNAseedsite">conserved miRNA target sites hg19 </option> - </param> - <param name="anchor" label="Anchor to query features" help="default anchoring to database featuers" type="boolean" truevalue="query" falsevalue="database" checked="False"/> - <param name="windowsize" size="10" type="integer" value="5000" label="Window size (-w)" help="will create new intervals of w bp flanking the original center. set to 0 will not change input interval size)"/> - <param name="minfeat" size="10" type="integer" value="100" label="Minimum number of query intervals hits" help="database features overlapping with too few query intervals are discarded"/> - <param name="span" size="10" type="float" value="0.1" label="loess span: smoothing parameter" help="value less then 0.1 disables smoothing"/> - <param name="outputlabel" size="80" type="text" label="Output label" value="test"/> - -</inputs> - <outputs> - <data format="txt" name="outlog" label="${outputlabel} (log)"/> - <data format="tabular" name="output_standarderror" label="${outputlabel} (standard error)"/> - <data format="tabular" name="output_coverage" label="${outputlabel} (coverage)"/> - <data format="pdf" name="output_plot" label="${outputlabel} (plot)"/> - </outputs> - <help> - -**Example output** - -.. image:: ./static/operation_icons/align_multiple2.png - - -**What it does** - -This tool aligns a query interval set (such as ChIP peaks) to a database of features (such as other ChIP peaks or TSS/splice sites), calculates and plots the relative distance of database features to the query intervals. Currently two databases are available: - --- **ChIP peaks** from 191 ChIP experiments (processed from hmChIP database, see individual peak/BED files in **Shared Data**) - --- **Annotated gene features**, such as: TSS, TES, 5'ss, 3'ss, CDS start and end, miRNA seed matches, enhancers, CpG island, microsatellite, small RNA, poly A sites (3P-seq-tags), miRNA genes, and tRNA genes. - -Two output files are generated. One is the coverage/profile for each feature in the database that has a minimum overlap with the query set. The first two columns are feature name and the total number of overlapping intervals from the query. Column 3 to column 102 are coverage at each bin. The other file is an PDF file plotting both the heatmap for all features and the average coverage for each individual database feature. - - -**How it works** - -For each interval/peak in the query file, a window (default 10,000bp) is created around the center of the interval and is divided into 100 bins. For each database feature set (such as Pol II peaks), the tool counts how many intervals in the database feature file overlap with each bin. The count is then averaged over all query intervals that have at least one hit in at least one bin. Overall the plotted 'average coverage' represnts the fraction of query features (only those with hits, number shown in individual plot title) that has database feature interval covering that bin. The extreme is when the database feature is the same as the query, then every query interval is covered at the center, the average coverage of the center bin will be 1. - -The heatmap is scaled for each row before clustering. - - </help> -</tool> -
--- a/align2multiple.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,109 +0,0 @@ -<tool id="align2multiple" name="align-to-multiple"> - <description>features</description> - <command>cat $script_file | R --vanilla --slave > $logfile </command> - <inputs> - <param name="query" type="data" format="interval" label="Query intervals" help= "keep it small (less than 1,000,000 lines)"/> - <param name="label" type="text" value="" size="30" label="Data Label"/> - <param name="windowsize" size="10" type="integer" value="5000" label="radius of the window" help="will create new intervals of w bp flanking the original center. set to 0 will not change input interval size)"/> - <param name="nbins" size="10" type="integer" value="20" label="Number of bins dividing the window"/> - <param name="sort" label="Sort intervals" help="Sort by the center of the first input, then the second input, then third..." type="boolean" truevalue="sort" falsevalue="none" checked="True"/> - <repeat name="series" title="input file"> - <param name="label" type="text" value="" size="30" label="Data Label"/> - <param name="input" type="data" format="interval" label="Dataset"/> - </repeat> - </inputs> - - <configfiles> - <configfile name="script_file"> - ## Setup R error handling to go to stderr - cat('\n[',date(),'] Start running job\n') - options(warn=-1) - windowsize = as.integer("$windowsize") - labels = '$label' - ## align query to itself - cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/alignr.py -a $query -b $query -o $label-$label --profile-only -q -w $windowsize -n $nbins' - cat('\n[',date(),'] ',cmd,'\n') - system(cmd) - ## align other sets to query - #for $i,$s in enumerate( $series ) - labels = c(labels,'$s.label.value') - cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/alignr.py -a $s.input.file_name -b $query -o $label-$s.label.value --profile-only -q -w $windowsize -n $nbins' - cat('\n[',date(),'] ',cmd,'\n') - system(cmd) - #end for - cat('\n[',date(),'] Read output\n') - ## read output of query2query - print(paste(labels[1],labels[1],sep='-')) - x = read.table(paste(labels[1],labels[1],sep='-')) - ids = as.character(x[,1]) - nfeat = nrow(x) - x = as.matrix(x[,3:ncol(x)]) - nbin = ncol(x) - - ## a table mapping id to position - ind = list() - for (i in 1:nfeat){ - ind[[ids[i]]] = i - } - ## read other output files - for (i in 2:length(labels)){ - print(paste(labels[1],labels[i],sep='-')) - x0 = read.table(paste(labels[1],labels[i],sep='-')) - ids0 = as.character(x0[,1]) - x0 = as.matrix(x0[,3:ncol(x0)]) - x1 = matrix(0,nfeat,nbin) - for (j in 1:nrow(x0)){ - #cat(j,'\t',ids0[j],'\t',ind[[ids0[j]]],'\n') - x1[ind[[ids0[j]]],] = x0[j,] - } - x = cbind(x,x1) - } - ## reorder - if ("${sort}" == "sort"){ - cat('\n[',date(),'] Sort intervals\n') - for (i in rev(2:length(labels))){ - x = x[order(x[,i*nbin-nbin/2]>0),] - } - } - png("${out_file1}") - ##par(mfrow=c(2,length(labels)),mar=c(1,1,4,1)) - layout(matrix(seq(2*length(labels)),nrow=2,byrow=T),heights=c(1,5)) - cat('\n[',date(),'] Plot summary\n') - par(mar=c(0,0,4,0)+0.1) - for (i in 1:length(labels)){ - plot(colSums(x[,((i-1)*nbin+1):(i*nbin)]),type='l',axes=F,main=labels[i]) - } - cat('\n[',date(),'] Plot heatmap\n') - par(mar=c(0,0,0,0)+0.1) - for (i in 1:length(labels)){ - image(-t(log2(1+x[,((i-1)*nbin+1):(i*nbin)])),axes=F) - } - dev.off() - cat('\n[',date(),'] Finished\n') - - </configfile> - </configfiles> - - <outputs> - <data format="txt" name="logfile" label="${tool.name} on ${on_string}: (log)" /> - <data format="png" name="out_file1" label="${tool.name} on ${on_string}: (plot)" /> - </outputs> - -<help> -.. class:: infomark - -This tool allows you to check the co-localization pattern of multiple interval sets. All interval sets are aligned to the center of the intervals in the query interval set. - -Each row represents a window of certain size around the center of one interval in the query set, such as ChIP peaks. Each heatmap shows the position of other features in the SAME window (the same rows in each heatmap represent the same interval/genomic position). - - -The example below shows that of all Fox2 peaks, half of them are within 1kb of TSS. Of the half outside TSS, about one half has H3K4me1, two thirds of which are further depleted of H3K4me3. - ------ - -**Example** - -.. image:: ./static/images/align2multiple.png - -</help> -</tool>
--- a/alignr.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,353 +0,0 @@ -''' -the scripts takes two files as input, and compute the coverage of -features in input 1 across features in input 2. Features in input 2 are -divided into bins and coverage is computed for each bin. - -please check the help information by typing: - - python align.py -h - - -requirement: - please install the following tools first: - bedtools: for read/region overlapping, http://code.google.com/p/bedtools/ - -''' - -import os,sys,os.path -from optparse import OptionParser - -def lineCount(filename): - with open(filename) as f: - for i, l in enumerate(f): - pass - return i + 1 - -def combineFilename(f1,f2): - ''' - fuse two file names into one - ''' - return f1.split('/')[-1]+'-'+f2.split('/')[-1] - -def checkFormat(filename1,filename2,input1format): - ''' - check the format of input files - ''' - - # file1 - # read the first line, see how many filds - ncol1 = 6 - if input1format == "BED": - f = open(filename1) - line = f.readline().strip().split('\t') - ncol1 = len(line) - if ncol1 < 3: - print "ERROR: "+filename1+" has only "+str(ncol1)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited." - sys.exit(1) - f.close() - - # file2 - f = open(filename2) - line = f.readline().strip().split('\t') - ncol2 = len(line) - if ncol2 < 3: - print "ERROR: "+filename2+" has only "+str(ncol2)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited." - sys.exit(1) - - return ncol1,ncol2 - - -def makeBed(filename,ncol): - ''' - add up to 6 column - ''' - f = open(filename) - outfile = filename+'.tmp.bed' - outf = open(outfile,'w') - if ncol == 3: - for line in f: - outf.write(line.strip()+'\t.\t0\t+\n') - elif ncol == 4: - for line in f: - outf.write(line.strip()+'\t0\t+\n') - if ncol == 5: - for line in f: - outf.write(line.strip()+'\t+\n') - f.close() - outf.close() - return outfile - -def makeWindow(filename,window): - - outfile = filename+'-window='+str(window)+'.tmp.bed' - if not os.path.exists(outfile): - f=open(filename) - out = open(outfile,'w') - lines = f.readlines() - if 'track' in lines[0]: - del lines[0] - for line in lines: - flds = line.strip().split('\t') - - #new position - center = (int(flds[1]) + int(flds[2]))/2 - start = center - window - end = center + window - if start >= 0: - flds[1] = str(start) - flds[2] = str(end) - out.write('\t'.join(flds)+'\n') - f.close() - out.close() - return outfile - -def groupReadsMapped2aRegion(filename,ncol): - ''' - read output from intersectBED - find all reads mapped to each region - ''' - try: - f=open(filename) - #If filename cannot be opened, print an error message and exit - except IOError: - print "could not open",filename,"Are you sure this file exists?" - sys.exit(1) - lines = f.readlines() - - allReadsStart = {} - allReadsEnd = {} - regionStrand = {} - regionStart = {} - regionEnd = {} - - for line in lines: - flds = line.strip().split('\t') - key = '_'.join(flds[ncol:(ncol+4)]) - if not allReadsStart.has_key(key): - allReadsStart[key] = list() - allReadsEnd[key] = list() - #print flds[ncol+0],flds[ncol+1],flds[ncol+2] - allReadsStart[key].append(int(flds[1])) - allReadsEnd[key].append(int(flds[2])) - regionStrand[key] = flds[ncol+5] - regionStart[key] = int(flds[ncol+1]) - regionEnd[key] = int(flds[ncol+2]) - return (allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd) - - -def createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,nbins): - ''' - each region is divided into nbins - compute the number of reads covering each bin for each region - ''' - RegionProfile = {} - nRead = {} # num of all reads in the region - for region in allReadsStart.keys(): - RegionProfile[region] = [0]*nbins - nRead[region] = len(allReadsStart[region]) - #print region,nRead[region],allReadsStart[region] - for i in range(nRead[region]): - RegionProfile[region] = updateRegionCount(RegionProfile[region],allReadsStart[region][i],allReadsEnd[region][i],regionStart[region],regionEnd[region],regionStrand[region],nbins) - return RegionProfile,nRead - -def updateRegionCount(RegionCount,readStart,readEnd,regionStart,regionEnd,strand,nbins): - ''' - each region is divided into nbins, - add 1 to each bin covered by the read - ''' - L = regionEnd-regionStart - start = int(nbins*(readStart-regionStart)/L) - end = int(nbins*(readEnd-regionStart)/L) - if start < 0: - start = 0 - if end > nbins: - end = nbins - if strand == '-': - for i in range(start,end): - RegionCount[nbins-1-i] = RegionCount[nbins-1-i] + 1 - else: # if the 6th column of the input is not strand, will treat as + strand by default - for i in range(start,end): - RegionCount[i] = RegionCount[i] + 1 - return RegionCount - -def saveProfile(filename,Profile,nRegion): - out = open(filename,'w') - for regionType in Profile.keys(): - #print Profile[regionType] - out.write(regionType+'\t'+str(nRegion[regionType])+'\t'+'\t'.join(map(str,Profile[regionType]))+'\n') - -def saveSummary(filename,Profile,nbin): - out = open(filename+'.summary','w') - - nfeat = len(Profile) - summaryprofile = [0]*nbin - for regionType in Profile.keys(): - for i in range(nbin): - summaryprofile[i] += Profile[regionType][i] - out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,summaryprofile))+'\n') - out.close() - # calculate standard error - out = open(filename+'.standarderror','w') - sd = [0.0]*nbin - u = [0.0]*nbin - for i in range(nbin): - u[i] = float(summaryprofile[i])/nfeat - for regionType in Profile.keys(): - sd[i] = sd[i] + (Profile[regionType][i] - u[i])**2 - sd[i] = sd[i]**0.5 / nfeat - out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,sd))+'\n') - out.close() - -def main(): - usage = "usage: %prog [options] -a inputA -b inputB" - parser = OptionParser(usage) - parser.add_option("-a", dest="inputA", - help="(required) input file A, interval (first 3 columns are chrN, start and end) or BAM format. The script computes the depth of coverage of features in file A across the features in file B" ) - parser.add_option("-b",dest="inputB", - help="(required) input file B, interval file" ) - parser.add_option("-f",dest="aformat",default="BED", - help="Format of input file A. Can be BED (default) or BAM") - parser.add_option("-w",type='int',dest="window", - help="Generate new inputB by making a window of 2 x WINDOW bp (in total) flanking the center of each input feature" ) - parser.add_option("-n", type="int", dest="nbins",default=100, - help="number of bins. Features in B are binned, and the coverage is computed for each bin. Default is 100") - parser.add_option("-s",action="store_true", dest="strandness", - help="enforce strandness: require overlapping on the same strand. Default is off") - parser.add_option("-p",action="store_true", dest="plot",default=False, - help="load existed intersectBed outputfile") - parser.add_option("-q", action="store_true", dest="quiet",default=False, - help="suppress output on screen") - parser.add_option("-o", dest="output_data", - help="(optional) output coverage file (txt) name." ) - parser.add_option("-v", dest="output_plot", - help="(optional) output plot (pdf) file name." ) - parser.add_option("-l", dest="plot_title", default="", - help="(optional) output title of the plot." ) - parser.add_option("--ylim", dest="ylim", default="min,max", - help="(optional) ylim of the plot" ) - parser.add_option("--summary-only", action="store_true", dest="summary_only",default=False, - help="save profile summary only (no data for individual features)") - parser.add_option("--compute-se", action="store_true", dest="compute_se",default=False, - help="compute and plot standard deviation for each bin. used when --summary-only is on") - parser.add_option("--profile-only", action="store_true", dest="profile_only",default=False, - help="save profile only (no plot)") - parser.add_option("--span", type="float", dest="span",default=0.1, - help="loess span smooth parameter, 0.1 ~ 1") - - (options, args) = parser.parse_args() - - if options.inputA == None or options.inputB == None: - parser.error("Please specify two input files!!") - - if not options.quiet: - print "checking input file format..." - - ncol,ncol2 = checkFormat(options.inputA ,options.inputB,options.aformat) - - if ncol2 < 6: - options.inputB = makeBed(options.inputB,ncol2) - if not options.quiet: - print "fill up 6 columns" - - if options.window > 0: - if not options.quiet: - print "making windows from "+options.inputB+"..." - options.inputB = makeWindow(options.inputB,options.window) - - output = combineFilename(str(options.inputA),str(options.inputB)) - - if not options.plot: - if options.aformat == "BAM": - cmd = "intersectBed -abam "+str(options.inputA)+" -b "+str(options.inputB) + ' -bed -split ' - else: - cmd = "intersectBed -a "+str(options.inputA)+" -b "+str(options.inputB) - if options.strandness: - cmd = cmd + ' -s' - cmd = cmd +" -wo > "+ output+'-intersect.tmp.bed' - if not options.quiet: - print "search for overlappings: "+cmd - status = os.system(cmd) - if status != 0: - sys.exit(1) - - - if not options.quiet: - print 'group reads mapped to the same region...' - - allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd = groupReadsMapped2aRegion(output+'-intersect.tmp.bed',ncol) - - if len(allReadsStart) == 0: - if not options.quiet: - print 'no overlap found!!' - os.system('rm *tmp.*') - sys.exit(1) - - if not options.quiet: - print 'count number of reads mapped to each bin...' - - RegionProfile,nRead = createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,options.nbins) - - if options.output_data == None: - options.output_data = output+'.txt' - - if options.summary_only: - saveSummary(options.output_data,RegionProfile,options.nbins) - - else: - saveProfile(options.output_data,RegionProfile,nRead) - - if not options.quiet: - print 'results saved to: '+ options.output_data - - if not (options.summary_only or options.profile_only ): - # visualize - - if options.window < 1: - xlab = 'relative position (bins)' - else: - xlab = 'relative position (bp)' - - if options.output_plot == None: - options.output_plot = output+'.pdf' - - title = options.plot_title+'\n n = '+str(len(RegionProfile)) - - rscript = open("tmp.r","w") - rscript.write("x <- read.table('"+options.output_data+"')\n") - rscript.write("pdf('"+options.output_plot+"')\n") - rscript.write("avg <- colSums(x[,3:ncol(x)])/nrow(x)\n") - rscript.write("err <- sd(x[,3:ncol(x)])/sqrt(nrow(x))\n") - - if options.window == 0: - rscript.write("xticks <- seq("+str(options.nbins)+")\n") - else: - rscript.write("xticks <- seq("+str(-options.window)+","+str(options.window)+",length.out="+str(options.nbins)+")\n") - - if options.ylim != 'min,max': - rscript.write("ylim=c("+options.ylim+")\n") - else: - rscript.write("ylim=c(min(avg-err),max(avg+err))\n") - rscript.write("par(cex=1.5)\n") - #smooth - if options.span >= 0.1: - rscript.write("avg = loess(avg~xticks,span="+str(options.span)+")$fitted\n") - rscript.write("err = loess(err~xticks,span="+str(options.span)+")$fitted\n") - rscript.write("plot(xticks,avg,ylab='average coverage',main='"+title+"',xlab='"+xlab+"',type='l',lwd=0,ylim=ylim)\n") - rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='slateblue1',border=NA)\n") - rscript.write("lines(xticks,avg,type='l',lwd=1)\n") - #rscript.write("xticks <- barplot(avg,names.arg=seq("+str(options.nbins)+"),ylab='average coverage',main='"+title+"',xlab='"+xlab+"',,ylim=c(min(avg-err),max(avg+err)))\n") - #rscript.write("arrows(xticks,avg+err, xticks, avg-err, angle=90, code=3, length=0.0,col='green')\n") - #rscript.write("lines(xticks,avg,lwd=2)\n") - #rscript.write("lines(xticks,avg-err,col='green')\n") - #rscript.write("lines(xticks,avg+err,col='green')\n") - rscript.write("dev.off()\n") - rscript.close() - - os.system("R --vanilla < tmp.r") - - # remove intermediate output - os.system('rm *tmp.*') - - -if __name__ == "__main__": - main()
--- a/alignr.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,142 +0,0 @@ -<tool id="alignr" name="align"> - <description>two interval sets</description> - <command interpreter="python"> alignr.py -a $inputa -w $windowsize -n $nbins -o $output_data -v $output_plot $stranded -q -l $outputlabel --ylim=$ylim --span $span - #if $inputb_source_type.inputb_select == "user": - -b "$inputb" - #else: - -b "${inputb_source_type.selectedb.fields.value}" - #end if - #if $inputa_format.inputa_select == "BAM": - -f BAM - #end if - </command> - <inputs> - <conditional name="inputa_format"> - <param name="inputa_select" type="select" label="Select your first input format" > - <option value="BED" selected="true">BED-like (chrNum start end ...) </option> - <option value="BAM"> BAM</option> - </param> - <when value="BED"> - <param name="inputa" type="data" format="interval" label="Input file for the first interval set (-a)"/> - </when> - <when value="BAM"> - <param name="inputa" type="data" format="bam" label="Input file for the first interval set (-a)"/> - </when> - </conditional> - <conditional name="inputb_source_type"> - <param name="inputb_select" type="select" label="Input source for the second interval set"> - <option value="mm9ucsc" selected="true">mm9 ucsc knownGene annotations</option> - <option value="mm9refseq">mm9 refseq gene annotations</option> - <option value="mm9ensembl">mm9 ensembl gene annotations</option> - <option value="hg18ucsc" >hg18 ucsc knownGene annotations</option> - <option value="hg18refseq">hg18 refseq gene annotations</option> - <option value="hg18ensembl">hg18 ensembl gene annotations</option> - <option value="user">Dataset in Your History</option> - </param> - <when value="user"> - <param name="inputb" type="data" format="interval" label="Input file for the second interval set (-b)" /> - </when> - <when value="mm9ucsc"> - <param name="selectedb" type="select" label="Input for the second interval set (-b)" > - <options from_file="aligndb-mm9-knownGene.loc"> - <column name="name" index="0"/> - <column name="value" index="1"/> - </options> - </param> - </when> - <when value="mm9refseq"> - <param name="selectedb" type="select" label="Input for the second interval set (-b)" > - <options from_file="aligndb-mm9-refGene.loc"> - <column name="name" index="0"/> - <column name="value" index="1"/> - </options> - </param> - </when> - <when value="mm9ensembl"> - <param name="selectedb" type="select" label="Input for the second interval set (-b)" > - <options from_file="aligndb-mm9-ensGene.loc"> - <column name="name" index="0"/> - <column name="value" index="1"/> - </options> - </param> - </when> - <when value="hg18ucsc"> - <param name="selectedb" type="select" label="Input for the second interval set (-b)" > - <options from_file="aligndb-hg18-knownGene.loc"> - <column name="name" index="0"/> - <column name="value" index="1"/> - </options> - </param> - </when> - <when value="hg18refseq"> - <param name="selectedb" type="select" label="Input for the second interval set (-b)" > - <options from_file="aligndb-hg18-refGene.loc"> - <column name="name" index="0"/> - <column name="value" index="1"/> - </options> - </param> - </when> - <when value="hg18ensembl"> - <param name="selectedb" type="select" label="Input for the second interval set (-b)" > - <options from_file="aligndb-hg18-ensGene.loc"> - <column name="name" index="0"/> - <column name="value" index="1"/> - </options> - </param> - </when> - - </conditional> - <param name="windowsize" size="10" type="integer" value="0" label="Change input 2 interval size (-w)" help="will create new intervals of w bp flanking the original center. set to 0 will not change input interval size)"/> - <param name="nbins" size="10" type="integer" value="100" label="Number of bins dividing intervals in input 2(-n)"/> - <param name="span" size="10" type="float" value="0.1" label="loess span: smoothing parameter" help="value less then 0.1 disables smoothing"/> - <param name="stranded" label="Check if require overlap on the same strand (-s)" type="boolean" truevalue="-s" falsevalue="" checked="False"/> - <param name="outputlabel" size="80" type="text" label="Output label" value="test"/> - <param name="ylim" size="10" type="text" label="set ylim of the plot" value="min,max" help="e.g. 0,1 (default is the min and max of the signal)"/> - -</inputs> - <outputs> - <data format="tabular" name="output_data" label="${outputlabel} (data)"/> - <data format="pdf" name="output_plot" label="${outputlabel} (plot)"/> - </outputs> - <help> - -**What it does** - -This tool aligns two sets of intervals, finds overlaps, calculates and plots the coverage of the first set across the second set. Applications include: - -- check read distribution around TSS/poly A site/splice site/motif site/miRNA target site -- check relative position/overlap of two lists of ChIP-seq peaks - -Two output files are generated. One is the coverage/profile for each interval in input 2. The first two columns are interval ID and the total number of overlapping intervals from input 1. Column 3 to column nbins+2 are coverage at each bin. The other file is an PDF file plotting the average coverage of each bin. To modify the visualization, please downlaod the coverage file and make your own plots. - ------ - -**Annotated features** - -Currently supports mouse genome build mm9 and human hg18. Each interval spans 1000bp upstream and 1000bp downstream of a feature such as TSS. Features with overlapping exons in the intronic/intergenic part of the 2000bp interval are removed. - ------ - -**Usage** - - -h, --help show this help message and exit - -a INPUTA (required) input file A, BED-like (first 3 columns: chr, start, end) or BAM format. The - script computes the depth of coverage of features in file - A across the features in file B - -b INPUTB (required) input file B, BED format or MACS peak file. - Requires an unique name for each line in column 4 - -m inputB is a MACS peak file. - -f AFORMAT Format of input file A. Can be BED (default) or BAM - -w WINDOW Generate new inputB by making a window of 2 x WINDOW bp - (in total) flanking the center of each input feature - -n NBINS number of bins. Features in B are binned, and the coverage - is computed for each bin. Default is 100 - -s enforce strandness: require overlapping on the same - strand. Default is off - -p load existed intersectBed outputfile - -q suppress output on screen - -o OUTPUTPROFILE (optional) output profile name. - -v PLOTFILE (optional) plot file name - </help> -</tool> -
--- a/alignvis.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,76 +0,0 @@ -import sys,os - -infile = sys.argv[1] -outfile = sys.argv[2] -uselog = sys.argv[3] -subset = sys.argv[4] -reorder = sys.argv[5] -color = sys.argv[6] -scale = sys.argv[7] # rescale each row -rscript = open('tmp.r','w') - -rscript.write("x <- read.table('"+infile+"')\n") -rscript.write("nfeat <- nrow(x) \n") -rscript.write("nbin <- ncol(x) - 2\n") -rscript.write("totalcov <- x[,2]\n") -rscript.write("x <- x[,3:ncol(x)]\n") - -if subset =='subset': - rscript.write("if (nfeat*nbin > 100000) {\n") - rscript.write(" nfeat2 <- as.integer(100000/nbin)\n") - rscript.write(" subind <- sample(seq(nfeat),nfeat2)\n") - rscript.write(" x <- x[subind,]\n") - rscript.write(" totalcov <- totalcov[subind]\n") - rscript.write("}\n") - -rscript.write("pdf('"+outfile+"')\n") - -if uselog == 'uselog': - rscript.write("x <- -(log(1+as.matrix(x,nc=ncol(x)-2)))\n") -else: - rscript.write("x <- -as.matrix(x,nc=ncol(x)-2)\n") -if scale == 'scale': - rscript.write("x <- scale(x)\n") -if reorder == 'average': - rscript.write("hc <- hclust(dist(x),method= 'average')\n") - rscript.write("x <- x[hc$order,]\n") -elif reorder == 'centroid': - rscript.write("hc <- hclust(dist(x),method= 'centroid')\n") - rscript.write("x <- x[hc$order,]\n") -elif reorder == 'complete': - rscript.write("hc <- hclust(dist(x),method= 'complete')\n") - rscript.write("x <- x[hc$order,]\n") -elif reorder == 'single': - rscript.write("hc <- hclust(dist(x),method= 'single')\n") - rscript.write("x <- x[hc$order,]\n") -elif reorder == 'median': - rscript.write("hc <- hclust(dist(x),method= 'median')\n") - rscript.write("x <- x[hc$order,]\n") -elif reorder == 'sort_by_total': - rscript.write("srt <- sort(totalcov,index.return=T)\n") - rscript.write("x <- x[srt$ix,]\n") -elif reorder == 'sort_by_center': - rscript.write("srt <- sort(x[,as.integer(nbin/2)],index.return=T)\n") - rscript.write("x <- x[srt$ix,]\n") -if color == 'heat': - rscript.write("colormap = heat.colors(1000)\n") -elif color == 'topo': - rscript.write("colormap = topo.colors(1000)\n") -elif color == 'rainbow': - rscript.write("colormap = rainbow(1000)\n") -elif color == 'terrain': - rscript.write("colormap = terrain.colors(1000)\n") -else: - rscript.write("colormap = gray.colors(1000)\n") - -#rscript.write("qt <- quantile(as.vector(x),probs=c(0.1,0.9))\n") -#rscript.write("breaks <- c(min(as.vector(x)),seq(qt[1],qt[2],length.out=99),max(as.vector(x)))\n") -#rscript.write("image(t(x),col=colormap,breaks=breaks,axes=F)\n") -rscript.write("image(t(x),col=colormap,axes=F)\n") -rscript.write("dev.off()\n") - -rscript.close() - -os.system("R --slave < tmp.r") -os.system("rm tmp.r") -
--- a/alignvis.r Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,8 +0,0 @@ -args <- commandArgs(TRUE) -x <- read.table(args[1]) -pdf(args[2]) -#visualize the profile with heatmap -srt <- sort(x[,2],index.return=T) # sort by total number of reads -image(-t(log(as.matrix(x[srt$ix[1:nrow(x)],3:ncol(x)],nc=ncol(x)-2))),col=gray.colors(100)) -dev.off() -
--- a/alignvis.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ -<tool id="alignvis" name="heatmap"> - <description>of align output</description> - <command interpreter="python"> alignvis.py $input $output $uselog $subset $reorder $color $scale </command> - <inputs> - <param name="input" format="tabular" type="data" label="Original file"/> - <param name="uselog" label="log transform the data" type="boolean" truevalue="uselog" falsevalue="none" checked="True"/> - <param name="subset" label="sample a subset if the data is too large" type="boolean" truevalue="subset" falsevalue="none" checked="True"/> - <param name="scale" label="normalize by row/feature" type="boolean" truevalue="scale" falsevalue="none" checked="False"/> - <param name="reorder" type="select" label="reorder features (rows)"> - <option value="none" selected="true">None</option> - <option value="sort_by_sum">Sort row by sum</option> - <option value="sort_by_center">Sort row by center </option> - <option value="average">Cluster rows (average)</option> - <option value="median">Cluster rows (median) </option> - <option value="centroid">Cluster rows (centroid)</option> - <option value="complete">Cluster rows (complete)</option> - <option value="single">Cluster rows (single)</option> - </param> - - <param name="color" type="select" label="color scheme"> - <option value="heat" selected="true">heat</option> - <option value="gray">gray</option> - <option value="rainbow">rainbow</option> - <option value="topo">topo</option> - <option value="terrain">terrain</option> - </param> - </inputs> - <outputs> - <data format="pdf" name="output" /> - </outputs> - <help> - -**What it does** - -This tool generates a heatmap for output from 'align' tool. Each row is the color-coded coverage of a feature, and the features are sorted by the total coverage in the interval. - -**Example** - -.. image:: ./static/operation_icons/heatmap.png - - </help> -</tool>
--- a/altschulEriksonDinuclShuffle.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,150 +0,0 @@ -#! /usr/bin/env python - -# altschulEriksonDinuclShuffle.py -# P. Clote, Oct 2003 -# NOTE: One cannot use function "count(s,word)" to count the number -# of occurrences of dinucleotide word in string s, since the built-in -# function counts only nonoverlapping words, presumably in a left to -# right fashion. - - -import sys,string,random - - - -def computeCountAndLists(s): - #WARNING: Use of function count(s,'UU') returns 1 on word UUU - #since it apparently counts only nonoverlapping words UU - #For this reason, we work with the indices. - - #Initialize lists and mono- and dinucleotide dictionaries - List = {} #List is a dictionary of lists - List['A'] = []; List['C'] = []; - List['G'] = []; List['T'] = []; - nuclList = ["A","C","G","T"] - s = s.upper() - s = s.replace("U","T") - nuclCnt = {} #empty dictionary - dinuclCnt = {} #empty dictionary - for x in nuclList: - nuclCnt[x]=0 - dinuclCnt[x]={} - for y in nuclList: - dinuclCnt[x][y]=0 - - #Compute count and lists - nuclCnt[s[0]] = 1 - nuclTotal = 1 - dinuclTotal = 0 - for i in range(len(s)-1): - x = s[i]; y = s[i+1] - List[x].append( y ) - nuclCnt[y] += 1; nuclTotal += 1 - dinuclCnt[x][y] += 1; dinuclTotal += 1 - assert (nuclTotal==len(s)) - assert (dinuclTotal==len(s)-1) - return nuclCnt,dinuclCnt,List - - -def chooseEdge(x,dinuclCnt): - numInList = 0 - for y in ['A','C','G','T']: - numInList += dinuclCnt[x][y] - z = random.random() - denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T'] - numerator = dinuclCnt[x]['A'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['A'] -= 1 - return 'A' - numerator += dinuclCnt[x]['C'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['C'] -= 1 - return 'C' - numerator += dinuclCnt[x]['G'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['G'] -= 1 - return 'G' - dinuclCnt[x]['T'] -= 1 - return 'T' - -def connectedToLast(edgeList,nuclList,lastCh): - D = {} - for x in nuclList: D[x]=0 - for edge in edgeList: - a = edge[0]; b = edge[1] - if b==lastCh: D[a]=1 - for i in range(2): - for edge in edgeList: - a = edge[0]; b = edge[1] - if D[b]==1: D[a]=1 - ok = 0 - for x in nuclList: - if x!=lastCh and D[x]==0: return 0 - return 1 - - - -def eulerian(s): - nuclCnt,dinuclCnt,List = computeCountAndLists(s) - #compute nucleotides appearing in s - nuclList = [] - for x in ["A","C","G","T"]: - if x in s: nuclList.append(x) - #compute numInList[x] = number of dinucleotides beginning with x - numInList = {} - for x in nuclList: - numInList[x]=0 - for y in nuclList: - numInList[x] += dinuclCnt[x][y] - #create dinucleotide shuffle L - firstCh = s[0] #start with first letter of s - lastCh = s[-1] - edgeList = [] - for x in nuclList: - if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] ) - ok = connectedToLast(edgeList,nuclList,lastCh) - return ok,edgeList,nuclList,lastCh - - -def shuffleEdgeList(L): - n = len(L); barrier = n - for i in range(n-1): - z = int(random.random() * barrier) - tmp = L[z] - L[z]= L[barrier-1] - L[barrier-1] = tmp - barrier -= 1 - return L - -def dinuclShuffle(s): - ok = 0 - while not ok: - ok,edgeList,nuclList,lastCh = eulerian(s) - nuclCnt,dinuclCnt,List = computeCountAndLists(s) - - #remove last edges from each vertex list, shuffle, then add back - #the removed edges at end of vertex lists. - for [x,y] in edgeList: List[x].remove(y) - for x in nuclList: shuffleEdgeList(List[x]) - for [x,y] in edgeList: List[x].append(y) - - #construct the eulerian path - L = [s[0]]; prevCh = s[0] - for i in range(len(s)-2): - ch = List[prevCh][0] - L.append( ch ) - del List[prevCh][0] - prevCh = ch - L.append(s[-1]) - t = string.join(L,"") - return t - - - -if __name__ == '__main__': - if len(sys.argv)!=3: - print "Usage: python altschulEriksonDinuclShuffle.py GCATCGA 5" - sys.exit(1) - s = sys.argv[1].upper() - for i in range(int(sys.argv[2])): - print dinuclShuffle(s)
--- a/bedClean.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,37 +0,0 @@ -import sys - -def readChrSize(filename): - f = open(filename) - chrSize = {} - for line in f: - chrom,size = line.strip().split() - chrSize[chrom]=int(size) - f.close() - return chrSize - -def cleanFile(filename,chrSize,outfile): - f = open(filename) - out = open(outfile,'w') - i = 0 - for line in f: - i = i + 1 - flds = line.strip().split('\t') - if len(flds) < 3: - print 'line',i,'incomplete line:\n',line - elif chrSize.has_key(flds[0]): - if int(flds[1]) > int(flds[2]): - tmp = flds[1] - flds[1] = flds[2] - flds[2] = tmp - if int( flds[1]) < 0 or int(flds[2]) <0: - print 'line',i,'negative coordinates:\n',line - elif int(flds[2]) > chrSize[flds[0]]: - print 'line',i,'end larger than chr size:\n',line - else: - out.write('\t'.join(flds)+'\n') - else: - print 'line',i,'chromosome',flds[0],'not found!\n',line - f.close() - out.close() - -cleanFile(sys.argv[1],readChrSize(sys.argv[2]),sys.argv[3])
--- a/bed_to_bam.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -<tool id="bedToBam" name="bedToBam"> - <description>convert BED or GFF or VCF to BAM</description> - <command>bedToBam -i $input -g $genome $bed12 $mapq $ubam > $outfile </command> - <inputs> - <param name="input" format="bed,gff,vcf" type="data" label="Input file (BED,GFF,VCF)" help="BED files must be at least BED4 to be amenable to BAM (needs name field)"/> - <param name="genome" type="select" label="Select genome"> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm9.genome" selected="true">mm9</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm8.genome">mm8</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg18.genome">hg18</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg19.genome">hg19</option> - </param> - <param name="mapq" size="10" type="integer" value="255" label="Set the mappinq quality for the BAM records"/> - <param name="bed12" label="The BED file is in BED12 format" help="The BAM CIGAR string will reflect BED blocks" type="boolean" truevalue="-bed12" falsevalue="" checked="False"/> - <param name="ubam" label="Write uncompressed BAM output" help="Default is to write compressed BAM" type="boolean" truevalue="-ubam" falsevalue="" checked="False"/> - </inputs> - <outputs> - <data format="bam" name="outfile" /> - </outputs> -</tool>
--- a/bedclean.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,37 +0,0 @@ -import sys - -def readChrSize(filename): - f = open(filename) - chrSize = {} - for line in f: - chrom,size = line.strip().split() - chrSize[chrom]=int(size) - f.close() - return chrSize - -def cleanFile(filename,chrSize,outfile): - f = open(filename) - out = open(outfile,'w') - i = 0 - for line in f: - i = i + 1 - flds = line.strip().split('\t') - if len(flds) < 3: - print 'line',i,'incomplete line:\n',line - elif chrSize.has_key(flds[0]): - if int(flds[1]) > int(flds[2]): - tmp = flds[1] - flds[1] = flds[2] - flds[2] = tmp - if int( flds[1]) < 0 or int(flds[2]) <0: - print 'line',i,'negative coordinates:\n',line - elif int(flds[2]) > chrSize[flds[0]]: - print 'line',i,'end larger than chr size:\n',line - else: - out.write('\t'.join(flds)+'\n') - else: - print 'line',i,'chromosome',flds[0],'not found!\n',line - f.close() - out.close() - -cleanFile(sys.argv[1],readChrSize(sys.argv[2]),sys.argv[3])
--- a/bedclean.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -<tool id="bedclean" name="clean interval"> - <description>remove off-chromosome lines</description> - <command interpreter="python">bedClean.py $input $genome $output > $log </command> - <inputs> - <param name="input" type="data" format="interval" label="Original interval file"/> - <param name="genome" type="select" label="Select genome"> - <option value="/media/data/Research/Project/galaxy/data/chrsize/mouse.mm9.genome" selected="true">mm9</option> - <option value="/media/data/Research/Project/galaxy/data/chrsize/mouse.mm8.genome">mm8</option> - <option value="/media/data/Research/Project/galaxy/data/chrsize/human.hg18.genome">hg18</option> - <option value="/media/data/Research/Project/galaxy/data/chrsize/human.hg19.genome">hg19</option> - </param> - </inputs> - <outputs> - <data format="input" name="output" label="${tool.name} on ${on_string} (interval)"/> - <data format="txt" name="log" label="${tool.name} on ${on_string} (log)"/> - </outputs> - <help> - -**Description** - -remove lines that are - -1. comment or track name lines - -2. on chr*_random - -3. or have negative coordinates - -4. or the end is larger than chromosome size - - - </help> -</tool>
--- a/bedsort.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ -<tool id="bedsort" name="sort"> - <description>a interval file by chr and start</description> - <command> head -n $skip $input > $output - && tail -n+`expr $skip + 1` $input | sort -k1,1 -k2,2g >> $output - </command> - <inputs> - <param name="input" type="data" format="bed" label="Input interval file"/> - <param name="skip" type="integer" value="0" label="top lines to skip" help="output directly, not sorted"/> - </inputs> - <outputs> - <data format="bed" name="output" /> - </outputs> - <help> - -**Description** - -Unix command used:: - - head -n $skip $input > $output - tail -n+`expr $skip + 1` $input | sort -k1,1 -k2,2g >> $output - - </help> -</tool>
--- a/bigWigAverageOverBed.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,12 +0,0 @@ -<tool id="bigWigAverageOverBed" name="bigWigAverageOverBed"> - <description>average interval coverage</description> - <command>bigWigAverageOverBed $bw $bed $outtab -bedOut=$outbed 2> err </command> - <inputs> - <param name="bw" format="bigwig" type="data" label="BigWig file"/> - <param name="bed" format="bed" type="data" label="Bed file"/> - </inputs> - <outputs> - <data format="tabular" name="outtab" label="${tool.name} on ${on_string} (tab)"/> - <data format="bed" name="outbed" label="${tool.name} on ${on_string} (bed)"/> - </outputs> -</tool>
--- a/binaverage.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,74 +0,0 @@ -<tool id="binaverage" name="bin and average"> - <description>of numeric columns</description> - <command>cat $script_file | R --vanilla --slave > $out_log </command> - <inputs> - <param name="input" type="data" format="tabular" label="Data file"/> - <param name="data_avg" type="integer" value="1" label="Column number of the data to average"/> - <param name="label_avg" type="text" value="label-avg" size="30" label="data label"/> - <param name="log_avg" label="log2 transform the data" type="boolean" truevalue="logavg" falsevalue="none" checked="False"/> - <param name="data_bin" type="integer" value="2" label="Column number of the data used to make bins"/> - <param name="label_bin" type="text" value="label-bin" size="30" label="data label"/> - <param name="log_bin" label="log2 transform the data" type="boolean" truevalue="logbin" falsevalue="none" checked="False"/> - <param name="nbin" type="integer" value="3" label="number of bins"/> - <param name="bintype" type="select" label="Bin by rank or by value" > - <option value="rank" selected="true">by rank: bins have the same number of data points</option> - <option value="value">by value: bins may have different number of data points</option> - </param> - <param name="legendloc" type="select" label="legend location on CDF plot" > - <option value="bottomright" selected="true">bottomright</option> - <option value="bottomleft">bottomleft</option> - <option value="bottom">bottom</option> - <option value="left">left</option> - <option value="topleft">topleft</option> - <option value="top">top</option> - <option value="topright">topright</option> - <option value="right">right</option> - <option value="center">center</option> - </param> - - <param name="title" type="text" value="bin-average" size="50" label="title of this analysis"/> - - </inputs> - - <configfiles> - <configfile name="script_file"> - ## Setup R error handling to go to stderr - options(warn=-1) - source("/Users/xuebing/galaxy-dist/tools/mytools/cdf.r") - x = read.table("${input}",sep='\t') - x = x[,c($data_bin,$data_avg)] - label_avg = "${label_avg}" - label_bin = "${label_bin}" - if ("${log_bin}" == "logbin"){ - x[,1] = log2(1+x[,1]) - label_bin = paste('log2',label_bin) - } - if ("${log_avg}" == "logavg"){ - x[,2] = log2(1+x[,2]) - label_avg = paste('log2',label_avg) - } - res = binaverage(x,$nbin,"${bintype}") - attach(res) - for (i in 1:${nbin}){ - print(paste(label_bin,labels[i],sep=':')) - print(summary(binned[[i]])) - } - pdf("${out_file}") - mycdf(binned,"${title}",labels,"$legendloc",label_avg,label_bin) - dev.off() - </configfile> - </configfiles> - - <outputs> - <data format="txt" name="out_log" label="${title}: (log)" /> - <data format="pdf" name="out_file" label="${title}: (plot)" /> - </outputs> - -<help> - -.. class:: infomark - -This tool generates barplot and CDF plot comparing data/rows in a numeric column that are binned by a second numeric column. The input should have at least two numeric columns. One of the column is used to group rows into bins, and then values in the other column are compared using barplot, CDF plot, and KS test. - -</help> -</tool>
--- a/binnedAverage.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,77 +0,0 @@ -''' -get binned score of intervals,allow extension -''' - -import os,sys,numpy,random,string - -from resize import * - -from bx.bbi.bigwig_file import BigWigFile - -def binning(x,n): - # make n bin of x - y = numpy.zeros(n,dtype=float) - if len(x) == 0: - return y - step = float(len(x))/n - for k in range(n): - i = int(step*k) - j = int(step*(k+1)) + 1 - y[k] = x[i:j].mean() - #print i,j,k,y[k] - return y - -def getBinnedScore(bwfile,intvfile,outfile,outplot,nbin): - ''' - get binned average and std - ''' - fbw = open(bwfile) - bw = BigWigFile(file=fbw) - fin = open(intvfile) - out = open(outfile,'w') - zeros = '\t'.join(['0']*nbin) - for line in fin: - #chrom,start,end,name,score,strand - line = line.strip() - flds = line.split('\t') - #get the score at base resolution as an array - scores = bw.get_as_array(flds[0],int(flds[1]),int(flds[2])) - if scores == None: - print 'not found:\t',line - out.write(line+'\t'+zeros+'\n') - continue - # reverse if on minus strand - if flds[5] == '-': - scores = scores[::-1] - # no score = 0 - scores = numpy.nan_to_num(scores) - # bin the data - binned = binning(scores,nbin) - out.write(line+'\t'+'\t'.join(map(str,binned))+'\n') - fin.close() - out.close() - # plot - if nbin > 1: - tmp = "".join(random.sample(string.letters+string.digits, 8)) - rscript = open(tmp,"w") - rscript.write("options(warn=-1)\n") - rscript.write("x <- read.table('"+outfile+"',sep='\t')\n") - rscript.write("x <- x[,(ncol(x)+1-"+str(nbin)+"):ncol(x)]\n") - rscript.write("pdf('"+outplot+"')\n") - rscript.write("avg <- apply(x,2,mean)\n") - rscript.write("err <- apply(x,2,sd)/sqrt(nrow(x))\n") - rscript.write("print(avg)\n") - rscript.write("print(err)\n") - rscript.write("ylim=c(min(avg-err),max(avg+err))\n") - rscript.write("xticks <- seq(ncol(x))\n") - rscript.write("plot(xticks,avg,xlab='',ylab='average',type='l',lwd=0,ylim=ylim)\n") - rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='lightgreen',border=NA)\n") - rscript.write("lines(xticks,avg,type='l',lwd=1)\n") - rscript.write("dev.off()\n") - rscript.close() - os.system("R --vanilla < "+tmp) - os.system("rm "+tmp) - -print sys.argv -prog,bwfile,intvfile,nbin,outfile,outplot = sys.argv -getBinnedScore(bwfile,intvfile,outfile,outplot,int(nbin))
--- a/bowtie2bed.pl Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# write bowtie output to bed file - -# perl bowtie2bed.pl s_5_trimmed.map outputfile 200 -# input -# inputfile -# extension - -$inputfile = $ARGV[0]; -$extension = $ARGV[2]; -$outputfile = $ARGV[1];#$inputfile.".extended_$extension"."bp.bed"; - -print "input file: $inputfile\n"; -print "output file: $outputfile\n"; -print "track name: $outputfile\n"; - -open (IN,$inputfile) or die $!; -open (OUT,">$outputfile") or die $!; - -print OUT "track name=$outputfile itemRgb=On\n"; - -while(<IN>) -{ - @flds = split/\t/; - $flds[0] =~ s/ /-/g;#substitute space to dash - - if ($flds[1] eq "+") - { - print OUT join("\t",$flds[2],$flds[3],$flds[3]+$extension+length($flds[4]),$flds[0],1,$flds[1],$flds[3],$flds[3]+length($flds[4]),"255,0,0","\n"); - } - else - { - - print OUT join("\t",$flds[2],max(0,$flds[3]-$extension),$flds[3]+length($flds[4]),$flds[0],1,$flds[1],$flds[3],$flds[3]+length($flds[4]),"0,255,0","\n"); - } -} -close(IN); -close(OUT); - -sub max() -{ - ($a,$b) = @_; - return $a>$b?$a:$b; -}
--- a/bowtie2bed.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ -<tool id="bowtie2bed" name="bowtie-to-bed"> - <description> converter and read extender</description> - <command interpreter="perl">bowtie2bed.pl $input $out_file1 $extendLength </command> - <inputs> - <param name="input" format="txt" type="data" label="Bowtie map result file"/> - <param name="extendLength" size="10" type="integer" value="200" label="Extend 3 primer (bp)"/> - </inputs> - <outputs> - <data format="bed" name="out_file1" /> - </outputs> - <tests> - <test> - <param name="input" value="test.map" ftype="TXT"/> - <param name="extendLength" value="200"/> - <output name="out_file1" file="testmap.bed"/> - </test> - </tests> - <help> - - -**What it does** - -This tool converts bowtie output map format file to bed format, with the option to extend the 3 primer end. - -- Sequence and quality information is lost after conversion -- The output contains a track name at the first row - ------ - -**Example** - -Converting the following bowtie mapped reads:: - - SRR073078.2 HWUSI-EAS465_8_1_1_524 length=36 - chr2 112499209 AGTGTGACTGCATCTCTTCCTTCGTGGGGCTNCAGT ... - SRR073078.3 HWUSI-EAS465_8_1_1_1054 length=36 + chr17 75877120 CCACNCCTCCTTTCAAAACACACTGCCAGGTGCGTC ... - -will result in:: - - track name=/home/xuebing/Research/galaxy/galaxy-dist/database/files/000/dataset_5.dat itemRgb=On - chr2 112499109 112499245 SRR073078.2-HWUSI-EAS465_8_1_1_524-length=36 1 - 112499209 112499245 0,255,0 - chr17 75877120 75877256 SRR073078.3-HWUSI-EAS465_8_1_1_1054-length=36 1 + 75877120 75877156 255,0,0 - - </help> -</tool>
--- a/bwBinavg.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ -<tool id="bwbinavg" name="bigwig summary"> - <description>for intervals</description> - <command interpreter="python">getGenomicScore.py $input $output $score_type $bwfile $nbin $strand $outplot $span</command> - <inputs> - <param name="input" format="interval" type="data" label="Interval file"/> - <param name="bwfile" format="bigwig" type="data" label="BigWig file"/> - <param name="score_type" type="select" label="Select score summary type" > - <option value="mean" selected="true">mean</option> - <option value="max">maximum</option> - <option value="min">minimum</option> - <option value="std">standard deviation</option> - <option value="coverage">coverage:fraction covered</option> - </param> - <param name="nbin" type="integer" value="1" label="number of bins"/> - <param name="strand" type="integer" value="0" label="Specify the strand column" help="leave 0 to ignore strand information. Only matters if using more than 1 bin"/> - <param name="span" size="10" type="float" value="0.1" label="loess span: smoothing parameter" help="value less then 0.1 disables smoothing"/> - </inputs> - <outputs> - <data format="pdf" name="outplot" label="${tool.name} on ${on_string}[plot]"/> - <data format="interval" name="output" label="${tool.name} on ${on_string}[data]"/> - </outputs> - <help> - -.. class:: infomark - -Each interval is binned and the average base-resolution score/coverage/density in the bigwig file is added as new columns appended at the end of the original file . - -**Example** - -If your original data has the following format: - -+-----+-----+---+------+ -|chrom|start|end|other2| -+-----+-----+---+------+ - -and you choose to divide each interval into 3 bins and return the mean scores of each bin, your output will look like this: - -+-----+-----+---+------+-----+-----+-----+ -|chrom|start|end|other2|mean1|mean2|mean3| -+-----+-----+---+------+-----+-----+-----+ - - -</help> -</tool>
--- a/cdf.r Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,103 +0,0 @@ -# bin and average -binaverage = function(x,nbin,rankORvalue){ -# use x[,1] to bin x[,2] - binned = list() - if (rankORvalue == 'value'){ - mi = min(x[,1]) - ma = max(x[,1]) - bins = seq(mi,ma,length.out=nbin+1) - bins[1] = bins[1] - abs(mi)/100 - bins[nbin+1] = bins[nbin+1] + abs(ma)/100 - for (i in 1:nbin){ - binned[[i]] = x[x[,1] >= bins[i] & x[,1] < bins[i+1],2] - } - bins[1] = bins[1] + abs(mi)/100 - bins[nbin+1] = bins[nbin+1] - abs(ma)/100 - } else { - x = x[order(x[,1]),] - step = round(nrow(x)/nbin) - bins = x[1,1] - for (i in 1:(nbin-1)){ - binned[[i]] = x[((i-1)*step+1):(i*step),2] - bins = c(bins,x[i*step+1,1]) - } - binned[[nbin]] = x[((nbin-1)*step+1):nrow(x),2] - bins[nbin+1] = x[nrow(x),1] - } -# bin lavel - labels = character(0) - for (i in 1:nbin){ - labels = c(labels,paste(format(bins[i],digits=2,nsmall=2),format(bins[i+1],digits=2,nsmall=2),sep='~')) - } - list(binned=binned,bins=bins,labels=labels) -} -#res = binaverage(x,3,'rank') - -# CDF plot and KS.test -mycdf = function(list,title,labels,legendposition,xlabel,legend_title){ - L = length(list) - - # barplot for mean and std - avg = numeric(L) - err = numeric(L) - for (i in 1:L){ - avg[i] = mean(list[[i]]) - err[i] = sd(list[[i]]) - } - #print(list[[1]]) - #print(list[[2]]) - #print(avg) - #print(err) - par(cex=1.5,mar=c(8,6,6,4)) - xticks <- barplot(avg,names.arg=labels,las=2,ylab=xlabel,main='mean and standard deviation',xlab=legend_title,ylim=c(0,max(avg+err))) - arrows(xticks,avg+err, xticks, avg-err, angle=90, code=3, length=0.0) - - if (L>1){ - # ks test - cat('\nKS test:\n') - cat('sample1\tsample2\tp-value\n') - cat('-------------------------------------------------\n') - for (i in 1:(L-1)){ - for (j in (i+1):L){ - cat(labels[i],'\t',labels[j],'\t') - ks = ks.test(list[[i]],list[[j]]) - pv = max(2.2e-16,ks$p.value) - pv = format(pv,digits=3,nsmall=2) - cat(pv,'\n') - } - } - cat('-------------------------------------------------\n') - } - if (L == 2){ - title = paste(title,'\np=',pv,sep='') - } - # cdf plot - listx = list() - listy = list() - mi = 1e10 - ma = 1e-10 - for (i in 1:L){ - mi = min(mi,list[[i]]) - ma = max(ma,list[[i]]) - } - for (i in 1:L){ - listx[[i]] = c(mi,listx[i],ma) - listy[[i]] = c(0,listy[i],1) - } - for (i in 1:L){ - mi = min(mi,list[[i]]) - ma = max(ma,list[[i]]) - listx[[i]] = sort(list[[i]]) - listy[[i]] = c(1:length(list[[i]]))/length(list[[i]]) - } -#par(xlog=(xlog=='xlog')) - plot(listx[[1]],listy[[1]],type='l',lty=1,lwd=2,col=2,main=title,xlab=xlabel,ylab='cumulative frequency') - for (i in 2:L){ - lines(listx[[i]],listy[[i]],type='l',lty=i,lwd=2,col=i+1) - } - # legend - for (i in 1:L){ - labels[i] = paste(labels[i],', n=',length(list[[i]]),sep='') - } - legend(legendposition,legend=labels,col=2:(L+1), lty=1:L,lwd=2, bty='n',title=legend_title) -}
--- a/cdf.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,78 +0,0 @@ -<tool id="cdf" name="CDF"> - <description>plot of multiple numeric columns</description> - <command>cat $script_file | R --vanilla --slave > $out_log </command> - <inputs> - <param name="title" type="text" value="CDF plot" size="30" label="Plot title"/> - <param name="xlabel" type="text" value="value" size="30" label="xlabel"/> - <param name="log" label="log2 transform the data" type="boolean" truevalue="log" falsevalue="none" checked="False"/> - <param name="zero" label="remove zero" type="boolean" truevalue="zero" falsevalue="none" checked="False"/> - <param name="legendloc" type="select" label="legend location on CDF plot" > - <option value="bottomright" selected="true">bottomright</option> - <option value="bottomleft">bottomleft</option> - <option value="bottom">bottom</option> - <option value="left">left</option> - <option value="topleft">topleft</option> - <option value="top">top</option> - <option value="topright">topright</option> - <option value="right">right</option> - <option value="center">center</option> - </param> - <repeat name="series" title="sample"> - <param name="label" type="text" value="" size="30" label="data label"/> - <param name="input" type="data" format="tabular" label="dataset"/> - <param name="column" type="integer" value="2" label="column number (-1 for last column)"/> - </repeat> - - </inputs> - - <configfiles> - <configfile name="script_file"> - ## Setup R error handling to go to stderr - options(warn=-1) - source("/Users/xuebing/galaxy-dist/tools/mytools/cdf.r") - uselog = as.character("${log}") - zero = as.character("${zero}") - title = as.character("${title}") - xlabel = as.character("${xlabel}") - if (uselog=='log'){ - xlabel = paste('log2',xlabel) - } - - labels = character(0) - x = list() - #for $i, $s in enumerate( $series ) - labels = c(labels,"${s.label.value}") - x0 = read.table( "${s.input.file_name}" ,sep='\t') - col = ${s.column.value} - if (col == -1) {col = ncol(x0)} - x0 = x0[,col] - if (zero == 'zero'){ - x0 = x0[x0 != 0] - } - if (uselog=='log'){ - x0=log2(1+x0) - } - print("${s.label.value}") - summary(x0) - x[[$i+1]] = x0 - #end for - pdf("${out_file}") - mycdf(x,title,labels,"${legendloc}",xlabel,'') - dev.off() - </configfile> - </configfiles> - - <outputs> - <data format="txt" name="out_log" label="${tool.name} on ${on_string}: (log)" /> - <data format="pdf" name="out_file" label="${tool.name} on ${on_string}: (plot)" /> - </outputs> - -<help> - -.. class:: infomark - -This tool generate barplot and CDF plot comparing multiple numeric columns in different files. - - -</help> -</tool>
--- a/closestBed.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ -<tool id="closestbed" name="closestBed"> - <description>find closest features</description> - <command>closestBed -a $inputa -b $inputb $strandness -d $no -t $tie> $output_data - </command> - <inputs> - <param name="inputa" type="data" format="interval,bam,bed,gff,vcf" label="Input A (-a)"/> - <param name="inputb" type="data" format="interval,bam,bed,gff,vcf" label="Input B (-b)"/> - <param name="strandness" type="select" label="Strand requirement" > - <option value="" selected="true"> none </option> - <option value="-s" > -s: closest feature on the same strand</option> - <option value="-S" > -S: closest feature on the opposite strand </option> - </param> - - <param name="no" label="Only look for non-overlaping features" type="boolean" truevalue="-no" falsevalue="" checked="False"/> - <param name="tie" type="select" label="Strand requirement" > - <option value="all" selected="true"> report all ties </option> - <option value="first" > report the first that occurred</option> - <option value="last" > report the last that occurred </option> - </param> - </inputs> - <outputs> - <data format="input" name="output_data"/> - </outputs> - <help> - -**What it does** - -This is a wrapper for closestBed. - - </help> -</tool> -
--- a/collapseBed.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ -''' -collapse intervals -''' - -def collapseInterval_strand(filename): - uniqintv = {} - data = {} - f = open(filename) - header = f.readline() - if 'chr' in header: - flds = header.strip().split('\t') - key = '\t'.join([flds[0],flds[1],flds[2],flds[5]]) - uniqintv[key] = 1 - data[key] = flds - for line in f: - flds = line.strip().split('\t') - key = '\t'.join([flds[0],flds[1],flds[2],flds[5]]) - if uniqintv.has_key(key): - uniqintv[key] = uniqintv[key] + 1 - else: - uniqintv[key] = 1 - data[key] = flds - f.close() - for key in uniqintv.keys(): - print '\t'.join(data[key]+[str(uniqintv[key])]) - #flds = key.split('\t') - #print '\t'.join([flds[0],flds[1],flds[2],'.',str(uniqintv[key]),flds[3]]) - -def collapseInterval(filename): - uniqintv = {} - data = {} - f = open(filename) - header = f.readline() - if 'chr' in header: - flds = header.strip().split('\t') - key = '\t'.join([flds[0],flds[1],flds[2]]) - uniqintv[key] = 1 - data[key] = flds - for line in f: - flds = line.strip().split('\t') - key = '\t'.join([flds[0],flds[1],flds[2]]) - if uniqintv.has_key(key): - uniqintv[key] = uniqintv[key] + 1 - else: - uniqintv[key] = 1 - data[key] = flds - f.close() - for key in uniqintv.keys(): - print '\t'.join(data[key]+[str(uniqintv[key])]) - #flds = key.split('\t') - #print '\t'.join([flds[0],flds[1],flds[2],'.',str(uniqintv[key])]) - -import sys - -if sys.argv[2] == 'strand': - collapseInterval_strand(sys.argv[1]) -else: - collapseInterval(sys.argv[1])
--- a/collapseBed.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -<tool id="collapseBed" name="collapse"> - <description>intervals</description> - <command interpreter="python">collapseBed2.py $input $strand $score > $outfile </command> - <inputs> - <param name="input" format="interval" type="data" label="Original file"/> - <param name="strand" size="10" type="integer" value="6" label="strand column" help="set 0 to ignore strand information" /> - <param name="score" size="10" type="integer" value="5" label="for duplicate lines, keep the one with max value in column" help="set 0 to ignore score information" /> - </inputs> - <outputs> - <data format="input" name="outfile" /> - </outputs> - <help> - -**What it does** - -This tool collapses genomic intervals that have the same position (and strandness if specified) and output a set of unique intervals. - - </help> -</tool>
--- a/collapseBed2.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -''' -collapse intervals -''' - -def collapseInterval_strand(filename,c_strand,c_score): - # keeping max column c - uniqintv = {} - data = {} - f = open(filename) - header = f.readline() - if 'chr' in header: - flds = header.strip().split('\t') - key = '\t'.join([flds[0],flds[1],flds[2],flds[c_strand]]) - uniqintv[key] = float(flds[c_score]) - data[key] = flds - for line in f: - flds = line.strip().split('\t') - key = '\t'.join([flds[0],flds[1],flds[2],flds[c_strand]]) - if not uniqintv.has_key(key): - uniqintv[key] = float(flds[c_score]) - data[key] = flds - elif uniqintv[key] < float(flds[c_score]): - uniqintv[key] = float(flds[c_score]) - data[key] = flds - - f.close() - for key in uniqintv.keys(): - print '\t'.join(data[key]) - -import sys - -if sys.argv[2] == '0':#ignore strand - sys.argv[2] = 1 -if sys.argv[3] == '0':# ignore score - sys.argv[3] = 2 -collapseInterval_strand(sys.argv[1],int(sys.argv[2])-1,int(sys.argv[3])-1)
--- a/collapseTab.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,37 +0,0 @@ -''' -collapse tabular files, with key columns, and max columns -''' - -def collapseTab(filename,c_key,c_max): - # keeping rows with max value in column c_max - nCol = max(max(c_key),c_max) - c_max = c_max - 1 - for i in range(len(c_key)): - c_key[i] = c_key[i] - 1 - uniqintv = {} - data = {} - f = open(filename) - for line in f: - flds = line.strip().split('\t') - if len(flds) < nCol: - continue - key = '' - for i in c_key: - key = key + flds[i-1] # i is 1-based, python is 0-based - if not uniqintv.has_key(key): - uniqintv[key] = float(flds[c_max]) - data[key] = flds - elif uniqintv[key] < float(flds[c_max]): - uniqintv[key] = float(flds[c_max]) - data[key] = flds - - f.close() - for key in uniqintv.keys(): - print '\t'.join(data[key]) - -import sys - -# convert string to number list -c_key = map(int,sys.argv[2].split(',')) -c_max = int(sys.argv[3]) -collapseTab(sys.argv[1],c_key,c_max)
--- a/collapseTab.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -<tool id="collapseTab" name="collapse tabular"> - <description>files</description> - <command interpreter="python">collapseTab.py $input $key $max > $outfile </command> - <inputs> - <param name="input" format="tabular" type="data" label="Original file"/> - <param name="key" size="10" type="text" value="1,2,3" label="key column(s)" help="columns to define unique rows" /> - <param name="max" size="10" type="text" value="5" label="for lines with identical key, keep the one with max value in this column" help="need to be numeric" /> - </inputs> - <outputs> - <data format="input" name="outfile" /> - </outputs> - <help> - -**What it does** - -Similar to 'Group' but returns the entire line. - - </help> -</tool>
--- a/convertEnsembl.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ -''' -convert ensembl bed to ucsc -add chr to chromosome -1 = + --1 = - -''' - -import sys -f = open(sys.argv[1]) -out = open(sys.argv[2],'w') -skip = int(sys.argv[3]) - -for i in range(skip): - f.readline() - -for line in f: - flds = line.strip().split('\t') - flds[0] = 'chr'+flds[0] - if flds[5] == '1': - flds[5] = '+' - else: - flds[5] = '-' - out.write('\t'.join(flds)+'\n') -f.close() -out.close() -
--- a/convertEnsembl.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,18 +0,0 @@ -<tool id="convertens" name="convert ensembl"> - <description>to ucsc</description> - <command interpreter="python">convertEnsembl.py $input $output $skip </command> - <inputs> - <param name="input" format="interval" type="data" label="Original file"/> - <param name="skip" size="10" type="integer" value="0" label="Number of beginning lines to skip"/> - </inputs> - <outputs> - <data format="input" name="output" /> - </outputs> - <help> - -**What it does** - -This tool convert ensembl based interval file to ucsc format: add 'chr' to chromosome number (column 1), and replace '1' and '-1' with '+' and '-' in column 6, respectively. - - </help> -</tool>
--- a/dreme.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,50 +0,0 @@ -<tool id="dreme" name="DREME"> - <description>short motif discovery</description> - <command interpreter="python">/Users/xuebing/bin/dreme.py -p $input -png -e $ethresh - #if $background_select.bg_select == "fromfile": - -n "${bgfile}" - #end if - - && mv dreme_out/dreme.html ${html_outfile} - - && mv dreme_out/dreme.txt ${txt_outfile} - - && mv dreme_out/dreme.xml ${xml_outfile} - - && rm -rf dreme_out - - </command> - <inputs> - <param name="input" type="data" format="fasta" label="Sequence file (FASTA)"/> - <conditional name="background_select"> - <param name="bg_select" type="select" label="Background sequence" > - <option value="shuffle" selected="true">shuffle the orignal sequence</option> - <option value="fromfile">load from file</option> - </param> - <when value="fromfile"> - <param name="bgfile" type="data" format="fasta" label="Background sequence file (FASTA)"/> - </when> - </conditional> - - <param name="ethresh" size="10" type="float" value="0.05" label="E-value threshold"/> - </inputs> - <outputs> - - <data format="xml" name="xml_outfile" label="${tool.name} on ${on_string} (xml)"/> - <data format="txt" name="txt_outfile" label="${tool.name} on ${on_string} (motif)"/> - <data format="html" name="html_outfile" label="${tool.name} on ${on_string} (html)"/> - </outputs> - <help> - -**What it does** - -http://meme.sdsc.edu/meme/doc/dreme.html - -DREME (Discriminative Regular Expression Motif Elicitation) finds relatively short motifs (up to 8 bases) fast, and can perform discriminative motif discovery if given a negative set, consisting of sequences unlikely to contain a motif of interest that is however likely to be found in the main ("positive") sequence set. If you do not provide a negative set the program shuffles the positive set to provide a background (in the role of the negative set). - -The input to DREME is one or two sets of DNA sequences. The program uses a Fisher Exact Test to determine significance of each motif found in the postive set as compared with its representation in the negative set, using a significance threshold that may be set on the command line. - -DREME achieves its high speed by restricting its search to regular expressions based on the IUPAC alphabet representing bases and ambiguous characters, and by using a heuristic estimate of generalised motifs' statistical significance. - - </help> -</tool>
--- a/endbias.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,52 +0,0 @@ -''' -usage: - -python endbias.py utr5-coverage utr3-coverage outputfile -''' -import sys,math - -def getCoverage(filename): - f = open(filename) - coverage = {} - for line in f: - flds = line.strip().split('\t') - score = float(flds[4]) - name = (flds[0].split('utr'))[0].strip('_') - if coverage.has_key(name): - if score > coverage[name]: - coverage[name] = score - else: - coverage[name] = score - return coverage - -def endBias(filename,utr5,utr3): - out = open(filename,'w') - for txpt in utr5.keys(): - if utr3.has_key(txpt): - out.write('\t'.join([txpt,str(utr5[txpt]),str(utr3[txpt]),str(math.log((1+utr5[txpt])/(1+utr3[txpt]),2))])+'\n') - out.close() - - -utr5 = getCoverage(sys.argv[1]) -utr3 = getCoverage(sys.argv[2]) -endBias(sys.argv[3],utr5,utr3) - -''' - -utr5 = getCoverage('hmga2-utr5.coverage') -utr3 = getCoverage('hmga2-utr3.coverage') -logratio, cov5,cov3= endBias(utr5,utr3) -2**pylab.median(logratio.values()) - -log2utr5 = pylab.log2(pylab.array(cov5)+1) -log2utr3 = pylab.log2(pylab.array(cov3)+1) - -pylab.plot(log2utr5,log2utr3,'bo') - -pylab.show() - -utr5 = getCoverage('control-utr5.coverage') -utr3 = getCoverage('control-utr3.coverage') -logratio, cov5,cov3= endBias(utr5,utr3) -2**pylab.median(logratio.values()) -'''
--- a/endbias.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,11 +0,0 @@ -<tool id="endbias" name="bias"> - <description>of UTR coverage</description> - <command interpreter="python"> endbias.py $input1 $input2 $output </command> - <inputs> - <param name="input1" format="txt" type="data" label="5' UTR coverage" help="tabular output from bigWigAverageOverBed"/> - <param name="input2" format="txt" type="data" label="3' UTR coverage" help="tabular output from bigWigAverageOverBed"/> - </inputs> - <outputs> - <data format="tabular" name="output" /> - </outputs> -</tool>
--- a/fasta-dinucleotide-shuffle.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,223 +0,0 @@ -#!/usr/bin/python - -import sys, string, random -import sequence - -# -# turn on psyco to speed up by 3X -# -if __name__=='__main__': - try: - import psyco - #psyco.log() - psyco.full() - psyco_found = True - except ImportError: -# psyco_found = False - pass -# print >> sys.stderr, "psyco_found", psyco_found - - -# altschulEriksonDinuclShuffle.py -# P. Clote, Oct 2003 - -def computeCountAndLists(s): - - #Initialize lists and mono- and dinucleotide dictionaries - List = {} #List is a dictionary of lists - List['A'] = []; List['C'] = []; - List['G'] = []; List['T'] = []; - # FIXME: is this ok? - List['N'] = [] - nuclList = ["A","C","G","T","N"] - s = s.upper() - #s = s.replace("U","T") - nuclCnt = {} #empty dictionary - dinuclCnt = {} #empty dictionary - for x in nuclList: - nuclCnt[x]=0 - dinuclCnt[x]={} - for y in nuclList: - dinuclCnt[x][y]=0 - - #Compute count and lists - nuclCnt[s[0]] = 1 - nuclTotal = 1 - dinuclTotal = 0 - for i in range(len(s)-1): - x = s[i]; y = s[i+1] - List[x].append( y ) - nuclCnt[y] += 1; nuclTotal += 1 - dinuclCnt[x][y] += 1; dinuclTotal += 1 - assert (nuclTotal==len(s)) - assert (dinuclTotal==len(s)-1) - return nuclCnt,dinuclCnt,List - - -def chooseEdge(x,dinuclCnt): - z = random.random() - denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']+dinuclCnt[x]['N'] - numerator = dinuclCnt[x]['A'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['A'] -= 1 - return 'A' - numerator += dinuclCnt[x]['C'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['C'] -= 1 - return 'C' - numerator += dinuclCnt[x]['G'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['G'] -= 1 - return 'G' - numerator += dinuclCnt[x]['T'] - if z < float(numerator)/float(denom): - dinuclCnt[x]['T'] -= 1 - return 'T' - dinuclCnt[x]['N'] -= 1 - return 'N' - -def connectedToLast(edgeList,nuclList,lastCh): - D = {} - for x in nuclList: D[x]=0 - for edge in edgeList: - a = edge[0]; b = edge[1] - if b==lastCh: D[a]=1 - for i in range(3): - for edge in edgeList: - a = edge[0]; b = edge[1] - if D[b]==1: D[a]=1 - ok = 0 - for x in nuclList: - if x!=lastCh and D[x]==0: return 0 - return 1 - -def eulerian(s): - nuclCnt,dinuclCnt,List = computeCountAndLists(s) - #compute nucleotides appearing in s - nuclList = [] - for x in ["A","C","G","T","N"]: - if x in s: nuclList.append(x) - #create dinucleotide shuffle L - firstCh = s[0] #start with first letter of s - lastCh = s[-1] - edgeList = [] - for x in nuclList: - if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] ) - ok = connectedToLast(edgeList,nuclList,lastCh) - return ok,edgeList,nuclList,lastCh - - -def shuffleEdgeList(L): - n = len(L); barrier = n - for i in range(n-1): - z = int(random.random() * barrier) - tmp = L[z] - L[z]= L[barrier-1] - L[barrier-1] = tmp - barrier -= 1 - return L - -def dinuclShuffle(s): - ok = 0 - while not ok: - ok,edgeList,nuclList,lastCh = eulerian(s) - nuclCnt,dinuclCnt,List = computeCountAndLists(s) - - #remove last edges from each vertex list, shuffle, then add back - #the removed edges at end of vertex lists. - for [x,y] in edgeList: List[x].remove(y) - for x in nuclList: shuffleEdgeList(List[x]) - for [x,y] in edgeList: List[x].append(y) - - #construct the eulerian path - L = [s[0]]; prevCh = s[0] - for i in range(len(s)-2): - ch = List[prevCh][0] - L.append( ch ) - del List[prevCh][0] - prevCh = ch - L.append(s[-1]) - t = string.join(L,"") - return t - -def main(): - - # - # defaults - # - file_name = None - seed = 1 - copies = 1 - - # - # get command line arguments - # - usage = """USAGE: - %s [options] - - -f <filename> file name (required) - -t <tag> added to shuffled sequence names - -s <seed> random seed; default: %d - -c <n> make <n> shuffled copies of each sequence; default: %d - -h print this usage message - """ % (sys.argv[0], seed, copies) - - # no arguments: print usage - if len(sys.argv) == 1: - print >> sys.stderr, usage; sys.exit(1) - - tag = ""; - - # parse command line - i = 1 - while i < len(sys.argv): - arg = sys.argv[i] - if (arg == "-f"): - i += 1 - try: file_name = sys.argv[i] - except: print >> sys.stderr, usage; sys.exit(1) - elif (arg == "-t"): - i += 1 - try: tag = sys.argv[i] - except: print >> sys.stderr, usage; sys.exit(1) - elif (arg == "-s"): - i += 1 - try: seed = string.atoi(sys.argv[i]) - except: print >> sys.stderr, usage; sys.exit(1) - elif (arg == "-c"): - i += 1 - try: copies = string.atoi(sys.argv[i]) - except: print >> sys.stderr, usage; sys.exit(1) - elif (arg == "-h"): - print >> sys.stderr, usage; sys.exit(1) - else: - print >> sys.stderr, "Unknown command line argument: " + arg - sys.exit(1) - i += 1 - - # check that required arguments given - if (file_name == None): - print >> sys.stderr, usage; sys.exit(1) - - random.seed(seed) - - # read sequences - seqs = sequence.readFASTA(file_name,'Extended DNA') - - for s in seqs: - str = s.getString() - #FIXME altschul can't handle ambigs - name = s.getName() - - #print >> sys.stderr, ">%s" % name - - for i in range(copies): - - shuffledSeq = dinuclShuffle(str) - - if (copies == 1): - print >> sys.stdout, ">%s\n%s" % (name+tag, shuffledSeq) - else: - print >> sys.stdout, ">%s_%d\n%s" % (name+tag, i, shuffledSeq) - -if __name__ == '__main__': main()
--- a/fastamarkov.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ -<tool id="fastamarkov" name="background model"> - <description>of DNA sequence</description> - <command>cat $input | fasta-get-markov -m $m $norc > $output 2> err.txt - - </command> - <inputs> - <param name="input" type="data" format="fasta" label="Sequence file (FASTA)"/> - <param name="m" size="10" type="integer" value="0" label="Order of Markov model to use"/> - <param name="norc" label="Combine forward and reverse complement frequencies" type="boolean" truevalue="" falsevalue="-norc" checked="True"/> - </inputs> - <outputs> - <data format="txt" name="output" /> - </outputs> - <help> - -**What it does** - -This tool generates a markov model from a fasta sequence file. - - </help> -</tool>
--- a/fastashuffle1.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,17 +0,0 @@ -<tool id="seqshuffle" name="shuffle sequences"> - <description>preserving mono-nucleotide frequency</description> - <command>cat $input | fasta-shuffle-letters > $output </command> - <inputs> - <param name="input" type="data" format="fasta" label="Original FASTA sequence file"/> - </inputs> - <outputs> - <data format="fasta" name="output" /> - </outputs> - <help> - -**Description** - -shuffle the position of nucleotides in each sequence, preserving the frequency of nucleotides in that sequence, but do not preserve di-nucleotide frequency. - - </help> -</tool>
--- a/fastashuffle2.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ -<tool id="seqshuffle2" name="shuffle sequence"> - <description>preserving dinucleotide frequency</description> - <command interpreter="python">fasta-dinucleotide-shuffle.py -f $input -t $tag -c $n -s $seed > $output </command> - <inputs> - <param name="input" format="fasta" type="data" label="Original sequence file"/> - <param name="tag" type="text" size="40" value="-shuffled" label="tag added to shuffled sequence name"/> - <param name="n" type="integer" value="1" label="number of shuffled copies for each sequence"/> - <param name="seed" type="integer" value="1" label="random seed" help="the same seed gives the same random sequences"/> - </inputs> - <outputs> - <data format="fasta" name="output" /> - </outputs> - <help> - -**What it does** - -This tool shuffles the sequences in the input file but preserves the dinucleotide frequency of each sequence. - -The code implements the Altschul-Erikson dinucleotide shuffle algorithm, described in "Significance of nucleotide sequence alignments: A method for random sequence permutation that preserves dinucleotide and codon usage", S.F. Altschul and B.W. Erikson, Mol. Biol. Evol., 2(6):526--538, 1985. - -Code adapted from http://bioinformatics.bc.edu/clotelab/RNAdinucleotideShuffle/dinucleotideShuffle.html - - </help> -</tool>
--- a/fastqdump.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,18 +0,0 @@ -<tool id="fastqdump" name="fastq-dump"> - <description>convert SRA to FASTQ</description> - <command>/Users/xuebing/tools/sratoolkit.2.1.9-mac32/fastq-dump -A $input -M $minReadLen -Z > $out_file1 </command> - <inputs> - <param name="input" format="sra" type="data" label="Original file (SRA)"/> - <param name="minReadLen" size="10" type="integer" value="10" label="minimum read length to output"/> - </inputs> - <outputs> - <data format="fastq" name="out_file1" /> - </outputs> - <help> - -**What it does** - -This is a wrapper of the fastq-dump tool from sra-toolkit. See http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software - - </help> -</tool>
--- a/fimo2-old.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ -<tool id="fimo" name="motif search"> - <description>using FIMO</description> - <command> fimo - #if $background_select.bg_select == "fromfile": - -bgfile $bgfile - #end if - - $norc --output-pthresh $pth --verbosity 1 $motif $database - && mv fimo_out/fimo.html ${html_outfile} - - && mv fimo_out/fimo.txt ${txt_outfile} - - && rm -rf fimo_out - - </command> - <inputs> - <conditional name="motif_selection"> - <param name="motif_sel" type="select" label="Motif Source"> - <option value="user" selected="true">Use motif file in Your History</option> - <option value="known" >Known motif</option> - </param> - ##<when value="known"> - ## <param name="motif" type="select" label="Select motif"> - ## <option value="/Users/xuebing/galaxy-dist/tool-data/motif-database/5primerSpliceSite" selected="true">mouse 5 primer splice site</option> - ## <option value="/Users/xuebing/galaxy-dist/tool-data/motif-database/5primerSpliceSite">mouse 3 primer splice site</option> - ## <option value="/Users/xuebing/galaxy-dist/tool-data/motif-database/TATA-Box.meme">TATA box</option> - ## </param> - ##</when> - <when value="user"> - <param name="motif" type="data" format="txt" label="Motif file" help="created using the tool create-motif-file, or import from Shared Data"/> - </when> - </conditional> - - <param name="database" type="data" format="fasta" label="Sequence file (FASTA)"/> - - <conditional name="background_select"> - <param name="bg_select" type="select" label="Background model" > - <option value="uniform" selected="true">uniform</option> - <option value="fromfile">load from file</option> - </param> - <when value="fromfile"> - <param name="bgfile" type="data" format="txt" label="File for background model"/> - </when> - </conditional> - - <param name="pth" size="10" type="float" value="0.01" label="p-value threshold"/> - <param name="norc" label="Do not score the reverse complement DNA strand. Both strands are scored by default" type="boolean" truevalue="-norc" falsevalue="" checked="False"/> - </inputs> - <outputs> - <data format="html" name="html_outfile" label="${tool.name} on ${on_string} (html)"/> - <data format="txt" name="txt_outfile" label="${tool.name} on ${on_string} (txt)"/> - </outputs> - <help> - -**What it does** - -This tool uses FIMO to find matches of a motif in a fasta file. See more details: - -http://meme.sdsc.edu/meme/fimo-intro.html - - - </help> -</tool>
--- a/fimo2.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,47 +0,0 @@ -<tool id="fimo" name="motif search"> - <description>using FIMO</description> - <command> fimo - #if $background_select.bg_select == "fromfile": - -bgfile $bgfile - #end if - - $norc --max-stored-scores 5000000 --output-pthresh $pth --verbosity 1 $motif $database - && mv fimo_out/fimo.html ${html_outfile} - - && mv fimo_out/fimo.txt ${txt_outfile} - - && rm -rf fimo_out - - </command> - <inputs> - - <param name="motif" type="data" format="txt" label="Motif file" help="created using the tool create-motif-file, or import from Shared Data"/> - <param name="database" type="data" format="fasta" label="Sequence file (FASTA)"/> - - <conditional name="background_select"> - <param name="bg_select" type="select" label="Background model" > - <option value="uniform" selected="true">uniform</option> - <option value="fromfile">load from file</option> - </param> - <when value="fromfile"> - <param name="bgfile" type="data" format="txt" label="File for background model"/> - </when> - </conditional> - - <param name="pth" size="10" type="float" value="0.0001" label="p-value threshold"/> - <param name="norc" label="Do not score the reverse complement DNA strand. Both strands are scored by default" type="boolean" truevalue="-norc" falsevalue="" checked="False"/> - </inputs> - <outputs> - <data format="html" name="html_outfile" label="${tool.name} on ${on_string} (html)"/> - <data format="txt" name="txt_outfile" label="${tool.name} on ${on_string} (txt)"/> - </outputs> - <help> - -**What it does** - -This tool uses FIMO to find matches of a motif in a fasta file. See more details: - -http://meme.sdsc.edu/meme/fimo-intro.html - - </help> -</tool>
--- a/fimo2bed.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ -''' -#pattern name sequence name start stop score p-value q-value matched sequence -constitutive-donor mm9_chr1_39533592_39535592_- 1815 1823 12.032 4.26e-06 0.397 CAGGTAAGT -constitutive-donor mm9_chr1_59313750_59315750_+ 1889 1897 12.032 4.26e-06 0.397 CAGGTAAGT - -#pattern name sequence name start stop score p-value q-value matched sequence -constitutive-donor mm9_chr1_172019075_172021075_- 1947 1955 12.032 4.26e-06 0.843 CAGGTAAGT -constitutive-donor mm9_chr1_15300532_15302532_+ 156 164 12.032 4.26e-06 0.843 CAGGTAAGT -''' - -import sys - -def fimo2bed(filename,rc): - ''' - parse fimo output to make a bed file - rc: the sequence have been reverse complemented - ''' - f = open(filename) - header = f.readline() - for line in f: - pattern,posi,begin,stop,score,pv,qv,seq = line.strip().split('\t') - flds = posi.split('_') - start = flds[-3] - end = flds[-2] - strand = flds[-1] - chrom = '_'.join(flds[1:-3]) #'chrX_random' - if not rc: - if strand == '+': - start1 = str(int(start) + int(begin)-1) - end1 = str(int(start) + int(stop)) - print '\t'.join([chrom,start1,end1,seq,score,strand]) - else: - start1 = str(int(end) - int(stop)) - end1 = str(int(end) - int(begin)+1) - print '\t'.join([chrom,start1,end1,seq,score,strand]) - else: - if strand == '-': - start1 = str(int(start) + int(begin)-1) - end1 = str(int(start) + int(stop)) - print '\t'.join([chrom,start1,end1,seq,score,'+']) - else: - start1 = str(int(end) - int(stop)) - end1 = str(int(end) - int(begin)+1) - print '\t'.join([chrom,start1,end1,seq,score,'-']) - -fimo2bed(sys.argv[1],sys.argv[2]=='rc')
--- a/fimo2bed.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -<tool id="fimo2bed" name="fimo-to-bed"> - <description>convert FIMO output to BED</description> - <command interpreter="python">fimo2bed.py $input $rc > $output</command> - <inputs> - <param name="input" format="txt" type="data" label="FIMO output file"/> - <param name="rc" label="Check if the sequences are reverse complement" type="boolean" truevalue="rc" falsevalue="none" checked="False"/> - </inputs> - <outputs> - <data format="bed" name="output" /> - </outputs> - <help> - - Only works if your original FIMO input fasta sequences have ids like:: - - mm9_chr15_99358448_99360448_- - - - </help> -</tool>
--- a/genomeView.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,108 +0,0 @@ -<tool id="genomeview" name="whole genome"> - <description>plot and correlation</description> - <command>cat $script_file | R --vanilla --slave 2> err.log </command> - <inputs> - <param name="genome" type="select" label="Select genome"> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm9.genome" selected="true">mm9</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm8.genome">mm8</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg18.genome">hg18</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg19.genome">hg19</option> - </param> - <param name="resolution" type="integer" label="resolution" value="5000" help="resolution in bps. It must be between 200 and 10,000,000"> - <validator type="in_range" max="1000000000" min="200" message="Resolution is out of range, Resolution has to be between 200 to 100000000" /> - </param> - <param name="log" label="plot the log" type="boolean" truevalue="log" falsevalue="none" checked="False"/> - <param name="union" label="compute correlation in union regions" help="ignore regions covered by neither interval sets. Recommended for sparse data under high resolution when most regions are empty" type="boolean" truevalue="union" falsevalue="none" checked="False"/> - <repeat name="series" title="input file"> - <param name="label" type="text" value="" size="30" label="Data Label"/> - <param name="input" type="data" format="interval" label="Dataset"/> - </repeat> - </inputs> - - <configfiles> - <configfile name="script_file"> - ## Setup R error handling to go to stderr - options(warn=-1) - source("/Users/xuebing/galaxy-dist/tools/mytools/genomeview.r") - genome = read.table( "${genome}") - uselog = as.character("${log}") - union = as.character("${union}") - resolution = as.integer("${resolution}") - cat('resolution=',resolution,'\n') - offset = caloffset(genome) - mcov = matrix(ncol=1,nrow=as.integer(offset[length(offset)] / resolution)) - ## Open output PDF file - pdf( "${out_file1}" ,height=4,width=20) - labels = character(0) - ## Determine range of all series in the plot - #for $i, $s in enumerate( $series ) - x = read.table( "${s.input.file_name}" ) - res = coverage(x,genome,offset,resolution) - plotcov(res,genome,offset,"${s.label.value}",uselog) - labels = c(labels,"${s.label.value}") - attach(res) - mcov = cbind(mcov,cov) - detach(res) - #end for - dev.off() - pdf("${out_file2}") - mcov = mcov[,-1] - nSample = length(labels) - if (nSample > 1) { - if (union == 'union') { - cm = matrix(0,nrow=nSample,ncol=nSample) - for (i in 1:(nSample-1)) { - cm[i,i] = 1 - for (j in (i+1):nSample){ - cm[i,j] = union_correlation(mcov[,i],mcov[,j]) - cm[j,i] = cm[i,j] - } - } - cm[nSample,nSample] = 1 - } else { - cm = cor(mcov) - } - rm(mcov) - ##heatmap(-cm,margins=c(8,8),sym=T,scale='none',labRow=labels,labCol=labels) - ##heatmap2(cm,'none',TRUE,c(8,8),labels,labels) - x = cm - h = heatmap(-x,scale='none',sym=T,margins=c(8,8),labRow=labels,labRol=labels) - attach(h) - x = x[rowInd,colInd] - tx = numeric(0) - ty = numeric(0) - txt = character(0) - for (i in 1:nrow(x)){ - for (j in 1:ncol(x)){ - tx = c(tx,i) - ty = c(ty,ncol(x)-j+1) - txt = c(txt,round(x[i,j]*100)/100) - } - } - heatmap(-x,scale='none',sym=T,margins=c(8,8),labRow=labels[rowInd],labCol=labels[colInd],add.expr=text(tx,ty,txt,col='black')) - library(gplots) - heatmap.2(cm,margins=c(8,8),scale='none',key=TRUE,trace='none', symkey=T,symbreaks=T,col=bluered,labRow=labels,labCol=labels,symm=T) - } - dev.off() - </configfile> - </configfiles> - - <outputs> - <data format="pdf" name="out_file1" label="${tool.name} on ${on_string}: (plot)" /> - <data format="pdf" name="out_file2" label="${tool.name} on ${on_string}: (correlation)" /> - </outputs> - -<help> -.. class:: infomark - -This tool allows you to plot multiple intervals across all chromosomes at different resolution, and it also plots the correlation matrix if multiple intervals are provided. - ------ - -**Example** - -.. image:: ./static/images/correlationmatrix.png -.. image:: ./static/images/wholegenome.png - -</help> -</tool>
--- a/genomeview.r Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,70 +0,0 @@ - -caloffset = function(genome){ - total_len = sum(as.numeric(genome[,2])) - offset = 0 - for (i in 1:nrow(genome)) { - offset = c(offset,offset[i]+genome[i,2]) - } - offset -} - -coverage = function(intervals,genome,offset,resolution) { - - nChr = length(offset) - 1 - total_len = offset[nChr+1] - nbin = as.integer(total_len / resolution) - cov = numeric(nbin)#coverage - col = numeric(nbin)#color - for (i in 1:nChr) { - d = x[x[,1]==as.character(genome[i,1]),2:3] - d = ceiling(((d[,1]+d[,2])/2+offset[i])*nbin/total_len) - t = table(d) - pos = as.numeric(row.names(t)) - cov[pos] = cov[pos] + as.numeric(t) - col[pos] = i - } - list(nbin=nbin,cov=cov,col=col) -} - -# plot coverage -# res = genomeView(x,genome,100000) -plotcov = function(res,genome,offset,title,uselog) { - if (uselog == 'log'){ - res$cov = log10(res$cov+1) - } - ymax = max(res$cov) - #print(ymax) - par(mar=c(5,5,5,1)) - plot(seq(length(res$cov)),res$cov,type='h',cex=0.1,cex.axis=2,cex.lab=2,cex.main=3,col=res$col,xaxt='n',main=title,xlab='chromosome',ylab='coverage',frame.plot=F,ylim=c(0,ymax)) - xticks = numeric(nrow(genome)) - for (i in 1:nrow(genome)){ - xticks[i] = (offset[i]+offset[i+1])/2*res$nbin/offset[length(offset)] - } - mtext(genome[,1],side=1,at=xticks,adj=1,las=2,col=seq(nrow(genome))) -} - -union_correlation = function(x,y){ - z = x>0 | y>0 - cor(x[z],y[z]) -} - - -heatmap2 = function(x,scale,sym,margins,labRow,labCol){ - h = heatmap(x,scale=scale,sym=sym,margins=margins,labRow=labRow,labCol=labCol) - x = x[h$rowInd,h$colInd] - tx = numeric(0) - ty = numeric(0) - txt = character(0) - for (i in 1:nrow(x)){ - for (j in 1:ncol(x)){ - tx <- c(tx,i) - ty <- c(ty,ncol(x)-j+1) - txt <- c(txt,format(x[i,j],digits=2,nsmall=2)) - } - } - #heatmap(x,scale=scale,sym=sym,margins=margins,labRow=labRow[h$rowInd],labCol=labCol[h$colInd],add.expr=text(1:4,1:4,1:4)) - cat(dim(tx)) - text(tx,ty,txt) - heatmap(x,scale=scale,sym=sym,margins=margins,labRow=labRow[h$rowInd],labCol=labCol[h$colInd],add.expr=text(tx,ty,txt)) - -}
--- a/getGenomicScore.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,79 +0,0 @@ - -import random,string,os,sys - - -def getScore(intvfile,outfile,summary_type,bwfilepath,nbin,strand,outplot,span): - f = open(intvfile) - tmpsh = "".join(random.sample(string.letters+string.digits, 8)) - tmpout = "".join(random.sample(string.letters+string.digits, 8)) - tmp = open(tmpsh,'w') - if os.path.isdir(bwfilepath): - for line in f: - flds = line.strip().split('\t') - cmd = 'bigWigSummary -type='+summary_type+' '+bwfilepath+'/'+flds[0]+'.bw '+flds[0]+' '+flds[1]+' '+flds[2]+' '+nbin+' >> '+tmpout+' 2>>'+tmpout+'\n' - tmp.write(cmd) - else: - for line in f: - flds = line.strip().split('\t') - cmd = 'bigWigSummary -type='+summary_type+' '+bwfilepath+' '+flds[0]+' '+flds[1]+' '+flds[2]+' '+nbin+' >> '+tmpout+' 2>>'+tmpout+'\n' - tmp.write(cmd) - f.close() - # remove blank lines - tmp.write("sed '/^$/d' "+tmpout+'>'+tmpout+".1\n") - tmp.write("sed '/^Can/d' "+tmpout+".1 >"+tmpout+".2\n") - # set n/a to 0 - tmp.write("sed 's/n\/a/0/g' "+tmpout+".2 >"+tmpout+".3\n") - # replace text with 0 - zeros = ''.join(['0\t']*int(nbin)) - tmp.write("sed 's/^[a-zA-Z]/"+zeros+"/' "+tmpout+".3 >"+tmpout+".4\n") - # cut the first nbin columns - tmp.write("cut -f 1-"+nbin+" "+tmpout+".4 > "+tmpout+".5\n") - tmp.write("paste "+intvfile+" "+tmpout+".5 >"+outfile+"\n") - tmp.close() - os.system('chmod +x '+tmpsh) - os.system('./'+tmpsh) - #os.system('rm '+tmpout+'*') - #os.system('rm '+tmpsh) - - # strandness: need to reverse bins for - strand - if nbin > 1 and strand > 0: - strand = strand - 1 # python is 0 based - os.system('mv '+outfile+' '+tmpout) - f = open(tmpout) - out = open(outfile,'w') - for line in f: - flds=line.strip().split('\t') - if flds[strand] == '+': - out.write(line) - else: - scores = flds[-int(nbin):] - scores.reverse() - flds = flds[:-int(nbin)]+scores - out.write('\t'.join(flds)+'\n') - os.system('rm '+tmpout) - f.close() - out.close() - # plot - if int(nbin) > 1: - rscript = open(tmpsh,"w") - rscript.write("options(warn=-1)\n") - rscript.write("x <- read.table('"+outfile+"',sep='\t')\n") - rscript.write("x <- x[,(ncol(x)+1-"+nbin+"):ncol(x)]\n") - rscript.write("pdf('"+outplot+"')\n") - rscript.write("avg <- apply(x,2,mean)\n") - rscript.write("err <- apply(x,2,sd)/sqrt(nrow(x))\n") - rscript.write("ylim=c(min(avg-err),max(avg+err))\n") - rscript.write("xticks <- seq(ncol(x))-(1+ncol(x))/2\n") - if span >= 0.1: - rscript.write("avg = loess(avg~xticks,span="+str(span)+")$fitted\n") - rscript.write("err = loess(err~xticks,span="+str(span)+")$fitted\n") - rscript.write("par(cex=1.5)\n") - rscript.write("plot(xticks,avg,ylab='average conservation score',xlab='relative position (bin)',type='l',lwd=0,ylim=ylim)\n") - rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='slateblue1',border=NA)\n") - rscript.write("lines(xticks,avg,type='l',lwd=1)\n") - rscript.write("dev.off()\n") - rscript.close() - os.system("R --vanilla < "+tmpsh) - os.system("rm "+tmpsh) - -getScore(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],int(sys.argv[6]),sys.argv[7],float(sys.argv[8]))
--- a/headtail.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ -<tool id="headtail" name="head-or-tail"> - <description>of a file</description> - <command>$headortail -n $nline $input > $out_file1 </command> - <inputs> - <param name="input" format="txt" type="data" label="Original file"/> - <param name="nline" size="10" type="integer" value="10" label="Number of lines to output"/> - <param name="headortail" type="select" label="Head or Tail"> - <option value="head" selected="true">head</option> - <option value="tail">tail</option> - </param> - </inputs> - <outputs> - <data format="input" name="out_file1" /> - </outputs> - <tests> - <test> - <output name="out_file1" file="testmap.head"/> - <param name="input" value="test.map" ftype="TXT"/> - <param name="n" value="10"/> - <param name="headortail" value="head" /> - </test> - </tests> - <help> - -**What it does** - -This is a wrapper of the unix head/tail command, which is used to show lines at the beginning or at the end of a file. - - </help> -</tool>
--- a/intersectSig.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,90 +0,0 @@ -''' -find overlap and test signifiance -''' - -import os,sys - -def lineCount(filename): - if os.stat(filename).st_size == 0: - return 0 - with open(filename) as f: - for i, l in enumerate(f): - pass - print i - return i+1 - -def intersect(fileA,fileB,outfile,fraction,reciprocal): - # return fileA intervals that overlap with interval in fileB - cmd = 'intersectBed -a '+fileA+' -b '+fileB + ' -u -wa -f '+fraction +' '+ reciprocal + '>'+outfile - #print cmd - os.system(cmd) - -def shuffle(fileA,fileB,genomefile,fraction,reciprocal,N): - # shuffle fileA N times, return the distribution of overlaps - nOverlap = [] - for i in range(N): - # shuffle fileA using shuffleBed - #cmd = 'shuffleBed -i '+fileA+' -g '+genomefile +'>fileA.shuffled' - # using random_interval.py - cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/random_interval.py '+fileA+' fileA.shuffled across '+genomefile - os.system(cmd) - intersect('fileA.shuffled',fileB,'tmp',fraction,reciprocal) - nOverlap.append(lineCount('tmp')) - os.system('rm tmp') - os.system('rm fileA.shuffled') - return nOverlap - -def main(): - fileA = sys.argv[1] - fileB = sys.argv[2] - outfile = sys.argv[3] - outplot = sys.argv[4] - outshuffle = sys.argv[5] - N = int(sys.argv[6]) # times to shuffle - genomefile = sys.argv[7] - fraction = sys.argv[8] - if len(sys.argv) == 10: - reciprocal = sys.argv[9] # can only be '-r' - else: - reciprocal = '' - - #print sys.argv - - # number of lines in input - nA = lineCount(fileA) - nB = lineCount(fileB) - - # intersect on real data - intersect(fileA,fileB,outfile,fraction,reciprocal) - # number of overlaps - nOverlapReal = lineCount(outfile) - - #print 'number of intervals in inputA that overlap with intervals in inputB:',nOverlapReal - - # shuffle fileA to estimate background - nOverlapNull = shuffle(fileA,fileB,genomefile,fraction,reciprocal,N) - out = open(outshuffle,'w') - out.write("\t".join(map(str,nOverlapNull))) - out.close() - - # plot histogram - rscript = open('tmp.r','w') - rscript.write("options(warn=-1)\n") - rscript.write("x0 <- "+str(nOverlapReal)+"\n") - rscript.write("x <- c("+','.join(map(str,nOverlapNull))+")\n") - rscript.write("library(MASS)\n") - rscript.write("pv <- min((1+sum(x>=x0))/length(x),(1+sum(x<=x0))/length(x))\n") - rscript.write("title <- paste('actual:chance = ',x0,':',format(mean(x),digits=1,nsmall=1),' = ',format(x0/mean(x),digits=1,nsmall=2),', p-value < ',pv,sep='')\n") - rscript.write("pdf('"+outplot+"')\n") - rscript.write("library(grid)\n") - rscript.write("library(VennDiagram)\n") - rscript.write("venn <- venn.diagram(x=list(A=1:"+str(nA)+",B="+str(nA-nOverlapReal+1)+":"+str(nA+nB-nOverlapReal)+"),filename=NULL,fill=c('red','blue'),col='transparent',alpha=0.5,label.col='black',cex=3,lwd=0,fontfamily='serif',fontface='bold',cat.col = c('red', 'blue'),cat.cex=3,cat.fontfamily='serif',cat.fontface='bold')\n") - rscript.write("grid.draw(venn)\n") - rscript.write("h <- hist(x,breaks=50,xlab='number of overlaps',ylab='frequency',main=title)\n") - rscript.write("plot(h$mids,h$counts,type='h',xlim=c(min(h$mids,x0),max(x0,h$mids)),ylim=c(0,max(h$counts)),xlab='number of overlaps',ylab='frequency',main=title)\n") - rscript.write("points(x0,0,col='red')\n") - rscript.write("dev.off()\n") - rscript.close() - os.system("R --vanilla < tmp.r") - os.system('rm tmp.r') -main()
--- a/intersectSig.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ -<tool id="intersectsig" name="test overlap"> - <description>of two interval lists</description> - <command interpreter="python"> intersectSig.py $fileA $fileB $outfile $outplot $outshuffle $n $genome $fraction $reciprocal </command> - <inputs> - <param name="fileA" type="data" format="interval" label="Return intervals in file A" /> - <param name="fileB" type="data" format="interval" label="that overlap with intervals in file B" /> - <param name="genome" type="select" label="Select genome"> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm9.genome" selected="true">mm9</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm8.genome">mm8</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg18.genome">hg18</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg19.genome">hg19</option> - </param> - <param name="fraction" size="10" type="float" value="1e-9" label="Minimum overlap required as a fraction of interval in file A" help="Default is 1E-9 (i.e., 1bp)."/> - <param name="reciprocal" label="Require that the fraction overlap be reciprocal for A and B" type="boolean" truevalue="-r" falsevalue="" checked="False"/> - <param name="n" size="10" type="integer" value="100" label="Number of permutations to run" help="File A is shuffled this number of times and the number of random overlaps is used to estimate the null distribution and compute the p value"/> -</inputs> - <outputs> - <data format="interval" name="outfile" label="${tool.name} on ${on_string}:overlap"/> - <data format="txt" name="outshuffle" label="${tool.name} on ${on_string}:null"/> - <data format="pdf" name="outplot" label="${tool.name} on ${on_string}:plot"/> - </outputs> - <help> - -**What it does** - -This tool uses intersectBed to find intervals in the first dataset that overlap with intervals in the second dataset. To estimate the significance of the overlap, the first dataset is shuffled then intersect with the second dataset to generate a null distribution of the number of overlaps. The tool returns venn diagram plot, histogram of the null distribution, overlapped intervals from the first input, and the null distribution of overlaps. - - </help> -</tool> -
--- a/intersectbed.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,103 +0,0 @@ -<tool id="intersectbed" name="intersectBed"> - <description>intersect two interval sets</description> - <command> intersectBed -a $inputa -b $inputb $output_opt $strandness $r -f $f $split > $output_data - </command> - <inputs> - <param name="inputa" type="data" format="interval,bam,bed,gff,vcf" label="Input A (-a)"/> - <param name="inputb" type="data" format="interval,bam,bed,gff,vcf" label="Input B (-b)"/> - <param name="output_opt" type="select" label="Output style" > - <option value="-wa" selected="true"> -wa: entry in A that overlaps B</option> - <option value="-wb" > -wb: entry in B that overlaps A</option> - <option value="-wo" > -wo: A,B, and num bases overlap </option> - <option value="-wao" > -wao: A,B, and num bases overlap </option> - <option value="-u" > -u: A only </option> - <option value="-c" > -c: A, num B features overlap </option> - <option value="-v" > -v: A without overlap </option> - </param> - - <param name="f" size="10" type="float" value="1E-9" label="Minimum overlap required as a fraction of A"/> - - <param name="strandness" type="select" label="Strand requirement" > - <option value="" selected="true"> none </option> - <option value="-s" > -s: require overlap on the same strand</option> - <option value="-S" > -S: require overlap on the opposite strand </option> - </param> - - <param name="r" label="Require that the fraction overlap be reciprocal for A and B (-r)." type="boolean" truevalue="-r" falsevalue="" checked="False"/> - <param name="split" label="Treat'split' BAM or BED12 entries as distinct BED intervals (-split)." type="boolean" truevalue="-split" falsevalue="" checked="False"/></inputs> - <outputs> - <data format="bed" name="output_data"/> - </outputs> - <help> - -**What it does** - -This is a wrapper for intersecBed. - - Program: intersectBed (v2.13.3) - Author: Aaron Quinlan (aaronquinlan@gmail.com) - Summary: Report overlaps between two feature files. - -Usage:: - - intersectBed [OPTIONS] -a (bed/gff/vcf) -b (bed/gff/vcf) - -Options:: - -abam The A input file is in BAM format. Output will be BAM as well. - - -ubam Write uncompressed BAM output. Default is to write compressed BAM. - - -bed When using BAM input (-abam), write output as BED. The default - is to write output in BAM when using -abam. - - -wa Write the original entry in A for each overlap. - - -wb Write the original entry in B for each overlap. - - Useful for knowing _what_ A overlaps. Restricted by -f and -r. - - -wo Write the original A and B entries plus the number of base - pairs of overlap between the two features. - - Overlaps restricted by -f and -r. - Only A features with overlap are reported. - - -wao Write the original A and B entries plus the number of base - pairs of overlap between the two features. - - Overlapping features restricted by -f and -r. - However, A features w/o overlap are also reported - with a NULL B feature and overlap = 0. - - -u Write the original A entry _once_ if _any_ overlaps found in B. - - In other words, just report the fact >=1 hit was found. - - Overlaps restricted by -f and -r. - - -c For each entry in A, report the number of overlaps with B. - - Reports 0 for A entries that have no overlap with B. - - Overlaps restricted by -f and -r. - - -v Only report those entries in A that have _no overlaps_ with B. - - Similar to "grep -v" (an homage). - - -f Minimum overlap required as a fraction of A. - - Default is 1E-9 (i.e., 1bp). - - FLOAT (e.g. 0.50) - - -r Require that the fraction overlap be reciprocal for A and B. - - In other words, if -f is 0.90 and -r is used, this requires - that B overlap 90% of A and A _also_ overlaps 90% of B. - - -s Require same strandedness. That is, only report hits in B that - overlap A on the _same_ strand. - - By default, overlaps are reported without respect to strand. - - -S Require different strandedness. That is, only report hits in B that - overlap A on the _opposite_ strand. - - By default, overlaps are reported without respect to strand. - - -split Treat "split" BAM or BED12 entries as distinct BED intervals. - - -sorted Use the "chromsweep" algorithm for sorted (-k1,1 -k2,2n) input - NOTE: this will trust, but not enforce that data is sorted. Caveat emptor. - - </help> -</tool> -
--- a/intervalOverlap.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,82 +0,0 @@ -''' -find overlap and test signifiance -''' - -import os,sys - -def lineCount(filename): - i = 0 - with open(filename) as f: - for i, l in enumerate(f): - pass - return i + 1 - -def intersect(fileA,fileB,outfile,fraction,reciprocal): - # return fileA intervals that overlap with interval in fileB - cmd = 'intersectBed -a '+fileA+' -b '+fileB + ' --wo -f '+fraction +' '+ reciprocal + '>'+outfile - #print cmd - os.system(cmd) - -def parseIntersect(filename): - # get number of overlapped A and B - nA = 0 - nB = 0 - return nA,nb - -def shuffle(fileA,fileB,genomefile,fraction,reciprocal,N): - # shuffle fileA N times, return the distribution of overlaps - nOverlap = [] - for i in range(N): - # shuffle fileA using shuffleBed - #cmd = 'shuffleBed -i '+fileA+' -g '+genomefile +'>fileA.shuffled' - # using random_interval.py - cmd = 'python /Users/xuebing/galaxy-dist/tools/mytools/random_interval.py '+fileA+' fileA.shuffled across '+genomefile - os.system(cmd) - intersect('fileA.shuffled',fileB,'tmp',fraction,reciprocal) - nOverlap.append(lineCount('tmp')) - os.system('rm tmp') - os.system('rm fileA.shuffled') - return nOverlap - -def main(): - fileA = sys.argv[1] - fileB = sys.argv[2] - outfile = sys.argv[3] - outplot = sys.argv[4] - N = int(sys.argv[5]) # times to shuffle - genomefile = sys.argv[6] - fraction = sys.argv[7] - if len(sys.argv) == 9: - reciprocal = sys.argv[8] # can only be '-r' - else: - reciprocal = '' - - print sys.argv - - # intersect on real data - intersect(fileA,fileB,outfile,fraction,reciprocal) - # number of overlaps - nOverlapReal = lineCount(outfile) - - print 'number of intervals in inputA that overlap with intervals in inputB:',nOverlapReal - - # shuffle fileA to estimate background - nOverlapNull = shuffle(fileA,fileB,genomefile,fraction,reciprocal,N) - - # plot histogram - rscript = open('tmp.r','w') - rscript.write("x0 <- "+str(nOverlapReal)+"\n") - rscript.write("x <- c("+','.join(map(str,nOverlapNull))+")\n") - rscript.write("library(MASS)\n") - rscript.write("\n") - rscript.write("pv <- min((1+sum(x>=x0))/length(x),(1+sum(x<=x0))/length(x))\n") - rscript.write("title <- paste('actual:chance = ',x0,':',round(mean(x)),' = ',format(x0/mean(x),digits=1,nsmall=2),', p-value < ',pv,sep='')\n") - rscript.write("pdf('"+outplot+"')\n") - rscript.write("h <- hist(x,breaks=50,xlab='number of overlaps',ylab='frequency',main=title)\n") - rscript.write("plot(h$mids,h$counts,type='h',xlim=c(min(h$mids,x0),max(x0,h$mids)),ylim=c(0,max(h$counts)),xlab='number of overlaps',ylab='frequency',main=title)\n") - rscript.write("points(x0,0,col='red')\n") - rscript.write("dev.off()") - rscript.close() - os.system("R --vanilla < tmp.r") - os.system('rm tmp.r') -main()
--- a/intervalSize.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,18 +0,0 @@ -''' -plot histogram of interval size -''' - -import os,sys - -inputfile = sys.argv[1] -outputfile = sys.argv[2] - -rf = open('tmp.r','w') -rf.write("x <- read.table('"+inputfile+"')\n") -rf.write("len <- x[,3]-x[,2]\n") -rf.write("pdf('"+outputfile+"')\n") -rf.write("hist(len,breaks=100,xlab='interval size',main=paste('mean=',mean(len),sep=''))\n") -rf.write("dev.off()") -rf.close() -os.system("R --vanilla < tmp.r") -os.system('rm tmp.r')
--- a/intervalSize.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,17 +0,0 @@ -<tool id="intervalsize" name="interval size"> - <description>distribution</description> - <command interpreter="python">intervalSize.py $input $output</command> - <inputs> - <param name="input" format="txt" type="data" label="Plot the size distribution of the following file"/> - </inputs> - <outputs> - <data format="pdf" name="output" /> - </outputs> - <help> - -**What it does** - -This tool generates a histogram of the interval size. - - </help> -</tool>
--- a/iupac2meme.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ -<tool id="iupac2meme" name="create-motif-file"> - <description>from one sequence</description> - <command>iupac2meme - #if $background_select.bg_select == "fromfile": - -bg $bgfile - #end if - -numseqs $numseqs $logodds $motif > $output </command> - <inputs> - <param name="motif" size="20" type="text" value="AATAAA" label="motif sequence" help="IUPAC motif, such as ACGGWNYCGT"/> - <conditional name="background_select"> - <param name="bg_select" type="select" label="Background model" > - <option value="uniform" selected="true">uniform</option> - <option value="fromfile">load from file</option> - </param> - <when value="fromfile"> - <param name="bgfile" type="data" format="txt" label="File for background model"/> - </when> - </conditional> - - <param name="numseqs" size="10" type="integer" value="20" label="Number of sequences (-numseqs)" help="assume frequencies based on this many sequences; default: 20"/> - <param name="logodds" label="also output log-odds (PSSM)" help="output the log-odds (PSSM) and frequency (PSPM) motifs; default: PSPM motif only" type="boolean" truevalue="-logodds" falsevalue="" checked="False"/> - </inputs> - <outputs> - <data format="txt" name="output" label="$motif-meme"/> - </outputs> - <help> - -**Description** - -Convert an IUPAC motif into a MEME version 4 formatted file suitable for use with FIMO and other MEME Suite programs. - -See additional information: - -http://meme.sdsc.edu/meme/doc/iupac2meme.html - -**IUPAC code**:: - - Nucleotide Code: Base: - ---------------- ----- - A.................Adenine - C.................Cytosine - G.................Guanine - T (or U)..........Thymine (or Uracil) - R.................A or G - Y.................C or T - S.................G or C - W.................A or T - K.................G or T - M.................A or C - B.................C or G or T - D.................A or G or T - H.................A or C or T - V.................A or C or G - N.................any base - - - - </help> -</tool>
--- a/makebigwig.sh Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,57 +0,0 @@ -# use of output: move to public_html and visualize in ucsc genome browser with the following: -# track name="xxx" color=0,0,255 type=bigWig bigDataUrl=http://rous.mit.edu/~wuxbl/xxx.bw - -if [ $# -lt 6 ] -then - echo "./bigwig.sh infile outtag bam/bed sorted/none genome strand/none [-split]" - exit -fi - -f=$1 -outf=$2 -extension=$3 -sorted=$4 -genome=$5 -strand=$6 -split=$7 -i=i -if [ $extension = bam ] -then - i=ibam - if [ $sorted != sorted ] - then - echo 'sorting bam file...=>' $f.sorted.bam - samtools sort $f $f.sorted - f=$f.sorted.bam - fi -else - if [ $sorted != sorted ] - then - echo 'sorting bed file...=>' $f.sorted.bed - sort -k1,1 $f > $f.sorted.bed - f=$f.sorted.bed - fi -fi - - echo 'making bedgraph file...=>' $f.bedgraph - if [ $strand != strand ] - then - genomeCoverageBed -bg -$i $f -g $genome $split > $f.bedgraph - echo 'making bigwig file...=>' $outf.bw - bedGraphToBigWig $f.bedgraph $genome $outf - else - genomeCoverageBed -bg -$i $f -g $genome $split -strand + > $f+.bedgraph - genomeCoverageBed -bg -$i $f -g $genome $split -strand - > $f-.bedgraph - echo 'making bigwig file for + strand...=>' $outf+.bw - bedGraphToBigWig $f+.bedgraph $genome $outf+.bw - echo 'making bigwig file for - strand...=>' $outf-.bw - bedGraphToBigWig $f-.bedgraph $genome $outf-.bw - fi - -# remove intermediate files -if [ $sorted != sorted ] - then - rm $f -fi -rm $f*.bedgraph -
--- a/makebigwig.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,37 +0,0 @@ -<tool id="makebigwig" name="make bigwig"> - <description>from BED or BAM</description> - <command interpreter="sh"> makebigwig.sh $input $outfile - #if $inputa_format.bedorbam == "bed": - bed - #else: - bam - #end if - $sorted $genome none $split >$log 2> $log </command> - <inputs> - <conditional name="inputa_format"> - <param name="bedorbam" type="select" label="Select input format" > - <option value="bed" selected="true">BED</option> - <option value="bam"> BAM</option> - </param> - <when value="bed"> - <param name="input" type="data" format="bed" label="Input file"/> - </when> - <when value="bam"> - <param name="input" type="data" format="bam" label="Input file"/> - </when> - </conditional> - <param name="genome" type="select" label="Select genome"> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm9.genome" selected="true">mm9</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm8.genome">mm8</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg18.genome">hg18</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg19.genome">hg19</option> - </param> - <param name="sorted" label="Check if the input is sorted" type="boolean" truevalue="sorted" falsevalue="none" checked="False"/> - <param name="split" label="Split junctions" help="Treat 'split' BAM or BED12 entries as distinct BED intervals." type="boolean" truevalue="-split" falsevalue="" checked="False"/> - </inputs> - <outputs> - <data format="txt" name="log" label="makebigwig LOG" /> - <data format="bigwig" name="outfile" /> - - </outputs> -</tool>
--- a/makewindow.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -def makeWindow(filename,outfile,window): - window = window/2 - f=open(filename) - out = open(outfile,'w') - for line in f: - flds = line.strip().split() - #new position - center = (int(flds[1]) + int(flds[2]))/2 - start = center - window - end = center + window - if start >= 0: - flds[1] = str(start) - flds[2] = str(end) - out.write('\t'.join(flds)+'\n') - f.close() - out.close() - -import sys -makeWindow(sys.argv[1],sys.argv[2],int(sys.argv[3]))
--- a/makewindow.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,18 +0,0 @@ -<tool id="makewindow" name="make-window"> - <description>around interval center </description> - <command interpreter="python"> makewindow.py $input $output $window </command> - <inputs> - <param name="input" type="data" format="interval" label="Input interval file"/> - <param name="window" type="integer" value="1000" label="window size (bps)" /> - </inputs> - <outputs> - <data format="input" name="output" /> - </outputs> - <help> - -**Description** - -For each interval in the input file, take the middle point, then extend each side windowsize/2 bps. - - </help> -</tool>
--- a/meme.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,349 +0,0 @@ -<tool id="meme_meme" name="MEME" version="1.0.0"> - <requirements><requirement type='package'>meme</requirement></requirements> - <description>motif discovery</description> - <command>meme "$input1" -o "${html_outfile.files_path}" - -nostatus - - ##-p 8 ##number of processors - - #if str( $options_type.options_type_selector ) == 'advanced': - -sf "${ str( $options_type.sf ).replace( ' ', '_' ) }" - -${options_type.alphabet_type.alphabet_type_selector} - -mod "${options_type.mod_type.mod_type_selector}" - -nmotifs "${options_type.nmotifs}" - -wnsites "${options_type.wnsites}" - -maxsize "${options_type.maxsize}" - - #if $options_type.evt < float('inf'): - -evt "${options_type.evt}" - #end if - - #if str( $options_type.mod_type.mod_type_selector ) != 'oops': - #if str( $options_type.mod_type.motif_occurrence_type.motif_occurrence_type_selector ) == 'nsites': - -nsites "${options_type.mod_type.motif_occurrence_type.nsites}" - #elif str( $options_type.mod_type.motif_occurrence_type.motif_occurrence_type_selector ) == 'min_max_sites': - -minsites "${options_type.mod_type.motif_occurrence_type.minsites}" -maxsites "${options_type.mod_type.motif_occurrence_type.maxsites}" - #end if - #end if - - #if str( $options_type.motif_width_type.motif_width_type_selector ) == 'exact': - -w "${options_type.motif_width_type.width}" - #else - -minw "${options_type.motif_width_type.minw}" -maxw "${options_type.motif_width_type.maxw}" - #end if - - #if str( $options_type.motif_trim_type.motif_trim_type_selector ) == 'nomatrim': - -nomatrim - #else - -wg "${options_type.motif_trim_type.wg}" -ws "${options_type.motif_trim_type.ws}" ${options_type.motif_trim_type.noendgaps} - #end if - - #if str( $options_type.bfile ) != 'None': - -bfile "${options_type.bfile}" - #end if - - #if str( $options_type.pspfile ) != 'None': - -psp "${options_type.pspfile}" - #end if - - #if str( $options_type.alphabet_type.alphabet_type_selector ) == "dna": - ${options_type.alphabet_type.revcomp} ${options_type.alphabet_type.pal} - #end if - - -maxiter "${options_type.maxiter}" -distance "${options_type.distance}" - - -prior "${options_type.alphabet_type.prior_type.prior_type_selector}" - #if str( $options_type.alphabet_type.prior_type.prior_type_selector ) != 'addone': - -b "${options_type.alphabet_type.prior_type.prior_b}" - #if str( $options_type.alphabet_type.prior_type.plib ) != 'None': - -plib "${options_type.alphabet_type.prior_type.plib}" - #end if - #end if - - #if str( $options_type.alphabet_type.spmap_type.spmap_type_selector ) == 'cons': - -cons "${options_type.alphabet_type.spmap_type.cons}" - #else - -spmap "${options_type.alphabet_type.spmap_type.spmap_type_selector}" - -spfuzz "${options_type.alphabet_type.spmap_type.spfuzz}" - #end if - - #if str( $options_type.branching_type.branching_type_selector ) == 'x_branch': - -x_branch -bfactor "${options_type.branching_type.bfactor}" -heapsize "${options_type.branching_type.heapsize}" - #end if - - ##-maxsize "1000000" ##remove hardcoded maxsize? should increase number of processors instead - - #end if - - 2>&1 || echo "Error running MEME." - - - && mv ${html_outfile.files_path}/meme.html ${html_outfile} - - && mv ${html_outfile.files_path}/meme.txt ${txt_outfile} - - && mv ${html_outfile.files_path}/meme.xml ${xml_outfile} - - </command> - <inputs> - <param format="fasta" name="input1" type="data" label="Sequences"/> - - <conditional name="options_type"> - <param name="options_type_selector" type="select" label="Options Configuration"> - <option value="basic" selected="true">Basic</option> - <option value="advanced">Advanced</option> - </param> - <when value="basic"> - <!-- do nothing here --> - </when> - <when value="advanced"> - - <param name="sf" type="text" value="Galaxy FASTA Input" label="Name of sequence set" /> - - <conditional name="alphabet_type"> - <param name="alphabet_type_selector" type="select" label="Sequence Alphabet"> - <option value="protein">Protein</option> - <option value="dna" selected="true">DNA</option> - </param> - <when value="protein"> - <conditional name="prior_type"> - <param name="prior_type_selector" type="select" label="Choice of prior"> - <option value="dirichlet">simple Dirichlet prior</option> - <option value="dmix" selected="true">mixture of Dirichlets prior</option> - <option value="mega">extremely low variance dmix</option> - <option value="megap">mega for all but last iteration of EM; dmix on last iteration</option> - <option value="addone">add +1 to each observed count</option> - </param> - <when value="dirichlet"> - <param name="prior_b" type="float" value="0.01" label="strength of prior on model parameters" /> - <param name="plib" type="data" format="txt" optional="True" label="Dirichlet prior file" /> - </when> - <when value="dmix"> - <param name="prior_b" type="float" value="0" label="strength of prior on model parameters" /> - <param name="plib" type="data" format="txt" optional="True" label="Dirichlet prior file" /> - </when> - <when value="mega"> - <param name="prior_b" type="float" value="0" label="strength of prior on model parameters" /> - <param name="plib" type="data" format="txt" optional="True" label="Dirichlet prior file" /> - </when> - <when value="megap"> - <param name="prior_b" type="float" value="0" label="strength of prior on model parameters" /> - <param name="plib" type="data" format="txt" optional="True" label="Dirichlet prior file" /> - </when> - <when value="addone"> - <!-- no values here? --> - </when> - </conditional> - <conditional name="spmap_type"> - <param name="spmap_type_selector" type="select" label="EM starting points"> - <option value="uni">uni</option> - <option value="pam" selected="true">pam</option> - <option value="cons">Use starting point from string</option> - </param> - <when value="uni"> - <param name="spfuzz" type="float" value="0.5" label="Fuzziness of the mapping" /> - </when> - <when value="pam"> - <param name="spfuzz" type="integer" value="120" label="Fuzziness of the mapping" /> - </when> - <when value="cons"> - <param name="cons" type="text" value="" label="Starting point from string" /> - </when> - </conditional> - </when> - <when value="dna"> - <param name="revcomp" label="Check reverse complement" type="boolean" truevalue="-revcomp" falsevalue="" checked="False"/> - <param name="pal" label="Check for palindromes" type="boolean" truevalue="-pal" falsevalue="" checked="False"/> - <conditional name="prior_type"> - <param name="prior_type_selector" type="select" label="Sequence Alphabet"> - <option value="dirichlet" selected="true">simple Dirichlet prior</option> - <option value="dmix">mixture of Dirichlets prior</option> - <option value="mega">extremely low variance dmix</option> - <option value="megap">mega for all but last iteration of EM; dmix on last iteration</option> - <option value="addone">add +1 to each observed count</option> - </param> - <when value="dirichlet"> - <param name="prior_b" type="float" value="0.01" label="strength of prior on model parameters" /> - <param name="plib" type="data" format="txt" optional="True" label="Dirichlet prior file" /> - </when> - <when value="dmix"> - <param name="prior_b" type="float" value="0" label="strength of prior on model parameters" /> - <param name="plib" type="data" format="txt" optional="True" label="Dirichlet prior file" /> - </when> - <when value="mega"> - <param name="prior_b" type="float" value="0" label="strength of prior on model parameters" /> - <param name="plib" type="data" format="txt" optional="True" label="Dirichlet prior file" /> - </when> - <when value="megap"> - <param name="prior_b" type="float" value="0" label="strength of prior on model parameters" /> - <param name="plib" type="data" format="txt" optional="True" label="Dirichlet prior file" /> - </when> - <when value="addone"> - <!-- no values here? --> - </when> - </conditional> - <conditional name="spmap_type"> - <param name="spmap_type_selector" type="select" label="EM starting points"> - <option value="uni" selected="true">uni</option> - <option value="pam">pam</option> - <option value="cons">Use starting point from string</option> - </param> - <when value="uni"> - <param name="spfuzz" type="float" value="0.5" label="Fuzziness of the mapping" /> - </when> - <when value="pam"> - <param name="spfuzz" type="integer" value="120" label="Fuzziness of the mapping" /> - </when> - <when value="cons"> - <param name="cons" type="text" value="" label="Starting point from string" /> - </when> - </conditional> - </when> - </conditional> - - <param name="nmotifs" type="integer" value="1" label="Number of different motifs to search" /> - <param name="maxsize" type="integer" value="1000000" label="Max number of characters in the sequence file"/> - <param name="evt" type="float" value="inf" label="E-value to stop looking for motifs" /> - <conditional name="mod_type"> - <param name="mod_type_selector" type="select" label="Expected motif distribution"> - <option value="oops">One Occurrence Per Sequence</option> - <option value="zoops" selected="true">Zero or One Occurrence Per Sequence</option> - <option value="anr">Any Number of Repetitions</option> - </param> - <when value="oops"> - <!-- no values here --> - </when> - <when value="zoops"> - <conditional name="motif_occurrence_type"> - <param name="motif_occurrence_type_selector" type="select" label="Number of motif occurrences"> - <option value="default" selected="true">Use defaults</option> - <option value="nsites">nsites</option> - <option value="min_max_sites">min and max sites</option> - </param> - <when value="default"> - <!-- no values here --> - </when> - <when value="nsites"> - <param name="nsites" type="integer" value="1" label="Search nsites number of occurrences" /> - </when> - <when value="min_max_sites"> - <param name="minsites" type="integer" value="1" label="minsites" /> - <param name="maxsites" type="integer" value="50" label="maxsites" /> - </when> - </conditional> - </when> - <when value="anr"> - <conditional name="motif_occurrence_type"> - <param name="motif_occurrence_type_selector" type="select" label="Number of motif occurrences"> - <option value="default" selected="true">Use defaults</option> - <option value="nsites">nsites</option> - <option value="min_max_sites">min and max sites</option> - </param> - <when value="default"> - <!-- no values here --> - </when> - <when value="nsites"> - <param name="nsites" type="integer" value="1" label="Search nsites number of occurrences" /> - </when> - <when value="min_max_sites"> - <param name="minsites" type="integer" value="1" label="minsites" /> - <param name="maxsites" type="integer" value="50" label="maxsites" /> - </when> - </conditional> - </when> - </conditional> - <param name="wnsites" type="float" value="0.8" label="Weight on the prior on nsites" /> - - <conditional name="motif_width_type"> - <param name="motif_width_type_selector" type="select" label="Motif width type"> - <option value="exact">Exact width</option> - <option value="range" selected="true">Specify a range</option> - </param> - <when value="exact"> - <param name="width" type="integer" value="10" label="Width of motif to search" /> - </when> - <when value="range"> - <param name="minw" type="integer" value="8" label="Min width of motif to search" /> - <param name="maxw" type="integer" value="50" label="Max width of motif to search" /> - </when> - </conditional> - - <conditional name="motif_trim_type"> - <param name="motif_trim_type_selector" type="select" label="Motif trim type"> - <option value="nomatrim">No motif trim</option> - <option value="trim" selected="true">Trim motif</option> - </param> - <when value="nomatrim"> - <!-- no values here --> - </when> - <when value="trim"> - <param name="wg" type="integer" value="11" label="Gap cost" /> - <param name="ws" type="integer" value="1" label="Space cost" /> - <param name="noendgaps" label="Do not penalize endgaps" type="boolean" truevalue="-noendgaps" falsevalue="" checked="False"/> - </when> - </conditional> - - <param name="bfile" type="data" format="txt" optional="True" label="Background Model" /> - <param name="pspfile" type="data" format="txt" optional="True" label="Position-Specific Prior" /> - - <param name="maxiter" type="integer" value="50" label="Number of iterations of EM to run" /> - <param name="distance" type="float" value="0.001" label="Convergence criterion" /> - - <conditional name="branching_type"> - <param name="branching_type_selector" type="select" label="x-branching type"> - <option value="x_branch">Perform x-branching</option> - <option value="no_x_branch" selected="true">No x-branching</option> - </param> - <when value="no_x_branch"> - <!-- no values here --> - </when> - <when value="x_branch"> - <param name="bfactor" type="integer" value="3" label="Number of iterations of branching" /> - <param name="heapsize" type="integer" value="64" label="Maximum number of heaps to use" /> - </when> - </conditional> - - </when> - </conditional> - - <param name="non_commercial_use" label="I certify that I am not using this tool for commercial purposes." type="boolean" truevalue="NON_COMMERCIAL_USE" falsevalue="COMMERCIAL_USE" checked="False"> - <validator type="expression" message="This tool is only available for non-commercial use.">value == True</validator> - </param> - - </inputs> - <outputs> - <data format="html" name="html_outfile" label="${tool.name} on ${on_string} (html)"/> - <data format="txt" name="txt_outfile" label="${tool.name} on ${on_string} (text)"/> - <data format="memexml" name="xml_outfile" label="${tool.name} on ${on_string} (xml)"/> - </outputs> - <tests> - <test> - <param name="input1" value="meme/meme/meme_input_1.fasta" ftype="fasta" dbkey="hg19"/> - <param name="options_type_selector" value="basic"/> - <param name="non_commercial_use" value="True"/> - <output name="html_outfile" file="meme/meme/meme_output_html_1.html" lines_diff="12"/> - <output name="txt_outfile" file="meme/meme/meme_output_txt_1.txt" lines_diff="12"/> - <output name="xml_outfile" file="meme/meme/meme_output_xml_1.xml" lines_diff="8"/> - </test> - </tests> - <help> - -.. class:: warningmark - -**WARNING: This tool is only available for non-commercial use. Use for educational, research and non-profit purposes is permitted. Before using, be sure to review, agree, and comply with the license.** - -If you want to specify sequence weights, you must include them at the top of your input FASTA file. - -.. class:: infomark - -**To cite MEME:** -Timothy L. Bailey and Charles Elkan, "Fitting a mixture model by expectation maximization to discover motifs in biopolymers", Proceedings of the Second International Conference on Intelligent Systems for Molecular Biology, pp. 28-36, AAAI Press, Menlo Park, California, 1994. - - -For detailed information on MEME, click here_. To view the license_. - -.. _here: http://meme.nbcr.net/meme/meme-intro.html -.. _license: http://meme.nbcr.net/meme/COPYRIGHT.html - - </help> -</tool>
--- a/memelogo.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ -<tool id="memelogo" name="motif logo"> - <description>of MEME motif</description> - <command>ceqlogo -i $input -o tmp.eps -t $title -x '' - && ps2pdf -dEPSCrop tmp.eps $output - </command> - <inputs> - <param name="input" type="data" format="txt" label="MEME motif file"/> - <param name="title" type='text' size="50" label="Title" value="motif1"/> - </inputs> - <outputs> - <data format="pdf" name="output" /> - </outputs> - <help> - -**Description** - -Generate sequence logo for MEME motif file. See details here: - -http://meme.sdsc.edu/meme/doc/ceqlogo.html - -**Example output** - -.. image:: ./static/images/memelogo.png - - </help> -</tool>
--- a/metaintv.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,109 +0,0 @@ -''' -get binned score of intervals,allow extension -''' - -import os,sys,numpy - -from resize import * - -from bx.bbi.bigwig_file import BigWigFile - -def binning(x,n): - # make n bin of x - y = numpy.zeros(n,dtype=float) - if len(x) == 0: - return y - step = float(len(x))/n - for k in range(n): - i = int(step*k) - j = int(step*(k+1)) + 1 - y[k] = x[i:j].mean() - #print i,j,k,y[k] - return y - -def getBinnedScore(bwfile,intvfile,nbin): - ''' - get binned average and std - ''' - fbw = open(bwfile) - bw = BigWigFile(file=fbw) - fin = open(intvfile) - avg = numpy.zeros(nbin) - sqr = numpy.zeros(nbin) - N = 0 - for line in fin: - #chrom,start,end,name,score,strand - flds = line.strip().split('\t') - #get the score at base resolution as an array - scores = bw.get_as_array(flds[0],int(flds[1]),int(flds[2])) - if scores == None: - print 'not found:\t',line - continue - N = N + 1 - #print line,scores - # reverse if on minus strand - if flds[5] == '-': - scores = scores[::-1] - # no score = 0 - scores = numpy.nan_to_num(scores) - # bin the data - binned = binning(scores,nbin) - avg = avg + binned - sqr = sqr + binned**2 - # compute avg and std - avg = avg / N - err = ((sqr/N-avg**2)**0.5)/(N**0.5) - return avg,err - -def getExtendedBinScore(bwfile,intvfile,nbins,exts): - ''' - nbins: n1,n2,n3 - exts: l1,l2,l3,l4 - ''' - # make left extension - resize(intvfile,intvfile+'.tmp','start-'+str(exts[0]),'start+'+str(exts[1]),'stranded') - # compute binned average - avg,err = getBinnedScore(bwfile,intvfile+'.tmp',nbins[0]) - # make center region - resize(intvfile,intvfile+'.tmp','start+'+str(exts[1]),'end-'+str(exts[2]),'stranded') - # compute binned average - avg1,err1 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[1]) - avg = numpy.concatenate((avg,avg1)) - err = numpy.concatenate((err,err1)) - # make right region - resize(intvfile,intvfile+'.tmp','end-'+str(exts[2]),'end+'+str(exts[3]),'stranded') - # compute binned average - avg2,err2 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[2]) - avg = numpy.concatenate((avg,avg2)) - err = numpy.concatenate((err,err2)) - - return avg,err - -print sys.argv -prog,bwfile,intvfile,nbin,outfile,outplot = sys.argv -avg, err = getBinnedScore(bwfile,intvfile,int(nbin)) -out = open(outfile,'w') -numpy.savetxt(out, avg, fmt='%.6f', delimiter=' ', newline=' ') -out.write('\n') -numpy.savetxt(out, err, fmt='%.6f', delimiter=' ', newline=' ') -out.write('\n') -out.close() - -# plot -rscript = open("tmp.r","w") -rscript.write("options(warn=-1)\n") -rscript.write("x <- read.table('"+outfile+"')\n") -rscript.write("pdf('"+outplot+"')\n") -rscript.write("avg <- x[1,]\n") -rscript.write("err <- x[2,]\n") -rscript.write("print(x)\n") -rscript.write("ylim=c(min(avg-err),max(avg+err))\n") -rscript.write("xticks <- seq(ncol(x))\n") -rscript.write("plot(xticks,avg,xlab='',ylab='average coverage',type='l',lwd=0,ylim=ylim)\n") -rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='lightgreen',border=NA)\n") -rscript.write("lines(xticks,avg,type='l',lwd=1)\n") -rscript.write("dev.off()\n") -rscript.close() -os.system("R --vanilla < tmp.r") -os.system("rm tmp.r") -
--- a/metaintv.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -<tool id="metaintv" name="binned-average"> - <description>from bigwig</description> - <command interpreter="python">binnedAverage.py $bwfile $intvfile $nbin $outfile $outplot </command> - <inputs> - <param name="intvfile" format="bed" type="data" label="BED file (require strand in column 6)"/> - <param name="bwfile" format="bigwig" type="data" label="BigWig file"/> - <param name="nbin" type="integer" value="20" label="number of bins"/> - - </inputs> - <outputs> - <data format="tabular" name="outfile" label="${tool.name} on ${on_string}[data]"/> - <data format="pdf" name="outplot" label="${tool.name} on ${on_string}[plot]"/> - </outputs> - <help> - -.. class:: infomark - -Each interval is binned and the average base-resolution score/coverage/density in the bigwig file is added as new columns appended at the end of the original file . - -**Example** - -If your original data has the following format: - -+-----+-----+---+------+ -|chrom|start|end|other2| -+-----+-----+---+------+ - -and you choose to divide each interval into 3 bins and return the mean scores of each bin, your output will look like this: - -+-----+-----+---+------+-----+-----+-----+ -|chrom|start|end|other2|mean1|mean2|mean3| -+-----+-----+---+------+-----+-----+-----+ - - -</help> -</tool>
--- a/metaintv2.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,109 +0,0 @@ -''' -get binned score of intervals,allow extension -''' - -import os,sys,numpy - -from resize import * - -from bx.bbi.bigwig_file import BigWigFile - -def binning(x,n): - # make n bin of x - y = numpy.zeros(n,dtype=float) - if len(x) == 0: - return y - step = float(len(x))/n - for k in range(n): - i = int(step*k) - j = int(step*(k+1)) + 1 - y[k] = x[i:j].mean() - #print i,j,k,y[k] - return y - -def getBinnedScore(bwfile,intvfile,nbin): - ''' - get binned average and std - ''' - fbw = open(bwfile) - bw = BigWigFile(file=fbw) - fin = open(intvfile) - avg = numpy.zeros(nbin) - sqr = numpy.zeros(nbin) - N = 0 - for line in fin: - #chrom,start,end,name,score,strand - flds = line.strip().split('\t') - #get the score at base resolution as an array - scores = bw.get_as_array(flds[0],int(flds[1]),int(flds[2])) - if scores == None: - print 'not found:\t',line - continue - N = N + 1 - #print line,scores - # reverse if on minus strand - if flds[5] == '-': - scores = scores[::-1] - # no score = 0 - scores = numpy.nan_to_num(scores) - # bin the data - binned = binning(scores,nbin) - avg = avg + binned - sqr = sqr + binned**2 - # compute avg and std - avg = avg / N - err = ((sqr/N-avg**2)**0.5)/(N**0.5) - return avg,err - -def getExtendedBinScore(bwfile,intvfile,nbins,exts): - ''' - nbins: n1,n2,n3 - exts: l1,l2,l3,l4 - ''' - # make left extension - resize(intvfile,intvfile+'.tmp','start-'+str(exts[0]),'start+'+str(exts[1]),'stranded') - # compute binned average - avg,err = getBinnedScore(bwfile,intvfile+'.tmp',nbins[0]) - # make center region - resize(intvfile,intvfile+'.tmp','start+'+str(exts[1]),'end-'+str(exts[2]),'stranded') - # compute binned average - avg1,err1 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[1]) - avg = numpy.concatenate((avg,avg1)) - err = numpy.concatenate((err,err1)) - # make right region - resize(intvfile,intvfile+'.tmp','end-'+str(exts[2]),'end+'+str(exts[3]),'stranded') - # compute binned average - avg2,err2 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[2]) - avg = numpy.concatenate((avg,avg2)) - err = numpy.concatenate((err,err2)) - - return avg,err - -print sys.argv -bwfile,intvfile,exts,nbins,outfile,outplot = sys.argv -avg, err = getExtendedBinScore(bwfile,intvfile,numpy.fromstring(nbins,sep=','),numpy.fromstring(exts,sep=',')) -out = open(outfile,'w') -numpy.savetxt(out, avg, fmt='%.6f', delimiter=' ', newline=' ') -out.write('\n') -numpy.savetxt(out, err, fmt='%.6f', delimiter=' ', newline=' ') -out.write('\n') -out.close() - -# plot -rscript = open("tmp.r","w") -rscript.write("options(warn=-1)\n") -rscript.write("x <- read.table('"+outfile+"')\n") -rscript.write("pdf('"+outplot+"')\n") -rscript.write("avg <- x[1,]\n") -rscript.write("err <- x[2,]\n") -rscript.write("print(x)\n") -rscript.write("ylim=c(min(avg-err),max(avg+err))\n") -rscript.write("xticks <- seq(ncol(x))\n") -rscript.write("plot(xticks,avg,ylab='average coverage',type='l',lwd=0,ylim=ylim)\n") -rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='lightgreen',border=NA)\n") -rscript.write("lines(xticks,avg,type='l',lwd=1)\n") -rscript.write("dev.off()\n") -rscript.close() -os.system("R --vanilla < tmp.r") -os.system("rm tmp.r") -
--- a/metaintv3.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,109 +0,0 @@ -''' -get binned score of intervals,allow extension -''' - -import os,sys,numpy - -from resize import * - -from bx.bbi.bigwig_file import BigWigFile - -def binning(x,n): - # make n bin of x - y = numpy.zeros(n,dtype=float) - if len(x) == 0: - return y - step = float(len(x))/n - for k in range(n): - i = int(step*k) - j = int(step*(k+1)) + 1 - y[k] = x[i:j].mean() - #print i,j,k,y[k] - return y - -def getBinnedScore(bwfile,intvfile,nbin): - ''' - get binned average and std - ''' - fbw = open(bwfile) - bw = BigWigFile(file=fbw) - fin = open(intvfile) - avg = numpy.zeros(nbin) - sqr = numpy.zeros(nbin) - N = 0 - for line in fin: - #chrom,start,end,name,score,strand - flds = line.strip().split('\t') - #get the score at base resolution as an array - scores = bw.get_as_array(flds[0],int(flds[1]),int(flds[2])) - if scores == None: - print 'not found:\t',line - continue - N = N + 1 - #print line,scores - # reverse if on minus strand - if flds[5] == '-': - scores = scores[::-1] - # no score = 0 - scores = numpy.nan_to_num(scores) - # bin the data - binned = binning(scores,nbin) - avg = avg + binned - sqr = sqr + binned**2 - # compute avg and std - avg = avg / N - err = ((sqr/N-avg**2)**0.5)/(N**0.5) - return avg,err - -def getExtendedBinScore(bwfile,intvfile,nbins,exts): - ''' - nbins: n1,n2,n3 - exts: l1,l2,l3,l4 - ''' - # make left extension - resize(intvfile,intvfile+'.tmp','start-'+str(exts[0]),'start+'+str(exts[1]),'stranded') - # compute binned average - avg,err = getBinnedScore(bwfile,intvfile+'.tmp',nbins[0]) - # make center region - resize(intvfile,intvfile+'.tmp','start+'+str(exts[1]),'end-'+str(exts[2]),'stranded') - # compute binned average - avg1,err1 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[1]) - avg = numpy.concatenate((avg,avg1)) - err = numpy.concatenate((err,err1)) - # make right region - resize(intvfile,intvfile+'.tmp','end-'+str(exts[2]),'end+'+str(exts[3]),'stranded') - # compute binned average - avg2,err2 = getBinnedScore(bwfile,intvfile+'.tmp',nbins[2]) - avg = numpy.concatenate((avg,avg2)) - err = numpy.concatenate((err,err2)) - - return avg,err - -print sys.argv -bwfile,intvfile,exts,nbins,outfile,outplot = sys.argv -avg, err = getExtendedBinScore(bwfile,intvfile,numpy.fromstring(nbins,sep=','),numpy.fromstring(exts,sep=',')) -out = open(outfile,'w') -numpy.savetxt(out, avg, fmt='%.6f', delimiter=' ', newline=' ') -out.write('\n') -numpy.savetxt(out, err, fmt='%.6f', delimiter=' ', newline=' ') -out.write('\n') -out.close() - -# plot -rscript = open("tmp.r","w") -rscript.write("options(warn=-1)\n") -rscript.write("x <- read.table('"+outfile+"')\n") -rscript.write("pdf('"+outplot+"')\n") -rscript.write("avg <- x[1,]\n") -rscript.write("err <- x[2,]\n") -rscript.write("print(x)\n") -rscript.write("ylim=c(min(avg-err),max(avg+err))\n") -rscript.write("xticks <- seq(ncol(x))\n") -rscript.write("plot(xticks,avg,ylab='average coverage',type='l',lwd=0,ylim=ylim)\n") -rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='lightgreen',border=NA)\n") -rscript.write("lines(xticks,avg,type='l',lwd=1)\n") -rscript.write("dev.off()\n") -rscript.close() -os.system("R --vanilla < tmp.r") -os.system("rm tmp.r") -
--- a/metaintv_ext.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,128 +0,0 @@ -''' -get binned score of intervals,allow extension -''' - -import os,sys,numpy -import string, random - -from resize import * - -from bx.bbi.bigwig_file import BigWigFile - -def binning(x,n): - # make n bin of x - y = numpy.zeros(n,dtype=float) - if len(x) == 0: - return y - step = float(len(x))/n - for k in range(n): - i = int(step*k) - j = int(step*(k+1)) + 1 - y[k] = x[i:j].mean() - #print i,j,k,y[k] - return y - -def getBinnedScore(bwfile,intvfile,nbin): - ''' - get binned average and std - ''' - fbw = open(bwfile) - bw = BigWigFile(file=fbw) - fin = open(intvfile) - avg = numpy.zeros(nbin) - sqr = numpy.zeros(nbin) - N = 0 - for line in fin: - #print N - #chrom,start,end,name,score,strand - flds = line.strip().split('\t') - #get the score at base resolution as an array - scores = bw.get_as_array(flds[0],int(flds[1]),int(flds[2])) - if scores == None: - print 'not found:\t',N,line - continue - N = N + 1 - #print line,scores - # reverse if on minus strand - if flds[5] == '-': - scores = scores[::-1] - # no score = 0 - scores = numpy.nan_to_num(scores) - # bin the data - binned = binning(scores,nbin) - avg = avg + binned - sqr = sqr + binned**2 - # compute avg and std - avg = avg / N - err = ((sqr/N-avg**2)**0.5)/(N**0.5) - return avg,err,N - -def getExtendedBinScore(bwfile,intvfile,nbins,exts): - ''' - nbins: n1,n2,n3 - exts: l1,l2,l3,l4 - ''' - avg = [] - err = [] - tmpfile = "".join(random.sample(string.letters+string.digits, 8)) - if exts[0]>0 or exts[1]>0: - print 'make left extension' - resize(intvfile,tmpfile,'start-'+str(exts[0]),'start+'+str(exts[1]),'stranded') - print 'compute binned average' - avg,err,N = getBinnedScore(bwfile,tmpfile,nbins[0]) - print 'regions used:',N - print 'make center region' - resize(intvfile,tmpfile,'start+'+str(exts[1]),'end-'+str(exts[2]),'stranded') - print 'compute binned average' - avg1,err1,N = getBinnedScore(bwfile,tmpfile,nbins[1]) - print 'regions used:',N - avg = numpy.concatenate((avg,avg1)) - err = numpy.concatenate((err,err1)) - if exts[2]>0 or exts[3]>0: - print 'make right region' - resize(intvfile,tmpfile,'end-'+str(exts[2]),'end+'+str(exts[3]),'stranded') - print 'compute binned average' - avg2,err2,N = getBinnedScore(bwfile,tmpfile,nbins[2]) - print 'regions used:',N - avg = numpy.concatenate((avg,avg2)) - err = numpy.concatenate((err,err2)) - os.system('rm '+tmpfile) - return avg,err - -prog,bwfile,intvfile,exts,nbins,outfile,outplot = sys.argv -nbins = numpy.fromstring(nbins,dtype=int,sep=',') -exts = numpy.fromstring(exts,dtype=int,sep=',') -avg, err = getExtendedBinScore(bwfile,intvfile,nbins,exts) -print 'save data' -out = open(outfile,'w') -numpy.savetxt(out, avg, fmt='%.6f', delimiter=' ', newline=' ') -out.write('\n') -numpy.savetxt(out, err, fmt='%.6f', delimiter=' ', newline=' ') -out.write('\n') -out.close() - -print 'plot' -start = exts[0]*nbins[0]/(exts[0]+exts[1]) -end = nbins[0]+nbins[1]+exts[2]*nbins[2]/(exts[2]+exts[3]) -#print start,end -rscript = open("tmp.r","w") -rscript.write("options(warn=-1)\n") -rscript.write("x <- read.table('"+outfile+"')\n") -rscript.write("pdf('"+outplot+"')\n") -rscript.write("avg <- x[1,]\n") -rscript.write("err <- x[2,]\n") -#rscript.write("print(x)\n") -rscript.write("ylim=c(min(avg-err),max(avg+err))\n") -rscript.write("xticks <- seq(ncol(x))\n") -#rscript.write("print(xticks)\n") -rscript.write("plot(xticks,avg,xlab='',ylab='average coverage',type='l',lwd=0,ylim=ylim,xaxt='n')\n") -rscript.write("axis(1, at=c(min(xticks),"+str(start)+","+str(end)+",max(xticks)),labels=c(-"+str(exts[0])+",0,0,"+str(exts[3])+"), las=2)\n") -rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='lightgreen',border=NA)\n") -rscript.write("lines(xticks,avg,type='l',lwd=1)\n") -rscript.write("lines(c(min(xticks),max(xticks)),c(0,0),lwd=2)\n") -rscript.write("lines(c("+str(start)+","+str(end)+"),c(0,0),lwd=10)\n") -rscript.write("dev.off()\n") -rscript.close() -os.system("R --vanilla --slave < tmp.r") -os.system("rm tmp.r") -
--- a/metaintv_ext.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ -<tool id="metaintv_ext" name="aggregrate binned-average"> - <description>from bigwig (allow extension)</description> - <command interpreter="python">metaintv_ext.py $bwfile $intvfile $exts $nbins $outfile $outplot > $outlog </command> - <inputs> - <param name="intvfile" format="interval" type="data" label="Interval file"/> - <param name="bwfile" format="bigwig" type="data" label="BigWig file"/> - <param name="exts" type="text" size="80" value="100,50,50,100" label="extensions"/> - <param name="nbins" type="text" size="80" value="10,10,10" label="number of bins"/> - - </inputs> - <outputs> - <data format="txt" name="outlog" label="${tool.name} on ${on_string}[log]"/> - <data format="tabular" name="outfile" label="${tool.name} on ${on_string}[data]"/> - <data format="pdf" name="outplot" label="${tool.name} on ${on_string}[plot]"/> - </outputs> - <help> - -.. class:: infomark - -To be added - -</help> -</tool>
--- a/phastCons.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,48 +0,0 @@ -<tool id="getScore" name="conservation"> - <description>phastCons or phyloP,vertebrate30way</description> - <command interpreter="python">getGenomicScore.py $input $output $score_type $score_path $nbin $strand $outplot $span</command> - <inputs> - <param name="input" format="interval" type="data" label="Interval file"/> - <param name="score_path" type="select" label="Select score" > - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/mm8/phastcons" >mm8-phastCons17way</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/mm9/phastcon" selected="true">mm9-phastCons30way-vertebrate</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/mm9/phyloP30way">mm9-phyloP30way-vertebrate</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/hg18/phastCons28wayPlacMam">hg18-phastCons28wayPlacMam</option> </param> - <param name="score_type" type="select" label="Select score summary type" > - <option value="mean" selected="true">mean</option> - <option value="max">maximum</option> - <option value="min">minimum</option> - <option value="std">standard deviation</option> - <option value="coverage">coverage:fraction covered</option> - </param> - <param name="nbin" type="integer" value="1" label="number of bins"/> - <param name="span" size="10" type="float" value="0.1" label="loess span: smoothing parameter" help="value less then 0.1 disables smoothing"/> - <param name="strand" type="integer" value="0" label="Specify the strand column" help="leave 0 to ignore strand information. Only matters if using more than 1 bin"/> - </inputs> - <outputs> - <data format="pdf" name="outplot" label="${tool.name} on ${on_string}[plot]"/> - <data format="interval" name="output" label="${tool.name} on ${on_string}[data]"/> - </outputs> - <help> - -.. class:: infomark - -The score for each interval is added as a new column appended at the end of the original file . - -**Example** - -If your original data has the following format: - -+-----+-----+---+------+ -|chrom|start|end|other2| -+-----+-----+---+------+ - -and you choose to return the mean of phastCons scores, your output will look like this: - -+-----+-----+---+------+----+ -|chrom|start|end|other2|mean| -+-----+-----+---+------+----+ - - -</help> -</tool>
--- a/plotmatrix.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,77 +0,0 @@ -import sys,os - -infile = sys.argv[1] -outfile = sys.argv[2] -uselog = sys.argv[3] -subset = sys.argv[4] -reorder = sys.argv[5] -color = sys.argv[6] -scale = sys.argv[7] # rescale each row -cols = sys.argv[8] -rscript = open('tmp.r','w') - -rscript.write("x <- read.table('"+infile+"')\n") -rscript.write("x <- x[,c("+cols+")]\n") -rscript.write("nr <- nrow(x) \n") -rscript.write("nc <- ncol(x)\n") -rscript.write("rowsum <- apply(x,1,sum)\n") - -if subset =='subset': - rscript.write("if (nr*nc > 100000) {\n") - rscript.write(" nr2 <- as.integer(100000/nc)\n") - rscript.write(" subind <- sample(seq(nr),nr2)\n") - rscript.write(" x <- x[subind,]\n") - rscript.write(" rowsum <- rowsum[subind]\n") - rscript.write("}\n") - -rscript.write("pdf('"+outfile+"')\n") - -if uselog == 'uselog': - rscript.write("x <- -(log(as.matrix(x,nc=nc)))\n") -else: - rscript.write("x <- -as.matrix(x,nc=nc)\n") -if scale == 'scale': - rscript.write("x <- scale(x)\n") -if reorder == 'average': - rscript.write("hc <- hclust(dist(x),method= 'average')\n") - rscript.write("x <- x[hc$order,]\n") -elif reorder == 'centroid': - rscript.write("hc <- hclust(dist(x),method= 'centroid')\n") - rscript.write("x <- x[hc$order,]\n") -elif reorder == 'complete': - rscript.write("hc <- hclust(dist(x),method= 'complete')\n") - rscript.write("x <- x[hc$order,]\n") -elif reorder == 'single': - rscript.write("hc <- hclust(dist(x),method= 'single')\n") - rscript.write("x <- x[hc$order,]\n") -elif reorder == 'median': - rscript.write("hc <- hclust(dist(x),method= 'median')\n") - rscript.write("x <- x[hc$order,]\n") -elif reorder == 'sort_by_total': - rscript.write("srt <- sort(rowsum,index.return=T)\n") - rscript.write("x <- x[srt$ix,]\n") -elif reorder == 'sort_by_center': - rscript.write("srt <- sort(x[,as.integer(nc/2)],index.return=T)\n") - rscript.write("x <- x[srt$ix,]\n") -if color == 'heat': - rscript.write("colormap = heat.colors(1000)\n") -elif color == 'topo': - rscript.write("colormap = topo.colors(1000)\n") -elif color == 'rainbow': - rscript.write("colormap = rainbow(1000)\n") -elif color == 'terrain': - rscript.write("colormap = terrain.colors(1000)\n") -else: - rscript.write("colormap = gray.colors(1000)\n") - -#rscript.write("qt <- quantile(as.vector(x),probs=c(0.1,0.9))\n") -#rscript.write("breaks <- c(min(as.vector(x)),seq(qt[1],qt[2],length.out=99),max(as.vector(x)))\n") -#rscript.write("image(t(x),col=colormap,breaks=breaks,axes=F)\n") -rscript.write("image(t(x),col=colormap,axes=F)\n") -rscript.write("dev.off()\n") - -rscript.close() - -os.system("R --slave < tmp.r") -os.system("rm tmp.r") -
--- a/plotmatrix.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -<tool id="plotmatrix" name="matrix-visualization"> - <description>with sorting and clustering)</description> - <command interpreter="python"> plotmatrix.py $input $output $uselog $subset $reorder $color $scale $cols</command> - <inputs> - <param name="input" format="tabular" type="data" label="Data file"/> - <param name="cols" type="text" value="1,3,8:10" label="data columns" help="e.g.: column 1, 3, 8,9,10"/> - <param name="uselog" label="log transform the data" type="boolean" truevalue="uselog" falsevalue="none" checked="True"/> - <param name="subset" label="sample a subset if the data is too large" type="boolean" truevalue="subset" falsevalue="none" checked="True"/> - <param name="scale" label="normalize by row" type="boolean" truevalue="scale" falsevalue="none" checked="False"/> - <param name="reorder" type="select" label="reorder features (rows)"> - <option value="none" selected="true">None</option> - <option value="sort_by_sum">Sort row by sum</option> - <option value="sort_by_center">Sort row by center </option> - <option value="average">Cluster rows (average)</option> - <option value="median">Cluster rows (median) </option> - <option value="centroid">Cluster rows (centroid)</option> - <option value="complete">Cluster rows (complete)</option> - <option value="single">Cluster rows (single)</option> - </param> - - <param name="color" type="select" label="color scheme"> - <option value="heat" selected="true">heat</option> - <option value="gray">gray</option> - <option value="rainbow">rainbow</option> - <option value="topo">topo</option> - <option value="terrain">terrain</option> - </param> - </inputs> - <outputs> - <data format="pdf" name="output" /> - </outputs> - <help> - -**What it does** - -This tool generates a heatmap for output from 'align' tool. Each row is the color-coded coverage of a feature, and the features are sorted by the total coverage in the interval. - -**Example** - -.. image:: ./static/operation_icons/heatmap.png - - </help> -</tool>
--- a/r_wrapper.sh Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -#!/bin/sh -R --vanilla $* < $infile
--- a/random_interval.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,96 +0,0 @@ -''' -simulate a random interval set that mimics the size and strand of a reference set -''' - -def inferSizeFromRefBed(filename): - ''' - read reference interval set, get chrom size information - ''' - chrSize = {} - f = open(filename) - for line in f: - flds = line.strip().split('\t') - if not chrSize.has_key(flds[0]): - chrSize[flds[0]] = int(flds[2]) - elif chrSize[flds[0]] < int(flds[2]): - chrSize[flds[0]] = int(flds[2]) - f.close() - return chrSize - -def getChrSize(filename): - chrSize = {} - f = open(filename) - for line in f: - flds = line.strip().split() - if len(flds) >1: - chrSize[flds[0]] = int(flds[1]) - f.close() - return chrSize - -def makeWeightedChrom(chrSize): - ''' - make a list of chr_id, the freq is proportional to its length - ''' - - genome_len = 0 - - for chrom in chrSize: - chrom_len = chrSize[chrom] - genome_len += chrom_len - - weighted_chrom = [] - for chrom in chrSize: - weight = int(round(1000*float(chrSize[chrom])/genome_len)) - weighted_chrom += [chrom]*weight - - return weighted_chrom - -def randomIntervalWithinChrom(infile,outfile,chrSize): - ''' - ''' - fin = open(infile) - fout = open(outfile,'w') - n = 0 - for line in fin: - n = n + 1 - flds = line.strip().split('\t') - interval_size = int(flds[2]) - int(flds[1]) - flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size)) - flds[2] = str(int(flds[1])+interval_size) - fout.write('\t'.join(flds)+'\n') - fin.close() - fout.close() - -def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom): - ''' - ''' - fin = open(infile) - fout = open(outfile,'w') - n = 0 - for line in fin: - n = n + 1 - flds = line.strip().split('\t') - interval_size = int(flds[2]) - int(flds[1]) - # find a random chrom - flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)] - flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size)) - flds[2] = str(int(flds[1])+interval_size) - fout.write('\t'.join(flds)+'\n') - fin.close() - fout.close() - -import sys,random -def main(): - # python random_interval.py test100.bed testout.bed across human.hg18.genome - - reference_interval_file = sys.argv[1] - output_file = sys.argv[2] - across_or_within_chrom = sys.argv[3] # within or across - chrom_size_file = sys.argv[4] - chrSize = getChrSize(chrom_size_file) - print chrSize.keys() - if across_or_within_chrom == 'within': - randomIntervalWithinChrom(reference_interval_file,output_file,chrSize) - else: - randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize)) -main()
--- a/random_interval.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,45 +0,0 @@ -<tool id="randominterval" name="shuffle intervals"> - <description>weight chromosome by length</description> - <command interpreter="python">random_interval.py $input $output $within $genome </command> - <inputs> - <param name="input" format="interval" type="data" label="reference interval file to mimic"/> - <param name="within" label="randomize within chromosome" help="If checked, for each original interval will move it to a random position in the SAME chromosome. The default is to move it to any chromosome (chance proportional to chromosome size)" type="boolean" truevalue="within" falsevalue="across" checked="False"/> - <param name="genome" type="select" label="Select genome"> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm9.genome" selected="true">mm9</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm8.genome">mm8</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg18.genome">hg18</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg19.genome">hg19</option> - </param> - </inputs> - <outputs> - <data format="interval" name="output" /> - </outputs> - <help> - - -**What it does** - -This tool will generate a set of intervals randomly distributed in the genome, mimicking the size distribution of the reference set. The same number of intervals are generated. - - -**How it works** - -For each interval in the reference set, the script picks a random position as the new start in the genome, and then pick the end such that the size of the random interval is the same as the original one. The default setting is to move the interval to any chromosome, with the probability proportional to the size/length of the chromosome. You can have it pick a random position in the same chromosome, such that in the randomized set each chromosome has the same number of intervals as the reference set. The size of the chromosome can be either learned from the reference set (chromosome size = max(interval end)) or read from a chromosome size file. When learning from the reference set, only regions spanned by reference intervals are used to generate random intervals. Regions (may be an entire chromosome) not covered by the reference set will not appear in the output. - -**Chromosome size file** - -Chromosome size files for hg18,hg19,mm8,and mm9 can be found in 'Shared Data'. To use those files, select the correct one and import into to the history, then the file will be listed in the drop-down menu of this tool. You can also make your own chromosme size file: each line specifies the size of a chromosome (tab-delimited): - -chr1 92394392 - -chr2 232342342 - - -You can use the following script from UCSC genome browser to download chromosome size files for other genomes: - -http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/fetchChromSizes - - - </help> - -</tool>
--- a/removeDuplicate.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -<tool id="removeDuplicate" name="remove duplicate"> - <description>lines</description> - <command> cat $input | sort | uniq > $output </command> - <inputs> - <param name="input" format="txt" type="data" label="Original file"/> - </inputs> - <outputs> - <data format="input" name="output" /> - </outputs> -</tool>
--- a/resize.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,57 +0,0 @@ -''' -change start and end of interval files -''' - -import sys - -def resize(infile,outfile,expr_start,expr_end,strand): - fin = open(infile) - fout = open(outfile,'w') - if expr_start[0:3] == 'end': - c1 = 2 - n1 = int(expr_start[3:]) - else: - c1 = 1 - n1 = int(expr_start[5:]) - if expr_end[0:3] == 'end': - c2 = 2 - n2 = int(expr_end[3:]) - else: - c2 = 1 - n2 = int(expr_end[5:]) - if strand == 'ignore': - for line in fin: - flds = line.strip().split('\t') - start = int(flds[c1]) + n1 - if start >= 0: - end = int(flds[c2]) + n2 - if end >= start: - flds[1] = str(start) - flds[2] = str(end) - fout.write('\t'.join(flds)+'\n') - else:# upstream downstream - for line in fin: - flds = line.strip().split('\t') - if flds[5] == '+': - start = int(flds[c1]) + n1 - if start >= 0: - end = int(flds[c2]) + n2 - if end >= start: - flds[1] = str(start) - flds[2] = str(end) - fout.write('\t'.join(flds)+'\n') - else: # on the - strand - start = int(flds[3-c2]) - n2 # end=end+n2 - if start >= 0: - end = int(flds[3-c1]) - n1 - if end >= start: - flds[1] = str(start) - flds[2] = str(end) - fout.write('\t'.join(flds)+'\n') - fin.close() - fout.close() - -if __name__ == "__main__": - resize(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5]) - # python resize.py in.bed out.bed start-3 end+5 both - # python resize.py <input.bed> <output.bed> expr_start expr_end strand(both/+/-)
--- a/resize.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ -<tool id="resize" name="resize"> - <description>intervals</description> - <command interpreter="python">resize.py $infile $outfile $expr_start $expr_end $strand </command> - <inputs> - <param name="infile" format="interval" type="data" label="Original file"/> - <param name="expr_start" size="20" type="text" value="start-0" label="start=" help="e.g. start+10, start-10, end-100"/> - <param name="expr_end" size="20" type="text" value="end+0" label="end=" help="e.g. end-100, start+10"/> - <param name="strand" label="Enforce strandness" type="boolean" truevalue="strand" falsevalue="ignore" checked="False"/> - </inputs> - <outputs> - <data format="input" name="outfile" /> - </outputs> - <help> - -**What it does** - -This tool changes start and end of each row in an interval file. When strandness is enforced, chromosome start and end are treated as the 5' and 3' end for intervals on the '+' strand, and the opposite for intervals on the '-' strand. In the expression such as 'start=start-1000', 'start' and 'end' are interpreted as the 5' and 3' end, respectively, and the operator '+' and '-' means moving downsteam and upsteam, respectively. For example, when enforcing strandness, - -**start=start-1000**: extend 1000 bp on the 5' end (moving start upstream) - -**start=start+1000**: trancate 1000 bp on the 5' end (moving start downsteam) - -**end=end+1000**: extend 1000 bp on the 3' end (moving end downsteam) - -**end=start+1000**: moving the end to 1000 bp downsteam of the start (return the first 1000 bp on the 5' end) - -**end=start+1**: taking the 5' end of the interval - -**start=end-1**: taking the 3' end of the interval - - </help> -</tool>
--- a/revcompl.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,84 +0,0 @@ -import sys - -compldna = {'A':'T', - 'C':'G', - 'G':'C', - 'T':'A', - 'U':'A', - 'M':'K', - 'K':'M', - 'W':'W', - 'S':'S', - 'R':'Y', - 'Y':'R', - 'N':'N'} -complrna = {'A':'U', - 'C':'G', - 'G':'C', - 'T':'A', - 'U':'A', - 'M':'K', - 'K':'M', - 'W':'W', - 'S':'S', - 'R':'Y', - 'Y':'R', - 'N':'N'} -def complement(seq,compl): - complseq = [compl[base] for base in seq] - return complseq - -def reverse_complement(seq,compl): - seq = list(seq) - seq.reverse() - return ''.join(complement(seq,compl)) - -def readFastaFile(infile,outfile,compl): - - fin = open(infile) - out = open(outfile,'w') - - currSeq='' - currSeqname=None - for line in fin: - if '>' in line: - if currSeqname !=None: - out.write(currSeqname+reverse_complement(currSeq,compl)+'\n') - currSeqname=None - currSeq='' - currSeqname=line - else: - currSeq=currSeq+line.strip().upper() - - if currSeqname!=None: - out.write(currSeqname+reverse_complement(currSeq,compl)+'\n') - - fin.close() - out.close() - -def readrawseq(infile,outfile,compl): - ''' - each line is a sequence - ''' - fin = open(infile) - out = open(outfile,'w') - for line in fin: - out.write(reverse_complement(line.strip().upper(),compl)+'\n') - fin.close() - out.close() - -def main(): - seqfile = sys.argv[1] - outfile = sys.argv[2] - fasta = sys.argv[3] - rna = sys.argv[4] - if rna == 'rna': - compl = complrna - else: - compl = compldna - if fasta == 'fasta': - readFastaFile(seqfile,outfile,compl) - else: - readrawseq(seqfile,outfile,compl) - -main()
--- a/revcompl.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ -<tool id="revcompl" name="reverse complement"> - <description>of DNA/RNA sequences</description> - <command interpreter="python">revcompl.py $input $output $fasta $rna </command> - <inputs> - <param name="input" format="txt" type="data" label="Original sequence file"/> - <param name="fasta" label="Check if input is fasta format" type="boolean" truevalue="fasta" falsevalue="txt" checked="False"/> - <param name="rna" label="Check if need to output as RNA sequences" type="boolean" truevalue="rna" falsevalue="dna" checked="False"/> - </inputs> - <outputs> - <data format="input" name="output" /> - </outputs> - <help> - -**What it does** - -This tool outputs reverse complementary of DNA/RNA sequences in the input file. The input can be fasta format or raw sequences (each line is a sequence). - -Degenerate nucleotides are supported. Here is the match table: - -A to T/U - -C to G - -G to C - -T/U to A - -M to K - -W to W - -S to S - -R to Y - -Y to R - -N to N - - </help> -</tool>
--- a/sampline.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,133 +0,0 @@ -#!/usr/bin/env python - -""" -Sampling random records from a file. Each record is defined by a fixed number of lines. - -Usage: sampline.py [options] - -Options: - -h, --help show this help message and exit - -r, --replacement Sampling with replacement - -i INPUT, --input=INPUT - Input file - -o OUTPUT, --output=OUTPUT - Output file - -k NSAMPLE, --nSample=NSAMPLE - (required) number of records to be sampled/output - -m RECSIZE, --recSize=RECSIZE - (default=1) number of lines spanned by each record - -n NSKIP, --nSkip=NSKIP - (default=0) number of comment lines to skip at the - beginning - -example: - python sampline.py -i test10000.fastq -o out.txt --nSample=5 --recSize=4 --nSkip=0 --replacement -""" - -import optparse, string, random,sys,math,itertools - -assert sys.version_info[:2] >= ( 2, 4 ) - -def main(): - - # Parse command line - parser = optparse.OptionParser( usage="%prog [options] " ) - parser.add_option( "-r", "--replacement", action="store_true", dest="replacement",default=False, - help="Sampling with replacement" ) - parser.add_option( "-i", "--input", dest="input", default=None, - help="Input file" ) - parser.add_option( "-o", "--output", dest="output", default=None, - help="Output file" ) - parser.add_option("-k","--nSample", type='int',dest="nSample",default=None, - help="(required) number of records to be sampled/output" ) - parser.add_option("-m","--recSize", type='int',dest="recSize",default=1, - help="(default=1) number of lines spanned by each record" ) - parser.add_option("-n","--nSkip", type='int',dest="nSkip",default=0, - help="(default=0) number of comment lines to skip at the beginning") - options, args = parser.parse_args() - #assert options.region in ( 'coding', 'utr3', 'utr5', 'transcribed' ), "Invalid region argument" - - sampline(options.input,options.output,options.nSample,options.recSize,options.nSkip,options.replacement) - -def sample_wr(population, k): - "Chooses k random elements (with replacement) from a population" - n = len(population) - _random, _int = random.random, int # speed hack - return [_int(_random() * n) for i in itertools.repeat(None, k)] - -# num of lines -def readinput(filename): - try: - f = open (filename) - except: - print >> sys.stderr, "can't open file "+str(filename) - sys.exit(0) - - nline = 0 - for line in f: - nline = nline + 1 - f.close() - return nline - -def sampline(infile,outfile,nSample,recSize,nSkip,replacement): - # sample nSample records from file - # each record contains recSize lines - # skip the top nSkip lines - - nLine = readinput(infile) - print 'num of lines in input: ',nLine - print 'avoid sampling the first ',nSkip,' lines' - print 'lines per record: ',recSize - - if (nLine-nSkip) % recSize: - print >> sys.stderr, "the number of lines is not dividable by record size!" - sys.exit(0) - - nTotalRecords = (nLine-nSkip) / recSize - print "total number of records: ",nTotalRecords - - if replacement or nTotalRecords < nSample: - sel = sample_wr(range(nTotalRecords),nSample) - else: - sel = random.sample(range(nTotalRecords),nSample) - - #print len(sel), sorted(sel) - - # output - try: - fout = open (outfile,'w') - except: - print >> sys.stderr, "can't open file "+str(outfile) - sys.exit(0) - fin = open(infile) - n = 0 # index of line - rec = "" # to store all content of a record - nrepeat = 0 # number of times a record is sampled - curr_rec = -1 - for line in fin: - if n < nSkip: - n = n + 1 - fout.write(line) - continue - - if not (n-nSkip) % recSize:# a new record - # print the previous sampled record - for i in range(nrepeat): - fout.write(rec) - curr_rec = (n-nSkip)/recSize - nrepeat = sel.count(curr_rec) - if nrepeat: # sampled - rec = line - #print curr_rec,nrepeat - elif (n-nSkip)/recSize == curr_rec: - rec = rec + line - n = n + 1 - # if the last record is selected - if curr_rec == nTotalRecords-1: - for i in range(nrepeat): - fout.write(rec) - fin.close() - fout.close() - - -if __name__ == "__main__": main()
--- a/sampline.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,108 +0,0 @@ -<tool id="sampline" name="sample"> - <description>records from a file</description> - <command interpreter="python">sampline.py --input=$input --output=$out_file1 --nSample=$nSample --recSize=$recSize --nSkip=$nSkip $replacement</command> - <inputs> - <param name="input" format="txt" type="data" label="Original file"/> - <param name="nSample" size="10" type="integer" value="100" label="Number of records to sample"/> - <param name="recSize" size="10" type="integer" value="1" label="Number of lines per record"/> - <param name="nSkip" size="10" type="integer" value="0" label="Number of top lines to output directly (without sampling)"/> - <param name="replacement" label="Sampling with replacement" type="boolean" truevalue="--replacement" falsevalue="" checked="False"/> - </inputs> - <outputs> - <data format="input" name="out_file1" /> - </outputs> - <tests> - <test> - <output name="out_file1" file="testmap.sampled"/> - <param name="input" value="test.map" ftype="TXT"/> - <param name="nSample" value="100"/> - <param name="recSize" value="1" /> - <param name="nSkip" value="0" /> - <param name="replacement" value=""/> - </test> - </tests> - <help> - -**What it does** - -This tool selects random records from a file. Each record is defined by a fixed number of lines. - -- When doing over-sampling, --replacement option is enforced by default. - ------ - -**Example 1: sampling from a BED file** - -parameters:: - - 1 line per record, sampling 5 lines, without replacement, output line 1 (track name) directly - -Input:: - - track name=test.bed - chr1 148078400 148078582 CCDS993.1_cds_0_0_chr1_148078401_r 0 - - chr11 116124407 116124501 CCDS8374.1_cds_0_0_chr11_116124408_r 0 - - chr15 41826029 41826196 CCDS10101.1_cds_0_0_chr15_41826030_f 0 + - chr16 142908 143003 CCDS10397.1_cds_0_0_chr16_142909_f 0 + - chr2 220229609 220230869 CCDS2443.1_cds_0_0_chr2_220229610_r 0 - - chr20 33579500 33579527 CCDS13256.1_cds_0_0_chr20_33579501_r 0 - - chr20 33593260 33593348 CCDS13257.1_cds_0_0_chr20_33593261_f 0 + - chr5 131621326 131621419 CCDS4152.1_cds_0_0_chr5_131621327_f 0 + - chr7 113660517 113660685 CCDS5760.1_cds_0_0_chr7_113660518_f 0 + - chrX 152648964 152649196 CCDS14733.1_cds_0_0_chrX_152648965_r 0 - - -Output:: - - track name=test.bed - chr11 116124407 116124501 CCDS8374.1_cds_0_0_chr11_116124408_r 0 - - chr16 142908 143003 CCDS10397.1_cds_0_0_chr16_142909_f 0 + - chr20 33579500 33579527 CCDS13256.1_cds_0_0_chr20_33579501_r 0 - - chr20 33593260 33593348 CCDS13257.1_cds_0_0_chr20_33593261_f 0 + - chr5 131621326 131621419 CCDS4152.1_cds_0_0_chr5_131621327_f 0 + - -**Example 2: sampling reads from a fastq file** - -parameters:: - - 4 line per record, sampling 3 records, without replacement - -Input:: - - @SRR066787.2496 WICMT-SOLEXA:8:1:28:2047 length=36 - NNANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN - +SRR066787.2496 WICMT-SOLEXA:8:1:28:2047 length=36 - !!%!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - @SRR066787.2497 WICMT-SOLEXA:8:1:28:463 length=36 - GTGATTAAGAAGAGACTGGCATCACTAAGGTGACAT - +SRR066787.2497 WICMT-SOLEXA:8:1:28:463 length=36 - @A=BBCBBAA@:@:@@@:,?AB:B?BB=*2:@=?AA - @SRR066787.2498 WICMT-SOLEXA:8:1:28:704 length=36 - GAACCCAATTTTCAAAGAAGTGTGACTGCTTGTTTC - +SRR066787.2498 WICMT-SOLEXA:8:1:28:704 length=36 - =?BAABBACCCCAA9>>A=>A?A;;@A>ABBABBB: - @SRR066787.2499 WICMT-SOLEXA:8:1:28:997 length=36 - CGACTTCAGGCTCTCGCTAGCCTTCGCTTGACTGAC - +SRR066787.2499 WICMT-SOLEXA:8:1:28:997 length=36 - BCCBCCB?A1ACAC>;@CCAAABB?8=BA>@?B?@: - @SRR066787.2500 WICMT-SOLEXA:8:1:28:582 length=36 - TCTCTCTCTTTCTCTCTCTCTCTCTCTCTCTCTCTC - +SRR066787.2500 WICMT-SOLEXA:8:1:28:582 length=36 - ?.?.=9C8CCC:BACBCBC?CCC@CBBBCBBACAC8 - -Output:: - - @SRR066787.2497 WICMT-SOLEXA:8:1:28:463 length=36 - GTGATTAAGAAGAGACTGGCATCACTAAGGTGACAT - +SRR066787.2497 WICMT-SOLEXA:8:1:28:463 length=36 - @A=BBCBBAA@:@:@@@:,?AB:B?BB=*2:@=?AA - @SRR066787.2499 WICMT-SOLEXA:8:1:28:997 length=36 - CGACTTCAGGCTCTCGCTAGCCTTCGCTTGACTGAC - +SRR066787.2499 WICMT-SOLEXA:8:1:28:997 length=36 - BCCBCCB?A1ACAC>;@CCAAABB?8=BA>@?B?@: - @SRR066787.2500 WICMT-SOLEXA:8:1:28:582 length=36 - TCTCTCTCTTTCTCTCTCTCTCTCTCTCTCTCTCTC - +SRR066787.2500 WICMT-SOLEXA:8:1:28:582 length=36 - ?.?.=9C8CCC:BACBCBC?CCC@CBBBCBBACAC8 - - </help> -</tool>
--- a/seq2meme.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,101 +0,0 @@ -# -*- coding: iso-8859-1 -*- -import random,sys,math - -#import pylab - -def readMotifFile(filename): - - try: - f=open(filename) - except IOError: - print "could not open",filename,"Are you sure this file exists?" - sys.exit(1) - - seqs=[] - maxL = 0 - for line in f: - if '>' in line or 'N' in line: - next - else: - seq = line.strip().upper() - if maxL < len(seq): - maxL = len(seq) - seqs.append(seq) - f.close() - - print len(seqs),'sequences loaded' - print 'max seq length:',maxL - for i in range(len(seqs)): - if len(seqs[i]) < maxL: - del seqs[i] - print len(seqs),'sequences with length = ',maxL - return seqs - - -def createWeightMatrix(seqs,psuedocont): - - motifWidth = len(seqs[0]) - weightMatrix = [] - for i in range(motifWidth): - weightMatrix.append({'A':psuedocont,'C':psuedocont,'G':psuedocont,'T':psuedocont}) - - #Use a for loop to iterate through all the sequences. For each sequence, begin at the start site in starts, and look at motifWidth bases. Count how many times each base appears in each position of the motif - #YOUR CODE HERE - for seq in seqs: - for pos in range(motifWidth): - weightMatrix[pos][seq[pos]] = weightMatrix[pos][seq[pos]] + 1.0 - - #Normalize your weight matrix (so that it contains probabilities rather than counts) - #Remember the added psuedocounts when you normalize! - for pos in range(motifWidth): - totalCount = sum(weightMatrix[pos].values()) - for letter in weightMatrix[pos].keys(): - weightMatrix[pos][letter] = weightMatrix[pos][letter]/totalCount - - #Return your weight matrix - return weightMatrix - -def printMemeFormat(weightMatrix,motifName,filename,nSite,background): - f = open(filename,'w') - f.write('MEME version 4.4\n\n') - - f.write('ALPHABET= ACGT\n\n') - - f.write('strands: + -\n\n') - - f.write('Background letter frequencies (from:\n') - f.write(background+'\n\n') - - f.write('MOTIF '+motifName+'\n\n') - - f.write('letter-probability matrix: alength= 4 '+'w= '+str(len(weightMatrix))+' nsites= '+str(nSite)+' E= 0\n') - for position in range(len(weightMatrix)): - probsThisPosition=weightMatrix[position] - f.write(' '+"%.6f" %(probsThisPosition['A'])+'\t '+"%.6f" %(probsThisPosition['C'])+'\t '+"%.6f" %(probsThisPosition['G'])+'\t '+"%.6f" %(probsThisPosition['T'])+'\t\n') - f.write('\n\n') - f.close() - -#get a two decimal-place string representation of a float f -def twoDecimal(f): - return "%.6f" %(f) - -def run(): - - #Get file name from command line - if len(sys.argv) < 3: - print "python seq2meme.py motif_fasta outputfile motifName psuedocont background" - sys.exit(1) - else: - motifFile=sys.argv[1] # - outFile=sys.argv[2] - motifName=sys.argv[3] - psuedocont = float(sys.argv[4]) - background=' '.join(sys.argv[5].strip().split(',')) - - motifs=readMotifFile(motifFile) - - #Create weight matrix - motifWeightMatrix=createWeightMatrix(motifs,psuedocont) - printMemeFormat(motifWeightMatrix,motifName,outFile,len(motifs),background) -run() -
--- a/seq2meme.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ -<tool id="seq2meme" name="create-motif-file"> - <description>from fasta file</description> - <command interpreter="python">seq2meme.py $input $output $motifName $psuedocont $background </command> - <inputs> - <param name="input" type="data" format="txt" label="Sequence file" help="all sequences should be the same length"/> - <param name="motifName" size="20" type="text" value="motif1" label="Motif name (no space allowed)"/> - <param name="psuedocont" size="10" type="float" value="1.0" label="Psuedocount"/> - <param name="background" size="40" type="text" value="A,0.25,C,0.25,G,0.25,T,0.25" label="Background frequency"/> - </inputs> - <outputs> - <data format="txt" name="output" label="$motifName-meme"/> - </outputs> - <help> - -**Description** - -Generate a MEME motif format file from a set of sequences. Input could be raw sequences (one sequence per line) or fasta format (one identifier line and one sequence line). - - - </help> -</tool>
--- a/seqshuffle.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ -import sys - -from altschulEriksonDinuclShuffle import * - -def readFastaFile(infile,outfile): - - fin = open(infile) - out = open(outfile,'w') - - currSeq='' - currSeqname=None - for line in fin: - if '>' in line: - if currSeqname !=None: - out.write(currSeqname+dinuclShuffle(currSeq)+'\n') - currSeqname=None - currSeq='' - currSeqname=line - else: - currSeq=currSeq+line.strip().upper() - - if currSeqname!=None: - out.write(currSeqname+dinuclShuffle(currSeq)+'\n') - - fin.close() - out.close() - -def readrawseq(infile,outfile): - ''' - each line is a sequence - ''' - fin = open(infile) - out = open(outfile,'w') - for line in fin: - out.write(dinuclShuffle(line.strip().upper())+'\n') - fin.close() - out.close() - -def main(): - seqfile = sys.argv[1] - outfile = sys.argv[2] - fasta = sys.argv[3] - - if fasta == 'fasta': - readFastaFile(seqfile,outfile) - else: - readrawseq(seqfile,outfile) - -main()
--- a/sequence.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,720 +0,0 @@ -#!@WHICHPYTHON@ - -import copy, string, sys - -#------------------ Alphabet ------------------- - -class Alphabet(object): - """Biological alphabet class. - This defines the set of symbols from which various objects can be built, e.g. sequences and motifs. - The symbol set is immutable and accessed as a tuple. - symstr: symbols in alphabet as either a tuple or string - complement: dictionary defining letters and their complement - """ - def __init__(self, symstr, complement = None): - """Construct an alphabet from a string or tuple of characters. - Lower case characters will be converted to upper case. - An optional mapping for complements may be provided. - Example: - >>> alpha = sequence.Alphabet('ACGTttga', {'A':'C', 'G':'T'}) - >>> alpha.getSymbols() - will construct the DNA alphabet and output: - ('A', 'C', 'G', 'T') - """ - symlst = [] - for s in [str(sym).upper()[0] for sym in symstr]: - if not s in symlst: - symlst.append(s) - self.symbols = tuple(symlst) - if complement != None: - # expand the mapping and check for contradictions - cmap = {} - for s in self.symbols: - c = complement.get(s, None) - if c != None: - if s in cmap and cmap[s] != c: - raise RuntimeError("Alphabet complement map " - "contains contradictory mapping") - cmap[s] = c - cmap[c] = s - # replace mapping with indicies - cimap = {} - for idx in range (len(self.symbols)): - s = self.symbols[idx] - if s in cmap: - cimap[cmap[s]] = idx - # create tuple - cidxlst = [] - for idx in range (len(self.symbols)): - cidxlst.append(cimap.get(self.symbols[idx], None)) - self.complements = tuple(cidxlst) - else: - self.complements = None - - def getSymbols(self): - """Retrieve a tuple with all symbols, immutable membership and order""" - return self.symbols - - def getComplements(self): - """Retrieve a tuple with all complement indicies, immutable""" - return self.complements - - def isValidSymbol(self, sym): - """Check if the symbol is a member of alphabet""" - return any([s==sym for s in self.symbols]) - - def getIndex(self, sym): - """Retrieve the index of the symbol (immutable)""" - for idx in range (len(self.symbols)): - if self.symbols[idx] == sym: - return idx - raise RuntimeError("Symbol " + sym + " does not exist in alphabet") - - def isComplementable(self): - return self.complements != None - - def getComplement(self, sym): - """Retrieve the complement of the symbol (immutable)""" - return self.symbols[self.complements[self.getIndex(sym)]]; - - def isValidString(self, symstr): - """Check if the string contains only symbols that belong to the alphabet""" - found = True - for sym in symstr: - if self.isValidSymbol(sym) == False: - return False - return True - - def getLen(self): - """Retrieve the number of symbols in (the length of) the alphabet""" - return len(self.symbols) - -# pre-defined alphabets that can be specified by their name -predefAlphabets = [ - ("DNA" , Alphabet('ACGT', {'A':'T', 'G':'C'})), - ("RNA" , Alphabet('ACGU')), - ("Extended DNA" , Alphabet('ACGTYRN')), - ("Protein" , Alphabet('ACDEFGHIKLMNPQRSTVWY')), - ("Extended Protein" , Alphabet('ACDEFGHIKLMNPQRSTVWYX')), - ("TM Labels" , Alphabet('MIO')) -] - -def getAlphabet(name): - """Retrieve a pre-defined alphabet by name. - Currently, "Protein", "DNA", "RNA", "Extended DNA", "Extended Protein" and "TM Labels" are available. - Example: - >>> alpha = sequence.getAlphabet('Protein') - >>> alpha.getSymbols() - will retrieve the 20 amino acid alphabet and output the tuple: - ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y') - """ - for (xname, xalpha) in predefAlphabets: - if xname == name: - return xalpha - return None - -#------------------ Sequence ------------------- - -class Sequence(object): - """Biological sequence class. Sequence data is immutable. - - data: the sequence data as a tuple or string - alpha: the alphabet from which symbols are taken - name: the sequence name, if any - info: can contain additional sequence information apart from the name - """ - def __init__(self, sequence, alpha = None, name = "", seqinfo = ""): - """Create a sequence with sequence data. - Specifying the alphabet is optional, so is the name and info. - Example: - >>> myseq = sequence.Sequence('MVSAKKVPAIAMSFGVSF') - will create a sequence with name "", and assign one of the predefined alphabets on basis of what symbols were used. - >>> myseq.getAlphabet().getSymbols() - will most likely output the standard protein alphabet: - ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y') - """ - if type(sequence) is str: - self.data = tuple(sequence.upper()) - elif type(sequence) is tuple: - self.data = sequence - elif type(sequence) is list: - self.data = tuple([s.upper() for s in sequence]) - else: - raise RuntimeError("Sequence data is not specified correctly: must be string or tuple") - # Resolve choice of alphabet - validAlphabet = False - if alpha == None: # Alphabet is not set, attempt to set it automatically... - for (xname, xalpha) in predefAlphabets: # Iterate through each predefined alphabet, in order - if xalpha.isValidString( self.data ): # This alphabet works, go with it - self.alpha = alpha = xalpha - validAlphabet = True - break - self.name = name - self.info = seqinfo - if validAlphabet == False: # we were either unsuccessful above or the alphabet was specified so test it - if type(alpha) is str: # check if name is a predefined alphabet - for (xname, xalpha) in predefAlphabets: # Iterate through each predefined alphabet, check for name - if (xname == alpha): - alpha = xalpha - break - if type(alpha) is Alphabet: # the alphabet is specified - if alpha.isValidString(self.data) == False: - raise RuntimeError("Invalid alphabet specified: "+"".join(alpha.getSymbols())+" is not compatible with sequence '"+"".join(self.data)+"'") - else: - self.alpha = alpha - else: - raise RuntimeError("Could not identify alphabet from sequence") - - #basic getters and setters for the class - def getName(self): - """Get the name of the sequence""" - return self.name - def getInfo(self): - """Get additional info of the sequence (e.g. from the defline in a FASTA file)""" - return self.info - def getAlphabet(self): - """Retrieve the alphabet that is assigned to this sequence""" - return self.alpha - def setName(self, name): - """Change the name of the sequence""" - self.name = name - def setAlphabet(self, alpha): - """Set the alphabet, throws an exception if it is not compatible with the sequence data""" - if type(alpha) is Alphabet: - if alpha.isValid( sequence ) == False: - raise RuntimeError( "Invalid alphabet specified" ) - #sequence functions - def getSequence(self): - """Retrieve the sequence data (a tuple of symbols)""" - return self.data - def getString(self): - """Retrieve the sequence data as a string (copy of actual data)""" - return "".join(self.data) - def getLen(self): - """Get the length of the sequence (number of symbols)""" - return len(self.data) - def getSite(self, position, length = 1): - """Retrieve a site in the sequence of desired length. - Note that positions go from 0 to length-1, and that if the requested site - extends beyond those the method throws an exception. - """ - if position >= 0 and position <= self.getLen() - length: - if length == 1: - return self.data[position] - else: - return self.data[position:position+length] - else: - raise RuntimeError( "Attempt to access invalid position in sequence "+self.getName() ) - - def nice(self): - """ A short description of the sequence """ - print self.getName(), ":", self.getLen() - -def readStrings(filename): - """ Read one or more lines of text from a file--for example an alignment. - Return as a list of strings. - filename: name of file - """ - txtlist = [] - f = open(filename) - for line in f.readlines(): - txtlist.extend(line.split()) - return txtlist - -def readFASTA(filename, alpha = None): - """ Read one or more sequences from a file in FASTA format. - filename: name of file to load sequences from - alpha: alphabet that is used (if left unspecified, an attempt is made to identify the alphabet for each individual sequence) - """ - seqlist = [] - seqname = None - seqinfo = None - seqdata = [] - fh = open(filename) - thisline = fh.readline() - while (thisline): - if (thisline[0] == '>'): # new sequence - if (seqname): # take care of the data that is already in the buffer before processing the new sequence - try: - seqnew = Sequence(seqdata, alpha, seqname, seqinfo) - seqlist.append(seqnew) - except RuntimeError, e: - print >> sys.stderr, "Warning: "+seqname+" is invalid (ignored): ", e - seqinfo = thisline[1:-1] # everything on the defline is "info" - seqname = seqinfo.split()[0] # up to first space - seqdata = [] - else: # pull out the sequence data - cleanline = thisline.split() - for line in cleanline: - seqdata.extend(tuple(line.strip('*'))) # sometimes a line ends with an asterisk in FASTA files - thisline = fh.readline() - - if (seqname): - try: - seqnew = Sequence(seqdata, alpha, seqname, seqinfo) - seqlist.append(seqnew) - except RuntimeError, e: - print >> sys.stderr, "Warning: " + seqname + " is invalid (ignored): ", e - else: - raise RuntimeError("No sequences on FASTA format found in this file") - fh.close() - return seqlist - -def _writeOneFASTA(sequence, filehandle): - """Write one sequence in FASTA format to an already open file""" - filehandle.write(">" + sequence.getName()+"\n") - data = sequence.getSequence() - lines = ( sequence.getLen() - 1) / 60 + 1 - for i in range(lines): - #note: python lets us get the last line (var length) free - #lineofseq = data[i*60 : (i+1)*60] + "\n" - lineofseq = "".join(data[i*60 : (i+1)*60]) + "\n" - filehandle.write(lineofseq) - -def writeFASTA(sequence, filename): - """Write a list (or a single) of sequences to a file in the FASTA format""" - fh = open(filename, "w") - if isinstance(sequence, Sequence): - _writeOneFASTA(sequence, fh) - else: - for seq in sequence: - if isinstance(seq, Sequence): - _writeOneFASTA(seq, fh) - else: - print >> sys.stderr, "Warning: could not write " + seq.getName() + " (ignored)." - fh.flush() - fh.close() - -#------------------ Distrib ------------------- - -class Distrib(object): - """Class for storing a multinomial probability distribution over the symbols in an alphabet""" - def __init__(self, alpha, pseudo_count = 0.0): - self.alpha = alpha - self.tot = pseudo_count * self.alpha.getLen() - self.cnt = [pseudo_count for _ in range( self.alpha.getLen() )] - - def __deepcopy__(self, memo): - dup = Distrib(self.alpha) - dup.tot = copy.deepcopy(self.tot, memo) - dup.cnt = copy.deepcopy(self.cnt, memo) - return dup - - def count(self, syms = None ): - """Count an observation of a symbol""" - if syms == None: - syms = self.alpha.getSymbols() - for sym in syms: - idx = self.alpha.getIndex( sym ) - self.cnt[idx] += 1.0 - self.tot += 1 - - def complement(self): - """Complement the counts, throw an error if this is impossible""" - if not self.alpha.isComplementable(): - raise RuntimeError("Attempt to complement a Distrib " - "based on a non-complementable alphabet.") - coms = self.alpha.getComplements() - new_count = [] - for idx in range(len(coms)): - cidx = coms[idx] - if cidx == None: - cidx = idx - new_count.append(self.cnt[cidx]) - self.cnt = new_count - return self - - def reset(self): - """Reset the distribution, that is, restart counting.""" - self.tot = 0 - self.cnt = [0.0 for _ in range( self.alpha.getLen() )] - - def getFreq(self, sym = None): - """Determine the probability distribution from the current counts. - The order in which probabilities are given follow the order of the symbols in the alphabet.""" - if self.tot > 0: - if sym == None: - freq = tuple([ y / self.tot for y in self.cnt ]) - return freq - else: - idx = self.alpha.getIndex( sym ) - return self.cnt[idx] / self.tot - return None - - def pretty(self): - """Retrieve the probabilites for all symbols and return as a pretty table (a list of text strings)""" - table = ["".join(["%4s " % s for s in self.alpha.getSymbols()])] - table.append("".join(["%3.2f " % y for y in Distrib.getFreq(self)])) - return table - - def getSymbols(self): - """Get the symbols in the alphabet in the same order as probabilities are given.""" - return self.alpha.getSymbols() - - def getAlphabet(self): - """Get the alphabet over which the distribution is defined.""" - return self.alpha - -#------------------ Motif (and subclasses) ------------------- - -class Motif(object): - """ Sequence motif class--defining a pattern that can be searched in sequences. - This class is not intended for direct use. Instead use and develop sub-classes (see below). - """ - def __init__(self, alpha): - self.len = 0 - self.alpha = alpha - - def getLen(self): - """Get the length of the motif""" - return self.len - - def getAlphabet(self): - """Get the alphabet that is used in the motif""" - return self.alpha - - def isAlphabet(self, seqstr): - """Check if the sequence can be processed by this motif""" - mystr = seqstr - if type(seqstr) is Sequence: - mystr = seqstr.getString() - return self.getAlphabet().isValidString(mystr) - -import re - -class RegExp(Motif): - """A motif class that defines the pattern in terms of a regular expression""" - def __init__(self, alpha, re_string): - Motif.__init__(self, alpha) - self.pattern = re.compile(re_string) - - def match(self, seq): - """Find matches to the motif in a specified sequence. - The method is a generator, hence subsequent hits can be retrieved using next(). - The returned result is a tuple (position, match-sequence, score), where score is - always 1.0 since a regular expression is either true or false (not returned). - """ - myseq = seq - if not type(seq) is Sequence: - myseq = Sequence(seq, self.alpha) - mystr = myseq.getString() - if not Motif.isAlphabet(self, mystr): - raise RuntimeError("Motif alphabet is not valid for sequence " + myseq.getName()) - for m in re.finditer(self.pattern, mystr): - yield (m.start(), m.group(), 1.0) - -import math, time - -# Variables used by the PWM for creating an EPS file -_colour_def = ( - "/black [0 0 0] def\n" - "/red [0.8 0 0] def\n" - "/green [0 0.5 0] def\n" - "/blue [0 0 0.8] def\n" - "/yellow [1 1 0] def\n" - "/purple [0.8 0 0.8] def\n" - "/magenta [1.0 0 1.0] def\n" - "/cyan [0 1.0 1.0] def\n" - "/pink [1.0 0.8 0.8] def\n" - "/turquoise [0.2 0.9 0.8] def\n" - "/orange [1 0.7 0] def\n" - "/lightred [0.8 0.56 0.56] def\n" - "/lightgreen [0.35 0.5 0.35] def\n" - "/lightblue [0.56 0.56 0.8] def\n" - "/lightyellow [1 1 0.71] def\n" - "/lightpurple [0.8 0.56 0.8] def\n" - "/lightmagenta [1.0 0.7 1.0] def\n" - "/lightcyan [0.7 1.0 1.0] def\n" - "/lightpink [1.0 0.9 0.9] def\n" - "/lightturquoise [0.81 0.9 0.89] def\n" - "/lightorange [1 0.91 0.7] def\n") -_colour_dict = ( - "/fullColourDict <<\n" - " (G) orange\n" - " (T) green\n" - " (C) blue\n" - " (A) red\n" - " (U) green\n" - ">> def\n" - "/mutedColourDict <<\n" - " (G) lightorange\n" - " (T) lightgreen\n" - " (C) lightblue\n" - " (A) lightred\n" - " (U) lightgreen\n" - ">> def\n" - "/colorDict fullColourDict def\n") - -_eps_defaults = { - 'LOGOTYPE': 'NA', - 'FONTSIZE': '12', - 'TITLEFONTSIZE': '12', - 'SMALLFONTSIZE': '6', - 'TOPMARGIN': '0.9', - 'BOTTOMMARGIN': '0.9', - 'YAXIS': 'true', - 'YAXISLABEL': 'bits', - 'XAXISLABEL': '', - 'TITLE': '', - 'ERRORBARFRACTION': '1.0', - 'SHOWINGBOX': 'false', - 'BARBITS': '2.0', - 'TICBITS': '1', - 'COLORDEF': _colour_def, - 'COLORDICT': _colour_dict, - 'SHOWENDS': 'false', - 'NUMBERING': 'true', - 'OUTLINE': 'false', -} -class PWM(Motif): - """This motif subclass defines a pattern in terms of a position weight matrix. - An alphabet must be provided. A pseudo-count to be added to each count is - optional. A uniform background distribution is used by default. - """ - def __init__(self, alpha): - Motif.__init__(self, alpha) # set alphabet of this multinomial distribution - self.background = Distrib(alpha) # the default background ... - self.background.count(alpha.getSymbols()) # ... is uniform - self.nsites = 0 - - def setFromAlignment(self, aligned, pseudo_count = 0.0): - """Set the probabilities in the PWM from an alignment. - The alignment is a list of equal-length strings (see readStrings), OR - a list of Sequence. - """ - self.cols = -1 - self.nsites = len(aligned) - seqs = [] - # Below we create a list of Sequence from the alignment, - # while doing some error checking, and figure out the number of columns - for s in aligned: - # probably a text string, so we make a nameless sequence from it - if not type(s) is Sequence: - s=Sequence(s, Motif.getAlphabet(self)) - else: - # it was a sequence, so we check that the alphabet in - # this motif will be able to process it - if not Motif.isAlphabet(self, s): - raise RuntimeError("Motif alphabet is not valid for sequence " + s.getName()) - if self.cols == -1: - self.cols = s.getLen() - elif self.cols != s.getLen(): - raise RuntimeError("Sequences in alignment are not of equal length") - seqs.append(s) - # The line below initializes the list of Distrib (one for each column of the alignment) - self.counts = [Distrib(Motif.getAlphabet(self), pseudo_count) for _ in range(self.cols)] - # Next, we do the counting, column by column - for c in range( self.cols ): # iterate through columns - for s in seqs: # iterate through rows - # determine the index of the symbol we find at this position (row, column c) - self.counts[c].count(s.getSite(c)) - # Update the length - self.len = self.cols - - def reverseComplement(self): - """Reverse complement the PWM""" - i = 0 - j = len(self.counts)-1 - while (i < j): - temp = self.counts[i]; - self.counts[i] = self.counts[j] - self.counts[j] = temp - self.counts[i].complement() - self.counts[j].complement() - i += 1; - j -= 1; - if i == j: - self.counts[i].complement() - return self - - def getNSites(self): - """Get the number of sites that made the PWM""" - return self.nsites - - def setBackground(self, distrib): - """Set the background distribution""" - if not distrib.getAlphabet() == Motif.getAlphabet(self): - raise RuntimeError("Incompatible alphabets") - self.background = distrib - - def getFreq(self, col = None, sym = None): - """Get the probabilities for all positions in the PWM (a list of Distribs)""" - if (col == None): - return [y.getFreq() for y in self.counts] - else: - return self.counts[col].getFreq(sym) - - def pretty(self): - """Retrieve the probabilites for all positions in the PWM as a pretty table (a list of text strings)""" - #table = ["".join(["%8s " % s for s in self.alpha.getSymbols()])] - table = [] - for row in PWM.getFreq(self): - table.append("".join(["%8.6f " % y for y in row])) - return table - - def logoddsPretty(self, bkg): - """Retrieve the (base-2) log-odds for all positions in the PWM as a pretty table (a list of text strings)""" - table = [] - for row in PWM.getFreq(self): - #table.append("".join(["%8.6f " % (math.log((row[i]+1e-6)/bkg[i])/math.log(2)) for i in range(len(row))])) - table.append("".join(["%8.6f " % (math.log((row[i])/bkg[i])/math.log(2)) for i in range(len(row))])) - #table.append("".join(["%8.6f " % row[i] for i in range(len(row))])) - return table - - - def consensus_sequence(self): - """ - Get the consensus sequence corresponding to a PWM. - Consensus sequence is the letter in each column - with the highest probability. - """ - consensus = "" - alphabet = Motif.getAlphabet(self).getSymbols() - for pos in range(self.cols): - best_letter = alphabet[0] - best_p = self.counts[pos].getFreq(best_letter) - for letter in alphabet[1:]: - p = self.counts[pos].getFreq(letter) - if p > best_p: - best_p = p - best_letter = letter - consensus += best_letter - return consensus - - - def consensus(self): - """ - Get the consensus corresponding to a PWM. - Consensus at each column of motif is a list of - characters with non-zero probabilities. - """ - consensus = [] - for pos in range(self.cols): - matches = [] - for letter in Motif.getAlphabet(self).getSymbols(): - p = self.counts[pos].getFreq(letter) - if p > 0: - matches += letter - consensus.append(matches) - return consensus - - - def getScore(self, seq, start): - """Score this particular list of symbols using the PFM (background needs to be set separately)""" - sum = 0.0 - seqdata = seq.getSequence()[start : start+self.cols] - for pos in range(len(seqdata)): - q = self.counts[pos].getFreq(seqdata[pos]) - if q == 0: - q = 0.0001 # to avoid log(0) == -Infinity - logodds = math.log(q / self.background.getFreq(seqdata[pos])) - sum += logodds - return sum - - def match(self, seq, _LOG0 = -10): - """Find matches to the motif in a specified sequence. - The method is a generator, hence subsequent hits can be retrieved using next(). - The returned result is a tuple (position, match-sequence, score). - The optional parameter _LOG0 specifies a lower bound on reported logodds scores. - """ - myseq = seq - if not type(seq) is Sequence: - myseq = Sequence(seq, self.alpha) - if not Motif.isAlphabet(self, myseq): - raise RuntimeError("Motif alphabet is not valid for sequence " + myseq.getName()) - for pos in range(myseq.getLen() - self.cols): - score = PWM.getScore(self, myseq, pos) - if score > _LOG0: - yield (pos, "".join(myseq.getSite(pos, self.cols)), score) - - def writeEPS(self, program, template_file, eps_fh, - timestamp = time.localtime()): - """Write out a DNA motif to EPS format.""" - small_dfmt = "%d.%m.%Y %H:%M" - full_dfmt = "%d.%m.%Y %H:%M:%S %Z" - small_date = time.strftime(small_dfmt, timestamp) - full_date = time.strftime(full_dfmt, timestamp) - points_per_cm = 72.0 / 2.54 - height = 4.5 - width = self.getLen() * 0.8 + 2 - width = min(30, width) - points_height = int(height * points_per_cm) - points_width = int(width * points_per_cm) - defaults = _eps_defaults.copy() - defaults['CREATOR'] = program - defaults['CREATIONDATE'] = full_date - defaults['LOGOHEIGHT'] = str(height) - defaults['LOGOWIDTH'] = str(width) - defaults['FINEPRINT'] = program + ' ' + small_date - defaults['CHARSPERLINE'] = str(self.getLen()) - defaults['BOUNDINGHEIGHT'] = str(points_height) - defaults['BOUNDINGWIDTH'] = str(points_width) - defaults['LOGOLINEHEIGHT'] = str(height) - with open(template_file, 'r') as template_fh: - m_var = re.compile("\{\$([A-Z]+)\}") - for line in template_fh: - last = 0 - match = m_var.search(line) - while (match): - if (last < match.start()): - prev = line[last:match.start()] - eps_fh.write(prev) - key = match.group(1) - if (key == "DATA"): - eps_fh.write("\nStartLine\n") - for pos in range(self.getLen()): - eps_fh.write("({0:d}) startstack\n".format(pos+1)) - stack = [] - # calculate the stack information content - alpha_ic = 2 - h = 0 - for sym in self.getAlphabet().getSymbols(): - freq = self.getFreq(pos, sym) - if (freq == 0): - continue - h -= (freq * math.log(freq, 2)) - stack_ic = alpha_ic - h - # calculate the heights of each symbol - for sym in self.getAlphabet().getSymbols(): - freq = self.getFreq(pos, sym) - if (freq == 0): - continue - stack.append((freq * stack_ic, sym)) - stack.sort(); - # output the symbols - for symh, sym in stack: - eps_fh.write(" {0:f} ({1:s}) numchar\n".format( - symh, sym)) - eps_fh.write("endstack\n\n") - eps_fh.write("EndLine\n") - elif (key in defaults): - eps_fh.write(defaults[key]) - else: - raise RuntimeError('Unknown variable "' + key + - '" in EPS template') - last = match.end(); - match = m_var.search(line, last) - if (last < len(line)): - eps_fh.write(line[last:]) - - -#------------------ Main method ------------------- -# Executed if you run this file from the operating system prompt, e.g. -# > python sequence.py - -if __name__=='__main__': - alpha = getAlphabet('Extended DNA') - #seqs = readFASTA('pos.fasta') - seqs = [] - aln = readStrings('tmp0') - #regexp = RegExp(alpha, '[AG]G.[DE]TT[AS].') - pwm = PWM(alpha) - pwm.setFromAlignment(aln) - for row in pwm.pretty(): - print row - for s in seqs: - print s.getName(), s.getLen(), s.getAlphabet().getSymbols() - for m in regexp.match( s ): - print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2]) - for m in pwm.match( s ): - print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2])
--- a/shuffleBed.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,107 +0,0 @@ -''' -simulate a random interval set that mimics the size and strand of a reference set -''' - -def inferSizeFromRefBed(filename,header): - ''' - read reference interval set, get chrom size information - ''' - chrSize = {} - f = open(filename) - if header: - header = f.readline() - for line in f: - flds = line.strip().split('\t') - if not chrSize.has_key(flds[0]): - chrSize[flds[0]] = int(flds[2]) - elif chrSize[flds[0]] < int(flds[2]): - chrSize[flds[0]] = int(flds[2]) - f.close() - return chrSize - -def getChrSize(filename): - chrSize = {} - f = open(filename) - for line in f: - flds = line.strip().split('\t') - if len(flds) >1: - chrSize[flds[0]] = int(flds[1]) - f.close() - return chrSize - -def makeWeightedChrom(chrSize): - ''' - make a list of chr_id, the freq is proportional to its length - ''' - - genome_len = 0 - - for chrom in chrSize: - chrom_len = chrSize[chrom] - genome_len += chrom_len - - weighted_chrom = [] - for chrom in chrSize: - weight = int(round(1000*float(chrSize[chrom])/genome_len)) - weighted_chrom += [chrom]*weight - - return weighted_chrom - -def randomIntervalWithinChrom(infile,outfile,chrSize,header): - ''' - ''' - fin = open(infile) - if header: - header = fin.readline() - fout = open(outfile,'w') - n = 0 - for line in fin: - n = n + 1 - flds = line.strip().split('\t') - interval_size = int(flds[2]) - int(flds[1]) - rstart = random.randint(0,chrSize[flds[0]]-interval_size) - fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n') - fin.close() - fout.close() - -def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom,header): - ''' - ''' - fin = open(infile) - if header: - header = fin.readline() - fout = open(outfile,'w') - n = 0 - for line in fin: - n = n + 1 - flds = line.strip().split('\t') - interval_size = int(flds[2]) - int(flds[1]) - # find a random chrom - flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)] - # random start in the chrom - rstart = random.randint(0,chrSize[flds[0]]-interval_size) - fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n') - fin.close() - fout.close() - -import sys,random -def main(): - # python random_interval.py test100.bed testout.bed across header human.hg18.genome - - reference_interval_file = sys.argv[1] - output_file = sys.argv[2] - across_or_within_chrom = sys.argv[3] # within or across - if sys.argv[4] == 'header': - header = True - else: - header = False - if len(sys.argv) == 6: - chrom_size_file = sys.argv[5] - chrSize = getChrSize(chrom_size_file) - else: - chrSize = inferSizeFromRefBed(reference_interval_file,header) - if across_or_within_chrom == 'within': - randomIntervalWithinChrom(reference_interval_file,output_file,chrSize,header) - else: - randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize),header) -main()
--- a/shuffleBed.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -<tool id="shufflebed" name="shuffleBed"> - <description>chromosome not weighted by length</description> - <command>shuffleBed -i $input -g $genome $chrom > $outfile - #if $limit.limit_select=="include": - -incl $limitfile - #else if $limit.limit_select=="exclude": - -excl $limitfile - #end if - </command> - <inputs> - <param name="input" format="bed,gff,vcf" type="data" label="Original intervals (BED/GFF/VCF)" /> - <param name="genome" type="select" label="Select genome"> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm9.genome" selected="true">mm9</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm8.genome">mm8</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg18.genome">hg18</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg19.genome">hg19</option> - </param> - <param name="chrom" label="keep intervals on the same chromosome?" type="boolean" truevalue="-chrom" falsevalue="" checked="False"/> - <conditional name="limit"> - <param name="limit_select" type="select" label="restrictions for the shuffling" help="Instead of randomly placing features in a genome, one can specify regions features should or should not be randomly placed (e.g. genes.bed or repeats.bed)."> - <option value="none" selected="true">None</option> - <option value="include">within specified regions</option> - <option value="exclude">outside specified regions</option> - </param> - <when value="include"> - <param name="limitfile" type="data" format="interval" label="specify regions"/> - </when> - <when value="exclude"> - <param name="limitfile" type="data" format="interval" label="specify regions"/> - </when> - </conditional> - </inputs> - <outputs> - <data format="input" name="outfile" /> - </outputs> - <help> - -.. class:: infomark - -Every chromosome are choosed with equal probability, regardless their size. Please use the tool 'random intervals' instead for general randomization. - - </help> -</tool>
--- a/spatial_proximity.py Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ - -import os,sys - -file1 = sys.argv[1] -file2 = sys.argv[2] -genome = sys.argv[3] -outplot = sys.argv[4] -outlog = sys.argv[5] -outbed = sys.argv[6] - -strandness = '' -if len(sys.argv) > 7: - strandness = sys.argv[7] - -# real distance -cmd = 'closestBed -a '+file1+' -b '+file2 + ' '+strandness + ' -d -t first > '+outbed -os.system(cmd) -# shuffle -cmd = 'shuffleBed -chrom -g '+genome+' -i '+file1+'> shuffled.bed' -os.system(cmd) -# shuffled distance -cmd = 'closestBed -a shuffled.bed -b '+file2 + ' '+strandness + ' -d -t first > shuffled.dist' -os.system(cmd) - - -# test in R -r = open('tmp.r','w') -r.write("options(warn=-1)\n") -r.write("source('/Users/xuebing/galaxy-dist/tools/mytools/cdf.r')\n") -r.write("x = read.table('"+outbed+"',sep='\t')\n") -r.write("y = read.table('shuffled.dist',sep='\t')\n") -r.write("pdf('"+outplot+"')\n") -r.write("mycdf(list(log10(1+x[,ncol(x)]),log10(1+y[,ncol(y)])),'spatial distance',c('real','shuffled'),'topleft','log10 distance','')\n") -r.write("dev.off()\n") -r.close() -os.system("R --vanilla < tmp.r >"+outlog)
--- a/spatial_proximity.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ -<tool id="spatialproximity" name="spatial proximity"> - <description>of two interval sets</description> - <command interpreter="python">spatial_proximity.py $inputa $inputb $genome $outplot $outlog $outbed $strandness - </command> - <inputs> - <param name="inputa" format="interval" type="data" label="Interval set 1" /> - <param name="inputb" format="interval" type="data" label="Interval set 2" /> - <param name="genome" type="select" label="Select genome"> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm9.genome" selected="true">mm9</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/mouse.mm8.genome">mm8</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg18.genome">hg18</option> - <option value="/Users/xuebing/galaxy-dist/tool-data/genome/chrsize/human.hg19.genome">hg19</option> - </param> - <param name="strandness" type="select" label="Strand requirement" > - <option value="" selected="true"> none </option> - <option value="-s" > -s: closest feature on the same strand</option> - <option value="-S" > -S: closest feature on the opposite strand </option> - </param> - </inputs> - <outputs> - <data format="input" name="outbed" label="${tool.name} on ${on_string}: (bed)" /> - <data format="pdf" name="outplot" label="${tool.name} on ${on_string}: (plot)" /> - <data format="txt" name="outlog" label="${tool.name} on ${on_string}: (log)" /> - </outputs> - <help> - -.. class:: infomark - -for each feature in the first interval set, find the closest in the second set, then compared the distance distribution to shuffled set 1. - - </help> -</tool>
--- a/splicesite.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ -<tool id="splicesite" name="splice site score"> - <description>using max entropy model</description> - <command interpreter="perl">$script $input > $out_file1 </command> - <inputs> - <param name="input" format="txt" type="data" label="Sequence file" help="fasta or raw sequence (one sequence per line)"/> - <param name="script" type="select" label="Select model"> - <option value="splicesitescore/score5.pl" selected="true">5' splice site</option> - <option value="splicesitescore/score3.pl">3' splice site</option> - </param> - </inputs> - <outputs> - <data format="tabular" name="out_file1" /> - </outputs> - <help> - -**What it does** - -This tool computes splice site scores using a max entropy model. See more details here: - -http://genes.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html - ------ - -**Example input for 5' splice site sequence** - -3 exonic and 6 intronic nucleotides flanking the junction:: - - CTGGTGAGT - AAGGTACAG - -or fasta format:: - - >seq1 - CTGGTGAGT - >seq2 - AAGGTACAG - -Output:: - - CTGGTGAGT 10.10 - AAGGTACAG 8.04 - -or fasta format:: - - >seq1 CTGGTGAGT 10.10 - >seq2 AAGGTACAG 8.04 - - ------ - -**Example input for 3' splice site sequence** - -3 exonic and 20 intronic nucleotides flanking the junction:: - - CCTGCATCCTCTGTTCCCAGGTG - TTTCTTCCCTCCGGGAACAGTGG - -Output:: - - CCTGCATCCTCTGTTCCCAGGTG 10.47 - TTTCTTCCCTCCGGGAACAGTGG 6.22 - - </help> -</tool>
--- a/venn.xml Sat Mar 10 08:17:36 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,82 +0,0 @@ -<tool id="venn" name="Venn Diagram"> - <description>from summary counts</description> - <command interpreter="python">$script_file $labels $counts $output $script</command> - <inputs> - <param name="labels" type="text" value="A,B" size="50" label="Set Labels"/> - <param name="counts" type="text" value="30,10,20" size="80" label="Counts in each region" help="region order: two sets: A, B, AB; three sets: A,B,C,AB,BC,AC,ABC"/> - </inputs> - <configfiles> - <configfile name="script_file"> -import os -labels = '${labels}'.replace(' ','_').split(',') -counts = '${counts}'.replace(' ','').split(',') -counts = map(int,counts) -rscript = open('${script}','w') -rscript.write("options(warn=-1)\n") -rscript.write("pdf('"+"${output}"+"')\n") -rscript.write("library(grid)\n") -rscript.write("library(VennDiagram)\n") -if len(labels)==2: - for i in range(2): - counts[i+1] = counts[i+1]+counts[i] - rscript.write("venn =venn.diagram(\n\tx=list(\n\t\t"+labels[0]+"=c(1:"+str(counts[0])+","+str(counts[1]+1)+":"+str(counts[2])+"),\n\t\t"+labels[1]+"="+str(counts[0]+1)+":"+str(counts[2])+"),\n\tfilename=NULL,\n\tfill=c('red','blue'),\n\tcol='transparent',\n\talpha=0.5,\n\tlabel.col='black',\n\tcex=2,\n\tlwd=0,\n\tfontfamily='serif',\n\tfontface='bold',\n\tcat.col = c('red', 'blue'),\n\tcat.cex=2,\n\tcat.fontfamily='serif',\n\tcat.fontface='bold')\n") -else: - for i in range(6): - counts[i+1] = counts[i+1]+counts[i] - rscript.write("venn =venn.diagram(\n\tx=list(\n\t\t"+labels[0]+"=c(1:"+str(counts[0])+","+str(counts[2]+1)+":"+str(counts[3])+","+str(counts[4]+1)+":"+str(counts[6])+"),\n\t\t"+labels[1]+"=c("+str(counts[0]+1)+":"+str(counts[1])+","+str(counts[2]+1)+":"+str(counts[4])+","+str(counts[5]+1)+":"+str(counts[6])+"),\n\t\t"+labels[2]+"=c("+str(counts[1]+1)+":"+str(counts[2])+","+str(counts[3]+1)+":"+str(counts[6])+")),\n\tfilename=NULL,\n\tfill=c('red','blue','green'),\n\tcol='transparent',\n\talpha=0.5,\n\tlabel.col='black',\n\tcex=2,\n\tlwd=0,\n\tfontfamily='serif',\n\tfontface='bold',\n\tcat.col = c('red', 'blue','green'),\n\tcat.cex=2,\n\tcat.fontfamily='serif',\n\tcat.fontface='bold')\n") -rscript.write("grid.draw(venn)\n") -rscript.write("dev.off()\n") -rscript.close() -os.system("cat "+"${script}"+" | R --vanilla --slave") - </configfile> - </configfiles> - - <outputs> - <data format="txt" name="script" label="${tool.name} on ${on_string}: (script)" /> - <data format="pdf" name="output" label="${tool.name} on ${on_string}: (plot)" /> - - </outputs> - -<help> -.. class:: infomark - -This is a wrapper for R package VennDiagram. It allows you to plot two-set or three-set venn diagrams based on counts. The R script used to generate the plot is also in the output. - -Input: labels for sets and counts for each region in the diagram. - -A: A-only - -B: B-only - -C: C-only - -AB: in A and B but not C - -BC: in B and C but not A - -AC: in A and C but not B - -ABC: in A, B, and C - ------ - -**Example** - -Labels: X,Y - -Counts: 30,10,20 - - -.. image:: ./static/images/venn2.png - - -Labels: A,B,C - -Counts: 10,20,30,40,50,60,70 - - -.. image:: ./static/images/venn3.png - - -</help> -</tool>
