Mercurial > repos > marpiech > norwich_tools
changeset 7:77114c36b8ab draft default tip
planemo upload
author | marpiech |
---|---|
date | Mon, 29 Aug 2016 07:28:10 -0400 |
parents | 97fb58d6c0cb |
children | |
files | README.txt bam_to_bigwig.py bam_to_bigwig.xml sample.xml test-data/bam_to_bigwig_test.bam test-data/bam_to_bigwig_test.bigwig tool_dependencies.xml |
diffstat | 7 files changed, 236 insertions(+), 13 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.txt Mon Aug 29 07:28:10 2016 -0400 @@ -0,0 +1,48 @@ +Convert a BAM file into a BigWig coverage file. This can be used directly from +Galaxy for display at UCSC. The advantage over standard Wiggle format is that +the data is stored in a compressed format and can be retrieved by genome +region. This allows you to view regions of arbitrarily large Wiggle file data +at UCSC while avoiding the upload costs. + +History +------- + +v0.2.0 add a sort step after genomeCoverageBed which is required in some +instances otherwise bedGraphToBigWig will complain. This version also uses +Galaxy's dependency mechanism, added some tests, and updated some formatting. +By Lance Parsons. + +v0.1.1 passes the forgotten split argument and moves to using the new +sub-command enabled bedtools. Thanks to David Leader. + +As of v0.1.0, the Galaxy tools uses a revised bam_to_bigwig.py script using +genomeCoverageBed and bedGraphToBigWig - this approach allows gaps/skpis to +be excluded from the coverage calculation, which is important for RNA-Seq. + +Until v0.0.2, this Galaxy tool used the bam_to_wiggle.py script from +https://github.com/chapmanb/bcbb/blob/master/nextgen/scripts/bam_to_wiggle.py +which internally used pysam (and thus samtools) and wigToBigWig from UCSC. + +Requirements +------------ + +If you are installing this tool manually, place the Python script in the +same directory as the XML configuration file, or provide a soft link to it. +Ensure the following command line tools are on the system path: + +pysam - Python interface to samtools (http://code.google.com/p/pysam/) +genomeCoverageBed - part of BedTools (http://code.google.com/p/bedtools/) +bedGraphToBigWig - from UCSC (http://hgdownload.cse.ucsc.edu/admin/exe/) + +Credits +------- + +Original script by Brad Chapman, revisions from Peter Cock including the +switch to using genomeCoverageBed and bedGraphToBigWig based on the work +of Lance Parsons. + +License +------ + +The code is freely available under the MIT license: +http://www.opensource.org/licenses/mit-license.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bam_to_bigwig.py Mon Aug 29 07:28:10 2016 -0400 @@ -0,0 +1,122 @@ +#!/usr/bin/env python +"""Convert BAM files to BigWig file format in a specified region. + +Original version copyright Brad Chapman with revisions from Peter Cock +and ideas from Lance Parsons + +Usage: + bam_to_bigwig.py <BAM file> [--outfile=<output file name>] [--split] + +The --split argument is passed to bedtools genomecov + +The script requires: + pysam (http://code.google.com/p/pysam/) + bedtools genomecov (http://code.google.com/p/bedtools/) + bedGraphToBigWig from UCSC (http://hgdownload.cse.ucsc.edu/admin/exe/) +""" +import os +import sys +import subprocess +import tempfile +from optparse import OptionParser +from contextlib import contextmanager, closing + +import pysam + + +def main(bam_file, outfile=None, split=False): + config = {"program": {"ucsc_bedGraphToBigWig": ["bedGraphToBigWig"], + "bedtools_genomeCoverageBed": + ["bedtools", "genomecov"]}} + if outfile is None: + outfile = "%s.bigwig" % os.path.splitext(bam_file)[0] + if os.path.abspath(bam_file) == os.path.abspath(outfile): + sys.stderr.write("Bad arguments, " + "input and output files are the same.\n") + sys.exit(1) + if os.path.exists(outfile) and os.path.getsize(outfile) > 0: + sys.stderr.write("Warning, output file already exists.\n") + + sizes = get_sizes(bam_file, config) + print "Have %i references" % len(sizes) + if not sizes: + sys.stderr.write("Problem reading BAM header.\n") + sys.exit(1) + + # Use a temp file to avoid any possiblity of not having write permission + temp_handle = tempfile.NamedTemporaryFile(delete=False) + temp_file = temp_handle.name + with closing(temp_handle): + print "Calculating coverage..." + convert_to_graph(bam_file, split, config, temp_handle) + try: + print("Converting %i MB graph file to bigwig..." % + (os.path.getsize(temp_file) // (1024 * 1024))) + # Can't pipe this as stdin due to converter design, + # https://lists.soe.ucsc.edu/pipermail/genome/2011-March/025455.html + convert_to_bigwig(temp_file, sizes, config, outfile) + finally: + if os.path.isfile(temp_file): + os.remove(temp_file) + print "Done" + + +@contextmanager +def indexed_bam(bam_file, config): + if not os.path.exists(bam_file + ".bai"): + pysam.index(bam_file) + sam_reader = pysam.Samfile(bam_file, "rb") + yield sam_reader + sam_reader.close() + + +def get_sizes(bam_file, config): + with indexed_bam(bam_file, config) as work_bam: + sizes = zip(work_bam.references, work_bam.lengths) + return sizes + + +def convert_to_graph(bam_file, split, config, out_handle): + cl = config["program"]["bedtools_genomeCoverageBed"] + \ + ["-ibam", bam_file, "-bg"] + if split: + cl.append("-split") + new_env = os.environ.copy() + new_env['LC_COLLATE'] = 'C' + p1 = subprocess.Popen(cl, stdout=subprocess.PIPE) + p2 = subprocess.Popen(["sort", "-k1,1", "-k2,2n"], + env=new_env, + stdin=p1.stdout, + stdout=out_handle) + p1.stdout.close() + p2.communicate() + + +def convert_to_bigwig(bedgraph_file, chr_sizes, config, bw_file): + # This will be fine under Galaxy, but could use temp folder? + size_file = "%s-sizes.txt" % (os.path.splitext(bw_file)[0]) + with open(size_file, "w") as out_handle: + for chrom, size in chr_sizes: + out_handle.write("%s\t%s\n" % (chrom, size)) + try: + cl = config["program"]["ucsc_bedGraphToBigWig"] + \ + [bedgraph_file, size_file, bw_file] + subprocess.check_call(cl) + finally: + os.remove(size_file) + return bw_file + + +if __name__ == "__main__": + parser = OptionParser() + parser.add_option("-o", "--outfile", dest="outfile") + parser.add_option("-s", "--split", action="store_true", dest="split") + (options, args) = parser.parse_args() + if len(args) not in [1, 2]: + print "Incorrect arguments" + print __doc__ + sys.exit() + kwargs = dict( + outfile=options.outfile, + split=options.split) + main(*args, **kwargs)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bam_to_bigwig.xml Mon Aug 29 07:28:10 2016 -0400 @@ -0,0 +1,58 @@ +<tool id="bam_to_bigwig" name="BAM to BigWig" version="0.2.0"> + + <description>Calculates coverage from a BAM alignment file</description> + + <requirements> + <requirement type="package" version="0.8.3">pysam</requirement> + <requirement type="package" version="2.24">bedtools</requirement> + <requirement type="package" version="312">ucsc_tools</requirement> + </requirements> + + <command detect_errors="aggressive" interpreter="python"><![CDATA[ + bam_to_bigwig.py $align --outfile=$out --split + ]]></command> + + <inputs> + <param format="bam" name="align" type="data" label="BAM alignment file"/> + </inputs> + + <outputs> + <data format="bigwig" name="out" /> + </outputs> + + <tests> + <test> + <param name="align" value="bam_to_bigwig_test.bam"/> + <output name="out" file="bam_to_bigwig_test.bigwig"/> + </test> + </tests> + +<help><![CDATA[ +**What it does** + +Creates a coverage file in BigWig format, given a BAM alignment file. + +Gaps or skips (CIGAR D or N operators) are not counted towards the coverage +calculation, which is important when mapping RNA Seq reads to genes with +introns. + +**Input** + +A BAM alignment file. This needs to have the genome database build used in +alignment annotated. If your file has '?' for the database build, click on the +pencil icon to edit the alignment attributes, and specify the organism used to +align against. + +**Output** + +BigWig files can be loaded directly from Galaxy into the UCSC browser. They can +be loaded incrementally by UCSC, so a single file can be used to represent the +entire genome without having to upload the entire thing as a custom track. + +]]></help> + + <citations> + <citation type="doi">10.1093/bioinformatics/btp352</citation> + <citation type="doi">10.1093/bioinformatics/btq033</citation> + </citations> +</tool>
--- a/sample.xml Mon Aug 29 07:21:05 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,11 +0,0 @@ -<tool id="sam_filter_1" name="SAM Filter"> - <command interpreter="python"> - - </command> - <inputs> - - </inputs> - <outputs> - - </outputs> -</tool>
--- a/tool_dependencies.xml Mon Aug 29 07:21:05 2016 -0400 +++ b/tool_dependencies.xml Mon Aug 29 07:28:10 2016 -0400 @@ -1,6 +1,12 @@ <?xml version="1.0"?> <tool_dependency> - <package name="abyss" version="1.9.0"> - <repository changeset_revision="ded58f1da91c" name="package_abyss_1_9_0" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> + <package name="pysam" version="0.8.3"> + <repository changeset_revision="08db58be052a" name="package_python_2_7_pysam_0_8_3" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + <package name="bedtools" version="2.24"> + <repository changeset_revision="39b86c1e267d" name="package_bedtools_2_24" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + <package name="ucsc_tools" version="312"> + <repository changeset_revision="2d6bafd63401" name="package_ucsc_tools_312" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> </package> </tool_dependency>