changeset 2:e2edfa478eb4

Covert to use bedGraph intermediate with bedtools. Thanks to Peter Cock and Lance Parsons for the updates
author Brad Chapman <chapmanb@50mail.com>
date Wed, 05 Sep 2012 21:14:54 -0400
parents 0ff100a057ef
children 294e9dae5a9b
files bam_to_bigwig/README.txt bam_to_bigwig/bam_to_bigwig.py bam_to_bigwig/bam_to_bigwig.xml bam_to_bigwig/bam_to_wiggle.py
diffstat 4 files changed, 148 insertions(+), 135 deletions(-) [+]
line wrap: on
line diff
--- a/bam_to_bigwig/README.txt	Tue Jun 07 16:26:46 2011 -0400
+++ b/bam_to_bigwig/README.txt	Wed Sep 05 21:14:54 2012 -0400
@@ -4,15 +4,31 @@
 region. This allows you to view regions of arbitrarily large Wiggle file data
 at UCSC while avoiding the upload costs.
 
-The latest version of the bam_to_wiggle.py script is available at:
+History
+-------
+
+As of v0.1.0, the Galaxy tools uses a revised bam_to_bigwig.py script using
+genomeCoverageBed and bedGraphToBigWig - this approach allows gaps/skpis to
+be excluded from the coverage calculation, which is important for RNA-Seq.
 
+Until v0.0.2, this Galaxy tool used the bam_to_wiggle.py script from
 https://github.com/chapmanb/bcbb/blob/master/nextgen/scripts/bam_to_wiggle.py
+which internally used pysam (and thus samtools) and wigToBigWig from UCSC.
+
+Requirements
+------------
 
-Place the script in the same directory as the XML configuration file, or
-provide a soft link to it.
+If you are installing this tool manually, place the Python script in the
+same directory as the XML configuration file, or provide a soft link to it.
+Ensure the following command line tools are on the system path:
 
-This requires:
+pysam - Python interface to samtools (http://code.google.com/p/pysam/)
+genomeCoverageBed - part of BedTools (http://code.google.com/p/bedtools/)
+bedGraphToBigWig  - from UCSC (http://hgdownload.cse.ucsc.edu/admin/exe/)
 
-Python2, version 2.6 or better
-pysam (http://code.google.com/p/pysam/)
-wigToBigWig from UCSC (http://hgdownload.cse.ucsc.edu/admin/exe/)
+Credits
+-------
+
+Original script by Brad Chapman, revisions from Peter Cock including the
+switch to using genomeCoverageBed and bedGraphToBigWig based on the work
+of Lance Parsons.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bam_to_bigwig/bam_to_bigwig.py	Wed Sep 05 21:14:54 2012 -0400
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+#Original version copyright Brad Chapman with revisions from Peter Cock
+#and ideas from Lance Parsons
+"""Convert BAM files to BigWig file format in a specified region.
+
+Usage:
+    bam_to_wiggle.py <BAM file> [<YAML config>] [--outfile=<output file name>]
+
+The optional config file (not used by the Galaxy interface) is in YAML format
+and specifies the location of the binary tools.
+
+program:
+  bedtools_genomeCoverageBed: genomeCoverageBed
+  ucsc_bedGraphToBigWig: bedGraphToBigWig
+
+If not specified, these will be assumed to be present in the system path.
+
+The script requires:
+    pysam (http://code.google.com/p/pysam/)
+    genomeCoverageBed from BedTools (http://code.google.com/p/bedtools/)
+    bedGraphToBigWig from UCSC (http://hgdownload.cse.ucsc.edu/admin/exe/)
+If a configuration file is used, then PyYAML is also required (http://pyyaml.org/)
+"""
+import os
+import sys
+import subprocess
+import tempfile
+from optparse import OptionParser
+from contextlib import contextmanager, closing
+
+import pysam
+
+def main(bam_file, config_file=None, outfile=None, split=False):
+    if config_file:
+        import yaml
+        with open(config_file) as in_handle:
+            config = yaml.load(in_handle)
+    else:
+        config = {"program": {"ucsc_bedGraphToBigWig" : "bedGraphToBigWig",
+                              "bedtools_genomeCoverageBed" : "genomeCoverageBed"}}
+    if outfile is None:
+        outfile = "%s.bigwig" % os.path.splitext(bam_file)[0]
+    if os.path.abspath(bam_file) == os.path.abspath(outfile):
+        sys.stderr.write("Bad arguments, input and output files are the same.\n")
+        sys.exit(1)
+    if os.path.exists(outfile) and os.path.getsize(outfile) > 0:
+        sys.stderr.write("Warning, output file already exists.\n")
+
+    sizes = get_sizes(bam_file, config)
+    print "Have %i references" % len(sizes)
+    if not sizes:
+        sys.stderr.write("Problem reading BAM header.\n")
+        sys.exit(1)
+
+    #Use a temp file to avoid any possiblity of not having write permission
+    temp_handle = tempfile.NamedTemporaryFile(delete=False)
+    temp_file = temp_handle.name
+    with closing(temp_handle):
+        print "Calculating coverage..."
+        convert_to_graph(bam_file, split, config, temp_handle)
+    try:
+        print "Converting %i MB graph file to bigwig..." % (os.path.getsize(temp_file) // (1024*1024))
+        #Can't pipe this as stdin due to converter design,
+        #https://lists.soe.ucsc.edu/pipermail/genome/2011-March/025455.html
+        convert_to_bigwig(temp_file, sizes, config, outfile)
+    finally:
+        if os.path.isfile(temp_file):
+            os.remove(temp_file)
+    print "Done"
+
+@contextmanager
+def indexed_bam(bam_file, config):
+    if not os.path.exists(bam_file + ".bai"):
+        pysam.index(bam_file)
+    sam_reader = pysam.Samfile(bam_file, "rb")
+    yield sam_reader
+    sam_reader.close()
+
+def get_sizes(bam_file, config):
+    with indexed_bam(bam_file, config) as work_bam:
+        sizes = zip(work_bam.references, work_bam.lengths)
+    return sizes
+
+def convert_to_graph(bam_file, split, config, out_handle):
+    cl = [config["program"]["bedtools_genomeCoverageBed"], "-ibam", bam_file, "-bg"]
+    if split:
+        cl.append("-split")
+    subprocess.check_call(cl, stdout=out_handle)
+
+def convert_to_bigwig(bedgraph_file, chr_sizes, config, bw_file):
+    #This will be fine under Galaxy, but could use temp folder?
+    size_file = "%s-sizes.txt" % (os.path.splitext(bw_file)[0])
+    with open(size_file, "w") as out_handle:
+        for chrom, size in chr_sizes:
+            out_handle.write("%s\t%s\n" % (chrom, size))
+    try:
+        cl = [config["program"]["ucsc_bedGraphToBigWig"], bedgraph_file, size_file, bw_file]
+        subprocess.check_call(cl)
+    finally:
+        os.remove(size_file)
+    return bw_file
+
+if __name__ == "__main__":
+    parser = OptionParser()
+    parser.add_option("-o", "--outfile", dest="outfile")
+    parser.add_option("-s", "--split", action="store_true", dest="split")
+    (options, args) = parser.parse_args()
+    if len(args) not in [1, 2]:
+        print "Incorrect arguments"
+        print __doc__
+        sys.exit()
+    kwargs = dict(
+        outfile=options.outfile,
+        split=options.split)
+    main(*args, **kwargs)
--- a/bam_to_bigwig/bam_to_bigwig.xml	Tue Jun 07 16:26:46 2011 -0400
+++ b/bam_to_bigwig/bam_to_bigwig.xml	Wed Sep 05 21:14:54 2012 -0400
@@ -1,17 +1,23 @@
-<tool id="bam_to_bigwig" name="BAM to BigWig" version="0.0.2">
+<tool id="bam_to_bigwig" name="BAM to BigWig" version="0.1.0">
   <description>Calculates coverage from a BAM alignment file</description>
-  <command interpreter="python">bam_to_wiggle.py $align --outfile=$out</command>
+  <command interpreter="python">bam_to_bigwig.py $align --outfile=$out</command>
   <inputs>
     <param format="bam" name="align" type="data" label="BAM alignment file"/>
   </inputs>
   <outputs>
     <data format="bigwig" name="out" />
   </outputs>
-
+  <requirements>
+        <requirement type="python-module">pysam</requirement>
+        <requirement type="binary">genomeCoverageBed</requirement>
+        <requirement type="binary">bedGraphToBigWig</requirement>
+  </requirements>
 <help>
 **What it does**
 
-Creates a coverage file in BigWig format, given a BAM alignment file. 
+Creates a coverage file in BigWig format, given a BAM alignment file.
+
+Gaps or skips (CIGAR D or N operators) are not counted towards the coverage calculation, which is important when mapping RNA Seq reads to genes with introns.
 
 **Input**
 
@@ -21,5 +27,4 @@
 
 BigWig files can be loaded directly from Galaxy into the UCSC browser. They can be loaded incrementally by UCSC, so a single file can be used to represent the entire genome without having to upload the entire thing as a custom track.
 </help>
-
 </tool>
--- a/bam_to_bigwig/bam_to_wiggle.py	Tue Jun 07 16:26:46 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,123 +0,0 @@
-#!/usr/bin/env python
-"""Convert BAM files to BigWig file format in a specified region.
-
-Usage:
-    bam_to_wiggle.py <BAM file> [<YAML config>]
-    [--outfile=<output file name>
-     --chrom=<chrom>
-     --start=<start>
-     --end=<end>]
-
-chrom start and end are optional, in which case they default to everything.
-
-The config file is in YAML format and specifies the location of the wigToBigWig
-program from UCSC:
-
-program:
-  ucsc_bigwig: wigToBigWig
-
-If not specified, these will be assumed to be present in the system path.
-
-The script requires:
-    pysam (http://code.google.com/p/pysam/)
-    wigToBigWig from UCSC (http://hgdownload.cse.ucsc.edu/admin/exe/)
-If a configuration file is used, then PyYAML is also required (http://pyyaml.org/)
-"""
-import os
-import sys
-import subprocess
-import tempfile
-from optparse import OptionParser
-from contextlib import contextmanager, closing
-
-import pysam
-
-def main(bam_file, config_file=None, chrom='all', start=0, end=None,
-         outfile=None):
-    if config_file:
-        import yaml
-        with open(config_file) as in_handle:
-            config = yaml.load(in_handle)
-    else:
-        config = {"program": {"ucsc_bigwig" : "wigToBigWig"}}
-    if outfile is None:
-        outfile = "%s.bigwig" % os.path.splitext(bam_file)[0]
-    if start > 0:
-        start = int(start) - 1
-    if end is not None:
-        end = int(end)
-    regions = [(chrom, start, end)]
-    if os.path.abspath(bam_file) == os.path.abspath(outfile):
-        sys.stderr.write("Bad arguments, input and output files are the same.\n")
-        sys.exit(1)
-    if not (os.path.exists(outfile) and os.path.getsize(outfile) > 0):
-        #Use a temp file to avoid any possiblity of not having write permission
-        out_handle = tempfile.NamedTemporaryFile(delete=False)
-        wig_file = out_handle.name
-        with closing(out_handle):
-            chr_sizes, wig_valid = write_bam_track(bam_file, regions, config, out_handle)
-        try:
-            if wig_valid:
-                convert_to_bigwig(wig_file, chr_sizes, config, outfile)
-        finally:
-            os.remove(wig_file)
-
-@contextmanager
-def indexed_bam(bam_file, config):
-    if not os.path.exists(bam_file + ".bai"):
-        pysam.index(bam_file)
-    sam_reader = pysam.Samfile(bam_file, "rb")
-    yield sam_reader
-    sam_reader.close()
-
-def write_bam_track(bam_file, regions, config, out_handle):
-    out_handle.write("track %s\n" % " ".join(["type=wiggle_0",
-        "name=%s" % os.path.splitext(os.path.split(bam_file)[-1])[0],
-        "visibility=full",
-        ]))
-    is_valid = False
-    with indexed_bam(bam_file, config) as work_bam:
-        sizes = zip(work_bam.references, work_bam.lengths)
-        if len(regions) == 1 and regions[0][0] == "all":
-            regions = [(name, 0, length) for name, length in sizes]
-        for chrom, start, end in regions:
-            if end is None and chrom in work_bam.references:
-                end = work_bam.lengths[work_bam.references.index(chrom)]
-            assert end is not None, "Could not find %s in header" % chrom
-            out_handle.write("variableStep chrom=%s\n" % chrom)
-            for col in work_bam.pileup(chrom, start, end):
-                out_handle.write("%s %s\n" % (col.pos+1, col.n))
-                is_valid = True
-    return sizes, is_valid
-
-def convert_to_bigwig(wig_file, chr_sizes, config, bw_file=None):
-    if not bw_file:
-        bw_file = "%s.bigwig" % (os.path.splitext(wig_file)[0])
-    size_file = "%s-sizes.txt" % (os.path.splitext(wig_file)[0])
-    with open(size_file, "w") as out_handle:
-        for chrom, size in chr_sizes:
-            out_handle.write("%s\t%s\n" % (chrom, size))
-    try:
-        cl = [config["program"]["ucsc_bigwig"], wig_file, size_file, bw_file]
-        subprocess.check_call(cl)
-    finally:
-        os.remove(size_file)
-    return bw_file
-
-if __name__ == "__main__":
-    parser = OptionParser()
-    parser.add_option("-o", "--outfile", dest="outfile")
-    parser.add_option("-c", "--chrom", dest="chrom")
-    parser.add_option("-s", "--start", dest="start")
-    parser.add_option("-e", "--end", dest="end")
-    (options, args) = parser.parse_args()
-    if len(args) not in [1, 2]:
-        print "Incorrect arguments"
-        print __doc__
-        sys.exit()
-    kwargs = dict(
-        outfile=options.outfile,
-        chrom=options.chrom or 'all',
-        start=options.start or 0,
-        end=options.end)
-    main(*args, **kwargs)