changeset 2:2b474ebbfc7d draft

Uploaded
author devteam
date Tue, 21 Apr 2015 17:37:49 -0400
parents 74a8d2d60258
children a4a10c7924d1
files macros.xml samtools_slice_bam.py samtools_slice_bam.xml test-data/bam-slice-input.bam test-data/bam-slice-test1.bam test-data/bam-slice-test2.bam test-data/bam-slice-test3.bam test-data/bam-slice.bed test-data/gatk/fake_phiX_reads_1.bam test-data/gatk/fake_phiX_variant_locations.bed tool_dependencies.xml
diffstat 11 files changed, 226 insertions(+), 112 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Tue Apr 21 17:37:49 2015 -0400
@@ -0,0 +1,70 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="1.2">samtools</requirement>
+            <yield/>
+        </requirements>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="bibtex">
+                @misc{SAM_def,
+                title={Definition of SAM/BAM format},
+                url = {https://samtools.github.io/hts-specs/SAMv1.pdf},}
+            </citation>
+            <citation type="doi">10.1093/bioinformatics/btp352</citation>
+            <citation type="doi">10.1093/bioinformatics/btr076</citation>
+            <citation type="doi">10.1093/bioinformatics/btr509</citation>
+            <citation type="bibtex">
+                @misc{Danecek_et_al,
+                Author={Danecek, P., Schiffels, S., Durbin, R.},
+                title={Multiallelic calling model in bcftools (-m)},
+                url = {http://samtools.github.io/bcftools/call-m.pdf},}
+            </citation>
+            <citation type="bibtex">
+                @misc{Durbin_VCQC,
+                Author={Durbin, R.},
+                title={Segregation based metric for variant call QC},
+                url = {http://samtools.github.io/bcftools/rd-SegBias.pdf},}
+            </citation>
+            <citation type="bibtex">
+                @misc{Li_SamMath,
+                Author={Li, H.},
+                title={Mathematical Notes on SAMtools Algorithms},
+                url = {http://www.broadinstitute.org/gatk/media/docs/Samtools.pdf},}
+            </citation>
+            <citation type="bibtex">
+                @misc{SamTools_github,
+                title={SAMTools GitHub page},
+                url = {https://github.com/samtools/samtools},}
+            </citation>
+        </citations>
+    </xml>
+    <xml name="version_command">
+        <version_command>samtools --version | head -n 1 | awk '{ print $2 }'</version_command>
+    </xml>
+    <xml name="stdio">
+        <stdio>
+            <exit_code range="1:" level="fatal" description="Error" />
+        </stdio>
+    </xml>
+    <token name="@no-chrom-options@">
+-----
+
+.. class:: warningmark
+
+**No options available? How to re-detect metadata**
+
+If you see a &quot;No options available&quot; within the &quot;**Select references (chromosomes and contigs) you would like to restrict bam to**&quot; drop down, you need to re-detect metadata for the dataset you are trying to process. To do this follow these steps:
+
+1. Click on the **pencil** icon adjacent to the dataset in the history
+2. A new menu will appear in the center pane of the interface
+3. Click **Datatype** tab
+4. Set **New Type** to **BAM**
+5. Click **Save**
+
+The medatada will be re-detected and you will be able to see the list of reference sequences in the &quot;**Select references (chromosomes and contigs) you would like to restrict bam to**&quot; drop-down.
+
+    </token>
+
+</macros>
--- a/samtools_slice_bam.py	Thu Mar 27 15:28:06 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,75 +0,0 @@
-#!/usr/bin/env python
-#Dan Blankenberg
-
-"""
-A wrapper script for slicing a BAM file by provided BED file using SAMTools.
-%prog input_filename.sam output_filename.bam
-"""
-#TODO: Confirm that the sort is necessary e.g. if input regions are out of order
-
-
-import sys, optparse, os, tempfile, subprocess, shutil
-
-CHUNK_SIZE = 2**20 #1mb
-
-def cleanup_before_exit( tmp_dir ):
-    if tmp_dir and os.path.exists( tmp_dir ):
-        shutil.rmtree( tmp_dir )
-
-def __main__():
-    #Parse Command Line
-    parser = optparse.OptionParser()
-    (options, args) = parser.parse_args()
-    
-    assert len( args ) == 4, "Invalid command line: samtools_slice_bam.py input.bam input.bam.bai input.interval output.bam"
-    input_bam_filename, input_index_filename, input_interval_filename, output_bam_filename = args
-    
-    tmp_dir = tempfile.mkdtemp( prefix='tmp-samtools_slice_bam-' )
-    
-    tmp_input_bam_filename = os.path.join( tmp_dir, 'input_bam.bam' )
-    os.symlink( input_bam_filename, tmp_input_bam_filename )
-    os.symlink( input_index_filename, "%s.bai" % tmp_input_bam_filename )
-    
-    #Slice BAM
-    unsorted_bam_filename = os.path.join( tmp_dir, 'unsorted.bam' )
-    unsorted_stderr_filename = os.path.join( tmp_dir, 'unsorted.stderr' )
-    cmd = 'samtools view -b -L "%s" "%s" > "%s"' % ( input_interval_filename, tmp_input_bam_filename, unsorted_bam_filename )
-    proc = subprocess.Popen( args=cmd, stderr=open( unsorted_stderr_filename, 'wb' ), shell=True, cwd=tmp_dir )
-    return_code = proc.wait()
-    if return_code:
-        stderr_target = sys.stderr
-    else:
-        stderr_target = sys.stdout
-    stderr = open( unsorted_stderr_filename )
-    while True:
-        chunk = stderr.read( CHUNK_SIZE )
-        if chunk:
-            stderr_target.write( chunk )
-        else:
-            break
-    stderr.close()
-    
-    #sort sam, so indexing will not fail
-    #TODO: confirm if sorting is necessary (is original BAM order maintained, or does the output follow the order of input intervals?)
-    sorted_stderr_filename = os.path.join( tmp_dir, 'sorted.stderr' )
-    sorting_prefix = os.path.join( tmp_dir, 'sorted_bam' )
-    cmd = 'samtools sort -o "%s" "%s" > "%s"' % ( unsorted_bam_filename, sorting_prefix, output_bam_filename )
-    proc = subprocess.Popen( args=cmd, stderr=open( sorted_stderr_filename, 'wb' ), shell=True, cwd=tmp_dir )
-    return_code = proc.wait()
-    
-    if return_code:
-        stderr_target = sys.stderr
-    else:
-        stderr_target = sys.stdout
-    stderr = open( sorted_stderr_filename )
-    while True:
-        chunk = stderr.read( CHUNK_SIZE )
-        if chunk:
-            stderr_target.write( chunk )
-        else:
-            break
-    stderr.close()
-    
-    cleanup_before_exit( tmp_dir )
-
-if __name__=="__main__": __main__()
--- a/samtools_slice_bam.xml	Thu Mar 27 15:28:06 2014 -0400
+++ b/samtools_slice_bam.xml	Tue Apr 21 17:37:49 2015 -0400
@@ -1,40 +1,123 @@
-<tool id="samtools_slice_bam" name="Slice BAM" version="0.0.2">
-  <description>by provided regions</description>
-  <requirements>
-      <requirement type="package" version="0.1.19">samtools</requirement>
-  </requirements>
-  <command interpreter="python">samtools_slice_bam.py
-    "${input_bam}"
-    "${input_bam.metadata.bam_index}"
-    "${input_interval}"
-    "${output_bam}"
-  </command>
-  <inputs>
-    <param name="input_bam" type="data" format="bam" label="BAM file" />
-    <param name="input_interval" type="data" format="bed" label="BED file" />
-  </inputs>
-  <outputs>
-    <data format="bam" name="output_bam"/>
-  </outputs>
-  <tests>
-      <test>
-          <param name="input_bam" value="gatk/fake_phiX_reads_1.bam" ftype="bam" />
-          <param name="input_interval" value="gatk/fake_phiX_variant_locations.bed" ftype="bed" />
-          <output name="output_bam" file="gatk/fake_phiX_reads_1.bam" ftype="bam" />
-      </test>
-  </tests>
-  <help>
+<tool id="samtools_slice_bam" name="Slice" version="2.0">
+  <description>BAM by genomic regions</description>
+  <macros>
+    <import>macros.xml</import>
+  </macros>
+  <!-- <code file="samtools_slice_options.py"/> -->
+  <expand macro="requirements"></expand>
+  <expand macro="stdio"></expand>
+  <expand macro="version_command"></expand>
+    <command>
+<![CDATA[
+    ln -s "${input_bam}" temp_input.bam &&
+    ln -s "${input_bam.metadata.bam_index}" temp_input.bam.bai &&
+
+    #if str($slice_method.slice_method_selector) == "bed":
+    
+        samtools view -@ \${GALAXY_SLOTS:-1} -b -L "${input_interval}" -o unsorted_output.bam temp_input.bam && 
+
+    #elif str($slice_method.slice_method_selector) == "chr":
+
+        samtools view -@ \${GALAXY_SLOTS:-1} -b -o unsorted_output.bam temp_input.bam 
+        ${ ' '.join( map( lambda x:'"%s"' % ( x ), str( $slice_method.refs ).split(",") ) ) } &&
+
+    #elif str($slice_method.slice_method_selector) == "man":
+
+        samtools view -@ \${GALAXY_SLOTS:-1} -b -o unsorted_output.bam temp_input.bam 
+
+        #for $region in $slice_method.regions:
+            "${region.chrom}:${region.start}-${region.end}"
+        #end for
+
+        &&
+
+    #end if
+
+    samtools sort -O bam -T sorted -@ \${GALAXY_SLOTS:-1} -o "${output_bam}" unsorted_output.bam
+]]>
+    </command>
+    <inputs>
+        <param name="input_bam" format="bam" label="Select BAM dataset to slice" type="data" />
+        <conditional name="slice_method">
+            <param name="slice_method_selector" type="select" label="How do you want to slice your dataset?">
+                <option value="bed">using a list of intervals from a BED dataset</option>
+                <option value="chr">by chromosomes/contigs present in the BAM dataset</option>
+                <option value="man">by chromosomes/contigs and coordinates</option>
+            </param>
+            <when value="bed">
+                <param format="bed" label="BED file" name="input_interval" type="data" help="BED datasets can be obtained using &quot;Get Data -> UCSC Main&quot; datasource."/>
+            </when>
+            <when value="chr">
+                <param name="refs" type="select" optional="False" multiple="True" label="Select references (chromosomes and contigs) you would like to restrict bam to" help="Click and type in the box above to see options. You can select multiple entries. If &quot;No options available&quot; is displayed, you need to re-detect metadata on the input dataset. See help section below.">
+
+                    <!-- The options tagset below extracts reference names from bam file metadata -->
+                    <!-- This will not work with bed files with old style metadata. However this  -->
+                    <!-- Can be easily fixed by re-deceting metadata on a bam dataset by clicking -->
+                    <!-- The pencil icon and settind datatype to "bam"                            -->
+                    <!-- This change has been commited in the following pull request:             -->
+                    <!-- https://github.com/galaxyproject/galaxy/pull/107                         -->
+
+                    <options>
+                        <filter type="data_meta" ref="input_bam" key="reference_names" />
+                    </options>
+                </param>
+            </when>
+            <when value="man">
+                <repeat name="regions" title="Regions" min="1">
+                    <param name="chrom" type="select" optional="False" label="Select references (chromosomes and contigs) you would like to restrict bam to" help="Select chromosome/contig from the list. If &quot;No options available&quot; is displayed, you need to re-detect metadata on the input dataset. See help section below.">
+
+                        <!-- See comments above -->
+
+                        <options>
+                            <filter type="data_meta" ref="input_bam" key="reference_names" />
+                        </options>
+                    </param>
+                    <param name="start" type="integer" min="1" value="0" label="Enter START coordinate (1-based)"/>
+                    <param name="end" type="integer" min="1" value="100" label="Enter END coordinate"/>
+                </repeat>
+
+
+            </when>
+        </conditional>
+
+    </inputs>
+    <outputs>
+        <data format="bam" name="output_bam" />
+    </outputs>
+    <tests>
+        <test>
+            <param ftype="bam" name="input_bam" value="bam-slice-input.bam" />
+            <param name="slice_method_selector" value="bed"/>
+            <param ftype="bed" name="input_interval" value="bam-slice.bed" />
+            <output file="bam-slice-test1.bam" ftype="bam" name="output_bam" />
+        </test>
+        <test>
+            <param ftype="bam" name="input_bam" value="bam-slice-input.bam" />
+            <param name="slice_method_selector" value="chr"/>
+            <param name="refs" value="chrM" />
+            <output file="bam-slice-test2.bam" ftype="bam" name="output_bam" />
+        </test>
+        <test>
+            <param ftype="bam" name="input_bam" value="bam-slice-input.bam" />
+            <param name="slice_method_selector" value="man"/>
+            <param name="chrom" value="chrM" />
+            <param name="start" value="1" />
+            <param name="end" value="1000" />
+            <output file="bam-slice-test3.bam" ftype="bam" name="output_bam" />
+        </test>
+    </tests>
+    <help>
+<![CDATA[
+
 **What it does**
 
- Accepts an input BAM file and an input BED file and creates an output BAM file containing only those alignments that overlap the provided BED intervals.
+Allows to restrict (slice) input BAM dataset to a list of intervals defined in a BED file, individual chromosomes, or manually set list of coordinates. BED datasets can be obtained from **Get Data -> UCSC Main**.
 
-------
-
-**Citation**
+This tool is based on ``samtools view`` command. 
 
-For the underlying tool, please cite `Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9. &lt;http://www.ncbi.nlm.nih.gov/pubmed/19505943&gt;`_
+@no-chrom-options@
 
-If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.*
-
+]]>
   </help>
+    <expand macro="citations"></expand>
 </tool>
Binary file test-data/bam-slice-input.bam has changed
Binary file test-data/bam-slice-test1.bam has changed
Binary file test-data/bam-slice-test2.bam has changed
Binary file test-data/bam-slice-test3.bam has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/bam-slice.bed	Tue Apr 21 17:37:49 2015 -0400
@@ -0,0 +1,38 @@
+chrM	5	1000	myInterval	0	+
+chrM	577	647	TRNF	0	+
+chrM	648	1601	RNR1	0	+
+chrM	1602	1670	TRNV	0	+
+chrM	1671	3229	RNR2	0	+
+chrM	3230	3304	TRNL1	0	+
+chrM	3307	4262	ND1	0	+
+chrM	4263	4331	TRNI	0	+
+chrM	4329	4400	TRNQ	0	-
+chrM	4402	4469	TRNM	0	+
+chrM	4470	5511	ND2	0	+
+chrM	5512	5579	TRNW	0	+
+chrM	5587	5655	TRNA	0	-
+chrM	5657	5729	TRNN	0	-
+chrM	5761	5826	TRNC	0	-
+chrM	5826	5891	TRNY	0	-
+chrM	5904	7445	COX1	0	+
+chrM	7446	7514	TRNS1	0	-
+chrM	7518	7585	TRND	0	+
+chrM	7586	8269	COX2	0	+
+chrM	8295	8364	TRNK	0	+
+chrM	8366	8572	ATP8	0	+
+chrM	8527	9207	ATP6	0	+
+chrM	9207	9990	COX3	0	+
+chrM	9991	10058	TRNG	0	+
+chrM	10059	10404	ND3	0	+
+chrM	10405	10469	TRNR	0	+
+chrM	10470	10766	ND4L	0	+
+chrM	10760	12137	ND4	0	+
+chrM	12138	12206	TRNH	0	+
+chrM	12207	12265	TRNS2	0	+
+chrM	12266	12336	TRNL2	0	+
+chrM	12337	14148	ND5	0	+
+chrM	14149	14673	ND6	0	-
+chrM	14674	14742	TRNE	0	-
+chrM	14747	15887	CYTB	0	+
+chrM	15888	15953	TRNT	0	+
+chrM	15956	16023	TRNP	0	-
Binary file test-data/gatk/fake_phiX_reads_1.bam has changed
--- a/test-data/gatk/fake_phiX_variant_locations.bed	Thu Mar 27 15:28:06 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-phiX174	1442	1443
-phiX174	1445	1446
--- a/tool_dependencies.xml	Thu Mar 27 15:28:06 2014 -0400
+++ b/tool_dependencies.xml	Tue Apr 21 17:37:49 2015 -0400
@@ -1,6 +1,6 @@
 <?xml version="1.0"?>
 <tool_dependency>
-    <package name="samtools" version="0.1.19">
-        <repository changeset_revision="1ef76f8d8e52" name="package_samtools_0_1_19" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" />
+    <package name="samtools" version="1.2">
+        <repository changeset_revision="6eea04363026" name="package_samtools_1_2" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
     </package>
 </tool_dependency>