Galaxy |

Changeset 2:e29bc5c169bc (2015-03-20)

Previous changeset 1:c71dd035971e (2015-01-14) Next changeset 3:607ca4b95837 (2015-03-20)

Commit message:
Uploaded

modified:
bwa-mem.xml
bwa.xml
bwa_macros.xml
tool_dependencies.xml

added:
shed_upload.tar.gz
test-data/bwa-aln-test3.bam
test-data/bwa-mem-test2.bam

diff -r c71dd035971e -r e29bc5c169bc bwa-mem.xml
--- a/bwa-mem.xml Wed Jan 14 13:51:07 2015 -0500
+++ b/bwa-mem.xml Fri Mar 20 12:09:08 2015 -0400

[

b'@@ -1,31 +1,29 @@\n <?xml version="1.0"?>\n-<tool id="bwa_mem" name="BWA-MEM" version="0.1">\n- \n+<tool id="bwa_mem" name="Map with BWA-MEM" version="0.2.1">\n+ <description>- map medium and long reads (> 100 bp) against reference genome</description>\n <macros>\n <import>bwa_macros.xml</import>\n </macros>\n- \n <requirements>\n <requirement type="package" version="0.7.10.039ea20639">bwa</requirement>\n <requirement type="package" version="1.1">samtools</requirement>\n </requirements>\n- <description>- map medium and long reads (> 100 bp) against reference genome</description>\n+ <stdio>\n+ <exit_code range="1:" />\n+ </stdio>\n <command>\n- \n #set $reference_fasta_filename = "localref.fa"\n- \n+\n #if str( $reference_source.reference_source_selector ) == "history":\n- \n ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" &&\n- \n+\n ## The following shell commands decide with of the BWA indexing algorithms (IS or BWTSW) will be run\n ## depending ob the size of the input FASTA dataset\n- \n (\n size=`stat -c %s "${reference_fasta_filename}" 2>/dev/null`; ## Linux\n- if [ $? -eq 0 ]; \n+ if [ $? -eq 0 ];\n then\n- if [ \\$size -lt 2000000000 ]; \n+ if [ "\\$size" -lt 2000000000 ];\n then\n bwa index -a is "${reference_fasta_filename}";\n echo "Generating BWA index with is algorithm";\n@@ -35,10 +33,10 @@\n fi;\n fi;\n \n- eval \\$(stat -s "${reference_fasta_filename}"); ## OSX\n- if [ $? -eq 0 ];\n+ eval \\$(stat -s "${reference_fasta_filename}" 2>/dev/null); ## OSX\n+ if [ -n "\\$st_size" ];\n then\n- if [ \\$st_size -lt 2000000000 ];\n+ if [ "\\$st_size" -lt 2000000000 ];\n then\n bwa index -a is "${reference_fasta_filename}";\n echo "Generating BWA index with is algorithm";\n@@ -48,31 +46,28 @@\n fi;\n fi;\n ) &&\n- \n+\n #else:\n #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path )\n #end if\n- \n+\n ## Begin BWA-MEM command line\n- \n+\n bwa mem\n -t "\\${GALAXY_SLOTS:-1}"\n- -v 1 ## Verbosity is set to 1 (errors only) \n- \n+ -v 1 ## Verbosity is set to 1 (errors only)\n+\n #if str( $fastq_input.fastq_input_selector ) == "paired_iv": ## For interleaved fastq files set -p option\n -p\n #if str( $fastq_input.iset_stats ): ## check that insert statistics is used\n -I "${fastq_input.iset_stats}"\n #end if\n #end if\n- \n+\n #if str( $analysis_type.analysis_type_selector ) == "pacbio":\n- -x\n- \n+ -x pacbio\n #elif str( $analysis_type.analysis_type_selector ) == "full":\n- \n- #if str( $analysis_type.algorithmic_options.algorithmic_options_selector ) == "True": ## Algorithmic options\n- \n+ #if str( $analysis_type.algorithmic_options.algorithmic_options_selector ) == "set": ## Algorithmic options\n -k "${analysis_type.algorithmic_options.k}"\n -w "${analysis_type.algorithmic_options.w}"\n -d "${analysis_type.algorithmic_options.d}"\n@@ -85,22 +80,18 @@\n ${analysis_type.algorithmic_options.S}\n ${analysis_type.algorithmic_options.P}\n ${analysis_type.algorithmic_options.e}\n- \n #end if\n- \n- #if str( $analysis_type.scoring_options.scoring_options_selector '..b' <param name="Y" type="boolean" truevalue="-Y" falsevalue="" label="use soft clipping for supplementary alignments" help="-Y"/>\n- <param name="M" type="boolean" truevalue="-M" falsevalue="" label="mark shorter split hits as secondary" help="-M"/>\n+ <param name="T" type="integer" value="30" label="Minimum score to output" help="-T; default=30"/>\n+ <param name="h" type="integer" value="5" label="If there are less than THIS VALUE hits with score >80% of the max score, output them all in the XA tag" help="-h; default=5" />\n+ <param name="a" type="boolean" truevalue="-a" falsevalue="" label="Output all alignments for single-ends or unpaired paired-ends" help="-a; These alignments will be flagged as secondary alignments"/>\n+ <param name="C" type="boolean" truevalue="-C" falsevalue="" label="Append FASTA/FASTQ comment to BAM output" help="-C"/>\n+ <param name="V" type="boolean" truevalue="-V" falsevalue="" label="Output the reference FASTA header in the XR tag" help="-C"/>\n+ <param name="Y" type="boolean" truevalue="-Y" falsevalue="" label="Use soft clipping for supplementary alignments" help="-Y; By default, BWA-MEM uses soft clipping for the primary alignment and hard clipping for supplementary alignments" />\n+ <param name="M" type="boolean" truevalue="-M" falsevalue="" label="Mark shorter split hits of a chimeric alignment in the FLAG field as \'secondary alignment\' instead of \'supplementary alignment\'" help="-M; For Picard<1.96 compatibility" />\n </when>\n <when value="do_not_set">\n \n@@ -315,11 +279,11 @@\n </when>\n </conditional>\n </inputs>\n- \n+\n <outputs>\n <data format="bam" name="bam_output" label="${tool.name} on ${on_string} (mapped reads in BAM format)"/>\n </outputs>\n- \n+\n <tests>\n <test>\n <param name="reference_source_selector" value="history" />\n@@ -330,12 +294,19 @@\n <param name="analysis_type_selector" value="illumina"/>\n <output name="bam_output" ftype="bam" file="bwa-mem-test1.bam" lines_diff="2" />\n </test>\n+ <test>\n+ <param name="reference_source_selector" value="history" />\n+ <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>\n+ <param name="fastq_input_selector" value="paired"/>\n+ <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/>\n+ <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>\n+ <param name="rg_selector" value="set"/>\n+ <param name="ID" value="rg1"/>\n+ <param name="analysis_type_selector" value="illumina"/>\n+ <output name="bam_output" ftype="bam" file="bwa-mem-test2.bam" lines_diff="2" />\n+ </test>\n </tests>\n- <stdio>\n- <exit_code range="1:" />\n- </stdio>\n <help>\n- \n **What is does**\n \n From http://arxiv.org/abs/1303.3997:\n@@ -358,7 +329,7 @@\n 1. *Simple Illumina mode*: The simplest possible bwa mem application in which it alignes single or paired-end data to reference using default parameters. It is equivalent to the following command: bwa mem <reference index> <fastq dataset1> [fastq dataset2]\n 2. *PacBio mode*: The mode adjusted specifically for mapping of long PacBio subreads. Equivalent to the following command: bwa mem -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0 <reference index> <PacBio dataset in fastq format>\n 3. *Full list of options*: Allows access to all options through Galaxy interface.\n- \n+\n ------\n \n **BWA MEM options**\n@@ -407,16 +378,12 @@\n specify the mean, standard deviation (10% of the mean if absent), max\n (4 sigma from the mean if absent) and min of the insert size distribution.\n FR orientation only. [inferred]\n- \n \n @dataset_collections@\n \n @RG@\n \n @info@\n-\n- \n- \n </help>\n <citations>\n <citation type="doi">10.1093/bioinformatics/btp324</citation>\n'

diff -r c71dd035971e -r e29bc5c169bc bwa.xml
--- a/bwa.xml Wed Jan 14 13:51:07 2015 -0500
+++ b/bwa.xml Fri Mar 20 12:09:08 2015 -0400

[

b'@@ -1,220 +1,10 @@\n <?xml version="1.0"?>\n-<tool id="bwa" name="BWA" version="0.1">\n- \n- <requirements>\n- <requirement type="package" version="0.7.10.039ea20639">bwa</requirement>\n- <requirement type="package" version="1.1">samtools</requirement>\n- </requirements>\n+<tool id="bwa" name="Map with BWA" version="0.1">\n <description>- map short reads (< 100 bp) against reference genome</description>\n- <command>\n- \n- #set $reference_fasta_filename = "localref.fa"\n- \n- #if str( $reference_source.reference_source_selector ) == "history":\n- \n- ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" &&\n- \n- ## The following shell commands decide with of the BWA indexing algorithms (IS or BWTSW) will be run\n- ## depending ob the size of the input FASTA dataset\n- \n- (\n- size=`stat -c %s "${reference_fasta_filename}" 2>/dev/null`; ## Linux\n- if [ $? -eq 0 ]; \n- then\n- if [ \\$size -lt 2000000000 ]; \n- then\n- bwa index -a is "${reference_fasta_filename}";\n- else\n- bwa index -a bwtsw "${reference_fasta_filename}";\n- fi;\n- fi;\n-\n- eval \\$(stat -s "${reference_fasta_filename}"); ## OSX\n- if [ $? -eq 0 ];\n- then\n- if [ \\$st_size -lt 2000000000 ];\n- then\n- bwa index -a is "${reference_fasta_filename}";\n- echo "Generating BWA index with is algorithm";\n- else\n- bwa index -a bwtsw "${reference_fasta_filename}";\n- echo "Generating BWA index with bwtsw algorithm";\n- fi;\n- fi;\n- ) &&\n- \n- #else:\n- #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path )\n- #end if\n- \n- ## Begin bwa command line\n- \n-####### Fastq paired\n- \n- #if str( $input_type.input_type_selector ) == "paired" or str( $input_type.input_type_selector ) == "paired_collection":\n- \n- bwa aln\n- -t "\\${GALAXY_SLOTS:-1}"\n- \n- @command_options@\n- \n- "${reference_fasta_filename}"\n- \n- #if str( $input_type.input_type_selector ) == "paired_collection":\n- "${input_type.fastq_input1.forward}"\n- #else\n- "${input_type.fastq_input1}"\n- #end if\n- \n- > first.sai &&\n- \n- bwa aln\n- -t "\\${GALAXY_SLOTS:-1}"\n- \n- @command_options@\n- \n- "${reference_fasta_filename}"\n- \n- #if str( $input_type.input_type_selector ) == "paired_collection":\n- "${input_type.fastq_input1.reverse}"\n- #else\n- "${input_type.fastq_input2}"\n- #end if\n-\n- > second.sai &&\n- \n- bwa sampe\n- \n- #if str( $input_type.adv_pe_options.adv_pe_options_selector) == "True":\n- \n- -a ${$input_type.adv_pe_options.a}\n- -o ${$input_type.adv_pe_options.o}\n- -n ${$input_type.adv_pe_options.n}\n- -N ${$input_type.adv_pe_options.N}\n- \n- #end if\n- \n- @read_group_options@\n- \n- #if str( $input_type.input_type_selector ) == "paired_collection":\n- \n- "${reference_fasta_filename}" first.sai second.sai "${input_type.fastq_input1.forward}" "${input_type.fastq_input1.reverse}"\n- \n- #else:\n- \n- "${reference_fasta_filename}" first.sai second.sai "${input_type.fastq_input1}" "${input_type.fastq_input2}"\n- \n- #end if\n- \n-####### Fastq single\n- \n- #elif str( $input_type.input_type_selector ) == "single":\n- \n- bwa aln\n- -t "\\${GALAXY_SLOTS:-1}"\n- \n- @command_options@\n- \n- "${reference_fasta_filename}"\n- "${input_type.fastq_input1}"\n- > first.sai &&\n- \n- bwa samse\n- \n- #if str( $input_type.adv_se_options.adv_'..b' integer, or the fraction of missing alignments given 2% uniform base error rate if float. In the latter case, the maximum edit distance is automatically chosen for different read lengths." help="aln -n; default=0.04"/>\n <param name="o" type="integer" value="1" label="maximum number or gap openings" help="aln -o; default=1"/>\n <param name="e" type="integer" value="-1" label="maximum number of gap extensions" help="aln -e; -1 disables long gaps and invokes k-difference mode; default=-1"/>\n@@ -395,15 +352,15 @@\n <param name="R" type="integer" value="30" label="stop searching when there are more than this value of equally best hits" help="aln -R; default=30"/>\n <param name="q" type="integer" value="0" label="quality threshold for read trimming down to 35bp" help="aln -q; default=0"/>\n <param name="B" type="integer" optional="True" label="length of barcode" help="aln -B; optional parameter"/>\n- <param name="L" type="float" optional="True" label="log-scaled gap penalty for long deletions" help="aln -L; optional parameter"/> \n+ <param name="L" type="float" optional="True" label="log-scaled gap penalty for long deletions" help="aln -L; optional parameter"/>\n </when>\n </conditional>\n </inputs>\n- \n+\n <outputs>\n <data format="bam" name="bam_output" label="${tool.name} on ${on_string} (mapped reads in BAM format)"/>\n </outputs>\n- \n+\n <tests>\n <test>\n <param name="reference_source_selector" value="history" />\n@@ -422,12 +379,19 @@\n <param name="analysis_type_selector" value="illumina"/>\n <output name="bam_output" ftype="bam" file="bwa-aln-test2.bam" lines_diff="2" />\n </test>\n+ <test>\n+ <param name="reference_source_selector" value="history" />\n+ <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>\n+ <param name="input_type_selector" value="paired"/>\n+ <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/>\n+ <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>\n+ <param name="rg_selector" value="set"/>\n+ <param name="ID" value="rg1"/>\n+ <param name="analysis_type_selector" value="illumina"/>\n+ <output name="bam_output" ftype="bam" file="bwa-aln-test3.bam" lines_diff="2" />\n+ </test>\n </tests>\n- <stdio>\n- <exit_code range="1:" />\n- </stdio>\n <help>\n- \n **What is does**\n \n BWA is a software package for mapping low-divergent sequences against a large reference genome, such as the human genome. The bwa-aln algorithm is designed for Illumina sequence reads up to 100bp. For longer reads use BWA-MEM algorithm distributed as separate Galaxy tool.\n@@ -437,7 +401,7 @@\n - bwa aln - actual mapper placing reads onto the reference sequence\n - bwa samse - post-processor converting suffix array coordinates into genome coordinates in SAM format for single reads\n - bam sampe - post-processor for paired reads\n- \n+\n Galaxy implementation takes fastq or BAM (unaligned BAM) datasets as input and produces output in BAM (not SAM; in reality SAM produced by the bwa is converted to BAM on the fly by samtools view command) format, which can be further processed using various BAM utilities exiting in Galaxy (BAMTools, SAMTools, Picard).\n \n -----\n@@ -448,7 +412,7 @@\n \n 1. *Simple Illumina mode*: The simplest possible bwa mem application in which it alignes single or paired-end data to reference using default parameters. It is equivalent to the following command: bwa mem <reference index> <fastq dataset1> [fastq dataset2]\n 2. *Full list of options*: Allows access to all options through Galaxy interface.\n- \n+\n ------\n \n **bwa-aln options**\n@@ -490,14 +454,12 @@\n \n -n INT maximum hits to output for paired reads [3]\n -r STR read group header line [null]\n- \n \n @dataset_collections@\n \n @RG@\n \n @info@\n- \n </help>\n <citations>\n <citation type="doi">10.1093/bioinformatics/btp324</citation>\n'

diff -r c71dd035971e -r e29bc5c169bc bwa_macros.xml
--- a/bwa_macros.xml Wed Jan 14 13:51:07 2015 -0500
+++ b/bwa_macros.xml Fri Mar 20 12:09:08 2015 -0400

[

@@ -1,6 +1,37 @@
<macros>
+
+  <token name="@set_rg_string@">
+      #set $rg_string = "@RG\tID:" + str($rg.ID) + "\tSM:" + str($rg.SM) + "\tPL:" + str($rg.PL)
+      #if $rg.LB
+        #set $rg_string += "\tLB:$rg.LB"
+      #end if
+      #if $rg.CN
+        #set $rg_string += "\tCN:$rg.CN"
+      #end if
+      #if $rg.DS
+        #set $rg_string += "\tDS:$rg.DS"
+      #end if
+      #if $rg.DT
+        #set $rg_string += "\tDT:$rg.DT"
+      #end if
+      #if $rg.FO
+        #set $rg_string += "\tFO:$rg.FO"
+      #end if
+      #if $rg.KS
+        #set $rg_string += "\tKS:$rg.KS"
+      #end if
+      #if $rg.PG
+        #set $rg_string += "\tPG:$rg.PG"
+      #end if
+      #if str($rg.PI)
+        #set $rg_string += "\tPI:$rg.PI"
+      #end if
+      #if $rg.PU
+        #set $rg_string += "\tPU:$rg.PU"
+      #end if
+  </token>

-    <token name="@RG@">
+  <token name="@RG@">
-----

.. class:: warningmark
@@ -8,9 +39,9 @@
**Read Groups are Important!**

One of the recommended best practices in NGS analysis is adding read group information to BAM files. You can do thid directly in BWA interface using the
-**Specify readgroup information?** widget. If you are not familiar with readgroups you shold know that this is effectively a way to tag reads with an additional ID.
+**Specify read group information?** widget. If you are not familiar with read groups you shold know that this is effectively a way to tag reads with an additional ID.
This allows you to combine BAM files from, for example, multiple BWA runs into a single dataset. This significantly simplifies downstream processing as
-instead of dealing with multiple datasets you only have to handle only one. This is possible because the readgroup information allows you to identify
+instead of dealing with multiple datasets you only have to handle only one. This is possible because the read group information allows you to identify
data from different experiments even if they are combined in one file. Many downstream analysis tools such as varinat callers (e.g., FreeBayes or Naive Varinat Caller
present in Galaxy) are aware of readgtroups and will automatically generate calls for each individual sample even if they are combined within a single file.

@@ -51,8 +82,8 @@
  @RG     ID:FLOWCELL2.LANE4      PL:illumina     LB:LIB-KID-2 SM:KID      PI:400

Note the hierarchical relationship between read groups (unique for each lane) to libraries (sequenced on two lanes) and samples (across four lanes, two lanes for each library).
-    </token>
-    <token name="@info@">
+  </token>
+  <token name="@info@">
-----

.. class:: infomark
@@ -66,9 +97,9 @@
   3. https://github.com/lh3/bwa
   4. http://bio-bwa.sourceforge.net/

-    </token>
+  </token>

-    <token name="@dataset_collections@">
+  <token name="@dataset_collections@">
------

**Dataset collections - processing large numbers of datasets at once**
@@ -76,7 +107,43 @@
This will be added shortly

-    </token>
-
+  </token>
+  <xml name="readgroup_params">
+    <conditional name="rg">
+      <param name="rg_selector" type="select" label="Set read groups information?" help="-R; Specifying read group information can greatly simplify your downstream analyses by allowing combining multiple datasets. See help below for more details">
+        <option value="set">Set</option>
+        <option value="do_not_set" selected="True">Do not set</option>
+      </param>
+      <when value="set">
+        <param name="ID" type="text" value="" size="20" label="Read group identifier (ID)" help="This value must be unique among multiple samples in your experiment">
+          <validator type="empty_field" />
+        </param>
+        <param name="SM" type="text" value="" size="20" label="Read group sample name (SM)" help="This value should be descriptive. Use pool name where a pool is being sequenced" />
+        <param name="PL" type="select" label="Platform/technology used to produce the reads (PL)">
+          <option value="CAPILLARY">CAPILLARY</option>
+          <option value="LS454">LS454</option>
+          <option value="ILLUMINA">ILLUMINA</option>
+          <option value="SOLID">SOLID</option>
+          <option value="HELICOS">HELICOS</option>
+          <option value="IONTORRENT">IONTORRENT</option>
+          <option value="PACBIO">PACBIO</option>
+        </param>
+        <param name="LB" type="text" size="25" label="Library name (LB)" />
+        <param name="CN" type="text" size="25" label="Sequencing center that produced the read (CN)" />
+        <param name="DS" type="text" size="25" label="Description (DS)" />
+        <param name="DT" type="text" size="25" label="Date that run was produced (DT)" help="ISO8601 format date or date/time, like YYYY-MM-DD" />
+        <param name="FO" type="text" size="25" optional="true" label="Flow order (FO)" help="The array of nucleotide bases that correspond to the nucleotides used for each flow of each read. Multi-base flows are encoded in IUPAC format, and non-nucleotide flows by various other characters. Format: /\*|[ACMGRSVTWYHKDBN]+/">
+          <validator type="regex" message="Invalid flow order">\*|[ACMGRSVTWYHKDBN]+$</validator>
+        </param>
+        <param name="KS" type="text" size="25" label="The array of nucleotide bases that correspond to the key sequence of each read (KS)" />
+        <param name="PG" type="text" size="25" label="Programs used for processing the read group (PG)" />
+        <param name="PI" type="integer" optional="true" label="Predicted median insert size (PI)" />
+        <param name="PU" type="text" size="25" label="Platform unit (PU)" help="Unique identifier (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD)" />
+      </when>
+      <when value="do_not_set">
+        
+      </when>
+    </conditional>
+  </xml>

</macros>

diff -r c71dd035971e -r e29bc5c169bc shed_upload.tar.gz

Binary file shed_upload.tar.gz has changed

diff -r c71dd035971e -r e29bc5c169bc test-data/bwa-aln-test3.bam

Binary file test-data/bwa-aln-test3.bam has changed

diff -r c71dd035971e -r e29bc5c169bc test-data/bwa-mem-test2.bam

Binary file test-data/bwa-mem-test2.bam has changed

diff -r c71dd035971e -r e29bc5c169bc tool_dependencies.xml
--- a/tool_dependencies.xml Wed Jan 14 13:51:07 2015 -0500
+++ b/tool_dependencies.xml Fri Mar 20 12:09:08 2015 -0400

@@ -1,9 +1,9 @@
<?xml version="1.0"?>
<tool_dependency>
     <package name="bwa" version="0.7.10.039ea20639">
-        <repository changeset_revision="5b9aca1e1c07" name="package_bwa_0_7_10_039ea20639" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" />
+        <repository changeset_revision="5b9aca1e1c07" name="package_bwa_0_7_10_039ea20639" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" />
     </package>
     <package name="samtools" version="1.1">
-        <repository changeset_revision="43f2fbec5d52" name="package_samtools_1_1" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" />
+        <repository changeset_revision="43f2fbec5d52" name="package_samtools_1_1" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
      </package>
</tool_dependency>