diff bwa-mem2.xml @ 0:82217dccdbcf draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/bwa_mem2 commit 7998bbefd9bfd03bc0e92a922297b503832c0419"
author iuc
date Fri, 08 Oct 2021 10:19:48 +0000
parents
children b4a22d90cce9
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bwa-mem2.xml	Fri Oct 08 10:19:48 2021 +0000
@@ -0,0 +1,399 @@
+<tool id="bwa_mem2" name="BWA-MEM2"  version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01">
+    <description>- map medium and long reads (&gt; 100 bp) against reference genome</description>
+    <macros>
+        <import>read_group_macros.xml</import>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <expand macro="stdio"/>
+    <expand macro="xrefs"/>
+    <command><![CDATA[
+@pipefail@
+@set_reference_fasta_filename@
+
+## Begin BWA-MEM command line
+
+bwa-mem2 mem
+#if str( $output_sort ) == "unsorted":
+    -t 1
+#else
+    -t "\${GALAXY_SLOTS:-1}"
+#end if
+## Verbosity is set to 1 (errors only)
+-v 1
+
+#if str( $fastq_input.fastq_input_selector ) == "paired_iv":
+    ## For interleaved fastq files set -p option
+    -p
+    ## check that insert statistics is used
+    #if str( $fastq_input.iset_stats ):
+      -I '${fastq_input.iset_stats}'
+    #end if
+#end if
+
+#if str( $analysis_type.analysis_type_selector ) not in ["illumina", "full"]:
+    -x '$analysis_type.analysis_type_selector'
+#elif str( $analysis_type.analysis_type_selector ) == "full":
+    ## Algorithmic options
+    #if str( $analysis_type.algorithmic_options.algorithmic_options_selector ) == "set":
+        -k '${analysis_type.algorithmic_options.k}'
+        -w '${analysis_type.algorithmic_options.w}'
+        -d '${analysis_type.algorithmic_options.d}'
+        -r '${analysis_type.algorithmic_options.r}'
+        -y '${analysis_type.algorithmic_options.y}'
+        -c '${analysis_type.algorithmic_options.c}'
+        -D '${analysis_type.algorithmic_options.D}'
+        -W '${analysis_type.algorithmic_options.W}'
+        -m '${analysis_type.algorithmic_options.m}'
+        ${analysis_type.algorithmic_options.S}
+        ${analysis_type.algorithmic_options.P}
+        ${analysis_type.algorithmic_options.e}
+    #end if
+
+    ## Scoring options
+    #if str( $analysis_type.scoring_options.scoring_options_selector ) == "set":
+        -A '${analysis_type.scoring_options.A}'
+        -B '${analysis_type.scoring_options.B}'
+        -O '${analysis_type.scoring_options.O}'
+        -E '${analysis_type.scoring_options.E}'
+        -L '${analysis_type.scoring_options.L}'
+        -U '${analysis_type.scoring_options.U}'
+    #end if
+
+    ## IO options
+    #if str( $analysis_type.io_options.io_options_selector ) == "set":
+        -T '${analysis_type.io_options.T}'
+        -h '${analysis_type.io_options.h}'
+        ${analysis_type.io_options.a}
+        ${analysis_type.io_options.C}
+        ${analysis_type.io_options.V}
+        ${analysis_type.io_options.Y}
+        ${analysis_type.io_options.M}
+        ${analysis_type.io_options.five}
+        ${analysis_type.io_options.q}
+    #end if
+
+#end if
+
+## Handle read group options...
+@define_read_group_helpers@
+#if str( $fastq_input.fastq_input_selector ) == "paired":
+    #set $rg_auto_name = $read_group_name_default($fastq_input.fastq_input1, $fastq_input.fastq_input2)
+#else:
+    #set $rg_auto_name = $read_group_name_default($fastq_input.fastq_input1)
+#end if
+@set_use_rg_var@
+@set_read_group_vars@
+#if $use_rg
+    @set_rg_string@
+    -R '$rg_string'
+#end if
+
+#if str( $fastq_input.fastq_input_selector ) == "paired":
+    ## check that insert statistics is used
+    #if str( $fastq_input.iset_stats ):
+        -I '${fastq_input.iset_stats}'
+    #end if
+
+    '${reference_fasta_filename}'
+    '${fastq_input.fastq_input1}' '${fastq_input.fastq_input2}'
+#elif str( $fastq_input.fastq_input_selector ) == "paired_collection":
+    ## check that insert statistics is used
+    #if str( $fastq_input.iset_stats ):
+        -I '${fastq_input.iset_stats}'
+    #end if
+
+    '${reference_fasta_filename}'
+    '${fastq_input.fastq_input1.forward}' '${fastq_input.fastq_input1.reverse}'
+#else:
+    '${reference_fasta_filename}'
+    '${fastq_input.fastq_input1}'
+#end if
+
+#if str( $output_sort ) == "coordinate":
+        | samtools sort -@\${GALAXY_SLOTS:-2} -T "\${TMPDIR:-.}" -O bam -o '$bam_output'
+#elif str( $output_sort ) == "name":
+        | samtools sort -n -@\${GALAXY_SLOTS:-2} -T "\${TMPDIR:-.}" -O bam -o '$bam_output'
+#else
+        | samtools view -@ \${GALAXY_SLOTS:-2} -bS - -o '$bam_output'
+#end if
+
+
+    ]]></command>
+
+    <inputs>
+        <expand macro="reference_source_conditional" />
+        <conditional name="fastq_input">
+            <param name="fastq_input_selector" type="select" label="Single or Paired-end reads" help="Select between paired and single end data">
+                <option value="paired">Paired</option>
+                <option value="single">Single</option>
+                <option value="paired_collection">Paired Collection</option>
+                <option value="paired_iv">Paired Interleaved</option>
+            </param>
+            <when value="paired">
+                <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select first set of reads" help="Specify dataset with forward reads"/>
+                <param name="fastq_input2" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select second set of reads" help="Specify dataset with reverse reads"/>
+                <param name="iset_stats" type="text" optional="True" label="Enter mean, standard deviation, max, and min for insert lengths." help="-I; This parameter is only used for paired reads. Only mean is required while sd, max, and min will be inferred. Examples: both &quot;250&quot; and &quot;250,25&quot; will work while &quot;250,,10&quot; will not. See below for details.">
+                    <sanitizer invalid_char="">
+                        <valid initial="string.digits"><add value=","/> </valid>
+                    </sanitizer>
+                </param>
+            </when>
+            <when value="single">
+                <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select fastq dataset" help="Specify dataset with single reads"/>
+            </when>
+            <when value="paired_collection">
+                <param name="fastq_input1" format="fastqsanger,fastqsanger.gz,fasta" type="data_collection" collection_type="paired" label="Select a paired collection" help="See help section for an explanation of dataset collections"/>
+                <param name="iset_stats" type="text" optional="True" label="Enter mean, standard deviation, max, and min for insert lengths." help="-I; This parameter is only used for paired reads. Only mean is required while sd, max, and min will be inferred. Examples: both &quot;250&quot; and &quot;250,25&quot; will work while &quot;250,,10&quot; will not. See below for details.">
+                    <sanitizer invalid_char="">
+                        <valid initial="string.digits"><add value=","/> </valid>
+                    </sanitizer>
+                </param>
+            </when>
+            <when value="paired_iv">
+                <param name="fastq_input1" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select fastq dataset" help="Specify dataset with interleaved reads"/>
+                <param name="iset_stats" type="text" optional="True" label="Enter mean, standard deviation, max, and min for insert lengths." help="-I; This parameter is only used for paired reads. Only mean is required while sd, max, and min will be inferred. Examples: both &quot;250&quot; and &quot;250,25&quot; will work while &quot;250,,10&quot; will not. See below for details.">
+                    <sanitizer invalid_char="">
+                        <valid initial="string.digits"><add value=","/> </valid>
+                    </sanitizer>
+                </param>
+            </when>
+        </conditional>
+
+        <expand macro="read_group_conditional" />
+
+        <conditional name="analysis_type">
+            <param name="analysis_type_selector" type="select" label="Select analysis mode">
+                <option value="illumina">1.Simple Illumina mode</option>
+                <option value="pacbio">2.PacBio mode (-x pacbio)</option>
+                <option value="ont2d">3.Nanopore 2D-reads mode (-x ont2d)</option>
+                <option value="intractg">4.Intra-species contigs mode (-x intractg)</option>
+                <option value="full">5.Full list of options</option>
+            </param>
+            <when value="illumina">
+                <!-- do nothing -->
+            </when>
+            <when value="pacbio">
+                <!-- do nothing. all magic happens within <command> tag -->
+            </when>
+            <when value="ont2d">
+                <!-- do nothing. all magic happens within <command> tag -->
+            </when>
+            <when value="intractg">
+                <!-- do nothing. all magic happens within <command> tag -->
+            </when>
+            <when value="full">
+                <conditional name="algorithmic_options">
+                    <param name="algorithmic_options_selector" type="select" label="Set algorithmic options?" help="Sets -k, -w, -d, -r, -y, -c, -D, -W, -m, -S, -P, and -e options.">
+                        <option value="set">Set</option>
+                        <option value="do_not_set" selected="True">Do not set</option>
+                    </param>
+                    <when value="set">
+                        <param name="k" type="integer" value="19" label="Minimum seed length" help="-k; default=19"/>
+                        <param name="w" type="integer" value="100" label="Band width for banded alignment" help="-w; default=100"/>
+                        <param name="d" type="integer" value="100" label="Off-diagonal X-dropoff" help="-d; default=100"/>
+                        <param name="r" type="float" value="1.5" label="Look for internal seeds inside a seed longer than -k * THIS VALUE" help="-r; default=1.5; This is a key heuristic parameter for tuning the performance. Larger value yields fewer seeds, which leads to faster alignment speed but lower accuracy" />
+                        <param name="y" type="integer" value="20" label="Seed occurrence for the 3rd round seeding" help="-y; default=20" />
+                        <param name="c" type="integer" value="500" label="Skip seeds with more than that many occurrences" help="-c; default=500"/>
+                        <param name="D" type="float" value="0.5" label="Drop chains shorter than this fraction of the longest overlapping chain" help="-D; default=0.5"/>
+                        <param name="W" type="integer" value="0" label="Discard a chain if seeded bases shorter than THIS VALUE" help="-W; default=0"/>
+                        <param name="m" type="integer" value="50" label="Perform at most this many rounds of mate rescues for each read" help="-m; default=50"/>
+                        <param name="S" type="boolean" truevalue="-S" falsevalue="" label="Skip mate rescue" help="-S"/>
+                        <param name="P" type="boolean" truevalue="-P" falsevalue="" label="Skip pairing; mate rescue performed unless -S also in use" help="-P"/>
+                        <param name="e" type="boolean" truevalue="-e" falsevalue="" label="Discard full-length exact matches" help="-e"/>
+                    </when>
+                    <when value="do_not_set">
+                        <!-- do nothing -->
+                    </when>
+                </conditional>
+
+                <conditional name="scoring_options">
+                    <param name="scoring_options_selector" type="select" label="Set scoring options?" help="Sets -A, -B, -O, -E, -L, and -U options.">
+                        <option value="set">Set</option>
+                        <option value="do_not_set" selected="True">Do not set</option>
+                    </param>
+                    <when value="set">
+                        <param name="A" type="integer" value="1" label="Score for a sequence match" help="-A; scales options -T, -d, -B, -O, -E, -L, and -U unless overridden; default=1"/>
+                        <param name="B" type="integer" value="4" label="Penalty for a mismatch" help="-B; default=4"/>
+                        <param name="O" type="text" value="6,6" label="Gap open penalties for deletions and insertions" help="-O; default=6,6">
+                            <sanitizer invalid_char="">
+                                <valid initial="string.digits"><add value=","/> </valid>
+                            </sanitizer>
+                        </param>
+                        <param name="E" type="text" value="1,1" label="Gap extension penalties; a gap of size k cost &#39;-O + -E*k&#39;. If two numbers are specified, the first is the penalty of extending a deletion and the second for extending an insertion" help="-E; default=1,1">
+                            <sanitizer invalid_char="">
+                                <valid initial="string.digits"><add value=","/> </valid>
+                            </sanitizer>
+                        </param>
+                        <param name="L" type="text" value="5,5" label="Penalties for 5&#39;-end and 3&#39;-end clipping" help="-L; default=5,5; When performing Smith-Waterman extension, BWA-MEM keeps track of the best score reaching the end of query. If this score is larger than the best Smith-Waterman score minus the clipping penalty, clipping will not be applied. Note that in this case, the SAM AS tag reports the best Smith-Waterman score; clipping penalty is not deduced">
+                            <sanitizer invalid_char="">
+                                <valid initial="string.digits"><add value=","/> </valid>
+                            </sanitizer>
+                        </param>
+                        <param name="U" type="integer" value="17" label="Penalty for an unpaired read pair" help="-U; default=17"/>
+                    </when>
+                    <when value="do_not_set">
+                        <!-- do nothing -->
+                    </when>
+                </conditional>
+
+                <conditional name="io_options">
+                    <param name="io_options_selector" type="select" label="Set input/output options" help="Sets -T, -h, -a, -C, -V, -Y, and -M options.">
+                        <option value="set">Set</option>
+                        <option value="do_not_set" selected="True">Do not set</option>
+                    </param>
+                    <when value="set">
+                        <param name="five" argument="-5" type="boolean" truevalue="-5" falsevalue="" label="For split alignment, take alignment with smallest coordinate as primary" help="Useful for HiC data"/>
+                        <param argument="-q" type="boolean" truevalue="-q" falsevalue="" label="Don't lower MAPQ for split alignment" help="By default the MAPQ score of a supplementary alignment will be lowered to the primary alignment score."/>
+                        <param name="T" type="integer" value="30" label="Minimum score to output" help="-T; default=30"/>
+                        <param name="h" type="integer" value="5" label="If there are less than THIS VALUE hits with score &gt;80% of the max score, output them all in the XA tag" help="-h; default=5" />
+                        <param name="a" type="boolean" truevalue="-a" falsevalue="" label="Output all alignments for single-ends or unpaired paired-ends" help="-a; These alignments will be flagged as secondary alignments"/>
+                        <param name="C" type="boolean" truevalue="-C" falsevalue="" label="Append FASTA/FASTQ comment to BAM output" help="-C"/>
+                        <param name="V" type="boolean" truevalue="-V" falsevalue="" label="Output the reference FASTA header in the XR tag" help="-C"/>
+                        <param name="Y" type="boolean" truevalue="-Y" falsevalue="" label="Use soft clipping for supplementary alignments" help="-Y; By default, BWA-MEM uses soft clipping for the primary alignment and hard clipping for supplementary alignments" />
+                        <param name="M" type="boolean" truevalue="-M" falsevalue="" label="Mark shorter split hits of a chimeric alignment in the FLAG field as 'secondary alignment' instead of 'supplementary alignment'" help="-M; For Picard&lt;1.96 compatibility" />
+                    </when>
+                    <when value="do_not_set">
+                        <!-- do nothing -->
+                    </when>
+                </conditional>
+            </when>
+        </conditional>
+        <param name="output_sort" type="select" label="BAM sorting mode" help="The 'Not sorted' option can extend the run time of the tool significantly (cause it requires running on only a single thread).">
+            <option value="coordinate" selected="True">Sort by chromosomal coordinates</option>
+            <option value="name">Sort by read names  (i.e., the QNAME field) </option>
+            <option value="unsorted">Not sorted (sorted as input)</option>
+        </param>
+    </inputs>
+
+    <outputs>
+        <data format="bam" name="bam_output" label="${tool.name} on ${on_string} (mapped reads in BAM format)">
+            <expand macro="dbKeyActionsBwaMem" />
+            <change_format>
+                <when input="output_sort" value="name" format="qname_sorted.bam" />
+                <when input="output_sort" value="unsorted" format="qname_input_sorted.bam" />
+            </change_format>
+        </data>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="reference_source_selector" value="history" />
+            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
+            <param name="fastq_input_selector" value="paired"/>
+            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/>
+            <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>
+            <param name="analysis_type_selector" value="illumina"/>
+            <output name="bam_output" ftype="bam" file="bwa-mem-test1.bam" lines_diff="4" />
+        </test>
+        <test>
+            <param name="reference_source_selector" value="history" />
+            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
+            <param name="fastq_input_selector" value="single"/>
+            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fasta1.fa"/>
+            <param name="analysis_type_selector" value="illumina"/>
+            <output name="bam_output" ftype="bam" file="bwa-mem-test1-fasta.bam" lines_diff="4" />
+        </test>
+        <test>
+            <param name="reference_source_selector" value="history" />
+            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
+            <param name="fastq_input_selector" value="paired"/>
+            <param name="fastq_input1" ftype="fastqsanger.gz" value="bwa-mem-fastq1.fq.gz"/>
+            <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>
+            <param name="analysis_type_selector" value="illumina"/>
+            <output name="bam_output" ftype="bam" file="bwa-mem-test1.bam" lines_diff="4" />
+        </test>
+        <test>
+            <param name="reference_source_selector" value="history" />
+            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
+            <param name="index_a" value="is"/>
+            <param name="fastq_input_selector" value="paired"/>
+            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/>
+            <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>
+            <param name="rg_selector" value="set"/>
+            <param name="ID" value="rg1"/>
+            <param name="PL" value="CAPILLARY"/>
+            <param name="LB" value="AARDVARK-1" />
+            <param name="analysis_type_selector" value="illumina"/>
+            <output name="bam_output" ftype="bam" file="bwa-mem-test2.bam" lines_diff="4" />
+        </test>
+        <test>
+            <param name="reference_source_selector" value="history" />
+            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
+            <param name="fastq_input_selector" value="paired"/>
+            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/>
+            <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>
+            <param name="analysis_type_selector" value="illumina"/>
+            <param name="output_sort" value="unsorted"/>
+            <output name="bam_output" ftype="qname_input_sorted.bam" file="bwa-mem-test3.bam" lines_diff="4" />
+        </test>
+        <test>
+            <param name="reference_source_selector" value="history" />
+            <param name="ref_file" ftype="fasta" value="bwa-mem-mt-genome.fa"/>
+            <param name="fastq_input_selector" value="paired"/>
+            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/>
+            <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>
+            <param name="analysis_type_selector" value="illumina"/>
+            <param name="output_sort" value="name"/>
+            <output name="bam_output" ftype="qname_sorted.bam" file="bwa-mem-test4.bam" lines_diff="4" />
+        </test>
+        <test>
+            <param name="reference_source_selector" value="cached" />
+            <param name="ref_file" value="mtgenome"/>
+            <param name="fastq_input_selector" value="paired"/>
+            <param name="fastq_input1" ftype="fastqsanger" value="bwa-mem-fastq1.fq"/>
+            <param name="fastq_input2" ftype="fastqsanger" value="bwa-mem-fastq2.fq"/>
+            <param name="analysis_type_selector" value="illumina"/>
+            <output name="bam_output" ftype="bam" file="bwa-mem-test1.bam" lines_diff="4" />
+        </test>
+    </tests>
+    <help><![CDATA[
+**What is does**
+BWA-MEM2 is the new version of the bwa-mem algorithm in bwa. It produces alignment identical to bwa and is ~1.3-3.1x faster depending on the use-case, dataset and the running machine.
+The algorithm is robust to sequencing errors and applicable to a wide range of sequence lengths from 70bp to a few megabases.
+
+The Galaxy implementation takes fastq files as input and produces output in BAM format, which can be further processed using various BAM utilities exiting in Galaxy (BAMTools, SAMTools, Picard).
+
+-----
+
+**Indices: Selecting reference genomes for BWA**
+
+Galaxy wrapper for BWA allows you select between precomputed and user-defined indices for reference genomes using **Will you select a reference genome from your history or use a built-in index?** flag. This flag has two options:
+
+  1. **Use a built-in genome index** - when selected (this is default), Galaxy provides the user with **Select reference genome index** dropdown. Genomes listed in this dropdown have been pre-indexed with bwa index utility and are ready to be mapped against.
+  2. **Use a genome from the history and build index** - when selected, Galaxy provides the user with **Select reference genome sequence** dropdown. This dropdown is populated by all FASTA formatted files listed in your current history. If your genome of interest is uploaded into history it will be shown there. Selecting a genome from this dropdown will cause Galaxy to first transparently index it using `bwa index` command, and then run mapping with `bwa mem`.
+
+If your genome of interest is not listed here you have two choices:
+
+  1. Contact galaxy team using **Help->Support** link at the top of the interface and let us know that an index needs to be added
+  2. Upload your genome of interest as a FASTA file to Galaxy history and selected **Use a genome from the history and build index** option.
+
+-----
+
+**Galaxy-specific option**
+
+Galaxy allows four levels of control over bwa-mem options provided by **Select analysis mode** menu option. These are:
+
+  1. *Simple Illumina mode*: The simplest possible bwa mem application in which it alignes single or paired-end data to reference using default parameters. It is equivalent to the following command: bwa mem <reference index> <fastq dataset1> [fastq dataset2]
+  2. *PacBio mode*: The mode adjusted specifically for mapping of long PacBio subreads. Equivalent to the following command: bwa mem -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0  <reference index> <PacBio dataset in fastq format>
+  3. *Full list of options*: Allows access to all options through Galaxy interface.
+
+-----
+
+**Bam sorting mode**
+
+The generated bam files can be sorted according to three criteria: coordinates, names and input order.
+
+In coordinate sorted mode the reads are sorted by coordinates. It means that the reads from the beginning of the first chromosome are first in the file. 
+
+When sorted by read name, the file is sorted by the reference ID (i.e., the QNAME field). 
+
+Finally, the *No sorted (sorted as input)* option yield a BAM file in which the records are sorted in an order corresponding to the order of the reads in the original input file. This option requires using a single thread to perform the conversion from SAM to BAM format, so the runtime is extended.
+
+
+@RG@
+
+@info@
+    ]]></help>
+    <expand macro="citations" />
+</tool>