Mercurial > repos > wolma > mimodd_aln
view snap_caller.xml @ 0:d801b0675eb5 draft
planemo upload for repository https://github.com/wm75/mimodd_galaxy_wrappers commit b36048cd608ede0ec6f6559648525c9350caae34-dirty
author | wolma |
---|---|
date | Sat, 11 Nov 2017 18:18:54 -0500 |
parents | |
children | e76e813f615a |
line wrap: on
line source
<tool id="mimodd_align" name="MiModD Read Alignment" version="@MIMODD_WRAPPER_VERSION@"> <description>maps sequence reads to a reference genome using SNAP</description> <macros> <import>macros.xml</import> <macro name="require_metadata"> <param name="header" type="data" format="sam" label="metadata source for this sample" /> </macro> <macro name="sam_bam_selector" token_format="sam"> <param name="ifile" type="data" format="@FORMAT@" label="input file"/> <param name="header" type="data" format="sam" optional="true" label="(optional) metadata source for this sample" help="a SAM format dataset providing information about the sequences in the input data in its header; do NOT provide this dataset if the information is already part of your input dataset unless you want to have the original metadata overwritten. If needed, a metadata source dataset can be generated with the MiModD Run Annotation tool." /> </macro> </macros> <expand macro="requirements" /> <expand macro="stdio" /> <expand macro="version_command" /> <command><![CDATA[ ## Currently Galaxy does not autoconvert collections of fastq.gz files. ## This tool wrapper fixes that by allowing fastq and fastq.gz as input ## collection formats. ## gz_input is then used as flag to indicate a fastq.gz input file #set gz_input = False mimodd snap-batch -s #if str($reference.source) == "cached": #set ref_genome = $reference.genome.fields.path #else: #set ref_genome = $reference.genome #end if #for $i in $datasets "snap ${i.mode_choose.mode} '$ref_genome' #if $str($i.mode_choose.mode) == "paired" and $str($i.mode_choose.input.iformat) == "fastq": #if $str($i.mode_choose.input.pe_source.type) == 'collection': ## PE input provided as a paired collection - if the forward ## dataset is gzipped we assume the reverse dataset is too. '${i.mode_choose.input.pe_source.input_data.forward}' '${i.mode_choose.input.pe_source.input_data.reverse}' #if $i.mode_choose.input.pe_source.input_data.forward.is_of_type('fastq.gz'): #set gz_input = True #end if #else ## PE input provided as separate fastq datasets '${i.mode_choose.input.pe_source.ifile1}' '${i.mode_choose.input.pe_source.ifile2}' #end if #else: ## Input is either SE data or not in fastq format => ## only one input dataset '${i.mode_choose.input.ifile}' #end if #if $gz_input: ## a gzipped fastq input dataset was encountered --iformat gz #else --iformat ${i.mode_choose.input.iformat} #end if --ofile '$ofile' --oformat ${output_options.oformat} ${output_options.sort} ${output_options.explicit_mmatch_notation} --idx-seedsize $indexing.seedsize --idx-slack $indexing.slack --idx-overflow $indexing.overflow #set $aln_spec = $i.mode_choose.aln_options #if $str($i.mode_choose.mode) == "paired": #set $aln_global = $alignment.paired #set $treat_overlaps = $aln_spec.discard_overlapping_mates or $aln_global.discard_overlapping_mates --spacing #if $aln_spec.sp_min then $aln_spec.sp_min else $aln_global.sp_min #if $aln_spec.sp_max then $aln_spec.sp_max else $aln_global.sp_max #else #set $aln_global = $alignment.single #set $treat_overlaps = "" #end if --maxseeds #if $aln_spec.maxseeds then $aln_spec.maxseeds else $aln_global.maxseeds --maxhits #if $aln_spec.maxhits then $aln_spec.maxhits else $aln_global.maxhits --clipping #if $aln_spec.clipping then $aln_spec.clipping else $aln_global.clipping --maxdist #if $aln_spec.maxdist then $aln_spec.maxdist else $aln_global.maxdist --confdiff #if $aln_spec.confdiff then $aln_spec.confdiff else $aln_global.confdiff --confadapt #if $aln_spec.confadpt then $aln_spec.confadpt else $aln_global.confadpt #if $i.mode_choose.input.header: --header '${i.mode_choose.input.header}' #end if --selectivity $output_options.selectivity #if $str($output_options.filter_output) != "off": --filter-output $output_options.filter_output #end if #if $treat_overlaps: --discard-overlapping-mates ## remove ',' (and possibly adjacent whitespace) and replace with ' ' '#echo ("' '".join($treat_overlaps.replace(" ", "").split(',')))#' #end if --verbose" #end for ]]></command> <inputs> <conditional name="reference"> <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in genome?"> <option value="cached">Use a built-in genome</option> <option value="history">Use a genome from my history</option> </param> <when value="cached"> <param name="genome" type="select" label="reference genome" help="The fasta reference genome that SNAP should align reads against."> <options from_data_table="all_fasta" /> </param> </when> <when value="history"> <param name="genome" type="data" format="fasta" label="reference genome" help="The fasta reference genome that SNAP should align reads against."/> </when> </conditional> <section name="indexing" title="Parameters affecting reference genome indexing" expanded="false"> <param name="seedsize" type="integer" value="20" label="seed size (default: 20)" help="Length of the seeds used in the reference genome hash table (SNAP index option -s)."/> <param name="slack" type="float" value="0.3" label="hash table slack size (default: 0.3)" help="Corresponds to the -h option of SNAP index."/> <param name="overflow" type="integer" min="1" max="1000" value="40" label="index overflow factor (default: 40)" help="Factor (between 1 and 1000) that controls the size of the index build overflow space. For certain genomes you may have to increase this value if you are getting a corresponding error from the tool." /> </section> <section name="alignment" title="Alignment parameters" expanded="false" help="The global alignment parameters in this section will be used for samples for which you do not provide their own sample-specific settings."> <section name="single" title="Parameters applied to single-end samples" help="These parameters will affect the alignments for any single-end sample for which you do not provide sample-specific settings."> <param name="maxdist" type="integer" value="8" label="edit distance (default: 8)" help="maximum edit distance allowed per read or pair (SNAP option -d); higher values allow more divergent alignments to be found, but increase the rate of misalignments."/> <param name="confdiff" type="integer" value="2" label="confidence threshold (default: 2)" help="Confidence threshold (SNAP option -c); the minimum edit distance difference between two alternate alignments required to reject the poorer alignment as suboptimal; higher values increase the rate of ambiguously aligned reads."/> <param name="confadpt" type="integer" value="7" label="adaptive confdiff behaviour (default: 7)" help="Specifies how many seeds of a read may be ignored (based on the maximum hits value above) before the confidence threshold above gets increased by one for that read (SNAP option -a); helps fine-tuning alignment accuracy in repetitive regions of the genome."/> <param name="maxseeds" type="integer" value="25" label="maximum seeds per read (default: 25)" help="Number of seeds to use per read (SNAP option -n) when trying to match it to the reference genome; higher numbers will increase the rate of aligned reads and reduce the rate of misalignments, but will reduce performance."/> <param name="maxhits" type="integer" value="250" label="maximum hits per seed (default: 250)" help="Maximum hits to consider per seed (SNAP option -h); don't use a seed region in the alignment process if it matches more than maxhits regions in the reference genome. Higher values reduce the rate of misalignments, but reduce performance."/> <param name="clipping" type="select" display="radio" label="read clipping (default: from back and front)" help="Specifies from which end of a read low-quality bases should be clipped (SNAP option -Cxx)"> <option value="++">from back and front</option> <option value="x+">from back only</option> <option value="+x">from front only</option> <option value="xx">no clipping</option> </param> </section> <section name="paired" title="Parameters applied to paired-end samples" help="These parameters will affect the alignments for any paired-end sample for which you do not provide sample-specific settings."> <param name="sp_min" type="integer" value="100" label="minimum spacing to allow between paired ends (default: 100)" help="Corresponds to the first value of the SNAP option -s."/> <param name="sp_max" type="integer" value="10000" label="maximum spacing to allow between paired ends (default: 10000)" help="Corresponds to the second value of the SNAP option -s."/> <param name="discard_overlapping_mates" type="text" optional="true" label="discard overlapping read pairs of type" help="Consider overlapping mate pairs of the given orientation type(s) anomalous and discard them; allowed values: RF, FR, FF, RR; multiple types may be specified as a comma-separated list and ALL can be used as a shortcut for discarding all overlapping mate pairs; leave blank to retain all overlapping pairs." /> <param name="maxdist" type="integer" value="8" label="edit distance (default: 8)" help="maximum edit distance allowed per read or pair (SNAP option -d); higher values allow more divergent alignments to be found, but increase the rate of misalignments."/> <param name="confdiff" type="integer" value="2" label="confidence threshold (default: 2)" help="Confidence threshold (SNAP option -c); the minimum edit distance difference between two alternate alignments required to reject the poorer alignment as suboptimal; higher values increase the rate of ambiguously aligned reads."/> <param name="confadpt" type="integer" value="7" label="adaptive confdiff behaviour (default: 7)" help="Specifies how many seeds of a read may be ignored (based on the maximum hits value above) before the confidence threshold above gets increased by one for that read (SNAP option -a); helps fine-tuning alignment accuracy in repetitive regions of the genome."/> <param name="maxseeds" type="integer" value="25" label="maximum seeds per read (default: 25)" help="Number of seeds to use per read (SNAP option -n) when trying to match it to the reference genome; higher numbers will increase the rate of aligned reads and reduce the rate of misalignments, but will reduce performance."/> <param name="maxhits" type="integer" value="250" label="maximum hits per seed (default: 250)" help="Maximum hits to consider per seed (SNAP option -h); don't use a seed region in the alignment process if it matches more than maxhits regions in the reference genome. Higher values reduce the rate of misalignments, but reduce performance."/> <param name="clipping" type="select" display="radio" label="read clipping (default: from back and front)" help="Specifies from which end of a read low-quality bases should be clipped (SNAP option -Cxx)"> <option value="++">from back and front</option> <option value="x+">from back only</option> <option value="+x">from front only</option> <option value="xx">no clipping</option> </param> </section> </section> <conditional name="output_options"> <param name="config" type="select" label="Output options" help="No matter how many input datasets you specify below and what there formats are, this tool will produce a single output file with the aligned reads from all samples. In this section you can configure some aspects of what the output should look like. Unless you have a really special usecase, you can (and probably should) just go with the default settings though."> <option value="default">Just go with the defaults</option> <option value="custom">Show detailed output options</option> </param> <when value="default"> <param name="oformat" type="hidden" value="bam" /> <param name="sort" type="hidden" value=""/> <param name="explicit_mmatch_notation" type="hidden" value=""/> <param name="filter_output" type="hidden" value="off"/> <param name="selectivity" type="hidden" value="1"/> </when> <when value="custom"> <param name="oformat" type="select" display="radio" label="Output format"> <option value="bam">BAM</option> <option value="sam">SAM</option> </param> <param name="sort" type="boolean" falsevalue="--no-sort" truevalue="" checked="true" label="Sort aligned reads in the output by coordinates" help="Turn off if you want to retain the read order of the input file(s) (mimodd snap option --no-sort)." /> <param name="explicit_mmatch_notation" type="boolean" truevalue="-X" falsevalue="" checked="false" label="Use = and X to indicate matches/mismatches in CIGAR strings explicitly instead of using M for both" help="Warning: Downstream tools may still rely on the classic M notation! Turn this on at your own risk (mimodd snap option -X)." /> <param name="selectivity" type="integer" min="1" value="1" label="selectivity (default: 1)" help="randomly choose 1/selectivity of the reads to score (SNAP option -S). The default of 1 indicates that all reads should be worked with." /> <param name="filter_output" type="select" display="radio" label="filter output (default: no filtering)" help="filter output (SNAP option -F) to retain only specific classes of reads."> <option value="off">no filtering</option> <option value="a">aligned only</option> <option value="s">single-aligned only</option> <option value="u">unaligned only</option> </param> </when> </conditional> <repeat name="datasets" title="datasets" default="1" min="1"> <conditional name="mode_choose"> <param name="mode" type="select" label="choose mode" help="Reads obtained from single-end sequencing runs should be aligned in 'single' mode, paired-end reads in 'paired' mode. **WARNING**: if the read input file is in SAM/BAM format, the current version of this tool will **not** verify the mode and may produce erroneous alignments with wrong settings!"> <option value="single">single-end</option> <option value="paired">paired-end</option> </param> <when value="single"> <conditional name="input"> <param name="iformat" type="select" label="input file format"> <option value="bam">BAM</option> <option value="sam">SAM</option> <option value="fastq">fastq</option> </param> <when value="bam"> <expand macro="sam_bam_selector" format="bam" /> </when> <when value="sam"> <expand macro="sam_bam_selector" format="sam" /> </when> <when value="fastq"> <param name="ifile" type="data" format="fastq" label="input file"/> <expand macro="require_metadata" /> </when> </conditional> <section name="aln_options" title="Alignment options for this sample" expanded="false" help="Any options you specify here will overwrite the global alignment settings defined for all single-end samples above."> <param name="maxdist" type="integer" optional="true" value="" label="edit distance" help="maximum edit distance allowed per read or pair (SNAP option -d); higher values allow more divergent alignments to be found, but increase the rate of misalignments."/> <param name="confdiff" type="integer" optional="true" value="" label="confidence threshold" help="Confidence threshold (SNAP option -c); the minimum edit distance difference between two alternate alignments required to reject the poorer alignment as suboptimal; higher values increase the rate of ambiguously aligned reads."/> <param name="confadpt" type="integer" optional="true" value="" label="adaptive confdiff behaviour" help="Specifies how many seeds of a read may be ignored (based on the maximum hits value above) before the confidence threshold above gets increased by one for that read (SNAP option -a); helps fine-tuning alignment accuracy in repetitive regions of the genome."/> <param name="maxseeds" type="integer" optional="true" value="" label="maximum seeds per read" help="Number of seeds to use per read (SNAP option -n) when trying to match it to the reference genome; higher numbers will increase the rate of aligned reads and reduce the rate of misalignments, but will reduce performance."/> <param name="maxhits" type="integer" optional="true" value="" label="maximum hits per seed" help="Maximum hits to consider per seed (SNAP option -h); don't use a seed region in the alignment process if it matches more than maxhits regions in the reference genome. Higher values reduce the rate of misalignments, but reduce performance."/> <param name="clipping" type="select" display="radio" label="read clipping (default: from back and front)" help="Specifies from which end of a read low-quality bases should be clipped (SNAP option -Cxx)"> <option value="">use global setting</option> <option value="++">from back and front</option> <option value="x+">from back only</option> <option value="+x">from front only</option> <option value="xx">no clipping</option> </param> </section> </when> <when value="paired"> <conditional name="input"> <param name="iformat" type="select" label="input file format"> <option value="bam">BAM</option> <option value="sam">SAM</option> <option value="fastq">fastq</option> </param> <when value="bam"> <expand macro="sam_bam_selector" format="bam" /> </when> <when value="sam"> <expand macro="sam_bam_selector" format="sam" /> </when> <when value="fastq"> <conditional name="pe_source"> <param name="type" type="select" label="the paired-end fastq input is provided as"> <option value="individual">Individual datasets</option> <option value="collection">a Paired collection</option> </param> <when value="individual"> <param name="ifile1" type="data" format="fastq" label="inputfile with the first set of reads of paired-end data"/> <param name="ifile2" type="data" format="fastq" label="inputfile with the second set of reads of paired-end data"/> </when> <when value="collection"> <param name="input_data" type="data_collection" collection_type="paired" format="fastq, fastq.gz" label="paired input dataset collection"/> </when> </conditional> <expand macro="require_metadata" /> </when> </conditional> <section name="aln_options" title="Alignment options for this sample" expanded="false" help="Any options you specify here will overwrite the global alignment settings defined for all paired-end samples above."> <param name="sp_min" type="integer" optional="true" value="0" label="minimum spacing to allow between paired ends" help="Corresponds to the first value of the SNAP option -s."/> <param name="sp_max" type="integer" optional="true" value="0" label="maximum spacing to allow between paired ends" help="Corresponds to the second value of the SNAP option -s."/> <param name="discard_overlapping_mates" type="text" optional="true" value="" label="discard overlapping read pairs of type" help="Consider overlapping mate pairs of the given orientation type(s) anomalous and discard them; allowed values: RF, FR, FF, RR; multiple types may be specified as a comma-separated list and ALL can be used as a shortcut for discarding all overlapping mate pairs; leave blank to retain all overlapping pairs." /> <param name="maxdist" type="integer" optional="true" value="0" label="edit distance" help="maximum edit distance allowed per read or pair (SNAP option -d); higher values allow more divergent alignments to be found, but increase the rate of misalignments."/> <param name="confdiff" type="integer" optional="true" value="" label="confidence threshold" help="Confidence threshold (SNAP option -c); the minimum edit distance difference between two alternate alignments required to reject the poorer alignment as suboptimal; higher values increase the rate of ambiguously aligned reads."/> <param name="confadpt" type="integer" optional="true" value="" label="adaptive confdiff behaviour" help="Specifies how many seeds of a read may be ignored (based on the maximum hits value above) before the confidence threshold above gets increased by one for that read (SNAP option -a); helps fine-tuning alignment accuracy in repetitive regions of the genome."/> <param name="maxseeds" type="integer" optional="true" value="" label="maximum seeds per read" help="Number of seeds to use per read (SNAP option -n) when trying to match it to the reference genome; higher numbers will increase the rate of aligned reads and reduce the rate of misalignments, but will reduce performance."/> <param name="maxhits" type="integer" optional="true" value="" label="maximum hits per seed" help="Maximum hits to consider per seed (SNAP option -h); don't use a seed region in the alignment process if it matches more than maxhits regions in the reference genome. Higher values reduce the rate of misalignments, but reduce performance."/> <param name="clipping" type="select" display="radio" label="read clipping (default: from back and front)" help="Specifies from which end of a read low-quality bases should be clipped (SNAP option -Cxx)"> <option value="">use global setting</option> <option value="++">from back and front</option> <option value="x+">from back only</option> <option value="+x">from front only</option> <option value="xx">no clipping</option> </param> </section> </when> </conditional> </repeat> </inputs> <outputs> <data name="ofile" format="bam" label="Aligned reads from MiModd ${tool.name} on ${on_string}"> <change_format> <when input="output_options.oformat" value="sam" format="sam"/> </change_format> <actions> <conditional name="reference.source"> <when value="cached"> <action type="metadata" name="dbkey"> <option type="from_data_table" name="all_fasta" column="1" offset="0"> <filter type="param_value" ref="reference.genome" column="0" /> </option> </action> </when> </conditional> </actions> </data> </outputs> <tests> <test> <conditional name="reference"> <param name="source" value="history" /> <param name="genome" value="a.fa" /> </conditional> <repeat name="datasets"> <conditional name="mode_choose"> <param name="mode" value="single" /> <conditional name="input"> <param name="iformat" value="bam" /> <param name="ifile" value="a_part1.bam" /> </conditional> </conditional> </repeat> <assert_command> <has_text text="--idx-slack 0.3" /> <has_text text="--iformat bam" /> <has_text text="--oformat bam" /> <has_text text="--idx-seedsize 20" /> <has_text text="--idx-slack 0.3" /> <has_text text="--idx-overflow 40" /> <has_text text="--maxseeds 25" /> <has_text text="--maxhits 250" /> <has_text text="--clipping ++" /> <has_text text="--maxdist 8" /> <has_text text="--confdiff 2" /> <has_text text="--confadapt 7" /> <has_text text="--selectivity 1" /> </assert_command> </test> <test> <conditional name="reference"> <param name="source" value="history" /> <param name="genome" value="a.fa" /> </conditional> <repeat name="datasets"> <conditional name="mode_choose"> <param name="mode" value="single" /> <conditional name="input"> <param name="iformat" value="bam" /> <param name="ifile" value="a_part1.bam" /> </conditional> <section name="aln_options"> <param name="maxdist" value="7" /> </section> </conditional> </repeat> <assert_command> <has_text text="--idx-slack 0.3" /> <has_text text="--iformat bam" /> <has_text text="--oformat bam" /> <has_text text="--idx-seedsize 20" /> <has_text text="--idx-slack 0.3" /> <has_text text="--idx-overflow 40" /> <has_text text="--maxseeds 25" /> <has_text text="--maxhits 250" /> <has_text text="--clipping ++" /> <has_text text="--maxdist 7" /> <has_text text="--confdiff 2" /> <has_text text="--confadapt 7" /> <has_text text="--selectivity 1" /> </assert_command> </test> </tests> <help><![CDATA[ .. class:: infomark **What it does** The tool aligns the sequenced reads in an arbitrary number of input datasets against a common reference genome and stores the results in a single, possibly multi-sample output dataset. Internally, the tool uses the ultrafast, hashtable-based aligner SNAP (http://snap.cs.berkeley.edu). ---------- **Notes:** *Input formats* - The tool accepts SAM, BAM, fastq and fastq.gz input datasets of sequenced reads and supports both single-end and paired-end data. The recommended approach with MiModD is to store NGS datasets in SAM/BAM format with *Run Metadata* (see below) stored in the file header. You can use the *MiModD Run Annotation* and *MiModD Convert* tools to convert data from fastq format to SAM/BAM format while attaching run metadata to it. While alignments **directly from fastq format** are supported, this **is less reliable** due to less strict specifications of this format. If you find the tool complaining about malformed fastq input, it is likely that you can fix this problem by converting the data to SAM/BAM format first. - If you wish to align paired-end data directly from fastq format, the mate sequence data has to be split over two datasets as is mostly standard today. If you have your paired-end data as a single dataset you may look into the *FASTQ splitter* and *FASTQ de-interlacer* tools for Galaxy, which are available from the `Fastq Manipulation category`_ of the Galaxy Tool Shed and may be able to convert your files to the expected format. *Run Metadata* - **Every input file requires accompanying Run Metadata!** Most importantly, this includes a *read-group ID* (an identifier of the sequencing run that produced the data) and a *sample name* (identifying the biological sample sequenced in the run). - If an input dataset does not provide this information directly (fastq datasets never do; SAM/BAM datasets may provide it in their header), you need to specify a separate SAM/BAM dataset with an appropriate header as the source of the Run Metadata. You can use the *MiModD Run Annotation* tool to generate such a file. - If a SAM/BAM input dataset already provides Run Metadata, you can still specify a different Run Metadata source, which will then overwrite the information already present in the input. This is useful, for example, to resolve read-group ID conflicts between multiple input datasets. - Every input dataset can only contain reads from a single read-group. If you would like, for example, to realign the reads in a multi-sample SAM/BAM dataset. You should first use the *MiModD Sort* tool to sort the data by read names (this step is only necessary for paired-end data), then split the reads into new per-read-group datasets using the *MiModD Convert* tool. - Several input datasets can declare identical read-group IDs and/or sample names. Identical read-group IDs mean that the datasets were produced in the same sequencing run, as is the case, for example, with partial fastq sequencing data. In the output dataset, the corresponding reads will be merged and it will not be possible to trace back their source. Identical sample names (but different read-group IDs) indicate that the same sample has been sequenced multiple times. In the output dataset, the corresponding reads will be tagged appropriately and tools like the *MiModD Variant Calling* tool will let you decide whether you want to treat them together or separately. ---------- **Tool Options** The section *Alignment parameters* lets you configure global settings for the alignment job that will be applied to all input datasets. For each input dataset, however, you can overwrite some or all of these settings by specifying new values in the section *Alignment options for this sample*. Some of the alignment parameters may have **big** effects on the alignment quality, but these effects are very dependent on the type of input sequences. You are strongly encouraged to consult the in-depth `tool documentation`_ for detailed explanations of the available options. .. _Fastq Manipulation category: https://toolshed.g2.bx.psu.edu/repository/browse_repositories_in_category?id=310ff67d4caf6531 .. _recipe for using gzipped fastq files in Galaxy: http://mimodd.readthedocs.org/en/latest/recipes.html#use-gzipped-fastq-files-in-galaxy .. _tool documentation: http://mimodd.readthedocs.io/en/@MIMODD_REAL_VERSION@/tool_doc.html#snap @HELP_FOOTER@ ]]></help> <expand macro="citations" /> </tool>