view snap_caller.xml @ 23:5db0545b9004 draft

update to v0.1.7.3
author wolma
date Thu, 21 Jul 2016 03:55:49 -0400
parents c46406466625
children
line wrap: on
line source

<tool id="read_alignment" name="SNAP Read Alignment" version="0.1.7.3">
  <description>Map sequence reads to a reference genome using SNAP</description>
  <macros>
    <import>toolshed_macros.xml</import>
  </macros>
  <expand macro="requirements" />
  <version_command>python3 -m MiModD version -q</version_command>
  <command> 
	python3 -m MiModD snap-batch -s
	## SNAP calls (considering different cases)

	#for $i in $datasets
		"snap ${i.mode_choose.mode} '$ref_genome'
		#if $str($i.mode_choose.mode) == "paired" and $str($i.mode_choose.input.iformat) in ("fastq", "gz"):
'${i.mode_choose.input.ifile1}' '${i.mode_choose.input.ifile2}'
		#else:
'${i.mode_choose.input.ifile}'
		#end if
--ofile '$outputfile' --iformat ${i.mode_choose.input.iformat} --oformat $oformat
--idx-seedsize '$set.seedsize'
--idx-slack '$set.slack' --maxseeds '$set.maxseeds' --maxhits '$set.maxhits' --clipping $set.clipping --maxdist '$set.maxdist' --confdiff '$set.confdiff' --confadapt '$set.confadpt'
		#if $i.mode_choose.input.header:
--header '${i.mode_choose.input.header}'
		#end if
		#if $str($i.mode_choose.mode) == "paired":
--spacing '$set.sp_min' '$set.sp_max'
		#end if
		#if $str($set.selectivity) != "off":
--selectivity '$set.selectivity'
		#end if
		#if $str($set.filter_output) != "off":
--filter-output $set.filter_output
		#end if
		#if $str($set.sort) == "off":
--no-sort
		#end if
		#if $str($set.mmatch_notation) != "general":
-X
		#end if
		#if $set.discard_overlapping_mates:
--discard-overlapping-mates 
	## remove ',' (and possibly adjacent whitespace) and replace with ' '
	'#echo ("' '".join($set.discard_overlapping_mates.replace(" ", "").split(',')))#'
        #end if
--verbose
"							
	#end for
  </command>

  <inputs>
    ## mandatory arguments (and mode-conditionals)

    <param format="fasta" help="The fasta reference genome that SNAP should align reads against." label="reference genome" name="ref_genome" type="data" />
    
    <repeat default="1" min="1" name="datasets" title="datasets">    
      <conditional name="mode_choose">
        <param help="Reads obtained from single-end sequencing runs should be aligned in 'single' mode, paired-end reads in 'paired' mode. **WARNING**: if the read input file is in SAM/BAM format, the current version of this tool will **not** verify the mode and may produce erroneous alignments with wrong settings!" label="choose mode" name="mode" type="select">
	      <option value="single">single-end</option>
	      <option value="paired">paired-end</option>
        </param>
      
        <when value="single">
	      <conditional name="input">
            <param label="input file format" name="iformat" type="select">
              <option value="bam">BAM</option>
              <option value="sam">SAM</option>
              <option value="gz">gz</option>
		      <option value="fastq">fastq</option>
	        </param>
	        <when value="bam">
		      <param format="bam" label="input file" name="ifile" type="data" />
              <param format="sam" label="custom header file" name="header" optional="true" type="data" />
	        </when>
	        <when value="sam">
		      <param format="sam" label="input file" name="ifile" type="data" />
              <param format="sam" label="custom header file" name="header" optional="true" type="data" />
	        </when>            
	        <when value="gz">
		      <param label="input file" name="ifile" type="data" />
		      <param format="sam" label="header file" name="header" type="data" />
		    </when>
	        <when value="fastq">
		      <param format="fastq" label="input file" name="ifile" type="data" />
		      <param format="sam" label="header file" name="header" type="data" />
		    </when>
          </conditional>
        </when>
        <when value="paired">	
	      <conditional name="input">
            <param label="input file format" name="iformat" type="select">
              <option value="bam">BAM</option>
              <option value="sam">SAM</option>
		      <option value="gz">gz</option>
              <option value="fastq">fastq</option>
	        </param>
            <when value="bam">
		      <param format="bam" label="input file" name="ifile" type="data" />
              <param format="sam" label="custom header file" name="header" optional="true" type="data" />
		    </when>
            <when value="sam">
		      <param format="sam" label="input file" name="ifile" type="data" />
		      <param format="sam" label="custom header file" name="header" optional="true" type="data" />
            </when>
 	        <when value="fastq">
		      <param format="fastq" label="inputfile with the first set of reads of paired-end data" name="ifile1" type="data" />
	          <param format="fastq" label="inputfile with the second set of reads of paired-end data" name="ifile2" type="data" />
		      <param format="sam" help="required" label="header file" name="header" type="data" />
		    </when>
	        <when value="gz">
		      <param label="inputfile with the first set of reads of paired-end data" name="ifile1" type="data" />
	          <param label="inputfile with the second set of reads of paired-end data" name="ifile2" type="data" />
		      <param format="sam" help="required" label="header file" name="header" type="data" />
		    </when>
          </conditional>
	    </when>
      </conditional>
    </repeat>

    <param label="output file format" name="oformat" type="select">
      <option value="bam">BAM</option>
      <option value="sam">SAM</option>
    </param>	
    
    ## optional arguments

    <conditional name="set">
      <param help="This section lets you specify the detailed parameter settings for the SNAP aligner. Only change them if you know what you are doing, i.e., read the documentation first." label="further parameter settings" name="settings_mode" type="select">
        <option value="default">default settings</option>
	    <option value="change">change settings</option>
      </param>

      ## default settings   
  
      <when value="default">
	    <param name="seedsize" type="hidden" value="20" />
    	<param name="slack" type="hidden" value="0.3" />
    	<param name="sp_min" type="hidden" value="100" />
	    <param name="sp_max" type="hidden" value="10000" />
    	<param name="maxdist" type="hidden" value="8" />
	    <param name="confdiff" type="hidden" value="2" />
	    <param name="confadpt" type="hidden" value="7" />  
	    
	    <param name="maxseeds" type="hidden" value="25" />
	    <param name="maxhits" type="hidden" value="250" />
	    <param name="clipping" type="hidden" value="++" />

	    <param name="selectivity" type="hidden" value="off" />
	    <param name="filter_output" type="hidden" value="off" />
	    <param name="sort" type="hidden" value="0" />
	    <param name="mmatch_notation" type="hidden" value="general" />
	    <param name="discard_overlapping_mates" type="hidden" value="" />
      </when>
      
      ## change settings

      <when value="change">
	    <param help="Length of the seeds used in the reference genome hash table (SNAP index option -s)." label="seed size (default: 20)" name="seedsize" type="integer" value="20" />
    	<param help="Corresponds to the -h option of SNAP index." label="hash table slack size (default: 0.3)" name="slack" type="float" value="0.3" />	

      ## paired-end specific options
    	<param help="Corresponds to the first value of the SNAP option -s. Affects paired-end data only." label="minimum spacing to allow between paired ends (default: 100)" name="sp_min" type="integer" value="100" />
	    <param help="Corresponds to the second value of the SNAP option -s. Affects paired-end data only." label="maximum spacing to allow between paired ends (default: 10000)" name="sp_max" type="integer" value="10000" />
	    <param display="checkboxes" help="Consider overlapping mate pairs of the given orientation type(s) anomalous and discard them; allowed values: RF, FR, FF, RR; multiple types may be specified as a comma-separated list and ALL can be used as a shortcut for discarding all overlapping mate pairs; leave blank to retain all overlapping pairs. Affects paired-end data only." label="discard overlapping read pairs of type" multiple="true" name="discard_overlapping_mates" type="text" />
    	<param help="maximum edit distance allowed per read or pair (SNAP option -d); higher values allow more divergent alignments to be found, but increase the rate of misalignments." label="edit distance (default: 8)" name="maxdist" type="integer" value="8" />
	    <param help="Maximum hits to consider per seed (SNAP option -h); don't use a seed region in the alignment process if it matches more than maxhits regions in the reference genome. Higher values reduce the rate of misalignments, but reduce performance." label="maximum hits per seed (default: 250)" name="maxhits" type="integer" value="250" />
	    <param help="Confidence threshold (SNAP option -c); the minimum edit distance difference between two alternate alignments required to reject the poorer alignment as suboptimal; higher values increase the rate of ambiguously aligned reads." label="confidence threshold (default: 2)" name="confdiff" type="integer" value="2" />
	    <param help="Specifies how many seeds of a read may be ignored (based on the maximum hits value above) before the confidence threshold above gets increased by one for that read; helps fine-tuning alignment accuracy in repetitive regions of the genome." label="adaptive confdiff behaviour (default: 7)" name="confadpt" type="integer" value="7" />  
    	<param help="Number of seeds to use per read (SNAP option -n) when trying to match it to the reference genome; higher numbers will increase the rate of aligned reads and reduce the rate of misalignments, but will reduce performance." label="maximum seeds per read (default: 25)" name="maxseeds" type="integer" value="25" />
	    <param help="Specifies from which end of a read low-quality bases should be clipped (SNAP option -Cxx)" label="read clipping (default: from back and front)" name="clipping" type="select">
	      <option value="++">from back and front</option>
	      <option value="x+">from back only</option>
	      <option value="+x">from front only</option>
	      <option value="xx">no clipping</option>
	    </param>
	    <param help="randomly choose 1/selectivity of the reads to score (SNAP option -S). The tool uses the default of 1 (or a 0 setting) to indicate that all reads should be worked with." label="selectivity (default: 1)" name="selectivity" type="integer" value="1" />
	    <param help="filter output (SNAP option -F for certain classes of reads." label="filter output (default: no filtering)" name="filter_output" type="select">
	      <option value="off">no filtering</option>
	      <option value="a">aligned only</option>
	      <option value="s">single-aligned only</option>
	      <option value="u">unaligned only</option>
	    </param>
	    <param help="Sort the output file by alignment location (SNAP option --so)." label="output sorting (default: sort by read coordinates)" name="sort" type="select">
	      <option value="0">sort by read coordinates</option>	  
	      <option value="off">no sorting</option>
	    </param>
	    <param help="Indicates whether CIGAR strings in the generated SAM/BAM file should use M (alignment match) rather than = and X (sequence (mis-)match). Warning: Downstream variant calling based on samtools currently relies on the old-style M notation!!" label="CIGAR symbols for alignment matches/mismatches (default: M notation)" name="mmatch_notation" type="select">
	      <option value="general">use M for both matches and mismatches</option>
	      <option value="differentiate">use = for matches, X for mismatches</option>
	    </param>
      </when>
    </conditional>
</inputs>

<outputs>
  <data format="bam" label="Aligned reads from MiModd ${tool.name} on ${on_string}" name="outputfile">
    <change_format>
	  <when format="sam" input="oformat" value="sam" />
	</change_format>
  </data>
</outputs>

<help>
.. class:: infomark

   **What it does**

The tool aligns the sequenced reads in an arbitrary number of input datasets against a common reference genome and stores the results in a single, possibly multi-sample output file. It supports a variety of different sequenced reads input formats, i.e., SAM, BAM, fastq and gzipped fastq, and both single-end and paired-end data.

Internally, the tool uses the ultrafast, hashtable-based aligner SNAP (http://snap.cs.berkeley.edu), hence its name. 

**Notes:**

1) In its standard configuration Galaxy will decompress any .gz files during their upload, so the option to align gzipped fastq input is useful only with customized Galaxy instances or by using linked files as explained in our `recipe for using gzipped fastq files in Galaxy`_ from the `MiModD user guide`_.

2) To use paired-end fastq data with the tool the read mate information needs to be split over two fastq files in corresponding order.

   **TIP:** If your paired-end data is arranged differently, you may look into the *fastq splitter* and *fastq de-interlacer* tools for Galaxy from the `Fastq Manipulation category`_ of the Galaxy Tool Shed to see if they can convert your files to the expected format.

3) The tool supports the alignment of reads from the same sequencing run, but distributed across several input files. 
   
   Generally, it expects the reads from each input dataset to belong to one read-group and will abort with an error message if any input dataset declares more than one read group or sample names in its header. Different datasets, however, are allowed to contain reads from the same read-group (as indicated by matching read-group IDs and sample names in their headers), in which case the reads will be combined into one group in the output.

4) Read-group information is required for every input dataset!
 
   We generally recommend to store NGS datasets in SAM/BAM format with run metadata stored in the file header. You can use the *NGS Run Annotation* and *Convert* tools to convert data in fastq format to SAM/BAM with added run information.

   While it is not our recommended approach, you can, if you prefer it, align reads from fastq files or SAM/BAM files without header read-group information. To do so, you **must** specify a SAM file that provides the missing information in its header along with the input dataset. You can generate a SAM header file with the *NGS Run Annotation* tool.

   Optionally, a SAM header file can also be used to replace existing read-group information in a headered SAM/BAM input file. This can be used to resolve read-group ID conflicts between multiple input files at tool runtime.

5) The options available under *further parameter settings* can have **big** effects on the alignment quality. You are strongly encouraged to consult the `tool documentation`_ for detailed explanations of the available options.

6) Currently, you cannot configure aligner-specific options separately for specific input files from within this Galaxy tool. If you need this advanced level of control, you should use the command line tool ``mimodd snap-batch``.

.. _Fastq Manipulation category: https://toolshed.g2.bx.psu.edu/repository/browse_repositories_in_category?id=310ff67d4caf6531
.. _recipe for using gzipped fastq files in Galaxy: http://mimodd.readthedocs.org/en/latest/recipes.html#use-gzipped-fastq-files-in-galaxy
.. _MiModD user guide: http://mimodd.readthedocs.org/en/latest
.. _tool documentation: http://mimodd.readthedocs.org/en/latest/tool_doc.html#snap

</help>
</tool>