view FLASH.xml @ 2:6889442b27dc draft default tip

author aaronpetkau
date Sat, 04 Jul 2015 08:58:21 -0400
line wrap: on
line source

<tool id="FLASH" name="FLASH" version="1.3.0">
  <description>merge paired-end reads from fragments that are shorter than twice the length of reads</description>
  <command interpreter="bash"> $extendedFrags $notCombined1 $notCombined2 $interNotCombined $readsAndPairs $log_file -o out -t 4 
    #if $min_overlap
      -m $min_overlap
    #end if
    #if $max_overlap
      -M $max_overlap
      -M 250
    #end if
    #if $outputs.output_type == "Interleaved_fastq"
	    #else if $outputs.output_type == "tab"
    #end if
    #if $options.options_select == "advanced"
      #if $options.max_mismatch_density
	-x $options.max_mismatch_density
      #end if
      #if $options.phred_offset
	-p $options.phred_offset
      #end if
      #if $options.read_length
	-r $options.read_length
      #end if
      #if $options.fragment_length
	-f $options.fragment_length
      #end if
      #if $options.fragment_stdev
	-s $options.fragment_stdev
      #end if
      #if $options.cap_mismatch_quals
      #end if
      #if $options.quiet
      #end if
    #end if 

    #if $input_type.sPaired == "paired":
       $input_type.pInput1 $input_type.pInput2
    #elif $input_type.sPaired == "collections":
       $input_type.fastq_collection.forward $input_type.fastq_collection.reverse
    #end if

          <conditional name="input_type">
            <param name="sPaired" type="select" label="Single Pair or Collection">
              <option value="collections">Paired-end Collections</option>
              <option value="paired">Paired-end</option>
            <when value="paired">
                <param name="pInput1" type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" label="Forward FASTQ file" help="Must have ASCII encoded quality scores"/>
                <param name="pInput2" type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" label="Reverse FASTQ file" help="File format must match the Forward FASTQ file"/>
            <when value="collections">
              <param name="fastq_collection" type="data_collection" label="Paired-end Fastq collection" help="" optional="false" format="txt" collection_type="paired" />

	<param name="min_overlap" type="integer" label="Minimum overlap" optional="true"/>
	<param name="max_overlap" type="integer" label="Maximum overlap" value="250" optional="true"/>
	<conditional name="outputs">
		<param name="output_type" type="select" label="Output type">
                	<option value="Non-interleaved_fastq">Non-interleaved fastq</option>
                	<option value="Interleaved_fastq">Interleaved fastq</option>
                	<option value="tab">Tab-deliminated</option>
        <conditional name="options">
		<param name="options_select" type="select" label="Options Type">
			<option value="basic">Basic</option>
			<option value="advanced">Advanced</option>
		<when value="advanced">
			<param name="max_mismatch_density" type="float" label="Maximum mismatch density" optional="true"/>
			<param name="phred_offset" type="select" label="Phred-offset" optional="true">
				<option value="33">33</option>
				<option value="64">64</option>
			<param name="read_length" type="integer" label="Average read length" optional="true"/>
			<param name="fragment_length" type="integer" label="Fragment length" optional="true"/>
			<param name="fragment_stdev" type="integer" label="Fragment length standard deviation" optional="true"/>
			<param name="cap_mismatch_quals" type="boolean" label="Cap mismatch quality scores" truevalue="--cap-mismatch-quals" optional="true"/>
			<!--<param name="compress" type="boolean" label="Compress output files with gzip" optional="true"/>
			<param name="compress_prog" type="text" label="Compression program" optional="true"/>
			<param name="compress_prog_args" type="text" label="Compression program arguments" optional="true"/> <~~~~~~~~Phil says the compression options aren't needed-->
			<param name="quiet" type="boolean" label="Do not print informational messages" truevalue="-q" optional="true"/>
    <data format="fastqsanger" name="extendedFrags" label="Merged reads">
	<filter>outputs['output_type'] != "tab"</filter>
    <data format="fastqsanger" name="notCombined1" label="Read 1 of mate pairs not merged">
	<filter>outputs['output_type'] == "Non-interleaved_fastq"</filter>
    <data format="fastqsanger" name="notCombined2" label="Read 2 of mate pairs not merged">
	<filter>outputs['output_type'] == "Non-interleaved_fastq"</filter>
    <data format="fastqsanger" name="interNotCombined" label="Interleaved non-combined pairs">
	<filter>outputs['output_type'] == "Interleaved_fastq"</filter>
    <data format="tabular" name="readsAndPairs" label="Merged and non-merged pairs">
	<filter>outputs['output_type'] == "tab"</filter>
    <data format="txt" name="log_file" label="Log file"/>
   <!-- <data format="txt" name="numericHistogram" label="Numeric histogram of merged read lengths"/>
    <data format="txt" name="visualHistogram" label="Visual histogram of merged read lengths"/>-->
    <requirement type="package" version="1.2.9">FLASH</requirement>

FLASH (Fast Length Adjustment of SHort reads) is an accurate and fast tool
to merge paired-end reads that were generated from DNA fragments whose
lengths are shorter than twice the length of reads.  Merged read pairs result
in unpaired longer reads, which are generally more desired in genome
assembly and genome analysis processes.

Briefly, the FLASH algorithm considers all possible overlaps at or above a
minimum length between the reads in a pair and chooses the overlap that
results in the lowest mismatch density (proportion of mismatched bases in
the overlapped region).  Ties between multiple overlaps are broken by
considering quality scores at mismatch sites.  When building the merged
sequence, FLASH computes a consensus sequence in the overlapped region.
More details can be found in the original publication

Limitations of FLASH include:
   - FLASH cannot merge paired-end reads that do not overlap.
   - FLASH cannot merge read pairs that have an outward orientation, either
     due to being "jumping" reads or due to excessive trimming.
   - FLASH is not designed for data that has a significant amount of indel
     errors (such as Sanger sequencing data).  It is best suited for Illumina

                               MANDATORY INPUT

The most common input to FLASH is two FASTQ files containing read 1 and read 2
of each mate pair, respectively, in the same order.

Alternatively, you may provide one FASTQ file, which may be standard input,
containing paired-end reads in either interleaved FASTQ (see the
--interleaved-input option) or tab-delimited (see the --tab-delimited-input
option) format.  In all cases, gzip compressed input is autodetected.  Also,
in all cases, the PHRED offset is, by default, assumed to be 33; use the
--phred-offset option to change it.


The default output of FLASH consists of the following files:

   - out.extendedFrags.fastq      The merged reads.
   - out.notCombined_1.fastq      Read 1 of mate pairs that were not merged.
   - out.notCombined_2.fastq      Read 2 of mate pairs that were not merged.
   - out.hist                     Numeric histogram of merged read lengths.
   - out.histogram                Visual histogram of merged read lengths.

FLASH also logs informational messages to standard output.  These can be
redirected to a file, as in the following example:

  $ flash reads_1.fq reads_2.fq | tee flash.log

In addition, FLASH supports several features affecting the output:

   - Writing the merged reads directly to standard output (--to-stdout)
   - Writing gzip compressed output files (-z) or using an external
     compression program (--compress-prog)
   - Writing the uncombined read pairs in interleaved FASTQ format
   - Writing all output reads to a single file in tab-delimited format


  -m, --min-overlap=NUM   The minimum required overlap length between two
                          reads to provide a confident overlap.  Default:

  -M, --max-overlap=NUM   Maximum overlap length expected in approximately
                          90% of read pairs.  It is by default set to 70bp,
                          which works well for 100bp reads generated from a
                          180bp library, assuming a normal distribution of
                          fragment lengths.  Overlaps longer than the maximum
                          overlap parameter are still considered as good
                          overlaps, but the mismatch density (explained below)
                          is calculated over the first max_overlap bases in
                          the overlapped region rather than the entire
                          overlap.  Default: 70bp, or calculated from the
                          specified read length, fragment length, and fragment
                          length standard deviation.

  -x, --max-mismatch-density=NUM
                          Maximum allowed ratio between the number of
                          mismatched base pairs and the overlap length.
                          Two reads will not be combined with a given overlap
                          if that overlap results in a mismatched base density
                          higher than this value.  Note: Any occurence of an
                          'N' in either read is ignored and not counted
                          towards the mismatches or overlap length.  Our
                          experimental results suggest that higher values of
                          the maximum mismatch density yield larger
                          numbers of correctly merged read pairs but at
                          the expense of higher numbers of incorrectly
                          merged read pairs.  Default: 0.25.

  -p, --phred-offset=OFFSET
                          The smallest ASCII value of the characters used to
                          represent quality values of bases in FASTQ files.
                          It should be set to either 33, which corresponds
                          to the later Illumina platforms and Sanger
                          platforms, or 64, which corresponds to the
                          earlier Illumina platforms.  Default: 33.

  -r, --read-len=LEN

  -f, --fragment-len=LEN

  -s, --fragment-len-stddev=LEN
                          Average read length, fragment length, and fragment
                          standard deviation.  These are convenience parameters
                          only, as they are only used for calculating the
                          maximum overlap (--max-overlap) parameter.
                          The maximum overlap is calculated as the overlap of
                          average-length reads from an average-size fragment
                          plus 2.5 times the fragment length standard
                          deviation.  The default values are -r 100, -f 180,
                          and -s 18, so this works out to a maximum overlap of
                          65 bp.  If --max-overlap is specified, then the
                          specified value overrides the calculated value.

                          If you do not know the standard deviation of the
                          fragment library, you can probably assume that the
                          standard deviation is 10% of the average fragment

  --cap-mismatch-quals    Cap quality scores assigned at mismatch locations
                          to 2.  This was the default behavior in FLASH v1.2.7
                          and earlier.  Later versions will instead calculate
                          such scores as the
                          absolute value of the difference in quality scores,
                          but at least 2.  Essentially, the new behavior
                          prevents a low quality base call that is likely a
                          sequencing error from significantly bringing down
                          the quality of a high quality, likely correct base

  --interleaved-input     Instead of requiring files MATES_1.FASTQ and
                          MATES_2.FASTQ, allow a single file MATES.FASTQ that
                          has the paired-end reads interleaved.  Specify "-"
                          to read from standard input.

  --interleaved-output    Write the uncombined pairs in interleaved FASTQ

  -I, --interleaved       Equivalent to specifying both --interleaved-input
                          and --interleaved-output.

  -Ti, --tab-delimited-input
                          Assume the input is in tab-delimited format
                          rather than FASTQ, in the format described below in
                          '--tab-delimited-output'.  In this mode you should
                          provide a single input file, each line of which must
                          contain either a read pair (5 fields) or a single
                          read (3 fields).  FLASH will try to combine the read
                          pairs.  Single reads will be written to the output
                          file as-is if also using --tab-delimited-output;
                          otherwise they will be ignored.  Note that you may
                          specify "-" as the input file to read the
                          tab-delimited data from standard input.

  -To, --tab-delimited-output
                          Write output in tab-delimited format (not FASTQ).
                          Each line will contain either a combined pair in the
                          format 'tag &lt;tab&gt; seq &lt;tab&gt; qual' or an uncombined
                          pair in the format 'tag &lt;tab&gt; seq_1 &lt;tab&gt; qual_1
                          &lt;tab&gt; seq_2 &lt;tab&gt; qual_2'.

  -o, --output-prefix=PREFIX
                          Prefix of output files.  Default: "out".

  -d, --output-directory=DIR
                          Path to directory for output files.  Default:
                          current working directory.

  -c, --to-stdout         
			  Write the combined reads to standard output.  In
                          this mode, with FASTQ output (the default) the
                          uncombined reads are discarded.  With tab-delimited
                          output, uncombined reads are included in the
                          tab-delimited data written to standard output.
                          In both cases, histogram files are not written,
                          and informational messages are sent to standard
                          error rather than to standard output.

  --suffix=SUFFIX, --output-suffix=SUFFIX
                          Use SUFFIX as the suffix of the output files
                          after ".fastq".  A dot before the suffix is assumed,
                          unless an empty suffix is provided.  Default:
                          nothing; or 'gz' if -z is specified; or PROG if
                          --compress-prog=PROG is specified.

  -t, --threads=NTHREADS  Set the number of worker threads.  This is in
                          addition to the I/O threads.  Default: number of
                          processors.  Note: if you need FLASH's output to
                          appear deterministically or in the same order as
                          the original reads, you must specify -t 1

  -q, --quiet             Do not print informational messages.

  -h, --help              Display this help and exit.

  -v, --version           Display version.