# HG changeset patch # User aaronpetkau # Date 1436014701 14400 # Node ID 6889442b27dcc691e48a52fbb3986afde59abd41 # Parent a444685f161ca877fc127adfbf63c5d7f2858e5d Uploaded diff -r a444685f161c -r 6889442b27dc FLASH.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/FLASH.sh Sat Jul 04 08:58:21 2015 -0400 @@ -0,0 +1,41 @@ +#/bin/bash + +#grab output files +merged_reads=$1 +shift +not_combined_1=$1 +shift +not_combined_2=$1 +shift +inter_not_combined=$1 +shift +reads_and_pairs=$1 +shift +log_file=$1 +shift + +flash $@ > $log_file +sleep 5 #sleep because phil says so + +if [ -f out.notCombined_2.fastq ]; +then + mv out.notCombined_2.fastq $not_combined_2 +fi +if [ -f out.notCombined_1.fastq ]; +then + mv out.notCombined_1.fastq $not_combined_1 +fi +if [ -f out.notCombined.fastq ]; +then + mv out.notCombined.fastq $inter_not_combined +fi +if [ -f out.readsAndPairs.tab ]; +then + mv out.readsAndPairs.tab $reads_and_pairs +fi +if [ -f out.extendedFrags.fastq ]; +then + mv out.extendedFrags.fastq $merged_reads +fi + +exit 0 diff -r a444685f161c -r 6889442b27dc FLASH.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/FLASH.xml Sat Jul 04 08:58:21 2015 -0400 @@ -0,0 +1,331 @@ + + merge paired-end reads from fragments that are shorter than twice the length of reads + + FLASH.sh $extendedFrags $notCombined1 $notCombined2 $interNotCombined $readsAndPairs $log_file -o out -t 4 + #if $min_overlap + -m $min_overlap + #end if + #if $max_overlap + -M $max_overlap + #else + -M 250 + #end if + #if $outputs.output_type == "Interleaved_fastq" + --interleaved-output + #else if $outputs.output_type == "tab" + -To + #end if + #if $options.options_select == "advanced" + #if $options.max_mismatch_density + -x $options.max_mismatch_density + #end if + #if $options.phred_offset + -p $options.phred_offset + #end if + #if $options.read_length + -r $options.read_length + #end if + #if $options.fragment_length + -f $options.fragment_length + #end if + #if $options.fragment_stdev + -s $options.fragment_stdev + #end if + #if $options.cap_mismatch_quals + $options.cap_mismatch_quals + #end if + #if $options.quiet + $options.quiet + #end if + #end if + + #if $input_type.sPaired == "paired": + $input_type.pInput1 $input_type.pInput2 + #elif $input_type.sPaired == "collections": + $input_type.fastq_collection.forward $input_type.fastq_collection.reverse + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + outputs['output_type'] != "tab" + + + outputs['output_type'] == "Non-interleaved_fastq" + + + outputs['output_type'] == "Non-interleaved_fastq" + + + outputs['output_type'] == "Interleaved_fastq" + + + outputs['output_type'] == "tab" + + + + + + FLASH + + +---------------------------------------------------------------------------- + DESCRIPTION +---------------------------------------------------------------------------- + +FLASH (Fast Length Adjustment of SHort reads) is an accurate and fast tool +to merge paired-end reads that were generated from DNA fragments whose +lengths are shorter than twice the length of reads. Merged read pairs result +in unpaired longer reads, which are generally more desired in genome +assembly and genome analysis processes. + +Briefly, the FLASH algorithm considers all possible overlaps at or above a +minimum length between the reads in a pair and chooses the overlap that +results in the lowest mismatch density (proportion of mismatched bases in +the overlapped region). Ties between multiple overlaps are broken by +considering quality scores at mismatch sites. When building the merged +sequence, FLASH computes a consensus sequence in the overlapped region. +More details can be found in the original publication +(http://bioinformatics.oxfordjournals.org/content/27/21/2957.full). + +Limitations of FLASH include: + - FLASH cannot merge paired-end reads that do not overlap. + - FLASH cannot merge read pairs that have an outward orientation, either + due to being "jumping" reads or due to excessive trimming. + - FLASH is not designed for data that has a significant amount of indel + errors (such as Sanger sequencing data). It is best suited for Illumina + data. + +---------------------------------------------------------------------------- + MANDATORY INPUT +---------------------------------------------------------------------------- + +The most common input to FLASH is two FASTQ files containing read 1 and read 2 +of each mate pair, respectively, in the same order. + +Alternatively, you may provide one FASTQ file, which may be standard input, +containing paired-end reads in either interleaved FASTQ (see the +--interleaved-input option) or tab-delimited (see the --tab-delimited-input +option) format. In all cases, gzip compressed input is autodetected. Also, +in all cases, the PHRED offset is, by default, assumed to be 33; use the +--phred-offset option to change it. + +---------------------------------------------------------------------------- + OUTPUT +---------------------------------------------------------------------------- + +The default output of FLASH consists of the following files: + + - out.extendedFrags.fastq The merged reads. + - out.notCombined_1.fastq Read 1 of mate pairs that were not merged. + - out.notCombined_2.fastq Read 2 of mate pairs that were not merged. + - out.hist Numeric histogram of merged read lengths. + - out.histogram Visual histogram of merged read lengths. + +FLASH also logs informational messages to standard output. These can be +redirected to a file, as in the following example: + + $ flash reads_1.fq reads_2.fq | tee flash.log + +In addition, FLASH supports several features affecting the output: + + - Writing the merged reads directly to standard output (--to-stdout) + - Writing gzip compressed output files (-z) or using an external + compression program (--compress-prog) + - Writing the uncombined read pairs in interleaved FASTQ format + (--interleaved-output) + - Writing all output reads to a single file in tab-delimited format + (--tab-delimited-output) + +---------------------------------------------------------------------------- + OPTIONS +---------------------------------------------------------------------------- + + -m, --min-overlap=NUM The minimum required overlap length between two + reads to provide a confident overlap. Default: + 10bp. + + -M, --max-overlap=NUM Maximum overlap length expected in approximately + 90% of read pairs. It is by default set to 70bp, + which works well for 100bp reads generated from a + 180bp library, assuming a normal distribution of + fragment lengths. Overlaps longer than the maximum + overlap parameter are still considered as good + overlaps, but the mismatch density (explained below) + is calculated over the first max_overlap bases in + the overlapped region rather than the entire + overlap. Default: 70bp, or calculated from the + specified read length, fragment length, and fragment + length standard deviation. + + -x, --max-mismatch-density=NUM + Maximum allowed ratio between the number of + mismatched base pairs and the overlap length. + Two reads will not be combined with a given overlap + if that overlap results in a mismatched base density + higher than this value. Note: Any occurence of an + 'N' in either read is ignored and not counted + towards the mismatches or overlap length. Our + experimental results suggest that higher values of + the maximum mismatch density yield larger + numbers of correctly merged read pairs but at + the expense of higher numbers of incorrectly + merged read pairs. Default: 0.25. + + -p, --phred-offset=OFFSET + The smallest ASCII value of the characters used to + represent quality values of bases in FASTQ files. + It should be set to either 33, which corresponds + to the later Illumina platforms and Sanger + platforms, or 64, which corresponds to the + earlier Illumina platforms. Default: 33. + + -r, --read-len=LEN + + -f, --fragment-len=LEN + + -s, --fragment-len-stddev=LEN + Average read length, fragment length, and fragment + standard deviation. These are convenience parameters + only, as they are only used for calculating the + maximum overlap (--max-overlap) parameter. + The maximum overlap is calculated as the overlap of + average-length reads from an average-size fragment + plus 2.5 times the fragment length standard + deviation. The default values are -r 100, -f 180, + and -s 18, so this works out to a maximum overlap of + 65 bp. If --max-overlap is specified, then the + specified value overrides the calculated value. + + If you do not know the standard deviation of the + fragment library, you can probably assume that the + standard deviation is 10% of the average fragment + length. + + --cap-mismatch-quals Cap quality scores assigned at mismatch locations + to 2. This was the default behavior in FLASH v1.2.7 + and earlier. Later versions will instead calculate + such scores as the + absolute value of the difference in quality scores, + but at least 2. Essentially, the new behavior + prevents a low quality base call that is likely a + sequencing error from significantly bringing down + the quality of a high quality, likely correct base + call. + + --interleaved-input Instead of requiring files MATES_1.FASTQ and + MATES_2.FASTQ, allow a single file MATES.FASTQ that + has the paired-end reads interleaved. Specify "-" + to read from standard input. + + --interleaved-output Write the uncombined pairs in interleaved FASTQ + format. + + -I, --interleaved Equivalent to specifying both --interleaved-input + and --interleaved-output. + + -Ti, --tab-delimited-input + Assume the input is in tab-delimited format + rather than FASTQ, in the format described below in + '--tab-delimited-output'. In this mode you should + provide a single input file, each line of which must + contain either a read pair (5 fields) or a single + read (3 fields). FLASH will try to combine the read + pairs. Single reads will be written to the output + file as-is if also using --tab-delimited-output; + otherwise they will be ignored. Note that you may + specify "-" as the input file to read the + tab-delimited data from standard input. + + -To, --tab-delimited-output + Write output in tab-delimited format (not FASTQ). + Each line will contain either a combined pair in the + format 'tag <tab> seq <tab> qual' or an uncombined + pair in the format 'tag <tab> seq_1 <tab> qual_1 + <tab> seq_2 <tab> qual_2'. + + -o, --output-prefix=PREFIX + Prefix of output files. Default: "out". + + -d, --output-directory=DIR + Path to directory for output files. Default: + current working directory. + + -c, --to-stdout + Write the combined reads to standard output. In + this mode, with FASTQ output (the default) the + uncombined reads are discarded. With tab-delimited + output, uncombined reads are included in the + tab-delimited data written to standard output. + In both cases, histogram files are not written, + and informational messages are sent to standard + error rather than to standard output. + + --suffix=SUFFIX, --output-suffix=SUFFIX + Use SUFFIX as the suffix of the output files + after ".fastq". A dot before the suffix is assumed, + unless an empty suffix is provided. Default: + nothing; or 'gz' if -z is specified; or PROG if + --compress-prog=PROG is specified. + + -t, --threads=NTHREADS Set the number of worker threads. This is in + addition to the I/O threads. Default: number of + processors. Note: if you need FLASH's output to + appear deterministically or in the same order as + the original reads, you must specify -t 1 + (--threads=1). + + -q, --quiet Do not print informational messages. + + -h, --help Display this help and exit. + + -v, --version Display version. + + diff -r a444685f161c -r 6889442b27dc README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Sat Jul 04 08:58:21 2015 -0400 @@ -0,0 +1,106 @@ +Tool wrapper by Brian Yeo +brian.yeo@phac.aspc.gc.ca + + INTRODUCTION + +FLASH (Fast Length Adjustment of SHort reads) is an accurate and fast tool +to merge paired-end reads that were generated from DNA fragments whose +lengths are shorter than twice the length of reads. Merged read pairs result +in unpaired longer reads, which are generally more desired in genome +assembly and genome analysis processes. + +Briefly, the FLASH algorithm considers all possible overlaps at or above a +minimum length between the reads in a pair and chooses the overlap that +results in the lowest mismatch density (proportion of mismatched bases in +the overlapped region). Ties between multiple overlaps are broken by +considering quality scores at mismatch sites. When building the merged +sequence, FLASH computes a consensus sequence in the overlapped region. +More details can be found in the original publication +(http://bioinformatics.oxfordjournals.org/content/27/21/2957.full). + +Limitations of FLASH include: + - FLASH cannot merge paired-end reads that do not overlap. + - FLASH cannot merge read pairs that have an outward orientation, either + due to being "jumping" reads or due to excessive trimming. + - FLASH is not designed for data that has a significant amount of indel + errors (such as Sanger sequencing data). It is best suited for Illumina + data. + + INSTALLATION + +On UNIX-compatible systems, including GNU/Linux and Mac OS X, you must compile +FLASH from source. The only dependency, other than functions that are expected +to be available in the C library, is the zlib data compression library. To +install FLASH, download the tarball, untar it, and compile the code using the +provided Makefile: + + $ tar xzf FLASH-1.2.9.tar.gz + $ cd FLASH-1.2.9 + $ make + +The executable file that is produced is named 'flash'. To run it from the +command line you must copy it to a location on your $PATH variable, or else run +it with a path including a directory, such as "./flash". + +FLASH also runs on Windows, and you can compile it on Windows using MinGW. +However, for convenience you may instead download a standalone Windows binary +from the SourceForge page (https://sourceforge.net/projects/flashpage/). + + USAGE + +Please compile FLASH and run `flash --help' to see command-line usage +information and information about input/output files. + + MULTITHREADING + +By default, FLASH uses multiple threads. There are "combiner" threads that do +the actual read combining, as well as up to 5 threads that are used for I/O (up +to 2 readers, up to 3 writers). The default number of combiner threads is the +number of processors; however, it can be adjusted with the -t option (long +option: --threads). + +When multiple combiner threads are used, the order of the combined and +uncombined reads in the output files will be nondeterministic. If you need to +enforce that the output reads appear in the same order as the input, you must +specify --threads=1. + + PERFORMANCE + +Since the FLASH algorithm considers each read pair independently, FLASH will, by +default, process read pairs in parallel. FLASH v1.2.9 and later also make use +of vector instructions available on modern x86 CPUs. Consequently, FLASH works +quite fast, even with low-cost computing resources. As an example, we ran FLASH +v1.2.9 on a laptop with a dual-core 2.3 GHz AMD x86_64 processor and it +processed one million 101-bp read pairs in 11.6 seconds with the default +parameters. Less than 2 MB of memory was used. Actual timing results will +vary, but they will depend primarily on the number of CPUs available, the speed +of each CPU, and on the I/O speed of reading the input files and writing the +output files. FLASH is designed to be scalable to dozens of processors, +although its speed may be limited by I/O in such cases. + + ACCURACY + +With reads' error rate of 1% or less, FLASH processes over 99% of read pairs +correctly. With error rate of 2%, FLASH processes over 98% of read pairs +correctly when default parameters are used. With more aggressive parameters +(i.e., -x 0.35), FLASH processes over 90% of read pairs correctly even when the +error rate is 5%. + + PUBLICATION + +Title: FLASH: fast length adjustment of short reads to improve genome assemblies +Authors: Tanja Magoč and Steven L. Salzberg +URL: http://bioinformatics.oxfordjournals.org/content/27/21/2957.full + + LICENSE + +FLASH is released under the GNU General Public License Version 3 or later (see +COPYING). + + COMMENTS/QUESTIONS/REQUESTS + +Send an e-mail to flash.comment@gmail.com + +Other versions are available from the SourceForge page: + +https://sourceforge.net/projects/flashpage/ diff -r a444685f161c -r 6889442b27dc tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Sat Jul 04 08:58:21 2015 -0400 @@ -0,0 +1,16 @@ + + + + + + http://sourceforge.net/projects/flashpage/files/FLASH-1.2.9.tar.gz/download + make + cp -r * $INSTALL_DIR + $INSTALL_DIR/flash + + $INSTALL_DIR + + + + +