Galaxy |

Changeset 8:ff058438080a (2014-02-05)

Previous changeset 7:95ddc2380130 (2013-11-28) Next changeset 9:1f1943b29266 (2014-02-05)

Commit message:
Version 0.8, supports SPAdes 3.0.0

added:
tools/spades_3_0/filter_spades_output.pl
tools/spades_3_0/filter_spades_output.xml
tools/spades_3_0/plot_spades_stats.xml
tools/spades_3_0/r_wrapper.sh
tools/spades_3_0/spades.pl
tools/spades_3_0/spades.xml
tools/spades_3_0/tool_dependencies.xml

removed:
tools/spades_2_5/filter_spades_output.pl
tools/spades_2_5/filter_spades_output.xml
tools/spades_2_5/plot_spades_stats.xml
tools/spades_2_5/r_wrapper.sh
tools/spades_2_5/spades.pl
tools/spades_2_5/spades.xml
tools/spades_2_5/tool_dependencies.xml

diff -r 95ddc2380130 -r ff058438080a tools/spades_2_5/filter_spades_output.pl
--- a/tools/spades_2_5/filter_spades_output.pl Thu Nov 28 05:29:32 2013 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,106 +0,0 @@
-#!/usr/bin/perl -w
-
-=head1 SYNOPSIS
-
-filter_spades_output.pl - Filters contigs or scaffolds based on contig length and coverage.
-
-=head1 USAGE
-
-filter_spades_output.pl [-c|--coverage-cutoff] [-l|--length-cutoff] [-o|--filtered-out out.fasta] -t|--tab stats.tab seqs.fasta
-
-=head1 INPUT
-
-=head2 [-c|--coverage-cutoff]
-
-Mininum coverage. Contigs with lower coverage will be discarded. Default 10.
-
-=head2 [-l|--length-cutoff]
-
-Mininum coverage. Smaller ontigs will be discarded. Default 500.
-
-=head2 -t|--tab stats.tab
-
-A tabular file, with three columns: contig name, length, and coverage:
-
-NODE_1 31438 24.5116
-NODE_2 31354 2316.96
-NODE_3 26948 82.3294
-
-Such a file is produced by spades.xml. Contigs should be in the same order as in the fasta file.
-
-=head2 [-o|--filtered-out out.fasta]
-
-If specified, filtered out sequences will be written to this file.
-
-=head2 seqs.fasta
-
-Sequences in fasta format. Start of IDs must match ids in the tabular file.
-
-=head1 OUTPUT
-
-A fasta file on stdout.
-
-=head1 AUTHOR
-
-Lionel Guy (lionel.guy@icm.uu.se)
-
-=head1 DATE
-
-Thu Aug 29 13:51:13 CEST 2013
-
-=cut
-
-# libraries
-use strict;
-use Getopt::Long;
-use Bio::SeqIO;
-
-my $coverage_co = 10;
-my $length_co = 500;
-my $out_filtered;
-my $tab_file;
-
-GetOptions(
-    'c|coverage-cutoff=s' => \$coverage_co,
-    'l|length-cutoff=s' => \$length_co,
-    'o|filtered-out=s' => \$out_filtered,
-    't|tab=s' => \$tab_file,
-);
-my $fasta_file = shift(@ARGV);
-die ("No tab file specified") unless ($tab_file);
-die ("No fasta file specified") unless ($fasta_file);
-
-## Read tab file, discard rows with comments
-open TAB, '<', $tab_file or die "$?";
-my @stats;
-while (<TAB>){
-    chomp;
-    push @stats, $_ unless (/^#/);
-}
-
-## Read fasta
-my $seq_in = Bio::SeqIO->new(-file => $fasta_file,
-      -format => 'fasta');
-my $seq_out = Bio::SeqIO->new(-fh => \*STDOUT,
-       -format => 'fasta');
-my $seq_out_filt = Bio::SeqIO->new(-file => ">$out_filtered",
-    -format => 'fasta') if ($out_filtered);
-while (my $seq = $seq_in->next_seq){
-    my $stat = shift @stats;
-    die "Less rows in tab than sequences in seq file" unless $stat;
-    my ($id_tab, $length, $coverage) = split(/\t+/, $stat);
-    die "id, length or coverate not defined at $stat\n"
- unless ($id_tab && $length && $coverage);
-    my $id_seq = $seq->id;
-    die "Unmatched ids $id_seq and $id_tab\n" unless ($id_seq =~ /^$id_tab/i);
-    if ($length >= $length_co && $coverage >= $coverage_co){
- $seq_out->write_seq($seq);
-    } elsif ($out_filtered){
- $seq_out_filt->write_seq($seq);
-    } else {
- # do nothing
-    }
-}
-die "More rows in tab than sequences in seq file" if (scalar(@stats) > 0);
-exit 0;
-

diff -r 95ddc2380130 -r ff058438080a tools/spades_2_5/filter_spades_output.xml
--- a/tools/spades_2_5/filter_spades_output.xml Thu Nov 28 05:29:32 2013 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,33 +0,0 @@
-<tool id="filter_spades_output" name="Filter SPAdes output" version="0.1">
-  <description>remove low coverage and short contigs/scaffolds</description>
-  <command interpreter="perl">filter_spades_output.pl
-  --coverage-cutoff $coverage_co
-  --length-cutoff $length_co
-  #if $keep_leftover
-    --filtered-out $filtered_out
-  #end if
-  --tab $stats_in
-  $fasta_in > $fasta_output
-  </command>
-
-  <inputs>
-    <param name="fasta_in" type="data" format="fasta" label="Sequences" help="Contigs or scaffolds. Make sure you input the corresponding stat file" />
-    <param name="stats_in" type="data" format="tabular" label="Contig stats" />
-    <param name="length_co" type="integer" value="1000" min="0" label="Length cut-off" help="Contigs with length under that value are shown in red" />
-    <param name="coverage_co" type="integer" value="10" min="0" label="Coverage cut-off" help="Contigs with length under that value are shown in red" />
-    <param name="keep_leftover" type="boolean" checked="false" label="Save filtered-out sequences?" />
-  </inputs>
-  <outputs>
-    <data format="fasta" name="fasta_output" label="Filtered sequences" />
-    <data format="fasta" name="filtered_out" label="Discarded sequences">
-      <filter>keep_leftover == "true"</filter>
-    </data>
-  </outputs>
-  <help>
-**What it does**
-
-Using the output of SPAdes (a fasta and a stats file, either from contigs or scaffolds), it filters the fasta files, discarding all sequences that are under a given length or under a given coverage.
-
-Typically, this is used to discard short contigs, or contaminations. To display a coverage vs. length plot, use the "SPAdes stats" tool in the same package.
-  </help>
-</tool>

diff -r 95ddc2380130 -r ff058438080a tools/spades_2_5/plot_spades_stats.xml
--- a/tools/spades_2_5/plot_spades_stats.xml Thu Nov 28 05:29:32 2013 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,80 +0,0 @@
-<tool id="plot_spades_stats" name="SPAdes stats" version="0.1">
-  <description>coverage vs. length plot</description>
-  <requirements>
-    <requirement type="package">R</requirement>
-  </requirements>
-  <command interpreter="bash">r_wrapper.sh $script_file</command>
-
-  <inputs>
-    <param name="input_scaffolds" type="data" format="tabular" label="Scaffold stats"/>
-    <param name="input_contigs" type="data" format="tabular" label="Contig stats"/>
-    <param name="length_co" type="integer" value="1000" min="0" label="Length cut-off" help="Contigs with length under that value are shown in red"/>
-    <param name="coverage_co" type="integer" value="10" min="0" label="Coverage cut-off" help="Contigs with length under that value are shown in red"/>
-  </inputs>
-  <configfiles>
-    <configfile name="script_file">
-## Setup R error handling to go to stderr
-options( show.error.messages=F,
-  error = function () {
-    cat( geterrmessage(), file=stderr() ); q( "no", 1, F )
-} )
-files = c("${input_contigs}", "${input_scaffolds}")
-types = c("Contigs", "Scaffolds")
-
-## Start plotting device
-png("${out_file}", w=500, h=1000)
-par(mfrow=c(2,1))
-
-## Loop over the two files
-for (i in 1:length(types)){
-  seqs = read.table(files[i], header=FALSE, comment.char="#")
-  colnames = c("name", "length", "coverage")
-  names(seqs) = colnames
-
-  ## Stats over all sequences
-  sl_all = sort(seqs\$length, decreasing=TRUE)
-  cs_all = cumsum(sl_all)
-  s_all = sum(seqs\$length)
-  n50_idx_all = which.min(sl_all[cs_all < 0.5*s_all])
-  n90_idx_all = which.min(sl_all[cs_all < 0.9*s_all])
-  n50_all = sl_all[n50_idx_all]
-  n90_all = sl_all[n90_idx_all]
-
-  ## Filter short seqs, redo stats
-  seqs_filt = seqs[seqs\$length >= ${length_co} & seqs\$coverage >= ${coverage_co},]
-  if (nrow(seqs_filt) > 0){
-    sl_filt = sort(seqs_filt\$length, decreasing=TRUE)
-    cs_filt = cumsum(sl_filt)
-    s_filt = sum(seqs_filt\$length)
-    n50_idx_filt = which.min(sl_filt[cs_filt < 0.5*s_filt])
-    n90_idx_filt = which.min(sl_filt[cs_filt < 0.9*s_filt])
-    n50_filt = sl_filt[n50_idx_filt]
-    n90_filt = sl_filt[n90_idx_filt]
-  }
-  seqs_bad = seqs[seqs\$length < ${length_co} | seqs\$coverage < ${coverage_co},]
-
-  ## Length vs coverage
-  plot(length~coverage, data=seqs, log="xy", type="n", main=paste(types[i], ": coverage vs. length", sep=""), xlab="Coverage", ylab="Length")
-  if (nrow(seqs_bad) > 0){
-    points(length~coverage, data=seqs_bad, cex=0.5, col="red")
-  }
-  if (nrow(seqs_filt) > 0){
-    points(length~coverage, data=seqs_filt, cex=0.5, col="black")
-  }
-  abline(v=${coverage_co}, h=${length_co}, lty=2, col=grey(0.3))
-  legend(x="topleft", legend=c("Before/after filtering", paste(c("N50: ", "N90: ", "Median cov.: "), c(n50_all, n90_all, round(median(seqs\$coverage))), rep("/", 3), c(n50_filt, n90_filt, round(median(seqs_filt\$coverage))), sep="")), cex=0.8)
-}
-dev.off()
-    </configfile>
-  </configfiles>
-  <outputs>
-    <data format="png" name="out_file" />
-  </outputs>
-  <help>
-**What it does**
-
-Using the output of SPAdes (a pair of fasta file and stat file for each of the contigs and scaffolds), it produces a coverage vs. contig plot. Each dot represent a contig/scaffold. Given a coverage and a length cutoff, sequences that do not meet those criteria are shown in red. Some statistics are also given (N50, N90, median contig/scaffold length) both before and after filtering.
-
-Use the "filter SPAdes output" tool to actually filter sequences.
-  </help>
-</tool>
\ No newline at end of file

diff -r 95ddc2380130 -r ff058438080a tools/spades_2_5/r_wrapper.sh
--- a/tools/spades_2_5/r_wrapper.sh Thu Nov 28 05:29:32 2013 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,23 +0,0 @@
-#!/bin/sh
-
-### Run R providing the R script in $1 as standard input and passing
-### the remaining arguments on the command line
-
-# Function that writes a message to stderr and exits
-function fail
-{
- echo "$@" >&2
- exit 1
-}
-
-# Ensure R executable is found
-which R > /dev/null || fail "'R' is required by this tool but was not found on path"
-
-# Extract first argument
-infile=$1; shift
-
-# Ensure the file exists
-test -f $infile || fail "R input file '$infile' does not exist"
-
-# Invoke R passing file named by first argument to stdin
-R --vanilla --slave $* < $infile

diff -r 95ddc2380130 -r ff058438080a tools/spades_2_5/spades.pl
--- a/tools/spades_2_5/spades.pl Thu Nov 28 05:29:32 2013 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,84 +0,0 @@
-#!/usr/bin/env perl
-## A wrapper script to call spades.py and collect its output
-use strict;
-use warnings;
-use File::Temp qw/ tempfile tempdir /;
-use File::Copy;
-use Getopt::Long;
-
-# Parse arguments
-my ($out_contigs_file,
-    $out_contigs_stats,
-    $out_scaffolds_file,
-    $out_scaffolds_stats,
-    $out_log_file,
-    @sysargs) = @ARGV;
-
-## GetOptions not compatible with parsing the rest of the arguments in an array.
-## Keeping the not-so-nice parse-in-one-go method, without named arguments.
-# GetOptions(
-#     'contigs-file=s'    => \$out_contigs_file,
-#     'contigs-stats=s'   => \$out_contigs_stats,
-#     'scaffolds-file=s'  => \$out_scaffolds_file,
-#     'scaffolds-stats=s' => \$out_scaffolds_stats,
-#     'out_log_file=s'    => \$out_log_file,
-# );
-
-# my @sysargs = @ARGV;
-
-# Create temporary folder to store files, delete after use
-#my $output_dir = tempdir( CLEANUP => 0 );
-my $output_dir = 'output_dir';
-# Link "dat" files as fastq, otherwise spades complains about file format
-
-# Create log handle
-open my $log, '>', $out_log_file or die "Cannot write to $out_log_file: $?\n";
-
-# Run program
-# To do: record time
-&runSpades(@sysargs);
-&collectOutput();
-&extractCoverageLength($out_contigs_file, $out_contigs_stats);
-&extractCoverageLength($out_scaffolds_file, $out_scaffolds_stats);
-print $log "Done\n";
-close $log;
-exit 0;
-
-# Run spades
-sub runSpades {
-    my $cmd = join(" ", @_) . " -o $output_dir";
-    my $return_code = system($cmd);
-    if ($return_code) {
- print $log "Failed with code $return_code\nCommand $cmd\nMessage: $?\n";
- die "Failed with code $return_code\nCommand $cmd\nMessage: $?\n";
-    }
-    return 0;
-}
-
-# Collect output
-sub collectOutput{
-    # To do: check that the files are there
-    # Collects output
-    move "$output_dir/contigs.fasta", $out_contigs_file;
-    move "$output_dir/scaffolds.fasta", $out_scaffolds_file;
-    open LOG, '<', "$output_dir/spades.log"
- or die "Cannot open log file $output_dir/spades.log: $?";
-    print $log $_ while (<LOG>);
-    return 0;
-}
-
-# Extract
-sub extractCoverageLength{
-    my ($in, $out) = @_;
-    open FASTA, '<', $in or die $!;
-    open TAB, '>', $out or die $!;
-    print TAB "#name\tlength\tcoverage\n";
-    while (<FASTA>){
- next unless /^>/;
- chomp;
- die "Not all elements found in $_\n" if (! m/^>NODE_(\d+)_length_(\d+)_cov_(\d+\.*\d*)_/);
- my ($n, $l, $cov) = ($1, $2, $3);
- print TAB "NODE_$n\t$l\t$cov\n";
-    }
-    close TAB;
-}

diff -r 95ddc2380130 -r ff058438080a tools/spades_2_5/spades.xml
--- a/tools/spades_2_5/spades.xml Thu Nov 28 05:29:32 2013 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,133 +0,0 @@
-<tool id="spades" name="spades" version="0.7">
-  <description>SPAdes genome assembler for regular and single-cell projects</description>
-  <requirements>
-    <requirement type="package" version="2.5.1">spades</requirement>
-  </requirements>
-  <command interpreter="perl">spades.pl
-     $out_contigs
-     $out_contig_stats
-     $out_scaffolds
-     $out_scaffold_stats
-     $out_log
-    ## A real command looks like: spades.py -k 21,33,55,77,99,127 --careful -1 Y.fastq.gz -2 X.fastq.gz -t 24 -o output
-    spades.py
-    ## Forces unzipped output, faster
-    --disable-gzip-output
-    $sc
-    $onlyassembler
-    $careful
-    ##$rectangles
-    -t \${GALAXY_SLOTS:-16}
-    -k "$kmers"
-    ##-i $iterations
-    ##--phred-offset
-    ## Sequence files
-    #for $i, $library in enumerate( $libraries )
-      #set num=$i+1
-      #if str( $library.lib_type ) == "paired_end":
-        #set prefix = 'pe'
-      #else:
-        #set prefix = 'mp'
-      #end if
-      --$prefix$num-$library.orientation
-      #for $file in $library.files
- #if $file.file_type.type == "separate"
-          --$prefix$num-1 fastq:$file.file_type.fwd_reads
-          --$prefix$num-2 fastq:$file.file_type.rev_reads
-        #elif $file.file_type.type == "interleaved"
-          --$prefix$num-12 fastq:$file.file_type.interleaved_reads
-        #elif $file.file_type.type == "unpaired"
-          --$prefix$num-s fastq:$file.file_type.unpaired_reads
-        #end if
-      #end for
-    #end for
-  </command>
-  <inputs>
-    <param name="sc" type="boolean" truevalue="--sc" falsevalue="" label="Single-cell?" help="This option is required for MDA (single-cell) data.">
-      <option value="false">No</option>
-      <option value="true">Yes</option>
-    </param>
-    <param name="onlyassembler" type="boolean" truevalue="--only-assembler" falsevalue="" checked="False" label="Run only assembly? (without read error correction)" />
-    <param name="careful" type="boolean" truevalue="--careful" falsevalue="" checked="True" label="Careful correction?" help="Tries to reduce number of mismatches and short indels. Also runs MismatchCorrector – a post processing tool, which uses BWA tool (comes with SPAdes)." />
-    
-    
-    <param name="kmers" type="text" label="K-mers to use, separated by commas" value="21,33,55" help="Comma-separated list of k-mer sizes to be used (all values must be odd, less than 128, listed in ascending order, and smaller than the read length). The default value is 21,33,55." >
-    </param>
-    
-    <repeat name="libraries" title="Libraries" min="1" help="It is not possible to specify only mate-pair libraries. Scaffolds are not produced if neither a paired-end nor a mate-pair library is provided.">
-      <param name="lib_type" type="select" label="Library type">
- <option value="paired_end">Paired-end / Single reads</option>
- <option value="mate_paired">Mate pairs</option>
-      </param>
-      <param name="orientation" type="select" label="Orientation">
- <option value="fr" selected="true">-> <- (fr)</option>
- <option value="rf"><- -> (rf)</option>
- <option value="ff">-> -> (ff)</option>
-      </param>
-      <repeat name="files" title="Files" min="1">
- <conditional name="file_type">
-   <param name="type" type="select" label="Select file format">
-     <option value="separate">Separate input files</option>
-     <option value="interleaved">Interleaved files</option>
-     <option value="unpaired">Unpaired/Single reads</option>
-   </param>
-   <when value="separate">
-     <param name="fwd_reads" type="data" format="fastq" label="Forward reads" help="FASTQ format" />
-     <param name="rev_reads" type="data" format="fastq" label="Reverse reads" help="FASTQ format" />
-   </when>
-   <when value="interleaved">
-     <param name="interleaved_reads" type="data" format="fastq" label="Interleaved paired reads" help="FASTQ format" />
-   </when>
-   <when value="unpaired">
-     <param name="unpaired_reads" type="data" format="fastq" label="Unpaired reads" help="FASTQ format" />
-   </when>
- </conditional>
-      </repeat>
-    </repeat>
-  </inputs>
-  <outputs>
-    <data name="out_contigs" format="fasta" label="SPAdes contigs (fasta)" />
-    <data name="out_contig_stats" format="tabular" label="SPAdes contig stats" />
-    <data name="out_scaffolds" format="fasta" label="SPAdes scaffolds (fasta)" />
-    <data name="out_scaffold_stats" format="tabular" label="SPAdes scaffold stats" />
-    <data name="out_log" format="txt" label="SPAdes log" />
-  </outputs>
-  
-  <help>
-**What it does**
-
-SPAdes – St. Petersburg genome assembler – is intended for both standard isolates and single-cell MDA bacteria assemblies. See http://bioinf.spbau.ru/en/spades for more details on SPAdes.
-
-This wrapper runs SPAdes 2.5.1, collects the output, and throws away all the temporary files. It also produces a tab file with contig names, length and coverage.
-
-**SPAdes citation**
-
-Anton Bankevich, Sergey Nurk, Dmitry Antipov, Alexey A. Gurevich, Mikhail Dvorkin, Alexander S. Kulikov, Valery M. Lesin, Sergey I. Nikolenko, Son Pham, Andrey D. Prjibelski, Alexey V. Pyshkin, Alexander V. Sirotkin, Nikolay Vyahhi, Glenn Tesler, Max A. Alekseyev, and Pavel A. Pevzner. Journal of Computational Biology. May 2012, 19(5): 455-477. doi:10.1089/cmb.2012.0021.
-
-**License**
-
-SPAdes is developed by and copyrighted to Saint-Petersburg Academic University, and is released under GPLv2.
-
-This wrapper is copyrighted by Lionel Guy, and is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with this program.  If not, see http://www.gnu.org/licenses/.
-
-** Acknowledgments **
-
-Anton Korobeynikov greatlty helped understanding how SPAdes work, and integrated handy features into SPAdes.
-
-Nicola Soranzo fixed bugs in the 0.6 version.
-  </help>
-</tool>

diff -r 95ddc2380130 -r ff058438080a tools/spades_2_5/tool_dependencies.xml
--- a/tools/spades_2_5/tool_dependencies.xml Thu Nov 28 05:29:32 2013 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,33 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="spades" version="2.5.1">
-        <install version="1.0">
-            <actions>
-                <action type="download_by_url">http://spades.bioinf.spbau.ru/release2.5.1/SPAdes-2.5.1-Linux.tar.gz</action>
-                
- <action type="make_directory">$INSTALL_DIR/bin</action>
- <action type="make_directory">$INSTALL_DIR/share</action>
- <action type="move_directory_files">
-   <source_directory>bin</source_directory>
-   <destination_directory>$INSTALL_DIR/bin</destination_directory>
- </action>
-                <action type="move_directory_files">
-   <source_directory>share</source_directory>
-   <destination_directory>$INSTALL_DIR/share</destination_directory>
- </action>
- 
- 
-                
-                
-                
-            </actions>
-        </install>
-        <readme>
-This installs SPAdes 2.5.1.
-
-See manual here http://spades.bioinf.spbau.ru/release2.5.1/manual.html
-See also here http://bioinf.spbau.ru/en/spades
-        </readme>
-    </package>
-</tool_dependency>
-

diff -r 95ddc2380130 -r ff058438080a tools/spades_3_0/filter_spades_output.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/spades_3_0/filter_spades_output.pl Wed Feb 05 05:19:03 2014 -0500

[

@@ -0,0 +1,106 @@
+#!/usr/bin/perl -w
+
+=head1 SYNOPSIS
+
+filter_spades_output.pl - Filters contigs or scaffolds based on contig length and coverage.
+
+=head1 USAGE
+
+filter_spades_output.pl [-c|--coverage-cutoff] [-l|--length-cutoff] [-o|--filtered-out out.fasta] -t|--tab stats.tab seqs.fasta
+
+=head1 INPUT
+
+=head2 [-c|--coverage-cutoff]
+
+Mininum coverage. Contigs with lower coverage will be discarded. Default 10.
+
+=head2 [-l|--length-cutoff]
+
+Mininum coverage. Smaller ontigs will be discarded. Default 500.
+
+=head2 -t|--tab stats.tab
+
+A tabular file, with three columns: contig name, length, and coverage:
+
+NODE_1 31438 24.5116
+NODE_2 31354 2316.96
+NODE_3 26948 82.3294
+
+Such a file is produced by spades.xml. Contigs should be in the same order as in the fasta file.
+
+=head2 [-o|--filtered-out out.fasta]
+
+If specified, filtered out sequences will be written to this file.
+
+=head2 seqs.fasta
+
+Sequences in fasta format. Start of IDs must match ids in the tabular file.
+
+=head1 OUTPUT
+
+A fasta file on stdout.
+
+=head1 AUTHOR
+
+Lionel Guy (lionel.guy@icm.uu.se)
+
+=head1 DATE
+
+Thu Aug 29 13:51:13 CEST 2013
+
+=cut
+
+# libraries
+use strict;
+use Getopt::Long;
+use Bio::SeqIO;
+
+my $coverage_co = 10;
+my $length_co = 500;
+my $out_filtered;
+my $tab_file;
+
+GetOptions(
+    'c|coverage-cutoff=s' => \$coverage_co,
+    'l|length-cutoff=s' => \$length_co,
+    'o|filtered-out=s' => \$out_filtered,
+    't|tab=s' => \$tab_file,
+);
+my $fasta_file = shift(@ARGV);
+die ("No tab file specified") unless ($tab_file);
+die ("No fasta file specified") unless ($fasta_file);
+
+## Read tab file, discard rows with comments
+open TAB, '<', $tab_file or die "$?";
+my @stats;
+while (<TAB>){
+    chomp;
+    push @stats, $_ unless (/^#/);
+}
+
+## Read fasta
+my $seq_in = Bio::SeqIO->new(-file => $fasta_file,
+      -format => 'fasta');
+my $seq_out = Bio::SeqIO->new(-fh => \*STDOUT,
+       -format => 'fasta');
+my $seq_out_filt = Bio::SeqIO->new(-file => ">$out_filtered",
+    -format => 'fasta') if ($out_filtered);
+while (my $seq = $seq_in->next_seq){
+    my $stat = shift @stats;
+    die "Less rows in tab than sequences in seq file" unless $stat;
+    my ($id_tab, $length, $coverage) = split(/\t+/, $stat);
+    die "id, length or coverate not defined at $stat\n"
+ unless ($id_tab && $length && $coverage);
+    my $id_seq = $seq->id;
+    die "Unmatched ids $id_seq and $id_tab\n" unless ($id_seq =~ /^$id_tab/i);
+    if ($length >= $length_co && $coverage >= $coverage_co){
+ $seq_out->write_seq($seq);
+    } elsif ($out_filtered){
+ $seq_out_filt->write_seq($seq);
+    } else {
+ # do nothing
+    }
+}
+die "More rows in tab than sequences in seq file" if (scalar(@stats) > 0);
+exit 0;
+

diff -r 95ddc2380130 -r ff058438080a tools/spades_3_0/filter_spades_output.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/spades_3_0/filter_spades_output.xml Wed Feb 05 05:19:03 2014 -0500

@@ -0,0 +1,33 @@
+<tool id="filter_spades_output" name="Filter SPAdes output" version="0.1">
+  <description>remove low coverage and short contigs/scaffolds</description>
+  <command interpreter="perl">filter_spades_output.pl
+  --coverage-cutoff $coverage_co
+  --length-cutoff $length_co
+  #if $keep_leftover
+    --filtered-out $filtered_out
+  #end if
+  --tab $stats_in
+  $fasta_in > $fasta_output
+  </command>
+
+  <inputs>
+    <param name="fasta_in" type="data" format="fasta" label="Sequences" help="Contigs or scaffolds. Make sure you input the corresponding stat file" />
+    <param name="stats_in" type="data" format="tabular" label="Contig stats" />
+    <param name="length_co" type="integer" value="1000" min="0" label="Length cut-off" help="Contigs with length under that value are shown in red" />
+    <param name="coverage_co" type="integer" value="10" min="0" label="Coverage cut-off" help="Contigs with length under that value are shown in red" />
+    <param name="keep_leftover" type="boolean" checked="false" label="Save filtered-out sequences?" />
+  </inputs>
+  <outputs>
+    <data format="fasta" name="fasta_output" label="Filtered sequences" />
+    <data format="fasta" name="filtered_out" label="Discarded sequences">
+      <filter>keep_leftover == "true"</filter>
+    </data>
+  </outputs>
+  <help>
+**What it does**
+
+Using the output of SPAdes (a fasta and a stats file, either from contigs or scaffolds), it filters the fasta files, discarding all sequences that are under a given length or under a given coverage.
+
+Typically, this is used to discard short contigs, or contaminations. To display a coverage vs. length plot, use the "SPAdes stats" tool in the same package.
+  </help>
+</tool>

diff -r 95ddc2380130 -r ff058438080a tools/spades_3_0/plot_spades_stats.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/spades_3_0/plot_spades_stats.xml Wed Feb 05 05:19:03 2014 -0500

[

@@ -0,0 +1,80 @@
+<tool id="plot_spades_stats" name="SPAdes stats" version="0.1">
+  <description>coverage vs. length plot</description>
+  <requirements>
+    <requirement type="package">R</requirement>
+  </requirements>
+  <command interpreter="bash">r_wrapper.sh $script_file</command>
+
+  <inputs>
+    <param name="input_scaffolds" type="data" format="tabular" label="Scaffold stats"/>
+    <param name="input_contigs" type="data" format="tabular" label="Contig stats"/>
+    <param name="length_co" type="integer" value="1000" min="0" label="Length cut-off" help="Contigs with length under that value are shown in red"/>
+    <param name="coverage_co" type="integer" value="10" min="0" label="Coverage cut-off" help="Contigs with length under that value are shown in red"/>
+  </inputs>
+  <configfiles>
+    <configfile name="script_file">
+## Setup R error handling to go to stderr
+options( show.error.messages=F,
+  error = function () {
+    cat( geterrmessage(), file=stderr() ); q( "no", 1, F )
+} )
+files = c("${input_contigs}", "${input_scaffolds}")
+types = c("Contigs", "Scaffolds")
+
+## Start plotting device
+png("${out_file}", w=500, h=1000)
+par(mfrow=c(2,1))
+
+## Loop over the two files
+for (i in 1:length(types)){
+  seqs = read.table(files[i], header=FALSE, comment.char="#")
+  colnames = c("name", "length", "coverage")
+  names(seqs) = colnames
+
+  ## Stats over all sequences
+  sl_all = sort(seqs\$length, decreasing=TRUE)
+  cs_all = cumsum(sl_all)
+  s_all = sum(seqs\$length)
+  n50_idx_all = which.min(sl_all[cs_all < 0.5*s_all])
+  n90_idx_all = which.min(sl_all[cs_all < 0.9*s_all])
+  n50_all = sl_all[n50_idx_all]
+  n90_all = sl_all[n90_idx_all]
+
+  ## Filter short seqs, redo stats
+  seqs_filt = seqs[seqs\$length >= ${length_co} & seqs\$coverage >= ${coverage_co},]
+  if (nrow(seqs_filt) > 0){
+    sl_filt = sort(seqs_filt\$length, decreasing=TRUE)
+    cs_filt = cumsum(sl_filt)
+    s_filt = sum(seqs_filt\$length)
+    n50_idx_filt = which.min(sl_filt[cs_filt < 0.5*s_filt])
+    n90_idx_filt = which.min(sl_filt[cs_filt < 0.9*s_filt])
+    n50_filt = sl_filt[n50_idx_filt]
+    n90_filt = sl_filt[n90_idx_filt]
+  }
+  seqs_bad = seqs[seqs\$length < ${length_co} | seqs\$coverage < ${coverage_co},]
+
+  ## Length vs coverage
+  plot(length~coverage, data=seqs, log="xy", type="n", main=paste(types[i], ": coverage vs. length", sep=""), xlab="Coverage", ylab="Length")
+  if (nrow(seqs_bad) > 0){
+    points(length~coverage, data=seqs_bad, cex=0.5, col="red")
+  }
+  if (nrow(seqs_filt) > 0){
+    points(length~coverage, data=seqs_filt, cex=0.5, col="black")
+  }
+  abline(v=${coverage_co}, h=${length_co}, lty=2, col=grey(0.3))
+  legend(x="topleft", legend=c("Before/after filtering", paste(c("N50: ", "N90: ", "Median cov.: "), c(n50_all, n90_all, round(median(seqs\$coverage))), rep("/", 3), c(n50_filt, n90_filt, round(median(seqs_filt\$coverage))), sep="")), cex=0.8)
+}
+dev.off()
+    </configfile>
+  </configfiles>
+  <outputs>
+    <data format="png" name="out_file" />
+  </outputs>
+  <help>
+**What it does**
+
+Using the output of SPAdes (a pair of fasta file and stat file for each of the contigs and scaffolds), it produces a coverage vs. contig plot. Each dot represent a contig/scaffold. Given a coverage and a length cutoff, sequences that do not meet those criteria are shown in red. Some statistics are also given (N50, N90, median contig/scaffold length) both before and after filtering.
+
+Use the "filter SPAdes output" tool to actually filter sequences.
+  </help>
+</tool>
\ No newline at end of file

diff -r 95ddc2380130 -r ff058438080a tools/spades_3_0/r_wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/spades_3_0/r_wrapper.sh Wed Feb 05 05:19:03 2014 -0500

@@ -0,0 +1,23 @@
+#!/bin/sh
+
+### Run R providing the R script in $1 as standard input and passing
+### the remaining arguments on the command line
+
+# Function that writes a message to stderr and exits
+function fail
+{
+ echo "$@" >&2
+ exit 1
+}
+
+# Ensure R executable is found
+which R > /dev/null || fail "'R' is required by this tool but was not found on path"
+
+# Extract first argument
+infile=$1; shift
+
+# Ensure the file exists
+test -f $infile || fail "R input file '$infile' does not exist"
+
+# Invoke R passing file named by first argument to stdin
+R --vanilla --slave $* < $infile

diff -r 95ddc2380130 -r ff058438080a tools/spades_3_0/spades.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/spades_3_0/spades.pl Wed Feb 05 05:19:03 2014 -0500

@@ -0,0 +1,84 @@
+#!/usr/bin/env perl
+## A wrapper script to call spades.py and collect its output
+use strict;
+use warnings;
+use File::Temp qw/ tempfile tempdir /;
+use File::Copy;
+use Getopt::Long;
+
+# Parse arguments
+my ($out_contigs_file,
+    $out_contigs_stats,
+    $out_scaffolds_file,
+    $out_scaffolds_stats,
+    $out_log_file,
+    @sysargs) = @ARGV;
+
+## GetOptions not compatible with parsing the rest of the arguments in an array.
+## Keeping the not-so-nice parse-in-one-go method, without named arguments.
+# GetOptions(
+#     'contigs-file=s'    => \$out_contigs_file,
+#     'contigs-stats=s'   => \$out_contigs_stats,
+#     'scaffolds-file=s'  => \$out_scaffolds_file,
+#     'scaffolds-stats=s' => \$out_scaffolds_stats,
+#     'out_log_file=s'    => \$out_log_file,
+# );
+
+# my @sysargs = @ARGV;
+
+# Create temporary folder to store files, delete after use
+#my $output_dir = tempdir( CLEANUP => 0 );
+my $output_dir = 'output_dir';
+# Link "dat" files as fastq, otherwise spades complains about file format
+
+# Create log handle
+open my $log, '>', $out_log_file or die "Cannot write to $out_log_file: $?\n";
+
+# Run program
+# To do: record time
+&runSpades(@sysargs);
+&collectOutput();
+&extractCoverageLength($out_contigs_file, $out_contigs_stats);
+&extractCoverageLength($out_scaffolds_file, $out_scaffolds_stats);
+print $log "Done\n";
+close $log;
+exit 0;
+
+# Run spades
+sub runSpades {
+    my $cmd = join(" ", @_) . " -o $output_dir";
+    my $return_code = system($cmd);
+    if ($return_code) {
+ print $log "Failed with code $return_code\nCommand $cmd\nMessage: $?\n";
+ die "Failed with code $return_code\nCommand $cmd\nMessage: $?\n";
+    }
+    return 0;
+}
+
+# Collect output
+sub collectOutput{
+    # To do: check that the files are there
+    # Collects output
+    move "$output_dir/contigs.fasta", $out_contigs_file;
+    move "$output_dir/scaffolds.fasta", $out_scaffolds_file;
+    open LOG, '<', "$output_dir/spades.log"
+ or die "Cannot open log file $output_dir/spades.log: $?";
+    print $log $_ while (<LOG>);
+    return 0;
+}
+
+# Extract
+sub extractCoverageLength{
+    my ($in, $out) = @_;
+    open FASTA, '<', $in or die $!;
+    open TAB, '>', $out or die $!;
+    print TAB "#name\tlength\tcoverage\n";
+    while (<FASTA>){
+ next unless /^>/;
+ chomp;
+ die "Not all elements found in $_\n" if (! m/^>NODE_(\d+)_length_(\d+)_cov_(\d+\.*\d*)/);
+ my ($n, $l, $cov) = ($1, $2, $3);
+ print TAB "NODE_$n\t$l\t$cov\n";
+    }
+    close TAB;
+}

diff -r 95ddc2380130 -r ff058438080a tools/spades_3_0/spades.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/spades_3_0/spades.xml Wed Feb 05 05:19:03 2014 -0500

b'@@ -0,0 +1,193 @@\n+<tool id="spades" name="spades" version="0.8">\n+ <description>SPAdes genome assembler for regular and single-cell projects</description>\n+ <requirements>\n+ <requirement type="package" version="3.0.0">spades</requirement>\n+ </requirements>\n+ <command interpreter="perl">spades.pl \n+ $out_contigs \n+ $out_contig_stats \n+ $out_scaffolds \n+ $out_scaffold_stats \n+ $out_log \n+ ## A real command looks like: spades.py -k 21,33,55,77,99,127 --careful -1 Y.fastq.gz -2 X.fastq.gz -t 24 -o output\n+ spades.py\n+ ## Forces unzipped output, faster\n+ --disable-gzip-output\n+ $sc\n+ $onlyassembler\n+ $careful\n+ -t \\${GALAXY_SLOTS:-16}\n+ -k "$kmers"\n+ $iontorrent\n+ ## Sequence files, libraries\n+ #for $i, $library in enumerate( $libraries )\n+ #set num=$i+1\n+ #if str( $library.lib_type ) == "paired_end":\n+ #set prefix = \'pe\'\n+ #else:\n+ #set prefix = \'mp\'\n+ #end if\n+ --$prefix$num-$library.orientation \n+ #for $file in $library.files\n+\t#if $file.file_type.type == "separate"\n+ --$prefix$num-1 fastq:$file.file_type.fwd_reads\n+ --$prefix$num-2 fastq:$file.file_type.rev_reads\n+ #elif $file.file_type.type == "interleaved"\n+ --$prefix$num-12 fastq:$file.file_type.interleaved_reads\n+ #elif $file.file_type.type == "unpaired"\n+ --$prefix$num-s fastq:$file.file_type.unpaired_reads\n+ #end if\n+ #end for\n+ #end for\n+ ## PacBio reads\n+ #for $i, $pacbiolib in enumerate( $pacbio )\n+ --pacbio fastq:$pacbiolib.pacbio_reads\n+ #end for\n+ ## Sanger\n+ #for $i, $sangerlib in enumerate( $sanger )\n+ --sanger $sangerlib.file_type.type:$sangerlib.file_type.sanger_reads\n+ #end for\n+ ## Contigs\n+ #for $i, $trustedcontigs in enumerate( $trustedcontigs )\n+ --trusted-contigs $trustedcontigs.file_type.type:$trustedcontigs.file_type.trusted_contigs\n+ #end for\n+ #for $i, $untrustedcontigs in enumerate( $untrustedcontigs )\n+ --untrusted-contigs $untrustedcontigs.file_type.type:$untrustedcontigs.file_type.untrusted_contigs\n+ #end for\n+ </command>\n+ <inputs>\n+ <param name="sc" type="boolean" truevalue="--sc" falsevalue="" label="Single-cell?" help="This option is required for MDA (single-cell) data.">\n+ <option value="false">No</option>\n+ <option value="true">Yes</option>\n+ </param>\n+ <param name="onlyassembler" type="boolean" truevalue="--only-assembler" falsevalue="" checked="False" label="Run only assembly? (without read error correction)" />\n+ <param name="careful" type="boolean" truevalue="--careful" falsevalue="" checked="True" label="Careful correction?" help="Tries to reduce number of mismatches and short indels. Also runs MismatchCorrector \xe2\x80\x93 a post processing tool, which uses BWA tool (comes with SPAdes)." />\n+ <param name="kmers" type="text" label="K-mers to use, separated by commas" value="21,33,55" help="Comma-separated list of k-mer sizes to be used (all values must be odd, less than 128, listed in ascending order, and smaller than the read length). The default value is 21,33,55." >\n+ </param>\n+ <param name="iontorrent" type="boolean" truevalue="--iontorrent" falsevalue="" checked="False" label="Libraries are IonTorrent reads?" />\n+ \n+ <repeat name="libraries" title="Libraries" min="1" help="It is not possible to specify only mate-pair libraries. Scaffolds are not produced if neither a paired-end nor a mate-pair library is provided.">\n+ <param name="lib_type" type="select" label="Library type">\n+\t<option value="paired_end">Paired-end / Single reads</option>\n+\t<option value="mate_paired">Mate pairs</option>\n+ </param>\n+ <param name="orientation" type="select" label="Orientation">\n+\t<option value="fr" selected="true">-> <- (fr)</option>\n+\t<option value="rf"><- -> (rf)</option>\n+\t<option value="ff">-> -> (ff)</option>\n+ </param>\n+ <repeat name="files" title="'..b' <conditional name="file_type">\n+\t<param name="type" type="select" label="Select file format">\n+\t <option value="fasta">fasta</option>\n+\t <option value="fastq">fastq</option>\n+\t</param>\n+\t<when value="fasta">\n+\t <param name="trusted_contigs" type="data" format="fasta" label="Trusted contigs" help="FASTA format" />\n+\t</when>\n+\t<when value="fastq">\n+\t <param name="trusted_contigs" type="data" format="fastq" label="Trusted contigs" help="FASTQ format" />\n+\t</when>\n+ </conditional>\n+ </repeat>\n+ <repeat name="untrustedcontigs" title="Untrusted contigs" help="Contigs of the same genome, quality of which is average or unknown. Contigs of poor quality can be used but may introduce errors in the assembly. This option is also not intended for contigs of the related species.">\n+ <conditional name="file_type">\n+\t<param name="type" type="select" label="Select file format">\n+\t <option value="fasta">fasta</option>\n+\t <option value="fastq">fastq</option>\n+\t</param>\n+\t<when value="fasta">\n+\t <param name="untrusted_contigs" type="data" format="fasta" label="Untrusted contigs" help="FASTA format" />\n+\t</when>\n+\t<when value="fastq">\n+\t <param name="untrusted_contigs" type="data" format="fastq" label="Untrusted contigsz" help="FASTQ format" />\n+\t</when>\n+ </conditional>\n+ </repeat>\n+ </inputs>\n+ <outputs>\n+ <data name="out_contigs" format="fasta" label="SPAdes contigs (fasta)" />\n+ <data name="out_contig_stats" format="tabular" label="SPAdes contig stats" />\n+ <data name="out_scaffolds" format="fasta" label="SPAdes scaffolds (fasta)" />\n+ <data name="out_scaffold_stats" format="tabular" label="SPAdes scaffold stats" />\n+ <data name="out_log" format="txt" label="SPAdes log" />\n+ </outputs>\n+ \n+ <help>\n+**What it does**\n+\n+SPAdes \xe2\x80\x93 St. Petersburg genome assembler \xe2\x80\x93 is intended for both standard isolates and single-cell MDA bacteria assemblies. See http://bioinf.spbau.ru/en/spades for more details on SPAdes.\n+\n+This wrapper runs SPAdes 3.0.0, collects the output, and throws away all the temporary files. It also produces a tab file with contig names, length and coverage. \n+\n+**SPAdes citation**\n+\n+Anton Bankevich, Sergey Nurk, Dmitry Antipov, Alexey A. Gurevich, Mikhail Dvorkin, Alexander S. Kulikov, Valery M. Lesin, Sergey I. Nikolenko, Son Pham, Andrey D. Prjibelski, Alexey V. Pyshkin, Alexander V. Sirotkin, Nikolay Vyahhi, Glenn Tesler, Max A. Alekseyev, and Pavel A. Pevzner. Journal of Computational Biology. May 2012, 19(5): 455-477. doi:10.1089/cmb.2012.0021. \n+ \n+**License**\n+\n+SPAdes is developed by and copyrighted to Saint-Petersburg Academic University, and is released under GPLv2.\n+\n+This wrapper is copyrighted by Lionel Guy, and is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.\n+\n+This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.\n+\n+You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.\n+\n+** Acknowledgments **\n+\n+Anton Korobeynikov greatlty helped understanding how SPAdes work, and integrated handy features into SPAdes.\n+\n+Nicola Soranzo fixed various bugs.\n+ </help>\n+</tool>\n'

diff -r 95ddc2380130 -r ff058438080a tools/spades_3_0/tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/spades_3_0/tool_dependencies.xml Wed Feb 05 05:19:03 2014 -0500

@@ -0,0 +1,33 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="spades" version="2.5.1">
+        <install version="1.0">
+            <actions>
+                <action type="download_by_url">http://spades.bioinf.spbau.ru/release3.0.0/SPAdes-3.0.0-Linux.tar.gz</action>
+                
+ <action type="make_directory">$INSTALL_DIR/bin</action>
+ <action type="make_directory">$INSTALL_DIR/share</action>
+ <action type="move_directory_files">
+   <source_directory>bin</source_directory>
+   <destination_directory>$INSTALL_DIR/bin</destination_directory>
+ </action>
+                <action type="move_directory_files">
+   <source_directory>share</source_directory>
+   <destination_directory>$INSTALL_DIR/share</destination_directory>
+ </action>
+ 
+ 
+                
+                
+                
+            </actions>
+        </install>
+        <readme>
+This installs SPAdes 3.0.0.
+
+See manual here http://spades.bioinf.spbau.ru/release3.0.0/manual.html
+See also here http://bioinf.spbau.ru/en/spades
+        </readme>
+    </package>
+</tool_dependency>
+