Mercurial > repos > lionelguy > spades
changeset 2:b5ce24f34dd7 draft
Uploaded
author | lionelguy |
---|---|
date | Thu, 05 Sep 2013 07:43:48 -0400 |
parents | 0f8b2da62d7d |
children | d82f18c76309 |
files | tools/spades_2_5/filter_spades_output.pl tools/spades_2_5/filter_spades_output.xml tools/spades_2_5/plot_spades_stats.xml tools/spades_2_5/r_wrapper.sh tools/spades_2_5/spades.pl tools/spades_2_5/spades.xml |
diffstat | 6 files changed, 304 insertions(+), 52 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/spades_2_5/filter_spades_output.pl Thu Sep 05 07:43:48 2013 -0400 @@ -0,0 +1,106 @@ +#!/usr/bin/perl -w + +=head1 SYNOPSIS + +filter_spades_output.pl - Filters contigs or scaffolds based on contig length and coverage. + +=head1 USAGE + +filter_spades_output.pl [-c|--coverage-cutoff] [-l|--length-cutoff] [-o|--filtered-out out.fasta] -t|--tab stats.tab seqs.fasta + +=head1 INPUT + +=head2 [-c|--coverage-cutoff] + +Mininum coverage. Contigs with lower coverage will be discarded. Default 10. + +=head2 [-l|--length-cutoff] + +Mininum coverage. Smaller ontigs will be discarded. Default 500. + +=head2 -t|--tab stats.tab + +A tabular file, with three columns: contig name, length, and coverage: + +NODE_1 31438 24.5116 +NODE_2 31354 2316.96 +NODE_3 26948 82.3294 + +Such a file is produced by spades.xml. Contigs should be in the same order as in the fasta file. + +=head2 [-o|--filtered-out out.fasta] + +If specified, filtered out sequences will be written to this file. + +=head2 seqs.fasta + +Sequences in fasta format. Start of IDs must match ids in the tabular file. + +=head1 OUTPUT + +A fasta file on stdout. + +=head1 AUTHOR + +Lionel Guy (lionel.guy@icm.uu.se) + +=head1 DATE + +Thu Aug 29 13:51:13 CEST 2013 + +=cut + +# libraries +use strict; +use Getopt::Long; +use Bio::SeqIO; + +my $coverage_co = 10; +my $length_co = 500; +my $out_filtered; +my $tab_file; + +GetOptions( + 'c|coverage-cutoff=s' => \$coverage_co, + 'l|length-cutoff=s' => \$length_co, + 'o|filtered-out=s' => \$out_filtered, + 't|tab=s' => \$tab_file, +); +my $fasta_file = shift(@ARGV); +die ("No tab file specified") unless ($tab_file); +die ("No fasta file specified") unless ($fasta_file); + +## Read tab file, discard rows with comments +open TAB, '<', $tab_file or die "$?"; +my @stats; +while (<TAB>){ + chomp; + push @stats, $_ unless (/^#/); +} + +## Read fasta +my $seq_in = Bio::SeqIO->new(-file => $fasta_file, + -format => 'fasta'); +my $seq_out = Bio::SeqIO->new(-fh => \*STDOUT, + -format => 'fasta'); +my $seq_out_filt = Bio::SeqIO->new(-file => ">$out_filtered", + -format => 'fasta') if ($out_filtered); +while (my $seq = $seq_in->next_seq){ + my $stat = shift @stats; + die "Less rows in tab than sequences in seq file" unless $stat; + my ($id_tab, $length, $coverage) = split(/\t+/, $stat); + die "id, length or coverate not defined at $stat\n" + unless ($id_tab && $length && $coverage); + my $id_seq = $seq->id; + die "Unmatched ids $id_seq and $id_tab\n" unless ($id_seq =~ /^$id_tab/i); + if ($length >= $length_co && $coverage >= $coverage_co){ + $seq_out->write_seq($seq); + } elsif ($out_filtered){ + $seq_out_filt->write_seq($seq); + } else { + # do nothing + } +} +die "More rows in tab than sequences in seq file" if (scalar(@stats) > 0); +exit 0; +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/spades_2_5/filter_spades_output.xml Thu Sep 05 07:43:48 2013 -0400 @@ -0,0 +1,33 @@ +<tool id="filter_spades_output" name="Filter SPAdes output" version="0.1"> + <description>remove low coverage and short contigs/scaffolds</description> + <command interpreter="perl">filter_spades_output.pl + --coverage-cutoff $coverage_co + --length-cutoff $length_co + #if $keep_leftover + --filtered-out $filtered_out + #end if + --tab $stats_in + $fasta_in > $fasta_output + </command> + + <inputs> + <param name="fasta_in" type="data" format="fasta" label="Sequences" help="Contigs or scaffolds. Make sure you input the corresponding stat file" /> + <param name="stats_in" type="data" format="tabular" label="Contig stats" /> + <param name="length_co" type="integer" value="1000" min="0" label="Length cut-off" help="Contigs with length under that value are shown in red" /> + <param name="coverage_co" type="integer" value="10" min="0" label="Coverage cut-off" help="Contigs with length under that value are shown in red" /> + <param name="keep_leftover" type="boolean" checked="false" label="Save filtered-out sequences?" /> + </inputs> + <outputs> + <data format="fasta" name="fasta_output" label="Filtered sequences" /> + <data format="fasta" name="filtered_out" label="Discarded sequences"> + <filter>keep_leftover == "true"</filter> + </data> + </outputs> + <help> +**What it does** + +Using the output of SPAdes (a fasta and stats file, either from contigs or scaffolds), it filters the fasta files, discarding all sequences that are under a given length or under a given coverage. + +Typically, this is used to discard short contigs, or contaminations. To dispaly a coverage vs. length plot, use the "SPAdes stats" tool in the same package. + </help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/spades_2_5/plot_spades_stats.xml Thu Sep 05 07:43:48 2013 -0400 @@ -0,0 +1,80 @@ +<tool id="plot_spades_stats" name="SPAdes stats" version="0.1"> + <description>coverage vs. length plot</description> + <requirements> + <requirement type="package">R</requirement> + </requirements> + <command interpreter="bash">r_wrapper.sh $script_file</command> + + <inputs> + <param name="input_scaffolds" type="data" format="tabular" label="Scaffold stats"/> + <param name="input_contigs" type="data" format="tabular" label="Contig stats"/> + <param name="length_co" type="integer" value="1000" min="0" label="Length cut-off" help="Contigs with length under that value are shown in red"/> + <param name="coverage_co" type="integer" value="10" min="0" label="Coverage cut-off" help="Contigs with length under that value are shown in red"/> + </inputs> + <configfiles> + <configfile name="script_file"> +## Setup R error handling to go to stderr +options( show.error.messages=F, + error = function () { + cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) +} ) +files = c("${input_contigs}", "${input_scaffolds}") +types = c("Contigs", "Scaffolds") + +## Start plotting device +png("${out_file}", w=500, h=1000) +par(mfrow=c(2,1)) + +## Loop over the two files +for (i in 1:length(types)){ + seqs = read.table(files[i], header=FALSE, comment.char="#") + colnames = c("name", "length", "coverage") + names(seqs) = colnames + + ## Stats over all sequences + sl_all = sort(seqs\$length, decreasing=TRUE) + cs_all = cumsum(sl_all) + s_all = sum(seqs\$length) + n50_idx_all = which.min(sl_all[cs_all < 0.5*s_all]) + n90_idx_all = which.min(sl_all[cs_all < 0.9*s_all]) + n50_all = sl_all[n50_idx_all] + n90_all = sl_all[n90_idx_all] + + ## Filter short seqs, redo stats + seqs_filt = seqs[seqs\$length >= ${length_co} & seqs\$coverage >= ${coverage_co},] + if (nrow(seqs_filt) > 0){ + sl_filt = sort(seqs_filt\$length, decreasing=TRUE) + cs_filt = cumsum(sl_filt) + s_filt = sum(seqs_filt\$length) + n50_idx_filt = which.min(sl_filt[cs_filt < 0.5*s_filt]) + n90_idx_filt = which.min(sl_filt[cs_filt < 0.9*s_filt]) + n50_filt = sl_filt[n50_idx_filt] + n90_filt = sl_filt[n90_idx_filt] + } + seqs_bad = seqs[seqs\$length < ${length_co} | seqs\$coverage < ${coverage_co},] + + ## Length vs coverage + plot(length~coverage, data=seqs, log="xy", type="n", main=paste(types[i], ": coverage vs. length", sep=""), xlab="Coverage", ylab="Length") + if (nrow(seqs_bad) > 0){ + points(length~coverage, data=seqs_bad, cex=0.5, col="red") + } + if (nrow(seqs_filt) > 0){ + points(length~coverage, data=seqs_filt, cex=0.5, col="black") + } + abline(v=${coverage_co}, h=${length_co}, lty=2, col=grey(0.3)) + legend(x="topleft", legend=c("Before/after filtering", paste(c("N50: ", "N90: ", "Median cov.: "), c(n50_all, n90_all, round(median(seqs\$coverage))), rep("/", 3), c(n50_filt, n90_filt, round(median(seqs_filt\$coverage))), sep="")), cex=0.8) +} +dev.off() + </configfile> + </configfiles> + <outputs> + <data format="png" name="out_file" /> + </outputs> + <help> +**What it does** + +Using the output of SPAdes (a pair of fasta file and stat file for each of the contigs and scaffolds), it produces a coverage vs. contig plot. Each dot represent a contig/scaffold. Given a coverage and a length cutoff, sequences that do not meet those criteria are shown in red. Some statistics are also given (N50, N90, median contig/scaffold length) both before and after filtering. + +Use the "filter SPAdes output" tool to actually filter sequences. + </help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/spades_2_5/r_wrapper.sh Thu Sep 05 07:43:48 2013 -0400 @@ -0,0 +1,23 @@ +#!/bin/sh + +### Run R providing the R script in $1 as standard input and passing +### the remaining arguments on the command line + +# Function that writes a message to stderr and exits +function fail +{ + echo "$@" >&2 + exit 1 +} + +# Ensure R executable is found +which R > /dev/null || fail "'R' is required by this tool but was not found on path" + +# Extract first argument +infile=$1; shift + +# Ensure the file exists +test -f $infile || fail "R input file '$infile' does not exist" + +# Invoke R passing file named by first argument to stdin +R --vanilla --slave $* < $infile
--- a/tools/spades_2_5/spades.pl Mon Aug 19 09:06:17 2013 -0400 +++ b/tools/spades_2_5/spades.pl Thu Sep 05 07:43:48 2013 -0400 @@ -28,7 +28,7 @@ # Create temporary folder to store files, delete after use #my $output_dir = tempdir( CLEANUP => 0 ); -my $output_dir = tempdir( CLEANUP => 1 ); +my $output_dir = 'output_dir'; # Link "dat" files as fastq, otherwise spades complains about file format # Create log handle @@ -72,13 +72,14 @@ my ($in, $out) = @_; open FASTA, '<', $in or die $!; open TAB, '>', $out or die $!; + print TAB "#name\tlength\tcoverage\n"; while (<FASTA>){ next unless /^>/; chomp; my @a = split(/\s/, $_); my ($NODE, $n, $LENGTH, $l, $COV, $cov) = split(/_/, $a[0]); die "Not all elements found in $_\n" unless ($n && $l && $cov); - print TAB "$n\t$l\t$cov\n"; + print TAB "NODE_$n\t$l\t$cov\n"; } close TAB; }
--- a/tools/spades_2_5/spades.xml Mon Aug 19 09:06:17 2013 -0400 +++ b/tools/spades_2_5/spades.xml Thu Sep 05 07:43:48 2013 -0400 @@ -1,4 +1,4 @@ -<tool id="spades" name="spades" version="0.4"> +<tool id="spades" name="spades" version="0.5"> <description>SPAdes genome assembler for regular and single-cell projects</description> <requirements> <requirement type="package" version="2.5.0">spades</requirement> @@ -11,70 +11,79 @@ $out_log ## A real command looks like: spades.py -k 21,33,55,77,99,127 --careful -1 Y.fastq.gz -2 X.fastq.gz -t 24 -o output spades.py - ## TODO: kmers, threads, other options (-sc for single-cell) - #if $sc == "true": - --sc - #end if - #if $careful == "true": - --careful - #end if - #if $rectangle == "true" - --rectangle - #end if + $sc + $onlyassembler + $careful + $rectangle -t $threads -k $kmers -i $iterations ##--phred-offset ## Sequence files - #for $i, $s in enumerate( $reads ) - #if $s.read_type.type == "pairedend" - -1 $s.read_type.fwd_reads - -2 $s.read_type.rev_reads - #elif $s.read_type.type == "interleaved" - --12 $s.read_type.interleaved_reads - #elif $s.read_type.type == "unpaired" - -s $s.read_type.unpaired_reads + #for $i, $library in enumerate( $libraries ) + #set num=$i+1 + #if str( $library.lib_type ) == "paired_end": + #set prefix = 'pe' + #else: + #set prefix = 'mp' #end if + --$prefix$num-$library.orientation + #for $file in $library.files + #if $file.file_type.type == "separate" + --$prefix$num-1 $file.file_type.fwd_reads + --$prefix$num-2 $file.file_type.rev_reads + #elif $file.file_type.type == "interleaved" + --$prefix$num-12 $file.file_type.interleaved_reads + #elif $file.file_type.type == "unpaired" + --$prefix$num-s $file.file_type.unpaired_reads + #end if + #end for #end for </command> <inputs> - <param name="sc" type="select" label="Single-cell?" help="This flag is required for MDA (single-cell) data."> + <param name="sc" type="boolean" truevalue="--sc" falsevalue="" label="Single-cell?" help="This option is required for MDA (single-cell) data."> <option value="false">No</option> <option value="true">Yes</option> </param> - <param name="careful" type="select" label="Careful correction?" help="Tries to reduce number of mismatches and short indels. Also runs MismatchCorrector – a post processing tool, which uses BWA tool (comes with SPAdes)."> - <option value="false">No</option> - <option value="true" selected="true">Yes</option> - </param> - <param name="rectangle" type="select" label="Use rectangle correction for repeat resolution?" help="Uses rectangle graph algorithm for repeat resolution stage instead of usual SPAdes repeat resolution module (experimental)."> - <option value="false" selected="true">No</option> - <option value="true">Yes</option> - </param> + <param name="onlyassembler" type="boolean" truevalue="--only-assembler" falsevalue="" checked="False" label="Run only assembly? (without read error correction)" /> + <param name="careful" type="boolean" truevalue="--careful" falsevalue="" checked="True" label="Careful correction?" help="Tries to reduce number of mismatches and short indels. Also runs MismatchCorrector – a post processing tool, which uses BWA tool (comes with SPAdes)." /> + <param name="rectangle" type="boolean" truevalue="--rectangle" falsevalue="" checked="False" label="Use rectangle correction for repeat resolution?" help="Uses rectangle graph algorithm for repeat resolution stage instead of usual SPAdes repeat resolution module (experimental, use at your own risks)." /> <param name="threads" type="integer" label="Number of threads to use" value="16"> </param> <param name="iterations" type="integer" label="Number of iterations for read error correction." value="1"> </param> - <param name="kmers" type="text" label="K-mers to use, separated by commas" value="21,33,55" help="Comma-separated list of k-mer sizes to be used (all values must be odd, less than 128 and listed in ascending order). The default value is 21,33,55." > + <param name="kmers" type="text" label="K-mers to use, separated by commas" value="21,33,55" help="Comma-separated list of k-mer sizes to be used (all values must be odd, less than 128, listed in ascending order, and smaller than the read length). The default value is 21,33,55." > </param> <!-- Reads --> - <repeat name="reads" title="Reads"> - <conditional name="read_type"> - <param name="type" type="select" label="Select type of reads"> - <option value="pairedend">Paired-end, separate inputs</option> - <option value="interleaved">Paired-end, interleaved</option> - <option value="unpaired">Unpaired reads</option> - </param> - <when value="pairedend"> - <param name="fwd_reads" type="data" format="fastq" label="Forward reads" help="FASTQ format" /> - <param name="rev_reads" type="data" format="fastq" label="Reverse reads" help="FASTQ format" /> - </when> - <when value="interleaved"> - <param name="interleaved_reads" type="data" format="fastq" label="Interleaved paired reads" help="FASTQ format" /> - </when> - <when value="unpaired"> - <param name="unpaired_reads" type="data" format="fastq" label="Unpaired reads" help="FASTQ format" /> - </when> - </conditional> + <repeat name="libraries" title="Libraries"> + <param name="lib_type" type="select" label="Library type"> + <option value="paired_end">Paired end / Single reads</option> + <option value="mate_paired">Mate pairs</option> + </param> + <param name="orientation" type="select" label="Orientation"> + <option value="fr" selected="true">-> <- (fr)</option> + <option value="rf"><- -> (rf)</option> + <option value="ff">-> -> (ff)</option> + </param> + <repeat name="files" title="Files"> + <conditional name="file_type"> + <param name="type" type="select" label="Select file format"> + <option value="separate">Separate input files</option> + <option value="interleaved">Interleaved files</option> + <option value="unpaired">Unpaired/Single reads</option> + </param> + <when value="separate"> + <param name="fwd_reads" type="data" format="fastq" label="Forward reads" help="FASTQ format" /> + <param name="rev_reads" type="data" format="fastq" label="Reverse reads" help="FASTQ format" /> + </when> + <when value="interleaved"> + <param name="interleaved_reads" type="data" format="fastq" label="Interleaved paired reads" help="FASTQ format" /> + </when> + <when value="unpaired"> + <param name="unpaired_reads" type="data" format="fastq" label="Unpaired reads" help="FASTQ format" /> + </when> + </conditional> + </repeat> </repeat> </inputs> <outputs> @@ -84,9 +93,9 @@ <data name="out_scaffold_stats" format="tabular" label="SPAdes scaffold stats" /> <data name="out_log" format="txt" label="SPAdes log" /> </outputs> - <tests> + <!-- <tests> <test> - <!-- Based on the tests coming along with SPAdes --> + <param name="sc" value="false" /> <param name="careful" value="false" /> <param name="rectangle" value="false" /> @@ -97,7 +106,7 @@ <param name="rev_reads" value="ecoli_1K_2.fq" ftype="fastq" /> <output name="out_contigs" file="reference_1K.fa" ftype="fasta" compare="re_match" lines_diff="1" /> </test> - </tests> + </tests> --> <help> **What it does**