# HG changeset patch # User lionelguy # Date 1378381428 14400 # Node ID b5ce24f34dd75ddef389fccf8b9dc2671340bd0f # Parent 0f8b2da62d7d6a3b718d66fe9ae5d83c15e974cf Uploaded diff -r 0f8b2da62d7d -r b5ce24f34dd7 tools/spades_2_5/filter_spades_output.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/spades_2_5/filter_spades_output.pl Thu Sep 05 07:43:48 2013 -0400 @@ -0,0 +1,106 @@ +#!/usr/bin/perl -w + +=head1 SYNOPSIS + +filter_spades_output.pl - Filters contigs or scaffolds based on contig length and coverage. + +=head1 USAGE + +filter_spades_output.pl [-c|--coverage-cutoff] [-l|--length-cutoff] [-o|--filtered-out out.fasta] -t|--tab stats.tab seqs.fasta + +=head1 INPUT + +=head2 [-c|--coverage-cutoff] + +Mininum coverage. Contigs with lower coverage will be discarded. Default 10. + +=head2 [-l|--length-cutoff] + +Mininum coverage. Smaller ontigs will be discarded. Default 500. + +=head2 -t|--tab stats.tab + +A tabular file, with three columns: contig name, length, and coverage: + +NODE_1 31438 24.5116 +NODE_2 31354 2316.96 +NODE_3 26948 82.3294 + +Such a file is produced by spades.xml. Contigs should be in the same order as in the fasta file. + +=head2 [-o|--filtered-out out.fasta] + +If specified, filtered out sequences will be written to this file. + +=head2 seqs.fasta + +Sequences in fasta format. Start of IDs must match ids in the tabular file. + +=head1 OUTPUT + +A fasta file on stdout. + +=head1 AUTHOR + +Lionel Guy (lionel.guy@icm.uu.se) + +=head1 DATE + +Thu Aug 29 13:51:13 CEST 2013 + +=cut + +# libraries +use strict; +use Getopt::Long; +use Bio::SeqIO; + +my $coverage_co = 10; +my $length_co = 500; +my $out_filtered; +my $tab_file; + +GetOptions( + 'c|coverage-cutoff=s' => \$coverage_co, + 'l|length-cutoff=s' => \$length_co, + 'o|filtered-out=s' => \$out_filtered, + 't|tab=s' => \$tab_file, +); +my $fasta_file = shift(@ARGV); +die ("No tab file specified") unless ($tab_file); +die ("No fasta file specified") unless ($fasta_file); + +## Read tab file, discard rows with comments +open TAB, '<', $tab_file or die "$?"; +my @stats; +while (){ + chomp; + push @stats, $_ unless (/^#/); +} + +## Read fasta +my $seq_in = Bio::SeqIO->new(-file => $fasta_file, + -format => 'fasta'); +my $seq_out = Bio::SeqIO->new(-fh => \*STDOUT, + -format => 'fasta'); +my $seq_out_filt = Bio::SeqIO->new(-file => ">$out_filtered", + -format => 'fasta') if ($out_filtered); +while (my $seq = $seq_in->next_seq){ + my $stat = shift @stats; + die "Less rows in tab than sequences in seq file" unless $stat; + my ($id_tab, $length, $coverage) = split(/\t+/, $stat); + die "id, length or coverate not defined at $stat\n" + unless ($id_tab && $length && $coverage); + my $id_seq = $seq->id; + die "Unmatched ids $id_seq and $id_tab\n" unless ($id_seq =~ /^$id_tab/i); + if ($length >= $length_co && $coverage >= $coverage_co){ + $seq_out->write_seq($seq); + } elsif ($out_filtered){ + $seq_out_filt->write_seq($seq); + } else { + # do nothing + } +} +die "More rows in tab than sequences in seq file" if (scalar(@stats) > 0); +exit 0; + diff -r 0f8b2da62d7d -r b5ce24f34dd7 tools/spades_2_5/filter_spades_output.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/spades_2_5/filter_spades_output.xml Thu Sep 05 07:43:48 2013 -0400 @@ -0,0 +1,33 @@ + + remove low coverage and short contigs/scaffolds + filter_spades_output.pl + --coverage-cutoff $coverage_co + --length-cutoff $length_co + #if $keep_leftover + --filtered-out $filtered_out + #end if + --tab $stats_in + $fasta_in > $fasta_output + + + + + + + + + + + + + keep_leftover == "true" + + + +**What it does** + +Using the output of SPAdes (a fasta and stats file, either from contigs or scaffolds), it filters the fasta files, discarding all sequences that are under a given length or under a given coverage. + +Typically, this is used to discard short contigs, or contaminations. To dispaly a coverage vs. length plot, use the "SPAdes stats" tool in the same package. + + \ No newline at end of file diff -r 0f8b2da62d7d -r b5ce24f34dd7 tools/spades_2_5/plot_spades_stats.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/spades_2_5/plot_spades_stats.xml Thu Sep 05 07:43:48 2013 -0400 @@ -0,0 +1,80 @@ + + coverage vs. length plot + + R + + r_wrapper.sh $script_file + + + + + + + + + +## Setup R error handling to go to stderr +options( show.error.messages=F, + error = function () { + cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) +} ) +files = c("${input_contigs}", "${input_scaffolds}") +types = c("Contigs", "Scaffolds") + +## Start plotting device +png("${out_file}", w=500, h=1000) +par(mfrow=c(2,1)) + +## Loop over the two files +for (i in 1:length(types)){ + seqs = read.table(files[i], header=FALSE, comment.char="#") + colnames = c("name", "length", "coverage") + names(seqs) = colnames + + ## Stats over all sequences + sl_all = sort(seqs\$length, decreasing=TRUE) + cs_all = cumsum(sl_all) + s_all = sum(seqs\$length) + n50_idx_all = which.min(sl_all[cs_all < 0.5*s_all]) + n90_idx_all = which.min(sl_all[cs_all < 0.9*s_all]) + n50_all = sl_all[n50_idx_all] + n90_all = sl_all[n90_idx_all] + + ## Filter short seqs, redo stats + seqs_filt = seqs[seqs\$length >= ${length_co} & seqs\$coverage >= ${coverage_co},] + if (nrow(seqs_filt) > 0){ + sl_filt = sort(seqs_filt\$length, decreasing=TRUE) + cs_filt = cumsum(sl_filt) + s_filt = sum(seqs_filt\$length) + n50_idx_filt = which.min(sl_filt[cs_filt < 0.5*s_filt]) + n90_idx_filt = which.min(sl_filt[cs_filt < 0.9*s_filt]) + n50_filt = sl_filt[n50_idx_filt] + n90_filt = sl_filt[n90_idx_filt] + } + seqs_bad = seqs[seqs\$length < ${length_co} | seqs\$coverage < ${coverage_co},] + + ## Length vs coverage + plot(length~coverage, data=seqs, log="xy", type="n", main=paste(types[i], ": coverage vs. length", sep=""), xlab="Coverage", ylab="Length") + if (nrow(seqs_bad) > 0){ + points(length~coverage, data=seqs_bad, cex=0.5, col="red") + } + if (nrow(seqs_filt) > 0){ + points(length~coverage, data=seqs_filt, cex=0.5, col="black") + } + abline(v=${coverage_co}, h=${length_co}, lty=2, col=grey(0.3)) + legend(x="topleft", legend=c("Before/after filtering", paste(c("N50: ", "N90: ", "Median cov.: "), c(n50_all, n90_all, round(median(seqs\$coverage))), rep("/", 3), c(n50_filt, n90_filt, round(median(seqs_filt\$coverage))), sep="")), cex=0.8) +} +dev.off() + + + + + + +**What it does** + +Using the output of SPAdes (a pair of fasta file and stat file for each of the contigs and scaffolds), it produces a coverage vs. contig plot. Each dot represent a contig/scaffold. Given a coverage and a length cutoff, sequences that do not meet those criteria are shown in red. Some statistics are also given (N50, N90, median contig/scaffold length) both before and after filtering. + +Use the "filter SPAdes output" tool to actually filter sequences. + + \ No newline at end of file diff -r 0f8b2da62d7d -r b5ce24f34dd7 tools/spades_2_5/r_wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/spades_2_5/r_wrapper.sh Thu Sep 05 07:43:48 2013 -0400 @@ -0,0 +1,23 @@ +#!/bin/sh + +### Run R providing the R script in $1 as standard input and passing +### the remaining arguments on the command line + +# Function that writes a message to stderr and exits +function fail +{ + echo "$@" >&2 + exit 1 +} + +# Ensure R executable is found +which R > /dev/null || fail "'R' is required by this tool but was not found on path" + +# Extract first argument +infile=$1; shift + +# Ensure the file exists +test -f $infile || fail "R input file '$infile' does not exist" + +# Invoke R passing file named by first argument to stdin +R --vanilla --slave $* < $infile diff -r 0f8b2da62d7d -r b5ce24f34dd7 tools/spades_2_5/spades.pl --- a/tools/spades_2_5/spades.pl Mon Aug 19 09:06:17 2013 -0400 +++ b/tools/spades_2_5/spades.pl Thu Sep 05 07:43:48 2013 -0400 @@ -28,7 +28,7 @@ # Create temporary folder to store files, delete after use #my $output_dir = tempdir( CLEANUP => 0 ); -my $output_dir = tempdir( CLEANUP => 1 ); +my $output_dir = 'output_dir'; # Link "dat" files as fastq, otherwise spades complains about file format # Create log handle @@ -72,13 +72,14 @@ my ($in, $out) = @_; open FASTA, '<', $in or die $!; open TAB, '>', $out or die $!; + print TAB "#name\tlength\tcoverage\n"; while (){ next unless /^>/; chomp; my @a = split(/\s/, $_); my ($NODE, $n, $LENGTH, $l, $COV, $cov) = split(/_/, $a[0]); die "Not all elements found in $_\n" unless ($n && $l && $cov); - print TAB "$n\t$l\t$cov\n"; + print TAB "NODE_$n\t$l\t$cov\n"; } close TAB; } diff -r 0f8b2da62d7d -r b5ce24f34dd7 tools/spades_2_5/spades.xml --- a/tools/spades_2_5/spades.xml Mon Aug 19 09:06:17 2013 -0400 +++ b/tools/spades_2_5/spades.xml Thu Sep 05 07:43:48 2013 -0400 @@ -1,4 +1,4 @@ - + SPAdes genome assembler for regular and single-cell projects spades @@ -11,70 +11,79 @@ $out_log ## A real command looks like: spades.py -k 21,33,55,77,99,127 --careful -1 Y.fastq.gz -2 X.fastq.gz -t 24 -o output spades.py - ## TODO: kmers, threads, other options (-sc for single-cell) - #if $sc == "true": - --sc - #end if - #if $careful == "true": - --careful - #end if - #if $rectangle == "true" - --rectangle - #end if + $sc + $onlyassembler + $careful + $rectangle -t $threads -k $kmers -i $iterations ##--phred-offset ## Sequence files - #for $i, $s in enumerate( $reads ) - #if $s.read_type.type == "pairedend" - -1 $s.read_type.fwd_reads - -2 $s.read_type.rev_reads - #elif $s.read_type.type == "interleaved" - --12 $s.read_type.interleaved_reads - #elif $s.read_type.type == "unpaired" - -s $s.read_type.unpaired_reads + #for $i, $library in enumerate( $libraries ) + #set num=$i+1 + #if str( $library.lib_type ) == "paired_end": + #set prefix = 'pe' + #else: + #set prefix = 'mp' #end if + --$prefix$num-$library.orientation + #for $file in $library.files + #if $file.file_type.type == "separate" + --$prefix$num-1 $file.file_type.fwd_reads + --$prefix$num-2 $file.file_type.rev_reads + #elif $file.file_type.type == "interleaved" + --$prefix$num-12 $file.file_type.interleaved_reads + #elif $file.file_type.type == "unpaired" + --$prefix$num-s $file.file_type.unpaired_reads + #end if + #end for #end for - + - - - - - - - - + + + - + - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -84,9 +93,9 @@ - + + @@ -97,7 +106,7 @@ - + --> **What it does**