Mercurial > repos > lionelguy > spades
diff tools/spades_3_5_0/filter_spades_output.pl @ 12:85c6121d92a5 draft
Uploaded
author | takadonet |
---|---|
date | Tue, 16 Dec 2014 12:57:49 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/spades_3_5_0/filter_spades_output.pl Tue Dec 16 12:57:49 2014 -0500 @@ -0,0 +1,106 @@ +#!/usr/bin/env perl -w + +=head1 SYNOPSIS + +filter_spades_output.pl - Filters contigs or scaffolds based on contig length and coverage. + +=head1 USAGE + +filter_spades_output.pl [-c|--coverage-cutoff] [-l|--length-cutoff] [-o|--filtered-out out.fasta] -t|--tab stats.tab seqs.fasta + +=head1 INPUT + +=head2 [-c|--coverage-cutoff] + +Mininum coverage. Contigs with lower coverage will be discarded. Default 10. + +=head2 [-l|--length-cutoff] + +Mininum coverage. Smaller ontigs will be discarded. Default 500. + +=head2 -t|--tab stats.tab + +A tabular file, with three columns: contig name, length, and coverage: + +NODE_1 31438 24.5116 +NODE_2 31354 2316.96 +NODE_3 26948 82.3294 + +Such a file is produced by spades.xml. Contigs should be in the same order as in the fasta file. + +=head2 [-o|--filtered-out out.fasta] + +If specified, filtered out sequences will be written to this file. + +=head2 seqs.fasta + +Sequences in fasta format. Start of IDs must match ids in the tabular file. + +=head1 OUTPUT + +A fasta file on stdout. + +=head1 AUTHOR + +Lionel Guy (lionel.guy@icm.uu.se) + +=head1 DATE + +Thu Aug 29 13:51:13 CEST 2013 + +=cut + +# libraries +use strict; +use Getopt::Long; +use Bio::SeqIO; + +my $coverage_co = 10; +my $length_co = 500; +my $out_filtered; +my $tab_file; + +GetOptions( + 'c|coverage-cutoff=s' => \$coverage_co, + 'l|length-cutoff=s' => \$length_co, + 'o|filtered-out=s' => \$out_filtered, + 't|tab=s' => \$tab_file, +); +my $fasta_file = shift(@ARGV); +die ("No tab file specified") unless ($tab_file); +die ("No fasta file specified") unless ($fasta_file); + +## Read tab file, discard rows with comments +open TAB, '<', $tab_file or die "$?"; +my @stats; +while (<TAB>){ + chomp; + push @stats, $_ unless (/^#/); +} + +## Read fasta +my $seq_in = Bio::SeqIO->new(-file => $fasta_file, + -format => 'fasta'); +my $seq_out = Bio::SeqIO->new(-fh => \*STDOUT, + -format => 'fasta'); +my $seq_out_filt = Bio::SeqIO->new(-file => ">$out_filtered", + -format => 'fasta') if ($out_filtered); +while (my $seq = $seq_in->next_seq){ + my $stat = shift @stats; + die "Less rows in tab than sequences in seq file" unless $stat; + my ($id_tab, $length, $coverage) = split(/\t+/, $stat); + die "id, length or coverate not defined at $stat\n" + unless ($id_tab && $length && $coverage); + my $id_seq = $seq->id; + die "Unmatched ids $id_seq and $id_tab\n" unless ($id_seq =~ /^$id_tab/i); + if ($length >= $length_co && $coverage >= $coverage_co){ + $seq_out->write_seq($seq); + } elsif ($out_filtered){ + $seq_out_filt->write_seq($seq); + } else { + # do nothing + } +} +die "More rows in tab than sequences in seq file" if (scalar(@stats) > 0); +exit 0; +