2
|
1 #!/usr/bin/perl -w
|
|
2
|
|
3 =head1 SYNOPSIS
|
|
4
|
|
5 filter_spades_output.pl - Filters contigs or scaffolds based on contig length and coverage.
|
|
6
|
|
7 =head1 USAGE
|
|
8
|
|
9 filter_spades_output.pl [-c|--coverage-cutoff] [-l|--length-cutoff] [-o|--filtered-out out.fasta] -t|--tab stats.tab seqs.fasta
|
|
10
|
|
11 =head1 INPUT
|
|
12
|
|
13 =head2 [-c|--coverage-cutoff]
|
|
14
|
|
15 Mininum coverage. Contigs with lower coverage will be discarded. Default 10.
|
|
16
|
|
17 =head2 [-l|--length-cutoff]
|
|
18
|
|
19 Mininum coverage. Smaller ontigs will be discarded. Default 500.
|
|
20
|
|
21 =head2 -t|--tab stats.tab
|
|
22
|
|
23 A tabular file, with three columns: contig name, length, and coverage:
|
|
24
|
|
25 NODE_1 31438 24.5116
|
|
26 NODE_2 31354 2316.96
|
|
27 NODE_3 26948 82.3294
|
|
28
|
|
29 Such a file is produced by spades.xml. Contigs should be in the same order as in the fasta file.
|
|
30
|
|
31 =head2 [-o|--filtered-out out.fasta]
|
|
32
|
|
33 If specified, filtered out sequences will be written to this file.
|
|
34
|
|
35 =head2 seqs.fasta
|
|
36
|
|
37 Sequences in fasta format. Start of IDs must match ids in the tabular file.
|
|
38
|
|
39 =head1 OUTPUT
|
|
40
|
|
41 A fasta file on stdout.
|
|
42
|
|
43 =head1 AUTHOR
|
|
44
|
|
45 Lionel Guy (lionel.guy@icm.uu.se)
|
|
46
|
|
47 =head1 DATE
|
|
48
|
|
49 Thu Aug 29 13:51:13 CEST 2013
|
|
50
|
|
51 =cut
|
|
52
|
|
53 # libraries
|
|
54 use strict;
|
|
55 use Getopt::Long;
|
|
56 use Bio::SeqIO;
|
|
57
|
|
58 my $coverage_co = 10;
|
|
59 my $length_co = 500;
|
|
60 my $out_filtered;
|
|
61 my $tab_file;
|
|
62
|
|
63 GetOptions(
|
|
64 'c|coverage-cutoff=s' => \$coverage_co,
|
|
65 'l|length-cutoff=s' => \$length_co,
|
|
66 'o|filtered-out=s' => \$out_filtered,
|
|
67 't|tab=s' => \$tab_file,
|
|
68 );
|
|
69 my $fasta_file = shift(@ARGV);
|
|
70 die ("No tab file specified") unless ($tab_file);
|
|
71 die ("No fasta file specified") unless ($fasta_file);
|
|
72
|
|
73 ## Read tab file, discard rows with comments
|
|
74 open TAB, '<', $tab_file or die "$?";
|
|
75 my @stats;
|
|
76 while (<TAB>){
|
|
77 chomp;
|
|
78 push @stats, $_ unless (/^#/);
|
|
79 }
|
|
80
|
|
81 ## Read fasta
|
|
82 my $seq_in = Bio::SeqIO->new(-file => $fasta_file,
|
|
83 -format => 'fasta');
|
|
84 my $seq_out = Bio::SeqIO->new(-fh => \*STDOUT,
|
|
85 -format => 'fasta');
|
|
86 my $seq_out_filt = Bio::SeqIO->new(-file => ">$out_filtered",
|
|
87 -format => 'fasta') if ($out_filtered);
|
|
88 while (my $seq = $seq_in->next_seq){
|
|
89 my $stat = shift @stats;
|
|
90 die "Less rows in tab than sequences in seq file" unless $stat;
|
|
91 my ($id_tab, $length, $coverage) = split(/\t+/, $stat);
|
|
92 die "id, length or coverate not defined at $stat\n"
|
|
93 unless ($id_tab && $length && $coverage);
|
|
94 my $id_seq = $seq->id;
|
|
95 die "Unmatched ids $id_seq and $id_tab\n" unless ($id_seq =~ /^$id_tab/i);
|
|
96 if ($length >= $length_co && $coverage >= $coverage_co){
|
|
97 $seq_out->write_seq($seq);
|
|
98 } elsif ($out_filtered){
|
|
99 $seq_out_filt->write_seq($seq);
|
|
100 } else {
|
|
101 # do nothing
|
|
102 }
|
|
103 }
|
|
104 die "More rows in tab than sequences in seq file" if (scalar(@stats) > 0);
|
|
105 exit 0;
|
|
106
|