Mercurial > repos > lionelguy > spades
comparison tools/spades_2_5/filter_spades_output.pl @ 2:b5ce24f34dd7 draft
Uploaded
author | lionelguy |
---|---|
date | Thu, 05 Sep 2013 07:43:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1:0f8b2da62d7d | 2:b5ce24f34dd7 |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 =head1 SYNOPSIS | |
4 | |
5 filter_spades_output.pl - Filters contigs or scaffolds based on contig length and coverage. | |
6 | |
7 =head1 USAGE | |
8 | |
9 filter_spades_output.pl [-c|--coverage-cutoff] [-l|--length-cutoff] [-o|--filtered-out out.fasta] -t|--tab stats.tab seqs.fasta | |
10 | |
11 =head1 INPUT | |
12 | |
13 =head2 [-c|--coverage-cutoff] | |
14 | |
15 Mininum coverage. Contigs with lower coverage will be discarded. Default 10. | |
16 | |
17 =head2 [-l|--length-cutoff] | |
18 | |
19 Mininum coverage. Smaller ontigs will be discarded. Default 500. | |
20 | |
21 =head2 -t|--tab stats.tab | |
22 | |
23 A tabular file, with three columns: contig name, length, and coverage: | |
24 | |
25 NODE_1 31438 24.5116 | |
26 NODE_2 31354 2316.96 | |
27 NODE_3 26948 82.3294 | |
28 | |
29 Such a file is produced by spades.xml. Contigs should be in the same order as in the fasta file. | |
30 | |
31 =head2 [-o|--filtered-out out.fasta] | |
32 | |
33 If specified, filtered out sequences will be written to this file. | |
34 | |
35 =head2 seqs.fasta | |
36 | |
37 Sequences in fasta format. Start of IDs must match ids in the tabular file. | |
38 | |
39 =head1 OUTPUT | |
40 | |
41 A fasta file on stdout. | |
42 | |
43 =head1 AUTHOR | |
44 | |
45 Lionel Guy (lionel.guy@icm.uu.se) | |
46 | |
47 =head1 DATE | |
48 | |
49 Thu Aug 29 13:51:13 CEST 2013 | |
50 | |
51 =cut | |
52 | |
53 # libraries | |
54 use strict; | |
55 use Getopt::Long; | |
56 use Bio::SeqIO; | |
57 | |
58 my $coverage_co = 10; | |
59 my $length_co = 500; | |
60 my $out_filtered; | |
61 my $tab_file; | |
62 | |
63 GetOptions( | |
64 'c|coverage-cutoff=s' => \$coverage_co, | |
65 'l|length-cutoff=s' => \$length_co, | |
66 'o|filtered-out=s' => \$out_filtered, | |
67 't|tab=s' => \$tab_file, | |
68 ); | |
69 my $fasta_file = shift(@ARGV); | |
70 die ("No tab file specified") unless ($tab_file); | |
71 die ("No fasta file specified") unless ($fasta_file); | |
72 | |
73 ## Read tab file, discard rows with comments | |
74 open TAB, '<', $tab_file or die "$?"; | |
75 my @stats; | |
76 while (<TAB>){ | |
77 chomp; | |
78 push @stats, $_ unless (/^#/); | |
79 } | |
80 | |
81 ## Read fasta | |
82 my $seq_in = Bio::SeqIO->new(-file => $fasta_file, | |
83 -format => 'fasta'); | |
84 my $seq_out = Bio::SeqIO->new(-fh => \*STDOUT, | |
85 -format => 'fasta'); | |
86 my $seq_out_filt = Bio::SeqIO->new(-file => ">$out_filtered", | |
87 -format => 'fasta') if ($out_filtered); | |
88 while (my $seq = $seq_in->next_seq){ | |
89 my $stat = shift @stats; | |
90 die "Less rows in tab than sequences in seq file" unless $stat; | |
91 my ($id_tab, $length, $coverage) = split(/\t+/, $stat); | |
92 die "id, length or coverate not defined at $stat\n" | |
93 unless ($id_tab && $length && $coverage); | |
94 my $id_seq = $seq->id; | |
95 die "Unmatched ids $id_seq and $id_tab\n" unless ($id_seq =~ /^$id_tab/i); | |
96 if ($length >= $length_co && $coverage >= $coverage_co){ | |
97 $seq_out->write_seq($seq); | |
98 } elsif ($out_filtered){ | |
99 $seq_out_filt->write_seq($seq); | |
100 } else { | |
101 # do nothing | |
102 } | |
103 } | |
104 die "More rows in tab than sequences in seq file" if (scalar(@stats) > 0); | |
105 exit 0; | |
106 |