annotate nml_filter_spades_repeats.pl @ 0:ddd1e15df88c draft

Uploaded
author aaronpetkau
date Sat, 04 Jul 2015 09:45:30 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
1 #!/usr/bin/env perl
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
2
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
3 use warnings;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
4 use strict;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
5 use Getopt::Long;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
6 use Bio::SeqIO;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
7 use Pod::Usage;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
8
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
9 my ($fasta_file, $tab_file, $coverage_co, $length_co, $repeat_co, $out_filtered, $out_repeats, $out_norepeats,$coverage_length_co, $summary_out, $filtered_repeats, $help);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
10
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
11 GetOptions(
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
12 'c|coverage-cutoff=s' => \$coverage_co,
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
13 'l|length-cutoff=s' => \$length_co,
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
14 'e|coverage-length-cutoff=s' => \$coverage_length_co,
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
15 'r|repeat_cutoff=s' => \$repeat_co,
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
16 'i|input=s' => \$fasta_file,
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
17 't|tab=s' => \$tab_file,
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
18 'f|filtered-out=s' => \$out_filtered,
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
19 'o|output-repeats=s' => \$out_repeats,
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
20 'u|output-norepeats=s' => \$out_norepeats,
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
21 'n|filtered-repeats=s' => \$filtered_repeats,
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
22 's|summary=s' => \$summary_out,
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
23 'h|help' => \$help
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
24 );
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
25
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
26 pod2usage(-verbose => 2) if ($help);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
27 print "A fasta file is required. Please enter a fasta file using the -i flag.\n" if (!$fasta_file);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
28 print "A spades tabs file is required. Please enter a tabs file using the -t flag\n" if (!$tab_file);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
29 pod2usage(1) unless $fasta_file && $tab_file;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
30
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
31 if (!$coverage_co)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
32 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
33 $coverage_co = 0.33;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
34 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
35 if (!$length_co)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
36 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
37 $length_co = 1000;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
38 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
39 if (!$coverage_length_co)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
40 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
41 $coverage_length_co = 5000;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
42 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
43 if (!$repeat_co)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
44 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
45 $repeat_co = 1.75;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
46 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
47 if (!$out_filtered)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
48 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
49 $out_filtered = "Discarded_sequences.fasta";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
50 print "Discarded sequences will be printed out to $out_filtered\n";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
51 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
52 if (!$out_repeats)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
53 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
54 $out_repeats = "Filtered_sequences_with_repeats.fasta";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
55 print "Filtered sequences with repeats will be printed out to $out_repeats\n";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
56 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
57 if (!$out_norepeats)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
58 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
59 $out_norepeats = "Filtered_sequences_no_repeats.fasta";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
60 print "Filtered sequences without repeats will be printed out to $out_norepeats\n";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
61 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
62 if (!$filtered_repeats)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
63 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
64 $filtered_repeats = "Repeat_sequences.fasta";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
65 print "Repeat sequences will be printed out to $filtered_repeats\n";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
66 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
67
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
68 die ("No tab file specified") unless ($tab_file);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
69 die ("No fasta file specified") unless ($fasta_file);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
70
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
71 ##Read tab file and discard rows with comments
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
72 open TAB, '<', $tab_file or die "Could not open tab file: $?";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
73 open SEQIN, '<', $fasta_file or die "Could not open tab file: $?";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
74 open SEQOUT_REP, '>', $out_repeats or die "Could not open file for writing: $?";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
75 open SEQOUT_NOREP, '>', $out_norepeats or die "Could not open file for writing: $?";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
76 open SEQOUT_FILT, '>', $out_filtered if ($out_filtered);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
77 open SEQOUT_FILT_REP, '>', $filtered_repeats or die "Could not open file for writing: $?";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
78 open SUMMARY, '>', $summary_out if ($summary_out);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
79
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
80
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
81 my $avg_coverage = 0;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
82 my $num_contigs = 0;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
83 my $cutoff_coverage;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
84 my $cutoff_repeats;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
85 my @stats;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
86
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
87
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
88 while (<TAB>)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
89 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
90 chomp;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
91 push @stats, $_ unless (/^#/);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
92 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
93
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
94 #Calculate average coverage.
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
95 foreach my $stat(@stats)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
96 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
97 my ($length, $coverage);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
98 (undef,$length, $coverage) = split(/\t+/, $stat);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
99 die "length or coverage not defined at $stat\n" unless ($length && ($coverage ne '' && $coverage >= 0));
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
100 if ($length >= $coverage_length_co)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
101 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
102 $avg_coverage = $avg_coverage + $coverage;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
103 $num_contigs++;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
104 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
105 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
106
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
107 $avg_coverage = $avg_coverage / $num_contigs;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
108 $cutoff_coverage = $avg_coverage * $coverage_co;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
109 $cutoff_repeats = $avg_coverage * $repeat_co;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
110
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
111 print SUMMARY "Filter SPAdes repeats Results Summary\n======================================\n\n" if ($summary_out);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
112 print SUMMARY "Paramaters used:\nLength cutoff for calcularing average cutoff: $coverage_length_co\nCoverage cutoff ratio: $coverage_co\nRepeat cutoff ratio: $repeat_co\nLength cutoff: $length_co\n\n" if ($summary_out);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
113
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
114 print SUMMARY "Calculations:\nAverage coverage: $avg_coverage\nCoverage cutoff: $cutoff_coverage\nRepeat cutoff: $cutoff_repeats\n\nFile headers:\n" if ($summary_out);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
115
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
116 my ($header, $seq_id, $seq);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
117 my $repeated = 0;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
118 my $valid = 0;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
119
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
120 #Summary strings:
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
121 my $discarded = "";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
122 my $repeats = "";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
123 my $filtered_rep = "";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
124 my $filtered_norep = "";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
125
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
126 while (my $line = <SEQIN>)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
127 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
128 if ($line =~ />/)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
129 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
130 chomp $line;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
131 #Get the sequence name to compare against tab file
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
132 $header = $line;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
133 $seq_id = $line =~ /(\w+)_length/;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
134 $seq = "";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
135
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
136 my $stat = shift @stats;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
137 die "Less rows in tab than sequences in seq file" unless $stat;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
138 my($name, $length, $coverage) = split(/\t+/, $stat);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
139 die "name or length not defined at $stat\n" unless ($name && $length);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
140 die "coverage is not defined at $stat\n" unless ($coverage ne '' && $coverage >= 0);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
141 die "Unmatched names $header and $name\n" unless ($header =~ /$name/i);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
142
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
143 #Entry passes the length and coverage cutoffs?
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
144 if ($length >= $length_co && $coverage >= $cutoff_coverage)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
145 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
146 $valid = 1;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
147 #Repeats
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
148 if ($coverage >= $cutoff_repeats)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
149 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
150 my $num_repeats = int($coverage/$avg_coverage);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
151 $header = $header."(".$num_repeats." copies)";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
152 print SEQOUT_REP $header,"\n";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
153 $filtered_rep = $filtered_rep.$header."\n";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
154 print SEQOUT_FILT_REP $header, "\n";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
155 $repeats = $repeats.$header."\n";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
156 $repeated = 1;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
157 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
158 else
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
159 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
160 print SEQOUT_REP $header, "\n";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
161 $filtered_rep = $filtered_rep.$header."\n";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
162 print SEQOUT_NOREP $header, "\n";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
163 $filtered_norep = $filtered_norep.$header."\n";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
164 $repeated = 0;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
165 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
166 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
167 elsif ($out_filtered)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
168 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
169 $valid = 0;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
170 print SEQOUT_FILT $header,"\n";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
171 $discarded = $discarded.$header."\n";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
172 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
173 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
174 else
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
175 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
176 if ($valid)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
177 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
178 print SEQOUT_REP $line;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
179 if (!$repeated)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
180 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
181 print SEQOUT_NOREP $line;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
182 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
183 else
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
184 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
185 print SEQOUT_FILT_REP $line;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
186 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
187 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
188 elsif ($out_filtered)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
189 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
190 print SEQOUT_FILT $line;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
191 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
192 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
193
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
194 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
195
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
196 close TAB;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
197 close SEQIN;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
198 close SEQOUT_REP;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
199 close SEQOUT_NOREP;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
200 close SEQOUT_FILT;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
201 close SEQOUT_FILT_REP;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
202
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
203
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
204 #Get summary info:
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
205 if ($summary_out)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
206 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
207 print SUMMARY "Filtered sequences (with repeats):\n$filtered_rep\n";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
208 print SUMMARY "Filtered sequences (no repeats):\n$filtered_norep\n";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
209 print SUMMARY "Repeat sequences:\n$repeats\n";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
210 if ($out_filtered)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
211 {
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
212 print SUMMARY "Discarded sequences:\n$discarded\n";
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
213 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
214
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
215 close SUMMARY;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
216 }
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
217
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
218 die "More rows in stats file than sequences in the fasta file\n" if (scalar(@stats) > 0);
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
219 exit 0;
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
220
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
221
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
222 __END__
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
223
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
224
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
225
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
226 =head1 NAME
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
227
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
228 filter_spades_repeats.pl - Filters contigs or scaffolds based on contig length and detects contigs/scaffolds with very high coverage.
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
229
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
230
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
231
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
232 =head1 USAGE
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
233
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
234 filter_spades_output.pl -i <contigs/scaffolds input>
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
235 -t <stats input>
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
236 -o <output fasta with repeats>
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
237 -u <output fasta without repeats>
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
238
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
239 Optional:
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
240 -c <coverage cutoff ratio> (default 0.33)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
241 -l <length cutoff> (default: 1000)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
242 -e <length cutoff for average coverage calculation> (default: 5000)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
243 -r <repeat cutoff ratio> (default (1.75)
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
244 -n <filtered repeated sequences>
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
245 -f <discarded sequences>
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
246 -s <output summary file>
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
247
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
248 For more information:
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
249 -h
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
250
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
251
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
252 =head1 INPUT
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
253
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
254 =over 8
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
255
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
256 =item B<-i>B<--input>
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
257
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
258 Contigs/Scaffolds fasta file.
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
259
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
260 =item B<-t>B<--tab>
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
261
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
262 The tabular output file from SPAdes. This file should have the following format:
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
263
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
264 #name length coverage
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
265
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
266 NODE_1 31438 24.5116
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
267
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
268 NODE_2 31354 2316.96
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
269
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
270 NODE_3 26948 82.3294
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
271
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
272 =item B<-o>B<--output-repeats>
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
273
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
274 Output fasta file including the contigs marked as repeated.
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
275
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
276 =item B<-u>B<--output-norepeats>
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
277
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
278 Output fasta file excluding the contigs marked as repeated.
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
279
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
280 =item B<-c>B<--coverage-cutoff>
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
281
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
282 Mininum coverage ratio.
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
283
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
284 coverage_theshold = average_coverage * minimum_coverage_ratio.
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
285
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
286 Any contigs/scaffolds with coverage below the coverage_theshold will be eliminated.
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
287
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
288 =item B<-l>B<--length-cutoff>
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
289
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
290 Mininum length. Contigs below this length will be eliminated.
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
291
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
292 =item B<-e>B<--coverage-length-cutoff>
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
293
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
294 Minimum length to use for average coverage calculations.
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
295
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
296 =item B<-r>B<--repeat-cutoff>
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
297
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
298 Minimum repeats ratio.
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
299
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
300 repeat_threshold = average_coverage * repeat_ratio.
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
301
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
302 Any contigs with coverage below this threshold will be considered to be repeated
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
303
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
304
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
305 =item B<-f>B<--filtered-out>
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
306
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
307 If specified, filtered out sequences will be written to this file.
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
308
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
309 =item B<-s>B<--summary>
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
310
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
311 A summary of results
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
312
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
313 =back
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
314 =cut
ddd1e15df88c Uploaded
aaronpetkau
parents:
diff changeset
315