0
|
1 #!/usr/bin/env perl
|
|
2
|
|
3 use warnings;
|
|
4 use strict;
|
|
5 use Getopt::Long;
|
|
6 use Bio::SeqIO;
|
|
7 use Pod::Usage;
|
|
8
|
|
9 my ($fasta_file, $tab_file, $coverage_co, $length_co, $repeat_co, $out_filtered, $out_repeats, $out_norepeats,$coverage_length_co, $summary_out, $filtered_repeats, $help);
|
|
10
|
|
11 GetOptions(
|
|
12 'c|coverage-cutoff=s' => \$coverage_co,
|
|
13 'l|length-cutoff=s' => \$length_co,
|
|
14 'e|coverage-length-cutoff=s' => \$coverage_length_co,
|
|
15 'r|repeat_cutoff=s' => \$repeat_co,
|
|
16 'i|input=s' => \$fasta_file,
|
|
17 't|tab=s' => \$tab_file,
|
|
18 'f|filtered-out=s' => \$out_filtered,
|
|
19 'o|output-repeats=s' => \$out_repeats,
|
|
20 'u|output-norepeats=s' => \$out_norepeats,
|
|
21 'n|filtered-repeats=s' => \$filtered_repeats,
|
|
22 's|summary=s' => \$summary_out,
|
|
23 'h|help' => \$help
|
|
24 );
|
|
25
|
|
26 pod2usage(-verbose => 2) if ($help);
|
|
27 print "A fasta file is required. Please enter a fasta file using the -i flag.\n" if (!$fasta_file);
|
|
28 print "A spades tabs file is required. Please enter a tabs file using the -t flag\n" if (!$tab_file);
|
|
29 pod2usage(1) unless $fasta_file && $tab_file;
|
|
30
|
|
31 if (!$coverage_co)
|
|
32 {
|
|
33 $coverage_co = 0.33;
|
|
34 }
|
|
35 if (!$length_co)
|
|
36 {
|
|
37 $length_co = 1000;
|
|
38 }
|
|
39 if (!$coverage_length_co)
|
|
40 {
|
|
41 $coverage_length_co = 5000;
|
|
42 }
|
|
43 if (!$repeat_co)
|
|
44 {
|
|
45 $repeat_co = 1.75;
|
|
46 }
|
|
47 if (!$out_filtered)
|
|
48 {
|
|
49 $out_filtered = "Discarded_sequences.fasta";
|
|
50 print "Discarded sequences will be printed out to $out_filtered\n";
|
|
51 }
|
|
52 if (!$out_repeats)
|
|
53 {
|
|
54 $out_repeats = "Filtered_sequences_with_repeats.fasta";
|
|
55 print "Filtered sequences with repeats will be printed out to $out_repeats\n";
|
|
56 }
|
|
57 if (!$out_norepeats)
|
|
58 {
|
|
59 $out_norepeats = "Filtered_sequences_no_repeats.fasta";
|
|
60 print "Filtered sequences without repeats will be printed out to $out_norepeats\n";
|
|
61 }
|
|
62 if (!$filtered_repeats)
|
|
63 {
|
|
64 $filtered_repeats = "Repeat_sequences.fasta";
|
|
65 print "Repeat sequences will be printed out to $filtered_repeats\n";
|
|
66 }
|
|
67
|
|
68 die ("No tab file specified") unless ($tab_file);
|
|
69 die ("No fasta file specified") unless ($fasta_file);
|
|
70
|
|
71 ##Read tab file and discard rows with comments
|
|
72 open TAB, '<', $tab_file or die "Could not open tab file: $?";
|
|
73 open SEQIN, '<', $fasta_file or die "Could not open tab file: $?";
|
|
74 open SEQOUT_REP, '>', $out_repeats or die "Could not open file for writing: $?";
|
|
75 open SEQOUT_NOREP, '>', $out_norepeats or die "Could not open file for writing: $?";
|
|
76 open SEQOUT_FILT, '>', $out_filtered if ($out_filtered);
|
|
77 open SEQOUT_FILT_REP, '>', $filtered_repeats or die "Could not open file for writing: $?";
|
|
78 open SUMMARY, '>', $summary_out if ($summary_out);
|
|
79
|
|
80
|
|
81 my $avg_coverage = 0;
|
|
82 my $num_contigs = 0;
|
|
83 my $cutoff_coverage;
|
|
84 my $cutoff_repeats;
|
|
85 my @stats;
|
|
86
|
|
87
|
|
88 while (<TAB>)
|
|
89 {
|
|
90 chomp;
|
|
91 push @stats, $_ unless (/^#/);
|
|
92 }
|
|
93
|
|
94 #Calculate average coverage.
|
|
95 foreach my $stat(@stats)
|
|
96 {
|
|
97 my ($length, $coverage);
|
|
98 (undef,$length, $coverage) = split(/\t+/, $stat);
|
|
99 die "length or coverage not defined at $stat\n" unless ($length && ($coverage ne '' && $coverage >= 0));
|
|
100 if ($length >= $coverage_length_co)
|
|
101 {
|
|
102 $avg_coverage = $avg_coverage + $coverage;
|
|
103 $num_contigs++;
|
|
104 }
|
|
105 }
|
|
106
|
|
107 $avg_coverage = $avg_coverage / $num_contigs;
|
|
108 $cutoff_coverage = $avg_coverage * $coverage_co;
|
|
109 $cutoff_repeats = $avg_coverage * $repeat_co;
|
|
110
|
|
111 print SUMMARY "Filter SPAdes repeats Results Summary\n======================================\n\n" if ($summary_out);
|
|
112 print SUMMARY "Paramaters used:\nLength cutoff for calcularing average cutoff: $coverage_length_co\nCoverage cutoff ratio: $coverage_co\nRepeat cutoff ratio: $repeat_co\nLength cutoff: $length_co\n\n" if ($summary_out);
|
|
113
|
|
114 print SUMMARY "Calculations:\nAverage coverage: $avg_coverage\nCoverage cutoff: $cutoff_coverage\nRepeat cutoff: $cutoff_repeats\n\nFile headers:\n" if ($summary_out);
|
|
115
|
|
116 my ($header, $seq_id, $seq);
|
|
117 my $repeated = 0;
|
|
118 my $valid = 0;
|
|
119
|
|
120 #Summary strings:
|
|
121 my $discarded = "";
|
|
122 my $repeats = "";
|
|
123 my $filtered_rep = "";
|
|
124 my $filtered_norep = "";
|
|
125
|
|
126 while (my $line = <SEQIN>)
|
|
127 {
|
|
128 if ($line =~ />/)
|
|
129 {
|
|
130 chomp $line;
|
|
131 #Get the sequence name to compare against tab file
|
|
132 $header = $line;
|
|
133 $seq_id = $line =~ /(\w+)_length/;
|
|
134 $seq = "";
|
|
135
|
|
136 my $stat = shift @stats;
|
|
137 die "Less rows in tab than sequences in seq file" unless $stat;
|
|
138 my($name, $length, $coverage) = split(/\t+/, $stat);
|
|
139 die "name or length not defined at $stat\n" unless ($name && $length);
|
|
140 die "coverage is not defined at $stat\n" unless ($coverage ne '' && $coverage >= 0);
|
|
141 die "Unmatched names $header and $name\n" unless ($header =~ /$name/i);
|
|
142
|
|
143 #Entry passes the length and coverage cutoffs?
|
|
144 if ($length >= $length_co && $coverage >= $cutoff_coverage)
|
|
145 {
|
|
146 $valid = 1;
|
|
147 #Repeats
|
|
148 if ($coverage >= $cutoff_repeats)
|
|
149 {
|
|
150 my $num_repeats = int($coverage/$avg_coverage);
|
|
151 $header = $header."(".$num_repeats." copies)";
|
|
152 print SEQOUT_REP $header,"\n";
|
|
153 $filtered_rep = $filtered_rep.$header."\n";
|
|
154 print SEQOUT_FILT_REP $header, "\n";
|
|
155 $repeats = $repeats.$header."\n";
|
|
156 $repeated = 1;
|
|
157 }
|
|
158 else
|
|
159 {
|
|
160 print SEQOUT_REP $header, "\n";
|
|
161 $filtered_rep = $filtered_rep.$header."\n";
|
|
162 print SEQOUT_NOREP $header, "\n";
|
|
163 $filtered_norep = $filtered_norep.$header."\n";
|
|
164 $repeated = 0;
|
|
165 }
|
|
166 }
|
|
167 elsif ($out_filtered)
|
|
168 {
|
|
169 $valid = 0;
|
|
170 print SEQOUT_FILT $header,"\n";
|
|
171 $discarded = $discarded.$header."\n";
|
|
172 }
|
|
173 }
|
|
174 else
|
|
175 {
|
|
176 if ($valid)
|
|
177 {
|
|
178 print SEQOUT_REP $line;
|
|
179 if (!$repeated)
|
|
180 {
|
|
181 print SEQOUT_NOREP $line;
|
|
182 }
|
|
183 else
|
|
184 {
|
|
185 print SEQOUT_FILT_REP $line;
|
|
186 }
|
|
187 }
|
|
188 elsif ($out_filtered)
|
|
189 {
|
|
190 print SEQOUT_FILT $line;
|
|
191 }
|
|
192 }
|
|
193
|
|
194 }
|
|
195
|
|
196 close TAB;
|
|
197 close SEQIN;
|
|
198 close SEQOUT_REP;
|
|
199 close SEQOUT_NOREP;
|
|
200 close SEQOUT_FILT;
|
|
201 close SEQOUT_FILT_REP;
|
|
202
|
|
203
|
|
204 #Get summary info:
|
|
205 if ($summary_out)
|
|
206 {
|
|
207 print SUMMARY "Filtered sequences (with repeats):\n$filtered_rep\n";
|
|
208 print SUMMARY "Filtered sequences (no repeats):\n$filtered_norep\n";
|
|
209 print SUMMARY "Repeat sequences:\n$repeats\n";
|
|
210 if ($out_filtered)
|
|
211 {
|
|
212 print SUMMARY "Discarded sequences:\n$discarded\n";
|
|
213 }
|
|
214
|
|
215 close SUMMARY;
|
|
216 }
|
|
217
|
|
218 die "More rows in stats file than sequences in the fasta file\n" if (scalar(@stats) > 0);
|
|
219 exit 0;
|
|
220
|
|
221
|
|
222 __END__
|
|
223
|
|
224
|
|
225
|
|
226 =head1 NAME
|
|
227
|
|
228 filter_spades_repeats.pl - Filters contigs or scaffolds based on contig length and detects contigs/scaffolds with very high coverage.
|
|
229
|
|
230
|
|
231
|
|
232 =head1 USAGE
|
|
233
|
|
234 filter_spades_output.pl -i <contigs/scaffolds input>
|
|
235 -t <stats input>
|
|
236 -o <output fasta with repeats>
|
|
237 -u <output fasta without repeats>
|
|
238
|
|
239 Optional:
|
|
240 -c <coverage cutoff ratio> (default 0.33)
|
|
241 -l <length cutoff> (default: 1000)
|
|
242 -e <length cutoff for average coverage calculation> (default: 5000)
|
|
243 -r <repeat cutoff ratio> (default (1.75)
|
|
244 -n <filtered repeated sequences>
|
|
245 -f <discarded sequences>
|
|
246 -s <output summary file>
|
|
247
|
|
248 For more information:
|
|
249 -h
|
|
250
|
|
251
|
|
252 =head1 INPUT
|
|
253
|
|
254 =over 8
|
|
255
|
|
256 =item B<-i>B<--input>
|
|
257
|
|
258 Contigs/Scaffolds fasta file.
|
|
259
|
|
260 =item B<-t>B<--tab>
|
|
261
|
|
262 The tabular output file from SPAdes. This file should have the following format:
|
|
263
|
|
264 #name length coverage
|
|
265
|
|
266 NODE_1 31438 24.5116
|
|
267
|
|
268 NODE_2 31354 2316.96
|
|
269
|
|
270 NODE_3 26948 82.3294
|
|
271
|
|
272 =item B<-o>B<--output-repeats>
|
|
273
|
|
274 Output fasta file including the contigs marked as repeated.
|
|
275
|
|
276 =item B<-u>B<--output-norepeats>
|
|
277
|
|
278 Output fasta file excluding the contigs marked as repeated.
|
|
279
|
|
280 =item B<-c>B<--coverage-cutoff>
|
|
281
|
|
282 Mininum coverage ratio.
|
|
283
|
|
284 coverage_theshold = average_coverage * minimum_coverage_ratio.
|
|
285
|
|
286 Any contigs/scaffolds with coverage below the coverage_theshold will be eliminated.
|
|
287
|
|
288 =item B<-l>B<--length-cutoff>
|
|
289
|
|
290 Mininum length. Contigs below this length will be eliminated.
|
|
291
|
|
292 =item B<-e>B<--coverage-length-cutoff>
|
|
293
|
|
294 Minimum length to use for average coverage calculations.
|
|
295
|
|
296 =item B<-r>B<--repeat-cutoff>
|
|
297
|
|
298 Minimum repeats ratio.
|
|
299
|
|
300 repeat_threshold = average_coverage * repeat_ratio.
|
|
301
|
|
302 Any contigs with coverage below this threshold will be considered to be repeated
|
|
303
|
|
304
|
|
305 =item B<-f>B<--filtered-out>
|
|
306
|
|
307 If specified, filtered out sequences will be written to this file.
|
|
308
|
|
309 =item B<-s>B<--summary>
|
|
310
|
|
311 A summary of results
|
|
312
|
|
313 =back
|
|
314 =cut
|
|
315
|