comparison nml_filter_spades_repeats.pl @ 0:ddd1e15df88c draft

Uploaded
author aaronpetkau
date Sat, 04 Jul 2015 09:45:30 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ddd1e15df88c
1 #!/usr/bin/env perl
2
3 use warnings;
4 use strict;
5 use Getopt::Long;
6 use Bio::SeqIO;
7 use Pod::Usage;
8
9 my ($fasta_file, $tab_file, $coverage_co, $length_co, $repeat_co, $out_filtered, $out_repeats, $out_norepeats,$coverage_length_co, $summary_out, $filtered_repeats, $help);
10
11 GetOptions(
12 'c|coverage-cutoff=s' => \$coverage_co,
13 'l|length-cutoff=s' => \$length_co,
14 'e|coverage-length-cutoff=s' => \$coverage_length_co,
15 'r|repeat_cutoff=s' => \$repeat_co,
16 'i|input=s' => \$fasta_file,
17 't|tab=s' => \$tab_file,
18 'f|filtered-out=s' => \$out_filtered,
19 'o|output-repeats=s' => \$out_repeats,
20 'u|output-norepeats=s' => \$out_norepeats,
21 'n|filtered-repeats=s' => \$filtered_repeats,
22 's|summary=s' => \$summary_out,
23 'h|help' => \$help
24 );
25
26 pod2usage(-verbose => 2) if ($help);
27 print "A fasta file is required. Please enter a fasta file using the -i flag.\n" if (!$fasta_file);
28 print "A spades tabs file is required. Please enter a tabs file using the -t flag\n" if (!$tab_file);
29 pod2usage(1) unless $fasta_file && $tab_file;
30
31 if (!$coverage_co)
32 {
33 $coverage_co = 0.33;
34 }
35 if (!$length_co)
36 {
37 $length_co = 1000;
38 }
39 if (!$coverage_length_co)
40 {
41 $coverage_length_co = 5000;
42 }
43 if (!$repeat_co)
44 {
45 $repeat_co = 1.75;
46 }
47 if (!$out_filtered)
48 {
49 $out_filtered = "Discarded_sequences.fasta";
50 print "Discarded sequences will be printed out to $out_filtered\n";
51 }
52 if (!$out_repeats)
53 {
54 $out_repeats = "Filtered_sequences_with_repeats.fasta";
55 print "Filtered sequences with repeats will be printed out to $out_repeats\n";
56 }
57 if (!$out_norepeats)
58 {
59 $out_norepeats = "Filtered_sequences_no_repeats.fasta";
60 print "Filtered sequences without repeats will be printed out to $out_norepeats\n";
61 }
62 if (!$filtered_repeats)
63 {
64 $filtered_repeats = "Repeat_sequences.fasta";
65 print "Repeat sequences will be printed out to $filtered_repeats\n";
66 }
67
68 die ("No tab file specified") unless ($tab_file);
69 die ("No fasta file specified") unless ($fasta_file);
70
71 ##Read tab file and discard rows with comments
72 open TAB, '<', $tab_file or die "Could not open tab file: $?";
73 open SEQIN, '<', $fasta_file or die "Could not open tab file: $?";
74 open SEQOUT_REP, '>', $out_repeats or die "Could not open file for writing: $?";
75 open SEQOUT_NOREP, '>', $out_norepeats or die "Could not open file for writing: $?";
76 open SEQOUT_FILT, '>', $out_filtered if ($out_filtered);
77 open SEQOUT_FILT_REP, '>', $filtered_repeats or die "Could not open file for writing: $?";
78 open SUMMARY, '>', $summary_out if ($summary_out);
79
80
81 my $avg_coverage = 0;
82 my $num_contigs = 0;
83 my $cutoff_coverage;
84 my $cutoff_repeats;
85 my @stats;
86
87
88 while (<TAB>)
89 {
90 chomp;
91 push @stats, $_ unless (/^#/);
92 }
93
94 #Calculate average coverage.
95 foreach my $stat(@stats)
96 {
97 my ($length, $coverage);
98 (undef,$length, $coverage) = split(/\t+/, $stat);
99 die "length or coverage not defined at $stat\n" unless ($length && ($coverage ne '' && $coverage >= 0));
100 if ($length >= $coverage_length_co)
101 {
102 $avg_coverage = $avg_coverage + $coverage;
103 $num_contigs++;
104 }
105 }
106
107 $avg_coverage = $avg_coverage / $num_contigs;
108 $cutoff_coverage = $avg_coverage * $coverage_co;
109 $cutoff_repeats = $avg_coverage * $repeat_co;
110
111 print SUMMARY "Filter SPAdes repeats Results Summary\n======================================\n\n" if ($summary_out);
112 print SUMMARY "Paramaters used:\nLength cutoff for calcularing average cutoff: $coverage_length_co\nCoverage cutoff ratio: $coverage_co\nRepeat cutoff ratio: $repeat_co\nLength cutoff: $length_co\n\n" if ($summary_out);
113
114 print SUMMARY "Calculations:\nAverage coverage: $avg_coverage\nCoverage cutoff: $cutoff_coverage\nRepeat cutoff: $cutoff_repeats\n\nFile headers:\n" if ($summary_out);
115
116 my ($header, $seq_id, $seq);
117 my $repeated = 0;
118 my $valid = 0;
119
120 #Summary strings:
121 my $discarded = "";
122 my $repeats = "";
123 my $filtered_rep = "";
124 my $filtered_norep = "";
125
126 while (my $line = <SEQIN>)
127 {
128 if ($line =~ />/)
129 {
130 chomp $line;
131 #Get the sequence name to compare against tab file
132 $header = $line;
133 $seq_id = $line =~ /(\w+)_length/;
134 $seq = "";
135
136 my $stat = shift @stats;
137 die "Less rows in tab than sequences in seq file" unless $stat;
138 my($name, $length, $coverage) = split(/\t+/, $stat);
139 die "name or length not defined at $stat\n" unless ($name && $length);
140 die "coverage is not defined at $stat\n" unless ($coverage ne '' && $coverage >= 0);
141 die "Unmatched names $header and $name\n" unless ($header =~ /$name/i);
142
143 #Entry passes the length and coverage cutoffs?
144 if ($length >= $length_co && $coverage >= $cutoff_coverage)
145 {
146 $valid = 1;
147 #Repeats
148 if ($coverage >= $cutoff_repeats)
149 {
150 my $num_repeats = int($coverage/$avg_coverage);
151 $header = $header."(".$num_repeats." copies)";
152 print SEQOUT_REP $header,"\n";
153 $filtered_rep = $filtered_rep.$header."\n";
154 print SEQOUT_FILT_REP $header, "\n";
155 $repeats = $repeats.$header."\n";
156 $repeated = 1;
157 }
158 else
159 {
160 print SEQOUT_REP $header, "\n";
161 $filtered_rep = $filtered_rep.$header."\n";
162 print SEQOUT_NOREP $header, "\n";
163 $filtered_norep = $filtered_norep.$header."\n";
164 $repeated = 0;
165 }
166 }
167 elsif ($out_filtered)
168 {
169 $valid = 0;
170 print SEQOUT_FILT $header,"\n";
171 $discarded = $discarded.$header."\n";
172 }
173 }
174 else
175 {
176 if ($valid)
177 {
178 print SEQOUT_REP $line;
179 if (!$repeated)
180 {
181 print SEQOUT_NOREP $line;
182 }
183 else
184 {
185 print SEQOUT_FILT_REP $line;
186 }
187 }
188 elsif ($out_filtered)
189 {
190 print SEQOUT_FILT $line;
191 }
192 }
193
194 }
195
196 close TAB;
197 close SEQIN;
198 close SEQOUT_REP;
199 close SEQOUT_NOREP;
200 close SEQOUT_FILT;
201 close SEQOUT_FILT_REP;
202
203
204 #Get summary info:
205 if ($summary_out)
206 {
207 print SUMMARY "Filtered sequences (with repeats):\n$filtered_rep\n";
208 print SUMMARY "Filtered sequences (no repeats):\n$filtered_norep\n";
209 print SUMMARY "Repeat sequences:\n$repeats\n";
210 if ($out_filtered)
211 {
212 print SUMMARY "Discarded sequences:\n$discarded\n";
213 }
214
215 close SUMMARY;
216 }
217
218 die "More rows in stats file than sequences in the fasta file\n" if (scalar(@stats) > 0);
219 exit 0;
220
221
222 __END__
223
224
225
226 =head1 NAME
227
228 filter_spades_repeats.pl - Filters contigs or scaffolds based on contig length and detects contigs/scaffolds with very high coverage.
229
230
231
232 =head1 USAGE
233
234 filter_spades_output.pl -i <contigs/scaffolds input>
235 -t <stats input>
236 -o <output fasta with repeats>
237 -u <output fasta without repeats>
238
239 Optional:
240 -c <coverage cutoff ratio> (default 0.33)
241 -l <length cutoff> (default: 1000)
242 -e <length cutoff for average coverage calculation> (default: 5000)
243 -r <repeat cutoff ratio> (default (1.75)
244 -n <filtered repeated sequences>
245 -f <discarded sequences>
246 -s <output summary file>
247
248 For more information:
249 -h
250
251
252 =head1 INPUT
253
254 =over 8
255
256 =item B<-i>B<--input>
257
258 Contigs/Scaffolds fasta file.
259
260 =item B<-t>B<--tab>
261
262 The tabular output file from SPAdes. This file should have the following format:
263
264 #name length coverage
265
266 NODE_1 31438 24.5116
267
268 NODE_2 31354 2316.96
269
270 NODE_3 26948 82.3294
271
272 =item B<-o>B<--output-repeats>
273
274 Output fasta file including the contigs marked as repeated.
275
276 =item B<-u>B<--output-norepeats>
277
278 Output fasta file excluding the contigs marked as repeated.
279
280 =item B<-c>B<--coverage-cutoff>
281
282 Mininum coverage ratio.
283
284 coverage_theshold = average_coverage * minimum_coverage_ratio.
285
286 Any contigs/scaffolds with coverage below the coverage_theshold will be eliminated.
287
288 =item B<-l>B<--length-cutoff>
289
290 Mininum length. Contigs below this length will be eliminated.
291
292 =item B<-e>B<--coverage-length-cutoff>
293
294 Minimum length to use for average coverage calculations.
295
296 =item B<-r>B<--repeat-cutoff>
297
298 Minimum repeats ratio.
299
300 repeat_threshold = average_coverage * repeat_ratio.
301
302 Any contigs with coverage below this threshold will be considered to be repeated
303
304
305 =item B<-f>B<--filtered-out>
306
307 If specified, filtered out sequences will be written to this file.
308
309 =item B<-s>B<--summary>
310
311 A summary of results
312
313 =back
314 =cut
315