Mercurial > repos > aaronpetkau > filter_spades_repeats
comparison nml_filter_spades_repeats.pl @ 0:ddd1e15df88c draft
Uploaded
author | aaronpetkau |
---|---|
date | Sat, 04 Jul 2015 09:45:30 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ddd1e15df88c |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 use warnings; | |
4 use strict; | |
5 use Getopt::Long; | |
6 use Bio::SeqIO; | |
7 use Pod::Usage; | |
8 | |
9 my ($fasta_file, $tab_file, $coverage_co, $length_co, $repeat_co, $out_filtered, $out_repeats, $out_norepeats,$coverage_length_co, $summary_out, $filtered_repeats, $help); | |
10 | |
11 GetOptions( | |
12 'c|coverage-cutoff=s' => \$coverage_co, | |
13 'l|length-cutoff=s' => \$length_co, | |
14 'e|coverage-length-cutoff=s' => \$coverage_length_co, | |
15 'r|repeat_cutoff=s' => \$repeat_co, | |
16 'i|input=s' => \$fasta_file, | |
17 't|tab=s' => \$tab_file, | |
18 'f|filtered-out=s' => \$out_filtered, | |
19 'o|output-repeats=s' => \$out_repeats, | |
20 'u|output-norepeats=s' => \$out_norepeats, | |
21 'n|filtered-repeats=s' => \$filtered_repeats, | |
22 's|summary=s' => \$summary_out, | |
23 'h|help' => \$help | |
24 ); | |
25 | |
26 pod2usage(-verbose => 2) if ($help); | |
27 print "A fasta file is required. Please enter a fasta file using the -i flag.\n" if (!$fasta_file); | |
28 print "A spades tabs file is required. Please enter a tabs file using the -t flag\n" if (!$tab_file); | |
29 pod2usage(1) unless $fasta_file && $tab_file; | |
30 | |
31 if (!$coverage_co) | |
32 { | |
33 $coverage_co = 0.33; | |
34 } | |
35 if (!$length_co) | |
36 { | |
37 $length_co = 1000; | |
38 } | |
39 if (!$coverage_length_co) | |
40 { | |
41 $coverage_length_co = 5000; | |
42 } | |
43 if (!$repeat_co) | |
44 { | |
45 $repeat_co = 1.75; | |
46 } | |
47 if (!$out_filtered) | |
48 { | |
49 $out_filtered = "Discarded_sequences.fasta"; | |
50 print "Discarded sequences will be printed out to $out_filtered\n"; | |
51 } | |
52 if (!$out_repeats) | |
53 { | |
54 $out_repeats = "Filtered_sequences_with_repeats.fasta"; | |
55 print "Filtered sequences with repeats will be printed out to $out_repeats\n"; | |
56 } | |
57 if (!$out_norepeats) | |
58 { | |
59 $out_norepeats = "Filtered_sequences_no_repeats.fasta"; | |
60 print "Filtered sequences without repeats will be printed out to $out_norepeats\n"; | |
61 } | |
62 if (!$filtered_repeats) | |
63 { | |
64 $filtered_repeats = "Repeat_sequences.fasta"; | |
65 print "Repeat sequences will be printed out to $filtered_repeats\n"; | |
66 } | |
67 | |
68 die ("No tab file specified") unless ($tab_file); | |
69 die ("No fasta file specified") unless ($fasta_file); | |
70 | |
71 ##Read tab file and discard rows with comments | |
72 open TAB, '<', $tab_file or die "Could not open tab file: $?"; | |
73 open SEQIN, '<', $fasta_file or die "Could not open tab file: $?"; | |
74 open SEQOUT_REP, '>', $out_repeats or die "Could not open file for writing: $?"; | |
75 open SEQOUT_NOREP, '>', $out_norepeats or die "Could not open file for writing: $?"; | |
76 open SEQOUT_FILT, '>', $out_filtered if ($out_filtered); | |
77 open SEQOUT_FILT_REP, '>', $filtered_repeats or die "Could not open file for writing: $?"; | |
78 open SUMMARY, '>', $summary_out if ($summary_out); | |
79 | |
80 | |
81 my $avg_coverage = 0; | |
82 my $num_contigs = 0; | |
83 my $cutoff_coverage; | |
84 my $cutoff_repeats; | |
85 my @stats; | |
86 | |
87 | |
88 while (<TAB>) | |
89 { | |
90 chomp; | |
91 push @stats, $_ unless (/^#/); | |
92 } | |
93 | |
94 #Calculate average coverage. | |
95 foreach my $stat(@stats) | |
96 { | |
97 my ($length, $coverage); | |
98 (undef,$length, $coverage) = split(/\t+/, $stat); | |
99 die "length or coverage not defined at $stat\n" unless ($length && ($coverage ne '' && $coverage >= 0)); | |
100 if ($length >= $coverage_length_co) | |
101 { | |
102 $avg_coverage = $avg_coverage + $coverage; | |
103 $num_contigs++; | |
104 } | |
105 } | |
106 | |
107 $avg_coverage = $avg_coverage / $num_contigs; | |
108 $cutoff_coverage = $avg_coverage * $coverage_co; | |
109 $cutoff_repeats = $avg_coverage * $repeat_co; | |
110 | |
111 print SUMMARY "Filter SPAdes repeats Results Summary\n======================================\n\n" if ($summary_out); | |
112 print SUMMARY "Paramaters used:\nLength cutoff for calcularing average cutoff: $coverage_length_co\nCoverage cutoff ratio: $coverage_co\nRepeat cutoff ratio: $repeat_co\nLength cutoff: $length_co\n\n" if ($summary_out); | |
113 | |
114 print SUMMARY "Calculations:\nAverage coverage: $avg_coverage\nCoverage cutoff: $cutoff_coverage\nRepeat cutoff: $cutoff_repeats\n\nFile headers:\n" if ($summary_out); | |
115 | |
116 my ($header, $seq_id, $seq); | |
117 my $repeated = 0; | |
118 my $valid = 0; | |
119 | |
120 #Summary strings: | |
121 my $discarded = ""; | |
122 my $repeats = ""; | |
123 my $filtered_rep = ""; | |
124 my $filtered_norep = ""; | |
125 | |
126 while (my $line = <SEQIN>) | |
127 { | |
128 if ($line =~ />/) | |
129 { | |
130 chomp $line; | |
131 #Get the sequence name to compare against tab file | |
132 $header = $line; | |
133 $seq_id = $line =~ /(\w+)_length/; | |
134 $seq = ""; | |
135 | |
136 my $stat = shift @stats; | |
137 die "Less rows in tab than sequences in seq file" unless $stat; | |
138 my($name, $length, $coverage) = split(/\t+/, $stat); | |
139 die "name or length not defined at $stat\n" unless ($name && $length); | |
140 die "coverage is not defined at $stat\n" unless ($coverage ne '' && $coverage >= 0); | |
141 die "Unmatched names $header and $name\n" unless ($header =~ /$name/i); | |
142 | |
143 #Entry passes the length and coverage cutoffs? | |
144 if ($length >= $length_co && $coverage >= $cutoff_coverage) | |
145 { | |
146 $valid = 1; | |
147 #Repeats | |
148 if ($coverage >= $cutoff_repeats) | |
149 { | |
150 my $num_repeats = int($coverage/$avg_coverage); | |
151 $header = $header."(".$num_repeats." copies)"; | |
152 print SEQOUT_REP $header,"\n"; | |
153 $filtered_rep = $filtered_rep.$header."\n"; | |
154 print SEQOUT_FILT_REP $header, "\n"; | |
155 $repeats = $repeats.$header."\n"; | |
156 $repeated = 1; | |
157 } | |
158 else | |
159 { | |
160 print SEQOUT_REP $header, "\n"; | |
161 $filtered_rep = $filtered_rep.$header."\n"; | |
162 print SEQOUT_NOREP $header, "\n"; | |
163 $filtered_norep = $filtered_norep.$header."\n"; | |
164 $repeated = 0; | |
165 } | |
166 } | |
167 elsif ($out_filtered) | |
168 { | |
169 $valid = 0; | |
170 print SEQOUT_FILT $header,"\n"; | |
171 $discarded = $discarded.$header."\n"; | |
172 } | |
173 } | |
174 else | |
175 { | |
176 if ($valid) | |
177 { | |
178 print SEQOUT_REP $line; | |
179 if (!$repeated) | |
180 { | |
181 print SEQOUT_NOREP $line; | |
182 } | |
183 else | |
184 { | |
185 print SEQOUT_FILT_REP $line; | |
186 } | |
187 } | |
188 elsif ($out_filtered) | |
189 { | |
190 print SEQOUT_FILT $line; | |
191 } | |
192 } | |
193 | |
194 } | |
195 | |
196 close TAB; | |
197 close SEQIN; | |
198 close SEQOUT_REP; | |
199 close SEQOUT_NOREP; | |
200 close SEQOUT_FILT; | |
201 close SEQOUT_FILT_REP; | |
202 | |
203 | |
204 #Get summary info: | |
205 if ($summary_out) | |
206 { | |
207 print SUMMARY "Filtered sequences (with repeats):\n$filtered_rep\n"; | |
208 print SUMMARY "Filtered sequences (no repeats):\n$filtered_norep\n"; | |
209 print SUMMARY "Repeat sequences:\n$repeats\n"; | |
210 if ($out_filtered) | |
211 { | |
212 print SUMMARY "Discarded sequences:\n$discarded\n"; | |
213 } | |
214 | |
215 close SUMMARY; | |
216 } | |
217 | |
218 die "More rows in stats file than sequences in the fasta file\n" if (scalar(@stats) > 0); | |
219 exit 0; | |
220 | |
221 | |
222 __END__ | |
223 | |
224 | |
225 | |
226 =head1 NAME | |
227 | |
228 filter_spades_repeats.pl - Filters contigs or scaffolds based on contig length and detects contigs/scaffolds with very high coverage. | |
229 | |
230 | |
231 | |
232 =head1 USAGE | |
233 | |
234 filter_spades_output.pl -i <contigs/scaffolds input> | |
235 -t <stats input> | |
236 -o <output fasta with repeats> | |
237 -u <output fasta without repeats> | |
238 | |
239 Optional: | |
240 -c <coverage cutoff ratio> (default 0.33) | |
241 -l <length cutoff> (default: 1000) | |
242 -e <length cutoff for average coverage calculation> (default: 5000) | |
243 -r <repeat cutoff ratio> (default (1.75) | |
244 -n <filtered repeated sequences> | |
245 -f <discarded sequences> | |
246 -s <output summary file> | |
247 | |
248 For more information: | |
249 -h | |
250 | |
251 | |
252 =head1 INPUT | |
253 | |
254 =over 8 | |
255 | |
256 =item B<-i>B<--input> | |
257 | |
258 Contigs/Scaffolds fasta file. | |
259 | |
260 =item B<-t>B<--tab> | |
261 | |
262 The tabular output file from SPAdes. This file should have the following format: | |
263 | |
264 #name length coverage | |
265 | |
266 NODE_1 31438 24.5116 | |
267 | |
268 NODE_2 31354 2316.96 | |
269 | |
270 NODE_3 26948 82.3294 | |
271 | |
272 =item B<-o>B<--output-repeats> | |
273 | |
274 Output fasta file including the contigs marked as repeated. | |
275 | |
276 =item B<-u>B<--output-norepeats> | |
277 | |
278 Output fasta file excluding the contigs marked as repeated. | |
279 | |
280 =item B<-c>B<--coverage-cutoff> | |
281 | |
282 Mininum coverage ratio. | |
283 | |
284 coverage_theshold = average_coverage * minimum_coverage_ratio. | |
285 | |
286 Any contigs/scaffolds with coverage below the coverage_theshold will be eliminated. | |
287 | |
288 =item B<-l>B<--length-cutoff> | |
289 | |
290 Mininum length. Contigs below this length will be eliminated. | |
291 | |
292 =item B<-e>B<--coverage-length-cutoff> | |
293 | |
294 Minimum length to use for average coverage calculations. | |
295 | |
296 =item B<-r>B<--repeat-cutoff> | |
297 | |
298 Minimum repeats ratio. | |
299 | |
300 repeat_threshold = average_coverage * repeat_ratio. | |
301 | |
302 Any contigs with coverage below this threshold will be considered to be repeated | |
303 | |
304 | |
305 =item B<-f>B<--filtered-out> | |
306 | |
307 If specified, filtered out sequences will be written to this file. | |
308 | |
309 =item B<-s>B<--summary> | |
310 | |
311 A summary of results | |
312 | |
313 =back | |
314 =cut | |
315 |