Mercurial > repos > iss > eurl_vtec_wgs_pt
comparison scripts/filter_spades_repeats.pl @ 0:c6bab5103a14 draft
"planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
author | iss |
---|---|
date | Mon, 21 Mar 2022 15:23:09 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c6bab5103a14 |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 use strict; | |
4 use Getopt::Long; | |
5 use Bio::SeqIO; | |
6 use Pod::Usage; | |
7 | |
8 my ($fasta_file, $tab_file, $coverage_co, $length_co, $repeat_co, $out_filtered, $out_repeats, $out_norepeats,$coverage_length_co, $summary_out, $filtered_repeats, $help); | |
9 | |
10 GetOptions( | |
11 'c|coverage-cutoff=s' => \$coverage_co, | |
12 'l|length-cutoff=s' => \$length_co, | |
13 'e|coverage-length-cutoff=s' => \$coverage_length_co, | |
14 'r|repeat_cutoff=s' => \$repeat_co, | |
15 'i|input=s' => \$fasta_file, | |
16 't|tab=s' => \$tab_file, | |
17 'f|filtered-out=s' => \$out_filtered, | |
18 'o|output-repeats=s' => \$out_repeats, | |
19 'u|output-norepeats=s' => \$out_norepeats, | |
20 'n|filtered-repeats=s' => \$filtered_repeats, | |
21 's|summary=s' => \$summary_out, | |
22 'h|help' => \$help | |
23 ); | |
24 | |
25 pod2usage(-verbose => 2) if ($help); | |
26 print "A fasta file is required. Please enter a fasta file using the -i flag.\n" if (!$fasta_file); | |
27 print "A spades tabs file is required. Please enter a tabs file using the -t flag\n" if (!$tab_file); | |
28 pod2usage(1) unless $fasta_file && $tab_file; | |
29 | |
30 if (!$coverage_co) | |
31 { | |
32 $coverage_co = 0.33; | |
33 } | |
34 if (!$length_co) | |
35 { | |
36 $length_co = 1000; | |
37 } | |
38 if (!$coverage_length_co) | |
39 { | |
40 $coverage_length_co = 5000; | |
41 } | |
42 if (!$repeat_co) | |
43 { | |
44 $repeat_co = 1.75; | |
45 } | |
46 if (!$out_filtered) | |
47 { | |
48 $out_filtered = "Discarded_sequences.fasta"; | |
49 print "Discarded sequences will be printed out to $out_filtered\n"; | |
50 } | |
51 if (!$out_repeats) | |
52 { | |
53 $out_repeats = "Filtered_sequences_with_repeats.fasta"; | |
54 print "Filtered sequences with repeats will be printed out to $out_repeats\n"; | |
55 } | |
56 if (!$out_norepeats) | |
57 { | |
58 $out_norepeats = "Filtered_sequences_no_repeats.fasta"; | |
59 print "Filtered sequences without repeats will be printed out to $out_norepeats\n"; | |
60 } | |
61 if (!$filtered_repeats) | |
62 { | |
63 $filtered_repeats = "Repeat_sequences.fasta"; | |
64 print "Repeat sequences will be printed out to $filtered_repeats\n"; | |
65 } | |
66 | |
67 die ("No tab file specified") unless ($tab_file); | |
68 die ("No fasta file specified") unless ($fasta_file); | |
69 | |
70 ##Read tab file and discard rows with comments | |
71 open TAB, '<', $tab_file or die "Could not open tab file: $?"; | |
72 open SEQIN, '<', $fasta_file or die "Could not open tab file: $?"; | |
73 open SEQOUT_REP, '>', $out_repeats or die "Could not open file for writing: $?"; | |
74 open SEQOUT_NOREP, '>', $out_norepeats or die "Could not open file for writing: $?"; | |
75 open SEQOUT_FILT, '>', $out_filtered if ($out_filtered); | |
76 open SEQOUT_FILT_REP, '>', $filtered_repeats or die "Could not open file for writing: $?"; | |
77 open SUMMARY, '>', $summary_out if ($summary_out); | |
78 | |
79 | |
80 my $avg_coverage = 0; | |
81 my $num_contigs = 0; | |
82 my $cutoff_coverage; | |
83 my $cutoff_repeats; | |
84 my @stats; | |
85 | |
86 | |
87 while (<TAB>) | |
88 { | |
89 chomp; | |
90 push @stats, $_ unless (/^#/); | |
91 } | |
92 | |
93 #Calculate average coverage. | |
94 foreach my $stat(@stats) | |
95 { | |
96 my ($length, $coverage); | |
97 (undef,$length, $coverage) = split(/\t+/, $stat); | |
98 die "length or coverage not defined at $stat\n" unless ($length && ($coverage ne '' && $coverage >= 0)); | |
99 if ($length >= $coverage_length_co) | |
100 { | |
101 $avg_coverage = $avg_coverage + $coverage; | |
102 $num_contigs++; | |
103 } | |
104 } | |
105 | |
106 $avg_coverage = $avg_coverage / $num_contigs; | |
107 $cutoff_coverage = $avg_coverage * $coverage_co; | |
108 $cutoff_repeats = $avg_coverage * $repeat_co; | |
109 | |
110 print SUMMARY "Filter SPAdes repeats Results Summary\n======================================\n\n" if ($summary_out); | |
111 print SUMMARY "Paramaters used:\nLength cutoff for calcularing average cutoff: $coverage_length_co\nCoverage cutoff ratio: $coverage_co\nRepeat cutoff ratio: $repeat_co\nLength cutoff: $length_co\n\n" if ($summary_out); | |
112 | |
113 print SUMMARY "Calculations:\nAverage coverage: $avg_coverage\nCoverage cutoff: $cutoff_coverage\nRepeat cutoff: $cutoff_repeats\n\nFile headers:\n" if ($summary_out); | |
114 | |
115 my ($header, $seq_id, $seq); | |
116 my $repeated = 0; | |
117 my $valid = 0; | |
118 | |
119 #Summary strings: | |
120 my $discarded = ""; | |
121 my $repeats = ""; | |
122 my $filtered_rep = ""; | |
123 my $filtered_norep = ""; | |
124 | |
125 while (my $line = <SEQIN>) | |
126 { | |
127 if ($line =~ />/) | |
128 { | |
129 chomp $line; | |
130 #Get the sequence name to compare against tab file | |
131 $header = $line; | |
132 $seq_id = $line =~ /(\w+)_length/; | |
133 $seq = ""; | |
134 | |
135 my $stat = shift @stats; | |
136 die "Less rows in tab than sequences in seq file" unless $stat; | |
137 my($name, $length, $coverage) = split(/\t+/, $stat); | |
138 die "name or length not defined at $stat\n" unless ($name && $length); | |
139 die "coverage is not defined at $stat\n" unless ($coverage ne '' && $coverage >= 0); | |
140 die "Unmatched names $header and $name\n" unless ($header =~ /$name/i); | |
141 | |
142 #Entry passes the length and coverage cutoffs? | |
143 if ($length >= $length_co && $coverage >= $cutoff_coverage) | |
144 { | |
145 $valid = 1; | |
146 #Repeats | |
147 if ($coverage >= $cutoff_repeats) | |
148 { | |
149 my $num_repeats = int($coverage/$avg_coverage); | |
150 $header = $header."(".$num_repeats." copies)"; | |
151 print SEQOUT_REP $header,"\n"; | |
152 $filtered_rep = $filtered_rep.$header."\n"; | |
153 print SEQOUT_FILT_REP $header, "\n"; | |
154 $repeats = $repeats.$header."\n"; | |
155 $repeated = 1; | |
156 } | |
157 else | |
158 { | |
159 print SEQOUT_REP $header, "\n"; | |
160 $filtered_rep = $filtered_rep.$header."\n"; | |
161 print SEQOUT_NOREP $header, "\n"; | |
162 $filtered_norep = $filtered_norep.$header."\n"; | |
163 $repeated = 0; | |
164 } | |
165 } | |
166 elsif ($out_filtered) | |
167 { | |
168 $valid = 0; | |
169 print SEQOUT_FILT $header,"\n"; | |
170 $discarded = $discarded.$header."\n"; | |
171 } | |
172 } | |
173 else | |
174 { | |
175 if ($valid) | |
176 { | |
177 print SEQOUT_REP $line; | |
178 if (!$repeated) | |
179 { | |
180 print SEQOUT_NOREP $line; | |
181 } | |
182 else | |
183 { | |
184 print SEQOUT_FILT_REP $line; | |
185 } | |
186 } | |
187 elsif ($out_filtered) | |
188 { | |
189 print SEQOUT_FILT $line; | |
190 } | |
191 } | |
192 | |
193 } | |
194 | |
195 close TAB; | |
196 close SEQIN; | |
197 close SEQOUT_REP; | |
198 close SEQOUT_NOREP; | |
199 close SEQOUT_FILT; | |
200 close SEQOUT_FILT_REP; | |
201 | |
202 | |
203 #Get summary info: | |
204 if ($summary_out) | |
205 { | |
206 print SUMMARY "Filtered sequences (with repeats):\n$filtered_rep\n"; | |
207 print SUMMARY "Filtered sequences (no repeats):\n$filtered_norep\n"; | |
208 print SUMMARY "Repeat sequences:\n$repeats\n"; | |
209 if ($out_filtered) | |
210 { | |
211 print SUMMARY "Discarded sequences:\n$discarded\n"; | |
212 } | |
213 | |
214 close SUMMARY; | |
215 } | |
216 | |
217 die "More rows in stats file than sequences in the fasta file\n" if (scalar(@stats) > 0); | |
218 exit 0; | |
219 | |
220 | |
221 __END__ | |
222 | |
223 | |
224 | |
225 =head1 NAME | |
226 | |
227 filter_spades_repeats.pl - Filters contigs or scaffolds based on contig length and detects contigs/scaffolds with very high coverage. | |
228 | |
229 | |
230 | |
231 =head1 USAGE | |
232 | |
233 filter_spades_output.pl -i <contigs/scaffolds input> | |
234 -t <stats input> | |
235 -o <output fasta with repeats> | |
236 -u <output fasta without repeats> | |
237 | |
238 Optional: | |
239 -c <coverage cutoff ratio> (default 0.33) | |
240 -l <length cutoff> (default: 1000) | |
241 -e <length cutoff for average coverage calculation> (default: 5000) | |
242 -r <repeat cutoff ratio> (default (1.75) | |
243 -n <filtered repeated sequences> | |
244 -f <discarded sequences> | |
245 -s <output summary file> | |
246 | |
247 For more information: | |
248 -h | |
249 | |
250 | |
251 =head1 INPUT | |
252 | |
253 =over 8 | |
254 | |
255 =item B<-i>B<--input> | |
256 | |
257 Contigs/Scaffolds fasta file. | |
258 | |
259 =item B<-t>B<--tab> | |
260 | |
261 The tabular output file from SPAdes. This file should have the following format: | |
262 | |
263 #name length coverage | |
264 | |
265 NODE_1 31438 24.5116 | |
266 | |
267 NODE_2 31354 2316.96 | |
268 | |
269 NODE_3 26948 82.3294 | |
270 | |
271 =item B<-o>B<--output-repeats> | |
272 | |
273 Output fasta file including the contigs marked as repeated. | |
274 | |
275 =item B<-u>B<--output-norepeats> | |
276 | |
277 Output fasta file excluding the contigs marked as repeated. | |
278 | |
279 =item B<-c>B<--coverage-cutoff> | |
280 | |
281 Mininum coverage ratio. | |
282 | |
283 coverage_theshold = average_coverage * minimum_coverage_ratio. | |
284 | |
285 Any contigs/scaffolds with coverage below the coverage_theshold will be eliminated. | |
286 | |
287 =item B<-l>B<--length-cutoff> | |
288 | |
289 Mininum length. Contigs below this length will be eliminated. | |
290 | |
291 =item B<-e>B<--coverage-length-cutoff> | |
292 | |
293 Minimum length to use for average coverage calculations. | |
294 | |
295 =item B<-r>B<--repeat-cutoff> | |
296 | |
297 Minimum repeats ratio. | |
298 | |
299 repeat_threshold = average_coverage * repeat_ratio. | |
300 | |
301 Any contigs with coverage below this threshold will be considered to be repeated | |
302 | |
303 | |
304 =item B<-f>B<--filtered-out> | |
305 | |
306 If specified, filtered out sequences will be written to this file. | |
307 | |
308 =item B<-s>B<--summary> | |
309 | |
310 A summary of results | |
311 | |
312 =back | |
313 =cut |