comparison filter_spades_repeats.pl @ 0:90957420cc07 draft

planemo upload for repository https://github.com/phac-nml/galaxy_tools/ commit 8ea19b9db8a5d861466adf3bf4e01928d3d1ca38
author nml
date Thu, 12 Oct 2017 15:04:45 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:90957420cc07
1 #!/usr/bin/env perl
2
3 use strict;
4 use Getopt::Long;
5 use Bio::SeqIO;
6 use Pod::Usage;
7
8 my ($fasta_file, $tab_file, $coverage_co, $length_co, $repeat_co, $out_filtered, $out_repeats, $out_norepeats,$coverage_length_co, $summary_out, $filtered_repeats, $help);
9
10 GetOptions(
11 'c|coverage-cutoff=s' => \$coverage_co,
12 'l|length-cutoff=s' => \$length_co,
13 'e|coverage-length-cutoff=s' => \$coverage_length_co,
14 'r|repeat_cutoff=s' => \$repeat_co,
15 'i|input=s' => \$fasta_file,
16 't|tab=s' => \$tab_file,
17 'f|filtered-out=s' => \$out_filtered,
18 'o|output-repeats=s' => \$out_repeats,
19 'u|output-norepeats=s' => \$out_norepeats,
20 'n|filtered-repeats=s' => \$filtered_repeats,
21 's|summary=s' => \$summary_out,
22 'h|help' => \$help
23 );
24
25 pod2usage(-verbose => 2) if ($help);
26 print "A fasta file is required. Please enter a fasta file using the -i flag.\n" if (!$fasta_file);
27 print "A spades tabs file is required. Please enter a tabs file using the -t flag\n" if (!$tab_file);
28 pod2usage(1) unless $fasta_file && $tab_file;
29
30 if (!$coverage_co)
31 {
32 $coverage_co = 0.33;
33 }
34 if (!$length_co)
35 {
36 $length_co = 1000;
37 }
38 if (!$coverage_length_co)
39 {
40 $coverage_length_co = 5000;
41 }
42 if (!$repeat_co)
43 {
44 $repeat_co = 1.75;
45 }
46 if (!$out_filtered)
47 {
48 $out_filtered = "Discarded_sequences.fasta";
49 print "Discarded sequences will be printed out to $out_filtered\n";
50 }
51 if (!$out_repeats)
52 {
53 $out_repeats = "Filtered_sequences_with_repeats.fasta";
54 print "Filtered sequences with repeats will be printed out to $out_repeats\n";
55 }
56 if (!$out_norepeats)
57 {
58 $out_norepeats = "Filtered_sequences_no_repeats.fasta";
59 print "Filtered sequences without repeats will be printed out to $out_norepeats\n";
60 }
61 if (!$filtered_repeats)
62 {
63 $filtered_repeats = "Repeat_sequences.fasta";
64 print "Repeat sequences will be printed out to $filtered_repeats\n";
65 }
66
67 die ("No tab file specified") unless ($tab_file);
68 die ("No fasta file specified") unless ($fasta_file);
69
70 ##Read tab file and discard rows with comments
71 open TAB, '<', $tab_file or die "Could not open tab file: $?";
72 open SEQIN, '<', $fasta_file or die "Could not open tab file: $?";
73 open SEQOUT_REP, '>', $out_repeats or die "Could not open file for writing: $?";
74 open SEQOUT_NOREP, '>', $out_norepeats or die "Could not open file for writing: $?";
75 open SEQOUT_FILT, '>', $out_filtered if ($out_filtered);
76 open SEQOUT_FILT_REP, '>', $filtered_repeats or die "Could not open file for writing: $?";
77 open SUMMARY, '>', $summary_out if ($summary_out);
78
79
80 my $avg_coverage = 0;
81 my $num_contigs = 0;
82 my $cutoff_coverage;
83 my $cutoff_repeats;
84 my @stats;
85
86
87 while (<TAB>)
88 {
89 chomp;
90 push @stats, $_ unless (/^#/);
91 }
92
93 #Calculate average coverage.
94 foreach my $stat(@stats)
95 {
96 my ($length, $coverage);
97 (undef,$length, $coverage) = split(/\t+/, $stat);
98 die "length or coverage not defined at $stat\n" unless ($length && ($coverage ne '' && $coverage >= 0));
99 if ($length >= $coverage_length_co)
100 {
101 $avg_coverage = $avg_coverage + $coverage;
102 $num_contigs++;
103 }
104 }
105
106 $avg_coverage = $avg_coverage / $num_contigs;
107 $cutoff_coverage = $avg_coverage * $coverage_co;
108 $cutoff_repeats = $avg_coverage * $repeat_co;
109
110 print SUMMARY "Filter SPAdes repeats Results Summary\n======================================\n\n" if ($summary_out);
111 print SUMMARY "Paramaters used:\nLength cutoff for calcularing average cutoff: $coverage_length_co\nCoverage cutoff ratio: $coverage_co\nRepeat cutoff ratio: $repeat_co\nLength cutoff: $length_co\n\n" if ($summary_out);
112
113 print SUMMARY "Calculations:\nAverage coverage: $avg_coverage\nCoverage cutoff: $cutoff_coverage\nRepeat cutoff: $cutoff_repeats\n\nFile headers:\n" if ($summary_out);
114
115 my ($header, $seq_id, $seq);
116 my $repeated = 0;
117 my $valid = 0;
118
119 #Summary strings:
120 my $discarded = "";
121 my $repeats = "";
122 my $filtered_rep = "";
123 my $filtered_norep = "";
124
125 while (my $line = <SEQIN>)
126 {
127 if ($line =~ />/)
128 {
129 chomp $line;
130 #Get the sequence name to compare against tab file
131 $header = $line;
132 $seq_id = $line =~ /(\w+)_length/;
133 $seq = "";
134
135 my $stat = shift @stats;
136 die "Less rows in tab than sequences in seq file" unless $stat;
137 my($name, $length, $coverage) = split(/\t+/, $stat);
138 die "name or length not defined at $stat\n" unless ($name && $length);
139 die "coverage is not defined at $stat\n" unless ($coverage ne '' && $coverage >= 0);
140 die "Unmatched names $header and $name\n" unless ($header =~ /$name/i);
141
142 #Entry passes the length and coverage cutoffs?
143 if ($length >= $length_co && $coverage >= $cutoff_coverage)
144 {
145 $valid = 1;
146 #Repeats
147 if ($coverage >= $cutoff_repeats)
148 {
149 my $num_repeats = int($coverage/$avg_coverage);
150 $header = $header."(".$num_repeats." copies)";
151 print SEQOUT_REP $header,"\n";
152 $filtered_rep = $filtered_rep.$header."\n";
153 print SEQOUT_FILT_REP $header, "\n";
154 $repeats = $repeats.$header."\n";
155 $repeated = 1;
156 }
157 else
158 {
159 print SEQOUT_REP $header, "\n";
160 $filtered_rep = $filtered_rep.$header."\n";
161 print SEQOUT_NOREP $header, "\n";
162 $filtered_norep = $filtered_norep.$header."\n";
163 $repeated = 0;
164 }
165 }
166 elsif ($out_filtered)
167 {
168 $valid = 0;
169 print SEQOUT_FILT $header,"\n";
170 $discarded = $discarded.$header."\n";
171 }
172 }
173 else
174 {
175 if ($valid)
176 {
177 print SEQOUT_REP $line;
178 if (!$repeated)
179 {
180 print SEQOUT_NOREP $line;
181 }
182 else
183 {
184 print SEQOUT_FILT_REP $line;
185 }
186 }
187 elsif ($out_filtered)
188 {
189 print SEQOUT_FILT $line;
190 }
191 }
192
193 }
194
195 close TAB;
196 close SEQIN;
197 close SEQOUT_REP;
198 close SEQOUT_NOREP;
199 close SEQOUT_FILT;
200 close SEQOUT_FILT_REP;
201
202
203 #Get summary info:
204 if ($summary_out)
205 {
206 print SUMMARY "Filtered sequences (with repeats):\n$filtered_rep\n";
207 print SUMMARY "Filtered sequences (no repeats):\n$filtered_norep\n";
208 print SUMMARY "Repeat sequences:\n$repeats\n";
209 if ($out_filtered)
210 {
211 print SUMMARY "Discarded sequences:\n$discarded\n";
212 }
213
214 close SUMMARY;
215 }
216
217 die "More rows in stats file than sequences in the fasta file\n" if (scalar(@stats) > 0);
218 exit 0;
219
220
221 __END__
222
223
224
225 =head1 NAME
226
227 filter_spades_repeats.pl - Filters contigs or scaffolds based on contig length and detects contigs/scaffolds with very high coverage.
228
229
230
231 =head1 USAGE
232
233 filter_spades_output.pl -i <contigs/scaffolds input>
234 -t <stats input>
235 -o <output fasta with repeats>
236 -u <output fasta without repeats>
237
238 Optional:
239 -c <coverage cutoff ratio> (default 0.33)
240 -l <length cutoff> (default: 1000)
241 -e <length cutoff for average coverage calculation> (default: 5000)
242 -r <repeat cutoff ratio> (default (1.75)
243 -n <filtered repeated sequences>
244 -f <discarded sequences>
245 -s <output summary file>
246
247 For more information:
248 -h
249
250
251 =head1 INPUT
252
253 =over 8
254
255 =item B<-i>B<--input>
256
257 Contigs/Scaffolds fasta file.
258
259 =item B<-t>B<--tab>
260
261 The tabular output file from SPAdes. This file should have the following format:
262
263 #name length coverage
264
265 NODE_1 31438 24.5116
266
267 NODE_2 31354 2316.96
268
269 NODE_3 26948 82.3294
270
271 =item B<-o>B<--output-repeats>
272
273 Output fasta file including the contigs marked as repeated.
274
275 =item B<-u>B<--output-norepeats>
276
277 Output fasta file excluding the contigs marked as repeated.
278
279 =item B<-c>B<--coverage-cutoff>
280
281 Mininum coverage ratio.
282
283 coverage_theshold = average_coverage * minimum_coverage_ratio.
284
285 Any contigs/scaffolds with coverage below the coverage_theshold will be eliminated.
286
287 =item B<-l>B<--length-cutoff>
288
289 Mininum length. Contigs below this length will be eliminated.
290
291 =item B<-e>B<--coverage-length-cutoff>
292
293 Minimum length to use for average coverage calculations.
294
295 =item B<-r>B<--repeat-cutoff>
296
297 Minimum repeats ratio.
298
299 repeat_threshold = average_coverage * repeat_ratio.
300
301 Any contigs with coverage below this threshold will be considered to be repeated
302
303
304 =item B<-f>B<--filtered-out>
305
306 If specified, filtered out sequences will be written to this file.
307
308 =item B<-s>B<--summary>
309
310 A summary of results
311
312 =back
313 =cut