Mercurial > repos > nml > filter_spades_repeats
view filter_spades_repeats.pl @ 1:0e3d2c8b1b23 draft default tip
planemo upload for repository https://github.com/phac-nml/galaxy_tools/ commit 3f9ae719338c7c8db81d645b8ee09727e2d9ce23
author | nml |
---|---|
date | Tue, 07 Nov 2017 11:52:52 -0500 |
parents | 90957420cc07 |
children |
line wrap: on
line source
#!/usr/bin/env perl use strict; use Getopt::Long; use Bio::SeqIO; use Pod::Usage; my ($fasta_file, $tab_file, $coverage_co, $length_co, $repeat_co, $out_filtered, $out_repeats, $out_norepeats,$coverage_length_co, $summary_out, $filtered_repeats, $help); GetOptions( 'c|coverage-cutoff=s' => \$coverage_co, 'l|length-cutoff=s' => \$length_co, 'e|coverage-length-cutoff=s' => \$coverage_length_co, 'r|repeat_cutoff=s' => \$repeat_co, 'i|input=s' => \$fasta_file, 't|tab=s' => \$tab_file, 'f|filtered-out=s' => \$out_filtered, 'o|output-repeats=s' => \$out_repeats, 'u|output-norepeats=s' => \$out_norepeats, 'n|filtered-repeats=s' => \$filtered_repeats, 's|summary=s' => \$summary_out, 'h|help' => \$help ); pod2usage(-verbose => 2) if ($help); print "A fasta file is required. Please enter a fasta file using the -i flag.\n" if (!$fasta_file); print "A spades tabs file is required. Please enter a tabs file using the -t flag\n" if (!$tab_file); pod2usage(1) unless $fasta_file && $tab_file; if (!$coverage_co) { $coverage_co = 0.33; } if (!$length_co) { $length_co = 1000; } if (!$coverage_length_co) { $coverage_length_co = 5000; } if (!$repeat_co) { $repeat_co = 1.75; } if (!$out_filtered) { $out_filtered = "Discarded_sequences.fasta"; print "Discarded sequences will be printed out to $out_filtered\n"; } if (!$out_repeats) { $out_repeats = "Filtered_sequences_with_repeats.fasta"; print "Filtered sequences with repeats will be printed out to $out_repeats\n"; } if (!$out_norepeats) { $out_norepeats = "Filtered_sequences_no_repeats.fasta"; print "Filtered sequences without repeats will be printed out to $out_norepeats\n"; } if (!$filtered_repeats) { $filtered_repeats = "Repeat_sequences.fasta"; print "Repeat sequences will be printed out to $filtered_repeats\n"; } die ("No tab file specified") unless ($tab_file); die ("No fasta file specified") unless ($fasta_file); ##Read tab file and discard rows with comments open TAB, '<', $tab_file or die "Could not open tab file: $?"; open SEQIN, '<', $fasta_file or die "Could not open tab file: $?"; open SEQOUT_REP, '>', $out_repeats or die "Could not open file for writing: $?"; open SEQOUT_NOREP, '>', $out_norepeats or die "Could not open file for writing: $?"; open SEQOUT_FILT, '>', $out_filtered if ($out_filtered); open SEQOUT_FILT_REP, '>', $filtered_repeats or die "Could not open file for writing: $?"; open SUMMARY, '>', $summary_out if ($summary_out); my $avg_coverage = 0; my $num_contigs = 0; my $cutoff_coverage; my $cutoff_repeats; my @stats; while (<TAB>) { chomp; push @stats, $_ unless (/^#/); } #Calculate average coverage. foreach my $stat(@stats) { my ($length, $coverage); (undef,$length, $coverage) = split(/\t+/, $stat); die "length or coverage not defined at $stat\n" unless ($length && ($coverage ne '' && $coverage >= 0)); if ($length >= $coverage_length_co) { $avg_coverage = $avg_coverage + $coverage; $num_contigs++; } } $avg_coverage = $avg_coverage / $num_contigs; $cutoff_coverage = $avg_coverage * $coverage_co; $cutoff_repeats = $avg_coverage * $repeat_co; print SUMMARY "Filter SPAdes repeats Results Summary\n======================================\n\n" if ($summary_out); print SUMMARY "Paramaters used:\nLength cutoff for calcularing average cutoff: $coverage_length_co\nCoverage cutoff ratio: $coverage_co\nRepeat cutoff ratio: $repeat_co\nLength cutoff: $length_co\n\n" if ($summary_out); print SUMMARY "Calculations:\nAverage coverage: $avg_coverage\nCoverage cutoff: $cutoff_coverage\nRepeat cutoff: $cutoff_repeats\n\nFile headers:\n" if ($summary_out); my ($header, $seq_id, $seq); my $repeated = 0; my $valid = 0; #Summary strings: my $discarded = ""; my $repeats = ""; my $filtered_rep = ""; my $filtered_norep = ""; while (my $line = <SEQIN>) { if ($line =~ />/) { chomp $line; #Get the sequence name to compare against tab file $header = $line; $seq_id = $line =~ /(\w+)_length/; $seq = ""; my $stat = shift @stats; die "Less rows in tab than sequences in seq file" unless $stat; my($name, $length, $coverage) = split(/\t+/, $stat); die "name or length not defined at $stat\n" unless ($name && $length); die "coverage is not defined at $stat\n" unless ($coverage ne '' && $coverage >= 0); die "Unmatched names $header and $name\n" unless ($header =~ /$name/i); #Entry passes the length and coverage cutoffs? if ($length >= $length_co && $coverage >= $cutoff_coverage) { $valid = 1; #Repeats if ($coverage >= $cutoff_repeats) { my $num_repeats = int($coverage/$avg_coverage); $header = $header."(".$num_repeats." copies)"; print SEQOUT_REP $header,"\n"; $filtered_rep = $filtered_rep.$header."\n"; print SEQOUT_FILT_REP $header, "\n"; $repeats = $repeats.$header."\n"; $repeated = 1; } else { print SEQOUT_REP $header, "\n"; $filtered_rep = $filtered_rep.$header."\n"; print SEQOUT_NOREP $header, "\n"; $filtered_norep = $filtered_norep.$header."\n"; $repeated = 0; } } elsif ($out_filtered) { $valid = 0; print SEQOUT_FILT $header,"\n"; $discarded = $discarded.$header."\n"; } } else { if ($valid) { print SEQOUT_REP $line; if (!$repeated) { print SEQOUT_NOREP $line; } else { print SEQOUT_FILT_REP $line; } } elsif ($out_filtered) { print SEQOUT_FILT $line; } } } close TAB; close SEQIN; close SEQOUT_REP; close SEQOUT_NOREP; close SEQOUT_FILT; close SEQOUT_FILT_REP; #Get summary info: if ($summary_out) { print SUMMARY "Filtered sequences (with repeats):\n$filtered_rep\n"; print SUMMARY "Filtered sequences (no repeats):\n$filtered_norep\n"; print SUMMARY "Repeat sequences:\n$repeats\n"; if ($out_filtered) { print SUMMARY "Discarded sequences:\n$discarded\n"; } close SUMMARY; } die "More rows in stats file than sequences in the fasta file\n" if (scalar(@stats) > 0); exit 0; __END__ =head1 NAME filter_spades_repeats.pl - Filters contigs or scaffolds based on contig length and detects contigs/scaffolds with very high coverage. =head1 USAGE filter_spades_output.pl -i <contigs/scaffolds input> -t <stats input> -o <output fasta with repeats> -u <output fasta without repeats> Optional: -c <coverage cutoff ratio> (default 0.33) -l <length cutoff> (default: 1000) -e <length cutoff for average coverage calculation> (default: 5000) -r <repeat cutoff ratio> (default (1.75) -n <filtered repeated sequences> -f <discarded sequences> -s <output summary file> For more information: -h =head1 INPUT =over 8 =item B<-i>B<--input> Contigs/Scaffolds fasta file. =item B<-t>B<--tab> The tabular output file from SPAdes. This file should have the following format: #name length coverage NODE_1 31438 24.5116 NODE_2 31354 2316.96 NODE_3 26948 82.3294 =item B<-o>B<--output-repeats> Output fasta file including the contigs marked as repeated. =item B<-u>B<--output-norepeats> Output fasta file excluding the contigs marked as repeated. =item B<-c>B<--coverage-cutoff> Mininum coverage ratio. coverage_theshold = average_coverage * minimum_coverage_ratio. Any contigs/scaffolds with coverage below the coverage_theshold will be eliminated. =item B<-l>B<--length-cutoff> Mininum length. Contigs below this length will be eliminated. =item B<-e>B<--coverage-length-cutoff> Minimum length to use for average coverage calculations. =item B<-r>B<--repeat-cutoff> Minimum repeats ratio. repeat_threshold = average_coverage * repeat_ratio. Any contigs with coverage below this threshold will be considered to be repeated =item B<-f>B<--filtered-out> If specified, filtered out sequences will be written to this file. =item B<-s>B<--summary> A summary of results =back =cut