Mercurial > repos > aaronpetkau > filter_spades_repeats
diff nml_filter_spades_repeats.pl @ 0:ddd1e15df88c draft
Uploaded
author | aaronpetkau |
---|---|
date | Sat, 04 Jul 2015 09:45:30 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nml_filter_spades_repeats.pl Sat Jul 04 09:45:30 2015 -0400 @@ -0,0 +1,315 @@ +#!/usr/bin/env perl + +use warnings; +use strict; +use Getopt::Long; +use Bio::SeqIO; +use Pod::Usage; + +my ($fasta_file, $tab_file, $coverage_co, $length_co, $repeat_co, $out_filtered, $out_repeats, $out_norepeats,$coverage_length_co, $summary_out, $filtered_repeats, $help); + +GetOptions( + 'c|coverage-cutoff=s' => \$coverage_co, + 'l|length-cutoff=s' => \$length_co, + 'e|coverage-length-cutoff=s' => \$coverage_length_co, + 'r|repeat_cutoff=s' => \$repeat_co, + 'i|input=s' => \$fasta_file, + 't|tab=s' => \$tab_file, + 'f|filtered-out=s' => \$out_filtered, + 'o|output-repeats=s' => \$out_repeats, + 'u|output-norepeats=s' => \$out_norepeats, + 'n|filtered-repeats=s' => \$filtered_repeats, + 's|summary=s' => \$summary_out, + 'h|help' => \$help +); + +pod2usage(-verbose => 2) if ($help); +print "A fasta file is required. Please enter a fasta file using the -i flag.\n" if (!$fasta_file); +print "A spades tabs file is required. Please enter a tabs file using the -t flag\n" if (!$tab_file); +pod2usage(1) unless $fasta_file && $tab_file; + +if (!$coverage_co) +{ + $coverage_co = 0.33; +} +if (!$length_co) +{ + $length_co = 1000; +} +if (!$coverage_length_co) +{ + $coverage_length_co = 5000; +} +if (!$repeat_co) +{ + $repeat_co = 1.75; +} +if (!$out_filtered) +{ + $out_filtered = "Discarded_sequences.fasta"; + print "Discarded sequences will be printed out to $out_filtered\n"; +} +if (!$out_repeats) +{ + $out_repeats = "Filtered_sequences_with_repeats.fasta"; + print "Filtered sequences with repeats will be printed out to $out_repeats\n"; +} +if (!$out_norepeats) +{ + $out_norepeats = "Filtered_sequences_no_repeats.fasta"; + print "Filtered sequences without repeats will be printed out to $out_norepeats\n"; +} +if (!$filtered_repeats) +{ + $filtered_repeats = "Repeat_sequences.fasta"; + print "Repeat sequences will be printed out to $filtered_repeats\n"; +} + +die ("No tab file specified") unless ($tab_file); +die ("No fasta file specified") unless ($fasta_file); + +##Read tab file and discard rows with comments +open TAB, '<', $tab_file or die "Could not open tab file: $?"; +open SEQIN, '<', $fasta_file or die "Could not open tab file: $?"; +open SEQOUT_REP, '>', $out_repeats or die "Could not open file for writing: $?"; +open SEQOUT_NOREP, '>', $out_norepeats or die "Could not open file for writing: $?"; +open SEQOUT_FILT, '>', $out_filtered if ($out_filtered); +open SEQOUT_FILT_REP, '>', $filtered_repeats or die "Could not open file for writing: $?"; +open SUMMARY, '>', $summary_out if ($summary_out); + + +my $avg_coverage = 0; +my $num_contigs = 0; +my $cutoff_coverage; +my $cutoff_repeats; +my @stats; + + +while (<TAB>) +{ + chomp; + push @stats, $_ unless (/^#/); +} + +#Calculate average coverage. +foreach my $stat(@stats) +{ + my ($length, $coverage); + (undef,$length, $coverage) = split(/\t+/, $stat); + die "length or coverage not defined at $stat\n" unless ($length && ($coverage ne '' && $coverage >= 0)); + if ($length >= $coverage_length_co) + { + $avg_coverage = $avg_coverage + $coverage; + $num_contigs++; + } +} + +$avg_coverage = $avg_coverage / $num_contigs; +$cutoff_coverage = $avg_coverage * $coverage_co; +$cutoff_repeats = $avg_coverage * $repeat_co; + +print SUMMARY "Filter SPAdes repeats Results Summary\n======================================\n\n" if ($summary_out); +print SUMMARY "Paramaters used:\nLength cutoff for calcularing average cutoff: $coverage_length_co\nCoverage cutoff ratio: $coverage_co\nRepeat cutoff ratio: $repeat_co\nLength cutoff: $length_co\n\n" if ($summary_out); + +print SUMMARY "Calculations:\nAverage coverage: $avg_coverage\nCoverage cutoff: $cutoff_coverage\nRepeat cutoff: $cutoff_repeats\n\nFile headers:\n" if ($summary_out); + +my ($header, $seq_id, $seq); +my $repeated = 0; +my $valid = 0; + +#Summary strings: +my $discarded = ""; +my $repeats = ""; +my $filtered_rep = ""; +my $filtered_norep = ""; + +while (my $line = <SEQIN>) +{ + if ($line =~ />/) + { + chomp $line; + #Get the sequence name to compare against tab file + $header = $line; + $seq_id = $line =~ /(\w+)_length/; + $seq = ""; + + my $stat = shift @stats; + die "Less rows in tab than sequences in seq file" unless $stat; + my($name, $length, $coverage) = split(/\t+/, $stat); + die "name or length not defined at $stat\n" unless ($name && $length); + die "coverage is not defined at $stat\n" unless ($coverage ne '' && $coverage >= 0); + die "Unmatched names $header and $name\n" unless ($header =~ /$name/i); + + #Entry passes the length and coverage cutoffs? + if ($length >= $length_co && $coverage >= $cutoff_coverage) + { + $valid = 1; + #Repeats + if ($coverage >= $cutoff_repeats) + { + my $num_repeats = int($coverage/$avg_coverage); + $header = $header."(".$num_repeats." copies)"; + print SEQOUT_REP $header,"\n"; + $filtered_rep = $filtered_rep.$header."\n"; + print SEQOUT_FILT_REP $header, "\n"; + $repeats = $repeats.$header."\n"; + $repeated = 1; + } + else + { + print SEQOUT_REP $header, "\n"; + $filtered_rep = $filtered_rep.$header."\n"; + print SEQOUT_NOREP $header, "\n"; + $filtered_norep = $filtered_norep.$header."\n"; + $repeated = 0; + } + } + elsif ($out_filtered) + { + $valid = 0; + print SEQOUT_FILT $header,"\n"; + $discarded = $discarded.$header."\n"; + } + } + else + { + if ($valid) + { + print SEQOUT_REP $line; + if (!$repeated) + { + print SEQOUT_NOREP $line; + } + else + { + print SEQOUT_FILT_REP $line; + } + } + elsif ($out_filtered) + { + print SEQOUT_FILT $line; + } + } + +} + +close TAB; +close SEQIN; +close SEQOUT_REP; +close SEQOUT_NOREP; +close SEQOUT_FILT; +close SEQOUT_FILT_REP; + + +#Get summary info: +if ($summary_out) +{ + print SUMMARY "Filtered sequences (with repeats):\n$filtered_rep\n"; + print SUMMARY "Filtered sequences (no repeats):\n$filtered_norep\n"; + print SUMMARY "Repeat sequences:\n$repeats\n"; + if ($out_filtered) + { + print SUMMARY "Discarded sequences:\n$discarded\n"; + } + + close SUMMARY; +} + +die "More rows in stats file than sequences in the fasta file\n" if (scalar(@stats) > 0); +exit 0; + + +__END__ + + + +=head1 NAME + + filter_spades_repeats.pl - Filters contigs or scaffolds based on contig length and detects contigs/scaffolds with very high coverage. + + + +=head1 USAGE + + filter_spades_output.pl -i <contigs/scaffolds input> + -t <stats input> + -o <output fasta with repeats> + -u <output fasta without repeats> + + Optional: + -c <coverage cutoff ratio> (default 0.33) + -l <length cutoff> (default: 1000) + -e <length cutoff for average coverage calculation> (default: 5000) + -r <repeat cutoff ratio> (default (1.75) + -n <filtered repeated sequences> + -f <discarded sequences> + -s <output summary file> + + For more information: + -h + + +=head1 INPUT + +=over 8 + +=item B<-i>B<--input> + +Contigs/Scaffolds fasta file. + +=item B<-t>B<--tab> + +The tabular output file from SPAdes. This file should have the following format: + + #name length coverage + + NODE_1 31438 24.5116 + + NODE_2 31354 2316.96 + + NODE_3 26948 82.3294 + +=item B<-o>B<--output-repeats> + +Output fasta file including the contigs marked as repeated. + +=item B<-u>B<--output-norepeats> + +Output fasta file excluding the contigs marked as repeated. + +=item B<-c>B<--coverage-cutoff> + +Mininum coverage ratio. + + coverage_theshold = average_coverage * minimum_coverage_ratio. + +Any contigs/scaffolds with coverage below the coverage_theshold will be eliminated. + +=item B<-l>B<--length-cutoff> + +Mininum length. Contigs below this length will be eliminated. + +=item B<-e>B<--coverage-length-cutoff> + +Minimum length to use for average coverage calculations. + +=item B<-r>B<--repeat-cutoff> + +Minimum repeats ratio. + + repeat_threshold = average_coverage * repeat_ratio. + +Any contigs with coverage below this threshold will be considered to be repeated + + +=item B<-f>B<--filtered-out> + +If specified, filtered out sequences will be written to this file. + +=item B<-s>B<--summary> + +A summary of results + +=back +=cut +