# HG changeset patch # User aaronpetkau # Date 1436017530 14400 # Node ID ddd1e15df88cbb7711819d29ea117506ebad3e2a Uploaded diff -r 000000000000 -r ddd1e15df88c filter_spades_repeats.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_spades_repeats.xml Sat Jul 04 09:45:30 2015 -0400 @@ -0,0 +1,160 @@ + + Remove short and repeat contigs/scaffolds + + perl + + nml_filter_spades_repeats.pl -i $fasta_input -t $tab_input -c $cov_cutoff -r $rep_cutoff -l $len_cutoff -o $output_with_repeats -u $output_without_repeats -n $repeat_sequences_only -e $cov_len_cutoff -f $discarded_sequences -s $summary + + + + + + + + + + + + + + + + + + + + + + + + keep_leftover == "yes" + + + print_summary == "yes" + + + + + + + +================ +**What it does** +================ + +Using the output of SPAdes (a fasta and a stats file, either from contigs or scaffolds), it filters the fasta files, discarding all sequences that are under a given length or under a calculated coverage. Repeated contigs are detected based on coverage. + +-------------------------------------- + +========== +**Output** +========== + +- **Filtered sequences (with repeats)** + - Will contain the filtered contigs/scaffolds including the repeats. These are the sequences that passed the length and minumum coverage cutoffs. + - For workflows, this output is named **output_with_repeats** +- **Filtered sequences (no repeats)** + - Will contain the filtered contigs/scaffolds excluding the repeats. These are the sequences that passed the length, minimum coverage and repeat cutoffs. + - For workflows, this output is named **output_without_repeats** +- **Repeat sequences** + - Will contain the repeated contigs/scaffolds only. These are the sequences that were exluded for having high coverage (determined by the repeat cutoff). + - For workflows, this output is named **repeat_sequences_only** +- **Discarded sequences** + - If selected, will contain the discarded sequences. These are the sequences that fell below the length and minumum coverage cutoffs, and got discarded. + - For workflows, this output is named **discarded_sequences** +- **Results summary** : If selected, will contain a summary of all the results. + +------------------------------------------ + +============ +**Example** +============ + +Stats file input: +------------------ + + +------------+------------+------------+ + |#name |length |coverage | + +============+============+============+ + |NODE_1 |2500 |15.5 | + +------------+------------+------------+ + |NODE_2 |102 |3.0 | + +------------+------------+------------+ + |NODE_3 |1300 |50.0 | + +------------+------------+------------+ + |NODE_4 |1000 |2.3 | + +------------+------------+------------+ + |NODE_5 |5000 |14.3 | + +------------+------------+------------+ + |NODE_6 |450 |25.2 | + +------------+------------+------------+ + +User Inputs: +------------ + +- Coverage cut-off ratio = 0.33 +- Repeat cut-off ratio = 1.75 +- Length cut-off = 500 +- Length for average coverage calculation = 1000 + +Calculations: +------------- + +**Average coverage will be calculatd based on contigs with length >= 1000bp** + + +- Average coverage = 15.5 + 50.0 + 2.3 + 14.3 / 4 = 20.5 + +**Contigs that have coverage in the lower 1/3 of the average coverage will be eliminated.** + +- Coverage cut-off = 20.5 * 0.33 = 6.8 + +**Contigs with high coverage (larger than 1.75 times the average coverage) are considered to be repeated contigs.** + +- Repeat cut-off = 20.5 * 1.75 = 35.9 + +**Number of copies are calculated by dividing the sequence coverage by the average coverage.** + +- Number of repeats for NODE_3 = 50 / 20.5 = 2 copies + + +Output (in fasta format): +-------------------------- + +**Filtered sequences (with repeats)** + +:: + + >NODE_1 + >NODE_3 (2 copies) + >NODE_5 + +**Filtered sequences (no repeats)** + +:: + + >NODE_1 + >NODE_5 + +**Repeat sequences** + +:: + + >NODE_3 (2 copies) + +**Discarded sequences** + +:: + + >NODE_2 + >NODE_4 + >NODE_6 + +--------------------------------------- + + + + + + + diff -r 000000000000 -r ddd1e15df88c nml_filter_spades_repeats.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nml_filter_spades_repeats.pl Sat Jul 04 09:45:30 2015 -0400 @@ -0,0 +1,315 @@ +#!/usr/bin/env perl + +use warnings; +use strict; +use Getopt::Long; +use Bio::SeqIO; +use Pod::Usage; + +my ($fasta_file, $tab_file, $coverage_co, $length_co, $repeat_co, $out_filtered, $out_repeats, $out_norepeats,$coverage_length_co, $summary_out, $filtered_repeats, $help); + +GetOptions( + 'c|coverage-cutoff=s' => \$coverage_co, + 'l|length-cutoff=s' => \$length_co, + 'e|coverage-length-cutoff=s' => \$coverage_length_co, + 'r|repeat_cutoff=s' => \$repeat_co, + 'i|input=s' => \$fasta_file, + 't|tab=s' => \$tab_file, + 'f|filtered-out=s' => \$out_filtered, + 'o|output-repeats=s' => \$out_repeats, + 'u|output-norepeats=s' => \$out_norepeats, + 'n|filtered-repeats=s' => \$filtered_repeats, + 's|summary=s' => \$summary_out, + 'h|help' => \$help +); + +pod2usage(-verbose => 2) if ($help); +print "A fasta file is required. Please enter a fasta file using the -i flag.\n" if (!$fasta_file); +print "A spades tabs file is required. Please enter a tabs file using the -t flag\n" if (!$tab_file); +pod2usage(1) unless $fasta_file && $tab_file; + +if (!$coverage_co) +{ + $coverage_co = 0.33; +} +if (!$length_co) +{ + $length_co = 1000; +} +if (!$coverage_length_co) +{ + $coverage_length_co = 5000; +} +if (!$repeat_co) +{ + $repeat_co = 1.75; +} +if (!$out_filtered) +{ + $out_filtered = "Discarded_sequences.fasta"; + print "Discarded sequences will be printed out to $out_filtered\n"; +} +if (!$out_repeats) +{ + $out_repeats = "Filtered_sequences_with_repeats.fasta"; + print "Filtered sequences with repeats will be printed out to $out_repeats\n"; +} +if (!$out_norepeats) +{ + $out_norepeats = "Filtered_sequences_no_repeats.fasta"; + print "Filtered sequences without repeats will be printed out to $out_norepeats\n"; +} +if (!$filtered_repeats) +{ + $filtered_repeats = "Repeat_sequences.fasta"; + print "Repeat sequences will be printed out to $filtered_repeats\n"; +} + +die ("No tab file specified") unless ($tab_file); +die ("No fasta file specified") unless ($fasta_file); + +##Read tab file and discard rows with comments +open TAB, '<', $tab_file or die "Could not open tab file: $?"; +open SEQIN, '<', $fasta_file or die "Could not open tab file: $?"; +open SEQOUT_REP, '>', $out_repeats or die "Could not open file for writing: $?"; +open SEQOUT_NOREP, '>', $out_norepeats or die "Could not open file for writing: $?"; +open SEQOUT_FILT, '>', $out_filtered if ($out_filtered); +open SEQOUT_FILT_REP, '>', $filtered_repeats or die "Could not open file for writing: $?"; +open SUMMARY, '>', $summary_out if ($summary_out); + + +my $avg_coverage = 0; +my $num_contigs = 0; +my $cutoff_coverage; +my $cutoff_repeats; +my @stats; + + +while () +{ + chomp; + push @stats, $_ unless (/^#/); +} + +#Calculate average coverage. +foreach my $stat(@stats) +{ + my ($length, $coverage); + (undef,$length, $coverage) = split(/\t+/, $stat); + die "length or coverage not defined at $stat\n" unless ($length && ($coverage ne '' && $coverage >= 0)); + if ($length >= $coverage_length_co) + { + $avg_coverage = $avg_coverage + $coverage; + $num_contigs++; + } +} + +$avg_coverage = $avg_coverage / $num_contigs; +$cutoff_coverage = $avg_coverage * $coverage_co; +$cutoff_repeats = $avg_coverage * $repeat_co; + +print SUMMARY "Filter SPAdes repeats Results Summary\n======================================\n\n" if ($summary_out); +print SUMMARY "Paramaters used:\nLength cutoff for calcularing average cutoff: $coverage_length_co\nCoverage cutoff ratio: $coverage_co\nRepeat cutoff ratio: $repeat_co\nLength cutoff: $length_co\n\n" if ($summary_out); + +print SUMMARY "Calculations:\nAverage coverage: $avg_coverage\nCoverage cutoff: $cutoff_coverage\nRepeat cutoff: $cutoff_repeats\n\nFile headers:\n" if ($summary_out); + +my ($header, $seq_id, $seq); +my $repeated = 0; +my $valid = 0; + +#Summary strings: +my $discarded = ""; +my $repeats = ""; +my $filtered_rep = ""; +my $filtered_norep = ""; + +while (my $line = ) +{ + if ($line =~ />/) + { + chomp $line; + #Get the sequence name to compare against tab file + $header = $line; + $seq_id = $line =~ /(\w+)_length/; + $seq = ""; + + my $stat = shift @stats; + die "Less rows in tab than sequences in seq file" unless $stat; + my($name, $length, $coverage) = split(/\t+/, $stat); + die "name or length not defined at $stat\n" unless ($name && $length); + die "coverage is not defined at $stat\n" unless ($coverage ne '' && $coverage >= 0); + die "Unmatched names $header and $name\n" unless ($header =~ /$name/i); + + #Entry passes the length and coverage cutoffs? + if ($length >= $length_co && $coverage >= $cutoff_coverage) + { + $valid = 1; + #Repeats + if ($coverage >= $cutoff_repeats) + { + my $num_repeats = int($coverage/$avg_coverage); + $header = $header."(".$num_repeats." copies)"; + print SEQOUT_REP $header,"\n"; + $filtered_rep = $filtered_rep.$header."\n"; + print SEQOUT_FILT_REP $header, "\n"; + $repeats = $repeats.$header."\n"; + $repeated = 1; + } + else + { + print SEQOUT_REP $header, "\n"; + $filtered_rep = $filtered_rep.$header."\n"; + print SEQOUT_NOREP $header, "\n"; + $filtered_norep = $filtered_norep.$header."\n"; + $repeated = 0; + } + } + elsif ($out_filtered) + { + $valid = 0; + print SEQOUT_FILT $header,"\n"; + $discarded = $discarded.$header."\n"; + } + } + else + { + if ($valid) + { + print SEQOUT_REP $line; + if (!$repeated) + { + print SEQOUT_NOREP $line; + } + else + { + print SEQOUT_FILT_REP $line; + } + } + elsif ($out_filtered) + { + print SEQOUT_FILT $line; + } + } + +} + +close TAB; +close SEQIN; +close SEQOUT_REP; +close SEQOUT_NOREP; +close SEQOUT_FILT; +close SEQOUT_FILT_REP; + + +#Get summary info: +if ($summary_out) +{ + print SUMMARY "Filtered sequences (with repeats):\n$filtered_rep\n"; + print SUMMARY "Filtered sequences (no repeats):\n$filtered_norep\n"; + print SUMMARY "Repeat sequences:\n$repeats\n"; + if ($out_filtered) + { + print SUMMARY "Discarded sequences:\n$discarded\n"; + } + + close SUMMARY; +} + +die "More rows in stats file than sequences in the fasta file\n" if (scalar(@stats) > 0); +exit 0; + + +__END__ + + + +=head1 NAME + + filter_spades_repeats.pl - Filters contigs or scaffolds based on contig length and detects contigs/scaffolds with very high coverage. + + + +=head1 USAGE + + filter_spades_output.pl -i + -t + -o + -u + + Optional: + -c (default 0.33) + -l (default: 1000) + -e (default: 5000) + -r (default (1.75) + -n + -f + -s + + For more information: + -h + + +=head1 INPUT + +=over 8 + +=item B<-i>B<--input> + +Contigs/Scaffolds fasta file. + +=item B<-t>B<--tab> + +The tabular output file from SPAdes. This file should have the following format: + + #name length coverage + + NODE_1 31438 24.5116 + + NODE_2 31354 2316.96 + + NODE_3 26948 82.3294 + +=item B<-o>B<--output-repeats> + +Output fasta file including the contigs marked as repeated. + +=item B<-u>B<--output-norepeats> + +Output fasta file excluding the contigs marked as repeated. + +=item B<-c>B<--coverage-cutoff> + +Mininum coverage ratio. + + coverage_theshold = average_coverage * minimum_coverage_ratio. + +Any contigs/scaffolds with coverage below the coverage_theshold will be eliminated. + +=item B<-l>B<--length-cutoff> + +Mininum length. Contigs below this length will be eliminated. + +=item B<-e>B<--coverage-length-cutoff> + +Minimum length to use for average coverage calculations. + +=item B<-r>B<--repeat-cutoff> + +Minimum repeats ratio. + + repeat_threshold = average_coverage * repeat_ratio. + +Any contigs with coverage below this threshold will be considered to be repeated + + +=item B<-f>B<--filtered-out> + +If specified, filtered out sequences will be written to this file. + +=item B<-s>B<--summary> + +A summary of results + +=back +=cut + diff -r 000000000000 -r ddd1e15df88c tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Sat Jul 04 09:45:30 2015 -0400 @@ -0,0 +1,7 @@ + + + + + + +