Mercurial > repos > aaronpetkau > filter_spades_repeats
changeset 1:27f6392e254f draft default tip
Deleted selected files
author | aaronpetkau |
---|---|
date | Sat, 04 Jul 2015 10:48:40 -0400 |
parents | ddd1e15df88c |
children | |
files | filter_spades_repeats.xml nml_filter_spades_repeats.pl tool_dependencies.xml |
diffstat | 3 files changed, 0 insertions(+), 482 deletions(-) [+] |
line wrap: on
line diff
--- a/filter_spades_repeats.xml Sat Jul 04 09:45:30 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,160 +0,0 @@ -<tool id="filter_spades_repeat" name="Filter SPAdes repeats" version="1.0.0"> - <description>Remove short and repeat contigs/scaffolds</description> - <requirements> - <requirement type="package" version="5.18.1">perl</requirement> - </requirements> - <command interpreter="perl">nml_filter_spades_repeats.pl -i $fasta_input -t $tab_input -c $cov_cutoff -r $rep_cutoff -l $len_cutoff -o $output_with_repeats -u $output_without_repeats -n $repeat_sequences_only -e $cov_len_cutoff -f $discarded_sequences -s $summary - </command> - - <inputs> - <param name="fasta_input" type="data" format="fasta" label="Contigs or scaffolds file" help="Contigs/Scaffolds output file from Spades" /> - <param name="tab_input" type="data" format="tabular" label="Stats file" help="Enter the corresponding stats file of the fasta file input above" /> - <param name="cov_cutoff" type="float" value="0.33" min="0" label="Coverage cut-off ratio" help="This is the average coverage ratio cutoff. For example: if the average coverage is 100 and a coverage cut-off ratio of 0.5 is used, then any contigs with coverage lower than 50 will be eliminated." /> - <param name="rep_cutoff" type="float" value="1.75" min="0" label="Repeat cut-off ratio" help="This is the coverage ratio cutoff to determine repeats in contigs. For exmaple: if the average coverage is 100 and a repeat cut-off ratio of 1.75 is used, then any contigs with coverage more than or equal to 175 will be marked as repeats." /> - <param name="len_cutoff" type="integer" value="1000" min="0" label="Length cut-off" help="Contigs with length under the chosen cut-off will be eliminated." /> - <param name="cov_len_cutoff" type="integer" value="5000" min="0" label="Length for average coverage calculation" help="Only contigs above this length will be used to calculate the average coverage." /> - <param name="keep_leftover" type="select" label="Print out a fasta file containing the discarded sequences?"> - <option value="yes">Yes</option> - <option value="no">No</option> - </param> - <param name="print_summary" type="select" label="Print out a summary of all the results?"> - <option value="yes">Yes</option> - <option value="no">No</option> - </param> - </inputs> - <outputs> - <data format="fasta" name="output_with_repeats" label="Filtered sequences (with repeats)" /> - <data format="fasta" name="output_without_repeats" label="Filtered sequences (no repeats)" /> - <data format="fasta" name="repeat_sequences_only" label="Repeat sequences" /> - <data format="fasta" name="discarded_sequences" label="Discarded sequences"> - <filter>keep_leftover == "yes"</filter> - </data> - <data format="txt" name="summary" label="Results summary"> - <filter>print_summary == "yes"</filter> - </data> - </outputs> - - - - - <help> -================ -**What it does** -================ - -Using the output of SPAdes (a fasta and a stats file, either from contigs or scaffolds), it filters the fasta files, discarding all sequences that are under a given length or under a calculated coverage. Repeated contigs are detected based on coverage. - --------------------------------------- - -========== -**Output** -========== - -- **Filtered sequences (with repeats)** - - Will contain the filtered contigs/scaffolds including the repeats. These are the sequences that passed the length and minumum coverage cutoffs. - - For workflows, this output is named **output_with_repeats** -- **Filtered sequences (no repeats)** - - Will contain the filtered contigs/scaffolds excluding the repeats. These are the sequences that passed the length, minimum coverage and repeat cutoffs. - - For workflows, this output is named **output_without_repeats** -- **Repeat sequences** - - Will contain the repeated contigs/scaffolds only. These are the sequences that were exluded for having high coverage (determined by the repeat cutoff). - - For workflows, this output is named **repeat_sequences_only** -- **Discarded sequences** - - If selected, will contain the discarded sequences. These are the sequences that fell below the length and minumum coverage cutoffs, and got discarded. - - For workflows, this output is named **discarded_sequences** -- **Results summary** : If selected, will contain a summary of all the results. - ------------------------------------------- - -============ -**Example** -============ - -Stats file input: ------------------- - - +------------+------------+------------+ - |#name |length |coverage | - +============+============+============+ - |NODE_1 |2500 |15.5 | - +------------+------------+------------+ - |NODE_2 |102 |3.0 | - +------------+------------+------------+ - |NODE_3 |1300 |50.0 | - +------------+------------+------------+ - |NODE_4 |1000 |2.3 | - +------------+------------+------------+ - |NODE_5 |5000 |14.3 | - +------------+------------+------------+ - |NODE_6 |450 |25.2 | - +------------+------------+------------+ - -User Inputs: ------------- - -- Coverage cut-off ratio = 0.33 -- Repeat cut-off ratio = 1.75 -- Length cut-off = 500 -- Length for average coverage calculation = 1000 - -Calculations: -------------- - -**Average coverage will be calculatd based on contigs with length >= 1000bp** - - -- Average coverage = 15.5 + 50.0 + 2.3 + 14.3 / 4 = 20.5 - -**Contigs that have coverage in the lower 1/3 of the average coverage will be eliminated.** - -- Coverage cut-off = 20.5 * 0.33 = 6.8 - -**Contigs with high coverage (larger than 1.75 times the average coverage) are considered to be repeated contigs.** - -- Repeat cut-off = 20.5 * 1.75 = 35.9 - -**Number of copies are calculated by dividing the sequence coverage by the average coverage.** - -- Number of repeats for NODE_3 = 50 / 20.5 = 2 copies - - -Output (in fasta format): --------------------------- - -**Filtered sequences (with repeats)** - -:: - - >NODE_1 - >NODE_3 (2 copies) - >NODE_5 - -**Filtered sequences (no repeats)** - -:: - - >NODE_1 - >NODE_5 - -**Repeat sequences** - -:: - - >NODE_3 (2 copies) - -**Discarded sequences** - -:: - - >NODE_2 - >NODE_4 - >NODE_6 - ---------------------------------------- - - - - -</help> - -</tool>
--- a/nml_filter_spades_repeats.pl Sat Jul 04 09:45:30 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,315 +0,0 @@ -#!/usr/bin/env perl - -use warnings; -use strict; -use Getopt::Long; -use Bio::SeqIO; -use Pod::Usage; - -my ($fasta_file, $tab_file, $coverage_co, $length_co, $repeat_co, $out_filtered, $out_repeats, $out_norepeats,$coverage_length_co, $summary_out, $filtered_repeats, $help); - -GetOptions( - 'c|coverage-cutoff=s' => \$coverage_co, - 'l|length-cutoff=s' => \$length_co, - 'e|coverage-length-cutoff=s' => \$coverage_length_co, - 'r|repeat_cutoff=s' => \$repeat_co, - 'i|input=s' => \$fasta_file, - 't|tab=s' => \$tab_file, - 'f|filtered-out=s' => \$out_filtered, - 'o|output-repeats=s' => \$out_repeats, - 'u|output-norepeats=s' => \$out_norepeats, - 'n|filtered-repeats=s' => \$filtered_repeats, - 's|summary=s' => \$summary_out, - 'h|help' => \$help -); - -pod2usage(-verbose => 2) if ($help); -print "A fasta file is required. Please enter a fasta file using the -i flag.\n" if (!$fasta_file); -print "A spades tabs file is required. Please enter a tabs file using the -t flag\n" if (!$tab_file); -pod2usage(1) unless $fasta_file && $tab_file; - -if (!$coverage_co) -{ - $coverage_co = 0.33; -} -if (!$length_co) -{ - $length_co = 1000; -} -if (!$coverage_length_co) -{ - $coverage_length_co = 5000; -} -if (!$repeat_co) -{ - $repeat_co = 1.75; -} -if (!$out_filtered) -{ - $out_filtered = "Discarded_sequences.fasta"; - print "Discarded sequences will be printed out to $out_filtered\n"; -} -if (!$out_repeats) -{ - $out_repeats = "Filtered_sequences_with_repeats.fasta"; - print "Filtered sequences with repeats will be printed out to $out_repeats\n"; -} -if (!$out_norepeats) -{ - $out_norepeats = "Filtered_sequences_no_repeats.fasta"; - print "Filtered sequences without repeats will be printed out to $out_norepeats\n"; -} -if (!$filtered_repeats) -{ - $filtered_repeats = "Repeat_sequences.fasta"; - print "Repeat sequences will be printed out to $filtered_repeats\n"; -} - -die ("No tab file specified") unless ($tab_file); -die ("No fasta file specified") unless ($fasta_file); - -##Read tab file and discard rows with comments -open TAB, '<', $tab_file or die "Could not open tab file: $?"; -open SEQIN, '<', $fasta_file or die "Could not open tab file: $?"; -open SEQOUT_REP, '>', $out_repeats or die "Could not open file for writing: $?"; -open SEQOUT_NOREP, '>', $out_norepeats or die "Could not open file for writing: $?"; -open SEQOUT_FILT, '>', $out_filtered if ($out_filtered); -open SEQOUT_FILT_REP, '>', $filtered_repeats or die "Could not open file for writing: $?"; -open SUMMARY, '>', $summary_out if ($summary_out); - - -my $avg_coverage = 0; -my $num_contigs = 0; -my $cutoff_coverage; -my $cutoff_repeats; -my @stats; - - -while (<TAB>) -{ - chomp; - push @stats, $_ unless (/^#/); -} - -#Calculate average coverage. -foreach my $stat(@stats) -{ - my ($length, $coverage); - (undef,$length, $coverage) = split(/\t+/, $stat); - die "length or coverage not defined at $stat\n" unless ($length && ($coverage ne '' && $coverage >= 0)); - if ($length >= $coverage_length_co) - { - $avg_coverage = $avg_coverage + $coverage; - $num_contigs++; - } -} - -$avg_coverage = $avg_coverage / $num_contigs; -$cutoff_coverage = $avg_coverage * $coverage_co; -$cutoff_repeats = $avg_coverage * $repeat_co; - -print SUMMARY "Filter SPAdes repeats Results Summary\n======================================\n\n" if ($summary_out); -print SUMMARY "Paramaters used:\nLength cutoff for calcularing average cutoff: $coverage_length_co\nCoverage cutoff ratio: $coverage_co\nRepeat cutoff ratio: $repeat_co\nLength cutoff: $length_co\n\n" if ($summary_out); - -print SUMMARY "Calculations:\nAverage coverage: $avg_coverage\nCoverage cutoff: $cutoff_coverage\nRepeat cutoff: $cutoff_repeats\n\nFile headers:\n" if ($summary_out); - -my ($header, $seq_id, $seq); -my $repeated = 0; -my $valid = 0; - -#Summary strings: -my $discarded = ""; -my $repeats = ""; -my $filtered_rep = ""; -my $filtered_norep = ""; - -while (my $line = <SEQIN>) -{ - if ($line =~ />/) - { - chomp $line; - #Get the sequence name to compare against tab file - $header = $line; - $seq_id = $line =~ /(\w+)_length/; - $seq = ""; - - my $stat = shift @stats; - die "Less rows in tab than sequences in seq file" unless $stat; - my($name, $length, $coverage) = split(/\t+/, $stat); - die "name or length not defined at $stat\n" unless ($name && $length); - die "coverage is not defined at $stat\n" unless ($coverage ne '' && $coverage >= 0); - die "Unmatched names $header and $name\n" unless ($header =~ /$name/i); - - #Entry passes the length and coverage cutoffs? - if ($length >= $length_co && $coverage >= $cutoff_coverage) - { - $valid = 1; - #Repeats - if ($coverage >= $cutoff_repeats) - { - my $num_repeats = int($coverage/$avg_coverage); - $header = $header."(".$num_repeats." copies)"; - print SEQOUT_REP $header,"\n"; - $filtered_rep = $filtered_rep.$header."\n"; - print SEQOUT_FILT_REP $header, "\n"; - $repeats = $repeats.$header."\n"; - $repeated = 1; - } - else - { - print SEQOUT_REP $header, "\n"; - $filtered_rep = $filtered_rep.$header."\n"; - print SEQOUT_NOREP $header, "\n"; - $filtered_norep = $filtered_norep.$header."\n"; - $repeated = 0; - } - } - elsif ($out_filtered) - { - $valid = 0; - print SEQOUT_FILT $header,"\n"; - $discarded = $discarded.$header."\n"; - } - } - else - { - if ($valid) - { - print SEQOUT_REP $line; - if (!$repeated) - { - print SEQOUT_NOREP $line; - } - else - { - print SEQOUT_FILT_REP $line; - } - } - elsif ($out_filtered) - { - print SEQOUT_FILT $line; - } - } - -} - -close TAB; -close SEQIN; -close SEQOUT_REP; -close SEQOUT_NOREP; -close SEQOUT_FILT; -close SEQOUT_FILT_REP; - - -#Get summary info: -if ($summary_out) -{ - print SUMMARY "Filtered sequences (with repeats):\n$filtered_rep\n"; - print SUMMARY "Filtered sequences (no repeats):\n$filtered_norep\n"; - print SUMMARY "Repeat sequences:\n$repeats\n"; - if ($out_filtered) - { - print SUMMARY "Discarded sequences:\n$discarded\n"; - } - - close SUMMARY; -} - -die "More rows in stats file than sequences in the fasta file\n" if (scalar(@stats) > 0); -exit 0; - - -__END__ - - - -=head1 NAME - - filter_spades_repeats.pl - Filters contigs or scaffolds based on contig length and detects contigs/scaffolds with very high coverage. - - - -=head1 USAGE - - filter_spades_output.pl -i <contigs/scaffolds input> - -t <stats input> - -o <output fasta with repeats> - -u <output fasta without repeats> - - Optional: - -c <coverage cutoff ratio> (default 0.33) - -l <length cutoff> (default: 1000) - -e <length cutoff for average coverage calculation> (default: 5000) - -r <repeat cutoff ratio> (default (1.75) - -n <filtered repeated sequences> - -f <discarded sequences> - -s <output summary file> - - For more information: - -h - - -=head1 INPUT - -=over 8 - -=item B<-i>B<--input> - -Contigs/Scaffolds fasta file. - -=item B<-t>B<--tab> - -The tabular output file from SPAdes. This file should have the following format: - - #name length coverage - - NODE_1 31438 24.5116 - - NODE_2 31354 2316.96 - - NODE_3 26948 82.3294 - -=item B<-o>B<--output-repeats> - -Output fasta file including the contigs marked as repeated. - -=item B<-u>B<--output-norepeats> - -Output fasta file excluding the contigs marked as repeated. - -=item B<-c>B<--coverage-cutoff> - -Mininum coverage ratio. - - coverage_theshold = average_coverage * minimum_coverage_ratio. - -Any contigs/scaffolds with coverage below the coverage_theshold will be eliminated. - -=item B<-l>B<--length-cutoff> - -Mininum length. Contigs below this length will be eliminated. - -=item B<-e>B<--coverage-length-cutoff> - -Minimum length to use for average coverage calculations. - -=item B<-r>B<--repeat-cutoff> - -Minimum repeats ratio. - - repeat_threshold = average_coverage * repeat_ratio. - -Any contigs with coverage below this threshold will be considered to be repeated - - -=item B<-f>B<--filtered-out> - -If specified, filtered out sequences will be written to this file. - -=item B<-s>B<--summary> - -A summary of results - -=back -=cut -
--- a/tool_dependencies.xml Sat Jul 04 09:45:30 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <repository name="package_perl_5_18" owner="iuc" > - <package name="perl" version="5.18.1" prior_installation_required="True" /> - </repository> -</tool_dependency> -