Mercurial > repos > aaronpetkau > filter_spades_repeats

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_spades_repeats.xml	Sat Jul 04 09:45:30 2015 -0400
@@ -0,0 +1,160 @@
+<tool id="filter_spades_repeat" name="Filter SPAdes repeats" version="1.0.0">
+	<description>Remove short and repeat contigs/scaffolds</description>
+	<requirements>
+		<requirement type="package" version="5.18.1">perl</requirement>
+	</requirements>
+	<command interpreter="perl">nml_filter_spades_repeats.pl -i $fasta_input -t $tab_input -c $cov_cutoff -r $rep_cutoff -l $len_cutoff -o $output_with_repeats -u $output_without_repeats -n $repeat_sequences_only -e $cov_len_cutoff -f $discarded_sequences -s $summary
+	</command>
+
+	<inputs>
+		<param name="fasta_input" type="data" format="fasta" label="Contigs or scaffolds file" help="Contigs/Scaffolds output file from Spades" />
+		<param name="tab_input" type="data" format="tabular" label="Stats file" help="Enter the corresponding stats file of the fasta file input above" />
+		<param name="cov_cutoff" type="float" value="0.33" min="0" label="Coverage cut-off ratio" help="This is the average coverage ratio cutoff. For example: if the average coverage is 100 and a coverage cut-off ratio of 0.5 is used, then any contigs with coverage lower than 50 will be eliminated." />
+		<param name="rep_cutoff" type="float" value="1.75" min="0" label="Repeat cut-off ratio" help="This is the coverage ratio cutoff to determine repeats in contigs. For exmaple: if the average coverage is 100 and a repeat cut-off ratio of 1.75 is used, then any contigs with coverage more than or equal to 175 will be marked as repeats." />
+		<param name="len_cutoff" type="integer" value="1000" min="0" label="Length cut-off" help="Contigs with length under the chosen cut-off will be eliminated." />
+                <param name="cov_len_cutoff" type="integer" value="5000" min="0" label="Length for average coverage calculation" help="Only contigs above this length will be used to calculate the average coverage." />
+		<param name="keep_leftover" type="select" label="Print out a fasta file containing the discarded sequences?">
+			<option value="yes">Yes</option>
+			<option value="no">No</option>
+		</param>
+                <param name="print_summary" type="select" label="Print out a summary of all the results?">
+                        <option value="yes">Yes</option>
+                        <option value="no">No</option>
+               </param>
+	</inputs>
+	<outputs>
+		<data format="fasta" name="output_with_repeats" label="Filtered sequences (with repeats)" />
+		<data format="fasta" name="output_without_repeats" label="Filtered sequences (no repeats)" />
+                <data format="fasta" name="repeat_sequences_only" label="Repeat sequences" />
+		<data format="fasta" name="discarded_sequences" label="Discarded sequences">
+			<filter>keep_leftover == "yes"</filter>
+		</data>
+                <data format="txt" name="summary" label="Results summary">
+                        <filter>print_summary == "yes"</filter>
+               </data>
+	</outputs>
+
+
+
+
+	<help>
+================
+**What it does**
+================
+
+Using the output of SPAdes (a fasta and a stats file, either from contigs or scaffolds), it filters the fasta files, discarding all sequences that are under a given length or under a calculated coverage. Repeated contigs are detected based on coverage.
+
+--------------------------------------
+
+==========
+**Output**
+==========
+
+- **Filtered sequences (with repeats)**
+	- Will contain the filtered contigs/scaffolds including the repeats. These are the sequences that passed the length and minumum coverage cutoffs.
+	- For workflows, this output is named **output_with_repeats**
+- **Filtered sequences (no repeats)**
+	-  Will contain the filtered contigs/scaffolds excluding the repeats. These are the sequences that passed the length, minimum coverage and repeat cutoffs.
+	- For workflows, this output is named **output_without_repeats**
+- **Repeat sequences**
+	- Will contain the repeated contigs/scaffolds only. These are the sequences that were exluded for having high coverage (determined by the repeat cutoff).
+	- For workflows, this output is named **repeat_sequences_only**
+- **Discarded sequences**
+	- If selected, will contain the discarded sequences. These are the sequences that fell below the length and minumum coverage cutoffs, and got discarded.
+	- For workflows, this output is named **discarded_sequences**
+- **Results summary**  : If selected, will contain a summary of all the results.
+
+------------------------------------------
+
+============
+**Example**
+============
+
+Stats file input:
+------------------
+
+    +------------+------------+------------+
+    |#name       |length      |coverage    |
+    +============+============+============+
+    |NODE_1      |2500        |15.5        |
+    +------------+------------+------------+
+    |NODE_2      |102         |3.0         |
+    +------------+------------+------------+
+    |NODE_3      |1300        |50.0        |
+    +------------+------------+------------+
+    |NODE_4      |1000        |2.3         |
+    +------------+------------+------------+
+    |NODE_5      |5000        |14.3        |
+    +------------+------------+------------+
+    |NODE_6      |450         |25.2        |
+    +------------+------------+------------+
+
+User Inputs:
+------------
+
+- Coverage cut-off ratio = 0.33
+- Repeat cut-off ratio = 1.75
+- Length cut-off = 500
+- Length for average coverage calculation = 1000
+
+Calculations:
+-------------
+
+**Average coverage will be calculatd based on contigs with length >= 1000bp**
+
+
+- Average coverage = 15.5 + 50.0 + 2.3 + 14.3 / 4 = 20.5
+
+**Contigs that have coverage in the lower 1/3 of the average coverage will be eliminated.**
+
+- Coverage cut-off = 20.5 * 0.33 = 6.8
+
+**Contigs with high coverage (larger than 1.75 times the average coverage) are considered to be repeated contigs.**
+
+- Repeat cut-off = 20.5 * 1.75 = 35.9
+
+**Number of copies are calculated by dividing the sequence coverage by the average coverage.**
+
+- Number of repeats for NODE_3  = 50 / 20.5 = 2 copies
+
+
+Output (in fasta format):
+--------------------------
+
+**Filtered sequences (with repeats)**
+
+::
+
+	>NODE_1
+	>NODE_3 (2 copies)
+	>NODE_5
+
+**Filtered sequences (no repeats)**
+
+::
+
+	>NODE_1
+	>NODE_5
+
+**Repeat sequences**
+
+::
+
+      >NODE_3 (2 copies)
+
+**Discarded sequences**
+
+::
+
+	>NODE_2
+	>NODE_4
+    >NODE_6
+
+---------------------------------------
+
+
+
+
+</help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nml_filter_spades_repeats.pl	Sat Jul 04 09:45:30 2015 -0400
@@ -0,0 +1,315 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+use Getopt::Long;
+use Bio::SeqIO;
+use Pod::Usage;
+
+my ($fasta_file, $tab_file, $coverage_co, $length_co, $repeat_co, $out_filtered, $out_repeats, $out_norepeats,$coverage_length_co, $summary_out, $filtered_repeats, $help);
+
+GetOptions(
+	'c|coverage-cutoff=s'	       => \$coverage_co,
+	'l|length-cutoff=s'	       => \$length_co,
+        'e|coverage-length-cutoff=s'   => \$coverage_length_co,
+	'r|repeat_cutoff=s'	       => \$repeat_co,
+	'i|input=s'		       => \$fasta_file,
+	't|tab=s'		       => \$tab_file,
+	'f|filtered-out=s'	       => \$out_filtered,
+	'o|output-repeats=s'	       => \$out_repeats,
+	'u|output-norepeats=s'	       => \$out_norepeats,
+        'n|filtered-repeats=s'         => \$filtered_repeats,
+        's|summary=s'                  => \$summary_out,
+	'h|help'		       => \$help
+);
+
+pod2usage(-verbose => 2) if ($help);
+print "A fasta file is required. Please enter a fasta file using the -i flag.\n" if (!$fasta_file);
+print "A spades tabs file is required. Please enter a tabs file using the -t flag\n" if (!$tab_file);
+pod2usage(1) unless $fasta_file && $tab_file;
+
+if (!$coverage_co)
+{
+   $coverage_co = 0.33;
+}
+if (!$length_co)
+{
+   $length_co = 1000;
+}
+if (!$coverage_length_co)
+{
+   $coverage_length_co = 5000;
+}
+if (!$repeat_co)
+{
+   $repeat_co = 1.75;
+}
+if (!$out_filtered)
+{
+   $out_filtered = "Discarded_sequences.fasta";
+   print "Discarded sequences will be printed out to $out_filtered\n";
+}
+if (!$out_repeats)
+{
+   $out_repeats = "Filtered_sequences_with_repeats.fasta";
+   print "Filtered sequences with repeats will be printed out to $out_repeats\n";
+}
+if (!$out_norepeats)
+{
+   $out_norepeats = "Filtered_sequences_no_repeats.fasta";
+   print "Filtered sequences without repeats will be printed out to $out_norepeats\n";
+}
+if (!$filtered_repeats)
+{
+   $filtered_repeats = "Repeat_sequences.fasta";
+   print "Repeat sequences will be printed out to $filtered_repeats\n";
+}
+
+die ("No tab file specified") unless ($tab_file);
+die ("No fasta file specified") unless ($fasta_file);
+
+##Read tab file and discard rows with comments
+open TAB, '<', $tab_file or die "Could not open tab file: $?";
+open SEQIN, '<', $fasta_file or die "Could not open tab file: $?";
+open SEQOUT_REP, '>', $out_repeats or die "Could not open file for writing: $?";
+open SEQOUT_NOREP, '>', $out_norepeats or die "Could not open file for writing: $?";
+open SEQOUT_FILT, '>', $out_filtered if ($out_filtered);
+open SEQOUT_FILT_REP, '>', $filtered_repeats or die "Could not open file for writing: $?";
+open SUMMARY, '>', $summary_out if ($summary_out);
+
+
+my $avg_coverage = 0;
+my $num_contigs = 0;
+my $cutoff_coverage;
+my $cutoff_repeats;
+my @stats;
+
+
+while (<TAB>)
+{
+	chomp;
+	push @stats, $_ unless (/^#/);
+}
+
+#Calculate average coverage.
+foreach my $stat(@stats)
+{
+	my ($length, $coverage);
+	(undef,$length, $coverage) = split(/\t+/, $stat);
+        die "length or coverage not defined at $stat\n" unless ($length && ($coverage ne '' && $coverage >= 0));
+	if ($length >= $coverage_length_co)
+	{
+		$avg_coverage = $avg_coverage + $coverage;
+		$num_contigs++;
+	}
+}
+
+$avg_coverage = $avg_coverage / $num_contigs;
+$cutoff_coverage = $avg_coverage * $coverage_co;
+$cutoff_repeats = $avg_coverage * $repeat_co;
+
+print SUMMARY "Filter SPAdes repeats Results Summary\n======================================\n\n" if ($summary_out);
+print SUMMARY "Paramaters used:\nLength cutoff for calcularing average cutoff: $coverage_length_co\nCoverage cutoff ratio: $coverage_co\nRepeat cutoff ratio: $repeat_co\nLength cutoff: $length_co\n\n" if ($summary_out);
+
+print SUMMARY "Calculations:\nAverage coverage: $avg_coverage\nCoverage cutoff: $cutoff_coverage\nRepeat cutoff: $cutoff_repeats\n\nFile headers:\n" if ($summary_out);
+
+my ($header, $seq_id, $seq);
+my $repeated = 0;
+my $valid = 0;
+
+#Summary strings:
+my $discarded = "";
+my $repeats = "";
+my $filtered_rep = "";
+my $filtered_norep = "";
+
+while (my $line = <SEQIN>)
+{
+	if ($line =~ />/)
+	{
+		chomp $line;
+                #Get the sequence name to compare against tab file
+		$header = $line;
+		$seq_id = $line =~ /(\w+)_length/;
+		$seq = "";
+
+		my $stat = shift @stats;
+		die "Less rows in tab than sequences in seq file" unless $stat;
+		my($name, $length, $coverage) = split(/\t+/, $stat);
+		die "name or length not defined at $stat\n" unless ($name && $length);
+                die "coverage is not defined at $stat\n" unless ($coverage ne '' && $coverage >= 0);
+		die "Unmatched names $header and $name\n" unless ($header =~ /$name/i);
+
+               #Entry passes the length and coverage cutoffs?
+		if ($length >= $length_co && $coverage >= $cutoff_coverage)
+		{
+			$valid = 1;
+			#Repeats
+			if ($coverage >= $cutoff_repeats)
+			{
+				my $num_repeats = int($coverage/$avg_coverage);
+				$header = $header."(".$num_repeats." copies)";
+				print SEQOUT_REP $header,"\n";
+                                $filtered_rep = $filtered_rep.$header."\n";
+                                print SEQOUT_FILT_REP $header, "\n";
+                                $repeats = $repeats.$header."\n";
+				$repeated = 1;
+			}
+			else
+			{
+				print SEQOUT_REP $header, "\n";
+                                $filtered_rep = $filtered_rep.$header."\n";
+				print SEQOUT_NOREP $header, "\n";
+                                $filtered_norep = $filtered_norep.$header."\n";
+				$repeated = 0;
+			}
+		}
+		elsif ($out_filtered)
+		{
+			$valid = 0;
+			print SEQOUT_FILT $header,"\n";
+                        $discarded = $discarded.$header."\n";
+		}
+	}
+	else
+	{
+		if ($valid)
+		{
+			print SEQOUT_REP $line;
+			if (!$repeated)
+			{
+				print SEQOUT_NOREP $line;
+			}
+                        else
+                        {
+                              print SEQOUT_FILT_REP $line;
+                        }
+		}
+		elsif ($out_filtered)
+		{
+			print SEQOUT_FILT $line;
+		}
+	}
+
+}
+
+close TAB;
+close SEQIN;
+close SEQOUT_REP;
+close SEQOUT_NOREP;
+close SEQOUT_FILT;
+close SEQOUT_FILT_REP;
+
+
+#Get summary info:
+if ($summary_out)
+{
+   print SUMMARY "Filtered sequences (with repeats):\n$filtered_rep\n";
+   print SUMMARY "Filtered sequences (no repeats):\n$filtered_norep\n";
+   print SUMMARY "Repeat sequences:\n$repeats\n";
+   if ($out_filtered)
+   {
+      print SUMMARY "Discarded sequences:\n$discarded\n";
+   }
+
+   close SUMMARY;
+}
+
+die "More rows in stats file than sequences in the fasta file\n" if (scalar(@stats) > 0);
+exit 0;
+
+
+__END__
+
+
+
+=head1 NAME
+
+	filter_spades_repeats.pl - Filters contigs or scaffolds based on contig length and detects contigs/scaffolds with very high coverage.
+
+
+
+=head1 USAGE
+
+	filter_spades_output.pl -i <contigs/scaffolds input>
+                                -t <stats input>
+                                -o <output fasta with repeats>
+                                -u <output fasta without repeats>
+
+                                Optional:
+                                -c <coverage cutoff ratio> (default 0.33)
+				-l <length cutoff> (default: 1000)
+                                -e <length cutoff for average coverage calculation> (default: 5000)
+				-r <repeat cutoff ratio> (default (1.75)
+                                -n <filtered repeated sequences>
+                                -f <discarded sequences>
+                                -s <output summary file>
+
+                                For more information:
+                                -h
+
+
+=head1 INPUT
+
+=over 8
+
+=item B<-i>B<--input>
+
+Contigs/Scaffolds fasta file.
+
+=item B<-t>B<--tab>
+
+The tabular output file from SPAdes. This file should have the following format:
+
+      #name length   coverage
+
+      NODE_1   31438 24.5116
+
+      NODE_2   31354 2316.96
+
+      NODE_3   26948 82.3294
+
+=item B<-o>B<--output-repeats>
+
+Output fasta file including the contigs marked as repeated.
+
+=item B<-u>B<--output-norepeats>
+
+Output fasta file excluding the contigs marked as repeated.
+
+=item B<-c>B<--coverage-cutoff>
+
+Mininum coverage ratio.
+
+	coverage_theshold = average_coverage * minimum_coverage_ratio.
+
+Any contigs/scaffolds with coverage below the coverage_theshold will be eliminated.
+
+=item B<-l>B<--length-cutoff>
+
+Mininum length. Contigs below this length will be eliminated.
+
+=item B<-e>B<--coverage-length-cutoff>
+
+Minimum length to use for average coverage calculations.
+
+=item B<-r>B<--repeat-cutoff>
+
+Minimum repeats ratio.
+
+	repeat_threshold = average_coverage * repeat_ratio.
+
+Any contigs with coverage below this threshold will be considered to be repeated
+
+
+=item B<-f>B<--filtered-out>
+
+If specified, filtered out sequences will be written to this file.
+
+=item B<-s>B<--summary>
+
+A summary of results
+
+=back
+=cut
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Sat Jul 04 09:45:30 2015 -0400
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <repository name="package_perl_5_18" owner="iuc" >
+        <package name="perl" version="5.18.1" prior_installation_required="True" />
+    </repository>
+</tool_dependency>
+