changeset 1:27f6392e254f draft default tip

Deleted selected files
author aaronpetkau
date Sat, 04 Jul 2015 10:48:40 -0400
parents ddd1e15df88c
children
files filter_spades_repeats.xml nml_filter_spades_repeats.pl tool_dependencies.xml
diffstat 3 files changed, 0 insertions(+), 482 deletions(-) [+]
line wrap: on
line diff
--- a/filter_spades_repeats.xml	Sat Jul 04 09:45:30 2015 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,160 +0,0 @@
-<tool id="filter_spades_repeat" name="Filter SPAdes repeats" version="1.0.0">
-	<description>Remove short and repeat contigs/scaffolds</description>
-	<requirements>
-		<requirement type="package" version="5.18.1">perl</requirement>
-	</requirements>
-	<command interpreter="perl">nml_filter_spades_repeats.pl -i $fasta_input -t $tab_input -c $cov_cutoff -r $rep_cutoff -l $len_cutoff -o $output_with_repeats -u $output_without_repeats -n $repeat_sequences_only -e $cov_len_cutoff -f $discarded_sequences -s $summary
-	</command>
-
-	<inputs>
-		<param name="fasta_input" type="data" format="fasta" label="Contigs or scaffolds file" help="Contigs/Scaffolds output file from Spades" />
-		<param name="tab_input" type="data" format="tabular" label="Stats file" help="Enter the corresponding stats file of the fasta file input above" />
-		<param name="cov_cutoff" type="float" value="0.33" min="0" label="Coverage cut-off ratio" help="This is the average coverage ratio cutoff. For example: if the average coverage is 100 and a coverage cut-off ratio of 0.5 is used, then any contigs with coverage lower than 50 will be eliminated." />
-		<param name="rep_cutoff" type="float" value="1.75" min="0" label="Repeat cut-off ratio" help="This is the coverage ratio cutoff to determine repeats in contigs. For exmaple: if the average coverage is 100 and a repeat cut-off ratio of 1.75 is used, then any contigs with coverage more than or equal to 175 will be marked as repeats." />
-		<param name="len_cutoff" type="integer" value="1000" min="0" label="Length cut-off" help="Contigs with length under the chosen cut-off will be eliminated." />
-                <param name="cov_len_cutoff" type="integer" value="5000" min="0" label="Length for average coverage calculation" help="Only contigs above this length will be used to calculate the average coverage." />
-		<param name="keep_leftover" type="select" label="Print out a fasta file containing the discarded sequences?">
-			<option value="yes">Yes</option>
-			<option value="no">No</option>
-		</param>
-                <param name="print_summary" type="select" label="Print out a summary of all the results?">
-                        <option value="yes">Yes</option>
-                        <option value="no">No</option>
-               </param>
-	</inputs>
-	<outputs>
-		<data format="fasta" name="output_with_repeats" label="Filtered sequences (with repeats)" />
-		<data format="fasta" name="output_without_repeats" label="Filtered sequences (no repeats)" />
-                <data format="fasta" name="repeat_sequences_only" label="Repeat sequences" />
-		<data format="fasta" name="discarded_sequences" label="Discarded sequences">
-			<filter>keep_leftover == "yes"</filter>
-		</data>
-                <data format="txt" name="summary" label="Results summary">
-                        <filter>print_summary == "yes"</filter>
-               </data>
-	</outputs>
-
-
-
-
-	<help>
-================
-**What it does**
-================
-
-Using the output of SPAdes (a fasta and a stats file, either from contigs or scaffolds), it filters the fasta files, discarding all sequences that are under a given length or under a calculated coverage. Repeated contigs are detected based on coverage.
-
---------------------------------------
-
-==========
-**Output**
-==========
-
-- **Filtered sequences (with repeats)** 
-	- Will contain the filtered contigs/scaffolds including the repeats. These are the sequences that passed the length and minumum coverage cutoffs.
-	- For workflows, this output is named **output_with_repeats**
-- **Filtered sequences (no repeats)**   
-	-  Will contain the filtered contigs/scaffolds excluding the repeats. These are the sequences that passed the length, minimum coverage and repeat cutoffs.
-	- For workflows, this output is named **output_without_repeats**
-- **Repeat sequences**                  
-	- Will contain the repeated contigs/scaffolds only. These are the sequences that were exluded for having high coverage (determined by the repeat cutoff).
-	- For workflows, this output is named **repeat_sequences_only**
-- **Discarded sequences**               
-	- If selected, will contain the discarded sequences. These are the sequences that fell below the length and minumum coverage cutoffs, and got discarded.
-	- For workflows, this output is named **discarded_sequences**
-- **Results summary**  : If selected, will contain a summary of all the results.
-  
-------------------------------------------
-
-============
-**Example**
-============
-
-Stats file input:
-------------------
-    
-    +------------+------------+------------+
-    |#name       |length      |coverage    |
-    +============+============+============+
-    |NODE_1      |2500        |15.5        |
-    +------------+------------+------------+
-    |NODE_2      |102         |3.0         |
-    +------------+------------+------------+
-    |NODE_3      |1300        |50.0        |
-    +------------+------------+------------+
-    |NODE_4      |1000        |2.3         |
-    +------------+------------+------------+
-    |NODE_5      |5000        |14.3        |
-    +------------+------------+------------+
-    |NODE_6      |450         |25.2        |
-    +------------+------------+------------+
-
-User Inputs:
-------------
-
-- Coverage cut-off ratio = 0.33 
-- Repeat cut-off ratio = 1.75  
-- Length cut-off = 500
-- Length for average coverage calculation = 1000
-
-Calculations:
--------------
-
-**Average coverage will be calculatd based on contigs with length >= 1000bp**
-
-
-- Average coverage = 15.5 + 50.0 + 2.3 + 14.3 / 4 = 20.5
-
-**Contigs that have coverage in the lower 1/3 of the average coverage will be eliminated.**
-
-- Coverage cut-off = 20.5 * 0.33 = 6.8
-
-**Contigs with high coverage (larger than 1.75 times the average coverage) are considered to be repeated contigs.**
-
-- Repeat cut-off = 20.5 * 1.75 = 35.9
-
-**Number of copies are calculated by dividing the sequence coverage by the average coverage.**
-
-- Number of repeats for NODE_3  = 50 / 20.5 = 2 copies
-
-
-Output (in fasta format):
---------------------------
-
-**Filtered sequences (with repeats)**
-
-::
-
-	>NODE_1
-	>NODE_3 (2 copies)
-	>NODE_5
-
-**Filtered sequences (no repeats)**
-
-::
-
-	>NODE_1
-	>NODE_5
-
-**Repeat sequences**
-
-::
-
-      >NODE_3 (2 copies)
-
-**Discarded sequences**
-
-::
-
-	>NODE_2
-	>NODE_4
-    >NODE_6
-
----------------------------------------
-
-
-
-
-</help>
-
-</tool>
--- a/nml_filter_spades_repeats.pl	Sat Jul 04 09:45:30 2015 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,315 +0,0 @@
-#!/usr/bin/env perl
-
-use warnings;
-use strict;
-use Getopt::Long;
-use Bio::SeqIO;
-use Pod::Usage;
-
-my ($fasta_file, $tab_file, $coverage_co, $length_co, $repeat_co, $out_filtered, $out_repeats, $out_norepeats,$coverage_length_co, $summary_out, $filtered_repeats, $help);
-
-GetOptions(
-	'c|coverage-cutoff=s'	       => \$coverage_co,
-	'l|length-cutoff=s'	       => \$length_co,
-        'e|coverage-length-cutoff=s'   => \$coverage_length_co,
-	'r|repeat_cutoff=s'	       => \$repeat_co, 
-	'i|input=s'		       => \$fasta_file,
-	't|tab=s'		       => \$tab_file,
-	'f|filtered-out=s'	       => \$out_filtered,
-	'o|output-repeats=s'	       => \$out_repeats,
-	'u|output-norepeats=s'	       => \$out_norepeats,
-        'n|filtered-repeats=s'         => \$filtered_repeats,
-        's|summary=s'                  => \$summary_out,
-	'h|help'		       => \$help
-);
-
-pod2usage(-verbose => 2) if ($help);
-print "A fasta file is required. Please enter a fasta file using the -i flag.\n" if (!$fasta_file);
-print "A spades tabs file is required. Please enter a tabs file using the -t flag\n" if (!$tab_file);
-pod2usage(1) unless $fasta_file && $tab_file;
-
-if (!$coverage_co)
-{
-   $coverage_co = 0.33;
-}
-if (!$length_co)
-{
-   $length_co = 1000;
-}
-if (!$coverage_length_co)
-{
-   $coverage_length_co = 5000;
-}
-if (!$repeat_co)
-{
-   $repeat_co = 1.75;
-}
-if (!$out_filtered)
-{
-   $out_filtered = "Discarded_sequences.fasta";
-   print "Discarded sequences will be printed out to $out_filtered\n";
-}
-if (!$out_repeats)
-{
-   $out_repeats = "Filtered_sequences_with_repeats.fasta";
-   print "Filtered sequences with repeats will be printed out to $out_repeats\n";
-}
-if (!$out_norepeats)
-{
-   $out_norepeats = "Filtered_sequences_no_repeats.fasta";
-   print "Filtered sequences without repeats will be printed out to $out_norepeats\n";
-}
-if (!$filtered_repeats)
-{
-   $filtered_repeats = "Repeat_sequences.fasta";
-   print "Repeat sequences will be printed out to $filtered_repeats\n";
-}
-
-die ("No tab file specified") unless ($tab_file);
-die ("No fasta file specified") unless ($fasta_file);
-
-##Read tab file and discard rows with comments
-open TAB, '<', $tab_file or die "Could not open tab file: $?";
-open SEQIN, '<', $fasta_file or die "Could not open tab file: $?";
-open SEQOUT_REP, '>', $out_repeats or die "Could not open file for writing: $?";
-open SEQOUT_NOREP, '>', $out_norepeats or die "Could not open file for writing: $?";
-open SEQOUT_FILT, '>', $out_filtered if ($out_filtered);
-open SEQOUT_FILT_REP, '>', $filtered_repeats or die "Could not open file for writing: $?";
-open SUMMARY, '>', $summary_out if ($summary_out);
-
-
-my $avg_coverage = 0;
-my $num_contigs = 0;
-my $cutoff_coverage;
-my $cutoff_repeats;
-my @stats;
-
-
-while (<TAB>)
-{
-	chomp;
-	push @stats, $_ unless (/^#/);
-}
-
-#Calculate average coverage.
-foreach my $stat(@stats)
-{
-	my ($length, $coverage);
-	(undef,$length, $coverage) = split(/\t+/, $stat);
-        die "length or coverage not defined at $stat\n" unless ($length && ($coverage ne '' && $coverage >= 0));
-	if ($length >= $coverage_length_co)
-	{
-		$avg_coverage = $avg_coverage + $coverage;
-		$num_contigs++;
-	}
-}
-
-$avg_coverage = $avg_coverage / $num_contigs;
-$cutoff_coverage = $avg_coverage * $coverage_co;
-$cutoff_repeats = $avg_coverage * $repeat_co;
-
-print SUMMARY "Filter SPAdes repeats Results Summary\n======================================\n\n" if ($summary_out);
-print SUMMARY "Paramaters used:\nLength cutoff for calcularing average cutoff: $coverage_length_co\nCoverage cutoff ratio: $coverage_co\nRepeat cutoff ratio: $repeat_co\nLength cutoff: $length_co\n\n" if ($summary_out);
-
-print SUMMARY "Calculations:\nAverage coverage: $avg_coverage\nCoverage cutoff: $cutoff_coverage\nRepeat cutoff: $cutoff_repeats\n\nFile headers:\n" if ($summary_out);
-
-my ($header, $seq_id, $seq); 
-my $repeated = 0;
-my $valid = 0;
-
-#Summary strings:
-my $discarded = "";
-my $repeats = "";
-my $filtered_rep = "";
-my $filtered_norep = "";
-
-while (my $line = <SEQIN>)
-{
-	if ($line =~ />/)
-	{
-		chomp $line;
-                #Get the sequence name to compare against tab file
-		$header = $line;
-		$seq_id = $line =~ /(\w+)_length/;
-		$seq = "";
-	
-		my $stat = shift @stats;
-		die "Less rows in tab than sequences in seq file" unless $stat;
-		my($name, $length, $coverage) = split(/\t+/, $stat);
-		die "name or length not defined at $stat\n" unless ($name && $length);
-                die "coverage is not defined at $stat\n" unless ($coverage ne '' && $coverage >= 0);
-		die "Unmatched names $header and $name\n" unless ($header =~ /$name/i);
-
-               #Entry passes the length and coverage cutoffs?
-		if ($length >= $length_co && $coverage >= $cutoff_coverage)
-		{
-			$valid = 1;
-			#Repeats
-			if ($coverage >= $cutoff_repeats)
-			{
-				my $num_repeats = int($coverage/$avg_coverage);
-				$header = $header."(".$num_repeats." copies)";
-				print SEQOUT_REP $header,"\n";
-                                $filtered_rep = $filtered_rep.$header."\n";
-                                print SEQOUT_FILT_REP $header, "\n";
-                                $repeats = $repeats.$header."\n";
-				$repeated = 1;
-			}
-			else
-			{
-				print SEQOUT_REP $header, "\n";
-                                $filtered_rep = $filtered_rep.$header."\n";
-				print SEQOUT_NOREP $header, "\n";
-                                $filtered_norep = $filtered_norep.$header."\n";
-				$repeated = 0;
-			}
-		}
-		elsif ($out_filtered)
-		{
-			$valid = 0;
-			print SEQOUT_FILT $header,"\n";
-                        $discarded = $discarded.$header."\n";
-		}
-	}
-	else
-	{
-		if ($valid)
-		{
-			print SEQOUT_REP $line;
-			if (!$repeated)
-			{
-				print SEQOUT_NOREP $line;
-			}
-                        else
-                        {
-                              print SEQOUT_FILT_REP $line;
-                        }
-		}
-		elsif ($out_filtered)
-		{
-			print SEQOUT_FILT $line;
-		}
-	}
-	
-}
-
-close TAB;
-close SEQIN;
-close SEQOUT_REP;
-close SEQOUT_NOREP;
-close SEQOUT_FILT;
-close SEQOUT_FILT_REP;
-
-
-#Get summary info:
-if ($summary_out)
-{
-   print SUMMARY "Filtered sequences (with repeats):\n$filtered_rep\n";
-   print SUMMARY "Filtered sequences (no repeats):\n$filtered_norep\n";
-   print SUMMARY "Repeat sequences:\n$repeats\n";
-   if ($out_filtered)
-   {
-      print SUMMARY "Discarded sequences:\n$discarded\n"; 
-   }
-
-   close SUMMARY;
-}
-
-die "More rows in stats file than sequences in the fasta file\n" if (scalar(@stats) > 0);
-exit 0;
-
-
-__END__
-
-
-
-=head1 NAME
-
-	filter_spades_repeats.pl - Filters contigs or scaffolds based on contig length and detects contigs/scaffolds with very high coverage.
-
-
-
-=head1 USAGE
-
-	filter_spades_output.pl -i <contigs/scaffolds input>
-                                -t <stats input>
-                                -o <output fasta with repeats>
-                                -u <output fasta without repeats>
-                                
-                                Optional:
-                                -c <coverage cutoff ratio> (default 0.33) 
-				-l <length cutoff> (default: 1000)
-                                -e <length cutoff for average coverage calculation> (default: 5000)
-				-r <repeat cutoff ratio> (default (1.75)
-                                -n <filtered repeated sequences>
-                                -f <discarded sequences>
-                                -s <output summary file>
-
-                                For more information:
-                                -h
-
-
-=head1 INPUT
-
-=over 8
-
-=item B<-i>B<--input>
-
-Contigs/Scaffolds fasta file.
-
-=item B<-t>B<--tab>
-
-The tabular output file from SPAdes. This file should have the following format:
-
-      #name length   coverage
-
-      NODE_1   31438 24.5116
-      
-      NODE_2   31354 2316.96
-
-      NODE_3   26948 82.3294
-
-=item B<-o>B<--output-repeats>
-
-Output fasta file including the contigs marked as repeated.
-
-=item B<-u>B<--output-norepeats>
-
-Output fasta file excluding the contigs marked as repeated.
-
-=item B<-c>B<--coverage-cutoff>
-
-Mininum coverage ratio. 
-
-	coverage_theshold = average_coverage * minimum_coverage_ratio.
-
-Any contigs/scaffolds with coverage below the coverage_theshold will be eliminated.
-
-=item B<-l>B<--length-cutoff>
-
-Mininum length. Contigs below this length will be eliminated.
-
-=item B<-e>B<--coverage-length-cutoff>
-
-Minimum length to use for average coverage calculations.
-
-=item B<-r>B<--repeat-cutoff>
-
-Minimum repeats ratio. 
-
-	repeat_threshold = average_coverage * repeat_ratio. 
-
-Any contigs with coverage below this threshold will be considered to be repeated
-
-
-=item B<-f>B<--filtered-out>
-
-If specified, filtered out sequences will be written to this file.
-
-=item B<-s>B<--summary>
-
-A summary of results
-
-=back
-=cut
-
--- a/tool_dependencies.xml	Sat Jul 04 09:45:30 2015 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,7 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <repository name="package_perl_5_18" owner="iuc" >
-        <package name="perl" version="5.18.1" prior_installation_required="True" />
-    </repository>
-</tool_dependency>
-