changeset 1:16f1f3e2de42 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit 02d0ae7ac02425ef454d2e42a0513887596a3b4d"
author iuc
date Wed, 21 Apr 2021 09:10:46 +0000
parents 9c620a950d3a
children cd0874854f51
files fasta-stats.pl fasta-stats.xml test-data/ng50_out.txt
diffstat 3 files changed, 63 insertions(+), 11 deletions(-) [+]
line wrap: on
line diff
--- a/fasta-stats.pl	Thu Nov 22 04:16:35 2018 -0500
+++ b/fasta-stats.pl	Wed Apr 21 09:10:46 2021 +0000
@@ -8,6 +8,16 @@
 use warnings;
 use List::Util qw(sum min max);
 
+
+#Parameters
+my $file = shift;
+my $calc_ng50 = 0;
+my $genome_size = 0;
+if (scalar(@ARGV) > 0){
+  $genome_size = $ARGV[0];
+  $calc_ng50 = 1;
+}
+
 # stat storage
 
 my $n=0;
@@ -17,7 +27,10 @@
 
 # MAIN LOOP collecting sequences
 
-while (my $line = <ARGV>) {
+#open the file first
+open IN, $file or die{ "Couldn't open $file for reading\n$!" };
+
+while (my $line = <IN>) {
   chomp $line;
   if ($line =~ m/^\s*>/) {
     process($seq) if $n;
@@ -46,17 +59,15 @@
   $stat{'len_max'} = $len[-1];
   $stat{'len_median'} = $len[int(@len/2)];
   $stat{'len_mean'} = int( $stat{'num_bp'} / $stat{'num_seq'} ); 
+  
   # calculate n50
-
-  $stat{'len_N50'} = 0;
-  my $cum=0;
   my $thresh = int 0.5 * $stat{'num_bp'};
-  for my $i (0 .. $#len) {
-    $cum += $len[$i];
-    if ($cum >= $thresh) {
-      $stat{'len_N50'} = $len[$i];
-      last;
-    }
+  $stat{'len_N50'} = &calc_x50(@len, $thresh);
+  
+  #calculate NG50
+  if ($calc_ng50) {
+    my $thresh = int 0.5 * $genome_size * 1000000;
+    $stat{'len_NG50'} = &calc_x50(@len, $thresh);
   }
 }
 
@@ -87,3 +98,18 @@
   push @len, length($s);    
 }
 
+# N50/NG50 calculation sub
+
+sub calc_x50{
+  my @x = shift;
+  my $thresh = shift;
+  my $cum=0;
+  for my $i (0 .. $#x) {
+    $cum += $x[$i];
+    if ($cum >= $thresh) {
+      return $x[$i];
+    }
+  }
+  return 0;
+}
+
--- a/fasta-stats.xml	Thu Nov 22 04:16:35 2018 -0500
+++ b/fasta-stats.xml	Wed Apr 21 09:10:46 2021 +0000
@@ -1,4 +1,4 @@
-<tool id="fasta-stats" name="Fasta Statistics" version="1.0.1">
+<tool id="fasta-stats" name="Fasta Statistics" version="1.0.2">
     <description>Display summary statistics for a fasta file.</description>
     <requirements>
         <requirement type="package" version="5.26">perl</requirement>
@@ -6,11 +6,15 @@
     <command detect_errors="exit_code"><![CDATA[
         perl '${__tool_directory__}/fasta-stats.pl'
         '$dataset'
+        #if $genome_size:
+            $genome_size
+        #end if
         > '$stats'
         ]]>
     </command>
     <inputs>
         <param name="dataset" type="data" format="fasta" label="fasta or multifasta file" help="fasta dataset to get statistics for."/>
+        <param name="genome_size" type="float" optional="True" label="Genome size estimate (optional)" help="Estimate of the genome size in megabases (MB). If specified, NG50 will be calculated."/>
     </inputs>
     <outputs>
         <data name="stats" format="tabular" label="${tool.name} on ${on_string}: Fasta summary stats"/>
@@ -20,6 +24,11 @@
             <param name="dataset" value="test.fasta"/>
             <output name="stats" file="test_out.txt"/>
         </test>
+        <test>
+            <param name="dataset" value="test.fasta"/>
+            <param name="genome_size" value="5.0"/>
+            <output name="stats" file="ng50_out.txt"/>
+        </test>
     </tests>
     <help>
 **Fasta Stats**
@@ -36,6 +45,8 @@
 
     GC content in %
 
+    If an optional genome size estimate is specified, then the NG50 length will also be calculated. 
+
 ------
 
 Inputs:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ng50_out.txt	Wed Apr 21 09:10:46 2021 +0000
@@ -0,0 +1,15 @@
+GC_content	52.0
+len_N50	194780
+len_NG50	0
+len_max	194780
+len_mean	194780
+len_median	194780
+len_min	194780
+num_A	46297
+num_C	50626
+num_G	50678
+num_N	0
+num_T	47179
+num_bp	194780
+num_bp_not_N	194780
+num_seq	1