Repository 'fasta_stats'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/fasta_stats

Changeset 0:9c620a950d3a (2018-11-22)
Next changeset 1:16f1f3e2de42 (2021-04-21)
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
added:
fasta-stats.pl
fasta-stats.xml
test-data/test.fasta
test-data/test_out.txt
b
diff -r 000000000000 -r 9c620a950d3a fasta-stats.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta-stats.pl Thu Nov 22 04:16:35 2018 -0500
[
@@ -0,0 +1,89 @@
+#!/usr/bin/env perl
+
+# fasta-stats
+# written by torsten.seemann@monash.edu 
+# oct 2012
+
+use strict;
+use warnings;
+use List::Util qw(sum min max);
+
+# stat storage
+
+my $n=0;
+my $seq = '';
+my %stat;
+my @len;
+
+# MAIN LOOP collecting sequences
+
+while (my $line = <ARGV>) {
+  chomp $line;
+  if ($line =~ m/^\s*>/) {
+    process($seq) if $n;
+    $n++;
+    $seq='';
+  }
+  else {
+    $seq .= $line;
+  }  
+}
+
+process($seq) if $n;
+
+# sort length array 
+# (should use hash here for efficiency with huge no of short reads?)
+
+@len = sort { $a <=> $b } @len;
+
+# compute more stats
+
+$stat{'num_seq'} = scalar(@len);
+
+if (@len) {
+  $stat{'num_bp'} = sum(@len);
+  $stat{'len_min'} = $len[0];
+  $stat{'len_max'} = $len[-1];
+  $stat{'len_median'} = $len[int(@len/2)];
+  $stat{'len_mean'} = int( $stat{'num_bp'} / $stat{'num_seq'} ); 
+  # calculate n50
+
+  $stat{'len_N50'} = 0;
+  my $cum=0;
+  my $thresh = int 0.5 * $stat{'num_bp'};
+  for my $i (0 .. $#len) {
+    $cum += $len[$i];
+    if ($cum >= $thresh) {
+      $stat{'len_N50'} = $len[$i];
+      last;
+    }
+  }
+}
+
+#calculate GC content
+$stat{'num_bp_not_N'} = $stat{'num_G'} + $stat{'num_C'} + $stat{'num_A'} + $stat{'num_T'};
+$stat{'GC_content'} = ($stat{'num_G'} + $stat{'num_C'}) / $stat{'num_bp_not_N'}*100;
+
+# print stats as .tsv
+
+for my $name (sort keys %stat) {
+    if ($name =~ m/GC_content/){
+        printf "%s\t%0.1f\n", $name, $stat{$name};
+    } else {
+        printf "%s\t%s\n", $name, $stat{$name};
+    }
+}
+
+# run for each sequence
+
+sub process {
+  my($s) = @_;
+  # base composition
+  for my $x (qw(A G T C N)) {
+    my $count = $s =~ s/$x/$x/gi;
+    $stat{"num_$x"} += $count;
+  }
+  # keep list of all lengths encountered
+  push @len, length($s);    
+}
+
b
diff -r 000000000000 -r 9c620a950d3a fasta-stats.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta-stats.xml Thu Nov 22 04:16:35 2018 -0500
[
@@ -0,0 +1,55 @@
+<tool id="fasta-stats" name="Fasta Statistics" version="1.0.1">
+    <description>Display summary statistics for a fasta file.</description>
+    <requirements>
+        <requirement type="package" version="5.26">perl</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        perl '${__tool_directory__}/fasta-stats.pl'
+        '$dataset'
+        > '$stats'
+        ]]>
+    </command>
+    <inputs>
+        <param name="dataset" type="data" format="fasta" label="fasta or multifasta file" help="fasta dataset to get statistics for."/>
+    </inputs>
+    <outputs>
+        <data name="stats" format="tabular" label="${tool.name} on ${on_string}: Fasta summary stats"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="dataset" value="test.fasta"/>
+            <output name="stats" file="test_out.txt"/>
+        </test>
+    </tests>
+    <help>
+**Fasta Stats**
+Displays the summary statistics for a fasta file.
+
+------
+
+Outputs in tabular form:
+    Lengths: n50, min, max, median and average
+
+    Number of base pairs: A, C, G, T, N, Total and Total_not_N
+
+    Number of sequences
+
+    GC content in %
+
+------
+
+Inputs:
+
+Fasta dataset
+    </help>
+    <citations>
+        <citation type="bibtex">
+@UNPUBLISHED{Seemann_Gladman2012,
+    author = {Torsten Seemann and Simon Gladman},
+    title = {Fasta Statistics: Display summary statistics for a fasta file.},
+    year = {2012},
+    url = {https://github.com/galaxyproject/tools-iuc},
+}
+        </citation>
+    </citations>
+</tool>
b
diff -r 000000000000 -r 9c620a950d3a test-data/test.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test.fasta Thu Nov 22 04:16:35 2018 -0500
b
b'@@ -0,0 +1,2784 @@\n+>gi|529218539|ref|NC_021976.1| Acetobacter pasteurianus 386B plasmid Apa386Bp1, complete sequence\n+GAAGGAAGCTTGGATCAGCTTCGTGTTTCCCGTGTGCCCCATTTCATTATGCAGTTCCTGAAGAGATCAT\n+CGTGGTTAGGTTGAGTCTCCGATCCGCCATGAGGGAACGTGTAGAATGTAGCGAGTGTCATGAACCATGA\n+CGAACATCGGGAAGAGCGGTAACCGCCAAGGACTGATGAGGTGTGAAGCCAACAGGCTGGTACCATACAG\n+CACATTACGATGTAGACGCTCGTGATCCGGCGATGAGGATATCGACAAGGCGCTTTGCACTAGTTTCCCA\n+ACCGGGGCCAGCCGCATAGTTGGAAACACCAGCCAGAGCACGAAGTAGGTCGAGTGGTTCGATCTCTGTG\n+CTGATATCTCTACTGGCAACTGCGGCTGCTACAAGTTTCTCAATCGCGTTCTGTAGAACCGTGCCAGACT\n+GGGCATAAAGTGCGCTCGTTCCTCCCATCAGCCCATTGAGGGCGGGGGCAATTACCTGCTTGGCAGCAAT\n+ATAATCTACAAACAGTCGCATCCATGCACGCAGTGCCTCCACGGGCGGATGCTGTGTGGCAAGAACTGGT\n+GCCGCGTCGGCTAGGCGGTCCAGTTCAGCGCGATAGACCGCCTCGATCAGAGCATCGCGATTAGGAAAAT\n+GGCGGTAAAGTGTTCCAATTCCAACTGCCGCACGACGGGCAACTTCATCCAAAGGGATATCGATTTCCCC\n+CGCCGTGAACGCGGCTTTGGCCACTTTAAGCAGGTTTTCCCTGTTACGTTGTCCATCGCTGCGCGGTCGG\n+CGGCGCGGCTTTTTTTCGTCGGTCACCGGTTTTCCTCTTGTTAAACGGAGTATGTCTCCGCATATACAAG\n+ACGGAGCCATGCTCCATATAACACTCCGAATGCCGCTTCGGCAAAAAACGGATGATCCTTTCAAGGAGTT\n+CCACCCATGACACAGACTTTTGGCGCGCGTTCGACGACAGAAGATGTCCTTTCCAGCGTGTCCCTAAAGG\n+GCAAACGTGTTCTCGTAACTGGCGTTTCTGCTGGTCTGGGGGTTGAAACCGCGCGTGCCCTTGCGGCTCA\n+TGGTGCCCAGGTAGTGGGGGCCGCGCGTAACCTGACAAAAGCAGAACACGCCACCGGACAGGTCCGTGCA\n+GATGCTGAACGTGGGGGAGGGACGTTTGAACTGATCGCCCTAGACCTCGCGGATCTGACCAGTGTCCGCG\n+CCTGCGCTGACCAATTGAACGCAACCGGTTTGCCTTTCGATCTGGTTATTGCCAATGCGGGTGTGATGGC\n+TACACCATTTAGTCATACGAAGGACGGATTTGAGACGCAGTTCGGCACCAACCACCTAGGGCACTTTGTT\n+CTTGTCAACCAGATAGCGGGGCTGATGCGCCCCGGTGCCCGACTGGTCAATGTCTCCTCGGCCGGACATC\n+GCTTTGCAGATGTTGATCTGAAAGACCCGAATTTCGAGCATACACCATACGATCCATTCATCGCTTATGG\n+ACGTTCTAAAACAGCCAATATCCTTTTTGCCGTTGCGTTTGACGCACGACATCGTACACGTGGTGTGCGT\n+GCTACAGCAGTGCATCCGGGTGGGATCATGACCGAACTGCCGCGCTATATGCAGGCAGGTGCCATTGAGG\n+CAATGGTGGCGGGGATCAATGAACAGGCTGCTGCAGAAGGAAAGCCACCATTTCAGTTTAAGACCATTCC\n+ACAAGGTGCAGCCACATCGGTCTGGGCAGGCGTGGTGGCGGAAGCTGATGCGGTCGGTGGGCATTACAGC\n+GAAGACTGCCATGTAAGCCCGGTTATTCCTGATGATCAGCTTCTTAGCTTGGTCAGTGAAGGTGTGCGGG\n+CCTACGCGGTTGATCCTGTGCATGCCGAGGCGCTGTGGGCAAAAAGTGAGGAAATGGTTGGAGAAACATT\n+TTCCTGAAACATCCACATCCTTGTTTCTGGCATAACGGATAAGCCCGACATCCAGACAGACATTTTGCCG\n+ATATGTGTAGCTTGAGCGCAGGGGACGGAAAACGTTTTTCCTGCGCTTTTCGTGCAAGGATATTGATTGT\n+AGTGCGCCTTGTTGTTTTTGGTTTGCCTCTTCTTGTTGTGATCCTGAATTGGCTGTGGCCATTACCGCTG\n+CCTTTGGGCCTGAAGATGTTTGCAGCCGTTCTGATGATCATGGCCGCACTTTACCACTACTGGTCTCGGC\n+TTTCTTCGGGGTCCGTTTTCACCCCGGAATTCCCCCGGCCTGTTATCATCCTGTTTAATTGGGCGTTCGG\n+GGCCATCCTGTTTCTTACGCTGCTCCAGATTGCGCTGGATTTCGGTGCACTACTGGTTGTGCTGATGACG\n+TGGCAGCCAGTGCATATTCCAGTAGGTGCGCGGGGGGCTGCTGGAGGTATTGCTGGGGTGCTGGCCGCAA\n+TTGGCGTTGCCAATGCGCTACGTGTGCCACCGGTCAGGAACGTGGCAGTGACCATTCCCGGCCTGTCGCC\n+AGCGTTTGACGGGTATCGACTGGTCCAATTGACCGATCTGCATATTAGTCGGCTGTTTCCCGCCAACTGG\n+GCAAGGGCAGTTGTGGACCGTACCAATGCGATCGGTGCTGATATGATTGTCGTGACGGGTGATTTCATTG\n+ACGGCTCAGTAGCCATGCGCCGCGCGGACGTGGCTCCACTTGCACAATTACACGCGCCAGACGGTGTTTT\n+GGCTATCCCGGGCAATCATGAATATTACTTTGATTATACCGACTGGATGCGTCATCTGAAGGAATTGGGC\n+TTTCACATGCTACTGAACCGTCACACAGTCATCTCAAGGGGAGGGGCTGAGTTGGTCATTGCAGGTGTTA\n+CTGATCGGTCCGCACCTAGGCATGGACAGGCTGGCCCCAATCTGGCCGCGGCTCTCACAGATAGTCCCGA\n+AGGGGCCCCGATTGTGCTGCTTGATCACCAACCCGGTGATGCGCGTGCGGCGGCTACCCAAGGTATTGCT\n+CTTCAGTTGTCGGGACATACACATGGCGGGATGATCCTCGGTCTCGACCGTCTTGTTGCGCGCGGGAATA\n+ACGGATTTGTATCCGGGCGCTACGATGTGAATGGCATGACGCTTTACGTCAATAACGGTACCGGGCTGTG\n+GCCAGGCTTTGCGCTTCGGCTGGGCATACCGCCTGAGATCACCTGCTTTAGTTTGCGTGCGGGATAATAC\n+TGGGGTTTTATCAAGTGCTTCTTATAGCACATCTCGCCGCAACCGTGTTCCTTCTCATTTCTTGAGTTAT\n+AGCTTCCTGTAATCACGTGGGCTAACACCTGTTTCCCTGCGAAACACCTGTGCAAAATGGCTTGGGCTCT\n+TATAACCAATGGTCAGGGCGATCTCGATGATGGATTGGTCGCTCTGGCACAAAAGTTCCTTCGCATGTTC\n+AACACGTCGCCGCACAAACCAGCGTGACGGGCTTTGCCCCATGGTGTTGTGAAATGATCGGCTGAAATGA\n+AACCGGCCCATCCCGCACAAATCGGCCAGAACATCCAGATCAAATGGTTCGGCGAGATGTGCCTCCATAT\n+GATCAAGTGCCTTGCGAAGTTTCCAAGCTGGGAGCTGTGCAGGTTTGGGCGTGCGTATCACATGTATATT\n+TGCATAATTGCGCAAAAGGTGGATGGTCAGACTTTCCAAAAGTCCGTTCACAAATAAAGGACTGGCCAAC\n+AGAGGCTTCTGTAGCTCAGCCGTCAGACCTGTCAGAACGCCAGATATAAATGTATCCTGAACACCGGAAA\n+TATCATGCATACGGAGCCGCAATGGGTTAAGCCCCAAAGATCGGGCAGCGCGGTTCACCAAAGT'..b'AAAGCGTGTTGGCATATATCTGTATTATTCGTTCCACTTATGTGAGTTTCAGATGCCCAAAAATTCGT\n+CCTCATTGCCAGAAGACCGTCTTCCGGTTGCTGGCCTTCTGGCGCTCGCAATGACAGGGTTCCTCTGCAT\n+CATGACGGAGACACTTCCTGCAGGACTGCTGCCGGAAATAAGTGCTGGTCTGCGGGTATCTCCCGCTTAT\n+GCAGGACAGATGGTCACCGCCTATGCTGTAGGTTCCCTAAGTGCCGCCATTCCACTCACACTTGCTACAC\n+AAAGATGGCGGCGACGTACCGTGCTGCTTCTGGCCATCATTGGTTTTCTACTGTTCAATGCAGTTACTGC\n+GCTTTCACCCAATTATTGGCTGACACTCGCAGTCCGTTACAGTGCGGGCGCAGCAGCGGGATTGGCATGG\n+GCACTTCTGGCGGGATATGCGCGGCGCATGGTAACACGTACCCAGCAGGGGCGAGCGCTGGCTATTGCCA\n+TGGTTGGAACGCCAATCGCCTTATCCCTTGGTGTTCCTGCCGGAACCTGGTTGGGAGCCATTTTCGGATG\n+GCGGTTGGCATTTGGCATGATGTCAGTCTTTACATTTTTGCTGATTGGCTGGGTGCTGGTGACGGTTCCG\n+GATTTTTCGGGTTCTTTGCATACACAACGGCCAGCATTCCGACATGTTCTAACGACATCTGGCGTGCGAC\n+CCGTTCTGGGAGTGGTGATCCTCTGGATGCTCGCACATAATATCCTGTATACTTACATTGTACCTTTTCT\n+GATACCGGCAGGGCTGGCGGGGAAGGCTGATCTTGTGTTGCTGGTTTTTGGTGTTGCCGCATTGGGTGGT\n+ATTGCGCTGACAGGGCAACTGGTGGATCATGCCCTGCGTGAGGCAGTACTGGTCAGTCTGGCCGTGTTTG\n+CGGTGGTCTGTCTCGCTTTTATCATTGATGTGCACTCTCCAGTGGTTATCTGGGGAGGTGTGAGTATCTG\n+GGGCCTGACTTTTGGCGGTGCCGCAACGTTATTGCAGACGGCGCTGGCTGATTCAGCACAAGAAGGCGCA\n+GATGTCGCATTGTCCATGAATGTTGTGGCATGGAACAGCGCTATTGCAGGAGGCGGTCTATTGGGAGGTC\n+TTCTCCTGAACCTGTGGGGTGCAGCCTCTTTCCCATGGGTTATGATGGGTCTTCTGATCTGCGGTTTTGG\n+GATCGTATGGCATGCTCGCTCACACGGGTTTCCGCAAGGTCATCGACTTTTGCAATAGCATCAGCAAACC\n+CTGTGAACTTCAGCAATGGGGTGTCCGGGAAAGTCATTGTATTGTCAGCGGATCAGTCATCAACAGGTCT\n+TTATCAGGAATAAAGCCTATTTCCTGATAGCCTGCGTCTAGCTGCCGTGTATGGTCCAATAGGTTCAGGT\n+GTAATCATGAGCACGATCAGGACATTCCCGGAACTGTTACCTGGCTTTCAGTGGCAGGATATTGAGGTGG\n+AAGGTGTGCGGATCCGCACGGCCACAGGCGGGAAGGGGCCACCAGTGCTTCTGCTGCATGGTCATCCGCA\n+AATGCACCTGACATGGCACAAGGTTGCTCCTACACTGGCAGAACATTTTACCGTTGTAGCGCCGGATCTG\n+CGTGGCTATGGCGATAGCGCCAAGCCTGAAGGCGGGGCAGGTCACGCCAATTATTCCAAACGTGCGATGG\n+CTGCCGATCAGGTTGGCGTCATGAAGGTACTCGGGTTTGAACGGTTCAGGGTGGTCGGGCATGACCGGGG\n+AGGGCGGGTTGCGCATCGCATGGCGCTGGATACACCACAGGTAGTGGAAAAGCTGGTGCTGATTGATATT\n+GCACCAACAGCAACAATGTATGCCCACACCAATATGGAGTTTGCTCGACGCTATTTCTGGTGGTTTTTTC\n+TGATCCAGCCCTATCCTCTGCCAGAAAAGCTGATTGATGGCGACCCAGATTTCTTTCTGGAAAACCATAT\n+TGCAGGACAGATCAAAATCCCGAGGTCAGTCGATCCCCGCGTAATGGCTGAATACCGCCGCTGCTATGCT\n+GACCCGACAATGCGCCATGCAGCTTGTGAGGATTATCGGGCTGCGGCTGGTATAGACCTTGAGCATGATG\n+CAGCCGATGCCAACAAACGGGTTACAGCGCCGTTACTGGCATTATGGGGTGCACGCGGCACAGTGGGCGC\n+ACTTTATGATGTTGTGGAGACATGGCGCAAAAAAGCGACTGACGTGCAGGGCCACGCAATTGATTGCGGG\n+CATAGTCCACAGGAAGAAGCACCTGAAGAATTTCTGCATCAGCTTAAAGAGTTTCTTTGAAATGGATTCC\n+AAAATGTACAGTCTTTTGGCCAGATGAGTTCGGCATACGAATAATAGGGATTTTTATCTCTGCACATTAG\n+GGCCCGTGTAAAGAAAACGTACTGAAATATGATGGAAACACTATGGCGACAGACCTTCTATCCCAATAAT\n+CCCGCATTGTCGTAATTCTCAATAATCTGGTCAAAAAGGCTGACATTCGAGCGGCTTCTTGCCATGAGCT\n+GTGCTCCAGATACTGCAGCAAAGATGGCTCGTGCACGTTCTATGTTTTGGGATGATGCCGGGTTTATCCT\n+AGAGATGATCGTAGTCAGCCAGGTCACATTGATATCGGTAAATGTCTGTATTTCGTGTTTTACAGCATCA\n+GGCAGATCATCATATTCAGCACTCATGAAACTCCCTAGACACAGACGATTACCGTTTTCGAGTGAACGAC\n+GAAAGATTTCCGGATAACGACGCAAGCATGTCAGAGAATCTGGAGTTTCCATCTGCATACTTTCCAGATT\n+AGCCTTGGTATCTTCCCAATAGCGTCGCGCAACTGCCATTCCAAGATCGGCTTTACTGGGAAAGTGATAA\n+TAAATGCTTGCAGGCTTAATTCCGACTTCCTTGGCAAGTTCACGGAAGTTCAGGCCGTTGTAACCACGGG\n+TCTGTGCAGCCTTTCTGGCTGCTGCCAAGATAGCTTCTCTGGAACTCTCATGCGTTATGTTCATTGTATT\n+GCGACTGTCTCCCTATGTGTTGGTAGGTAAGCGTTGACGCCCTGAAAGGGAAGGCATAACAAAATATACC\n+TACCAAGTGGTAGGTAGGCACATGACAGGCGGTCCAAAACAGTTGTTTTCATTAGGGGTGAGGCAATTCT\n+TATGAAAATCTATGATTGGCCTACAGGGCCTTATCCGGCACGTGTGCGGATTGCTCTGGCGGAAAAGCAG\n+ATGCTGTCCAGGATAAAGTTTGTCCAGATCAATCTGTGGAAAGGCGAACATAAGACAGCGGACTTCCGAA\n+CCAAGAACTATTCAGGCACGGTGCCGGTTCTCGAACTTGATGACGGAACCTTTCTTGCCGAATGCACTGC\n+GATCACAGTTTATCTTGATACGCTTGACGGGATTCCAACATTGACTGGCAGAACGCCACGTGAAAAGGGC\n+GTGATCCAGATGATGACCAAACGCGCTGAAATCGAAATGCTCGATGCTATCAGCATTTATTTCCATCATG\n+CGACGCCAGGCCTTGGCCCTAAAGTAGAGCTTTACCAGAACAGGGAATGGGGCCTTTACCAGCGCGATAA\n+GGCTTTACGCGGCATGCATTACTTTGATGCTCTCCTTAAAAAACAGCCTTTTATCGCAGGGGAGAATTTT\n+TCTATGGCAGATATTGCCGTGATTGGAGGCTTTATATTCGCCGCAGTCGTGAAACTGCCTATTCCTCACG\n+AGTGCAGTGCCCTTCTGGCGTGGTATGCAAGAATGCAGGAACGTCCAAGCGTTCGGGATCAGCTGGCAAC\n+TGTCTCGCCATAACATTGAAACTCGTTGTGCTAAATGTAGCACGCAGCTTTTTTATGACTCTTAATGTGC\n+ACGCAGATGGACGTGAACTGCTAAATAAGGAGTGGAGGTA\n'
b
diff -r 000000000000 -r 9c620a950d3a test-data/test_out.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out.txt Thu Nov 22 04:16:35 2018 -0500
b
@@ -0,0 +1,14 @@
+GC_content 52.0
+len_N50 194780
+len_max 194780
+len_mean 194780
+len_median 194780
+len_min 194780
+num_A 46297
+num_C 50626
+num_G 50678
+num_N 0
+num_T 47179
+num_bp 194780
+num_bp_not_N 194780
+num_seq 1