Next changeset 1:16f1f3e2de42 (2021-04-21) |
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb |
added:
fasta-stats.pl fasta-stats.xml test-data/test.fasta test-data/test_out.txt |
b |
diff -r 000000000000 -r 9c620a950d3a fasta-stats.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta-stats.pl Thu Nov 22 04:16:35 2018 -0500 |
[ |
@@ -0,0 +1,89 @@ +#!/usr/bin/env perl + +# fasta-stats +# written by torsten.seemann@monash.edu +# oct 2012 + +use strict; +use warnings; +use List::Util qw(sum min max); + +# stat storage + +my $n=0; +my $seq = ''; +my %stat; +my @len; + +# MAIN LOOP collecting sequences + +while (my $line = <ARGV>) { + chomp $line; + if ($line =~ m/^\s*>/) { + process($seq) if $n; + $n++; + $seq=''; + } + else { + $seq .= $line; + } +} + +process($seq) if $n; + +# sort length array +# (should use hash here for efficiency with huge no of short reads?) + +@len = sort { $a <=> $b } @len; + +# compute more stats + +$stat{'num_seq'} = scalar(@len); + +if (@len) { + $stat{'num_bp'} = sum(@len); + $stat{'len_min'} = $len[0]; + $stat{'len_max'} = $len[-1]; + $stat{'len_median'} = $len[int(@len/2)]; + $stat{'len_mean'} = int( $stat{'num_bp'} / $stat{'num_seq'} ); + # calculate n50 + + $stat{'len_N50'} = 0; + my $cum=0; + my $thresh = int 0.5 * $stat{'num_bp'}; + for my $i (0 .. $#len) { + $cum += $len[$i]; + if ($cum >= $thresh) { + $stat{'len_N50'} = $len[$i]; + last; + } + } +} + +#calculate GC content +$stat{'num_bp_not_N'} = $stat{'num_G'} + $stat{'num_C'} + $stat{'num_A'} + $stat{'num_T'}; +$stat{'GC_content'} = ($stat{'num_G'} + $stat{'num_C'}) / $stat{'num_bp_not_N'}*100; + +# print stats as .tsv + +for my $name (sort keys %stat) { + if ($name =~ m/GC_content/){ + printf "%s\t%0.1f\n", $name, $stat{$name}; + } else { + printf "%s\t%s\n", $name, $stat{$name}; + } +} + +# run for each sequence + +sub process { + my($s) = @_; + # base composition + for my $x (qw(A G T C N)) { + my $count = $s =~ s/$x/$x/gi; + $stat{"num_$x"} += $count; + } + # keep list of all lengths encountered + push @len, length($s); +} + |
b |
diff -r 000000000000 -r 9c620a950d3a fasta-stats.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta-stats.xml Thu Nov 22 04:16:35 2018 -0500 |
[ |
@@ -0,0 +1,55 @@ +<tool id="fasta-stats" name="Fasta Statistics" version="1.0.1"> + <description>Display summary statistics for a fasta file.</description> + <requirements> + <requirement type="package" version="5.26">perl</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + perl '${__tool_directory__}/fasta-stats.pl' + '$dataset' + > '$stats' + ]]> + </command> + <inputs> + <param name="dataset" type="data" format="fasta" label="fasta or multifasta file" help="fasta dataset to get statistics for."/> + </inputs> + <outputs> + <data name="stats" format="tabular" label="${tool.name} on ${on_string}: Fasta summary stats"/> + </outputs> + <tests> + <test> + <param name="dataset" value="test.fasta"/> + <output name="stats" file="test_out.txt"/> + </test> + </tests> + <help> +**Fasta Stats** +Displays the summary statistics for a fasta file. + +------ + +Outputs in tabular form: + Lengths: n50, min, max, median and average + + Number of base pairs: A, C, G, T, N, Total and Total_not_N + + Number of sequences + + GC content in % + +------ + +Inputs: + +Fasta dataset + </help> + <citations> + <citation type="bibtex"> +@UNPUBLISHED{Seemann_Gladman2012, + author = {Torsten Seemann and Simon Gladman}, + title = {Fasta Statistics: Display summary statistics for a fasta file.}, + year = {2012}, + url = {https://github.com/galaxyproject/tools-iuc}, +} + </citation> + </citations> +</tool> |
b |
diff -r 000000000000 -r 9c620a950d3a test-data/test.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test.fasta Thu Nov 22 04:16:35 2018 -0500 |
b |
b'@@ -0,0 +1,2784 @@\n+>gi|529218539|ref|NC_021976.1| Acetobacter pasteurianus 386B plasmid Apa386Bp1, complete sequence\n+GAAGGAAGCTTGGATCAGCTTCGTGTTTCCCGTGTGCCCCATTTCATTATGCAGTTCCTGAAGAGATCAT\n+CGTGGTTAGGTTGAGTCTCCGATCCGCCATGAGGGAACGTGTAGAATGTAGCGAGTGTCATGAACCATGA\n+CGAACATCGGGAAGAGCGGTAACCGCCAAGGACTGATGAGGTGTGAAGCCAACAGGCTGGTACCATACAG\n+CACATTACGATGTAGACGCTCGTGATCCGGCGATGAGGATATCGACAAGGCGCTTTGCACTAGTTTCCCA\n+ACCGGGGCCAGCCGCATAGTTGGAAACACCAGCCAGAGCACGAAGTAGGTCGAGTGGTTCGATCTCTGTG\n+CTGATATCTCTACTGGCAACTGCGGCTGCTACAAGTTTCTCAATCGCGTTCTGTAGAACCGTGCCAGACT\n+GGGCATAAAGTGCGCTCGTTCCTCCCATCAGCCCATTGAGGGCGGGGGCAATTACCTGCTTGGCAGCAAT\n+ATAATCTACAAACAGTCGCATCCATGCACGCAGTGCCTCCACGGGCGGATGCTGTGTGGCAAGAACTGGT\n+GCCGCGTCGGCTAGGCGGTCCAGTTCAGCGCGATAGACCGCCTCGATCAGAGCATCGCGATTAGGAAAAT\n+GGCGGTAAAGTGTTCCAATTCCAACTGCCGCACGACGGGCAACTTCATCCAAAGGGATATCGATTTCCCC\n+CGCCGTGAACGCGGCTTTGGCCACTTTAAGCAGGTTTTCCCTGTTACGTTGTCCATCGCTGCGCGGTCGG\n+CGGCGCGGCTTTTTTTCGTCGGTCACCGGTTTTCCTCTTGTTAAACGGAGTATGTCTCCGCATATACAAG\n+ACGGAGCCATGCTCCATATAACACTCCGAATGCCGCTTCGGCAAAAAACGGATGATCCTTTCAAGGAGTT\n+CCACCCATGACACAGACTTTTGGCGCGCGTTCGACGACAGAAGATGTCCTTTCCAGCGTGTCCCTAAAGG\n+GCAAACGTGTTCTCGTAACTGGCGTTTCTGCTGGTCTGGGGGTTGAAACCGCGCGTGCCCTTGCGGCTCA\n+TGGTGCCCAGGTAGTGGGGGCCGCGCGTAACCTGACAAAAGCAGAACACGCCACCGGACAGGTCCGTGCA\n+GATGCTGAACGTGGGGGAGGGACGTTTGAACTGATCGCCCTAGACCTCGCGGATCTGACCAGTGTCCGCG\n+CCTGCGCTGACCAATTGAACGCAACCGGTTTGCCTTTCGATCTGGTTATTGCCAATGCGGGTGTGATGGC\n+TACACCATTTAGTCATACGAAGGACGGATTTGAGACGCAGTTCGGCACCAACCACCTAGGGCACTTTGTT\n+CTTGTCAACCAGATAGCGGGGCTGATGCGCCCCGGTGCCCGACTGGTCAATGTCTCCTCGGCCGGACATC\n+GCTTTGCAGATGTTGATCTGAAAGACCCGAATTTCGAGCATACACCATACGATCCATTCATCGCTTATGG\n+ACGTTCTAAAACAGCCAATATCCTTTTTGCCGTTGCGTTTGACGCACGACATCGTACACGTGGTGTGCGT\n+GCTACAGCAGTGCATCCGGGTGGGATCATGACCGAACTGCCGCGCTATATGCAGGCAGGTGCCATTGAGG\n+CAATGGTGGCGGGGATCAATGAACAGGCTGCTGCAGAAGGAAAGCCACCATTTCAGTTTAAGACCATTCC\n+ACAAGGTGCAGCCACATCGGTCTGGGCAGGCGTGGTGGCGGAAGCTGATGCGGTCGGTGGGCATTACAGC\n+GAAGACTGCCATGTAAGCCCGGTTATTCCTGATGATCAGCTTCTTAGCTTGGTCAGTGAAGGTGTGCGGG\n+CCTACGCGGTTGATCCTGTGCATGCCGAGGCGCTGTGGGCAAAAAGTGAGGAAATGGTTGGAGAAACATT\n+TTCCTGAAACATCCACATCCTTGTTTCTGGCATAACGGATAAGCCCGACATCCAGACAGACATTTTGCCG\n+ATATGTGTAGCTTGAGCGCAGGGGACGGAAAACGTTTTTCCTGCGCTTTTCGTGCAAGGATATTGATTGT\n+AGTGCGCCTTGTTGTTTTTGGTTTGCCTCTTCTTGTTGTGATCCTGAATTGGCTGTGGCCATTACCGCTG\n+CCTTTGGGCCTGAAGATGTTTGCAGCCGTTCTGATGATCATGGCCGCACTTTACCACTACTGGTCTCGGC\n+TTTCTTCGGGGTCCGTTTTCACCCCGGAATTCCCCCGGCCTGTTATCATCCTGTTTAATTGGGCGTTCGG\n+GGCCATCCTGTTTCTTACGCTGCTCCAGATTGCGCTGGATTTCGGTGCACTACTGGTTGTGCTGATGACG\n+TGGCAGCCAGTGCATATTCCAGTAGGTGCGCGGGGGGCTGCTGGAGGTATTGCTGGGGTGCTGGCCGCAA\n+TTGGCGTTGCCAATGCGCTACGTGTGCCACCGGTCAGGAACGTGGCAGTGACCATTCCCGGCCTGTCGCC\n+AGCGTTTGACGGGTATCGACTGGTCCAATTGACCGATCTGCATATTAGTCGGCTGTTTCCCGCCAACTGG\n+GCAAGGGCAGTTGTGGACCGTACCAATGCGATCGGTGCTGATATGATTGTCGTGACGGGTGATTTCATTG\n+ACGGCTCAGTAGCCATGCGCCGCGCGGACGTGGCTCCACTTGCACAATTACACGCGCCAGACGGTGTTTT\n+GGCTATCCCGGGCAATCATGAATATTACTTTGATTATACCGACTGGATGCGTCATCTGAAGGAATTGGGC\n+TTTCACATGCTACTGAACCGTCACACAGTCATCTCAAGGGGAGGGGCTGAGTTGGTCATTGCAGGTGTTA\n+CTGATCGGTCCGCACCTAGGCATGGACAGGCTGGCCCCAATCTGGCCGCGGCTCTCACAGATAGTCCCGA\n+AGGGGCCCCGATTGTGCTGCTTGATCACCAACCCGGTGATGCGCGTGCGGCGGCTACCCAAGGTATTGCT\n+CTTCAGTTGTCGGGACATACACATGGCGGGATGATCCTCGGTCTCGACCGTCTTGTTGCGCGCGGGAATA\n+ACGGATTTGTATCCGGGCGCTACGATGTGAATGGCATGACGCTTTACGTCAATAACGGTACCGGGCTGTG\n+GCCAGGCTTTGCGCTTCGGCTGGGCATACCGCCTGAGATCACCTGCTTTAGTTTGCGTGCGGGATAATAC\n+TGGGGTTTTATCAAGTGCTTCTTATAGCACATCTCGCCGCAACCGTGTTCCTTCTCATTTCTTGAGTTAT\n+AGCTTCCTGTAATCACGTGGGCTAACACCTGTTTCCCTGCGAAACACCTGTGCAAAATGGCTTGGGCTCT\n+TATAACCAATGGTCAGGGCGATCTCGATGATGGATTGGTCGCTCTGGCACAAAAGTTCCTTCGCATGTTC\n+AACACGTCGCCGCACAAACCAGCGTGACGGGCTTTGCCCCATGGTGTTGTGAAATGATCGGCTGAAATGA\n+AACCGGCCCATCCCGCACAAATCGGCCAGAACATCCAGATCAAATGGTTCGGCGAGATGTGCCTCCATAT\n+GATCAAGTGCCTTGCGAAGTTTCCAAGCTGGGAGCTGTGCAGGTTTGGGCGTGCGTATCACATGTATATT\n+TGCATAATTGCGCAAAAGGTGGATGGTCAGACTTTCCAAAAGTCCGTTCACAAATAAAGGACTGGCCAAC\n+AGAGGCTTCTGTAGCTCAGCCGTCAGACCTGTCAGAACGCCAGATATAAATGTATCCTGAACACCGGAAA\n+TATCATGCATACGGAGCCGCAATGGGTTAAGCCCCAAAGATCGGGCAGCGCGGTTCACCAAAGT'..b'AAAGCGTGTTGGCATATATCTGTATTATTCGTTCCACTTATGTGAGTTTCAGATGCCCAAAAATTCGT\n+CCTCATTGCCAGAAGACCGTCTTCCGGTTGCTGGCCTTCTGGCGCTCGCAATGACAGGGTTCCTCTGCAT\n+CATGACGGAGACACTTCCTGCAGGACTGCTGCCGGAAATAAGTGCTGGTCTGCGGGTATCTCCCGCTTAT\n+GCAGGACAGATGGTCACCGCCTATGCTGTAGGTTCCCTAAGTGCCGCCATTCCACTCACACTTGCTACAC\n+AAAGATGGCGGCGACGTACCGTGCTGCTTCTGGCCATCATTGGTTTTCTACTGTTCAATGCAGTTACTGC\n+GCTTTCACCCAATTATTGGCTGACACTCGCAGTCCGTTACAGTGCGGGCGCAGCAGCGGGATTGGCATGG\n+GCACTTCTGGCGGGATATGCGCGGCGCATGGTAACACGTACCCAGCAGGGGCGAGCGCTGGCTATTGCCA\n+TGGTTGGAACGCCAATCGCCTTATCCCTTGGTGTTCCTGCCGGAACCTGGTTGGGAGCCATTTTCGGATG\n+GCGGTTGGCATTTGGCATGATGTCAGTCTTTACATTTTTGCTGATTGGCTGGGTGCTGGTGACGGTTCCG\n+GATTTTTCGGGTTCTTTGCATACACAACGGCCAGCATTCCGACATGTTCTAACGACATCTGGCGTGCGAC\n+CCGTTCTGGGAGTGGTGATCCTCTGGATGCTCGCACATAATATCCTGTATACTTACATTGTACCTTTTCT\n+GATACCGGCAGGGCTGGCGGGGAAGGCTGATCTTGTGTTGCTGGTTTTTGGTGTTGCCGCATTGGGTGGT\n+ATTGCGCTGACAGGGCAACTGGTGGATCATGCCCTGCGTGAGGCAGTACTGGTCAGTCTGGCCGTGTTTG\n+CGGTGGTCTGTCTCGCTTTTATCATTGATGTGCACTCTCCAGTGGTTATCTGGGGAGGTGTGAGTATCTG\n+GGGCCTGACTTTTGGCGGTGCCGCAACGTTATTGCAGACGGCGCTGGCTGATTCAGCACAAGAAGGCGCA\n+GATGTCGCATTGTCCATGAATGTTGTGGCATGGAACAGCGCTATTGCAGGAGGCGGTCTATTGGGAGGTC\n+TTCTCCTGAACCTGTGGGGTGCAGCCTCTTTCCCATGGGTTATGATGGGTCTTCTGATCTGCGGTTTTGG\n+GATCGTATGGCATGCTCGCTCACACGGGTTTCCGCAAGGTCATCGACTTTTGCAATAGCATCAGCAAACC\n+CTGTGAACTTCAGCAATGGGGTGTCCGGGAAAGTCATTGTATTGTCAGCGGATCAGTCATCAACAGGTCT\n+TTATCAGGAATAAAGCCTATTTCCTGATAGCCTGCGTCTAGCTGCCGTGTATGGTCCAATAGGTTCAGGT\n+GTAATCATGAGCACGATCAGGACATTCCCGGAACTGTTACCTGGCTTTCAGTGGCAGGATATTGAGGTGG\n+AAGGTGTGCGGATCCGCACGGCCACAGGCGGGAAGGGGCCACCAGTGCTTCTGCTGCATGGTCATCCGCA\n+AATGCACCTGACATGGCACAAGGTTGCTCCTACACTGGCAGAACATTTTACCGTTGTAGCGCCGGATCTG\n+CGTGGCTATGGCGATAGCGCCAAGCCTGAAGGCGGGGCAGGTCACGCCAATTATTCCAAACGTGCGATGG\n+CTGCCGATCAGGTTGGCGTCATGAAGGTACTCGGGTTTGAACGGTTCAGGGTGGTCGGGCATGACCGGGG\n+AGGGCGGGTTGCGCATCGCATGGCGCTGGATACACCACAGGTAGTGGAAAAGCTGGTGCTGATTGATATT\n+GCACCAACAGCAACAATGTATGCCCACACCAATATGGAGTTTGCTCGACGCTATTTCTGGTGGTTTTTTC\n+TGATCCAGCCCTATCCTCTGCCAGAAAAGCTGATTGATGGCGACCCAGATTTCTTTCTGGAAAACCATAT\n+TGCAGGACAGATCAAAATCCCGAGGTCAGTCGATCCCCGCGTAATGGCTGAATACCGCCGCTGCTATGCT\n+GACCCGACAATGCGCCATGCAGCTTGTGAGGATTATCGGGCTGCGGCTGGTATAGACCTTGAGCATGATG\n+CAGCCGATGCCAACAAACGGGTTACAGCGCCGTTACTGGCATTATGGGGTGCACGCGGCACAGTGGGCGC\n+ACTTTATGATGTTGTGGAGACATGGCGCAAAAAAGCGACTGACGTGCAGGGCCACGCAATTGATTGCGGG\n+CATAGTCCACAGGAAGAAGCACCTGAAGAATTTCTGCATCAGCTTAAAGAGTTTCTTTGAAATGGATTCC\n+AAAATGTACAGTCTTTTGGCCAGATGAGTTCGGCATACGAATAATAGGGATTTTTATCTCTGCACATTAG\n+GGCCCGTGTAAAGAAAACGTACTGAAATATGATGGAAACACTATGGCGACAGACCTTCTATCCCAATAAT\n+CCCGCATTGTCGTAATTCTCAATAATCTGGTCAAAAAGGCTGACATTCGAGCGGCTTCTTGCCATGAGCT\n+GTGCTCCAGATACTGCAGCAAAGATGGCTCGTGCACGTTCTATGTTTTGGGATGATGCCGGGTTTATCCT\n+AGAGATGATCGTAGTCAGCCAGGTCACATTGATATCGGTAAATGTCTGTATTTCGTGTTTTACAGCATCA\n+GGCAGATCATCATATTCAGCACTCATGAAACTCCCTAGACACAGACGATTACCGTTTTCGAGTGAACGAC\n+GAAAGATTTCCGGATAACGACGCAAGCATGTCAGAGAATCTGGAGTTTCCATCTGCATACTTTCCAGATT\n+AGCCTTGGTATCTTCCCAATAGCGTCGCGCAACTGCCATTCCAAGATCGGCTTTACTGGGAAAGTGATAA\n+TAAATGCTTGCAGGCTTAATTCCGACTTCCTTGGCAAGTTCACGGAAGTTCAGGCCGTTGTAACCACGGG\n+TCTGTGCAGCCTTTCTGGCTGCTGCCAAGATAGCTTCTCTGGAACTCTCATGCGTTATGTTCATTGTATT\n+GCGACTGTCTCCCTATGTGTTGGTAGGTAAGCGTTGACGCCCTGAAAGGGAAGGCATAACAAAATATACC\n+TACCAAGTGGTAGGTAGGCACATGACAGGCGGTCCAAAACAGTTGTTTTCATTAGGGGTGAGGCAATTCT\n+TATGAAAATCTATGATTGGCCTACAGGGCCTTATCCGGCACGTGTGCGGATTGCTCTGGCGGAAAAGCAG\n+ATGCTGTCCAGGATAAAGTTTGTCCAGATCAATCTGTGGAAAGGCGAACATAAGACAGCGGACTTCCGAA\n+CCAAGAACTATTCAGGCACGGTGCCGGTTCTCGAACTTGATGACGGAACCTTTCTTGCCGAATGCACTGC\n+GATCACAGTTTATCTTGATACGCTTGACGGGATTCCAACATTGACTGGCAGAACGCCACGTGAAAAGGGC\n+GTGATCCAGATGATGACCAAACGCGCTGAAATCGAAATGCTCGATGCTATCAGCATTTATTTCCATCATG\n+CGACGCCAGGCCTTGGCCCTAAAGTAGAGCTTTACCAGAACAGGGAATGGGGCCTTTACCAGCGCGATAA\n+GGCTTTACGCGGCATGCATTACTTTGATGCTCTCCTTAAAAAACAGCCTTTTATCGCAGGGGAGAATTTT\n+TCTATGGCAGATATTGCCGTGATTGGAGGCTTTATATTCGCCGCAGTCGTGAAACTGCCTATTCCTCACG\n+AGTGCAGTGCCCTTCTGGCGTGGTATGCAAGAATGCAGGAACGTCCAAGCGTTCGGGATCAGCTGGCAAC\n+TGTCTCGCCATAACATTGAAACTCGTTGTGCTAAATGTAGCACGCAGCTTTTTTATGACTCTTAATGTGC\n+ACGCAGATGGACGTGAACTGCTAAATAAGGAGTGGAGGTA\n' |
b |
diff -r 000000000000 -r 9c620a950d3a test-data/test_out.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_out.txt Thu Nov 22 04:16:35 2018 -0500 |
b |
@@ -0,0 +1,14 @@ +GC_content 52.0 +len_N50 194780 +len_max 194780 +len_mean 194780 +len_median 194780 +len_min 194780 +num_A 46297 +num_C 50626 +num_G 50678 +num_N 0 +num_T 47179 +num_bp 194780 +num_bp_not_N 194780 +num_seq 1 |