comparison fasta-stats.pl @ 1:20ca2574216a draft default tip

Uploaded
author simon-gladman
date Tue, 25 Jun 2013 01:51:40 -0400
parents
children
comparison
equal deleted inserted replaced
0:153f7a414921 1:20ca2574216a
1 #!/usr/bin/env perl
2
3 # fasta-stats
4 # written by torsten.seemann@monash.edu
5 # oct 2012
6
7 use strict;
8 use warnings;
9 use List::Util qw(sum min max);
10
11 # stat storage
12
13 my $n=0;
14 my $seq = '';
15 my %stat;
16 my @len;
17
18 # MAIN LOOP collecting sequences
19
20 while (my $line = <ARGV>) {
21 chomp $line;
22 if ($line =~ m/^\s*>/) {
23 process($seq) if $n;
24 $n++;
25 $seq='';
26 }
27 else {
28 $seq .= $line;
29 }
30 }
31
32 process($seq) if $n;
33
34 # sort length array
35 # (should use hash here for efficiency with huge no of short reads?)
36
37 @len = sort { $a <=> $b } @len;
38
39 # compute more stats
40
41 $stat{'num_seq'} = scalar(@len);
42
43 if (@len) {
44 $stat{'num_bp'} = sum(@len);
45 $stat{'len_min'} = $len[0];
46 $stat{'len_max'} = $len[-1];
47 $stat{'len_median'} = $len[int(@len/2)];
48 $stat{'len_mean'} = int( $stat{'num_bp'} / $stat{'num_seq'} );
49 # calculate n50
50
51 $stat{'len_N50'} = 0;
52 my $cum=0;
53 my $thresh = int 0.5 * $stat{'num_bp'};
54 for my $i (0 .. $#len) {
55 $cum += $len[$i];
56 if ($cum >= $thresh) {
57 $stat{'len_N50'} = $len[$i];
58 last;
59 }
60 }
61 }
62
63 #calculate GC content
64 $stat{'num_bp_not_N'} = $stat{'num_G'} + $stat{'num_C'} + $stat{'num_A'} + $stat{'num_T'};
65 $stat{'GC_content'} = ($stat{'num_G'} + $stat{'num_C'}) / $stat{'num_bp_not_N'}*100;
66
67 # print stats as .tsv
68
69 for my $name (sort keys %stat) {
70 if ($name =~ m/GC_content/){
71 printf "%s\t%0.1f\n", $name, $stat{$name};
72 } else {
73 printf "%s\t%s\n", $name, $stat{$name};
74 }
75 }
76
77 # run for each sequence
78
79 sub process {
80 my($s) = @_;
81 # base composition
82 for my $x (qw(A G T C N)) {
83 my $count = $s =~ s/$x/$x/gi;
84 $stat{"num_$x"} += $count;
85 }
86 # keep list of all lengths encountered
87 push @len, length($s);
88 }
89