annotate FastaStats.pl @ 0:163892325845 draft default tip

Initial commit.
author galaxyp
date Fri, 10 May 2013 17:15:08 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
163892325845 Initial commit.
galaxyp
parents:
diff changeset
1 #!/usr/bin/perl
163892325845 Initial commit.
galaxyp
parents:
diff changeset
2
163892325845 Initial commit.
galaxyp
parents:
diff changeset
3 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
4 # FastaStats.pl
163892325845 Initial commit.
galaxyp
parents:
diff changeset
5 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
6 # =====================================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
7 # $Id: FastaStats.pl 6 2010-05-27 15:52:50Z pieter $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
8 # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/FastaStats.pl $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
9 # $LastChangedDate: 2010-05-27 10:52:50 -0500 (Thu, 27 May 2010) $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
10 # $LastChangedRevision: 6 $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
11 # $LastChangedBy: pieter $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
12 # =====================================================
163892325845 Initial commit.
galaxyp
parents:
diff changeset
13 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
14 # Counts the amount of sequences in a FASTA file
163892325845 Initial commit.
galaxyp
parents:
diff changeset
15 # as well as the amount of nucleotides / amino acids
163892325845 Initial commit.
galaxyp
parents:
diff changeset
16 # and the sueqence composition.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
17 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
18
163892325845 Initial commit.
galaxyp
parents:
diff changeset
19 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
20 # Initialize evironment
163892325845 Initial commit.
galaxyp
parents:
diff changeset
21 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
22 use strict;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
23 use Getopt::Std;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
24 use Log::Log4perl qw(:easy);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
25
163892325845 Initial commit.
galaxyp
parents:
diff changeset
26 my %log_levels = (
163892325845 Initial commit.
galaxyp
parents:
diff changeset
27 'ALL' => $ALL,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
28 'TRACE' => $TRACE,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
29 'DEBUG' => $DEBUG,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
30 'INFO' => $INFO,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
31 'WARN' => $WARN,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
32 'ERROR' => $ERROR,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
33 'FATAL' => $FATAL,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
34 'OFF' => $OFF,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
35 );
163892325845 Initial commit.
galaxyp
parents:
diff changeset
36
163892325845 Initial commit.
galaxyp
parents:
diff changeset
37 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
38 # Get options.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
39 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
40 my %opts;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
41 Getopt::Std::getopts('pi:o:l:', \%opts);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
42
163892325845 Initial commit.
galaxyp
parents:
diff changeset
43 my $fasta_file = $opts{'i'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
44 my $stats_file = $opts{'o'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
45 my $log_level = $opts{'l'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
46 my $get_positional_composition = $opts{'p'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
47
163892325845 Initial commit.
galaxyp
parents:
diff changeset
48 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
49 # Configure logging.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
50 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
51 # Provides default if user did not specify log level:
163892325845 Initial commit.
galaxyp
parents:
diff changeset
52 $log_level = (defined($log_level) ? $log_level : 'INFO');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
53 # Reset log level to default if user specified illegal log level.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
54 $log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'INFO'});
163892325845 Initial commit.
galaxyp
parents:
diff changeset
55 #Log::Log4perl->init('log4perl.properties');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
56 Log::Log4perl->easy_init(
163892325845 Initial commit.
galaxyp
parents:
diff changeset
57 #{ level => $log_level,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
58 # file => ">>FastaStats.log",
163892325845 Initial commit.
galaxyp
parents:
diff changeset
59 # layout => '%F{1}-%L-%M: %m%n' },
163892325845 Initial commit.
galaxyp
parents:
diff changeset
60 { level => $log_level,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
61 file => "STDOUT",
163892325845 Initial commit.
galaxyp
parents:
diff changeset
62 layout => '%d L:%L %p> %m%n' },
163892325845 Initial commit.
galaxyp
parents:
diff changeset
63 );
163892325845 Initial commit.
galaxyp
parents:
diff changeset
64 my $logger = Log::Log4perl::get_logger();
163892325845 Initial commit.
galaxyp
parents:
diff changeset
65
163892325845 Initial commit.
galaxyp
parents:
diff changeset
66 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
67 # Check options.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
68 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
69 if ($fasta_file =~ /^$/) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
70 _Usage();
163892325845 Initial commit.
galaxyp
parents:
diff changeset
71 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
72 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
73 unless (-f $fasta_file && -r $fasta_file) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
74 _Usage();
163892325845 Initial commit.
galaxyp
parents:
diff changeset
75 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
76 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
77
163892325845 Initial commit.
galaxyp
parents:
diff changeset
78 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
79 # Start job.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
80 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
81 $logger->info('Starting...');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
82 $logger->info('Using FASTA file: '. $fasta_file);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
83
163892325845 Initial commit.
galaxyp
parents:
diff changeset
84 my $sequence_count = 0;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
85 my %acid_composition_total;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
86 my %acid_composition_positional;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
87 my $position_offset;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
88
163892325845 Initial commit.
galaxyp
parents:
diff changeset
89 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
90 # Create filehandles.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
91 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
92 my $fasta_fh;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
93 my $stats_fh;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
94
163892325845 Initial commit.
galaxyp
parents:
diff changeset
95 eval {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
96 open($fasta_fh, "<$fasta_file");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
97 open($stats_fh, ">$stats_file");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
98 };
163892325845 Initial commit.
galaxyp
parents:
diff changeset
99 if ($@) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
100 $logger->fatal('Cannot create filehandle: ' . $@);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
101 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
102 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
103
163892325845 Initial commit.
galaxyp
parents:
diff changeset
104 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
105 # Parse FASTA file.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
106 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
107 while (my $line = <$fasta_fh>) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
108
163892325845 Initial commit.
galaxyp
parents:
diff changeset
109 if ($line =~ /^>/i) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
110
163892325845 Initial commit.
galaxyp
parents:
diff changeset
111 $sequence_count++;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
112 $position_offset = 0; # reset position.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
113
163892325845 Initial commit.
galaxyp
parents:
diff changeset
114 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
115
163892325845 Initial commit.
galaxyp
parents:
diff changeset
116 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
117 # Ignore:
163892325845 Initial commit.
galaxyp
parents:
diff changeset
118 # * white space
163892325845 Initial commit.
galaxyp
parents:
diff changeset
119 # * end of line (EOL) characters
163892325845 Initial commit.
galaxyp
parents:
diff changeset
120 # * lower- and/or uppercase (repeat) masking.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
121 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
122 my $seq_line = $line;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
123 $seq_line =~ s/[\s\n\r]+//g;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
124 next if ($seq_line eq '');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
125 $seq_line = uc($seq_line);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
126
163892325845 Initial commit.
galaxyp
parents:
diff changeset
127 if ($seq_line =~ m/^([a-zA-Z*-]+)$/) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
128
163892325845 Initial commit.
galaxyp
parents:
diff changeset
129 my $seq = $1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
130 my @acids = split(//, $seq);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
131
163892325845 Initial commit.
galaxyp
parents:
diff changeset
132 foreach my $acid(@acids) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
133
163892325845 Initial commit.
galaxyp
parents:
diff changeset
134 $acid_composition_total{$acid}++;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
135
163892325845 Initial commit.
galaxyp
parents:
diff changeset
136 if ($get_positional_composition) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
137
163892325845 Initial commit.
galaxyp
parents:
diff changeset
138 $acid_composition_positional{$acid}[$position_offset]++;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
139 $position_offset++;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
140
163892325845 Initial commit.
galaxyp
parents:
diff changeset
141 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
142 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
143
163892325845 Initial commit.
galaxyp
parents:
diff changeset
144 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
145
163892325845 Initial commit.
galaxyp
parents:
diff changeset
146 $logger->warn('Weird line in FASTA file: ' . $line);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
147 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
148
163892325845 Initial commit.
galaxyp
parents:
diff changeset
149 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
150 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
151 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
152
163892325845 Initial commit.
galaxyp
parents:
diff changeset
153 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
154 # Save stats.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
155 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
156 print($stats_fh 'Sequences' . "\t" . $sequence_count . "\n");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
157 print($stats_fh 'Total acid composition:' . "\n");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
158 my $total_acid_count = 0;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
159 foreach my $acid (sort(keys(%acid_composition_total))) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
160 my $acid_count = $acid_composition_total{$acid};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
161 print($stats_fh 'Acid ' . $acid . "\t" . $acid_count . "\n");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
162 $total_acid_count += $acid_count;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
163 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
164 if ($get_positional_composition) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
165 print($stats_fh 'Positional acid composition:' . "\n");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
166 foreach my $acid (sort(keys(%acid_composition_positional))) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
167 print($stats_fh 'Acid ' . $acid);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
168 for my $inx (1 .. scalar(@{$acid_composition_positional{$acid}})) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
169 my $acid_count;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
170 if (defined($acid_composition_positional{$acid}[$inx-1])) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
171 $acid_count = $acid_composition_positional{$acid}[$inx-1];
163892325845 Initial commit.
galaxyp
parents:
diff changeset
172 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
173 $acid_count = 0;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
174 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
175 print($stats_fh "\t" . $acid_count);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
176 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
177 print($stats_fh "\n");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
178 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
179 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
180
163892325845 Initial commit.
galaxyp
parents:
diff changeset
181 print($stats_fh 'Total acids' . "\t" . $total_acid_count . "\n");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
182
163892325845 Initial commit.
galaxyp
parents:
diff changeset
183 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
184 # Close filehandles.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
185 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
186 close($fasta_fh);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
187 close($stats_fh);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
188
163892325845 Initial commit.
galaxyp
parents:
diff changeset
189 $logger->info('Found ' . $total_acid_count . ' nucleotide/amino acids in ' . $sequence_count . ' sequences.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
190 $logger->info('Finished!');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
191
163892325845 Initial commit.
galaxyp
parents:
diff changeset
192 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
193 ##
163892325845 Initial commit.
galaxyp
parents:
diff changeset
194 ### Subs.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
195 ##
163892325845 Initial commit.
galaxyp
parents:
diff changeset
196 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
197
163892325845 Initial commit.
galaxyp
parents:
diff changeset
198 sub _Usage {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
199
163892325845 Initial commit.
galaxyp
parents:
diff changeset
200 print "\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
201 print "FastaStats.pl - Reports statistics on a FASTA file like \n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
202 print " * The number of sequences\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
203 print " * The total number of (nucleotide|amino) acids\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
204 print " * Sequence composition per (nucleotide|amino) acid\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
205 print "\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
206 print "Usage:\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
207 print "\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
208 print " FastaStats.pl options\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
209 print "\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
210 print "Available options are:\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
211 print "\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
212 print " -i [file] Input FASTA file.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
213 print " -o [file] Output stats file.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
214 print " -p Add positional stats to the output.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
215 print " This will count the occurrance of each AA on each position of all sequences.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
216 print " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
217 print "\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
218 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
219
163892325845 Initial commit.
galaxyp
parents:
diff changeset
220 }