comparison FastaStats.pl @ 0:163892325845 draft default tip

Initial commit.
author galaxyp
date Fri, 10 May 2013 17:15:08 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:163892325845
1 #!/usr/bin/perl
2
3 #
4 # FastaStats.pl
5 #
6 # =====================================================
7 # $Id: FastaStats.pl 6 2010-05-27 15:52:50Z pieter $
8 # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/FastaStats.pl $
9 # $LastChangedDate: 2010-05-27 10:52:50 -0500 (Thu, 27 May 2010) $
10 # $LastChangedRevision: 6 $
11 # $LastChangedBy: pieter $
12 # =====================================================
13 #
14 # Counts the amount of sequences in a FASTA file
15 # as well as the amount of nucleotides / amino acids
16 # and the sueqence composition.
17 #
18
19 #
20 # Initialize evironment
21 #
22 use strict;
23 use Getopt::Std;
24 use Log::Log4perl qw(:easy);
25
26 my %log_levels = (
27 'ALL' => $ALL,
28 'TRACE' => $TRACE,
29 'DEBUG' => $DEBUG,
30 'INFO' => $INFO,
31 'WARN' => $WARN,
32 'ERROR' => $ERROR,
33 'FATAL' => $FATAL,
34 'OFF' => $OFF,
35 );
36
37 #
38 # Get options.
39 #
40 my %opts;
41 Getopt::Std::getopts('pi:o:l:', \%opts);
42
43 my $fasta_file = $opts{'i'};
44 my $stats_file = $opts{'o'};
45 my $log_level = $opts{'l'};
46 my $get_positional_composition = $opts{'p'};
47
48 #
49 # Configure logging.
50 #
51 # Provides default if user did not specify log level:
52 $log_level = (defined($log_level) ? $log_level : 'INFO');
53 # Reset log level to default if user specified illegal log level.
54 $log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'INFO'});
55 #Log::Log4perl->init('log4perl.properties');
56 Log::Log4perl->easy_init(
57 #{ level => $log_level,
58 # file => ">>FastaStats.log",
59 # layout => '%F{1}-%L-%M: %m%n' },
60 { level => $log_level,
61 file => "STDOUT",
62 layout => '%d L:%L %p> %m%n' },
63 );
64 my $logger = Log::Log4perl::get_logger();
65
66 #
67 # Check options.
68 #
69 if ($fasta_file =~ /^$/) {
70 _Usage();
71 exit;
72 }
73 unless (-f $fasta_file && -r $fasta_file) {
74 _Usage();
75 exit;
76 }
77
78 #
79 # Start job.
80 #
81 $logger->info('Starting...');
82 $logger->info('Using FASTA file: '. $fasta_file);
83
84 my $sequence_count = 0;
85 my %acid_composition_total;
86 my %acid_composition_positional;
87 my $position_offset;
88
89 #
90 # Create filehandles.
91 #
92 my $fasta_fh;
93 my $stats_fh;
94
95 eval {
96 open($fasta_fh, "<$fasta_file");
97 open($stats_fh, ">$stats_file");
98 };
99 if ($@) {
100 $logger->fatal('Cannot create filehandle: ' . $@);
101 exit;
102 }
103
104 #
105 # Parse FASTA file.
106 #
107 while (my $line = <$fasta_fh>) {
108
109 if ($line =~ /^>/i) {
110
111 $sequence_count++;
112 $position_offset = 0; # reset position.
113
114 } else {
115
116 #
117 # Ignore:
118 # * white space
119 # * end of line (EOL) characters
120 # * lower- and/or uppercase (repeat) masking.
121 #
122 my $seq_line = $line;
123 $seq_line =~ s/[\s\n\r]+//g;
124 next if ($seq_line eq '');
125 $seq_line = uc($seq_line);
126
127 if ($seq_line =~ m/^([a-zA-Z*-]+)$/) {
128
129 my $seq = $1;
130 my @acids = split(//, $seq);
131
132 foreach my $acid(@acids) {
133
134 $acid_composition_total{$acid}++;
135
136 if ($get_positional_composition) {
137
138 $acid_composition_positional{$acid}[$position_offset]++;
139 $position_offset++;
140
141 }
142 }
143
144 } else {
145
146 $logger->warn('Weird line in FASTA file: ' . $line);
147 exit;
148
149 }
150 }
151 }
152
153 #
154 # Save stats.
155 #
156 print($stats_fh 'Sequences' . "\t" . $sequence_count . "\n");
157 print($stats_fh 'Total acid composition:' . "\n");
158 my $total_acid_count = 0;
159 foreach my $acid (sort(keys(%acid_composition_total))) {
160 my $acid_count = $acid_composition_total{$acid};
161 print($stats_fh 'Acid ' . $acid . "\t" . $acid_count . "\n");
162 $total_acid_count += $acid_count;
163 }
164 if ($get_positional_composition) {
165 print($stats_fh 'Positional acid composition:' . "\n");
166 foreach my $acid (sort(keys(%acid_composition_positional))) {
167 print($stats_fh 'Acid ' . $acid);
168 for my $inx (1 .. scalar(@{$acid_composition_positional{$acid}})) {
169 my $acid_count;
170 if (defined($acid_composition_positional{$acid}[$inx-1])) {
171 $acid_count = $acid_composition_positional{$acid}[$inx-1];
172 } else {
173 $acid_count = 0;
174 }
175 print($stats_fh "\t" . $acid_count);
176 }
177 print($stats_fh "\n");
178 }
179 }
180
181 print($stats_fh 'Total acids' . "\t" . $total_acid_count . "\n");
182
183 #
184 # Close filehandles.
185 #
186 close($fasta_fh);
187 close($stats_fh);
188
189 $logger->info('Found ' . $total_acid_count . ' nucleotide/amino acids in ' . $sequence_count . ' sequences.');
190 $logger->info('Finished!');
191
192 #
193 ##
194 ### Subs.
195 ##
196 #
197
198 sub _Usage {
199
200 print "\n";
201 print "FastaStats.pl - Reports statistics on a FASTA file like \n";
202 print " * The number of sequences\n";
203 print " * The total number of (nucleotide|amino) acids\n";
204 print " * Sequence composition per (nucleotide|amino) acid\n";
205 print "\n";
206 print "Usage:\n";
207 print "\n";
208 print " FastaStats.pl options\n";
209 print "\n";
210 print "Available options are:\n";
211 print "\n";
212 print " -i [file] Input FASTA file.\n";
213 print " -o [file] Output stats file.\n";
214 print " -p Add positional stats to the output.\n";
215 print " This will count the occurrance of each AA on each position of all sequences.\n";
216 print " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n";
217 print "\n";
218 exit;
219
220 }