view FastaStats.pl @ 0:163892325845 draft default tip

Initial commit.
author galaxyp
date Fri, 10 May 2013 17:15:08 -0400
parents
children
line wrap: on
line source

#!/usr/bin/perl

#
# FastaStats.pl
#
# =====================================================
# $Id: FastaStats.pl 6 2010-05-27 15:52:50Z pieter $
# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/FastaStats.pl $
# $LastChangedDate: 2010-05-27 10:52:50 -0500 (Thu, 27 May 2010) $ 
# $LastChangedRevision: 6 $
# $LastChangedBy: pieter $
# =====================================================
# 
# Counts the amount of sequences in a FASTA file 
# as well as the amount of nucleotides / amino acids 
# and the sueqence composition.
#

#
# Initialize evironment
#
use strict;
use Getopt::Std;
use Log::Log4perl qw(:easy);

my %log_levels = (
	'ALL'	=> $ALL,
	'TRACE'	=> $TRACE,
	'DEBUG'	=> $DEBUG,
	'INFO'	=> $INFO,
	'WARN'	=> $WARN,
	'ERROR'	=> $ERROR,
	'FATAL'	=> $FATAL,
	'OFF'	=> $OFF,
);

#
# Get options.
#
my %opts;
Getopt::Std::getopts('pi:o:l:', \%opts);

my $fasta_file					= $opts{'i'};
my $stats_file  				= $opts{'o'};
my $log_level					= $opts{'l'};
my $get_positional_composition	= $opts{'p'};

#
# Configure logging.
#
# Provides default if user did not specify log level:
$log_level = (defined($log_level) ? $log_level : 'INFO');
# Reset log level to default if user specified illegal log level.
$log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'INFO'});
#Log::Log4perl->init('log4perl.properties');
Log::Log4perl->easy_init(
	#{ level    => $log_level,
	#  file     => ">>FastaStats.log",
	#  layout   => '%F{1}-%L-%M: %m%n' },
	{ level    => $log_level,
	  file     => "STDOUT",
	  layout   => '%d L:%L %p> %m%n' },
);
my $logger = Log::Log4perl::get_logger();

#
# Check options.
#
if ($fasta_file =~ /^$/) {
	_Usage();
	exit;
}
unless (-f $fasta_file && -r $fasta_file) {
	_Usage();
	exit;
}

#
# Start job.
#
$logger->info('Starting...');
$logger->info('Using FASTA file: '. $fasta_file);

my $sequence_count	= 0;
my %acid_composition_total;
my %acid_composition_positional;
my $position_offset;

#
# Create filehandles.
#
my $fasta_fh;
my $stats_fh;

eval {
	open($fasta_fh, "<$fasta_file");
	open($stats_fh, ">$stats_file");
};
if ($@) {
	$logger->fatal('Cannot create filehandle: ' . $@);
	exit;
}

#
# Parse FASTA file.
#
while (my $line = <$fasta_fh>) {

	if ($line =~ /^>/i) {
		
		$sequence_count++;
		$position_offset = 0; # reset position. 
		
	} else {
		
		#
		# Ignore:
		#	* white space
		#	* end of line (EOL) characters 
		#	* lower- and/or uppercase (repeat) masking.
		#
		my $seq_line = $line;
		$seq_line =~ s/[\s\n\r]+//g;
		next if ($seq_line eq '');
		$seq_line = uc($seq_line);
		
		if ($seq_line =~ m/^([a-zA-Z*-]+)$/) {

			my $seq = $1;
			my @acids = split(//, $seq);

			foreach my $acid(@acids) {
				
				$acid_composition_total{$acid}++;
				
				if ($get_positional_composition) {
					
					$acid_composition_positional{$acid}[$position_offset]++;
					$position_offset++;
										
				}
			}

		} else {

			$logger->warn('Weird line in FASTA file: ' . $line);
			exit;

		}
	}
}

#
# Save stats.
#
print($stats_fh 'Sequences'	. "\t" . $sequence_count	. "\n");
print($stats_fh 'Total acid composition:' . "\n");
my $total_acid_count = 0;
foreach my $acid (sort(keys(%acid_composition_total))) {
	my $acid_count = $acid_composition_total{$acid};
	print($stats_fh 'Acid ' . $acid		. "\t" . $acid_count	. "\n");
	$total_acid_count += $acid_count;
}
if ($get_positional_composition) {
print($stats_fh 'Positional acid composition:' . "\n");
	foreach my $acid (sort(keys(%acid_composition_positional))) { 
		print($stats_fh 'Acid ' . $acid);
		for my $inx (1 .. scalar(@{$acid_composition_positional{$acid}})) {
			my $acid_count;
			if (defined($acid_composition_positional{$acid}[$inx-1])) {
				$acid_count = $acid_composition_positional{$acid}[$inx-1];
			} else {
				$acid_count = 0;
			}
			print($stats_fh "\t" . $acid_count);
		}
		print($stats_fh "\n");
	}
}

print($stats_fh 'Total acids'		. "\t" . $total_acid_count	. "\n");

#
# Close filehandles.
#
close($fasta_fh);
close($stats_fh);

$logger->info('Found ' . $total_acid_count . ' nucleotide/amino acids in ' . $sequence_count . ' sequences.');
$logger->info('Finished!');

#
##
### Subs.
##
#

sub _Usage {

	print "\n";
	print "FastaStats.pl - Reports statistics on a FASTA file like \n";
	print "                  * The number of sequences\n";
	print "                  * The total number of (nucleotide|amino) acids\n";
	print "                  * Sequence composition per (nucleotide|amino) acid\n";
	print "\n";
	print "Usage:\n";
	print "\n";
	print "   FastaStats.pl options\n";
	print "\n";
	print "Available options are:\n";
	print "\n";
	print "   -i [file]          Input FASTA file.\n";
	print "   -o [file]          Output stats file.\n";
	print "   -p                 Add positional stats to the output.\n";
	print "                      This will count the occurrance of each AA on each position of all sequences.\n";
	print "   -l [LEVEL]         Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n";
	print "\n";
	exit;

}