Mercurial > repos > galaxyp > nbic_fasta

diff ProteinDigestor.pl @ 0:163892325845 draft default tip
Initial commit.
author: galaxyp
date: Fri, 10 May 2013 17:15:08 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ProteinDigestor.pl	Fri May 10 17:15:08 2013 -0400
@@ -0,0 +1,820 @@
+#!/usr/bin/perl -w
+
+###############################################################
+# ProteinDigestor.pl
+# 
+# =====================================================
+# $Id: ProteinDigestor.pl 76 2011-01-04 13:16:56Z pieter.neerincx@gmail.com $
+# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ProteinDigestor.pl $
+# $LastChangedDate: 2011-01-04 07:16:56 -0600 (Tue, 04 Jan 2011) $
+# $LastChangedRevision: 76 $
+# $LastChangedBy: pieter.neerincx@gmail.com $
+# =====================================================
+# 
+# (c) Dr. Ir. B. van Breukelen et al.
+# https://bioinformatics.chem.uu.nl
+# http://www.netherlandsproteomicscentre.eu/ 
+# b.vanbreukelen@pharm.uu.nl
+# 
+# Software to create peptide databases from a fasta
+# file of proteins. You can cut with several enzymes
+# select on size, PI, etc etc etc.
+#
+# TODO: put filters, enzymes, in seperate config file
+###############################################################
+
+#############################
+# Initialise environment
+#############################
+use strict;
+use Getopt::Std;
+use Log::Log4perl qw(:easy);
+
+#
+# Log levels for Log4perl.
+#
+my %log_levels = (
+	'ALL'   => $ALL,
+	'TRACE' => $TRACE,
+	'DEBUG' => $DEBUG,
+	'INFO'  => $INFO,
+	'WARN'  => $WARN,
+	'ERROR' => $ERROR,
+	'FATAL' => $FATAL,
+	'OFF'   => $OFF,
+);
+
+#
+# pK values array
+#
+my %cPk = (
+	'A'=>[3.55, 7.59, 0    , 0    , 0],
+	'B'=>[3.55, 7.50, 0    , 0    , 0],
+	'C'=>[3.55, 7.50, 9.00 , 9.00 , 9.00], 
+	'D'=>[4.55, 7.50, 4.05 , 4.05 , 4.05],
+	'E'=>[4.75, 7.70, 4.45 , 4.45 , 4.45],
+	'F'=>[3.55, 7.50, 0    , 0.   , 0],
+	'G'=>[3.55, 7.50, 0    , 0.   , 0],
+	'H'=>[3.55, 7.50, 5.98 , 5.98 , 5.98],
+ 	'I'=>[3.55, 7.50, 0    , 0.   , 0.],
+	'J'=>[0.00, 0.00, 0    , 0.   , 0.],
+	'K'=>[3.55, 7.50, 10.00, 10.00, 10.00],
+ 	'L'=>[3.55, 7.50, 0    , 0.   , 0.],
+ 	'M'=>[3.55, 7.00, 0    , 0.   , 0.],
+	'N'=>[3.55, 7.50, 0    , 0.   , 0.],
+	'O'=>[0.00, 0.00, 0    , 0.   , 0.],
+	'P'=>[3.55, 8.36, 0    , 0.   , 0.],
+	'Q'=>[3.55, 7.50, 0    , 0.   , 0.],
+	'R'=>[3.55, 7.50, 12.0 , 12.0 , 12.0],
+	'S'=>[3.55, 6.93, 0    , 0.   , 0.],
+	'T'=>[3.55, 6.82, 0    , 0.   , 0.],
+	'U'=>[0.00, 0.00, 0    , 0.   , 0.],
+	'V'=>[3.55, 7.44, 0    , 0.   , 0.],
+	'W'=>[3.55, 7.50, 0    , 0.   , 0.],
+	'X'=>[3.55, 7.50, 0    , 0.   , 0.],
+	'Y'=>[3.55, 7.50, 10.00, 10.00, 10.00],
+	'Z'=>[3.55, 7.50, 0    , 0.   , 0.]
+);
+		
+#
+# MW values array
+#
+my %cMWs = (
+	'A'=>89.09,
+	'B'=>132.65,
+	'C'=>121.15,
+	'D'=>133.1,
+	'E'=>147.13,
+	'F'=>165.19,
+	'G'=>75.07,
+	'H'=>155.16,
+	'I'=>131.18,
+	'J'=>0.00,
+	'K'=>146.19,
+	'L'=>131.18,
+	'M'=>149.22,
+	'N'=>132.12, 
+	'O'=>0.00,
+	'P'=>115.13,
+	'Q'=>146.15,
+	'R'=>174.21,
+	'S'=>105.09,
+	'T'=>119.12,
+	'U'=>168.05,
+	'V'=>117.15,
+	'W'=>204.22,
+	'X'=>129.26,
+	'Y'=>181.19,
+	'Z'=>146.73 
+);
+
+#  			
+# Digestion enzymes.
+#
+# For example Trypsin cuts c-terminal of K and R not followed by a P at position p1 = c:K,R:P=1
+#
+my %enzymes = (
+	'Trypsin'			=> 'c:K,R:P=1',
+	'Trypsin/P'			=> 'c:K,R',
+	'Chymotrypsin'		=> 'c:F,L,W,Y:P=1',
+	'Chymotrypsin/P'	=> 'c:F,L,W,Y',
+	'V8_E'				=> 'c:E,Z:P=1',
+	'V8_DE'				=> 'c:E,D,B,Z:P=1',
+	'LysC'				=> 'c:K:P=1',
+	'LysC/P'			=> 'c:K',
+	'LysN/P'			=> 'n:K'
+);
+
+my @filters = (
+#	"[R|K][R|K][R|K]..[S|T]."
+#	"[R|K][R|K][R|K].[S|T]." ,
+#	"[R|K][R|K]..[S|T].",
+#	"[R|K]..[S|T].",
+#	"[R|K][R|K].[S|T].",
+#	"[R|K].[S|T]."
+#	"[S|T].[R|K]"
+);
+		
+my %charges = ('A'=>0,	'B'=>0,	'C'=>0, 'D'=>-1, 'E'=>-1, 'F'=>0,'G'=>0, 'H'=>1, 'I'=>0,
+				'J'=>0, 'K'=>1,	'L'=>0, 'M'=>0,	'N'=>0, 'O'=>0,'P'=>0, 'Q'=>0, 'R'=>1,
+				'S'=>0, 'T'=>0,	'U'=>0, 'V'=>0,	'W'=>0, 'X'=>0,'Y'=>0, 'Z'=>0);
+					
+my %pept = ();	 #hashed array of peptides (HASH) and fasta headers (VALUE)		
+
+my $H20 = 18.015; #Mol. weight water
+
+my $PH_MIN = 0.0; #min pH value
+my $PH_MAX = 14.0; #max pH value
+my $MAXLOOP = 2000; # max number iterations
+my $EPSI = 0.0001; # desired presision  			
+
+#
+# Get options.
+#
+my %opts;
+Getopt::Std::getopts('i:o:e:r:p:m:n:c:l:sa', \%opts);
+
+my $input						= $opts{'i'};
+my $output						= $opts{'o'};
+my $enzyme						= $opts{'e'};
+my $enzyme_cleavage_site_rules	= $opts{'r'};
+my $partial_cleavage			= $opts{'s'};
+my $partial_cleavage_chance		= $opts{'p'};
+my $min_peptide_weight			= $opts{'m'};
+my $max_peptide_weight			= $opts{'n'};
+my $max_charge					= $opts{'c'};
+my $add_sequence_context		= $opts{'a'};
+my $log_level					= $opts{'l'};
+
+#
+# Configure logging.
+#
+# Provides default if user did not specify log level:
+$log_level = (defined($log_level) ? $log_level : 'INFO');
+
+# Reset log level to default if user specified illegal log level.
+$log_level = (
+	defined($log_levels{$log_level})
+	? $log_levels{$log_level}
+	: $log_levels{'INFO'});
+
+#Log::Log4perl->init('log4perl.properties');
+Log::Log4perl->easy_init(
+
+	#{ level    => $log_level,
+	#  file     => ">>ProteinDigestor.log",
+	#  layout   => '%F{1}-%L-%M: %m%n' },
+	{
+		level  => $log_level,
+		file   => "STDOUT",
+		layout => '%d L:%L %p> %m%n'
+	},
+);
+my $logger = Log::Log4perl::get_logger();
+
+#
+# Check user input.
+#
+unless (defined($input) && defined($output)) {
+	_Usage();
+	exit;
+}
+if ($input =~ /^$/ || $output =~ /^$/) {
+	_Usage();
+	exit;
+}
+if ($input eq $output) {
+	$logger->fatal('Output file is the same as the input file. Select a different file for the output.');
+	exit;
+}
+
+#
+# Check input file.
+#
+unless (-e $input && -f $input && -r $input) {
+	$logger->fatal('Cannot read from input file ' . $input . ': ' . $!);
+	exit;
+}
+
+#
+# Set additional defaults.
+#
+$partial_cleavage			= (defined($partial_cleavage)			? $partial_cleavage			: 0);			# Disabled by default
+$partial_cleavage_chance	= (defined($partial_cleavage_chance)	? $partial_cleavage_chance	: 0.1);			# 10% change by default
+$add_sequence_context		= (defined($add_sequence_context)		? $add_sequence_context		: 0);			# Disabled by default
+
+########################################################
+# MAIN PROGRAM
+########################################################
+
+$logger->info('Starting...');
+
+#
+# Get protease cleavage site specs.
+#
+unless (defined($enzyme_cleavage_site_rules)) {
+	
+	#
+	# Set default to Trypsin if the user did not specify an enzyme name nor a string with enzyme cleavage site rules.
+	#
+	$enzyme						= (defined($enzyme)						? $enzyme					: 'Trypsin');
+	
+	#
+	# Check for illegal enzymes.
+	#
+	unless (defined($enzymes{$enzyme})) {
+		$logger->fatal('Unkown enzyme ' . $enzyme . '.');
+		exit;
+	}
+	
+	#
+	# Fetch enzyme cleavage site rules.
+	#
+	$enzyme_cleavage_site_rules = $enzymes{$enzyme};
+	
+}
+
+$logger->info('Using enzyme cleavage site rules: ' . $enzyme_cleavage_site_rules);
+my @split_params = split(/:/, $enzyme_cleavage_site_rules);
+my $cut_term = $split_params[0];
+my @cut_aa = split(/,/, $split_params[1]);
+my @inhibition_aa;
+if (defined($split_params[2])) {
+	@inhibition_aa = split(/,/, $split_params[2]);
+}
+my $annotation = {};
+my $sequence = '';
+
+#
+# Create filehandles.
+#
+open(my $input_fh,   "$input");
+open(my $output_fh, ">$output");
+
+#
+# Write header to result file.
+#
+print($output_fh "Protein ID\tPeptide\tMW\tpI\tCharge\tnumber S\tnumber T\tnumber Y\n");
+
+#
+# Digest fasta files and store peptides in an array (hashed)
+#
+foreach my $line (<$input_fh>){
+	
+	if ($line =~ /^>/){
+		
+		#
+		# We found a new FASTA header.
+		#
+		
+		#
+		# Process the previously found protein if we already found one.
+		#
+		unless ($sequence eq '') {
+			_ProcessProtein();
+		}
+		
+		#
+		# Parse new FASTA header.
+		#
+		$line =~ s/[\r\n\f]+//g;
+		$annotation = _ParseHeader($line);
+		$sequence = '';
+		
+	} else {
+		
+		#
+		# Found a sequence line.
+		#
+		$line =~ s/[\t\s\r\n\f0-9\*\-]+//g;
+		$sequence .= $line;
+	}
+}
+
+#
+# Don't foget to process the last sequence!
+#
+_ProcessProtein();
+
+close($input_fh);
+close($output_fh);
+
+$logger->info('Finished!');
+
+#########################################################
+# SUBS
+#########################################################
+
+sub _ProcessProtein {
+	
+	my $id = ${$annotation}{'ID'};
+	$logger->debug('Accession/ID: ' . $id);
+	$logger->debug('Sequence: ' . $sequence . "\n");
+	my $total_length_sequence = length($sequence);
+	my $total_peptide_length = 0;
+	
+	my ($peptides) = _Digest($sequence, $annotation, \@cut_aa, $cut_term, \@inhibition_aa);
+	
+	#
+	# Process peptides.
+	#
+	foreach my $peptide_with_context (sort(keys(%{$peptides}))) {
+		
+		#
+		# Get peptide properties.
+		#
+		$logger->debug('Peptide with context: ' . $peptide_with_context);
+		my ($bare_peptide) = _ParsePeptide($peptide_with_context);
+		my $peplength = length($bare_peptide);				
+		my ($pepweight) = _MwCalc($bare_peptide);
+		
+		#
+		# Apply filters
+		#
+		# Filter 1 is on weight.
+		#
+		if (defined($min_peptide_weight)) {
+			
+			$logger->debug('Minimum weight filter:');
+			$logger->debug("\t" . 'Peptide weight: ' . $pepweight);
+			
+			if ($pepweight >= $min_peptide_weight) {
+				
+				$logger->debug("\t" . 'PASSED   - Weight is >= ' . $min_peptide_weight);
+				
+			} else {
+				
+				# Skip this one.
+				$logger->debug("\t" . 'REJECTED - Weight is < ' . $min_peptide_weight);
+				next;
+				
+			}
+		}
+		
+		if (defined($max_peptide_weight)) {
+			
+			$logger->debug('Maximum weight filter:');
+			$logger->debug("\t" . 'Peptide weight: ' . $pepweight);
+			
+			if ($pepweight <= $max_peptide_weight) {
+				
+				$logger->debug("\t" . 'PASSED   - Weight is <= ' . $max_peptide_weight);
+				
+			} else {
+				
+				# Skip this one.
+				$logger->debug("\t" . 'REJECTED - Weight is > ' . $max_peptide_weight);
+				next;
+				
+			}
+		}
+					
+		#
+		# Filter 2 is on sequence motifs.
+		#
+		if (scalar(@filters) > 0) {
+			
+			$logger->debug('Motif filter:');
+			my ($motif) = _ContainsMotif($bare_peptide, \@filters);
+			
+			if ($motif > 0) {
+				
+				$logger->debug("\t" . 'PASSED   - Peptide contains motif ' . $motif);
+				
+			} else {
+				
+				# Skip this one.
+				$logger->debug("\t" . 'REJECTED - Peptide does not contain any motif.');
+				next;
+				
+			}
+		}
+		
+		my ($pepPI)		= _PiCalc($bare_peptide);
+		my ($pepCharge)	= _ChargeCalc($bare_peptide);
+			
+		#
+		# Filter 3 is on the amount of charges.
+		#
+		if (defined($max_charge)) {
+			
+			$logger->debug('Charge filter:');
+			$logger->debug("\t" . 'Peptide charge: ' . $pepCharge);
+			
+			if (defined($max_charge) && $pepCharge <= $max_charge) {
+					
+				$logger->debug("\t" . 'PASSED   - Charge <= ' . $max_charge);
+				
+			} else {
+				
+				# Skip this one.
+				$logger->debug("\t" . 'REJECTED - Charge > ' . $max_charge);
+				next;
+				
+			}
+		}
+		
+		#
+		# If there is no partial digestion we can calculated the % coverage.
+		#
+		unless ($partial_cleavage) {
+			$total_peptide_length += length($bare_peptide);
+		}
+		
+		my ($residue_count) = _CountResiduesSTY($bare_peptide);
+		
+		#
+		# Store peptide.
+		#
+		my $pep_sequence;
+		
+		if ($add_sequence_context) {
+			$pep_sequence = $peptide_with_context;
+		} else {
+			$pep_sequence = $bare_peptide;
+		}
+		
+		print($output_fh $id . "\t" . $pep_sequence . "\t" . $pepweight . "\t" . $pepPI. "\t" . $pepCharge. "\t" . $residue_count . "\n");
+		
+	}
+	
+	#
+	# If partial digestion was not enabled, we can calculated the % coverage.
+	#
+	unless ($partial_cleavage) {
+			my $coverage = ($total_peptide_length / $total_length_sequence)*100;
+			$logger->debug("Sequence residues: $total_length_sequence | #peptide residues: $total_peptide_length | %Coverage: $coverage");            
+	}
+}
+
+sub _ParseHeader {
+	
+	my ($header) = @_;
+	my %annotation;
+	my $ids;
+	my $id;
+	my $description;
+	
+	$header=~ s/^>//;
+	
+	if ($header =~ m/([^\s]+)\s+([^\s]+.*)/) {
+		$ids = $1;
+		$description = $2;
+	} else {
+		$ids = $header;
+	}
+	
+	#
+	# Redimentory support for detecting multiple IDs: check for pipe sperated IDs.
+	#
+	if ($ids =~ m/([^\|])\|/) {
+		$id = $1;
+	} else {
+		$id = $ids;
+	}
+	
+	$annotation{'ID'} = $id;
+	$annotation{'DE'} = $description;
+	
+	return (\%annotation);
+	
+}
+
+sub _ContainsMotif{
+	
+	my ($peptide, $filters) = @_;
+	my $cnt = 0;
+	
+	foreach my $filter (@{$filters}){
+		
+		$cnt++;
+		if ($peptide =~ /$filter/){
+			return($cnt);
+		}	
+	
+	}
+	
+	return(0);
+
+}
+
+sub _ParsePeptide{
+	
+	my ($peptide_with_flanking_context) = @_;
+	
+	my @pep = split(/\./, $peptide_with_flanking_context);
+	my $bare_peptide = uc($pep[1]);
+	
+	return($bare_peptide);	
+	
+}
+
+sub _Digest{
+	
+	my ($sequence, $annotation, $enz_cutting, $cut_term, $enz_restrict) = @_;
+	
+	$sequence = uc($sequence);
+	my %pept = ();
+
+	# we start by scanning the sequence.. to find one of the cutting rulez not followed by a restict rule
+	# what is the sequence length
+	my @array = split(//, $sequence);
+	my $length = scalar(@array);
+	my $start_offset = 0;
+	my $offset_adjust_for_cleavage_terminus = 0;
+	if ($cut_term eq 'n') {
+		$offset_adjust_for_cleavage_terminus = 1;
+	}
+	
+	my $peptide = '';
+	
+	my $counter = 1;
+	if (!($partial_cleavage)) {
+		$partial_cleavage_chance = 0;
+	} else {
+		$counter = 100;
+	}
+	
+	#this is for incomplete cleavage... we repeat digest 100 times with a probability that the enzyme will not cut
+	for (my $cnt = 0; $cnt < $counter; $cnt++) {
+		
+		AA:for (my $aa_offset = 0 + $offset_adjust_for_cleavage_terminus; $aa_offset < ($length - 1 + $offset_adjust_for_cleavage_terminus); $aa_offset++) {
+			
+			my $this_aa = substr($sequence, $aa_offset, 1);
+			
+			#
+			# Check if protease recognizes this amino acid as cleavage site.
+			#
+			foreach my $cut_aa (@{$enz_cutting}) {
+				
+				if ($this_aa eq $cut_aa) {
+									
+					#
+					# Check if AA is not preceeded or followed by an AA that inhibits the protease... 
+					#
+					foreach my $inhibit (@{$enz_restrict}) {
+						 
+
+						my @inhibit_conditions = split(/=/, $inhibit);
+						my $inhibit_aa = $inhibit_conditions[0];
+						my $position_to_check = $inhibit_conditions[1];
+						
+						my $neighborhood_aa = substr($sequence, $aa_offset + $position_to_check, 1);
+												
+						if ($neighborhood_aa eq $inhibit_aa){
+							$logger->debug('No cleavage due to inhibition by ' . $inhibit_aa . ' at position ' . $position_to_check);
+							next AA;
+						}
+					}
+					
+					#
+					# Consider the possibility of not cutting when doing incomplete digestions!
+					#
+					my $random = 1;
+					if ($partial_cleavage){
+						$random = rand(1);
+					}
+					if ($random <= $partial_cleavage_chance){
+						$logger->debug('No cleavage due to incomplete digestion.');
+						next AA;
+					}
+					
+					#
+					# If no inhibition spoiled the cleavage, lets cut! 
+					#
+					my $cut_offset;
+					if ($cut_term eq 'c') {
+						$cut_offset = $aa_offset + 1;
+					} elsif ($cut_term eq 'n') {
+						$cut_offset = $aa_offset;
+					}
+					
+					# nice formatting of peptide
+					if ($start_offset == 0){
+						$peptide .="terminus.";
+						$peptide .= substr($sequence, $start_offset, $cut_offset - $start_offset).".";
+					}else{
+						$peptide .= substr($sequence, $start_offset - 3, 3).".";
+						$peptide .= substr($sequence, $start_offset, $cut_offset - $start_offset).".";
+					}
+					$peptide .= substr($sequence, $cut_offset, 3);
+					$start_offset = $cut_offset;
+					
+					# check if peptide already exists
+					# if so, concatenate this fasta header 
+					#if (exists($pept{$peptide})){
+					#	$fasta_header .= $pept{$peptide};
+					#}
+					$pept{$peptide} = ${$annotation}{'ID'};
+					$peptide = '';
+				}
+			}
+		}
+		
+		#
+		# The last peptide sequence !
+		#
+		$peptide = substr($sequence, $start_offset - 3, 3).".";
+		$peptide .= substr($sequence, $start_offset, $length);
+		$peptide .= ".terminus";
+		$pept{$peptide} = ${$annotation}{'ID'};
+		
+	} #COUNT
+	
+	return(\%pept);
+	
+}
+
+sub _PiCalc{
+	
+	my ($sequence) = @_;
+	
+	my $nTermResidue;
+	my $cTermResidue;
+	my $phMin;my $phMid ;my $phMax;
+	my $charge;
+	my $cter; my $nter; my $carg; my $chis; my $clys;my $casp;my $cglu;my $ccys;my $ctyr;
+	
+	my $composition = _GetComposition($sequence);
+	my $strlength = length($sequence);
+	$nTermResidue = substr($sequence, 0, 1);
+	for ($nTermResidue){s/[\s\n\r\t]//g;};
+	$cTermResidue = substr($sequence, $strlength - 1, 1);
+	for ($cTermResidue){s/[\s\n\r\t]//g;};
+	$phMin = $PH_MIN;
+	$phMax = $PH_MAX;
+	my $i;
+	
+	for ($i=0, $charge = 1.0; $i<$MAXLOOP && ($phMax-$phMin)>$EPSI; $i++){
+		$phMid = $phMin + ($phMax - $phMin) / 2;
+		$cter = 10**(-$cPk{$cTermResidue}->[0]) / (10**(-$cPk{$cTermResidue}->[0]) + 10**(-$phMid));	
+		$nter = 10**(-$phMid)/(10**(-$cPk{$nTermResidue}->[1])+10**(-$phMid));
+		
+		$carg = ${$composition}{R} * 10**(-$phMid) / (10**(-$cPk{'R'}->[2])+10**(-$phMid));
+		$chis = ${$composition}{H} * 10**(-$phMid) / (10**(-$cPk{'H'}->[2])+10**(-$phMid));
+		$clys = ${$composition}{K} * 10**(-$phMid) / (10**(-$cPk{'K'}->[2])+10**(-$phMid));
+		
+		$casp = ${$composition}{D} * 10**(-$cPk{'D'}->[2]) / (10**(-$cPk{'D'}->[2])+10**(-$phMid));
+		$cglu = ${$composition}{E} * 10**(-$cPk{'E'}->[2]) / (10**(-$cPk{'E'}->[2])+10**(-$phMid));
+		
+		$ccys = ${$composition}{C} * 10**(-$cPk{'C'}->[2]) / (10**(-$cPk{'C'}->[2])+10**(-$phMid));
+		$ctyr = ${$composition}{Y} * 10**(-$cPk{'Y'}->[2]) / (10**(-$cPk{'Y'}->[2])+10**(-$phMid));
+		
+		$charge = $carg + $clys + $chis + $nter - ($casp + $cglu + $ctyr + $ccys + $cter);
+		
+		if ($charge > 0.0){
+			$phMin = $phMid;	
+		}else{
+			$phMax = $phMid;
+		}
+	}
+	
+	return($phMid);
+	
+}
+ 
+sub _MwCalc{
+	
+	my ($sequence) = @_;
+	
+	my $mw;
+	my $strlength = length($sequence);
+	my $composition = _GetComposition($sequence);
+		
+	# subtract N-1 water molecules
+	$mw = - ($strlength -1) * $H20;
+	
+	foreach my $aa (sort keys %{$composition}){
+		$logger->trace('AA:' . $aa . ' | frequency: ' . ${$composition}{$aa} . ' | AA MW: ' . $cMWs{$aa});
+		$mw += ${$composition}{$aa} * $cMWs{$aa};	
+	}
+	
+	return($mw);
+	
+}
+
+sub _ChargeCalc{
+	
+	my ($sequence) = @_;
+	
+	my $charge;
+	my $composition = _GetComposition($sequence);
+	
+	foreach my $aa (sort(keys(%{$composition}))){
+		$charge += ${$composition}{$aa} * $charges{$aa};	
+	}
+	
+	return($charge);
+	
+}
+
+sub _CountResiduesSTY{
+	
+	my ($sequence) = @_;
+	
+	my $composition = _GetComposition($sequence);
+	my $sty_residues = ${$composition}{'S'} . "\t" . ${$composition}{'T'} . "\t" . ${$composition}{'Y'};
+	
+	return($sty_residues);
+	
+}
+
+sub _GetComposition {
+	
+	my ($sequence) = @_;	
+	
+	my $i;
+	my $c;
+	my %composition = ('A'=>0, 'B'=>0,'C'=>0, 'D'=>0, 'E'=>0, 'F'=>0,'G'=>0, 'H'=>0, 'I'=>0,
+				'J'=>0, 'K'=>0,'L'=>0, 'M'=>0, 'N'=>0, 'O'=>0,'P'=>0, 'Q'=>0, 'R'=>0,
+				'S'=>0, 'T'=>0,'U'=>0, 'V'=>0, 'W'=>0, 'X'=>0,'Y'=>0, 'Z'=>0);	
+	
+	#
+	# Compute aminoacid composition.
+	#
+	my $strlength = length($sequence);
+	for ($i=0; $i<$strlength; $i++){
+		$c = substr($sequence, $i, 1);
+		$composition{$c}++;
+	}
+	
+	return(\%composition);
+	
+}
+
+sub _Usage {
+	
+	my $longest_enzyme_name_length = 0;
+	foreach my $protease (keys(%enzymes)) {
+		if (length($protease) > $longest_enzyme_name_length) {
+			 $longest_enzyme_name_length = length($protease);
+		}
+	}
+	my $field_formatting = '%-' . ($longest_enzyme_name_length + 3) . 's';
+	
+	print "\n";
+	print "ProteinDigestor.pl - Digests protein sequences from a FASTA file\n";
+	print "\n";
+	print "Usage:\n";
+	print "\n";
+	print "   ProteinDigestor.pl options\n";
+	print "\n";
+	print "Available options are:\n";
+	print "\n";
+	print "   -i [file]          Input FASTA file.\n";
+	print "   -o [file]          Output stats file.\n";
+	print "\n";
+	print "                      You can either specify a protease this tool knows with -e\n";
+	print "                      or you can specify the cleavage rule for a protease this does not know (yet) with -r\n";
+	print "\n";
+	print "   -e [enzyme_name]   Digestion enzyme. Known enzymes and their cleavage rules:\n";
+	
+	foreach my $protease (sort(keys(%enzymes))) {
+		print '                         ' . sprintf("$field_formatting", $protease) . $enzymes{$protease} . "\n";
+	}
+	
+	print "   -r [enzyme_rule]   Protease cleavage site rule. This rule contains sperated by colons:\n";
+	print "                         * The terminus that indicates on which side of a recognized amino acid the protease will cleave: c or n.\n";
+	print "                         * The amino acids recognized by the protease.\n";
+	print "                           Multiple amino acids separated by a comma.\n";
+	print "                         * The amino acids that inhibit cleavage and their position relative to amino acids recognized by the protease.\n";
+	print '                           Amino acid and its relative position separated by an equals sign (=).' . "\n";
+	print "                           Multiple amino acids separated by a comma.\n";
+	print '                         For example \'c:K,R:P=1\' for Trypsin with proline inhibition.' . "\n";
+	print "   -s                 Semi for partial digestion.\n";
+	print "   -p [chance]        Number between 0 and 1 indicating the chance that a site will not be cleaved.\n";
+	print "   -m [weight]        Minimum weight for a peptide.\n";
+	print "   -n [weight]        Maximum weight for a peptide.\n";
+	print "   -c [int]           Maximum charge a peptide is allowed to have.\n";
+	print "   -a                 Add sequence context.\n";
+	print "                      This will add max 3 amino acids or the word terminus on each side of the peptides.\n";
+	print "                      Contexts and peptides are separated with a dot.\n";
+	print "                      Examples of peptides with a context:\n";
+	print "                         terminus.MSVSLSAK.ASH\n";
+	print "                         SAK.ASHLLCSSTR.VSL\n";
+	print "                         STR.VSLSPAVTSSSSSPVVRPALSSSTS.terminus\n";
+	print "   -l [LEVEL]         Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n";
+	print "\n";
+	exit;
+	
+}
author	galaxyp
date	Fri, 10 May 2013 17:15:08 -0400
parents
children