Mercurial > repos > galaxyp > nbic_fasta
diff ProteinDigestor.pl @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ProteinDigestor.pl Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,820 @@ +#!/usr/bin/perl -w + +############################################################### +# ProteinDigestor.pl +# +# ===================================================== +# $Id: ProteinDigestor.pl 76 2011-01-04 13:16:56Z pieter.neerincx@gmail.com $ +# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ProteinDigestor.pl $ +# $LastChangedDate: 2011-01-04 07:16:56 -0600 (Tue, 04 Jan 2011) $ +# $LastChangedRevision: 76 $ +# $LastChangedBy: pieter.neerincx@gmail.com $ +# ===================================================== +# +# (c) Dr. Ir. B. van Breukelen et al. +# https://bioinformatics.chem.uu.nl +# http://www.netherlandsproteomicscentre.eu/ +# b.vanbreukelen@pharm.uu.nl +# +# Software to create peptide databases from a fasta +# file of proteins. You can cut with several enzymes +# select on size, PI, etc etc etc. +# +# TODO: put filters, enzymes, in seperate config file +############################################################### + +############################# +# Initialise environment +############################# +use strict; +use Getopt::Std; +use Log::Log4perl qw(:easy); + +# +# Log levels for Log4perl. +# +my %log_levels = ( + 'ALL' => $ALL, + 'TRACE' => $TRACE, + 'DEBUG' => $DEBUG, + 'INFO' => $INFO, + 'WARN' => $WARN, + 'ERROR' => $ERROR, + 'FATAL' => $FATAL, + 'OFF' => $OFF, +); + +# +# pK values array +# +my %cPk = ( + 'A'=>[3.55, 7.59, 0 , 0 , 0], + 'B'=>[3.55, 7.50, 0 , 0 , 0], + 'C'=>[3.55, 7.50, 9.00 , 9.00 , 9.00], + 'D'=>[4.55, 7.50, 4.05 , 4.05 , 4.05], + 'E'=>[4.75, 7.70, 4.45 , 4.45 , 4.45], + 'F'=>[3.55, 7.50, 0 , 0. , 0], + 'G'=>[3.55, 7.50, 0 , 0. , 0], + 'H'=>[3.55, 7.50, 5.98 , 5.98 , 5.98], + 'I'=>[3.55, 7.50, 0 , 0. , 0.], + 'J'=>[0.00, 0.00, 0 , 0. , 0.], + 'K'=>[3.55, 7.50, 10.00, 10.00, 10.00], + 'L'=>[3.55, 7.50, 0 , 0. , 0.], + 'M'=>[3.55, 7.00, 0 , 0. , 0.], + 'N'=>[3.55, 7.50, 0 , 0. , 0.], + 'O'=>[0.00, 0.00, 0 , 0. , 0.], + 'P'=>[3.55, 8.36, 0 , 0. , 0.], + 'Q'=>[3.55, 7.50, 0 , 0. , 0.], + 'R'=>[3.55, 7.50, 12.0 , 12.0 , 12.0], + 'S'=>[3.55, 6.93, 0 , 0. , 0.], + 'T'=>[3.55, 6.82, 0 , 0. , 0.], + 'U'=>[0.00, 0.00, 0 , 0. , 0.], + 'V'=>[3.55, 7.44, 0 , 0. , 0.], + 'W'=>[3.55, 7.50, 0 , 0. , 0.], + 'X'=>[3.55, 7.50, 0 , 0. , 0.], + 'Y'=>[3.55, 7.50, 10.00, 10.00, 10.00], + 'Z'=>[3.55, 7.50, 0 , 0. , 0.] +); + +# +# MW values array +# +my %cMWs = ( + 'A'=>89.09, + 'B'=>132.65, + 'C'=>121.15, + 'D'=>133.1, + 'E'=>147.13, + 'F'=>165.19, + 'G'=>75.07, + 'H'=>155.16, + 'I'=>131.18, + 'J'=>0.00, + 'K'=>146.19, + 'L'=>131.18, + 'M'=>149.22, + 'N'=>132.12, + 'O'=>0.00, + 'P'=>115.13, + 'Q'=>146.15, + 'R'=>174.21, + 'S'=>105.09, + 'T'=>119.12, + 'U'=>168.05, + 'V'=>117.15, + 'W'=>204.22, + 'X'=>129.26, + 'Y'=>181.19, + 'Z'=>146.73 +); + +# +# Digestion enzymes. +# +# For example Trypsin cuts c-terminal of K and R not followed by a P at position p1 = c:K,R:P=1 +# +my %enzymes = ( + 'Trypsin' => 'c:K,R:P=1', + 'Trypsin/P' => 'c:K,R', + 'Chymotrypsin' => 'c:F,L,W,Y:P=1', + 'Chymotrypsin/P' => 'c:F,L,W,Y', + 'V8_E' => 'c:E,Z:P=1', + 'V8_DE' => 'c:E,D,B,Z:P=1', + 'LysC' => 'c:K:P=1', + 'LysC/P' => 'c:K', + 'LysN/P' => 'n:K' +); + +my @filters = ( +# "[R|K][R|K][R|K]..[S|T]." +# "[R|K][R|K][R|K].[S|T]." , +# "[R|K][R|K]..[S|T].", +# "[R|K]..[S|T].", +# "[R|K][R|K].[S|T].", +# "[R|K].[S|T]." +# "[S|T].[R|K]" +); + +my %charges = ('A'=>0, 'B'=>0, 'C'=>0, 'D'=>-1, 'E'=>-1, 'F'=>0,'G'=>0, 'H'=>1, 'I'=>0, + 'J'=>0, 'K'=>1, 'L'=>0, 'M'=>0, 'N'=>0, 'O'=>0,'P'=>0, 'Q'=>0, 'R'=>1, + 'S'=>0, 'T'=>0, 'U'=>0, 'V'=>0, 'W'=>0, 'X'=>0,'Y'=>0, 'Z'=>0); + +my %pept = (); #hashed array of peptides (HASH) and fasta headers (VALUE) + +my $H20 = 18.015; #Mol. weight water + +my $PH_MIN = 0.0; #min pH value +my $PH_MAX = 14.0; #max pH value +my $MAXLOOP = 2000; # max number iterations +my $EPSI = 0.0001; # desired presision + +# +# Get options. +# +my %opts; +Getopt::Std::getopts('i:o:e:r:p:m:n:c:l:sa', \%opts); + +my $input = $opts{'i'}; +my $output = $opts{'o'}; +my $enzyme = $opts{'e'}; +my $enzyme_cleavage_site_rules = $opts{'r'}; +my $partial_cleavage = $opts{'s'}; +my $partial_cleavage_chance = $opts{'p'}; +my $min_peptide_weight = $opts{'m'}; +my $max_peptide_weight = $opts{'n'}; +my $max_charge = $opts{'c'}; +my $add_sequence_context = $opts{'a'}; +my $log_level = $opts{'l'}; + +# +# Configure logging. +# +# Provides default if user did not specify log level: +$log_level = (defined($log_level) ? $log_level : 'INFO'); + +# Reset log level to default if user specified illegal log level. +$log_level = ( + defined($log_levels{$log_level}) + ? $log_levels{$log_level} + : $log_levels{'INFO'}); + +#Log::Log4perl->init('log4perl.properties'); +Log::Log4perl->easy_init( + + #{ level => $log_level, + # file => ">>ProteinDigestor.log", + # layout => '%F{1}-%L-%M: %m%n' }, + { + level => $log_level, + file => "STDOUT", + layout => '%d L:%L %p> %m%n' + }, +); +my $logger = Log::Log4perl::get_logger(); + +# +# Check user input. +# +unless (defined($input) && defined($output)) { + _Usage(); + exit; +} +if ($input =~ /^$/ || $output =~ /^$/) { + _Usage(); + exit; +} +if ($input eq $output) { + $logger->fatal('Output file is the same as the input file. Select a different file for the output.'); + exit; +} + +# +# Check input file. +# +unless (-e $input && -f $input && -r $input) { + $logger->fatal('Cannot read from input file ' . $input . ': ' . $!); + exit; +} + +# +# Set additional defaults. +# +$partial_cleavage = (defined($partial_cleavage) ? $partial_cleavage : 0); # Disabled by default +$partial_cleavage_chance = (defined($partial_cleavage_chance) ? $partial_cleavage_chance : 0.1); # 10% change by default +$add_sequence_context = (defined($add_sequence_context) ? $add_sequence_context : 0); # Disabled by default + +######################################################## +# MAIN PROGRAM +######################################################## + +$logger->info('Starting...'); + +# +# Get protease cleavage site specs. +# +unless (defined($enzyme_cleavage_site_rules)) { + + # + # Set default to Trypsin if the user did not specify an enzyme name nor a string with enzyme cleavage site rules. + # + $enzyme = (defined($enzyme) ? $enzyme : 'Trypsin'); + + # + # Check for illegal enzymes. + # + unless (defined($enzymes{$enzyme})) { + $logger->fatal('Unkown enzyme ' . $enzyme . '.'); + exit; + } + + # + # Fetch enzyme cleavage site rules. + # + $enzyme_cleavage_site_rules = $enzymes{$enzyme}; + +} + +$logger->info('Using enzyme cleavage site rules: ' . $enzyme_cleavage_site_rules); +my @split_params = split(/:/, $enzyme_cleavage_site_rules); +my $cut_term = $split_params[0]; +my @cut_aa = split(/,/, $split_params[1]); +my @inhibition_aa; +if (defined($split_params[2])) { + @inhibition_aa = split(/,/, $split_params[2]); +} +my $annotation = {}; +my $sequence = ''; + +# +# Create filehandles. +# +open(my $input_fh, "$input"); +open(my $output_fh, ">$output"); + +# +# Write header to result file. +# +print($output_fh "Protein ID\tPeptide\tMW\tpI\tCharge\tnumber S\tnumber T\tnumber Y\n"); + +# +# Digest fasta files and store peptides in an array (hashed) +# +foreach my $line (<$input_fh>){ + + if ($line =~ /^>/){ + + # + # We found a new FASTA header. + # + + # + # Process the previously found protein if we already found one. + # + unless ($sequence eq '') { + _ProcessProtein(); + } + + # + # Parse new FASTA header. + # + $line =~ s/[\r\n\f]+//g; + $annotation = _ParseHeader($line); + $sequence = ''; + + } else { + + # + # Found a sequence line. + # + $line =~ s/[\t\s\r\n\f0-9\*\-]+//g; + $sequence .= $line; + } +} + +# +# Don't foget to process the last sequence! +# +_ProcessProtein(); + +close($input_fh); +close($output_fh); + +$logger->info('Finished!'); + +######################################################### +# SUBS +######################################################### + +sub _ProcessProtein { + + my $id = ${$annotation}{'ID'}; + $logger->debug('Accession/ID: ' . $id); + $logger->debug('Sequence: ' . $sequence . "\n"); + my $total_length_sequence = length($sequence); + my $total_peptide_length = 0; + + my ($peptides) = _Digest($sequence, $annotation, \@cut_aa, $cut_term, \@inhibition_aa); + + # + # Process peptides. + # + foreach my $peptide_with_context (sort(keys(%{$peptides}))) { + + # + # Get peptide properties. + # + $logger->debug('Peptide with context: ' . $peptide_with_context); + my ($bare_peptide) = _ParsePeptide($peptide_with_context); + my $peplength = length($bare_peptide); + my ($pepweight) = _MwCalc($bare_peptide); + + # + # Apply filters + # + # Filter 1 is on weight. + # + if (defined($min_peptide_weight)) { + + $logger->debug('Minimum weight filter:'); + $logger->debug("\t" . 'Peptide weight: ' . $pepweight); + + if ($pepweight >= $min_peptide_weight) { + + $logger->debug("\t" . 'PASSED - Weight is >= ' . $min_peptide_weight); + + } else { + + # Skip this one. + $logger->debug("\t" . 'REJECTED - Weight is < ' . $min_peptide_weight); + next; + + } + } + + if (defined($max_peptide_weight)) { + + $logger->debug('Maximum weight filter:'); + $logger->debug("\t" . 'Peptide weight: ' . $pepweight); + + if ($pepweight <= $max_peptide_weight) { + + $logger->debug("\t" . 'PASSED - Weight is <= ' . $max_peptide_weight); + + } else { + + # Skip this one. + $logger->debug("\t" . 'REJECTED - Weight is > ' . $max_peptide_weight); + next; + + } + } + + # + # Filter 2 is on sequence motifs. + # + if (scalar(@filters) > 0) { + + $logger->debug('Motif filter:'); + my ($motif) = _ContainsMotif($bare_peptide, \@filters); + + if ($motif > 0) { + + $logger->debug("\t" . 'PASSED - Peptide contains motif ' . $motif); + + } else { + + # Skip this one. + $logger->debug("\t" . 'REJECTED - Peptide does not contain any motif.'); + next; + + } + } + + my ($pepPI) = _PiCalc($bare_peptide); + my ($pepCharge) = _ChargeCalc($bare_peptide); + + # + # Filter 3 is on the amount of charges. + # + if (defined($max_charge)) { + + $logger->debug('Charge filter:'); + $logger->debug("\t" . 'Peptide charge: ' . $pepCharge); + + if (defined($max_charge) && $pepCharge <= $max_charge) { + + $logger->debug("\t" . 'PASSED - Charge <= ' . $max_charge); + + } else { + + # Skip this one. + $logger->debug("\t" . 'REJECTED - Charge > ' . $max_charge); + next; + + } + } + + # + # If there is no partial digestion we can calculated the % coverage. + # + unless ($partial_cleavage) { + $total_peptide_length += length($bare_peptide); + } + + my ($residue_count) = _CountResiduesSTY($bare_peptide); + + # + # Store peptide. + # + my $pep_sequence; + + if ($add_sequence_context) { + $pep_sequence = $peptide_with_context; + } else { + $pep_sequence = $bare_peptide; + } + + print($output_fh $id . "\t" . $pep_sequence . "\t" . $pepweight . "\t" . $pepPI. "\t" . $pepCharge. "\t" . $residue_count . "\n"); + + } + + # + # If partial digestion was not enabled, we can calculated the % coverage. + # + unless ($partial_cleavage) { + my $coverage = ($total_peptide_length / $total_length_sequence)*100; + $logger->debug("Sequence residues: $total_length_sequence | #peptide residues: $total_peptide_length | %Coverage: $coverage"); + } +} + +sub _ParseHeader { + + my ($header) = @_; + my %annotation; + my $ids; + my $id; + my $description; + + $header=~ s/^>//; + + if ($header =~ m/([^\s]+)\s+([^\s]+.*)/) { + $ids = $1; + $description = $2; + } else { + $ids = $header; + } + + # + # Redimentory support for detecting multiple IDs: check for pipe sperated IDs. + # + if ($ids =~ m/([^\|])\|/) { + $id = $1; + } else { + $id = $ids; + } + + $annotation{'ID'} = $id; + $annotation{'DE'} = $description; + + return (\%annotation); + +} + +sub _ContainsMotif{ + + my ($peptide, $filters) = @_; + my $cnt = 0; + + foreach my $filter (@{$filters}){ + + $cnt++; + if ($peptide =~ /$filter/){ + return($cnt); + } + + } + + return(0); + +} + +sub _ParsePeptide{ + + my ($peptide_with_flanking_context) = @_; + + my @pep = split(/\./, $peptide_with_flanking_context); + my $bare_peptide = uc($pep[1]); + + return($bare_peptide); + +} + +sub _Digest{ + + my ($sequence, $annotation, $enz_cutting, $cut_term, $enz_restrict) = @_; + + $sequence = uc($sequence); + my %pept = (); + + # we start by scanning the sequence.. to find one of the cutting rulez not followed by a restict rule + # what is the sequence length + my @array = split(//, $sequence); + my $length = scalar(@array); + my $start_offset = 0; + my $offset_adjust_for_cleavage_terminus = 0; + if ($cut_term eq 'n') { + $offset_adjust_for_cleavage_terminus = 1; + } + + my $peptide = ''; + + my $counter = 1; + if (!($partial_cleavage)) { + $partial_cleavage_chance = 0; + } else { + $counter = 100; + } + + #this is for incomplete cleavage... we repeat digest 100 times with a probability that the enzyme will not cut + for (my $cnt = 0; $cnt < $counter; $cnt++) { + + AA:for (my $aa_offset = 0 + $offset_adjust_for_cleavage_terminus; $aa_offset < ($length - 1 + $offset_adjust_for_cleavage_terminus); $aa_offset++) { + + my $this_aa = substr($sequence, $aa_offset, 1); + + # + # Check if protease recognizes this amino acid as cleavage site. + # + foreach my $cut_aa (@{$enz_cutting}) { + + if ($this_aa eq $cut_aa) { + + # + # Check if AA is not preceeded or followed by an AA that inhibits the protease... + # + foreach my $inhibit (@{$enz_restrict}) { + + + my @inhibit_conditions = split(/=/, $inhibit); + my $inhibit_aa = $inhibit_conditions[0]; + my $position_to_check = $inhibit_conditions[1]; + + my $neighborhood_aa = substr($sequence, $aa_offset + $position_to_check, 1); + + if ($neighborhood_aa eq $inhibit_aa){ + $logger->debug('No cleavage due to inhibition by ' . $inhibit_aa . ' at position ' . $position_to_check); + next AA; + } + } + + # + # Consider the possibility of not cutting when doing incomplete digestions! + # + my $random = 1; + if ($partial_cleavage){ + $random = rand(1); + } + if ($random <= $partial_cleavage_chance){ + $logger->debug('No cleavage due to incomplete digestion.'); + next AA; + } + + # + # If no inhibition spoiled the cleavage, lets cut! + # + my $cut_offset; + if ($cut_term eq 'c') { + $cut_offset = $aa_offset + 1; + } elsif ($cut_term eq 'n') { + $cut_offset = $aa_offset; + } + + # nice formatting of peptide + if ($start_offset == 0){ + $peptide .="terminus."; + $peptide .= substr($sequence, $start_offset, $cut_offset - $start_offset)."."; + }else{ + $peptide .= substr($sequence, $start_offset - 3, 3)."."; + $peptide .= substr($sequence, $start_offset, $cut_offset - $start_offset)."."; + } + $peptide .= substr($sequence, $cut_offset, 3); + $start_offset = $cut_offset; + + # check if peptide already exists + # if so, concatenate this fasta header + #if (exists($pept{$peptide})){ + # $fasta_header .= $pept{$peptide}; + #} + $pept{$peptide} = ${$annotation}{'ID'}; + $peptide = ''; + } + } + } + + # + # The last peptide sequence ! + # + $peptide = substr($sequence, $start_offset - 3, 3)."."; + $peptide .= substr($sequence, $start_offset, $length); + $peptide .= ".terminus"; + $pept{$peptide} = ${$annotation}{'ID'}; + + } #COUNT + + return(\%pept); + +} + +sub _PiCalc{ + + my ($sequence) = @_; + + my $nTermResidue; + my $cTermResidue; + my $phMin;my $phMid ;my $phMax; + my $charge; + my $cter; my $nter; my $carg; my $chis; my $clys;my $casp;my $cglu;my $ccys;my $ctyr; + + my $composition = _GetComposition($sequence); + my $strlength = length($sequence); + $nTermResidue = substr($sequence, 0, 1); + for ($nTermResidue){s/[\s\n\r\t]//g;}; + $cTermResidue = substr($sequence, $strlength - 1, 1); + for ($cTermResidue){s/[\s\n\r\t]//g;}; + $phMin = $PH_MIN; + $phMax = $PH_MAX; + my $i; + + for ($i=0, $charge = 1.0; $i<$MAXLOOP && ($phMax-$phMin)>$EPSI; $i++){ + $phMid = $phMin + ($phMax - $phMin) / 2; + $cter = 10**(-$cPk{$cTermResidue}->[0]) / (10**(-$cPk{$cTermResidue}->[0]) + 10**(-$phMid)); + $nter = 10**(-$phMid)/(10**(-$cPk{$nTermResidue}->[1])+10**(-$phMid)); + + $carg = ${$composition}{R} * 10**(-$phMid) / (10**(-$cPk{'R'}->[2])+10**(-$phMid)); + $chis = ${$composition}{H} * 10**(-$phMid) / (10**(-$cPk{'H'}->[2])+10**(-$phMid)); + $clys = ${$composition}{K} * 10**(-$phMid) / (10**(-$cPk{'K'}->[2])+10**(-$phMid)); + + $casp = ${$composition}{D} * 10**(-$cPk{'D'}->[2]) / (10**(-$cPk{'D'}->[2])+10**(-$phMid)); + $cglu = ${$composition}{E} * 10**(-$cPk{'E'}->[2]) / (10**(-$cPk{'E'}->[2])+10**(-$phMid)); + + $ccys = ${$composition}{C} * 10**(-$cPk{'C'}->[2]) / (10**(-$cPk{'C'}->[2])+10**(-$phMid)); + $ctyr = ${$composition}{Y} * 10**(-$cPk{'Y'}->[2]) / (10**(-$cPk{'Y'}->[2])+10**(-$phMid)); + + $charge = $carg + $clys + $chis + $nter - ($casp + $cglu + $ctyr + $ccys + $cter); + + if ($charge > 0.0){ + $phMin = $phMid; + }else{ + $phMax = $phMid; + } + } + + return($phMid); + +} + +sub _MwCalc{ + + my ($sequence) = @_; + + my $mw; + my $strlength = length($sequence); + my $composition = _GetComposition($sequence); + + # subtract N-1 water molecules + $mw = - ($strlength -1) * $H20; + + foreach my $aa (sort keys %{$composition}){ + $logger->trace('AA:' . $aa . ' | frequency: ' . ${$composition}{$aa} . ' | AA MW: ' . $cMWs{$aa}); + $mw += ${$composition}{$aa} * $cMWs{$aa}; + } + + return($mw); + +} + +sub _ChargeCalc{ + + my ($sequence) = @_; + + my $charge; + my $composition = _GetComposition($sequence); + + foreach my $aa (sort(keys(%{$composition}))){ + $charge += ${$composition}{$aa} * $charges{$aa}; + } + + return($charge); + +} + +sub _CountResiduesSTY{ + + my ($sequence) = @_; + + my $composition = _GetComposition($sequence); + my $sty_residues = ${$composition}{'S'} . "\t" . ${$composition}{'T'} . "\t" . ${$composition}{'Y'}; + + return($sty_residues); + +} + +sub _GetComposition { + + my ($sequence) = @_; + + my $i; + my $c; + my %composition = ('A'=>0, 'B'=>0,'C'=>0, 'D'=>0, 'E'=>0, 'F'=>0,'G'=>0, 'H'=>0, 'I'=>0, + 'J'=>0, 'K'=>0,'L'=>0, 'M'=>0, 'N'=>0, 'O'=>0,'P'=>0, 'Q'=>0, 'R'=>0, + 'S'=>0, 'T'=>0,'U'=>0, 'V'=>0, 'W'=>0, 'X'=>0,'Y'=>0, 'Z'=>0); + + # + # Compute aminoacid composition. + # + my $strlength = length($sequence); + for ($i=0; $i<$strlength; $i++){ + $c = substr($sequence, $i, 1); + $composition{$c}++; + } + + return(\%composition); + +} + +sub _Usage { + + my $longest_enzyme_name_length = 0; + foreach my $protease (keys(%enzymes)) { + if (length($protease) > $longest_enzyme_name_length) { + $longest_enzyme_name_length = length($protease); + } + } + my $field_formatting = '%-' . ($longest_enzyme_name_length + 3) . 's'; + + print "\n"; + print "ProteinDigestor.pl - Digests protein sequences from a FASTA file\n"; + print "\n"; + print "Usage:\n"; + print "\n"; + print " ProteinDigestor.pl options\n"; + print "\n"; + print "Available options are:\n"; + print "\n"; + print " -i [file] Input FASTA file.\n"; + print " -o [file] Output stats file.\n"; + print "\n"; + print " You can either specify a protease this tool knows with -e\n"; + print " or you can specify the cleavage rule for a protease this does not know (yet) with -r\n"; + print "\n"; + print " -e [enzyme_name] Digestion enzyme. Known enzymes and their cleavage rules:\n"; + + foreach my $protease (sort(keys(%enzymes))) { + print ' ' . sprintf("$field_formatting", $protease) . $enzymes{$protease} . "\n"; + } + + print " -r [enzyme_rule] Protease cleavage site rule. This rule contains sperated by colons:\n"; + print " * The terminus that indicates on which side of a recognized amino acid the protease will cleave: c or n.\n"; + print " * The amino acids recognized by the protease.\n"; + print " Multiple amino acids separated by a comma.\n"; + print " * The amino acids that inhibit cleavage and their position relative to amino acids recognized by the protease.\n"; + print ' Amino acid and its relative position separated by an equals sign (=).' . "\n"; + print " Multiple amino acids separated by a comma.\n"; + print ' For example \'c:K,R:P=1\' for Trypsin with proline inhibition.' . "\n"; + print " -s Semi for partial digestion.\n"; + print " -p [chance] Number between 0 and 1 indicating the chance that a site will not be cleaved.\n"; + print " -m [weight] Minimum weight for a peptide.\n"; + print " -n [weight] Maximum weight for a peptide.\n"; + print " -c [int] Maximum charge a peptide is allowed to have.\n"; + print " -a Add sequence context.\n"; + print " This will add max 3 amino acids or the word terminus on each side of the peptides.\n"; + print " Contexts and peptides are separated with a dot.\n"; + print " Examples of peptides with a context:\n"; + print " terminus.MSVSLSAK.ASH\n"; + print " SAK.ASHLLCSSTR.VSL\n"; + print " STR.VSLSPAVTSSSSSPVVRPALSSSTS.terminus\n"; + print " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n"; + print "\n"; + exit; + +}