# HG changeset patch # User galaxyp # Date 1368220508 14400 # Node ID 16389232584566a2f6a19732868fa64bc24cb20b Initial commit. diff -r 000000000000 -r 163892325845 ConvertFastaHeaders.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ConvertFastaHeaders.pl Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,669 @@ +#!/usr/bin/perl + +# +# convertFastaHeaders.pl +# +# $Id: ConvertFastaHeaders.pl 44 2010-10-18 12:58:41Z pieter.neerincx@gmail.com $ +# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ConvertFastaHeaders.pl $ +# $LastChangedDate: 2010-10-18 07:58:41 -0500 (Mon, 18 Oct 2010) $ +# $LastChangedRevision: 44 $ +# $LastChangedBy: pieter.neerincx@gmail.com $ +# +# Converts sequence header of FASTA files (in various customisable ways). +# + +# +# Initialize evironment +# +use strict; +use Getopt::Std; +use Log::Log4perl qw(:easy); + +my %log_levels = ( + 'ALL' => $ALL, + 'TRACE' => $TRACE, + 'DEBUG' => $DEBUG, + 'INFO' => $INFO, + 'WARN' => $WARN, + 'ERROR' => $ERROR, + 'FATAL' => $FATAL, + 'OFF' => $OFF, +); + +# +# Get options. +# +my %opts; +Getopt::Std::getopts('i:o:l:e:f:n:a:p:', \%opts); +my $input = $opts{'i'}; +my $output = $opts{'o'}; +my $log_level = $opts{'l'}; +my $extension = $opts{'e'}; +my @x_fixes_array = split(/\s+/, $opts{'f'}); +my $new_x_fix = $opts{'n'}; +my $action = $opts{'a'}; +my $position = $opts{'p'}; +my %ids_to_delete; +my @new_id_order; + +# +# Configure logging. +# +# Provides default if user did not specify log level: +$log_level = (defined($log_level) ? $log_level : 'WARN'); +# Reset log level to default if user specified illegal log level. +$log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'WARN'}); +#Log::Log4perl->init('log4perl.properties'); +Log::Log4perl->easy_init( + #{ level => $log_level, + # file => ">>ConvertFastaHeaders.log", + # layout => '%F{1}-%L-%M: %m%n' }, + { level => $log_level, + file => "STDERR", + layout => '%d L:%L %p> %m%n' }, +); +my $logger = Log::Log4perl::get_logger(); + +# +# Start the conversion process. +# +$logger->info("Starting..."); + +# +# Check user input. +# + +# Provides default if user did not specify action: +$action = (defined($action) ? $action : 'add'); + +# Check for valid action and action specific options. +if ($action eq 'add' || $action eq 'strip' || $action eq 'replace') { + + unless (scalar(@x_fixes_array) > 0) { + $logger->fatal('No prefixes or suffixes specified.'); + _Usage(); + } + + if ($action eq 'replace') { + unless (defined($new_x_fix) && $new_x_fix ne '') { + $logger->fatal('No new prefix or suffix specified to replace the existing ones.'); + _Usage(); + } + } + + # Provides default if user did not specify position: + $position = (defined($position) ? $position : 'prefix'); + # Check for valid position. + if ($action eq 'add' || $action eq 'strip') { + unless ($position eq 'prefix' || $position eq 'suffix') { + $logger->fatal('Illegal position specified. Must be \'prefix\' or \'suffix\'.'); + _Usage(); + } + } elsif ($action eq 'replace') { + unless ($position eq 'prefix' || $position eq 'suffix' || $position eq 'pre2suf' || $position eq 'suf2pre') { + $logger->fatal('Illegal position specified. Must be \'prefix\', \'suffix\', \'pre2suf\' or \'suf2pre\'.'); + _Usage(); + } + } + +} elsif ($action eq 'delete' || $action eq 'shuffle') { + + unless (defined($position) && $position ne '') { + $logger->fatal('No position specified.'); + _Usage(); + } + + my @id_indices = split(/,/, $position); + + # Check if the value is a number. + foreach my $index_number (@id_indices) { + + unless ($index_number =~ m/^[1-9][0-9]*$/) { + + $logger->fatal('Illegal character in position list. Must be a single positive integer or comma separated list of positive integers.'); + _Usage(); + + } + + if ($action eq 'delete') { + + $ids_to_delete{$index_number} = 'del'; + + } elsif ($action eq 'shuffle') { + + push(@new_id_order, $index_number); + + } + } + +} else { + $logger->fatal('Illegal action specified. Must be add, strip, replace, delete or shuffle.'); + _Usage(); +} + + +# Provides default if user did not specify log level: +$log_level = (defined($log_level) ? $log_level : 'WARN'); +# Reset log level to default if user specified illegal log level. +$log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'WARN'}); + +# Provide default if user did not specify fasta filename extension: +$extension = (defined($extension) ? $extension : 'fa'); + +if ($input =~ /^$/ || $output =~ /^$/) { + # Indir and outdir cannot be empty. + _Usage(); +} +if ($input eq $output) { + $logger->fatal("Output dir/file is the same as the input dir/file. Please choose a different one."); + exit; +} + +# +# Check if input is a single file or a directory. +# +unless (-e $input && -r $input) { + + $logger->fatal("Input $input does not exist or is not readable: $!"); + exit; + +} else { + + if (-f $input) { + + # + # We've got an input file. + # + my $file; + if ($input =~ m/(.+\/)([^\/]+)$/) { + $file = $2; + } else { + $file = $input; + } + + $logger->info('Parsing ' . $file . "...\n"); + + _ConvertFastaHeaders($input, $output, $action, \@x_fixes_array, $new_x_fix, $position, \%ids_to_delete, \@new_id_order); + + $logger->info('Converted ' . $file); + + } else { + + # + # We've got an input directory. + # Assume the output is also a directory. + # Append trailing path separators if they was missing. + # + my $indir; + my $outdir; + unless ($input =~ m/\/$/) { + $input = $input .+ '/'; + } + unless ($output =~ m/\/$/) { + $output = $output .+ '/'; + } + # + # Make sure the input dir is a directory. + # + unless (-d $input) { + $logger->fatal("Input $input is not a file nor a directory: $!"); + exit; + } else { + $indir = $input; + $outdir = $output; + } + + # + # Get all FASTA files from the input dir. + # + my $files = _GetFiles($indir, $outdir, $extension); + + # + # Create the output directory if did not exist yet. + # + if (-e $outdir && -d $outdir) { + unless (-w $outdir) { + $logger->fatal("Cannot write to output directory $outdir. Check for permission errors, read-only file systems, etc."); + exit; + } + } else { + $logger->info("Creating output directory $outdir..."); + eval{mkdir($outdir);}; + if ($@) { + $logger->fatal("Cannot create output directory $outdir: $@"); + exit; + } + } + + # + # Convert FASTA files. + # + foreach my $file (@{$files}) { + + $logger->info('Parsing ' . $file . "...\n"); + + my $pathfrom = $indir .+ $file; + my $pathto = $outdir .+ $file; + + _ConvertFastaHeaders($input, $output, $action, \@x_fixes_array, $new_x_fix, $position, \%ids_to_delete, \@new_id_order); + + $logger->info('Converted ' . $file); + + } + } +} + +$logger->info('Finished!'); + +# +## +### Internal subs. +## +# + +sub _GetFiles { + + my ($indir, $outdir, $extension) = @_; + my @files; + + # + # Get the relative path to the outdir. + # Use this to remove it from the list of files/folders that need to be processed + # in case it's a subfolder of the input directory. + # + $outdir =~ m/\/([^\/]+)\/$/; + my $outdir_rel = $1; + + # + # Get and parse all files from the input dir. + # + eval{ + opendir (INDIR, $indir); + @files = grep { /.+\.$extension/i and not /^\..*/ and not /$outdir_rel/} readdir INDIR; + closedir INDIR; + }; + if ($@) { + $logger->fatal("Cannot read files from input directory $indir: $@"); + exit; + } + + return(\@files); +} + +sub _ConvertFastaHeaders { + + $logger->debug('_ConvertFastaHeaders sub'); + + my ($pathfrom, $pathto, $action, $x_fixes_array, $new_x_fix, $position, $ids_to_delete, $new_id_order) = @_; + + my $header_count = 0; + + #local($/) = "\n\n"; # set line seperator to a blank line + open(READ,"<$pathfrom") or die "\tcan't open input file $pathfrom: $!"; + open(SAVE,">$pathto") or die "\tcan't open output file $pathto: $!"; + while (my $line = ) { + + my $new_line; + + if ($line =~ /^>/) { + + # + # It's a header line. + # + $header_count++; + my $ids_string; + my $description; + my $line_end; + + if ($line =~ /^>([^\s]+)\s+(.+)([\n\r\f]+)/i) { + + # + # Header with descripton + # + $ids_string = $1; + $description = $2; + $line_end = $3; + + } elsif ($line =~ /^>([^\s]+)\s*([\n\r\f]+)/i) { + + # + # Header without descripton + # + $ids_string = $1; + $line_end = $2; + + } else { + + $logger->fatal("Malformed header line. Cannot find ID."); + exit; + + } + + my @ids = split(/\|/, $ids_string); + + if ($action eq 'strip') { + + $new_line = _StripFix($x_fixes_array, $ids_string, $description); + + } elsif ($action eq 'replace') { + + $new_line = _ReplaceFix($x_fixes_array, $new_x_fix, $position, \@ids, $description); + + } elsif ($action eq 'add') { + + $new_line = _AddFix($x_fixes_array, $position, \@ids, $description); + + } elsif ($action eq 'delete') { + + $new_line = _DeleteID($ids_to_delete, \@ids, $description); + + } elsif ($action eq 'shuffle') { + + $new_line = _ShuffleID($new_id_order, \@ids, $description); + + } + + unless (defined($new_line)) { + + $logger->fatal('Cannot convert header number: ' . $header_count); + $logger->fatal('Offending header line was: ' . $line); + exit; + + } + + $new_line .= $line_end; + + } elsif ($line =~ /^[\n\r\f]+$/) { + + # Skip blank line. + + } else { + + # + # It must be a sequence line. + # + $new_line = $line; + + } + + # Save (modified) line. + print SAVE $new_line or die "\tcan't save output to file $pathto: $!"; + + } + + close(READ); + close(SAVE); + +} + +sub _StripFix { + + my ($x_fixes_array, $ids_string, $description) = @_; + my $new_line; + + foreach my $x_fix (@{$x_fixes_array}) { + + $ids_string =~ s/$x_fix//g; + + } + + if (defined($description)) { + $new_line = '>' . $ids_string . ' ' . $description; + } else { + $new_line = '>' . $ids_string; + } + + return($new_line); + +} + +sub _ReplaceFix { + + my ($x_fixes_array, $new_x_fix, $position, $ids, $description) = @_; + my $new_line = '>'; + + for my $count (0 .. $#{$ids}) { + + my $id = ${$ids}[$count]; + my $stripped_id; + my $match = 0; + + if ($position eq 'prefix' || $position eq 'pre2suf') { + + foreach my $x_fix (@{$x_fixes_array}) { + + if ($id =~ m/^$x_fix(.+)/) { + + $stripped_id = $1; + $id = $stripped_id; + $match = 1; + + } + } + + } elsif ($position eq 'suffix' || $position eq 'suf2pre') { + + foreach my $x_fix (@{$x_fixes_array}) { + + if ($id =~ m/(.+)$x_fix$/) { + + $stripped_id = $1; + $id = $stripped_id; + $match = 1; + + } + } + + } else { + + $logger->fatal("Illegal or no position $position specified."); + exit; + + } + + if ($match) { + + # + # Append the new *fix. + # + if ($position eq 'prefix' || $position eq 'suf2pre') { + + $new_line .= $new_x_fix . $stripped_id . '|'; + + } elsif ($position eq 'pre2suf' || $position eq 'suffix') { + + $new_line .= $stripped_id . $new_x_fix . '|'; + + } + + } else { + + # + # Copy the ID unmodified to the result. + # + $new_line .= ${$ids}[$count] . '|'; + + } + } + + $new_line =~ s/\|$//; + if (defined($description)) { + $new_line .= ' ' . $description; + } + + return($new_line); + +} + +sub _AddFix { + + my ($x_fixes_array, $position, $ids, $description) = @_; + my $new_line = '>'; + + my $id_count = scalar(@{$ids}); + my $x_fix_count = scalar(@{$x_fixes_array}); + + unless ($id_count == $x_fix_count) { + $logger->fatal('Amount of pre- or suffixes specified (' . $x_fix_count . ') does not match with amount if IDs found ' . $id_count . ').'); + return(undef); + } + + for my $count (0 .. $#{$ids}) { + + if ($position eq 'prefix') { + + $new_line .= ${$x_fixes_array}[$count] . ${$ids}[$count] . '|'; + + } elsif ($position eq 'suffix') { + + $new_line .= ${$ids}[$count] . ${$x_fixes_array}[$count] . '|'; + + } + } + + $new_line =~ s/\|$//; + if (defined($description)) { + $new_line .= ' ' . $description; + } + + return($new_line); + +} + +sub _DeleteID { + + my ($ids_to_delete, $ids, $description) = @_; + my $new_line = '>'; + + $new_line = '>'; + + for my $offset (0 .. $#{$ids}) { + + my $index = $offset + 1; + + if (defined(${$ids_to_delete}{$index})) { + + # Skip (drop) this ID. + $logger->debug('Dropping ' . ${$ids}[$offset] . ' as it is ID number ' . $index . '.'); + + } else { + + $new_line .= ${$ids}[$offset] . '|'; + + } + } + + $new_line =~ s/\|$//; + if (defined($description)) { + $new_line .= ' ' . $description; + } + + return($new_line); + +} + +sub _ShuffleID { + + my ($new_id_order, $ids, $description) = @_; + my $new_line = '>'; + + my $id_count = scalar(@{$ids}); + my $new_id_order_item_count = scalar(@{$new_id_order}); + + unless ($id_count == $new_id_order_item_count) { + $logger->fatal('Amount of IDs specified to re-order (' . $new_id_order_item_count . ') does not match with amount if IDs found (' . $id_count . ').'); + return(undef); + } + + $new_line = '>'; + + foreach my $rank (@{$new_id_order}) { + + my $offset = $rank - 1; + $logger->debug('ID rank ' . $rank . ' = ' . ${$ids}[$offset] . '.'); + $new_line .= ${$ids}[$offset] . '|'; + $logger->debug('New header line now contains ' . $new_line . '.'); + + } + + $new_line =~ s/\|$//; + if (defined($description)) { + $new_line .= ' ' . $description; + } + + return($new_line); + +} + +sub _Usage { + + print "\n"; + print "ConvertFastaHeaders.pl - Converts sequence headers of FASTA files.\n"; + print "\n"; + print "Usage:\n"; + print "\n"; + print " ConvertFastaHeaders.pl options\n"; + print "\n"; + print "Available options are:\n"; + print "\n"; + print " -i [dir/file] Input can be a single FASTA file or a directory containing FASTA files.\n"; + print " -e [ext] File name extension for the FASTA files in case the input is a directory. (default = fa)\n"; + print " -o [dir/file] Output file or directory where the result(s) will be saved.\n"; + print " -a [action] Action must be one of 'add', 'strip', 'replace', 'delete' or 'shuffle'.\n"; + print " The actions 'delete' and 'shuffle' operate on complete sequence IDs with or without (database namespace) prefixes or suffixes.\n"; + print " The actions 'add', 'strip' and 'replace' operate on sequence ID prefixes or suffixes.\n"; + print " Note in case *fixes are added the order of the *fixes is important! (See below for examples.)\n"; + print " -p [position] Positon must be a comma separated list of numbers in case the action is 'delete' or 'shuffle'.\n"; + print " Position must be one of 'prefix' or 'suffix' when the action is 'add' or 'strip'.\n"; + print " In case the action is 'replace' the position can also be one of pre2suf or suf2pre \n"; + print " to replace a prefix with a suffix or vice versa.\n"; + print " -f '[*fix1 *fix2 *fixN]' Space separated list of prefixes or suffixes, which will be replaced in, added to or removed from pipe separated identifiers.\n"; + print " Note that in case of database namespace prefixes you must specify both the database name space and \n"; + print " the character to separate the namespace from the accession number as the prefix. (See below for examples.) \n"; + print " -n '[*fix]' A single new prefix or suffix to replace the *fixes specified with -f.\n"; + print " (Only required in case the action is 'replace'.)\n"; + print " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n"; + print "\n"; + print "Examples:\n"; + print "\n"; + print " Adding prefixes\n"; + print " In this case the order of the *fixes specified with -f is important!\n"; + print " With -a add -p prefix -f 'UniProtAcc: UniProtID:', this header:\n"; + print " >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " will be converted into:\n"; + print " >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " Stripping prefixes\n"; + print " In this case the order of the *fixes specified with -f is not relevant.\n"; + print " With both -a strip -p prefix -f 'UniProtAcc: UniProtID:' or \n"; + print " with -a strip -p prefix -f 'UniProtID: UniProtAcc:', this header:\n"; + print " >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " will be converted into:\n"; + print " >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " Replacing prefixes with a suffix\n"; + print " In this case the order of the *fixes specified with -f is not relevant.\n"; + print " With -a replace -p pre2suf -f 'REV_' -n '_REV', this header:\n"; + print " >REV_P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " will be converted into:\n"; + print " >P32234_REV|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " Deleting sequence identifiers\n"; + print " Supply a comma separated list of numbers for the ranks of the identifiers / accession numbers you want to remove.\n"; + print " Multiple identifiers must be separated with a pipe symbol.\n"; + print " With -a delete -p '1,3', this header:\n"; + print " >UniProtID:128UP_DROME|UniProtAcc:P32234|EMBL:AY069810 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " will be converted into:\n"; + print " >UniProtAcc:P32234 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " Changing the order of sequence identifiers\n"; + print " Supply a comma separated list of numbers for the new order of all the identifiers / accession numbers in a header.\n"; + print " Multiple identifiers must be separated with a pipe symbol.\n"; + print " Hence if your headers contain 4 pipe separated IDs and you only want to swap the order of the first and the second, \n"; + print " you will still need to specify the new (unchanged) order for number 3 and 4 too.\n"; + print " With -a shuffle -p '2,1,3', this header:\n"; + print " >UniProtID:128UP_DROME|UniProtAcc:P32234|EMBL:AY069810 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " will be converted into:\n"; + print " >UniProtAcc:P32234|UniProtID:128UP_DROME|EMBL:AY069810 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " Specifying only *2,1* as the New order for the IDs will not work, because this header contains 3 IDs, \n"; + print " so you'll have to include the (new) position for the third one as well.\n"; + print "\n"; + exit; + +} diff -r 000000000000 -r 163892325845 ConvertFastaHeaders.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ConvertFastaHeaders.xml Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,241 @@ + + + Converts sequence headers of FASTA files + + #if $mode.action == "add" #ConvertFastaHeaders.pl -i $input -o $output -p $mode.pos.position -a $mode.action -f "$mode.pos.xfixes" -l ERROR + #elif $mode.action == "strip" #ConvertFastaHeaders.pl -i $input -o $output -p $mode.pos.position -a $mode.action -f "$mode.pos.xfixes" -l ERROR + #elif $mode.action == "replace" #ConvertFastaHeaders.pl -i $input -o $output -p $mode.pos.position -a $mode.action -f "$mode.pos.xfixes" -n $mode.pos.newfix -l ERROR + #elif $mode.action == "delete" #ConvertFastaHeaders.pl -i $input -o $output -p $mode.position -a $mode.action -l ERROR + #elif $mode.action == "shuffle" #ConvertFastaHeaders.pl -i $input -o $output -p $mode.position -a $mode.action -l ERROR + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**What it does** + +This tool converts the sequence headers (the description line starting with >) for a set of FASTA sequences. \ +Currently supported conversions: + + - Append prefixes like for example database namespace to identifiers / accession numbers. + - Append suffixes to identifiers / accession numbers. + - Remove prefixes like for example database namespace from identifiers / accession numbers. + - Remove suffixes from identifiers / accession numbers. + - Replace prefixes with a suffix or vice versa. + +----- + +**Examples** + +======================================================== +*Appending database namespace prefixes* +======================================================== + +Supply a space separated list of prefixes to add to pipe separated identifiers. \ +The prefixes must contain both the database namespace you want to add and the character used to separate the namespace from the identifier. \ +The order of the namespaces is important in this case! \ +For example with action *Add Labels*, label position *Prepend Prefixes* and prefixes *UniProtAcc: UniProtID:*, this header:: + + >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +will be converted into:: + + >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +======================================================== +*Removing database namespace prefixes* +======================================================== + +Supply a space separated list of namespaces to remove from pipe separated identifiers. \ +The prefixes must contain both the database namespace you want to remove and the character used to separate the namespace from the identifier. \ +The order of the namespaces is not relevant in this case. \ +For example with action *Remove Labels*, label position *Strip Prefixes* and \ +with prefixes *UniProtAcc: UniProtID:* or with prefixes *UniProtID: UniProtAcc:*, this header:: + + >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +will be converted into:: + + >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +======================================================== +*Replacing a prefix with a suffix* +======================================================== + +Supply the prefix to remove from pipe separated identifiers and a new suffix. \ +The order of the prefixes is not relevant in this case. \ +Optionally you can specify more than one prefix to remove by separating them with spaces. \ +You can specify only one new suffix though, so in case you specified more than one prefix they will all be replaced with the same new suffix. \ +In the following example we'll replace a *REV_* prefix to indicate a reversed sequence with a *_REV* suffix. \ +With action *Replace Labels*, label position *Replace prefixes with a suffix*, prefix *REV_* and new suffix *_REV*, this header:: + + >REV_P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +will be converted into:: + + >P32234_REV|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +======================================================== +*Other \*fix replacements* +======================================================== + +You can also replace a prefix with a new prefix, a suffix with a new suffix or a suffix with a new prefix. \ +These replacements are all similar to the example above. \ +Hence you can specify multiple \*-fixes to be replaced and only one new \*-fix to replace the existing ones. + +======================================================== +*Deleting sequence identifiers* +======================================================== + +Supply a comma separated list of numbers for the ranks of the identifiers / accession numbers you want to remove. \ +Multiple identifiers must be separated with a pipe symbol. \ +For example with action *Delete*, and Ranks of IDs to delete set to *1,3*, this header:: + + >UniProtID:128UP_DROME|UniProtAcc:P32234|EMBL:AY069810 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +will be converted into:: + + >UniProtAcc:P32234 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +======================================================== +*Changing the order of sequence identifiers* +======================================================== + +Supply a comma separated list of numbers for the new order of all the identifiers / accession numbers in a header. \ +Multiple identifiers must be separated with a pipe symbol. \ +Hence if your headers contain 4 pipe separated IDs and you only want to swap the order of the first and the second, \ +you will still need to specify the new (unchanged) order for number 3 and 4 too. + +For example with action *Shuffle*, and New order for the IDs set to *2,1,3*, this header:: + + >UniProtID:128UP_DROME|UniProtAcc:P32234|EMBL:AY069810 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +will be converted into:: + + >UniProtAcc:P32234|UniProtID:128UP_DROME|EMBL:AY069810 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +Specifying only *2,1* as the New order for the IDs will not work, because this header contains 3 IDs, \ +so you'll have to include the (new) position for the third one as well. + +======================================================== +*Need another type of conversion?* +======================================================== + +Contact your local bioinformaticians to add other conversions... + + + diff -r 000000000000 -r 163892325845 ExtractCleavageSiteSequenceContext.svg --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ExtractCleavageSiteSequenceContext.svg Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,997 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r 163892325845 ExtractCleavageSiteSequenceContext.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ExtractCleavageSiteSequenceContext.xml Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,218 @@ + + + by mapping peptides back to proteins and fetching the regions surrounding the peptide termini. + ExtractPeptideSequenceContext.pl --db $db --dbf FASTA --f $fragments --icol $icol --pcol $pcol $strip --cleo $cleo --ca '$ca' --ct $ct --n $n --c $c --pc '$pc' --ll WARN + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. role:: raw-html(raw) + :format: html + +.. class:: infomark + +**What it does** + +Map peptide sequences back to proteins and extract sequence contexts for cleavage sites. + +:raw-html:`<object data="static/images/nbic_gmr/ExtractCleavageSiteSequenceContext.svg" type="image/svg+xml" width="100%"/>` + +=================================================== +*Peptide sequences and their protein's identifiers* +=================================================== + +This file must contain at least peptides and accession numbers or IDs of the proteins the peptides were derived from. \ +The data must be in TAB delimited format and may contain other columns, which will be preserved in the output. \ +If a sequence context was found, it will be appended in a new column to the right of the existing columns. \ +When another sequence context was found for the same peptide, it will appended as an extra row in the output. +Protein accession numbers / IDs must be in the same format as was used in the FASTA file with protein sequences (database). \ +The only exception to this rule is that accession numbers / IDs may be optionally suffixed with the peptide\'s position in its protein between brackets. \ +For example: CLH1_HUMAN[1612-1620] will be matched to CLH1_HUMAN in a FASTA file with protein sequences. \ +Amino acids in the petide sequences must be in uppercase. + +=============================================== +*Protein sequences* +=============================================== + +Input file containing all protein sequences in FASTA format. \ +This tool will look for any type of protein ID in the first part of FASTA sequence headers up until the first white space. \ +Optionally multiple IDs may be present separated with pipe symbols (|) or semicolons (;). \ +Optionally IDs may be prefixed with a database namespace and a colon (:). \ +For example the accession number P32234 as well as the ID 128UP_DROME would be recognized in both this sequence header: + + >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +and in this one: + + >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +=================================================== +*N-terminal and C-terminal sequence context length* +=================================================== + +Integers specifying the length of the N-terminal and C-terminal sequence context to retrieve starting from the modification site. \ +Note that the width of a cleavage site is 0 amino acids. \ +When defaults are used for both the N-terminal and C-terminal sequence context lengths, \ +the total sequence context length for a cleavage site will be: +(N-terminal sequence context) + (C-terminal sequence context) = 5 + 5 = 10. + +=============================================== +*Cleavage amino acid and terminus* +=============================================== + +This tool assumes the peptides were derived from cutting with a proteolytic enzyme, \ +that cuts on the *cleavage terminal* side of all *cleavage amino acids*. \ +When the specificity of the used protease is unknown, \ +you may provide an asterisk (*) to retrieve sequence context for any cleavage site, \ +but in that case this tool will not filter non-specifically cleaved fragments, \ +that may be the result of processes other than protease activity. + +=============================================== +*Padding character* +=============================================== + +Optional padding character to fill N-terminal or C-terminal positions in the sequence context, \ +when the protein was too short to get a complete sequence context. \ +Defaults to - a.k.a. dash or alignment gap character. \ + +----- + +**Getting input data** + +.. _my folder utility: http://mascotinternal.chem.uu.nl/mascot/cgi/uu_myfolder.pl + +This tool requires \ +peptide sequences in TAB delimited format and \ +protein sequences from which the peptides were derived in FASTA format. \ +If your peptide sequences are not in TAB delimited format, you can convert from: + + - FASTA format using *FASTA manipulation* -> *FASTA-to-Tabular* + - A format using a different delimiter using *Text Manipulation* -> *Convert* + +When your peptides were derived from a mass spectrometry experiment and identified with a search engine like Mascot, Sequest, etc.,\ +please make sure you provide the same FASTA database for this tool as the one used for your search. +If you used Mascot hosted by the Biomolecular Mass Spectrometry and Proteomics Group @ Utrecht University, \ +you can use the `my folder utility`_ to download the FASTA databases from the Mascot server. + +----- + +**Examples** + +Example input for peptides identified with a Mascot search, \ +some with phosphorylated residues indicated by pS, pT or pY \ +and in TAB delimited format:: + + sequence score peptide mr mass delta (abs) mass delta (ppm) all protein matches + AGNAARDN 54.24 787.357254 -4.223E-5 -0.05334300253998803 H2A1B_HUMAN[67-74]; H2A1C_HUMAN[67-74]; H2A1D_HUMAN[67-74] + KLpSAAVVLI 11.48 912.600784 0.001608 1.7619971713721432 OSGI2_HUMAN[405-413] + RAGIKVpTVA 23.01 913.570892 6.283E-5 0.06786555979719196 PARK7_HUMAN[28-36] + KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 ALDOA_HUMAN[101-110] + KIKELQAF 11.87 975.575287 0.003907 4.004816493470687 MMP20_HUMAN[71-78] + KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689] + KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245] + KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] + +=============================================== +*Appending cleavage site sequence contexts* +=============================================== + +With these options: + + - K as *cleavage amino acid* + - N-terminal as *cleavage terminus* + - c6 as *Protein identifier column* + - c1 as *Peptide sequence column* + - a suitable FASTA database with *Protein sequences* + - and everything else set to defaults + +the example above will generate a result like this:: + + AGNAARDN 54.24 787.357254 -4.223E-5 -0.05334300253998803 H2A1B_HUMAN[67-74]; H2A1C_HUMAN[67-74]; H2A1D_HUMAN[67-74] AARDNKKTRI + KLpSAAVVLI 11.48 912.600784 0.001608 1.7619971713721432 OSGI2_HUMAN[405-413] LKKIFKLSAA + KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 ALDOA_HUMAN[101-110] QVIKSKGGVV + KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 ALDOA_HUMAN[101-110] GIKVDKGVVP + KIKELQAF 11.87 975.575287 0.003907 4.004816493470687 MMP20_HUMAN[71-78] NSMIRKIKEL + KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689] VGEKEKISGT + KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245] AILEYKLYEA + KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] LTKVDKLDAS + KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] SESLRKEEEQ + + +Note the header line was ignored and if peptides were derived from specific LysN cleavage, they will occur twice in the output: \ +once with the sequence context for the peptide\'s N-terminus and once for its C-terminus. + + + diff -r 000000000000 -r 163892325845 ExtractMiscleavageSiteSequenceContext.svg --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ExtractMiscleavageSiteSequenceContext.svg Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,1357 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r 163892325845 ExtractMiscleavageSiteSequenceContext.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ExtractMiscleavageSiteSequenceContext.xml Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,211 @@ + + + by mapping peptides back to proteins and fetching the regions surrounding missed cleavage sites. + ExtractPeptideSequenceContext.pl --db $db --dbf FASTA --f $fragments --icol $icol --pcol $pcol $strip --miso $miso --ca $ca --ct $ct --n $n --c $c --pc '$pc' --ll WARN + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. role:: raw-html(raw) + :format: html + +.. class:: infomark + +**What it does** + +Map peptide sequences back to proteins and extract sequence contexts for miscleavage sites. + +:raw-html:`<object data="static/images/nbic_gmr/ExtractMiscleavageSiteSequenceContext.svg" type="image/svg+xml" width="100%"/>` + +=================================================== +*Peptide sequences and their protein's identifiers* +=================================================== + +This file must contain at least peptides and accession numbers or IDs of the proteins the peptides were derived from. \ +The data must be in TAB delimited format and may contain other columns, which will be preserved in the output. \ +If a sequence context was found, it will be appended in a new column to the right of the existing columns. \ +When another sequence context was found for the same peptide, it will appended as an extra row in the output. +Protein accession numbers / IDs must be in the same format as was used in the FASTA file with protein sequences (database). \ +The only exception to this rule is that accession numbers / IDs may be optionally suffixed with the peptide\'s position in its protein between brackets. \ +For example: CLH1_HUMAN[1612-1620] will be matched to CLH1_HUMAN in a FASTA file with protein sequences. \ +Amino acids in the petide sequences must be in uppercase. + +=============================================== +*Protein sequences* +=============================================== + +Input file containing all protein sequences in FASTA format. \ +This tool will look for any type of protein ID in the first part of FASTA sequence headers up until the first white space. \ +Optionally multiple IDs may be present separated with pipe symbols (|) or semicolons (;). \ +Optionally IDs may be prefixed with a database namespace and a colon (:). \ +For example the accession number P32234 as well as the ID 128UP_DROME would be recognized in both this sequence header: + + >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +and in this one: + + >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +=================================================== +*N-terminal and C-terminal sequence context length* +=================================================== + +Integers specifying the length of the N-terminal and C-terminal sequence context to retrieve starting from the modification site. \ +Note that the width of a miscleavage site is 0 amino acids. \ +When defaults are used for both the N-terminal and C-terminal sequence context lengths, \ +the total sequence context length for a miscleavage site will be: +(N-terminal sequence context) + (C-terminal sequence context) = 5 + 5 = 10. + +=============================================== +*Cleavage amino acid and terminus* +=============================================== + +This tool assumes the peptides were derived from cutting with a proteolytic enzyme, \ +that should have cut on the *cleavage terminal* side of all *cleavage amino acids*. \ + +=============================================== +*Padding character* +=============================================== + +Optional padding character to fill N-terminal or C-terminal positions in the sequence context, \ +when the protein was too short to get a complete sequence context. \ +Defaults to - a.k.a. dash or alignment gap character. \ + +----- + +**Getting input data** + +.. _my folder utility: http://mascotinternal.chem.uu.nl/mascot/cgi/uu_myfolder.pl + +This tool requires \ +peptide sequences in TAB delimited format and \ +protein sequences from which the peptides were derived in FASTA format. \ +If your peptide sequences are not in TAB delimited format, you can convert from: + + - FASTA format using *FASTA manipulation* -> *FASTA-to-Tabular* + - A format using a different delimiter using *Text Manipulation* -> *Convert* + +When your peptides were derived from a mass spectrometry experiment and identified with a search engine like Mascot, Sequest, etc.,\ +please make sure you provide the same FASTA database for this tool as the one used for your search. +If you used Mascot hosted by the Biomolecular Mass Spectrometry and Proteomics Group @ Utrecht University, \ +you can use the `my folder utility`_ to download the FASTA databases from the Mascot server. + +----- + +**Examples** + +Example input for peptides identified with a Mascot search, \ +some with phosphorylated residues indicated by pS, pT or pY \ +and in TAB delimited format:: + + sequence score peptide mr mass delta (abs) mass delta (ppm) all protein matches + AGNAARDN 54.24 787.357254 -4.223E-5 -0.05334300253998803 H2A1B_HUMAN[67-74]; H2A1C_HUMAN[67-74]; H2A1D_HUMAN[67-74] + KLpSAAVVLI 11.48 912.600784 0.001608 1.7619971713721432 OSGI2_HUMAN[405-413] + RAGIKVpTVA 23.01 913.570892 6.283E-5 0.06786555979719196 PARK7_HUMAN[28-36] + KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 ALDOA_HUMAN[101-110] + KIKELQAF 11.87 975.575287 0.003907 4.004816493470687 MMP20_HUMAN[71-78] + KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689] + KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245] + KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] + +=============================================== +*Appending miscleavage site sequence contexts* +=============================================== + +With these options: + + - K as the *amino acid* the protease should have recognized + - N-terminal as the side of the recognized amino where the protease should have cleaved. + - c6 as *Protein identifier column* + - c1 as *Peptide sequence column* + - a suitable FASTA database with *Protein sequences* + - and everything else set to defaults + +the example above will generate a result like this:: + + RAGIKVpTVA 23.01 913.570892 6.283E-5 0.06786555979719196 PARK7_HUMAN[28-36] RRAGIKVTVA + KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 ALDOA_HUMAN[101-110] GVVGIKVDKG + KIKELQAF 11.87 975.575287 0.003907 4.004816493470687 MMP20_HUMAN[71-78] MIRKIKELQA + KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245] LYEALKFIML + +Note the header line was ignored and if peptides have more than one miscleavage site they will occur more than once in the output. + + + diff -r 000000000000 -r 163892325845 ExtractModificationSiteSequenceContext.svg --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ExtractModificationSiteSequenceContext.svg Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,887 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r 163892325845 ExtractModificationSiteSequenceContext.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ExtractModificationSiteSequenceContext.xml Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,183 @@ + + + by mapping modified amino acids in peptides back to proteins and fetching the sequence surrounding the modified sites. + ExtractPeptideSequenceContext.pl --db $db --dbf FASTA --f $fragments --icol $icol --pcol $pcol --s --modo $modo --ma '$ma' --n $n --c $c --pc '$pc' --ll ERROR + + + + + + + + + + + + + + + + + + + + + + +.. role:: raw-html(raw) + :format: html + +.. class:: infomark + +**What it does** + +Map peptide sequences back to proteins and extract sequence contexts for modification sites. + +:raw-html:`<object data="static/images/nbic_gmr/ExtractModificationSiteSequenceContext.svg" type="image/svg+xml" width="100%"/>` + + +=================================================== +*Peptide sequences and their protein's identifiers* +=================================================== + +This file must contain at least peptides and accession numbers or IDs of the proteins the peptides were derived from. \ +The data must be in TAB delimited format and may contain other columns, which will be preserved in the output. \ +If a sequence context was found, it will be appended in a new column to the right of the existing columns. \ +When another sequence context was found for the same peptide, it will appended as an extra row in the output. +Protein accession numbers / IDs must be in the same format as was used in the FASTA file with protein sequences (database). \ +The only exception to this rule is that accession numbers / IDs may be optionally suffixed with the peptide\'s position in its protein between brackets. \ +For example: CLH1_HUMAN[1612-1620] will be matched to CLH1_HUMAN in a FASTA file with protein sequences. \ +Amino acids in the petide sequences must be in uppercase. + +=============================================== +*Protein sequences* +=============================================== + +Input file containing all protein sequences in FASTA format. \ +This tool will look for any type of protein ID in the first part of FASTA sequence headers up until the first white space. \ +Optionally multiple IDs may be present separated with pipe symbols (|) or semicolons (;). \ +Optionally IDs may be prefixed with a database namespace and a colon (:). \ +For example the accession number P32234 as well as the ID 128UP_DROME would be recognized in both this sequence header: + + >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +and in this one: + + >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +=================================================== +*N-terminal and C-terminal sequence context length* +=================================================== + +Integers specifying the length of the N-terminal and C-terminal sequence context to retrieve starting from the modification site. \ +Note that the width of a modification site is 1 amino acid. \ +When defaults are used for both the N-terminal and C-terminal sequence context lengths, \ +the total sequence context length for a modification site will be: +(N-terminal sequence context) + (modified amino acid) + (C-terminal sequence context) = 5 + 1 + 5 = 11. + +=============================================== +*Modified amino acid* +=============================================== + +The amino acid must be specified in uppercase and the modification in lower case. \ +The order is not important. \ +Hence a phophorylated serine in a peptide sequence can be indicated with either pS or Sp, \ +but you cannot mix both pS and Sp in a single peptide sequence file. \ +You may provide an asterisk (*) instead of an upper case amino acid to retrieve sequence contexts \ +for the specified modification no matter what amino acid it was located on. \ +A modification may be specified with more than one lower case character, \ +so for example phosphoS or Sphospho can also be used for a phosphorylated serine. + +=============================================== +*Padding character* +=============================================== + +Optional padding character to fill N-terminal or C-terminal positions in the sequence context, \ +when the protein was too short to get a complete sequence context. \ +Defaults to - a.k.a. dash or alignment gap character. \ + +----- + +**Getting input data** + +.. _my folder utility: http://mascotinternal.chem.uu.nl/mascot/cgi/uu_myfolder.pl + +This tool requires \ +peptide sequences in TAB delimited format and \ +protein sequences from which the peptides were derived in FASTA format. \ +If your peptide sequences are not in TAB delimited format, you can convert from: + + - FASTA format using *FASTA manipulation* -> *FASTA-to-Tabular* + - A format using a different delimiter using *Text Manipulation* -> *Convert* + +When your peptides were derived from a mass spectrometry experiment and identified with a search engine like Mascot, Sequest, etc.,\ +please make sure you provide the same FASTA database for this tool as the one used for your search. +If you used Mascot hosted by the Biomolecular Mass Spectrometry and Proteomics Group @ Utrecht University, \ +you can use the `my folder utility`_ to download the FASTA databases from the Mascot server. + +----- + +**Examples** + +Example input for peptides identified with a Mascot search, \ +some with phosphorylated residues indicated by pS, pT or pY \ +and in TAB delimited format:: + + sequence score peptide mr mass delta (abs) mass delta (ppm) all protein matches + AGNAARDN 54.24 787.357254 -4.223E-5 -0.05334300253998803 H2A1B_HUMAN[67-74]; H2A1C_HUMAN[67-74]; H2A1D_HUMAN[67-74] + KLpSAAVVLI 11.48 912.600784 0.001608 1.7619971713721432 OSGI2_HUMAN[405-413] + RAGIKVpTVA 23.01 913.570892 6.283E-5 0.06786555979719196 PARK7_HUMAN[28-36] + KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 ALDOA_HUMAN[101-110] + KIKELQAF 11.87 975.575287 0.003907 4.004816493470687 MMP20_HUMAN[71-78] + KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689] + KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245] + KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] + +=============================================== +*Appending modification site sequence contexts* +=============================================== + +With these options: + + - p\* as *modified amino acid* + - c6 as *Protein identifier column* + - c1 as *Peptide sequence column* + - a suitable FASTA database with *Protein sequences* + - and everything else set to defaults + +the example above will generate a result like this:: + + KLpSAAVVLI 11.48 912.600784 0.001608 1.7619971713721432 OSGI2_HUMAN[405-413] KIFKLSAAVVL + RAGIKVpTVA 23.01 913.570892 6.283E-5 0.06786555979719196 PARK7_HUMAN[28-36] AGIKVTVAGLA + KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689] EKEKISGTVNI + KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689] EKISGTVNIRT + KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245] LEYKLYEALKF + KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] DKLDASESLRK + KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] LDASESLRKEE + +Note the header line was ignored, peptides like AGNAARDN without any modified amino acids are absent from the output \ +and peptides like KLDApSEpSLR with more than one modified amino acid occur more than once in the output. + + + diff -r 000000000000 -r 163892325845 ExtractPeptideSequenceContext.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ExtractPeptideSequenceContext.pl Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,1744 @@ +#!/usr/bin/perl -w +# +# This script extracts subsequences from UniProtKB *.dat files or from *.fasta files +# based on a list of accession numbers / IDs and a sequence fragment (peptide). +# +# ===================================================== +# $Id: ExtractPeptideSequenceContext.pl 112 2011-03-01 19:50:23Z pieter.neerincx@gmail.com $ +# $HeadURL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractPeptideSequenceContext.pl $ +# $LastChangedDate: 2011-03-01 13:50:23 -0600 (Tue, 01 Mar 2011) $ +# $LastChangedRevision: 112 $ +# $LastChangedBy: pieter.neerincx@gmail.com $ +# ===================================================== +# + +# +# initialise environment +# +use strict; +use Getopt::Long; +use SWISS::Entry; # For parsing UniProtKB *.dat files. +use Log::Log4perl qw(:easy); + +my $ps = '/'; # Path Separator. +my %log_levels = ( + 'ALL' => $ALL, + 'TRACE' => $TRACE, + 'DEBUG' => $DEBUG, + 'INFO' => $INFO, + 'WARN' => $WARN, + 'ERROR' => $ERROR, + 'FATAL' => $FATAL, + 'OFF' => $OFF, +); + +# +# List of databases, which may contain versioned accession numbers, +# and the separation character used to join the accession number and +# the version number. +# +my %databases_with_optionally_versioned_ids = ( + 'IPI' => '.', + 'ipi' => '.', + 'UniProtKB/Swiss-Prot' => '.', + 'sp' => '.', + 'SP' => '.', + 'UniProtKB/TrEMBL' => '.', + 'TR' => '.', + 'tr' => '.', + 'Tr' => '.' +); + +my $db; +my $db_alt; +my $db_format; +my $index_id; # Key used to lookup sequences. Can be ID or Accession number (default). +my $fragments; +my $id_column; +my $peptide_column; +my $strip_lc; # For the sequence fragments in the fragments file. With strip_lc enabled we + # assume amino acids are in upper case and lower case characters represent modifications, + # which will be stripped off before searching for the fragments in the complete sequences. +my $cleavage_output; # Sequence contexts for cleavage sites. +my $miscleavage_output; # Sequence contexts for miscleavage sites. +my $modification_output; # Sequence contexts for modification sites. +my $peptide_output; # Sequence contexts for complete peptides. +my $cleavage_aa; # Assume the sequence fragments were derived from cutting at this amino acid. +my $cleavage_terminus; # Assume the sequence fragments were derived from cutting at this side of $cut_at_aa +my $modified_aa; +my $n_term_context_length; +my $c_term_context_length; +my $miscleavage_distance; # Minimal distance between a putative miscleavage site and the peptide termini. + # Uncleaved AAs closer to the peptide termini can be ignored with this parameter: + # A. To prevent overlap between cleavage and miscleavage peptide sequence contexts. + # B. To remove false positive miscleavage sites that cannot be cleaved, + # because one of the resulting fragments is too short. Sometimes proteases may need + # a minimal length surrounding the cleavage site to bind and cleave a peptide. +my $padding_character; # Used for padding if the protein sequence is too short for a long enough context. +my $log_level; + +Getopt::Long::GetOptions ( + 'db=s' => \$db, + 'dba:s' => \$db_alt, + 'dbf:s' => \$db_format, + 'k:s' => \$index_id, # Key used to lookup sequences. Can be ID or Accession number (default). + 'f=s' => \$fragments, + 'icol:i' => \$id_column, + 'pcol:i' => \$peptide_column, + 's' => \$strip_lc, # For the sequence fragments in the fragments file. With strip_lc enabled we + # assume amino acids are in upper case and lower case characters represent modifications, + # which will be stripped off before searching for the fragments in the complete sequences. + 'cleo:s' => \$cleavage_output, + 'miso:s' => \$miscleavage_output, + 'modo:s' => \$modification_output, + 'pepo:s' => \$peptide_output, + 'ca:s' => \$cleavage_aa, # Assume the sequence fragments were derived from cutting at this amino acid. + 'ct:s' => \$cleavage_terminus, # Assume the sequence fragments were derived from cutting at this side of $cut_at_aa + 'ma:s' => \$modified_aa, + 'n:i' => \$n_term_context_length, + 'c:i' => \$c_term_context_length, + 'mcd:i' => \$miscleavage_distance, # Minimal distance between a putative miscleavage site and the peptide termini. + # Uncleaved AAs closer to the peptide termini can be ignored with this parameter: + # A. To prevent overlap between cleavage and miscleavage peptide sequence contexts. + # B. To remove false positive miscleavage sites that cannot be cleaved, + # because one of the resulting fragments is too short. Sometimes proteases may need + # a minimal length surrounding the cleavage site to bind and cleave a peptide. + 'pc:s' => \$padding_character, + 'll:s' => \$log_level +); + +# +# TODO: 1. Handle --mcd param. Maybe should be extended to exclude also modification sites too close together... +# + +# +# Configure logging. +# +# Provides default if user did not specify log level: +$log_level = (defined($log_level) ? $log_level : 'INFO'); +# Reset log level to default if user specified illegal log level. +$log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'INFO'}); +#Log::Log4perl->init('log4perl.properties'); +Log::Log4perl->easy_init( + #{ level => $log_level, + # file => ">>ExtractPeptideSequenceContext.log", + # layout => '%F{1}-%L-%M: %m%n' }, + { level => $log_level, + file => "STDOUT", + layout => '%d L:%L %p> %m%n' }, +); +my $logger = Log::Log4perl::get_logger(); + +# +# Check user input. +# +foreach my $input ($db, $db_alt, $fragments, $cleavage_output, $miscleavage_output, $modification_output) { + if (defined($input) && length($input) <= 0) { + $input = undef(); + } +} +unless (defined($db) && + defined($fragments) && + (defined($cleavage_output) || defined($miscleavage_output) || defined($modification_output) || defined($peptide_output)) +) { + $logger->fatal('You need to specify at least a DB & fragments file as inputs and at least one output file.'); + _Usage(); + exit; +} +if (defined($db_format)) { + unless ($db_format eq 'UniProtKB DAT' or $db_format eq 'FASTA') { + $logger->fatal('Database format in unsupported format.'); + $logger->fatal("\t" . 'Must be \'UniProtKB DAT\' or \'FASTA\'.'); + $logger->fatal("\t" . 'But --dbf was ' . $db_format . '.'); + exit; + } +} +if (defined($cleavage_aa)) { + unless ($cleavage_aa =~ m/^[*A-Z]$/) { + $logger->fatal('Cleavage amino acid in unsupported format.'); + $cleavage_aa = undef; + } +} +if (defined($cleavage_terminus)) { + unless ($cleavage_terminus =~ m/^[*CN]$/) { + $logger->fatal('Cleavage terminus in unsupported format.'); + $cleavage_terminus = undef; + } +} +if (defined($modified_aa)) { + unless ($modified_aa =~ m/^[*A-Z][a-z]+$/ || $modified_aa =~ m/^[a-z]+[*A-Z]$/) { + $logger->fatal('Modified amino acid in unsupported format.'); + $modified_aa = undef; + } +} +# +# We need info to extract: +# * cleavage site sequence contexts or +# * modification site sequence contexts or +# * peptide sequence contexts +# +if (defined($cleavage_output) || defined($miscleavage_output)) { + unless (defined($cleavage_aa) && defined($cleavage_terminus)) { + $logger->fatal('If you want to retrieve (mis)cleavage site sequence contexts, you must provide valid values for --ca and --ct.'); + $logger->debug('You specified --ca ' . $cleavage_aa . ' --ct ' . $cleavage_terminus . '.'); + _Usage(); + exit; + } +} +if (defined($modification_output)) { + unless (defined($modified_aa)) { + $logger->fatal('If you want to retrieve modifications site sequence contexts you must provide a valid value for --ma.'); + $logger->debug('You specified --ma ' . $modified_aa . '.'); + _Usage(); + exit; + } +} +if (defined($peptide_output)) { + # + # We can work with defaults for all params. + # +} + +# Make sure we don't overwrite the input. +foreach my $output ($cleavage_output, $miscleavage_output, $modification_output, $peptide_output) { + foreach my $input ($db, $db_alt, $fragments) { + if (defined($input) && defined($output) && $output eq $input) { + $logger->fatal('Output file ' . $output . ' is the same as input file ' . $input . '.'); + $logger->fatal("\t" . 'Please choose a different file for the output.'); + exit; + } + } +} +# Define defaults for optional arguments. +$index_id = (defined($index_id) && length($index_id) > 0 ? $index_id : 'AC'); +$id_column = (defined($id_column) && length($id_column) > 0 ? $id_column : 1); +$peptide_column = (defined($peptide_column) && length($peptide_column) > 0 ? $peptide_column : 2); +$n_term_context_length = (defined($n_term_context_length) && length($n_term_context_length) > 0 ? $n_term_context_length : 5); +$c_term_context_length = (defined($c_term_context_length) && length($c_term_context_length) > 0 ? $c_term_context_length : 5); +$padding_character = (defined($padding_character) ? $padding_character : '-'); +# Stripping of lowercase characters is disabled by default unless we search for modification site contexts in which case it must be enabled! +$strip_lc = (defined($strip_lc) ? $strip_lc : 0); +$strip_lc = (defined($modified_aa) ? 1 : $strip_lc); +unless (defined($miscleavage_distance) && $miscleavage_distance > 0) { + if ($n_term_context_length >= $c_term_context_length) { + $miscleavage_distance = $n_term_context_length; + } else { + $miscleavage_distance = $c_term_context_length; + } +} + + +# +# Check db files and fragments file. +# +unless (-e $db && -f $db && -r $db) { + $logger->fatal('Cannot read from database input file ' . $db . ': ' . $!); + exit; +} +if (defined($db_alt)) { + unless (-e $db_alt && -f $db_alt && -r $db_alt) { + $logger->fatal('Cannot read from alternative database input file ' . $db_alt . ': ' . $!); + exit; + } +} +unless (-e $fragments && -f $fragments && -r $fragments) { + $logger->fatal('Cannot read from fragments input file ' . $fragments . ': ' .$!); + exit; +} + +# +# Create lookup table(s) of the entire database input file(s). +# +my @db_lookup_table_refs; +(my $db_seqs, $index_id) = _CreateLookupHash($db, $index_id, $db_format); +my $seq_count = scalar(keys(%{$db_seqs})); +$logger->info('Number of sequences in database lookup table: ' . $seq_count); +push(@db_lookup_table_refs, $db_seqs); +# Optional backup DB file. +if (defined($db_alt)) { + (my $db_seqs_alt, $index_id) = _CreateLookupHash($db_alt, $index_id, $db_format); + my $seq_count_alt = scalar(keys(%{$db_seqs_alt})); + $logger->info('Number of sequences in backup database lookup table: ' . $seq_count_alt); + push(@db_lookup_table_refs, $db_seqs_alt); +} + +# +# Debugging loop +# +#foreach my $keyyy (keys(%{$db_seqs})) { +# $logger->fatal('Key: ' . $keyyy); +# $logger->fatal("\t" . 'AC: ' . ${$db_seqs}{$keyyy}{'AC'}); +# $logger->fatal("\t" . 'ID: ' . ${$db_seqs}{$keyyy}{'ID'}); +# $logger->fatal("\t" . 'SQ: ' . ${$db_seqs}{$keyyy}{'SQ'}); +#} + +# +# Lookup sequence fragment contexts and store results. +# +my ($counter_cleavages, $counter_miscleavages, $counter_modifications, $counter_peptides) = _ExtractSeqs(\@db_lookup_table_refs, + $fragments, $strip_lc, $id_column, $peptide_column, + $cleavage_output, $miscleavage_output, $modification_output, $peptide_output, + $cleavage_aa, $cleavage_terminus, $miscleavage_distance, + $n_term_context_length, $c_term_context_length, + $padding_character); +# +# Report result stats. +# +my $max_length = 0; +foreach my $counter ($counter_cleavages, $counter_miscleavages, $counter_modifications, $counter_peptides) { + $max_length = (length($counter) > $max_length) ? length($counter) : $max_length; +} +my $prefixed_counter = sprintf('%*s', $max_length, $counter_cleavages); +$logger->info('Extracted cleavage site sequence contexts: ' . $prefixed_counter . '.'); +$prefixed_counter = sprintf('%*s', $max_length, $counter_miscleavages); +$logger->info('Extracted miscleavage site sequence contexts: ' . $prefixed_counter . '.'); +$prefixed_counter = sprintf('%*s', $max_length, $counter_modifications); +$logger->info('Extracted modification site sequence contexts: ' . $prefixed_counter . '.'); +$prefixed_counter = sprintf('%*s', $max_length, $counter_peptides); +$logger->info('Extracted peptide sequence contexts: ' . $prefixed_counter . '.'); +$prefixed_counter = sprintf('%*s', $max_length, $counter_cleavages + $counter_miscleavages + $counter_modifications + $counter_peptides); +$logger->info('Total extracted sequence contexts: ' . $prefixed_counter . '.'); +$logger->info('Finished!'); + +# +## +### Internal subs. +## +# + +sub _CreateLookupHash { + + my ($file_path, $index_id, $file_type) = @_; + + my $file_path_fh; + my %seqs; + + if (defined($file_type)) { + $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...'); + } elsif ($file_path =~ m/($ps?)([^\.]+)\.dat$/i) { + $file_type = 'UniProtKB DAT'; + $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...'); + } elsif ($file_path =~ m/($ps?)([^\.]+)\.fa(sta)?$/i) { + $file_type = 'FASTA'; + # For FASTA files we don't know what to expect: + # IDs or ACcession numbers or a mix of both. + # (Re)set index_id type to AC to prevent adding redundant IDs to the results. + # Hence treat all identifiers stable and unstable as ACs. + # (Normally we default to ACcession numbers and if we would have found an AC + # and the input fragments file was using IDs, we would append the AC to the results.) + $index_id = 'AC'; + $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...'); + } else { + $file_type = 'FASTA'; + $index_id = 'AC'; + $logger->warn('Could not determine database file type based on the file name extension of ' . $file_path); + $logger->warn('Will assume it is a FASTA file and hope that is correct...'); + $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...'); + } + + if ($file_type eq 'UniProtKB DAT') { + + # + # Read an entire record delimited by // at a time. + # + local $/ = "\/\/\n"; + + eval { + open($file_path_fh, "<$file_path"); + }; + if ($@) { + $logger->fatal('Cannot read from database input file: ' . $@); + exit; + } + + while (<$file_path_fh>){ + + $logger->trace('Record:' . "\n$_\n"); + + my $entry = SWISS::Entry->fromText($_); + + my $id = $entry->ID; + my $acc = $entry->AC; + my $seq = $entry->SQ; + my $index_key; + + if ($index_id eq 'ID') { + $index_key = $id; + } elsif ($index_id eq 'AC') { + $index_key = $acc; + } else { + $logger->fatal('Unknown type of identifier used for indexing: ' . $index_id . '. Must be ID or AC.'); + exit; + } + + $logger->debug('Found accession ' . $acc); + $logger->trace('Found accession ' . $acc . ' with ID ' . $id . ' and seq ' . $seq); + + $seqs{$index_key}{'AC'} = $acc; + $seqs{$index_key}{'ID'} = $id; + $seqs{$index_key}{'SQ'} = $seq; + + } + + } elsif ($file_type eq 'FASTA') { + + my @header_ids = (); + my $seq = ''; + + eval { + open($file_path_fh, "<$file_path"); + }; + if ($@) { + $logger->fatal('Cannot read from database input file: ' . $@); + exit; + } + + LINE: while (my $line = <$file_path_fh>) { + + if ($line =~ m/^[\s\n\r]+$/) { + + # + # Skip blank line + # + $logger->debug('Found empty line in FASTA file.'); + next LINE; + + } elsif ($line =~ m/^[a-zA-Z]+/) { + + # + # It's a sequence line + # Remove all white space and new line characters. + # + $logger->debug('Found FASTA sequence line: ' . $line); + + $line =~ s/[\s\n\r]+//g; + $seq .= $line; + + } elsif ($line =~ m/^>/) { + + $logger->debug('Found new FASTA header line: ' . $line); + + if ((length($seq) > 0) && defined($header_ids[0])) { + + # + # Store the previously found sequence in the lookup table. + # + foreach my $header_id (@header_ids) { + + # + # We don't know if the ID we found is an accession number (stable ID) or a normal (unstable) ID + # -> Fill both the AC and ID fields with the same ID value. + # + $seqs{$header_id}{'AC'} = $header_id; + $seqs{$header_id}{'ID'} = $header_id; + $seqs{$header_id}{'SQ'} = $seq; + + $logger->debug('Stored ' . $header_id . ':'); + $logger->debug("\t" . 'SQ ' . $seq); + + } + + # + # Reset vars. + # + $seq = ''; + @header_ids = (); + + } + + # + # Check for the presence of some frequently occurring naming schemes: + # + # >IPI:CON_Trypsin|SWISS-PROT:P00761|TRYP_PIG Trypsin - Sus scrofa (Pig). + # >IPI:CON_IPI00174775.2|TREMBL:Q32MB2;Q86Y46 Tax_Id=9606 Gene_Symbol=KRT73 Keratin-73 + # >sp|Q42592|APXS_ARATH L-ascorbate peroxidase S, chloroplastic/mitochondrial; + # >jgi|Triad1|1|gw1.3.1.1 + # + # The FASTA header line can basically have any format. + # Therefore we try to see if the IDs we are looking for are present anywhere + # in the header line up until the first white space or new line. + # This should prevent matching annotation from other proteins in the description + # like a for example a 'best' BLAST hit that contains one of the IDs we + # are looking for. Hence, in such a case this sequence is *not* the one of + # the IDs we looking for, but similar to that one at best. + # + my @unfiltered_ids; + if ($line =~ m/^>((([^\s;|]+)[;|])*([^\s;|]+))[|;]?/i) { + + # + # Got multiple IDs + # + my $multiple_ids = $1; + @unfiltered_ids = split(/[\s;|]+/, $multiple_ids); + + } elsif ($line =~ m/^>([^\s]+)/i){ + + # + # Got a single ID + # + my $single_id = $1; + push(@unfiltered_ids, $single_id); + + } + + KEY:foreach my $unfiltered_id (@unfiltered_ids) { + + my $this_id = $unfiltered_id; + + # + # Check for RockerBox formattted IDs. + # + # Example of RockerBox ID format, which may contain + # * multiple IDs separated by semi-colons and + # * suffixed with the position of the peptide in the protein: + # YGR192C[123-142]; YJR009C[123-142] + if ($this_id =~ m/^([^\[\]]+)\[\d+-\d+\]$/i) { + $logger->trace('Protein ID in RockerBox format: ' . $this_id); + $this_id = $1; + $logger->trace("\t" . 'Using ID : ' . $this_id); + } + + # + # Check for DB namespace prefixes. + # + if ($this_id =~ m/([^:]+):([^:]+)/i) { + + # + # Namespace prefixed ID. + # + $logger->trace('Namespace prefixed ID: ' . $this_id); + + my $prefix = $1; + $this_id = $2; + + $logger->trace("\t" . 'Using ID: ' . $this_id); + + # + # For a selected list of "known" database namespace prefixes, + # check for versioned accession numbers and remove the version number. + # + if (defined($prefix) && defined($databases_with_optionally_versioned_ids{$prefix})) { + $logger->trace('Found prefix: ' . $prefix); + my $separator = $databases_with_optionally_versioned_ids{$prefix}; + $separator = '\\' . $separator; + if ($this_id =~ m/([^$separator]+)$separator\d+$/) { + $this_id = $1; + } + } + } + + $logger->debug('Found ID: ' . $this_id); + push(@header_ids, $this_id); + + } + +# if ($line =~ m/^>((([^\s\n\r:;|]+)[:]([^\s\n\r:|]+)[|;])*([^\s\n\r:;|]+)[:]([^\s\n\r:|]+))[|;]?(\s+(.+))?/i) { +# +# # +# # One or more namespace prefixed IDs. +# # +# my $concatenated_namespace_prefixed_ids = $1; +# $logger->debug('Found prefixed IDs in header: ' . $concatenated_namespace_prefixed_ids); +# +# # database_namespace = $3 && $5; +# # id = $4 && $6; +# # description = $8; +# my @namespace_prefixed_ids = split(/[|;]/, $concatenated_namespace_prefixed_ids); +# +# foreach my $prefixed_id (@namespace_prefixed_ids) { +# +# $logger->trace('Prefixed ID: ' . $prefixed_id); +# +# if ($prefixed_id =~ m/(([^\s\n\r:;|]+):)?([^\s\n\r:|]+)/i) { +# +# my $this_prefix = $2; +# my $this_id = $3; +# $logger->debug('Found ID: ' . $this_id); +# push(@header_ids, $this_id); +# +# # +# # For a selected list of "known" database namespace prefixes, +# # check for versioned accession numbers and - if found - add +# # both the versioned and the unversioned accession number +# # to the list of header IDs. +# # +# if (defined($this_prefix) && defined($databases_with_optionally_versioned_ids{$this_prefix})) { +# $logger->trace('Found prefix: ' . $this_prefix); +# my $separator = $databases_with_optionally_versioned_ids{$this_prefix}; +# $separator = '\\' . $separator; +# if ($this_id =~ m/([^$separator]+)$separator\d+$/) { +# my $this_unversioned_id = $1; +# $logger->debug('Found unversioned ID: ' . $this_unversioned_id); +# push(@header_ids, $this_unversioned_id); +# } +# } +# +# } else { +# +# $logger->warn( +# 'This should have been an optionally prefixed ID, ' +# . 'but failed to match the corresponding regex: ' +# . $prefixed_id); +# +# } +# } +# +# } elsif ($line =~ m/^>((([^\s\n\r:;|]+)[|;])*([^\s\n\r:;|]+))[|;]?(\s+(.+))?/i) { +# +# # +# # One or more unprefixed IDs. +# # +# my $concatenated_ids = $1; +# $logger->debug('Found unprefixed IDs in header: ' . $concatenated_ids); +# +# # id = $3 && $4; +# # description = $7; +# my @unprefixed_ids = split(/[|;]/, $concatenated_ids); +# +# foreach my $unprefixed_id (@unprefixed_ids) { +# +# $logger->debug('Found ID: ' . $unprefixed_id); +# push(@header_ids, $unprefixed_id); +# +# } +# +# } else { +# +# # +# # Something else. +# # +# # The FASTA header line can basically have any format, +# # so this is probably one of the less frequent occurring annotation schemes. +# # Therefore we try to see if the IDs we are looking for are present anywhere +# # in the header line up until the first white space or new line. +# # This may be tricky as we might match other annotation from other proteins +# # like a description from a 'best' BLAST hit that contains one of the IDs we +# # are looking for. Hence, in such a case this sequence is *not* the one of +# # the IDs we looking for, but similar to that one at best. +# # +# if ($line =~ m/>([^\n\r\s]+)/) { +# +# my $putative_id = $1; +# $logger->debug('Found putative ID in header: ' . $putative_id); +# push(@header_ids, $putative_id); +# +# } else { +# $logger->warn('Cannot identify IDs in this header: ' . $line); +# } +# } + } + } + + # + # Store the last found sequence in the lookup table. + # + foreach my $header_id (@header_ids) { + + # + # We don't know if the ID we found is an accession number (stable ID) or a normal (unstable) ID + # -> Fill both the AC and ID fields with the same ID value. + # + $seqs{$header_id}{'AC'} = $header_id; + $seqs{$header_id}{'ID'} = $header_id; + $seqs{$header_id}{'SQ'} = $seq; + + $logger->debug('Stored ' . $header_id . ':'); + $logger->debug("\t" . 'SQ ' . $seq); + + } + + # + # Reset vars. + # + $seq = ''; + @header_ids = (); + + } + + close($file_path_fh); + $logger->info('Created lookup list for database sequences.'); + + return (\%seqs, $index_id); + +} + +sub _ExtractSeqs { + + my ($db_lookup_table_refs, + $path_from, $strip_lc, $id_column, $peptide_column, + $cleavage_path_to, $miscleavage_path_to, $modification_path_to, $peptide_path_to, + $cleavage_aa, $cleavage_terminus, $miscleavage_distance, + $n_term_context_length, $c_term_context_length, + $padding_character) = @_; + + $logger->debug('Parsing ' . $path_from . '...'); + + my $extracted_seq_contexts_cle = 0; + my $extracted_seq_contexts_mis = 0; + my $extracted_seq_contexts_mod = 0; + my $extracted_seq_contexts_pep = 0; + my $path_from_fh; + my $cleavage_path_to_fh; + my $miscleavage_path_to_fh; + my $modification_path_to_fh; + my $peptide_path_to_fh; + + eval { + open($path_from_fh, "<$path_from"); + }; + if ($@) { + $logger->fatal('Cannot read from fragments input file: ' . $@); + exit; + } + if (defined($cleavage_path_to)) { + eval { + open($cleavage_path_to_fh, ">$cleavage_path_to"); + }; + if ($@) { + $logger->fatal('Cannot write to output file: ' . $@); + exit; + } + } + if (defined($miscleavage_path_to)) { + eval { + open($miscleavage_path_to_fh, ">$miscleavage_path_to"); + }; + if ($@) { + $logger->fatal('Cannot write to output file: ' . $@); + exit; + } + } + if (defined($modification_path_to)) { + eval { + open($modification_path_to_fh, ">$modification_path_to"); + }; + if ($@) { + $logger->fatal('Cannot write to output file: ' . $@); + exit; + } + } + if (defined($peptide_path_to)) { + eval { + open($peptide_path_to_fh, ">$peptide_path_to"); + }; + if ($@) { + $logger->fatal('Cannot write to output file: ' . $@); + exit; + } + } + + LINE:while (my $line = <$path_from_fh>) { + + # + # Remove (trailing) line end characters! + # + $line =~ s/[\n\r\f]+//g; + $logger->trace($line); + + my @column_values = split(/\t/, $line); + + # For debugging only. + #foreach my $val (@column_values) { + # $logger->debug('Col val: ' . $val); + #} + + my $id_column_index = $id_column - 1; + my $peptide_column_index = $peptide_column - 1; + my $unfiltered_key = $column_values[$id_column_index]; + my $key; + my @keys; + my $peptide = $column_values[$peptide_column_index]; + my $peptide_nonmod; + my $protein; + my $acc; + my $id; + my $index = -1; # An index < 0 indicates the peptide was not found in a protein sequence (yet). + my $length; + my $cleavage_peptide_contexts = []; + my $miscleavage_peptide_contexts = []; + my $modification_peptide_contexts = []; + my $peptide_contexts = []; + + # + # Check if $key and $peptide were present in this line. + # If not this may be a leading/trailing empty or meta-data line, + # that does not contain the correct amount of columns. + # + unless (defined($unfiltered_key) && defined($peptide)) { + $logger->warn('Skipping line: ' . $line); + $logger->warn("\t" . '(Peptide sequence and/or protein ID missing or incorrect amount of columns.)'); + next LINE; + } + + # + # Sanity check for peptide sequences. + # + unless ($peptide =~ m/([a-zA-Z]+)/) { + $logger->warn('Fragment file line in unexpected format. Cannot find peptide in: ' . $line); + $logger->debug('$peptide contains: ' . $peptide); + next LINE; + } + + if ($strip_lc) { + # Remove lower case characters representing modifications. + $peptide_nonmod = $peptide; + $peptide_nonmod =~ s/[a-z]+//g; + } else { + # Convert everything to uppercase. + $peptide_nonmod = uc($peptide); + } + + # + # Figure out in what format the Protein ID(s) were supplied. + # + # * Strip off optional DB namespace prefixes. + # * Handle optional accession number version suffixes. + # + if ($unfiltered_key =~ m/^((([^\s;|]+)[\s;|]+)*([^\s;|]+))[\s|;]?/i) { + + # + # Got multiple IDs + # + my $multiple_ids = $1; + @keys = split(/[\s;|]+/, $multiple_ids); + + } else { + + # + # Got a single ID + # + push(@keys, $unfiltered_key); + + } + + KEY:foreach $unfiltered_key (@keys) { + + $key = $unfiltered_key; + + # + # Check for RockerBox formattted IDs. + # + # Example of RockerBox ID format, which may contain + # * multiple IDs separated by semi-colons and + # * suffixed with the position of the peptide in the protein: + # YGR192C[123-142]; YJR009C[123-142] + if ($key =~ m/^([^\[]+)\[\d+-\d+\]$/i) { + $logger->trace('Protein ID in RockerBox format: ' . $key); + $key = $1; + $logger->trace("\t" . 'Using ID : ' . $key); + } + + # + # Check for DB namespace prefixes. + # + if ($key =~ m/([^:]+):([^:]+)/i) { + + # + # Namespace prefixed ID. + # + $logger->trace('Namespace prefixed ID: ' . $key); + + my $prefix = $1; + $key = $2; + + $logger->trace("\t" . 'Using ID: ' . $key); + + # + # For a selected list of "known" database namespace prefixes, + # check for versioned accession numbers and remove the version number. + # + if (defined($prefix) && defined($databases_with_optionally_versioned_ids{$prefix})) { + $logger->trace('Found prefix: ' . $prefix); + my $separator = $databases_with_optionally_versioned_ids{$prefix}; + $separator = '\\' . $separator; + if ($key =~ m/([^$separator]+)$separator\d+$/) { + $key = $1; + } + } + } + + $logger->debug('Found ID: ' . $key); + + # + # Sanity check for Protein ID. + # + unless ($key =~ m/^([A-Z0-9\._-]+)$/i) { + $logger->warn('Fragment file line in unexpected format. Cannot find sequence ID in:'); + $logger->warn("\t" . $line); + $logger->debug('$key contains: ' . $key); + next LINE; + } + + # + # Lookup info for the protein this peptide was derived from. + # + ($protein, $acc, $id, $index) = _LookupProteinInfo($key, $peptide_nonmod, $db_lookup_table_refs); + + # + # Check if the peptide (fragment) was found in this protein sequence. + # + if (defined($protein)) { + if ($index > -1) { + $logger->trace('Found peptide sequence in protein ' . $key); + last KEY; + } else { + $logger->warn('Cannot find peptide (' . $peptide_nonmod . ') in protein ' . $key); + $logger->warn("\t" . 'Will try other protein IDs for this peptide if available.'); + } + } else { + $logger->warn('Cannot find protein identifier ' . $key . ' in any of the supplied databases.'); + $logger->warn("\t" . 'Will try other protein IDs for this peptide if available.'); + } + } + + # + # Check if the peptide (fragment) was found in any of the associated protein sequences. + # + if (defined($protein)) { + + $logger->debug('Found protein ' . $key); + + if ($index > -1) { + + $logger->debug('Found peptide (' . $peptide_nonmod . ') in protein ' . $key); + + if (defined($cleavage_path_to)) { + ($cleavage_peptide_contexts) = _GetCleavageSequenceContexts($peptide_nonmod, $protein, $key, $index, + $cleavage_aa, $cleavage_terminus); + } + if (defined($miscleavage_path_to)) { + ($miscleavage_peptide_contexts) = _GetMiscleavageSequenceContexts($peptide_nonmod, $protein, $key, $index, + $cleavage_aa, $cleavage_terminus, $miscleavage_distance); + } + if (defined($modification_path_to)) { + ($modification_peptide_contexts) = _GetModificationSequenceContexts($peptide_nonmod, $protein, $key, $index, + $modified_aa, $peptide); + } + if (defined($peptide_path_to)) { + ($peptide_contexts) = _GetPeptideSequenceContexts($peptide_nonmod, $protein, $key, $index); + } + } else { + $logger->error('Cannot find peptide (' . $peptide_nonmod . ') in protein ' . $key); + $acc = 'NA'; + } + } else { + $logger->error('Cannot find protein "' . $unfiltered_key . '" in any of the supplied databases.'); + $acc = 'NA'; + } + + # + # Append the peptide contexts to the existing lines of the fragment file and save that new line to the output file. + # + $extracted_seq_contexts_cle = _SaveSequenceContexts($line, $acc, $cleavage_peptide_contexts, + $cleavage_path_to_fh, $extracted_seq_contexts_cle); + + $extracted_seq_contexts_mis = _SaveSequenceContexts($line, $acc, $miscleavage_peptide_contexts, + $miscleavage_path_to_fh, $extracted_seq_contexts_mis); + + $extracted_seq_contexts_mod = _SaveSequenceContexts($line, $acc, $modification_peptide_contexts, + $modification_path_to_fh, $extracted_seq_contexts_mod); + + $extracted_seq_contexts_pep = _SaveSequenceContexts($line, $acc, $peptide_contexts, + $peptide_path_to_fh, $extracted_seq_contexts_pep); + + } + + # + # Close file handles. + # + foreach my $fh ($path_from_fh, $cleavage_path_to_fh, $miscleavage_path_to_fh, $modification_path_to_fh, $peptide_path_to_fh) { + if (defined($fh)) { + close($fh); + } + } + + return($extracted_seq_contexts_cle, $extracted_seq_contexts_mis, $extracted_seq_contexts_mod, $extracted_seq_contexts_pep); + +} + +sub _LookupProteinInfo { + + my ($key, $peptide, $db_lookup_table_refs) = @_; + + my $protein_found_in_db = 0; + my $protein; + my $acc; + my $id; + my $index; + + DB_LOOKUP:foreach my $db_lookup_table_ref (@{$db_lookup_table_refs}) { + + if (defined(${$db_lookup_table_ref}{$key})) { + + $protein_found_in_db = 1; + $logger->debug('Found protein ' . $key . ' in this sequence DB.'); + + $protein = ${$db_lookup_table_ref}{$key}{'SQ'}; + $acc = ${$db_lookup_table_ref}{$key}{'AC'}; + $id = ${$db_lookup_table_ref}{$key}{'ID'}; + # + # Find peptide (fragment) in DB protein sequence. + # + $index = index($protein, $peptide); + + if ($index > -1) { + + # + # Found peptide! + # + $logger->debug('Found peptide ' . $peptide . ' at position ' . $index . ' in protein ' . $key); + last DB_LOOKUP; + + } + + # + # If the input was based on mass spectrometry data we cannot see + # the difference between I (Isolucene) and L (Leucine) as the both + # weigh the same. Let's try an I agnostic search by substituting all + # Is for Ls. + # + my $protein_l_only = $protein; + my $peptide_l_only = $peptide; + $protein_l_only =~ s/I/L/g; + $peptide_l_only =~ s/I/L/g; + $index = index($protein_l_only, $peptide_l_only); + + if ($index > -1) { + + # + # Found peptide! + # + $logger->debug('Found peptide ' . $peptide . ' at position ' . $index . ' in protein ' . $key); + $logger->debug("\t" . 'Had to treat I and L as the same amino acid in the DB search to get a match.'); + last DB_LOOKUP; + + } + + # + # Cannot find peptide :( + # + $protein = undef; + $acc = undef; + $id = undef; + + $logger->debug('Cannot find peptide (' . $peptide . ') in protein ' . $key); + $logger->debug("\t" . 'This may be the result of an updated protein sequence in the database.'); + $logger->debug("\t" . 'Will try to find the peptide in other databases (if available).'); + + if ($strip_lc) { + $logger->debug("\t" . '(Stripping of lowercase characters, indicating modifications in the peptide sequence, was enabled.)'); + } else { + $logger->debug("\t" . '(Stripping of lowercase characters, indicating modifications in the peptide sequence, was *not* enabled.)'); + } + + } else { + $logger->debug('Protein ' . $key . ' missing from this sequence DB. Will try other databases (if available).'); + } + } + + # + # Check if the peptide (fragment) was found in a DB protein sequence. + # + if ($protein_found_in_db == 1) { + unless ($index > -1) { + $logger->warn('Cannot find peptide (' . $peptide . ') in protein ' . $key); + $logger->warn("\t" . 'Will try to find this peptide (' . $peptide . ') using other associated protein identifiers (if available).'); + } + } else { + $logger->warn('Cannot find protein ' . $key . ' in any of the supplied databases.'); + } + + return($protein, $acc, $id, $index); +} + +sub _GetCleavageSequenceContexts { + + my ($peptide, $protein, $key, $index, $cleavage_aa, $cleavage_terminus) = @_; + + my %peptide_contexts; + $peptide_contexts{'n_term'}{'context'} = ''; + $peptide_contexts{'c_term'}{'context'} = ''; + my $max_context_length = $n_term_context_length + $c_term_context_length; + + foreach my $peptide_term (keys(%peptide_contexts)) { + + # + # Get the offset for the sequence contexts to retrieve from this peptide terminus + # or drop the context for this peptide terminus if it is also the protein terminus. + # + if ($peptide_term eq 'n_term') { + if ($index > 0) { + # This is an N-terminal peptide context. + my $offset = $index - $n_term_context_length; + $peptide_contexts{$peptide_term}{'offset'} = $offset; + $logger->debug('Offset = ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . '.'); + } else { + # Peptide N-terminus == Protein N-terminus. + $logger->warn('Peptide N-terminus = protein N-terminus. Dropping N-terminal sequence context for peptide ' . $peptide . + ' in protein ' . $key . '.'); + next; + } + } elsif ($peptide_term eq 'c_term') { + if (($index + length($peptide)) < length($protein)) { + # This is a C-terminal peptide context. + my $offset = ($index + length($peptide))- $n_term_context_length; + $peptide_contexts{$peptide_term}{'offset'} = $offset; + $logger->debug('Offset = ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . '.'); + } else { + # Peptide C-terminus == Protein C-terminus. + $logger->warn('Peptide C-terminus = protein C-terminus. Dropping C-terminal sequence context for peptide ' . $peptide . + ' in protein ' . $key . '.'); + next; + } + } + + # + # Check if DB protein sequence is long enough on n-terminal side + # to retrieve full length peptide contexts. + # (c-term length is checked later...) + # + if ($peptide_contexts{$peptide_term}{'offset'} < 0) { + + # + # Protein too short for full length context: Calculate offset adjustment. + # + my $offset = $peptide_contexts{$peptide_term}{'offset'}; + my $offset_adjustment = -($offset); + $peptide_contexts{$peptide_term}{'offset_adjustment'} = $offset_adjustment; + $logger->debug('Offset ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . ' less than zero'); + + if (length($padding_character) == 1) { + + # + # Add N-terminal padding characters for contexts that are too short on the N-terminal side. + # + my $n_term_padding = "$padding_character" x $offset_adjustment; + $peptide_contexts{$peptide_term}{'context'} = $n_term_padding; + $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding); + + } else { + $logger->debug('Padding was disabled. Will not add n-term padding.'); + } + + $peptide_contexts{$peptide_term}{'offset'} = 0; + + } else { + + $peptide_contexts{$peptide_term}{'offset_adjustment'} = 0; + + } + + # + # Extract sequence context from DB protein sequence. + # + my $peptide_context = substr($protein, $peptide_contexts{$peptide_term}{'offset'}, + $max_context_length - $peptide_contexts{$peptide_term}{'offset_adjustment'}); + # append the context extracted from the protein sequence, + # because there may be already some N-terminal padding... + $peptide_contexts{$peptide_term}{'context'} .= $peptide_context; + + # + # Quality Control: Check enzyme specificity for enzymes with known (= user supplied) specs. + # + unless ($cleavage_aa eq '*') { + + $peptide_context = $peptide_contexts{$peptide_term}{'context'}; + my $p1 = substr($peptide_context, ($n_term_context_length -1), 1); + my $p1_prime = substr($peptide_context, $n_term_context_length, 1); + + if ($cleavage_terminus eq 'C') { + + if ($p1 eq $cleavage_aa) { + # Peptide fragment contains the AA recognised by the protease. + $logger->debug('Specific protease activity -> Retaining peptide context.'); + } else { + # Must be non-specific cleavage -> drop peptide context. + $logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context . + ' for peptide ' . $peptide . ' in protein ' . $key); + $peptide_contexts{$peptide_term}{'context'} = ''; + } + + } elsif ($cleavage_terminus eq 'N') { + + if ($p1_prime eq $cleavage_aa) { + # Peptide fragment contains the AA recognised by the protease. + $logger->debug('Specific protease activity -> Retaining peptide context.'); + } else { + # Must be non-specific cleavage -> drop peptide context. + $logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context . + ' for peptide ' . $peptide . ' in protein ' . $key); + $peptide_contexts{$peptide_term}{'context'} = ''; + } + + } elsif ($cleavage_terminus eq '*') { + + if (($p1 eq $cleavage_aa) || ($p1_prime eq $cleavage_aa)) { + # Peptide fragment contains the AA recognised by the protease. + $logger->debug('Specific protease activity -> Retaining peptide context.'); + } else { + # Must be non-specific cleavage -> drop peptide context. + $logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context . + ' for peptide ' . $peptide . ' in protein ' . $key); + $peptide_contexts{$peptide_term}{'context'} = ''; + } + + } else { + $logger->fatal('Unknown cleavage terminus ' . $cleavage_terminus . '. Must be C, N or *.'); + exit; + } + } + } + + push(my @contexts, $peptide_contexts{'n_term'}{'context'}, $peptide_contexts{'c_term'}{'context'}); + + if (length($padding_character) == 1) { + _CheckSequenceContextLength(\@contexts, $peptide, $key, $max_context_length); + } else { + $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.'); + } + + return(\@contexts); + +} + +sub _GetMiscleavageSequenceContexts { + + my ($peptide, $protein, $key, $petide_index, $cleavage_aa, $cleavage_terminus, $miscleavage_distance) = @_; + + my @peptide_contexts; + my $max_context_length = $n_term_context_length + $c_term_context_length; + + # + # Find miscleavage sites in peptide. + # + # TODO: handle --mcd + # + my @amino_acids = split('', $peptide); + my $aa_index = 0; + # + # Drop first and last amino acids as these are from cleavage sites or from the protein termini. + # In either case they cannot represent a miscleavage site. + # + shift(@amino_acids); + pop(@amino_acids); + + foreach my $aa (@amino_acids) { + + $aa_index++; + + if ($aa eq $cleavage_aa) { + + # + # We found a miscleavage site. + # + $logger->debug('Found miscleavage site in peptide ' . $peptide . ' of protein ' . $key . '.'); + # + # Get the offset for the sequence context. + # + my $offset = $petide_index + $aa_index - $n_term_context_length; + $logger->debug("\t" . 'Offset = ' . $offset . '.'); + my $n_term_padding = ''; + my $offset_adjustment = 0; + + if ($cleavage_terminus eq 'C') { + + # + # Must increment the offset with 1 if the protease cuts C-terminal of a recognition site. + # + $offset++; + + } + + # + # Check if DB protein sequence is long enough on n-terminal side + # to retrieve full length peptide contexts. + # (c-term length is checked elsewhere.) + # + if ($offset < 0) { + + # + # Protein too short for full length context: Calculate offset adjustment. + # + $offset_adjustment = -($offset); + $logger->debug('Offset ' . $offset . ' for miscleavage context in peptide ' . $peptide . ' in protein ' . $key . ' less than zero'); + + if (length($padding_character) == 1) { + + # + # Add N-terminal padding characters for contexts that are too short on the N-terminal side. + # + $n_term_padding = "$padding_character" x $offset_adjustment; + $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding); + + } else { + $logger->debug('Padding was disabled. Will not add n-term padding.'); + } + + # + # Reset offset for this context to protein start. + # + $offset = 0; + + } + + # + # Extract sequence context from DB protein sequence. + # + my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment); + $peptide_context = $n_term_padding . $peptide_context; + $logger->debug('Found miscleavage context: ' . $peptide_context . '.'); + + push(@peptide_contexts, $peptide_context); + } + } + + if (length($padding_character) == 1) { + _CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length); + } else { + $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.'); + } + + return(\@peptide_contexts); + +} + +sub _GetModificationSequenceContexts { + + my ($peptide, $protein, $key, $petide_index, $modified_aa, $modified_peptide) = @_; + + my @peptide_contexts; + my $modified_aa_index_adjustment; + my @modified_aa_in_peptide_indices; + my $modified_aa_in_protein_index; + my $max_context_length = $n_term_context_length + 1 + $c_term_context_length; + #my $offset = 0; + my $modified_aa_in_modified_peptide_index; + my $modified_peptide_remainder; + my $processed_peptide_length = 0; + + # + # Find amino acid in the string identifying both the modified amino acid and the modification. + # + if ($modified_aa =~ m/^([a-z]+)[A-Z*]$/) { + + $modified_aa_index_adjustment = length($1); + + } elsif ($modified_aa =~ m/^[A-Z*][a-z]$/) { + + $modified_aa_index_adjustment = 0; + + } else { + + $logger->fatal('Modified amino acid was specified in an unsupported format: ' . $modified_aa . '.'); + exit; + + } + + # + # Find modified sites in peptide. + # + my $modified_aa_for_regex = $modified_aa; + $modified_aa_for_regex =~ s/\*/[A-Z]/; + $logger->debug('modified_aa for search: ' . $modified_aa_for_regex); + if ($modified_peptide =~ m/$modified_aa_for_regex/) { + $logger->debug("\t" . 'Peptide contains mods.'); + $modified_aa_in_modified_peptide_index = $-[0]; + $modified_peptide_remainder = $'; + my $modified_peptide_prefix = $`; + $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); + $processed_peptide_length += length($modified_peptide_prefix); + $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); + $processed_peptide_length += length($modified_aa); + $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); + $logger->trace("\t\t" . 'Index for mod :' . $modified_aa_in_modified_peptide_index); + } else { + $logger->debug('No modified amino acids found in this peptide.'); + } + + #my $modified_aa_in_modified_peptide_index = index($modified_peptide, $modified_aa); + + while (defined($modified_aa_in_modified_peptide_index)) { + + $modified_aa_in_modified_peptide_index += $modified_aa_index_adjustment; + $logger->debug('Found modified amino acid at position ' . ($modified_aa_in_modified_peptide_index + 1) . ' in modified peptide ' . $modified_peptide . ' of protein ' . $key . '.'); + + # + # Count all modifications of any type preceeding the found modified amino acid. + # + my $modified_n_term = substr($modified_peptide, 0, $modified_aa_in_modified_peptide_index); + my $n_term = $modified_n_term; + $n_term =~ s/[a-z]+//g; + my $mod_count = length($modified_n_term) - length($n_term); + + if ($mod_count >= 0) { + $logger->debug(' counted modifications preceeding amino acid = ' . $mod_count); + } else { + $logger->fatal(' counted modifications preceeding amino acid is negative: ' . $mod_count); + exit; + } + + # + # We need to substract the amount of mods preceeding the identified amino acid + # from the index to get the position of the amino acid in the unmodified peptide. + # + my $modified_aa_in_peptide_index = $modified_aa_in_modified_peptide_index - $mod_count; + $logger->debug(' and at position ' . ($modified_aa_in_peptide_index + 1) . ' in peptide ' . $peptide . ' of protein ' . $key . '.'); + + push(@modified_aa_in_peptide_indices, $modified_aa_in_peptide_index); + + # + # Search for additional mods in the modified peptide. + # + #$offset = $modified_aa_in_modified_peptide_index + 1; + #$modified_aa_in_modified_peptide_index = index($modified_peptide, $modified_aa, $offset); + if ($modified_peptide_remainder =~ m/$modified_aa_for_regex/) { + $logger->debug("\t" . 'Peptide contains more mods.'); + $modified_peptide_remainder = $'; + my $modified_peptide_prefix = $`; + $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); + $modified_aa_in_modified_peptide_index = $-[0] + $processed_peptide_length; + $logger->trace("\t\t" . 'Index for mod :' . $modified_aa_in_modified_peptide_index); + $processed_peptide_length += length($modified_peptide_prefix); + $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); + $processed_peptide_length += length($modified_aa); + $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); + } else { + $modified_aa_in_modified_peptide_index = undef(); + $logger->debug('No more modified amino acids found in this peptide.'); + } + } + + # + # Retrieve sequence contexts for modified stites from protein sequence + # + foreach my $modified_aa_in_peptide_index (@modified_aa_in_peptide_indices) { + + # + # Get the offset for the sequence context. + # + my $offset = $petide_index + $modified_aa_in_peptide_index - $n_term_context_length; + $logger->debug("\t" . 'Offset = ' . $offset . '.'); + my $n_term_padding = ''; + my $offset_adjustment = 0; + + # + # Check if DB protein sequence is long enough on n-terminal side + # to retrieve full length peptide contexts. + # (c-term length is checked elsewhere.) + # + if ($offset < 0) { + + # + # Protein too short for full length context: Calculate offset adjustment. + # + $offset_adjustment = -($offset); + $logger->debug('Offset ' . $offset . ' for modification context in peptide ' . $peptide . ' in protein ' . $key . ' less than zero'); + + if (length($padding_character) == 1) { + + # + # Add N-terminal padding characters for contexts that are too short on the N-terminal side. + # + $n_term_padding = "$padding_character" x $offset_adjustment; + $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding); + + } else { + $logger->debug('Padding was disabled. Will not add n-term padding.'); + } + + # + # Reset offset for this context to protein start. + # + $offset = 0; + + } + + # + # Extract sequence context from DB protein sequence. + # + my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment); + $peptide_context = $n_term_padding . $peptide_context; + $logger->debug('Found modification context: ' . $peptide_context . '.'); + + push(@peptide_contexts, $peptide_context); + + } + + if (length($padding_character) == 1) { + _CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length); + } else { + $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.'); + } + + return(\@peptide_contexts); + +} + +# +# TODO: pep contexts. +# +sub _GetPeptideSequenceContexts { + + my ($peptide, $protein, $key, $peptide_index) = @_; + + my @peptide_contexts; + my $max_context_length = $n_term_context_length + length($peptide) + $c_term_context_length; + + # + # Get the offset for the peptide sequence context. + # + my $offset = $peptide_index - $n_term_context_length; + $logger->debug("\t" . 'Offset = ' . $offset . '.'); + my $n_term_padding = ''; + my $offset_adjustment = 0; + + # + # Check if DB protein sequence is long enough on n-terminal side + # to retrieve full length peptide contexts. + # (c-term length is checked elsewhere.) + # + if ($offset < 0) { + + # + # Protein too short for full length context: Calculate offset adjustment. + # + $offset_adjustment = -($offset); + $logger->debug('Offset ' . $offset . ' for peptide context of peptide ' . $peptide . ' in protein ' . $key . ' less than zero'); + + if (length($padding_character) == 1) { + + # + # Add N-terminal padding characters for contexts that are too short on the N-terminal side. + # + $n_term_padding = "$padding_character" x $offset_adjustment; + $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding); + + } else { + $logger->debug('Padding was disabled. Will not add n-term padding.'); + } + + # + # Reset offset for this context to protein start. + # + $offset = 0; + + } + + # + # Extract sequence context from DB protein sequence. + # + my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment); + $peptide_context = $n_term_padding . $peptide_context; + $logger->debug('Found modification context: ' . $peptide_context . '.'); + + push(@peptide_contexts, $peptide_context); + + if (length($padding_character) == 1) { + _CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length); + } else { + $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.'); + } + + return(\@peptide_contexts); + +} + +sub _CheckSequenceContextLength { + + my ($contexts, $peptide, $key, $max_context_length) = @_; + + # + # If padding was enabled for sequence contexts < max sequence context length. + # + foreach my $peptide_context (@{$contexts}) { + + my $peptide_context_length = length($peptide_context); + + if ($peptide_context_length < $max_context_length) { + + # + # sequence context < max sequence context length -> append c-term padding. + # (n-term padding was already appended if necessary during sequence context lookup with _GetSequenceContext().) + # + my $c_term_padding = "$padding_character" x ($max_context_length - $peptide_context_length); + $logger->debug('Length of peptide context (' . $peptide_context_length . ') < max context length.'); + $logger->debug("\t" . 'Will add C-terminal padding: ' . $c_term_padding); + $peptide_context = $peptide_context . $c_term_padding; + + } + + # + # Sanity check for padded sequence contexts. + # + # (Checks if the combined effect of n-term and c-term padding worked well.) + # + $peptide_context_length = length($peptide_context); + unless ($peptide_context_length == $max_context_length) { + $logger->fatal('Length of peptide context too short or too long for peptide ' . $peptide . ' in protein ' . $key); + $logger->fatal("\t" . 'Context length is ' . $peptide_context_length . ', but should be ' . $max_context_length . '.'); + exit; + } + } + + # + # No need to return the modified (c-terminal-padded) contexts as we worked with a arrayref. + # + +} + +sub _SaveSequenceContexts { + + my ($line, $acc, $contexts, $path_to_fh, $extracted_seq_contexts) = @_; + + # + # Append the peptide contexts to the existing line of the fragment file and + # save that new line to the output file. + # By appending we keep all info from the original fragment file intact. + # + foreach my $context (@{$contexts}) { + + # + # Skip "empty"/"missing" contexts. + # May be the result of: + # 1. The peptide terminus being also the protein terminus. + # 2. Proteins missing from the database. + # 3. ... + # + if ($padding_character ne '') { + if ($context =~ m/^($padding_character)+$/) { + next; + } + } else { + if ($context eq '') { + next; + } + } + + my $new_line = $line; + + # 1. Append accession numbers if the fragment file used IDs. + if ($index_id eq 'ID') { + $new_line .= "\t" . $acc; + } + + # 2. Append sequence context. + $new_line .= "\t" . $context; + # 3. Append new line character. + $new_line .= "\n"; + # Save result to disk. + print($path_to_fh $new_line); + $extracted_seq_contexts++; + + } + + return($extracted_seq_contexts); + +} + +sub _Usage { + + + print STDOUT "\n" . + 'ExtractPeptideSequenceContext.pl: Extract sequence contexts of cleavage, miscleavage or modification sites ' . "\n" . + ' based on sequence fragments (like peptides experimentally identified with MSMS) ' . "\n" . + ' and protein sequences from a database in FASTA or UniProtKB file format.' . "\n" . + "\n" . + 'Usage:' . "\n" . + "\n" . + ' ExtractPeptideSequenceContext.pl options' . "\n" . + "\n" . + 'Available options are:' . "\n" . + "\n" . + ' --db [file] Input file containing all database sequences in either UniProtKB *.dat or *.fasta format.' . "\n" . + ' --dba [file] Optional alternative / backup input file containing database sequences in either UniProtKB *.dat or *.fasta format.' . "\n" . + ' This may be a slightly older or newer version of the database, which will be searched in case a ' . "\n" . + ' sequence ID was not found in the primary database file that was specified with --db [file].' . "\n" . + ' --dbf [format] Optional argument to specify the database sequence file format. Valid format specifications are:' . "\n" . + ' * \'UniProtKB DAT\' for UniProtKB DAT files.' . "\n" . + ' * \'FASTA\' for FASTA files.' . "\n" . + ' Only required if the database files do *not* have the standard file extensions: ' . "\n" . + ' *.dat for UniProtKB DAT files or ' . "\n" . + ' *.fa or *.fasta for FASTA files.' . "\n" . + ' --k [ID|AC] Unique identifier type used as key to index an UniProtKB *.dat database file.' . "\n" . + ' Must be either AC for a UniProt accession number (default) or ID for a UniProt identifier.' . "\n" . + ' Not required for *.fasta databases: This tool will look for any type of ID in the first part of FASTA ' . "\n" . + ' sequence headers up until the first white space.' . "\n" . + ' Optionally multiple IDs may be present separated with pipe symbols (|) or semicolons (;).' . "\n" . + ' Optionally IDs may be prefixed with a database namespace and a colon (:).' . "\n" . + ' For example the accession number P32234 as well as the ID 128UP_DROME would be recognised in both ' . "\n" . + ' this sequence header:' . "\n" . + ' >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)' . "\n" . + ' and in this one:' . "\n" . + ' >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)' . "\n" . + ' --f [file] Fragment file containing accession numbers or IDs and sequence fragments (peptides), ' . "\n" . + ' whose sequence context should be extracted from the database. This file' . "\n" . + ' * Must be tab delimited with one accession/ID plus one sequence fragment per line' . "\n" . + ' * Must contain accession numbers / IDs in the same format as the database file used.' . "\n" . + ' Exception: Optionally IDs may be suffixed with the peptide\'s position in its protein between brackets.' . "\n" . + ' For example: CLH1_HUMAN[1612-1620].' . "\n" . + ' * May contain other columns, which will be preserved in the output.' . "\n" . + ' * Is the basis for the output: if a sequence context was found, ' . "\n" . + ' it will be appended in another column to the right of the existing columns.' . "\n" . + ' --icol [int] Column in the tab delimited fragment file containing the protein identifiers / accession numbers.' . "\n" . + ' Default = 1.' . "\n" . + ' --pcol [int] Column in the tab delimited fragment file containing the peptide sequences.' . "\n" . + ' Default = 2.' . "\n" . + ' --s Strip lowercase characters from the sequence fragments in the fragment file before doing a database lookup.' . "\n" . + ' Amino acids are expected to be in uppercase. If not, the sequences are converted to uppercase ' . "\n" . + ' before searching for the fragment in the database. When lowercase characters represent ' . "\n" . + ' modifications instead of amino acids these need to be removed before searching the database.' . "\n" . + ' Default = 0 (disabled) except when a modified amino acid is specified with --ma, which will automatically enable --s.' . "\n" . + ' --cleo [file] Optional output file where the retrieved cleavage site sequence contexts will be saved.' . "\n" . + ' --miso [file] Optional output file where the retrieved miscleavage site sequence contexts will be saved.' . "\n" . + ' --modo [file] Optional output file where the retrieved modification site sequence contexts will be saved.' . "\n" . + ' --pepo [file] Optional output file where the retrieved peptide sequence contexts will be saved.' . "\n" . + ' (At least one output file must be specified with --cleo, --miso, --modo or --pepo).' . "\n" . + ' --ca [A-Z] Cleavage amino acid. Assume the sequence fragments were derived from cutting with a proteolytic enzyme, ' . "\n" . + ' that recognizes this amino acid as the position to cut.' . "\n" . + ' When the specificity of the protease used to generate the peptides in the fragments file is unknown,' . "\n" . + ' you may provide an asterisk (*) to retrieve sequence context for any cleavage,' . "\n" . + ' but in that case this tool will not filter non-specifically cleaved fragments,' . "\n" . + ' that may be the result of other processes than protease activity.' . "\n" . + ' --ct [C|N] Cleavage terminus. Assume the sequence fragments were derived from cutting with a proteolytic enzyme, ' . "\n" . + ' that cuts at the C or N terminus of the amino acid specified with the --ca [A-Z] parameter.' . "\n" . + ' When the specificity of the protease used to generate the peptides in the fragments file is unknown,' . "\n" . + ' you may provide an asterisk (*) to retrieve sequence context for any cleavage,' . "\n" . + ' but in that case this tool will not filter non-specifically cleaved fragments,' . "\n" . + ' that may be the result of other processes than protease activity.' . "\n" . + ' --ma [A-Za-z] Modified amino acid.' . "\n" . + ' The amino acid must be specified in uppercase and the modification in lower case.' . "\n" . + ' The order is not important.' . "\n" . + ' Hence a phophorylated serine in the fragments file can be indicated with either pS or Sp, ' . "\n" . + ' but you cannot mix both pS and Sp in a single fragments file.' . "\n" . + ' You may provide an asterisk (*) instead of an upper case amino acid to retrieve sequence contexts ' . "\n" . + ' for the specified modification no matter what amino acid it was located on.' . "\n" . + ' A modification may be specified with more than one lower case character, ' . "\n" . + ' so for example phosphoS or Sphospho can also be used for a phosphorylated serine.' . "\n" . + ' --n [int] Integer specifying the length of the N-terminal sequence context to retrieve starting from the ' . "\n" . + ' cleavage, miscleavage or modification site. Default = 5.' . "\n" . + ' --c [int] Integer specifying the length of the C-terminal sequence context to retrieve starting from the ' . "\n" . + ' cleavage, miscleavage or modification site. Default = 5.' . "\n" . + ' Please note that cleavage and miscleavage sites have zero width, ' . "\n" . + ' while modified amino acids will increment the sequence context lenght with 1.' . "\n" . + ' When defaults are used for both the N-terminal and C-terminal sequence context lengths,' . "\n" . + ' the total sequence context length for a' . "\n" . + ' * cleavage or miscleavage site will be:' . "\n" . + ' (N-terminal sequence context) + (C-terminal sequence context) = 5 + 5 = 10.' . "\n" . + ' * modification site will be:' . "\n" . + ' (N-terminal sequence context) + modified AA + (C-terminal sequence context) = 5 + 1 + 5 = 11.' . "\n" . +# ' --mcd [int] Minimal distance between a putative miscleavage site and the peptide termini.' . "\n" . +# ' Uncleaved amino acids closer to the peptide termini can be ignored with this parameter:' . "\n" . +# ' A. To prevent overlap between cleavage and miscleavage peptide sequence contexts.' . "\n" . +# ' B. To prevent false positive miscleavage sites due to sites that cannot be cleaved, ' . "\n" . +# ' because one of the resulting fragments is too short. (Sometimes proteases may need ' . "\n" . +# ' a minimal length surrounding the cleavage site to bind and cleave a peptide.)' . "\n" . +# ' Default = same as the longest *-terminal context length. Hence [--n [int]|--c [int]] (see above).' . "\n" . + ' --pc [char] Optional padding character to fill N-terminal or C-terminal positions in the sequence context, ' . "\n" . + ' when the protein was too short to get a complete sequence context.' . "\n" . + ' Default = - (a.k.a. dash or the alignment gap character.)' . "\n" . + ' To disable padding specify an empty string like this --pc \'\'.' . "\n" . + ' --ll [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.' . "\n" . + "\n"; + exit; + +} diff -r 000000000000 -r 163892325845 ExtractPeptideSequenceContext.svg --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ExtractPeptideSequenceContext.svg Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,539 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r 163892325845 ExtractPeptideSequenceContext.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ExtractPeptideSequenceContext.xml Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,172 @@ + + + by mapping peptides back to proteins and extending them on both termini to include their sequence context. + ExtractPeptideSequenceContext.pl --db $db --dbf FASTA --f $fragments --icol $icol --pcol $pcol $strip --pepo $pepo --n $n --c $c --pc '$pc' --ll WARN + + + + + + + + + + + + + + + + + + + + + + + + + + +.. role:: raw-html(raw) + :format: html + +.. class:: infomark + +**What it does** + +Map peptide sequences back to proteins and extend the peptides on both termini to include their sequence context. + +:raw-html:`<object data="static/images/nbic_gmr/ExtractPeptideSequenceContext.svg" type="image/svg+xml" width="100%"/>` + +=================================================== +*Peptide sequences and their protein's identifiers* +=================================================== + +This file must contain at least peptides and accession numbers or IDs of the proteins the peptides were derived from. \ +The data must be in TAB delimited format and may contain other columns, which will be preserved in the output. \ +If a sequence context was found, it will be appended in a new column to the right of the existing columns. \ +When another sequence context was found for the same peptide, it will appended as an extra row in the output. +Protein accession numbers / IDs must be in the same format as was used in the FASTA file with protein sequences (database). \ +The only exception to this rule is that accession numbers / IDs may be optionally suffixed with the peptide\'s position in its protein between brackets. \ +For example: CLH1_HUMAN[1612-1620] will be matched to CLH1_HUMAN in a FASTA file with protein sequences. \ +Amino acids in the petide sequences must be in uppercase. + +=============================================== +*Protein sequences* +=============================================== + +Input file containing all protein sequences in FASTA format. \ +This tool will look for any type of protein ID in the first part of FASTA sequence headers up until the first white space. \ +Optionally multiple IDs may be present separated with pipe symbols (|) or semicolons (;). \ +Optionally IDs may be prefixed with a database namespace and a colon (:). \ +For example the accession number P32234 as well as the ID 128UP_DROME would be recognized in both this sequence header: + + >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +and in this one: + + >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +=================================================== +*N-terminal and C-terminal sequence context length* +=================================================== + +Integers specifying the length of the N-terminal and C-terminal sequence context to retrieve starting from the peptide termini. \ +So the total sequence context length for a peptide will be: +(N-terminal sequence context) + (length of the peptide) + (C-terminal sequence context). + +=============================================== +*Padding character* +=============================================== + +Optional padding character to fill N-terminal or C-terminal positions in the sequence context, \ +when the protein was too short to get a complete sequence context. \ +Defaults to - a.k.a. dash or alignment gap character. \ + +----- + +**Getting input data** + +.. _my folder utility: http://mascotinternal.chem.uu.nl/mascot/cgi/uu_myfolder.pl + +This tool requires \ +peptide sequences in TAB delimited format and \ +protein sequences from which the peptides were derived in FASTA format. \ +If your peptide sequences are not in TAB delimited format, you can convert from: + + - FASTA format using *FASTA manipulation* -> *FASTA-to-Tabular* + - A format using a different delimiter using *Text Manipulation* -> *Convert* + +When your peptides were derived from a mass spectrometry experiment and identified with a search engine like Mascot, Sequest, etc.,\ +please make sure you provide the same FASTA database for this tool as the one used for your search. +If you used Mascot hosted by the Biomolecular Mass Spectrometry and Proteomics Group @ Utrecht University, \ +you can use the `my folder utility`_ to download the FASTA databases from the Mascot server. + +----- + +**Examples** + +Example input for peptides identified with a Mascot search, \ +some with phosphorylated residues indicated by pS, pT or pY \ +and in TAB delimited format:: + + sequence score peptide mr mass delta (abs) mass delta (ppm) all protein matches + AGNAARDN 54.24 787.357254 -4.223E-5 -0.05334300253990 H2A1B_HUMAN[67-74] + KLpSAAVVLI 11.48 912.600784 0.001608 1.7619971713721432 OSGI2_HUMAN[405-413] + RAGIKVpTVA 23.01 913.570892 6.283E-5 0.06786555979719196 PARK7_HUMAN[28-36] + KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 P04075[101-110] + KIKELQAF 11.87 975.575287 0.003907 4.00481649347068 O60882[71-78] + KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689] + KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245] + KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] + +=============================================== +*Appending peptide sequence contexts* +=============================================== + +With these options: + + - c6 as *Protein identifier column* + - c1 as *Peptide sequence column* + - 5 as *N-terminal sequence context length* + - 5 as *C-terminal sequence context length* + - a suitable FASTA database with *Protein sequences* + - and everything else set to defaults + +the example above will generate a result like this:: + + AGNAARDN 54.24 787.357254 -4.223E-5 -0.05334300253990 H2A1B_HUMAN[67-74] EILELAGNAARDNKKTRI + KLpSAAVVLI 11.48 912.600784 0.001608 1.7619971713721432 OSGI2_HUMAN[405-413] LKKIFKLSAAVVLIGSHPN + RAGIKVpTVA 23.01 913.570892 6.283E-5 0.06786555979719196 PARK7_HUMAN[28-36] VDVMRRAGIKVTVAGLAGK + KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 P04075[101-110] QVIKSKGGVVGIKVDKGVVP + KIKELQAF 11.87 975.575287 0.003907 4.00481649347068 O60882[71-78] NSMIRKIKELQAFFGLQV + KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689] VGEKEKISGTVNIRTRDNK + KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245] AILEYKLYEALKFIMLYQ + KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] LTKVDKLDASESLRKEEEQ + +Note the header line was ignored. + + + diff -r 000000000000 -r 163892325845 ExtractSeqsFromFasta.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ExtractSeqsFromFasta.pl Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,397 @@ +#!/usr/bin/perl -w +# +# This script extracts sequences from a multi-sequence FASTA file +# based on a list of accession numbers / IDs +# +# ===================================================== +# $Id: ExtractSeqsFromFasta.pl 15 2010-07-08 18:07:59Z pieter $ +# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractSeqsFromFasta.pl $ +# $LastChangedDate: 2010-07-08 13:07:59 -0500 (Thu, 08 Jul 2010) $ +# $LastChangedRevision: 15 $ +# $LastChangedBy: pieter $ +# ===================================================== + +# +# initialise environment +# +use strict; +use Getopt::Std; +use Log::Log4perl qw(:easy); + +my %log_levels = ( + 'ALL' => $ALL, + 'TRACE' => $TRACE, + 'DEBUG' => $DEBUG, + 'INFO' => $INFO, + 'WARN' => $WARN, + 'ERROR' => $ERROR, + 'FATAL' => $FATAL, + 'OFF' => $OFF, +); +my %match_logic_types = ( + 'literal' => 1, + 'regex' => 1, +); + +# +# Get options. +# +my %opts; +Getopt::Std::getopts('ul:i:o:f:m:', \%opts); + +my $input = $opts{'i'}; +my $output = $opts{'o'}; +my $filter = $opts{'f'}; +my $log_level = $opts{'l'}; +my $match_logic = $opts{'m'}; +my $ignore_accession_version = $opts{'u'}; + +# +# Configure logging. +# +# Provides default if user did not specify log level: +$log_level = (defined($log_level) ? $log_level : 'INFO'); + +# Reset log level to default if user specified illegal log level. +$log_level = ( + defined($log_levels{$log_level}) + ? $log_levels{$log_level} + : $log_levels{'INFO'}); + +#Log::Log4perl->init('log4perl.properties'); +Log::Log4perl->easy_init( + + #{ level => $log_level, + # file => ">>ExtractSeqsFromFasta.log", + # layout => '%F{1}-%L-%M: %m%n' }, + { + level => $log_level, + file => "STDOUT", + layout => '%d L:%L %p> %m%n' + }, +); +my $logger = Log::Log4perl::get_logger(); + +# +# Check user input. +# +unless (defined($input) && defined($output) && defined($filter)) { + _Usage(); + exit; +} +if ($input =~ /^$/ || $output =~ /^$/ || $filter =~ /^$/) { + _Usage(); + exit; +} +if ($input eq $output) { + $logger->fatal('Output file is the same as the input file. Select a different file for the output.'); + exit; +} + +# +# Check input and filter file. +# +unless (-e $input && -f $input && -r $input) { + $logger->fatal('Cannot read from input file ' . $input . ': ' . $!); + exit; +} +unless (-e $filter && -f $filter && -r $filter) { + $logger->fatal('Cannot read from filter file ' . $filter . ': ' . $!); + exit; +} +# +# Check match logic. +# +$match_logic = (defined($match_logic) ? $match_logic : 'literal'); +unless (defined($match_logic_types{$match_logic})) { + $logger->fatal('Unkown logic ' . $match_logic . ' specified for filtering of the input sequences.'); + exit; +} + +# +# Read accession numbers to search the FASTA file(s) for +# +my $wanted = _CreateLookupHash($filter, $match_logic); +my $seqs_to_extract = scalar(keys(%{$wanted})); +$logger->info('Number of sequences to search for: ' . $seqs_to_extract); + +# +# Extract seqs +# +my ($counter) = _ExtractSeqs($input, $output, $wanted, $match_logic); + +$logger->info('Extracted ' . $counter . ' sequences.'); +$logger->info('Finished!'); + +# +## +### Internal subs. +## +# + +sub _CreateLookupHash { + + my ($file_path, $match_logic) = @_; + + $logger->info('Parsing ' . $file_path . '...'); + + my %wanted; + my $file_path_fh; + + eval { + open($file_path_fh, "<$file_path"); + }; + if ($@) { + $logger->fatal('Cannot read ID file: ' . $@); + exit; + } + + LINE: while (my $line = <$file_path_fh>) { + + $line =~ s/[\r\n]+//g; + + if ($match_logic eq 'literal') { + + if ($line =~ m/([a-z0-9_\-\.]+)/i) { + + my $id = $1; + + if ($ignore_accession_version) { + + # + # Remove version from accession number if it was versioned. + # + if ($id =~ m/([^\.]+)\.(\d+)/) { + + $id = $1; + + } + } + + $logger->debug('Found accession/ID ' . $id); + $wanted{$id} = 1; + + } else { + $logger->warn('Accession/ID in unsupported format: ' . $line); + next; + } + + } elsif ($match_logic eq 'regex') { + + if ($line =~ m/([a-z0-9_\-\.\[\]:?^\$]+)/i) { + my $regex = $1; + $logger->debug('Found regex ' . $regex); + $wanted{$regex} = 1; + } else { + $logger->warn('Regex in unsupported format: ' . $line); + next; + } + + } + } + + close($file_path_fh); + $logger->info('Created ID lookup list.'); + + return (\%wanted); + +} + +sub _ExtractSeqs { + + my ($path_from, $path_to, $wanted, $match_logic) = @_; + + $logger->info('Parsing ' . $path_from . '...'); + + my $extracted_seqs = 0; + my $found = 0; + my $path_from_fh; + my $path_to_fh; + + eval { + open($path_from_fh, "<$path_from"); + }; + if ($@) { + $logger->fatal('Cannot read FASTA file: ' . $@); + exit; + } + eval { + open($path_to_fh, ">$path_to"); + }; + if ($@) { + $logger->fatal('Cannot write FASTA file: ' . $@); + exit; + } + + LINE: while (my $line = <$path_from_fh>) { + + if ($line =~ m/^>/) { + $logger->debug('Found header line: ' . $line); + $found = 0; + my @header_ids; + + # + # Check for the presence of some frequently occurring naming schemes: + # + # >IPI:CON_Trypsin|SWISS-PROT:P00761|TRYP_PIG Trypsin - Sus scrofa (Pig). + # >IPI:CON_IPI00174775.2|TREMBL:Q32MB2;Q86Y46 Tax_Id=9606 Gene_Symbol=KRT73 Keratin-73 + # >sp|Q42592|APXS_ARATH L-ascorbate peroxidase S, chloroplastic/mitochondrial; + # >jgi|Triad1|1|gw1.3.1.1 + # + if ($line =~ m/^>((([^\s\n\r:;|]+)[:]([^\s\n\r:|]+)[|;])*([^\s\n\r:;|]+)[:]([^\s\n\r:|]+))[|;]?(\s+(.+))?/i) { + + # + # One or more namespace prefixed IDs. + # + my $concatenated_namespace_prefixed_ids = $1; + $logger->debug('Found prefixed IDs in header: ' . $concatenated_namespace_prefixed_ids); + + # database_namespace = $3 && $5; + # id = $4 && $6; + # description = $8; + my @namespace_prefixed_ids = split(/[|;]/, $concatenated_namespace_prefixed_ids); + + foreach my $prefixed_id (@namespace_prefixed_ids) { + + if ($prefixed_id =~ m/([^\s\n\r:;|]+:)?([^\s\n\r:|]+)/i) { + + my $this_id = $2; + + $logger->debug('Found ID: ' . $this_id); + push(@header_ids, $this_id); + + } else { + + $logger->warn( + 'This should have been an optionally prefixed ID, ' + . 'but failed to match the corresponding regex: ' + . $prefixed_id); + + } + } + + } elsif ($line =~ m/^>((([^\s\n\r:;|]+)[|;])*([^\s\n\r:;|]+))[|;]?(\s+(.+))?/i) { + + # + # One or more unprefixed IDs. + # + my $concatenated_ids = $1; + $logger->debug('Found unprefixed IDs in header: ' . $concatenated_ids); + + # id = $3 && $4; + # description = $7; + my @unprefixed_ids = split(/[|;]/, $concatenated_ids); + + foreach my $unprefixed_id (@unprefixed_ids) { + + $logger->debug('Found ID: ' . $unprefixed_id); + push(@header_ids, $unprefixed_id); + + } + + } else { + + # + # Something else. + # + # The FASTA header line can basically have any format, + # so this is probably one of the less frequent occurring annotation schemes. + # Therefore we try to see if the IDs we are looking for are present anywhere + # in the header line up until the first white space or new line. + # This may be tricky as we might match other annotation from other proteins + # like a description from a 'best' BLAST hit that contains one of the IDs we + # are looking for. Hence, in such a case this sequence is *not* the one of + # the IDs we looking for, but similar to that one at best. + # + if ($line =~ m/>([^\n\r\s]+)/) { + + my $putative_id = $1; + $logger->debug('Found putative ID in header: ' . $putative_id); + push(@header_ids, $putative_id); + + } else { + $logger->warn('Cannot identify IDs in this header: ' . $line); + } + } + + + if ($ignore_accession_version) { + + for my $inx (0 .. $#header_ids) { + + if ($header_ids[$inx] =~ m/([^\.]+)\.(\d+)/) { + + my $this_unversioned_id = $1; + my $version_number = $2; + $logger->debug('Dropping version number (' . $version_number . ') for versioned ID: ' . $this_unversioned_id . '.'); + $header_ids[$inx] = $this_unversioned_id; + + } + } + } + + foreach my $id (@header_ids) { + $logger->debug('Checking if ID ' . $id . ' is in the list of sequences to extract.'); + + if ($match_logic eq 'literal') { + + if (${$wanted}{$id}) { + $logger->info('Literal bingo, preserving sequence with ID ' . $id); + $found = 1; + $extracted_seqs++; + last; + } + + } elsif ($match_logic eq 'regex') { + + foreach my $regex (keys(%{$wanted})) { + + if ($id =~ m/$regex/) { + $logger->info('Regex bingo, preserving sequence with ID ' . $id); + $found = 1; + $extracted_seqs++; + last; + } + } + } + } + } + if ($found) { + print($path_to_fh $line); + } + + } + + close($path_from_fh); + close($path_to_fh); + + return ($extracted_seqs); + +} + +sub _Usage { + + print STDERR "\n" + . "extractSeqsFromFasta.pl:\n" + . " Extracts sequences from a multi-sequence FASTA file.\n" . "\n" + . "Usage:\n" . "\n" + . " extractSeqsFromFasta.pl options\n" . "\n" + . "Available options are:\n" . "\n" + . " -i [file] Input file in FASTA format.\n" + . " -o [file] Output file in FASTA format where the extracted files will be saved.\n" + . " -f [file] Filter file containing accession numbers or IDs that shoud be extracted from the input.\n" + . " (One accession/ID per line)\n" + . " -m [string] Match logic that defines how the accession numbers or IDs from the filter file will be.\n" + . " matched to those in the FASTA file. Supported logic types:\n" + . " literal for exact matching.\n" + . " regex for fuzzy matching using simple regular expressions (in Perl regex syntax).\n" + . " -u Use unversioned accession numbers for matching (only with -m literal).\n" + . " If the FASTA input file contains a versioned accession number like this IPI00189968.5,\n" + . " running this tool without -u (default) IPI00189968 or IPI00189968.2 will not match IPI00189968.5, \n" + . " but with -u IPI00189968 or IPI00189968.2 will match IPI00189968.5 and the sequence will be extracted.\n" + . " Hence this allows for less strict matching, but is less fuzzy than matching with regexes.\n" + . " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n" + . "\n"; + exit; + +} diff -r 000000000000 -r 163892325845 ExtractSeqsFromFasta.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ExtractSeqsFromFasta.xml Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,78 @@ + + + Extract sequences from a FASTA file based on a list of IDs + ExtractSeqsFromFasta.pl $ignore_accession_number_versions -f $identifiers -i $input -o $output -l WARN + + + + + + + + + + + +.. class:: infomark + +**What it does** + +This tool filters a set of FASTA sequences for certain identifiers (IDs) or accession numbers. \ +Only sequences whose ID or accession number is present in the supplied list will remain in the filtered FASTA output. \ +The list of IDs or accession numbers to filter for must be a flat text file with one ID or accession per line. + +This tool can match IDs with and without colon prefixed database namespaces in FASTA sequence header line. \ +Hence your FASTA header can contain both >UniProtKB:Q86Y46 ... or just plain >Q86Y46 ... . \ +Database namespace prefixes should not be present in the list of IDs that you want to extract sequences for. + +FASTA headers may contain multiple IDs separated with pipe symbols (|) or semi colons (;). \ +If multiple IDs are supplied these should not contain any white space as everything after the \ +first white space is considered to be the (optional) description, which will not be matched against the list \ +of IDs to extract. + +If your FASTA file contains versioned IDs / accessions, your list of IDs / accessions to extract must also contain \ +versioned IDs / accessions and the version numbers must match. + +----- + +**Example** + +If the FASTA header is this:: + + >IPI:CON_IPI00174775.2|TREMBL:Q32MB2;Q86Y46 Tax_Id=9606 Gene_Symbol=KRT73 Keratin-73 + +The following IDs / accession numbers will match this sequence header:: + + CON_IPI00174775.2 + Q32MB2 + Q86Y46 + +These will not match:: + + IPI:CON_IPI00174775.2 (prefix should be removed) + KRT73 (ID part of description and not part of list of IDs, + which is everything up until the first white space.) + +And finally these will not match unless *ignore accession number versions* is enabled:: + + CON_IPI00174775 (no version number, while FASTA file does contain versioned accession numbers) + CON_IPI00174775.1 (wrong version number) + + + diff -r 000000000000 -r 163892325845 FastaStats.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/FastaStats.pl Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,220 @@ +#!/usr/bin/perl + +# +# FastaStats.pl +# +# ===================================================== +# $Id: FastaStats.pl 6 2010-05-27 15:52:50Z pieter $ +# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/FastaStats.pl $ +# $LastChangedDate: 2010-05-27 10:52:50 -0500 (Thu, 27 May 2010) $ +# $LastChangedRevision: 6 $ +# $LastChangedBy: pieter $ +# ===================================================== +# +# Counts the amount of sequences in a FASTA file +# as well as the amount of nucleotides / amino acids +# and the sueqence composition. +# + +# +# Initialize evironment +# +use strict; +use Getopt::Std; +use Log::Log4perl qw(:easy); + +my %log_levels = ( + 'ALL' => $ALL, + 'TRACE' => $TRACE, + 'DEBUG' => $DEBUG, + 'INFO' => $INFO, + 'WARN' => $WARN, + 'ERROR' => $ERROR, + 'FATAL' => $FATAL, + 'OFF' => $OFF, +); + +# +# Get options. +# +my %opts; +Getopt::Std::getopts('pi:o:l:', \%opts); + +my $fasta_file = $opts{'i'}; +my $stats_file = $opts{'o'}; +my $log_level = $opts{'l'}; +my $get_positional_composition = $opts{'p'}; + +# +# Configure logging. +# +# Provides default if user did not specify log level: +$log_level = (defined($log_level) ? $log_level : 'INFO'); +# Reset log level to default if user specified illegal log level. +$log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'INFO'}); +#Log::Log4perl->init('log4perl.properties'); +Log::Log4perl->easy_init( + #{ level => $log_level, + # file => ">>FastaStats.log", + # layout => '%F{1}-%L-%M: %m%n' }, + { level => $log_level, + file => "STDOUT", + layout => '%d L:%L %p> %m%n' }, +); +my $logger = Log::Log4perl::get_logger(); + +# +# Check options. +# +if ($fasta_file =~ /^$/) { + _Usage(); + exit; +} +unless (-f $fasta_file && -r $fasta_file) { + _Usage(); + exit; +} + +# +# Start job. +# +$logger->info('Starting...'); +$logger->info('Using FASTA file: '. $fasta_file); + +my $sequence_count = 0; +my %acid_composition_total; +my %acid_composition_positional; +my $position_offset; + +# +# Create filehandles. +# +my $fasta_fh; +my $stats_fh; + +eval { + open($fasta_fh, "<$fasta_file"); + open($stats_fh, ">$stats_file"); +}; +if ($@) { + $logger->fatal('Cannot create filehandle: ' . $@); + exit; +} + +# +# Parse FASTA file. +# +while (my $line = <$fasta_fh>) { + + if ($line =~ /^>/i) { + + $sequence_count++; + $position_offset = 0; # reset position. + + } else { + + # + # Ignore: + # * white space + # * end of line (EOL) characters + # * lower- and/or uppercase (repeat) masking. + # + my $seq_line = $line; + $seq_line =~ s/[\s\n\r]+//g; + next if ($seq_line eq ''); + $seq_line = uc($seq_line); + + if ($seq_line =~ m/^([a-zA-Z*-]+)$/) { + + my $seq = $1; + my @acids = split(//, $seq); + + foreach my $acid(@acids) { + + $acid_composition_total{$acid}++; + + if ($get_positional_composition) { + + $acid_composition_positional{$acid}[$position_offset]++; + $position_offset++; + + } + } + + } else { + + $logger->warn('Weird line in FASTA file: ' . $line); + exit; + + } + } +} + +# +# Save stats. +# +print($stats_fh 'Sequences' . "\t" . $sequence_count . "\n"); +print($stats_fh 'Total acid composition:' . "\n"); +my $total_acid_count = 0; +foreach my $acid (sort(keys(%acid_composition_total))) { + my $acid_count = $acid_composition_total{$acid}; + print($stats_fh 'Acid ' . $acid . "\t" . $acid_count . "\n"); + $total_acid_count += $acid_count; +} +if ($get_positional_composition) { +print($stats_fh 'Positional acid composition:' . "\n"); + foreach my $acid (sort(keys(%acid_composition_positional))) { + print($stats_fh 'Acid ' . $acid); + for my $inx (1 .. scalar(@{$acid_composition_positional{$acid}})) { + my $acid_count; + if (defined($acid_composition_positional{$acid}[$inx-1])) { + $acid_count = $acid_composition_positional{$acid}[$inx-1]; + } else { + $acid_count = 0; + } + print($stats_fh "\t" . $acid_count); + } + print($stats_fh "\n"); + } +} + +print($stats_fh 'Total acids' . "\t" . $total_acid_count . "\n"); + +# +# Close filehandles. +# +close($fasta_fh); +close($stats_fh); + +$logger->info('Found ' . $total_acid_count . ' nucleotide/amino acids in ' . $sequence_count . ' sequences.'); +$logger->info('Finished!'); + +# +## +### Subs. +## +# + +sub _Usage { + + print "\n"; + print "FastaStats.pl - Reports statistics on a FASTA file like \n"; + print " * The number of sequences\n"; + print " * The total number of (nucleotide|amino) acids\n"; + print " * Sequence composition per (nucleotide|amino) acid\n"; + print "\n"; + print "Usage:\n"; + print "\n"; + print " FastaStats.pl options\n"; + print "\n"; + print "Available options are:\n"; + print "\n"; + print " -i [file] Input FASTA file.\n"; + print " -o [file] Output stats file.\n"; + print " -p Add positional stats to the output.\n"; + print " This will count the occurrance of each AA on each position of all sequences.\n"; + print " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n"; + print "\n"; + exit; + +} diff -r 000000000000 -r 163892325845 FastaStats.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/FastaStats.xml Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,87 @@ + + + List statistics for sequences in a FASTA file + FastaStats.pl $get_positional_composition_stats -i $input -o $output -l WARN + + + + + + + + + + + + + + + +.. class:: infomark + +**What it does** + +This tool analyzes a collection of sequences in FASTA format and reports: \ + + - The total number of sequences. + - The total number of nucleotide or amino acids. + - The total frequency of nucleotide or amino acids. + - The positional frequency of nucleotide or amino acids (optional). + +----- + +**Example** + +If the FASTA sequence collection contains these two sequences:: + + >UniProtKB:Q42593 L-ascorbate peroxidase T, chloroplastic; + MSVSLSAASHLLCSSTRVSLSPAVTSSSSSPVVALSSSTSPHSLGSVASSSLFPHSSFVL + QKKHPINGTSTRMISPKCAASDAAQLISAKEDIKVLLRTKFCHPILVRLGWHDAGTYNKN + IEEWPLRGGANGSLRFEAELKHAANAGLLNALKLIQPLKDKYPNISYADLFQLASATAIE + EAGGPDIPMKYGRVDVVAPEQCPEEGRLPDAGPPSPADHLRDVFYRMGLDDKEIVALSGA + HTLGRARPDRSGWGKPETKYTKTGPGEAGGQSWTVKWLKFDNSYFKDIKEKRDDDLLVLP + TDAALFEDPSFKNYAEKYAEDVAAFFKDYAEAHAKLSNLGAKFDPPEGIVIENVPEKFVA + AKYSTGKKELSDSMKKKIRAEYEAIGGSPDKPLPTNYFLNIIIAIGVLVLLSTLFGGNNN + SDFSGF + >UniProtKB:A0MQ79 Ascorbate peroxidase; + MVKNYPVVSEEYLIAVDKAKKKLRGFIAEKNCAPLMLRLAWHSAGTFDQCSRTGGPFGTM + RFKAEQAHSANNGIDIAIRLLEPIKEQFPILSYADFYQLAGVVAVEVTGGPEVPFHPGRP + DKEEPPVEGRLPDAYKGSDHLRDVFIKQMGLSDQDIVALSGGHTLGRCHKERSGFEGPWT + ENPLIFDNSYFKELVCGERDGLLQLPSDKALLADPVFHPLVEKYAADEDAFFADYAEAHL + KLSELGFADA + +The reported stats (without optional positional acid frequencies) will be this:: + + Sequences 2 + Acid A 69 + Acid C 8 + Acid D 44 + Acid E 44 + Acid F 33 + Acid G 52 + Acid H 18 + Acid I 30 + Acid K 50 + Acid L 67 + Acid M 9 + Acid N 22 + Acid P 46 + Acid Q 13 + Acid R 26 + Acid S 57 + Acid T 23 + Acid V 37 + Acid W 7 + Acid Y 21 + Total acids 676 + + + diff -r 000000000000 -r 163892325845 GenerateDegenerateFasta.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GenerateDegenerateFasta.pl Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,435 @@ +#!/usr/bin/perl -w +# +# This script generates a multi-sequence FASTA file +# with all possible combinations of sequences based +# on degenerate input sequences. +# +# ===================================================== +# $Id: GenerateDegenerateFasta.pl 67 2010-11-09 15:20:01Z pieter.neerincx@gmail.com $ +# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/GenerateDegenerateFasta.pl $ +# $LastChangedDate: 2010-11-09 09:20:01 -0600 (Tue, 09 Nov 2010) $ +# $LastChangedRevision: 67 $ +# $LastChangedBy: pieter.neerincx@gmail.com $ +# ===================================================== + +# +# initialise environment +# +use strict; +use Getopt::Std; +use Log::Log4perl qw(:easy); + +my %log_levels = ( + 'ALL' => $ALL, + 'TRACE' => $TRACE, + 'DEBUG' => $DEBUG, + 'INFO' => $INFO, + 'WARN' => $WARN, + 'ERROR' => $ERROR, + 'FATAL' => $FATAL, + 'OFF' => $OFF, +); + +my %amino_acids = ( + 'A' => ['A'], + 'B' => ['D','N'], + 'C' => ['C'], + 'D' => ['D'], + 'E' => ['E'], + 'F' => ['F'], + 'G' => ['G'], + 'H' => ['H'], + 'I' => ['I'], + 'J' => ['I','L'], + 'K' => ['K'], + 'L' => ['L'], + 'M' => ['M'], + 'N' => ['N'], + 'O' => ['O'], + 'P' => ['P'], + 'Q' => ['Q'], + 'R' => ['R'], + 'S' => ['S'], + 'T' => ['T'], + 'U' => ['U'], + 'V' => ['V'], + 'W' => ['W'], + 'X' => ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'], # 20 regular Amino Acids. + 'X22' => ['A','C','D','E','F','G','H','I','K','L','M','N','O','P','Q','R','S','T','U','V','W','Y'], # 22 Amino Acids including the rare Selenocysteine and Pyrrolysine. + 'Y' => ['Y'], + 'Z' => ['E','Q'], +); + +my %dna = ( + 'A' => ['A'], + 'B' => ['C','G','T'], + 'C' => ['C'], + 'D' => ['A','G','T'], + 'G' => ['G'], + 'H' => ['A','C','T'], + 'K' => ['G','T'], + 'M' => ['A','C'], + 'N' => ['A','C','G','T'], + 'R' => ['A','G'], + 'S' => ['C','G'], + 'T' => ['T'], + 'V' => ['A','C','G'], + 'W' => ['A','T'], + 'Y' => ['C','T'], +); + +my %rna = ( + 'A' => ['A'], + 'B' => ['C','G','U'], + 'C' => ['C'], + 'D' => ['A','G','U'], + 'G' => ['G'], + 'H' => ['A','C','U'], + 'K' => ['G','U'], + 'M' => ['A','C'], + 'N' => ['A','C','G','U'], + 'R' => ['A','G'], + 'S' => ['C','G'], + 'U' => ['U'], + 'V' => ['A','C','G'], + 'W' => ['A','U'], + 'Y' => ['C','U'], +); + +# +# Get options. +# +my %opts; +Getopt::Std::getopts('i:p:s:t:o:x:l:', \%opts); + +my $input = $opts{'i'}; +my $prefix_column = $opts{'p'}; +my $sequence_column = $opts{'s'}; +my $acid_type = $opts{'t'}; +my $output = $opts{'o'}; +my $aa_x = $opts{'x'}; +my $log_level = $opts{'l'}; + +# +# Configure logging. +# +# Provides default if user did not specify log level: +$log_level = (defined($log_level) ? $log_level : 'INFO'); + +# Reset log level to default if user specified illegal log level. +$log_level = ( + defined($log_levels{$log_level}) + ? $log_levels{$log_level} + : $log_levels{'INFO'}); + +#Log::Log4perl->init('log4perl.properties'); +Log::Log4perl->easy_init( + + #{ level => $log_level, + # file => ">>GenerateDegenrateFasta.log", + # layout => '%F{1}-%L-%M: %m%n' }, + { + level => $log_level, + file => "STDOUT", + layout => '%d L:%L %p> %m%n' + }, +); +my $logger = Log::Log4perl::get_logger(); + +# +# Check user input. +# +unless (defined($input) && defined($output)) { + _Usage(); + exit; +} +if ($input =~ /^$/ || $output =~ /^$/) { + _Usage(); + exit; +} +if ($input eq $output) { + $logger->fatal('Output file is the same as the input file. Select a different file for the output.'); + exit; +} + +# +# Check input file. +# +unless (-e $input && -f $input && -r $input) { + $logger->fatal('Cannot read from input file ' . $input . ': ' . $!); + exit; +} + +$prefix_column = (defined($prefix_column) ? $prefix_column : 1); +unless ($prefix_column =~ m/^\d+$/ && $prefix_column > 0) { + $logger->fatal('Prefix column must be a positive integer.'); + exit; +} else { + $logger->debug('Prefix column = ' . $prefix_column); +} + +$sequence_column = (defined($sequence_column) ? $sequence_column : 2); +unless ($sequence_column =~ m/^\d+$/ && $sequence_column > 0) { + $logger->fatal('Sequence column must be a positive integer.'); + exit; +} else { + $logger->debug('Sequence column = ' . $sequence_column); +} + +$acid_type = (defined($acid_type) ? $acid_type : 'aa'); +unless ($acid_type eq 'aa' || $acid_type eq 'dna' || $acid_type eq 'rna') { + $logger->fatal('Illegal value for option -t. Value for acid type must be \'aa\' for amino acids, \'dna\' for deoxyribonucleic acids or \'rna\' for ribonucleic acids.'); + exit; +} + +my %acids; +if ($acid_type eq 'aa') { + $aa_x = (defined($aa_x) ? $aa_x : 20); + unless ($aa_x == 20 || $aa_x == 22) { + $logger->fatal('Illegal value for option -x. Value for amino acid \'X\' expansion must be 20 or 22.'); + exit; + } + %acids = %amino_acids; +} elsif ($acid_type eq 'dna') { + %acids = %dna; +} elsif ($acid_type eq 'rna') { + %acids = %rna; +} + +# +# Read degenerate sequences and their IDs from a tab delimited file +# +my $degenerate_sequences = _ParseInput($input, $prefix_column, $sequence_column); +my $degenerate_sequence_count = scalar(keys(%{$degenerate_sequences})); +$logger->info('Number of degenerate sequences: ' . $degenerate_sequence_count); + +# +# Generate FASTA DB with all possible permutations. +# +my ($counter) = _GenerateSeqs($degenerate_sequences, $output, $acid_type); + +$logger->info('Generated FASTA DB with ' . $counter . ' sequences.'); +$logger->info('Finished!'); + +# +## +### Internal subs. +## +# + +sub _ParseInput { + + my ($file_path, $prefix_column, $sequence_column) = @_; + + $logger->info('Parsing ' . $file_path . '...'); + + my %degenerate_sequences; + my $file_path_fh; + my $prefix_column_offset = $prefix_column - 1; + my $sequence_column_offset = $sequence_column - 1; + my $line_counter = 0; + + eval { + open($file_path_fh, "<$file_path"); + }; + if ($@) { + $logger->fatal('Cannot read input file: ' . $@); + exit; + } + + LINE: while (my $line = <$file_path_fh>) { + + $line =~ s/[\r\n]+//g; + $line_counter++; + + my @values = split("\t", $line); + + unless (defined($values[$prefix_column_offset])) { + $logger->fatal('Prefix missing on line ' . $line_counter . ' of the input file.'); + exit; + } + + unless (defined($values[$sequence_column_offset])) { + $logger->fatal('Sequence missing on line ' . $line_counter . ' of the input file.'); + exit; + } + + my $prefix = $values[$prefix_column_offset]; + my $degenerate_sequence = $values[$sequence_column_offset]; + + + unless ($prefix =~ m/[a-zA-Z0-9_:\.\-]/i) { + $logger->fatal('Prefix countains illegal characters on line ' . $line_counter . '. Prefix should contain only a-z A-Z 0-9 _ : . or -.'); + exit; + } + + unless ($degenerate_sequence =~ m/[a-zA-Z0-9_:\.\-]/i) { + $logger->fatal('Degenerate sequence countains illegal characters on line ' . $line_counter . '. Sequence should contain only a-z A-Z.'); + exit; + } + + $degenerate_sequences{$prefix} = $degenerate_sequence; + $logger->debug('Found degenerate sequence ' . $degenerate_sequence . ' with prefix ' . $prefix); + + } + + close($file_path_fh); + $logger->info('Parsed input.'); + + return (\%degenerate_sequences); + +} + +sub _GenerateSeqs { + + my ($degenerate_sequences, $path_to, $acid_type) = @_; + + $logger->info('Generating sequences...'); + + my $generated_seqs = 0; + my $path_to_fh; + + eval { + open($path_to_fh, ">$path_to"); + }; + if ($@) { + $logger->fatal('Cannot write FASTA file: ' . $@); + exit; + } + + DS:foreach my $prefix(sort(keys(%{$degenerate_sequences}))) { + + my $degenerate_sequence = ${$degenerate_sequences}{$prefix}; + $degenerate_sequence = uc($degenerate_sequence); + + $logger->debug("\t" . 'For ' . $prefix); + + my $suffix = 1; + my @degenerate_sequence_acids = split('', $degenerate_sequence); + my $new_sequences = []; + my $next_acid_count = 0; + + foreach my $next_acid (@degenerate_sequence_acids) { + + $next_acid_count++; + + unless (defined($acids{$next_acid})) { + $logger->error('Unknown (degenerate) acid ' . $next_acid . ' in input sequence ' . $prefix . ' (' . $degenerate_sequence . ').'); + $logger->warn("\t" . 'Skipping sequence ' . $prefix . '.'); + next DS; + } + + if ($acid_type eq 'aa') { + + if ($next_acid eq 'X') { + + # + # By default X will expand to the 20 most common amino acids, + # but if -x 22 was specified X will expand to all 22 amino acids + # including the very rare ones. + # + if ($aa_x == 22) { + + $next_acid = 'X22'; + + } + } + } + + $new_sequences = _GrowSequence ($new_sequences, $next_acid); + + my $sequence_count = scalar(@{$new_sequences}); + $logger->trace("\t\t" . $next_acid_count . ' Acid ' . $next_acid . ': ' . $sequence_count . ' sequences.'); + + } + + foreach my $new_sequence (@{$new_sequences}) { + + $generated_seqs++; + my $id = $prefix . '_' . $suffix; + + $logger->trace("\t\t\t" . $id . ' :: ' . $new_sequence); + + print($path_to_fh '>' . $id . "\n"); + # TODO: wrap long sequences over multiple lines. + print($path_to_fh $new_sequence . "\n"); + + $suffix++; + + } + } + + close($path_to_fh); + + return ($generated_seqs); + +} + +# +# Usage +# + +sub _GrowSequence { + + my ($sequences, $next_acid) = @_; + + my @larger_sequences; + + if (scalar(@{$sequences}) < 1) { + + foreach my $acid (@{$acids{$next_acid}}) { + + my $larger_sequence = $acid; + + push(@larger_sequences, $larger_sequence); + + } + + } else { + + foreach my $sequence (@{$sequences}) { + + foreach my $acid (@{$acids{$next_acid}}) { + + my $larger_sequence = $sequence . $acid; + + push(@larger_sequences, $larger_sequence); + + } + } + } + + return (\@larger_sequences); + +} + +sub _Usage { + + print STDERR "\n" + . 'GenerateDegenerateFasta.pl:' . "\n\n" + . ' Generates a multi-sequence FASTA file with all possible combinations of sequences ' . "\n" + . ' based on degenerate input sequences.' . "\n\n" + . 'Usage:' . "\n\n" + . ' GenerateDegenerateFasta.pl options' . "\n\n" + . 'Available options are:' . "\n\n" + . ' -i [file] Input file in tab delimited format.' . "\n" + . ' -p [number] Prefix column.' . "\n" + . ' Column from the input file containg strings that will be used as prefixes ' . "\n" + . ' to generate unique IDs for the FASTA sequences.' . "\n" + . ' -s [number] Sequence column.' . "\n" + . ' Column from the input file containg degenerate amino or nucleic acid sequences ' . "\n" + . ' that will be used to generate the FASTA sequences.' . "\n" + . ' For example RXX can be used to generate the amino acids sequences RAA, RAC, RAD ... RYY.' . "\n" + . ' -t [type] Acid type of the degenerate input sequences.' . "\n" + . ' Must be one of:' . "\n" + . ' aa - for amino acids' . "\n" + . ' dna - for deoxyribonucleic acids' . "\n" + . ' rna - for ribonucleic acids' . "\n" + . ' -o [file] Output file in FASTA format.' . "\n" + . ' -x [20|22] Indicates whether the degenerate amino acid X represents only the 20 most common amino acids (default).' . "\n" + . ' or all 22 amino acids including the rare Selenocysteine and Pyrrolysine.' . "\n" + . ' -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.' . "\n" + . "\n"; + exit; + +} diff -r 000000000000 -r 163892325845 GenerateDegenerateFasta.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GenerateDegenerateFasta.xml Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,177 @@ + + + Creates a FASTA file with all possible sequences for degenerate sequences. + + #if $sequence_features.acid_type=="aa" #GenerateDegenerateFasta.pl -i $input -p $pcol -s $scol -t aa -o $output -x $sequence_features.xexpansion -l WARN + #elif $sequence_features.acid_type=="dna" #GenerateDegenerateFasta.pl -i $input -p $pcol -s $scol -t dna -o $output -l WARN + #elif $sequence_features.acid_type=="rna" #GenerateDegenerateFasta.pl -i $input -p $pcol -s $scol -t rna -o $output -l WARN + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**What it does** + +This tool creates a multi-sequence FASTA file with all possible sequences based on degenerate input sequences. +The input must be a tab delimited file containing at least 2 columns. +One with the degenerate sequences and the other with a prefix that will be used to give each of generated sequences a unique identifier. +In addition to the prefix, the generated identifiers will contain an underscore followed by an incremented number. + +=================================================== +*Degenerate (wild card) amino acids* +=================================================== + +===================== =========================================== ==================================================================== +Amino Acid Expands to Comment +===================== =========================================== ==================================================================== +B D,N +J I,L +X A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y The 20 most common amino acids (default) or +X A,C,D,E,F,G,H,I,K,L,M,N,O,P,Q,R,S,T,U,V,W,Y All 22 amino acids including the rare Selenocysteine and Pyrrolysine +Z E,Q +===================== =========================================== ==================================================================== + +=================================================== +*Degenerate (wild card) deoxyribonucleic acids* +=================================================== + +===================== ================================ =============================================================================== +Deoxyribonucleic Acid Expands to Comment +===================== ================================ =============================================================================== +B C,G,T Not A; B follows A alphabetically +D A,G,T Not C; D follows C alphabetically +H A,C,T Not G; H follows G alphabetically +K G,T Keto +M A,C aMino +N A,C,G,T aNy +R A,G puRine +S C,G Strong interaction (3 H-bonds) +V A,C,G Not T (and not U); V follows U alphabetically +W A,T Weak interaction (2 H-bonds) +Y C,T pYrimidine +===================== ================================ =============================================================================== + +=================================================== +*Degenerate (wild card) ribonucleic acids* +=================================================== + +===================== ================================ =============================================================================== +Ribonucleic Acid Expands to Comment +===================== ================================ =============================================================================== +B C,G,U Not A; B follows A alphabetically +D A,G,U Not C; D follows C alphabetically +H A,C,U Not G; H follows G alphabetically +K G,U Keto +M A,C aMino +N A,C,G,U aNy +R A,G puRine +S C,G Strong interaction (3 H-bonds) +V A,C,G Not U; V follows U alphabetically +W A,U Weak interaction (2 H-bonds) +Y C,U pYrimidine +===================== ================================ =============================================================================== + +----- + +**Example** + +If the degenerate input contains these two peptides:: + + Seq1 AXY + Seq2 SJT + +The generated FASTA sequences will be this:: + + >Seq1_1 + AAY + >Seq1_2 + ACY + >Seq1_3 + ADY + >Seq1_4 + AEY + >Seq1_5 + AFY + >Seq1_6 + AGY + >Seq1_7 + AHY + >Seq1_8 + AIY + >Seq1_9 + AKY + >Seq1_10 + ALY + >Seq1_11 + AMY + >Seq1_12 + ANY + >Seq1_13 + APY + >Seq1_14 + AQY + >Seq1_15 + ARY + >Seq1_16 + ASY + >Seq1_17 + ATY + >Seq1_18 + AVY + >Seq1_19 + AWY + >Seq1_20 + AYY + >Seq2_1 + SIT + >Seq2_2 + SLT + + + diff -r 000000000000 -r 163892325845 LICENSE --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/LICENSE Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,23 @@ +Copyright (C) 2009-2013 by NBIC (www.nbic.nl) + +This license holds for all files within this directory/repository that do not +contain an explicit reference to another license. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + diff -r 000000000000 -r 163892325845 ProteinDigestor.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ProteinDigestor.pl Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,820 @@ +#!/usr/bin/perl -w + +############################################################### +# ProteinDigestor.pl +# +# ===================================================== +# $Id: ProteinDigestor.pl 76 2011-01-04 13:16:56Z pieter.neerincx@gmail.com $ +# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ProteinDigestor.pl $ +# $LastChangedDate: 2011-01-04 07:16:56 -0600 (Tue, 04 Jan 2011) $ +# $LastChangedRevision: 76 $ +# $LastChangedBy: pieter.neerincx@gmail.com $ +# ===================================================== +# +# (c) Dr. Ir. B. van Breukelen et al. +# https://bioinformatics.chem.uu.nl +# http://www.netherlandsproteomicscentre.eu/ +# b.vanbreukelen@pharm.uu.nl +# +# Software to create peptide databases from a fasta +# file of proteins. You can cut with several enzymes +# select on size, PI, etc etc etc. +# +# TODO: put filters, enzymes, in seperate config file +############################################################### + +############################# +# Initialise environment +############################# +use strict; +use Getopt::Std; +use Log::Log4perl qw(:easy); + +# +# Log levels for Log4perl. +# +my %log_levels = ( + 'ALL' => $ALL, + 'TRACE' => $TRACE, + 'DEBUG' => $DEBUG, + 'INFO' => $INFO, + 'WARN' => $WARN, + 'ERROR' => $ERROR, + 'FATAL' => $FATAL, + 'OFF' => $OFF, +); + +# +# pK values array +# +my %cPk = ( + 'A'=>[3.55, 7.59, 0 , 0 , 0], + 'B'=>[3.55, 7.50, 0 , 0 , 0], + 'C'=>[3.55, 7.50, 9.00 , 9.00 , 9.00], + 'D'=>[4.55, 7.50, 4.05 , 4.05 , 4.05], + 'E'=>[4.75, 7.70, 4.45 , 4.45 , 4.45], + 'F'=>[3.55, 7.50, 0 , 0. , 0], + 'G'=>[3.55, 7.50, 0 , 0. , 0], + 'H'=>[3.55, 7.50, 5.98 , 5.98 , 5.98], + 'I'=>[3.55, 7.50, 0 , 0. , 0.], + 'J'=>[0.00, 0.00, 0 , 0. , 0.], + 'K'=>[3.55, 7.50, 10.00, 10.00, 10.00], + 'L'=>[3.55, 7.50, 0 , 0. , 0.], + 'M'=>[3.55, 7.00, 0 , 0. , 0.], + 'N'=>[3.55, 7.50, 0 , 0. , 0.], + 'O'=>[0.00, 0.00, 0 , 0. , 0.], + 'P'=>[3.55, 8.36, 0 , 0. , 0.], + 'Q'=>[3.55, 7.50, 0 , 0. , 0.], + 'R'=>[3.55, 7.50, 12.0 , 12.0 , 12.0], + 'S'=>[3.55, 6.93, 0 , 0. , 0.], + 'T'=>[3.55, 6.82, 0 , 0. , 0.], + 'U'=>[0.00, 0.00, 0 , 0. , 0.], + 'V'=>[3.55, 7.44, 0 , 0. , 0.], + 'W'=>[3.55, 7.50, 0 , 0. , 0.], + 'X'=>[3.55, 7.50, 0 , 0. , 0.], + 'Y'=>[3.55, 7.50, 10.00, 10.00, 10.00], + 'Z'=>[3.55, 7.50, 0 , 0. , 0.] +); + +# +# MW values array +# +my %cMWs = ( + 'A'=>89.09, + 'B'=>132.65, + 'C'=>121.15, + 'D'=>133.1, + 'E'=>147.13, + 'F'=>165.19, + 'G'=>75.07, + 'H'=>155.16, + 'I'=>131.18, + 'J'=>0.00, + 'K'=>146.19, + 'L'=>131.18, + 'M'=>149.22, + 'N'=>132.12, + 'O'=>0.00, + 'P'=>115.13, + 'Q'=>146.15, + 'R'=>174.21, + 'S'=>105.09, + 'T'=>119.12, + 'U'=>168.05, + 'V'=>117.15, + 'W'=>204.22, + 'X'=>129.26, + 'Y'=>181.19, + 'Z'=>146.73 +); + +# +# Digestion enzymes. +# +# For example Trypsin cuts c-terminal of K and R not followed by a P at position p1 = c:K,R:P=1 +# +my %enzymes = ( + 'Trypsin' => 'c:K,R:P=1', + 'Trypsin/P' => 'c:K,R', + 'Chymotrypsin' => 'c:F,L,W,Y:P=1', + 'Chymotrypsin/P' => 'c:F,L,W,Y', + 'V8_E' => 'c:E,Z:P=1', + 'V8_DE' => 'c:E,D,B,Z:P=1', + 'LysC' => 'c:K:P=1', + 'LysC/P' => 'c:K', + 'LysN/P' => 'n:K' +); + +my @filters = ( +# "[R|K][R|K][R|K]..[S|T]." +# "[R|K][R|K][R|K].[S|T]." , +# "[R|K][R|K]..[S|T].", +# "[R|K]..[S|T].", +# "[R|K][R|K].[S|T].", +# "[R|K].[S|T]." +# "[S|T].[R|K]" +); + +my %charges = ('A'=>0, 'B'=>0, 'C'=>0, 'D'=>-1, 'E'=>-1, 'F'=>0,'G'=>0, 'H'=>1, 'I'=>0, + 'J'=>0, 'K'=>1, 'L'=>0, 'M'=>0, 'N'=>0, 'O'=>0,'P'=>0, 'Q'=>0, 'R'=>1, + 'S'=>0, 'T'=>0, 'U'=>0, 'V'=>0, 'W'=>0, 'X'=>0,'Y'=>0, 'Z'=>0); + +my %pept = (); #hashed array of peptides (HASH) and fasta headers (VALUE) + +my $H20 = 18.015; #Mol. weight water + +my $PH_MIN = 0.0; #min pH value +my $PH_MAX = 14.0; #max pH value +my $MAXLOOP = 2000; # max number iterations +my $EPSI = 0.0001; # desired presision + +# +# Get options. +# +my %opts; +Getopt::Std::getopts('i:o:e:r:p:m:n:c:l:sa', \%opts); + +my $input = $opts{'i'}; +my $output = $opts{'o'}; +my $enzyme = $opts{'e'}; +my $enzyme_cleavage_site_rules = $opts{'r'}; +my $partial_cleavage = $opts{'s'}; +my $partial_cleavage_chance = $opts{'p'}; +my $min_peptide_weight = $opts{'m'}; +my $max_peptide_weight = $opts{'n'}; +my $max_charge = $opts{'c'}; +my $add_sequence_context = $opts{'a'}; +my $log_level = $opts{'l'}; + +# +# Configure logging. +# +# Provides default if user did not specify log level: +$log_level = (defined($log_level) ? $log_level : 'INFO'); + +# Reset log level to default if user specified illegal log level. +$log_level = ( + defined($log_levels{$log_level}) + ? $log_levels{$log_level} + : $log_levels{'INFO'}); + +#Log::Log4perl->init('log4perl.properties'); +Log::Log4perl->easy_init( + + #{ level => $log_level, + # file => ">>ProteinDigestor.log", + # layout => '%F{1}-%L-%M: %m%n' }, + { + level => $log_level, + file => "STDOUT", + layout => '%d L:%L %p> %m%n' + }, +); +my $logger = Log::Log4perl::get_logger(); + +# +# Check user input. +# +unless (defined($input) && defined($output)) { + _Usage(); + exit; +} +if ($input =~ /^$/ || $output =~ /^$/) { + _Usage(); + exit; +} +if ($input eq $output) { + $logger->fatal('Output file is the same as the input file. Select a different file for the output.'); + exit; +} + +# +# Check input file. +# +unless (-e $input && -f $input && -r $input) { + $logger->fatal('Cannot read from input file ' . $input . ': ' . $!); + exit; +} + +# +# Set additional defaults. +# +$partial_cleavage = (defined($partial_cleavage) ? $partial_cleavage : 0); # Disabled by default +$partial_cleavage_chance = (defined($partial_cleavage_chance) ? $partial_cleavage_chance : 0.1); # 10% change by default +$add_sequence_context = (defined($add_sequence_context) ? $add_sequence_context : 0); # Disabled by default + +######################################################## +# MAIN PROGRAM +######################################################## + +$logger->info('Starting...'); + +# +# Get protease cleavage site specs. +# +unless (defined($enzyme_cleavage_site_rules)) { + + # + # Set default to Trypsin if the user did not specify an enzyme name nor a string with enzyme cleavage site rules. + # + $enzyme = (defined($enzyme) ? $enzyme : 'Trypsin'); + + # + # Check for illegal enzymes. + # + unless (defined($enzymes{$enzyme})) { + $logger->fatal('Unkown enzyme ' . $enzyme . '.'); + exit; + } + + # + # Fetch enzyme cleavage site rules. + # + $enzyme_cleavage_site_rules = $enzymes{$enzyme}; + +} + +$logger->info('Using enzyme cleavage site rules: ' . $enzyme_cleavage_site_rules); +my @split_params = split(/:/, $enzyme_cleavage_site_rules); +my $cut_term = $split_params[0]; +my @cut_aa = split(/,/, $split_params[1]); +my @inhibition_aa; +if (defined($split_params[2])) { + @inhibition_aa = split(/,/, $split_params[2]); +} +my $annotation = {}; +my $sequence = ''; + +# +# Create filehandles. +# +open(my $input_fh, "$input"); +open(my $output_fh, ">$output"); + +# +# Write header to result file. +# +print($output_fh "Protein ID\tPeptide\tMW\tpI\tCharge\tnumber S\tnumber T\tnumber Y\n"); + +# +# Digest fasta files and store peptides in an array (hashed) +# +foreach my $line (<$input_fh>){ + + if ($line =~ /^>/){ + + # + # We found a new FASTA header. + # + + # + # Process the previously found protein if we already found one. + # + unless ($sequence eq '') { + _ProcessProtein(); + } + + # + # Parse new FASTA header. + # + $line =~ s/[\r\n\f]+//g; + $annotation = _ParseHeader($line); + $sequence = ''; + + } else { + + # + # Found a sequence line. + # + $line =~ s/[\t\s\r\n\f0-9\*\-]+//g; + $sequence .= $line; + } +} + +# +# Don't foget to process the last sequence! +# +_ProcessProtein(); + +close($input_fh); +close($output_fh); + +$logger->info('Finished!'); + +######################################################### +# SUBS +######################################################### + +sub _ProcessProtein { + + my $id = ${$annotation}{'ID'}; + $logger->debug('Accession/ID: ' . $id); + $logger->debug('Sequence: ' . $sequence . "\n"); + my $total_length_sequence = length($sequence); + my $total_peptide_length = 0; + + my ($peptides) = _Digest($sequence, $annotation, \@cut_aa, $cut_term, \@inhibition_aa); + + # + # Process peptides. + # + foreach my $peptide_with_context (sort(keys(%{$peptides}))) { + + # + # Get peptide properties. + # + $logger->debug('Peptide with context: ' . $peptide_with_context); + my ($bare_peptide) = _ParsePeptide($peptide_with_context); + my $peplength = length($bare_peptide); + my ($pepweight) = _MwCalc($bare_peptide); + + # + # Apply filters + # + # Filter 1 is on weight. + # + if (defined($min_peptide_weight)) { + + $logger->debug('Minimum weight filter:'); + $logger->debug("\t" . 'Peptide weight: ' . $pepweight); + + if ($pepweight >= $min_peptide_weight) { + + $logger->debug("\t" . 'PASSED - Weight is >= ' . $min_peptide_weight); + + } else { + + # Skip this one. + $logger->debug("\t" . 'REJECTED - Weight is < ' . $min_peptide_weight); + next; + + } + } + + if (defined($max_peptide_weight)) { + + $logger->debug('Maximum weight filter:'); + $logger->debug("\t" . 'Peptide weight: ' . $pepweight); + + if ($pepweight <= $max_peptide_weight) { + + $logger->debug("\t" . 'PASSED - Weight is <= ' . $max_peptide_weight); + + } else { + + # Skip this one. + $logger->debug("\t" . 'REJECTED - Weight is > ' . $max_peptide_weight); + next; + + } + } + + # + # Filter 2 is on sequence motifs. + # + if (scalar(@filters) > 0) { + + $logger->debug('Motif filter:'); + my ($motif) = _ContainsMotif($bare_peptide, \@filters); + + if ($motif > 0) { + + $logger->debug("\t" . 'PASSED - Peptide contains motif ' . $motif); + + } else { + + # Skip this one. + $logger->debug("\t" . 'REJECTED - Peptide does not contain any motif.'); + next; + + } + } + + my ($pepPI) = _PiCalc($bare_peptide); + my ($pepCharge) = _ChargeCalc($bare_peptide); + + # + # Filter 3 is on the amount of charges. + # + if (defined($max_charge)) { + + $logger->debug('Charge filter:'); + $logger->debug("\t" . 'Peptide charge: ' . $pepCharge); + + if (defined($max_charge) && $pepCharge <= $max_charge) { + + $logger->debug("\t" . 'PASSED - Charge <= ' . $max_charge); + + } else { + + # Skip this one. + $logger->debug("\t" . 'REJECTED - Charge > ' . $max_charge); + next; + + } + } + + # + # If there is no partial digestion we can calculated the % coverage. + # + unless ($partial_cleavage) { + $total_peptide_length += length($bare_peptide); + } + + my ($residue_count) = _CountResiduesSTY($bare_peptide); + + # + # Store peptide. + # + my $pep_sequence; + + if ($add_sequence_context) { + $pep_sequence = $peptide_with_context; + } else { + $pep_sequence = $bare_peptide; + } + + print($output_fh $id . "\t" . $pep_sequence . "\t" . $pepweight . "\t" . $pepPI. "\t" . $pepCharge. "\t" . $residue_count . "\n"); + + } + + # + # If partial digestion was not enabled, we can calculated the % coverage. + # + unless ($partial_cleavage) { + my $coverage = ($total_peptide_length / $total_length_sequence)*100; + $logger->debug("Sequence residues: $total_length_sequence | #peptide residues: $total_peptide_length | %Coverage: $coverage"); + } +} + +sub _ParseHeader { + + my ($header) = @_; + my %annotation; + my $ids; + my $id; + my $description; + + $header=~ s/^>//; + + if ($header =~ m/([^\s]+)\s+([^\s]+.*)/) { + $ids = $1; + $description = $2; + } else { + $ids = $header; + } + + # + # Redimentory support for detecting multiple IDs: check for pipe sperated IDs. + # + if ($ids =~ m/([^\|])\|/) { + $id = $1; + } else { + $id = $ids; + } + + $annotation{'ID'} = $id; + $annotation{'DE'} = $description; + + return (\%annotation); + +} + +sub _ContainsMotif{ + + my ($peptide, $filters) = @_; + my $cnt = 0; + + foreach my $filter (@{$filters}){ + + $cnt++; + if ($peptide =~ /$filter/){ + return($cnt); + } + + } + + return(0); + +} + +sub _ParsePeptide{ + + my ($peptide_with_flanking_context) = @_; + + my @pep = split(/\./, $peptide_with_flanking_context); + my $bare_peptide = uc($pep[1]); + + return($bare_peptide); + +} + +sub _Digest{ + + my ($sequence, $annotation, $enz_cutting, $cut_term, $enz_restrict) = @_; + + $sequence = uc($sequence); + my %pept = (); + + # we start by scanning the sequence.. to find one of the cutting rulez not followed by a restict rule + # what is the sequence length + my @array = split(//, $sequence); + my $length = scalar(@array); + my $start_offset = 0; + my $offset_adjust_for_cleavage_terminus = 0; + if ($cut_term eq 'n') { + $offset_adjust_for_cleavage_terminus = 1; + } + + my $peptide = ''; + + my $counter = 1; + if (!($partial_cleavage)) { + $partial_cleavage_chance = 0; + } else { + $counter = 100; + } + + #this is for incomplete cleavage... we repeat digest 100 times with a probability that the enzyme will not cut + for (my $cnt = 0; $cnt < $counter; $cnt++) { + + AA:for (my $aa_offset = 0 + $offset_adjust_for_cleavage_terminus; $aa_offset < ($length - 1 + $offset_adjust_for_cleavage_terminus); $aa_offset++) { + + my $this_aa = substr($sequence, $aa_offset, 1); + + # + # Check if protease recognizes this amino acid as cleavage site. + # + foreach my $cut_aa (@{$enz_cutting}) { + + if ($this_aa eq $cut_aa) { + + # + # Check if AA is not preceeded or followed by an AA that inhibits the protease... + # + foreach my $inhibit (@{$enz_restrict}) { + + + my @inhibit_conditions = split(/=/, $inhibit); + my $inhibit_aa = $inhibit_conditions[0]; + my $position_to_check = $inhibit_conditions[1]; + + my $neighborhood_aa = substr($sequence, $aa_offset + $position_to_check, 1); + + if ($neighborhood_aa eq $inhibit_aa){ + $logger->debug('No cleavage due to inhibition by ' . $inhibit_aa . ' at position ' . $position_to_check); + next AA; + } + } + + # + # Consider the possibility of not cutting when doing incomplete digestions! + # + my $random = 1; + if ($partial_cleavage){ + $random = rand(1); + } + if ($random <= $partial_cleavage_chance){ + $logger->debug('No cleavage due to incomplete digestion.'); + next AA; + } + + # + # If no inhibition spoiled the cleavage, lets cut! + # + my $cut_offset; + if ($cut_term eq 'c') { + $cut_offset = $aa_offset + 1; + } elsif ($cut_term eq 'n') { + $cut_offset = $aa_offset; + } + + # nice formatting of peptide + if ($start_offset == 0){ + $peptide .="terminus."; + $peptide .= substr($sequence, $start_offset, $cut_offset - $start_offset)."."; + }else{ + $peptide .= substr($sequence, $start_offset - 3, 3)."."; + $peptide .= substr($sequence, $start_offset, $cut_offset - $start_offset)."."; + } + $peptide .= substr($sequence, $cut_offset, 3); + $start_offset = $cut_offset; + + # check if peptide already exists + # if so, concatenate this fasta header + #if (exists($pept{$peptide})){ + # $fasta_header .= $pept{$peptide}; + #} + $pept{$peptide} = ${$annotation}{'ID'}; + $peptide = ''; + } + } + } + + # + # The last peptide sequence ! + # + $peptide = substr($sequence, $start_offset - 3, 3)."."; + $peptide .= substr($sequence, $start_offset, $length); + $peptide .= ".terminus"; + $pept{$peptide} = ${$annotation}{'ID'}; + + } #COUNT + + return(\%pept); + +} + +sub _PiCalc{ + + my ($sequence) = @_; + + my $nTermResidue; + my $cTermResidue; + my $phMin;my $phMid ;my $phMax; + my $charge; + my $cter; my $nter; my $carg; my $chis; my $clys;my $casp;my $cglu;my $ccys;my $ctyr; + + my $composition = _GetComposition($sequence); + my $strlength = length($sequence); + $nTermResidue = substr($sequence, 0, 1); + for ($nTermResidue){s/[\s\n\r\t]//g;}; + $cTermResidue = substr($sequence, $strlength - 1, 1); + for ($cTermResidue){s/[\s\n\r\t]//g;}; + $phMin = $PH_MIN; + $phMax = $PH_MAX; + my $i; + + for ($i=0, $charge = 1.0; $i<$MAXLOOP && ($phMax-$phMin)>$EPSI; $i++){ + $phMid = $phMin + ($phMax - $phMin) / 2; + $cter = 10**(-$cPk{$cTermResidue}->[0]) / (10**(-$cPk{$cTermResidue}->[0]) + 10**(-$phMid)); + $nter = 10**(-$phMid)/(10**(-$cPk{$nTermResidue}->[1])+10**(-$phMid)); + + $carg = ${$composition}{R} * 10**(-$phMid) / (10**(-$cPk{'R'}->[2])+10**(-$phMid)); + $chis = ${$composition}{H} * 10**(-$phMid) / (10**(-$cPk{'H'}->[2])+10**(-$phMid)); + $clys = ${$composition}{K} * 10**(-$phMid) / (10**(-$cPk{'K'}->[2])+10**(-$phMid)); + + $casp = ${$composition}{D} * 10**(-$cPk{'D'}->[2]) / (10**(-$cPk{'D'}->[2])+10**(-$phMid)); + $cglu = ${$composition}{E} * 10**(-$cPk{'E'}->[2]) / (10**(-$cPk{'E'}->[2])+10**(-$phMid)); + + $ccys = ${$composition}{C} * 10**(-$cPk{'C'}->[2]) / (10**(-$cPk{'C'}->[2])+10**(-$phMid)); + $ctyr = ${$composition}{Y} * 10**(-$cPk{'Y'}->[2]) / (10**(-$cPk{'Y'}->[2])+10**(-$phMid)); + + $charge = $carg + $clys + $chis + $nter - ($casp + $cglu + $ctyr + $ccys + $cter); + + if ($charge > 0.0){ + $phMin = $phMid; + }else{ + $phMax = $phMid; + } + } + + return($phMid); + +} + +sub _MwCalc{ + + my ($sequence) = @_; + + my $mw; + my $strlength = length($sequence); + my $composition = _GetComposition($sequence); + + # subtract N-1 water molecules + $mw = - ($strlength -1) * $H20; + + foreach my $aa (sort keys %{$composition}){ + $logger->trace('AA:' . $aa . ' | frequency: ' . ${$composition}{$aa} . ' | AA MW: ' . $cMWs{$aa}); + $mw += ${$composition}{$aa} * $cMWs{$aa}; + } + + return($mw); + +} + +sub _ChargeCalc{ + + my ($sequence) = @_; + + my $charge; + my $composition = _GetComposition($sequence); + + foreach my $aa (sort(keys(%{$composition}))){ + $charge += ${$composition}{$aa} * $charges{$aa}; + } + + return($charge); + +} + +sub _CountResiduesSTY{ + + my ($sequence) = @_; + + my $composition = _GetComposition($sequence); + my $sty_residues = ${$composition}{'S'} . "\t" . ${$composition}{'T'} . "\t" . ${$composition}{'Y'}; + + return($sty_residues); + +} + +sub _GetComposition { + + my ($sequence) = @_; + + my $i; + my $c; + my %composition = ('A'=>0, 'B'=>0,'C'=>0, 'D'=>0, 'E'=>0, 'F'=>0,'G'=>0, 'H'=>0, 'I'=>0, + 'J'=>0, 'K'=>0,'L'=>0, 'M'=>0, 'N'=>0, 'O'=>0,'P'=>0, 'Q'=>0, 'R'=>0, + 'S'=>0, 'T'=>0,'U'=>0, 'V'=>0, 'W'=>0, 'X'=>0,'Y'=>0, 'Z'=>0); + + # + # Compute aminoacid composition. + # + my $strlength = length($sequence); + for ($i=0; $i<$strlength; $i++){ + $c = substr($sequence, $i, 1); + $composition{$c}++; + } + + return(\%composition); + +} + +sub _Usage { + + my $longest_enzyme_name_length = 0; + foreach my $protease (keys(%enzymes)) { + if (length($protease) > $longest_enzyme_name_length) { + $longest_enzyme_name_length = length($protease); + } + } + my $field_formatting = '%-' . ($longest_enzyme_name_length + 3) . 's'; + + print "\n"; + print "ProteinDigestor.pl - Digests protein sequences from a FASTA file\n"; + print "\n"; + print "Usage:\n"; + print "\n"; + print " ProteinDigestor.pl options\n"; + print "\n"; + print "Available options are:\n"; + print "\n"; + print " -i [file] Input FASTA file.\n"; + print " -o [file] Output stats file.\n"; + print "\n"; + print " You can either specify a protease this tool knows with -e\n"; + print " or you can specify the cleavage rule for a protease this does not know (yet) with -r\n"; + print "\n"; + print " -e [enzyme_name] Digestion enzyme. Known enzymes and their cleavage rules:\n"; + + foreach my $protease (sort(keys(%enzymes))) { + print ' ' . sprintf("$field_formatting", $protease) . $enzymes{$protease} . "\n"; + } + + print " -r [enzyme_rule] Protease cleavage site rule. This rule contains sperated by colons:\n"; + print " * The terminus that indicates on which side of a recognized amino acid the protease will cleave: c or n.\n"; + print " * The amino acids recognized by the protease.\n"; + print " Multiple amino acids separated by a comma.\n"; + print " * The amino acids that inhibit cleavage and their position relative to amino acids recognized by the protease.\n"; + print ' Amino acid and its relative position separated by an equals sign (=).' . "\n"; + print " Multiple amino acids separated by a comma.\n"; + print ' For example \'c:K,R:P=1\' for Trypsin with proline inhibition.' . "\n"; + print " -s Semi for partial digestion.\n"; + print " -p [chance] Number between 0 and 1 indicating the chance that a site will not be cleaved.\n"; + print " -m [weight] Minimum weight for a peptide.\n"; + print " -n [weight] Maximum weight for a peptide.\n"; + print " -c [int] Maximum charge a peptide is allowed to have.\n"; + print " -a Add sequence context.\n"; + print " This will add max 3 amino acids or the word terminus on each side of the peptides.\n"; + print " Contexts and peptides are separated with a dot.\n"; + print " Examples of peptides with a context:\n"; + print " terminus.MSVSLSAK.ASH\n"; + print " SAK.ASHLLCSSTR.VSL\n"; + print " STR.VSLSPAVTSSSSSPVVRPALSSSTS.terminus\n"; + print " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n"; + print "\n"; + exit; + +} diff -r 000000000000 -r 163892325845 ProteinDigestor.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ProteinDigestor.xml Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,93 @@ + + + In silico digestion of proteins into peptides + ProteinDigestor.pl -i $input -o $output -r '$rule' -l ERROR + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**What it does** + +This tool digests protein sequences from a FASTA file.\ +In addition some statistics are calculated for each peptide: + + * Molecular weight in Da (MW). + * Isoelectric point (pI) + * Net charge assuming K, R and H add +1 and D and E add -1 to the net charge of the peptide. + * The mount of potential phosphorilation sites (S, T or Y amino acids). + +**Getting input data** + +This tool requires protein sequences in FASTA format. \ +If your protein sequences are not in FASTA format, you'll have to convert it first.\ +For sequences in tab delimited files for example you can use the *FASTA manipulation* -> *Tabular-to-FASTA* tool. + +----- + +**Example** + +If for example the protein sequences in FASTA format would be this:: + + >FamousDatabase:Q42593Z L-ascorbate peroxidase T, chloroplastic; + MSVSLSAKASHLLCSSTRVSLSPAVTSSSSSPVVRPALSSSTS* + >FamousDatabase:A0MQ79Z Ascorbate peroxidase; + MVKPNYPVVSEEYLIAVDKAKKKLRGFIAEKNCAPLMLRL* + +and you would perform an *in silico* digest with Trypsin, the result will look like this:: + + Protein ID Peptide MW pI Charge number S number T number Y + FamousDatabase:Q42593Z ASHLLCSSTR 1074.225 8.30219268798828 2 3 1 0 + FamousDatabase:Q42593Z VSLSPAVTSSSSSPVVRPALSSSTS 2390.61 9.72000885009766 1 11 2 0 + FamousDatabase:Q42593Z MSVSLSAK 821.995 8.50011444091797 1 3 0 0 + FamousDatabase:A0MQ79Z NCAPLMLR 917.175 8.24974822998047 1 0 0 0 + FamousDatabase:A0MQ79Z K 146.19 8.75005340576172 1 0 0 0 + FamousDatabase:A0MQ79Z K 146.19 8.75005340576172 1 0 0 0 + FamousDatabase:A0MQ79Z LR 287.375 9.75002288818359 1 0 0 0 + FamousDatabase:A0MQ79Z GFIAEK 663.775 6.00136566162109 0 0 0 0 + FamousDatabase:A0MQ79Z L 131.18 5.52498626708984 0 0 0 0 + FamousDatabase:A0MQ79Z AK 217.265 8.79502105712891 1 0 0 0 + FamousDatabase:A0MQ79Z MVKPNYPVVSEEYLIAVDK 2194.59 4.67711639404297 -1 1 0 2 + + +======================================================== +*Need to digest with another protease?* +======================================================== + +Contact your local bioinformaticians to add other proteases... + + + diff -r 000000000000 -r 163892325845 README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,1 @@ +This tool shed contains the Galaxy-P fork of the NBIC FASTA tools. The original tools can be found in NBIC's SVN Galaxy repository: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/.