Mercurial > repos > galaxyp > nbic_fasta

diff ExtractPeptideSequenceContext.pl @ 0:163892325845 draft default tip
Initial commit.
author: galaxyp
date: Fri, 10 May 2013 17:15:08 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ExtractPeptideSequenceContext.pl	Fri May 10 17:15:08 2013 -0400
@@ -0,0 +1,1744 @@
+#!/usr/bin/perl -w
+# 
+# This script extracts subsequences from UniProtKB *.dat files or from *.fasta files
+# based on a list of accession numbers / IDs and a sequence fragment (peptide).
+#
+# =====================================================
+# $Id: ExtractPeptideSequenceContext.pl 112 2011-03-01 19:50:23Z pieter.neerincx@gmail.com $
+# $HeadURL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractPeptideSequenceContext.pl $
+# $LastChangedDate: 2011-03-01 13:50:23 -0600 (Tue, 01 Mar 2011) $ 
+# $LastChangedRevision: 112 $
+# $LastChangedBy: pieter.neerincx@gmail.com $
+# =====================================================
+#
+
+#
+# initialise environment
+#
+use strict;
+use Getopt::Long;
+use SWISS::Entry; # For parsing UniProtKB *.dat files.
+use Log::Log4perl qw(:easy);
+
+my $ps = '/'; # Path Separator.
+my %log_levels = (
+	'ALL'	=> $ALL,
+	'TRACE'	=> $TRACE,
+	'DEBUG'	=> $DEBUG,
+	'INFO'	=> $INFO,
+	'WARN'	=> $WARN,
+	'ERROR'	=> $ERROR,
+	'FATAL'	=> $FATAL,
+	'OFF'	=> $OFF,
+);
+
+#
+# List of databases, which may contain versioned accession numbers,
+# and the separation character used to join the accession number and
+# the version number. 
+#
+my %databases_with_optionally_versioned_ids = (
+	'IPI'					=> '.',
+	'ipi'					=> '.',
+	'UniProtKB/Swiss-Prot'	=> '.',
+	'sp'					=> '.',
+	'SP'					=> '.',
+	'UniProtKB/TrEMBL'		=> '.',
+	'TR'					=> '.',
+	'tr'					=> '.',
+	'Tr'					=> '.'
+);
+
+my $db;
+my $db_alt;
+my $db_format;
+my $index_id; 					# Key used to lookup sequences. Can be ID or Accession number (default).
+my $fragments;
+my $id_column;
+my $peptide_column;
+my $strip_lc;					# For the sequence fragments in the fragments file. With strip_lc enabled we 
+								# assume amino acids are in upper case and lower case characters represent modifications, 
+								# which will be stripped off before searching for the fragments in the complete sequences.
+my $cleavage_output;			# Sequence contexts for cleavage sites.
+my $miscleavage_output;			# Sequence contexts for miscleavage sites.
+my $modification_output;		# Sequence contexts for modification sites.
+my $peptide_output;				# Sequence contexts for complete peptides.
+my $cleavage_aa;				# Assume the sequence fragments were derived from cutting at this amino acid. 
+my $cleavage_terminus;			# Assume the sequence fragments were derived from cutting at this side of $cut_at_aa 
+my $modified_aa;		
+my $n_term_context_length;
+my $c_term_context_length;
+my $miscleavage_distance;		# Minimal distance between a putative miscleavage site and the peptide termini.
+								# Uncleaved AAs closer to the peptide termini can be ignored with this parameter:
+								#  A. To prevent overlap between cleavage and miscleavage peptide sequence contexts.
+								#  B. To remove false positive miscleavage sites that cannot be cleaved, 
+								#     because one of the resulting fragments is too short. Sometimes proteases may need 
+								#     a minimal length surrounding the cleavage site to bind and cleave a peptide.
+my $padding_character;			# Used for padding if the protein sequence is too short for a long enough context.
+my $log_level;
+
+Getopt::Long::GetOptions (
+	'db=s'		=> \$db,
+	'dba:s'		=> \$db_alt,
+	'dbf:s'		=> \$db_format,
+	'k:s'		=> \$index_id,				# Key used to lookup sequences. Can be ID or Accession number (default).
+	'f=s'		=> \$fragments,
+	'icol:i'	=> \$id_column,
+	'pcol:i'	=> \$peptide_column,
+	's'			=> \$strip_lc,				# For the sequence fragments in the fragments file. With strip_lc enabled we 
+											# assume amino acids are in upper case and lower case characters represent modifications, 
+											# which will be stripped off before searching for the fragments in the complete sequences.
+	'cleo:s'	=> \$cleavage_output,
+	'miso:s'	=> \$miscleavage_output,
+	'modo:s'	=> \$modification_output,
+	'pepo:s'	=> \$peptide_output,
+	'ca:s'		=> \$cleavage_aa,			# Assume the sequence fragments were derived from cutting at this amino acid. 
+	'ct:s'		=> \$cleavage_terminus,		# Assume the sequence fragments were derived from cutting at this side of $cut_at_aa 
+	'ma:s'		=> \$modified_aa,			
+	'n:i'		=> \$n_term_context_length,
+	'c:i'		=> \$c_term_context_length,
+	'mcd:i'		=> \$miscleavage_distance,	# Minimal distance between a putative miscleavage site and the peptide termini.
+											# Uncleaved AAs closer to the peptide termini can be ignored with this parameter:
+											#  A. To prevent overlap between cleavage and miscleavage peptide sequence contexts.
+											#  B. To remove false positive miscleavage sites that cannot be cleaved, 
+											#     because one of the resulting fragments is too short. Sometimes proteases may need 
+											#     a minimal length surrounding the cleavage site to bind and cleave a peptide.
+	'pc:s'		=> \$padding_character,
+	'll:s'		=> \$log_level
+);
+
+#
+# TODO: 1. Handle --mcd param. Maybe should be extended to exclude also modification sites too close together... 
+#
+
+#
+# Configure logging.
+#
+# Provides default if user did not specify log level:
+$log_level = (defined($log_level) ? $log_level : 'INFO');
+# Reset log level to default if user specified illegal log level.
+$log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'INFO'});
+#Log::Log4perl->init('log4perl.properties');
+Log::Log4perl->easy_init(
+	#{ level    => $log_level,
+	#  file     => ">>ExtractPeptideSequenceContext.log",
+	#  layout   => '%F{1}-%L-%M: %m%n' },
+	{ level    => $log_level,
+	  file     => "STDOUT",
+	  layout   => '%d L:%L %p> %m%n' },
+);
+my $logger = Log::Log4perl::get_logger();
+
+#
+# Check user input.
+#
+foreach my $input ($db, $db_alt, $fragments, $cleavage_output, $miscleavage_output, $modification_output) {
+	if (defined($input) && length($input) <= 0) {
+		$input = undef();
+	}
+}
+unless (defined($db) && 
+        defined($fragments) && 
+        (defined($cleavage_output) || defined($miscleavage_output) || defined($modification_output) || defined($peptide_output))
+) {
+	$logger->fatal('You need to specify at least a DB & fragments file as inputs and at least one output file.');
+    _Usage();
+    exit;	
+}
+if (defined($db_format)) {
+	unless ($db_format eq 'UniProtKB DAT' or $db_format eq 'FASTA') {
+		$logger->fatal('Database format in unsupported format.');
+		$logger->fatal("\t" . 'Must be \'UniProtKB DAT\' or \'FASTA\'.');
+		$logger->fatal("\t" . 'But --dbf was ' . $db_format . '.');
+		exit;
+	}
+}
+if (defined($cleavage_aa)) {
+	unless ($cleavage_aa =~ m/^[*A-Z]$/) {
+		$logger->fatal('Cleavage amino acid in unsupported format.');
+		$cleavage_aa = undef;
+	}
+}
+if (defined($cleavage_terminus)) {
+	unless ($cleavage_terminus =~ m/^[*CN]$/) {
+		$logger->fatal('Cleavage terminus in unsupported format.');
+		$cleavage_terminus = undef;
+	}
+}
+if (defined($modified_aa)) {
+	unless ($modified_aa =~ m/^[*A-Z][a-z]+$/ || $modified_aa =~ m/^[a-z]+[*A-Z]$/) {
+		$logger->fatal('Modified amino acid in unsupported format.');
+		$modified_aa = undef;
+	}
+}
+#
+# We need info to extract:
+#  * cleavage site sequence contexts or 
+#  * modification site sequence contexts or 
+#  * peptide sequence contexts
+#
+if (defined($cleavage_output) || defined($miscleavage_output)) {
+	unless (defined($cleavage_aa) && defined($cleavage_terminus)) {
+		$logger->fatal('If you want to retrieve (mis)cleavage site sequence contexts, you must provide valid values for --ca and --ct.');
+		$logger->debug('You specified --ca ' . $cleavage_aa . ' --ct ' . $cleavage_terminus . '.');
+	    _Usage();
+	    exit;
+	}
+}
+if (defined($modification_output)) {
+	unless (defined($modified_aa)) {
+		$logger->fatal('If you want to retrieve modifications site sequence contexts you must provide a valid value for --ma.');
+		$logger->debug('You specified --ma ' . $modified_aa . '.');
+	    _Usage();
+	    exit;
+	}
+} 
+if (defined($peptide_output)) {
+	#
+	# We can work with defaults for all params.
+	#
+} 
+
+# Make sure we don't overwrite the input.
+foreach my $output ($cleavage_output, $miscleavage_output, $modification_output, $peptide_output) {
+	foreach my $input ($db, $db_alt, $fragments) {
+		if (defined($input) && defined($output) && $output eq $input) {
+		    $logger->fatal('Output file ' . $output . ' is the same as input file ' . $input . '.');
+		    $logger->fatal("\t" . 'Please choose a different file for the output.');
+		    exit;
+		}
+	}
+}
+# Define defaults for optional arguments.
+$index_id				= (defined($index_id) && length($index_id) > 0								? $index_id					: 'AC');
+$id_column				= (defined($id_column) && length($id_column) > 0							? $id_column				: 1);
+$peptide_column			= (defined($peptide_column) && length($peptide_column) > 0					? $peptide_column			: 2);
+$n_term_context_length	= (defined($n_term_context_length) && length($n_term_context_length) > 0	? $n_term_context_length	: 5);
+$c_term_context_length	= (defined($c_term_context_length) && length($c_term_context_length) > 0	? $c_term_context_length	: 5);
+$padding_character		= (defined($padding_character) 												? $padding_character		: '-');
+# Stripping of lowercase characters is disabled by default unless we search for modification site contexts in which case it must be enabled!  
+$strip_lc				= (defined($strip_lc)														? $strip_lc					: 0);
+$strip_lc				= (defined($modified_aa)													? 1							: $strip_lc);
+unless (defined($miscleavage_distance) && $miscleavage_distance > 0) {
+	if ($n_term_context_length >= $c_term_context_length) {
+		$miscleavage_distance = $n_term_context_length;
+	} else {
+		$miscleavage_distance = $c_term_context_length;
+	}
+}
+
+
+#
+# Check db files and fragments file.
+#
+unless (-e $db && -f $db && -r $db) {
+    $logger->fatal('Cannot read from database input file ' . $db . ': ' . $!);
+    exit;
+}
+if (defined($db_alt)) {
+	unless (-e $db_alt && -f $db_alt && -r $db_alt) {
+	    $logger->fatal('Cannot read from alternative database input file ' . $db_alt . ': ' . $!);
+	    exit;
+	}
+}
+unless (-e $fragments && -f $fragments && -r $fragments) {
+    $logger->fatal('Cannot read from fragments input file ' . $fragments . ': ' .$!);
+	exit;
+}
+
+#
+# Create lookup table(s) of the entire database input file(s).
+#
+my @db_lookup_table_refs;
+(my $db_seqs, $index_id) = _CreateLookupHash($db, $index_id, $db_format);
+my $seq_count = scalar(keys(%{$db_seqs}));
+$logger->info('Number of sequences in database lookup table: ' . $seq_count);
+push(@db_lookup_table_refs, $db_seqs);
+# Optional backup DB file.
+if (defined($db_alt)) {
+	(my $db_seqs_alt, $index_id) = _CreateLookupHash($db_alt, $index_id, $db_format);
+	my $seq_count_alt = scalar(keys(%{$db_seqs_alt}));
+	$logger->info('Number of sequences in backup database lookup table: ' . $seq_count_alt);
+	push(@db_lookup_table_refs, $db_seqs_alt);
+}
+
+#
+# Debugging loop
+#
+#foreach my $keyyy (keys(%{$db_seqs})) {
+#	$logger->fatal('Key: ' . $keyyy);
+#	$logger->fatal("\t" . 'AC: ' . ${$db_seqs}{$keyyy}{'AC'});
+#	$logger->fatal("\t" . 'ID: ' . ${$db_seqs}{$keyyy}{'ID'});
+#	$logger->fatal("\t" . 'SQ: ' . ${$db_seqs}{$keyyy}{'SQ'});
+#}
+
+#
+# Lookup sequence fragment contexts and store results.
+#
+my ($counter_cleavages, $counter_miscleavages, $counter_modifications, $counter_peptides) = _ExtractSeqs(\@db_lookup_table_refs, 
+								$fragments, $strip_lc, $id_column, $peptide_column, 
+								$cleavage_output, $miscleavage_output, $modification_output, $peptide_output, 
+								$cleavage_aa, $cleavage_terminus, $miscleavage_distance, 
+								$n_term_context_length, $c_term_context_length, 
+								$padding_character);
+#
+# Report result stats.
+#
+my $max_length = 0;
+foreach my $counter ($counter_cleavages, $counter_miscleavages, $counter_modifications, $counter_peptides) {
+	$max_length = (length($counter) > $max_length) ? length($counter) : $max_length;
+}
+my $prefixed_counter = sprintf('%*s', $max_length, $counter_cleavages);
+$logger->info('Extracted cleavage site sequence contexts:     ' . $prefixed_counter . '.');
+$prefixed_counter = sprintf('%*s', $max_length, $counter_miscleavages);
+$logger->info('Extracted miscleavage site sequence contexts:  ' . $prefixed_counter . '.');
+$prefixed_counter = sprintf('%*s', $max_length, $counter_modifications);
+$logger->info('Extracted modification site sequence contexts: ' . $prefixed_counter . '.');
+$prefixed_counter = sprintf('%*s', $max_length, $counter_peptides);
+$logger->info('Extracted peptide sequence contexts:           ' . $prefixed_counter . '.');
+$prefixed_counter = sprintf('%*s', $max_length, $counter_cleavages + $counter_miscleavages + $counter_modifications + $counter_peptides);
+$logger->info('Total extracted sequence contexts:             ' . $prefixed_counter . '.');
+$logger->info('Finished!');
+
+#
+##
+### Internal subs.
+##
+#
+
+sub _CreateLookupHash {
+	
+    my ($file_path, $index_id, $file_type) = @_;
+    
+    my $file_path_fh;
+	my %seqs;
+	
+    if (defined($file_type)) {
+    	$logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...');    	
+    } elsif ($file_path =~ m/($ps?)([^\.]+)\.dat$/i) {
+    	$file_type = 'UniProtKB DAT';
+    	$logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...');
+    } elsif ($file_path =~ m/($ps?)([^\.]+)\.fa(sta)?$/i) {
+    	$file_type = 'FASTA';
+    	# For FASTA files we don't know what to expect:
+    	# IDs or ACcession numbers or a mix of both.
+    	# (Re)set index_id type to AC to prevent adding redundant IDs to the results. 
+    	# Hence treat all identifiers stable and unstable as ACs.
+    	# (Normally we default to ACcession numbers and if we would have found an AC
+    	#  and the input fragments file was using IDs, we would append the AC to the results.)
+    	$index_id = 'AC';
+    	$logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...');
+    } else {
+	    $file_type = 'FASTA';
+    	$index_id = 'AC';
+	    $logger->warn('Could not determine database file type based on the file name extension of ' . $file_path);
+	    $logger->warn('Will assume it is a FASTA file and hope that is correct...');
+		$logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...');
+    }
+    
+	if ($file_type eq 'UniProtKB DAT') {
+		
+		#
+		# Read an entire record delimited by // at a time.
+		#
+		local $/ = "\/\/\n";
+		
+		eval {
+			open($file_path_fh, "<$file_path");
+		};
+		if ($@) {
+			$logger->fatal('Cannot read from database input file: ' . $@);
+			exit;
+		}
+		
+		while (<$file_path_fh>){
+			
+	        $logger->trace('Record:' . "\n$_\n");
+			
+			my $entry = SWISS::Entry->fromText($_);
+			
+			my $id  = $entry->ID;
+	    	my $acc = $entry->AC;
+	    	my $seq = $entry->SQ;
+	    	my $index_key;
+	    	
+	    	if ($index_id eq 'ID') {
+	    		$index_key = $id;
+	    	} elsif ($index_id eq 'AC') {
+	    		$index_key = $acc;
+	    	} else {
+				$logger->fatal('Unknown type of identifier used for indexing: ' . $index_id . '. Must be ID or AC.');
+				exit;
+	    	}
+			
+			$logger->debug('Found accession ' . $acc);
+	        $logger->trace('Found accession ' . $acc . ' with ID ' . $id . ' and seq ' . $seq);
+			
+			$seqs{$index_key}{'AC'} = $acc;
+			$seqs{$index_key}{'ID'} = $id;
+			$seqs{$index_key}{'SQ'} = $seq;
+			
+		}
+	    
+	} elsif ($file_type eq 'FASTA') {
+		
+		my @header_ids = ();
+		my $seq = '';
+		
+		eval {
+			open($file_path_fh, "<$file_path");
+		};
+		if ($@) {
+			$logger->fatal('Cannot read from database input file: ' . $@);
+			exit;
+		}
+		
+		LINE: while (my $line = <$file_path_fh>) {
+		
+			if ($line =~ m/^[\s\n\r]+$/) {
+				
+				#
+				# Skip blank line
+				#
+				$logger->debug('Found empty line in FASTA file.');
+				next LINE;
+				
+			} elsif ($line =~ m/^[a-zA-Z]+/) {
+				
+				#
+				# It's a sequence line
+				# Remove all white space and new line characters.
+				#
+				$logger->debug('Found FASTA sequence line: ' . $line);
+				
+				$line =~ s/[\s\n\r]+//g;
+				$seq .= $line;
+			
+			} elsif ($line =~ m/^>/) {
+				
+				$logger->debug('Found new FASTA header line: ' . $line);
+				
+				if ((length($seq) > 0) && defined($header_ids[0])) {
+					
+					#
+					# Store the previously found sequence in the lookup table.
+					#
+					foreach my $header_id (@header_ids) {
+						
+						#
+						# We don't know if the ID we found is an accession number (stable ID) or a normal (unstable) ID
+						#  -> Fill both the AC and ID fields with the same ID value. 
+						#
+						$seqs{$header_id}{'AC'} = $header_id;
+						$seqs{$header_id}{'ID'} = $header_id;
+						$seqs{$header_id}{'SQ'} = $seq;
+						
+						$logger->debug('Stored ' . $header_id . ':');
+						$logger->debug("\t" . 'SQ ' . $seq);
+						
+					}
+					
+					#
+					# Reset vars. 
+					#
+					$seq = '';
+					@header_ids = ();
+					
+				}
+				
+	            #
+	            # Check for the presence of some frequently occurring naming schemes:
+	            #
+	            # >IPI:CON_Trypsin|SWISS-PROT:P00761|TRYP_PIG Trypsin - Sus scrofa (Pig).
+	            # >IPI:CON_IPI00174775.2|TREMBL:Q32MB2;Q86Y46 Tax_Id=9606 Gene_Symbol=KRT73 Keratin-73
+	            # >sp|Q42592|APXS_ARATH L-ascorbate peroxidase S, chloroplastic/mitochondrial;
+	            # >jgi|Triad1|1|gw1.3.1.1
+	            #
+            	# The FASTA header line can basically have any format.
+            	# Therefore we try to see if the IDs we are looking for are present anywhere
+            	# in the header line up until the first white space or new line.
+            	# This should prevent matching annotation from other proteins in the description 
+            	# like a for example a 'best' BLAST hit that contains one of the IDs we
+            	# are looking for. Hence, in such a case this sequence is *not* the one of
+            	# the IDs we looking for, but similar to that one at best.
+            	#
+	            my @unfiltered_ids;
+				if ($line =~ m/^>((([^\s;|]+)[;|])*([^\s;|]+))[|;]?/i) {
+					
+					#
+					# Got multiple IDs
+					#
+					my $multiple_ids = $1;
+					@unfiltered_ids = split(/[\s;|]+/, $multiple_ids);
+					
+				} elsif ($line =~ m/^>([^\s]+)/i){
+					
+					#
+					# Got a single ID
+					#
+					my $single_id = $1;
+					push(@unfiltered_ids, $single_id);
+					
+				}
+				
+				KEY:foreach my $unfiltered_id (@unfiltered_ids) {
+					
+					my $this_id = $unfiltered_id;
+					
+					#
+					# Check for RockerBox formattted IDs.
+					#
+					# Example of RockerBox ID format, which may contain
+					#  * multiple IDs separated by semi-colons and 
+					#  * suffixed with the position of the peptide in the protein:
+					# YGR192C[123-142]; YJR009C[123-142]
+					if ($this_id =~ m/^([^\[\]]+)\[\d+-\d+\]$/i) {
+						$logger->trace('Protein ID in RockerBox format: ' . $this_id);
+						$this_id = $1;
+						$logger->trace("\t" . 'Using ID : ' . $this_id);			
+					}
+					
+					#
+					# Check for DB namespace prefixes.
+					#
+					if ($this_id =~ m/([^:]+):([^:]+)/i) {
+					
+						#
+						# Namespace prefixed ID.
+						#
+						$logger->trace('Namespace prefixed ID: ' . $this_id);
+										
+						my $prefix	= $1;
+						$this_id	= $2;
+						
+						$logger->trace("\t" . 'Using ID: ' . $this_id);
+							
+						#
+						# For a selected list of "known" database namespace prefixes,
+						# check for versioned accession numbers and remove the version number.
+						#
+						if (defined($prefix) && defined($databases_with_optionally_versioned_ids{$prefix})) {
+							$logger->trace('Found prefix: ' . $prefix);
+							my $separator = $databases_with_optionally_versioned_ids{$prefix};
+							$separator = '\\' . $separator;
+							if ($this_id =~ m/([^$separator]+)$separator\d+$/) {
+								$this_id = $1;
+							}
+						}
+					}
+					
+					$logger->debug('Found ID: ' . $this_id);
+					push(@header_ids, $this_id);				
+				
+				}
+								
+#				if ($line =~ m/^>((([^\s\n\r:;|]+)[:]([^\s\n\r:|]+)[|;])*([^\s\n\r:;|]+)[:]([^\s\n\r:|]+))[|;]?(\s+(.+))?/i) {
+#					
+#					#
+#					# One or more namespace prefixed IDs.
+#					#
+#					my $concatenated_namespace_prefixed_ids = $1;
+#					$logger->debug('Found prefixed IDs in header: ' . $concatenated_namespace_prefixed_ids);
+#					
+#					#   database_namespace	= $3 && $5;
+#					#   id					= $4 && $6;
+#					#   description			= $8;
+#					my @namespace_prefixed_ids = split(/[|;]/, $concatenated_namespace_prefixed_ids);
+#					
+#					foreach my $prefixed_id (@namespace_prefixed_ids) {
+#						
+#						$logger->trace('Prefixed ID: ' . $prefixed_id);
+#						
+#						if ($prefixed_id =~ m/(([^\s\n\r:;|]+):)?([^\s\n\r:|]+)/i) {
+#							
+#							my $this_prefix = $2;
+#							my $this_id = $3;
+#							$logger->debug('Found ID: ' . $this_id);
+#							push(@header_ids, $this_id);
+#							
+#							#
+#							# For a selected list of "known" database namespace prefixes,
+#							# check for versioned accession numbers and - if found - add 
+#							# both the versioned and the unversioned accession number 
+#							# to the list of header IDs.
+#							#
+#							if (defined($this_prefix) && defined($databases_with_optionally_versioned_ids{$this_prefix})) {
+#								$logger->trace('Found prefix: ' . $this_prefix);
+#								my $separator = $databases_with_optionally_versioned_ids{$this_prefix};
+#								$separator = '\\' . $separator;
+#								if ($this_id =~ m/([^$separator]+)$separator\d+$/) {
+#									my $this_unversioned_id = $1;
+#									$logger->debug('Found unversioned ID: ' . $this_unversioned_id);
+#									push(@header_ids, $this_unversioned_id);
+#								}
+#							}
+#							
+#						} else {
+#							
+#							$logger->warn(
+#								'This should have been an optionally prefixed ID, '
+#								  . 'but failed to match the corresponding regex: '
+#								  . $prefixed_id);
+#						
+#						}
+#					}
+#				
+#				} elsif ($line =~ m/^>((([^\s\n\r:;|]+)[|;])*([^\s\n\r:;|]+))[|;]?(\s+(.+))?/i) {
+#					
+#					#
+#					# One or more unprefixed IDs.
+#					#
+#					my $concatenated_ids = $1;
+#					$logger->debug('Found unprefixed IDs in header: ' . $concatenated_ids);
+#					
+#					#   id					= $3 && $4;
+#					#   description			= $7;
+#					my @unprefixed_ids = split(/[|;]/, $concatenated_ids);
+#					
+#					foreach my $unprefixed_id (@unprefixed_ids) {
+#						
+#						$logger->debug('Found ID: ' . $unprefixed_id);
+#						push(@header_ids, $unprefixed_id);
+#						
+#					}
+#					
+#				} else {
+#					
+#	            	#
+#	            	# Something else.
+#	            	#
+#	            	# The FASTA header line can basically have any format,
+#	            	# so this is probably one of the less frequent occurring annotation schemes.
+#	            	# Therefore we try to see if the IDs we are looking for are present anywhere
+#	            	# in the header line up until the first white space or new line.
+#	            	# This may be tricky as we might match other annotation from other proteins
+#	            	# like a description from a 'best' BLAST hit that contains one of the IDs we
+#	            	# are looking for. Hence, in such a case this sequence is *not* the one of
+#	            	# the IDs we looking for, but similar to that one at best.
+#	            	#
+#					if ($line =~ m/>([^\n\r\s]+)/) {
+#						
+#						my $putative_id = $1;
+#						$logger->debug('Found putative ID in header: ' . $putative_id);
+#						push(@header_ids, $putative_id);
+#						
+#					} else {
+#						$logger->warn('Cannot identify IDs in this header: ' . $line);
+#					}
+#				}
+			}
+		}
+		
+		#
+		# Store the last found sequence in the lookup table.
+		#
+		foreach my $header_id (@header_ids) {
+			
+			#
+			# We don't know if the ID we found is an accession number (stable ID) or a normal (unstable) ID
+			#  -> Fill both the AC and ID fields with the same ID value. 
+			#
+			$seqs{$header_id}{'AC'} = $header_id;
+			$seqs{$header_id}{'ID'} = $header_id;
+			$seqs{$header_id}{'SQ'} = $seq;
+			
+			$logger->debug('Stored ' . $header_id . ':');
+			$logger->debug("\t" . 'SQ ' . $seq);
+			
+		}
+		
+		#
+		# Reset vars. 
+		#
+		$seq = '';
+		@header_ids = ();
+		
+	}
+	
+	close($file_path_fh);
+	$logger->info('Created lookup list for database sequences.');
+	
+    return (\%seqs, $index_id);
+	
+}
+
+sub _ExtractSeqs {
+	
+    my ($db_lookup_table_refs, 
+    	$path_from, $strip_lc, $id_column, $peptide_column, 
+    	$cleavage_path_to, $miscleavage_path_to, $modification_path_to, $peptide_path_to, 
+    	$cleavage_aa, $cleavage_terminus, $miscleavage_distance, 
+		$n_term_context_length, $c_term_context_length, 
+		$padding_character) = @_;
+		
+    $logger->debug('Parsing ' . $path_from . '...');
+
+    my $extracted_seq_contexts_cle = 0;
+    my $extracted_seq_contexts_mis = 0;
+    my $extracted_seq_contexts_mod = 0;
+	my $extracted_seq_contexts_pep = 0;
+	my $path_from_fh;
+	my $cleavage_path_to_fh;
+	my $miscleavage_path_to_fh;
+	my $modification_path_to_fh;
+	my $peptide_path_to_fh;
+	
+	eval {
+		open($path_from_fh,	"<$path_from");
+	};
+	if ($@) {
+		$logger->fatal('Cannot read from fragments input file: ' . $@);
+		exit;
+	}
+	if (defined($cleavage_path_to)) {	
+		eval {
+			open($cleavage_path_to_fh, ">$cleavage_path_to");
+		};
+		if ($@) {
+			$logger->fatal('Cannot write to output file: ' . $@);
+			exit;
+		}
+	}
+	if (defined($miscleavage_path_to)) {
+		eval {
+			open($miscleavage_path_to_fh, ">$miscleavage_path_to");
+		};
+		if ($@) {
+			$logger->fatal('Cannot write to output file: ' . $@);
+			exit;
+		}
+	}
+	if (defined($modification_path_to)) {
+		eval {
+			open($modification_path_to_fh, ">$modification_path_to");
+		};
+		if ($@) {
+			$logger->fatal('Cannot write to output file: ' . $@);
+			exit;
+		}
+	}
+	if (defined($peptide_path_to)) {
+		eval {
+			open($peptide_path_to_fh, ">$peptide_path_to");
+		};
+		if ($@) {
+			$logger->fatal('Cannot write to output file: ' . $@);
+			exit;
+		}
+	}
+	
+    LINE:while (my $line = <$path_from_fh>) {
+		
+		#
+		# Remove (trailing) line end characters!
+		#
+		$line =~ s/[\n\r\f]+//g;
+		$logger->trace($line);
+		
+		my @column_values = split(/\t/, $line);
+		
+		# For debugging only.
+		#foreach my $val (@column_values) {
+		#	$logger->debug('Col val: ' . $val);
+		#}
+		
+		my $id_column_index			= $id_column - 1;
+		my $peptide_column_index	= $peptide_column - 1;
+		my $unfiltered_key			= $column_values[$id_column_index];
+		my $key;
+		my @keys;
+		my $peptide					= $column_values[$peptide_column_index];
+		my $peptide_nonmod;
+		my $protein;
+		my $acc;
+		my $id;
+		my $index = -1; # An index < 0 indicates the peptide was not found in a protein sequence (yet).
+		my $length;
+		my $cleavage_peptide_contexts = [];
+		my $miscleavage_peptide_contexts = [];
+		my $modification_peptide_contexts = [];
+		my $peptide_contexts = [];
+		
+		#
+		# Check if $key and $peptide were present in this line.
+		# If not this may be a leading/trailing empty or meta-data line, 
+		# that does not contain the correct amount of columns. 
+		#
+		unless (defined($unfiltered_key) && defined($peptide)) {
+			$logger->warn('Skipping line: ' . $line);
+			$logger->warn("\t" . '(Peptide sequence and/or protein ID missing or incorrect amount of columns.)');
+			next LINE;
+		}
+		
+		#
+		# Sanity check for peptide sequences.
+		#
+		unless ($peptide =~ m/([a-zA-Z]+)/) {
+        	$logger->warn('Fragment file line in unexpected format. Cannot find peptide in: ' . $line);
+        	$logger->debug('$peptide contains: ' . $peptide);
+            next LINE;
+		}
+		
+		if ($strip_lc) {
+			# Remove lower case characters representing modifications.
+			$peptide_nonmod = $peptide;
+			$peptide_nonmod =~ s/[a-z]+//g;
+		} else {
+			# Convert everything to uppercase.
+			$peptide_nonmod = uc($peptide);
+		}
+		
+		#
+		# Figure out in what format the Protein ID(s) were supplied.
+		#
+		#  * Strip off optional DB namespace prefixes.
+		#  * Handle optional accession number version suffixes.
+		#
+		if ($unfiltered_key =~ m/^((([^\s;|]+)[\s;|]+)*([^\s;|]+))[\s|;]?/i) {
+			
+			#
+			# Got multiple IDs
+			#
+			my $multiple_ids = $1;
+			@keys = split(/[\s;|]+/, $multiple_ids);
+			
+		} else {
+			
+			#
+			# Got a single ID
+			#
+			push(@keys, $unfiltered_key);
+			
+		}
+		
+		KEY:foreach $unfiltered_key (@keys) {
+			
+			$key = $unfiltered_key;
+			
+			#
+			# Check for RockerBox formattted IDs.
+			#
+			# Example of RockerBox ID format, which may contain
+			#  * multiple IDs separated by semi-colons and 
+			#  * suffixed with the position of the peptide in the protein:
+			# YGR192C[123-142]; YJR009C[123-142]
+			if ($key =~ m/^([^\[]+)\[\d+-\d+\]$/i) {
+				$logger->trace('Protein ID in RockerBox format: ' . $key);
+				$key = $1;
+				$logger->trace("\t" . 'Using ID : ' . $key);			
+			}
+			
+			#
+			# Check for DB namespace prefixes.
+			#
+			if ($key =~ m/([^:]+):([^:]+)/i) {
+			
+				#
+				# Namespace prefixed ID.
+				#
+				$logger->trace('Namespace prefixed ID: ' . $key);
+								
+				my $prefix	= $1;
+				$key		= $2;
+				
+				$logger->trace("\t" . 'Using ID: ' . $key);
+					
+				#
+				# For a selected list of "known" database namespace prefixes,
+				# check for versioned accession numbers and remove the version number.
+				#
+				if (defined($prefix) && defined($databases_with_optionally_versioned_ids{$prefix})) {
+					$logger->trace('Found prefix: ' . $prefix);
+					my $separator = $databases_with_optionally_versioned_ids{$prefix};
+					$separator = '\\' . $separator;
+					if ($key =~ m/([^$separator]+)$separator\d+$/) {
+						$key = $1;
+					}
+				}
+			}
+			
+			$logger->debug('Found ID: ' . $key);
+			
+			#
+			# Sanity check for Protein ID.
+			#
+			unless ($key =~ m/^([A-Z0-9\._-]+)$/i) {
+	        	$logger->warn('Fragment file line in unexpected format. Cannot find sequence ID in:');
+	         	$logger->warn("\t" . $line);
+	         	$logger->debug('$key contains: ' . $key);
+	            next LINE;
+			}
+		
+			#
+			# Lookup info for the protein this peptide was derived from.
+			#
+			($protein, $acc, $id, $index) = _LookupProteinInfo($key, $peptide_nonmod, $db_lookup_table_refs);
+			
+			#
+			# Check if the peptide (fragment) was found in this protein sequence.
+			#
+			if (defined($protein)) {
+				if ($index > -1) {
+					$logger->trace('Found peptide sequence in protein ' . $key);
+					last KEY;
+				} else {
+					$logger->warn('Cannot find peptide (' . $peptide_nonmod . ') in protein ' . $key);
+					$logger->warn("\t" . 'Will try other protein IDs for this peptide if available.');
+				}
+			} else {
+				$logger->warn('Cannot find protein identifier ' . $key . ' in any of the supplied databases.');
+				$logger->warn("\t" . 'Will try other protein IDs for this peptide if available.');
+			}
+		}
+		
+		#
+		# Check if the peptide (fragment) was found in any of the associated protein sequences.
+		#
+		if (defined($protein)) {
+			
+			$logger->debug('Found protein ' . $key);
+			
+			if ($index > -1) {
+				
+				$logger->debug('Found peptide (' . $peptide_nonmod . ') in protein ' . $key);
+				
+				if (defined($cleavage_path_to)) {
+					($cleavage_peptide_contexts) = _GetCleavageSequenceContexts($peptide_nonmod, $protein, $key, $index,
+																						 $cleavage_aa, $cleavage_terminus);	
+				}
+				if (defined($miscleavage_path_to)) {
+					($miscleavage_peptide_contexts) = _GetMiscleavageSequenceContexts($peptide_nonmod, $protein, $key, $index,
+																						 $cleavage_aa, $cleavage_terminus, $miscleavage_distance);
+				}
+				if (defined($modification_path_to)) {
+					($modification_peptide_contexts) = _GetModificationSequenceContexts($peptide_nonmod, $protein, $key, $index,
+																						 $modified_aa, $peptide);
+				}
+				if (defined($peptide_path_to)) {
+					($peptide_contexts) = _GetPeptideSequenceContexts($peptide_nonmod, $protein, $key, $index);
+				}
+			} else {
+				$logger->error('Cannot find peptide (' . $peptide_nonmod . ') in protein ' . $key);
+				$acc = 'NA';
+			}
+		} else {
+			$logger->error('Cannot find protein "' . $unfiltered_key . '" in any of the supplied databases.');
+			$acc = 'NA';
+		}
+		
+		#
+		# Append the peptide contexts to the existing lines of the fragment file and save that new line to the output file.
+		#	
+		$extracted_seq_contexts_cle = _SaveSequenceContexts($line, $acc, $cleavage_peptide_contexts,  
+															$cleavage_path_to_fh, $extracted_seq_contexts_cle);
+		
+		$extracted_seq_contexts_mis = _SaveSequenceContexts($line, $acc, $miscleavage_peptide_contexts,  
+															$miscleavage_path_to_fh, $extracted_seq_contexts_mis);
+		
+		$extracted_seq_contexts_mod = _SaveSequenceContexts($line, $acc, $modification_peptide_contexts,  
+															$modification_path_to_fh, $extracted_seq_contexts_mod);
+		
+		$extracted_seq_contexts_pep = _SaveSequenceContexts($line, $acc, $peptide_contexts,  
+															$peptide_path_to_fh, $extracted_seq_contexts_pep);
+		
+    }
+	
+	#
+	# Close file handles.
+    #
+    foreach my $fh ($path_from_fh, $cleavage_path_to_fh, $miscleavage_path_to_fh, $modification_path_to_fh, $peptide_path_to_fh) {
+	    if (defined($fh)) {
+	    	close($fh);
+	    }
+    }
+    
+    return($extracted_seq_contexts_cle, $extracted_seq_contexts_mis, $extracted_seq_contexts_mod, $extracted_seq_contexts_pep);
+    
+}
+
+sub _LookupProteinInfo {
+	
+	my ($key, $peptide, $db_lookup_table_refs) = @_;
+	
+	my $protein_found_in_db = 0;
+	my $protein;
+	my $acc;
+	my $id;
+	my $index;
+	
+	DB_LOOKUP:foreach my $db_lookup_table_ref (@{$db_lookup_table_refs}) {
+		
+		if (defined(${$db_lookup_table_ref}{$key})) {
+			
+			$protein_found_in_db = 1;
+			$logger->debug('Found protein ' . $key . ' in this sequence DB.');
+			
+			$protein = ${$db_lookup_table_ref}{$key}{'SQ'};
+			$acc     = ${$db_lookup_table_ref}{$key}{'AC'};
+			$id      = ${$db_lookup_table_ref}{$key}{'ID'};
+			#
+			# Find peptide (fragment) in DB protein sequence.
+			#
+			$index = index($protein, $peptide);
+	
+			if ($index > -1) {
+				
+				#
+				# Found peptide!
+				#
+				$logger->debug('Found peptide ' . $peptide . ' at position ' . $index . ' in protein ' . $key);
+				last DB_LOOKUP;
+			
+			}
+			
+			#
+			# If the input was based on mass spectrometry data we cannot see 
+			# the difference between I (Isolucene) and L (Leucine) as the both 
+			# weigh the same. Let's try an I agnostic search by substituting all 
+			# Is for Ls.
+			#
+			my $protein_l_only = $protein;
+			my $peptide_l_only = $peptide;
+			$protein_l_only =~ s/I/L/g; 
+			$peptide_l_only =~ s/I/L/g; 
+			$index = index($protein_l_only, $peptide_l_only);
+	
+			if ($index > -1) {
+				
+				#
+				# Found peptide!
+				#
+				$logger->debug('Found peptide ' . $peptide . ' at position ' . $index . ' in protein ' . $key);
+				$logger->debug("\t" . 'Had to treat I and L as the same amino acid in the DB search to get a match.');
+				last DB_LOOKUP;
+			
+			}
+			
+			#
+			# Cannot find peptide :(
+			#
+			$protein	= undef;
+			$acc		= undef;
+			$id			= undef;
+			
+			$logger->debug('Cannot find peptide (' . $peptide . ') in protein ' . $key);
+			$logger->debug("\t" . 'This may be the result of an updated protein sequence in the database.');
+			$logger->debug("\t" . 'Will try to find the peptide in other databases (if available).');
+		
+			if ($strip_lc) {
+				$logger->debug("\t" . '(Stripping of lowercase characters, indicating modifications in the peptide sequence, was enabled.)');
+			} else {
+				$logger->debug("\t" . '(Stripping of lowercase characters, indicating modifications in the peptide sequence, was *not* enabled.)');
+			}
+			
+		} else {
+			$logger->debug('Protein ' . $key . ' missing from this sequence DB. Will try other databases (if available).');
+		}
+	}
+	
+	#
+	# Check if the peptide (fragment) was found in a DB protein sequence.
+	#
+	if ($protein_found_in_db == 1) {
+		unless ($index > -1) {
+			$logger->warn('Cannot find peptide (' . $peptide . ') in protein ' . $key);
+			$logger->warn("\t" . 'Will try to find this peptide (' . $peptide . ') using other associated protein identifiers (if available).');
+		}
+	} else {
+		$logger->warn('Cannot find protein ' . $key . ' in any of the supplied databases.');
+	}
+	
+	return($protein, $acc, $id, $index);
+}
+
+sub _GetCleavageSequenceContexts {
+	
+	my ($peptide, $protein, $key, $index, $cleavage_aa, $cleavage_terminus) = @_;
+	
+	my %peptide_contexts;
+	$peptide_contexts{'n_term'}{'context'} = '';
+	$peptide_contexts{'c_term'}{'context'} = '';
+	my $max_context_length = $n_term_context_length + $c_term_context_length;
+	
+	foreach my $peptide_term (keys(%peptide_contexts)) {
+		
+		#
+		# Get the offset for the sequence contexts to retrieve from this peptide terminus
+		# or drop the context for this peptide terminus if it is also the protein terminus.
+		#
+		if ($peptide_term eq 'n_term') {
+			if ($index > 0) {
+				# This is an N-terminal peptide context.
+				my $offset = $index - $n_term_context_length;
+				$peptide_contexts{$peptide_term}{'offset'} = $offset;
+				$logger->debug('Offset = ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . '.');
+			} else {
+				# Peptide N-terminus == Protein N-terminus.
+				$logger->warn('Peptide N-terminus = protein N-terminus. Dropping N-terminal sequence context for peptide ' . $peptide . 
+							  ' in protein ' . $key . '.');
+				next;
+			}
+		} elsif ($peptide_term eq 'c_term') {
+			if (($index + length($peptide)) < length($protein)) {
+				# This is a C-terminal peptide context.
+				my $offset = ($index + length($peptide))- $n_term_context_length;
+				$peptide_contexts{$peptide_term}{'offset'} = $offset;
+				$logger->debug('Offset = ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . '.');
+			} else {
+				# Peptide C-terminus == Protein C-terminus.
+				$logger->warn('Peptide C-terminus = protein C-terminus. Dropping C-terminal sequence context for peptide ' . $peptide . 
+							  ' in protein ' . $key . '.');
+				next;
+			}
+		}
+
+		#
+		# Check if DB protein sequence is long enough on n-terminal side
+		# to retrieve full length peptide contexts.
+		# (c-term length is checked later...)
+		#
+		if ($peptide_contexts{$peptide_term}{'offset'} < 0) {
+			
+			#
+			# Protein too short for full length context: Calculate offset adjustment.
+			#
+			my $offset = $peptide_contexts{$peptide_term}{'offset'};
+			my $offset_adjustment = -($offset);
+			$peptide_contexts{$peptide_term}{'offset_adjustment'} = $offset_adjustment;
+			$logger->debug('Offset ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . ' less than zero');
+			
+			if (length($padding_character) == 1) {
+				
+				#
+				# Add N-terminal padding characters for contexts that are too short on the N-terminal side.
+				#
+				my $n_term_padding = "$padding_character" x $offset_adjustment;
+				$peptide_contexts{$peptide_term}{'context'} = $n_term_padding;
+				$logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding);
+							
+			} else {
+				$logger->debug('Padding was disabled. Will not add n-term padding.');
+			}
+			
+			$peptide_contexts{$peptide_term}{'offset'} = 0;
+		
+		} else {
+		
+			$peptide_contexts{$peptide_term}{'offset_adjustment'} = 0;
+		
+		}
+		
+		#
+		# Extract sequence context from DB protein sequence.
+		#
+		my $peptide_context = substr($protein, $peptide_contexts{$peptide_term}{'offset'}, 
+									$max_context_length - $peptide_contexts{$peptide_term}{'offset_adjustment'});
+		# append the context extracted from the protein sequence,
+		# because there may be already some N-terminal padding...
+		$peptide_contexts{$peptide_term}{'context'} .= $peptide_context;	
+		
+		#
+		# Quality Control: Check enzyme specificity for enzymes with known (= user supplied) specs.
+		#
+		unless ($cleavage_aa eq '*') {
+			
+			$peptide_context = $peptide_contexts{$peptide_term}{'context'};
+			my $p1 = substr($peptide_context, ($n_term_context_length -1), 1);
+			my $p1_prime = substr($peptide_context, $n_term_context_length, 1);
+			
+			if ($cleavage_terminus eq 'C') {
+				
+				if ($p1 eq $cleavage_aa) {
+					# Peptide fragment contains the AA recognised by the protease.
+					$logger->debug('Specific protease activity -> Retaining peptide context.');
+				} else {	
+					# Must be non-specific cleavage -> drop peptide context.
+					$logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context . 
+								  ' for peptide ' . $peptide . ' in protein ' . $key);
+					$peptide_contexts{$peptide_term}{'context'} = '';
+				}
+				
+			} elsif ($cleavage_terminus eq 'N') {
+				
+				if ($p1_prime eq $cleavage_aa) {
+					# Peptide fragment contains the AA recognised by the protease.
+					$logger->debug('Specific protease activity -> Retaining peptide context.');
+				} else {	
+					# Must be non-specific cleavage -> drop peptide context.
+					$logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context . 
+								  ' for peptide ' . $peptide . ' in protein ' . $key);
+					$peptide_contexts{$peptide_term}{'context'} = '';
+				}
+            
+			} elsif ($cleavage_terminus eq '*') {
+				
+				if (($p1 eq $cleavage_aa) || ($p1_prime eq $cleavage_aa)) {
+					# Peptide fragment contains the AA recognised by the protease.
+					$logger->debug('Specific protease activity -> Retaining peptide context.');
+				} else {	
+					# Must be non-specific cleavage -> drop peptide context.
+					$logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context . 
+								  ' for peptide ' . $peptide . ' in protein ' . $key);
+					$peptide_contexts{$peptide_term}{'context'} = '';
+				}
+			
+			} else {
+				$logger->fatal('Unknown cleavage terminus ' . $cleavage_terminus . '. Must be C, N or *.');
+				exit;
+			}
+		}
+	}
+	
+	push(my @contexts, $peptide_contexts{'n_term'}{'context'}, $peptide_contexts{'c_term'}{'context'});
+	
+	if (length($padding_character) == 1) {
+		_CheckSequenceContextLength(\@contexts, $peptide, $key, $max_context_length);
+	} else {
+		$logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.');
+	}
+	
+	return(\@contexts);
+
+}
+
+sub _GetMiscleavageSequenceContexts {
+	
+	my ($peptide, $protein, $key, $petide_index, $cleavage_aa, $cleavage_terminus, $miscleavage_distance) = @_;
+	
+	my @peptide_contexts;
+	my $max_context_length = $n_term_context_length + $c_term_context_length;
+	
+	#
+	# Find miscleavage sites in peptide.
+	#
+	# TODO: handle --mcd
+	#
+	my @amino_acids = split('', $peptide);
+	my $aa_index = 0;
+	#
+	# Drop first and last amino acids as these are from cleavage sites or from the protein termini.
+	# In either case they cannot represent a miscleavage site.
+	#
+	shift(@amino_acids);
+	pop(@amino_acids);
+
+	foreach my $aa (@amino_acids) {
+		
+		$aa_index++;
+		
+		if ($aa eq $cleavage_aa) {
+			
+			#
+			# We found a miscleavage site.
+			#
+			$logger->debug('Found miscleavage site in peptide ' . $peptide . ' of protein ' . $key . '.');
+			#
+			# Get the offset for the sequence context.
+			#
+			my $offset = $petide_index + $aa_index - $n_term_context_length;
+			$logger->debug("\t" . 'Offset = ' . $offset . '.');
+			my $n_term_padding = '';
+			my $offset_adjustment = 0;
+			
+			if ($cleavage_terminus eq 'C') {
+				
+				#
+				# Must increment the offset with 1 if the protease cuts C-terminal of a recognition site.
+				#
+				$offset++;
+				
+			}
+			
+			#
+			# Check if DB protein sequence is long enough on n-terminal side
+			# to retrieve full length peptide contexts.
+			# (c-term length is checked elsewhere.)
+			#
+			if ($offset < 0) {
+				
+				#
+				# Protein too short for full length context: Calculate offset adjustment.
+				#
+				$offset_adjustment = -($offset);
+				$logger->debug('Offset ' . $offset . ' for miscleavage context in peptide ' . $peptide . ' in protein ' . $key . ' less than zero');
+				
+				if (length($padding_character) == 1) {
+					
+					#
+					# Add N-terminal padding characters for contexts that are too short on the N-terminal side.
+					#
+					$n_term_padding = "$padding_character" x $offset_adjustment;
+					$logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding);
+								
+				} else {
+					$logger->debug('Padding was disabled. Will not add n-term padding.');
+				}
+				
+				#
+				# Reset offset for this context to protein start.
+				#
+				$offset = 0;
+				
+			}
+			
+			#
+			# Extract sequence context from DB protein sequence.
+			#
+			my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment);
+			$peptide_context = $n_term_padding . $peptide_context;
+			$logger->debug('Found miscleavage context: ' . $peptide_context . '.');
+								
+			push(@peptide_contexts, $peptide_context);
+		}
+	}
+	
+	if (length($padding_character) == 1) {
+		_CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length);
+	} else {
+		$logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.');
+	}
+	
+	return(\@peptide_contexts);
+	
+}
+
+sub _GetModificationSequenceContexts {
+	
+	my ($peptide, $protein, $key, $petide_index, $modified_aa, $modified_peptide) = @_;
+	
+	my @peptide_contexts;
+	my $modified_aa_index_adjustment;
+	my @modified_aa_in_peptide_indices;
+	my $modified_aa_in_protein_index;
+	my $max_context_length = $n_term_context_length + 1 + $c_term_context_length;
+	#my $offset = 0;
+	my $modified_aa_in_modified_peptide_index;
+	my $modified_peptide_remainder;
+	my $processed_peptide_length = 0;
+	
+	#
+	# Find amino acid in the string identifying both the modified amino acid and the modification. 
+	#
+	if ($modified_aa =~ m/^([a-z]+)[A-Z*]$/) {
+			
+		$modified_aa_index_adjustment = length($1);
+		
+	} elsif ($modified_aa =~ m/^[A-Z*][a-z]$/) {
+		
+		$modified_aa_index_adjustment = 0;
+		
+	} else {
+		
+		$logger->fatal('Modified amino acid was specified in an unsupported format: ' . $modified_aa . '.');
+		exit;
+	
+	}
+	
+	#
+	# Find modified sites in peptide.
+	#
+	my $modified_aa_for_regex = $modified_aa;
+	$modified_aa_for_regex =~ s/\*/[A-Z]/;
+	$logger->debug('modified_aa for search: ' . $modified_aa_for_regex);
+	if ($modified_peptide =~ m/$modified_aa_for_regex/) {
+		$logger->debug("\t" . 'Peptide contains mods.');
+		$modified_aa_in_modified_peptide_index = $-[0];
+		$modified_peptide_remainder = $';
+		my $modified_peptide_prefix = $`;
+		$logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length);
+		$processed_peptide_length += length($modified_peptide_prefix);
+		$logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length);
+		$processed_peptide_length += length($modified_aa);
+		$logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length);
+		$logger->trace("\t\t" . 'Index for mod :' . $modified_aa_in_modified_peptide_index);
+	} else {
+		$logger->debug('No modified amino acids found in this peptide.');
+	}
+	
+	#my $modified_aa_in_modified_peptide_index = index($modified_peptide, $modified_aa);
+	
+	while (defined($modified_aa_in_modified_peptide_index)) {
+
+    	$modified_aa_in_modified_peptide_index += $modified_aa_index_adjustment;
+    	$logger->debug('Found modified amino acid at position ' . ($modified_aa_in_modified_peptide_index + 1) . ' in modified peptide ' . $modified_peptide . ' of protein ' . $key . '.');
+    	
+    	#
+    	# Count all modifications of any type preceeding the found modified amino acid.
+    	#
+    	my $modified_n_term = substr($modified_peptide, 0, $modified_aa_in_modified_peptide_index);
+    	my $n_term = $modified_n_term;
+    	$n_term =~ s/[a-z]+//g;
+    	my $mod_count = length($modified_n_term) - length($n_term);
+    	
+    	if ($mod_count >= 0) {
+    		$logger->debug('                      counted modifications preceeding amino acid = ' . $mod_count);
+    	} else {
+    		$logger->fatal('                      counted modifications preceeding amino acid is negative: ' . $mod_count);
+    		exit;
+    	}
+    	
+    	#
+    	# We need to substract the amount of mods preceeding the identified amino acid 
+    	# from the index to get the position of the amino acid in the unmodified peptide.
+    	#
+    	my $modified_aa_in_peptide_index = $modified_aa_in_modified_peptide_index - $mod_count;
+    	$logger->debug('                      and at position ' . ($modified_aa_in_peptide_index + 1) . ' in peptide ' . $peptide . ' of protein ' . $key . '.');
+
+    	push(@modified_aa_in_peptide_indices, $modified_aa_in_peptide_index);
+    	
+		#
+		# Search for additional mods in the modified peptide.
+		#
+		#$offset = $modified_aa_in_modified_peptide_index + 1;
+		#$modified_aa_in_modified_peptide_index = index($modified_peptide, $modified_aa, $offset);
+		if ($modified_peptide_remainder =~ m/$modified_aa_for_regex/) {
+			$logger->debug("\t" . 'Peptide contains more mods.');
+			$modified_peptide_remainder = $';
+			my $modified_peptide_prefix = $`;
+			$logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length);
+			$modified_aa_in_modified_peptide_index = $-[0] + $processed_peptide_length;
+			$logger->trace("\t\t" . 'Index for mod :' . $modified_aa_in_modified_peptide_index);
+			$processed_peptide_length += length($modified_peptide_prefix);
+			$logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length);
+			$processed_peptide_length += length($modified_aa);
+			$logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length);
+		} else {
+			$modified_aa_in_modified_peptide_index = undef();
+			$logger->debug('No more modified amino acids found in this peptide.');
+		}		
+	}
+	
+	#
+	# Retrieve sequence contexts for modified stites from protein sequence
+	#
+	foreach my $modified_aa_in_peptide_index (@modified_aa_in_peptide_indices) {
+		
+		#
+		# Get the offset for the sequence context.
+		#
+		my $offset = $petide_index + $modified_aa_in_peptide_index - $n_term_context_length;
+		$logger->debug("\t" . 'Offset = ' . $offset . '.');
+		my $n_term_padding = '';
+		my $offset_adjustment = 0;
+		
+		#
+		# Check if DB protein sequence is long enough on n-terminal side
+		# to retrieve full length peptide contexts.
+		# (c-term length is checked elsewhere.)
+		#
+		if ($offset < 0) {
+			
+			#
+			# Protein too short for full length context: Calculate offset adjustment.
+			#
+			$offset_adjustment = -($offset);
+			$logger->debug('Offset ' . $offset . ' for modification context in peptide ' . $peptide . ' in protein ' . $key . ' less than zero');
+			
+			if (length($padding_character) == 1) {
+				
+				#
+				# Add N-terminal padding characters for contexts that are too short on the N-terminal side.
+				#
+				$n_term_padding = "$padding_character" x $offset_adjustment;
+				$logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding);
+							
+			} else {
+				$logger->debug('Padding was disabled. Will not add n-term padding.');
+			}
+			
+			#
+			# Reset offset for this context to protein start.
+			#
+			$offset = 0;
+			
+		}
+		
+		#
+		# Extract sequence context from DB protein sequence.
+		#
+		my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment);
+		$peptide_context = $n_term_padding . $peptide_context;
+		$logger->debug('Found modification context: ' . $peptide_context . '.');
+							
+		push(@peptide_contexts, $peptide_context);
+		
+	}
+	
+	if (length($padding_character) == 1) {
+		_CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length);
+	} else {
+		$logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.');
+	}
+	
+	return(\@peptide_contexts);
+	
+}
+
+#
+# TODO: pep contexts.
+#
+sub _GetPeptideSequenceContexts {
+	
+	my ($peptide, $protein, $key, $peptide_index) = @_;
+	
+	my @peptide_contexts;
+	my $max_context_length = $n_term_context_length + length($peptide) + $c_term_context_length;
+	
+	#
+	# Get the offset for the peptide sequence context.
+	#
+	my $offset = $peptide_index - $n_term_context_length;
+	$logger->debug("\t" . 'Offset = ' . $offset . '.');
+	my $n_term_padding = '';
+	my $offset_adjustment = 0;
+	
+	#
+	# Check if DB protein sequence is long enough on n-terminal side
+	# to retrieve full length peptide contexts.
+	# (c-term length is checked elsewhere.)
+	#
+	if ($offset < 0) {
+		
+		#
+		# Protein too short for full length context: Calculate offset adjustment.
+		#
+		$offset_adjustment = -($offset);
+		$logger->debug('Offset ' . $offset . ' for peptide context of peptide ' . $peptide . ' in protein ' . $key . ' less than zero');
+		
+		if (length($padding_character) == 1) {
+			
+			#
+			# Add N-terminal padding characters for contexts that are too short on the N-terminal side.
+			#
+			$n_term_padding = "$padding_character" x $offset_adjustment;
+			$logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding);
+						
+		} else {
+			$logger->debug('Padding was disabled. Will not add n-term padding.');
+		}
+		
+		#
+		# Reset offset for this context to protein start.
+		#
+		$offset = 0;
+		
+	}
+	
+	#
+	# Extract sequence context from DB protein sequence.
+	#
+	my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment);
+	$peptide_context = $n_term_padding . $peptide_context;
+	$logger->debug('Found modification context: ' . $peptide_context . '.');
+						
+	push(@peptide_contexts, $peptide_context);
+	
+	if (length($padding_character) == 1) {
+		_CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length);
+	} else {
+		$logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.');
+	}
+	
+	return(\@peptide_contexts);
+
+}
+
+sub _CheckSequenceContextLength {
+
+	my ($contexts, $peptide, $key, $max_context_length) = @_;
+	
+	#
+	# If padding was enabled for sequence contexts < max sequence context length.
+	#
+	foreach my $peptide_context (@{$contexts}) {
+		
+		my $peptide_context_length = length($peptide_context);
+		
+		if ($peptide_context_length < $max_context_length) {
+			
+			#
+			# sequence context < max sequence context length -> append c-term padding.
+			# (n-term padding was already appended if necessary during sequence context lookup with _GetSequenceContext().)
+			#
+			my $c_term_padding = "$padding_character" x ($max_context_length - $peptide_context_length);
+			$logger->debug('Length of peptide context (' . $peptide_context_length . ') < max context length.');
+			$logger->debug("\t" . 'Will add C-terminal padding: ' . $c_term_padding);
+			$peptide_context = $peptide_context . $c_term_padding;
+		
+		}
+		
+		#
+		# Sanity check for padded sequence contexts.
+		#
+		# (Checks if the combined effect of n-term and c-term padding worked well.)
+		#
+		$peptide_context_length = length($peptide_context);
+		unless ($peptide_context_length == $max_context_length) {
+			$logger->fatal('Length of peptide context too short or too long for peptide ' . $peptide . ' in protein ' . $key);
+			$logger->fatal("\t" . 'Context length is ' . $peptide_context_length . ', but should be ' . $max_context_length . '.');
+			exit;
+		}
+	}
+	
+	#
+	# No need to return the modified (c-terminal-padded) contexts as we worked with a arrayref.
+	#
+	
+}
+
+sub _SaveSequenceContexts {
+	
+	my ($line, $acc, $contexts, $path_to_fh, $extracted_seq_contexts) = @_;
+	
+	#
+	# Append the peptide contexts to the existing line of the fragment file and
+	# save that new line to the output file.
+	# By appending we keep all info from the original fragment file intact.
+	#
+	foreach my $context (@{$contexts}) {
+		
+		#
+		# Skip "empty"/"missing" contexts.
+		# May be the result of:
+		#  1. The peptide terminus being also the protein terminus.
+		#  2. Proteins missing from the database.
+		#  3. ...
+		#
+		if ($padding_character ne '') {
+			if ($context =~ m/^($padding_character)+$/) {
+				next;
+			}
+		} else {
+			if ($context eq '') {
+				next;
+			}
+		}
+		
+		my $new_line = $line;
+		
+		# 1. Append accession numbers if the fragment file used IDs.
+		if ($index_id eq 'ID') {
+			$new_line .= "\t" . $acc;
+		}
+		
+		# 2. Append sequence context.
+		$new_line .= "\t" . $context;
+		# 3. Append new line character.
+		$new_line .= "\n";
+		# Save result to disk.
+		print($path_to_fh $new_line);
+		$extracted_seq_contexts++;
+	
+	}
+	
+	return($extracted_seq_contexts);
+	
+}
+
+sub _Usage {
+	 
+               
+    print STDOUT	"\n" .
+    'ExtractPeptideSequenceContext.pl: Extract sequence contexts of cleavage, miscleavage or modification sites ' . "\n" .
+    '                                  based on sequence fragments (like peptides experimentally identified with MSMS) ' . "\n" .
+    '                                  and protein sequences from a database in FASTA or UniProtKB file format.' . "\n" .
+    "\n" .
+    'Usage:' . "\n" .
+    "\n" .
+    '   ExtractPeptideSequenceContext.pl options' . "\n" .
+    "\n" .
+    'Available options are:' . "\n" .
+    "\n" .
+    '   --db [file]    Input file containing all database sequences in either UniProtKB *.dat or *.fasta format.' . "\n" .
+    '   --dba [file]   Optional alternative / backup input file containing database sequences in either UniProtKB *.dat or *.fasta format.' . "\n" .
+    '                  This may be a slightly older or newer version of the database, which will be searched in case a ' . "\n" .
+    '                  sequence ID was not found in the primary database file that was specified with --db [file].' . "\n" .
+    '   --dbf [format] Optional argument to specify the database sequence file format. Valid format specifications are:' . "\n" .
+    '                    * \'UniProtKB DAT\' for UniProtKB DAT files.' . "\n" .
+    '                    * \'FASTA\'         for FASTA files.' . "\n" .
+    '                  Only required if the database files do *not* have the standard file extensions: ' . "\n" .
+    '                    *.dat for UniProtKB DAT files or ' . "\n" .
+    '                    *.fa or *.fasta for FASTA files.' . "\n" .
+    '   --k [ID|AC]    Unique identifier type used as key to index an UniProtKB *.dat database file.' . "\n" .
+    '                  Must be either AC for a UniProt accession number (default) or ID for a UniProt identifier.' . "\n" .
+    '                  Not required for *.fasta databases: This tool will look for any type of ID in the first part of FASTA ' . "\n" .
+    '                  sequence headers up until the first white space.' . "\n" .
+    '                  Optionally multiple IDs may be present separated with pipe symbols (|) or semicolons (;).' . "\n" .
+    '                  Optionally IDs may be prefixed with a database namespace and a colon (:).' . "\n" .
+    '                  For example the accession number P32234 as well as the ID 128UP_DROME would be recognised in both ' . "\n" .
+    '                  this sequence header:' . "\n" .
+    '                     >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)' . "\n" .
+    '                  and in this one:' . "\n" .
+    '                     >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)' . "\n" .
+    '   --f [file]     Fragment file containing accession numbers or IDs and sequence fragments (peptides), ' . "\n" .
+    '                  whose sequence context should be extracted from the database. This file' . "\n" .
+    '                    * Must be tab delimited with one accession/ID plus one sequence fragment per line' . "\n" .
+    '                    * Must contain accession numbers / IDs in the same format as the database file used.' . "\n" .
+    '                      Exception: Optionally IDs may be suffixed with the peptide\'s position in its protein between brackets.' . "\n" .
+	'                      For example: CLH1_HUMAN[1612-1620].' . "\n" .
+    '                    * May contain other columns, which will be preserved in the output.' . "\n" .
+    '                    * Is the basis for the output: if a sequence context was found, ' . "\n" .
+    '                      it will be appended in another column to the right of the existing columns.' . "\n" .
+	'   --icol [int]   Column in the tab delimited fragment file containing the protein identifiers / accession numbers.' . "\n" .
+	'                  Default = 1.' . "\n" .
+	'   --pcol [int]   Column in the tab delimited fragment file containing the peptide sequences.' . "\n" .
+	'                  Default = 2.' . "\n" .
+    '   --s            Strip lowercase characters from the sequence fragments in the fragment file before doing a database lookup.' . "\n" .
+    '                  Amino acids are expected to be in uppercase. If not, the sequences are converted to uppercase ' . "\n" .
+    '                  before searching for the fragment in the database. When lowercase characters represent ' . "\n" .
+    '                  modifications instead of amino acids these need to be removed before searching the database.' . "\n" .
+    '                  Default = 0 (disabled) except when a modified amino acid is specified with --ma, which will automatically enable --s.' . "\n" .
+    '   --cleo [file]  Optional output file where the retrieved cleavage site sequence contexts will be saved.' . "\n" .
+	'   --miso [file]  Optional output file where the retrieved miscleavage site sequence contexts will be saved.' . "\n" .
+	'   --modo [file]  Optional output file where the retrieved modification site sequence contexts will be saved.' . "\n" .
+	'   --pepo [file]  Optional output file where the retrieved peptide sequence contexts will be saved.' . "\n" .
+    '                  (At least one output file must be specified with --cleo, --miso, --modo or --pepo).' . "\n" .
+    '   --ca [A-Z]     Cleavage amino acid. Assume the sequence fragments were derived from cutting with a proteolytic enzyme, ' . "\n" .
+    '                  that recognizes this amino acid as the position to cut.' . "\n" .
+    '                  When the specificity of the protease used to generate the peptides in the fragments file is unknown,' . "\n" .
+    '                  you may provide an asterisk (*) to retrieve sequence context for any cleavage,' . "\n" .
+    '                  but in that case this tool will not filter non-specifically cleaved fragments,' . "\n" .
+    '                  that may be the result of other processes than protease activity.' . "\n" .
+    '   --ct [C|N]     Cleavage terminus. Assume the sequence fragments were derived from cutting with a proteolytic enzyme, ' . "\n" .
+    '                  that cuts at the C or N terminus of the amino acid specified with the --ca [A-Z] parameter.' . "\n" .
+    '                  When the specificity of the protease used to generate the peptides in the fragments file is unknown,' . "\n" .
+    '                  you may provide an asterisk (*) to retrieve sequence context for any cleavage,' . "\n" .
+    '                  but in that case this tool will not filter non-specifically cleaved fragments,' . "\n" .
+    '                  that may be the result of other processes than protease activity.' . "\n" .
+    '   --ma [A-Za-z]  Modified amino acid.' . "\n" .
+    '                  The amino acid must be specified in uppercase and the modification in lower case.' . "\n" .
+    '                  The order is not important.' . "\n" .
+    '                  Hence a phophorylated serine in the fragments file can be indicated with either pS or Sp, ' . "\n" .
+    '                  but you cannot mix both pS and Sp in a single fragments file.' . "\n" .
+    '                  You may provide an asterisk (*) instead of an upper case amino acid to retrieve sequence contexts ' . "\n" .
+    '                  for the specified modification no matter what amino acid it was located on.' . "\n" .
+    '                  A modification may be specified with more than one lower case character, ' . "\n" .
+    '                  so for example phosphoS or Sphospho can also be used for a phosphorylated serine.' . "\n" .
+    '   --n [int]      Integer specifying the length of the N-terminal sequence context to retrieve starting from the ' . "\n" .
+    '                  cleavage, miscleavage or modification site. Default = 5.' . "\n" .
+    '   --c [int]      Integer specifying the length of the C-terminal sequence context to retrieve starting from the ' . "\n" .
+    '                  cleavage, miscleavage or modification site. Default = 5.' . "\n" .
+    '                  Please note that cleavage and miscleavage sites have zero width, ' . "\n" .
+    '                  while modified amino acids will increment the sequence context lenght with 1.' . "\n" .
+    '                  When defaults are used for both the N-terminal and C-terminal sequence context lengths,' . "\n" .
+    '                  the total sequence context length for a' . "\n" .
+    '                    * cleavage or miscleavage site will be:' . "\n" .
+    '                      (N-terminal sequence context) + (C-terminal sequence context) = 5 + 5 = 10.' . "\n" .
+    '                    * modification site will be:' . "\n" .
+    '                      (N-terminal sequence context) + modified AA + (C-terminal sequence context) = 5 + 1 + 5 = 11.' . "\n" .
+#	'   --mcd [int]    Minimal distance between a putative miscleavage site and the peptide termini.' . "\n" .
+#	'                  Uncleaved amino acids closer to the peptide termini can be ignored with this parameter:' . "\n" .
+#	'                    A. To prevent overlap between cleavage and miscleavage peptide sequence contexts.' . "\n" .
+#	'                    B. To prevent false positive miscleavage sites due to sites that cannot be cleaved, ' . "\n" .
+#	'                       because one of the resulting fragments is too short. (Sometimes proteases may need ' . "\n" .
+#	'                       a minimal length surrounding the cleavage site to bind and cleave a peptide.)' . "\n" .
+#	'                  Default = same as the longest *-terminal context length. Hence [--n [int]|--c [int]] (see above).' . "\n" .
+    '   --pc [char]    Optional padding character to fill N-terminal or C-terminal positions in the sequence context, ' . "\n" .
+    '                  when the protein was too short to get a complete sequence context.' . "\n" .
+    '                  Default = - (a.k.a. dash or the alignment gap character.)' . "\n" .
+    '                  To disable padding specify an empty string like this --pc \'\'.' . "\n" .
+    '   --ll [LEVEL]   Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.' . "\n" .
+    "\n";
+    exit;
+
+}
author	galaxyp
date	Fri, 10 May 2013 17:15:08 -0400
parents
children