Mercurial > repos > galaxyp > nbic_fasta
diff ExtractPeptideSequenceContext.pl @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ExtractPeptideSequenceContext.pl Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,1744 @@ +#!/usr/bin/perl -w +# +# This script extracts subsequences from UniProtKB *.dat files or from *.fasta files +# based on a list of accession numbers / IDs and a sequence fragment (peptide). +# +# ===================================================== +# $Id: ExtractPeptideSequenceContext.pl 112 2011-03-01 19:50:23Z pieter.neerincx@gmail.com $ +# $HeadURL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractPeptideSequenceContext.pl $ +# $LastChangedDate: 2011-03-01 13:50:23 -0600 (Tue, 01 Mar 2011) $ +# $LastChangedRevision: 112 $ +# $LastChangedBy: pieter.neerincx@gmail.com $ +# ===================================================== +# + +# +# initialise environment +# +use strict; +use Getopt::Long; +use SWISS::Entry; # For parsing UniProtKB *.dat files. +use Log::Log4perl qw(:easy); + +my $ps = '/'; # Path Separator. +my %log_levels = ( + 'ALL' => $ALL, + 'TRACE' => $TRACE, + 'DEBUG' => $DEBUG, + 'INFO' => $INFO, + 'WARN' => $WARN, + 'ERROR' => $ERROR, + 'FATAL' => $FATAL, + 'OFF' => $OFF, +); + +# +# List of databases, which may contain versioned accession numbers, +# and the separation character used to join the accession number and +# the version number. +# +my %databases_with_optionally_versioned_ids = ( + 'IPI' => '.', + 'ipi' => '.', + 'UniProtKB/Swiss-Prot' => '.', + 'sp' => '.', + 'SP' => '.', + 'UniProtKB/TrEMBL' => '.', + 'TR' => '.', + 'tr' => '.', + 'Tr' => '.' +); + +my $db; +my $db_alt; +my $db_format; +my $index_id; # Key used to lookup sequences. Can be ID or Accession number (default). +my $fragments; +my $id_column; +my $peptide_column; +my $strip_lc; # For the sequence fragments in the fragments file. With strip_lc enabled we + # assume amino acids are in upper case and lower case characters represent modifications, + # which will be stripped off before searching for the fragments in the complete sequences. +my $cleavage_output; # Sequence contexts for cleavage sites. +my $miscleavage_output; # Sequence contexts for miscleavage sites. +my $modification_output; # Sequence contexts for modification sites. +my $peptide_output; # Sequence contexts for complete peptides. +my $cleavage_aa; # Assume the sequence fragments were derived from cutting at this amino acid. +my $cleavage_terminus; # Assume the sequence fragments were derived from cutting at this side of $cut_at_aa +my $modified_aa; +my $n_term_context_length; +my $c_term_context_length; +my $miscleavage_distance; # Minimal distance between a putative miscleavage site and the peptide termini. + # Uncleaved AAs closer to the peptide termini can be ignored with this parameter: + # A. To prevent overlap between cleavage and miscleavage peptide sequence contexts. + # B. To remove false positive miscleavage sites that cannot be cleaved, + # because one of the resulting fragments is too short. Sometimes proteases may need + # a minimal length surrounding the cleavage site to bind and cleave a peptide. +my $padding_character; # Used for padding if the protein sequence is too short for a long enough context. +my $log_level; + +Getopt::Long::GetOptions ( + 'db=s' => \$db, + 'dba:s' => \$db_alt, + 'dbf:s' => \$db_format, + 'k:s' => \$index_id, # Key used to lookup sequences. Can be ID or Accession number (default). + 'f=s' => \$fragments, + 'icol:i' => \$id_column, + 'pcol:i' => \$peptide_column, + 's' => \$strip_lc, # For the sequence fragments in the fragments file. With strip_lc enabled we + # assume amino acids are in upper case and lower case characters represent modifications, + # which will be stripped off before searching for the fragments in the complete sequences. + 'cleo:s' => \$cleavage_output, + 'miso:s' => \$miscleavage_output, + 'modo:s' => \$modification_output, + 'pepo:s' => \$peptide_output, + 'ca:s' => \$cleavage_aa, # Assume the sequence fragments were derived from cutting at this amino acid. + 'ct:s' => \$cleavage_terminus, # Assume the sequence fragments were derived from cutting at this side of $cut_at_aa + 'ma:s' => \$modified_aa, + 'n:i' => \$n_term_context_length, + 'c:i' => \$c_term_context_length, + 'mcd:i' => \$miscleavage_distance, # Minimal distance between a putative miscleavage site and the peptide termini. + # Uncleaved AAs closer to the peptide termini can be ignored with this parameter: + # A. To prevent overlap between cleavage and miscleavage peptide sequence contexts. + # B. To remove false positive miscleavage sites that cannot be cleaved, + # because one of the resulting fragments is too short. Sometimes proteases may need + # a minimal length surrounding the cleavage site to bind and cleave a peptide. + 'pc:s' => \$padding_character, + 'll:s' => \$log_level +); + +# +# TODO: 1. Handle --mcd param. Maybe should be extended to exclude also modification sites too close together... +# + +# +# Configure logging. +# +# Provides default if user did not specify log level: +$log_level = (defined($log_level) ? $log_level : 'INFO'); +# Reset log level to default if user specified illegal log level. +$log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'INFO'}); +#Log::Log4perl->init('log4perl.properties'); +Log::Log4perl->easy_init( + #{ level => $log_level, + # file => ">>ExtractPeptideSequenceContext.log", + # layout => '%F{1}-%L-%M: %m%n' }, + { level => $log_level, + file => "STDOUT", + layout => '%d L:%L %p> %m%n' }, +); +my $logger = Log::Log4perl::get_logger(); + +# +# Check user input. +# +foreach my $input ($db, $db_alt, $fragments, $cleavage_output, $miscleavage_output, $modification_output) { + if (defined($input) && length($input) <= 0) { + $input = undef(); + } +} +unless (defined($db) && + defined($fragments) && + (defined($cleavage_output) || defined($miscleavage_output) || defined($modification_output) || defined($peptide_output)) +) { + $logger->fatal('You need to specify at least a DB & fragments file as inputs and at least one output file.'); + _Usage(); + exit; +} +if (defined($db_format)) { + unless ($db_format eq 'UniProtKB DAT' or $db_format eq 'FASTA') { + $logger->fatal('Database format in unsupported format.'); + $logger->fatal("\t" . 'Must be \'UniProtKB DAT\' or \'FASTA\'.'); + $logger->fatal("\t" . 'But --dbf was ' . $db_format . '.'); + exit; + } +} +if (defined($cleavage_aa)) { + unless ($cleavage_aa =~ m/^[*A-Z]$/) { + $logger->fatal('Cleavage amino acid in unsupported format.'); + $cleavage_aa = undef; + } +} +if (defined($cleavage_terminus)) { + unless ($cleavage_terminus =~ m/^[*CN]$/) { + $logger->fatal('Cleavage terminus in unsupported format.'); + $cleavage_terminus = undef; + } +} +if (defined($modified_aa)) { + unless ($modified_aa =~ m/^[*A-Z][a-z]+$/ || $modified_aa =~ m/^[a-z]+[*A-Z]$/) { + $logger->fatal('Modified amino acid in unsupported format.'); + $modified_aa = undef; + } +} +# +# We need info to extract: +# * cleavage site sequence contexts or +# * modification site sequence contexts or +# * peptide sequence contexts +# +if (defined($cleavage_output) || defined($miscleavage_output)) { + unless (defined($cleavage_aa) && defined($cleavage_terminus)) { + $logger->fatal('If you want to retrieve (mis)cleavage site sequence contexts, you must provide valid values for --ca and --ct.'); + $logger->debug('You specified --ca ' . $cleavage_aa . ' --ct ' . $cleavage_terminus . '.'); + _Usage(); + exit; + } +} +if (defined($modification_output)) { + unless (defined($modified_aa)) { + $logger->fatal('If you want to retrieve modifications site sequence contexts you must provide a valid value for --ma.'); + $logger->debug('You specified --ma ' . $modified_aa . '.'); + _Usage(); + exit; + } +} +if (defined($peptide_output)) { + # + # We can work with defaults for all params. + # +} + +# Make sure we don't overwrite the input. +foreach my $output ($cleavage_output, $miscleavage_output, $modification_output, $peptide_output) { + foreach my $input ($db, $db_alt, $fragments) { + if (defined($input) && defined($output) && $output eq $input) { + $logger->fatal('Output file ' . $output . ' is the same as input file ' . $input . '.'); + $logger->fatal("\t" . 'Please choose a different file for the output.'); + exit; + } + } +} +# Define defaults for optional arguments. +$index_id = (defined($index_id) && length($index_id) > 0 ? $index_id : 'AC'); +$id_column = (defined($id_column) && length($id_column) > 0 ? $id_column : 1); +$peptide_column = (defined($peptide_column) && length($peptide_column) > 0 ? $peptide_column : 2); +$n_term_context_length = (defined($n_term_context_length) && length($n_term_context_length) > 0 ? $n_term_context_length : 5); +$c_term_context_length = (defined($c_term_context_length) && length($c_term_context_length) > 0 ? $c_term_context_length : 5); +$padding_character = (defined($padding_character) ? $padding_character : '-'); +# Stripping of lowercase characters is disabled by default unless we search for modification site contexts in which case it must be enabled! +$strip_lc = (defined($strip_lc) ? $strip_lc : 0); +$strip_lc = (defined($modified_aa) ? 1 : $strip_lc); +unless (defined($miscleavage_distance) && $miscleavage_distance > 0) { + if ($n_term_context_length >= $c_term_context_length) { + $miscleavage_distance = $n_term_context_length; + } else { + $miscleavage_distance = $c_term_context_length; + } +} + + +# +# Check db files and fragments file. +# +unless (-e $db && -f $db && -r $db) { + $logger->fatal('Cannot read from database input file ' . $db . ': ' . $!); + exit; +} +if (defined($db_alt)) { + unless (-e $db_alt && -f $db_alt && -r $db_alt) { + $logger->fatal('Cannot read from alternative database input file ' . $db_alt . ': ' . $!); + exit; + } +} +unless (-e $fragments && -f $fragments && -r $fragments) { + $logger->fatal('Cannot read from fragments input file ' . $fragments . ': ' .$!); + exit; +} + +# +# Create lookup table(s) of the entire database input file(s). +# +my @db_lookup_table_refs; +(my $db_seqs, $index_id) = _CreateLookupHash($db, $index_id, $db_format); +my $seq_count = scalar(keys(%{$db_seqs})); +$logger->info('Number of sequences in database lookup table: ' . $seq_count); +push(@db_lookup_table_refs, $db_seqs); +# Optional backup DB file. +if (defined($db_alt)) { + (my $db_seqs_alt, $index_id) = _CreateLookupHash($db_alt, $index_id, $db_format); + my $seq_count_alt = scalar(keys(%{$db_seqs_alt})); + $logger->info('Number of sequences in backup database lookup table: ' . $seq_count_alt); + push(@db_lookup_table_refs, $db_seqs_alt); +} + +# +# Debugging loop +# +#foreach my $keyyy (keys(%{$db_seqs})) { +# $logger->fatal('Key: ' . $keyyy); +# $logger->fatal("\t" . 'AC: ' . ${$db_seqs}{$keyyy}{'AC'}); +# $logger->fatal("\t" . 'ID: ' . ${$db_seqs}{$keyyy}{'ID'}); +# $logger->fatal("\t" . 'SQ: ' . ${$db_seqs}{$keyyy}{'SQ'}); +#} + +# +# Lookup sequence fragment contexts and store results. +# +my ($counter_cleavages, $counter_miscleavages, $counter_modifications, $counter_peptides) = _ExtractSeqs(\@db_lookup_table_refs, + $fragments, $strip_lc, $id_column, $peptide_column, + $cleavage_output, $miscleavage_output, $modification_output, $peptide_output, + $cleavage_aa, $cleavage_terminus, $miscleavage_distance, + $n_term_context_length, $c_term_context_length, + $padding_character); +# +# Report result stats. +# +my $max_length = 0; +foreach my $counter ($counter_cleavages, $counter_miscleavages, $counter_modifications, $counter_peptides) { + $max_length = (length($counter) > $max_length) ? length($counter) : $max_length; +} +my $prefixed_counter = sprintf('%*s', $max_length, $counter_cleavages); +$logger->info('Extracted cleavage site sequence contexts: ' . $prefixed_counter . '.'); +$prefixed_counter = sprintf('%*s', $max_length, $counter_miscleavages); +$logger->info('Extracted miscleavage site sequence contexts: ' . $prefixed_counter . '.'); +$prefixed_counter = sprintf('%*s', $max_length, $counter_modifications); +$logger->info('Extracted modification site sequence contexts: ' . $prefixed_counter . '.'); +$prefixed_counter = sprintf('%*s', $max_length, $counter_peptides); +$logger->info('Extracted peptide sequence contexts: ' . $prefixed_counter . '.'); +$prefixed_counter = sprintf('%*s', $max_length, $counter_cleavages + $counter_miscleavages + $counter_modifications + $counter_peptides); +$logger->info('Total extracted sequence contexts: ' . $prefixed_counter . '.'); +$logger->info('Finished!'); + +# +## +### Internal subs. +## +# + +sub _CreateLookupHash { + + my ($file_path, $index_id, $file_type) = @_; + + my $file_path_fh; + my %seqs; + + if (defined($file_type)) { + $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...'); + } elsif ($file_path =~ m/($ps?)([^\.]+)\.dat$/i) { + $file_type = 'UniProtKB DAT'; + $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...'); + } elsif ($file_path =~ m/($ps?)([^\.]+)\.fa(sta)?$/i) { + $file_type = 'FASTA'; + # For FASTA files we don't know what to expect: + # IDs or ACcession numbers or a mix of both. + # (Re)set index_id type to AC to prevent adding redundant IDs to the results. + # Hence treat all identifiers stable and unstable as ACs. + # (Normally we default to ACcession numbers and if we would have found an AC + # and the input fragments file was using IDs, we would append the AC to the results.) + $index_id = 'AC'; + $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...'); + } else { + $file_type = 'FASTA'; + $index_id = 'AC'; + $logger->warn('Could not determine database file type based on the file name extension of ' . $file_path); + $logger->warn('Will assume it is a FASTA file and hope that is correct...'); + $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...'); + } + + if ($file_type eq 'UniProtKB DAT') { + + # + # Read an entire record delimited by // at a time. + # + local $/ = "\/\/\n"; + + eval { + open($file_path_fh, "<$file_path"); + }; + if ($@) { + $logger->fatal('Cannot read from database input file: ' . $@); + exit; + } + + while (<$file_path_fh>){ + + $logger->trace('Record:' . "\n$_\n"); + + my $entry = SWISS::Entry->fromText($_); + + my $id = $entry->ID; + my $acc = $entry->AC; + my $seq = $entry->SQ; + my $index_key; + + if ($index_id eq 'ID') { + $index_key = $id; + } elsif ($index_id eq 'AC') { + $index_key = $acc; + } else { + $logger->fatal('Unknown type of identifier used for indexing: ' . $index_id . '. Must be ID or AC.'); + exit; + } + + $logger->debug('Found accession ' . $acc); + $logger->trace('Found accession ' . $acc . ' with ID ' . $id . ' and seq ' . $seq); + + $seqs{$index_key}{'AC'} = $acc; + $seqs{$index_key}{'ID'} = $id; + $seqs{$index_key}{'SQ'} = $seq; + + } + + } elsif ($file_type eq 'FASTA') { + + my @header_ids = (); + my $seq = ''; + + eval { + open($file_path_fh, "<$file_path"); + }; + if ($@) { + $logger->fatal('Cannot read from database input file: ' . $@); + exit; + } + + LINE: while (my $line = <$file_path_fh>) { + + if ($line =~ m/^[\s\n\r]+$/) { + + # + # Skip blank line + # + $logger->debug('Found empty line in FASTA file.'); + next LINE; + + } elsif ($line =~ m/^[a-zA-Z]+/) { + + # + # It's a sequence line + # Remove all white space and new line characters. + # + $logger->debug('Found FASTA sequence line: ' . $line); + + $line =~ s/[\s\n\r]+//g; + $seq .= $line; + + } elsif ($line =~ m/^>/) { + + $logger->debug('Found new FASTA header line: ' . $line); + + if ((length($seq) > 0) && defined($header_ids[0])) { + + # + # Store the previously found sequence in the lookup table. + # + foreach my $header_id (@header_ids) { + + # + # We don't know if the ID we found is an accession number (stable ID) or a normal (unstable) ID + # -> Fill both the AC and ID fields with the same ID value. + # + $seqs{$header_id}{'AC'} = $header_id; + $seqs{$header_id}{'ID'} = $header_id; + $seqs{$header_id}{'SQ'} = $seq; + + $logger->debug('Stored ' . $header_id . ':'); + $logger->debug("\t" . 'SQ ' . $seq); + + } + + # + # Reset vars. + # + $seq = ''; + @header_ids = (); + + } + + # + # Check for the presence of some frequently occurring naming schemes: + # + # >IPI:CON_Trypsin|SWISS-PROT:P00761|TRYP_PIG Trypsin - Sus scrofa (Pig). + # >IPI:CON_IPI00174775.2|TREMBL:Q32MB2;Q86Y46 Tax_Id=9606 Gene_Symbol=KRT73 Keratin-73 + # >sp|Q42592|APXS_ARATH L-ascorbate peroxidase S, chloroplastic/mitochondrial; + # >jgi|Triad1|1|gw1.3.1.1 + # + # The FASTA header line can basically have any format. + # Therefore we try to see if the IDs we are looking for are present anywhere + # in the header line up until the first white space or new line. + # This should prevent matching annotation from other proteins in the description + # like a for example a 'best' BLAST hit that contains one of the IDs we + # are looking for. Hence, in such a case this sequence is *not* the one of + # the IDs we looking for, but similar to that one at best. + # + my @unfiltered_ids; + if ($line =~ m/^>((([^\s;|]+)[;|])*([^\s;|]+))[|;]?/i) { + + # + # Got multiple IDs + # + my $multiple_ids = $1; + @unfiltered_ids = split(/[\s;|]+/, $multiple_ids); + + } elsif ($line =~ m/^>([^\s]+)/i){ + + # + # Got a single ID + # + my $single_id = $1; + push(@unfiltered_ids, $single_id); + + } + + KEY:foreach my $unfiltered_id (@unfiltered_ids) { + + my $this_id = $unfiltered_id; + + # + # Check for RockerBox formattted IDs. + # + # Example of RockerBox ID format, which may contain + # * multiple IDs separated by semi-colons and + # * suffixed with the position of the peptide in the protein: + # YGR192C[123-142]; YJR009C[123-142] + if ($this_id =~ m/^([^\[\]]+)\[\d+-\d+\]$/i) { + $logger->trace('Protein ID in RockerBox format: ' . $this_id); + $this_id = $1; + $logger->trace("\t" . 'Using ID : ' . $this_id); + } + + # + # Check for DB namespace prefixes. + # + if ($this_id =~ m/([^:]+):([^:]+)/i) { + + # + # Namespace prefixed ID. + # + $logger->trace('Namespace prefixed ID: ' . $this_id); + + my $prefix = $1; + $this_id = $2; + + $logger->trace("\t" . 'Using ID: ' . $this_id); + + # + # For a selected list of "known" database namespace prefixes, + # check for versioned accession numbers and remove the version number. + # + if (defined($prefix) && defined($databases_with_optionally_versioned_ids{$prefix})) { + $logger->trace('Found prefix: ' . $prefix); + my $separator = $databases_with_optionally_versioned_ids{$prefix}; + $separator = '\\' . $separator; + if ($this_id =~ m/([^$separator]+)$separator\d+$/) { + $this_id = $1; + } + } + } + + $logger->debug('Found ID: ' . $this_id); + push(@header_ids, $this_id); + + } + +# if ($line =~ m/^>((([^\s\n\r:;|]+)[:]([^\s\n\r:|]+)[|;])*([^\s\n\r:;|]+)[:]([^\s\n\r:|]+))[|;]?(\s+(.+))?/i) { +# +# # +# # One or more namespace prefixed IDs. +# # +# my $concatenated_namespace_prefixed_ids = $1; +# $logger->debug('Found prefixed IDs in header: ' . $concatenated_namespace_prefixed_ids); +# +# # database_namespace = $3 && $5; +# # id = $4 && $6; +# # description = $8; +# my @namespace_prefixed_ids = split(/[|;]/, $concatenated_namespace_prefixed_ids); +# +# foreach my $prefixed_id (@namespace_prefixed_ids) { +# +# $logger->trace('Prefixed ID: ' . $prefixed_id); +# +# if ($prefixed_id =~ m/(([^\s\n\r:;|]+):)?([^\s\n\r:|]+)/i) { +# +# my $this_prefix = $2; +# my $this_id = $3; +# $logger->debug('Found ID: ' . $this_id); +# push(@header_ids, $this_id); +# +# # +# # For a selected list of "known" database namespace prefixes, +# # check for versioned accession numbers and - if found - add +# # both the versioned and the unversioned accession number +# # to the list of header IDs. +# # +# if (defined($this_prefix) && defined($databases_with_optionally_versioned_ids{$this_prefix})) { +# $logger->trace('Found prefix: ' . $this_prefix); +# my $separator = $databases_with_optionally_versioned_ids{$this_prefix}; +# $separator = '\\' . $separator; +# if ($this_id =~ m/([^$separator]+)$separator\d+$/) { +# my $this_unversioned_id = $1; +# $logger->debug('Found unversioned ID: ' . $this_unversioned_id); +# push(@header_ids, $this_unversioned_id); +# } +# } +# +# } else { +# +# $logger->warn( +# 'This should have been an optionally prefixed ID, ' +# . 'but failed to match the corresponding regex: ' +# . $prefixed_id); +# +# } +# } +# +# } elsif ($line =~ m/^>((([^\s\n\r:;|]+)[|;])*([^\s\n\r:;|]+))[|;]?(\s+(.+))?/i) { +# +# # +# # One or more unprefixed IDs. +# # +# my $concatenated_ids = $1; +# $logger->debug('Found unprefixed IDs in header: ' . $concatenated_ids); +# +# # id = $3 && $4; +# # description = $7; +# my @unprefixed_ids = split(/[|;]/, $concatenated_ids); +# +# foreach my $unprefixed_id (@unprefixed_ids) { +# +# $logger->debug('Found ID: ' . $unprefixed_id); +# push(@header_ids, $unprefixed_id); +# +# } +# +# } else { +# +# # +# # Something else. +# # +# # The FASTA header line can basically have any format, +# # so this is probably one of the less frequent occurring annotation schemes. +# # Therefore we try to see if the IDs we are looking for are present anywhere +# # in the header line up until the first white space or new line. +# # This may be tricky as we might match other annotation from other proteins +# # like a description from a 'best' BLAST hit that contains one of the IDs we +# # are looking for. Hence, in such a case this sequence is *not* the one of +# # the IDs we looking for, but similar to that one at best. +# # +# if ($line =~ m/>([^\n\r\s]+)/) { +# +# my $putative_id = $1; +# $logger->debug('Found putative ID in header: ' . $putative_id); +# push(@header_ids, $putative_id); +# +# } else { +# $logger->warn('Cannot identify IDs in this header: ' . $line); +# } +# } + } + } + + # + # Store the last found sequence in the lookup table. + # + foreach my $header_id (@header_ids) { + + # + # We don't know if the ID we found is an accession number (stable ID) or a normal (unstable) ID + # -> Fill both the AC and ID fields with the same ID value. + # + $seqs{$header_id}{'AC'} = $header_id; + $seqs{$header_id}{'ID'} = $header_id; + $seqs{$header_id}{'SQ'} = $seq; + + $logger->debug('Stored ' . $header_id . ':'); + $logger->debug("\t" . 'SQ ' . $seq); + + } + + # + # Reset vars. + # + $seq = ''; + @header_ids = (); + + } + + close($file_path_fh); + $logger->info('Created lookup list for database sequences.'); + + return (\%seqs, $index_id); + +} + +sub _ExtractSeqs { + + my ($db_lookup_table_refs, + $path_from, $strip_lc, $id_column, $peptide_column, + $cleavage_path_to, $miscleavage_path_to, $modification_path_to, $peptide_path_to, + $cleavage_aa, $cleavage_terminus, $miscleavage_distance, + $n_term_context_length, $c_term_context_length, + $padding_character) = @_; + + $logger->debug('Parsing ' . $path_from . '...'); + + my $extracted_seq_contexts_cle = 0; + my $extracted_seq_contexts_mis = 0; + my $extracted_seq_contexts_mod = 0; + my $extracted_seq_contexts_pep = 0; + my $path_from_fh; + my $cleavage_path_to_fh; + my $miscleavage_path_to_fh; + my $modification_path_to_fh; + my $peptide_path_to_fh; + + eval { + open($path_from_fh, "<$path_from"); + }; + if ($@) { + $logger->fatal('Cannot read from fragments input file: ' . $@); + exit; + } + if (defined($cleavage_path_to)) { + eval { + open($cleavage_path_to_fh, ">$cleavage_path_to"); + }; + if ($@) { + $logger->fatal('Cannot write to output file: ' . $@); + exit; + } + } + if (defined($miscleavage_path_to)) { + eval { + open($miscleavage_path_to_fh, ">$miscleavage_path_to"); + }; + if ($@) { + $logger->fatal('Cannot write to output file: ' . $@); + exit; + } + } + if (defined($modification_path_to)) { + eval { + open($modification_path_to_fh, ">$modification_path_to"); + }; + if ($@) { + $logger->fatal('Cannot write to output file: ' . $@); + exit; + } + } + if (defined($peptide_path_to)) { + eval { + open($peptide_path_to_fh, ">$peptide_path_to"); + }; + if ($@) { + $logger->fatal('Cannot write to output file: ' . $@); + exit; + } + } + + LINE:while (my $line = <$path_from_fh>) { + + # + # Remove (trailing) line end characters! + # + $line =~ s/[\n\r\f]+//g; + $logger->trace($line); + + my @column_values = split(/\t/, $line); + + # For debugging only. + #foreach my $val (@column_values) { + # $logger->debug('Col val: ' . $val); + #} + + my $id_column_index = $id_column - 1; + my $peptide_column_index = $peptide_column - 1; + my $unfiltered_key = $column_values[$id_column_index]; + my $key; + my @keys; + my $peptide = $column_values[$peptide_column_index]; + my $peptide_nonmod; + my $protein; + my $acc; + my $id; + my $index = -1; # An index < 0 indicates the peptide was not found in a protein sequence (yet). + my $length; + my $cleavage_peptide_contexts = []; + my $miscleavage_peptide_contexts = []; + my $modification_peptide_contexts = []; + my $peptide_contexts = []; + + # + # Check if $key and $peptide were present in this line. + # If not this may be a leading/trailing empty or meta-data line, + # that does not contain the correct amount of columns. + # + unless (defined($unfiltered_key) && defined($peptide)) { + $logger->warn('Skipping line: ' . $line); + $logger->warn("\t" . '(Peptide sequence and/or protein ID missing or incorrect amount of columns.)'); + next LINE; + } + + # + # Sanity check for peptide sequences. + # + unless ($peptide =~ m/([a-zA-Z]+)/) { + $logger->warn('Fragment file line in unexpected format. Cannot find peptide in: ' . $line); + $logger->debug('$peptide contains: ' . $peptide); + next LINE; + } + + if ($strip_lc) { + # Remove lower case characters representing modifications. + $peptide_nonmod = $peptide; + $peptide_nonmod =~ s/[a-z]+//g; + } else { + # Convert everything to uppercase. + $peptide_nonmod = uc($peptide); + } + + # + # Figure out in what format the Protein ID(s) were supplied. + # + # * Strip off optional DB namespace prefixes. + # * Handle optional accession number version suffixes. + # + if ($unfiltered_key =~ m/^((([^\s;|]+)[\s;|]+)*([^\s;|]+))[\s|;]?/i) { + + # + # Got multiple IDs + # + my $multiple_ids = $1; + @keys = split(/[\s;|]+/, $multiple_ids); + + } else { + + # + # Got a single ID + # + push(@keys, $unfiltered_key); + + } + + KEY:foreach $unfiltered_key (@keys) { + + $key = $unfiltered_key; + + # + # Check for RockerBox formattted IDs. + # + # Example of RockerBox ID format, which may contain + # * multiple IDs separated by semi-colons and + # * suffixed with the position of the peptide in the protein: + # YGR192C[123-142]; YJR009C[123-142] + if ($key =~ m/^([^\[]+)\[\d+-\d+\]$/i) { + $logger->trace('Protein ID in RockerBox format: ' . $key); + $key = $1; + $logger->trace("\t" . 'Using ID : ' . $key); + } + + # + # Check for DB namespace prefixes. + # + if ($key =~ m/([^:]+):([^:]+)/i) { + + # + # Namespace prefixed ID. + # + $logger->trace('Namespace prefixed ID: ' . $key); + + my $prefix = $1; + $key = $2; + + $logger->trace("\t" . 'Using ID: ' . $key); + + # + # For a selected list of "known" database namespace prefixes, + # check for versioned accession numbers and remove the version number. + # + if (defined($prefix) && defined($databases_with_optionally_versioned_ids{$prefix})) { + $logger->trace('Found prefix: ' . $prefix); + my $separator = $databases_with_optionally_versioned_ids{$prefix}; + $separator = '\\' . $separator; + if ($key =~ m/([^$separator]+)$separator\d+$/) { + $key = $1; + } + } + } + + $logger->debug('Found ID: ' . $key); + + # + # Sanity check for Protein ID. + # + unless ($key =~ m/^([A-Z0-9\._-]+)$/i) { + $logger->warn('Fragment file line in unexpected format. Cannot find sequence ID in:'); + $logger->warn("\t" . $line); + $logger->debug('$key contains: ' . $key); + next LINE; + } + + # + # Lookup info for the protein this peptide was derived from. + # + ($protein, $acc, $id, $index) = _LookupProteinInfo($key, $peptide_nonmod, $db_lookup_table_refs); + + # + # Check if the peptide (fragment) was found in this protein sequence. + # + if (defined($protein)) { + if ($index > -1) { + $logger->trace('Found peptide sequence in protein ' . $key); + last KEY; + } else { + $logger->warn('Cannot find peptide (' . $peptide_nonmod . ') in protein ' . $key); + $logger->warn("\t" . 'Will try other protein IDs for this peptide if available.'); + } + } else { + $logger->warn('Cannot find protein identifier ' . $key . ' in any of the supplied databases.'); + $logger->warn("\t" . 'Will try other protein IDs for this peptide if available.'); + } + } + + # + # Check if the peptide (fragment) was found in any of the associated protein sequences. + # + if (defined($protein)) { + + $logger->debug('Found protein ' . $key); + + if ($index > -1) { + + $logger->debug('Found peptide (' . $peptide_nonmod . ') in protein ' . $key); + + if (defined($cleavage_path_to)) { + ($cleavage_peptide_contexts) = _GetCleavageSequenceContexts($peptide_nonmod, $protein, $key, $index, + $cleavage_aa, $cleavage_terminus); + } + if (defined($miscleavage_path_to)) { + ($miscleavage_peptide_contexts) = _GetMiscleavageSequenceContexts($peptide_nonmod, $protein, $key, $index, + $cleavage_aa, $cleavage_terminus, $miscleavage_distance); + } + if (defined($modification_path_to)) { + ($modification_peptide_contexts) = _GetModificationSequenceContexts($peptide_nonmod, $protein, $key, $index, + $modified_aa, $peptide); + } + if (defined($peptide_path_to)) { + ($peptide_contexts) = _GetPeptideSequenceContexts($peptide_nonmod, $protein, $key, $index); + } + } else { + $logger->error('Cannot find peptide (' . $peptide_nonmod . ') in protein ' . $key); + $acc = 'NA'; + } + } else { + $logger->error('Cannot find protein "' . $unfiltered_key . '" in any of the supplied databases.'); + $acc = 'NA'; + } + + # + # Append the peptide contexts to the existing lines of the fragment file and save that new line to the output file. + # + $extracted_seq_contexts_cle = _SaveSequenceContexts($line, $acc, $cleavage_peptide_contexts, + $cleavage_path_to_fh, $extracted_seq_contexts_cle); + + $extracted_seq_contexts_mis = _SaveSequenceContexts($line, $acc, $miscleavage_peptide_contexts, + $miscleavage_path_to_fh, $extracted_seq_contexts_mis); + + $extracted_seq_contexts_mod = _SaveSequenceContexts($line, $acc, $modification_peptide_contexts, + $modification_path_to_fh, $extracted_seq_contexts_mod); + + $extracted_seq_contexts_pep = _SaveSequenceContexts($line, $acc, $peptide_contexts, + $peptide_path_to_fh, $extracted_seq_contexts_pep); + + } + + # + # Close file handles. + # + foreach my $fh ($path_from_fh, $cleavage_path_to_fh, $miscleavage_path_to_fh, $modification_path_to_fh, $peptide_path_to_fh) { + if (defined($fh)) { + close($fh); + } + } + + return($extracted_seq_contexts_cle, $extracted_seq_contexts_mis, $extracted_seq_contexts_mod, $extracted_seq_contexts_pep); + +} + +sub _LookupProteinInfo { + + my ($key, $peptide, $db_lookup_table_refs) = @_; + + my $protein_found_in_db = 0; + my $protein; + my $acc; + my $id; + my $index; + + DB_LOOKUP:foreach my $db_lookup_table_ref (@{$db_lookup_table_refs}) { + + if (defined(${$db_lookup_table_ref}{$key})) { + + $protein_found_in_db = 1; + $logger->debug('Found protein ' . $key . ' in this sequence DB.'); + + $protein = ${$db_lookup_table_ref}{$key}{'SQ'}; + $acc = ${$db_lookup_table_ref}{$key}{'AC'}; + $id = ${$db_lookup_table_ref}{$key}{'ID'}; + # + # Find peptide (fragment) in DB protein sequence. + # + $index = index($protein, $peptide); + + if ($index > -1) { + + # + # Found peptide! + # + $logger->debug('Found peptide ' . $peptide . ' at position ' . $index . ' in protein ' . $key); + last DB_LOOKUP; + + } + + # + # If the input was based on mass spectrometry data we cannot see + # the difference between I (Isolucene) and L (Leucine) as the both + # weigh the same. Let's try an I agnostic search by substituting all + # Is for Ls. + # + my $protein_l_only = $protein; + my $peptide_l_only = $peptide; + $protein_l_only =~ s/I/L/g; + $peptide_l_only =~ s/I/L/g; + $index = index($protein_l_only, $peptide_l_only); + + if ($index > -1) { + + # + # Found peptide! + # + $logger->debug('Found peptide ' . $peptide . ' at position ' . $index . ' in protein ' . $key); + $logger->debug("\t" . 'Had to treat I and L as the same amino acid in the DB search to get a match.'); + last DB_LOOKUP; + + } + + # + # Cannot find peptide :( + # + $protein = undef; + $acc = undef; + $id = undef; + + $logger->debug('Cannot find peptide (' . $peptide . ') in protein ' . $key); + $logger->debug("\t" . 'This may be the result of an updated protein sequence in the database.'); + $logger->debug("\t" . 'Will try to find the peptide in other databases (if available).'); + + if ($strip_lc) { + $logger->debug("\t" . '(Stripping of lowercase characters, indicating modifications in the peptide sequence, was enabled.)'); + } else { + $logger->debug("\t" . '(Stripping of lowercase characters, indicating modifications in the peptide sequence, was *not* enabled.)'); + } + + } else { + $logger->debug('Protein ' . $key . ' missing from this sequence DB. Will try other databases (if available).'); + } + } + + # + # Check if the peptide (fragment) was found in a DB protein sequence. + # + if ($protein_found_in_db == 1) { + unless ($index > -1) { + $logger->warn('Cannot find peptide (' . $peptide . ') in protein ' . $key); + $logger->warn("\t" . 'Will try to find this peptide (' . $peptide . ') using other associated protein identifiers (if available).'); + } + } else { + $logger->warn('Cannot find protein ' . $key . ' in any of the supplied databases.'); + } + + return($protein, $acc, $id, $index); +} + +sub _GetCleavageSequenceContexts { + + my ($peptide, $protein, $key, $index, $cleavage_aa, $cleavage_terminus) = @_; + + my %peptide_contexts; + $peptide_contexts{'n_term'}{'context'} = ''; + $peptide_contexts{'c_term'}{'context'} = ''; + my $max_context_length = $n_term_context_length + $c_term_context_length; + + foreach my $peptide_term (keys(%peptide_contexts)) { + + # + # Get the offset for the sequence contexts to retrieve from this peptide terminus + # or drop the context for this peptide terminus if it is also the protein terminus. + # + if ($peptide_term eq 'n_term') { + if ($index > 0) { + # This is an N-terminal peptide context. + my $offset = $index - $n_term_context_length; + $peptide_contexts{$peptide_term}{'offset'} = $offset; + $logger->debug('Offset = ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . '.'); + } else { + # Peptide N-terminus == Protein N-terminus. + $logger->warn('Peptide N-terminus = protein N-terminus. Dropping N-terminal sequence context for peptide ' . $peptide . + ' in protein ' . $key . '.'); + next; + } + } elsif ($peptide_term eq 'c_term') { + if (($index + length($peptide)) < length($protein)) { + # This is a C-terminal peptide context. + my $offset = ($index + length($peptide))- $n_term_context_length; + $peptide_contexts{$peptide_term}{'offset'} = $offset; + $logger->debug('Offset = ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . '.'); + } else { + # Peptide C-terminus == Protein C-terminus. + $logger->warn('Peptide C-terminus = protein C-terminus. Dropping C-terminal sequence context for peptide ' . $peptide . + ' in protein ' . $key . '.'); + next; + } + } + + # + # Check if DB protein sequence is long enough on n-terminal side + # to retrieve full length peptide contexts. + # (c-term length is checked later...) + # + if ($peptide_contexts{$peptide_term}{'offset'} < 0) { + + # + # Protein too short for full length context: Calculate offset adjustment. + # + my $offset = $peptide_contexts{$peptide_term}{'offset'}; + my $offset_adjustment = -($offset); + $peptide_contexts{$peptide_term}{'offset_adjustment'} = $offset_adjustment; + $logger->debug('Offset ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . ' less than zero'); + + if (length($padding_character) == 1) { + + # + # Add N-terminal padding characters for contexts that are too short on the N-terminal side. + # + my $n_term_padding = "$padding_character" x $offset_adjustment; + $peptide_contexts{$peptide_term}{'context'} = $n_term_padding; + $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding); + + } else { + $logger->debug('Padding was disabled. Will not add n-term padding.'); + } + + $peptide_contexts{$peptide_term}{'offset'} = 0; + + } else { + + $peptide_contexts{$peptide_term}{'offset_adjustment'} = 0; + + } + + # + # Extract sequence context from DB protein sequence. + # + my $peptide_context = substr($protein, $peptide_contexts{$peptide_term}{'offset'}, + $max_context_length - $peptide_contexts{$peptide_term}{'offset_adjustment'}); + # append the context extracted from the protein sequence, + # because there may be already some N-terminal padding... + $peptide_contexts{$peptide_term}{'context'} .= $peptide_context; + + # + # Quality Control: Check enzyme specificity for enzymes with known (= user supplied) specs. + # + unless ($cleavage_aa eq '*') { + + $peptide_context = $peptide_contexts{$peptide_term}{'context'}; + my $p1 = substr($peptide_context, ($n_term_context_length -1), 1); + my $p1_prime = substr($peptide_context, $n_term_context_length, 1); + + if ($cleavage_terminus eq 'C') { + + if ($p1 eq $cleavage_aa) { + # Peptide fragment contains the AA recognised by the protease. + $logger->debug('Specific protease activity -> Retaining peptide context.'); + } else { + # Must be non-specific cleavage -> drop peptide context. + $logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context . + ' for peptide ' . $peptide . ' in protein ' . $key); + $peptide_contexts{$peptide_term}{'context'} = ''; + } + + } elsif ($cleavage_terminus eq 'N') { + + if ($p1_prime eq $cleavage_aa) { + # Peptide fragment contains the AA recognised by the protease. + $logger->debug('Specific protease activity -> Retaining peptide context.'); + } else { + # Must be non-specific cleavage -> drop peptide context. + $logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context . + ' for peptide ' . $peptide . ' in protein ' . $key); + $peptide_contexts{$peptide_term}{'context'} = ''; + } + + } elsif ($cleavage_terminus eq '*') { + + if (($p1 eq $cleavage_aa) || ($p1_prime eq $cleavage_aa)) { + # Peptide fragment contains the AA recognised by the protease. + $logger->debug('Specific protease activity -> Retaining peptide context.'); + } else { + # Must be non-specific cleavage -> drop peptide context. + $logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context . + ' for peptide ' . $peptide . ' in protein ' . $key); + $peptide_contexts{$peptide_term}{'context'} = ''; + } + + } else { + $logger->fatal('Unknown cleavage terminus ' . $cleavage_terminus . '. Must be C, N or *.'); + exit; + } + } + } + + push(my @contexts, $peptide_contexts{'n_term'}{'context'}, $peptide_contexts{'c_term'}{'context'}); + + if (length($padding_character) == 1) { + _CheckSequenceContextLength(\@contexts, $peptide, $key, $max_context_length); + } else { + $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.'); + } + + return(\@contexts); + +} + +sub _GetMiscleavageSequenceContexts { + + my ($peptide, $protein, $key, $petide_index, $cleavage_aa, $cleavage_terminus, $miscleavage_distance) = @_; + + my @peptide_contexts; + my $max_context_length = $n_term_context_length + $c_term_context_length; + + # + # Find miscleavage sites in peptide. + # + # TODO: handle --mcd + # + my @amino_acids = split('', $peptide); + my $aa_index = 0; + # + # Drop first and last amino acids as these are from cleavage sites or from the protein termini. + # In either case they cannot represent a miscleavage site. + # + shift(@amino_acids); + pop(@amino_acids); + + foreach my $aa (@amino_acids) { + + $aa_index++; + + if ($aa eq $cleavage_aa) { + + # + # We found a miscleavage site. + # + $logger->debug('Found miscleavage site in peptide ' . $peptide . ' of protein ' . $key . '.'); + # + # Get the offset for the sequence context. + # + my $offset = $petide_index + $aa_index - $n_term_context_length; + $logger->debug("\t" . 'Offset = ' . $offset . '.'); + my $n_term_padding = ''; + my $offset_adjustment = 0; + + if ($cleavage_terminus eq 'C') { + + # + # Must increment the offset with 1 if the protease cuts C-terminal of a recognition site. + # + $offset++; + + } + + # + # Check if DB protein sequence is long enough on n-terminal side + # to retrieve full length peptide contexts. + # (c-term length is checked elsewhere.) + # + if ($offset < 0) { + + # + # Protein too short for full length context: Calculate offset adjustment. + # + $offset_adjustment = -($offset); + $logger->debug('Offset ' . $offset . ' for miscleavage context in peptide ' . $peptide . ' in protein ' . $key . ' less than zero'); + + if (length($padding_character) == 1) { + + # + # Add N-terminal padding characters for contexts that are too short on the N-terminal side. + # + $n_term_padding = "$padding_character" x $offset_adjustment; + $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding); + + } else { + $logger->debug('Padding was disabled. Will not add n-term padding.'); + } + + # + # Reset offset for this context to protein start. + # + $offset = 0; + + } + + # + # Extract sequence context from DB protein sequence. + # + my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment); + $peptide_context = $n_term_padding . $peptide_context; + $logger->debug('Found miscleavage context: ' . $peptide_context . '.'); + + push(@peptide_contexts, $peptide_context); + } + } + + if (length($padding_character) == 1) { + _CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length); + } else { + $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.'); + } + + return(\@peptide_contexts); + +} + +sub _GetModificationSequenceContexts { + + my ($peptide, $protein, $key, $petide_index, $modified_aa, $modified_peptide) = @_; + + my @peptide_contexts; + my $modified_aa_index_adjustment; + my @modified_aa_in_peptide_indices; + my $modified_aa_in_protein_index; + my $max_context_length = $n_term_context_length + 1 + $c_term_context_length; + #my $offset = 0; + my $modified_aa_in_modified_peptide_index; + my $modified_peptide_remainder; + my $processed_peptide_length = 0; + + # + # Find amino acid in the string identifying both the modified amino acid and the modification. + # + if ($modified_aa =~ m/^([a-z]+)[A-Z*]$/) { + + $modified_aa_index_adjustment = length($1); + + } elsif ($modified_aa =~ m/^[A-Z*][a-z]$/) { + + $modified_aa_index_adjustment = 0; + + } else { + + $logger->fatal('Modified amino acid was specified in an unsupported format: ' . $modified_aa . '.'); + exit; + + } + + # + # Find modified sites in peptide. + # + my $modified_aa_for_regex = $modified_aa; + $modified_aa_for_regex =~ s/\*/[A-Z]/; + $logger->debug('modified_aa for search: ' . $modified_aa_for_regex); + if ($modified_peptide =~ m/$modified_aa_for_regex/) { + $logger->debug("\t" . 'Peptide contains mods.'); + $modified_aa_in_modified_peptide_index = $-[0]; + $modified_peptide_remainder = $'; + my $modified_peptide_prefix = $`; + $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); + $processed_peptide_length += length($modified_peptide_prefix); + $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); + $processed_peptide_length += length($modified_aa); + $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); + $logger->trace("\t\t" . 'Index for mod :' . $modified_aa_in_modified_peptide_index); + } else { + $logger->debug('No modified amino acids found in this peptide.'); + } + + #my $modified_aa_in_modified_peptide_index = index($modified_peptide, $modified_aa); + + while (defined($modified_aa_in_modified_peptide_index)) { + + $modified_aa_in_modified_peptide_index += $modified_aa_index_adjustment; + $logger->debug('Found modified amino acid at position ' . ($modified_aa_in_modified_peptide_index + 1) . ' in modified peptide ' . $modified_peptide . ' of protein ' . $key . '.'); + + # + # Count all modifications of any type preceeding the found modified amino acid. + # + my $modified_n_term = substr($modified_peptide, 0, $modified_aa_in_modified_peptide_index); + my $n_term = $modified_n_term; + $n_term =~ s/[a-z]+//g; + my $mod_count = length($modified_n_term) - length($n_term); + + if ($mod_count >= 0) { + $logger->debug(' counted modifications preceeding amino acid = ' . $mod_count); + } else { + $logger->fatal(' counted modifications preceeding amino acid is negative: ' . $mod_count); + exit; + } + + # + # We need to substract the amount of mods preceeding the identified amino acid + # from the index to get the position of the amino acid in the unmodified peptide. + # + my $modified_aa_in_peptide_index = $modified_aa_in_modified_peptide_index - $mod_count; + $logger->debug(' and at position ' . ($modified_aa_in_peptide_index + 1) . ' in peptide ' . $peptide . ' of protein ' . $key . '.'); + + push(@modified_aa_in_peptide_indices, $modified_aa_in_peptide_index); + + # + # Search for additional mods in the modified peptide. + # + #$offset = $modified_aa_in_modified_peptide_index + 1; + #$modified_aa_in_modified_peptide_index = index($modified_peptide, $modified_aa, $offset); + if ($modified_peptide_remainder =~ m/$modified_aa_for_regex/) { + $logger->debug("\t" . 'Peptide contains more mods.'); + $modified_peptide_remainder = $'; + my $modified_peptide_prefix = $`; + $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); + $modified_aa_in_modified_peptide_index = $-[0] + $processed_peptide_length; + $logger->trace("\t\t" . 'Index for mod :' . $modified_aa_in_modified_peptide_index); + $processed_peptide_length += length($modified_peptide_prefix); + $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); + $processed_peptide_length += length($modified_aa); + $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); + } else { + $modified_aa_in_modified_peptide_index = undef(); + $logger->debug('No more modified amino acids found in this peptide.'); + } + } + + # + # Retrieve sequence contexts for modified stites from protein sequence + # + foreach my $modified_aa_in_peptide_index (@modified_aa_in_peptide_indices) { + + # + # Get the offset for the sequence context. + # + my $offset = $petide_index + $modified_aa_in_peptide_index - $n_term_context_length; + $logger->debug("\t" . 'Offset = ' . $offset . '.'); + my $n_term_padding = ''; + my $offset_adjustment = 0; + + # + # Check if DB protein sequence is long enough on n-terminal side + # to retrieve full length peptide contexts. + # (c-term length is checked elsewhere.) + # + if ($offset < 0) { + + # + # Protein too short for full length context: Calculate offset adjustment. + # + $offset_adjustment = -($offset); + $logger->debug('Offset ' . $offset . ' for modification context in peptide ' . $peptide . ' in protein ' . $key . ' less than zero'); + + if (length($padding_character) == 1) { + + # + # Add N-terminal padding characters for contexts that are too short on the N-terminal side. + # + $n_term_padding = "$padding_character" x $offset_adjustment; + $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding); + + } else { + $logger->debug('Padding was disabled. Will not add n-term padding.'); + } + + # + # Reset offset for this context to protein start. + # + $offset = 0; + + } + + # + # Extract sequence context from DB protein sequence. + # + my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment); + $peptide_context = $n_term_padding . $peptide_context; + $logger->debug('Found modification context: ' . $peptide_context . '.'); + + push(@peptide_contexts, $peptide_context); + + } + + if (length($padding_character) == 1) { + _CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length); + } else { + $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.'); + } + + return(\@peptide_contexts); + +} + +# +# TODO: pep contexts. +# +sub _GetPeptideSequenceContexts { + + my ($peptide, $protein, $key, $peptide_index) = @_; + + my @peptide_contexts; + my $max_context_length = $n_term_context_length + length($peptide) + $c_term_context_length; + + # + # Get the offset for the peptide sequence context. + # + my $offset = $peptide_index - $n_term_context_length; + $logger->debug("\t" . 'Offset = ' . $offset . '.'); + my $n_term_padding = ''; + my $offset_adjustment = 0; + + # + # Check if DB protein sequence is long enough on n-terminal side + # to retrieve full length peptide contexts. + # (c-term length is checked elsewhere.) + # + if ($offset < 0) { + + # + # Protein too short for full length context: Calculate offset adjustment. + # + $offset_adjustment = -($offset); + $logger->debug('Offset ' . $offset . ' for peptide context of peptide ' . $peptide . ' in protein ' . $key . ' less than zero'); + + if (length($padding_character) == 1) { + + # + # Add N-terminal padding characters for contexts that are too short on the N-terminal side. + # + $n_term_padding = "$padding_character" x $offset_adjustment; + $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding); + + } else { + $logger->debug('Padding was disabled. Will not add n-term padding.'); + } + + # + # Reset offset for this context to protein start. + # + $offset = 0; + + } + + # + # Extract sequence context from DB protein sequence. + # + my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment); + $peptide_context = $n_term_padding . $peptide_context; + $logger->debug('Found modification context: ' . $peptide_context . '.'); + + push(@peptide_contexts, $peptide_context); + + if (length($padding_character) == 1) { + _CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length); + } else { + $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.'); + } + + return(\@peptide_contexts); + +} + +sub _CheckSequenceContextLength { + + my ($contexts, $peptide, $key, $max_context_length) = @_; + + # + # If padding was enabled for sequence contexts < max sequence context length. + # + foreach my $peptide_context (@{$contexts}) { + + my $peptide_context_length = length($peptide_context); + + if ($peptide_context_length < $max_context_length) { + + # + # sequence context < max sequence context length -> append c-term padding. + # (n-term padding was already appended if necessary during sequence context lookup with _GetSequenceContext().) + # + my $c_term_padding = "$padding_character" x ($max_context_length - $peptide_context_length); + $logger->debug('Length of peptide context (' . $peptide_context_length . ') < max context length.'); + $logger->debug("\t" . 'Will add C-terminal padding: ' . $c_term_padding); + $peptide_context = $peptide_context . $c_term_padding; + + } + + # + # Sanity check for padded sequence contexts. + # + # (Checks if the combined effect of n-term and c-term padding worked well.) + # + $peptide_context_length = length($peptide_context); + unless ($peptide_context_length == $max_context_length) { + $logger->fatal('Length of peptide context too short or too long for peptide ' . $peptide . ' in protein ' . $key); + $logger->fatal("\t" . 'Context length is ' . $peptide_context_length . ', but should be ' . $max_context_length . '.'); + exit; + } + } + + # + # No need to return the modified (c-terminal-padded) contexts as we worked with a arrayref. + # + +} + +sub _SaveSequenceContexts { + + my ($line, $acc, $contexts, $path_to_fh, $extracted_seq_contexts) = @_; + + # + # Append the peptide contexts to the existing line of the fragment file and + # save that new line to the output file. + # By appending we keep all info from the original fragment file intact. + # + foreach my $context (@{$contexts}) { + + # + # Skip "empty"/"missing" contexts. + # May be the result of: + # 1. The peptide terminus being also the protein terminus. + # 2. Proteins missing from the database. + # 3. ... + # + if ($padding_character ne '') { + if ($context =~ m/^($padding_character)+$/) { + next; + } + } else { + if ($context eq '') { + next; + } + } + + my $new_line = $line; + + # 1. Append accession numbers if the fragment file used IDs. + if ($index_id eq 'ID') { + $new_line .= "\t" . $acc; + } + + # 2. Append sequence context. + $new_line .= "\t" . $context; + # 3. Append new line character. + $new_line .= "\n"; + # Save result to disk. + print($path_to_fh $new_line); + $extracted_seq_contexts++; + + } + + return($extracted_seq_contexts); + +} + +sub _Usage { + + + print STDOUT "\n" . + 'ExtractPeptideSequenceContext.pl: Extract sequence contexts of cleavage, miscleavage or modification sites ' . "\n" . + ' based on sequence fragments (like peptides experimentally identified with MSMS) ' . "\n" . + ' and protein sequences from a database in FASTA or UniProtKB file format.' . "\n" . + "\n" . + 'Usage:' . "\n" . + "\n" . + ' ExtractPeptideSequenceContext.pl options' . "\n" . + "\n" . + 'Available options are:' . "\n" . + "\n" . + ' --db [file] Input file containing all database sequences in either UniProtKB *.dat or *.fasta format.' . "\n" . + ' --dba [file] Optional alternative / backup input file containing database sequences in either UniProtKB *.dat or *.fasta format.' . "\n" . + ' This may be a slightly older or newer version of the database, which will be searched in case a ' . "\n" . + ' sequence ID was not found in the primary database file that was specified with --db [file].' . "\n" . + ' --dbf [format] Optional argument to specify the database sequence file format. Valid format specifications are:' . "\n" . + ' * \'UniProtKB DAT\' for UniProtKB DAT files.' . "\n" . + ' * \'FASTA\' for FASTA files.' . "\n" . + ' Only required if the database files do *not* have the standard file extensions: ' . "\n" . + ' *.dat for UniProtKB DAT files or ' . "\n" . + ' *.fa or *.fasta for FASTA files.' . "\n" . + ' --k [ID|AC] Unique identifier type used as key to index an UniProtKB *.dat database file.' . "\n" . + ' Must be either AC for a UniProt accession number (default) or ID for a UniProt identifier.' . "\n" . + ' Not required for *.fasta databases: This tool will look for any type of ID in the first part of FASTA ' . "\n" . + ' sequence headers up until the first white space.' . "\n" . + ' Optionally multiple IDs may be present separated with pipe symbols (|) or semicolons (;).' . "\n" . + ' Optionally IDs may be prefixed with a database namespace and a colon (:).' . "\n" . + ' For example the accession number P32234 as well as the ID 128UP_DROME would be recognised in both ' . "\n" . + ' this sequence header:' . "\n" . + ' >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)' . "\n" . + ' and in this one:' . "\n" . + ' >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)' . "\n" . + ' --f [file] Fragment file containing accession numbers or IDs and sequence fragments (peptides), ' . "\n" . + ' whose sequence context should be extracted from the database. This file' . "\n" . + ' * Must be tab delimited with one accession/ID plus one sequence fragment per line' . "\n" . + ' * Must contain accession numbers / IDs in the same format as the database file used.' . "\n" . + ' Exception: Optionally IDs may be suffixed with the peptide\'s position in its protein between brackets.' . "\n" . + ' For example: CLH1_HUMAN[1612-1620].' . "\n" . + ' * May contain other columns, which will be preserved in the output.' . "\n" . + ' * Is the basis for the output: if a sequence context was found, ' . "\n" . + ' it will be appended in another column to the right of the existing columns.' . "\n" . + ' --icol [int] Column in the tab delimited fragment file containing the protein identifiers / accession numbers.' . "\n" . + ' Default = 1.' . "\n" . + ' --pcol [int] Column in the tab delimited fragment file containing the peptide sequences.' . "\n" . + ' Default = 2.' . "\n" . + ' --s Strip lowercase characters from the sequence fragments in the fragment file before doing a database lookup.' . "\n" . + ' Amino acids are expected to be in uppercase. If not, the sequences are converted to uppercase ' . "\n" . + ' before searching for the fragment in the database. When lowercase characters represent ' . "\n" . + ' modifications instead of amino acids these need to be removed before searching the database.' . "\n" . + ' Default = 0 (disabled) except when a modified amino acid is specified with --ma, which will automatically enable --s.' . "\n" . + ' --cleo [file] Optional output file where the retrieved cleavage site sequence contexts will be saved.' . "\n" . + ' --miso [file] Optional output file where the retrieved miscleavage site sequence contexts will be saved.' . "\n" . + ' --modo [file] Optional output file where the retrieved modification site sequence contexts will be saved.' . "\n" . + ' --pepo [file] Optional output file where the retrieved peptide sequence contexts will be saved.' . "\n" . + ' (At least one output file must be specified with --cleo, --miso, --modo or --pepo).' . "\n" . + ' --ca [A-Z] Cleavage amino acid. Assume the sequence fragments were derived from cutting with a proteolytic enzyme, ' . "\n" . + ' that recognizes this amino acid as the position to cut.' . "\n" . + ' When the specificity of the protease used to generate the peptides in the fragments file is unknown,' . "\n" . + ' you may provide an asterisk (*) to retrieve sequence context for any cleavage,' . "\n" . + ' but in that case this tool will not filter non-specifically cleaved fragments,' . "\n" . + ' that may be the result of other processes than protease activity.' . "\n" . + ' --ct [C|N] Cleavage terminus. Assume the sequence fragments were derived from cutting with a proteolytic enzyme, ' . "\n" . + ' that cuts at the C or N terminus of the amino acid specified with the --ca [A-Z] parameter.' . "\n" . + ' When the specificity of the protease used to generate the peptides in the fragments file is unknown,' . "\n" . + ' you may provide an asterisk (*) to retrieve sequence context for any cleavage,' . "\n" . + ' but in that case this tool will not filter non-specifically cleaved fragments,' . "\n" . + ' that may be the result of other processes than protease activity.' . "\n" . + ' --ma [A-Za-z] Modified amino acid.' . "\n" . + ' The amino acid must be specified in uppercase and the modification in lower case.' . "\n" . + ' The order is not important.' . "\n" . + ' Hence a phophorylated serine in the fragments file can be indicated with either pS or Sp, ' . "\n" . + ' but you cannot mix both pS and Sp in a single fragments file.' . "\n" . + ' You may provide an asterisk (*) instead of an upper case amino acid to retrieve sequence contexts ' . "\n" . + ' for the specified modification no matter what amino acid it was located on.' . "\n" . + ' A modification may be specified with more than one lower case character, ' . "\n" . + ' so for example phosphoS or Sphospho can also be used for a phosphorylated serine.' . "\n" . + ' --n [int] Integer specifying the length of the N-terminal sequence context to retrieve starting from the ' . "\n" . + ' cleavage, miscleavage or modification site. Default = 5.' . "\n" . + ' --c [int] Integer specifying the length of the C-terminal sequence context to retrieve starting from the ' . "\n" . + ' cleavage, miscleavage or modification site. Default = 5.' . "\n" . + ' Please note that cleavage and miscleavage sites have zero width, ' . "\n" . + ' while modified amino acids will increment the sequence context lenght with 1.' . "\n" . + ' When defaults are used for both the N-terminal and C-terminal sequence context lengths,' . "\n" . + ' the total sequence context length for a' . "\n" . + ' * cleavage or miscleavage site will be:' . "\n" . + ' (N-terminal sequence context) + (C-terminal sequence context) = 5 + 5 = 10.' . "\n" . + ' * modification site will be:' . "\n" . + ' (N-terminal sequence context) + modified AA + (C-terminal sequence context) = 5 + 1 + 5 = 11.' . "\n" . +# ' --mcd [int] Minimal distance between a putative miscleavage site and the peptide termini.' . "\n" . +# ' Uncleaved amino acids closer to the peptide termini can be ignored with this parameter:' . "\n" . +# ' A. To prevent overlap between cleavage and miscleavage peptide sequence contexts.' . "\n" . +# ' B. To prevent false positive miscleavage sites due to sites that cannot be cleaved, ' . "\n" . +# ' because one of the resulting fragments is too short. (Sometimes proteases may need ' . "\n" . +# ' a minimal length surrounding the cleavage site to bind and cleave a peptide.)' . "\n" . +# ' Default = same as the longest *-terminal context length. Hence [--n [int]|--c [int]] (see above).' . "\n" . + ' --pc [char] Optional padding character to fill N-terminal or C-terminal positions in the sequence context, ' . "\n" . + ' when the protein was too short to get a complete sequence context.' . "\n" . + ' Default = - (a.k.a. dash or the alignment gap character.)' . "\n" . + ' To disable padding specify an empty string like this --pc \'\'.' . "\n" . + ' --ll [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.' . "\n" . + "\n"; + exit; + +}