nbic_fasta: ExtractPeptideSequenceContext.pl annotate

annotate ExtractPeptideSequenceContext.pl @ 0:163892325845 draft default tip

Initial commit.

author	galaxyp
date	Fri, 10 May 2013 17:15:08 -0400
parents
children

rev	line source
0 163892325845 Initial commit. galaxyp parents: diff changeset	1 #!/usr/bin/perl -w
163892325845 Initial commit. galaxyp parents: diff changeset	2 #
163892325845 Initial commit. galaxyp parents: diff changeset	3 # This script extracts subsequences from UniProtKB .dat files or from .fasta files
163892325845 Initial commit. galaxyp parents: diff changeset	4 # based on a list of accession numbers / IDs and a sequence fragment (peptide).
163892325845 Initial commit. galaxyp parents: diff changeset	5 #
163892325845 Initial commit. galaxyp parents: diff changeset	6 # =====================================================
163892325845 Initial commit. galaxyp parents: diff changeset	7 # $Id: ExtractPeptideSequenceContext.pl 112 2011-03-01 19:50:23Z pieter.neerincx@gmail.com $
163892325845 Initial commit. galaxyp parents: diff changeset	8 # $HeadURL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractPeptideSequenceContext.pl $
163892325845 Initial commit. galaxyp parents: diff changeset	9 # $LastChangedDate: 2011-03-01 13:50:23 -0600 (Tue, 01 Mar 2011) $
163892325845 Initial commit. galaxyp parents: diff changeset	10 # $LastChangedRevision: 112 $
163892325845 Initial commit. galaxyp parents: diff changeset	11 # $LastChangedBy: pieter.neerincx@gmail.com $
163892325845 Initial commit. galaxyp parents: diff changeset	12 # =====================================================
163892325845 Initial commit. galaxyp parents: diff changeset	13 #
163892325845 Initial commit. galaxyp parents: diff changeset	14
163892325845 Initial commit. galaxyp parents: diff changeset	15 #
163892325845 Initial commit. galaxyp parents: diff changeset	16 # initialise environment
163892325845 Initial commit. galaxyp parents: diff changeset	17 #
163892325845 Initial commit. galaxyp parents: diff changeset	18 use strict;
163892325845 Initial commit. galaxyp parents: diff changeset	19 use Getopt::Long;
163892325845 Initial commit. galaxyp parents: diff changeset	20 use SWISS::Entry; # For parsing UniProtKB *.dat files.
163892325845 Initial commit. galaxyp parents: diff changeset	21 use Log::Log4perl qw(:easy);
163892325845 Initial commit. galaxyp parents: diff changeset	22
163892325845 Initial commit. galaxyp parents: diff changeset	23 my $ps = '/'; # Path Separator.
163892325845 Initial commit. galaxyp parents: diff changeset	24 my %log_levels = (
163892325845 Initial commit. galaxyp parents: diff changeset	25 'ALL' => $ALL,
163892325845 Initial commit. galaxyp parents: diff changeset	26 'TRACE' => $TRACE,
163892325845 Initial commit. galaxyp parents: diff changeset	27 'DEBUG' => $DEBUG,
163892325845 Initial commit. galaxyp parents: diff changeset	28 'INFO' => $INFO,
163892325845 Initial commit. galaxyp parents: diff changeset	29 'WARN' => $WARN,
163892325845 Initial commit. galaxyp parents: diff changeset	30 'ERROR' => $ERROR,
163892325845 Initial commit. galaxyp parents: diff changeset	31 'FATAL' => $FATAL,
163892325845 Initial commit. galaxyp parents: diff changeset	32 'OFF' => $OFF,
163892325845 Initial commit. galaxyp parents: diff changeset	33 );
163892325845 Initial commit. galaxyp parents: diff changeset	34
163892325845 Initial commit. galaxyp parents: diff changeset	35 #
163892325845 Initial commit. galaxyp parents: diff changeset	36 # List of databases, which may contain versioned accession numbers,
163892325845 Initial commit. galaxyp parents: diff changeset	37 # and the separation character used to join the accession number and
163892325845 Initial commit. galaxyp parents: diff changeset	38 # the version number.
163892325845 Initial commit. galaxyp parents: diff changeset	39 #
163892325845 Initial commit. galaxyp parents: diff changeset	40 my %databases_with_optionally_versioned_ids = (
163892325845 Initial commit. galaxyp parents: diff changeset	41 'IPI' => '.',
163892325845 Initial commit. galaxyp parents: diff changeset	42 'ipi' => '.',
163892325845 Initial commit. galaxyp parents: diff changeset	43 'UniProtKB/Swiss-Prot' => '.',
163892325845 Initial commit. galaxyp parents: diff changeset	44 'sp' => '.',
163892325845 Initial commit. galaxyp parents: diff changeset	45 'SP' => '.',
163892325845 Initial commit. galaxyp parents: diff changeset	46 'UniProtKB/TrEMBL' => '.',
163892325845 Initial commit. galaxyp parents: diff changeset	47 'TR' => '.',
163892325845 Initial commit. galaxyp parents: diff changeset	48 'tr' => '.',
163892325845 Initial commit. galaxyp parents: diff changeset	49 'Tr' => '.'
163892325845 Initial commit. galaxyp parents: diff changeset	50 );
163892325845 Initial commit. galaxyp parents: diff changeset	51
163892325845 Initial commit. galaxyp parents: diff changeset	52 my $db;
163892325845 Initial commit. galaxyp parents: diff changeset	53 my $db_alt;
163892325845 Initial commit. galaxyp parents: diff changeset	54 my $db_format;
163892325845 Initial commit. galaxyp parents: diff changeset	55 my $index_id; # Key used to lookup sequences. Can be ID or Accession number (default).
163892325845 Initial commit. galaxyp parents: diff changeset	56 my $fragments;
163892325845 Initial commit. galaxyp parents: diff changeset	57 my $id_column;
163892325845 Initial commit. galaxyp parents: diff changeset	58 my $peptide_column;
163892325845 Initial commit. galaxyp parents: diff changeset	59 my $strip_lc; # For the sequence fragments in the fragments file. With strip_lc enabled we
163892325845 Initial commit. galaxyp parents: diff changeset	60 # assume amino acids are in upper case and lower case characters represent modifications,
163892325845 Initial commit. galaxyp parents: diff changeset	61 # which will be stripped off before searching for the fragments in the complete sequences.
163892325845 Initial commit. galaxyp parents: diff changeset	62 my $cleavage_output; # Sequence contexts for cleavage sites.
163892325845 Initial commit. galaxyp parents: diff changeset	63 my $miscleavage_output; # Sequence contexts for miscleavage sites.
163892325845 Initial commit. galaxyp parents: diff changeset	64 my $modification_output; # Sequence contexts for modification sites.
163892325845 Initial commit. galaxyp parents: diff changeset	65 my $peptide_output; # Sequence contexts for complete peptides.
163892325845 Initial commit. galaxyp parents: diff changeset	66 my $cleavage_aa; # Assume the sequence fragments were derived from cutting at this amino acid.
163892325845 Initial commit. galaxyp parents: diff changeset	67 my $cleavage_terminus; # Assume the sequence fragments were derived from cutting at this side of $cut_at_aa
163892325845 Initial commit. galaxyp parents: diff changeset	68 my $modified_aa;
163892325845 Initial commit. galaxyp parents: diff changeset	69 my $n_term_context_length;
163892325845 Initial commit. galaxyp parents: diff changeset	70 my $c_term_context_length;
163892325845 Initial commit. galaxyp parents: diff changeset	71 my $miscleavage_distance; # Minimal distance between a putative miscleavage site and the peptide termini.
163892325845 Initial commit. galaxyp parents: diff changeset	72 # Uncleaved AAs closer to the peptide termini can be ignored with this parameter:
163892325845 Initial commit. galaxyp parents: diff changeset	73 # A. To prevent overlap between cleavage and miscleavage peptide sequence contexts.
163892325845 Initial commit. galaxyp parents: diff changeset	74 # B. To remove false positive miscleavage sites that cannot be cleaved,
163892325845 Initial commit. galaxyp parents: diff changeset	75 # because one of the resulting fragments is too short. Sometimes proteases may need
163892325845 Initial commit. galaxyp parents: diff changeset	76 # a minimal length surrounding the cleavage site to bind and cleave a peptide.
163892325845 Initial commit. galaxyp parents: diff changeset	77 my $padding_character; # Used for padding if the protein sequence is too short for a long enough context.
163892325845 Initial commit. galaxyp parents: diff changeset	78 my $log_level;
163892325845 Initial commit. galaxyp parents: diff changeset	79
163892325845 Initial commit. galaxyp parents: diff changeset	80 Getopt::Long::GetOptions (
163892325845 Initial commit. galaxyp parents: diff changeset	81 'db=s' => \$db,
163892325845 Initial commit. galaxyp parents: diff changeset	82 'dba:s' => \$db_alt,
163892325845 Initial commit. galaxyp parents: diff changeset	83 'dbf:s' => \$db_format,
163892325845 Initial commit. galaxyp parents: diff changeset	84 'k:s' => \$index_id, # Key used to lookup sequences. Can be ID or Accession number (default).
163892325845 Initial commit. galaxyp parents: diff changeset	85 'f=s' => \$fragments,
163892325845 Initial commit. galaxyp parents: diff changeset	86 'icol:i' => \$id_column,
163892325845 Initial commit. galaxyp parents: diff changeset	87 'pcol:i' => \$peptide_column,
163892325845 Initial commit. galaxyp parents: diff changeset	88 's' => \$strip_lc, # For the sequence fragments in the fragments file. With strip_lc enabled we
163892325845 Initial commit. galaxyp parents: diff changeset	89 # assume amino acids are in upper case and lower case characters represent modifications,
163892325845 Initial commit. galaxyp parents: diff changeset	90 # which will be stripped off before searching for the fragments in the complete sequences.
163892325845 Initial commit. galaxyp parents: diff changeset	91 'cleo:s' => \$cleavage_output,
163892325845 Initial commit. galaxyp parents: diff changeset	92 'miso:s' => \$miscleavage_output,
163892325845 Initial commit. galaxyp parents: diff changeset	93 'modo:s' => \$modification_output,
163892325845 Initial commit. galaxyp parents: diff changeset	94 'pepo:s' => \$peptide_output,
163892325845 Initial commit. galaxyp parents: diff changeset	95 'ca:s' => \$cleavage_aa, # Assume the sequence fragments were derived from cutting at this amino acid.
163892325845 Initial commit. galaxyp parents: diff changeset	96 'ct:s' => \$cleavage_terminus, # Assume the sequence fragments were derived from cutting at this side of $cut_at_aa
163892325845 Initial commit. galaxyp parents: diff changeset	97 'ma:s' => \$modified_aa,
163892325845 Initial commit. galaxyp parents: diff changeset	98 'n:i' => \$n_term_context_length,
163892325845 Initial commit. galaxyp parents: diff changeset	99 'c:i' => \$c_term_context_length,
163892325845 Initial commit. galaxyp parents: diff changeset	100 'mcd:i' => \$miscleavage_distance, # Minimal distance between a putative miscleavage site and the peptide termini.
163892325845 Initial commit. galaxyp parents: diff changeset	101 # Uncleaved AAs closer to the peptide termini can be ignored with this parameter:
163892325845 Initial commit. galaxyp parents: diff changeset	102 # A. To prevent overlap between cleavage and miscleavage peptide sequence contexts.
163892325845 Initial commit. galaxyp parents: diff changeset	103 # B. To remove false positive miscleavage sites that cannot be cleaved,
163892325845 Initial commit. galaxyp parents: diff changeset	104 # because one of the resulting fragments is too short. Sometimes proteases may need
163892325845 Initial commit. galaxyp parents: diff changeset	105 # a minimal length surrounding the cleavage site to bind and cleave a peptide.
163892325845 Initial commit. galaxyp parents: diff changeset	106 'pc:s' => \$padding_character,
163892325845 Initial commit. galaxyp parents: diff changeset	107 'll:s' => \$log_level
163892325845 Initial commit. galaxyp parents: diff changeset	108 );
163892325845 Initial commit. galaxyp parents: diff changeset	109
163892325845 Initial commit. galaxyp parents: diff changeset	110 #
163892325845 Initial commit. galaxyp parents: diff changeset	111 # TODO: 1. Handle --mcd param. Maybe should be extended to exclude also modification sites too close together...
163892325845 Initial commit. galaxyp parents: diff changeset	112 #
163892325845 Initial commit. galaxyp parents: diff changeset	113
163892325845 Initial commit. galaxyp parents: diff changeset	114 #
163892325845 Initial commit. galaxyp parents: diff changeset	115 # Configure logging.
163892325845 Initial commit. galaxyp parents: diff changeset	116 #
163892325845 Initial commit. galaxyp parents: diff changeset	117 # Provides default if user did not specify log level:
163892325845 Initial commit. galaxyp parents: diff changeset	118 $log_level = (defined($log_level) ? $log_level : 'INFO');
163892325845 Initial commit. galaxyp parents: diff changeset	119 # Reset log level to default if user specified illegal log level.
163892325845 Initial commit. galaxyp parents: diff changeset	120 $log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'INFO'});
163892325845 Initial commit. galaxyp parents: diff changeset	121 #Log::Log4perl->init('log4perl.properties');
163892325845 Initial commit. galaxyp parents: diff changeset	122 Log::Log4perl->easy_init(
163892325845 Initial commit. galaxyp parents: diff changeset	123 #{ level => $log_level,
163892325845 Initial commit. galaxyp parents: diff changeset	124 # file => ">>ExtractPeptideSequenceContext.log",
163892325845 Initial commit. galaxyp parents: diff changeset	125 # layout => '%F{1}-%L-%M: %m%n' },
163892325845 Initial commit. galaxyp parents: diff changeset	126 { level => $log_level,
163892325845 Initial commit. galaxyp parents: diff changeset	127 file => "STDOUT",
163892325845 Initial commit. galaxyp parents: diff changeset	128 layout => '%d L:%L %p> %m%n' },
163892325845 Initial commit. galaxyp parents: diff changeset	129 );
163892325845 Initial commit. galaxyp parents: diff changeset	130 my $logger = Log::Log4perl::get_logger();
163892325845 Initial commit. galaxyp parents: diff changeset	131
163892325845 Initial commit. galaxyp parents: diff changeset	132 #
163892325845 Initial commit. galaxyp parents: diff changeset	133 # Check user input.
163892325845 Initial commit. galaxyp parents: diff changeset	134 #
163892325845 Initial commit. galaxyp parents: diff changeset	135 foreach my $input ($db, $db_alt, $fragments, $cleavage_output, $miscleavage_output, $modification_output) {
163892325845 Initial commit. galaxyp parents: diff changeset	136 if (defined($input) && length($input) <= 0) {
163892325845 Initial commit. galaxyp parents: diff changeset	137 $input = undef();
163892325845 Initial commit. galaxyp parents: diff changeset	138 }
163892325845 Initial commit. galaxyp parents: diff changeset	139 }
163892325845 Initial commit. galaxyp parents: diff changeset	140 unless (defined($db) &&
163892325845 Initial commit. galaxyp parents: diff changeset	141 defined($fragments) &&
163892325845 Initial commit. galaxyp parents: diff changeset	142 (defined($cleavage_output) \|\| defined($miscleavage_output) \|\| defined($modification_output) \|\| defined($peptide_output))
163892325845 Initial commit. galaxyp parents: diff changeset	143 ) {
163892325845 Initial commit. galaxyp parents: diff changeset	144 $logger->fatal('You need to specify at least a DB & fragments file as inputs and at least one output file.');
163892325845 Initial commit. galaxyp parents: diff changeset	145 _Usage();
163892325845 Initial commit. galaxyp parents: diff changeset	146 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	147 }
163892325845 Initial commit. galaxyp parents: diff changeset	148 if (defined($db_format)) {
163892325845 Initial commit. galaxyp parents: diff changeset	149 unless ($db_format eq 'UniProtKB DAT' or $db_format eq 'FASTA') {
163892325845 Initial commit. galaxyp parents: diff changeset	150 $logger->fatal('Database format in unsupported format.');
163892325845 Initial commit. galaxyp parents: diff changeset	151 $logger->fatal("\t" . 'Must be \'UniProtKB DAT\' or \'FASTA\'.');
163892325845 Initial commit. galaxyp parents: diff changeset	152 $logger->fatal("\t" . 'But --dbf was ' . $db_format . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	153 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	154 }
163892325845 Initial commit. galaxyp parents: diff changeset	155 }
163892325845 Initial commit. galaxyp parents: diff changeset	156 if (defined($cleavage_aa)) {
163892325845 Initial commit. galaxyp parents: diff changeset	157 unless ($cleavage_aa =~ m/^[*A-Z]$/) {
163892325845 Initial commit. galaxyp parents: diff changeset	158 $logger->fatal('Cleavage amino acid in unsupported format.');
163892325845 Initial commit. galaxyp parents: diff changeset	159 $cleavage_aa = undef;
163892325845 Initial commit. galaxyp parents: diff changeset	160 }
163892325845 Initial commit. galaxyp parents: diff changeset	161 }
163892325845 Initial commit. galaxyp parents: diff changeset	162 if (defined($cleavage_terminus)) {
163892325845 Initial commit. galaxyp parents: diff changeset	163 unless ($cleavage_terminus =~ m/^[*CN]$/) {
163892325845 Initial commit. galaxyp parents: diff changeset	164 $logger->fatal('Cleavage terminus in unsupported format.');
163892325845 Initial commit. galaxyp parents: diff changeset	165 $cleavage_terminus = undef;
163892325845 Initial commit. galaxyp parents: diff changeset	166 }
163892325845 Initial commit. galaxyp parents: diff changeset	167 }
163892325845 Initial commit. galaxyp parents: diff changeset	168 if (defined($modified_aa)) {
163892325845 Initial commit. galaxyp parents: diff changeset	169 unless ($modified_aa =~ m/^[A-Z][a-z]+$/ \|\| $modified_aa =~ m/^[a-z]+[A-Z]$/) {
163892325845 Initial commit. galaxyp parents: diff changeset	170 $logger->fatal('Modified amino acid in unsupported format.');
163892325845 Initial commit. galaxyp parents: diff changeset	171 $modified_aa = undef;
163892325845 Initial commit. galaxyp parents: diff changeset	172 }
163892325845 Initial commit. galaxyp parents: diff changeset	173 }
163892325845 Initial commit. galaxyp parents: diff changeset	174 #
163892325845 Initial commit. galaxyp parents: diff changeset	175 # We need info to extract:
163892325845 Initial commit. galaxyp parents: diff changeset	176 # * cleavage site sequence contexts or
163892325845 Initial commit. galaxyp parents: diff changeset	177 # * modification site sequence contexts or
163892325845 Initial commit. galaxyp parents: diff changeset	178 # * peptide sequence contexts
163892325845 Initial commit. galaxyp parents: diff changeset	179 #
163892325845 Initial commit. galaxyp parents: diff changeset	180 if (defined($cleavage_output) \|\| defined($miscleavage_output)) {
163892325845 Initial commit. galaxyp parents: diff changeset	181 unless (defined($cleavage_aa) && defined($cleavage_terminus)) {
163892325845 Initial commit. galaxyp parents: diff changeset	182 $logger->fatal('If you want to retrieve (mis)cleavage site sequence contexts, you must provide valid values for --ca and --ct.');
163892325845 Initial commit. galaxyp parents: diff changeset	183 $logger->debug('You specified --ca ' . $cleavage_aa . ' --ct ' . $cleavage_terminus . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	184 _Usage();
163892325845 Initial commit. galaxyp parents: diff changeset	185 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	186 }
163892325845 Initial commit. galaxyp parents: diff changeset	187 }
163892325845 Initial commit. galaxyp parents: diff changeset	188 if (defined($modification_output)) {
163892325845 Initial commit. galaxyp parents: diff changeset	189 unless (defined($modified_aa)) {
163892325845 Initial commit. galaxyp parents: diff changeset	190 $logger->fatal('If you want to retrieve modifications site sequence contexts you must provide a valid value for --ma.');
163892325845 Initial commit. galaxyp parents: diff changeset	191 $logger->debug('You specified --ma ' . $modified_aa . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	192 _Usage();
163892325845 Initial commit. galaxyp parents: diff changeset	193 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	194 }
163892325845 Initial commit. galaxyp parents: diff changeset	195 }
163892325845 Initial commit. galaxyp parents: diff changeset	196 if (defined($peptide_output)) {
163892325845 Initial commit. galaxyp parents: diff changeset	197 #
163892325845 Initial commit. galaxyp parents: diff changeset	198 # We can work with defaults for all params.
163892325845 Initial commit. galaxyp parents: diff changeset	199 #
163892325845 Initial commit. galaxyp parents: diff changeset	200 }
163892325845 Initial commit. galaxyp parents: diff changeset	201
163892325845 Initial commit. galaxyp parents: diff changeset	202 # Make sure we don't overwrite the input.
163892325845 Initial commit. galaxyp parents: diff changeset	203 foreach my $output ($cleavage_output, $miscleavage_output, $modification_output, $peptide_output) {
163892325845 Initial commit. galaxyp parents: diff changeset	204 foreach my $input ($db, $db_alt, $fragments) {
163892325845 Initial commit. galaxyp parents: diff changeset	205 if (defined($input) && defined($output) && $output eq $input) {
163892325845 Initial commit. galaxyp parents: diff changeset	206 $logger->fatal('Output file ' . $output . ' is the same as input file ' . $input . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	207 $logger->fatal("\t" . 'Please choose a different file for the output.');
163892325845 Initial commit. galaxyp parents: diff changeset	208 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	209 }
163892325845 Initial commit. galaxyp parents: diff changeset	210 }
163892325845 Initial commit. galaxyp parents: diff changeset	211 }
163892325845 Initial commit. galaxyp parents: diff changeset	212 # Define defaults for optional arguments.
163892325845 Initial commit. galaxyp parents: diff changeset	213 $index_id = (defined($index_id) && length($index_id) > 0 ? $index_id : 'AC');
163892325845 Initial commit. galaxyp parents: diff changeset	214 $id_column = (defined($id_column) && length($id_column) > 0 ? $id_column : 1);
163892325845 Initial commit. galaxyp parents: diff changeset	215 $peptide_column = (defined($peptide_column) && length($peptide_column) > 0 ? $peptide_column : 2);
163892325845 Initial commit. galaxyp parents: diff changeset	216 $n_term_context_length = (defined($n_term_context_length) && length($n_term_context_length) > 0 ? $n_term_context_length : 5);
163892325845 Initial commit. galaxyp parents: diff changeset	217 $c_term_context_length = (defined($c_term_context_length) && length($c_term_context_length) > 0 ? $c_term_context_length : 5);
163892325845 Initial commit. galaxyp parents: diff changeset	218 $padding_character = (defined($padding_character) ? $padding_character : '-');
163892325845 Initial commit. galaxyp parents: diff changeset	219 # Stripping of lowercase characters is disabled by default unless we search for modification site contexts in which case it must be enabled!
163892325845 Initial commit. galaxyp parents: diff changeset	220 $strip_lc = (defined($strip_lc) ? $strip_lc : 0);
163892325845 Initial commit. galaxyp parents: diff changeset	221 $strip_lc = (defined($modified_aa) ? 1 : $strip_lc);
163892325845 Initial commit. galaxyp parents: diff changeset	222 unless (defined($miscleavage_distance) && $miscleavage_distance > 0) {
163892325845 Initial commit. galaxyp parents: diff changeset	223 if ($n_term_context_length >= $c_term_context_length) {
163892325845 Initial commit. galaxyp parents: diff changeset	224 $miscleavage_distance = $n_term_context_length;
163892325845 Initial commit. galaxyp parents: diff changeset	225 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	226 $miscleavage_distance = $c_term_context_length;
163892325845 Initial commit. galaxyp parents: diff changeset	227 }
163892325845 Initial commit. galaxyp parents: diff changeset	228 }
163892325845 Initial commit. galaxyp parents: diff changeset	229
163892325845 Initial commit. galaxyp parents: diff changeset	230
163892325845 Initial commit. galaxyp parents: diff changeset	231 #
163892325845 Initial commit. galaxyp parents: diff changeset	232 # Check db files and fragments file.
163892325845 Initial commit. galaxyp parents: diff changeset	233 #
163892325845 Initial commit. galaxyp parents: diff changeset	234 unless (-e $db && -f $db && -r $db) {
163892325845 Initial commit. galaxyp parents: diff changeset	235 $logger->fatal('Cannot read from database input file ' . $db . ': ' . $!);
163892325845 Initial commit. galaxyp parents: diff changeset	236 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	237 }
163892325845 Initial commit. galaxyp parents: diff changeset	238 if (defined($db_alt)) {
163892325845 Initial commit. galaxyp parents: diff changeset	239 unless (-e $db_alt && -f $db_alt && -r $db_alt) {
163892325845 Initial commit. galaxyp parents: diff changeset	240 $logger->fatal('Cannot read from alternative database input file ' . $db_alt . ': ' . $!);
163892325845 Initial commit. galaxyp parents: diff changeset	241 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	242 }
163892325845 Initial commit. galaxyp parents: diff changeset	243 }
163892325845 Initial commit. galaxyp parents: diff changeset	244 unless (-e $fragments && -f $fragments && -r $fragments) {
163892325845 Initial commit. galaxyp parents: diff changeset	245 $logger->fatal('Cannot read from fragments input file ' . $fragments . ': ' .$!);
163892325845 Initial commit. galaxyp parents: diff changeset	246 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	247 }
163892325845 Initial commit. galaxyp parents: diff changeset	248
163892325845 Initial commit. galaxyp parents: diff changeset	249 #
163892325845 Initial commit. galaxyp parents: diff changeset	250 # Create lookup table(s) of the entire database input file(s).
163892325845 Initial commit. galaxyp parents: diff changeset	251 #
163892325845 Initial commit. galaxyp parents: diff changeset	252 my @db_lookup_table_refs;
163892325845 Initial commit. galaxyp parents: diff changeset	253 (my $db_seqs, $index_id) = _CreateLookupHash($db, $index_id, $db_format);
163892325845 Initial commit. galaxyp parents: diff changeset	254 my $seq_count = scalar(keys(%{$db_seqs}));
163892325845 Initial commit. galaxyp parents: diff changeset	255 $logger->info('Number of sequences in database lookup table: ' . $seq_count);
163892325845 Initial commit. galaxyp parents: diff changeset	256 push(@db_lookup_table_refs, $db_seqs);
163892325845 Initial commit. galaxyp parents: diff changeset	257 # Optional backup DB file.
163892325845 Initial commit. galaxyp parents: diff changeset	258 if (defined($db_alt)) {
163892325845 Initial commit. galaxyp parents: diff changeset	259 (my $db_seqs_alt, $index_id) = _CreateLookupHash($db_alt, $index_id, $db_format);
163892325845 Initial commit. galaxyp parents: diff changeset	260 my $seq_count_alt = scalar(keys(%{$db_seqs_alt}));
163892325845 Initial commit. galaxyp parents: diff changeset	261 $logger->info('Number of sequences in backup database lookup table: ' . $seq_count_alt);
163892325845 Initial commit. galaxyp parents: diff changeset	262 push(@db_lookup_table_refs, $db_seqs_alt);
163892325845 Initial commit. galaxyp parents: diff changeset	263 }
163892325845 Initial commit. galaxyp parents: diff changeset	264
163892325845 Initial commit. galaxyp parents: diff changeset	265 #
163892325845 Initial commit. galaxyp parents: diff changeset	266 # Debugging loop
163892325845 Initial commit. galaxyp parents: diff changeset	267 #
163892325845 Initial commit. galaxyp parents: diff changeset	268 #foreach my $keyyy (keys(%{$db_seqs})) {
163892325845 Initial commit. galaxyp parents: diff changeset	269 # $logger->fatal('Key: ' . $keyyy);
163892325845 Initial commit. galaxyp parents: diff changeset	270 # $logger->fatal("\t" . 'AC: ' . ${$db_seqs}{$keyyy}{'AC'});
163892325845 Initial commit. galaxyp parents: diff changeset	271 # $logger->fatal("\t" . 'ID: ' . ${$db_seqs}{$keyyy}{'ID'});
163892325845 Initial commit. galaxyp parents: diff changeset	272 # $logger->fatal("\t" . 'SQ: ' . ${$db_seqs}{$keyyy}{'SQ'});
163892325845 Initial commit. galaxyp parents: diff changeset	273 #}
163892325845 Initial commit. galaxyp parents: diff changeset	274
163892325845 Initial commit. galaxyp parents: diff changeset	275 #
163892325845 Initial commit. galaxyp parents: diff changeset	276 # Lookup sequence fragment contexts and store results.
163892325845 Initial commit. galaxyp parents: diff changeset	277 #
163892325845 Initial commit. galaxyp parents: diff changeset	278 my ($counter_cleavages, $counter_miscleavages, $counter_modifications, $counter_peptides) = _ExtractSeqs(\@db_lookup_table_refs,
163892325845 Initial commit. galaxyp parents: diff changeset	279 $fragments, $strip_lc, $id_column, $peptide_column,
163892325845 Initial commit. galaxyp parents: diff changeset	280 $cleavage_output, $miscleavage_output, $modification_output, $peptide_output,
163892325845 Initial commit. galaxyp parents: diff changeset	281 $cleavage_aa, $cleavage_terminus, $miscleavage_distance,
163892325845 Initial commit. galaxyp parents: diff changeset	282 $n_term_context_length, $c_term_context_length,
163892325845 Initial commit. galaxyp parents: diff changeset	283 $padding_character);
163892325845 Initial commit. galaxyp parents: diff changeset	284 #
163892325845 Initial commit. galaxyp parents: diff changeset	285 # Report result stats.
163892325845 Initial commit. galaxyp parents: diff changeset	286 #
163892325845 Initial commit. galaxyp parents: diff changeset	287 my $max_length = 0;
163892325845 Initial commit. galaxyp parents: diff changeset	288 foreach my $counter ($counter_cleavages, $counter_miscleavages, $counter_modifications, $counter_peptides) {
163892325845 Initial commit. galaxyp parents: diff changeset	289 $max_length = (length($counter) > $max_length) ? length($counter) : $max_length;
163892325845 Initial commit. galaxyp parents: diff changeset	290 }
163892325845 Initial commit. galaxyp parents: diff changeset	291 my $prefixed_counter = sprintf('%*s', $max_length, $counter_cleavages);
163892325845 Initial commit. galaxyp parents: diff changeset	292 $logger->info('Extracted cleavage site sequence contexts: ' . $prefixed_counter . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	293 $prefixed_counter = sprintf('%*s', $max_length, $counter_miscleavages);
163892325845 Initial commit. galaxyp parents: diff changeset	294 $logger->info('Extracted miscleavage site sequence contexts: ' . $prefixed_counter . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	295 $prefixed_counter = sprintf('%*s', $max_length, $counter_modifications);
163892325845 Initial commit. galaxyp parents: diff changeset	296 $logger->info('Extracted modification site sequence contexts: ' . $prefixed_counter . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	297 $prefixed_counter = sprintf('%*s', $max_length, $counter_peptides);
163892325845 Initial commit. galaxyp parents: diff changeset	298 $logger->info('Extracted peptide sequence contexts: ' . $prefixed_counter . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	299 $prefixed_counter = sprintf('%*s', $max_length, $counter_cleavages + $counter_miscleavages + $counter_modifications + $counter_peptides);
163892325845 Initial commit. galaxyp parents: diff changeset	300 $logger->info('Total extracted sequence contexts: ' . $prefixed_counter . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	301 $logger->info('Finished!');
163892325845 Initial commit. galaxyp parents: diff changeset	302
163892325845 Initial commit. galaxyp parents: diff changeset	303 #
163892325845 Initial commit. galaxyp parents: diff changeset	304 ##
163892325845 Initial commit. galaxyp parents: diff changeset	305 ### Internal subs.
163892325845 Initial commit. galaxyp parents: diff changeset	306 ##
163892325845 Initial commit. galaxyp parents: diff changeset	307 #
163892325845 Initial commit. galaxyp parents: diff changeset	308
163892325845 Initial commit. galaxyp parents: diff changeset	309 sub _CreateLookupHash {
163892325845 Initial commit. galaxyp parents: diff changeset	310
163892325845 Initial commit. galaxyp parents: diff changeset	311 my ($file_path, $index_id, $file_type) = @_;
163892325845 Initial commit. galaxyp parents: diff changeset	312
163892325845 Initial commit. galaxyp parents: diff changeset	313 my $file_path_fh;
163892325845 Initial commit. galaxyp parents: diff changeset	314 my %seqs;
163892325845 Initial commit. galaxyp parents: diff changeset	315
163892325845 Initial commit. galaxyp parents: diff changeset	316 if (defined($file_type)) {
163892325845 Initial commit. galaxyp parents: diff changeset	317 $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...');
163892325845 Initial commit. galaxyp parents: diff changeset	318 } elsif ($file_path =~ m/($ps?)([^\.]+)\.dat$/i) {
163892325845 Initial commit. galaxyp parents: diff changeset	319 $file_type = 'UniProtKB DAT';
163892325845 Initial commit. galaxyp parents: diff changeset	320 $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...');
163892325845 Initial commit. galaxyp parents: diff changeset	321 } elsif ($file_path =~ m/($ps?)([^\.]+)\.fa(sta)?$/i) {
163892325845 Initial commit. galaxyp parents: diff changeset	322 $file_type = 'FASTA';
163892325845 Initial commit. galaxyp parents: diff changeset	323 # For FASTA files we don't know what to expect:
163892325845 Initial commit. galaxyp parents: diff changeset	324 # IDs or ACcession numbers or a mix of both.
163892325845 Initial commit. galaxyp parents: diff changeset	325 # (Re)set index_id type to AC to prevent adding redundant IDs to the results.
163892325845 Initial commit. galaxyp parents: diff changeset	326 # Hence treat all identifiers stable and unstable as ACs.
163892325845 Initial commit. galaxyp parents: diff changeset	327 # (Normally we default to ACcession numbers and if we would have found an AC
163892325845 Initial commit. galaxyp parents: diff changeset	328 # and the input fragments file was using IDs, we would append the AC to the results.)
163892325845 Initial commit. galaxyp parents: diff changeset	329 $index_id = 'AC';
163892325845 Initial commit. galaxyp parents: diff changeset	330 $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...');
163892325845 Initial commit. galaxyp parents: diff changeset	331 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	332 $file_type = 'FASTA';
163892325845 Initial commit. galaxyp parents: diff changeset	333 $index_id = 'AC';
163892325845 Initial commit. galaxyp parents: diff changeset	334 $logger->warn('Could not determine database file type based on the file name extension of ' . $file_path);
163892325845 Initial commit. galaxyp parents: diff changeset	335 $logger->warn('Will assume it is a FASTA file and hope that is correct...');
163892325845 Initial commit. galaxyp parents: diff changeset	336 $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...');
163892325845 Initial commit. galaxyp parents: diff changeset	337 }
163892325845 Initial commit. galaxyp parents: diff changeset	338
163892325845 Initial commit. galaxyp parents: diff changeset	339 if ($file_type eq 'UniProtKB DAT') {
163892325845 Initial commit. galaxyp parents: diff changeset	340
163892325845 Initial commit. galaxyp parents: diff changeset	341 #
163892325845 Initial commit. galaxyp parents: diff changeset	342 # Read an entire record delimited by // at a time.
163892325845 Initial commit. galaxyp parents: diff changeset	343 #
163892325845 Initial commit. galaxyp parents: diff changeset	344 local $/ = "\/\/\n";
163892325845 Initial commit. galaxyp parents: diff changeset	345
163892325845 Initial commit. galaxyp parents: diff changeset	346 eval {
163892325845 Initial commit. galaxyp parents: diff changeset	347 open($file_path_fh, "<$file_path");
163892325845 Initial commit. galaxyp parents: diff changeset	348 };
163892325845 Initial commit. galaxyp parents: diff changeset	349 if ($@) {
163892325845 Initial commit. galaxyp parents: diff changeset	350 $logger->fatal('Cannot read from database input file: ' . $@);
163892325845 Initial commit. galaxyp parents: diff changeset	351 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	352 }
163892325845 Initial commit. galaxyp parents: diff changeset	353
163892325845 Initial commit. galaxyp parents: diff changeset	354 while (<$file_path_fh>){
163892325845 Initial commit. galaxyp parents: diff changeset	355
163892325845 Initial commit. galaxyp parents: diff changeset	356 $logger->trace('Record:' . "\n$_\n");
163892325845 Initial commit. galaxyp parents: diff changeset	357
163892325845 Initial commit. galaxyp parents: diff changeset	358 my $entry = SWISS::Entry->fromText($_);
163892325845 Initial commit. galaxyp parents: diff changeset	359
163892325845 Initial commit. galaxyp parents: diff changeset	360 my $id = $entry->ID;
163892325845 Initial commit. galaxyp parents: diff changeset	361 my $acc = $entry->AC;
163892325845 Initial commit. galaxyp parents: diff changeset	362 my $seq = $entry->SQ;
163892325845 Initial commit. galaxyp parents: diff changeset	363 my $index_key;
163892325845 Initial commit. galaxyp parents: diff changeset	364
163892325845 Initial commit. galaxyp parents: diff changeset	365 if ($index_id eq 'ID') {
163892325845 Initial commit. galaxyp parents: diff changeset	366 $index_key = $id;
163892325845 Initial commit. galaxyp parents: diff changeset	367 } elsif ($index_id eq 'AC') {
163892325845 Initial commit. galaxyp parents: diff changeset	368 $index_key = $acc;
163892325845 Initial commit. galaxyp parents: diff changeset	369 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	370 $logger->fatal('Unknown type of identifier used for indexing: ' . $index_id . '. Must be ID or AC.');
163892325845 Initial commit. galaxyp parents: diff changeset	371 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	372 }
163892325845 Initial commit. galaxyp parents: diff changeset	373
163892325845 Initial commit. galaxyp parents: diff changeset	374 $logger->debug('Found accession ' . $acc);
163892325845 Initial commit. galaxyp parents: diff changeset	375 $logger->trace('Found accession ' . $acc . ' with ID ' . $id . ' and seq ' . $seq);
163892325845 Initial commit. galaxyp parents: diff changeset	376
163892325845 Initial commit. galaxyp parents: diff changeset	377 $seqs{$index_key}{'AC'} = $acc;
163892325845 Initial commit. galaxyp parents: diff changeset	378 $seqs{$index_key}{'ID'} = $id;
163892325845 Initial commit. galaxyp parents: diff changeset	379 $seqs{$index_key}{'SQ'} = $seq;
163892325845 Initial commit. galaxyp parents: diff changeset	380
163892325845 Initial commit. galaxyp parents: diff changeset	381 }
163892325845 Initial commit. galaxyp parents: diff changeset	382
163892325845 Initial commit. galaxyp parents: diff changeset	383 } elsif ($file_type eq 'FASTA') {
163892325845 Initial commit. galaxyp parents: diff changeset	384
163892325845 Initial commit. galaxyp parents: diff changeset	385 my @header_ids = ();
163892325845 Initial commit. galaxyp parents: diff changeset	386 my $seq = '';
163892325845 Initial commit. galaxyp parents: diff changeset	387
163892325845 Initial commit. galaxyp parents: diff changeset	388 eval {
163892325845 Initial commit. galaxyp parents: diff changeset	389 open($file_path_fh, "<$file_path");
163892325845 Initial commit. galaxyp parents: diff changeset	390 };
163892325845 Initial commit. galaxyp parents: diff changeset	391 if ($@) {
163892325845 Initial commit. galaxyp parents: diff changeset	392 $logger->fatal('Cannot read from database input file: ' . $@);
163892325845 Initial commit. galaxyp parents: diff changeset	393 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	394 }
163892325845 Initial commit. galaxyp parents: diff changeset	395
163892325845 Initial commit. galaxyp parents: diff changeset	396 LINE: while (my $line = <$file_path_fh>) {
163892325845 Initial commit. galaxyp parents: diff changeset	397
163892325845 Initial commit. galaxyp parents: diff changeset	398 if ($line =~ m/^[\s\n\r]+$/) {
163892325845 Initial commit. galaxyp parents: diff changeset	399
163892325845 Initial commit. galaxyp parents: diff changeset	400 #
163892325845 Initial commit. galaxyp parents: diff changeset	401 # Skip blank line
163892325845 Initial commit. galaxyp parents: diff changeset	402 #
163892325845 Initial commit. galaxyp parents: diff changeset	403 $logger->debug('Found empty line in FASTA file.');
163892325845 Initial commit. galaxyp parents: diff changeset	404 next LINE;
163892325845 Initial commit. galaxyp parents: diff changeset	405
163892325845 Initial commit. galaxyp parents: diff changeset	406 } elsif ($line =~ m/^[a-zA-Z]+/) {
163892325845 Initial commit. galaxyp parents: diff changeset	407
163892325845 Initial commit. galaxyp parents: diff changeset	408 #
163892325845 Initial commit. galaxyp parents: diff changeset	409 # It's a sequence line
163892325845 Initial commit. galaxyp parents: diff changeset	410 # Remove all white space and new line characters.
163892325845 Initial commit. galaxyp parents: diff changeset	411 #
163892325845 Initial commit. galaxyp parents: diff changeset	412 $logger->debug('Found FASTA sequence line: ' . $line);
163892325845 Initial commit. galaxyp parents: diff changeset	413
163892325845 Initial commit. galaxyp parents: diff changeset	414 $line =~ s/[\s\n\r]+//g;
163892325845 Initial commit. galaxyp parents: diff changeset	415 $seq .= $line;
163892325845 Initial commit. galaxyp parents: diff changeset	416
163892325845 Initial commit. galaxyp parents: diff changeset	417 } elsif ($line =~ m/^>/) {
163892325845 Initial commit. galaxyp parents: diff changeset	418
163892325845 Initial commit. galaxyp parents: diff changeset	419 $logger->debug('Found new FASTA header line: ' . $line);
163892325845 Initial commit. galaxyp parents: diff changeset	420
163892325845 Initial commit. galaxyp parents: diff changeset	421 if ((length($seq) > 0) && defined($header_ids[0])) {
163892325845 Initial commit. galaxyp parents: diff changeset	422
163892325845 Initial commit. galaxyp parents: diff changeset	423 #
163892325845 Initial commit. galaxyp parents: diff changeset	424 # Store the previously found sequence in the lookup table.
163892325845 Initial commit. galaxyp parents: diff changeset	425 #
163892325845 Initial commit. galaxyp parents: diff changeset	426 foreach my $header_id (@header_ids) {
163892325845 Initial commit. galaxyp parents: diff changeset	427
163892325845 Initial commit. galaxyp parents: diff changeset	428 #
163892325845 Initial commit. galaxyp parents: diff changeset	429 # We don't know if the ID we found is an accession number (stable ID) or a normal (unstable) ID
163892325845 Initial commit. galaxyp parents: diff changeset	430 # -> Fill both the AC and ID fields with the same ID value.
163892325845 Initial commit. galaxyp parents: diff changeset	431 #
163892325845 Initial commit. galaxyp parents: diff changeset	432 $seqs{$header_id}{'AC'} = $header_id;
163892325845 Initial commit. galaxyp parents: diff changeset	433 $seqs{$header_id}{'ID'} = $header_id;
163892325845 Initial commit. galaxyp parents: diff changeset	434 $seqs{$header_id}{'SQ'} = $seq;
163892325845 Initial commit. galaxyp parents: diff changeset	435
163892325845 Initial commit. galaxyp parents: diff changeset	436 $logger->debug('Stored ' . $header_id . ':');
163892325845 Initial commit. galaxyp parents: diff changeset	437 $logger->debug("\t" . 'SQ ' . $seq);
163892325845 Initial commit. galaxyp parents: diff changeset	438
163892325845 Initial commit. galaxyp parents: diff changeset	439 }
163892325845 Initial commit. galaxyp parents: diff changeset	440
163892325845 Initial commit. galaxyp parents: diff changeset	441 #
163892325845 Initial commit. galaxyp parents: diff changeset	442 # Reset vars.
163892325845 Initial commit. galaxyp parents: diff changeset	443 #
163892325845 Initial commit. galaxyp parents: diff changeset	444 $seq = '';
163892325845 Initial commit. galaxyp parents: diff changeset	445 @header_ids = ();
163892325845 Initial commit. galaxyp parents: diff changeset	446
163892325845 Initial commit. galaxyp parents: diff changeset	447 }
163892325845 Initial commit. galaxyp parents: diff changeset	448
163892325845 Initial commit. galaxyp parents: diff changeset	449 #
163892325845 Initial commit. galaxyp parents: diff changeset	450 # Check for the presence of some frequently occurring naming schemes:
163892325845 Initial commit. galaxyp parents: diff changeset	451 #
163892325845 Initial commit. galaxyp parents: diff changeset	452 # >IPI:CON_Trypsin\|SWISS-PROT:P00761\|TRYP_PIG Trypsin - Sus scrofa (Pig).
163892325845 Initial commit. galaxyp parents: diff changeset	453 # >IPI:CON_IPI00174775.2\|TREMBL:Q32MB2;Q86Y46 Tax_Id=9606 Gene_Symbol=KRT73 Keratin-73
163892325845 Initial commit. galaxyp parents: diff changeset	454 # >sp\|Q42592\|APXS_ARATH L-ascorbate peroxidase S, chloroplastic/mitochondrial;
163892325845 Initial commit. galaxyp parents: diff changeset	455 # >jgi\|Triad1\|1\|gw1.3.1.1
163892325845 Initial commit. galaxyp parents: diff changeset	456 #
163892325845 Initial commit. galaxyp parents: diff changeset	457 # The FASTA header line can basically have any format.
163892325845 Initial commit. galaxyp parents: diff changeset	458 # Therefore we try to see if the IDs we are looking for are present anywhere
163892325845 Initial commit. galaxyp parents: diff changeset	459 # in the header line up until the first white space or new line.
163892325845 Initial commit. galaxyp parents: diff changeset	460 # This should prevent matching annotation from other proteins in the description
163892325845 Initial commit. galaxyp parents: diff changeset	461 # like a for example a 'best' BLAST hit that contains one of the IDs we
163892325845 Initial commit. galaxyp parents: diff changeset	462 # are looking for. Hence, in such a case this sequence is not the one of
163892325845 Initial commit. galaxyp parents: diff changeset	463 # the IDs we looking for, but similar to that one at best.
163892325845 Initial commit. galaxyp parents: diff changeset	464 #
163892325845 Initial commit. galaxyp parents: diff changeset	465 my @unfiltered_ids;
163892325845 Initial commit. galaxyp parents: diff changeset	466 if ($line =~ m/^>((([^\s;\|]+)[;\|])*([^\s;\|]+))[\|;]?/i) {
163892325845 Initial commit. galaxyp parents: diff changeset	467
163892325845 Initial commit. galaxyp parents: diff changeset	468 #
163892325845 Initial commit. galaxyp parents: diff changeset	469 # Got multiple IDs
163892325845 Initial commit. galaxyp parents: diff changeset	470 #
163892325845 Initial commit. galaxyp parents: diff changeset	471 my $multiple_ids = $1;
163892325845 Initial commit. galaxyp parents: diff changeset	472 @unfiltered_ids = split(/[\s;\|]+/, $multiple_ids);
163892325845 Initial commit. galaxyp parents: diff changeset	473
163892325845 Initial commit. galaxyp parents: diff changeset	474 } elsif ($line =~ m/^>([^\s]+)/i){
163892325845 Initial commit. galaxyp parents: diff changeset	475
163892325845 Initial commit. galaxyp parents: diff changeset	476 #
163892325845 Initial commit. galaxyp parents: diff changeset	477 # Got a single ID
163892325845 Initial commit. galaxyp parents: diff changeset	478 #
163892325845 Initial commit. galaxyp parents: diff changeset	479 my $single_id = $1;
163892325845 Initial commit. galaxyp parents: diff changeset	480 push(@unfiltered_ids, $single_id);
163892325845 Initial commit. galaxyp parents: diff changeset	481
163892325845 Initial commit. galaxyp parents: diff changeset	482 }
163892325845 Initial commit. galaxyp parents: diff changeset	483
163892325845 Initial commit. galaxyp parents: diff changeset	484 KEY:foreach my $unfiltered_id (@unfiltered_ids) {
163892325845 Initial commit. galaxyp parents: diff changeset	485
163892325845 Initial commit. galaxyp parents: diff changeset	486 my $this_id = $unfiltered_id;
163892325845 Initial commit. galaxyp parents: diff changeset	487
163892325845 Initial commit. galaxyp parents: diff changeset	488 #
163892325845 Initial commit. galaxyp parents: diff changeset	489 # Check for RockerBox formattted IDs.
163892325845 Initial commit. galaxyp parents: diff changeset	490 #
163892325845 Initial commit. galaxyp parents: diff changeset	491 # Example of RockerBox ID format, which may contain
163892325845 Initial commit. galaxyp parents: diff changeset	492 # * multiple IDs separated by semi-colons and
163892325845 Initial commit. galaxyp parents: diff changeset	493 # * suffixed with the position of the peptide in the protein:
163892325845 Initial commit. galaxyp parents: diff changeset	494 # YGR192C[123-142]; YJR009C[123-142]
163892325845 Initial commit. galaxyp parents: diff changeset	495 if ($this_id =~ m/^([^\[\]]+)\[\d+-\d+\]$/i) {
163892325845 Initial commit. galaxyp parents: diff changeset	496 $logger->trace('Protein ID in RockerBox format: ' . $this_id);
163892325845 Initial commit. galaxyp parents: diff changeset	497 $this_id = $1;
163892325845 Initial commit. galaxyp parents: diff changeset	498 $logger->trace("\t" . 'Using ID : ' . $this_id);
163892325845 Initial commit. galaxyp parents: diff changeset	499 }
163892325845 Initial commit. galaxyp parents: diff changeset	500
163892325845 Initial commit. galaxyp parents: diff changeset	501 #
163892325845 Initial commit. galaxyp parents: diff changeset	502 # Check for DB namespace prefixes.
163892325845 Initial commit. galaxyp parents: diff changeset	503 #
163892325845 Initial commit. galaxyp parents: diff changeset	504 if ($this_id =~ m/([^:]+):([^:]+)/i) {
163892325845 Initial commit. galaxyp parents: diff changeset	505
163892325845 Initial commit. galaxyp parents: diff changeset	506 #
163892325845 Initial commit. galaxyp parents: diff changeset	507 # Namespace prefixed ID.
163892325845 Initial commit. galaxyp parents: diff changeset	508 #
163892325845 Initial commit. galaxyp parents: diff changeset	509 $logger->trace('Namespace prefixed ID: ' . $this_id);
163892325845 Initial commit. galaxyp parents: diff changeset	510
163892325845 Initial commit. galaxyp parents: diff changeset	511 my $prefix = $1;
163892325845 Initial commit. galaxyp parents: diff changeset	512 $this_id = $2;
163892325845 Initial commit. galaxyp parents: diff changeset	513
163892325845 Initial commit. galaxyp parents: diff changeset	514 $logger->trace("\t" . 'Using ID: ' . $this_id);
163892325845 Initial commit. galaxyp parents: diff changeset	515
163892325845 Initial commit. galaxyp parents: diff changeset	516 #
163892325845 Initial commit. galaxyp parents: diff changeset	517 # For a selected list of "known" database namespace prefixes,
163892325845 Initial commit. galaxyp parents: diff changeset	518 # check for versioned accession numbers and remove the version number.
163892325845 Initial commit. galaxyp parents: diff changeset	519 #
163892325845 Initial commit. galaxyp parents: diff changeset	520 if (defined($prefix) && defined($databases_with_optionally_versioned_ids{$prefix})) {
163892325845 Initial commit. galaxyp parents: diff changeset	521 $logger->trace('Found prefix: ' . $prefix);
163892325845 Initial commit. galaxyp parents: diff changeset	522 my $separator = $databases_with_optionally_versioned_ids{$prefix};
163892325845 Initial commit. galaxyp parents: diff changeset	523 $separator = '\\' . $separator;
163892325845 Initial commit. galaxyp parents: diff changeset	524 if ($this_id =~ m/([^$separator]+)$separator\d+$/) {
163892325845 Initial commit. galaxyp parents: diff changeset	525 $this_id = $1;
163892325845 Initial commit. galaxyp parents: diff changeset	526 }
163892325845 Initial commit. galaxyp parents: diff changeset	527 }
163892325845 Initial commit. galaxyp parents: diff changeset	528 }
163892325845 Initial commit. galaxyp parents: diff changeset	529
163892325845 Initial commit. galaxyp parents: diff changeset	530 $logger->debug('Found ID: ' . $this_id);
163892325845 Initial commit. galaxyp parents: diff changeset	531 push(@header_ids, $this_id);
163892325845 Initial commit. galaxyp parents: diff changeset	532
163892325845 Initial commit. galaxyp parents: diff changeset	533 }
163892325845 Initial commit. galaxyp parents: diff changeset	534
163892325845 Initial commit. galaxyp parents: diff changeset	535 # if ($line =~ m/^>((([^\s\n\r:;\|]+)[:]([^\s\n\r:\|]+)[\|;])*([^\s\n\r:;\|]+)[:]([^\s\n\r:\|]+))[\|;]?(\s+(.+))?/i) {
163892325845 Initial commit. galaxyp parents: diff changeset	536 #
163892325845 Initial commit. galaxyp parents: diff changeset	537 # #
163892325845 Initial commit. galaxyp parents: diff changeset	538 # # One or more namespace prefixed IDs.
163892325845 Initial commit. galaxyp parents: diff changeset	539 # #
163892325845 Initial commit. galaxyp parents: diff changeset	540 # my $concatenated_namespace_prefixed_ids = $1;
163892325845 Initial commit. galaxyp parents: diff changeset	541 # $logger->debug('Found prefixed IDs in header: ' . $concatenated_namespace_prefixed_ids);
163892325845 Initial commit. galaxyp parents: diff changeset	542 #
163892325845 Initial commit. galaxyp parents: diff changeset	543 # # database_namespace = $3 && $5;
163892325845 Initial commit. galaxyp parents: diff changeset	544 # # id = $4 && $6;
163892325845 Initial commit. galaxyp parents: diff changeset	545 # # description = $8;
163892325845 Initial commit. galaxyp parents: diff changeset	546 # my @namespace_prefixed_ids = split(/[\|;]/, $concatenated_namespace_prefixed_ids);
163892325845 Initial commit. galaxyp parents: diff changeset	547 #
163892325845 Initial commit. galaxyp parents: diff changeset	548 # foreach my $prefixed_id (@namespace_prefixed_ids) {
163892325845 Initial commit. galaxyp parents: diff changeset	549 #
163892325845 Initial commit. galaxyp parents: diff changeset	550 # $logger->trace('Prefixed ID: ' . $prefixed_id);
163892325845 Initial commit. galaxyp parents: diff changeset	551 #
163892325845 Initial commit. galaxyp parents: diff changeset	552 # if ($prefixed_id =~ m/(([^\s\n\r:;\|]+):)?([^\s\n\r:\|]+)/i) {
163892325845 Initial commit. galaxyp parents: diff changeset	553 #
163892325845 Initial commit. galaxyp parents: diff changeset	554 # my $this_prefix = $2;
163892325845 Initial commit. galaxyp parents: diff changeset	555 # my $this_id = $3;
163892325845 Initial commit. galaxyp parents: diff changeset	556 # $logger->debug('Found ID: ' . $this_id);
163892325845 Initial commit. galaxyp parents: diff changeset	557 # push(@header_ids, $this_id);
163892325845 Initial commit. galaxyp parents: diff changeset	558 #
163892325845 Initial commit. galaxyp parents: diff changeset	559 # #
163892325845 Initial commit. galaxyp parents: diff changeset	560 # # For a selected list of "known" database namespace prefixes,
163892325845 Initial commit. galaxyp parents: diff changeset	561 # # check for versioned accession numbers and - if found - add
163892325845 Initial commit. galaxyp parents: diff changeset	562 # # both the versioned and the unversioned accession number
163892325845 Initial commit. galaxyp parents: diff changeset	563 # # to the list of header IDs.
163892325845 Initial commit. galaxyp parents: diff changeset	564 # #
163892325845 Initial commit. galaxyp parents: diff changeset	565 # if (defined($this_prefix) && defined($databases_with_optionally_versioned_ids{$this_prefix})) {
163892325845 Initial commit. galaxyp parents: diff changeset	566 # $logger->trace('Found prefix: ' . $this_prefix);
163892325845 Initial commit. galaxyp parents: diff changeset	567 # my $separator = $databases_with_optionally_versioned_ids{$this_prefix};
163892325845 Initial commit. galaxyp parents: diff changeset	568 # $separator = '\\' . $separator;
163892325845 Initial commit. galaxyp parents: diff changeset	569 # if ($this_id =~ m/([^$separator]+)$separator\d+$/) {
163892325845 Initial commit. galaxyp parents: diff changeset	570 # my $this_unversioned_id = $1;
163892325845 Initial commit. galaxyp parents: diff changeset	571 # $logger->debug('Found unversioned ID: ' . $this_unversioned_id);
163892325845 Initial commit. galaxyp parents: diff changeset	572 # push(@header_ids, $this_unversioned_id);
163892325845 Initial commit. galaxyp parents: diff changeset	573 # }
163892325845 Initial commit. galaxyp parents: diff changeset	574 # }
163892325845 Initial commit. galaxyp parents: diff changeset	575 #
163892325845 Initial commit. galaxyp parents: diff changeset	576 # } else {
163892325845 Initial commit. galaxyp parents: diff changeset	577 #
163892325845 Initial commit. galaxyp parents: diff changeset	578 # $logger->warn(
163892325845 Initial commit. galaxyp parents: diff changeset	579 # 'This should have been an optionally prefixed ID, '
163892325845 Initial commit. galaxyp parents: diff changeset	580 # . 'but failed to match the corresponding regex: '
163892325845 Initial commit. galaxyp parents: diff changeset	581 # . $prefixed_id);
163892325845 Initial commit. galaxyp parents: diff changeset	582 #
163892325845 Initial commit. galaxyp parents: diff changeset	583 # }
163892325845 Initial commit. galaxyp parents: diff changeset	584 # }
163892325845 Initial commit. galaxyp parents: diff changeset	585 #
163892325845 Initial commit. galaxyp parents: diff changeset	586 # } elsif ($line =~ m/^>((([^\s\n\r:;\|]+)[\|;])*([^\s\n\r:;\|]+))[\|;]?(\s+(.+))?/i) {
163892325845 Initial commit. galaxyp parents: diff changeset	587 #
163892325845 Initial commit. galaxyp parents: diff changeset	588 # #
163892325845 Initial commit. galaxyp parents: diff changeset	589 # # One or more unprefixed IDs.
163892325845 Initial commit. galaxyp parents: diff changeset	590 # #
163892325845 Initial commit. galaxyp parents: diff changeset	591 # my $concatenated_ids = $1;
163892325845 Initial commit. galaxyp parents: diff changeset	592 # $logger->debug('Found unprefixed IDs in header: ' . $concatenated_ids);
163892325845 Initial commit. galaxyp parents: diff changeset	593 #
163892325845 Initial commit. galaxyp parents: diff changeset	594 # # id = $3 && $4;
163892325845 Initial commit. galaxyp parents: diff changeset	595 # # description = $7;
163892325845 Initial commit. galaxyp parents: diff changeset	596 # my @unprefixed_ids = split(/[\|;]/, $concatenated_ids);
163892325845 Initial commit. galaxyp parents: diff changeset	597 #
163892325845 Initial commit. galaxyp parents: diff changeset	598 # foreach my $unprefixed_id (@unprefixed_ids) {
163892325845 Initial commit. galaxyp parents: diff changeset	599 #
163892325845 Initial commit. galaxyp parents: diff changeset	600 # $logger->debug('Found ID: ' . $unprefixed_id);
163892325845 Initial commit. galaxyp parents: diff changeset	601 # push(@header_ids, $unprefixed_id);
163892325845 Initial commit. galaxyp parents: diff changeset	602 #
163892325845 Initial commit. galaxyp parents: diff changeset	603 # }
163892325845 Initial commit. galaxyp parents: diff changeset	604 #
163892325845 Initial commit. galaxyp parents: diff changeset	605 # } else {
163892325845 Initial commit. galaxyp parents: diff changeset	606 #
163892325845 Initial commit. galaxyp parents: diff changeset	607 # #
163892325845 Initial commit. galaxyp parents: diff changeset	608 # # Something else.
163892325845 Initial commit. galaxyp parents: diff changeset	609 # #
163892325845 Initial commit. galaxyp parents: diff changeset	610 # # The FASTA header line can basically have any format,
163892325845 Initial commit. galaxyp parents: diff changeset	611 # # so this is probably one of the less frequent occurring annotation schemes.
163892325845 Initial commit. galaxyp parents: diff changeset	612 # # Therefore we try to see if the IDs we are looking for are present anywhere
163892325845 Initial commit. galaxyp parents: diff changeset	613 # # in the header line up until the first white space or new line.
163892325845 Initial commit. galaxyp parents: diff changeset	614 # # This may be tricky as we might match other annotation from other proteins
163892325845 Initial commit. galaxyp parents: diff changeset	615 # # like a description from a 'best' BLAST hit that contains one of the IDs we
163892325845 Initial commit. galaxyp parents: diff changeset	616 # # are looking for. Hence, in such a case this sequence is not the one of
163892325845 Initial commit. galaxyp parents: diff changeset	617 # # the IDs we looking for, but similar to that one at best.
163892325845 Initial commit. galaxyp parents: diff changeset	618 # #
163892325845 Initial commit. galaxyp parents: diff changeset	619 # if ($line =~ m/>([^\n\r\s]+)/) {
163892325845 Initial commit. galaxyp parents: diff changeset	620 #
163892325845 Initial commit. galaxyp parents: diff changeset	621 # my $putative_id = $1;
163892325845 Initial commit. galaxyp parents: diff changeset	622 # $logger->debug('Found putative ID in header: ' . $putative_id);
163892325845 Initial commit. galaxyp parents: diff changeset	623 # push(@header_ids, $putative_id);
163892325845 Initial commit. galaxyp parents: diff changeset	624 #
163892325845 Initial commit. galaxyp parents: diff changeset	625 # } else {
163892325845 Initial commit. galaxyp parents: diff changeset	626 # $logger->warn('Cannot identify IDs in this header: ' . $line);
163892325845 Initial commit. galaxyp parents: diff changeset	627 # }
163892325845 Initial commit. galaxyp parents: diff changeset	628 # }
163892325845 Initial commit. galaxyp parents: diff changeset	629 }
163892325845 Initial commit. galaxyp parents: diff changeset	630 }
163892325845 Initial commit. galaxyp parents: diff changeset	631
163892325845 Initial commit. galaxyp parents: diff changeset	632 #
163892325845 Initial commit. galaxyp parents: diff changeset	633 # Store the last found sequence in the lookup table.
163892325845 Initial commit. galaxyp parents: diff changeset	634 #
163892325845 Initial commit. galaxyp parents: diff changeset	635 foreach my $header_id (@header_ids) {
163892325845 Initial commit. galaxyp parents: diff changeset	636
163892325845 Initial commit. galaxyp parents: diff changeset	637 #
163892325845 Initial commit. galaxyp parents: diff changeset	638 # We don't know if the ID we found is an accession number (stable ID) or a normal (unstable) ID
163892325845 Initial commit. galaxyp parents: diff changeset	639 # -> Fill both the AC and ID fields with the same ID value.
163892325845 Initial commit. galaxyp parents: diff changeset	640 #
163892325845 Initial commit. galaxyp parents: diff changeset	641 $seqs{$header_id}{'AC'} = $header_id;
163892325845 Initial commit. galaxyp parents: diff changeset	642 $seqs{$header_id}{'ID'} = $header_id;
163892325845 Initial commit. galaxyp parents: diff changeset	643 $seqs{$header_id}{'SQ'} = $seq;
163892325845 Initial commit. galaxyp parents: diff changeset	644
163892325845 Initial commit. galaxyp parents: diff changeset	645 $logger->debug('Stored ' . $header_id . ':');
163892325845 Initial commit. galaxyp parents: diff changeset	646 $logger->debug("\t" . 'SQ ' . $seq);
163892325845 Initial commit. galaxyp parents: diff changeset	647
163892325845 Initial commit. galaxyp parents: diff changeset	648 }
163892325845 Initial commit. galaxyp parents: diff changeset	649
163892325845 Initial commit. galaxyp parents: diff changeset	650 #
163892325845 Initial commit. galaxyp parents: diff changeset	651 # Reset vars.
163892325845 Initial commit. galaxyp parents: diff changeset	652 #
163892325845 Initial commit. galaxyp parents: diff changeset	653 $seq = '';
163892325845 Initial commit. galaxyp parents: diff changeset	654 @header_ids = ();
163892325845 Initial commit. galaxyp parents: diff changeset	655
163892325845 Initial commit. galaxyp parents: diff changeset	656 }
163892325845 Initial commit. galaxyp parents: diff changeset	657
163892325845 Initial commit. galaxyp parents: diff changeset	658 close($file_path_fh);
163892325845 Initial commit. galaxyp parents: diff changeset	659 $logger->info('Created lookup list for database sequences.');
163892325845 Initial commit. galaxyp parents: diff changeset	660
163892325845 Initial commit. galaxyp parents: diff changeset	661 return (\%seqs, $index_id);
163892325845 Initial commit. galaxyp parents: diff changeset	662
163892325845 Initial commit. galaxyp parents: diff changeset	663 }
163892325845 Initial commit. galaxyp parents: diff changeset	664
163892325845 Initial commit. galaxyp parents: diff changeset	665 sub _ExtractSeqs {
163892325845 Initial commit. galaxyp parents: diff changeset	666
163892325845 Initial commit. galaxyp parents: diff changeset	667 my ($db_lookup_table_refs,
163892325845 Initial commit. galaxyp parents: diff changeset	668 $path_from, $strip_lc, $id_column, $peptide_column,
163892325845 Initial commit. galaxyp parents: diff changeset	669 $cleavage_path_to, $miscleavage_path_to, $modification_path_to, $peptide_path_to,
163892325845 Initial commit. galaxyp parents: diff changeset	670 $cleavage_aa, $cleavage_terminus, $miscleavage_distance,
163892325845 Initial commit. galaxyp parents: diff changeset	671 $n_term_context_length, $c_term_context_length,
163892325845 Initial commit. galaxyp parents: diff changeset	672 $padding_character) = @_;
163892325845 Initial commit. galaxyp parents: diff changeset	673
163892325845 Initial commit. galaxyp parents: diff changeset	674 $logger->debug('Parsing ' . $path_from . '...');
163892325845 Initial commit. galaxyp parents: diff changeset	675
163892325845 Initial commit. galaxyp parents: diff changeset	676 my $extracted_seq_contexts_cle = 0;
163892325845 Initial commit. galaxyp parents: diff changeset	677 my $extracted_seq_contexts_mis = 0;
163892325845 Initial commit. galaxyp parents: diff changeset	678 my $extracted_seq_contexts_mod = 0;
163892325845 Initial commit. galaxyp parents: diff changeset	679 my $extracted_seq_contexts_pep = 0;
163892325845 Initial commit. galaxyp parents: diff changeset	680 my $path_from_fh;
163892325845 Initial commit. galaxyp parents: diff changeset	681 my $cleavage_path_to_fh;
163892325845 Initial commit. galaxyp parents: diff changeset	682 my $miscleavage_path_to_fh;
163892325845 Initial commit. galaxyp parents: diff changeset	683 my $modification_path_to_fh;
163892325845 Initial commit. galaxyp parents: diff changeset	684 my $peptide_path_to_fh;
163892325845 Initial commit. galaxyp parents: diff changeset	685
163892325845 Initial commit. galaxyp parents: diff changeset	686 eval {
163892325845 Initial commit. galaxyp parents: diff changeset	687 open($path_from_fh, "<$path_from");
163892325845 Initial commit. galaxyp parents: diff changeset	688 };
163892325845 Initial commit. galaxyp parents: diff changeset	689 if ($@) {
163892325845 Initial commit. galaxyp parents: diff changeset	690 $logger->fatal('Cannot read from fragments input file: ' . $@);
163892325845 Initial commit. galaxyp parents: diff changeset	691 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	692 }
163892325845 Initial commit. galaxyp parents: diff changeset	693 if (defined($cleavage_path_to)) {
163892325845 Initial commit. galaxyp parents: diff changeset	694 eval {
163892325845 Initial commit. galaxyp parents: diff changeset	695 open($cleavage_path_to_fh, ">$cleavage_path_to");
163892325845 Initial commit. galaxyp parents: diff changeset	696 };
163892325845 Initial commit. galaxyp parents: diff changeset	697 if ($@) {
163892325845 Initial commit. galaxyp parents: diff changeset	698 $logger->fatal('Cannot write to output file: ' . $@);
163892325845 Initial commit. galaxyp parents: diff changeset	699 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	700 }
163892325845 Initial commit. galaxyp parents: diff changeset	701 }
163892325845 Initial commit. galaxyp parents: diff changeset	702 if (defined($miscleavage_path_to)) {
163892325845 Initial commit. galaxyp parents: diff changeset	703 eval {
163892325845 Initial commit. galaxyp parents: diff changeset	704 open($miscleavage_path_to_fh, ">$miscleavage_path_to");
163892325845 Initial commit. galaxyp parents: diff changeset	705 };
163892325845 Initial commit. galaxyp parents: diff changeset	706 if ($@) {
163892325845 Initial commit. galaxyp parents: diff changeset	707 $logger->fatal('Cannot write to output file: ' . $@);
163892325845 Initial commit. galaxyp parents: diff changeset	708 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	709 }
163892325845 Initial commit. galaxyp parents: diff changeset	710 }
163892325845 Initial commit. galaxyp parents: diff changeset	711 if (defined($modification_path_to)) {
163892325845 Initial commit. galaxyp parents: diff changeset	712 eval {
163892325845 Initial commit. galaxyp parents: diff changeset	713 open($modification_path_to_fh, ">$modification_path_to");
163892325845 Initial commit. galaxyp parents: diff changeset	714 };
163892325845 Initial commit. galaxyp parents: diff changeset	715 if ($@) {
163892325845 Initial commit. galaxyp parents: diff changeset	716 $logger->fatal('Cannot write to output file: ' . $@);
163892325845 Initial commit. galaxyp parents: diff changeset	717 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	718 }
163892325845 Initial commit. galaxyp parents: diff changeset	719 }
163892325845 Initial commit. galaxyp parents: diff changeset	720 if (defined($peptide_path_to)) {
163892325845 Initial commit. galaxyp parents: diff changeset	721 eval {
163892325845 Initial commit. galaxyp parents: diff changeset	722 open($peptide_path_to_fh, ">$peptide_path_to");
163892325845 Initial commit. galaxyp parents: diff changeset	723 };
163892325845 Initial commit. galaxyp parents: diff changeset	724 if ($@) {
163892325845 Initial commit. galaxyp parents: diff changeset	725 $logger->fatal('Cannot write to output file: ' . $@);
163892325845 Initial commit. galaxyp parents: diff changeset	726 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	727 }
163892325845 Initial commit. galaxyp parents: diff changeset	728 }
163892325845 Initial commit. galaxyp parents: diff changeset	729
163892325845 Initial commit. galaxyp parents: diff changeset	730 LINE:while (my $line = <$path_from_fh>) {
163892325845 Initial commit. galaxyp parents: diff changeset	731
163892325845 Initial commit. galaxyp parents: diff changeset	732 #
163892325845 Initial commit. galaxyp parents: diff changeset	733 # Remove (trailing) line end characters!
163892325845 Initial commit. galaxyp parents: diff changeset	734 #
163892325845 Initial commit. galaxyp parents: diff changeset	735 $line =~ s/[\n\r\f]+//g;
163892325845 Initial commit. galaxyp parents: diff changeset	736 $logger->trace($line);
163892325845 Initial commit. galaxyp parents: diff changeset	737
163892325845 Initial commit. galaxyp parents: diff changeset	738 my @column_values = split(/\t/, $line);
163892325845 Initial commit. galaxyp parents: diff changeset	739
163892325845 Initial commit. galaxyp parents: diff changeset	740 # For debugging only.
163892325845 Initial commit. galaxyp parents: diff changeset	741 #foreach my $val (@column_values) {
163892325845 Initial commit. galaxyp parents: diff changeset	742 # $logger->debug('Col val: ' . $val);
163892325845 Initial commit. galaxyp parents: diff changeset	743 #}
163892325845 Initial commit. galaxyp parents: diff changeset	744
163892325845 Initial commit. galaxyp parents: diff changeset	745 my $id_column_index = $id_column - 1;
163892325845 Initial commit. galaxyp parents: diff changeset	746 my $peptide_column_index = $peptide_column - 1;
163892325845 Initial commit. galaxyp parents: diff changeset	747 my $unfiltered_key = $column_values[$id_column_index];
163892325845 Initial commit. galaxyp parents: diff changeset	748 my $key;
163892325845 Initial commit. galaxyp parents: diff changeset	749 my @keys;
163892325845 Initial commit. galaxyp parents: diff changeset	750 my $peptide = $column_values[$peptide_column_index];
163892325845 Initial commit. galaxyp parents: diff changeset	751 my $peptide_nonmod;
163892325845 Initial commit. galaxyp parents: diff changeset	752 my $protein;
163892325845 Initial commit. galaxyp parents: diff changeset	753 my $acc;
163892325845 Initial commit. galaxyp parents: diff changeset	754 my $id;
163892325845 Initial commit. galaxyp parents: diff changeset	755 my $index = -1; # An index < 0 indicates the peptide was not found in a protein sequence (yet).
163892325845 Initial commit. galaxyp parents: diff changeset	756 my $length;
163892325845 Initial commit. galaxyp parents: diff changeset	757 my $cleavage_peptide_contexts = [];
163892325845 Initial commit. galaxyp parents: diff changeset	758 my $miscleavage_peptide_contexts = [];
163892325845 Initial commit. galaxyp parents: diff changeset	759 my $modification_peptide_contexts = [];
163892325845 Initial commit. galaxyp parents: diff changeset	760 my $peptide_contexts = [];
163892325845 Initial commit. galaxyp parents: diff changeset	761
163892325845 Initial commit. galaxyp parents: diff changeset	762 #
163892325845 Initial commit. galaxyp parents: diff changeset	763 # Check if $key and $peptide were present in this line.
163892325845 Initial commit. galaxyp parents: diff changeset	764 # If not this may be a leading/trailing empty or meta-data line,
163892325845 Initial commit. galaxyp parents: diff changeset	765 # that does not contain the correct amount of columns.
163892325845 Initial commit. galaxyp parents: diff changeset	766 #
163892325845 Initial commit. galaxyp parents: diff changeset	767 unless (defined($unfiltered_key) && defined($peptide)) {
163892325845 Initial commit. galaxyp parents: diff changeset	768 $logger->warn('Skipping line: ' . $line);
163892325845 Initial commit. galaxyp parents: diff changeset	769 $logger->warn("\t" . '(Peptide sequence and/or protein ID missing or incorrect amount of columns.)');
163892325845 Initial commit. galaxyp parents: diff changeset	770 next LINE;
163892325845 Initial commit. galaxyp parents: diff changeset	771 }
163892325845 Initial commit. galaxyp parents: diff changeset	772
163892325845 Initial commit. galaxyp parents: diff changeset	773 #
163892325845 Initial commit. galaxyp parents: diff changeset	774 # Sanity check for peptide sequences.
163892325845 Initial commit. galaxyp parents: diff changeset	775 #
163892325845 Initial commit. galaxyp parents: diff changeset	776 unless ($peptide =~ m/([a-zA-Z]+)/) {
163892325845 Initial commit. galaxyp parents: diff changeset	777 $logger->warn('Fragment file line in unexpected format. Cannot find peptide in: ' . $line);
163892325845 Initial commit. galaxyp parents: diff changeset	778 $logger->debug('$peptide contains: ' . $peptide);
163892325845 Initial commit. galaxyp parents: diff changeset	779 next LINE;
163892325845 Initial commit. galaxyp parents: diff changeset	780 }
163892325845 Initial commit. galaxyp parents: diff changeset	781
163892325845 Initial commit. galaxyp parents: diff changeset	782 if ($strip_lc) {
163892325845 Initial commit. galaxyp parents: diff changeset	783 # Remove lower case characters representing modifications.
163892325845 Initial commit. galaxyp parents: diff changeset	784 $peptide_nonmod = $peptide;
163892325845 Initial commit. galaxyp parents: diff changeset	785 $peptide_nonmod =~ s/[a-z]+//g;
163892325845 Initial commit. galaxyp parents: diff changeset	786 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	787 # Convert everything to uppercase.
163892325845 Initial commit. galaxyp parents: diff changeset	788 $peptide_nonmod = uc($peptide);
163892325845 Initial commit. galaxyp parents: diff changeset	789 }
163892325845 Initial commit. galaxyp parents: diff changeset	790
163892325845 Initial commit. galaxyp parents: diff changeset	791 #
163892325845 Initial commit. galaxyp parents: diff changeset	792 # Figure out in what format the Protein ID(s) were supplied.
163892325845 Initial commit. galaxyp parents: diff changeset	793 #
163892325845 Initial commit. galaxyp parents: diff changeset	794 # * Strip off optional DB namespace prefixes.
163892325845 Initial commit. galaxyp parents: diff changeset	795 # * Handle optional accession number version suffixes.
163892325845 Initial commit. galaxyp parents: diff changeset	796 #
163892325845 Initial commit. galaxyp parents: diff changeset	797 if ($unfiltered_key =~ m/^((([^\s;\|]+)[\s;\|]+)*([^\s;\|]+))[\s\|;]?/i) {
163892325845 Initial commit. galaxyp parents: diff changeset	798
163892325845 Initial commit. galaxyp parents: diff changeset	799 #
163892325845 Initial commit. galaxyp parents: diff changeset	800 # Got multiple IDs
163892325845 Initial commit. galaxyp parents: diff changeset	801 #
163892325845 Initial commit. galaxyp parents: diff changeset	802 my $multiple_ids = $1;
163892325845 Initial commit. galaxyp parents: diff changeset	803 @keys = split(/[\s;\|]+/, $multiple_ids);
163892325845 Initial commit. galaxyp parents: diff changeset	804
163892325845 Initial commit. galaxyp parents: diff changeset	805 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	806
163892325845 Initial commit. galaxyp parents: diff changeset	807 #
163892325845 Initial commit. galaxyp parents: diff changeset	808 # Got a single ID
163892325845 Initial commit. galaxyp parents: diff changeset	809 #
163892325845 Initial commit. galaxyp parents: diff changeset	810 push(@keys, $unfiltered_key);
163892325845 Initial commit. galaxyp parents: diff changeset	811
163892325845 Initial commit. galaxyp parents: diff changeset	812 }
163892325845 Initial commit. galaxyp parents: diff changeset	813
163892325845 Initial commit. galaxyp parents: diff changeset	814 KEY:foreach $unfiltered_key (@keys) {
163892325845 Initial commit. galaxyp parents: diff changeset	815
163892325845 Initial commit. galaxyp parents: diff changeset	816 $key = $unfiltered_key;
163892325845 Initial commit. galaxyp parents: diff changeset	817
163892325845 Initial commit. galaxyp parents: diff changeset	818 #
163892325845 Initial commit. galaxyp parents: diff changeset	819 # Check for RockerBox formattted IDs.
163892325845 Initial commit. galaxyp parents: diff changeset	820 #
163892325845 Initial commit. galaxyp parents: diff changeset	821 # Example of RockerBox ID format, which may contain
163892325845 Initial commit. galaxyp parents: diff changeset	822 # * multiple IDs separated by semi-colons and
163892325845 Initial commit. galaxyp parents: diff changeset	823 # * suffixed with the position of the peptide in the protein:
163892325845 Initial commit. galaxyp parents: diff changeset	824 # YGR192C[123-142]; YJR009C[123-142]
163892325845 Initial commit. galaxyp parents: diff changeset	825 if ($key =~ m/^([^\[]+)\[\d+-\d+\]$/i) {
163892325845 Initial commit. galaxyp parents: diff changeset	826 $logger->trace('Protein ID in RockerBox format: ' . $key);
163892325845 Initial commit. galaxyp parents: diff changeset	827 $key = $1;
163892325845 Initial commit. galaxyp parents: diff changeset	828 $logger->trace("\t" . 'Using ID : ' . $key);
163892325845 Initial commit. galaxyp parents: diff changeset	829 }
163892325845 Initial commit. galaxyp parents: diff changeset	830
163892325845 Initial commit. galaxyp parents: diff changeset	831 #
163892325845 Initial commit. galaxyp parents: diff changeset	832 # Check for DB namespace prefixes.
163892325845 Initial commit. galaxyp parents: diff changeset	833 #
163892325845 Initial commit. galaxyp parents: diff changeset	834 if ($key =~ m/([^:]+):([^:]+)/i) {
163892325845 Initial commit. galaxyp parents: diff changeset	835
163892325845 Initial commit. galaxyp parents: diff changeset	836 #
163892325845 Initial commit. galaxyp parents: diff changeset	837 # Namespace prefixed ID.
163892325845 Initial commit. galaxyp parents: diff changeset	838 #
163892325845 Initial commit. galaxyp parents: diff changeset	839 $logger->trace('Namespace prefixed ID: ' . $key);
163892325845 Initial commit. galaxyp parents: diff changeset	840
163892325845 Initial commit. galaxyp parents: diff changeset	841 my $prefix = $1;
163892325845 Initial commit. galaxyp parents: diff changeset	842 $key = $2;
163892325845 Initial commit. galaxyp parents: diff changeset	843
163892325845 Initial commit. galaxyp parents: diff changeset	844 $logger->trace("\t" . 'Using ID: ' . $key);
163892325845 Initial commit. galaxyp parents: diff changeset	845
163892325845 Initial commit. galaxyp parents: diff changeset	846 #
163892325845 Initial commit. galaxyp parents: diff changeset	847 # For a selected list of "known" database namespace prefixes,
163892325845 Initial commit. galaxyp parents: diff changeset	848 # check for versioned accession numbers and remove the version number.
163892325845 Initial commit. galaxyp parents: diff changeset	849 #
163892325845 Initial commit. galaxyp parents: diff changeset	850 if (defined($prefix) && defined($databases_with_optionally_versioned_ids{$prefix})) {
163892325845 Initial commit. galaxyp parents: diff changeset	851 $logger->trace('Found prefix: ' . $prefix);
163892325845 Initial commit. galaxyp parents: diff changeset	852 my $separator = $databases_with_optionally_versioned_ids{$prefix};
163892325845 Initial commit. galaxyp parents: diff changeset	853 $separator = '\\' . $separator;
163892325845 Initial commit. galaxyp parents: diff changeset	854 if ($key =~ m/([^$separator]+)$separator\d+$/) {
163892325845 Initial commit. galaxyp parents: diff changeset	855 $key = $1;
163892325845 Initial commit. galaxyp parents: diff changeset	856 }
163892325845 Initial commit. galaxyp parents: diff changeset	857 }
163892325845 Initial commit. galaxyp parents: diff changeset	858 }
163892325845 Initial commit. galaxyp parents: diff changeset	859
163892325845 Initial commit. galaxyp parents: diff changeset	860 $logger->debug('Found ID: ' . $key);
163892325845 Initial commit. galaxyp parents: diff changeset	861
163892325845 Initial commit. galaxyp parents: diff changeset	862 #
163892325845 Initial commit. galaxyp parents: diff changeset	863 # Sanity check for Protein ID.
163892325845 Initial commit. galaxyp parents: diff changeset	864 #
163892325845 Initial commit. galaxyp parents: diff changeset	865 unless ($key =~ m/^([A-Z0-9\._-]+)$/i) {
163892325845 Initial commit. galaxyp parents: diff changeset	866 $logger->warn('Fragment file line in unexpected format. Cannot find sequence ID in:');
163892325845 Initial commit. galaxyp parents: diff changeset	867 $logger->warn("\t" . $line);
163892325845 Initial commit. galaxyp parents: diff changeset	868 $logger->debug('$key contains: ' . $key);
163892325845 Initial commit. galaxyp parents: diff changeset	869 next LINE;
163892325845 Initial commit. galaxyp parents: diff changeset	870 }
163892325845 Initial commit. galaxyp parents: diff changeset	871
163892325845 Initial commit. galaxyp parents: diff changeset	872 #
163892325845 Initial commit. galaxyp parents: diff changeset	873 # Lookup info for the protein this peptide was derived from.
163892325845 Initial commit. galaxyp parents: diff changeset	874 #
163892325845 Initial commit. galaxyp parents: diff changeset	875 ($protein, $acc, $id, $index) = _LookupProteinInfo($key, $peptide_nonmod, $db_lookup_table_refs);
163892325845 Initial commit. galaxyp parents: diff changeset	876
163892325845 Initial commit. galaxyp parents: diff changeset	877 #
163892325845 Initial commit. galaxyp parents: diff changeset	878 # Check if the peptide (fragment) was found in this protein sequence.
163892325845 Initial commit. galaxyp parents: diff changeset	879 #
163892325845 Initial commit. galaxyp parents: diff changeset	880 if (defined($protein)) {
163892325845 Initial commit. galaxyp parents: diff changeset	881 if ($index > -1) {
163892325845 Initial commit. galaxyp parents: diff changeset	882 $logger->trace('Found peptide sequence in protein ' . $key);
163892325845 Initial commit. galaxyp parents: diff changeset	883 last KEY;
163892325845 Initial commit. galaxyp parents: diff changeset	884 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	885 $logger->warn('Cannot find peptide (' . $peptide_nonmod . ') in protein ' . $key);
163892325845 Initial commit. galaxyp parents: diff changeset	886 $logger->warn("\t" . 'Will try other protein IDs for this peptide if available.');
163892325845 Initial commit. galaxyp parents: diff changeset	887 }
163892325845 Initial commit. galaxyp parents: diff changeset	888 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	889 $logger->warn('Cannot find protein identifier ' . $key . ' in any of the supplied databases.');
163892325845 Initial commit. galaxyp parents: diff changeset	890 $logger->warn("\t" . 'Will try other protein IDs for this peptide if available.');
163892325845 Initial commit. galaxyp parents: diff changeset	891 }
163892325845 Initial commit. galaxyp parents: diff changeset	892 }
163892325845 Initial commit. galaxyp parents: diff changeset	893
163892325845 Initial commit. galaxyp parents: diff changeset	894 #
163892325845 Initial commit. galaxyp parents: diff changeset	895 # Check if the peptide (fragment) was found in any of the associated protein sequences.
163892325845 Initial commit. galaxyp parents: diff changeset	896 #
163892325845 Initial commit. galaxyp parents: diff changeset	897 if (defined($protein)) {
163892325845 Initial commit. galaxyp parents: diff changeset	898
163892325845 Initial commit. galaxyp parents: diff changeset	899 $logger->debug('Found protein ' . $key);
163892325845 Initial commit. galaxyp parents: diff changeset	900
163892325845 Initial commit. galaxyp parents: diff changeset	901 if ($index > -1) {
163892325845 Initial commit. galaxyp parents: diff changeset	902
163892325845 Initial commit. galaxyp parents: diff changeset	903 $logger->debug('Found peptide (' . $peptide_nonmod . ') in protein ' . $key);
163892325845 Initial commit. galaxyp parents: diff changeset	904
163892325845 Initial commit. galaxyp parents: diff changeset	905 if (defined($cleavage_path_to)) {
163892325845 Initial commit. galaxyp parents: diff changeset	906 ($cleavage_peptide_contexts) = _GetCleavageSequenceContexts($peptide_nonmod, $protein, $key, $index,
163892325845 Initial commit. galaxyp parents: diff changeset	907 $cleavage_aa, $cleavage_terminus);
163892325845 Initial commit. galaxyp parents: diff changeset	908 }
163892325845 Initial commit. galaxyp parents: diff changeset	909 if (defined($miscleavage_path_to)) {
163892325845 Initial commit. galaxyp parents: diff changeset	910 ($miscleavage_peptide_contexts) = _GetMiscleavageSequenceContexts($peptide_nonmod, $protein, $key, $index,
163892325845 Initial commit. galaxyp parents: diff changeset	911 $cleavage_aa, $cleavage_terminus, $miscleavage_distance);
163892325845 Initial commit. galaxyp parents: diff changeset	912 }
163892325845 Initial commit. galaxyp parents: diff changeset	913 if (defined($modification_path_to)) {
163892325845 Initial commit. galaxyp parents: diff changeset	914 ($modification_peptide_contexts) = _GetModificationSequenceContexts($peptide_nonmod, $protein, $key, $index,
163892325845 Initial commit. galaxyp parents: diff changeset	915 $modified_aa, $peptide);
163892325845 Initial commit. galaxyp parents: diff changeset	916 }
163892325845 Initial commit. galaxyp parents: diff changeset	917 if (defined($peptide_path_to)) {
163892325845 Initial commit. galaxyp parents: diff changeset	918 ($peptide_contexts) = _GetPeptideSequenceContexts($peptide_nonmod, $protein, $key, $index);
163892325845 Initial commit. galaxyp parents: diff changeset	919 }
163892325845 Initial commit. galaxyp parents: diff changeset	920 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	921 $logger->error('Cannot find peptide (' . $peptide_nonmod . ') in protein ' . $key);
163892325845 Initial commit. galaxyp parents: diff changeset	922 $acc = 'NA';
163892325845 Initial commit. galaxyp parents: diff changeset	923 }
163892325845 Initial commit. galaxyp parents: diff changeset	924 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	925 $logger->error('Cannot find protein "' . $unfiltered_key . '" in any of the supplied databases.');
163892325845 Initial commit. galaxyp parents: diff changeset	926 $acc = 'NA';
163892325845 Initial commit. galaxyp parents: diff changeset	927 }
163892325845 Initial commit. galaxyp parents: diff changeset	928
163892325845 Initial commit. galaxyp parents: diff changeset	929 #
163892325845 Initial commit. galaxyp parents: diff changeset	930 # Append the peptide contexts to the existing lines of the fragment file and save that new line to the output file.
163892325845 Initial commit. galaxyp parents: diff changeset	931 #
163892325845 Initial commit. galaxyp parents: diff changeset	932 $extracted_seq_contexts_cle = _SaveSequenceContexts($line, $acc, $cleavage_peptide_contexts,
163892325845 Initial commit. galaxyp parents: diff changeset	933 $cleavage_path_to_fh, $extracted_seq_contexts_cle);
163892325845 Initial commit. galaxyp parents: diff changeset	934
163892325845 Initial commit. galaxyp parents: diff changeset	935 $extracted_seq_contexts_mis = _SaveSequenceContexts($line, $acc, $miscleavage_peptide_contexts,
163892325845 Initial commit. galaxyp parents: diff changeset	936 $miscleavage_path_to_fh, $extracted_seq_contexts_mis);
163892325845 Initial commit. galaxyp parents: diff changeset	937
163892325845 Initial commit. galaxyp parents: diff changeset	938 $extracted_seq_contexts_mod = _SaveSequenceContexts($line, $acc, $modification_peptide_contexts,
163892325845 Initial commit. galaxyp parents: diff changeset	939 $modification_path_to_fh, $extracted_seq_contexts_mod);
163892325845 Initial commit. galaxyp parents: diff changeset	940
163892325845 Initial commit. galaxyp parents: diff changeset	941 $extracted_seq_contexts_pep = _SaveSequenceContexts($line, $acc, $peptide_contexts,
163892325845 Initial commit. galaxyp parents: diff changeset	942 $peptide_path_to_fh, $extracted_seq_contexts_pep);
163892325845 Initial commit. galaxyp parents: diff changeset	943
163892325845 Initial commit. galaxyp parents: diff changeset	944 }
163892325845 Initial commit. galaxyp parents: diff changeset	945
163892325845 Initial commit. galaxyp parents: diff changeset	946 #
163892325845 Initial commit. galaxyp parents: diff changeset	947 # Close file handles.
163892325845 Initial commit. galaxyp parents: diff changeset	948 #
163892325845 Initial commit. galaxyp parents: diff changeset	949 foreach my $fh ($path_from_fh, $cleavage_path_to_fh, $miscleavage_path_to_fh, $modification_path_to_fh, $peptide_path_to_fh) {
163892325845 Initial commit. galaxyp parents: diff changeset	950 if (defined($fh)) {
163892325845 Initial commit. galaxyp parents: diff changeset	951 close($fh);
163892325845 Initial commit. galaxyp parents: diff changeset	952 }
163892325845 Initial commit. galaxyp parents: diff changeset	953 }
163892325845 Initial commit. galaxyp parents: diff changeset	954
163892325845 Initial commit. galaxyp parents: diff changeset	955 return($extracted_seq_contexts_cle, $extracted_seq_contexts_mis, $extracted_seq_contexts_mod, $extracted_seq_contexts_pep);
163892325845 Initial commit. galaxyp parents: diff changeset	956
163892325845 Initial commit. galaxyp parents: diff changeset	957 }
163892325845 Initial commit. galaxyp parents: diff changeset	958
163892325845 Initial commit. galaxyp parents: diff changeset	959 sub _LookupProteinInfo {
163892325845 Initial commit. galaxyp parents: diff changeset	960
163892325845 Initial commit. galaxyp parents: diff changeset	961 my ($key, $peptide, $db_lookup_table_refs) = @_;
163892325845 Initial commit. galaxyp parents: diff changeset	962
163892325845 Initial commit. galaxyp parents: diff changeset	963 my $protein_found_in_db = 0;
163892325845 Initial commit. galaxyp parents: diff changeset	964 my $protein;
163892325845 Initial commit. galaxyp parents: diff changeset	965 my $acc;
163892325845 Initial commit. galaxyp parents: diff changeset	966 my $id;
163892325845 Initial commit. galaxyp parents: diff changeset	967 my $index;
163892325845 Initial commit. galaxyp parents: diff changeset	968
163892325845 Initial commit. galaxyp parents: diff changeset	969 DB_LOOKUP:foreach my $db_lookup_table_ref (@{$db_lookup_table_refs}) {
163892325845 Initial commit. galaxyp parents: diff changeset	970
163892325845 Initial commit. galaxyp parents: diff changeset	971 if (defined(${$db_lookup_table_ref}{$key})) {
163892325845 Initial commit. galaxyp parents: diff changeset	972
163892325845 Initial commit. galaxyp parents: diff changeset	973 $protein_found_in_db = 1;
163892325845 Initial commit. galaxyp parents: diff changeset	974 $logger->debug('Found protein ' . $key . ' in this sequence DB.');
163892325845 Initial commit. galaxyp parents: diff changeset	975
163892325845 Initial commit. galaxyp parents: diff changeset	976 $protein = ${$db_lookup_table_ref}{$key}{'SQ'};
163892325845 Initial commit. galaxyp parents: diff changeset	977 $acc = ${$db_lookup_table_ref}{$key}{'AC'};
163892325845 Initial commit. galaxyp parents: diff changeset	978 $id = ${$db_lookup_table_ref}{$key}{'ID'};
163892325845 Initial commit. galaxyp parents: diff changeset	979 #
163892325845 Initial commit. galaxyp parents: diff changeset	980 # Find peptide (fragment) in DB protein sequence.
163892325845 Initial commit. galaxyp parents: diff changeset	981 #
163892325845 Initial commit. galaxyp parents: diff changeset	982 $index = index($protein, $peptide);
163892325845 Initial commit. galaxyp parents: diff changeset	983
163892325845 Initial commit. galaxyp parents: diff changeset	984 if ($index > -1) {
163892325845 Initial commit. galaxyp parents: diff changeset	985
163892325845 Initial commit. galaxyp parents: diff changeset	986 #
163892325845 Initial commit. galaxyp parents: diff changeset	987 # Found peptide!
163892325845 Initial commit. galaxyp parents: diff changeset	988 #
163892325845 Initial commit. galaxyp parents: diff changeset	989 $logger->debug('Found peptide ' . $peptide . ' at position ' . $index . ' in protein ' . $key);
163892325845 Initial commit. galaxyp parents: diff changeset	990 last DB_LOOKUP;
163892325845 Initial commit. galaxyp parents: diff changeset	991
163892325845 Initial commit. galaxyp parents: diff changeset	992 }
163892325845 Initial commit. galaxyp parents: diff changeset	993
163892325845 Initial commit. galaxyp parents: diff changeset	994 #
163892325845 Initial commit. galaxyp parents: diff changeset	995 # If the input was based on mass spectrometry data we cannot see
163892325845 Initial commit. galaxyp parents: diff changeset	996 # the difference between I (Isolucene) and L (Leucine) as the both
163892325845 Initial commit. galaxyp parents: diff changeset	997 # weigh the same. Let's try an I agnostic search by substituting all
163892325845 Initial commit. galaxyp parents: diff changeset	998 # Is for Ls.
163892325845 Initial commit. galaxyp parents: diff changeset	999 #
163892325845 Initial commit. galaxyp parents: diff changeset	1000 my $protein_l_only = $protein;
163892325845 Initial commit. galaxyp parents: diff changeset	1001 my $peptide_l_only = $peptide;
163892325845 Initial commit. galaxyp parents: diff changeset	1002 $protein_l_only =~ s/I/L/g;
163892325845 Initial commit. galaxyp parents: diff changeset	1003 $peptide_l_only =~ s/I/L/g;
163892325845 Initial commit. galaxyp parents: diff changeset	1004 $index = index($protein_l_only, $peptide_l_only);
163892325845 Initial commit. galaxyp parents: diff changeset	1005
163892325845 Initial commit. galaxyp parents: diff changeset	1006 if ($index > -1) {
163892325845 Initial commit. galaxyp parents: diff changeset	1007
163892325845 Initial commit. galaxyp parents: diff changeset	1008 #
163892325845 Initial commit. galaxyp parents: diff changeset	1009 # Found peptide!
163892325845 Initial commit. galaxyp parents: diff changeset	1010 #
163892325845 Initial commit. galaxyp parents: diff changeset	1011 $logger->debug('Found peptide ' . $peptide . ' at position ' . $index . ' in protein ' . $key);
163892325845 Initial commit. galaxyp parents: diff changeset	1012 $logger->debug("\t" . 'Had to treat I and L as the same amino acid in the DB search to get a match.');
163892325845 Initial commit. galaxyp parents: diff changeset	1013 last DB_LOOKUP;
163892325845 Initial commit. galaxyp parents: diff changeset	1014
163892325845 Initial commit. galaxyp parents: diff changeset	1015 }
163892325845 Initial commit. galaxyp parents: diff changeset	1016
163892325845 Initial commit. galaxyp parents: diff changeset	1017 #
163892325845 Initial commit. galaxyp parents: diff changeset	1018 # Cannot find peptide :(
163892325845 Initial commit. galaxyp parents: diff changeset	1019 #
163892325845 Initial commit. galaxyp parents: diff changeset	1020 $protein = undef;
163892325845 Initial commit. galaxyp parents: diff changeset	1021 $acc = undef;
163892325845 Initial commit. galaxyp parents: diff changeset	1022 $id = undef;
163892325845 Initial commit. galaxyp parents: diff changeset	1023
163892325845 Initial commit. galaxyp parents: diff changeset	1024 $logger->debug('Cannot find peptide (' . $peptide . ') in protein ' . $key);
163892325845 Initial commit. galaxyp parents: diff changeset	1025 $logger->debug("\t" . 'This may be the result of an updated protein sequence in the database.');
163892325845 Initial commit. galaxyp parents: diff changeset	1026 $logger->debug("\t" . 'Will try to find the peptide in other databases (if available).');
163892325845 Initial commit. galaxyp parents: diff changeset	1027
163892325845 Initial commit. galaxyp parents: diff changeset	1028 if ($strip_lc) {
163892325845 Initial commit. galaxyp parents: diff changeset	1029 $logger->debug("\t" . '(Stripping of lowercase characters, indicating modifications in the peptide sequence, was enabled.)');
163892325845 Initial commit. galaxyp parents: diff changeset	1030 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1031 $logger->debug("\t" . '(Stripping of lowercase characters, indicating modifications in the peptide sequence, was not enabled.)');
163892325845 Initial commit. galaxyp parents: diff changeset	1032 }
163892325845 Initial commit. galaxyp parents: diff changeset	1033
163892325845 Initial commit. galaxyp parents: diff changeset	1034 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1035 $logger->debug('Protein ' . $key . ' missing from this sequence DB. Will try other databases (if available).');
163892325845 Initial commit. galaxyp parents: diff changeset	1036 }
163892325845 Initial commit. galaxyp parents: diff changeset	1037 }
163892325845 Initial commit. galaxyp parents: diff changeset	1038
163892325845 Initial commit. galaxyp parents: diff changeset	1039 #
163892325845 Initial commit. galaxyp parents: diff changeset	1040 # Check if the peptide (fragment) was found in a DB protein sequence.
163892325845 Initial commit. galaxyp parents: diff changeset	1041 #
163892325845 Initial commit. galaxyp parents: diff changeset	1042 if ($protein_found_in_db == 1) {
163892325845 Initial commit. galaxyp parents: diff changeset	1043 unless ($index > -1) {
163892325845 Initial commit. galaxyp parents: diff changeset	1044 $logger->warn('Cannot find peptide (' . $peptide . ') in protein ' . $key);
163892325845 Initial commit. galaxyp parents: diff changeset	1045 $logger->warn("\t" . 'Will try to find this peptide (' . $peptide . ') using other associated protein identifiers (if available).');
163892325845 Initial commit. galaxyp parents: diff changeset	1046 }
163892325845 Initial commit. galaxyp parents: diff changeset	1047 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1048 $logger->warn('Cannot find protein ' . $key . ' in any of the supplied databases.');
163892325845 Initial commit. galaxyp parents: diff changeset	1049 }
163892325845 Initial commit. galaxyp parents: diff changeset	1050
163892325845 Initial commit. galaxyp parents: diff changeset	1051 return($protein, $acc, $id, $index);
163892325845 Initial commit. galaxyp parents: diff changeset	1052 }
163892325845 Initial commit. galaxyp parents: diff changeset	1053
163892325845 Initial commit. galaxyp parents: diff changeset	1054 sub _GetCleavageSequenceContexts {
163892325845 Initial commit. galaxyp parents: diff changeset	1055
163892325845 Initial commit. galaxyp parents: diff changeset	1056 my ($peptide, $protein, $key, $index, $cleavage_aa, $cleavage_terminus) = @_;
163892325845 Initial commit. galaxyp parents: diff changeset	1057
163892325845 Initial commit. galaxyp parents: diff changeset	1058 my %peptide_contexts;
163892325845 Initial commit. galaxyp parents: diff changeset	1059 $peptide_contexts{'n_term'}{'context'} = '';
163892325845 Initial commit. galaxyp parents: diff changeset	1060 $peptide_contexts{'c_term'}{'context'} = '';
163892325845 Initial commit. galaxyp parents: diff changeset	1061 my $max_context_length = $n_term_context_length + $c_term_context_length;
163892325845 Initial commit. galaxyp parents: diff changeset	1062
163892325845 Initial commit. galaxyp parents: diff changeset	1063 foreach my $peptide_term (keys(%peptide_contexts)) {
163892325845 Initial commit. galaxyp parents: diff changeset	1064
163892325845 Initial commit. galaxyp parents: diff changeset	1065 #
163892325845 Initial commit. galaxyp parents: diff changeset	1066 # Get the offset for the sequence contexts to retrieve from this peptide terminus
163892325845 Initial commit. galaxyp parents: diff changeset	1067 # or drop the context for this peptide terminus if it is also the protein terminus.
163892325845 Initial commit. galaxyp parents: diff changeset	1068 #
163892325845 Initial commit. galaxyp parents: diff changeset	1069 if ($peptide_term eq 'n_term') {
163892325845 Initial commit. galaxyp parents: diff changeset	1070 if ($index > 0) {
163892325845 Initial commit. galaxyp parents: diff changeset	1071 # This is an N-terminal peptide context.
163892325845 Initial commit. galaxyp parents: diff changeset	1072 my $offset = $index - $n_term_context_length;
163892325845 Initial commit. galaxyp parents: diff changeset	1073 $peptide_contexts{$peptide_term}{'offset'} = $offset;
163892325845 Initial commit. galaxyp parents: diff changeset	1074 $logger->debug('Offset = ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	1075 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1076 # Peptide N-terminus == Protein N-terminus.
163892325845 Initial commit. galaxyp parents: diff changeset	1077 $logger->warn('Peptide N-terminus = protein N-terminus. Dropping N-terminal sequence context for peptide ' . $peptide .
163892325845 Initial commit. galaxyp parents: diff changeset	1078 ' in protein ' . $key . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	1079 next;
163892325845 Initial commit. galaxyp parents: diff changeset	1080 }
163892325845 Initial commit. galaxyp parents: diff changeset	1081 } elsif ($peptide_term eq 'c_term') {
163892325845 Initial commit. galaxyp parents: diff changeset	1082 if (($index + length($peptide)) < length($protein)) {
163892325845 Initial commit. galaxyp parents: diff changeset	1083 # This is a C-terminal peptide context.
163892325845 Initial commit. galaxyp parents: diff changeset	1084 my $offset = ($index + length($peptide))- $n_term_context_length;
163892325845 Initial commit. galaxyp parents: diff changeset	1085 $peptide_contexts{$peptide_term}{'offset'} = $offset;
163892325845 Initial commit. galaxyp parents: diff changeset	1086 $logger->debug('Offset = ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	1087 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1088 # Peptide C-terminus == Protein C-terminus.
163892325845 Initial commit. galaxyp parents: diff changeset	1089 $logger->warn('Peptide C-terminus = protein C-terminus. Dropping C-terminal sequence context for peptide ' . $peptide .
163892325845 Initial commit. galaxyp parents: diff changeset	1090 ' in protein ' . $key . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	1091 next;
163892325845 Initial commit. galaxyp parents: diff changeset	1092 }
163892325845 Initial commit. galaxyp parents: diff changeset	1093 }
163892325845 Initial commit. galaxyp parents: diff changeset	1094
163892325845 Initial commit. galaxyp parents: diff changeset	1095 #
163892325845 Initial commit. galaxyp parents: diff changeset	1096 # Check if DB protein sequence is long enough on n-terminal side
163892325845 Initial commit. galaxyp parents: diff changeset	1097 # to retrieve full length peptide contexts.
163892325845 Initial commit. galaxyp parents: diff changeset	1098 # (c-term length is checked later...)
163892325845 Initial commit. galaxyp parents: diff changeset	1099 #
163892325845 Initial commit. galaxyp parents: diff changeset	1100 if ($peptide_contexts{$peptide_term}{'offset'} < 0) {
163892325845 Initial commit. galaxyp parents: diff changeset	1101
163892325845 Initial commit. galaxyp parents: diff changeset	1102 #
163892325845 Initial commit. galaxyp parents: diff changeset	1103 # Protein too short for full length context: Calculate offset adjustment.
163892325845 Initial commit. galaxyp parents: diff changeset	1104 #
163892325845 Initial commit. galaxyp parents: diff changeset	1105 my $offset = $peptide_contexts{$peptide_term}{'offset'};
163892325845 Initial commit. galaxyp parents: diff changeset	1106 my $offset_adjustment = -($offset);
163892325845 Initial commit. galaxyp parents: diff changeset	1107 $peptide_contexts{$peptide_term}{'offset_adjustment'} = $offset_adjustment;
163892325845 Initial commit. galaxyp parents: diff changeset	1108 $logger->debug('Offset ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . ' less than zero');
163892325845 Initial commit. galaxyp parents: diff changeset	1109
163892325845 Initial commit. galaxyp parents: diff changeset	1110 if (length($padding_character) == 1) {
163892325845 Initial commit. galaxyp parents: diff changeset	1111
163892325845 Initial commit. galaxyp parents: diff changeset	1112 #
163892325845 Initial commit. galaxyp parents: diff changeset	1113 # Add N-terminal padding characters for contexts that are too short on the N-terminal side.
163892325845 Initial commit. galaxyp parents: diff changeset	1114 #
163892325845 Initial commit. galaxyp parents: diff changeset	1115 my $n_term_padding = "$padding_character" x $offset_adjustment;
163892325845 Initial commit. galaxyp parents: diff changeset	1116 $peptide_contexts{$peptide_term}{'context'} = $n_term_padding;
163892325845 Initial commit. galaxyp parents: diff changeset	1117 $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding);
163892325845 Initial commit. galaxyp parents: diff changeset	1118
163892325845 Initial commit. galaxyp parents: diff changeset	1119 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1120 $logger->debug('Padding was disabled. Will not add n-term padding.');
163892325845 Initial commit. galaxyp parents: diff changeset	1121 }
163892325845 Initial commit. galaxyp parents: diff changeset	1122
163892325845 Initial commit. galaxyp parents: diff changeset	1123 $peptide_contexts{$peptide_term}{'offset'} = 0;
163892325845 Initial commit. galaxyp parents: diff changeset	1124
163892325845 Initial commit. galaxyp parents: diff changeset	1125 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1126
163892325845 Initial commit. galaxyp parents: diff changeset	1127 $peptide_contexts{$peptide_term}{'offset_adjustment'} = 0;
163892325845 Initial commit. galaxyp parents: diff changeset	1128
163892325845 Initial commit. galaxyp parents: diff changeset	1129 }
163892325845 Initial commit. galaxyp parents: diff changeset	1130
163892325845 Initial commit. galaxyp parents: diff changeset	1131 #
163892325845 Initial commit. galaxyp parents: diff changeset	1132 # Extract sequence context from DB protein sequence.
163892325845 Initial commit. galaxyp parents: diff changeset	1133 #
163892325845 Initial commit. galaxyp parents: diff changeset	1134 my $peptide_context = substr($protein, $peptide_contexts{$peptide_term}{'offset'},
163892325845 Initial commit. galaxyp parents: diff changeset	1135 $max_context_length - $peptide_contexts{$peptide_term}{'offset_adjustment'});
163892325845 Initial commit. galaxyp parents: diff changeset	1136 # append the context extracted from the protein sequence,
163892325845 Initial commit. galaxyp parents: diff changeset	1137 # because there may be already some N-terminal padding...
163892325845 Initial commit. galaxyp parents: diff changeset	1138 $peptide_contexts{$peptide_term}{'context'} .= $peptide_context;
163892325845 Initial commit. galaxyp parents: diff changeset	1139
163892325845 Initial commit. galaxyp parents: diff changeset	1140 #
163892325845 Initial commit. galaxyp parents: diff changeset	1141 # Quality Control: Check enzyme specificity for enzymes with known (= user supplied) specs.
163892325845 Initial commit. galaxyp parents: diff changeset	1142 #
163892325845 Initial commit. galaxyp parents: diff changeset	1143 unless ($cleavage_aa eq '*') {
163892325845 Initial commit. galaxyp parents: diff changeset	1144
163892325845 Initial commit. galaxyp parents: diff changeset	1145 $peptide_context = $peptide_contexts{$peptide_term}{'context'};
163892325845 Initial commit. galaxyp parents: diff changeset	1146 my $p1 = substr($peptide_context, ($n_term_context_length -1), 1);
163892325845 Initial commit. galaxyp parents: diff changeset	1147 my $p1_prime = substr($peptide_context, $n_term_context_length, 1);
163892325845 Initial commit. galaxyp parents: diff changeset	1148
163892325845 Initial commit. galaxyp parents: diff changeset	1149 if ($cleavage_terminus eq 'C') {
163892325845 Initial commit. galaxyp parents: diff changeset	1150
163892325845 Initial commit. galaxyp parents: diff changeset	1151 if ($p1 eq $cleavage_aa) {
163892325845 Initial commit. galaxyp parents: diff changeset	1152 # Peptide fragment contains the AA recognised by the protease.
163892325845 Initial commit. galaxyp parents: diff changeset	1153 $logger->debug('Specific protease activity -> Retaining peptide context.');
163892325845 Initial commit. galaxyp parents: diff changeset	1154 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1155 # Must be non-specific cleavage -> drop peptide context.
163892325845 Initial commit. galaxyp parents: diff changeset	1156 $logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context .
163892325845 Initial commit. galaxyp parents: diff changeset	1157 ' for peptide ' . $peptide . ' in protein ' . $key);
163892325845 Initial commit. galaxyp parents: diff changeset	1158 $peptide_contexts{$peptide_term}{'context'} = '';
163892325845 Initial commit. galaxyp parents: diff changeset	1159 }
163892325845 Initial commit. galaxyp parents: diff changeset	1160
163892325845 Initial commit. galaxyp parents: diff changeset	1161 } elsif ($cleavage_terminus eq 'N') {
163892325845 Initial commit. galaxyp parents: diff changeset	1162
163892325845 Initial commit. galaxyp parents: diff changeset	1163 if ($p1_prime eq $cleavage_aa) {
163892325845 Initial commit. galaxyp parents: diff changeset	1164 # Peptide fragment contains the AA recognised by the protease.
163892325845 Initial commit. galaxyp parents: diff changeset	1165 $logger->debug('Specific protease activity -> Retaining peptide context.');
163892325845 Initial commit. galaxyp parents: diff changeset	1166 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1167 # Must be non-specific cleavage -> drop peptide context.
163892325845 Initial commit. galaxyp parents: diff changeset	1168 $logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context .
163892325845 Initial commit. galaxyp parents: diff changeset	1169 ' for peptide ' . $peptide . ' in protein ' . $key);
163892325845 Initial commit. galaxyp parents: diff changeset	1170 $peptide_contexts{$peptide_term}{'context'} = '';
163892325845 Initial commit. galaxyp parents: diff changeset	1171 }
163892325845 Initial commit. galaxyp parents: diff changeset	1172
163892325845 Initial commit. galaxyp parents: diff changeset	1173 } elsif ($cleavage_terminus eq '*') {
163892325845 Initial commit. galaxyp parents: diff changeset	1174
163892325845 Initial commit. galaxyp parents: diff changeset	1175 if (($p1 eq $cleavage_aa) \|\| ($p1_prime eq $cleavage_aa)) {
163892325845 Initial commit. galaxyp parents: diff changeset	1176 # Peptide fragment contains the AA recognised by the protease.
163892325845 Initial commit. galaxyp parents: diff changeset	1177 $logger->debug('Specific protease activity -> Retaining peptide context.');
163892325845 Initial commit. galaxyp parents: diff changeset	1178 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1179 # Must be non-specific cleavage -> drop peptide context.
163892325845 Initial commit. galaxyp parents: diff changeset	1180 $logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context .
163892325845 Initial commit. galaxyp parents: diff changeset	1181 ' for peptide ' . $peptide . ' in protein ' . $key);
163892325845 Initial commit. galaxyp parents: diff changeset	1182 $peptide_contexts{$peptide_term}{'context'} = '';
163892325845 Initial commit. galaxyp parents: diff changeset	1183 }
163892325845 Initial commit. galaxyp parents: diff changeset	1184
163892325845 Initial commit. galaxyp parents: diff changeset	1185 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1186 $logger->fatal('Unknown cleavage terminus ' . $cleavage_terminus . '. Must be C, N or *.');
163892325845 Initial commit. galaxyp parents: diff changeset	1187 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	1188 }
163892325845 Initial commit. galaxyp parents: diff changeset	1189 }
163892325845 Initial commit. galaxyp parents: diff changeset	1190 }
163892325845 Initial commit. galaxyp parents: diff changeset	1191
163892325845 Initial commit. galaxyp parents: diff changeset	1192 push(my @contexts, $peptide_contexts{'n_term'}{'context'}, $peptide_contexts{'c_term'}{'context'});
163892325845 Initial commit. galaxyp parents: diff changeset	1193
163892325845 Initial commit. galaxyp parents: diff changeset	1194 if (length($padding_character) == 1) {
163892325845 Initial commit. galaxyp parents: diff changeset	1195 _CheckSequenceContextLength(\@contexts, $peptide, $key, $max_context_length);
163892325845 Initial commit. galaxyp parents: diff changeset	1196 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1197 $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.');
163892325845 Initial commit. galaxyp parents: diff changeset	1198 }
163892325845 Initial commit. galaxyp parents: diff changeset	1199
163892325845 Initial commit. galaxyp parents: diff changeset	1200 return(\@contexts);
163892325845 Initial commit. galaxyp parents: diff changeset	1201
163892325845 Initial commit. galaxyp parents: diff changeset	1202 }
163892325845 Initial commit. galaxyp parents: diff changeset	1203
163892325845 Initial commit. galaxyp parents: diff changeset	1204 sub _GetMiscleavageSequenceContexts {
163892325845 Initial commit. galaxyp parents: diff changeset	1205
163892325845 Initial commit. galaxyp parents: diff changeset	1206 my ($peptide, $protein, $key, $petide_index, $cleavage_aa, $cleavage_terminus, $miscleavage_distance) = @_;
163892325845 Initial commit. galaxyp parents: diff changeset	1207
163892325845 Initial commit. galaxyp parents: diff changeset	1208 my @peptide_contexts;
163892325845 Initial commit. galaxyp parents: diff changeset	1209 my $max_context_length = $n_term_context_length + $c_term_context_length;
163892325845 Initial commit. galaxyp parents: diff changeset	1210
163892325845 Initial commit. galaxyp parents: diff changeset	1211 #
163892325845 Initial commit. galaxyp parents: diff changeset	1212 # Find miscleavage sites in peptide.
163892325845 Initial commit. galaxyp parents: diff changeset	1213 #
163892325845 Initial commit. galaxyp parents: diff changeset	1214 # TODO: handle --mcd
163892325845 Initial commit. galaxyp parents: diff changeset	1215 #
163892325845 Initial commit. galaxyp parents: diff changeset	1216 my @amino_acids = split('', $peptide);
163892325845 Initial commit. galaxyp parents: diff changeset	1217 my $aa_index = 0;
163892325845 Initial commit. galaxyp parents: diff changeset	1218 #
163892325845 Initial commit. galaxyp parents: diff changeset	1219 # Drop first and last amino acids as these are from cleavage sites or from the protein termini.
163892325845 Initial commit. galaxyp parents: diff changeset	1220 # In either case they cannot represent a miscleavage site.
163892325845 Initial commit. galaxyp parents: diff changeset	1221 #
163892325845 Initial commit. galaxyp parents: diff changeset	1222 shift(@amino_acids);
163892325845 Initial commit. galaxyp parents: diff changeset	1223 pop(@amino_acids);
163892325845 Initial commit. galaxyp parents: diff changeset	1224
163892325845 Initial commit. galaxyp parents: diff changeset	1225 foreach my $aa (@amino_acids) {
163892325845 Initial commit. galaxyp parents: diff changeset	1226
163892325845 Initial commit. galaxyp parents: diff changeset	1227 $aa_index++;
163892325845 Initial commit. galaxyp parents: diff changeset	1228
163892325845 Initial commit. galaxyp parents: diff changeset	1229 if ($aa eq $cleavage_aa) {
163892325845 Initial commit. galaxyp parents: diff changeset	1230
163892325845 Initial commit. galaxyp parents: diff changeset	1231 #
163892325845 Initial commit. galaxyp parents: diff changeset	1232 # We found a miscleavage site.
163892325845 Initial commit. galaxyp parents: diff changeset	1233 #
163892325845 Initial commit. galaxyp parents: diff changeset	1234 $logger->debug('Found miscleavage site in peptide ' . $peptide . ' of protein ' . $key . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	1235 #
163892325845 Initial commit. galaxyp parents: diff changeset	1236 # Get the offset for the sequence context.
163892325845 Initial commit. galaxyp parents: diff changeset	1237 #
163892325845 Initial commit. galaxyp parents: diff changeset	1238 my $offset = $petide_index + $aa_index - $n_term_context_length;
163892325845 Initial commit. galaxyp parents: diff changeset	1239 $logger->debug("\t" . 'Offset = ' . $offset . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	1240 my $n_term_padding = '';
163892325845 Initial commit. galaxyp parents: diff changeset	1241 my $offset_adjustment = 0;
163892325845 Initial commit. galaxyp parents: diff changeset	1242
163892325845 Initial commit. galaxyp parents: diff changeset	1243 if ($cleavage_terminus eq 'C') {
163892325845 Initial commit. galaxyp parents: diff changeset	1244
163892325845 Initial commit. galaxyp parents: diff changeset	1245 #
163892325845 Initial commit. galaxyp parents: diff changeset	1246 # Must increment the offset with 1 if the protease cuts C-terminal of a recognition site.
163892325845 Initial commit. galaxyp parents: diff changeset	1247 #
163892325845 Initial commit. galaxyp parents: diff changeset	1248 $offset++;
163892325845 Initial commit. galaxyp parents: diff changeset	1249
163892325845 Initial commit. galaxyp parents: diff changeset	1250 }
163892325845 Initial commit. galaxyp parents: diff changeset	1251
163892325845 Initial commit. galaxyp parents: diff changeset	1252 #
163892325845 Initial commit. galaxyp parents: diff changeset	1253 # Check if DB protein sequence is long enough on n-terminal side
163892325845 Initial commit. galaxyp parents: diff changeset	1254 # to retrieve full length peptide contexts.
163892325845 Initial commit. galaxyp parents: diff changeset	1255 # (c-term length is checked elsewhere.)
163892325845 Initial commit. galaxyp parents: diff changeset	1256 #
163892325845 Initial commit. galaxyp parents: diff changeset	1257 if ($offset < 0) {
163892325845 Initial commit. galaxyp parents: diff changeset	1258
163892325845 Initial commit. galaxyp parents: diff changeset	1259 #
163892325845 Initial commit. galaxyp parents: diff changeset	1260 # Protein too short for full length context: Calculate offset adjustment.
163892325845 Initial commit. galaxyp parents: diff changeset	1261 #
163892325845 Initial commit. galaxyp parents: diff changeset	1262 $offset_adjustment = -($offset);
163892325845 Initial commit. galaxyp parents: diff changeset	1263 $logger->debug('Offset ' . $offset . ' for miscleavage context in peptide ' . $peptide . ' in protein ' . $key . ' less than zero');
163892325845 Initial commit. galaxyp parents: diff changeset	1264
163892325845 Initial commit. galaxyp parents: diff changeset	1265 if (length($padding_character) == 1) {
163892325845 Initial commit. galaxyp parents: diff changeset	1266
163892325845 Initial commit. galaxyp parents: diff changeset	1267 #
163892325845 Initial commit. galaxyp parents: diff changeset	1268 # Add N-terminal padding characters for contexts that are too short on the N-terminal side.
163892325845 Initial commit. galaxyp parents: diff changeset	1269 #
163892325845 Initial commit. galaxyp parents: diff changeset	1270 $n_term_padding = "$padding_character" x $offset_adjustment;
163892325845 Initial commit. galaxyp parents: diff changeset	1271 $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding);
163892325845 Initial commit. galaxyp parents: diff changeset	1272
163892325845 Initial commit. galaxyp parents: diff changeset	1273 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1274 $logger->debug('Padding was disabled. Will not add n-term padding.');
163892325845 Initial commit. galaxyp parents: diff changeset	1275 }
163892325845 Initial commit. galaxyp parents: diff changeset	1276
163892325845 Initial commit. galaxyp parents: diff changeset	1277 #
163892325845 Initial commit. galaxyp parents: diff changeset	1278 # Reset offset for this context to protein start.
163892325845 Initial commit. galaxyp parents: diff changeset	1279 #
163892325845 Initial commit. galaxyp parents: diff changeset	1280 $offset = 0;
163892325845 Initial commit. galaxyp parents: diff changeset	1281
163892325845 Initial commit. galaxyp parents: diff changeset	1282 }
163892325845 Initial commit. galaxyp parents: diff changeset	1283
163892325845 Initial commit. galaxyp parents: diff changeset	1284 #
163892325845 Initial commit. galaxyp parents: diff changeset	1285 # Extract sequence context from DB protein sequence.
163892325845 Initial commit. galaxyp parents: diff changeset	1286 #
163892325845 Initial commit. galaxyp parents: diff changeset	1287 my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment);
163892325845 Initial commit. galaxyp parents: diff changeset	1288 $peptide_context = $n_term_padding . $peptide_context;
163892325845 Initial commit. galaxyp parents: diff changeset	1289 $logger->debug('Found miscleavage context: ' . $peptide_context . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	1290
163892325845 Initial commit. galaxyp parents: diff changeset	1291 push(@peptide_contexts, $peptide_context);
163892325845 Initial commit. galaxyp parents: diff changeset	1292 }
163892325845 Initial commit. galaxyp parents: diff changeset	1293 }
163892325845 Initial commit. galaxyp parents: diff changeset	1294
163892325845 Initial commit. galaxyp parents: diff changeset	1295 if (length($padding_character) == 1) {
163892325845 Initial commit. galaxyp parents: diff changeset	1296 _CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length);
163892325845 Initial commit. galaxyp parents: diff changeset	1297 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1298 $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.');
163892325845 Initial commit. galaxyp parents: diff changeset	1299 }
163892325845 Initial commit. galaxyp parents: diff changeset	1300
163892325845 Initial commit. galaxyp parents: diff changeset	1301 return(\@peptide_contexts);
163892325845 Initial commit. galaxyp parents: diff changeset	1302
163892325845 Initial commit. galaxyp parents: diff changeset	1303 }
163892325845 Initial commit. galaxyp parents: diff changeset	1304
163892325845 Initial commit. galaxyp parents: diff changeset	1305 sub _GetModificationSequenceContexts {
163892325845 Initial commit. galaxyp parents: diff changeset	1306
163892325845 Initial commit. galaxyp parents: diff changeset	1307 my ($peptide, $protein, $key, $petide_index, $modified_aa, $modified_peptide) = @_;
163892325845 Initial commit. galaxyp parents: diff changeset	1308
163892325845 Initial commit. galaxyp parents: diff changeset	1309 my @peptide_contexts;
163892325845 Initial commit. galaxyp parents: diff changeset	1310 my $modified_aa_index_adjustment;
163892325845 Initial commit. galaxyp parents: diff changeset	1311 my @modified_aa_in_peptide_indices;
163892325845 Initial commit. galaxyp parents: diff changeset	1312 my $modified_aa_in_protein_index;
163892325845 Initial commit. galaxyp parents: diff changeset	1313 my $max_context_length = $n_term_context_length + 1 + $c_term_context_length;
163892325845 Initial commit. galaxyp parents: diff changeset	1314 #my $offset = 0;
163892325845 Initial commit. galaxyp parents: diff changeset	1315 my $modified_aa_in_modified_peptide_index;
163892325845 Initial commit. galaxyp parents: diff changeset	1316 my $modified_peptide_remainder;
163892325845 Initial commit. galaxyp parents: diff changeset	1317 my $processed_peptide_length = 0;
163892325845 Initial commit. galaxyp parents: diff changeset	1318
163892325845 Initial commit. galaxyp parents: diff changeset	1319 #
163892325845 Initial commit. galaxyp parents: diff changeset	1320 # Find amino acid in the string identifying both the modified amino acid and the modification.
163892325845 Initial commit. galaxyp parents: diff changeset	1321 #
163892325845 Initial commit. galaxyp parents: diff changeset	1322 if ($modified_aa =~ m/^([a-z]+)[A-Z*]$/) {
163892325845 Initial commit. galaxyp parents: diff changeset	1323
163892325845 Initial commit. galaxyp parents: diff changeset	1324 $modified_aa_index_adjustment = length($1);
163892325845 Initial commit. galaxyp parents: diff changeset	1325
163892325845 Initial commit. galaxyp parents: diff changeset	1326 } elsif ($modified_aa =~ m/^[A-Z*][a-z]$/) {
163892325845 Initial commit. galaxyp parents: diff changeset	1327
163892325845 Initial commit. galaxyp parents: diff changeset	1328 $modified_aa_index_adjustment = 0;
163892325845 Initial commit. galaxyp parents: diff changeset	1329
163892325845 Initial commit. galaxyp parents: diff changeset	1330 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1331
163892325845 Initial commit. galaxyp parents: diff changeset	1332 $logger->fatal('Modified amino acid was specified in an unsupported format: ' . $modified_aa . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	1333 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	1334
163892325845 Initial commit. galaxyp parents: diff changeset	1335 }
163892325845 Initial commit. galaxyp parents: diff changeset	1336
163892325845 Initial commit. galaxyp parents: diff changeset	1337 #
163892325845 Initial commit. galaxyp parents: diff changeset	1338 # Find modified sites in peptide.
163892325845 Initial commit. galaxyp parents: diff changeset	1339 #
163892325845 Initial commit. galaxyp parents: diff changeset	1340 my $modified_aa_for_regex = $modified_aa;
163892325845 Initial commit. galaxyp parents: diff changeset	1341 $modified_aa_for_regex =~ s/\*/[A-Z]/;
163892325845 Initial commit. galaxyp parents: diff changeset	1342 $logger->debug('modified_aa for search: ' . $modified_aa_for_regex);
163892325845 Initial commit. galaxyp parents: diff changeset	1343 if ($modified_peptide =~ m/$modified_aa_for_regex/) {
163892325845 Initial commit. galaxyp parents: diff changeset	1344 $logger->debug("\t" . 'Peptide contains mods.');
163892325845 Initial commit. galaxyp parents: diff changeset	1345 $modified_aa_in_modified_peptide_index = $-[0];
163892325845 Initial commit. galaxyp parents: diff changeset	1346 $modified_peptide_remainder = $';
163892325845 Initial commit. galaxyp parents: diff changeset	1347 my $modified_peptide_prefix = $`;
163892325845 Initial commit. galaxyp parents: diff changeset	1348 $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length);
163892325845 Initial commit. galaxyp parents: diff changeset	1349 $processed_peptide_length += length($modified_peptide_prefix);
163892325845 Initial commit. galaxyp parents: diff changeset	1350 $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length);
163892325845 Initial commit. galaxyp parents: diff changeset	1351 $processed_peptide_length += length($modified_aa);
163892325845 Initial commit. galaxyp parents: diff changeset	1352 $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length);
163892325845 Initial commit. galaxyp parents: diff changeset	1353 $logger->trace("\t\t" . 'Index for mod :' . $modified_aa_in_modified_peptide_index);
163892325845 Initial commit. galaxyp parents: diff changeset	1354 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1355 $logger->debug('No modified amino acids found in this peptide.');
163892325845 Initial commit. galaxyp parents: diff changeset	1356 }
163892325845 Initial commit. galaxyp parents: diff changeset	1357
163892325845 Initial commit. galaxyp parents: diff changeset	1358 #my $modified_aa_in_modified_peptide_index = index($modified_peptide, $modified_aa);
163892325845 Initial commit. galaxyp parents: diff changeset	1359
163892325845 Initial commit. galaxyp parents: diff changeset	1360 while (defined($modified_aa_in_modified_peptide_index)) {
163892325845 Initial commit. galaxyp parents: diff changeset	1361
163892325845 Initial commit. galaxyp parents: diff changeset	1362 $modified_aa_in_modified_peptide_index += $modified_aa_index_adjustment;
163892325845 Initial commit. galaxyp parents: diff changeset	1363 $logger->debug('Found modified amino acid at position ' . ($modified_aa_in_modified_peptide_index + 1) . ' in modified peptide ' . $modified_peptide . ' of protein ' . $key . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	1364
163892325845 Initial commit. galaxyp parents: diff changeset	1365 #
163892325845 Initial commit. galaxyp parents: diff changeset	1366 # Count all modifications of any type preceeding the found modified amino acid.
163892325845 Initial commit. galaxyp parents: diff changeset	1367 #
163892325845 Initial commit. galaxyp parents: diff changeset	1368 my $modified_n_term = substr($modified_peptide, 0, $modified_aa_in_modified_peptide_index);
163892325845 Initial commit. galaxyp parents: diff changeset	1369 my $n_term = $modified_n_term;
163892325845 Initial commit. galaxyp parents: diff changeset	1370 $n_term =~ s/[a-z]+//g;
163892325845 Initial commit. galaxyp parents: diff changeset	1371 my $mod_count = length($modified_n_term) - length($n_term);
163892325845 Initial commit. galaxyp parents: diff changeset	1372
163892325845 Initial commit. galaxyp parents: diff changeset	1373 if ($mod_count >= 0) {
163892325845 Initial commit. galaxyp parents: diff changeset	1374 $logger->debug(' counted modifications preceeding amino acid = ' . $mod_count);
163892325845 Initial commit. galaxyp parents: diff changeset	1375 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1376 $logger->fatal(' counted modifications preceeding amino acid is negative: ' . $mod_count);
163892325845 Initial commit. galaxyp parents: diff changeset	1377 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	1378 }
163892325845 Initial commit. galaxyp parents: diff changeset	1379
163892325845 Initial commit. galaxyp parents: diff changeset	1380 #
163892325845 Initial commit. galaxyp parents: diff changeset	1381 # We need to substract the amount of mods preceeding the identified amino acid
163892325845 Initial commit. galaxyp parents: diff changeset	1382 # from the index to get the position of the amino acid in the unmodified peptide.
163892325845 Initial commit. galaxyp parents: diff changeset	1383 #
163892325845 Initial commit. galaxyp parents: diff changeset	1384 my $modified_aa_in_peptide_index = $modified_aa_in_modified_peptide_index - $mod_count;
163892325845 Initial commit. galaxyp parents: diff changeset	1385 $logger->debug(' and at position ' . ($modified_aa_in_peptide_index + 1) . ' in peptide ' . $peptide . ' of protein ' . $key . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	1386
163892325845 Initial commit. galaxyp parents: diff changeset	1387 push(@modified_aa_in_peptide_indices, $modified_aa_in_peptide_index);
163892325845 Initial commit. galaxyp parents: diff changeset	1388
163892325845 Initial commit. galaxyp parents: diff changeset	1389 #
163892325845 Initial commit. galaxyp parents: diff changeset	1390 # Search for additional mods in the modified peptide.
163892325845 Initial commit. galaxyp parents: diff changeset	1391 #
163892325845 Initial commit. galaxyp parents: diff changeset	1392 #$offset = $modified_aa_in_modified_peptide_index + 1;
163892325845 Initial commit. galaxyp parents: diff changeset	1393 #$modified_aa_in_modified_peptide_index = index($modified_peptide, $modified_aa, $offset);
163892325845 Initial commit. galaxyp parents: diff changeset	1394 if ($modified_peptide_remainder =~ m/$modified_aa_for_regex/) {
163892325845 Initial commit. galaxyp parents: diff changeset	1395 $logger->debug("\t" . 'Peptide contains more mods.');
163892325845 Initial commit. galaxyp parents: diff changeset	1396 $modified_peptide_remainder = $';
163892325845 Initial commit. galaxyp parents: diff changeset	1397 my $modified_peptide_prefix = $`;
163892325845 Initial commit. galaxyp parents: diff changeset	1398 $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length);
163892325845 Initial commit. galaxyp parents: diff changeset	1399 $modified_aa_in_modified_peptide_index = $-[0] + $processed_peptide_length;
163892325845 Initial commit. galaxyp parents: diff changeset	1400 $logger->trace("\t\t" . 'Index for mod :' . $modified_aa_in_modified_peptide_index);
163892325845 Initial commit. galaxyp parents: diff changeset	1401 $processed_peptide_length += length($modified_peptide_prefix);
163892325845 Initial commit. galaxyp parents: diff changeset	1402 $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length);
163892325845 Initial commit. galaxyp parents: diff changeset	1403 $processed_peptide_length += length($modified_aa);
163892325845 Initial commit. galaxyp parents: diff changeset	1404 $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length);
163892325845 Initial commit. galaxyp parents: diff changeset	1405 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1406 $modified_aa_in_modified_peptide_index = undef();
163892325845 Initial commit. galaxyp parents: diff changeset	1407 $logger->debug('No more modified amino acids found in this peptide.');
163892325845 Initial commit. galaxyp parents: diff changeset	1408 }
163892325845 Initial commit. galaxyp parents: diff changeset	1409 }
163892325845 Initial commit. galaxyp parents: diff changeset	1410
163892325845 Initial commit. galaxyp parents: diff changeset	1411 #
163892325845 Initial commit. galaxyp parents: diff changeset	1412 # Retrieve sequence contexts for modified stites from protein sequence
163892325845 Initial commit. galaxyp parents: diff changeset	1413 #
163892325845 Initial commit. galaxyp parents: diff changeset	1414 foreach my $modified_aa_in_peptide_index (@modified_aa_in_peptide_indices) {
163892325845 Initial commit. galaxyp parents: diff changeset	1415
163892325845 Initial commit. galaxyp parents: diff changeset	1416 #
163892325845 Initial commit. galaxyp parents: diff changeset	1417 # Get the offset for the sequence context.
163892325845 Initial commit. galaxyp parents: diff changeset	1418 #
163892325845 Initial commit. galaxyp parents: diff changeset	1419 my $offset = $petide_index + $modified_aa_in_peptide_index - $n_term_context_length;
163892325845 Initial commit. galaxyp parents: diff changeset	1420 $logger->debug("\t" . 'Offset = ' . $offset . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	1421 my $n_term_padding = '';
163892325845 Initial commit. galaxyp parents: diff changeset	1422 my $offset_adjustment = 0;
163892325845 Initial commit. galaxyp parents: diff changeset	1423
163892325845 Initial commit. galaxyp parents: diff changeset	1424 #
163892325845 Initial commit. galaxyp parents: diff changeset	1425 # Check if DB protein sequence is long enough on n-terminal side
163892325845 Initial commit. galaxyp parents: diff changeset	1426 # to retrieve full length peptide contexts.
163892325845 Initial commit. galaxyp parents: diff changeset	1427 # (c-term length is checked elsewhere.)
163892325845 Initial commit. galaxyp parents: diff changeset	1428 #
163892325845 Initial commit. galaxyp parents: diff changeset	1429 if ($offset < 0) {
163892325845 Initial commit. galaxyp parents: diff changeset	1430
163892325845 Initial commit. galaxyp parents: diff changeset	1431 #
163892325845 Initial commit. galaxyp parents: diff changeset	1432 # Protein too short for full length context: Calculate offset adjustment.
163892325845 Initial commit. galaxyp parents: diff changeset	1433 #
163892325845 Initial commit. galaxyp parents: diff changeset	1434 $offset_adjustment = -($offset);
163892325845 Initial commit. galaxyp parents: diff changeset	1435 $logger->debug('Offset ' . $offset . ' for modification context in peptide ' . $peptide . ' in protein ' . $key . ' less than zero');
163892325845 Initial commit. galaxyp parents: diff changeset	1436
163892325845 Initial commit. galaxyp parents: diff changeset	1437 if (length($padding_character) == 1) {
163892325845 Initial commit. galaxyp parents: diff changeset	1438
163892325845 Initial commit. galaxyp parents: diff changeset	1439 #
163892325845 Initial commit. galaxyp parents: diff changeset	1440 # Add N-terminal padding characters for contexts that are too short on the N-terminal side.
163892325845 Initial commit. galaxyp parents: diff changeset	1441 #
163892325845 Initial commit. galaxyp parents: diff changeset	1442 $n_term_padding = "$padding_character" x $offset_adjustment;
163892325845 Initial commit. galaxyp parents: diff changeset	1443 $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding);
163892325845 Initial commit. galaxyp parents: diff changeset	1444
163892325845 Initial commit. galaxyp parents: diff changeset	1445 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1446 $logger->debug('Padding was disabled. Will not add n-term padding.');
163892325845 Initial commit. galaxyp parents: diff changeset	1447 }
163892325845 Initial commit. galaxyp parents: diff changeset	1448
163892325845 Initial commit. galaxyp parents: diff changeset	1449 #
163892325845 Initial commit. galaxyp parents: diff changeset	1450 # Reset offset for this context to protein start.
163892325845 Initial commit. galaxyp parents: diff changeset	1451 #
163892325845 Initial commit. galaxyp parents: diff changeset	1452 $offset = 0;
163892325845 Initial commit. galaxyp parents: diff changeset	1453
163892325845 Initial commit. galaxyp parents: diff changeset	1454 }
163892325845 Initial commit. galaxyp parents: diff changeset	1455
163892325845 Initial commit. galaxyp parents: diff changeset	1456 #
163892325845 Initial commit. galaxyp parents: diff changeset	1457 # Extract sequence context from DB protein sequence.
163892325845 Initial commit. galaxyp parents: diff changeset	1458 #
163892325845 Initial commit. galaxyp parents: diff changeset	1459 my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment);
163892325845 Initial commit. galaxyp parents: diff changeset	1460 $peptide_context = $n_term_padding . $peptide_context;
163892325845 Initial commit. galaxyp parents: diff changeset	1461 $logger->debug('Found modification context: ' . $peptide_context . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	1462
163892325845 Initial commit. galaxyp parents: diff changeset	1463 push(@peptide_contexts, $peptide_context);
163892325845 Initial commit. galaxyp parents: diff changeset	1464
163892325845 Initial commit. galaxyp parents: diff changeset	1465 }
163892325845 Initial commit. galaxyp parents: diff changeset	1466
163892325845 Initial commit. galaxyp parents: diff changeset	1467 if (length($padding_character) == 1) {
163892325845 Initial commit. galaxyp parents: diff changeset	1468 _CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length);
163892325845 Initial commit. galaxyp parents: diff changeset	1469 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1470 $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.');
163892325845 Initial commit. galaxyp parents: diff changeset	1471 }
163892325845 Initial commit. galaxyp parents: diff changeset	1472
163892325845 Initial commit. galaxyp parents: diff changeset	1473 return(\@peptide_contexts);
163892325845 Initial commit. galaxyp parents: diff changeset	1474
163892325845 Initial commit. galaxyp parents: diff changeset	1475 }
163892325845 Initial commit. galaxyp parents: diff changeset	1476
163892325845 Initial commit. galaxyp parents: diff changeset	1477 #
163892325845 Initial commit. galaxyp parents: diff changeset	1478 # TODO: pep contexts.
163892325845 Initial commit. galaxyp parents: diff changeset	1479 #
163892325845 Initial commit. galaxyp parents: diff changeset	1480 sub _GetPeptideSequenceContexts {
163892325845 Initial commit. galaxyp parents: diff changeset	1481
163892325845 Initial commit. galaxyp parents: diff changeset	1482 my ($peptide, $protein, $key, $peptide_index) = @_;
163892325845 Initial commit. galaxyp parents: diff changeset	1483
163892325845 Initial commit. galaxyp parents: diff changeset	1484 my @peptide_contexts;
163892325845 Initial commit. galaxyp parents: diff changeset	1485 my $max_context_length = $n_term_context_length + length($peptide) + $c_term_context_length;
163892325845 Initial commit. galaxyp parents: diff changeset	1486
163892325845 Initial commit. galaxyp parents: diff changeset	1487 #
163892325845 Initial commit. galaxyp parents: diff changeset	1488 # Get the offset for the peptide sequence context.
163892325845 Initial commit. galaxyp parents: diff changeset	1489 #
163892325845 Initial commit. galaxyp parents: diff changeset	1490 my $offset = $peptide_index - $n_term_context_length;
163892325845 Initial commit. galaxyp parents: diff changeset	1491 $logger->debug("\t" . 'Offset = ' . $offset . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	1492 my $n_term_padding = '';
163892325845 Initial commit. galaxyp parents: diff changeset	1493 my $offset_adjustment = 0;
163892325845 Initial commit. galaxyp parents: diff changeset	1494
163892325845 Initial commit. galaxyp parents: diff changeset	1495 #
163892325845 Initial commit. galaxyp parents: diff changeset	1496 # Check if DB protein sequence is long enough on n-terminal side
163892325845 Initial commit. galaxyp parents: diff changeset	1497 # to retrieve full length peptide contexts.
163892325845 Initial commit. galaxyp parents: diff changeset	1498 # (c-term length is checked elsewhere.)
163892325845 Initial commit. galaxyp parents: diff changeset	1499 #
163892325845 Initial commit. galaxyp parents: diff changeset	1500 if ($offset < 0) {
163892325845 Initial commit. galaxyp parents: diff changeset	1501
163892325845 Initial commit. galaxyp parents: diff changeset	1502 #
163892325845 Initial commit. galaxyp parents: diff changeset	1503 # Protein too short for full length context: Calculate offset adjustment.
163892325845 Initial commit. galaxyp parents: diff changeset	1504 #
163892325845 Initial commit. galaxyp parents: diff changeset	1505 $offset_adjustment = -($offset);
163892325845 Initial commit. galaxyp parents: diff changeset	1506 $logger->debug('Offset ' . $offset . ' for peptide context of peptide ' . $peptide . ' in protein ' . $key . ' less than zero');
163892325845 Initial commit. galaxyp parents: diff changeset	1507
163892325845 Initial commit. galaxyp parents: diff changeset	1508 if (length($padding_character) == 1) {
163892325845 Initial commit. galaxyp parents: diff changeset	1509
163892325845 Initial commit. galaxyp parents: diff changeset	1510 #
163892325845 Initial commit. galaxyp parents: diff changeset	1511 # Add N-terminal padding characters for contexts that are too short on the N-terminal side.
163892325845 Initial commit. galaxyp parents: diff changeset	1512 #
163892325845 Initial commit. galaxyp parents: diff changeset	1513 $n_term_padding = "$padding_character" x $offset_adjustment;
163892325845 Initial commit. galaxyp parents: diff changeset	1514 $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding);
163892325845 Initial commit. galaxyp parents: diff changeset	1515
163892325845 Initial commit. galaxyp parents: diff changeset	1516 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1517 $logger->debug('Padding was disabled. Will not add n-term padding.');
163892325845 Initial commit. galaxyp parents: diff changeset	1518 }
163892325845 Initial commit. galaxyp parents: diff changeset	1519
163892325845 Initial commit. galaxyp parents: diff changeset	1520 #
163892325845 Initial commit. galaxyp parents: diff changeset	1521 # Reset offset for this context to protein start.
163892325845 Initial commit. galaxyp parents: diff changeset	1522 #
163892325845 Initial commit. galaxyp parents: diff changeset	1523 $offset = 0;
163892325845 Initial commit. galaxyp parents: diff changeset	1524
163892325845 Initial commit. galaxyp parents: diff changeset	1525 }
163892325845 Initial commit. galaxyp parents: diff changeset	1526
163892325845 Initial commit. galaxyp parents: diff changeset	1527 #
163892325845 Initial commit. galaxyp parents: diff changeset	1528 # Extract sequence context from DB protein sequence.
163892325845 Initial commit. galaxyp parents: diff changeset	1529 #
163892325845 Initial commit. galaxyp parents: diff changeset	1530 my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment);
163892325845 Initial commit. galaxyp parents: diff changeset	1531 $peptide_context = $n_term_padding . $peptide_context;
163892325845 Initial commit. galaxyp parents: diff changeset	1532 $logger->debug('Found modification context: ' . $peptide_context . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	1533
163892325845 Initial commit. galaxyp parents: diff changeset	1534 push(@peptide_contexts, $peptide_context);
163892325845 Initial commit. galaxyp parents: diff changeset	1535
163892325845 Initial commit. galaxyp parents: diff changeset	1536 if (length($padding_character) == 1) {
163892325845 Initial commit. galaxyp parents: diff changeset	1537 _CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length);
163892325845 Initial commit. galaxyp parents: diff changeset	1538 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1539 $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.');
163892325845 Initial commit. galaxyp parents: diff changeset	1540 }
163892325845 Initial commit. galaxyp parents: diff changeset	1541
163892325845 Initial commit. galaxyp parents: diff changeset	1542 return(\@peptide_contexts);
163892325845 Initial commit. galaxyp parents: diff changeset	1543
163892325845 Initial commit. galaxyp parents: diff changeset	1544 }
163892325845 Initial commit. galaxyp parents: diff changeset	1545
163892325845 Initial commit. galaxyp parents: diff changeset	1546 sub _CheckSequenceContextLength {
163892325845 Initial commit. galaxyp parents: diff changeset	1547
163892325845 Initial commit. galaxyp parents: diff changeset	1548 my ($contexts, $peptide, $key, $max_context_length) = @_;
163892325845 Initial commit. galaxyp parents: diff changeset	1549
163892325845 Initial commit. galaxyp parents: diff changeset	1550 #
163892325845 Initial commit. galaxyp parents: diff changeset	1551 # If padding was enabled for sequence contexts < max sequence context length.
163892325845 Initial commit. galaxyp parents: diff changeset	1552 #
163892325845 Initial commit. galaxyp parents: diff changeset	1553 foreach my $peptide_context (@{$contexts}) {
163892325845 Initial commit. galaxyp parents: diff changeset	1554
163892325845 Initial commit. galaxyp parents: diff changeset	1555 my $peptide_context_length = length($peptide_context);
163892325845 Initial commit. galaxyp parents: diff changeset	1556
163892325845 Initial commit. galaxyp parents: diff changeset	1557 if ($peptide_context_length < $max_context_length) {
163892325845 Initial commit. galaxyp parents: diff changeset	1558
163892325845 Initial commit. galaxyp parents: diff changeset	1559 #
163892325845 Initial commit. galaxyp parents: diff changeset	1560 # sequence context < max sequence context length -> append c-term padding.
163892325845 Initial commit. galaxyp parents: diff changeset	1561 # (n-term padding was already appended if necessary during sequence context lookup with _GetSequenceContext().)
163892325845 Initial commit. galaxyp parents: diff changeset	1562 #
163892325845 Initial commit. galaxyp parents: diff changeset	1563 my $c_term_padding = "$padding_character" x ($max_context_length - $peptide_context_length);
163892325845 Initial commit. galaxyp parents: diff changeset	1564 $logger->debug('Length of peptide context (' . $peptide_context_length . ') < max context length.');
163892325845 Initial commit. galaxyp parents: diff changeset	1565 $logger->debug("\t" . 'Will add C-terminal padding: ' . $c_term_padding);
163892325845 Initial commit. galaxyp parents: diff changeset	1566 $peptide_context = $peptide_context . $c_term_padding;
163892325845 Initial commit. galaxyp parents: diff changeset	1567
163892325845 Initial commit. galaxyp parents: diff changeset	1568 }
163892325845 Initial commit. galaxyp parents: diff changeset	1569
163892325845 Initial commit. galaxyp parents: diff changeset	1570 #
163892325845 Initial commit. galaxyp parents: diff changeset	1571 # Sanity check for padded sequence contexts.
163892325845 Initial commit. galaxyp parents: diff changeset	1572 #
163892325845 Initial commit. galaxyp parents: diff changeset	1573 # (Checks if the combined effect of n-term and c-term padding worked well.)
163892325845 Initial commit. galaxyp parents: diff changeset	1574 #
163892325845 Initial commit. galaxyp parents: diff changeset	1575 $peptide_context_length = length($peptide_context);
163892325845 Initial commit. galaxyp parents: diff changeset	1576 unless ($peptide_context_length == $max_context_length) {
163892325845 Initial commit. galaxyp parents: diff changeset	1577 $logger->fatal('Length of peptide context too short or too long for peptide ' . $peptide . ' in protein ' . $key);
163892325845 Initial commit. galaxyp parents: diff changeset	1578 $logger->fatal("\t" . 'Context length is ' . $peptide_context_length . ', but should be ' . $max_context_length . '.');
163892325845 Initial commit. galaxyp parents: diff changeset	1579 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	1580 }
163892325845 Initial commit. galaxyp parents: diff changeset	1581 }
163892325845 Initial commit. galaxyp parents: diff changeset	1582
163892325845 Initial commit. galaxyp parents: diff changeset	1583 #
163892325845 Initial commit. galaxyp parents: diff changeset	1584 # No need to return the modified (c-terminal-padded) contexts as we worked with a arrayref.
163892325845 Initial commit. galaxyp parents: diff changeset	1585 #
163892325845 Initial commit. galaxyp parents: diff changeset	1586
163892325845 Initial commit. galaxyp parents: diff changeset	1587 }
163892325845 Initial commit. galaxyp parents: diff changeset	1588
163892325845 Initial commit. galaxyp parents: diff changeset	1589 sub _SaveSequenceContexts {
163892325845 Initial commit. galaxyp parents: diff changeset	1590
163892325845 Initial commit. galaxyp parents: diff changeset	1591 my ($line, $acc, $contexts, $path_to_fh, $extracted_seq_contexts) = @_;
163892325845 Initial commit. galaxyp parents: diff changeset	1592
163892325845 Initial commit. galaxyp parents: diff changeset	1593 #
163892325845 Initial commit. galaxyp parents: diff changeset	1594 # Append the peptide contexts to the existing line of the fragment file and
163892325845 Initial commit. galaxyp parents: diff changeset	1595 # save that new line to the output file.
163892325845 Initial commit. galaxyp parents: diff changeset	1596 # By appending we keep all info from the original fragment file intact.
163892325845 Initial commit. galaxyp parents: diff changeset	1597 #
163892325845 Initial commit. galaxyp parents: diff changeset	1598 foreach my $context (@{$contexts}) {
163892325845 Initial commit. galaxyp parents: diff changeset	1599
163892325845 Initial commit. galaxyp parents: diff changeset	1600 #
163892325845 Initial commit. galaxyp parents: diff changeset	1601 # Skip "empty"/"missing" contexts.
163892325845 Initial commit. galaxyp parents: diff changeset	1602 # May be the result of:
163892325845 Initial commit. galaxyp parents: diff changeset	1603 # 1. The peptide terminus being also the protein terminus.
163892325845 Initial commit. galaxyp parents: diff changeset	1604 # 2. Proteins missing from the database.
163892325845 Initial commit. galaxyp parents: diff changeset	1605 # 3. ...
163892325845 Initial commit. galaxyp parents: diff changeset	1606 #
163892325845 Initial commit. galaxyp parents: diff changeset	1607 if ($padding_character ne '') {
163892325845 Initial commit. galaxyp parents: diff changeset	1608 if ($context =~ m/^($padding_character)+$/) {
163892325845 Initial commit. galaxyp parents: diff changeset	1609 next;
163892325845 Initial commit. galaxyp parents: diff changeset	1610 }
163892325845 Initial commit. galaxyp parents: diff changeset	1611 } else {
163892325845 Initial commit. galaxyp parents: diff changeset	1612 if ($context eq '') {
163892325845 Initial commit. galaxyp parents: diff changeset	1613 next;
163892325845 Initial commit. galaxyp parents: diff changeset	1614 }
163892325845 Initial commit. galaxyp parents: diff changeset	1615 }
163892325845 Initial commit. galaxyp parents: diff changeset	1616
163892325845 Initial commit. galaxyp parents: diff changeset	1617 my $new_line = $line;
163892325845 Initial commit. galaxyp parents: diff changeset	1618
163892325845 Initial commit. galaxyp parents: diff changeset	1619 # 1. Append accession numbers if the fragment file used IDs.
163892325845 Initial commit. galaxyp parents: diff changeset	1620 if ($index_id eq 'ID') {
163892325845 Initial commit. galaxyp parents: diff changeset	1621 $new_line .= "\t" . $acc;
163892325845 Initial commit. galaxyp parents: diff changeset	1622 }
163892325845 Initial commit. galaxyp parents: diff changeset	1623
163892325845 Initial commit. galaxyp parents: diff changeset	1624 # 2. Append sequence context.
163892325845 Initial commit. galaxyp parents: diff changeset	1625 $new_line .= "\t" . $context;
163892325845 Initial commit. galaxyp parents: diff changeset	1626 # 3. Append new line character.
163892325845 Initial commit. galaxyp parents: diff changeset	1627 $new_line .= "\n";
163892325845 Initial commit. galaxyp parents: diff changeset	1628 # Save result to disk.
163892325845 Initial commit. galaxyp parents: diff changeset	1629 print($path_to_fh $new_line);
163892325845 Initial commit. galaxyp parents: diff changeset	1630 $extracted_seq_contexts++;
163892325845 Initial commit. galaxyp parents: diff changeset	1631
163892325845 Initial commit. galaxyp parents: diff changeset	1632 }
163892325845 Initial commit. galaxyp parents: diff changeset	1633
163892325845 Initial commit. galaxyp parents: diff changeset	1634 return($extracted_seq_contexts);
163892325845 Initial commit. galaxyp parents: diff changeset	1635
163892325845 Initial commit. galaxyp parents: diff changeset	1636 }
163892325845 Initial commit. galaxyp parents: diff changeset	1637
163892325845 Initial commit. galaxyp parents: diff changeset	1638 sub _Usage {
163892325845 Initial commit. galaxyp parents: diff changeset	1639
163892325845 Initial commit. galaxyp parents: diff changeset	1640
163892325845 Initial commit. galaxyp parents: diff changeset	1641 print STDOUT "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1642 'ExtractPeptideSequenceContext.pl: Extract sequence contexts of cleavage, miscleavage or modification sites ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1643 ' based on sequence fragments (like peptides experimentally identified with MSMS) ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1644 ' and protein sequences from a database in FASTA or UniProtKB file format.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1645 "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1646 'Usage:' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1647 "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1648 ' ExtractPeptideSequenceContext.pl options' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1649 "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1650 'Available options are:' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1651 "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1652 ' --db [file] Input file containing all database sequences in either UniProtKB .dat or .fasta format.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1653 ' --dba [file] Optional alternative / backup input file containing database sequences in either UniProtKB .dat or .fasta format.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1654 ' This may be a slightly older or newer version of the database, which will be searched in case a ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1655 ' sequence ID was not found in the primary database file that was specified with --db [file].' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1656 ' --dbf [format] Optional argument to specify the database sequence file format. Valid format specifications are:' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1657 ' * \'UniProtKB DAT\' for UniProtKB DAT files.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1658 ' * \'FASTA\' for FASTA files.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1659 ' Only required if the database files do not have the standard file extensions: ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1660 ' *.dat for UniProtKB DAT files or ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1661 ' .fa or .fasta for FASTA files.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1662 ' --k [ID\|AC] Unique identifier type used as key to index an UniProtKB *.dat database file.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1663 ' Must be either AC for a UniProt accession number (default) or ID for a UniProt identifier.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1664 ' Not required for *.fasta databases: This tool will look for any type of ID in the first part of FASTA ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1665 ' sequence headers up until the first white space.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1666 ' Optionally multiple IDs may be present separated with pipe symbols (\|) or semicolons (;).' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1667 ' Optionally IDs may be prefixed with a database namespace and a colon (:).' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1668 ' For example the accession number P32234 as well as the ID 128UP_DROME would be recognised in both ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1669 ' this sequence header:' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1670 ' >UniProtAcc:P32234\|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1671 ' and in this one:' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1672 ' >P32234\|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1673 ' --f [file] Fragment file containing accession numbers or IDs and sequence fragments (peptides), ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1674 ' whose sequence context should be extracted from the database. This file' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1675 ' * Must be tab delimited with one accession/ID plus one sequence fragment per line' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1676 ' * Must contain accession numbers / IDs in the same format as the database file used.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1677 ' Exception: Optionally IDs may be suffixed with the peptide\'s position in its protein between brackets.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1678 ' For example: CLH1_HUMAN[1612-1620].' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1679 ' * May contain other columns, which will be preserved in the output.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1680 ' * Is the basis for the output: if a sequence context was found, ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1681 ' it will be appended in another column to the right of the existing columns.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1682 ' --icol [int] Column in the tab delimited fragment file containing the protein identifiers / accession numbers.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1683 ' Default = 1.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1684 ' --pcol [int] Column in the tab delimited fragment file containing the peptide sequences.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1685 ' Default = 2.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1686 ' --s Strip lowercase characters from the sequence fragments in the fragment file before doing a database lookup.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1687 ' Amino acids are expected to be in uppercase. If not, the sequences are converted to uppercase ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1688 ' before searching for the fragment in the database. When lowercase characters represent ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1689 ' modifications instead of amino acids these need to be removed before searching the database.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1690 ' Default = 0 (disabled) except when a modified amino acid is specified with --ma, which will automatically enable --s.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1691 ' --cleo [file] Optional output file where the retrieved cleavage site sequence contexts will be saved.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1692 ' --miso [file] Optional output file where the retrieved miscleavage site sequence contexts will be saved.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1693 ' --modo [file] Optional output file where the retrieved modification site sequence contexts will be saved.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1694 ' --pepo [file] Optional output file where the retrieved peptide sequence contexts will be saved.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1695 ' (At least one output file must be specified with --cleo, --miso, --modo or --pepo).' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1696 ' --ca [A-Z] Cleavage amino acid. Assume the sequence fragments were derived from cutting with a proteolytic enzyme, ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1697 ' that recognizes this amino acid as the position to cut.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1698 ' When the specificity of the protease used to generate the peptides in the fragments file is unknown,' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1699 ' you may provide an asterisk (*) to retrieve sequence context for any cleavage,' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1700 ' but in that case this tool will not filter non-specifically cleaved fragments,' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1701 ' that may be the result of other processes than protease activity.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1702 ' --ct [C\|N] Cleavage terminus. Assume the sequence fragments were derived from cutting with a proteolytic enzyme, ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1703 ' that cuts at the C or N terminus of the amino acid specified with the --ca [A-Z] parameter.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1704 ' When the specificity of the protease used to generate the peptides in the fragments file is unknown,' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1705 ' you may provide an asterisk (*) to retrieve sequence context for any cleavage,' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1706 ' but in that case this tool will not filter non-specifically cleaved fragments,' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1707 ' that may be the result of other processes than protease activity.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1708 ' --ma [A-Za-z] Modified amino acid.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1709 ' The amino acid must be specified in uppercase and the modification in lower case.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1710 ' The order is not important.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1711 ' Hence a phophorylated serine in the fragments file can be indicated with either pS or Sp, ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1712 ' but you cannot mix both pS and Sp in a single fragments file.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1713 ' You may provide an asterisk (*) instead of an upper case amino acid to retrieve sequence contexts ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1714 ' for the specified modification no matter what amino acid it was located on.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1715 ' A modification may be specified with more than one lower case character, ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1716 ' so for example phosphoS or Sphospho can also be used for a phosphorylated serine.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1717 ' --n [int] Integer specifying the length of the N-terminal sequence context to retrieve starting from the ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1718 ' cleavage, miscleavage or modification site. Default = 5.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1719 ' --c [int] Integer specifying the length of the C-terminal sequence context to retrieve starting from the ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1720 ' cleavage, miscleavage or modification site. Default = 5.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1721 ' Please note that cleavage and miscleavage sites have zero width, ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1722 ' while modified amino acids will increment the sequence context lenght with 1.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1723 ' When defaults are used for both the N-terminal and C-terminal sequence context lengths,' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1724 ' the total sequence context length for a' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1725 ' * cleavage or miscleavage site will be:' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1726 ' (N-terminal sequence context) + (C-terminal sequence context) = 5 + 5 = 10.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1727 ' * modification site will be:' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1728 ' (N-terminal sequence context) + modified AA + (C-terminal sequence context) = 5 + 1 + 5 = 11.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1729 # ' --mcd [int] Minimal distance between a putative miscleavage site and the peptide termini.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1730 # ' Uncleaved amino acids closer to the peptide termini can be ignored with this parameter:' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1731 # ' A. To prevent overlap between cleavage and miscleavage peptide sequence contexts.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1732 # ' B. To prevent false positive miscleavage sites due to sites that cannot be cleaved, ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1733 # ' because one of the resulting fragments is too short. (Sometimes proteases may need ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1734 # ' a minimal length surrounding the cleavage site to bind and cleave a peptide.)' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1735 # ' Default = same as the longest *-terminal context length. Hence [--n [int]\|--c [int]] (see above).' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1736 ' --pc [char] Optional padding character to fill N-terminal or C-terminal positions in the sequence context, ' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1737 ' when the protein was too short to get a complete sequence context.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1738 ' Default = - (a.k.a. dash or the alignment gap character.)' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1739 ' To disable padding specify an empty string like this --pc \'\'.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1740 ' --ll [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.' . "\n" .
163892325845 Initial commit. galaxyp parents: diff changeset	1741 "\n";
163892325845 Initial commit. galaxyp parents: diff changeset	1742 exit;
163892325845 Initial commit. galaxyp parents: diff changeset	1743
163892325845 Initial commit. galaxyp parents: diff changeset	1744 }

Mercurial > repos > galaxyp > nbic_fasta

annotate ExtractPeptideSequenceContext.pl @ 0:163892325845 draft default tip