0
|
1 #!/usr/bin/perl -w
|
|
2 #
|
|
3 # This script extracts subsequences from UniProtKB *.dat files or from *.fasta files
|
|
4 # based on a list of accession numbers / IDs and a sequence fragment (peptide).
|
|
5 #
|
|
6 # =====================================================
|
|
7 # $Id: ExtractPeptideSequenceContext.pl 112 2011-03-01 19:50:23Z pieter.neerincx@gmail.com $
|
|
8 # $HeadURL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractPeptideSequenceContext.pl $
|
|
9 # $LastChangedDate: 2011-03-01 13:50:23 -0600 (Tue, 01 Mar 2011) $
|
|
10 # $LastChangedRevision: 112 $
|
|
11 # $LastChangedBy: pieter.neerincx@gmail.com $
|
|
12 # =====================================================
|
|
13 #
|
|
14
|
|
15 #
|
|
16 # initialise environment
|
|
17 #
|
|
18 use strict;
|
|
19 use Getopt::Long;
|
|
20 use SWISS::Entry; # For parsing UniProtKB *.dat files.
|
|
21 use Log::Log4perl qw(:easy);
|
|
22
|
|
23 my $ps = '/'; # Path Separator.
|
|
24 my %log_levels = (
|
|
25 'ALL' => $ALL,
|
|
26 'TRACE' => $TRACE,
|
|
27 'DEBUG' => $DEBUG,
|
|
28 'INFO' => $INFO,
|
|
29 'WARN' => $WARN,
|
|
30 'ERROR' => $ERROR,
|
|
31 'FATAL' => $FATAL,
|
|
32 'OFF' => $OFF,
|
|
33 );
|
|
34
|
|
35 #
|
|
36 # List of databases, which may contain versioned accession numbers,
|
|
37 # and the separation character used to join the accession number and
|
|
38 # the version number.
|
|
39 #
|
|
40 my %databases_with_optionally_versioned_ids = (
|
|
41 'IPI' => '.',
|
|
42 'ipi' => '.',
|
|
43 'UniProtKB/Swiss-Prot' => '.',
|
|
44 'sp' => '.',
|
|
45 'SP' => '.',
|
|
46 'UniProtKB/TrEMBL' => '.',
|
|
47 'TR' => '.',
|
|
48 'tr' => '.',
|
|
49 'Tr' => '.'
|
|
50 );
|
|
51
|
|
52 my $db;
|
|
53 my $db_alt;
|
|
54 my $db_format;
|
|
55 my $index_id; # Key used to lookup sequences. Can be ID or Accession number (default).
|
|
56 my $fragments;
|
|
57 my $id_column;
|
|
58 my $peptide_column;
|
|
59 my $strip_lc; # For the sequence fragments in the fragments file. With strip_lc enabled we
|
|
60 # assume amino acids are in upper case and lower case characters represent modifications,
|
|
61 # which will be stripped off before searching for the fragments in the complete sequences.
|
|
62 my $cleavage_output; # Sequence contexts for cleavage sites.
|
|
63 my $miscleavage_output; # Sequence contexts for miscleavage sites.
|
|
64 my $modification_output; # Sequence contexts for modification sites.
|
|
65 my $peptide_output; # Sequence contexts for complete peptides.
|
|
66 my $cleavage_aa; # Assume the sequence fragments were derived from cutting at this amino acid.
|
|
67 my $cleavage_terminus; # Assume the sequence fragments were derived from cutting at this side of $cut_at_aa
|
|
68 my $modified_aa;
|
|
69 my $n_term_context_length;
|
|
70 my $c_term_context_length;
|
|
71 my $miscleavage_distance; # Minimal distance between a putative miscleavage site and the peptide termini.
|
|
72 # Uncleaved AAs closer to the peptide termini can be ignored with this parameter:
|
|
73 # A. To prevent overlap between cleavage and miscleavage peptide sequence contexts.
|
|
74 # B. To remove false positive miscleavage sites that cannot be cleaved,
|
|
75 # because one of the resulting fragments is too short. Sometimes proteases may need
|
|
76 # a minimal length surrounding the cleavage site to bind and cleave a peptide.
|
|
77 my $padding_character; # Used for padding if the protein sequence is too short for a long enough context.
|
|
78 my $log_level;
|
|
79
|
|
80 Getopt::Long::GetOptions (
|
|
81 'db=s' => \$db,
|
|
82 'dba:s' => \$db_alt,
|
|
83 'dbf:s' => \$db_format,
|
|
84 'k:s' => \$index_id, # Key used to lookup sequences. Can be ID or Accession number (default).
|
|
85 'f=s' => \$fragments,
|
|
86 'icol:i' => \$id_column,
|
|
87 'pcol:i' => \$peptide_column,
|
|
88 's' => \$strip_lc, # For the sequence fragments in the fragments file. With strip_lc enabled we
|
|
89 # assume amino acids are in upper case and lower case characters represent modifications,
|
|
90 # which will be stripped off before searching for the fragments in the complete sequences.
|
|
91 'cleo:s' => \$cleavage_output,
|
|
92 'miso:s' => \$miscleavage_output,
|
|
93 'modo:s' => \$modification_output,
|
|
94 'pepo:s' => \$peptide_output,
|
|
95 'ca:s' => \$cleavage_aa, # Assume the sequence fragments were derived from cutting at this amino acid.
|
|
96 'ct:s' => \$cleavage_terminus, # Assume the sequence fragments were derived from cutting at this side of $cut_at_aa
|
|
97 'ma:s' => \$modified_aa,
|
|
98 'n:i' => \$n_term_context_length,
|
|
99 'c:i' => \$c_term_context_length,
|
|
100 'mcd:i' => \$miscleavage_distance, # Minimal distance between a putative miscleavage site and the peptide termini.
|
|
101 # Uncleaved AAs closer to the peptide termini can be ignored with this parameter:
|
|
102 # A. To prevent overlap between cleavage and miscleavage peptide sequence contexts.
|
|
103 # B. To remove false positive miscleavage sites that cannot be cleaved,
|
|
104 # because one of the resulting fragments is too short. Sometimes proteases may need
|
|
105 # a minimal length surrounding the cleavage site to bind and cleave a peptide.
|
|
106 'pc:s' => \$padding_character,
|
|
107 'll:s' => \$log_level
|
|
108 );
|
|
109
|
|
110 #
|
|
111 # TODO: 1. Handle --mcd param. Maybe should be extended to exclude also modification sites too close together...
|
|
112 #
|
|
113
|
|
114 #
|
|
115 # Configure logging.
|
|
116 #
|
|
117 # Provides default if user did not specify log level:
|
|
118 $log_level = (defined($log_level) ? $log_level : 'INFO');
|
|
119 # Reset log level to default if user specified illegal log level.
|
|
120 $log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'INFO'});
|
|
121 #Log::Log4perl->init('log4perl.properties');
|
|
122 Log::Log4perl->easy_init(
|
|
123 #{ level => $log_level,
|
|
124 # file => ">>ExtractPeptideSequenceContext.log",
|
|
125 # layout => '%F{1}-%L-%M: %m%n' },
|
|
126 { level => $log_level,
|
|
127 file => "STDOUT",
|
|
128 layout => '%d L:%L %p> %m%n' },
|
|
129 );
|
|
130 my $logger = Log::Log4perl::get_logger();
|
|
131
|
|
132 #
|
|
133 # Check user input.
|
|
134 #
|
|
135 foreach my $input ($db, $db_alt, $fragments, $cleavage_output, $miscleavage_output, $modification_output) {
|
|
136 if (defined($input) && length($input) <= 0) {
|
|
137 $input = undef();
|
|
138 }
|
|
139 }
|
|
140 unless (defined($db) &&
|
|
141 defined($fragments) &&
|
|
142 (defined($cleavage_output) || defined($miscleavage_output) || defined($modification_output) || defined($peptide_output))
|
|
143 ) {
|
|
144 $logger->fatal('You need to specify at least a DB & fragments file as inputs and at least one output file.');
|
|
145 _Usage();
|
|
146 exit;
|
|
147 }
|
|
148 if (defined($db_format)) {
|
|
149 unless ($db_format eq 'UniProtKB DAT' or $db_format eq 'FASTA') {
|
|
150 $logger->fatal('Database format in unsupported format.');
|
|
151 $logger->fatal("\t" . 'Must be \'UniProtKB DAT\' or \'FASTA\'.');
|
|
152 $logger->fatal("\t" . 'But --dbf was ' . $db_format . '.');
|
|
153 exit;
|
|
154 }
|
|
155 }
|
|
156 if (defined($cleavage_aa)) {
|
|
157 unless ($cleavage_aa =~ m/^[*A-Z]$/) {
|
|
158 $logger->fatal('Cleavage amino acid in unsupported format.');
|
|
159 $cleavage_aa = undef;
|
|
160 }
|
|
161 }
|
|
162 if (defined($cleavage_terminus)) {
|
|
163 unless ($cleavage_terminus =~ m/^[*CN]$/) {
|
|
164 $logger->fatal('Cleavage terminus in unsupported format.');
|
|
165 $cleavage_terminus = undef;
|
|
166 }
|
|
167 }
|
|
168 if (defined($modified_aa)) {
|
|
169 unless ($modified_aa =~ m/^[*A-Z][a-z]+$/ || $modified_aa =~ m/^[a-z]+[*A-Z]$/) {
|
|
170 $logger->fatal('Modified amino acid in unsupported format.');
|
|
171 $modified_aa = undef;
|
|
172 }
|
|
173 }
|
|
174 #
|
|
175 # We need info to extract:
|
|
176 # * cleavage site sequence contexts or
|
|
177 # * modification site sequence contexts or
|
|
178 # * peptide sequence contexts
|
|
179 #
|
|
180 if (defined($cleavage_output) || defined($miscleavage_output)) {
|
|
181 unless (defined($cleavage_aa) && defined($cleavage_terminus)) {
|
|
182 $logger->fatal('If you want to retrieve (mis)cleavage site sequence contexts, you must provide valid values for --ca and --ct.');
|
|
183 $logger->debug('You specified --ca ' . $cleavage_aa . ' --ct ' . $cleavage_terminus . '.');
|
|
184 _Usage();
|
|
185 exit;
|
|
186 }
|
|
187 }
|
|
188 if (defined($modification_output)) {
|
|
189 unless (defined($modified_aa)) {
|
|
190 $logger->fatal('If you want to retrieve modifications site sequence contexts you must provide a valid value for --ma.');
|
|
191 $logger->debug('You specified --ma ' . $modified_aa . '.');
|
|
192 _Usage();
|
|
193 exit;
|
|
194 }
|
|
195 }
|
|
196 if (defined($peptide_output)) {
|
|
197 #
|
|
198 # We can work with defaults for all params.
|
|
199 #
|
|
200 }
|
|
201
|
|
202 # Make sure we don't overwrite the input.
|
|
203 foreach my $output ($cleavage_output, $miscleavage_output, $modification_output, $peptide_output) {
|
|
204 foreach my $input ($db, $db_alt, $fragments) {
|
|
205 if (defined($input) && defined($output) && $output eq $input) {
|
|
206 $logger->fatal('Output file ' . $output . ' is the same as input file ' . $input . '.');
|
|
207 $logger->fatal("\t" . 'Please choose a different file for the output.');
|
|
208 exit;
|
|
209 }
|
|
210 }
|
|
211 }
|
|
212 # Define defaults for optional arguments.
|
|
213 $index_id = (defined($index_id) && length($index_id) > 0 ? $index_id : 'AC');
|
|
214 $id_column = (defined($id_column) && length($id_column) > 0 ? $id_column : 1);
|
|
215 $peptide_column = (defined($peptide_column) && length($peptide_column) > 0 ? $peptide_column : 2);
|
|
216 $n_term_context_length = (defined($n_term_context_length) && length($n_term_context_length) > 0 ? $n_term_context_length : 5);
|
|
217 $c_term_context_length = (defined($c_term_context_length) && length($c_term_context_length) > 0 ? $c_term_context_length : 5);
|
|
218 $padding_character = (defined($padding_character) ? $padding_character : '-');
|
|
219 # Stripping of lowercase characters is disabled by default unless we search for modification site contexts in which case it must be enabled!
|
|
220 $strip_lc = (defined($strip_lc) ? $strip_lc : 0);
|
|
221 $strip_lc = (defined($modified_aa) ? 1 : $strip_lc);
|
|
222 unless (defined($miscleavage_distance) && $miscleavage_distance > 0) {
|
|
223 if ($n_term_context_length >= $c_term_context_length) {
|
|
224 $miscleavage_distance = $n_term_context_length;
|
|
225 } else {
|
|
226 $miscleavage_distance = $c_term_context_length;
|
|
227 }
|
|
228 }
|
|
229
|
|
230
|
|
231 #
|
|
232 # Check db files and fragments file.
|
|
233 #
|
|
234 unless (-e $db && -f $db && -r $db) {
|
|
235 $logger->fatal('Cannot read from database input file ' . $db . ': ' . $!);
|
|
236 exit;
|
|
237 }
|
|
238 if (defined($db_alt)) {
|
|
239 unless (-e $db_alt && -f $db_alt && -r $db_alt) {
|
|
240 $logger->fatal('Cannot read from alternative database input file ' . $db_alt . ': ' . $!);
|
|
241 exit;
|
|
242 }
|
|
243 }
|
|
244 unless (-e $fragments && -f $fragments && -r $fragments) {
|
|
245 $logger->fatal('Cannot read from fragments input file ' . $fragments . ': ' .$!);
|
|
246 exit;
|
|
247 }
|
|
248
|
|
249 #
|
|
250 # Create lookup table(s) of the entire database input file(s).
|
|
251 #
|
|
252 my @db_lookup_table_refs;
|
|
253 (my $db_seqs, $index_id) = _CreateLookupHash($db, $index_id, $db_format);
|
|
254 my $seq_count = scalar(keys(%{$db_seqs}));
|
|
255 $logger->info('Number of sequences in database lookup table: ' . $seq_count);
|
|
256 push(@db_lookup_table_refs, $db_seqs);
|
|
257 # Optional backup DB file.
|
|
258 if (defined($db_alt)) {
|
|
259 (my $db_seqs_alt, $index_id) = _CreateLookupHash($db_alt, $index_id, $db_format);
|
|
260 my $seq_count_alt = scalar(keys(%{$db_seqs_alt}));
|
|
261 $logger->info('Number of sequences in backup database lookup table: ' . $seq_count_alt);
|
|
262 push(@db_lookup_table_refs, $db_seqs_alt);
|
|
263 }
|
|
264
|
|
265 #
|
|
266 # Debugging loop
|
|
267 #
|
|
268 #foreach my $keyyy (keys(%{$db_seqs})) {
|
|
269 # $logger->fatal('Key: ' . $keyyy);
|
|
270 # $logger->fatal("\t" . 'AC: ' . ${$db_seqs}{$keyyy}{'AC'});
|
|
271 # $logger->fatal("\t" . 'ID: ' . ${$db_seqs}{$keyyy}{'ID'});
|
|
272 # $logger->fatal("\t" . 'SQ: ' . ${$db_seqs}{$keyyy}{'SQ'});
|
|
273 #}
|
|
274
|
|
275 #
|
|
276 # Lookup sequence fragment contexts and store results.
|
|
277 #
|
|
278 my ($counter_cleavages, $counter_miscleavages, $counter_modifications, $counter_peptides) = _ExtractSeqs(\@db_lookup_table_refs,
|
|
279 $fragments, $strip_lc, $id_column, $peptide_column,
|
|
280 $cleavage_output, $miscleavage_output, $modification_output, $peptide_output,
|
|
281 $cleavage_aa, $cleavage_terminus, $miscleavage_distance,
|
|
282 $n_term_context_length, $c_term_context_length,
|
|
283 $padding_character);
|
|
284 #
|
|
285 # Report result stats.
|
|
286 #
|
|
287 my $max_length = 0;
|
|
288 foreach my $counter ($counter_cleavages, $counter_miscleavages, $counter_modifications, $counter_peptides) {
|
|
289 $max_length = (length($counter) > $max_length) ? length($counter) : $max_length;
|
|
290 }
|
|
291 my $prefixed_counter = sprintf('%*s', $max_length, $counter_cleavages);
|
|
292 $logger->info('Extracted cleavage site sequence contexts: ' . $prefixed_counter . '.');
|
|
293 $prefixed_counter = sprintf('%*s', $max_length, $counter_miscleavages);
|
|
294 $logger->info('Extracted miscleavage site sequence contexts: ' . $prefixed_counter . '.');
|
|
295 $prefixed_counter = sprintf('%*s', $max_length, $counter_modifications);
|
|
296 $logger->info('Extracted modification site sequence contexts: ' . $prefixed_counter . '.');
|
|
297 $prefixed_counter = sprintf('%*s', $max_length, $counter_peptides);
|
|
298 $logger->info('Extracted peptide sequence contexts: ' . $prefixed_counter . '.');
|
|
299 $prefixed_counter = sprintf('%*s', $max_length, $counter_cleavages + $counter_miscleavages + $counter_modifications + $counter_peptides);
|
|
300 $logger->info('Total extracted sequence contexts: ' . $prefixed_counter . '.');
|
|
301 $logger->info('Finished!');
|
|
302
|
|
303 #
|
|
304 ##
|
|
305 ### Internal subs.
|
|
306 ##
|
|
307 #
|
|
308
|
|
309 sub _CreateLookupHash {
|
|
310
|
|
311 my ($file_path, $index_id, $file_type) = @_;
|
|
312
|
|
313 my $file_path_fh;
|
|
314 my %seqs;
|
|
315
|
|
316 if (defined($file_type)) {
|
|
317 $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...');
|
|
318 } elsif ($file_path =~ m/($ps?)([^\.]+)\.dat$/i) {
|
|
319 $file_type = 'UniProtKB DAT';
|
|
320 $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...');
|
|
321 } elsif ($file_path =~ m/($ps?)([^\.]+)\.fa(sta)?$/i) {
|
|
322 $file_type = 'FASTA';
|
|
323 # For FASTA files we don't know what to expect:
|
|
324 # IDs or ACcession numbers or a mix of both.
|
|
325 # (Re)set index_id type to AC to prevent adding redundant IDs to the results.
|
|
326 # Hence treat all identifiers stable and unstable as ACs.
|
|
327 # (Normally we default to ACcession numbers and if we would have found an AC
|
|
328 # and the input fragments file was using IDs, we would append the AC to the results.)
|
|
329 $index_id = 'AC';
|
|
330 $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...');
|
|
331 } else {
|
|
332 $file_type = 'FASTA';
|
|
333 $index_id = 'AC';
|
|
334 $logger->warn('Could not determine database file type based on the file name extension of ' . $file_path);
|
|
335 $logger->warn('Will assume it is a FASTA file and hope that is correct...');
|
|
336 $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...');
|
|
337 }
|
|
338
|
|
339 if ($file_type eq 'UniProtKB DAT') {
|
|
340
|
|
341 #
|
|
342 # Read an entire record delimited by // at a time.
|
|
343 #
|
|
344 local $/ = "\/\/\n";
|
|
345
|
|
346 eval {
|
|
347 open($file_path_fh, "<$file_path");
|
|
348 };
|
|
349 if ($@) {
|
|
350 $logger->fatal('Cannot read from database input file: ' . $@);
|
|
351 exit;
|
|
352 }
|
|
353
|
|
354 while (<$file_path_fh>){
|
|
355
|
|
356 $logger->trace('Record:' . "\n$_\n");
|
|
357
|
|
358 my $entry = SWISS::Entry->fromText($_);
|
|
359
|
|
360 my $id = $entry->ID;
|
|
361 my $acc = $entry->AC;
|
|
362 my $seq = $entry->SQ;
|
|
363 my $index_key;
|
|
364
|
|
365 if ($index_id eq 'ID') {
|
|
366 $index_key = $id;
|
|
367 } elsif ($index_id eq 'AC') {
|
|
368 $index_key = $acc;
|
|
369 } else {
|
|
370 $logger->fatal('Unknown type of identifier used for indexing: ' . $index_id . '. Must be ID or AC.');
|
|
371 exit;
|
|
372 }
|
|
373
|
|
374 $logger->debug('Found accession ' . $acc);
|
|
375 $logger->trace('Found accession ' . $acc . ' with ID ' . $id . ' and seq ' . $seq);
|
|
376
|
|
377 $seqs{$index_key}{'AC'} = $acc;
|
|
378 $seqs{$index_key}{'ID'} = $id;
|
|
379 $seqs{$index_key}{'SQ'} = $seq;
|
|
380
|
|
381 }
|
|
382
|
|
383 } elsif ($file_type eq 'FASTA') {
|
|
384
|
|
385 my @header_ids = ();
|
|
386 my $seq = '';
|
|
387
|
|
388 eval {
|
|
389 open($file_path_fh, "<$file_path");
|
|
390 };
|
|
391 if ($@) {
|
|
392 $logger->fatal('Cannot read from database input file: ' . $@);
|
|
393 exit;
|
|
394 }
|
|
395
|
|
396 LINE: while (my $line = <$file_path_fh>) {
|
|
397
|
|
398 if ($line =~ m/^[\s\n\r]+$/) {
|
|
399
|
|
400 #
|
|
401 # Skip blank line
|
|
402 #
|
|
403 $logger->debug('Found empty line in FASTA file.');
|
|
404 next LINE;
|
|
405
|
|
406 } elsif ($line =~ m/^[a-zA-Z]+/) {
|
|
407
|
|
408 #
|
|
409 # It's a sequence line
|
|
410 # Remove all white space and new line characters.
|
|
411 #
|
|
412 $logger->debug('Found FASTA sequence line: ' . $line);
|
|
413
|
|
414 $line =~ s/[\s\n\r]+//g;
|
|
415 $seq .= $line;
|
|
416
|
|
417 } elsif ($line =~ m/^>/) {
|
|
418
|
|
419 $logger->debug('Found new FASTA header line: ' . $line);
|
|
420
|
|
421 if ((length($seq) > 0) && defined($header_ids[0])) {
|
|
422
|
|
423 #
|
|
424 # Store the previously found sequence in the lookup table.
|
|
425 #
|
|
426 foreach my $header_id (@header_ids) {
|
|
427
|
|
428 #
|
|
429 # We don't know if the ID we found is an accession number (stable ID) or a normal (unstable) ID
|
|
430 # -> Fill both the AC and ID fields with the same ID value.
|
|
431 #
|
|
432 $seqs{$header_id}{'AC'} = $header_id;
|
|
433 $seqs{$header_id}{'ID'} = $header_id;
|
|
434 $seqs{$header_id}{'SQ'} = $seq;
|
|
435
|
|
436 $logger->debug('Stored ' . $header_id . ':');
|
|
437 $logger->debug("\t" . 'SQ ' . $seq);
|
|
438
|
|
439 }
|
|
440
|
|
441 #
|
|
442 # Reset vars.
|
|
443 #
|
|
444 $seq = '';
|
|
445 @header_ids = ();
|
|
446
|
|
447 }
|
|
448
|
|
449 #
|
|
450 # Check for the presence of some frequently occurring naming schemes:
|
|
451 #
|
|
452 # >IPI:CON_Trypsin|SWISS-PROT:P00761|TRYP_PIG Trypsin - Sus scrofa (Pig).
|
|
453 # >IPI:CON_IPI00174775.2|TREMBL:Q32MB2;Q86Y46 Tax_Id=9606 Gene_Symbol=KRT73 Keratin-73
|
|
454 # >sp|Q42592|APXS_ARATH L-ascorbate peroxidase S, chloroplastic/mitochondrial;
|
|
455 # >jgi|Triad1|1|gw1.3.1.1
|
|
456 #
|
|
457 # The FASTA header line can basically have any format.
|
|
458 # Therefore we try to see if the IDs we are looking for are present anywhere
|
|
459 # in the header line up until the first white space or new line.
|
|
460 # This should prevent matching annotation from other proteins in the description
|
|
461 # like a for example a 'best' BLAST hit that contains one of the IDs we
|
|
462 # are looking for. Hence, in such a case this sequence is *not* the one of
|
|
463 # the IDs we looking for, but similar to that one at best.
|
|
464 #
|
|
465 my @unfiltered_ids;
|
|
466 if ($line =~ m/^>((([^\s;|]+)[;|])*([^\s;|]+))[|;]?/i) {
|
|
467
|
|
468 #
|
|
469 # Got multiple IDs
|
|
470 #
|
|
471 my $multiple_ids = $1;
|
|
472 @unfiltered_ids = split(/[\s;|]+/, $multiple_ids);
|
|
473
|
|
474 } elsif ($line =~ m/^>([^\s]+)/i){
|
|
475
|
|
476 #
|
|
477 # Got a single ID
|
|
478 #
|
|
479 my $single_id = $1;
|
|
480 push(@unfiltered_ids, $single_id);
|
|
481
|
|
482 }
|
|
483
|
|
484 KEY:foreach my $unfiltered_id (@unfiltered_ids) {
|
|
485
|
|
486 my $this_id = $unfiltered_id;
|
|
487
|
|
488 #
|
|
489 # Check for RockerBox formattted IDs.
|
|
490 #
|
|
491 # Example of RockerBox ID format, which may contain
|
|
492 # * multiple IDs separated by semi-colons and
|
|
493 # * suffixed with the position of the peptide in the protein:
|
|
494 # YGR192C[123-142]; YJR009C[123-142]
|
|
495 if ($this_id =~ m/^([^\[\]]+)\[\d+-\d+\]$/i) {
|
|
496 $logger->trace('Protein ID in RockerBox format: ' . $this_id);
|
|
497 $this_id = $1;
|
|
498 $logger->trace("\t" . 'Using ID : ' . $this_id);
|
|
499 }
|
|
500
|
|
501 #
|
|
502 # Check for DB namespace prefixes.
|
|
503 #
|
|
504 if ($this_id =~ m/([^:]+):([^:]+)/i) {
|
|
505
|
|
506 #
|
|
507 # Namespace prefixed ID.
|
|
508 #
|
|
509 $logger->trace('Namespace prefixed ID: ' . $this_id);
|
|
510
|
|
511 my $prefix = $1;
|
|
512 $this_id = $2;
|
|
513
|
|
514 $logger->trace("\t" . 'Using ID: ' . $this_id);
|
|
515
|
|
516 #
|
|
517 # For a selected list of "known" database namespace prefixes,
|
|
518 # check for versioned accession numbers and remove the version number.
|
|
519 #
|
|
520 if (defined($prefix) && defined($databases_with_optionally_versioned_ids{$prefix})) {
|
|
521 $logger->trace('Found prefix: ' . $prefix);
|
|
522 my $separator = $databases_with_optionally_versioned_ids{$prefix};
|
|
523 $separator = '\\' . $separator;
|
|
524 if ($this_id =~ m/([^$separator]+)$separator\d+$/) {
|
|
525 $this_id = $1;
|
|
526 }
|
|
527 }
|
|
528 }
|
|
529
|
|
530 $logger->debug('Found ID: ' . $this_id);
|
|
531 push(@header_ids, $this_id);
|
|
532
|
|
533 }
|
|
534
|
|
535 # if ($line =~ m/^>((([^\s\n\r:;|]+)[:]([^\s\n\r:|]+)[|;])*([^\s\n\r:;|]+)[:]([^\s\n\r:|]+))[|;]?(\s+(.+))?/i) {
|
|
536 #
|
|
537 # #
|
|
538 # # One or more namespace prefixed IDs.
|
|
539 # #
|
|
540 # my $concatenated_namespace_prefixed_ids = $1;
|
|
541 # $logger->debug('Found prefixed IDs in header: ' . $concatenated_namespace_prefixed_ids);
|
|
542 #
|
|
543 # # database_namespace = $3 && $5;
|
|
544 # # id = $4 && $6;
|
|
545 # # description = $8;
|
|
546 # my @namespace_prefixed_ids = split(/[|;]/, $concatenated_namespace_prefixed_ids);
|
|
547 #
|
|
548 # foreach my $prefixed_id (@namespace_prefixed_ids) {
|
|
549 #
|
|
550 # $logger->trace('Prefixed ID: ' . $prefixed_id);
|
|
551 #
|
|
552 # if ($prefixed_id =~ m/(([^\s\n\r:;|]+):)?([^\s\n\r:|]+)/i) {
|
|
553 #
|
|
554 # my $this_prefix = $2;
|
|
555 # my $this_id = $3;
|
|
556 # $logger->debug('Found ID: ' . $this_id);
|
|
557 # push(@header_ids, $this_id);
|
|
558 #
|
|
559 # #
|
|
560 # # For a selected list of "known" database namespace prefixes,
|
|
561 # # check for versioned accession numbers and - if found - add
|
|
562 # # both the versioned and the unversioned accession number
|
|
563 # # to the list of header IDs.
|
|
564 # #
|
|
565 # if (defined($this_prefix) && defined($databases_with_optionally_versioned_ids{$this_prefix})) {
|
|
566 # $logger->trace('Found prefix: ' . $this_prefix);
|
|
567 # my $separator = $databases_with_optionally_versioned_ids{$this_prefix};
|
|
568 # $separator = '\\' . $separator;
|
|
569 # if ($this_id =~ m/([^$separator]+)$separator\d+$/) {
|
|
570 # my $this_unversioned_id = $1;
|
|
571 # $logger->debug('Found unversioned ID: ' . $this_unversioned_id);
|
|
572 # push(@header_ids, $this_unversioned_id);
|
|
573 # }
|
|
574 # }
|
|
575 #
|
|
576 # } else {
|
|
577 #
|
|
578 # $logger->warn(
|
|
579 # 'This should have been an optionally prefixed ID, '
|
|
580 # . 'but failed to match the corresponding regex: '
|
|
581 # . $prefixed_id);
|
|
582 #
|
|
583 # }
|
|
584 # }
|
|
585 #
|
|
586 # } elsif ($line =~ m/^>((([^\s\n\r:;|]+)[|;])*([^\s\n\r:;|]+))[|;]?(\s+(.+))?/i) {
|
|
587 #
|
|
588 # #
|
|
589 # # One or more unprefixed IDs.
|
|
590 # #
|
|
591 # my $concatenated_ids = $1;
|
|
592 # $logger->debug('Found unprefixed IDs in header: ' . $concatenated_ids);
|
|
593 #
|
|
594 # # id = $3 && $4;
|
|
595 # # description = $7;
|
|
596 # my @unprefixed_ids = split(/[|;]/, $concatenated_ids);
|
|
597 #
|
|
598 # foreach my $unprefixed_id (@unprefixed_ids) {
|
|
599 #
|
|
600 # $logger->debug('Found ID: ' . $unprefixed_id);
|
|
601 # push(@header_ids, $unprefixed_id);
|
|
602 #
|
|
603 # }
|
|
604 #
|
|
605 # } else {
|
|
606 #
|
|
607 # #
|
|
608 # # Something else.
|
|
609 # #
|
|
610 # # The FASTA header line can basically have any format,
|
|
611 # # so this is probably one of the less frequent occurring annotation schemes.
|
|
612 # # Therefore we try to see if the IDs we are looking for are present anywhere
|
|
613 # # in the header line up until the first white space or new line.
|
|
614 # # This may be tricky as we might match other annotation from other proteins
|
|
615 # # like a description from a 'best' BLAST hit that contains one of the IDs we
|
|
616 # # are looking for. Hence, in such a case this sequence is *not* the one of
|
|
617 # # the IDs we looking for, but similar to that one at best.
|
|
618 # #
|
|
619 # if ($line =~ m/>([^\n\r\s]+)/) {
|
|
620 #
|
|
621 # my $putative_id = $1;
|
|
622 # $logger->debug('Found putative ID in header: ' . $putative_id);
|
|
623 # push(@header_ids, $putative_id);
|
|
624 #
|
|
625 # } else {
|
|
626 # $logger->warn('Cannot identify IDs in this header: ' . $line);
|
|
627 # }
|
|
628 # }
|
|
629 }
|
|
630 }
|
|
631
|
|
632 #
|
|
633 # Store the last found sequence in the lookup table.
|
|
634 #
|
|
635 foreach my $header_id (@header_ids) {
|
|
636
|
|
637 #
|
|
638 # We don't know if the ID we found is an accession number (stable ID) or a normal (unstable) ID
|
|
639 # -> Fill both the AC and ID fields with the same ID value.
|
|
640 #
|
|
641 $seqs{$header_id}{'AC'} = $header_id;
|
|
642 $seqs{$header_id}{'ID'} = $header_id;
|
|
643 $seqs{$header_id}{'SQ'} = $seq;
|
|
644
|
|
645 $logger->debug('Stored ' . $header_id . ':');
|
|
646 $logger->debug("\t" . 'SQ ' . $seq);
|
|
647
|
|
648 }
|
|
649
|
|
650 #
|
|
651 # Reset vars.
|
|
652 #
|
|
653 $seq = '';
|
|
654 @header_ids = ();
|
|
655
|
|
656 }
|
|
657
|
|
658 close($file_path_fh);
|
|
659 $logger->info('Created lookup list for database sequences.');
|
|
660
|
|
661 return (\%seqs, $index_id);
|
|
662
|
|
663 }
|
|
664
|
|
665 sub _ExtractSeqs {
|
|
666
|
|
667 my ($db_lookup_table_refs,
|
|
668 $path_from, $strip_lc, $id_column, $peptide_column,
|
|
669 $cleavage_path_to, $miscleavage_path_to, $modification_path_to, $peptide_path_to,
|
|
670 $cleavage_aa, $cleavage_terminus, $miscleavage_distance,
|
|
671 $n_term_context_length, $c_term_context_length,
|
|
672 $padding_character) = @_;
|
|
673
|
|
674 $logger->debug('Parsing ' . $path_from . '...');
|
|
675
|
|
676 my $extracted_seq_contexts_cle = 0;
|
|
677 my $extracted_seq_contexts_mis = 0;
|
|
678 my $extracted_seq_contexts_mod = 0;
|
|
679 my $extracted_seq_contexts_pep = 0;
|
|
680 my $path_from_fh;
|
|
681 my $cleavage_path_to_fh;
|
|
682 my $miscleavage_path_to_fh;
|
|
683 my $modification_path_to_fh;
|
|
684 my $peptide_path_to_fh;
|
|
685
|
|
686 eval {
|
|
687 open($path_from_fh, "<$path_from");
|
|
688 };
|
|
689 if ($@) {
|
|
690 $logger->fatal('Cannot read from fragments input file: ' . $@);
|
|
691 exit;
|
|
692 }
|
|
693 if (defined($cleavage_path_to)) {
|
|
694 eval {
|
|
695 open($cleavage_path_to_fh, ">$cleavage_path_to");
|
|
696 };
|
|
697 if ($@) {
|
|
698 $logger->fatal('Cannot write to output file: ' . $@);
|
|
699 exit;
|
|
700 }
|
|
701 }
|
|
702 if (defined($miscleavage_path_to)) {
|
|
703 eval {
|
|
704 open($miscleavage_path_to_fh, ">$miscleavage_path_to");
|
|
705 };
|
|
706 if ($@) {
|
|
707 $logger->fatal('Cannot write to output file: ' . $@);
|
|
708 exit;
|
|
709 }
|
|
710 }
|
|
711 if (defined($modification_path_to)) {
|
|
712 eval {
|
|
713 open($modification_path_to_fh, ">$modification_path_to");
|
|
714 };
|
|
715 if ($@) {
|
|
716 $logger->fatal('Cannot write to output file: ' . $@);
|
|
717 exit;
|
|
718 }
|
|
719 }
|
|
720 if (defined($peptide_path_to)) {
|
|
721 eval {
|
|
722 open($peptide_path_to_fh, ">$peptide_path_to");
|
|
723 };
|
|
724 if ($@) {
|
|
725 $logger->fatal('Cannot write to output file: ' . $@);
|
|
726 exit;
|
|
727 }
|
|
728 }
|
|
729
|
|
730 LINE:while (my $line = <$path_from_fh>) {
|
|
731
|
|
732 #
|
|
733 # Remove (trailing) line end characters!
|
|
734 #
|
|
735 $line =~ s/[\n\r\f]+//g;
|
|
736 $logger->trace($line);
|
|
737
|
|
738 my @column_values = split(/\t/, $line);
|
|
739
|
|
740 # For debugging only.
|
|
741 #foreach my $val (@column_values) {
|
|
742 # $logger->debug('Col val: ' . $val);
|
|
743 #}
|
|
744
|
|
745 my $id_column_index = $id_column - 1;
|
|
746 my $peptide_column_index = $peptide_column - 1;
|
|
747 my $unfiltered_key = $column_values[$id_column_index];
|
|
748 my $key;
|
|
749 my @keys;
|
|
750 my $peptide = $column_values[$peptide_column_index];
|
|
751 my $peptide_nonmod;
|
|
752 my $protein;
|
|
753 my $acc;
|
|
754 my $id;
|
|
755 my $index = -1; # An index < 0 indicates the peptide was not found in a protein sequence (yet).
|
|
756 my $length;
|
|
757 my $cleavage_peptide_contexts = [];
|
|
758 my $miscleavage_peptide_contexts = [];
|
|
759 my $modification_peptide_contexts = [];
|
|
760 my $peptide_contexts = [];
|
|
761
|
|
762 #
|
|
763 # Check if $key and $peptide were present in this line.
|
|
764 # If not this may be a leading/trailing empty or meta-data line,
|
|
765 # that does not contain the correct amount of columns.
|
|
766 #
|
|
767 unless (defined($unfiltered_key) && defined($peptide)) {
|
|
768 $logger->warn('Skipping line: ' . $line);
|
|
769 $logger->warn("\t" . '(Peptide sequence and/or protein ID missing or incorrect amount of columns.)');
|
|
770 next LINE;
|
|
771 }
|
|
772
|
|
773 #
|
|
774 # Sanity check for peptide sequences.
|
|
775 #
|
|
776 unless ($peptide =~ m/([a-zA-Z]+)/) {
|
|
777 $logger->warn('Fragment file line in unexpected format. Cannot find peptide in: ' . $line);
|
|
778 $logger->debug('$peptide contains: ' . $peptide);
|
|
779 next LINE;
|
|
780 }
|
|
781
|
|
782 if ($strip_lc) {
|
|
783 # Remove lower case characters representing modifications.
|
|
784 $peptide_nonmod = $peptide;
|
|
785 $peptide_nonmod =~ s/[a-z]+//g;
|
|
786 } else {
|
|
787 # Convert everything to uppercase.
|
|
788 $peptide_nonmod = uc($peptide);
|
|
789 }
|
|
790
|
|
791 #
|
|
792 # Figure out in what format the Protein ID(s) were supplied.
|
|
793 #
|
|
794 # * Strip off optional DB namespace prefixes.
|
|
795 # * Handle optional accession number version suffixes.
|
|
796 #
|
|
797 if ($unfiltered_key =~ m/^((([^\s;|]+)[\s;|]+)*([^\s;|]+))[\s|;]?/i) {
|
|
798
|
|
799 #
|
|
800 # Got multiple IDs
|
|
801 #
|
|
802 my $multiple_ids = $1;
|
|
803 @keys = split(/[\s;|]+/, $multiple_ids);
|
|
804
|
|
805 } else {
|
|
806
|
|
807 #
|
|
808 # Got a single ID
|
|
809 #
|
|
810 push(@keys, $unfiltered_key);
|
|
811
|
|
812 }
|
|
813
|
|
814 KEY:foreach $unfiltered_key (@keys) {
|
|
815
|
|
816 $key = $unfiltered_key;
|
|
817
|
|
818 #
|
|
819 # Check for RockerBox formattted IDs.
|
|
820 #
|
|
821 # Example of RockerBox ID format, which may contain
|
|
822 # * multiple IDs separated by semi-colons and
|
|
823 # * suffixed with the position of the peptide in the protein:
|
|
824 # YGR192C[123-142]; YJR009C[123-142]
|
|
825 if ($key =~ m/^([^\[]+)\[\d+-\d+\]$/i) {
|
|
826 $logger->trace('Protein ID in RockerBox format: ' . $key);
|
|
827 $key = $1;
|
|
828 $logger->trace("\t" . 'Using ID : ' . $key);
|
|
829 }
|
|
830
|
|
831 #
|
|
832 # Check for DB namespace prefixes.
|
|
833 #
|
|
834 if ($key =~ m/([^:]+):([^:]+)/i) {
|
|
835
|
|
836 #
|
|
837 # Namespace prefixed ID.
|
|
838 #
|
|
839 $logger->trace('Namespace prefixed ID: ' . $key);
|
|
840
|
|
841 my $prefix = $1;
|
|
842 $key = $2;
|
|
843
|
|
844 $logger->trace("\t" . 'Using ID: ' . $key);
|
|
845
|
|
846 #
|
|
847 # For a selected list of "known" database namespace prefixes,
|
|
848 # check for versioned accession numbers and remove the version number.
|
|
849 #
|
|
850 if (defined($prefix) && defined($databases_with_optionally_versioned_ids{$prefix})) {
|
|
851 $logger->trace('Found prefix: ' . $prefix);
|
|
852 my $separator = $databases_with_optionally_versioned_ids{$prefix};
|
|
853 $separator = '\\' . $separator;
|
|
854 if ($key =~ m/([^$separator]+)$separator\d+$/) {
|
|
855 $key = $1;
|
|
856 }
|
|
857 }
|
|
858 }
|
|
859
|
|
860 $logger->debug('Found ID: ' . $key);
|
|
861
|
|
862 #
|
|
863 # Sanity check for Protein ID.
|
|
864 #
|
|
865 unless ($key =~ m/^([A-Z0-9\._-]+)$/i) {
|
|
866 $logger->warn('Fragment file line in unexpected format. Cannot find sequence ID in:');
|
|
867 $logger->warn("\t" . $line);
|
|
868 $logger->debug('$key contains: ' . $key);
|
|
869 next LINE;
|
|
870 }
|
|
871
|
|
872 #
|
|
873 # Lookup info for the protein this peptide was derived from.
|
|
874 #
|
|
875 ($protein, $acc, $id, $index) = _LookupProteinInfo($key, $peptide_nonmod, $db_lookup_table_refs);
|
|
876
|
|
877 #
|
|
878 # Check if the peptide (fragment) was found in this protein sequence.
|
|
879 #
|
|
880 if (defined($protein)) {
|
|
881 if ($index > -1) {
|
|
882 $logger->trace('Found peptide sequence in protein ' . $key);
|
|
883 last KEY;
|
|
884 } else {
|
|
885 $logger->warn('Cannot find peptide (' . $peptide_nonmod . ') in protein ' . $key);
|
|
886 $logger->warn("\t" . 'Will try other protein IDs for this peptide if available.');
|
|
887 }
|
|
888 } else {
|
|
889 $logger->warn('Cannot find protein identifier ' . $key . ' in any of the supplied databases.');
|
|
890 $logger->warn("\t" . 'Will try other protein IDs for this peptide if available.');
|
|
891 }
|
|
892 }
|
|
893
|
|
894 #
|
|
895 # Check if the peptide (fragment) was found in any of the associated protein sequences.
|
|
896 #
|
|
897 if (defined($protein)) {
|
|
898
|
|
899 $logger->debug('Found protein ' . $key);
|
|
900
|
|
901 if ($index > -1) {
|
|
902
|
|
903 $logger->debug('Found peptide (' . $peptide_nonmod . ') in protein ' . $key);
|
|
904
|
|
905 if (defined($cleavage_path_to)) {
|
|
906 ($cleavage_peptide_contexts) = _GetCleavageSequenceContexts($peptide_nonmod, $protein, $key, $index,
|
|
907 $cleavage_aa, $cleavage_terminus);
|
|
908 }
|
|
909 if (defined($miscleavage_path_to)) {
|
|
910 ($miscleavage_peptide_contexts) = _GetMiscleavageSequenceContexts($peptide_nonmod, $protein, $key, $index,
|
|
911 $cleavage_aa, $cleavage_terminus, $miscleavage_distance);
|
|
912 }
|
|
913 if (defined($modification_path_to)) {
|
|
914 ($modification_peptide_contexts) = _GetModificationSequenceContexts($peptide_nonmod, $protein, $key, $index,
|
|
915 $modified_aa, $peptide);
|
|
916 }
|
|
917 if (defined($peptide_path_to)) {
|
|
918 ($peptide_contexts) = _GetPeptideSequenceContexts($peptide_nonmod, $protein, $key, $index);
|
|
919 }
|
|
920 } else {
|
|
921 $logger->error('Cannot find peptide (' . $peptide_nonmod . ') in protein ' . $key);
|
|
922 $acc = 'NA';
|
|
923 }
|
|
924 } else {
|
|
925 $logger->error('Cannot find protein "' . $unfiltered_key . '" in any of the supplied databases.');
|
|
926 $acc = 'NA';
|
|
927 }
|
|
928
|
|
929 #
|
|
930 # Append the peptide contexts to the existing lines of the fragment file and save that new line to the output file.
|
|
931 #
|
|
932 $extracted_seq_contexts_cle = _SaveSequenceContexts($line, $acc, $cleavage_peptide_contexts,
|
|
933 $cleavage_path_to_fh, $extracted_seq_contexts_cle);
|
|
934
|
|
935 $extracted_seq_contexts_mis = _SaveSequenceContexts($line, $acc, $miscleavage_peptide_contexts,
|
|
936 $miscleavage_path_to_fh, $extracted_seq_contexts_mis);
|
|
937
|
|
938 $extracted_seq_contexts_mod = _SaveSequenceContexts($line, $acc, $modification_peptide_contexts,
|
|
939 $modification_path_to_fh, $extracted_seq_contexts_mod);
|
|
940
|
|
941 $extracted_seq_contexts_pep = _SaveSequenceContexts($line, $acc, $peptide_contexts,
|
|
942 $peptide_path_to_fh, $extracted_seq_contexts_pep);
|
|
943
|
|
944 }
|
|
945
|
|
946 #
|
|
947 # Close file handles.
|
|
948 #
|
|
949 foreach my $fh ($path_from_fh, $cleavage_path_to_fh, $miscleavage_path_to_fh, $modification_path_to_fh, $peptide_path_to_fh) {
|
|
950 if (defined($fh)) {
|
|
951 close($fh);
|
|
952 }
|
|
953 }
|
|
954
|
|
955 return($extracted_seq_contexts_cle, $extracted_seq_contexts_mis, $extracted_seq_contexts_mod, $extracted_seq_contexts_pep);
|
|
956
|
|
957 }
|
|
958
|
|
959 sub _LookupProteinInfo {
|
|
960
|
|
961 my ($key, $peptide, $db_lookup_table_refs) = @_;
|
|
962
|
|
963 my $protein_found_in_db = 0;
|
|
964 my $protein;
|
|
965 my $acc;
|
|
966 my $id;
|
|
967 my $index;
|
|
968
|
|
969 DB_LOOKUP:foreach my $db_lookup_table_ref (@{$db_lookup_table_refs}) {
|
|
970
|
|
971 if (defined(${$db_lookup_table_ref}{$key})) {
|
|
972
|
|
973 $protein_found_in_db = 1;
|
|
974 $logger->debug('Found protein ' . $key . ' in this sequence DB.');
|
|
975
|
|
976 $protein = ${$db_lookup_table_ref}{$key}{'SQ'};
|
|
977 $acc = ${$db_lookup_table_ref}{$key}{'AC'};
|
|
978 $id = ${$db_lookup_table_ref}{$key}{'ID'};
|
|
979 #
|
|
980 # Find peptide (fragment) in DB protein sequence.
|
|
981 #
|
|
982 $index = index($protein, $peptide);
|
|
983
|
|
984 if ($index > -1) {
|
|
985
|
|
986 #
|
|
987 # Found peptide!
|
|
988 #
|
|
989 $logger->debug('Found peptide ' . $peptide . ' at position ' . $index . ' in protein ' . $key);
|
|
990 last DB_LOOKUP;
|
|
991
|
|
992 }
|
|
993
|
|
994 #
|
|
995 # If the input was based on mass spectrometry data we cannot see
|
|
996 # the difference between I (Isolucene) and L (Leucine) as the both
|
|
997 # weigh the same. Let's try an I agnostic search by substituting all
|
|
998 # Is for Ls.
|
|
999 #
|
|
1000 my $protein_l_only = $protein;
|
|
1001 my $peptide_l_only = $peptide;
|
|
1002 $protein_l_only =~ s/I/L/g;
|
|
1003 $peptide_l_only =~ s/I/L/g;
|
|
1004 $index = index($protein_l_only, $peptide_l_only);
|
|
1005
|
|
1006 if ($index > -1) {
|
|
1007
|
|
1008 #
|
|
1009 # Found peptide!
|
|
1010 #
|
|
1011 $logger->debug('Found peptide ' . $peptide . ' at position ' . $index . ' in protein ' . $key);
|
|
1012 $logger->debug("\t" . 'Had to treat I and L as the same amino acid in the DB search to get a match.');
|
|
1013 last DB_LOOKUP;
|
|
1014
|
|
1015 }
|
|
1016
|
|
1017 #
|
|
1018 # Cannot find peptide :(
|
|
1019 #
|
|
1020 $protein = undef;
|
|
1021 $acc = undef;
|
|
1022 $id = undef;
|
|
1023
|
|
1024 $logger->debug('Cannot find peptide (' . $peptide . ') in protein ' . $key);
|
|
1025 $logger->debug("\t" . 'This may be the result of an updated protein sequence in the database.');
|
|
1026 $logger->debug("\t" . 'Will try to find the peptide in other databases (if available).');
|
|
1027
|
|
1028 if ($strip_lc) {
|
|
1029 $logger->debug("\t" . '(Stripping of lowercase characters, indicating modifications in the peptide sequence, was enabled.)');
|
|
1030 } else {
|
|
1031 $logger->debug("\t" . '(Stripping of lowercase characters, indicating modifications in the peptide sequence, was *not* enabled.)');
|
|
1032 }
|
|
1033
|
|
1034 } else {
|
|
1035 $logger->debug('Protein ' . $key . ' missing from this sequence DB. Will try other databases (if available).');
|
|
1036 }
|
|
1037 }
|
|
1038
|
|
1039 #
|
|
1040 # Check if the peptide (fragment) was found in a DB protein sequence.
|
|
1041 #
|
|
1042 if ($protein_found_in_db == 1) {
|
|
1043 unless ($index > -1) {
|
|
1044 $logger->warn('Cannot find peptide (' . $peptide . ') in protein ' . $key);
|
|
1045 $logger->warn("\t" . 'Will try to find this peptide (' . $peptide . ') using other associated protein identifiers (if available).');
|
|
1046 }
|
|
1047 } else {
|
|
1048 $logger->warn('Cannot find protein ' . $key . ' in any of the supplied databases.');
|
|
1049 }
|
|
1050
|
|
1051 return($protein, $acc, $id, $index);
|
|
1052 }
|
|
1053
|
|
1054 sub _GetCleavageSequenceContexts {
|
|
1055
|
|
1056 my ($peptide, $protein, $key, $index, $cleavage_aa, $cleavage_terminus) = @_;
|
|
1057
|
|
1058 my %peptide_contexts;
|
|
1059 $peptide_contexts{'n_term'}{'context'} = '';
|
|
1060 $peptide_contexts{'c_term'}{'context'} = '';
|
|
1061 my $max_context_length = $n_term_context_length + $c_term_context_length;
|
|
1062
|
|
1063 foreach my $peptide_term (keys(%peptide_contexts)) {
|
|
1064
|
|
1065 #
|
|
1066 # Get the offset for the sequence contexts to retrieve from this peptide terminus
|
|
1067 # or drop the context for this peptide terminus if it is also the protein terminus.
|
|
1068 #
|
|
1069 if ($peptide_term eq 'n_term') {
|
|
1070 if ($index > 0) {
|
|
1071 # This is an N-terminal peptide context.
|
|
1072 my $offset = $index - $n_term_context_length;
|
|
1073 $peptide_contexts{$peptide_term}{'offset'} = $offset;
|
|
1074 $logger->debug('Offset = ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . '.');
|
|
1075 } else {
|
|
1076 # Peptide N-terminus == Protein N-terminus.
|
|
1077 $logger->warn('Peptide N-terminus = protein N-terminus. Dropping N-terminal sequence context for peptide ' . $peptide .
|
|
1078 ' in protein ' . $key . '.');
|
|
1079 next;
|
|
1080 }
|
|
1081 } elsif ($peptide_term eq 'c_term') {
|
|
1082 if (($index + length($peptide)) < length($protein)) {
|
|
1083 # This is a C-terminal peptide context.
|
|
1084 my $offset = ($index + length($peptide))- $n_term_context_length;
|
|
1085 $peptide_contexts{$peptide_term}{'offset'} = $offset;
|
|
1086 $logger->debug('Offset = ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . '.');
|
|
1087 } else {
|
|
1088 # Peptide C-terminus == Protein C-terminus.
|
|
1089 $logger->warn('Peptide C-terminus = protein C-terminus. Dropping C-terminal sequence context for peptide ' . $peptide .
|
|
1090 ' in protein ' . $key . '.');
|
|
1091 next;
|
|
1092 }
|
|
1093 }
|
|
1094
|
|
1095 #
|
|
1096 # Check if DB protein sequence is long enough on n-terminal side
|
|
1097 # to retrieve full length peptide contexts.
|
|
1098 # (c-term length is checked later...)
|
|
1099 #
|
|
1100 if ($peptide_contexts{$peptide_term}{'offset'} < 0) {
|
|
1101
|
|
1102 #
|
|
1103 # Protein too short for full length context: Calculate offset adjustment.
|
|
1104 #
|
|
1105 my $offset = $peptide_contexts{$peptide_term}{'offset'};
|
|
1106 my $offset_adjustment = -($offset);
|
|
1107 $peptide_contexts{$peptide_term}{'offset_adjustment'} = $offset_adjustment;
|
|
1108 $logger->debug('Offset ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . ' less than zero');
|
|
1109
|
|
1110 if (length($padding_character) == 1) {
|
|
1111
|
|
1112 #
|
|
1113 # Add N-terminal padding characters for contexts that are too short on the N-terminal side.
|
|
1114 #
|
|
1115 my $n_term_padding = "$padding_character" x $offset_adjustment;
|
|
1116 $peptide_contexts{$peptide_term}{'context'} = $n_term_padding;
|
|
1117 $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding);
|
|
1118
|
|
1119 } else {
|
|
1120 $logger->debug('Padding was disabled. Will not add n-term padding.');
|
|
1121 }
|
|
1122
|
|
1123 $peptide_contexts{$peptide_term}{'offset'} = 0;
|
|
1124
|
|
1125 } else {
|
|
1126
|
|
1127 $peptide_contexts{$peptide_term}{'offset_adjustment'} = 0;
|
|
1128
|
|
1129 }
|
|
1130
|
|
1131 #
|
|
1132 # Extract sequence context from DB protein sequence.
|
|
1133 #
|
|
1134 my $peptide_context = substr($protein, $peptide_contexts{$peptide_term}{'offset'},
|
|
1135 $max_context_length - $peptide_contexts{$peptide_term}{'offset_adjustment'});
|
|
1136 # append the context extracted from the protein sequence,
|
|
1137 # because there may be already some N-terminal padding...
|
|
1138 $peptide_contexts{$peptide_term}{'context'} .= $peptide_context;
|
|
1139
|
|
1140 #
|
|
1141 # Quality Control: Check enzyme specificity for enzymes with known (= user supplied) specs.
|
|
1142 #
|
|
1143 unless ($cleavage_aa eq '*') {
|
|
1144
|
|
1145 $peptide_context = $peptide_contexts{$peptide_term}{'context'};
|
|
1146 my $p1 = substr($peptide_context, ($n_term_context_length -1), 1);
|
|
1147 my $p1_prime = substr($peptide_context, $n_term_context_length, 1);
|
|
1148
|
|
1149 if ($cleavage_terminus eq 'C') {
|
|
1150
|
|
1151 if ($p1 eq $cleavage_aa) {
|
|
1152 # Peptide fragment contains the AA recognised by the protease.
|
|
1153 $logger->debug('Specific protease activity -> Retaining peptide context.');
|
|
1154 } else {
|
|
1155 # Must be non-specific cleavage -> drop peptide context.
|
|
1156 $logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context .
|
|
1157 ' for peptide ' . $peptide . ' in protein ' . $key);
|
|
1158 $peptide_contexts{$peptide_term}{'context'} = '';
|
|
1159 }
|
|
1160
|
|
1161 } elsif ($cleavage_terminus eq 'N') {
|
|
1162
|
|
1163 if ($p1_prime eq $cleavage_aa) {
|
|
1164 # Peptide fragment contains the AA recognised by the protease.
|
|
1165 $logger->debug('Specific protease activity -> Retaining peptide context.');
|
|
1166 } else {
|
|
1167 # Must be non-specific cleavage -> drop peptide context.
|
|
1168 $logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context .
|
|
1169 ' for peptide ' . $peptide . ' in protein ' . $key);
|
|
1170 $peptide_contexts{$peptide_term}{'context'} = '';
|
|
1171 }
|
|
1172
|
|
1173 } elsif ($cleavage_terminus eq '*') {
|
|
1174
|
|
1175 if (($p1 eq $cleavage_aa) || ($p1_prime eq $cleavage_aa)) {
|
|
1176 # Peptide fragment contains the AA recognised by the protease.
|
|
1177 $logger->debug('Specific protease activity -> Retaining peptide context.');
|
|
1178 } else {
|
|
1179 # Must be non-specific cleavage -> drop peptide context.
|
|
1180 $logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context .
|
|
1181 ' for peptide ' . $peptide . ' in protein ' . $key);
|
|
1182 $peptide_contexts{$peptide_term}{'context'} = '';
|
|
1183 }
|
|
1184
|
|
1185 } else {
|
|
1186 $logger->fatal('Unknown cleavage terminus ' . $cleavage_terminus . '. Must be C, N or *.');
|
|
1187 exit;
|
|
1188 }
|
|
1189 }
|
|
1190 }
|
|
1191
|
|
1192 push(my @contexts, $peptide_contexts{'n_term'}{'context'}, $peptide_contexts{'c_term'}{'context'});
|
|
1193
|
|
1194 if (length($padding_character) == 1) {
|
|
1195 _CheckSequenceContextLength(\@contexts, $peptide, $key, $max_context_length);
|
|
1196 } else {
|
|
1197 $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.');
|
|
1198 }
|
|
1199
|
|
1200 return(\@contexts);
|
|
1201
|
|
1202 }
|
|
1203
|
|
1204 sub _GetMiscleavageSequenceContexts {
|
|
1205
|
|
1206 my ($peptide, $protein, $key, $petide_index, $cleavage_aa, $cleavage_terminus, $miscleavage_distance) = @_;
|
|
1207
|
|
1208 my @peptide_contexts;
|
|
1209 my $max_context_length = $n_term_context_length + $c_term_context_length;
|
|
1210
|
|
1211 #
|
|
1212 # Find miscleavage sites in peptide.
|
|
1213 #
|
|
1214 # TODO: handle --mcd
|
|
1215 #
|
|
1216 my @amino_acids = split('', $peptide);
|
|
1217 my $aa_index = 0;
|
|
1218 #
|
|
1219 # Drop first and last amino acids as these are from cleavage sites or from the protein termini.
|
|
1220 # In either case they cannot represent a miscleavage site.
|
|
1221 #
|
|
1222 shift(@amino_acids);
|
|
1223 pop(@amino_acids);
|
|
1224
|
|
1225 foreach my $aa (@amino_acids) {
|
|
1226
|
|
1227 $aa_index++;
|
|
1228
|
|
1229 if ($aa eq $cleavage_aa) {
|
|
1230
|
|
1231 #
|
|
1232 # We found a miscleavage site.
|
|
1233 #
|
|
1234 $logger->debug('Found miscleavage site in peptide ' . $peptide . ' of protein ' . $key . '.');
|
|
1235 #
|
|
1236 # Get the offset for the sequence context.
|
|
1237 #
|
|
1238 my $offset = $petide_index + $aa_index - $n_term_context_length;
|
|
1239 $logger->debug("\t" . 'Offset = ' . $offset . '.');
|
|
1240 my $n_term_padding = '';
|
|
1241 my $offset_adjustment = 0;
|
|
1242
|
|
1243 if ($cleavage_terminus eq 'C') {
|
|
1244
|
|
1245 #
|
|
1246 # Must increment the offset with 1 if the protease cuts C-terminal of a recognition site.
|
|
1247 #
|
|
1248 $offset++;
|
|
1249
|
|
1250 }
|
|
1251
|
|
1252 #
|
|
1253 # Check if DB protein sequence is long enough on n-terminal side
|
|
1254 # to retrieve full length peptide contexts.
|
|
1255 # (c-term length is checked elsewhere.)
|
|
1256 #
|
|
1257 if ($offset < 0) {
|
|
1258
|
|
1259 #
|
|
1260 # Protein too short for full length context: Calculate offset adjustment.
|
|
1261 #
|
|
1262 $offset_adjustment = -($offset);
|
|
1263 $logger->debug('Offset ' . $offset . ' for miscleavage context in peptide ' . $peptide . ' in protein ' . $key . ' less than zero');
|
|
1264
|
|
1265 if (length($padding_character) == 1) {
|
|
1266
|
|
1267 #
|
|
1268 # Add N-terminal padding characters for contexts that are too short on the N-terminal side.
|
|
1269 #
|
|
1270 $n_term_padding = "$padding_character" x $offset_adjustment;
|
|
1271 $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding);
|
|
1272
|
|
1273 } else {
|
|
1274 $logger->debug('Padding was disabled. Will not add n-term padding.');
|
|
1275 }
|
|
1276
|
|
1277 #
|
|
1278 # Reset offset for this context to protein start.
|
|
1279 #
|
|
1280 $offset = 0;
|
|
1281
|
|
1282 }
|
|
1283
|
|
1284 #
|
|
1285 # Extract sequence context from DB protein sequence.
|
|
1286 #
|
|
1287 my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment);
|
|
1288 $peptide_context = $n_term_padding . $peptide_context;
|
|
1289 $logger->debug('Found miscleavage context: ' . $peptide_context . '.');
|
|
1290
|
|
1291 push(@peptide_contexts, $peptide_context);
|
|
1292 }
|
|
1293 }
|
|
1294
|
|
1295 if (length($padding_character) == 1) {
|
|
1296 _CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length);
|
|
1297 } else {
|
|
1298 $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.');
|
|
1299 }
|
|
1300
|
|
1301 return(\@peptide_contexts);
|
|
1302
|
|
1303 }
|
|
1304
|
|
1305 sub _GetModificationSequenceContexts {
|
|
1306
|
|
1307 my ($peptide, $protein, $key, $petide_index, $modified_aa, $modified_peptide) = @_;
|
|
1308
|
|
1309 my @peptide_contexts;
|
|
1310 my $modified_aa_index_adjustment;
|
|
1311 my @modified_aa_in_peptide_indices;
|
|
1312 my $modified_aa_in_protein_index;
|
|
1313 my $max_context_length = $n_term_context_length + 1 + $c_term_context_length;
|
|
1314 #my $offset = 0;
|
|
1315 my $modified_aa_in_modified_peptide_index;
|
|
1316 my $modified_peptide_remainder;
|
|
1317 my $processed_peptide_length = 0;
|
|
1318
|
|
1319 #
|
|
1320 # Find amino acid in the string identifying both the modified amino acid and the modification.
|
|
1321 #
|
|
1322 if ($modified_aa =~ m/^([a-z]+)[A-Z*]$/) {
|
|
1323
|
|
1324 $modified_aa_index_adjustment = length($1);
|
|
1325
|
|
1326 } elsif ($modified_aa =~ m/^[A-Z*][a-z]$/) {
|
|
1327
|
|
1328 $modified_aa_index_adjustment = 0;
|
|
1329
|
|
1330 } else {
|
|
1331
|
|
1332 $logger->fatal('Modified amino acid was specified in an unsupported format: ' . $modified_aa . '.');
|
|
1333 exit;
|
|
1334
|
|
1335 }
|
|
1336
|
|
1337 #
|
|
1338 # Find modified sites in peptide.
|
|
1339 #
|
|
1340 my $modified_aa_for_regex = $modified_aa;
|
|
1341 $modified_aa_for_regex =~ s/\*/[A-Z]/;
|
|
1342 $logger->debug('modified_aa for search: ' . $modified_aa_for_regex);
|
|
1343 if ($modified_peptide =~ m/$modified_aa_for_regex/) {
|
|
1344 $logger->debug("\t" . 'Peptide contains mods.');
|
|
1345 $modified_aa_in_modified_peptide_index = $-[0];
|
|
1346 $modified_peptide_remainder = $';
|
|
1347 my $modified_peptide_prefix = $`;
|
|
1348 $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length);
|
|
1349 $processed_peptide_length += length($modified_peptide_prefix);
|
|
1350 $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length);
|
|
1351 $processed_peptide_length += length($modified_aa);
|
|
1352 $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length);
|
|
1353 $logger->trace("\t\t" . 'Index for mod :' . $modified_aa_in_modified_peptide_index);
|
|
1354 } else {
|
|
1355 $logger->debug('No modified amino acids found in this peptide.');
|
|
1356 }
|
|
1357
|
|
1358 #my $modified_aa_in_modified_peptide_index = index($modified_peptide, $modified_aa);
|
|
1359
|
|
1360 while (defined($modified_aa_in_modified_peptide_index)) {
|
|
1361
|
|
1362 $modified_aa_in_modified_peptide_index += $modified_aa_index_adjustment;
|
|
1363 $logger->debug('Found modified amino acid at position ' . ($modified_aa_in_modified_peptide_index + 1) . ' in modified peptide ' . $modified_peptide . ' of protein ' . $key . '.');
|
|
1364
|
|
1365 #
|
|
1366 # Count all modifications of any type preceeding the found modified amino acid.
|
|
1367 #
|
|
1368 my $modified_n_term = substr($modified_peptide, 0, $modified_aa_in_modified_peptide_index);
|
|
1369 my $n_term = $modified_n_term;
|
|
1370 $n_term =~ s/[a-z]+//g;
|
|
1371 my $mod_count = length($modified_n_term) - length($n_term);
|
|
1372
|
|
1373 if ($mod_count >= 0) {
|
|
1374 $logger->debug(' counted modifications preceeding amino acid = ' . $mod_count);
|
|
1375 } else {
|
|
1376 $logger->fatal(' counted modifications preceeding amino acid is negative: ' . $mod_count);
|
|
1377 exit;
|
|
1378 }
|
|
1379
|
|
1380 #
|
|
1381 # We need to substract the amount of mods preceeding the identified amino acid
|
|
1382 # from the index to get the position of the amino acid in the unmodified peptide.
|
|
1383 #
|
|
1384 my $modified_aa_in_peptide_index = $modified_aa_in_modified_peptide_index - $mod_count;
|
|
1385 $logger->debug(' and at position ' . ($modified_aa_in_peptide_index + 1) . ' in peptide ' . $peptide . ' of protein ' . $key . '.');
|
|
1386
|
|
1387 push(@modified_aa_in_peptide_indices, $modified_aa_in_peptide_index);
|
|
1388
|
|
1389 #
|
|
1390 # Search for additional mods in the modified peptide.
|
|
1391 #
|
|
1392 #$offset = $modified_aa_in_modified_peptide_index + 1;
|
|
1393 #$modified_aa_in_modified_peptide_index = index($modified_peptide, $modified_aa, $offset);
|
|
1394 if ($modified_peptide_remainder =~ m/$modified_aa_for_regex/) {
|
|
1395 $logger->debug("\t" . 'Peptide contains more mods.');
|
|
1396 $modified_peptide_remainder = $';
|
|
1397 my $modified_peptide_prefix = $`;
|
|
1398 $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length);
|
|
1399 $modified_aa_in_modified_peptide_index = $-[0] + $processed_peptide_length;
|
|
1400 $logger->trace("\t\t" . 'Index for mod :' . $modified_aa_in_modified_peptide_index);
|
|
1401 $processed_peptide_length += length($modified_peptide_prefix);
|
|
1402 $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length);
|
|
1403 $processed_peptide_length += length($modified_aa);
|
|
1404 $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length);
|
|
1405 } else {
|
|
1406 $modified_aa_in_modified_peptide_index = undef();
|
|
1407 $logger->debug('No more modified amino acids found in this peptide.');
|
|
1408 }
|
|
1409 }
|
|
1410
|
|
1411 #
|
|
1412 # Retrieve sequence contexts for modified stites from protein sequence
|
|
1413 #
|
|
1414 foreach my $modified_aa_in_peptide_index (@modified_aa_in_peptide_indices) {
|
|
1415
|
|
1416 #
|
|
1417 # Get the offset for the sequence context.
|
|
1418 #
|
|
1419 my $offset = $petide_index + $modified_aa_in_peptide_index - $n_term_context_length;
|
|
1420 $logger->debug("\t" . 'Offset = ' . $offset . '.');
|
|
1421 my $n_term_padding = '';
|
|
1422 my $offset_adjustment = 0;
|
|
1423
|
|
1424 #
|
|
1425 # Check if DB protein sequence is long enough on n-terminal side
|
|
1426 # to retrieve full length peptide contexts.
|
|
1427 # (c-term length is checked elsewhere.)
|
|
1428 #
|
|
1429 if ($offset < 0) {
|
|
1430
|
|
1431 #
|
|
1432 # Protein too short for full length context: Calculate offset adjustment.
|
|
1433 #
|
|
1434 $offset_adjustment = -($offset);
|
|
1435 $logger->debug('Offset ' . $offset . ' for modification context in peptide ' . $peptide . ' in protein ' . $key . ' less than zero');
|
|
1436
|
|
1437 if (length($padding_character) == 1) {
|
|
1438
|
|
1439 #
|
|
1440 # Add N-terminal padding characters for contexts that are too short on the N-terminal side.
|
|
1441 #
|
|
1442 $n_term_padding = "$padding_character" x $offset_adjustment;
|
|
1443 $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding);
|
|
1444
|
|
1445 } else {
|
|
1446 $logger->debug('Padding was disabled. Will not add n-term padding.');
|
|
1447 }
|
|
1448
|
|
1449 #
|
|
1450 # Reset offset for this context to protein start.
|
|
1451 #
|
|
1452 $offset = 0;
|
|
1453
|
|
1454 }
|
|
1455
|
|
1456 #
|
|
1457 # Extract sequence context from DB protein sequence.
|
|
1458 #
|
|
1459 my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment);
|
|
1460 $peptide_context = $n_term_padding . $peptide_context;
|
|
1461 $logger->debug('Found modification context: ' . $peptide_context . '.');
|
|
1462
|
|
1463 push(@peptide_contexts, $peptide_context);
|
|
1464
|
|
1465 }
|
|
1466
|
|
1467 if (length($padding_character) == 1) {
|
|
1468 _CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length);
|
|
1469 } else {
|
|
1470 $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.');
|
|
1471 }
|
|
1472
|
|
1473 return(\@peptide_contexts);
|
|
1474
|
|
1475 }
|
|
1476
|
|
1477 #
|
|
1478 # TODO: pep contexts.
|
|
1479 #
|
|
1480 sub _GetPeptideSequenceContexts {
|
|
1481
|
|
1482 my ($peptide, $protein, $key, $peptide_index) = @_;
|
|
1483
|
|
1484 my @peptide_contexts;
|
|
1485 my $max_context_length = $n_term_context_length + length($peptide) + $c_term_context_length;
|
|
1486
|
|
1487 #
|
|
1488 # Get the offset for the peptide sequence context.
|
|
1489 #
|
|
1490 my $offset = $peptide_index - $n_term_context_length;
|
|
1491 $logger->debug("\t" . 'Offset = ' . $offset . '.');
|
|
1492 my $n_term_padding = '';
|
|
1493 my $offset_adjustment = 0;
|
|
1494
|
|
1495 #
|
|
1496 # Check if DB protein sequence is long enough on n-terminal side
|
|
1497 # to retrieve full length peptide contexts.
|
|
1498 # (c-term length is checked elsewhere.)
|
|
1499 #
|
|
1500 if ($offset < 0) {
|
|
1501
|
|
1502 #
|
|
1503 # Protein too short for full length context: Calculate offset adjustment.
|
|
1504 #
|
|
1505 $offset_adjustment = -($offset);
|
|
1506 $logger->debug('Offset ' . $offset . ' for peptide context of peptide ' . $peptide . ' in protein ' . $key . ' less than zero');
|
|
1507
|
|
1508 if (length($padding_character) == 1) {
|
|
1509
|
|
1510 #
|
|
1511 # Add N-terminal padding characters for contexts that are too short on the N-terminal side.
|
|
1512 #
|
|
1513 $n_term_padding = "$padding_character" x $offset_adjustment;
|
|
1514 $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding);
|
|
1515
|
|
1516 } else {
|
|
1517 $logger->debug('Padding was disabled. Will not add n-term padding.');
|
|
1518 }
|
|
1519
|
|
1520 #
|
|
1521 # Reset offset for this context to protein start.
|
|
1522 #
|
|
1523 $offset = 0;
|
|
1524
|
|
1525 }
|
|
1526
|
|
1527 #
|
|
1528 # Extract sequence context from DB protein sequence.
|
|
1529 #
|
|
1530 my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment);
|
|
1531 $peptide_context = $n_term_padding . $peptide_context;
|
|
1532 $logger->debug('Found modification context: ' . $peptide_context . '.');
|
|
1533
|
|
1534 push(@peptide_contexts, $peptide_context);
|
|
1535
|
|
1536 if (length($padding_character) == 1) {
|
|
1537 _CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length);
|
|
1538 } else {
|
|
1539 $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.');
|
|
1540 }
|
|
1541
|
|
1542 return(\@peptide_contexts);
|
|
1543
|
|
1544 }
|
|
1545
|
|
1546 sub _CheckSequenceContextLength {
|
|
1547
|
|
1548 my ($contexts, $peptide, $key, $max_context_length) = @_;
|
|
1549
|
|
1550 #
|
|
1551 # If padding was enabled for sequence contexts < max sequence context length.
|
|
1552 #
|
|
1553 foreach my $peptide_context (@{$contexts}) {
|
|
1554
|
|
1555 my $peptide_context_length = length($peptide_context);
|
|
1556
|
|
1557 if ($peptide_context_length < $max_context_length) {
|
|
1558
|
|
1559 #
|
|
1560 # sequence context < max sequence context length -> append c-term padding.
|
|
1561 # (n-term padding was already appended if necessary during sequence context lookup with _GetSequenceContext().)
|
|
1562 #
|
|
1563 my $c_term_padding = "$padding_character" x ($max_context_length - $peptide_context_length);
|
|
1564 $logger->debug('Length of peptide context (' . $peptide_context_length . ') < max context length.');
|
|
1565 $logger->debug("\t" . 'Will add C-terminal padding: ' . $c_term_padding);
|
|
1566 $peptide_context = $peptide_context . $c_term_padding;
|
|
1567
|
|
1568 }
|
|
1569
|
|
1570 #
|
|
1571 # Sanity check for padded sequence contexts.
|
|
1572 #
|
|
1573 # (Checks if the combined effect of n-term and c-term padding worked well.)
|
|
1574 #
|
|
1575 $peptide_context_length = length($peptide_context);
|
|
1576 unless ($peptide_context_length == $max_context_length) {
|
|
1577 $logger->fatal('Length of peptide context too short or too long for peptide ' . $peptide . ' in protein ' . $key);
|
|
1578 $logger->fatal("\t" . 'Context length is ' . $peptide_context_length . ', but should be ' . $max_context_length . '.');
|
|
1579 exit;
|
|
1580 }
|
|
1581 }
|
|
1582
|
|
1583 #
|
|
1584 # No need to return the modified (c-terminal-padded) contexts as we worked with a arrayref.
|
|
1585 #
|
|
1586
|
|
1587 }
|
|
1588
|
|
1589 sub _SaveSequenceContexts {
|
|
1590
|
|
1591 my ($line, $acc, $contexts, $path_to_fh, $extracted_seq_contexts) = @_;
|
|
1592
|
|
1593 #
|
|
1594 # Append the peptide contexts to the existing line of the fragment file and
|
|
1595 # save that new line to the output file.
|
|
1596 # By appending we keep all info from the original fragment file intact.
|
|
1597 #
|
|
1598 foreach my $context (@{$contexts}) {
|
|
1599
|
|
1600 #
|
|
1601 # Skip "empty"/"missing" contexts.
|
|
1602 # May be the result of:
|
|
1603 # 1. The peptide terminus being also the protein terminus.
|
|
1604 # 2. Proteins missing from the database.
|
|
1605 # 3. ...
|
|
1606 #
|
|
1607 if ($padding_character ne '') {
|
|
1608 if ($context =~ m/^($padding_character)+$/) {
|
|
1609 next;
|
|
1610 }
|
|
1611 } else {
|
|
1612 if ($context eq '') {
|
|
1613 next;
|
|
1614 }
|
|
1615 }
|
|
1616
|
|
1617 my $new_line = $line;
|
|
1618
|
|
1619 # 1. Append accession numbers if the fragment file used IDs.
|
|
1620 if ($index_id eq 'ID') {
|
|
1621 $new_line .= "\t" . $acc;
|
|
1622 }
|
|
1623
|
|
1624 # 2. Append sequence context.
|
|
1625 $new_line .= "\t" . $context;
|
|
1626 # 3. Append new line character.
|
|
1627 $new_line .= "\n";
|
|
1628 # Save result to disk.
|
|
1629 print($path_to_fh $new_line);
|
|
1630 $extracted_seq_contexts++;
|
|
1631
|
|
1632 }
|
|
1633
|
|
1634 return($extracted_seq_contexts);
|
|
1635
|
|
1636 }
|
|
1637
|
|
1638 sub _Usage {
|
|
1639
|
|
1640
|
|
1641 print STDOUT "\n" .
|
|
1642 'ExtractPeptideSequenceContext.pl: Extract sequence contexts of cleavage, miscleavage or modification sites ' . "\n" .
|
|
1643 ' based on sequence fragments (like peptides experimentally identified with MSMS) ' . "\n" .
|
|
1644 ' and protein sequences from a database in FASTA or UniProtKB file format.' . "\n" .
|
|
1645 "\n" .
|
|
1646 'Usage:' . "\n" .
|
|
1647 "\n" .
|
|
1648 ' ExtractPeptideSequenceContext.pl options' . "\n" .
|
|
1649 "\n" .
|
|
1650 'Available options are:' . "\n" .
|
|
1651 "\n" .
|
|
1652 ' --db [file] Input file containing all database sequences in either UniProtKB *.dat or *.fasta format.' . "\n" .
|
|
1653 ' --dba [file] Optional alternative / backup input file containing database sequences in either UniProtKB *.dat or *.fasta format.' . "\n" .
|
|
1654 ' This may be a slightly older or newer version of the database, which will be searched in case a ' . "\n" .
|
|
1655 ' sequence ID was not found in the primary database file that was specified with --db [file].' . "\n" .
|
|
1656 ' --dbf [format] Optional argument to specify the database sequence file format. Valid format specifications are:' . "\n" .
|
|
1657 ' * \'UniProtKB DAT\' for UniProtKB DAT files.' . "\n" .
|
|
1658 ' * \'FASTA\' for FASTA files.' . "\n" .
|
|
1659 ' Only required if the database files do *not* have the standard file extensions: ' . "\n" .
|
|
1660 ' *.dat for UniProtKB DAT files or ' . "\n" .
|
|
1661 ' *.fa or *.fasta for FASTA files.' . "\n" .
|
|
1662 ' --k [ID|AC] Unique identifier type used as key to index an UniProtKB *.dat database file.' . "\n" .
|
|
1663 ' Must be either AC for a UniProt accession number (default) or ID for a UniProt identifier.' . "\n" .
|
|
1664 ' Not required for *.fasta databases: This tool will look for any type of ID in the first part of FASTA ' . "\n" .
|
|
1665 ' sequence headers up until the first white space.' . "\n" .
|
|
1666 ' Optionally multiple IDs may be present separated with pipe symbols (|) or semicolons (;).' . "\n" .
|
|
1667 ' Optionally IDs may be prefixed with a database namespace and a colon (:).' . "\n" .
|
|
1668 ' For example the accession number P32234 as well as the ID 128UP_DROME would be recognised in both ' . "\n" .
|
|
1669 ' this sequence header:' . "\n" .
|
|
1670 ' >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)' . "\n" .
|
|
1671 ' and in this one:' . "\n" .
|
|
1672 ' >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)' . "\n" .
|
|
1673 ' --f [file] Fragment file containing accession numbers or IDs and sequence fragments (peptides), ' . "\n" .
|
|
1674 ' whose sequence context should be extracted from the database. This file' . "\n" .
|
|
1675 ' * Must be tab delimited with one accession/ID plus one sequence fragment per line' . "\n" .
|
|
1676 ' * Must contain accession numbers / IDs in the same format as the database file used.' . "\n" .
|
|
1677 ' Exception: Optionally IDs may be suffixed with the peptide\'s position in its protein between brackets.' . "\n" .
|
|
1678 ' For example: CLH1_HUMAN[1612-1620].' . "\n" .
|
|
1679 ' * May contain other columns, which will be preserved in the output.' . "\n" .
|
|
1680 ' * Is the basis for the output: if a sequence context was found, ' . "\n" .
|
|
1681 ' it will be appended in another column to the right of the existing columns.' . "\n" .
|
|
1682 ' --icol [int] Column in the tab delimited fragment file containing the protein identifiers / accession numbers.' . "\n" .
|
|
1683 ' Default = 1.' . "\n" .
|
|
1684 ' --pcol [int] Column in the tab delimited fragment file containing the peptide sequences.' . "\n" .
|
|
1685 ' Default = 2.' . "\n" .
|
|
1686 ' --s Strip lowercase characters from the sequence fragments in the fragment file before doing a database lookup.' . "\n" .
|
|
1687 ' Amino acids are expected to be in uppercase. If not, the sequences are converted to uppercase ' . "\n" .
|
|
1688 ' before searching for the fragment in the database. When lowercase characters represent ' . "\n" .
|
|
1689 ' modifications instead of amino acids these need to be removed before searching the database.' . "\n" .
|
|
1690 ' Default = 0 (disabled) except when a modified amino acid is specified with --ma, which will automatically enable --s.' . "\n" .
|
|
1691 ' --cleo [file] Optional output file where the retrieved cleavage site sequence contexts will be saved.' . "\n" .
|
|
1692 ' --miso [file] Optional output file where the retrieved miscleavage site sequence contexts will be saved.' . "\n" .
|
|
1693 ' --modo [file] Optional output file where the retrieved modification site sequence contexts will be saved.' . "\n" .
|
|
1694 ' --pepo [file] Optional output file where the retrieved peptide sequence contexts will be saved.' . "\n" .
|
|
1695 ' (At least one output file must be specified with --cleo, --miso, --modo or --pepo).' . "\n" .
|
|
1696 ' --ca [A-Z] Cleavage amino acid. Assume the sequence fragments were derived from cutting with a proteolytic enzyme, ' . "\n" .
|
|
1697 ' that recognizes this amino acid as the position to cut.' . "\n" .
|
|
1698 ' When the specificity of the protease used to generate the peptides in the fragments file is unknown,' . "\n" .
|
|
1699 ' you may provide an asterisk (*) to retrieve sequence context for any cleavage,' . "\n" .
|
|
1700 ' but in that case this tool will not filter non-specifically cleaved fragments,' . "\n" .
|
|
1701 ' that may be the result of other processes than protease activity.' . "\n" .
|
|
1702 ' --ct [C|N] Cleavage terminus. Assume the sequence fragments were derived from cutting with a proteolytic enzyme, ' . "\n" .
|
|
1703 ' that cuts at the C or N terminus of the amino acid specified with the --ca [A-Z] parameter.' . "\n" .
|
|
1704 ' When the specificity of the protease used to generate the peptides in the fragments file is unknown,' . "\n" .
|
|
1705 ' you may provide an asterisk (*) to retrieve sequence context for any cleavage,' . "\n" .
|
|
1706 ' but in that case this tool will not filter non-specifically cleaved fragments,' . "\n" .
|
|
1707 ' that may be the result of other processes than protease activity.' . "\n" .
|
|
1708 ' --ma [A-Za-z] Modified amino acid.' . "\n" .
|
|
1709 ' The amino acid must be specified in uppercase and the modification in lower case.' . "\n" .
|
|
1710 ' The order is not important.' . "\n" .
|
|
1711 ' Hence a phophorylated serine in the fragments file can be indicated with either pS or Sp, ' . "\n" .
|
|
1712 ' but you cannot mix both pS and Sp in a single fragments file.' . "\n" .
|
|
1713 ' You may provide an asterisk (*) instead of an upper case amino acid to retrieve sequence contexts ' . "\n" .
|
|
1714 ' for the specified modification no matter what amino acid it was located on.' . "\n" .
|
|
1715 ' A modification may be specified with more than one lower case character, ' . "\n" .
|
|
1716 ' so for example phosphoS or Sphospho can also be used for a phosphorylated serine.' . "\n" .
|
|
1717 ' --n [int] Integer specifying the length of the N-terminal sequence context to retrieve starting from the ' . "\n" .
|
|
1718 ' cleavage, miscleavage or modification site. Default = 5.' . "\n" .
|
|
1719 ' --c [int] Integer specifying the length of the C-terminal sequence context to retrieve starting from the ' . "\n" .
|
|
1720 ' cleavage, miscleavage or modification site. Default = 5.' . "\n" .
|
|
1721 ' Please note that cleavage and miscleavage sites have zero width, ' . "\n" .
|
|
1722 ' while modified amino acids will increment the sequence context lenght with 1.' . "\n" .
|
|
1723 ' When defaults are used for both the N-terminal and C-terminal sequence context lengths,' . "\n" .
|
|
1724 ' the total sequence context length for a' . "\n" .
|
|
1725 ' * cleavage or miscleavage site will be:' . "\n" .
|
|
1726 ' (N-terminal sequence context) + (C-terminal sequence context) = 5 + 5 = 10.' . "\n" .
|
|
1727 ' * modification site will be:' . "\n" .
|
|
1728 ' (N-terminal sequence context) + modified AA + (C-terminal sequence context) = 5 + 1 + 5 = 11.' . "\n" .
|
|
1729 # ' --mcd [int] Minimal distance between a putative miscleavage site and the peptide termini.' . "\n" .
|
|
1730 # ' Uncleaved amino acids closer to the peptide termini can be ignored with this parameter:' . "\n" .
|
|
1731 # ' A. To prevent overlap between cleavage and miscleavage peptide sequence contexts.' . "\n" .
|
|
1732 # ' B. To prevent false positive miscleavage sites due to sites that cannot be cleaved, ' . "\n" .
|
|
1733 # ' because one of the resulting fragments is too short. (Sometimes proteases may need ' . "\n" .
|
|
1734 # ' a minimal length surrounding the cleavage site to bind and cleave a peptide.)' . "\n" .
|
|
1735 # ' Default = same as the longest *-terminal context length. Hence [--n [int]|--c [int]] (see above).' . "\n" .
|
|
1736 ' --pc [char] Optional padding character to fill N-terminal or C-terminal positions in the sequence context, ' . "\n" .
|
|
1737 ' when the protein was too short to get a complete sequence context.' . "\n" .
|
|
1738 ' Default = - (a.k.a. dash or the alignment gap character.)' . "\n" .
|
|
1739 ' To disable padding specify an empty string like this --pc \'\'.' . "\n" .
|
|
1740 ' --ll [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.' . "\n" .
|
|
1741 "\n";
|
|
1742 exit;
|
|
1743
|
|
1744 }
|