Mercurial > repos > galaxyp > nbic_fasta
comparison ExtractPeptideSequenceContext.pl @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:163892325845 |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # This script extracts subsequences from UniProtKB *.dat files or from *.fasta files | |
4 # based on a list of accession numbers / IDs and a sequence fragment (peptide). | |
5 # | |
6 # ===================================================== | |
7 # $Id: ExtractPeptideSequenceContext.pl 112 2011-03-01 19:50:23Z pieter.neerincx@gmail.com $ | |
8 # $HeadURL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractPeptideSequenceContext.pl $ | |
9 # $LastChangedDate: 2011-03-01 13:50:23 -0600 (Tue, 01 Mar 2011) $ | |
10 # $LastChangedRevision: 112 $ | |
11 # $LastChangedBy: pieter.neerincx@gmail.com $ | |
12 # ===================================================== | |
13 # | |
14 | |
15 # | |
16 # initialise environment | |
17 # | |
18 use strict; | |
19 use Getopt::Long; | |
20 use SWISS::Entry; # For parsing UniProtKB *.dat files. | |
21 use Log::Log4perl qw(:easy); | |
22 | |
23 my $ps = '/'; # Path Separator. | |
24 my %log_levels = ( | |
25 'ALL' => $ALL, | |
26 'TRACE' => $TRACE, | |
27 'DEBUG' => $DEBUG, | |
28 'INFO' => $INFO, | |
29 'WARN' => $WARN, | |
30 'ERROR' => $ERROR, | |
31 'FATAL' => $FATAL, | |
32 'OFF' => $OFF, | |
33 ); | |
34 | |
35 # | |
36 # List of databases, which may contain versioned accession numbers, | |
37 # and the separation character used to join the accession number and | |
38 # the version number. | |
39 # | |
40 my %databases_with_optionally_versioned_ids = ( | |
41 'IPI' => '.', | |
42 'ipi' => '.', | |
43 'UniProtKB/Swiss-Prot' => '.', | |
44 'sp' => '.', | |
45 'SP' => '.', | |
46 'UniProtKB/TrEMBL' => '.', | |
47 'TR' => '.', | |
48 'tr' => '.', | |
49 'Tr' => '.' | |
50 ); | |
51 | |
52 my $db; | |
53 my $db_alt; | |
54 my $db_format; | |
55 my $index_id; # Key used to lookup sequences. Can be ID or Accession number (default). | |
56 my $fragments; | |
57 my $id_column; | |
58 my $peptide_column; | |
59 my $strip_lc; # For the sequence fragments in the fragments file. With strip_lc enabled we | |
60 # assume amino acids are in upper case and lower case characters represent modifications, | |
61 # which will be stripped off before searching for the fragments in the complete sequences. | |
62 my $cleavage_output; # Sequence contexts for cleavage sites. | |
63 my $miscleavage_output; # Sequence contexts for miscleavage sites. | |
64 my $modification_output; # Sequence contexts for modification sites. | |
65 my $peptide_output; # Sequence contexts for complete peptides. | |
66 my $cleavage_aa; # Assume the sequence fragments were derived from cutting at this amino acid. | |
67 my $cleavage_terminus; # Assume the sequence fragments were derived from cutting at this side of $cut_at_aa | |
68 my $modified_aa; | |
69 my $n_term_context_length; | |
70 my $c_term_context_length; | |
71 my $miscleavage_distance; # Minimal distance between a putative miscleavage site and the peptide termini. | |
72 # Uncleaved AAs closer to the peptide termini can be ignored with this parameter: | |
73 # A. To prevent overlap between cleavage and miscleavage peptide sequence contexts. | |
74 # B. To remove false positive miscleavage sites that cannot be cleaved, | |
75 # because one of the resulting fragments is too short. Sometimes proteases may need | |
76 # a minimal length surrounding the cleavage site to bind and cleave a peptide. | |
77 my $padding_character; # Used for padding if the protein sequence is too short for a long enough context. | |
78 my $log_level; | |
79 | |
80 Getopt::Long::GetOptions ( | |
81 'db=s' => \$db, | |
82 'dba:s' => \$db_alt, | |
83 'dbf:s' => \$db_format, | |
84 'k:s' => \$index_id, # Key used to lookup sequences. Can be ID or Accession number (default). | |
85 'f=s' => \$fragments, | |
86 'icol:i' => \$id_column, | |
87 'pcol:i' => \$peptide_column, | |
88 's' => \$strip_lc, # For the sequence fragments in the fragments file. With strip_lc enabled we | |
89 # assume amino acids are in upper case and lower case characters represent modifications, | |
90 # which will be stripped off before searching for the fragments in the complete sequences. | |
91 'cleo:s' => \$cleavage_output, | |
92 'miso:s' => \$miscleavage_output, | |
93 'modo:s' => \$modification_output, | |
94 'pepo:s' => \$peptide_output, | |
95 'ca:s' => \$cleavage_aa, # Assume the sequence fragments were derived from cutting at this amino acid. | |
96 'ct:s' => \$cleavage_terminus, # Assume the sequence fragments were derived from cutting at this side of $cut_at_aa | |
97 'ma:s' => \$modified_aa, | |
98 'n:i' => \$n_term_context_length, | |
99 'c:i' => \$c_term_context_length, | |
100 'mcd:i' => \$miscleavage_distance, # Minimal distance between a putative miscleavage site and the peptide termini. | |
101 # Uncleaved AAs closer to the peptide termini can be ignored with this parameter: | |
102 # A. To prevent overlap between cleavage and miscleavage peptide sequence contexts. | |
103 # B. To remove false positive miscleavage sites that cannot be cleaved, | |
104 # because one of the resulting fragments is too short. Sometimes proteases may need | |
105 # a minimal length surrounding the cleavage site to bind and cleave a peptide. | |
106 'pc:s' => \$padding_character, | |
107 'll:s' => \$log_level | |
108 ); | |
109 | |
110 # | |
111 # TODO: 1. Handle --mcd param. Maybe should be extended to exclude also modification sites too close together... | |
112 # | |
113 | |
114 # | |
115 # Configure logging. | |
116 # | |
117 # Provides default if user did not specify log level: | |
118 $log_level = (defined($log_level) ? $log_level : 'INFO'); | |
119 # Reset log level to default if user specified illegal log level. | |
120 $log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'INFO'}); | |
121 #Log::Log4perl->init('log4perl.properties'); | |
122 Log::Log4perl->easy_init( | |
123 #{ level => $log_level, | |
124 # file => ">>ExtractPeptideSequenceContext.log", | |
125 # layout => '%F{1}-%L-%M: %m%n' }, | |
126 { level => $log_level, | |
127 file => "STDOUT", | |
128 layout => '%d L:%L %p> %m%n' }, | |
129 ); | |
130 my $logger = Log::Log4perl::get_logger(); | |
131 | |
132 # | |
133 # Check user input. | |
134 # | |
135 foreach my $input ($db, $db_alt, $fragments, $cleavage_output, $miscleavage_output, $modification_output) { | |
136 if (defined($input) && length($input) <= 0) { | |
137 $input = undef(); | |
138 } | |
139 } | |
140 unless (defined($db) && | |
141 defined($fragments) && | |
142 (defined($cleavage_output) || defined($miscleavage_output) || defined($modification_output) || defined($peptide_output)) | |
143 ) { | |
144 $logger->fatal('You need to specify at least a DB & fragments file as inputs and at least one output file.'); | |
145 _Usage(); | |
146 exit; | |
147 } | |
148 if (defined($db_format)) { | |
149 unless ($db_format eq 'UniProtKB DAT' or $db_format eq 'FASTA') { | |
150 $logger->fatal('Database format in unsupported format.'); | |
151 $logger->fatal("\t" . 'Must be \'UniProtKB DAT\' or \'FASTA\'.'); | |
152 $logger->fatal("\t" . 'But --dbf was ' . $db_format . '.'); | |
153 exit; | |
154 } | |
155 } | |
156 if (defined($cleavage_aa)) { | |
157 unless ($cleavage_aa =~ m/^[*A-Z]$/) { | |
158 $logger->fatal('Cleavage amino acid in unsupported format.'); | |
159 $cleavage_aa = undef; | |
160 } | |
161 } | |
162 if (defined($cleavage_terminus)) { | |
163 unless ($cleavage_terminus =~ m/^[*CN]$/) { | |
164 $logger->fatal('Cleavage terminus in unsupported format.'); | |
165 $cleavage_terminus = undef; | |
166 } | |
167 } | |
168 if (defined($modified_aa)) { | |
169 unless ($modified_aa =~ m/^[*A-Z][a-z]+$/ || $modified_aa =~ m/^[a-z]+[*A-Z]$/) { | |
170 $logger->fatal('Modified amino acid in unsupported format.'); | |
171 $modified_aa = undef; | |
172 } | |
173 } | |
174 # | |
175 # We need info to extract: | |
176 # * cleavage site sequence contexts or | |
177 # * modification site sequence contexts or | |
178 # * peptide sequence contexts | |
179 # | |
180 if (defined($cleavage_output) || defined($miscleavage_output)) { | |
181 unless (defined($cleavage_aa) && defined($cleavage_terminus)) { | |
182 $logger->fatal('If you want to retrieve (mis)cleavage site sequence contexts, you must provide valid values for --ca and --ct.'); | |
183 $logger->debug('You specified --ca ' . $cleavage_aa . ' --ct ' . $cleavage_terminus . '.'); | |
184 _Usage(); | |
185 exit; | |
186 } | |
187 } | |
188 if (defined($modification_output)) { | |
189 unless (defined($modified_aa)) { | |
190 $logger->fatal('If you want to retrieve modifications site sequence contexts you must provide a valid value for --ma.'); | |
191 $logger->debug('You specified --ma ' . $modified_aa . '.'); | |
192 _Usage(); | |
193 exit; | |
194 } | |
195 } | |
196 if (defined($peptide_output)) { | |
197 # | |
198 # We can work with defaults for all params. | |
199 # | |
200 } | |
201 | |
202 # Make sure we don't overwrite the input. | |
203 foreach my $output ($cleavage_output, $miscleavage_output, $modification_output, $peptide_output) { | |
204 foreach my $input ($db, $db_alt, $fragments) { | |
205 if (defined($input) && defined($output) && $output eq $input) { | |
206 $logger->fatal('Output file ' . $output . ' is the same as input file ' . $input . '.'); | |
207 $logger->fatal("\t" . 'Please choose a different file for the output.'); | |
208 exit; | |
209 } | |
210 } | |
211 } | |
212 # Define defaults for optional arguments. | |
213 $index_id = (defined($index_id) && length($index_id) > 0 ? $index_id : 'AC'); | |
214 $id_column = (defined($id_column) && length($id_column) > 0 ? $id_column : 1); | |
215 $peptide_column = (defined($peptide_column) && length($peptide_column) > 0 ? $peptide_column : 2); | |
216 $n_term_context_length = (defined($n_term_context_length) && length($n_term_context_length) > 0 ? $n_term_context_length : 5); | |
217 $c_term_context_length = (defined($c_term_context_length) && length($c_term_context_length) > 0 ? $c_term_context_length : 5); | |
218 $padding_character = (defined($padding_character) ? $padding_character : '-'); | |
219 # Stripping of lowercase characters is disabled by default unless we search for modification site contexts in which case it must be enabled! | |
220 $strip_lc = (defined($strip_lc) ? $strip_lc : 0); | |
221 $strip_lc = (defined($modified_aa) ? 1 : $strip_lc); | |
222 unless (defined($miscleavage_distance) && $miscleavage_distance > 0) { | |
223 if ($n_term_context_length >= $c_term_context_length) { | |
224 $miscleavage_distance = $n_term_context_length; | |
225 } else { | |
226 $miscleavage_distance = $c_term_context_length; | |
227 } | |
228 } | |
229 | |
230 | |
231 # | |
232 # Check db files and fragments file. | |
233 # | |
234 unless (-e $db && -f $db && -r $db) { | |
235 $logger->fatal('Cannot read from database input file ' . $db . ': ' . $!); | |
236 exit; | |
237 } | |
238 if (defined($db_alt)) { | |
239 unless (-e $db_alt && -f $db_alt && -r $db_alt) { | |
240 $logger->fatal('Cannot read from alternative database input file ' . $db_alt . ': ' . $!); | |
241 exit; | |
242 } | |
243 } | |
244 unless (-e $fragments && -f $fragments && -r $fragments) { | |
245 $logger->fatal('Cannot read from fragments input file ' . $fragments . ': ' .$!); | |
246 exit; | |
247 } | |
248 | |
249 # | |
250 # Create lookup table(s) of the entire database input file(s). | |
251 # | |
252 my @db_lookup_table_refs; | |
253 (my $db_seqs, $index_id) = _CreateLookupHash($db, $index_id, $db_format); | |
254 my $seq_count = scalar(keys(%{$db_seqs})); | |
255 $logger->info('Number of sequences in database lookup table: ' . $seq_count); | |
256 push(@db_lookup_table_refs, $db_seqs); | |
257 # Optional backup DB file. | |
258 if (defined($db_alt)) { | |
259 (my $db_seqs_alt, $index_id) = _CreateLookupHash($db_alt, $index_id, $db_format); | |
260 my $seq_count_alt = scalar(keys(%{$db_seqs_alt})); | |
261 $logger->info('Number of sequences in backup database lookup table: ' . $seq_count_alt); | |
262 push(@db_lookup_table_refs, $db_seqs_alt); | |
263 } | |
264 | |
265 # | |
266 # Debugging loop | |
267 # | |
268 #foreach my $keyyy (keys(%{$db_seqs})) { | |
269 # $logger->fatal('Key: ' . $keyyy); | |
270 # $logger->fatal("\t" . 'AC: ' . ${$db_seqs}{$keyyy}{'AC'}); | |
271 # $logger->fatal("\t" . 'ID: ' . ${$db_seqs}{$keyyy}{'ID'}); | |
272 # $logger->fatal("\t" . 'SQ: ' . ${$db_seqs}{$keyyy}{'SQ'}); | |
273 #} | |
274 | |
275 # | |
276 # Lookup sequence fragment contexts and store results. | |
277 # | |
278 my ($counter_cleavages, $counter_miscleavages, $counter_modifications, $counter_peptides) = _ExtractSeqs(\@db_lookup_table_refs, | |
279 $fragments, $strip_lc, $id_column, $peptide_column, | |
280 $cleavage_output, $miscleavage_output, $modification_output, $peptide_output, | |
281 $cleavage_aa, $cleavage_terminus, $miscleavage_distance, | |
282 $n_term_context_length, $c_term_context_length, | |
283 $padding_character); | |
284 # | |
285 # Report result stats. | |
286 # | |
287 my $max_length = 0; | |
288 foreach my $counter ($counter_cleavages, $counter_miscleavages, $counter_modifications, $counter_peptides) { | |
289 $max_length = (length($counter) > $max_length) ? length($counter) : $max_length; | |
290 } | |
291 my $prefixed_counter = sprintf('%*s', $max_length, $counter_cleavages); | |
292 $logger->info('Extracted cleavage site sequence contexts: ' . $prefixed_counter . '.'); | |
293 $prefixed_counter = sprintf('%*s', $max_length, $counter_miscleavages); | |
294 $logger->info('Extracted miscleavage site sequence contexts: ' . $prefixed_counter . '.'); | |
295 $prefixed_counter = sprintf('%*s', $max_length, $counter_modifications); | |
296 $logger->info('Extracted modification site sequence contexts: ' . $prefixed_counter . '.'); | |
297 $prefixed_counter = sprintf('%*s', $max_length, $counter_peptides); | |
298 $logger->info('Extracted peptide sequence contexts: ' . $prefixed_counter . '.'); | |
299 $prefixed_counter = sprintf('%*s', $max_length, $counter_cleavages + $counter_miscleavages + $counter_modifications + $counter_peptides); | |
300 $logger->info('Total extracted sequence contexts: ' . $prefixed_counter . '.'); | |
301 $logger->info('Finished!'); | |
302 | |
303 # | |
304 ## | |
305 ### Internal subs. | |
306 ## | |
307 # | |
308 | |
309 sub _CreateLookupHash { | |
310 | |
311 my ($file_path, $index_id, $file_type) = @_; | |
312 | |
313 my $file_path_fh; | |
314 my %seqs; | |
315 | |
316 if (defined($file_type)) { | |
317 $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...'); | |
318 } elsif ($file_path =~ m/($ps?)([^\.]+)\.dat$/i) { | |
319 $file_type = 'UniProtKB DAT'; | |
320 $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...'); | |
321 } elsif ($file_path =~ m/($ps?)([^\.]+)\.fa(sta)?$/i) { | |
322 $file_type = 'FASTA'; | |
323 # For FASTA files we don't know what to expect: | |
324 # IDs or ACcession numbers or a mix of both. | |
325 # (Re)set index_id type to AC to prevent adding redundant IDs to the results. | |
326 # Hence treat all identifiers stable and unstable as ACs. | |
327 # (Normally we default to ACcession numbers and if we would have found an AC | |
328 # and the input fragments file was using IDs, we would append the AC to the results.) | |
329 $index_id = 'AC'; | |
330 $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...'); | |
331 } else { | |
332 $file_type = 'FASTA'; | |
333 $index_id = 'AC'; | |
334 $logger->warn('Could not determine database file type based on the file name extension of ' . $file_path); | |
335 $logger->warn('Will assume it is a FASTA file and hope that is correct...'); | |
336 $logger->info('Parsing ' . $file_type . ' file ' . $file_path . '...'); | |
337 } | |
338 | |
339 if ($file_type eq 'UniProtKB DAT') { | |
340 | |
341 # | |
342 # Read an entire record delimited by // at a time. | |
343 # | |
344 local $/ = "\/\/\n"; | |
345 | |
346 eval { | |
347 open($file_path_fh, "<$file_path"); | |
348 }; | |
349 if ($@) { | |
350 $logger->fatal('Cannot read from database input file: ' . $@); | |
351 exit; | |
352 } | |
353 | |
354 while (<$file_path_fh>){ | |
355 | |
356 $logger->trace('Record:' . "\n$_\n"); | |
357 | |
358 my $entry = SWISS::Entry->fromText($_); | |
359 | |
360 my $id = $entry->ID; | |
361 my $acc = $entry->AC; | |
362 my $seq = $entry->SQ; | |
363 my $index_key; | |
364 | |
365 if ($index_id eq 'ID') { | |
366 $index_key = $id; | |
367 } elsif ($index_id eq 'AC') { | |
368 $index_key = $acc; | |
369 } else { | |
370 $logger->fatal('Unknown type of identifier used for indexing: ' . $index_id . '. Must be ID or AC.'); | |
371 exit; | |
372 } | |
373 | |
374 $logger->debug('Found accession ' . $acc); | |
375 $logger->trace('Found accession ' . $acc . ' with ID ' . $id . ' and seq ' . $seq); | |
376 | |
377 $seqs{$index_key}{'AC'} = $acc; | |
378 $seqs{$index_key}{'ID'} = $id; | |
379 $seqs{$index_key}{'SQ'} = $seq; | |
380 | |
381 } | |
382 | |
383 } elsif ($file_type eq 'FASTA') { | |
384 | |
385 my @header_ids = (); | |
386 my $seq = ''; | |
387 | |
388 eval { | |
389 open($file_path_fh, "<$file_path"); | |
390 }; | |
391 if ($@) { | |
392 $logger->fatal('Cannot read from database input file: ' . $@); | |
393 exit; | |
394 } | |
395 | |
396 LINE: while (my $line = <$file_path_fh>) { | |
397 | |
398 if ($line =~ m/^[\s\n\r]+$/) { | |
399 | |
400 # | |
401 # Skip blank line | |
402 # | |
403 $logger->debug('Found empty line in FASTA file.'); | |
404 next LINE; | |
405 | |
406 } elsif ($line =~ m/^[a-zA-Z]+/) { | |
407 | |
408 # | |
409 # It's a sequence line | |
410 # Remove all white space and new line characters. | |
411 # | |
412 $logger->debug('Found FASTA sequence line: ' . $line); | |
413 | |
414 $line =~ s/[\s\n\r]+//g; | |
415 $seq .= $line; | |
416 | |
417 } elsif ($line =~ m/^>/) { | |
418 | |
419 $logger->debug('Found new FASTA header line: ' . $line); | |
420 | |
421 if ((length($seq) > 0) && defined($header_ids[0])) { | |
422 | |
423 # | |
424 # Store the previously found sequence in the lookup table. | |
425 # | |
426 foreach my $header_id (@header_ids) { | |
427 | |
428 # | |
429 # We don't know if the ID we found is an accession number (stable ID) or a normal (unstable) ID | |
430 # -> Fill both the AC and ID fields with the same ID value. | |
431 # | |
432 $seqs{$header_id}{'AC'} = $header_id; | |
433 $seqs{$header_id}{'ID'} = $header_id; | |
434 $seqs{$header_id}{'SQ'} = $seq; | |
435 | |
436 $logger->debug('Stored ' . $header_id . ':'); | |
437 $logger->debug("\t" . 'SQ ' . $seq); | |
438 | |
439 } | |
440 | |
441 # | |
442 # Reset vars. | |
443 # | |
444 $seq = ''; | |
445 @header_ids = (); | |
446 | |
447 } | |
448 | |
449 # | |
450 # Check for the presence of some frequently occurring naming schemes: | |
451 # | |
452 # >IPI:CON_Trypsin|SWISS-PROT:P00761|TRYP_PIG Trypsin - Sus scrofa (Pig). | |
453 # >IPI:CON_IPI00174775.2|TREMBL:Q32MB2;Q86Y46 Tax_Id=9606 Gene_Symbol=KRT73 Keratin-73 | |
454 # >sp|Q42592|APXS_ARATH L-ascorbate peroxidase S, chloroplastic/mitochondrial; | |
455 # >jgi|Triad1|1|gw1.3.1.1 | |
456 # | |
457 # The FASTA header line can basically have any format. | |
458 # Therefore we try to see if the IDs we are looking for are present anywhere | |
459 # in the header line up until the first white space or new line. | |
460 # This should prevent matching annotation from other proteins in the description | |
461 # like a for example a 'best' BLAST hit that contains one of the IDs we | |
462 # are looking for. Hence, in such a case this sequence is *not* the one of | |
463 # the IDs we looking for, but similar to that one at best. | |
464 # | |
465 my @unfiltered_ids; | |
466 if ($line =~ m/^>((([^\s;|]+)[;|])*([^\s;|]+))[|;]?/i) { | |
467 | |
468 # | |
469 # Got multiple IDs | |
470 # | |
471 my $multiple_ids = $1; | |
472 @unfiltered_ids = split(/[\s;|]+/, $multiple_ids); | |
473 | |
474 } elsif ($line =~ m/^>([^\s]+)/i){ | |
475 | |
476 # | |
477 # Got a single ID | |
478 # | |
479 my $single_id = $1; | |
480 push(@unfiltered_ids, $single_id); | |
481 | |
482 } | |
483 | |
484 KEY:foreach my $unfiltered_id (@unfiltered_ids) { | |
485 | |
486 my $this_id = $unfiltered_id; | |
487 | |
488 # | |
489 # Check for RockerBox formattted IDs. | |
490 # | |
491 # Example of RockerBox ID format, which may contain | |
492 # * multiple IDs separated by semi-colons and | |
493 # * suffixed with the position of the peptide in the protein: | |
494 # YGR192C[123-142]; YJR009C[123-142] | |
495 if ($this_id =~ m/^([^\[\]]+)\[\d+-\d+\]$/i) { | |
496 $logger->trace('Protein ID in RockerBox format: ' . $this_id); | |
497 $this_id = $1; | |
498 $logger->trace("\t" . 'Using ID : ' . $this_id); | |
499 } | |
500 | |
501 # | |
502 # Check for DB namespace prefixes. | |
503 # | |
504 if ($this_id =~ m/([^:]+):([^:]+)/i) { | |
505 | |
506 # | |
507 # Namespace prefixed ID. | |
508 # | |
509 $logger->trace('Namespace prefixed ID: ' . $this_id); | |
510 | |
511 my $prefix = $1; | |
512 $this_id = $2; | |
513 | |
514 $logger->trace("\t" . 'Using ID: ' . $this_id); | |
515 | |
516 # | |
517 # For a selected list of "known" database namespace prefixes, | |
518 # check for versioned accession numbers and remove the version number. | |
519 # | |
520 if (defined($prefix) && defined($databases_with_optionally_versioned_ids{$prefix})) { | |
521 $logger->trace('Found prefix: ' . $prefix); | |
522 my $separator = $databases_with_optionally_versioned_ids{$prefix}; | |
523 $separator = '\\' . $separator; | |
524 if ($this_id =~ m/([^$separator]+)$separator\d+$/) { | |
525 $this_id = $1; | |
526 } | |
527 } | |
528 } | |
529 | |
530 $logger->debug('Found ID: ' . $this_id); | |
531 push(@header_ids, $this_id); | |
532 | |
533 } | |
534 | |
535 # if ($line =~ m/^>((([^\s\n\r:;|]+)[:]([^\s\n\r:|]+)[|;])*([^\s\n\r:;|]+)[:]([^\s\n\r:|]+))[|;]?(\s+(.+))?/i) { | |
536 # | |
537 # # | |
538 # # One or more namespace prefixed IDs. | |
539 # # | |
540 # my $concatenated_namespace_prefixed_ids = $1; | |
541 # $logger->debug('Found prefixed IDs in header: ' . $concatenated_namespace_prefixed_ids); | |
542 # | |
543 # # database_namespace = $3 && $5; | |
544 # # id = $4 && $6; | |
545 # # description = $8; | |
546 # my @namespace_prefixed_ids = split(/[|;]/, $concatenated_namespace_prefixed_ids); | |
547 # | |
548 # foreach my $prefixed_id (@namespace_prefixed_ids) { | |
549 # | |
550 # $logger->trace('Prefixed ID: ' . $prefixed_id); | |
551 # | |
552 # if ($prefixed_id =~ m/(([^\s\n\r:;|]+):)?([^\s\n\r:|]+)/i) { | |
553 # | |
554 # my $this_prefix = $2; | |
555 # my $this_id = $3; | |
556 # $logger->debug('Found ID: ' . $this_id); | |
557 # push(@header_ids, $this_id); | |
558 # | |
559 # # | |
560 # # For a selected list of "known" database namespace prefixes, | |
561 # # check for versioned accession numbers and - if found - add | |
562 # # both the versioned and the unversioned accession number | |
563 # # to the list of header IDs. | |
564 # # | |
565 # if (defined($this_prefix) && defined($databases_with_optionally_versioned_ids{$this_prefix})) { | |
566 # $logger->trace('Found prefix: ' . $this_prefix); | |
567 # my $separator = $databases_with_optionally_versioned_ids{$this_prefix}; | |
568 # $separator = '\\' . $separator; | |
569 # if ($this_id =~ m/([^$separator]+)$separator\d+$/) { | |
570 # my $this_unversioned_id = $1; | |
571 # $logger->debug('Found unversioned ID: ' . $this_unversioned_id); | |
572 # push(@header_ids, $this_unversioned_id); | |
573 # } | |
574 # } | |
575 # | |
576 # } else { | |
577 # | |
578 # $logger->warn( | |
579 # 'This should have been an optionally prefixed ID, ' | |
580 # . 'but failed to match the corresponding regex: ' | |
581 # . $prefixed_id); | |
582 # | |
583 # } | |
584 # } | |
585 # | |
586 # } elsif ($line =~ m/^>((([^\s\n\r:;|]+)[|;])*([^\s\n\r:;|]+))[|;]?(\s+(.+))?/i) { | |
587 # | |
588 # # | |
589 # # One or more unprefixed IDs. | |
590 # # | |
591 # my $concatenated_ids = $1; | |
592 # $logger->debug('Found unprefixed IDs in header: ' . $concatenated_ids); | |
593 # | |
594 # # id = $3 && $4; | |
595 # # description = $7; | |
596 # my @unprefixed_ids = split(/[|;]/, $concatenated_ids); | |
597 # | |
598 # foreach my $unprefixed_id (@unprefixed_ids) { | |
599 # | |
600 # $logger->debug('Found ID: ' . $unprefixed_id); | |
601 # push(@header_ids, $unprefixed_id); | |
602 # | |
603 # } | |
604 # | |
605 # } else { | |
606 # | |
607 # # | |
608 # # Something else. | |
609 # # | |
610 # # The FASTA header line can basically have any format, | |
611 # # so this is probably one of the less frequent occurring annotation schemes. | |
612 # # Therefore we try to see if the IDs we are looking for are present anywhere | |
613 # # in the header line up until the first white space or new line. | |
614 # # This may be tricky as we might match other annotation from other proteins | |
615 # # like a description from a 'best' BLAST hit that contains one of the IDs we | |
616 # # are looking for. Hence, in such a case this sequence is *not* the one of | |
617 # # the IDs we looking for, but similar to that one at best. | |
618 # # | |
619 # if ($line =~ m/>([^\n\r\s]+)/) { | |
620 # | |
621 # my $putative_id = $1; | |
622 # $logger->debug('Found putative ID in header: ' . $putative_id); | |
623 # push(@header_ids, $putative_id); | |
624 # | |
625 # } else { | |
626 # $logger->warn('Cannot identify IDs in this header: ' . $line); | |
627 # } | |
628 # } | |
629 } | |
630 } | |
631 | |
632 # | |
633 # Store the last found sequence in the lookup table. | |
634 # | |
635 foreach my $header_id (@header_ids) { | |
636 | |
637 # | |
638 # We don't know if the ID we found is an accession number (stable ID) or a normal (unstable) ID | |
639 # -> Fill both the AC and ID fields with the same ID value. | |
640 # | |
641 $seqs{$header_id}{'AC'} = $header_id; | |
642 $seqs{$header_id}{'ID'} = $header_id; | |
643 $seqs{$header_id}{'SQ'} = $seq; | |
644 | |
645 $logger->debug('Stored ' . $header_id . ':'); | |
646 $logger->debug("\t" . 'SQ ' . $seq); | |
647 | |
648 } | |
649 | |
650 # | |
651 # Reset vars. | |
652 # | |
653 $seq = ''; | |
654 @header_ids = (); | |
655 | |
656 } | |
657 | |
658 close($file_path_fh); | |
659 $logger->info('Created lookup list for database sequences.'); | |
660 | |
661 return (\%seqs, $index_id); | |
662 | |
663 } | |
664 | |
665 sub _ExtractSeqs { | |
666 | |
667 my ($db_lookup_table_refs, | |
668 $path_from, $strip_lc, $id_column, $peptide_column, | |
669 $cleavage_path_to, $miscleavage_path_to, $modification_path_to, $peptide_path_to, | |
670 $cleavage_aa, $cleavage_terminus, $miscleavage_distance, | |
671 $n_term_context_length, $c_term_context_length, | |
672 $padding_character) = @_; | |
673 | |
674 $logger->debug('Parsing ' . $path_from . '...'); | |
675 | |
676 my $extracted_seq_contexts_cle = 0; | |
677 my $extracted_seq_contexts_mis = 0; | |
678 my $extracted_seq_contexts_mod = 0; | |
679 my $extracted_seq_contexts_pep = 0; | |
680 my $path_from_fh; | |
681 my $cleavage_path_to_fh; | |
682 my $miscleavage_path_to_fh; | |
683 my $modification_path_to_fh; | |
684 my $peptide_path_to_fh; | |
685 | |
686 eval { | |
687 open($path_from_fh, "<$path_from"); | |
688 }; | |
689 if ($@) { | |
690 $logger->fatal('Cannot read from fragments input file: ' . $@); | |
691 exit; | |
692 } | |
693 if (defined($cleavage_path_to)) { | |
694 eval { | |
695 open($cleavage_path_to_fh, ">$cleavage_path_to"); | |
696 }; | |
697 if ($@) { | |
698 $logger->fatal('Cannot write to output file: ' . $@); | |
699 exit; | |
700 } | |
701 } | |
702 if (defined($miscleavage_path_to)) { | |
703 eval { | |
704 open($miscleavage_path_to_fh, ">$miscleavage_path_to"); | |
705 }; | |
706 if ($@) { | |
707 $logger->fatal('Cannot write to output file: ' . $@); | |
708 exit; | |
709 } | |
710 } | |
711 if (defined($modification_path_to)) { | |
712 eval { | |
713 open($modification_path_to_fh, ">$modification_path_to"); | |
714 }; | |
715 if ($@) { | |
716 $logger->fatal('Cannot write to output file: ' . $@); | |
717 exit; | |
718 } | |
719 } | |
720 if (defined($peptide_path_to)) { | |
721 eval { | |
722 open($peptide_path_to_fh, ">$peptide_path_to"); | |
723 }; | |
724 if ($@) { | |
725 $logger->fatal('Cannot write to output file: ' . $@); | |
726 exit; | |
727 } | |
728 } | |
729 | |
730 LINE:while (my $line = <$path_from_fh>) { | |
731 | |
732 # | |
733 # Remove (trailing) line end characters! | |
734 # | |
735 $line =~ s/[\n\r\f]+//g; | |
736 $logger->trace($line); | |
737 | |
738 my @column_values = split(/\t/, $line); | |
739 | |
740 # For debugging only. | |
741 #foreach my $val (@column_values) { | |
742 # $logger->debug('Col val: ' . $val); | |
743 #} | |
744 | |
745 my $id_column_index = $id_column - 1; | |
746 my $peptide_column_index = $peptide_column - 1; | |
747 my $unfiltered_key = $column_values[$id_column_index]; | |
748 my $key; | |
749 my @keys; | |
750 my $peptide = $column_values[$peptide_column_index]; | |
751 my $peptide_nonmod; | |
752 my $protein; | |
753 my $acc; | |
754 my $id; | |
755 my $index = -1; # An index < 0 indicates the peptide was not found in a protein sequence (yet). | |
756 my $length; | |
757 my $cleavage_peptide_contexts = []; | |
758 my $miscleavage_peptide_contexts = []; | |
759 my $modification_peptide_contexts = []; | |
760 my $peptide_contexts = []; | |
761 | |
762 # | |
763 # Check if $key and $peptide were present in this line. | |
764 # If not this may be a leading/trailing empty or meta-data line, | |
765 # that does not contain the correct amount of columns. | |
766 # | |
767 unless (defined($unfiltered_key) && defined($peptide)) { | |
768 $logger->warn('Skipping line: ' . $line); | |
769 $logger->warn("\t" . '(Peptide sequence and/or protein ID missing or incorrect amount of columns.)'); | |
770 next LINE; | |
771 } | |
772 | |
773 # | |
774 # Sanity check for peptide sequences. | |
775 # | |
776 unless ($peptide =~ m/([a-zA-Z]+)/) { | |
777 $logger->warn('Fragment file line in unexpected format. Cannot find peptide in: ' . $line); | |
778 $logger->debug('$peptide contains: ' . $peptide); | |
779 next LINE; | |
780 } | |
781 | |
782 if ($strip_lc) { | |
783 # Remove lower case characters representing modifications. | |
784 $peptide_nonmod = $peptide; | |
785 $peptide_nonmod =~ s/[a-z]+//g; | |
786 } else { | |
787 # Convert everything to uppercase. | |
788 $peptide_nonmod = uc($peptide); | |
789 } | |
790 | |
791 # | |
792 # Figure out in what format the Protein ID(s) were supplied. | |
793 # | |
794 # * Strip off optional DB namespace prefixes. | |
795 # * Handle optional accession number version suffixes. | |
796 # | |
797 if ($unfiltered_key =~ m/^((([^\s;|]+)[\s;|]+)*([^\s;|]+))[\s|;]?/i) { | |
798 | |
799 # | |
800 # Got multiple IDs | |
801 # | |
802 my $multiple_ids = $1; | |
803 @keys = split(/[\s;|]+/, $multiple_ids); | |
804 | |
805 } else { | |
806 | |
807 # | |
808 # Got a single ID | |
809 # | |
810 push(@keys, $unfiltered_key); | |
811 | |
812 } | |
813 | |
814 KEY:foreach $unfiltered_key (@keys) { | |
815 | |
816 $key = $unfiltered_key; | |
817 | |
818 # | |
819 # Check for RockerBox formattted IDs. | |
820 # | |
821 # Example of RockerBox ID format, which may contain | |
822 # * multiple IDs separated by semi-colons and | |
823 # * suffixed with the position of the peptide in the protein: | |
824 # YGR192C[123-142]; YJR009C[123-142] | |
825 if ($key =~ m/^([^\[]+)\[\d+-\d+\]$/i) { | |
826 $logger->trace('Protein ID in RockerBox format: ' . $key); | |
827 $key = $1; | |
828 $logger->trace("\t" . 'Using ID : ' . $key); | |
829 } | |
830 | |
831 # | |
832 # Check for DB namespace prefixes. | |
833 # | |
834 if ($key =~ m/([^:]+):([^:]+)/i) { | |
835 | |
836 # | |
837 # Namespace prefixed ID. | |
838 # | |
839 $logger->trace('Namespace prefixed ID: ' . $key); | |
840 | |
841 my $prefix = $1; | |
842 $key = $2; | |
843 | |
844 $logger->trace("\t" . 'Using ID: ' . $key); | |
845 | |
846 # | |
847 # For a selected list of "known" database namespace prefixes, | |
848 # check for versioned accession numbers and remove the version number. | |
849 # | |
850 if (defined($prefix) && defined($databases_with_optionally_versioned_ids{$prefix})) { | |
851 $logger->trace('Found prefix: ' . $prefix); | |
852 my $separator = $databases_with_optionally_versioned_ids{$prefix}; | |
853 $separator = '\\' . $separator; | |
854 if ($key =~ m/([^$separator]+)$separator\d+$/) { | |
855 $key = $1; | |
856 } | |
857 } | |
858 } | |
859 | |
860 $logger->debug('Found ID: ' . $key); | |
861 | |
862 # | |
863 # Sanity check for Protein ID. | |
864 # | |
865 unless ($key =~ m/^([A-Z0-9\._-]+)$/i) { | |
866 $logger->warn('Fragment file line in unexpected format. Cannot find sequence ID in:'); | |
867 $logger->warn("\t" . $line); | |
868 $logger->debug('$key contains: ' . $key); | |
869 next LINE; | |
870 } | |
871 | |
872 # | |
873 # Lookup info for the protein this peptide was derived from. | |
874 # | |
875 ($protein, $acc, $id, $index) = _LookupProteinInfo($key, $peptide_nonmod, $db_lookup_table_refs); | |
876 | |
877 # | |
878 # Check if the peptide (fragment) was found in this protein sequence. | |
879 # | |
880 if (defined($protein)) { | |
881 if ($index > -1) { | |
882 $logger->trace('Found peptide sequence in protein ' . $key); | |
883 last KEY; | |
884 } else { | |
885 $logger->warn('Cannot find peptide (' . $peptide_nonmod . ') in protein ' . $key); | |
886 $logger->warn("\t" . 'Will try other protein IDs for this peptide if available.'); | |
887 } | |
888 } else { | |
889 $logger->warn('Cannot find protein identifier ' . $key . ' in any of the supplied databases.'); | |
890 $logger->warn("\t" . 'Will try other protein IDs for this peptide if available.'); | |
891 } | |
892 } | |
893 | |
894 # | |
895 # Check if the peptide (fragment) was found in any of the associated protein sequences. | |
896 # | |
897 if (defined($protein)) { | |
898 | |
899 $logger->debug('Found protein ' . $key); | |
900 | |
901 if ($index > -1) { | |
902 | |
903 $logger->debug('Found peptide (' . $peptide_nonmod . ') in protein ' . $key); | |
904 | |
905 if (defined($cleavage_path_to)) { | |
906 ($cleavage_peptide_contexts) = _GetCleavageSequenceContexts($peptide_nonmod, $protein, $key, $index, | |
907 $cleavage_aa, $cleavage_terminus); | |
908 } | |
909 if (defined($miscleavage_path_to)) { | |
910 ($miscleavage_peptide_contexts) = _GetMiscleavageSequenceContexts($peptide_nonmod, $protein, $key, $index, | |
911 $cleavage_aa, $cleavage_terminus, $miscleavage_distance); | |
912 } | |
913 if (defined($modification_path_to)) { | |
914 ($modification_peptide_contexts) = _GetModificationSequenceContexts($peptide_nonmod, $protein, $key, $index, | |
915 $modified_aa, $peptide); | |
916 } | |
917 if (defined($peptide_path_to)) { | |
918 ($peptide_contexts) = _GetPeptideSequenceContexts($peptide_nonmod, $protein, $key, $index); | |
919 } | |
920 } else { | |
921 $logger->error('Cannot find peptide (' . $peptide_nonmod . ') in protein ' . $key); | |
922 $acc = 'NA'; | |
923 } | |
924 } else { | |
925 $logger->error('Cannot find protein "' . $unfiltered_key . '" in any of the supplied databases.'); | |
926 $acc = 'NA'; | |
927 } | |
928 | |
929 # | |
930 # Append the peptide contexts to the existing lines of the fragment file and save that new line to the output file. | |
931 # | |
932 $extracted_seq_contexts_cle = _SaveSequenceContexts($line, $acc, $cleavage_peptide_contexts, | |
933 $cleavage_path_to_fh, $extracted_seq_contexts_cle); | |
934 | |
935 $extracted_seq_contexts_mis = _SaveSequenceContexts($line, $acc, $miscleavage_peptide_contexts, | |
936 $miscleavage_path_to_fh, $extracted_seq_contexts_mis); | |
937 | |
938 $extracted_seq_contexts_mod = _SaveSequenceContexts($line, $acc, $modification_peptide_contexts, | |
939 $modification_path_to_fh, $extracted_seq_contexts_mod); | |
940 | |
941 $extracted_seq_contexts_pep = _SaveSequenceContexts($line, $acc, $peptide_contexts, | |
942 $peptide_path_to_fh, $extracted_seq_contexts_pep); | |
943 | |
944 } | |
945 | |
946 # | |
947 # Close file handles. | |
948 # | |
949 foreach my $fh ($path_from_fh, $cleavage_path_to_fh, $miscleavage_path_to_fh, $modification_path_to_fh, $peptide_path_to_fh) { | |
950 if (defined($fh)) { | |
951 close($fh); | |
952 } | |
953 } | |
954 | |
955 return($extracted_seq_contexts_cle, $extracted_seq_contexts_mis, $extracted_seq_contexts_mod, $extracted_seq_contexts_pep); | |
956 | |
957 } | |
958 | |
959 sub _LookupProteinInfo { | |
960 | |
961 my ($key, $peptide, $db_lookup_table_refs) = @_; | |
962 | |
963 my $protein_found_in_db = 0; | |
964 my $protein; | |
965 my $acc; | |
966 my $id; | |
967 my $index; | |
968 | |
969 DB_LOOKUP:foreach my $db_lookup_table_ref (@{$db_lookup_table_refs}) { | |
970 | |
971 if (defined(${$db_lookup_table_ref}{$key})) { | |
972 | |
973 $protein_found_in_db = 1; | |
974 $logger->debug('Found protein ' . $key . ' in this sequence DB.'); | |
975 | |
976 $protein = ${$db_lookup_table_ref}{$key}{'SQ'}; | |
977 $acc = ${$db_lookup_table_ref}{$key}{'AC'}; | |
978 $id = ${$db_lookup_table_ref}{$key}{'ID'}; | |
979 # | |
980 # Find peptide (fragment) in DB protein sequence. | |
981 # | |
982 $index = index($protein, $peptide); | |
983 | |
984 if ($index > -1) { | |
985 | |
986 # | |
987 # Found peptide! | |
988 # | |
989 $logger->debug('Found peptide ' . $peptide . ' at position ' . $index . ' in protein ' . $key); | |
990 last DB_LOOKUP; | |
991 | |
992 } | |
993 | |
994 # | |
995 # If the input was based on mass spectrometry data we cannot see | |
996 # the difference between I (Isolucene) and L (Leucine) as the both | |
997 # weigh the same. Let's try an I agnostic search by substituting all | |
998 # Is for Ls. | |
999 # | |
1000 my $protein_l_only = $protein; | |
1001 my $peptide_l_only = $peptide; | |
1002 $protein_l_only =~ s/I/L/g; | |
1003 $peptide_l_only =~ s/I/L/g; | |
1004 $index = index($protein_l_only, $peptide_l_only); | |
1005 | |
1006 if ($index > -1) { | |
1007 | |
1008 # | |
1009 # Found peptide! | |
1010 # | |
1011 $logger->debug('Found peptide ' . $peptide . ' at position ' . $index . ' in protein ' . $key); | |
1012 $logger->debug("\t" . 'Had to treat I and L as the same amino acid in the DB search to get a match.'); | |
1013 last DB_LOOKUP; | |
1014 | |
1015 } | |
1016 | |
1017 # | |
1018 # Cannot find peptide :( | |
1019 # | |
1020 $protein = undef; | |
1021 $acc = undef; | |
1022 $id = undef; | |
1023 | |
1024 $logger->debug('Cannot find peptide (' . $peptide . ') in protein ' . $key); | |
1025 $logger->debug("\t" . 'This may be the result of an updated protein sequence in the database.'); | |
1026 $logger->debug("\t" . 'Will try to find the peptide in other databases (if available).'); | |
1027 | |
1028 if ($strip_lc) { | |
1029 $logger->debug("\t" . '(Stripping of lowercase characters, indicating modifications in the peptide sequence, was enabled.)'); | |
1030 } else { | |
1031 $logger->debug("\t" . '(Stripping of lowercase characters, indicating modifications in the peptide sequence, was *not* enabled.)'); | |
1032 } | |
1033 | |
1034 } else { | |
1035 $logger->debug('Protein ' . $key . ' missing from this sequence DB. Will try other databases (if available).'); | |
1036 } | |
1037 } | |
1038 | |
1039 # | |
1040 # Check if the peptide (fragment) was found in a DB protein sequence. | |
1041 # | |
1042 if ($protein_found_in_db == 1) { | |
1043 unless ($index > -1) { | |
1044 $logger->warn('Cannot find peptide (' . $peptide . ') in protein ' . $key); | |
1045 $logger->warn("\t" . 'Will try to find this peptide (' . $peptide . ') using other associated protein identifiers (if available).'); | |
1046 } | |
1047 } else { | |
1048 $logger->warn('Cannot find protein ' . $key . ' in any of the supplied databases.'); | |
1049 } | |
1050 | |
1051 return($protein, $acc, $id, $index); | |
1052 } | |
1053 | |
1054 sub _GetCleavageSequenceContexts { | |
1055 | |
1056 my ($peptide, $protein, $key, $index, $cleavage_aa, $cleavage_terminus) = @_; | |
1057 | |
1058 my %peptide_contexts; | |
1059 $peptide_contexts{'n_term'}{'context'} = ''; | |
1060 $peptide_contexts{'c_term'}{'context'} = ''; | |
1061 my $max_context_length = $n_term_context_length + $c_term_context_length; | |
1062 | |
1063 foreach my $peptide_term (keys(%peptide_contexts)) { | |
1064 | |
1065 # | |
1066 # Get the offset for the sequence contexts to retrieve from this peptide terminus | |
1067 # or drop the context for this peptide terminus if it is also the protein terminus. | |
1068 # | |
1069 if ($peptide_term eq 'n_term') { | |
1070 if ($index > 0) { | |
1071 # This is an N-terminal peptide context. | |
1072 my $offset = $index - $n_term_context_length; | |
1073 $peptide_contexts{$peptide_term}{'offset'} = $offset; | |
1074 $logger->debug('Offset = ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . '.'); | |
1075 } else { | |
1076 # Peptide N-terminus == Protein N-terminus. | |
1077 $logger->warn('Peptide N-terminus = protein N-terminus. Dropping N-terminal sequence context for peptide ' . $peptide . | |
1078 ' in protein ' . $key . '.'); | |
1079 next; | |
1080 } | |
1081 } elsif ($peptide_term eq 'c_term') { | |
1082 if (($index + length($peptide)) < length($protein)) { | |
1083 # This is a C-terminal peptide context. | |
1084 my $offset = ($index + length($peptide))- $n_term_context_length; | |
1085 $peptide_contexts{$peptide_term}{'offset'} = $offset; | |
1086 $logger->debug('Offset = ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . '.'); | |
1087 } else { | |
1088 # Peptide C-terminus == Protein C-terminus. | |
1089 $logger->warn('Peptide C-terminus = protein C-terminus. Dropping C-terminal sequence context for peptide ' . $peptide . | |
1090 ' in protein ' . $key . '.'); | |
1091 next; | |
1092 } | |
1093 } | |
1094 | |
1095 # | |
1096 # Check if DB protein sequence is long enough on n-terminal side | |
1097 # to retrieve full length peptide contexts. | |
1098 # (c-term length is checked later...) | |
1099 # | |
1100 if ($peptide_contexts{$peptide_term}{'offset'} < 0) { | |
1101 | |
1102 # | |
1103 # Protein too short for full length context: Calculate offset adjustment. | |
1104 # | |
1105 my $offset = $peptide_contexts{$peptide_term}{'offset'}; | |
1106 my $offset_adjustment = -($offset); | |
1107 $peptide_contexts{$peptide_term}{'offset_adjustment'} = $offset_adjustment; | |
1108 $logger->debug('Offset ' . $offset . ' for ' . $peptide_term . ' of peptide ' . $peptide . ' in protein ' . $key . ' less than zero'); | |
1109 | |
1110 if (length($padding_character) == 1) { | |
1111 | |
1112 # | |
1113 # Add N-terminal padding characters for contexts that are too short on the N-terminal side. | |
1114 # | |
1115 my $n_term_padding = "$padding_character" x $offset_adjustment; | |
1116 $peptide_contexts{$peptide_term}{'context'} = $n_term_padding; | |
1117 $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding); | |
1118 | |
1119 } else { | |
1120 $logger->debug('Padding was disabled. Will not add n-term padding.'); | |
1121 } | |
1122 | |
1123 $peptide_contexts{$peptide_term}{'offset'} = 0; | |
1124 | |
1125 } else { | |
1126 | |
1127 $peptide_contexts{$peptide_term}{'offset_adjustment'} = 0; | |
1128 | |
1129 } | |
1130 | |
1131 # | |
1132 # Extract sequence context from DB protein sequence. | |
1133 # | |
1134 my $peptide_context = substr($protein, $peptide_contexts{$peptide_term}{'offset'}, | |
1135 $max_context_length - $peptide_contexts{$peptide_term}{'offset_adjustment'}); | |
1136 # append the context extracted from the protein sequence, | |
1137 # because there may be already some N-terminal padding... | |
1138 $peptide_contexts{$peptide_term}{'context'} .= $peptide_context; | |
1139 | |
1140 # | |
1141 # Quality Control: Check enzyme specificity for enzymes with known (= user supplied) specs. | |
1142 # | |
1143 unless ($cleavage_aa eq '*') { | |
1144 | |
1145 $peptide_context = $peptide_contexts{$peptide_term}{'context'}; | |
1146 my $p1 = substr($peptide_context, ($n_term_context_length -1), 1); | |
1147 my $p1_prime = substr($peptide_context, $n_term_context_length, 1); | |
1148 | |
1149 if ($cleavage_terminus eq 'C') { | |
1150 | |
1151 if ($p1 eq $cleavage_aa) { | |
1152 # Peptide fragment contains the AA recognised by the protease. | |
1153 $logger->debug('Specific protease activity -> Retaining peptide context.'); | |
1154 } else { | |
1155 # Must be non-specific cleavage -> drop peptide context. | |
1156 $logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context . | |
1157 ' for peptide ' . $peptide . ' in protein ' . $key); | |
1158 $peptide_contexts{$peptide_term}{'context'} = ''; | |
1159 } | |
1160 | |
1161 } elsif ($cleavage_terminus eq 'N') { | |
1162 | |
1163 if ($p1_prime eq $cleavage_aa) { | |
1164 # Peptide fragment contains the AA recognised by the protease. | |
1165 $logger->debug('Specific protease activity -> Retaining peptide context.'); | |
1166 } else { | |
1167 # Must be non-specific cleavage -> drop peptide context. | |
1168 $logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context . | |
1169 ' for peptide ' . $peptide . ' in protein ' . $key); | |
1170 $peptide_contexts{$peptide_term}{'context'} = ''; | |
1171 } | |
1172 | |
1173 } elsif ($cleavage_terminus eq '*') { | |
1174 | |
1175 if (($p1 eq $cleavage_aa) || ($p1_prime eq $cleavage_aa)) { | |
1176 # Peptide fragment contains the AA recognised by the protease. | |
1177 $logger->debug('Specific protease activity -> Retaining peptide context.'); | |
1178 } else { | |
1179 # Must be non-specific cleavage -> drop peptide context. | |
1180 $logger->warn('Non-specific cleavage: dropping ' . $peptide_term . ' peptide context ' . $peptide_context . | |
1181 ' for peptide ' . $peptide . ' in protein ' . $key); | |
1182 $peptide_contexts{$peptide_term}{'context'} = ''; | |
1183 } | |
1184 | |
1185 } else { | |
1186 $logger->fatal('Unknown cleavage terminus ' . $cleavage_terminus . '. Must be C, N or *.'); | |
1187 exit; | |
1188 } | |
1189 } | |
1190 } | |
1191 | |
1192 push(my @contexts, $peptide_contexts{'n_term'}{'context'}, $peptide_contexts{'c_term'}{'context'}); | |
1193 | |
1194 if (length($padding_character) == 1) { | |
1195 _CheckSequenceContextLength(\@contexts, $peptide, $key, $max_context_length); | |
1196 } else { | |
1197 $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.'); | |
1198 } | |
1199 | |
1200 return(\@contexts); | |
1201 | |
1202 } | |
1203 | |
1204 sub _GetMiscleavageSequenceContexts { | |
1205 | |
1206 my ($peptide, $protein, $key, $petide_index, $cleavage_aa, $cleavage_terminus, $miscleavage_distance) = @_; | |
1207 | |
1208 my @peptide_contexts; | |
1209 my $max_context_length = $n_term_context_length + $c_term_context_length; | |
1210 | |
1211 # | |
1212 # Find miscleavage sites in peptide. | |
1213 # | |
1214 # TODO: handle --mcd | |
1215 # | |
1216 my @amino_acids = split('', $peptide); | |
1217 my $aa_index = 0; | |
1218 # | |
1219 # Drop first and last amino acids as these are from cleavage sites or from the protein termini. | |
1220 # In either case they cannot represent a miscleavage site. | |
1221 # | |
1222 shift(@amino_acids); | |
1223 pop(@amino_acids); | |
1224 | |
1225 foreach my $aa (@amino_acids) { | |
1226 | |
1227 $aa_index++; | |
1228 | |
1229 if ($aa eq $cleavage_aa) { | |
1230 | |
1231 # | |
1232 # We found a miscleavage site. | |
1233 # | |
1234 $logger->debug('Found miscleavage site in peptide ' . $peptide . ' of protein ' . $key . '.'); | |
1235 # | |
1236 # Get the offset for the sequence context. | |
1237 # | |
1238 my $offset = $petide_index + $aa_index - $n_term_context_length; | |
1239 $logger->debug("\t" . 'Offset = ' . $offset . '.'); | |
1240 my $n_term_padding = ''; | |
1241 my $offset_adjustment = 0; | |
1242 | |
1243 if ($cleavage_terminus eq 'C') { | |
1244 | |
1245 # | |
1246 # Must increment the offset with 1 if the protease cuts C-terminal of a recognition site. | |
1247 # | |
1248 $offset++; | |
1249 | |
1250 } | |
1251 | |
1252 # | |
1253 # Check if DB protein sequence is long enough on n-terminal side | |
1254 # to retrieve full length peptide contexts. | |
1255 # (c-term length is checked elsewhere.) | |
1256 # | |
1257 if ($offset < 0) { | |
1258 | |
1259 # | |
1260 # Protein too short for full length context: Calculate offset adjustment. | |
1261 # | |
1262 $offset_adjustment = -($offset); | |
1263 $logger->debug('Offset ' . $offset . ' for miscleavage context in peptide ' . $peptide . ' in protein ' . $key . ' less than zero'); | |
1264 | |
1265 if (length($padding_character) == 1) { | |
1266 | |
1267 # | |
1268 # Add N-terminal padding characters for contexts that are too short on the N-terminal side. | |
1269 # | |
1270 $n_term_padding = "$padding_character" x $offset_adjustment; | |
1271 $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding); | |
1272 | |
1273 } else { | |
1274 $logger->debug('Padding was disabled. Will not add n-term padding.'); | |
1275 } | |
1276 | |
1277 # | |
1278 # Reset offset for this context to protein start. | |
1279 # | |
1280 $offset = 0; | |
1281 | |
1282 } | |
1283 | |
1284 # | |
1285 # Extract sequence context from DB protein sequence. | |
1286 # | |
1287 my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment); | |
1288 $peptide_context = $n_term_padding . $peptide_context; | |
1289 $logger->debug('Found miscleavage context: ' . $peptide_context . '.'); | |
1290 | |
1291 push(@peptide_contexts, $peptide_context); | |
1292 } | |
1293 } | |
1294 | |
1295 if (length($padding_character) == 1) { | |
1296 _CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length); | |
1297 } else { | |
1298 $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.'); | |
1299 } | |
1300 | |
1301 return(\@peptide_contexts); | |
1302 | |
1303 } | |
1304 | |
1305 sub _GetModificationSequenceContexts { | |
1306 | |
1307 my ($peptide, $protein, $key, $petide_index, $modified_aa, $modified_peptide) = @_; | |
1308 | |
1309 my @peptide_contexts; | |
1310 my $modified_aa_index_adjustment; | |
1311 my @modified_aa_in_peptide_indices; | |
1312 my $modified_aa_in_protein_index; | |
1313 my $max_context_length = $n_term_context_length + 1 + $c_term_context_length; | |
1314 #my $offset = 0; | |
1315 my $modified_aa_in_modified_peptide_index; | |
1316 my $modified_peptide_remainder; | |
1317 my $processed_peptide_length = 0; | |
1318 | |
1319 # | |
1320 # Find amino acid in the string identifying both the modified amino acid and the modification. | |
1321 # | |
1322 if ($modified_aa =~ m/^([a-z]+)[A-Z*]$/) { | |
1323 | |
1324 $modified_aa_index_adjustment = length($1); | |
1325 | |
1326 } elsif ($modified_aa =~ m/^[A-Z*][a-z]$/) { | |
1327 | |
1328 $modified_aa_index_adjustment = 0; | |
1329 | |
1330 } else { | |
1331 | |
1332 $logger->fatal('Modified amino acid was specified in an unsupported format: ' . $modified_aa . '.'); | |
1333 exit; | |
1334 | |
1335 } | |
1336 | |
1337 # | |
1338 # Find modified sites in peptide. | |
1339 # | |
1340 my $modified_aa_for_regex = $modified_aa; | |
1341 $modified_aa_for_regex =~ s/\*/[A-Z]/; | |
1342 $logger->debug('modified_aa for search: ' . $modified_aa_for_regex); | |
1343 if ($modified_peptide =~ m/$modified_aa_for_regex/) { | |
1344 $logger->debug("\t" . 'Peptide contains mods.'); | |
1345 $modified_aa_in_modified_peptide_index = $-[0]; | |
1346 $modified_peptide_remainder = $'; | |
1347 my $modified_peptide_prefix = $`; | |
1348 $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); | |
1349 $processed_peptide_length += length($modified_peptide_prefix); | |
1350 $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); | |
1351 $processed_peptide_length += length($modified_aa); | |
1352 $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); | |
1353 $logger->trace("\t\t" . 'Index for mod :' . $modified_aa_in_modified_peptide_index); | |
1354 } else { | |
1355 $logger->debug('No modified amino acids found in this peptide.'); | |
1356 } | |
1357 | |
1358 #my $modified_aa_in_modified_peptide_index = index($modified_peptide, $modified_aa); | |
1359 | |
1360 while (defined($modified_aa_in_modified_peptide_index)) { | |
1361 | |
1362 $modified_aa_in_modified_peptide_index += $modified_aa_index_adjustment; | |
1363 $logger->debug('Found modified amino acid at position ' . ($modified_aa_in_modified_peptide_index + 1) . ' in modified peptide ' . $modified_peptide . ' of protein ' . $key . '.'); | |
1364 | |
1365 # | |
1366 # Count all modifications of any type preceeding the found modified amino acid. | |
1367 # | |
1368 my $modified_n_term = substr($modified_peptide, 0, $modified_aa_in_modified_peptide_index); | |
1369 my $n_term = $modified_n_term; | |
1370 $n_term =~ s/[a-z]+//g; | |
1371 my $mod_count = length($modified_n_term) - length($n_term); | |
1372 | |
1373 if ($mod_count >= 0) { | |
1374 $logger->debug(' counted modifications preceeding amino acid = ' . $mod_count); | |
1375 } else { | |
1376 $logger->fatal(' counted modifications preceeding amino acid is negative: ' . $mod_count); | |
1377 exit; | |
1378 } | |
1379 | |
1380 # | |
1381 # We need to substract the amount of mods preceeding the identified amino acid | |
1382 # from the index to get the position of the amino acid in the unmodified peptide. | |
1383 # | |
1384 my $modified_aa_in_peptide_index = $modified_aa_in_modified_peptide_index - $mod_count; | |
1385 $logger->debug(' and at position ' . ($modified_aa_in_peptide_index + 1) . ' in peptide ' . $peptide . ' of protein ' . $key . '.'); | |
1386 | |
1387 push(@modified_aa_in_peptide_indices, $modified_aa_in_peptide_index); | |
1388 | |
1389 # | |
1390 # Search for additional mods in the modified peptide. | |
1391 # | |
1392 #$offset = $modified_aa_in_modified_peptide_index + 1; | |
1393 #$modified_aa_in_modified_peptide_index = index($modified_peptide, $modified_aa, $offset); | |
1394 if ($modified_peptide_remainder =~ m/$modified_aa_for_regex/) { | |
1395 $logger->debug("\t" . 'Peptide contains more mods.'); | |
1396 $modified_peptide_remainder = $'; | |
1397 my $modified_peptide_prefix = $`; | |
1398 $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); | |
1399 $modified_aa_in_modified_peptide_index = $-[0] + $processed_peptide_length; | |
1400 $logger->trace("\t\t" . 'Index for mod :' . $modified_aa_in_modified_peptide_index); | |
1401 $processed_peptide_length += length($modified_peptide_prefix); | |
1402 $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); | |
1403 $processed_peptide_length += length($modified_aa); | |
1404 $logger->trace("\t\t" . 'Processed pep length = ' . $processed_peptide_length); | |
1405 } else { | |
1406 $modified_aa_in_modified_peptide_index = undef(); | |
1407 $logger->debug('No more modified amino acids found in this peptide.'); | |
1408 } | |
1409 } | |
1410 | |
1411 # | |
1412 # Retrieve sequence contexts for modified stites from protein sequence | |
1413 # | |
1414 foreach my $modified_aa_in_peptide_index (@modified_aa_in_peptide_indices) { | |
1415 | |
1416 # | |
1417 # Get the offset for the sequence context. | |
1418 # | |
1419 my $offset = $petide_index + $modified_aa_in_peptide_index - $n_term_context_length; | |
1420 $logger->debug("\t" . 'Offset = ' . $offset . '.'); | |
1421 my $n_term_padding = ''; | |
1422 my $offset_adjustment = 0; | |
1423 | |
1424 # | |
1425 # Check if DB protein sequence is long enough on n-terminal side | |
1426 # to retrieve full length peptide contexts. | |
1427 # (c-term length is checked elsewhere.) | |
1428 # | |
1429 if ($offset < 0) { | |
1430 | |
1431 # | |
1432 # Protein too short for full length context: Calculate offset adjustment. | |
1433 # | |
1434 $offset_adjustment = -($offset); | |
1435 $logger->debug('Offset ' . $offset . ' for modification context in peptide ' . $peptide . ' in protein ' . $key . ' less than zero'); | |
1436 | |
1437 if (length($padding_character) == 1) { | |
1438 | |
1439 # | |
1440 # Add N-terminal padding characters for contexts that are too short on the N-terminal side. | |
1441 # | |
1442 $n_term_padding = "$padding_character" x $offset_adjustment; | |
1443 $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding); | |
1444 | |
1445 } else { | |
1446 $logger->debug('Padding was disabled. Will not add n-term padding.'); | |
1447 } | |
1448 | |
1449 # | |
1450 # Reset offset for this context to protein start. | |
1451 # | |
1452 $offset = 0; | |
1453 | |
1454 } | |
1455 | |
1456 # | |
1457 # Extract sequence context from DB protein sequence. | |
1458 # | |
1459 my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment); | |
1460 $peptide_context = $n_term_padding . $peptide_context; | |
1461 $logger->debug('Found modification context: ' . $peptide_context . '.'); | |
1462 | |
1463 push(@peptide_contexts, $peptide_context); | |
1464 | |
1465 } | |
1466 | |
1467 if (length($padding_character) == 1) { | |
1468 _CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length); | |
1469 } else { | |
1470 $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.'); | |
1471 } | |
1472 | |
1473 return(\@peptide_contexts); | |
1474 | |
1475 } | |
1476 | |
1477 # | |
1478 # TODO: pep contexts. | |
1479 # | |
1480 sub _GetPeptideSequenceContexts { | |
1481 | |
1482 my ($peptide, $protein, $key, $peptide_index) = @_; | |
1483 | |
1484 my @peptide_contexts; | |
1485 my $max_context_length = $n_term_context_length + length($peptide) + $c_term_context_length; | |
1486 | |
1487 # | |
1488 # Get the offset for the peptide sequence context. | |
1489 # | |
1490 my $offset = $peptide_index - $n_term_context_length; | |
1491 $logger->debug("\t" . 'Offset = ' . $offset . '.'); | |
1492 my $n_term_padding = ''; | |
1493 my $offset_adjustment = 0; | |
1494 | |
1495 # | |
1496 # Check if DB protein sequence is long enough on n-terminal side | |
1497 # to retrieve full length peptide contexts. | |
1498 # (c-term length is checked elsewhere.) | |
1499 # | |
1500 if ($offset < 0) { | |
1501 | |
1502 # | |
1503 # Protein too short for full length context: Calculate offset adjustment. | |
1504 # | |
1505 $offset_adjustment = -($offset); | |
1506 $logger->debug('Offset ' . $offset . ' for peptide context of peptide ' . $peptide . ' in protein ' . $key . ' less than zero'); | |
1507 | |
1508 if (length($padding_character) == 1) { | |
1509 | |
1510 # | |
1511 # Add N-terminal padding characters for contexts that are too short on the N-terminal side. | |
1512 # | |
1513 $n_term_padding = "$padding_character" x $offset_adjustment; | |
1514 $logger->debug("\t" . 'Will add N-terminal padding: ' . $n_term_padding); | |
1515 | |
1516 } else { | |
1517 $logger->debug('Padding was disabled. Will not add n-term padding.'); | |
1518 } | |
1519 | |
1520 # | |
1521 # Reset offset for this context to protein start. | |
1522 # | |
1523 $offset = 0; | |
1524 | |
1525 } | |
1526 | |
1527 # | |
1528 # Extract sequence context from DB protein sequence. | |
1529 # | |
1530 my $peptide_context = substr($protein, $offset, $max_context_length - $offset_adjustment); | |
1531 $peptide_context = $n_term_padding . $peptide_context; | |
1532 $logger->debug('Found modification context: ' . $peptide_context . '.'); | |
1533 | |
1534 push(@peptide_contexts, $peptide_context); | |
1535 | |
1536 if (length($padding_character) == 1) { | |
1537 _CheckSequenceContextLength(\@peptide_contexts, $peptide, $key, $max_context_length); | |
1538 } else { | |
1539 $logger->debug('Padding was disabled. Will not add c-term padding nor check sequence context length.'); | |
1540 } | |
1541 | |
1542 return(\@peptide_contexts); | |
1543 | |
1544 } | |
1545 | |
1546 sub _CheckSequenceContextLength { | |
1547 | |
1548 my ($contexts, $peptide, $key, $max_context_length) = @_; | |
1549 | |
1550 # | |
1551 # If padding was enabled for sequence contexts < max sequence context length. | |
1552 # | |
1553 foreach my $peptide_context (@{$contexts}) { | |
1554 | |
1555 my $peptide_context_length = length($peptide_context); | |
1556 | |
1557 if ($peptide_context_length < $max_context_length) { | |
1558 | |
1559 # | |
1560 # sequence context < max sequence context length -> append c-term padding. | |
1561 # (n-term padding was already appended if necessary during sequence context lookup with _GetSequenceContext().) | |
1562 # | |
1563 my $c_term_padding = "$padding_character" x ($max_context_length - $peptide_context_length); | |
1564 $logger->debug('Length of peptide context (' . $peptide_context_length . ') < max context length.'); | |
1565 $logger->debug("\t" . 'Will add C-terminal padding: ' . $c_term_padding); | |
1566 $peptide_context = $peptide_context . $c_term_padding; | |
1567 | |
1568 } | |
1569 | |
1570 # | |
1571 # Sanity check for padded sequence contexts. | |
1572 # | |
1573 # (Checks if the combined effect of n-term and c-term padding worked well.) | |
1574 # | |
1575 $peptide_context_length = length($peptide_context); | |
1576 unless ($peptide_context_length == $max_context_length) { | |
1577 $logger->fatal('Length of peptide context too short or too long for peptide ' . $peptide . ' in protein ' . $key); | |
1578 $logger->fatal("\t" . 'Context length is ' . $peptide_context_length . ', but should be ' . $max_context_length . '.'); | |
1579 exit; | |
1580 } | |
1581 } | |
1582 | |
1583 # | |
1584 # No need to return the modified (c-terminal-padded) contexts as we worked with a arrayref. | |
1585 # | |
1586 | |
1587 } | |
1588 | |
1589 sub _SaveSequenceContexts { | |
1590 | |
1591 my ($line, $acc, $contexts, $path_to_fh, $extracted_seq_contexts) = @_; | |
1592 | |
1593 # | |
1594 # Append the peptide contexts to the existing line of the fragment file and | |
1595 # save that new line to the output file. | |
1596 # By appending we keep all info from the original fragment file intact. | |
1597 # | |
1598 foreach my $context (@{$contexts}) { | |
1599 | |
1600 # | |
1601 # Skip "empty"/"missing" contexts. | |
1602 # May be the result of: | |
1603 # 1. The peptide terminus being also the protein terminus. | |
1604 # 2. Proteins missing from the database. | |
1605 # 3. ... | |
1606 # | |
1607 if ($padding_character ne '') { | |
1608 if ($context =~ m/^($padding_character)+$/) { | |
1609 next; | |
1610 } | |
1611 } else { | |
1612 if ($context eq '') { | |
1613 next; | |
1614 } | |
1615 } | |
1616 | |
1617 my $new_line = $line; | |
1618 | |
1619 # 1. Append accession numbers if the fragment file used IDs. | |
1620 if ($index_id eq 'ID') { | |
1621 $new_line .= "\t" . $acc; | |
1622 } | |
1623 | |
1624 # 2. Append sequence context. | |
1625 $new_line .= "\t" . $context; | |
1626 # 3. Append new line character. | |
1627 $new_line .= "\n"; | |
1628 # Save result to disk. | |
1629 print($path_to_fh $new_line); | |
1630 $extracted_seq_contexts++; | |
1631 | |
1632 } | |
1633 | |
1634 return($extracted_seq_contexts); | |
1635 | |
1636 } | |
1637 | |
1638 sub _Usage { | |
1639 | |
1640 | |
1641 print STDOUT "\n" . | |
1642 'ExtractPeptideSequenceContext.pl: Extract sequence contexts of cleavage, miscleavage or modification sites ' . "\n" . | |
1643 ' based on sequence fragments (like peptides experimentally identified with MSMS) ' . "\n" . | |
1644 ' and protein sequences from a database in FASTA or UniProtKB file format.' . "\n" . | |
1645 "\n" . | |
1646 'Usage:' . "\n" . | |
1647 "\n" . | |
1648 ' ExtractPeptideSequenceContext.pl options' . "\n" . | |
1649 "\n" . | |
1650 'Available options are:' . "\n" . | |
1651 "\n" . | |
1652 ' --db [file] Input file containing all database sequences in either UniProtKB *.dat or *.fasta format.' . "\n" . | |
1653 ' --dba [file] Optional alternative / backup input file containing database sequences in either UniProtKB *.dat or *.fasta format.' . "\n" . | |
1654 ' This may be a slightly older or newer version of the database, which will be searched in case a ' . "\n" . | |
1655 ' sequence ID was not found in the primary database file that was specified with --db [file].' . "\n" . | |
1656 ' --dbf [format] Optional argument to specify the database sequence file format. Valid format specifications are:' . "\n" . | |
1657 ' * \'UniProtKB DAT\' for UniProtKB DAT files.' . "\n" . | |
1658 ' * \'FASTA\' for FASTA files.' . "\n" . | |
1659 ' Only required if the database files do *not* have the standard file extensions: ' . "\n" . | |
1660 ' *.dat for UniProtKB DAT files or ' . "\n" . | |
1661 ' *.fa or *.fasta for FASTA files.' . "\n" . | |
1662 ' --k [ID|AC] Unique identifier type used as key to index an UniProtKB *.dat database file.' . "\n" . | |
1663 ' Must be either AC for a UniProt accession number (default) or ID for a UniProt identifier.' . "\n" . | |
1664 ' Not required for *.fasta databases: This tool will look for any type of ID in the first part of FASTA ' . "\n" . | |
1665 ' sequence headers up until the first white space.' . "\n" . | |
1666 ' Optionally multiple IDs may be present separated with pipe symbols (|) or semicolons (;).' . "\n" . | |
1667 ' Optionally IDs may be prefixed with a database namespace and a colon (:).' . "\n" . | |
1668 ' For example the accession number P32234 as well as the ID 128UP_DROME would be recognised in both ' . "\n" . | |
1669 ' this sequence header:' . "\n" . | |
1670 ' >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)' . "\n" . | |
1671 ' and in this one:' . "\n" . | |
1672 ' >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)' . "\n" . | |
1673 ' --f [file] Fragment file containing accession numbers or IDs and sequence fragments (peptides), ' . "\n" . | |
1674 ' whose sequence context should be extracted from the database. This file' . "\n" . | |
1675 ' * Must be tab delimited with one accession/ID plus one sequence fragment per line' . "\n" . | |
1676 ' * Must contain accession numbers / IDs in the same format as the database file used.' . "\n" . | |
1677 ' Exception: Optionally IDs may be suffixed with the peptide\'s position in its protein between brackets.' . "\n" . | |
1678 ' For example: CLH1_HUMAN[1612-1620].' . "\n" . | |
1679 ' * May contain other columns, which will be preserved in the output.' . "\n" . | |
1680 ' * Is the basis for the output: if a sequence context was found, ' . "\n" . | |
1681 ' it will be appended in another column to the right of the existing columns.' . "\n" . | |
1682 ' --icol [int] Column in the tab delimited fragment file containing the protein identifiers / accession numbers.' . "\n" . | |
1683 ' Default = 1.' . "\n" . | |
1684 ' --pcol [int] Column in the tab delimited fragment file containing the peptide sequences.' . "\n" . | |
1685 ' Default = 2.' . "\n" . | |
1686 ' --s Strip lowercase characters from the sequence fragments in the fragment file before doing a database lookup.' . "\n" . | |
1687 ' Amino acids are expected to be in uppercase. If not, the sequences are converted to uppercase ' . "\n" . | |
1688 ' before searching for the fragment in the database. When lowercase characters represent ' . "\n" . | |
1689 ' modifications instead of amino acids these need to be removed before searching the database.' . "\n" . | |
1690 ' Default = 0 (disabled) except when a modified amino acid is specified with --ma, which will automatically enable --s.' . "\n" . | |
1691 ' --cleo [file] Optional output file where the retrieved cleavage site sequence contexts will be saved.' . "\n" . | |
1692 ' --miso [file] Optional output file where the retrieved miscleavage site sequence contexts will be saved.' . "\n" . | |
1693 ' --modo [file] Optional output file where the retrieved modification site sequence contexts will be saved.' . "\n" . | |
1694 ' --pepo [file] Optional output file where the retrieved peptide sequence contexts will be saved.' . "\n" . | |
1695 ' (At least one output file must be specified with --cleo, --miso, --modo or --pepo).' . "\n" . | |
1696 ' --ca [A-Z] Cleavage amino acid. Assume the sequence fragments were derived from cutting with a proteolytic enzyme, ' . "\n" . | |
1697 ' that recognizes this amino acid as the position to cut.' . "\n" . | |
1698 ' When the specificity of the protease used to generate the peptides in the fragments file is unknown,' . "\n" . | |
1699 ' you may provide an asterisk (*) to retrieve sequence context for any cleavage,' . "\n" . | |
1700 ' but in that case this tool will not filter non-specifically cleaved fragments,' . "\n" . | |
1701 ' that may be the result of other processes than protease activity.' . "\n" . | |
1702 ' --ct [C|N] Cleavage terminus. Assume the sequence fragments were derived from cutting with a proteolytic enzyme, ' . "\n" . | |
1703 ' that cuts at the C or N terminus of the amino acid specified with the --ca [A-Z] parameter.' . "\n" . | |
1704 ' When the specificity of the protease used to generate the peptides in the fragments file is unknown,' . "\n" . | |
1705 ' you may provide an asterisk (*) to retrieve sequence context for any cleavage,' . "\n" . | |
1706 ' but in that case this tool will not filter non-specifically cleaved fragments,' . "\n" . | |
1707 ' that may be the result of other processes than protease activity.' . "\n" . | |
1708 ' --ma [A-Za-z] Modified amino acid.' . "\n" . | |
1709 ' The amino acid must be specified in uppercase and the modification in lower case.' . "\n" . | |
1710 ' The order is not important.' . "\n" . | |
1711 ' Hence a phophorylated serine in the fragments file can be indicated with either pS or Sp, ' . "\n" . | |
1712 ' but you cannot mix both pS and Sp in a single fragments file.' . "\n" . | |
1713 ' You may provide an asterisk (*) instead of an upper case amino acid to retrieve sequence contexts ' . "\n" . | |
1714 ' for the specified modification no matter what amino acid it was located on.' . "\n" . | |
1715 ' A modification may be specified with more than one lower case character, ' . "\n" . | |
1716 ' so for example phosphoS or Sphospho can also be used for a phosphorylated serine.' . "\n" . | |
1717 ' --n [int] Integer specifying the length of the N-terminal sequence context to retrieve starting from the ' . "\n" . | |
1718 ' cleavage, miscleavage or modification site. Default = 5.' . "\n" . | |
1719 ' --c [int] Integer specifying the length of the C-terminal sequence context to retrieve starting from the ' . "\n" . | |
1720 ' cleavage, miscleavage or modification site. Default = 5.' . "\n" . | |
1721 ' Please note that cleavage and miscleavage sites have zero width, ' . "\n" . | |
1722 ' while modified amino acids will increment the sequence context lenght with 1.' . "\n" . | |
1723 ' When defaults are used for both the N-terminal and C-terminal sequence context lengths,' . "\n" . | |
1724 ' the total sequence context length for a' . "\n" . | |
1725 ' * cleavage or miscleavage site will be:' . "\n" . | |
1726 ' (N-terminal sequence context) + (C-terminal sequence context) = 5 + 5 = 10.' . "\n" . | |
1727 ' * modification site will be:' . "\n" . | |
1728 ' (N-terminal sequence context) + modified AA + (C-terminal sequence context) = 5 + 1 + 5 = 11.' . "\n" . | |
1729 # ' --mcd [int] Minimal distance between a putative miscleavage site and the peptide termini.' . "\n" . | |
1730 # ' Uncleaved amino acids closer to the peptide termini can be ignored with this parameter:' . "\n" . | |
1731 # ' A. To prevent overlap between cleavage and miscleavage peptide sequence contexts.' . "\n" . | |
1732 # ' B. To prevent false positive miscleavage sites due to sites that cannot be cleaved, ' . "\n" . | |
1733 # ' because one of the resulting fragments is too short. (Sometimes proteases may need ' . "\n" . | |
1734 # ' a minimal length surrounding the cleavage site to bind and cleave a peptide.)' . "\n" . | |
1735 # ' Default = same as the longest *-terminal context length. Hence [--n [int]|--c [int]] (see above).' . "\n" . | |
1736 ' --pc [char] Optional padding character to fill N-terminal or C-terminal positions in the sequence context, ' . "\n" . | |
1737 ' when the protein was too short to get a complete sequence context.' . "\n" . | |
1738 ' Default = - (a.k.a. dash or the alignment gap character.)' . "\n" . | |
1739 ' To disable padding specify an empty string like this --pc \'\'.' . "\n" . | |
1740 ' --ll [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.' . "\n" . | |
1741 "\n"; | |
1742 exit; | |
1743 | |
1744 } |