0
|
1 #!/usr/bin/perl -w
|
|
2
|
|
3 ###############################################################
|
|
4 # ProteinDigestor.pl
|
|
5 #
|
|
6 # =====================================================
|
|
7 # $Id: ProteinDigestor.pl 76 2011-01-04 13:16:56Z pieter.neerincx@gmail.com $
|
|
8 # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ProteinDigestor.pl $
|
|
9 # $LastChangedDate: 2011-01-04 07:16:56 -0600 (Tue, 04 Jan 2011) $
|
|
10 # $LastChangedRevision: 76 $
|
|
11 # $LastChangedBy: pieter.neerincx@gmail.com $
|
|
12 # =====================================================
|
|
13 #
|
|
14 # (c) Dr. Ir. B. van Breukelen et al.
|
|
15 # https://bioinformatics.chem.uu.nl
|
|
16 # http://www.netherlandsproteomicscentre.eu/
|
|
17 # b.vanbreukelen@pharm.uu.nl
|
|
18 #
|
|
19 # Software to create peptide databases from a fasta
|
|
20 # file of proteins. You can cut with several enzymes
|
|
21 # select on size, PI, etc etc etc.
|
|
22 #
|
|
23 # TODO: put filters, enzymes, in seperate config file
|
|
24 ###############################################################
|
|
25
|
|
26 #############################
|
|
27 # Initialise environment
|
|
28 #############################
|
|
29 use strict;
|
|
30 use Getopt::Std;
|
|
31 use Log::Log4perl qw(:easy);
|
|
32
|
|
33 #
|
|
34 # Log levels for Log4perl.
|
|
35 #
|
|
36 my %log_levels = (
|
|
37 'ALL' => $ALL,
|
|
38 'TRACE' => $TRACE,
|
|
39 'DEBUG' => $DEBUG,
|
|
40 'INFO' => $INFO,
|
|
41 'WARN' => $WARN,
|
|
42 'ERROR' => $ERROR,
|
|
43 'FATAL' => $FATAL,
|
|
44 'OFF' => $OFF,
|
|
45 );
|
|
46
|
|
47 #
|
|
48 # pK values array
|
|
49 #
|
|
50 my %cPk = (
|
|
51 'A'=>[3.55, 7.59, 0 , 0 , 0],
|
|
52 'B'=>[3.55, 7.50, 0 , 0 , 0],
|
|
53 'C'=>[3.55, 7.50, 9.00 , 9.00 , 9.00],
|
|
54 'D'=>[4.55, 7.50, 4.05 , 4.05 , 4.05],
|
|
55 'E'=>[4.75, 7.70, 4.45 , 4.45 , 4.45],
|
|
56 'F'=>[3.55, 7.50, 0 , 0. , 0],
|
|
57 'G'=>[3.55, 7.50, 0 , 0. , 0],
|
|
58 'H'=>[3.55, 7.50, 5.98 , 5.98 , 5.98],
|
|
59 'I'=>[3.55, 7.50, 0 , 0. , 0.],
|
|
60 'J'=>[0.00, 0.00, 0 , 0. , 0.],
|
|
61 'K'=>[3.55, 7.50, 10.00, 10.00, 10.00],
|
|
62 'L'=>[3.55, 7.50, 0 , 0. , 0.],
|
|
63 'M'=>[3.55, 7.00, 0 , 0. , 0.],
|
|
64 'N'=>[3.55, 7.50, 0 , 0. , 0.],
|
|
65 'O'=>[0.00, 0.00, 0 , 0. , 0.],
|
|
66 'P'=>[3.55, 8.36, 0 , 0. , 0.],
|
|
67 'Q'=>[3.55, 7.50, 0 , 0. , 0.],
|
|
68 'R'=>[3.55, 7.50, 12.0 , 12.0 , 12.0],
|
|
69 'S'=>[3.55, 6.93, 0 , 0. , 0.],
|
|
70 'T'=>[3.55, 6.82, 0 , 0. , 0.],
|
|
71 'U'=>[0.00, 0.00, 0 , 0. , 0.],
|
|
72 'V'=>[3.55, 7.44, 0 , 0. , 0.],
|
|
73 'W'=>[3.55, 7.50, 0 , 0. , 0.],
|
|
74 'X'=>[3.55, 7.50, 0 , 0. , 0.],
|
|
75 'Y'=>[3.55, 7.50, 10.00, 10.00, 10.00],
|
|
76 'Z'=>[3.55, 7.50, 0 , 0. , 0.]
|
|
77 );
|
|
78
|
|
79 #
|
|
80 # MW values array
|
|
81 #
|
|
82 my %cMWs = (
|
|
83 'A'=>89.09,
|
|
84 'B'=>132.65,
|
|
85 'C'=>121.15,
|
|
86 'D'=>133.1,
|
|
87 'E'=>147.13,
|
|
88 'F'=>165.19,
|
|
89 'G'=>75.07,
|
|
90 'H'=>155.16,
|
|
91 'I'=>131.18,
|
|
92 'J'=>0.00,
|
|
93 'K'=>146.19,
|
|
94 'L'=>131.18,
|
|
95 'M'=>149.22,
|
|
96 'N'=>132.12,
|
|
97 'O'=>0.00,
|
|
98 'P'=>115.13,
|
|
99 'Q'=>146.15,
|
|
100 'R'=>174.21,
|
|
101 'S'=>105.09,
|
|
102 'T'=>119.12,
|
|
103 'U'=>168.05,
|
|
104 'V'=>117.15,
|
|
105 'W'=>204.22,
|
|
106 'X'=>129.26,
|
|
107 'Y'=>181.19,
|
|
108 'Z'=>146.73
|
|
109 );
|
|
110
|
|
111 #
|
|
112 # Digestion enzymes.
|
|
113 #
|
|
114 # For example Trypsin cuts c-terminal of K and R not followed by a P at position p1 = c:K,R:P=1
|
|
115 #
|
|
116 my %enzymes = (
|
|
117 'Trypsin' => 'c:K,R:P=1',
|
|
118 'Trypsin/P' => 'c:K,R',
|
|
119 'Chymotrypsin' => 'c:F,L,W,Y:P=1',
|
|
120 'Chymotrypsin/P' => 'c:F,L,W,Y',
|
|
121 'V8_E' => 'c:E,Z:P=1',
|
|
122 'V8_DE' => 'c:E,D,B,Z:P=1',
|
|
123 'LysC' => 'c:K:P=1',
|
|
124 'LysC/P' => 'c:K',
|
|
125 'LysN/P' => 'n:K'
|
|
126 );
|
|
127
|
|
128 my @filters = (
|
|
129 # "[R|K][R|K][R|K]..[S|T]."
|
|
130 # "[R|K][R|K][R|K].[S|T]." ,
|
|
131 # "[R|K][R|K]..[S|T].",
|
|
132 # "[R|K]..[S|T].",
|
|
133 # "[R|K][R|K].[S|T].",
|
|
134 # "[R|K].[S|T]."
|
|
135 # "[S|T].[R|K]"
|
|
136 );
|
|
137
|
|
138 my %charges = ('A'=>0, 'B'=>0, 'C'=>0, 'D'=>-1, 'E'=>-1, 'F'=>0,'G'=>0, 'H'=>1, 'I'=>0,
|
|
139 'J'=>0, 'K'=>1, 'L'=>0, 'M'=>0, 'N'=>0, 'O'=>0,'P'=>0, 'Q'=>0, 'R'=>1,
|
|
140 'S'=>0, 'T'=>0, 'U'=>0, 'V'=>0, 'W'=>0, 'X'=>0,'Y'=>0, 'Z'=>0);
|
|
141
|
|
142 my %pept = (); #hashed array of peptides (HASH) and fasta headers (VALUE)
|
|
143
|
|
144 my $H20 = 18.015; #Mol. weight water
|
|
145
|
|
146 my $PH_MIN = 0.0; #min pH value
|
|
147 my $PH_MAX = 14.0; #max pH value
|
|
148 my $MAXLOOP = 2000; # max number iterations
|
|
149 my $EPSI = 0.0001; # desired presision
|
|
150
|
|
151 #
|
|
152 # Get options.
|
|
153 #
|
|
154 my %opts;
|
|
155 Getopt::Std::getopts('i:o:e:r:p:m:n:c:l:sa', \%opts);
|
|
156
|
|
157 my $input = $opts{'i'};
|
|
158 my $output = $opts{'o'};
|
|
159 my $enzyme = $opts{'e'};
|
|
160 my $enzyme_cleavage_site_rules = $opts{'r'};
|
|
161 my $partial_cleavage = $opts{'s'};
|
|
162 my $partial_cleavage_chance = $opts{'p'};
|
|
163 my $min_peptide_weight = $opts{'m'};
|
|
164 my $max_peptide_weight = $opts{'n'};
|
|
165 my $max_charge = $opts{'c'};
|
|
166 my $add_sequence_context = $opts{'a'};
|
|
167 my $log_level = $opts{'l'};
|
|
168
|
|
169 #
|
|
170 # Configure logging.
|
|
171 #
|
|
172 # Provides default if user did not specify log level:
|
|
173 $log_level = (defined($log_level) ? $log_level : 'INFO');
|
|
174
|
|
175 # Reset log level to default if user specified illegal log level.
|
|
176 $log_level = (
|
|
177 defined($log_levels{$log_level})
|
|
178 ? $log_levels{$log_level}
|
|
179 : $log_levels{'INFO'});
|
|
180
|
|
181 #Log::Log4perl->init('log4perl.properties');
|
|
182 Log::Log4perl->easy_init(
|
|
183
|
|
184 #{ level => $log_level,
|
|
185 # file => ">>ProteinDigestor.log",
|
|
186 # layout => '%F{1}-%L-%M: %m%n' },
|
|
187 {
|
|
188 level => $log_level,
|
|
189 file => "STDOUT",
|
|
190 layout => '%d L:%L %p> %m%n'
|
|
191 },
|
|
192 );
|
|
193 my $logger = Log::Log4perl::get_logger();
|
|
194
|
|
195 #
|
|
196 # Check user input.
|
|
197 #
|
|
198 unless (defined($input) && defined($output)) {
|
|
199 _Usage();
|
|
200 exit;
|
|
201 }
|
|
202 if ($input =~ /^$/ || $output =~ /^$/) {
|
|
203 _Usage();
|
|
204 exit;
|
|
205 }
|
|
206 if ($input eq $output) {
|
|
207 $logger->fatal('Output file is the same as the input file. Select a different file for the output.');
|
|
208 exit;
|
|
209 }
|
|
210
|
|
211 #
|
|
212 # Check input file.
|
|
213 #
|
|
214 unless (-e $input && -f $input && -r $input) {
|
|
215 $logger->fatal('Cannot read from input file ' . $input . ': ' . $!);
|
|
216 exit;
|
|
217 }
|
|
218
|
|
219 #
|
|
220 # Set additional defaults.
|
|
221 #
|
|
222 $partial_cleavage = (defined($partial_cleavage) ? $partial_cleavage : 0); # Disabled by default
|
|
223 $partial_cleavage_chance = (defined($partial_cleavage_chance) ? $partial_cleavage_chance : 0.1); # 10% change by default
|
|
224 $add_sequence_context = (defined($add_sequence_context) ? $add_sequence_context : 0); # Disabled by default
|
|
225
|
|
226 ########################################################
|
|
227 # MAIN PROGRAM
|
|
228 ########################################################
|
|
229
|
|
230 $logger->info('Starting...');
|
|
231
|
|
232 #
|
|
233 # Get protease cleavage site specs.
|
|
234 #
|
|
235 unless (defined($enzyme_cleavage_site_rules)) {
|
|
236
|
|
237 #
|
|
238 # Set default to Trypsin if the user did not specify an enzyme name nor a string with enzyme cleavage site rules.
|
|
239 #
|
|
240 $enzyme = (defined($enzyme) ? $enzyme : 'Trypsin');
|
|
241
|
|
242 #
|
|
243 # Check for illegal enzymes.
|
|
244 #
|
|
245 unless (defined($enzymes{$enzyme})) {
|
|
246 $logger->fatal('Unkown enzyme ' . $enzyme . '.');
|
|
247 exit;
|
|
248 }
|
|
249
|
|
250 #
|
|
251 # Fetch enzyme cleavage site rules.
|
|
252 #
|
|
253 $enzyme_cleavage_site_rules = $enzymes{$enzyme};
|
|
254
|
|
255 }
|
|
256
|
|
257 $logger->info('Using enzyme cleavage site rules: ' . $enzyme_cleavage_site_rules);
|
|
258 my @split_params = split(/:/, $enzyme_cleavage_site_rules);
|
|
259 my $cut_term = $split_params[0];
|
|
260 my @cut_aa = split(/,/, $split_params[1]);
|
|
261 my @inhibition_aa;
|
|
262 if (defined($split_params[2])) {
|
|
263 @inhibition_aa = split(/,/, $split_params[2]);
|
|
264 }
|
|
265 my $annotation = {};
|
|
266 my $sequence = '';
|
|
267
|
|
268 #
|
|
269 # Create filehandles.
|
|
270 #
|
|
271 open(my $input_fh, "$input");
|
|
272 open(my $output_fh, ">$output");
|
|
273
|
|
274 #
|
|
275 # Write header to result file.
|
|
276 #
|
|
277 print($output_fh "Protein ID\tPeptide\tMW\tpI\tCharge\tnumber S\tnumber T\tnumber Y\n");
|
|
278
|
|
279 #
|
|
280 # Digest fasta files and store peptides in an array (hashed)
|
|
281 #
|
|
282 foreach my $line (<$input_fh>){
|
|
283
|
|
284 if ($line =~ /^>/){
|
|
285
|
|
286 #
|
|
287 # We found a new FASTA header.
|
|
288 #
|
|
289
|
|
290 #
|
|
291 # Process the previously found protein if we already found one.
|
|
292 #
|
|
293 unless ($sequence eq '') {
|
|
294 _ProcessProtein();
|
|
295 }
|
|
296
|
|
297 #
|
|
298 # Parse new FASTA header.
|
|
299 #
|
|
300 $line =~ s/[\r\n\f]+//g;
|
|
301 $annotation = _ParseHeader($line);
|
|
302 $sequence = '';
|
|
303
|
|
304 } else {
|
|
305
|
|
306 #
|
|
307 # Found a sequence line.
|
|
308 #
|
|
309 $line =~ s/[\t\s\r\n\f0-9\*\-]+//g;
|
|
310 $sequence .= $line;
|
|
311 }
|
|
312 }
|
|
313
|
|
314 #
|
|
315 # Don't foget to process the last sequence!
|
|
316 #
|
|
317 _ProcessProtein();
|
|
318
|
|
319 close($input_fh);
|
|
320 close($output_fh);
|
|
321
|
|
322 $logger->info('Finished!');
|
|
323
|
|
324 #########################################################
|
|
325 # SUBS
|
|
326 #########################################################
|
|
327
|
|
328 sub _ProcessProtein {
|
|
329
|
|
330 my $id = ${$annotation}{'ID'};
|
|
331 $logger->debug('Accession/ID: ' . $id);
|
|
332 $logger->debug('Sequence: ' . $sequence . "\n");
|
|
333 my $total_length_sequence = length($sequence);
|
|
334 my $total_peptide_length = 0;
|
|
335
|
|
336 my ($peptides) = _Digest($sequence, $annotation, \@cut_aa, $cut_term, \@inhibition_aa);
|
|
337
|
|
338 #
|
|
339 # Process peptides.
|
|
340 #
|
|
341 foreach my $peptide_with_context (sort(keys(%{$peptides}))) {
|
|
342
|
|
343 #
|
|
344 # Get peptide properties.
|
|
345 #
|
|
346 $logger->debug('Peptide with context: ' . $peptide_with_context);
|
|
347 my ($bare_peptide) = _ParsePeptide($peptide_with_context);
|
|
348 my $peplength = length($bare_peptide);
|
|
349 my ($pepweight) = _MwCalc($bare_peptide);
|
|
350
|
|
351 #
|
|
352 # Apply filters
|
|
353 #
|
|
354 # Filter 1 is on weight.
|
|
355 #
|
|
356 if (defined($min_peptide_weight)) {
|
|
357
|
|
358 $logger->debug('Minimum weight filter:');
|
|
359 $logger->debug("\t" . 'Peptide weight: ' . $pepweight);
|
|
360
|
|
361 if ($pepweight >= $min_peptide_weight) {
|
|
362
|
|
363 $logger->debug("\t" . 'PASSED - Weight is >= ' . $min_peptide_weight);
|
|
364
|
|
365 } else {
|
|
366
|
|
367 # Skip this one.
|
|
368 $logger->debug("\t" . 'REJECTED - Weight is < ' . $min_peptide_weight);
|
|
369 next;
|
|
370
|
|
371 }
|
|
372 }
|
|
373
|
|
374 if (defined($max_peptide_weight)) {
|
|
375
|
|
376 $logger->debug('Maximum weight filter:');
|
|
377 $logger->debug("\t" . 'Peptide weight: ' . $pepweight);
|
|
378
|
|
379 if ($pepweight <= $max_peptide_weight) {
|
|
380
|
|
381 $logger->debug("\t" . 'PASSED - Weight is <= ' . $max_peptide_weight);
|
|
382
|
|
383 } else {
|
|
384
|
|
385 # Skip this one.
|
|
386 $logger->debug("\t" . 'REJECTED - Weight is > ' . $max_peptide_weight);
|
|
387 next;
|
|
388
|
|
389 }
|
|
390 }
|
|
391
|
|
392 #
|
|
393 # Filter 2 is on sequence motifs.
|
|
394 #
|
|
395 if (scalar(@filters) > 0) {
|
|
396
|
|
397 $logger->debug('Motif filter:');
|
|
398 my ($motif) = _ContainsMotif($bare_peptide, \@filters);
|
|
399
|
|
400 if ($motif > 0) {
|
|
401
|
|
402 $logger->debug("\t" . 'PASSED - Peptide contains motif ' . $motif);
|
|
403
|
|
404 } else {
|
|
405
|
|
406 # Skip this one.
|
|
407 $logger->debug("\t" . 'REJECTED - Peptide does not contain any motif.');
|
|
408 next;
|
|
409
|
|
410 }
|
|
411 }
|
|
412
|
|
413 my ($pepPI) = _PiCalc($bare_peptide);
|
|
414 my ($pepCharge) = _ChargeCalc($bare_peptide);
|
|
415
|
|
416 #
|
|
417 # Filter 3 is on the amount of charges.
|
|
418 #
|
|
419 if (defined($max_charge)) {
|
|
420
|
|
421 $logger->debug('Charge filter:');
|
|
422 $logger->debug("\t" . 'Peptide charge: ' . $pepCharge);
|
|
423
|
|
424 if (defined($max_charge) && $pepCharge <= $max_charge) {
|
|
425
|
|
426 $logger->debug("\t" . 'PASSED - Charge <= ' . $max_charge);
|
|
427
|
|
428 } else {
|
|
429
|
|
430 # Skip this one.
|
|
431 $logger->debug("\t" . 'REJECTED - Charge > ' . $max_charge);
|
|
432 next;
|
|
433
|
|
434 }
|
|
435 }
|
|
436
|
|
437 #
|
|
438 # If there is no partial digestion we can calculated the % coverage.
|
|
439 #
|
|
440 unless ($partial_cleavage) {
|
|
441 $total_peptide_length += length($bare_peptide);
|
|
442 }
|
|
443
|
|
444 my ($residue_count) = _CountResiduesSTY($bare_peptide);
|
|
445
|
|
446 #
|
|
447 # Store peptide.
|
|
448 #
|
|
449 my $pep_sequence;
|
|
450
|
|
451 if ($add_sequence_context) {
|
|
452 $pep_sequence = $peptide_with_context;
|
|
453 } else {
|
|
454 $pep_sequence = $bare_peptide;
|
|
455 }
|
|
456
|
|
457 print($output_fh $id . "\t" . $pep_sequence . "\t" . $pepweight . "\t" . $pepPI. "\t" . $pepCharge. "\t" . $residue_count . "\n");
|
|
458
|
|
459 }
|
|
460
|
|
461 #
|
|
462 # If partial digestion was not enabled, we can calculated the % coverage.
|
|
463 #
|
|
464 unless ($partial_cleavage) {
|
|
465 my $coverage = ($total_peptide_length / $total_length_sequence)*100;
|
|
466 $logger->debug("Sequence residues: $total_length_sequence | #peptide residues: $total_peptide_length | %Coverage: $coverage");
|
|
467 }
|
|
468 }
|
|
469
|
|
470 sub _ParseHeader {
|
|
471
|
|
472 my ($header) = @_;
|
|
473 my %annotation;
|
|
474 my $ids;
|
|
475 my $id;
|
|
476 my $description;
|
|
477
|
|
478 $header=~ s/^>//;
|
|
479
|
|
480 if ($header =~ m/([^\s]+)\s+([^\s]+.*)/) {
|
|
481 $ids = $1;
|
|
482 $description = $2;
|
|
483 } else {
|
|
484 $ids = $header;
|
|
485 }
|
|
486
|
|
487 #
|
|
488 # Redimentory support for detecting multiple IDs: check for pipe sperated IDs.
|
|
489 #
|
|
490 if ($ids =~ m/([^\|])\|/) {
|
|
491 $id = $1;
|
|
492 } else {
|
|
493 $id = $ids;
|
|
494 }
|
|
495
|
|
496 $annotation{'ID'} = $id;
|
|
497 $annotation{'DE'} = $description;
|
|
498
|
|
499 return (\%annotation);
|
|
500
|
|
501 }
|
|
502
|
|
503 sub _ContainsMotif{
|
|
504
|
|
505 my ($peptide, $filters) = @_;
|
|
506 my $cnt = 0;
|
|
507
|
|
508 foreach my $filter (@{$filters}){
|
|
509
|
|
510 $cnt++;
|
|
511 if ($peptide =~ /$filter/){
|
|
512 return($cnt);
|
|
513 }
|
|
514
|
|
515 }
|
|
516
|
|
517 return(0);
|
|
518
|
|
519 }
|
|
520
|
|
521 sub _ParsePeptide{
|
|
522
|
|
523 my ($peptide_with_flanking_context) = @_;
|
|
524
|
|
525 my @pep = split(/\./, $peptide_with_flanking_context);
|
|
526 my $bare_peptide = uc($pep[1]);
|
|
527
|
|
528 return($bare_peptide);
|
|
529
|
|
530 }
|
|
531
|
|
532 sub _Digest{
|
|
533
|
|
534 my ($sequence, $annotation, $enz_cutting, $cut_term, $enz_restrict) = @_;
|
|
535
|
|
536 $sequence = uc($sequence);
|
|
537 my %pept = ();
|
|
538
|
|
539 # we start by scanning the sequence.. to find one of the cutting rulez not followed by a restict rule
|
|
540 # what is the sequence length
|
|
541 my @array = split(//, $sequence);
|
|
542 my $length = scalar(@array);
|
|
543 my $start_offset = 0;
|
|
544 my $offset_adjust_for_cleavage_terminus = 0;
|
|
545 if ($cut_term eq 'n') {
|
|
546 $offset_adjust_for_cleavage_terminus = 1;
|
|
547 }
|
|
548
|
|
549 my $peptide = '';
|
|
550
|
|
551 my $counter = 1;
|
|
552 if (!($partial_cleavage)) {
|
|
553 $partial_cleavage_chance = 0;
|
|
554 } else {
|
|
555 $counter = 100;
|
|
556 }
|
|
557
|
|
558 #this is for incomplete cleavage... we repeat digest 100 times with a probability that the enzyme will not cut
|
|
559 for (my $cnt = 0; $cnt < $counter; $cnt++) {
|
|
560
|
|
561 AA:for (my $aa_offset = 0 + $offset_adjust_for_cleavage_terminus; $aa_offset < ($length - 1 + $offset_adjust_for_cleavage_terminus); $aa_offset++) {
|
|
562
|
|
563 my $this_aa = substr($sequence, $aa_offset, 1);
|
|
564
|
|
565 #
|
|
566 # Check if protease recognizes this amino acid as cleavage site.
|
|
567 #
|
|
568 foreach my $cut_aa (@{$enz_cutting}) {
|
|
569
|
|
570 if ($this_aa eq $cut_aa) {
|
|
571
|
|
572 #
|
|
573 # Check if AA is not preceeded or followed by an AA that inhibits the protease...
|
|
574 #
|
|
575 foreach my $inhibit (@{$enz_restrict}) {
|
|
576
|
|
577
|
|
578 my @inhibit_conditions = split(/=/, $inhibit);
|
|
579 my $inhibit_aa = $inhibit_conditions[0];
|
|
580 my $position_to_check = $inhibit_conditions[1];
|
|
581
|
|
582 my $neighborhood_aa = substr($sequence, $aa_offset + $position_to_check, 1);
|
|
583
|
|
584 if ($neighborhood_aa eq $inhibit_aa){
|
|
585 $logger->debug('No cleavage due to inhibition by ' . $inhibit_aa . ' at position ' . $position_to_check);
|
|
586 next AA;
|
|
587 }
|
|
588 }
|
|
589
|
|
590 #
|
|
591 # Consider the possibility of not cutting when doing incomplete digestions!
|
|
592 #
|
|
593 my $random = 1;
|
|
594 if ($partial_cleavage){
|
|
595 $random = rand(1);
|
|
596 }
|
|
597 if ($random <= $partial_cleavage_chance){
|
|
598 $logger->debug('No cleavage due to incomplete digestion.');
|
|
599 next AA;
|
|
600 }
|
|
601
|
|
602 #
|
|
603 # If no inhibition spoiled the cleavage, lets cut!
|
|
604 #
|
|
605 my $cut_offset;
|
|
606 if ($cut_term eq 'c') {
|
|
607 $cut_offset = $aa_offset + 1;
|
|
608 } elsif ($cut_term eq 'n') {
|
|
609 $cut_offset = $aa_offset;
|
|
610 }
|
|
611
|
|
612 # nice formatting of peptide
|
|
613 if ($start_offset == 0){
|
|
614 $peptide .="terminus.";
|
|
615 $peptide .= substr($sequence, $start_offset, $cut_offset - $start_offset).".";
|
|
616 }else{
|
|
617 $peptide .= substr($sequence, $start_offset - 3, 3).".";
|
|
618 $peptide .= substr($sequence, $start_offset, $cut_offset - $start_offset).".";
|
|
619 }
|
|
620 $peptide .= substr($sequence, $cut_offset, 3);
|
|
621 $start_offset = $cut_offset;
|
|
622
|
|
623 # check if peptide already exists
|
|
624 # if so, concatenate this fasta header
|
|
625 #if (exists($pept{$peptide})){
|
|
626 # $fasta_header .= $pept{$peptide};
|
|
627 #}
|
|
628 $pept{$peptide} = ${$annotation}{'ID'};
|
|
629 $peptide = '';
|
|
630 }
|
|
631 }
|
|
632 }
|
|
633
|
|
634 #
|
|
635 # The last peptide sequence !
|
|
636 #
|
|
637 $peptide = substr($sequence, $start_offset - 3, 3).".";
|
|
638 $peptide .= substr($sequence, $start_offset, $length);
|
|
639 $peptide .= ".terminus";
|
|
640 $pept{$peptide} = ${$annotation}{'ID'};
|
|
641
|
|
642 } #COUNT
|
|
643
|
|
644 return(\%pept);
|
|
645
|
|
646 }
|
|
647
|
|
648 sub _PiCalc{
|
|
649
|
|
650 my ($sequence) = @_;
|
|
651
|
|
652 my $nTermResidue;
|
|
653 my $cTermResidue;
|
|
654 my $phMin;my $phMid ;my $phMax;
|
|
655 my $charge;
|
|
656 my $cter; my $nter; my $carg; my $chis; my $clys;my $casp;my $cglu;my $ccys;my $ctyr;
|
|
657
|
|
658 my $composition = _GetComposition($sequence);
|
|
659 my $strlength = length($sequence);
|
|
660 $nTermResidue = substr($sequence, 0, 1);
|
|
661 for ($nTermResidue){s/[\s\n\r\t]//g;};
|
|
662 $cTermResidue = substr($sequence, $strlength - 1, 1);
|
|
663 for ($cTermResidue){s/[\s\n\r\t]//g;};
|
|
664 $phMin = $PH_MIN;
|
|
665 $phMax = $PH_MAX;
|
|
666 my $i;
|
|
667
|
|
668 for ($i=0, $charge = 1.0; $i<$MAXLOOP && ($phMax-$phMin)>$EPSI; $i++){
|
|
669 $phMid = $phMin + ($phMax - $phMin) / 2;
|
|
670 $cter = 10**(-$cPk{$cTermResidue}->[0]) / (10**(-$cPk{$cTermResidue}->[0]) + 10**(-$phMid));
|
|
671 $nter = 10**(-$phMid)/(10**(-$cPk{$nTermResidue}->[1])+10**(-$phMid));
|
|
672
|
|
673 $carg = ${$composition}{R} * 10**(-$phMid) / (10**(-$cPk{'R'}->[2])+10**(-$phMid));
|
|
674 $chis = ${$composition}{H} * 10**(-$phMid) / (10**(-$cPk{'H'}->[2])+10**(-$phMid));
|
|
675 $clys = ${$composition}{K} * 10**(-$phMid) / (10**(-$cPk{'K'}->[2])+10**(-$phMid));
|
|
676
|
|
677 $casp = ${$composition}{D} * 10**(-$cPk{'D'}->[2]) / (10**(-$cPk{'D'}->[2])+10**(-$phMid));
|
|
678 $cglu = ${$composition}{E} * 10**(-$cPk{'E'}->[2]) / (10**(-$cPk{'E'}->[2])+10**(-$phMid));
|
|
679
|
|
680 $ccys = ${$composition}{C} * 10**(-$cPk{'C'}->[2]) / (10**(-$cPk{'C'}->[2])+10**(-$phMid));
|
|
681 $ctyr = ${$composition}{Y} * 10**(-$cPk{'Y'}->[2]) / (10**(-$cPk{'Y'}->[2])+10**(-$phMid));
|
|
682
|
|
683 $charge = $carg + $clys + $chis + $nter - ($casp + $cglu + $ctyr + $ccys + $cter);
|
|
684
|
|
685 if ($charge > 0.0){
|
|
686 $phMin = $phMid;
|
|
687 }else{
|
|
688 $phMax = $phMid;
|
|
689 }
|
|
690 }
|
|
691
|
|
692 return($phMid);
|
|
693
|
|
694 }
|
|
695
|
|
696 sub _MwCalc{
|
|
697
|
|
698 my ($sequence) = @_;
|
|
699
|
|
700 my $mw;
|
|
701 my $strlength = length($sequence);
|
|
702 my $composition = _GetComposition($sequence);
|
|
703
|
|
704 # subtract N-1 water molecules
|
|
705 $mw = - ($strlength -1) * $H20;
|
|
706
|
|
707 foreach my $aa (sort keys %{$composition}){
|
|
708 $logger->trace('AA:' . $aa . ' | frequency: ' . ${$composition}{$aa} . ' | AA MW: ' . $cMWs{$aa});
|
|
709 $mw += ${$composition}{$aa} * $cMWs{$aa};
|
|
710 }
|
|
711
|
|
712 return($mw);
|
|
713
|
|
714 }
|
|
715
|
|
716 sub _ChargeCalc{
|
|
717
|
|
718 my ($sequence) = @_;
|
|
719
|
|
720 my $charge;
|
|
721 my $composition = _GetComposition($sequence);
|
|
722
|
|
723 foreach my $aa (sort(keys(%{$composition}))){
|
|
724 $charge += ${$composition}{$aa} * $charges{$aa};
|
|
725 }
|
|
726
|
|
727 return($charge);
|
|
728
|
|
729 }
|
|
730
|
|
731 sub _CountResiduesSTY{
|
|
732
|
|
733 my ($sequence) = @_;
|
|
734
|
|
735 my $composition = _GetComposition($sequence);
|
|
736 my $sty_residues = ${$composition}{'S'} . "\t" . ${$composition}{'T'} . "\t" . ${$composition}{'Y'};
|
|
737
|
|
738 return($sty_residues);
|
|
739
|
|
740 }
|
|
741
|
|
742 sub _GetComposition {
|
|
743
|
|
744 my ($sequence) = @_;
|
|
745
|
|
746 my $i;
|
|
747 my $c;
|
|
748 my %composition = ('A'=>0, 'B'=>0,'C'=>0, 'D'=>0, 'E'=>0, 'F'=>0,'G'=>0, 'H'=>0, 'I'=>0,
|
|
749 'J'=>0, 'K'=>0,'L'=>0, 'M'=>0, 'N'=>0, 'O'=>0,'P'=>0, 'Q'=>0, 'R'=>0,
|
|
750 'S'=>0, 'T'=>0,'U'=>0, 'V'=>0, 'W'=>0, 'X'=>0,'Y'=>0, 'Z'=>0);
|
|
751
|
|
752 #
|
|
753 # Compute aminoacid composition.
|
|
754 #
|
|
755 my $strlength = length($sequence);
|
|
756 for ($i=0; $i<$strlength; $i++){
|
|
757 $c = substr($sequence, $i, 1);
|
|
758 $composition{$c}++;
|
|
759 }
|
|
760
|
|
761 return(\%composition);
|
|
762
|
|
763 }
|
|
764
|
|
765 sub _Usage {
|
|
766
|
|
767 my $longest_enzyme_name_length = 0;
|
|
768 foreach my $protease (keys(%enzymes)) {
|
|
769 if (length($protease) > $longest_enzyme_name_length) {
|
|
770 $longest_enzyme_name_length = length($protease);
|
|
771 }
|
|
772 }
|
|
773 my $field_formatting = '%-' . ($longest_enzyme_name_length + 3) . 's';
|
|
774
|
|
775 print "\n";
|
|
776 print "ProteinDigestor.pl - Digests protein sequences from a FASTA file\n";
|
|
777 print "\n";
|
|
778 print "Usage:\n";
|
|
779 print "\n";
|
|
780 print " ProteinDigestor.pl options\n";
|
|
781 print "\n";
|
|
782 print "Available options are:\n";
|
|
783 print "\n";
|
|
784 print " -i [file] Input FASTA file.\n";
|
|
785 print " -o [file] Output stats file.\n";
|
|
786 print "\n";
|
|
787 print " You can either specify a protease this tool knows with -e\n";
|
|
788 print " or you can specify the cleavage rule for a protease this does not know (yet) with -r\n";
|
|
789 print "\n";
|
|
790 print " -e [enzyme_name] Digestion enzyme. Known enzymes and their cleavage rules:\n";
|
|
791
|
|
792 foreach my $protease (sort(keys(%enzymes))) {
|
|
793 print ' ' . sprintf("$field_formatting", $protease) . $enzymes{$protease} . "\n";
|
|
794 }
|
|
795
|
|
796 print " -r [enzyme_rule] Protease cleavage site rule. This rule contains sperated by colons:\n";
|
|
797 print " * The terminus that indicates on which side of a recognized amino acid the protease will cleave: c or n.\n";
|
|
798 print " * The amino acids recognized by the protease.\n";
|
|
799 print " Multiple amino acids separated by a comma.\n";
|
|
800 print " * The amino acids that inhibit cleavage and their position relative to amino acids recognized by the protease.\n";
|
|
801 print ' Amino acid and its relative position separated by an equals sign (=).' . "\n";
|
|
802 print " Multiple amino acids separated by a comma.\n";
|
|
803 print ' For example \'c:K,R:P=1\' for Trypsin with proline inhibition.' . "\n";
|
|
804 print " -s Semi for partial digestion.\n";
|
|
805 print " -p [chance] Number between 0 and 1 indicating the chance that a site will not be cleaved.\n";
|
|
806 print " -m [weight] Minimum weight for a peptide.\n";
|
|
807 print " -n [weight] Maximum weight for a peptide.\n";
|
|
808 print " -c [int] Maximum charge a peptide is allowed to have.\n";
|
|
809 print " -a Add sequence context.\n";
|
|
810 print " This will add max 3 amino acids or the word terminus on each side of the peptides.\n";
|
|
811 print " Contexts and peptides are separated with a dot.\n";
|
|
812 print " Examples of peptides with a context:\n";
|
|
813 print " terminus.MSVSLSAK.ASH\n";
|
|
814 print " SAK.ASHLLCSSTR.VSL\n";
|
|
815 print " STR.VSLSPAVTSSSSSPVVRPALSSSTS.terminus\n";
|
|
816 print " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n";
|
|
817 print "\n";
|
|
818 exit;
|
|
819
|
|
820 }
|