Mercurial > repos > galaxyp > nbic_fasta
comparison ProteinDigestor.pl @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:163892325845 |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 ############################################################### | |
4 # ProteinDigestor.pl | |
5 # | |
6 # ===================================================== | |
7 # $Id: ProteinDigestor.pl 76 2011-01-04 13:16:56Z pieter.neerincx@gmail.com $ | |
8 # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ProteinDigestor.pl $ | |
9 # $LastChangedDate: 2011-01-04 07:16:56 -0600 (Tue, 04 Jan 2011) $ | |
10 # $LastChangedRevision: 76 $ | |
11 # $LastChangedBy: pieter.neerincx@gmail.com $ | |
12 # ===================================================== | |
13 # | |
14 # (c) Dr. Ir. B. van Breukelen et al. | |
15 # https://bioinformatics.chem.uu.nl | |
16 # http://www.netherlandsproteomicscentre.eu/ | |
17 # b.vanbreukelen@pharm.uu.nl | |
18 # | |
19 # Software to create peptide databases from a fasta | |
20 # file of proteins. You can cut with several enzymes | |
21 # select on size, PI, etc etc etc. | |
22 # | |
23 # TODO: put filters, enzymes, in seperate config file | |
24 ############################################################### | |
25 | |
26 ############################# | |
27 # Initialise environment | |
28 ############################# | |
29 use strict; | |
30 use Getopt::Std; | |
31 use Log::Log4perl qw(:easy); | |
32 | |
33 # | |
34 # Log levels for Log4perl. | |
35 # | |
36 my %log_levels = ( | |
37 'ALL' => $ALL, | |
38 'TRACE' => $TRACE, | |
39 'DEBUG' => $DEBUG, | |
40 'INFO' => $INFO, | |
41 'WARN' => $WARN, | |
42 'ERROR' => $ERROR, | |
43 'FATAL' => $FATAL, | |
44 'OFF' => $OFF, | |
45 ); | |
46 | |
47 # | |
48 # pK values array | |
49 # | |
50 my %cPk = ( | |
51 'A'=>[3.55, 7.59, 0 , 0 , 0], | |
52 'B'=>[3.55, 7.50, 0 , 0 , 0], | |
53 'C'=>[3.55, 7.50, 9.00 , 9.00 , 9.00], | |
54 'D'=>[4.55, 7.50, 4.05 , 4.05 , 4.05], | |
55 'E'=>[4.75, 7.70, 4.45 , 4.45 , 4.45], | |
56 'F'=>[3.55, 7.50, 0 , 0. , 0], | |
57 'G'=>[3.55, 7.50, 0 , 0. , 0], | |
58 'H'=>[3.55, 7.50, 5.98 , 5.98 , 5.98], | |
59 'I'=>[3.55, 7.50, 0 , 0. , 0.], | |
60 'J'=>[0.00, 0.00, 0 , 0. , 0.], | |
61 'K'=>[3.55, 7.50, 10.00, 10.00, 10.00], | |
62 'L'=>[3.55, 7.50, 0 , 0. , 0.], | |
63 'M'=>[3.55, 7.00, 0 , 0. , 0.], | |
64 'N'=>[3.55, 7.50, 0 , 0. , 0.], | |
65 'O'=>[0.00, 0.00, 0 , 0. , 0.], | |
66 'P'=>[3.55, 8.36, 0 , 0. , 0.], | |
67 'Q'=>[3.55, 7.50, 0 , 0. , 0.], | |
68 'R'=>[3.55, 7.50, 12.0 , 12.0 , 12.0], | |
69 'S'=>[3.55, 6.93, 0 , 0. , 0.], | |
70 'T'=>[3.55, 6.82, 0 , 0. , 0.], | |
71 'U'=>[0.00, 0.00, 0 , 0. , 0.], | |
72 'V'=>[3.55, 7.44, 0 , 0. , 0.], | |
73 'W'=>[3.55, 7.50, 0 , 0. , 0.], | |
74 'X'=>[3.55, 7.50, 0 , 0. , 0.], | |
75 'Y'=>[3.55, 7.50, 10.00, 10.00, 10.00], | |
76 'Z'=>[3.55, 7.50, 0 , 0. , 0.] | |
77 ); | |
78 | |
79 # | |
80 # MW values array | |
81 # | |
82 my %cMWs = ( | |
83 'A'=>89.09, | |
84 'B'=>132.65, | |
85 'C'=>121.15, | |
86 'D'=>133.1, | |
87 'E'=>147.13, | |
88 'F'=>165.19, | |
89 'G'=>75.07, | |
90 'H'=>155.16, | |
91 'I'=>131.18, | |
92 'J'=>0.00, | |
93 'K'=>146.19, | |
94 'L'=>131.18, | |
95 'M'=>149.22, | |
96 'N'=>132.12, | |
97 'O'=>0.00, | |
98 'P'=>115.13, | |
99 'Q'=>146.15, | |
100 'R'=>174.21, | |
101 'S'=>105.09, | |
102 'T'=>119.12, | |
103 'U'=>168.05, | |
104 'V'=>117.15, | |
105 'W'=>204.22, | |
106 'X'=>129.26, | |
107 'Y'=>181.19, | |
108 'Z'=>146.73 | |
109 ); | |
110 | |
111 # | |
112 # Digestion enzymes. | |
113 # | |
114 # For example Trypsin cuts c-terminal of K and R not followed by a P at position p1 = c:K,R:P=1 | |
115 # | |
116 my %enzymes = ( | |
117 'Trypsin' => 'c:K,R:P=1', | |
118 'Trypsin/P' => 'c:K,R', | |
119 'Chymotrypsin' => 'c:F,L,W,Y:P=1', | |
120 'Chymotrypsin/P' => 'c:F,L,W,Y', | |
121 'V8_E' => 'c:E,Z:P=1', | |
122 'V8_DE' => 'c:E,D,B,Z:P=1', | |
123 'LysC' => 'c:K:P=1', | |
124 'LysC/P' => 'c:K', | |
125 'LysN/P' => 'n:K' | |
126 ); | |
127 | |
128 my @filters = ( | |
129 # "[R|K][R|K][R|K]..[S|T]." | |
130 # "[R|K][R|K][R|K].[S|T]." , | |
131 # "[R|K][R|K]..[S|T].", | |
132 # "[R|K]..[S|T].", | |
133 # "[R|K][R|K].[S|T].", | |
134 # "[R|K].[S|T]." | |
135 # "[S|T].[R|K]" | |
136 ); | |
137 | |
138 my %charges = ('A'=>0, 'B'=>0, 'C'=>0, 'D'=>-1, 'E'=>-1, 'F'=>0,'G'=>0, 'H'=>1, 'I'=>0, | |
139 'J'=>0, 'K'=>1, 'L'=>0, 'M'=>0, 'N'=>0, 'O'=>0,'P'=>0, 'Q'=>0, 'R'=>1, | |
140 'S'=>0, 'T'=>0, 'U'=>0, 'V'=>0, 'W'=>0, 'X'=>0,'Y'=>0, 'Z'=>0); | |
141 | |
142 my %pept = (); #hashed array of peptides (HASH) and fasta headers (VALUE) | |
143 | |
144 my $H20 = 18.015; #Mol. weight water | |
145 | |
146 my $PH_MIN = 0.0; #min pH value | |
147 my $PH_MAX = 14.0; #max pH value | |
148 my $MAXLOOP = 2000; # max number iterations | |
149 my $EPSI = 0.0001; # desired presision | |
150 | |
151 # | |
152 # Get options. | |
153 # | |
154 my %opts; | |
155 Getopt::Std::getopts('i:o:e:r:p:m:n:c:l:sa', \%opts); | |
156 | |
157 my $input = $opts{'i'}; | |
158 my $output = $opts{'o'}; | |
159 my $enzyme = $opts{'e'}; | |
160 my $enzyme_cleavage_site_rules = $opts{'r'}; | |
161 my $partial_cleavage = $opts{'s'}; | |
162 my $partial_cleavage_chance = $opts{'p'}; | |
163 my $min_peptide_weight = $opts{'m'}; | |
164 my $max_peptide_weight = $opts{'n'}; | |
165 my $max_charge = $opts{'c'}; | |
166 my $add_sequence_context = $opts{'a'}; | |
167 my $log_level = $opts{'l'}; | |
168 | |
169 # | |
170 # Configure logging. | |
171 # | |
172 # Provides default if user did not specify log level: | |
173 $log_level = (defined($log_level) ? $log_level : 'INFO'); | |
174 | |
175 # Reset log level to default if user specified illegal log level. | |
176 $log_level = ( | |
177 defined($log_levels{$log_level}) | |
178 ? $log_levels{$log_level} | |
179 : $log_levels{'INFO'}); | |
180 | |
181 #Log::Log4perl->init('log4perl.properties'); | |
182 Log::Log4perl->easy_init( | |
183 | |
184 #{ level => $log_level, | |
185 # file => ">>ProteinDigestor.log", | |
186 # layout => '%F{1}-%L-%M: %m%n' }, | |
187 { | |
188 level => $log_level, | |
189 file => "STDOUT", | |
190 layout => '%d L:%L %p> %m%n' | |
191 }, | |
192 ); | |
193 my $logger = Log::Log4perl::get_logger(); | |
194 | |
195 # | |
196 # Check user input. | |
197 # | |
198 unless (defined($input) && defined($output)) { | |
199 _Usage(); | |
200 exit; | |
201 } | |
202 if ($input =~ /^$/ || $output =~ /^$/) { | |
203 _Usage(); | |
204 exit; | |
205 } | |
206 if ($input eq $output) { | |
207 $logger->fatal('Output file is the same as the input file. Select a different file for the output.'); | |
208 exit; | |
209 } | |
210 | |
211 # | |
212 # Check input file. | |
213 # | |
214 unless (-e $input && -f $input && -r $input) { | |
215 $logger->fatal('Cannot read from input file ' . $input . ': ' . $!); | |
216 exit; | |
217 } | |
218 | |
219 # | |
220 # Set additional defaults. | |
221 # | |
222 $partial_cleavage = (defined($partial_cleavage) ? $partial_cleavage : 0); # Disabled by default | |
223 $partial_cleavage_chance = (defined($partial_cleavage_chance) ? $partial_cleavage_chance : 0.1); # 10% change by default | |
224 $add_sequence_context = (defined($add_sequence_context) ? $add_sequence_context : 0); # Disabled by default | |
225 | |
226 ######################################################## | |
227 # MAIN PROGRAM | |
228 ######################################################## | |
229 | |
230 $logger->info('Starting...'); | |
231 | |
232 # | |
233 # Get protease cleavage site specs. | |
234 # | |
235 unless (defined($enzyme_cleavage_site_rules)) { | |
236 | |
237 # | |
238 # Set default to Trypsin if the user did not specify an enzyme name nor a string with enzyme cleavage site rules. | |
239 # | |
240 $enzyme = (defined($enzyme) ? $enzyme : 'Trypsin'); | |
241 | |
242 # | |
243 # Check for illegal enzymes. | |
244 # | |
245 unless (defined($enzymes{$enzyme})) { | |
246 $logger->fatal('Unkown enzyme ' . $enzyme . '.'); | |
247 exit; | |
248 } | |
249 | |
250 # | |
251 # Fetch enzyme cleavage site rules. | |
252 # | |
253 $enzyme_cleavage_site_rules = $enzymes{$enzyme}; | |
254 | |
255 } | |
256 | |
257 $logger->info('Using enzyme cleavage site rules: ' . $enzyme_cleavage_site_rules); | |
258 my @split_params = split(/:/, $enzyme_cleavage_site_rules); | |
259 my $cut_term = $split_params[0]; | |
260 my @cut_aa = split(/,/, $split_params[1]); | |
261 my @inhibition_aa; | |
262 if (defined($split_params[2])) { | |
263 @inhibition_aa = split(/,/, $split_params[2]); | |
264 } | |
265 my $annotation = {}; | |
266 my $sequence = ''; | |
267 | |
268 # | |
269 # Create filehandles. | |
270 # | |
271 open(my $input_fh, "$input"); | |
272 open(my $output_fh, ">$output"); | |
273 | |
274 # | |
275 # Write header to result file. | |
276 # | |
277 print($output_fh "Protein ID\tPeptide\tMW\tpI\tCharge\tnumber S\tnumber T\tnumber Y\n"); | |
278 | |
279 # | |
280 # Digest fasta files and store peptides in an array (hashed) | |
281 # | |
282 foreach my $line (<$input_fh>){ | |
283 | |
284 if ($line =~ /^>/){ | |
285 | |
286 # | |
287 # We found a new FASTA header. | |
288 # | |
289 | |
290 # | |
291 # Process the previously found protein if we already found one. | |
292 # | |
293 unless ($sequence eq '') { | |
294 _ProcessProtein(); | |
295 } | |
296 | |
297 # | |
298 # Parse new FASTA header. | |
299 # | |
300 $line =~ s/[\r\n\f]+//g; | |
301 $annotation = _ParseHeader($line); | |
302 $sequence = ''; | |
303 | |
304 } else { | |
305 | |
306 # | |
307 # Found a sequence line. | |
308 # | |
309 $line =~ s/[\t\s\r\n\f0-9\*\-]+//g; | |
310 $sequence .= $line; | |
311 } | |
312 } | |
313 | |
314 # | |
315 # Don't foget to process the last sequence! | |
316 # | |
317 _ProcessProtein(); | |
318 | |
319 close($input_fh); | |
320 close($output_fh); | |
321 | |
322 $logger->info('Finished!'); | |
323 | |
324 ######################################################### | |
325 # SUBS | |
326 ######################################################### | |
327 | |
328 sub _ProcessProtein { | |
329 | |
330 my $id = ${$annotation}{'ID'}; | |
331 $logger->debug('Accession/ID: ' . $id); | |
332 $logger->debug('Sequence: ' . $sequence . "\n"); | |
333 my $total_length_sequence = length($sequence); | |
334 my $total_peptide_length = 0; | |
335 | |
336 my ($peptides) = _Digest($sequence, $annotation, \@cut_aa, $cut_term, \@inhibition_aa); | |
337 | |
338 # | |
339 # Process peptides. | |
340 # | |
341 foreach my $peptide_with_context (sort(keys(%{$peptides}))) { | |
342 | |
343 # | |
344 # Get peptide properties. | |
345 # | |
346 $logger->debug('Peptide with context: ' . $peptide_with_context); | |
347 my ($bare_peptide) = _ParsePeptide($peptide_with_context); | |
348 my $peplength = length($bare_peptide); | |
349 my ($pepweight) = _MwCalc($bare_peptide); | |
350 | |
351 # | |
352 # Apply filters | |
353 # | |
354 # Filter 1 is on weight. | |
355 # | |
356 if (defined($min_peptide_weight)) { | |
357 | |
358 $logger->debug('Minimum weight filter:'); | |
359 $logger->debug("\t" . 'Peptide weight: ' . $pepweight); | |
360 | |
361 if ($pepweight >= $min_peptide_weight) { | |
362 | |
363 $logger->debug("\t" . 'PASSED - Weight is >= ' . $min_peptide_weight); | |
364 | |
365 } else { | |
366 | |
367 # Skip this one. | |
368 $logger->debug("\t" . 'REJECTED - Weight is < ' . $min_peptide_weight); | |
369 next; | |
370 | |
371 } | |
372 } | |
373 | |
374 if (defined($max_peptide_weight)) { | |
375 | |
376 $logger->debug('Maximum weight filter:'); | |
377 $logger->debug("\t" . 'Peptide weight: ' . $pepweight); | |
378 | |
379 if ($pepweight <= $max_peptide_weight) { | |
380 | |
381 $logger->debug("\t" . 'PASSED - Weight is <= ' . $max_peptide_weight); | |
382 | |
383 } else { | |
384 | |
385 # Skip this one. | |
386 $logger->debug("\t" . 'REJECTED - Weight is > ' . $max_peptide_weight); | |
387 next; | |
388 | |
389 } | |
390 } | |
391 | |
392 # | |
393 # Filter 2 is on sequence motifs. | |
394 # | |
395 if (scalar(@filters) > 0) { | |
396 | |
397 $logger->debug('Motif filter:'); | |
398 my ($motif) = _ContainsMotif($bare_peptide, \@filters); | |
399 | |
400 if ($motif > 0) { | |
401 | |
402 $logger->debug("\t" . 'PASSED - Peptide contains motif ' . $motif); | |
403 | |
404 } else { | |
405 | |
406 # Skip this one. | |
407 $logger->debug("\t" . 'REJECTED - Peptide does not contain any motif.'); | |
408 next; | |
409 | |
410 } | |
411 } | |
412 | |
413 my ($pepPI) = _PiCalc($bare_peptide); | |
414 my ($pepCharge) = _ChargeCalc($bare_peptide); | |
415 | |
416 # | |
417 # Filter 3 is on the amount of charges. | |
418 # | |
419 if (defined($max_charge)) { | |
420 | |
421 $logger->debug('Charge filter:'); | |
422 $logger->debug("\t" . 'Peptide charge: ' . $pepCharge); | |
423 | |
424 if (defined($max_charge) && $pepCharge <= $max_charge) { | |
425 | |
426 $logger->debug("\t" . 'PASSED - Charge <= ' . $max_charge); | |
427 | |
428 } else { | |
429 | |
430 # Skip this one. | |
431 $logger->debug("\t" . 'REJECTED - Charge > ' . $max_charge); | |
432 next; | |
433 | |
434 } | |
435 } | |
436 | |
437 # | |
438 # If there is no partial digestion we can calculated the % coverage. | |
439 # | |
440 unless ($partial_cleavage) { | |
441 $total_peptide_length += length($bare_peptide); | |
442 } | |
443 | |
444 my ($residue_count) = _CountResiduesSTY($bare_peptide); | |
445 | |
446 # | |
447 # Store peptide. | |
448 # | |
449 my $pep_sequence; | |
450 | |
451 if ($add_sequence_context) { | |
452 $pep_sequence = $peptide_with_context; | |
453 } else { | |
454 $pep_sequence = $bare_peptide; | |
455 } | |
456 | |
457 print($output_fh $id . "\t" . $pep_sequence . "\t" . $pepweight . "\t" . $pepPI. "\t" . $pepCharge. "\t" . $residue_count . "\n"); | |
458 | |
459 } | |
460 | |
461 # | |
462 # If partial digestion was not enabled, we can calculated the % coverage. | |
463 # | |
464 unless ($partial_cleavage) { | |
465 my $coverage = ($total_peptide_length / $total_length_sequence)*100; | |
466 $logger->debug("Sequence residues: $total_length_sequence | #peptide residues: $total_peptide_length | %Coverage: $coverage"); | |
467 } | |
468 } | |
469 | |
470 sub _ParseHeader { | |
471 | |
472 my ($header) = @_; | |
473 my %annotation; | |
474 my $ids; | |
475 my $id; | |
476 my $description; | |
477 | |
478 $header=~ s/^>//; | |
479 | |
480 if ($header =~ m/([^\s]+)\s+([^\s]+.*)/) { | |
481 $ids = $1; | |
482 $description = $2; | |
483 } else { | |
484 $ids = $header; | |
485 } | |
486 | |
487 # | |
488 # Redimentory support for detecting multiple IDs: check for pipe sperated IDs. | |
489 # | |
490 if ($ids =~ m/([^\|])\|/) { | |
491 $id = $1; | |
492 } else { | |
493 $id = $ids; | |
494 } | |
495 | |
496 $annotation{'ID'} = $id; | |
497 $annotation{'DE'} = $description; | |
498 | |
499 return (\%annotation); | |
500 | |
501 } | |
502 | |
503 sub _ContainsMotif{ | |
504 | |
505 my ($peptide, $filters) = @_; | |
506 my $cnt = 0; | |
507 | |
508 foreach my $filter (@{$filters}){ | |
509 | |
510 $cnt++; | |
511 if ($peptide =~ /$filter/){ | |
512 return($cnt); | |
513 } | |
514 | |
515 } | |
516 | |
517 return(0); | |
518 | |
519 } | |
520 | |
521 sub _ParsePeptide{ | |
522 | |
523 my ($peptide_with_flanking_context) = @_; | |
524 | |
525 my @pep = split(/\./, $peptide_with_flanking_context); | |
526 my $bare_peptide = uc($pep[1]); | |
527 | |
528 return($bare_peptide); | |
529 | |
530 } | |
531 | |
532 sub _Digest{ | |
533 | |
534 my ($sequence, $annotation, $enz_cutting, $cut_term, $enz_restrict) = @_; | |
535 | |
536 $sequence = uc($sequence); | |
537 my %pept = (); | |
538 | |
539 # we start by scanning the sequence.. to find one of the cutting rulez not followed by a restict rule | |
540 # what is the sequence length | |
541 my @array = split(//, $sequence); | |
542 my $length = scalar(@array); | |
543 my $start_offset = 0; | |
544 my $offset_adjust_for_cleavage_terminus = 0; | |
545 if ($cut_term eq 'n') { | |
546 $offset_adjust_for_cleavage_terminus = 1; | |
547 } | |
548 | |
549 my $peptide = ''; | |
550 | |
551 my $counter = 1; | |
552 if (!($partial_cleavage)) { | |
553 $partial_cleavage_chance = 0; | |
554 } else { | |
555 $counter = 100; | |
556 } | |
557 | |
558 #this is for incomplete cleavage... we repeat digest 100 times with a probability that the enzyme will not cut | |
559 for (my $cnt = 0; $cnt < $counter; $cnt++) { | |
560 | |
561 AA:for (my $aa_offset = 0 + $offset_adjust_for_cleavage_terminus; $aa_offset < ($length - 1 + $offset_adjust_for_cleavage_terminus); $aa_offset++) { | |
562 | |
563 my $this_aa = substr($sequence, $aa_offset, 1); | |
564 | |
565 # | |
566 # Check if protease recognizes this amino acid as cleavage site. | |
567 # | |
568 foreach my $cut_aa (@{$enz_cutting}) { | |
569 | |
570 if ($this_aa eq $cut_aa) { | |
571 | |
572 # | |
573 # Check if AA is not preceeded or followed by an AA that inhibits the protease... | |
574 # | |
575 foreach my $inhibit (@{$enz_restrict}) { | |
576 | |
577 | |
578 my @inhibit_conditions = split(/=/, $inhibit); | |
579 my $inhibit_aa = $inhibit_conditions[0]; | |
580 my $position_to_check = $inhibit_conditions[1]; | |
581 | |
582 my $neighborhood_aa = substr($sequence, $aa_offset + $position_to_check, 1); | |
583 | |
584 if ($neighborhood_aa eq $inhibit_aa){ | |
585 $logger->debug('No cleavage due to inhibition by ' . $inhibit_aa . ' at position ' . $position_to_check); | |
586 next AA; | |
587 } | |
588 } | |
589 | |
590 # | |
591 # Consider the possibility of not cutting when doing incomplete digestions! | |
592 # | |
593 my $random = 1; | |
594 if ($partial_cleavage){ | |
595 $random = rand(1); | |
596 } | |
597 if ($random <= $partial_cleavage_chance){ | |
598 $logger->debug('No cleavage due to incomplete digestion.'); | |
599 next AA; | |
600 } | |
601 | |
602 # | |
603 # If no inhibition spoiled the cleavage, lets cut! | |
604 # | |
605 my $cut_offset; | |
606 if ($cut_term eq 'c') { | |
607 $cut_offset = $aa_offset + 1; | |
608 } elsif ($cut_term eq 'n') { | |
609 $cut_offset = $aa_offset; | |
610 } | |
611 | |
612 # nice formatting of peptide | |
613 if ($start_offset == 0){ | |
614 $peptide .="terminus."; | |
615 $peptide .= substr($sequence, $start_offset, $cut_offset - $start_offset)."."; | |
616 }else{ | |
617 $peptide .= substr($sequence, $start_offset - 3, 3)."."; | |
618 $peptide .= substr($sequence, $start_offset, $cut_offset - $start_offset)."."; | |
619 } | |
620 $peptide .= substr($sequence, $cut_offset, 3); | |
621 $start_offset = $cut_offset; | |
622 | |
623 # check if peptide already exists | |
624 # if so, concatenate this fasta header | |
625 #if (exists($pept{$peptide})){ | |
626 # $fasta_header .= $pept{$peptide}; | |
627 #} | |
628 $pept{$peptide} = ${$annotation}{'ID'}; | |
629 $peptide = ''; | |
630 } | |
631 } | |
632 } | |
633 | |
634 # | |
635 # The last peptide sequence ! | |
636 # | |
637 $peptide = substr($sequence, $start_offset - 3, 3)."."; | |
638 $peptide .= substr($sequence, $start_offset, $length); | |
639 $peptide .= ".terminus"; | |
640 $pept{$peptide} = ${$annotation}{'ID'}; | |
641 | |
642 } #COUNT | |
643 | |
644 return(\%pept); | |
645 | |
646 } | |
647 | |
648 sub _PiCalc{ | |
649 | |
650 my ($sequence) = @_; | |
651 | |
652 my $nTermResidue; | |
653 my $cTermResidue; | |
654 my $phMin;my $phMid ;my $phMax; | |
655 my $charge; | |
656 my $cter; my $nter; my $carg; my $chis; my $clys;my $casp;my $cglu;my $ccys;my $ctyr; | |
657 | |
658 my $composition = _GetComposition($sequence); | |
659 my $strlength = length($sequence); | |
660 $nTermResidue = substr($sequence, 0, 1); | |
661 for ($nTermResidue){s/[\s\n\r\t]//g;}; | |
662 $cTermResidue = substr($sequence, $strlength - 1, 1); | |
663 for ($cTermResidue){s/[\s\n\r\t]//g;}; | |
664 $phMin = $PH_MIN; | |
665 $phMax = $PH_MAX; | |
666 my $i; | |
667 | |
668 for ($i=0, $charge = 1.0; $i<$MAXLOOP && ($phMax-$phMin)>$EPSI; $i++){ | |
669 $phMid = $phMin + ($phMax - $phMin) / 2; | |
670 $cter = 10**(-$cPk{$cTermResidue}->[0]) / (10**(-$cPk{$cTermResidue}->[0]) + 10**(-$phMid)); | |
671 $nter = 10**(-$phMid)/(10**(-$cPk{$nTermResidue}->[1])+10**(-$phMid)); | |
672 | |
673 $carg = ${$composition}{R} * 10**(-$phMid) / (10**(-$cPk{'R'}->[2])+10**(-$phMid)); | |
674 $chis = ${$composition}{H} * 10**(-$phMid) / (10**(-$cPk{'H'}->[2])+10**(-$phMid)); | |
675 $clys = ${$composition}{K} * 10**(-$phMid) / (10**(-$cPk{'K'}->[2])+10**(-$phMid)); | |
676 | |
677 $casp = ${$composition}{D} * 10**(-$cPk{'D'}->[2]) / (10**(-$cPk{'D'}->[2])+10**(-$phMid)); | |
678 $cglu = ${$composition}{E} * 10**(-$cPk{'E'}->[2]) / (10**(-$cPk{'E'}->[2])+10**(-$phMid)); | |
679 | |
680 $ccys = ${$composition}{C} * 10**(-$cPk{'C'}->[2]) / (10**(-$cPk{'C'}->[2])+10**(-$phMid)); | |
681 $ctyr = ${$composition}{Y} * 10**(-$cPk{'Y'}->[2]) / (10**(-$cPk{'Y'}->[2])+10**(-$phMid)); | |
682 | |
683 $charge = $carg + $clys + $chis + $nter - ($casp + $cglu + $ctyr + $ccys + $cter); | |
684 | |
685 if ($charge > 0.0){ | |
686 $phMin = $phMid; | |
687 }else{ | |
688 $phMax = $phMid; | |
689 } | |
690 } | |
691 | |
692 return($phMid); | |
693 | |
694 } | |
695 | |
696 sub _MwCalc{ | |
697 | |
698 my ($sequence) = @_; | |
699 | |
700 my $mw; | |
701 my $strlength = length($sequence); | |
702 my $composition = _GetComposition($sequence); | |
703 | |
704 # subtract N-1 water molecules | |
705 $mw = - ($strlength -1) * $H20; | |
706 | |
707 foreach my $aa (sort keys %{$composition}){ | |
708 $logger->trace('AA:' . $aa . ' | frequency: ' . ${$composition}{$aa} . ' | AA MW: ' . $cMWs{$aa}); | |
709 $mw += ${$composition}{$aa} * $cMWs{$aa}; | |
710 } | |
711 | |
712 return($mw); | |
713 | |
714 } | |
715 | |
716 sub _ChargeCalc{ | |
717 | |
718 my ($sequence) = @_; | |
719 | |
720 my $charge; | |
721 my $composition = _GetComposition($sequence); | |
722 | |
723 foreach my $aa (sort(keys(%{$composition}))){ | |
724 $charge += ${$composition}{$aa} * $charges{$aa}; | |
725 } | |
726 | |
727 return($charge); | |
728 | |
729 } | |
730 | |
731 sub _CountResiduesSTY{ | |
732 | |
733 my ($sequence) = @_; | |
734 | |
735 my $composition = _GetComposition($sequence); | |
736 my $sty_residues = ${$composition}{'S'} . "\t" . ${$composition}{'T'} . "\t" . ${$composition}{'Y'}; | |
737 | |
738 return($sty_residues); | |
739 | |
740 } | |
741 | |
742 sub _GetComposition { | |
743 | |
744 my ($sequence) = @_; | |
745 | |
746 my $i; | |
747 my $c; | |
748 my %composition = ('A'=>0, 'B'=>0,'C'=>0, 'D'=>0, 'E'=>0, 'F'=>0,'G'=>0, 'H'=>0, 'I'=>0, | |
749 'J'=>0, 'K'=>0,'L'=>0, 'M'=>0, 'N'=>0, 'O'=>0,'P'=>0, 'Q'=>0, 'R'=>0, | |
750 'S'=>0, 'T'=>0,'U'=>0, 'V'=>0, 'W'=>0, 'X'=>0,'Y'=>0, 'Z'=>0); | |
751 | |
752 # | |
753 # Compute aminoacid composition. | |
754 # | |
755 my $strlength = length($sequence); | |
756 for ($i=0; $i<$strlength; $i++){ | |
757 $c = substr($sequence, $i, 1); | |
758 $composition{$c}++; | |
759 } | |
760 | |
761 return(\%composition); | |
762 | |
763 } | |
764 | |
765 sub _Usage { | |
766 | |
767 my $longest_enzyme_name_length = 0; | |
768 foreach my $protease (keys(%enzymes)) { | |
769 if (length($protease) > $longest_enzyme_name_length) { | |
770 $longest_enzyme_name_length = length($protease); | |
771 } | |
772 } | |
773 my $field_formatting = '%-' . ($longest_enzyme_name_length + 3) . 's'; | |
774 | |
775 print "\n"; | |
776 print "ProteinDigestor.pl - Digests protein sequences from a FASTA file\n"; | |
777 print "\n"; | |
778 print "Usage:\n"; | |
779 print "\n"; | |
780 print " ProteinDigestor.pl options\n"; | |
781 print "\n"; | |
782 print "Available options are:\n"; | |
783 print "\n"; | |
784 print " -i [file] Input FASTA file.\n"; | |
785 print " -o [file] Output stats file.\n"; | |
786 print "\n"; | |
787 print " You can either specify a protease this tool knows with -e\n"; | |
788 print " or you can specify the cleavage rule for a protease this does not know (yet) with -r\n"; | |
789 print "\n"; | |
790 print " -e [enzyme_name] Digestion enzyme. Known enzymes and their cleavage rules:\n"; | |
791 | |
792 foreach my $protease (sort(keys(%enzymes))) { | |
793 print ' ' . sprintf("$field_formatting", $protease) . $enzymes{$protease} . "\n"; | |
794 } | |
795 | |
796 print " -r [enzyme_rule] Protease cleavage site rule. This rule contains sperated by colons:\n"; | |
797 print " * The terminus that indicates on which side of a recognized amino acid the protease will cleave: c or n.\n"; | |
798 print " * The amino acids recognized by the protease.\n"; | |
799 print " Multiple amino acids separated by a comma.\n"; | |
800 print " * The amino acids that inhibit cleavage and their position relative to amino acids recognized by the protease.\n"; | |
801 print ' Amino acid and its relative position separated by an equals sign (=).' . "\n"; | |
802 print " Multiple amino acids separated by a comma.\n"; | |
803 print ' For example \'c:K,R:P=1\' for Trypsin with proline inhibition.' . "\n"; | |
804 print " -s Semi for partial digestion.\n"; | |
805 print " -p [chance] Number between 0 and 1 indicating the chance that a site will not be cleaved.\n"; | |
806 print " -m [weight] Minimum weight for a peptide.\n"; | |
807 print " -n [weight] Maximum weight for a peptide.\n"; | |
808 print " -c [int] Maximum charge a peptide is allowed to have.\n"; | |
809 print " -a Add sequence context.\n"; | |
810 print " This will add max 3 amino acids or the word terminus on each side of the peptides.\n"; | |
811 print " Contexts and peptides are separated with a dot.\n"; | |
812 print " Examples of peptides with a context:\n"; | |
813 print " terminus.MSVSLSAK.ASH\n"; | |
814 print " SAK.ASHLLCSSTR.VSL\n"; | |
815 print " STR.VSLSPAVTSSSSSPVVRPALSSSTS.terminus\n"; | |
816 print " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n"; | |
817 print "\n"; | |
818 exit; | |
819 | |
820 } |