Mercurial > repos > cpt > cpt_psm_prep
comparison lib/CPT/Analysis/TerL.pm @ 1:d724f34e671d draft default tip
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:50:07 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:e4de0a0e90c8 | 1:d724f34e671d |
---|---|
1 package CPT::Analysis::TerL; | |
2 | |
3 # ABSTRACT: Guess phage packaging strategy based on homology to terminases (TerL) of phages with known packaging strategies | |
4 | |
5 use strict; | |
6 use warnings; | |
7 use Data::Dumper; | |
8 use autodie; | |
9 use Moose; | |
10 use Bio::SearchIO; | |
11 use File::ShareDir; | |
12 use File::Spec; | |
13 | |
14 has 'hmmer_evalue_cutoff' => ( is => 'rw', isa => 'Int' ); | |
15 has 'blast_evalue_cutoff' => ( is => 'rw', isa => 'Int' ); | |
16 has 'blast_dice_cutoff' => ( is => 'rw', isa => 'Int' ); | |
17 | |
18 has 'search_hmmer' => ( is => 'rw', isa => 'Bool' ); | |
19 has 'search_blast' => ( is => 'rw', isa => 'Bool' ); | |
20 | |
21 has 'data_dir' => ( is => 'rw', isa => 'Any' ); | |
22 | |
23 has 'awk_string' => ( | |
24 is => 'ro', | |
25 isa => 'Str', | |
26 default => | |
27 'BEGIN{print "#row,query id,subject id,evalue,dice" } {qid=$1;sid=$2;percent_identical=$3;query_length=$4;subject_length=$5;evalue=$6;dice=(2*percent_identical*subject_length/ ( subject_length + query_length ) );printf("%d,%s,%s,%s,%s\n",FNR, qid, sid, dice, evalue);}' | |
28 ); | |
29 | |
30 has 'input_file' => ( is => 'rw', isa => 'Str' ); | |
31 | |
32 my @column_max = ( 20, 10, 20, 10, 20, 60 ); | |
33 my %hits; | |
34 | |
35 sub run { | |
36 my ( $self, %data ) = @_; | |
37 | |
38 #$self->prepare(%data); | |
39 my $dir = '/galaxy/tool-data/' # File::ShareDir::dist_dir('CPT-Analysis-TerL'); | |
40 $self->data_dir($dir); | |
41 $self->input_file( $data{input_file} ); | |
42 | |
43 if ( $self->search_hmmer() ) { | |
44 $self->hmmer(); | |
45 } | |
46 if ( $self->search_blast() ) { | |
47 $self->blast($self); | |
48 } | |
49 | |
50 return $self->guess(); | |
51 } | |
52 | |
53 sub hmmer { | |
54 my ($self) = @_; | |
55 | |
56 use Term::ProgressBar 2.00; | |
57 my $progress = Term::ProgressBar->new( | |
58 { | |
59 name => 'HMMER', | |
60 count => 100, | |
61 ETA => 'linear', | |
62 } | |
63 ); | |
64 $progress->max_update_rate(1); | |
65 | |
66 my $hmmer_db_dir = | |
67 File::Spec->catdir( $self->data_dir(), 'db', 'hmmer' ); | |
68 my @hmmer_dbs = glob( File::Spec->catfile( $hmmer_db_dir, "*.hmm" ) ); | |
69 | |
70 my $i = 0; | |
71 foreach (@hmmer_dbs) { | |
72 $progress->update( 100 * ( $i++ ) / scalar @hmmer_dbs ); | |
73 my $db_name = substr( $_, rindex( $_, '/' ) + 1, -4 ); | |
74 my $output_filename = sprintf( | |
75 '%s.%s.out', | |
76 substr( | |
77 $self->input_file(), 0, | |
78 rindex( $self->input_file(), '.' ) | |
79 ), | |
80 $db_name | |
81 ); | |
82 my $query = sprintf( 'hmmsearch %s %s > %s', | |
83 $_, $self->input_file(), $output_filename ); | |
84 system($query); | |
85 | |
86 my $in = Bio::SearchIO->new( | |
87 -format => 'hmmer', | |
88 -file => $output_filename | |
89 ); | |
90 while ( my $result = $in->next_result ) { | |
91 while ( my $hit = $result->next_hit ) { | |
92 while ( my $hsp = $hit->next_hsp ) { | |
93 my ( $from, $to ) = | |
94 ( $result->query_name, $hit->name ); | |
95 unless ( $hits{$to}{$from} ) { | |
96 $hits{$to}{$from} = []; | |
97 } | |
98 push( | |
99 $hits{$to}{$from}, | |
100 { | |
101 'type' => 'hmmer', | |
102 'data' => { | |
103 'evalue' => ( | |
104 $hsp | |
105 ->evalue | |
106 eq '0' | |
107 ? '0.0' | |
108 : $hsp | |
109 ->evalue | |
110 ), | |
111 } | |
112 } | |
113 ); | |
114 } | |
115 } | |
116 } | |
117 unlink($output_filename); | |
118 } | |
119 $progress->update(100); | |
120 } | |
121 | |
122 sub blast { | |
123 my ($self) = @_; | |
124 | |
125 use Term::ProgressBar 2.00; | |
126 my $progress = Term::ProgressBar->new( | |
127 { | |
128 name => 'Blast', | |
129 count => 100, | |
130 ETA => 'linear', | |
131 } | |
132 ); | |
133 $progress->max_update_rate(1); | |
134 | |
135 my $blast_db_dir = | |
136 File::Spec->catdir( $self->data_dir(), 'db', 'blast' ); | |
137 my @blast_dbs = | |
138 map { substr( $_, 0, -4 ) } | |
139 glob( File::Spec->catfile( $blast_db_dir, "*.phr" ) ); | |
140 | |
141 my $i = 0; | |
142 | |
143 #my %hits; | |
144 | |
145 foreach my $blast_db (@blast_dbs) { | |
146 $progress->update( 100 * ( $i++ ) / scalar(@blast_dbs) ); | |
147 my $output_str = | |
148 substr( $blast_db, rindex( $blast_db, '/' ) + 1 ) . '.csv'; | |
149 my $query = sprintf( | |
150 'psiblast -query %s -db %s -evalue %s -outfmt "6 qseqid sseqid pident qlen slen evalue" | awk \'%s\' > %s', | |
151 $self->input_file(), $blast_db, '1e-5', | |
152 $self->awk_string(), $output_str ); | |
153 system($query); | |
154 open( my $tmpfh, '<', $output_str ); | |
155 while (<$tmpfh>) { | |
156 chomp $_; | |
157 if ( $_ !~ /^#/ ) { | |
158 my @line = split( /,/, $_ ); | |
159 unless ( $hits{ $line[1] }{ $line[2] } ) { | |
160 $hits{ $line[1] }{ $line[2] } = []; | |
161 } | |
162 push( | |
163 $hits{ $line[1] }{ $line[2] }, | |
164 { | |
165 'type' => 'psiblast', | |
166 'data' => { | |
167 'evalue' => $line[4], | |
168 'dice' => $line[3], | |
169 } | |
170 } | |
171 ); | |
172 } | |
173 } | |
174 close($tmpfh); | |
175 unlink($output_str); | |
176 | |
177 } | |
178 $progress->update(100); | |
179 } | |
180 | |
181 sub guess { | |
182 my ($self) = @_; | |
183 | |
184 open( my $groupings_fh, | |
185 '<', | |
186 File::Spec->catfile( $self->data_dir(), 'groupings.tsv' ) ); | |
187 | |
188 # Load groupings.tsv into memory | |
189 my %data; | |
190 while (<$groupings_fh>) { | |
191 if ( $_ !~ /^#/ ) { | |
192 chomp $_; | |
193 my ( $major, $minor, $hit ) = split( /\t/, $_ ); | |
194 unless ( $data{$major}{$minor} ) { | |
195 $data{$major}{$minor} = []; | |
196 } | |
197 push( $data{$major}{$minor}, $hit ); | |
198 } | |
199 } | |
200 | |
201 # Create a reverse lookup table | |
202 my %rdata; | |
203 foreach my $i ( keys %data ) { | |
204 foreach my $j ( keys %{ $data{$i} } ) { | |
205 foreach my $k ( @{ $data{$i}{$j} } ) { | |
206 if ( defined($k) ) { | |
207 $rdata{$k} = [ $i, $j ]; | |
208 } | |
209 else { | |
210 print "$i $j\n"; | |
211 } | |
212 } | |
213 } | |
214 } | |
215 | |
216 # Table printing stuff | |
217 my @header = ( | |
218 'Major Category', | |
219 'Major hits', | |
220 'Minor Category', | |
221 'Minor hits', | |
222 'Analysis', | |
223 'Evidence Type', | |
224 'Evidence' | |
225 ); | |
226 my %output; | |
227 | |
228 # Loop across the input keys | |
229 foreach my $input_key ( keys %hits ) { | |
230 my %guesses; | |
231 my %guess_evidence; | |
232 | |
233 # And across all of the hits that the query hit to | |
234 foreach my $against ( keys %{ $hits{$input_key} } ) { | |
235 | |
236 # We look at the evidence | |
237 my @evidence = @{ $hits{$input_key}{$against} }; | |
238 | |
239 my ( $type_major, $type_minor ); | |
240 if ( $rdata{$against} ) { | |
241 ( $type_major, $type_minor ) = | |
242 @{ $rdata{$against} }; | |
243 } | |
244 else { | |
245 ( $type_major, $type_minor ) = ( | |
246 substr( | |
247 $against, 0, | |
248 rindex( $against, '_' ) | |
249 ), | |
250 substr( | |
251 $against, | |
252 rindex( $against, '_' ) + 1 | |
253 ) | |
254 ); | |
255 } | |
256 | |
257 # Prepare hashes. | |
258 unless ( $guesses{$type_major} ) { | |
259 $guesses{$type_major} = (); | |
260 } | |
261 unless ( $guesses{$type_major}{$type_minor} ) { | |
262 $guesses{$type_major}{$type_minor} = (); | |
263 } | |
264 | |
265 # Loop across the evidence | |
266 foreach my $piece_of_evidence (@evidence) { | |
267 | |
268 # Here is an example piece of evidence | |
269 # 'GK_Gilmour_Gene43' => { | |
270 # 'SP18' => [ | |
271 # { | |
272 # 'data' => { | |
273 # 'evalue' => '2e-08', | |
274 # 'dice' => '23.8627' | |
275 # }, | |
276 # 'type' => 'psiblast' | |
277 # } | |
278 # ], | |
279 # | |
280 if ( $type_major !~ /subject i/ ) { | |
281 my %piece = %{$piece_of_evidence}; | |
282 if ( | |
283 $self->validate_evidence( | |
284 $piece_of_evidence) > 0 | |
285 ) | |
286 { | |
287 $guess_evidence{$type_major}++; | |
288 $guess_evidence{ | |
289 "$type_major$type_minor" | |
290 }++; | |
291 | |
292 # If it's not defined, set up sub arrays. | |
293 unless ( $guesses{$type_major} | |
294 {$type_minor} | |
295 { $piece{'type'} } ) | |
296 { | |
297 if ( $piece{'type'} | |
298 eq 'psiblast' ) | |
299 { | |
300 $guesses{ | |
301 $type_major | |
302 }{$type_minor} | |
303 { | |
304 $piece{ | |
305 'type' | |
306 } | |
307 } | |
308 = { | |
309 'evalue' | |
310 => [], | |
311 'dice' | |
312 => [] | |
313 }; | |
314 } | |
315 else { | |
316 $guesses{ | |
317 $type_major | |
318 }{$type_minor} | |
319 { | |
320 $piece{ | |
321 'type' | |
322 } | |
323 } | |
324 = { 'evalue' | |
325 => [] | |
326 }; | |
327 } | |
328 } | |
329 | |
330 # If it's zero, correct to zero. | |
331 if ( $piece{'data'}{'evalue'} | |
332 eq '0.0' ) | |
333 { | |
334 $piece{'data'}{'evalue'} | |
335 = '0.0'; | |
336 } | |
337 | |
338 # Add our evalue | |
339 push( | |
340 $guesses{$type_major} | |
341 {$type_minor} | |
342 { $piece{'type'} } | |
343 {'evalue'}, | |
344 $piece{'data'}{'evalue'} | |
345 ); | |
346 | |
347 # And if psiblast, add dice | |
348 if ( $piece{'type'} eq | |
349 'psiblast' ) | |
350 { | |
351 push( | |
352 $guesses{ | |
353 $type_major | |
354 }{$type_minor} | |
355 { | |
356 $piece{ | |
357 'type' | |
358 } | |
359 }{'dice'}, | |
360 $piece{'data'} | |
361 {'dice'} | |
362 ); | |
363 } | |
364 } | |
365 } | |
366 } | |
367 } | |
368 | |
369 my @output_sheet; | |
370 | |
371 foreach my $major ( keys %guesses ) { | |
372 if ( $guess_evidence{$major} ) { | |
373 foreach my $minor ( keys %{ $guesses{$major} } ) | |
374 { | |
375 if ( $guess_evidence{"$major$minor"} ) { | |
376 foreach my $evidence_category ( | |
377 keys %{ | |
378 $guesses{$major} | |
379 {$minor} | |
380 } | |
381 ) | |
382 { | |
383 if ( $evidence_category | |
384 ne 'evidence' ) | |
385 { | |
386 # things like evalue, dice | |
387 my %hits = %{ | |
388 $guesses{ | |
389 $major | |
390 }{ | |
391 $minor | |
392 }{ | |
393 $evidence_category | |
394 } | |
395 }; | |
396 foreach | |
397 my $subtype ( | |
398 keys | |
399 %hits ) | |
400 { # should be evalue, dice | |
401 push( | |
402 @output_sheet, | |
403 [ | |
404 $major, | |
405 $guess_evidence{ | |
406 $major | |
407 } | |
408 , | |
409 $minor, | |
410 $guess_evidence{ | |
411 "$major$minor" | |
412 } | |
413 , | |
414 $evidence_category, | |
415 $subtype, | |
416 join | |
417 ( | |
418 ',', | |
419 @{ | |
420 $hits{ | |
421 $subtype | |
422 } | |
423 } | |
424 ) | |
425 ] | |
426 ); | |
427 } | |
428 } | |
429 } | |
430 } | |
431 } | |
432 } | |
433 } | |
434 | |
435 if ( !scalar @output_sheet ) { | |
436 @output_sheet = ( ['No evidence above threshold'], ); | |
437 } | |
438 $output{$input_key} = { | |
439 header => \@header, | |
440 data => \@output_sheet, | |
441 }; | |
442 } | |
443 return \%output; | |
444 } | |
445 | |
446 sub validate_evidence { | |
447 my ( $self, $piece_of_evidence ) = @_; | |
448 my %piece = %{$piece_of_evidence}; | |
449 | |
450 #my ($self, $type, $subtype, $value) = @_; | |
451 # { | |
452 # 'data' => { | |
453 # 'evalue' => '2e-08', | |
454 # 'dice' => '23.8627' | |
455 # }, | |
456 # 'type' => 'psiblast' | |
457 # } | |
458 if ( $piece{type} eq 'hmmer' ) { | |
459 my $value = $piece{data}{evalue}; | |
460 if ( $value eq '0.0' || $value eq '0' ) { | |
461 return 1; | |
462 } | |
463 elsif ( !defined($value) || $value eq '' ) { | |
464 return 0; | |
465 } | |
466 else { | |
467 return ( log($value) < $self->hmmer_evalue_cutoff() ) | |
468 ; # -64 | |
469 } | |
470 } | |
471 elsif ( $piece{type} eq 'psiblast' ) { | |
472 my $evalue = $piece{data}{evalue}; | |
473 my $dice = $piece{data}{dice}; | |
474 | |
475 my $evalue_return = 0; | |
476 if ( $evalue eq '0.0' || $evalue eq '0' ) { | |
477 $evalue_return = 1; | |
478 } | |
479 else { | |
480 $evalue_return = | |
481 ( log($evalue) < $self->blast_evalue_cutoff() ); #-140 | |
482 } | |
483 | |
484 return $evalue_return | |
485 && ( $dice > $self->blast_dice_cutoff() ); # 30 | |
486 } | |
487 } | |
488 | |
489 1; | |
490 | |
491 __END__ | |
492 | |
493 =pod | |
494 | |
495 =encoding UTF-8 | |
496 | |
497 =head1 NAME | |
498 | |
499 CPT::Analysis::TerL - Guess phage packaging strategy based on homology to terminases (TerL) of phages with known packaging strategies | |
500 | |
501 =head1 VERSION | |
502 | |
503 version 1.96 | |
504 | |
505 =head1 AUTHOR | |
506 | |
507 Eric Rasche <rasche.eric@yandex.ru> | |
508 | |
509 =head1 COPYRIGHT AND LICENSE | |
510 | |
511 This software is Copyright (c) 2014 by Eric Rasche. | |
512 | |
513 This is free software, licensed under: | |
514 | |
515 The GNU General Public License, Version 3, June 2007 | |
516 | |
517 =cut |