Mercurial > repos > cpt > cpt_psm_prep
comparison lib/CPT/Analysis/TerL.pm @ 1:d724f34e671d draft default tip
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
| author | cpt |
|---|---|
| date | Mon, 05 Jun 2023 02:50:07 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:e4de0a0e90c8 | 1:d724f34e671d |
|---|---|
| 1 package CPT::Analysis::TerL; | |
| 2 | |
| 3 # ABSTRACT: Guess phage packaging strategy based on homology to terminases (TerL) of phages with known packaging strategies | |
| 4 | |
| 5 use strict; | |
| 6 use warnings; | |
| 7 use Data::Dumper; | |
| 8 use autodie; | |
| 9 use Moose; | |
| 10 use Bio::SearchIO; | |
| 11 use File::ShareDir; | |
| 12 use File::Spec; | |
| 13 | |
| 14 has 'hmmer_evalue_cutoff' => ( is => 'rw', isa => 'Int' ); | |
| 15 has 'blast_evalue_cutoff' => ( is => 'rw', isa => 'Int' ); | |
| 16 has 'blast_dice_cutoff' => ( is => 'rw', isa => 'Int' ); | |
| 17 | |
| 18 has 'search_hmmer' => ( is => 'rw', isa => 'Bool' ); | |
| 19 has 'search_blast' => ( is => 'rw', isa => 'Bool' ); | |
| 20 | |
| 21 has 'data_dir' => ( is => 'rw', isa => 'Any' ); | |
| 22 | |
| 23 has 'awk_string' => ( | |
| 24 is => 'ro', | |
| 25 isa => 'Str', | |
| 26 default => | |
| 27 'BEGIN{print "#row,query id,subject id,evalue,dice" } {qid=$1;sid=$2;percent_identical=$3;query_length=$4;subject_length=$5;evalue=$6;dice=(2*percent_identical*subject_length/ ( subject_length + query_length ) );printf("%d,%s,%s,%s,%s\n",FNR, qid, sid, dice, evalue);}' | |
| 28 ); | |
| 29 | |
| 30 has 'input_file' => ( is => 'rw', isa => 'Str' ); | |
| 31 | |
| 32 my @column_max = ( 20, 10, 20, 10, 20, 60 ); | |
| 33 my %hits; | |
| 34 | |
| 35 sub run { | |
| 36 my ( $self, %data ) = @_; | |
| 37 | |
| 38 #$self->prepare(%data); | |
| 39 my $dir = '/galaxy/tool-data/' # File::ShareDir::dist_dir('CPT-Analysis-TerL'); | |
| 40 $self->data_dir($dir); | |
| 41 $self->input_file( $data{input_file} ); | |
| 42 | |
| 43 if ( $self->search_hmmer() ) { | |
| 44 $self->hmmer(); | |
| 45 } | |
| 46 if ( $self->search_blast() ) { | |
| 47 $self->blast($self); | |
| 48 } | |
| 49 | |
| 50 return $self->guess(); | |
| 51 } | |
| 52 | |
| 53 sub hmmer { | |
| 54 my ($self) = @_; | |
| 55 | |
| 56 use Term::ProgressBar 2.00; | |
| 57 my $progress = Term::ProgressBar->new( | |
| 58 { | |
| 59 name => 'HMMER', | |
| 60 count => 100, | |
| 61 ETA => 'linear', | |
| 62 } | |
| 63 ); | |
| 64 $progress->max_update_rate(1); | |
| 65 | |
| 66 my $hmmer_db_dir = | |
| 67 File::Spec->catdir( $self->data_dir(), 'db', 'hmmer' ); | |
| 68 my @hmmer_dbs = glob( File::Spec->catfile( $hmmer_db_dir, "*.hmm" ) ); | |
| 69 | |
| 70 my $i = 0; | |
| 71 foreach (@hmmer_dbs) { | |
| 72 $progress->update( 100 * ( $i++ ) / scalar @hmmer_dbs ); | |
| 73 my $db_name = substr( $_, rindex( $_, '/' ) + 1, -4 ); | |
| 74 my $output_filename = sprintf( | |
| 75 '%s.%s.out', | |
| 76 substr( | |
| 77 $self->input_file(), 0, | |
| 78 rindex( $self->input_file(), '.' ) | |
| 79 ), | |
| 80 $db_name | |
| 81 ); | |
| 82 my $query = sprintf( 'hmmsearch %s %s > %s', | |
| 83 $_, $self->input_file(), $output_filename ); | |
| 84 system($query); | |
| 85 | |
| 86 my $in = Bio::SearchIO->new( | |
| 87 -format => 'hmmer', | |
| 88 -file => $output_filename | |
| 89 ); | |
| 90 while ( my $result = $in->next_result ) { | |
| 91 while ( my $hit = $result->next_hit ) { | |
| 92 while ( my $hsp = $hit->next_hsp ) { | |
| 93 my ( $from, $to ) = | |
| 94 ( $result->query_name, $hit->name ); | |
| 95 unless ( $hits{$to}{$from} ) { | |
| 96 $hits{$to}{$from} = []; | |
| 97 } | |
| 98 push( | |
| 99 $hits{$to}{$from}, | |
| 100 { | |
| 101 'type' => 'hmmer', | |
| 102 'data' => { | |
| 103 'evalue' => ( | |
| 104 $hsp | |
| 105 ->evalue | |
| 106 eq '0' | |
| 107 ? '0.0' | |
| 108 : $hsp | |
| 109 ->evalue | |
| 110 ), | |
| 111 } | |
| 112 } | |
| 113 ); | |
| 114 } | |
| 115 } | |
| 116 } | |
| 117 unlink($output_filename); | |
| 118 } | |
| 119 $progress->update(100); | |
| 120 } | |
| 121 | |
| 122 sub blast { | |
| 123 my ($self) = @_; | |
| 124 | |
| 125 use Term::ProgressBar 2.00; | |
| 126 my $progress = Term::ProgressBar->new( | |
| 127 { | |
| 128 name => 'Blast', | |
| 129 count => 100, | |
| 130 ETA => 'linear', | |
| 131 } | |
| 132 ); | |
| 133 $progress->max_update_rate(1); | |
| 134 | |
| 135 my $blast_db_dir = | |
| 136 File::Spec->catdir( $self->data_dir(), 'db', 'blast' ); | |
| 137 my @blast_dbs = | |
| 138 map { substr( $_, 0, -4 ) } | |
| 139 glob( File::Spec->catfile( $blast_db_dir, "*.phr" ) ); | |
| 140 | |
| 141 my $i = 0; | |
| 142 | |
| 143 #my %hits; | |
| 144 | |
| 145 foreach my $blast_db (@blast_dbs) { | |
| 146 $progress->update( 100 * ( $i++ ) / scalar(@blast_dbs) ); | |
| 147 my $output_str = | |
| 148 substr( $blast_db, rindex( $blast_db, '/' ) + 1 ) . '.csv'; | |
| 149 my $query = sprintf( | |
| 150 'psiblast -query %s -db %s -evalue %s -outfmt "6 qseqid sseqid pident qlen slen evalue" | awk \'%s\' > %s', | |
| 151 $self->input_file(), $blast_db, '1e-5', | |
| 152 $self->awk_string(), $output_str ); | |
| 153 system($query); | |
| 154 open( my $tmpfh, '<', $output_str ); | |
| 155 while (<$tmpfh>) { | |
| 156 chomp $_; | |
| 157 if ( $_ !~ /^#/ ) { | |
| 158 my @line = split( /,/, $_ ); | |
| 159 unless ( $hits{ $line[1] }{ $line[2] } ) { | |
| 160 $hits{ $line[1] }{ $line[2] } = []; | |
| 161 } | |
| 162 push( | |
| 163 $hits{ $line[1] }{ $line[2] }, | |
| 164 { | |
| 165 'type' => 'psiblast', | |
| 166 'data' => { | |
| 167 'evalue' => $line[4], | |
| 168 'dice' => $line[3], | |
| 169 } | |
| 170 } | |
| 171 ); | |
| 172 } | |
| 173 } | |
| 174 close($tmpfh); | |
| 175 unlink($output_str); | |
| 176 | |
| 177 } | |
| 178 $progress->update(100); | |
| 179 } | |
| 180 | |
| 181 sub guess { | |
| 182 my ($self) = @_; | |
| 183 | |
| 184 open( my $groupings_fh, | |
| 185 '<', | |
| 186 File::Spec->catfile( $self->data_dir(), 'groupings.tsv' ) ); | |
| 187 | |
| 188 # Load groupings.tsv into memory | |
| 189 my %data; | |
| 190 while (<$groupings_fh>) { | |
| 191 if ( $_ !~ /^#/ ) { | |
| 192 chomp $_; | |
| 193 my ( $major, $minor, $hit ) = split( /\t/, $_ ); | |
| 194 unless ( $data{$major}{$minor} ) { | |
| 195 $data{$major}{$minor} = []; | |
| 196 } | |
| 197 push( $data{$major}{$minor}, $hit ); | |
| 198 } | |
| 199 } | |
| 200 | |
| 201 # Create a reverse lookup table | |
| 202 my %rdata; | |
| 203 foreach my $i ( keys %data ) { | |
| 204 foreach my $j ( keys %{ $data{$i} } ) { | |
| 205 foreach my $k ( @{ $data{$i}{$j} } ) { | |
| 206 if ( defined($k) ) { | |
| 207 $rdata{$k} = [ $i, $j ]; | |
| 208 } | |
| 209 else { | |
| 210 print "$i $j\n"; | |
| 211 } | |
| 212 } | |
| 213 } | |
| 214 } | |
| 215 | |
| 216 # Table printing stuff | |
| 217 my @header = ( | |
| 218 'Major Category', | |
| 219 'Major hits', | |
| 220 'Minor Category', | |
| 221 'Minor hits', | |
| 222 'Analysis', | |
| 223 'Evidence Type', | |
| 224 'Evidence' | |
| 225 ); | |
| 226 my %output; | |
| 227 | |
| 228 # Loop across the input keys | |
| 229 foreach my $input_key ( keys %hits ) { | |
| 230 my %guesses; | |
| 231 my %guess_evidence; | |
| 232 | |
| 233 # And across all of the hits that the query hit to | |
| 234 foreach my $against ( keys %{ $hits{$input_key} } ) { | |
| 235 | |
| 236 # We look at the evidence | |
| 237 my @evidence = @{ $hits{$input_key}{$against} }; | |
| 238 | |
| 239 my ( $type_major, $type_minor ); | |
| 240 if ( $rdata{$against} ) { | |
| 241 ( $type_major, $type_minor ) = | |
| 242 @{ $rdata{$against} }; | |
| 243 } | |
| 244 else { | |
| 245 ( $type_major, $type_minor ) = ( | |
| 246 substr( | |
| 247 $against, 0, | |
| 248 rindex( $against, '_' ) | |
| 249 ), | |
| 250 substr( | |
| 251 $against, | |
| 252 rindex( $against, '_' ) + 1 | |
| 253 ) | |
| 254 ); | |
| 255 } | |
| 256 | |
| 257 # Prepare hashes. | |
| 258 unless ( $guesses{$type_major} ) { | |
| 259 $guesses{$type_major} = (); | |
| 260 } | |
| 261 unless ( $guesses{$type_major}{$type_minor} ) { | |
| 262 $guesses{$type_major}{$type_minor} = (); | |
| 263 } | |
| 264 | |
| 265 # Loop across the evidence | |
| 266 foreach my $piece_of_evidence (@evidence) { | |
| 267 | |
| 268 # Here is an example piece of evidence | |
| 269 # 'GK_Gilmour_Gene43' => { | |
| 270 # 'SP18' => [ | |
| 271 # { | |
| 272 # 'data' => { | |
| 273 # 'evalue' => '2e-08', | |
| 274 # 'dice' => '23.8627' | |
| 275 # }, | |
| 276 # 'type' => 'psiblast' | |
| 277 # } | |
| 278 # ], | |
| 279 # | |
| 280 if ( $type_major !~ /subject i/ ) { | |
| 281 my %piece = %{$piece_of_evidence}; | |
| 282 if ( | |
| 283 $self->validate_evidence( | |
| 284 $piece_of_evidence) > 0 | |
| 285 ) | |
| 286 { | |
| 287 $guess_evidence{$type_major}++; | |
| 288 $guess_evidence{ | |
| 289 "$type_major$type_minor" | |
| 290 }++; | |
| 291 | |
| 292 # If it's not defined, set up sub arrays. | |
| 293 unless ( $guesses{$type_major} | |
| 294 {$type_minor} | |
| 295 { $piece{'type'} } ) | |
| 296 { | |
| 297 if ( $piece{'type'} | |
| 298 eq 'psiblast' ) | |
| 299 { | |
| 300 $guesses{ | |
| 301 $type_major | |
| 302 }{$type_minor} | |
| 303 { | |
| 304 $piece{ | |
| 305 'type' | |
| 306 } | |
| 307 } | |
| 308 = { | |
| 309 'evalue' | |
| 310 => [], | |
| 311 'dice' | |
| 312 => [] | |
| 313 }; | |
| 314 } | |
| 315 else { | |
| 316 $guesses{ | |
| 317 $type_major | |
| 318 }{$type_minor} | |
| 319 { | |
| 320 $piece{ | |
| 321 'type' | |
| 322 } | |
| 323 } | |
| 324 = { 'evalue' | |
| 325 => [] | |
| 326 }; | |
| 327 } | |
| 328 } | |
| 329 | |
| 330 # If it's zero, correct to zero. | |
| 331 if ( $piece{'data'}{'evalue'} | |
| 332 eq '0.0' ) | |
| 333 { | |
| 334 $piece{'data'}{'evalue'} | |
| 335 = '0.0'; | |
| 336 } | |
| 337 | |
| 338 # Add our evalue | |
| 339 push( | |
| 340 $guesses{$type_major} | |
| 341 {$type_minor} | |
| 342 { $piece{'type'} } | |
| 343 {'evalue'}, | |
| 344 $piece{'data'}{'evalue'} | |
| 345 ); | |
| 346 | |
| 347 # And if psiblast, add dice | |
| 348 if ( $piece{'type'} eq | |
| 349 'psiblast' ) | |
| 350 { | |
| 351 push( | |
| 352 $guesses{ | |
| 353 $type_major | |
| 354 }{$type_minor} | |
| 355 { | |
| 356 $piece{ | |
| 357 'type' | |
| 358 } | |
| 359 }{'dice'}, | |
| 360 $piece{'data'} | |
| 361 {'dice'} | |
| 362 ); | |
| 363 } | |
| 364 } | |
| 365 } | |
| 366 } | |
| 367 } | |
| 368 | |
| 369 my @output_sheet; | |
| 370 | |
| 371 foreach my $major ( keys %guesses ) { | |
| 372 if ( $guess_evidence{$major} ) { | |
| 373 foreach my $minor ( keys %{ $guesses{$major} } ) | |
| 374 { | |
| 375 if ( $guess_evidence{"$major$minor"} ) { | |
| 376 foreach my $evidence_category ( | |
| 377 keys %{ | |
| 378 $guesses{$major} | |
| 379 {$minor} | |
| 380 } | |
| 381 ) | |
| 382 { | |
| 383 if ( $evidence_category | |
| 384 ne 'evidence' ) | |
| 385 { | |
| 386 # things like evalue, dice | |
| 387 my %hits = %{ | |
| 388 $guesses{ | |
| 389 $major | |
| 390 }{ | |
| 391 $minor | |
| 392 }{ | |
| 393 $evidence_category | |
| 394 } | |
| 395 }; | |
| 396 foreach | |
| 397 my $subtype ( | |
| 398 keys | |
| 399 %hits ) | |
| 400 { # should be evalue, dice | |
| 401 push( | |
| 402 @output_sheet, | |
| 403 [ | |
| 404 $major, | |
| 405 $guess_evidence{ | |
| 406 $major | |
| 407 } | |
| 408 , | |
| 409 $minor, | |
| 410 $guess_evidence{ | |
| 411 "$major$minor" | |
| 412 } | |
| 413 , | |
| 414 $evidence_category, | |
| 415 $subtype, | |
| 416 join | |
| 417 ( | |
| 418 ',', | |
| 419 @{ | |
| 420 $hits{ | |
| 421 $subtype | |
| 422 } | |
| 423 } | |
| 424 ) | |
| 425 ] | |
| 426 ); | |
| 427 } | |
| 428 } | |
| 429 } | |
| 430 } | |
| 431 } | |
| 432 } | |
| 433 } | |
| 434 | |
| 435 if ( !scalar @output_sheet ) { | |
| 436 @output_sheet = ( ['No evidence above threshold'], ); | |
| 437 } | |
| 438 $output{$input_key} = { | |
| 439 header => \@header, | |
| 440 data => \@output_sheet, | |
| 441 }; | |
| 442 } | |
| 443 return \%output; | |
| 444 } | |
| 445 | |
| 446 sub validate_evidence { | |
| 447 my ( $self, $piece_of_evidence ) = @_; | |
| 448 my %piece = %{$piece_of_evidence}; | |
| 449 | |
| 450 #my ($self, $type, $subtype, $value) = @_; | |
| 451 # { | |
| 452 # 'data' => { | |
| 453 # 'evalue' => '2e-08', | |
| 454 # 'dice' => '23.8627' | |
| 455 # }, | |
| 456 # 'type' => 'psiblast' | |
| 457 # } | |
| 458 if ( $piece{type} eq 'hmmer' ) { | |
| 459 my $value = $piece{data}{evalue}; | |
| 460 if ( $value eq '0.0' || $value eq '0' ) { | |
| 461 return 1; | |
| 462 } | |
| 463 elsif ( !defined($value) || $value eq '' ) { | |
| 464 return 0; | |
| 465 } | |
| 466 else { | |
| 467 return ( log($value) < $self->hmmer_evalue_cutoff() ) | |
| 468 ; # -64 | |
| 469 } | |
| 470 } | |
| 471 elsif ( $piece{type} eq 'psiblast' ) { | |
| 472 my $evalue = $piece{data}{evalue}; | |
| 473 my $dice = $piece{data}{dice}; | |
| 474 | |
| 475 my $evalue_return = 0; | |
| 476 if ( $evalue eq '0.0' || $evalue eq '0' ) { | |
| 477 $evalue_return = 1; | |
| 478 } | |
| 479 else { | |
| 480 $evalue_return = | |
| 481 ( log($evalue) < $self->blast_evalue_cutoff() ); #-140 | |
| 482 } | |
| 483 | |
| 484 return $evalue_return | |
| 485 && ( $dice > $self->blast_dice_cutoff() ); # 30 | |
| 486 } | |
| 487 } | |
| 488 | |
| 489 1; | |
| 490 | |
| 491 __END__ | |
| 492 | |
| 493 =pod | |
| 494 | |
| 495 =encoding UTF-8 | |
| 496 | |
| 497 =head1 NAME | |
| 498 | |
| 499 CPT::Analysis::TerL - Guess phage packaging strategy based on homology to terminases (TerL) of phages with known packaging strategies | |
| 500 | |
| 501 =head1 VERSION | |
| 502 | |
| 503 version 1.96 | |
| 504 | |
| 505 =head1 AUTHOR | |
| 506 | |
| 507 Eric Rasche <rasche.eric@yandex.ru> | |
| 508 | |
| 509 =head1 COPYRIGHT AND LICENSE | |
| 510 | |
| 511 This software is Copyright (c) 2014 by Eric Rasche. | |
| 512 | |
| 513 This is free software, licensed under: | |
| 514 | |
| 515 The GNU General Public License, Version 3, June 2007 | |
| 516 | |
| 517 =cut |
