Mercurial > repos > cpt > cpt_psm_prep
diff lib/CPT/Analysis/TerL.pm @ 1:d724f34e671d draft default tip
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:50:07 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/CPT/Analysis/TerL.pm Mon Jun 05 02:50:07 2023 +0000 @@ -0,0 +1,517 @@ +package CPT::Analysis::TerL; + +# ABSTRACT: Guess phage packaging strategy based on homology to terminases (TerL) of phages with known packaging strategies + +use strict; +use warnings; +use Data::Dumper; +use autodie; +use Moose; +use Bio::SearchIO; +use File::ShareDir; +use File::Spec; + +has 'hmmer_evalue_cutoff' => ( is => 'rw', isa => 'Int' ); +has 'blast_evalue_cutoff' => ( is => 'rw', isa => 'Int' ); +has 'blast_dice_cutoff' => ( is => 'rw', isa => 'Int' ); + +has 'search_hmmer' => ( is => 'rw', isa => 'Bool' ); +has 'search_blast' => ( is => 'rw', isa => 'Bool' ); + +has 'data_dir' => ( is => 'rw', isa => 'Any' ); + +has 'awk_string' => ( + is => 'ro', + isa => 'Str', + default => +'BEGIN{print "#row,query id,subject id,evalue,dice" } {qid=$1;sid=$2;percent_identical=$3;query_length=$4;subject_length=$5;evalue=$6;dice=(2*percent_identical*subject_length/ ( subject_length + query_length ) );printf("%d,%s,%s,%s,%s\n",FNR, qid, sid, dice, evalue);}' +); + +has 'input_file' => ( is => 'rw', isa => 'Str' ); + +my @column_max = ( 20, 10, 20, 10, 20, 60 ); +my %hits; + +sub run { + my ( $self, %data ) = @_; + + #$self->prepare(%data); + my $dir = '/galaxy/tool-data/' # File::ShareDir::dist_dir('CPT-Analysis-TerL'); + $self->data_dir($dir); + $self->input_file( $data{input_file} ); + + if ( $self->search_hmmer() ) { + $self->hmmer(); + } + if ( $self->search_blast() ) { + $self->blast($self); + } + + return $self->guess(); +} + +sub hmmer { + my ($self) = @_; + + use Term::ProgressBar 2.00; + my $progress = Term::ProgressBar->new( + { + name => 'HMMER', + count => 100, + ETA => 'linear', + } + ); + $progress->max_update_rate(1); + + my $hmmer_db_dir = + File::Spec->catdir( $self->data_dir(), 'db', 'hmmer' ); + my @hmmer_dbs = glob( File::Spec->catfile( $hmmer_db_dir, "*.hmm" ) ); + + my $i = 0; + foreach (@hmmer_dbs) { + $progress->update( 100 * ( $i++ ) / scalar @hmmer_dbs ); + my $db_name = substr( $_, rindex( $_, '/' ) + 1, -4 ); + my $output_filename = sprintf( + '%s.%s.out', + substr( + $self->input_file(), 0, + rindex( $self->input_file(), '.' ) + ), + $db_name + ); + my $query = sprintf( 'hmmsearch %s %s > %s', + $_, $self->input_file(), $output_filename ); + system($query); + + my $in = Bio::SearchIO->new( + -format => 'hmmer', + -file => $output_filename + ); + while ( my $result = $in->next_result ) { + while ( my $hit = $result->next_hit ) { + while ( my $hsp = $hit->next_hsp ) { + my ( $from, $to ) = + ( $result->query_name, $hit->name ); + unless ( $hits{$to}{$from} ) { + $hits{$to}{$from} = []; + } + push( + $hits{$to}{$from}, + { + 'type' => 'hmmer', + 'data' => { + 'evalue' => ( + $hsp + ->evalue + eq '0' + ? '0.0' + : $hsp + ->evalue + ), + } + } + ); + } + } + } + unlink($output_filename); + } + $progress->update(100); +} + +sub blast { + my ($self) = @_; + + use Term::ProgressBar 2.00; + my $progress = Term::ProgressBar->new( + { + name => 'Blast', + count => 100, + ETA => 'linear', + } + ); + $progress->max_update_rate(1); + + my $blast_db_dir = + File::Spec->catdir( $self->data_dir(), 'db', 'blast' ); + my @blast_dbs = + map { substr( $_, 0, -4 ) } + glob( File::Spec->catfile( $blast_db_dir, "*.phr" ) ); + + my $i = 0; + + #my %hits; + + foreach my $blast_db (@blast_dbs) { + $progress->update( 100 * ( $i++ ) / scalar(@blast_dbs) ); + my $output_str = + substr( $blast_db, rindex( $blast_db, '/' ) + 1 ) . '.csv'; + my $query = sprintf( +'psiblast -query %s -db %s -evalue %s -outfmt "6 qseqid sseqid pident qlen slen evalue" | awk \'%s\' > %s', + $self->input_file(), $blast_db, '1e-5', + $self->awk_string(), $output_str ); + system($query); + open( my $tmpfh, '<', $output_str ); + while (<$tmpfh>) { + chomp $_; + if ( $_ !~ /^#/ ) { + my @line = split( /,/, $_ ); + unless ( $hits{ $line[1] }{ $line[2] } ) { + $hits{ $line[1] }{ $line[2] } = []; + } + push( + $hits{ $line[1] }{ $line[2] }, + { + 'type' => 'psiblast', + 'data' => { + 'evalue' => $line[4], + 'dice' => $line[3], + } + } + ); + } + } + close($tmpfh); + unlink($output_str); + + } + $progress->update(100); +} + +sub guess { + my ($self) = @_; + + open( my $groupings_fh, + '<', + File::Spec->catfile( $self->data_dir(), 'groupings.tsv' ) ); + + # Load groupings.tsv into memory + my %data; + while (<$groupings_fh>) { + if ( $_ !~ /^#/ ) { + chomp $_; + my ( $major, $minor, $hit ) = split( /\t/, $_ ); + unless ( $data{$major}{$minor} ) { + $data{$major}{$minor} = []; + } + push( $data{$major}{$minor}, $hit ); + } + } + + # Create a reverse lookup table + my %rdata; + foreach my $i ( keys %data ) { + foreach my $j ( keys %{ $data{$i} } ) { + foreach my $k ( @{ $data{$i}{$j} } ) { + if ( defined($k) ) { + $rdata{$k} = [ $i, $j ]; + } + else { + print "$i $j\n"; + } + } + } + } + + # Table printing stuff + my @header = ( + 'Major Category', + 'Major hits', + 'Minor Category', + 'Minor hits', + 'Analysis', + 'Evidence Type', + 'Evidence' + ); + my %output; + + # Loop across the input keys + foreach my $input_key ( keys %hits ) { + my %guesses; + my %guess_evidence; + + # And across all of the hits that the query hit to + foreach my $against ( keys %{ $hits{$input_key} } ) { + + # We look at the evidence + my @evidence = @{ $hits{$input_key}{$against} }; + + my ( $type_major, $type_minor ); + if ( $rdata{$against} ) { + ( $type_major, $type_minor ) = + @{ $rdata{$against} }; + } + else { + ( $type_major, $type_minor ) = ( + substr( + $against, 0, + rindex( $against, '_' ) + ), + substr( + $against, + rindex( $against, '_' ) + 1 + ) + ); + } + + # Prepare hashes. + unless ( $guesses{$type_major} ) { + $guesses{$type_major} = (); + } + unless ( $guesses{$type_major}{$type_minor} ) { + $guesses{$type_major}{$type_minor} = (); + } + + # Loop across the evidence + foreach my $piece_of_evidence (@evidence) { + + # Here is an example piece of evidence + # 'GK_Gilmour_Gene43' => { + # 'SP18' => [ + # { + # 'data' => { + # 'evalue' => '2e-08', + # 'dice' => '23.8627' + # }, + # 'type' => 'psiblast' + # } + # ], + # + if ( $type_major !~ /subject i/ ) { + my %piece = %{$piece_of_evidence}; + if ( + $self->validate_evidence( + $piece_of_evidence) > 0 + ) + { + $guess_evidence{$type_major}++; + $guess_evidence{ + "$type_major$type_minor" + }++; + + # If it's not defined, set up sub arrays. + unless ( $guesses{$type_major} + {$type_minor} + { $piece{'type'} } ) + { + if ( $piece{'type'} + eq 'psiblast' ) + { + $guesses{ + $type_major + }{$type_minor} + { + $piece{ +'type' + } + } + = { + 'evalue' + => [], + 'dice' + => [] + }; + } + else { + $guesses{ + $type_major + }{$type_minor} + { + $piece{ +'type' + } + } + = { 'evalue' + => [] + }; + } + } + + # If it's zero, correct to zero. + if ( $piece{'data'}{'evalue'} + eq '0.0' ) + { + $piece{'data'}{'evalue'} + = '0.0'; + } + + # Add our evalue + push( + $guesses{$type_major} + {$type_minor} + { $piece{'type'} } + {'evalue'}, + $piece{'data'}{'evalue'} + ); + + # And if psiblast, add dice + if ( $piece{'type'} eq + 'psiblast' ) + { + push( + $guesses{ + $type_major + }{$type_minor} + { + $piece{ +'type' + } + }{'dice'}, + $piece{'data'} + {'dice'} + ); + } + } + } + } + } + + my @output_sheet; + + foreach my $major ( keys %guesses ) { + if ( $guess_evidence{$major} ) { + foreach my $minor ( keys %{ $guesses{$major} } ) + { + if ( $guess_evidence{"$major$minor"} ) { + foreach my $evidence_category ( + keys %{ + $guesses{$major} + {$minor} + } + ) + { + if ( $evidence_category + ne 'evidence' ) + { + # things like evalue, dice + my %hits = %{ + $guesses{ + $major + }{ + $minor + }{ + $evidence_category + } + }; + foreach + my $subtype ( + keys + %hits ) + { # should be evalue, dice + push( + @output_sheet, + [ + $major, + $guess_evidence{ + $major + } + , + $minor, + $guess_evidence{ +"$major$minor" + } + , + $evidence_category, + $subtype, + join + ( +',', + @{ + $hits{ + $subtype + } + } + ) + ] + ); + } + } + } + } + } + } + } + + if ( !scalar @output_sheet ) { + @output_sheet = ( ['No evidence above threshold'], ); + } + $output{$input_key} = { + header => \@header, + data => \@output_sheet, + }; + } + return \%output; +} + +sub validate_evidence { + my ( $self, $piece_of_evidence ) = @_; + my %piece = %{$piece_of_evidence}; + + #my ($self, $type, $subtype, $value) = @_; + # { + # 'data' => { + # 'evalue' => '2e-08', + # 'dice' => '23.8627' + # }, + # 'type' => 'psiblast' + # } + if ( $piece{type} eq 'hmmer' ) { + my $value = $piece{data}{evalue}; + if ( $value eq '0.0' || $value eq '0' ) { + return 1; + } + elsif ( !defined($value) || $value eq '' ) { + return 0; + } + else { + return ( log($value) < $self->hmmer_evalue_cutoff() ) + ; # -64 + } + } + elsif ( $piece{type} eq 'psiblast' ) { + my $evalue = $piece{data}{evalue}; + my $dice = $piece{data}{dice}; + + my $evalue_return = 0; + if ( $evalue eq '0.0' || $evalue eq '0' ) { + $evalue_return = 1; + } + else { + $evalue_return = + ( log($evalue) < $self->blast_evalue_cutoff() ); #-140 + } + + return $evalue_return + && ( $dice > $self->blast_dice_cutoff() ); # 30 + } +} + +1; + +__END__ + +=pod + +=encoding UTF-8 + +=head1 NAME + +CPT::Analysis::TerL - Guess phage packaging strategy based on homology to terminases (TerL) of phages with known packaging strategies + +=head1 VERSION + +version 1.96 + +=head1 AUTHOR + +Eric Rasche <rasche.eric@yandex.ru> + +=head1 COPYRIGHT AND LICENSE + +This software is Copyright (c) 2014 by Eric Rasche. + +This is free software, licensed under: + + The GNU General Public License, Version 3, June 2007 + +=cut