Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
diff orthologs/ucsb_hamster/hamstrsearch_local-hmmer3.pl @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/hamstrsearch_local-hmmer3.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,922 @@ +#!/usr/bin/perl -w +use strict; + +use FindBin; +use lib "$FindBin::Bin/lib"; +use Getopt::Long; +use Bio::SearchIO; +use Bio::Search::Hit::BlastHit; +use run_genewise; + +# PROGRAMNAME: hamstrsearch_local.pl + +# Copyright (C) 2009 INGO EBERSBERGER, ingo.ebersberger@univie.ac.at +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published +# by the Free Software Foundation; either version 3 of the License +# or any later version. + +# This program is distributed in the hope that it will be useful +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# You should have received a copy of the GNU General Public License +# along with this program; If not, see http://www.gnu.org/licenses + +# PROGRAM DESCRIPTION: This is the relevant version! + +# DATE: Wed Dec 19 10:41:09 CEST 2007 + + +######################## start main ############################# +my $version = "\nhamstrsearch_local-v1.0.pl\nGalaxy implementation by Todd H. Oakley and Roger Ngo, UCSB\n"; +print "$version\n"; +### EDIT THE FOLLOWING LINES TO CUSTOMIZE YOUR SCRIPT +## note: all the variables can also be set via the command line +my $pid = $$; +my $prog = 'hmmsearch'; #program for the hmm search +my $alignmentprog = 'clustalw'; + + +#PATH SETTINGS +my $hmmpath = '.'; +my $blastpath = '.'; #Uses Galaxy working directory +my $tmpdir = 'tmp_' . $pid; +my $eval = 1; # eval cutoff for the hmm search +my $logfile = "hamstrsearch_" . $pid . '.log'; +my $hmm_dir = 'hmm_dir'; +my $fa_dir = ''; +############################## +my $help; +my $seq2store_file=''; +my $cds2store_file=''; +my $hmm; +my @hmms; +my $fa; +my $fafile; +my @seqs2store; +my @cds2store; +my $ep2eg; +my $estfile; +my $aln; +my $idfile; +my $taxon_check = 0; +my $hmmset; +my $hmmsearch_dir; +my $dbfile = ''; # the file hmmsearch is run against +my $dbfile_short; +my $taxon_file; +my $refspec_string; +my @refspec = qw(); +my @primer_taxa; +my $refspec_name = ''; +my $taxon_global; +my $fileobj; +my $fa_dir_neu = ''; +my $gwrefprot; +my $seqtype; +my $align; +my $rep; +my $estflag; +my $proteinflag; +my $refseq; +my $refspec_final = ''; +my $concat; +my $seqs2store_file; + +#####determine the hostname####### +my $hostname = `hostname`; +chomp $hostname; +print "hostname is $hostname\n"; + +################################# +if (@ARGV==0) { + $help = 1; +} +## help message +my $helpmessage = " +This program is freely distributed under a GPL. See -version for more info +Copyright (c) GRL limited: portions of the code are from separate copyrights + +\nUSAGE: hamstrsearch_local.pl -sequence_file=<> -hmmset=<> -taxon=<> -refspec=<> [-est|-protein] [-hmm=<>] [-representative] [-h] + +OPTIONS: + +-sequence_file: + path and name of the file containing the sequences hmmer is run against + Per default, this file should be in the data directory. +-est: + set this flag if you are searching in ESTs +-protein: + set this flag if you are searching in protein sequences +-hmmset: + specifies the name of the core-ortholog set. + The program will look for the files in the default directory 'core-orthologs' unless you specify + a different one. +-taxon: + You need to specify a default taxon name from which your ESTs or protein sequences are derived. +-refspec: + sets the reference species. Note, it has to be a species that contributed sequences + to the hmms you are using. NO DEFAULT IS SET! For a list of possible reference + taxa you can have a look at the speclist.txt file in the default core-ortholog sets + that come with this distribution. Please use the 5 letter abreviations. If you choose + to use core-orthologs were not every taxon is represented in all core-orthologs, you + can provide a comma-separated list with the preferred refspec first. The lower-ranking + reference species will only be used if a certain gene is not present in the preferred + refspecies due to alternative paths in the transitive closure to define + the core-orthologs. + CURRENTLY NO CHECK IS IMPLEMENTED! + NOTE: A BLAST-DB FOR THE REFERENCE SPECIES IS REQUIRED! +-eval_limit=<> + This options allows to set the e-value cut-off for the HMM search. + DEFAULT: 1 +-hmm: + option to provide only a single hmm to be used for the search. + Note, this file has to end with .hmm + +### the following options should only be used when you chose to alter the default structure of the +### hamstrsearch_local directories. Currently, this has not been extensively tested. +-fasta_file: + path and name of the file containing the core-ortholog sequences + you don't have to use this option when you +-hmmpath: + sets the path to the hmm_dir. By default this is set to the current directory. +-blastpath: + sets the path where the blast-dbs are located. Per default this is ../blast_dir + Note, the program expects the structure blastpath/refspec/refspec_prot. + To overrule this structure you will have to edit the script. + \n\n"; +GetOptions ("h" => \$help, + "hmm=s" => \$hmm, + "est" => \$estflag, + "protein"=> \$proteinflag, + "sequence_file=s" => \$dbfile, + "fasta_file=s" => \$fafile, + "hmmset=s" => \$hmmset, + "hmmpath=s" => \$hmmpath, + "taxon_file=s" => \$taxon_file, + "taxon=s" => \$taxon_global, + "eval_limit=s" => \$eval, + "refspec=s" => \$refspec_string, + "estfile=s" => \$estfile, + "representative" => \$rep, + "blastpath=s" => \$blastpath, + "galaxyout=s" => \$seqs2store_file, + "2galaxyout=s" => \$cds2store_file); + +if ($help) { + print $helpmessage; + exit; +} + +## 1) check if all information is available to run HaMStR +my ($check, @log) = &checkInput(); +if ($check == 0) { + print join "\n", @log; + print "$helpmessage"; + exit; +} +else { + open (OUT, ">$logfile") or die "could not open logfile $logfile\n"; + print OUT join "\n", @log; + close OUT; +} +### read in of the core-ortholog sequences +my $co_seqs = parseSeqfile("$fafile"); + +## 2) loop through the hmms +## process each hmm file separately +for (my $i = 0; $i < @hmms; $i++) { + $fileobj = undef; + my @seqs = qw(); + my @newseqs = qw();## var to contain the sequences to be added to the orthologous cluster + my @newcds = qw(); + my $hmm = $hmms[$i]; + my $hmmout = $hmm; + $hmmout =~ s/\.hmm/\.out/; + ## 3) run the hmm search + if (!(-e "$hmmsearch_dir/$hmmout")) { + print "now running $prog using $hmm\n"; + !`$prog $hmm_dir/$hmm $dbfile >$hmmsearch_dir/$hmmout` or die "Problem running hmmsearch\n"; + } + else { + print "an hmmresult $hmmout already exists. Using this one!\n"; + print "NOTE: in Galaxy the hmm results are stored in the directory of the dataset\n"; + } + + ## 4) process the hmm search result + my $hitcount = 0; + ## 4a) loop through the individual results + ## now the modified version for hmmer3 comes + my ($query_name, @results) = parseHmmer3($hmmout, $hmmsearch_dir); + if (! @results) { + print "no hit found for $query_name\n"; + next; + } + chomp $query_name; + print "Results for $query_name\n"; + for (my $k = 0; $k < @results; $k++) { + my $hitname = $results[$k]; + print "$hitname\n"; + my $keep = 0; + my $hitseq = ''; + $refseq = ''; + ## 4b) test for the reciprocity criterion fulfilled + ($keep, $hitname, $hitseq, $refspec_final, $refseq) = &check4reciprocity($query_name, $hitname, @refspec); + if ($keep == 1) { + ## blast search with the hmm hit identifies the core-ortholog sequence of the reference species + ## check for the taxon from the taxon_file.txt. + my $taxon = ''; + if ($taxon_check){ + if ($taxon_check == 1) { + $taxon = &getTaxon($hitname); + } + elsif ($taxon_check == 2) { + $taxon = $taxon_global; + } + } + ## put the info about the hits into an object for later post-processing + $fileobj->{$taxon}->{prot}->[$hitcount] = $hitseq; + $fileobj->{$taxon}->{ids}->[$hitcount] = $hitname; + $fileobj->{$taxon}->{refseq} = $refseq; + $hitcount++; + } + else { + print "match to different protein from $refspec_final\n"; + } + } + ## 5) do the rest only if at least one hit was obtained + if (defined $fileobj) { + ## 5a) if the hits are derived from ESTs, get the best ORF + if ($estflag) { + $fileobj = &predictORF(); + } + ## 5b) if the user has chosen to postprocess the results + if ($rep) { + &processHits($refseq, $concat); + } + ## 6) prepare the output + my @taxa = keys(%$fileobj); + for (my $i = 0; $i< @taxa; $i++) { + if ($rep) { +# push @newseqs, ">$query_name|$taxa[$i]|$fileobj->{$taxa[$i]}->{refid}"; +#Rearrange Order for Galaxy - want species first for phylotable format convert pipes to tabs + push @newseqs, ">$taxa[$i]\t$query_name\t$fileobj->{$taxa[$i]}->{refid}"; + push @newseqs, $fileobj->{$taxa[$i]}->{refprot}; + if ($estflag) { +# push @newcds, ">$query_name|$taxa[$i]|$fileobj->{$taxa[$i]}->{refid}"; +#Rearrange Order for Galaxy - want species first for phylotable format + push @newcds, ">$taxa[$i]\t$query_name\t$fileobj->{$taxa[$i]}->{refid}"; + push @newcds, $fileobj->{$taxa[$i]}->{refcds}; + } + } + else { + my $idobj = $fileobj->{$taxa[$i]}->{ids}; + my $protobj = $fileobj->{$taxa[$i]}->{prot}; + my $cdsobj = $fileobj->{$taxa[$i]}->{cds}; + for (my $j = 0; $j < @$idobj; $j++) { +# push @newseqs, ">$query_name|$taxa[$i]|$idobj->[$j]"; +#Rearrange Order for Galaxy - want species first for phylotable format also tabs not pipe + push @newseqs, ">$taxa[$i]\t$query_name\t$idobj->[$j]"; + push @newseqs, $protobj->[$j]; + if ($estflag) { +# push @newcds, ">$query_name|$taxa[$i]|$idobj->[$j]"; +#Rearrange Order for Galaxy - want species first for phylotable format + push @newcds, ">$taxa[$i]\t$query_name\t$idobj->[$j]"; + push @newcds, $cdsobj->[$j]; + } + } + } + my $refs = $co_seqs->{$query_name}; + for (keys %$refs) { + my $line = ">$query_name|$_|" . $refs->{$_}->{seqid} . "\n" . $refs->{$_}->{seq}; + push @seqs, $line; + } + chomp @seqs; + print "\n"; + @seqs = (@seqs, @newseqs); + open (OUT, ">$fa_dir_neu/$query_name.fa"); + print OUT join "\n", @seqs; + print OUT "\n"; + close OUT; + for (my $i = 0; $i < @newseqs; $i+= 2) { +# my $line = $newseqs[$i] . "|" . $newseqs[$i+1]; +#Galaxy uses tabs not pipes + my $line = $newseqs[$i] . "\t" . $newseqs[$i+1]; + $line =~ s/>//; + push @seqs2store, $line; + if ($estflag) { +#Galaxy uses tabs not pipes +# my $cdsline = $newcds[$i] . "|" . $newcds[$i+1]; + my $cdsline = $newcds[$i] . "\t" . $newcds[$i+1]; + $cdsline =~ s/>//; + push @cds2store, $cdsline; + } + } + } + } +} +if (@seqs2store > 0) { + open (OUT, ">$seqs2store_file") or die "failed to open output SEQS file\n"; + print OUT join "\n", @seqs2store; + print OUT "\n"; + close OUT; + if ($estflag) { + open (OUT, ">$cds2store_file") or die "failed to open output CDS file\n"; + print OUT join "\n", @cds2store; + print OUT "\n"; + close OUT; + } +} +else { + open (OUT, ">$seqs2store_file") or die "failed to open output SEQS file\n"; + print OUT "no hits found\n"; +} +exit; +##################### start sub ############### +####### checkInput performs a number of checks whether sufficient information +### and all data are available to run HaMStR +sub checkInput { + my @log; + my $check = 1; + $dbfile_short = $dbfile; + $dbfile_short =~ s/\..*//; + ## 1) check for filetype + print "Checking for filetype:\t"; + if (!defined $estflag and !defined $proteinflag) { + push @log, "please determine the sequence type. Choose between the options -EST or -protein"; + print "failed\n"; + $check = 0; + } + else { + if ($estflag) { + $estfile = $dbfile; + $dbfile = "$dbfile.tc"; + push @log, "HaMStR will run on the ESTs in $estfile"; + push @log, "Translating ESTs"; + if (!(-e "$dbfile")) { + print "translating $estfile, this may take a while\n"; + `translate.pl -in=$estfile -out=$dbfile`; + open (LOG, "$logfile") or die "could not open logfile $logfile\n"; + my @info = <LOG>; + @log = (@log, @info); + close LOG; + } + else { + push @log, "Translated file already exists, using this one\n"; + } + if (! -e "$dbfile") { + push @log, "The translation of $estfile failed. Check the script translate.pl"; + print "failed\n"; + $check = 0; + } + } + else { + ## file type is protein + print "succeeded\n"; + } + } + ## 2) Check for presence of blastall + print "Checking for the blast program\t"; + if (!(`blastall`)) { + push @log, "could not execute blastall. Please check if this program is installed and executable"; + print "failed\n"; + $check = 0; + } + else { + push @log, "check for blastall succeeded"; + print "succeeded\n"; + } + ## 3) Check for presence of hmmsearch + print "Checking for hmmsearch\t"; + if (! `$prog -h`) { + push @log, "could not execute $prog. Please check if this program is installed and executable"; + print "failed\n"; + $check = 0; + } + else { + push @log, "check for $prog succeeded\n"; + print "succeeded\n"; + } + ## 4) Check for reference taxon + print "Checking for reference species and blast-dbs\t"; + if (!(defined $refspec_string)) { + push @log, "Please provide a reference species for the reblast!"; + print "failed\n"; + $check = 0; + } + else { + push @log, "Reference species for the re-blast: $refspec_string"; + @refspec = split /,/, $refspec_string; + $refspec_name = $refspec[0]; + print "succeeded\n"; + } + ## 5) Check for presence of the required blast dbs + print "checking for blast-dbs:\t"; + for (my $i = 0; $i < @refspec; $i++) { + my $blastpathtmp = "$blastpath/$refspec[$i]/$refspec[$i]" . "_prot"; + if (! (-e "$blastpathtmp.pin")) { + push @log, "please edit the blastpath. Could not find $blastpathtmp"; + print "$blastpathtmp failed\n"; + $check = 0; + } + else { + push @log, "check for $blastpathtmp succeeded"; + print "succeeded\n"; + } + } + ## 6) Check for presence of the directory structure + print "checking for presence of the hmm files:\t"; + if (!(defined $hmmset)) { + $hmmpath = '.'; + $hmmset = 'manual'; + } + else { + $hmmpath = "$hmmpath/$hmmset"; + $fafile = "$hmmpath/$hmmset" . '.fa'; + } + $hmm_dir = "$hmmpath/$hmm_dir"; + $hmmsearch_dir = $dbfile_short . '_' . $hmmset; + +#CHANGED FOR GALAXY DIRECTORY + # $fa_dir_neu = 'fa_dir_' . $dbfile_short . '_' . $hmmset . '_' . $refspec_name; + $fa_dir_neu = $dbfile_short . '_' . $hmmset . '_' . $refspec_name; + ## 7) check for the presence of the hmm-files and the fasta-file + if (!(-e "$hmm_dir")) { + push @log, "Could not find $hmm_dir"; + print "failed\n"; + $check = 0; + } + else { + if (defined $hmm) { + if (! -e "$hmm_dir/$hmm") { + push @log, "$hmm has been defined but could not be found in $hmm_dir/$hmm"; + $check = 0; + } + else { + push @log, "$hmm has been found"; + if ($hmm =~ /\.hmm$/) { + @hmms = ($hmm); + } + } + } + else { + push @log, "running HaMStR with all hmms in $hmm_dir"; + @hmms = `ls $hmm_dir`; + } + chomp @hmms; + print "succeeded\n"; + } + + ## 8) Test for presence of the fasta file containing the sequences of the core-ortholog cluster + print "checking for presence of the core-ortholog file:\t"; + if (defined $fafile) { + if (! -e "$fafile") { + push @log, "Could not find the file $fafile"; + print "failed\n"; + $check = 0; + } + else { + push @log, "check for $fafile succeeded"; + print "succeeded\n"; + } + } + else { + push @log, "Please provide path and name of fasta file containing the core-ortholog sequences"; + $check = 0; + print "failed\n"; + } + ## 9) Checks for the taxon_file + print "testing whether the taxon has been determined:\t"; + if (!(defined $taxon_file) or (!(-e "$taxon_file"))) { + if (defined $taxon_global) { + push @log, "using default taxon $taxon_global for all sequences"; + print "succeeded\n"; + $taxon_check = 2; + } + else { + push @log, "No taxon_file found. Please provide a global taxon name using the option -taxon"; + print "failed\n"; + $check = 0; + } + } + else { + push @log, "using the file $taxon_file as taxon_file"; + print "succeeded\n"; + $taxon_check = 1; + } + ## 10) Set the file where the matched seqs are found +#CHANGED BY THO FOR GALAXY TO ALLOW DETERMINATION OF OUTPUT FILE Made INPUT Option +# $seqs2store_file = 'hamstrsearch_' . $dbfile_short . '_' . $hmmset . '.out'; +# $cds2store_file = 'hamstrsearch_' . $dbfile_short . '_' . $hmmset . '_cds.out'; + + ## 11) apply the evalue-cut-off to the hmmsearch program + $prog = $prog . " -E $eval"; + push @log, "hmmsearch: $prog"; + + ## 12) setting up the directories where the output files will be put into. + if ($check == 1) { + if (!(-e "$hmmsearch_dir")) { + `mkdir $hmmsearch_dir`; + } + if (!(-e "$fa_dir_neu")) { + `mkdir $fa_dir_neu`; + } + if (!(-e "$tmpdir")) { + `mkdir $tmpdir`; + } + } + return ($check, @log); +} +################# +## check4reciprocity is the second major part of the program. It checks +## whether the protein sequence that has been identified by the hmmsearch +## identifies in turn the protein from the reference taxon that was used to +## build the hmm. +sub check4reciprocity { + my ($query_name, $hitname, @refspec) = @_; + my $searchdb; + ## get the sequence from the db_file + my $hitseq = `grep -m 1 -A 1 ">$hitname" $dbfile | tail -n 1`; + if (!defined $hitseq) { + print "could not retrieve a sequence for $hitname. Skipping...\n"; + return(0, '', '', ''); + } + else { + ## now get the sequence used for building the hmm + my @original; + my $refspec_final; + for (my $i = 0; $i < @refspec; $i++) { + @original = `grep "^>$query_name|$refspec[$i]" $fafile |sed -e "s/.*$refspec[$i]\|//"`; + chomp @original; + + if (@original > 0) { + $refspec_final = $refspec[$i]; + $searchdb = "$blastpath/$refspec_final/$refspec_final" . "_prot"; + last; + } + else { + print "original sequence not be found with grepping for ^>$query_name|$refspec[$i]. Proceeding with next refspec\n"; + } + } + if (@original == 0) { + print "original sequence not be found\n"; + return (0, '', '', $refspec_final); + } + print "REFSPEC is $refspec_final\n"; + ## continue with the blast + chomp $hitseq; +# $hitname =~ s/\|.*//; + ## now run the blast + open (OUT, ">$tmpdir/$$.fa") or die "could not open out for writing\n"; + print OUT ">$hitname\n$hitseq"; + close OUT; + !`blastall -p blastp -d $searchdb -v 10 -b 10 -i $tmpdir/$$.fa -o $tmpdir/$$.blast` or die "Problem running blast\n"; + ## now parse the best blast hit + my @hits = &getBestBlasthit("$tmpdir/$$.blast"); + if (@hits > 0) { + + print "hmm-seq: ", join "\t", @original , "\n"; + ## now loop through the best hits with the same evalue and check whether + ## among these I find the same seq as in $original + for (my $i = 0; $i <@hits; $i++) { + print "blast-hit: $hits[$i]"; + ## now loop through all the refspec-sequences in the hmm file + for (my $j = 0; $j < @original; $j++) { + if ($original[$j] eq $hits[$i]) { + print "\tHit\n"; + my ($refseq) = `grep -A 1 "$query_name|$refspec_final|$original[$j]" $fafile |tail -n 1`; + return (1, $hitname, $hitseq, $refspec_final, $refseq); + } + else { + print "\nnot hitting $original[$j]\n"; + } + } + } + ### if we end up here, we didn't find a hit that matches to the original sequence + ### in the top hits with the same eval + return (0, '', '', $refspec_final); + } + else { + print "no hit obtained\n"; + return(0, '', '', $refspec_final); + } + } +} +############# +sub getBestBlasthit { + my @hits; + my ($file) = @_; + my $searchio = Bio::SearchIO->new(-file => $file, + -format => 'blast', + -report_type => 'blastp') or die "parse failed"; + while( my $result = $searchio->next_result ){ + my $count = 0; + my $sig; + my $sig_old; + while( my $hit = $result->next_hit){ + ## now I enter all top hits having the same evalue into the result + $sig = $hit->score; + if (!defined $sig_old) { + $sig_old = $sig; + } + if ($sig == $sig_old) { + push @hits, $hit->accession; + } + else { + last; + } + } + } + return(@hits); +} +################## +sub getTaxon { + my ($hitname) = @_; +# my $q = "select name from taxon t, est_project e, est_info i, annotation_neu a where a.id = $hitname and a.contig_id = i.contig_id and i.project_id = e.project_id and e.taxon_id = t.taxon_id"; + if ($hitname =~ /\D/) { + $hitname =~ s/_.*//; + } + my $taxon = `grep -m 1 "^$hitname," $taxon_file | sed -e 's/^.*,//'`; + chomp $taxon; + $taxon =~ s/^[0-9]+,//; + $taxon =~ s/\s*$//; + $taxon =~ s/\s/_/g; + if ($taxon) { + return ($taxon); + } + else { + return(); + } +} +############### +sub processHits { + my ($concat) = @_; + ## 1) align all hit sequences for a taxon against the reference species + my @taxa = keys(%$fileobj); + for (my $i = 0; $i < @taxa; $i++) { + &orfRanking($taxa[$i]); + } +} + +################ +sub predictORF { + my $fileobj_new; +# my ($refseq) = @_; + my @taxa = keys(%$fileobj); + for (my $i = 0; $i < @taxa; $i++) { + my $protobj = $fileobj->{$taxa[$i]}->{prot}; + my $idobj = $fileobj->{$taxa[$i]}->{ids}; + my $refseq = $fileobj->{$taxa[$i]}->{refseq}; + my @ids = @$idobj; + for (my $j = 0; $j < @ids; $j++) { + ## determine the reading frame + my ($rf) = $ids[$j] =~ /.*_RF([0-9]+)/; + print "rf is $rf\n"; + $ids[$j] =~ s/_RF.*//; +# my $est = `grep -A 1 "$ids[$j]" $estfile |tail -n 1`; +################new grep command from version 8 to fix bug + my $est = `grep -A 1 ">$ids[$j]\\b" $estfile |tail -n 1`; + if (! $est) { + die "error in retrieval of est sequence for $ids[$j] in subroutine processHits\n"; + } + ## the EST is translated in rev complement + if ($rf > 3) { + $est = revComp($est); + } + + my $gw = run_genewise->new($est, $refseq, "$tmpdir"); + my $translation = $gw->translation; + my $cds = $gw->codons; + $translation =~ s/[-!]//g; + $fileobj_new->{$taxa[$i]}->{ids}->[$j] = $ids[$j]; + $fileobj_new->{$taxa[$i]}->{prot}->[$j] = $translation; + $fileobj_new->{$taxa[$i]}->{cds}->[$j] = $cds; + $fileobj_new->{$taxa[$i]}->{refseq} = $refseq; + } + } + return($fileobj_new); +} +############################ +sub orfRanking { + my ($spec) = @_; + my $result; + my $refprot; + my $refcds; + my @toalign; + my $protobj = $fileobj->{$spec}->{prot}; + my $idobj = $fileobj->{$spec}->{ids}; + my $refcluster; ## variables to take the cluster and its id for later analysis + my $refid; + if (@$protobj == 1) { + ## nothing to chose from + $refprot = $protobj->[0]; + $refcds = $fileobj->{$spec}->{cds}->[0]; + my $length = length($refprot); + $refid = $idobj->[0] . "-" . $length; + } + else { + ## more than one cluster + push @toalign, ">$refspec_final"; + push @toalign, $fileobj->{$spec}->{refseq}; + ## now walk through all the contigs + for (my $i = 0; $i < @$protobj; $i++) { + my @testseq = (">$idobj->[$i]", $protobj->[$i]); + @testseq = (@testseq, @toalign); + open (OUT, ">$tmpdir/$pid.ref.fa") or die "could not open file for writing refseqs\n"; + print OUT join "\n", @testseq; + close OUT; + ## run clustalw + !(`$alignmentprog $tmpdir/$pid.ref.fa -output=fasta -outfile=$tmpdir/$pid.ref.aln 2>&1 >$tmpdir/$pid.ref.log`) or die "error running clustalw\n"; + ## get the alignment score + $result->[$i]->{score} = `grep "Alignment Score" $tmpdir/$pid.ref.log |sed -e 's/[^0-9]//g'`; + if (!$result->[$i]->{score}) { + die "error in determining alignment score\n"; + } + chomp $result->[$i]->{score}; + ## get the aligned sequence + open (ALN, "$tmpdir/$pid.ref.aln") or die "failed to open alignment file\n"; + my @aln = <ALN>; + close ALN; + my $aseq = extractSeq($idobj->[$i], @aln); + ## remove the terminal gaps + $aseq =~ s/-*$//; + $result->[$i]->{aend} = length $aseq; + my ($head) = $aseq =~ /^(-*).*/; + ($result->[$i]->{astart}) = length($head)+1; + } + ### the results for all seqs has been gathered, now order them + $result = sortRef($result); + ($refprot, $refcds, $refid) = &determineRef($result,$spec); + } + $fileobj->{$spec}->{refprot} = $refprot; + $fileobj->{$spec}->{refcds} = $refcds; + $fileobj->{$spec}->{refid} = $refid; + return(); +} +########################### +sub sortRef { + my $result = shift; + my @sort; + for (my $i = 0; $i < @$result; $i++) { + push @sort, "$i,$result->[$i]->{astart},$result->[$i]->{aend},$result->[$i]->{score}"; + } + open (OUT, ">$tmpdir/$pid.sort") or die "failed to write for sorting\n"; + print OUT join "\n", @sort; + close OUT; + `sort -n -t ',' -k 2 $tmpdir/$pid.sort >$tmpdir/$pid.sort.out`; + @sort = `less $tmpdir/$pid.sort`; + chomp @sort; + $result = undef; + for (my $i = 0; $i < @sort; $i++) { + ($result->[$i]->{id}, $result->[$i]->{start}, $result->[$i]->{end}, $result->[$i]->{score}) = split ',', $sort[$i]; + } + return($result); +} +######################## +sub determineRef { + my ($result, $spec) = @_; + my $lastend = 0; + my $lastscore = 0; + my $final; + my $count = 0; + my $id = ''; + for (my $i = 0; $i < @$result; $i++) { + if ($result->[$i]->{start} < $lastend or $lastend == 0) { + if ($result->[$i]->{score} > $lastscore) { + $lastend = $result->[$i]->{end}; + $lastscore = $result->[$i]->{score}; + $id = $result->[$i]->{id}; + } + } + elsif ($result->[$i]->{start} > $lastend) { + ## a new part of the alignment is covered. Fix the results obtained so far + $final->[$count]->{id} = $id; + $lastend = $result->[$i]->{end}; + $id = $result->[$i]->{id}; + $count++; + } + } + $final->[$count]->{id} = $id; + ## now concatenate the results + my $refprot = ''; + my $refid = ''; + my $refcds = ''; + for (my $i = 0; $i < @$final; $i++) { + my $seq = $fileobj->{$spec}->{prot}->[$final->[$i]->{id}]; + my $cdsseq = $fileobj->{$spec}->{cds}->[$final->[$i]->{id}]; + my $length = length($seq); + $refid .= "$fileobj->{$spec}->{ids}->[$final->[$i]->{id}]-$length" . "PP"; + $refprot .= $seq; + if ($estflag) { + $refcds .= $cdsseq; + } + } + $refid =~ s/PP$//; + return($refprot, $refcds, $refid); +} +############################# +sub extractSeq { + my ($id, @aln) = @_; + my $seq = ''; + my $start = 0; + for (my $i = 0; $i < @aln; $i++) { + if ($aln[$i] =~ $id) { + $start = 1; + } + elsif ($aln[$i] =~ />/ and $start == 1) { + last; + } + elsif ($start == 1) { + $seq .= $aln[$i]; + } + } + $seq =~ s/\s//g; + return ($seq); +} +############################## +sub revComp { + my ($seq) = @_; + $seq =~ tr/AGCTYRKMWSagct/TCGARYMKWSTCGA/; + $seq = reverse($seq); + return($seq); +} +############################## +sub parseHmmer3 { + my ($file, $path) = @_; + if (!defined $path) { + $path = '.'; + } + open (IN, "$path/$file") or die "failed to open $file\n"; + my @data = <IN>; + close IN; + ### extract the hits + my @hit; + my $start = 0; + my $stop = 0; + my $i = 0; + for (my $i = 0; $i < @data; $i++) { + if (!($data[$i] =~ /\S/)) { + next; + } + else { + if ($data[$i] =~ /Scores for complete sequence/) { + $start = 1; + $i += 4; + } + elsif (($data[$i] =~ /inclusion threshold/) or ($data[$i] =~ /Domain and alignment/)) { + last; + } + if ($start == 1 and $stop == 0) { + $data[$i] =~ s/^\s+//; + my @list = split /\s+/, $data[$i]; + push @hit, $list[8]; + $start = 0; #Added by THO + } + } + } + ### get the query_id + my ($query) = grep /^Query:/, @data; + $query =~ s/^Query:\s+//; + $query =~ s/\s.*//; + if (defined $hit[0]) { + chomp @hit; + return ($query, @hit); + } + else { + return ($query); + } +} +##################### +sub parseSeqfile { + my $seqref; + my $id; + my $spec; + my $seqid; + my $seq; + my $file = shift; + open (IN, "$file") or die "failed to open $file\n"; + my @seqs = <IN>; + close IN; + chomp @seqs; + for (my $i = 0; $i < @seqs; $i++) { + if ($seqs[$i] =~ />/) { + $seqs[$i] =~ s/>//; + if (defined $id and defined $seq) { + $seqref->{$id}->{$spec}->{seqid} = $seqid; + $seqref->{$id}->{$spec}->{seq} = $seq; + $seq = undef; + } + ($id, $spec, $seqid) = split (/\|/, $seqs[$i]); + } + else { + $seq .= $seqs[$i]; + } + } + if (defined $id and defined $seq) { + $seqref->{$id}->{$spec}->{seqid} = $seqid; + $seqref->{$id}->{$spec}->{seq} = $seq; + $seq = undef; + } + return ($seqref); +}