Mercurial > repos > clustalomega > clustalomega
diff clustalomega/clustal-omega-0.2.0/src/hhalign/hhhitlist-C.h @ 0:ff1768533a07
Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author | clustalomega |
---|---|
date | Tue, 07 Jun 2011 17:04:25 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/clustalomega/clustal-omega-0.2.0/src/hhalign/hhhitlist-C.h Tue Jun 07 17:04:25 2011 -0400 @@ -0,0 +1,3198 @@ +/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ + +/********************************************************************* + * Clustal Omega - Multiple sequence alignment + * + * Copyright (C) 2010 University College Dublin + * + * Clustal-Omega is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This file is part of Clustal-Omega. + * + ********************************************************************/ + +/* + * RCS $Id: hhhitlist-C.h 199 2011-02-21 18:24:49Z fabian $ + */ + +// hhhitlist.C + +#ifndef MAIN +#define MAIN +#include <iostream> // cin, cout, cerr +#include <fstream> // ofstream, ifstream +#include <stdio.h> // printf +#include <stdlib.h> // exit +#include <string> // strcmp, strstr +#include <math.h> // sqrt, pow +#include <limits.h> // INT_MIN +#include <float.h> // FLT_MIN +#include <time.h> // clock +#include <ctype.h> // islower, isdigit etc +using std::ios; +using std::ifstream; +using std::ofstream; +using std::cout; +using std::cerr; +using std::endl; +#include "util-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc. +#include "list.h" // list data structure +#include "hash.h" // hash data structure +#include "hhdecl-C.h" // constants, class +#include "hhutil-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc. +#include "hhhmm.h" // class HMM +#include "hhalignment.h" // class Alignment +#include "hhhit.h" +#include "hhhalfalignment.h" +#include "hhfullalignment.h" +#endif + + +////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////// +//// Methods of class HitList +////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////// + + + +////////////////////////////////////////////////////////////////////////////// +/** + * @brief Print summary listing of hits + */ +void +HitList::PrintHitList(HMM& q, char* outfile) +{ + Hit hit; + int nhits=0; + char str[NAMELEN]=""; + + FILE* outf=NULL; + if (strcmp(outfile,"stdout")) + { + outf=fopen(outfile,"w"); + if (!outf) OpenFileError(outfile); + } + else + outf = stdout; + + + fprintf(outf,"Query %s\n",q.longname); +// fprintf(outf,"Family %s\n",q.fam); + fprintf(outf,"Match_columns %i\n",q.L); + fprintf(outf,"No_of_seqs %i out of %i\n",q.N_filtered,q.N_in); + fprintf(outf,"Neff %-4.1f\n",q.Neff_HMM); + fprintf(outf,"Searched_HMMs %i\n",N_searched); + + // Print date stamp + time_t* tp=new(time_t); + *tp=time(NULL); + fprintf(outf,"Date %s",ctime(tp)); + delete (tp); (tp) = NULL; + + // Print command line + fprintf(outf,"Command "); + for (int i=0; i<par.argc; i++) + if (strlen(par.argv[i])<=par.maxdbstrlen) + fprintf(outf,"%s ",par.argv[i]); + else + fprintf(outf,"<%i characters> ",(int)strlen(par.argv[i])); + fprintf(outf,"\n\n"); + +#ifdef WINDOWS + if (par.trans) + fprintf(outf," No Hit Prob E-trans E-value Score SS Cols Query HMM Template HMM\n"); + else + fprintf(outf," No Hit Prob E-value P-value Score SS Cols Query HMM Template HMM\n"); +#else + if (par.trans) + fprintf(outf," No Hit Prob E-trans E-value Score SS Cols Query HMM Template HMM\n"); + else + fprintf(outf," No Hit Prob E-value P-value Score SS Cols Query HMM Template HMM\n"); +#endif + + Reset(); + while (!End()) // print hit list + { + hit = ReadNext(); + if (nhits>=par.Z) break; //max number of lines reached? + if (nhits>=par.z && hit.Probab < par.p) break; + if (nhits>=par.z && hit.Eval > par.E) continue; +// if (hit.matched_cols <=1) continue; // adding this might get to intransparent... analogous statement in PrintAlignments + nhits++; + sprintf(str,"%3i %-30.30s ",nhits,hit.longname); + + +#ifdef WINDOWS + if (par.trans) // Transitive scoring + fprintf(outf,"%-34.34s %5.1f %8.2G %8.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.E1val,hit.Eval,hit.score,hit.score_ss,hit.matched_cols); + else // Normal scoring + fprintf(outf,"%-34.34s %5.1f %8.2G %8.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.Eval,hit.Pval,hit.score,hit.score_ss,hit.matched_cols); +#else + if (par.trans) // Transitive scoring + fprintf(outf,"%-34.34s %5.1f %7.2G %7.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.E1val,hit.Eval,hit.score,hit.score_ss,hit.matched_cols); + else // Normal scoring + fprintf(outf,"%-34.34s %5.1f %7.2G %7.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.Eval,hit.Pval,hit.score,hit.score_ss,hit.matched_cols); +#endif + + sprintf(str,"%4i-%-4i ",hit.i1,hit.i2); + fprintf(outf,"%-10.10s",str); + sprintf(str,"%4i-%-4i",hit.j1,hit.j2); + fprintf(outf,"%-9.9s(%i)\n",str,hit.L); + } //end print hit list + fprintf(outf,"\n"); + if (strcmp(outfile,"stdout")) fclose(outf); +} + + + +////////////////////////////////////////////////////////////////////////////// +/** + * @brief Print alignments of query sequences against hit sequences + */ +void +HitList::PrintAlignments( + + +#ifdef CLUSTALO + char **ppcFirstProf, char **ppcSecndProf, +#endif + HMM& q, char* outfile, char outformat) +{ + Hit hit; + FullAlignment qt_ali(par.nseqdis+10); // maximum 10 annotation (pseudo) sequences (ss_dssp, sa_dssp, ss_pred, ss_conf, consens,...) + int nhits=0; + +#ifndef CLUSTALO_NOFILE + FILE* outf=NULL; + if (strcmp(outfile,"stdout")) + { + if (outformat==0) + outf=fopen(outfile,"a"); //append to summary hitlist + else + outf=fopen(outfile,"w"); //open for writing + if (!outf) OpenFileError(outfile); + } + else + outf = stdout; +#endif + + Reset(); + while (!End()) // print hit list + { + if (nhits>=par.B) break; //max number of lines reached? + hit = ReadNext(); + if (nhits>=par.b && hit.Probab < par.p) break; + if (nhits>=par.b && hit.Eval > par.E) continue; + // // adding this might get to intransparent... + // // analogous statement in PrintHitlist and hhalign.C + // if (hit.matched_cols <=1) continue; + nhits++; + + // Build double alignment of query against template sequences + qt_ali.Build(q,hit); + +#ifndef CLUSTALO + // Print out alignment + if (outformat==0) // HHR format + { + fprintf(outf,"No %-3i\n",nhits); + qt_ali.PrintHeader(outf,q,hit); + qt_ali.PrintHHR(outf,hit); + } + else if (outformat==1) // FASTA format + { + fprintf(outf,"# No %-3i\n",nhits); + qt_ali.PrintFASTA(outf,hit); + } + else if(outformat==2) // A2M format + { + fprintf(outf,"# No %-3i\n",nhits); + qt_ali.PrintA2M(outf,hit); + } + else // A3m format + { + fprintf(outf,"# No %-3i\n",nhits); + qt_ali.PrintA3M(outf,hit); + } +#else + qt_ali.OverWriteSeqs(ppcFirstProf, ppcSecndProf); +#endif + + qt_ali.FreeMemory(); + } +#ifndef CLUSTALO_NOFILE + if (strcmp(outfile,"stdout")) fclose(outf); +#endif +} + + + + + +//////////////////////////////////////////////////////////////////////////// +/** + * @brief Return the ROC_5 score for optimization + * (changed 28.3.08 by Michael & Johannes) + */ +void +HitList::Optimize(HMM& q, char* buffer) +{ + const int NFAM =5; // calculate ROC_5 score + const int NSFAM=5; // calculate ROC_5 score + int roc=0; // ROC score + int fam=0; // number of hits from same family (at current threshold) + int not_fam=0; // number of hits not from same family + int sfam=0; // number of hits from same suporfamily (at current threshold) + int not_sfam=0; // number of hits not from same superfamily + Hit hit; + + SortList(); + Reset(); + while (!End()) + { + hit = ReadNext(); + if (!strcmp(hit.fam,q.fam)) fam++; // query and template from same superfamily? => positive + else if (not_fam<NFAM) // query and template from different family? => negative + { + not_fam++; + roc += fam; + } + if (!strcmp(hit.sfam,q.sfam)) sfam++; // query and template from same superfamily? => positive + else if (not_sfam<NSFAM) // query and template from different superfamily? => negative + { + not_sfam++; + roc += sfam; + } +// printf("qfam=%s tfam=%s qsfam=%s tsfam=%s fam=%-2i not_fam=%3i sfam=%-3i not_sfam=%-5i roc=%-3i\n",q.fam,hit.fam,q.sfam,hit.sfam,fam,not_fam,sfam,not_sfam,roc); + } + + // Write ROC score to file or stdout + FILE* buf=NULL; + if (strcmp(par.buffer,"stdout")) + { + buf=fopen(buffer,"w"); + if (!buf) OpenFileError(par.buffer); + } + else + buf = stdout; + + fprintf(buf,"%f\n",float(roc)/float(fam*NFAM+sfam*NSFAM)); // must be between 0 and 1 + if (v>=2) printf("ROC=%f\n",float(roc)/float(fam*NFAM+sfam*NSFAM)); + fclose(buf); +} + + + +////////////////////////////////////////////////////////////////////////////// +/** + * @brief Print score distribution into file score_dist + */ +void +HitList::PrintScoreFile(HMM& q) +{ + int i=0, n; + FILE* scoref=NULL; + Hit hit; + Hash<int> twice(10000); // make sure only one hit per HMM is listed + twice.Null(-1); + + if (strcmp(par.scorefile,"stdout")) + { + scoref=fopen(par.scorefile,"w"); + if (!scoref) + {cerr<<endl<<"WARNING from "<<par.argv[0]<<": could not open \'"<<par.scorefile<<"\'\n"; return;} + } + else + scoref = stdout; + Reset(); + fprintf(scoref,"NAME %s\n",q.longname); + fprintf(scoref,"FAM %s\n",q.fam); + fprintf(scoref,"FILE %s\n",q.file); + fprintf(scoref,"LENG %i\n",q.L); + fprintf(scoref,"\n"); +//fprintf(scoref,"TARGET REL LEN COL LOG-PVA S-TOT MS NALI\n"); + +//For hhformat, the PROBAB field has to start at position 41 !! +// ----+----1----+----2----+----3----+----4----+---- + fprintf(scoref,"TARGET FAMILY REL LEN COL LOG-PVA S-AASS PROBAB SCORE\n"); + // d153l__ 5 185 185 287.82 464.22 100.00 + // d1qsaa2 3 168 124 145.55 239.22 57.36 + while (!End()) + { + i++; + hit = ReadNext(); + if (twice[hit.name]==1) continue; // better hit with same HMM has been listed already + twice.Add(hit.name,1); + //if template and query are from the same superfamily + if (!strcmp(hit.name,q.name)) n=5; + else if (!strcmp(hit.fam,q.fam)) n=4; + else if (!strcmp(hit.sfam,q.sfam)) n=3; + else if (!strcmp(hit.fold,q.fold)) n=2; + else if (!strcmp(hit.cl,q.cl)) n=1; + else n=0; + fprintf(scoref,"%-10s %-10s %1i %3i %3i %s %7.2f %6.2f %7.2f\n",hit.name,hit.fam,n,hit.L,hit.matched_cols,sprintg(-1.443*hit.logPval,7),-hit.score_aass,hit.Probab,hit.score); + } + fclose(scoref); +} + + +inline double +logPvalue_HHblast(double s, double corr) +{ + return -s*(1.0-0.5*corr) + (1.0-corr)*log(1.0+s); +// return -s*(1.0-0.5*corr) + log( 1.0+(1.0-corr)*s ); +// return -s*(1.0-0.5*corr) + log( 1.0+(1.0-corr)*(1.0-0.5*corr)*s ); +} + +inline double +Pvalue_HHblast(double s, double corr) +{ + return exp(-s*(1.0-0.5*corr)) * pow(1.0+s,1.0-corr); +// return exp(-s*(1.0-0.5*corr)) * ( 1.0+(1.0-corr)*s ); +// return exp(-s*(1.0-0.5*corr)) * ( 1.0+(1.0-corr)*(1.0-0.5*corr)*s ); +} + +inline double +logLikelihood_HHblast(double s, double corr) +{ + if (s<0.0) { s=0.0; if (corr<1E-5) corr=1E-5; else if (corr>0.99999) corr=0.99999; } + else { if (corr<0.0) corr=0.0; else if (corr>1.0) corr=1.0; } + return -s*(1.0-0.5*corr) - corr*log(1.0+s) + log(s*(1.0-0.5*corr)+0.5*corr); + // return -s*(1.0-0.5*corr) + log( s*(1.0-corr)*(1.0-0.5*corr)+0.5*corr ); + // return -s*(1.0-0.5*corr) + log((s*(1.0-corr)*(1.0-0.5*corr)+corr)*(1.0-0.5*corr)); +} + +///////////////////////////////////////////////////////////////////////////////////// +/** + * @brief Evaluate the *negative* log likelihood for the order statistic of the uniform distribution + * for the best 10% of hits (vertex v = (corr,offset) ) + * The k'th order statistic for X~Uniform is p:=X^(k)~Beta(k,n-k+1) = const*p^(k-1)*(1-p)^(n-k) + * Needed to fit the correlation and score offset in HHblast +*/ +double +HitList::RankOrderFitCorr(double* v) +{ + double sum=0.0; +// printf("%8.2G %8.2G %i\n",v[0],v[1],Nprof); + int i1 = imin(Nprof,imax(50,int(0.05*Nprof))); + for (int i=0; i<i1; i++) + { + double p = Pvalue_HHblast(score[i]+v[1],v[0]); +// sum -= (1.0-double(i)/double(i1)) * weight[i] * ( double(i)*log(p) + (Nprof-i-1.0)*log(1.0-p) ); + float diff = p-(float(i)+1.0)/(Nprof+1.0); + sum += (1.0-double(i)/double(i1)) * weight[i]*diff*diff*(Nprof+1.0)*(Nprof+1.0)*(Nprof+2.0)/(i+10.0)/(Nprof-i); +// printf("%-3i Pval=%7.5f Preal=%7.5f diff=%7.5f sum=%7.5f\n",i,p,float(i+1)/(1.0+Nprof),diff,sum); + } + return sum; +} + +/** + * @brief Static wrapper-function for calling the nonstatic member function RankOrderFitCorr() + * ( see http://www.newty.de/fpt/callback.html#member ) + */ +double +HitList::RankOrderFitCorr_static(void* pt2hitlist, double* v) +{ + HitList* mySelf = (HitList*) pt2hitlist; // explicitly cast to a pointer to Hitlist + return mySelf->RankOrderFitCorr(v); // call member function +} + +///////////////////////////////////////////////////////////////////////////////////// +/** + * @brief Evaluate the *negative* log likelihood of the data at the vertex v = (corr,offset) + * Needed to fit the correlation and score offset in HHblast + */ +double +HitList::LogLikelihoodCorr(double* v) +{ + double sum=0.0; +// printf("%8.2G %8.2G %i\n",v[0],v[1],Nprof); + for (int i=0; i<Nprof; i++) + { + sum -= weight[i]*logLikelihood_HHblast(score[i]+v[1],v[0]); +// printf("%-3i Pval=%7.5f Preal=%7.5f diff=%7.5f rmsd=%7.5f sum=%7.5f\n",i,Pvalue_HHblast(score[i],v[0]),float(i)/(1.0+Nprof),x,sqrt(sum/sumw),sum); + } + return sum; +} + +/** + * @brief Static wrapper-function for calling the nonstatic member function LogLikelihoodCorr() + * ( see http://www.newty.de/fpt/callback.html#member ) + */ +double +HitList::LogLikelihoodCorr_static(void* pt2hitlist, double* v) +{ + HitList* mySelf = (HitList*) pt2hitlist; // explicitly cast to a pointer to Hitlist + return mySelf->LogLikelihoodCorr(v); // call member function +} + +///////////////////////////////////////////////////////////////////////////////////// +/** + * @brief Evaluate the *negative* log likelihood of the data at the vertex v = (lamda,mu) + * p(s) = lamda * exp{ -exp[-lamda*(s-mu)] - lamda*(s-mu) } = lamda * exp( -exp(-x) - x) + */ +double +HitList::LogLikelihoodEVD(double* v) +{ + double sum=0.0, sumw=0.0; + for (int i=0; i<Nprof; i++) + { + double x = v[0]*(score[i]-v[1]); + sum += weight[i]*(exp(-x)+x); + sumw += weight[i]; + } + return sum - sumw*log(v[0]); +} + +/** + * @brief Static wrapper-function for calling the nonstatic member function LogLikelihoodEVD() + * ( see http://www.newty.de/fpt/callback.html#member ) + */ +double +HitList::LogLikelihoodEVD_static(void* pt2hitlist, double* v) +{ + HitList* mySelf = (HitList*) pt2hitlist; // explicitly cast to a pointer to Hitlist + return mySelf->LogLikelihoodEVD(v); // call member function +} + +///////////////////////////////////////////////////////////////////////////////////// +/** + * @brief Subroutine to FindMin: try new point given by highest point ihigh and fac and replace ihigh if it is lower + */ +double +HitList::TryPoint(const int ndim, double* p, double* y, double* psum, int ihigh, double fac, double (*Func)(void* pt2hitlist, double* v)) +{ + // New point p_try = p_c + fac*(p_high-p_c), + // where p_c = ( sum_i (p_i) - p_high)/ndim is the center of ndim other points + // => p_try = fac1*sum_i(p_i) + fac2*p_high + double fac1=(1.-fac)/ndim; + double fac2=fac-fac1; + double ptry[ndim]; //new point to try out + double ytry; //function value of new point + int j; //index for the ndim parameters + + for (j=0; j<ndim; j++) + ptry[j]=psum[j]*fac1+p[ihigh*ndim+j]*fac2; + ytry = (*Func)(this,ptry); + if (ytry<=y[ihigh]) + { +// if (v>=4) printf("Trying: %-7.3f %-7.3f %-7.3f -> accept\n",ptry[0],ptry[1],ytry); + y[ihigh]=ytry; + for (j=0; j<ndim; j++) + { + psum[j] += ptry[j]-p[ihigh*ndim+j]; //update psum[j] + p[ihigh*ndim+j]=ptry[j]; //replace p[ihigh] with ptry + } //Note: ihigh is now not highest point anymore! + } +// else if (v>=4) printf("Trying: %-7.3f %-7.3f %-7.3f -> reject\n",ptry[0],ptry[1],ytry); + + return ytry; +} + + + +///////////////////////////////////////////////////////////////////////////////////// +/** + * @brief Find minimum with simplex method of Nelder and Mead (1965) + */ +float +HitList::FindMin(const int ndim, double* p, double* y, double tol, int& nfunc, double (*Func)(void* pt2hitlist, double* v)) +{ + const int MAXNFUNC=99; //maximum allowed number of function evaluations + int ihigh; //index of highest point on simplex + int inext; //index of second highest point on simplex + int ilow; //index of lowest point on simplex + int i; //index for the ndim+1 points + int j; //index for the ndim parameters + double rtol; //tolerance: difference of function value between highest and lowest point of simplex + double temp; //dummy + double ytry; //function value of trial point + double psum[ndim]; //psum[j] = j'th coordinate of sum vector (sum over all vertex vectors) + + nfunc=0; //number of function evaluations =0 + //Calculate sum vector psum[j] + for (j=0; j<ndim; j++) + { + psum[j]=p[j]; + for (i=1; i<ndim+1; i++) + psum[j]+=p[i*ndim+j]; + } + + // Repeat finding better points in simplex until rtol<tol + while(1) + { + // Find indices for highest, next highest and lowest point + ilow=0; + if (y[0]>y[1]) {inext=1; ihigh=0;} else {inext=0; ihigh=1;} + for (i=0; i<ndim+1; i++) + { + if (y[i]<=y[ilow]) ilow=i; + if (y[i]>y[ihigh]) {inext=ihigh; ihigh=i;} + else if (y[i]>y[inext] && i!= ihigh) inext=i; + } + + // If tolerance in y is smaller than tol swap lowest point to index 0 and break -> return + rtol = 2.*fabs(y[ihigh]-y[ilow]) / (fabs(y[ihigh])+fabs(y[ilow])+1E-10); + if (rtol<tol) + { + temp=y[ilow]; y[ilow]=y[0]; y[0]=temp; + for (j=0; j<ndim; j++) + { + temp=p[ilow*ndim+j]; p[ilow*ndim+j]=p[j]; p[j]=temp; + } + break; + } + + // Max number of function evaluations exceeded? + if (nfunc>=MAXNFUNC ) + { + if (v) fprintf(stderr,"\nWARNING: maximum likelihood fit of score distribution did not converge.\n"); + return 1; + } + + nfunc+=2; + // Point-reflect highest point on the center of gravity p_c of the other ndim points of the simplex + if (v>=3) printf("%3i %-7.3f %-7.3f %-12.8f %-9.3E\n",nfunc,p[ilow*ndim],p[ilow*ndim+1],y[ilow],rtol); +// if (v>=2) printf(" %3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]); + ytry = TryPoint(ndim,p,y,psum,ihigh,-1.0,Func); //reflect highest point on p_c + + if (ytry<=y[ilow]) + { + ytry = TryPoint(ndim,p,y,psum,ihigh,2.0,Func); //expand: try new point 2x further away from p_c +// if (v>=2) printf("Expanded: %3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]); + } + else if (ytry>=y[inext]) + { + // The new point is worse than the second worst point + temp=y[ihigh]; + ytry=TryPoint(ndim,p,y,psum,ihigh,0.5,Func); //contract simplex by 0.5 along (p_high-p_c +// if (v>=2) printf("Compressed:%3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]); + if (ytry>=temp) + { + // Trial point is larger than worst point => contract simplex by 0.5 towards lowest point + for (i=0; i<ndim+1; i++) + { + if (i!=ilow) + { + for (j=0; j<ndim; j++) + p[i*ndim+j]=0.5*(p[i*ndim+j]+p[ilow+j]); + y[i] = (*Func)(this,p+i*ndim); +// y[i] = (*Func)(p+i*ndim); + } + } + nfunc+=ndim; +// if (v>=2) printf("Contracted:%3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]); + + //Calculate psum[j] + for (j=0; j<ndim; j++) + { + psum[j]=p[j]; + for (i=1; i<ndim+1; i++) + psum[j]+=p[i*ndim+j]; + } + } + } + else nfunc--; + } + return (float)rtol; +} + + + +///////////////////////////////////////////////////////////////////////////////////// +/** + * @brief Do a maximum likelihod fit of the scores with an EV distribution with parameters lamda and mu + */ +void +HitList::MaxLikelihoodEVD(HMM& q, int nbest) +{ + double tol=1E-6; // Maximum relative tolerance when minimizing -log(P)/N (~likelihood) + static char first_call=1; + static Hash<int> size_fam(MAXPROF/10); // Hash counts number of HMMs in family + static Hash<int> size_sfam(MAXPROF/10); // Hash counts number of families in superfamily + Hash<int> excluded(50); // Hash containing names of superfamilies to be excluded from fit + size_fam.Null(0); // Set int value to return when no data can be retrieved + size_sfam.Null(0); // Set int value to return when no data can be retrieved + excluded.Null(0); // Set int value to return when no data can be retrieved + Hit hit; + + double mu; // EVD[mu,lam](x) = exp(-exp(-(x-mu)/lam)) = P(score<=x) + double vertex[2*3]; // three vertices of the simplex in lamda-mu plane + double yvertex[3]; // log likelihood values at the three vertices of the simplex + int nfunc=0; // number of function calls + double sum_weights=0.0; + float sum_scores=0.0; + float rtol; + + if (first_call==1) + { + first_call=0; + // Count how many HMMs are in each family; set number of multiple hits per template nrep + if (v>=4) printf(" count number of profiles in each family and families in each superfamily ...\n"); + Reset(); + while (!End()) + { + hit = ReadNext(); + if (!size_fam.Contains(hit.fam)) (*size_sfam(hit.sfam))++; //Add one to hash element for superfamily + (*size_fam(hit.fam))++; //Add one to hash element for family + // printf("size(%s)=%i name=%s\n",hit.fam,*size_fam(hit.fam),hit.name) + } + fams=size_fam.Size(); + sfams=size_sfam.Size(); + if (v>=3) + printf("%-3i HMMs from %i families and %i superfamilies searched. Found %i hits\n",N_searched,fams,sfams,Size()); + } + + // Query has SCOP family identifier? + if (q.fam && q.fam[0]>='a' && q.fam[0]<='k' && q.fam[1]=='.') + { + char sfamid[NAMELEN]; + char* ptr_in_fam=q.fam; + while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-'))) + { + char* ptr=strrchr(sfamid,'.'); + if (ptr) *ptr='\0'; + excluded.Add(sfamid); +// fprintf(stderr,"Exclude SCOP superfamily %s ptr_in_fam='%s'\n",sfamid,ptr_in_fam); + } + } + // Exclude best superfamilies from fit + else if (nbest>0) + { + if (sfams<97+nbest) return; + + // Find the nbest best-scoring superfamilies for exclusion from first ML fit + if (v>=4) printf(" find %i best-scoring superfamilies to exclude from first fit ...\n",nbest); + hit = Smallest(); + excluded.Add(hit.sfam); +// printf("Exclude in first round: %s %8.2f %s\n",hit.name,hit.score_aass,hit.sfam); + while (excluded.Size()<nbest) + { + Reset(); + while (!End() && excluded.Contains(ReadNext().sfam)) ; + hit=ReadCurrent(); + while (!End()) + { + if (ReadNext()<hit && !excluded.Contains(ReadCurrent().sfam)) + hit=ReadCurrent(); + } + excluded.Add(hit.sfam); +// printf("Exclude in first round: %s %8.2f %s %i %i\n",hit.name,hit.score_aass,hit.sfam,excluded.Size(),excluded.Contains(hit.sfam)); + } + tol = 0.01/size_sfam.Size(); // tol=1/N would lead to delta(log-likelihood)~1 (where N ~ number of superfamilies) since (1+1/N)^N = e + } + else + { + // Find the best-scoring superfamilies from first fit for exclusion from second ML fit + if (v>=4) printf(" find best-scoring superfamilies to exclude from second fit ...\n"); + Reset(); + while (!End()) + { + hit = ReadNext(); + if (hit.Eval < 0.05) excluded.Add(hit.sfam); // changed from 0.5 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + } + tol = 0.001/size_sfam.Size(); // tol=1/N would lead to delta(log-likelihood)~1 (where N ~ number of superfamilies) since (1+1/N)^N = e + } + + // Put scores into score[] and weights into weight[] + if (v>=3) printf(" generate scores and weights array for ML fitting ...\n"); + Nprof=0; + Reset(); + while (!End()) + { + hit = ReadNext(); + if (hit.irep > 1) continue; //Use only best hit per template + if (Nprof>=MAXPROF) break; + + char sfamid[NAMELEN]; + char* ptr_in_fam=hit.fam; + while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-'))) + { + char* ptr=strrchr(sfamid,'.'); + if (ptr) *ptr='\0'; + if (excluded.Contains(sfamid)) break; //HMM is among superfamilies to be excluded + } + if (excluded.Contains(sfamid)) { + if (v>=3) fprintf(stderr,"Exclude hit %s (family %s contains %s)\n",hit.name,hit.fam,sfamid); + continue; + } +// ScopID(hit.cl,hit.fold,hit.sfam,hit.fam); //Get scop superfamily code for template +// if (*hit.sfam=='\0' || excluded.Contains(hit.sfam)) continue; // skip HMM + + score[Nprof] = hit.score; + weight[Nprof]=1./size_fam[hit.fam]/size_sfam[hit.sfam]; + sum_scores +=hit.score*weight[Nprof]; + sum_weights+=weight[Nprof]; + + //DEBUG +// if (v>=4) printf("%-10.10s %-12.12s %-3i %-12.12s %-3i %6.4f %6.4f %7.1f\n",hit.name,hit.fam,size_fam[hit.fam],hit.sfam,size_sfam[hit.sfam],1./size_fam[hit.fam]/size_sfam[hit.sfam],sum,hit.score); + Nprof++; + } + //DEBUG + if (v>=3) + printf("%i hits used for score distribution\n",Nprof); + // for (int i=0; i<Nprof; i++) printf("%3i score=%8.3f weight=%7.5f\n",i,score[i],weight[i]); + + // Set simplex vertices and function values + mu = sum_scores/sum_weights - 0.584/LAMDA; + if (par.loc) // fit only in local mode; in global mode use fixed value LAMDA and mu mean score + { + double (*Func)(void*, double*); + Func = HitList::LogLikelihoodEVD_static; + + if (nbest>0) {vertex[0]=LAMDA; vertex[1]=mu;} /////////////////////////////////////////// DEBUG + else {vertex[0]=q.lamda; vertex[1]=mu;} + vertex[2]=vertex[0]+0.1; vertex[3]=vertex[1]; + vertex[4]=vertex[0]; vertex[5]=vertex[1]+0.2; + yvertex[0]=Func(this,vertex ); + yvertex[1]=Func(this,vertex+2); + yvertex[2]=Func(this,vertex+4); + + // Find lam and mu that minimize negative log likelihood of data + if (v>=3) printf("Fitting to EVD by maximum likelihood...\niter lamda mu -log(P)/N tol\n"); + rtol = FindMin(2,vertex,yvertex,tol,nfunc,Func); + if (v>=3) printf("%3i %-7.3f %-7.2f %-7.3f %-7.1E\n\n",nfunc,vertex[0],vertex[1],yvertex[0]-(1.5772-log(vertex[0])),rtol); +// printf("HHsearch lamda=%-6.3f mu=%-6.3f\n",vertex[0],vertex[1]); + } + else + { + vertex[0]=LAMDA_GLOB; vertex[1]=mu; + } + + // Set lamda and mu of profile + q.lamda = vertex[0]; + q.mu = vertex[1]; + + // Set P-values and E-values + // CHECK UPDATE FROM score=-logpval to score=-logpval+SSSCORE2NATLOG*score_ss !!!! + Reset(); + while (!End()) + { + hit = ReadNext(); + + // Calculate total score in raw score units: P-value = 1- exp(-exp(-lamda*(Saa-mu))) + hit.weight=1./size_fam[hit.fam]/size_sfam[hit.sfam]; // needed for transitive scoring + hit.logPval = logPvalue(hit.score,vertex); + hit.Pval=Pvalue(hit.score,vertex); + hit.Eval=exp(hit.logPval+log(N_searched)); +// hit.score_aass = hit.logPval/0.45-3.0 - hit.score_ss; // median(lamda)~0.45, median(mu)~4.0 in EVDs for scop20.1.63 HMMs + hit.score_aass = -q.lamda*(hit.score-q.mu)/0.45-3.0 - fmin(hit.score_ss,fmax(0.0,0.5*hit.score-5.0)); // median(lamda)~0.45, median(mu)~3.0 in EVDs for scop20.1.63 HMMs + hit.Probab = Probab(hit); + if (nbest>0 && par.loc) // correct length correction (not needed for second round of fitting, since lamda very similar) + if (par.idummy==0) //////////////////////////////////////////// + hit.score += log(q.L*hit.L)*(1/LAMDA-1/vertex[0]); + hit.score_sort = hit.score_aass; + Overwrite(hit); // copy hit object into current position of hitlist + + if (nbest==0 && par.trans==1) // if in transitive scoring mode (weights file given) + TransitiveScoring(); + else if (nbest==0 && par.trans==2) // if in transitive scoring mode (weights file given) + TransitiveScoring2(); + else if (nbest==0 && par.trans==3) // if in transitive scoring mode (weights file given) + TransitiveScoring3(); + else if (nbest==0 && par.trans==4) // if in transitive scoring mode (weights file given) + TransitiveScoring4(); + } +} + + +///////////////////////////////////////////////////////////////////////////////////// +/** + * @brief Calculate correlation and score offset for HHblast composite E-values + */ +void +HitList::CalculateHHblastCorrelation(HMM& q) +{ + int nfunc=0; // number of function calls + double tol; // Maximum relative tolerance when minimizing -log(P)/N (~likelihood) + double vertex[2*3]; // three vertices of the simplex in lamda-mu plane + double yvertex[3]; // log likelihood values at the three vertices of the simplex + Hit hit; + Hash<int> excluded(50); // Hash containing names of superfamilies to be excluded from fit + excluded.Null(0); // Set int value to return when no data can be retrieved + + // Set sum of HHsearch and PSI-BLAST score for calculating correlation + Reset(); + while (!End()) + { + hit = ReadNext(); + hit.score_sort = hit.logPval + blast_logPvals->Show(hit.name); // if template not in hash, return log Pval = 0, i.e. Pvalue = 1! + Overwrite(hit); // copy hit object into current position of hitlist + } + + // Query has SCOP family identifier? + if (q.fam && q.fam[0]>='a' && q.fam[0]<='k' && q.fam[1]=='.') + { + char sfamid[NAMELEN]; + char* ptr_in_fam=q.fam; + while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-'))) + { + char* ptr=strrchr(sfamid,'.'); + if (ptr) *ptr='\0'; + excluded.Add(sfamid); + fprintf(stderr,"Exclude SCOP superfamily %s ptr_in_fam='%s'\n",sfamid,ptr_in_fam); + } + } + + // Resort list by sum of log P-values + ResortList(); // use InsertSort to resort list according to sum of minus-log-Pvalues + Nprof=0; + Reset(); + ReadNext(); // skip best hit + while (!End()) + { + hit = ReadNext(); + if (hit.irep>=2) continue; // use only best alignments +// if (hit.Eval<0.005) {if (v>=3) printf("Fitting HHblast correlation coefficient: skipping %s with Evalue=%9.1g\n",hit.name,hit.Eval); continue;} + if (Nprof>=MAXPROF) break; + + char sfamid[NAMELEN]; + char* ptr_in_fam=hit.fam; + while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-'))) + { + char* ptr=strrchr(sfamid,'.'); + if (ptr) *ptr='\0'; + if (excluded.Contains(sfamid)) break; //HMM is among superfamilies to be excluded + } + if (excluded.Contains(sfamid)) { + if (v>=1) fprintf(stderr,"Exclude hit %s (family %s contains %s)\n",hit.name,hit.fam,sfamid); + continue; + } + score[Nprof] = -hit.score_sort; + weight[Nprof] = 1.0; // = hit.weight; +// printf("%3i %-12.12s %7.3f + %7.3f = %7.3f \n",Nprof,hit.name,hit.logPval,blast_logPvals->Show(hit.name),-hit.score_sort); ////////////////////// + printf("%3i %7.3f %7.3f\n",Nprof,hit.Pval,exp(blast_logPvals->Show(hit.name))); ////////////////////// + Nprof++; + } + + // Fit correlation + vertex[0]=0.5; vertex[1]=0.2; + vertex[2]=vertex[0]+0.2; vertex[3]=vertex[1]; + vertex[4]=vertex[0]; vertex[5]=vertex[1]+0.2; + + yvertex[0]=RankOrderFitCorr(vertex ); + yvertex[1]=RankOrderFitCorr(vertex+2); + yvertex[2]=RankOrderFitCorr(vertex+4); +// yvertex[0]=LogLikelihoodCorr(vertex ); +// yvertex[1]=LogLikelihoodCorr(vertex+2); +// yvertex[2]=LogLikelihoodCorr(vertex+4); + tol = 1e-6; + v=3;////////////////////////////////// + // Find correlation and offset that minimize mean square deviation of reported composite Pvalues from actual + if (v>=2) printf("Fitting correlation coefficient for HHblast...\niter corr offset logLikelihood tol\n"); + float rtol = FindMin(2,vertex,yvertex,tol,nfunc, HitList::RankOrderFitCorr_static); + if (v>=2) printf("%3i %-7.3f %-7.2f %-7.3f %-7.1E\n\n",nfunc,vertex[0],vertex[1],yvertex[0],rtol); + if (vertex[0]<0) vertex[0]=0.0; + + // Print correlation and offset for profile + printf("HHblast correlation=%-6.3f score offset=%-6.3f\n",vertex[0],vertex[1]); + v=2;////////////////////////////////// +} + + +/** + * @brief Calculate HHblast composite E-values + */ +inline double +corr_HHblast(float Nq, float Nt) +{ + return 0.5; +} + +/** + * @brief Calculate HHblast composite E-values + */ +inline double +offset_HHblast(float Nq, float Nt) +{ + return 0.0; +} + +////////////////////////////////////////////////////////////////////////////// +/** + * @brief Calculate HHblast composite E-values + */ +void +HitList::CalculateHHblastEvalues(HMM& q) +{ + Hit hit; + float corr, offset; // correlation coefficient and offset for calculating composite HHblast P-values + + Reset(); + while (!End()) + { + hit = ReadNext(); + corr = corr_HHblast(q.Neff_HMM,hit.Neff_HMM); + offset = offset_HHblast(q.Neff_HMM,hit.Neff_HMM); + hit.score_sort = hit.logPval + blast_logPvals->Show(hit.name); + hit.logPval = logPvalue_HHblast(-hit.score_sort+offset,corr); // overwrite logPval from HHsearch with composite logPval from HHblast + hit.Pval = Pvalue_HHblast(-hit.score_sort+offset,corr); // overwrite P-value from HHsearch with composite P-value from HHblast + hit.Eval = exp(hit.logPval+log(N_searched)); // overwrite E-value from HHsearch with composite E-value from HHblast + hit.Probab = Probab(hit); + Overwrite(hit); // copy hit object into current position of hitlist + } + ResortList(); // use InsertSort to resort list according to sum of minus-log-Pvalues +} + + +////////////////////////////////////////////////////////////////////////////// +/** + * @brief Read file generated by blastpgp (default output) and store P-values in hash + */ +void +HitList::ReadBlastFile(HMM& q) +{ + char line[LINELEN]=""; // input line + int Ndb; // number of sequences in database + int Ldb=0; // size of database in number of amino acids + char* templ; + int i; + if (!blast_logPvals) { blast_logPvals = new(Hash<float>); blast_logPvals->New(16381,0); } + + FILE* blaf = NULL; + if (!strcmp(par.blafile,"stdin")) blaf=stdin; + else + { + blaf = fopen(par.blafile,"rb"); + if (!blaf) OpenFileError(par.blafile); + } + + // Read number of sequences and size of database + while (fgetline(line,LINELEN-1,blaf) && !strstr(line,"sequences;")); + if (!strstr(line,"sequences;")) FormatError(par.blafile,"No 'Database:' string found."); + char* ptr=line; + Ndb = strint(ptr); + if (Ndb==INT_MIN) FormatError(par.blafile,"No integer for number of sequences in database found."); + while ((i=strint(ptr))>INT_MIN) Ldb = 1000*Ldb + i; + if (Ldb==0) FormatError(par.blafile,"No integer for size of database found."); + printf("\nNumber of sequences in database = %i Size of database = %i\n",Ndb,Ldb); + + // Read all E-values and sequence lengths + while (fgetline(line,LINELEN-1,blaf)) + { + if (line[0]=='>') + { + // Read template name + templ = new(char[255]); + ptr = line+1; + strwrd(templ,ptr); + if (!blast_logPvals->Contains(templ)) // store logPval only for best HSP with template + { + // Read length + while (fgetline(line,LINELEN-1,blaf) && !strstr(line,"Length =")); + ptr = line+18; + int length = strint(ptr); + // Read E-value + fgetline(line,LINELEN-1,blaf); + fgetline(line,LINELEN-1,blaf); + float EvalDB; // E-value[seq-db] = Evalue for comparison Query vs. database, from PSI-BLAST + float EvalQT; // E-value[seq-seq] = Evalue for comparison Query vs. template (seq-seq) + double logPval; + ptr = strstr(line+20,"Expect ="); + if (!ptr) FormatError(par.blafile,"No 'Expect =' string found."); + if (sscanf(ptr+8,"%g",&EvalDB)<1) + { + ptr[7]='1'; + if (sscanf(ptr+7,"%g",&EvalDB)<1) + FormatError(par.blafile,"No Evalue found after 'Expect ='."); + } + // Calculate P-value[seq-seq] = 1 - exp(-E-value[seq-seq]) = 1 - exp(-Lt/Ldb*E-value[seq-db]) + EvalQT = length/double(Ldb)*double(EvalDB); + if (EvalQT>1E-3) logPval = log(1.0-exp(-EvalQT)); else logPval=log(double(EvalQT)+1.0E-99); + blast_logPvals->Add(templ,logPval); + printf("template=%-10.10s length=%-3i EvalDB=%8.2g EvalQT=%8.2g P-value=%8.2g log Pval=%8.2g\n",templ,length,EvalDB,EvalQT,exp(logPval),logPval); + } + else { + delete[] templ; templ = NULL; + } + } + } + fclose(blaf); +} + + +///////////////////////////////////////////////////////////////////////////////////// +/** + * @brief Calculate output of hidden neural network units + */ +inline float +calc_hidden_output(const float* weights, const float* bias, float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm) +{ + float res; + // Calculate activation of hidden unit = sum of all inputs * weights + bias + res = Lqnorm*weights[0] + Ltnorm*weights[1] + Nqnorm*weights[2] + Ntnorm*weights[3] + *bias; + res = 1.0 / (1.0 + exp(-(res ))); // logistic function + return res; +} + +//////////////////////////////////////////////////////////////////////////////////// +/** + * @brief Neural network regressions of lamda for EVD + */ +inline float +lamda_NN(float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm) +{ + const int inputs = 4; + const int hidden = 4; + const float biases[] = {-0.73195, -1.43792, -1.18839, -3.01141}; // bias for all hidden units + const float weights[] = { // Weights for the neural networks (column = start unit, row = end unit) + -0.52356, -3.37650, 1.12984, -0.46796, + -4.71361, 0.14166, 1.66807, 0.16383, + -0.94895, -1.24358, -1.20293, 0.95434, + -0.00318, 0.53022, -0.04914, -0.77046, + 2.45630, 3.02905, 2.53803, 2.64379 + }; + float lamda=0.0; + for (int h = 0; h<hidden; h++) { + lamda += calc_hidden_output( weights+inputs*h, biases+h, Lqnorm,Ltnorm,Nqnorm,Ntnorm ) * weights[hidden*inputs+h]; + } + return lamda; +} + +//////////////////////////////////////////////////////////////////////////////////// +/** + * @brief Neural network regressions of mu for EVD + */ +inline float +mu_NN(float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm) +{ + const int inputs = 4; + const int hidden = 6; + const float biases[] = {-4.25264, -3.63484, -5.86653, -4.78472, -2.76356, -2.21580}; // bias for all hidden units + const float weights[] = { // Weights for the neural networks (column = start unit, row = end unit) + 1.96172, 1.07181, -7.41256, 0.26471, + 0.84643, 1.46777, -1.04800, -0.51425, + 1.42697, 1.99927, 0.64647, 0.27834, + 1.34216, 1.64064, 0.35538, -8.08311, + 2.30046, 1.31700, -0.46435, -0.46803, + 0.90090, -3.53067, 0.59212, 1.47503, + -1.26036, 1.52812, 1.58413, -1.90409, 0.92803, -0.66871 + }; + float mu=0.0; + for (int h = 0; h<hidden; h++) { + mu += calc_hidden_output( weights+inputs*h, biases+h, Lqnorm,Ltnorm,Nqnorm,Ntnorm ) * weights[hidden*inputs+h]; + } + return 20.0*mu; +} + +////////////////////////////////////////////////////////////////////////////// +/** + * @brief Calculate Pvalues as a function of query and template lengths and diversities + */ +void +HitList::CalculatePvalues(HMM& q) +{ + Hit hit; + float lamda=0.4, mu=3.0; + const float log1000=log(1000.0); + + if (par.idummy!=2) + { + printf("WARNING: idummy should have been ==2 (no length correction)\n"); + exit(4); + } + + if(N_searched==0) N_searched=1; + if (v>=2) + printf("Calculate Pvalues as a function of query and template lengths and diversities...\n"); + Reset(); + while (!End()) + { + hit = ReadNext(); + + if (par.loc) + { + lamda = lamda_NN( log(q.L)/log1000, log(hit.L)/log1000, q.Neff_HMM/10.0, hit.Neff_HMM/10.0 ); + mu = mu_NN( log(q.L)/log1000, log(hit.L)/log1000, q.Neff_HMM/10.0, hit.Neff_HMM/10.0 ); +// if (v>=3 && nhits++<20) +// printf("hit=%-10.10s Lq=%-4i Lt=%-4i Nq=%5.2f Nt=%5.2f => lamda=%-6.3f mu=%-6.3f\n",hit.name,q.L,hit.L,q.Neff_HMM,hit.Neff_HMM,lamda,mu); + } + else + { + printf("WARNING: global calibration not yet implemented!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); + } + hit.logPval = logPvalue(hit.score,lamda,mu); + hit.Pval = Pvalue(hit.score,lamda,mu); + hit.Eval=exp(hit.logPval+log(N_searched)); +// hit.score_aass = hit.logPval/LAMDA-3.0 - hit.score_ss; // median(lamda)~0.45, median(mu)~3.0 in EVDs for scop20.1.63 HMMs + // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue)) + hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-hit.Pval)) )/0.45 - fmin(lamda*hit.score_ss,fmax(0.0,0.2*(hit.score-8.0)))/0.45 - 3.0; + hit.score_sort = hit.score_aass; + hit.Probab = Probab(hit); + Overwrite(hit); + } + SortList(); + Reset(); + return; +} + +////////////////////////////////////////////////////////////////////////////// +/** + * @brief Calculate Pvalues from calibration of 0: query HMM, 1:template HMMs, 2: both + */ +void +HitList::GetPvalsFromCalibration(HMM& q) +{ + Hit hit; + char warn=0; + if(N_searched==0) N_searched=1; + if (v>=2) + { + switch (par.calm) + { + case 0: + printf("Using lamda=%-5.3f and mu=%-5.2f from calibrated query HMM %s. \n",q.lamda,q.mu,q.name); + printf("Note that HMMs need to be recalibrated when changing HMM-HMM alignment options.\n"); + break; + case 1: + printf("Using score distribution parameters lamda and mu from database HMMs \n"); + break; + case 2: + printf("Combining score distribution parameters lamda and mu from query and database HMMs\n"); + printf("Note that HMMs need to be recalibrated when changing HMM-HMM alignment options.\n"); + break; + } + } + Reset(); + while (!End()) + { + hit = ReadNext(); + if (par.calm==0 || (hit.logPvalt==0) ) + { + hit.logPval = logPvalue(hit.score,q.lamda,q.mu); + hit.Pval = Pvalue(hit.score,q.lamda,q.mu); + if (par.calm>0 && warn++<1 && v>=1) + printf("Warning: some template HMM (e.g. %s) are not calibrated. Using query calibration.\n",hit.name); + } + else if (par.calm==1) + { + hit.logPval = hit.logPvalt; + hit.Pval = hit.Pvalt; + } + else if (par.calm==2) + { + hit.logPval = 0.5*( logPvalue(hit.score,q.lamda,q.mu) + hit.logPvalt); + hit.Pval = sqrt( Pvalue(hit.score,q.lamda,q.mu) * hit.Pvalt); + if (v>=5) printf("Score: %7.1f lamda: %7.1f mu: %7.1f P-values: query-calibrated: %8.2G template-calibrated: %8.2G geometric mean: %8.2G\n",hit.score,q.lamda,q.mu,Pvalue(hit.score,q.lamda,q.mu),hit.Pvalt,hit.Pval); + } + + hit.Eval=exp(hit.logPval+log(N_searched)); +// hit.score_aass = hit.logPval/LAMDA-3.0 - hit.score_ss; // median(lamda)~0.45, median(mu)~3.0 in EVDs for scop20.1.63 HMMs + // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue)) + hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-hit.Pval)) ) / 0.45-3.0 - fmin(hit.score_ss,fmax(0.0,0.5*hit.score-5.0)); + hit.score_sort = hit.score_aass; + hit.Probab = Probab(hit); + Overwrite(hit); + } + SortList(); + Reset(); + return; +} + + + + + + + + + +////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////// +// Transitive scoring +////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////// + + + + + + + +///////////////////////////////////////////////////////////////////////////////////// +/** + * @brief Calculate P-values and Probabilities from transitive scoring over whole database + */ +void +HitList::TransitiveScoring() +{ + void PrintMatrix(float** V, int N); + void PrintMatrix(double** V, int N); + + float** Z; // matrix of intra-db Z-scores Z_kl + float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm) + char** fold; // fold name of HMM k + char** fam; // family of HMM k + float* Prob; // probability of HMM k + float* Zq; // Zq[k] = Z-score between query and database HMM k + float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q + float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k + float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively + int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans + int N; // dimension of weight matrix is NxN + int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively) + int k,l,m,n; // indices for database HMMs + char name[NAMELEN]; + Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N} + index.Null(-1); // Set int value to return when no data can be retrieved + Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit + excluded.Null(0); // Set int value to return when no data can be retrieved + Hit hit; + size_t unused; /* disable fread gcc warning */ + + // Read weights matrix W with index hash and names array + fprintf(stderr,"Reading in weights file\n"); + FILE* wfile = fopen(par.wfile,"rb"); + if (v>=1 && wfile==NULL) + { + fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched); + perror("fopen"); + fprintf(stderr,"Skipping caclulation of transitive P-values\n"); + par.trans=0; + return; + } + unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database) + if (v>=1 && N!=N_searched) + { + fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched); + fprintf(stderr,"Skipping caclulation of transitive P-values\n"); + par.trans=0; + return; + } + if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N); + // Read names of HMMs (to specify mapping of HMM to matrix indices) + for (k=0; k<N; k++) + { + unused = fread(name,sizeof(char),IDLEN,wfile); + index.Add(name,k); + } + // Read symmetric Z-scores matrix + Z = new(float*[N]); + for (k=0; k<N; k++) + { + Z[k] = new(float[N]); + for (l=0; l<k; l++) Z[k][l] = Z[l][k]; + unused = fread(Z[k]+k,sizeof(float),N-k,wfile); + } + // Read symmetric covariance matrix + C = new(float*[N]); + for (k=0; k<N; k++) + { + C[k] = new(float[N]); + for (l=0; l<k; l++) C[k][l] = C[l][k]; + unused = fread(C[k]+k,sizeof(float),N-k,wfile); + } + fclose(wfile); + + // Allocate memory + Zq = new(float[N]); + Ztq = new(float[N]); + Zrq = new(float[N]); + fold = new(char*[N]); + fam = new(char*[N]); + Prob = new(float[N]); + ll = new(int[N]); + w = new(float[N]); + + // Transform P-values to normally distributed Z-scores and store in Zq vector + fprintf(stderr,"Transform P-values to Z-scores\n"); + float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL + float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans + printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans); + + Reset(); + while (!End()) + { + hit = ReadNext(); + if (hit.irep>1) continue; + k = index.Show(hit.name); + if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);} + if (hit.logPvalt<0) + Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk) + else + Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk +// printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]); +// if (isnan(Zq[k])) { +// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); +// printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]); +// par.trans=0; +// return; +// } + if (Zq[k]>Zmax_neg) excluded.Add(hit.fold); + fold[k] = new(char[IDLEN]); + fam[k] = new(char[IDLEN]); + strcpy(fold[k],hit.fold); + strcpy(fam[k],hit.fam); + weight[k] = hit.weight; + Prob[k] = hit.Probab; + } + + if (v>=3) + { + excluded.Reset(); + while (!excluded.End()) + { + excluded.ReadNext(name); + printf("Excluded fold %s from fitting to Ztq\n",name); + } + } + + + //////////////////////////////////////////////////////////////// + // Calculate transitive score (query->l) Zt[l] + + // Construct vector ll of indices l for which Z_lq > Zmin_trans + m = 0; + for (l=0; l<N; l++) + if (Zq[l]>=Zmin_trans) ll[m++]=l; + M = m; // number of indices l for which Z_lq,Z_lk > Zmin_trans + +// for (m=0; m<M; m++) +// fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]); + + if (M<=1) + for (k=0; k<N; k++) Ztq[k]=0.0; + else + { + // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans + double** Csub = new(double*[M]); + double** Cinv = new(double*[M]); + for (m=0; m<M; m++) + { + Csub[m] = new(double[M]); + Cinv[m] = new(double[M]); + for (n=0; n<M; n++) + Csub[m][n] = double(C[ll[m]][ll[n]]); + } + + if (v>=3) + { + fprintf(stderr,"Covariance matrix\n"); + PrintMatrix(Csub,M); + } + + // Invert Csub + fprintf(stderr,"Calculate inverse of covariance submatrix\n"); + InvertMatrix(Cinv,Csub,M); + + if (v>=3) + { + fprintf(stderr,"Inverse covariance matrix\n"); + PrintMatrix(Cinv,M); + } + + // Calculate weights w[l] + for (m=0; m<M; m++) + { + double sum = 0.0; + for (n=0; n<M; n++) + sum += 1.0 * Cinv[m][n]; + w[m] = fmax(sum,0.0); + } + for (l=0; l<M; l++){ + delete[](Cinv[l]); (Cinv[l]) = NULL; + } + delete[](Cinv); (Cinv) = NULL; + + // Calculate Ztq[k] for all HMMs k + fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n"); + float norm = NormalizationFactor(Csub,w,M); + for (k=0; k<N; k++) + { + double sumZ = 0.0; + for (m=0; m<M; m++) + sumZ += w[m] * Z[ll[m]][k]; + Ztq[k] = sumZ/norm; + } + + for (l=0; l<M; l++){ + delete[](Csub[l]); (Csub[l]) = NULL; + } + delete[](Csub); (Csub) = NULL; + } + + //////////////////////////////////////////////////////////////// + // Calculate reverse transitive score (l->query-) Zrq[l] + + fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n"); + for (k=0; k<N; k++) + { + // Construct vector ll of indices l for which Z_lk > Zmin_tran + m = 0; + for (l=0; l<N; l++) + if (Z[l][k]+Z[k][l]>=2*Zmin_trans) ll[m++]=l; + int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran + + +// fprintf(stderr,"\nfam[k]: %s\n",fam[k]); +// for (m=0; m<M; m++) +// printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]); + + if (M<=1) + { + Zrq[k] = Zq[k]; + } + else + { + // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans + double** Csub = new(double*[M]); + for (m=0; m<M; m++) + { + Csub[m] = new(double[M]); + for (n=0; n<M; n++) + Csub[m][n] = double(C[ll[m]][ll[n]]); + } +// fprintf(stderr,"Covariance matrix\n"); +// PrintMatrix(Csub,M); + + if (M==2) + { + for (m=0; m<M; m++) w[m] = 1.0/M; + } + else + { + + double** Cinv = new(double*[M]); + for (m=0; m<M; m++) Cinv[m] = new(double[M]); + + // Invert Csub + InvertMatrix(Cinv,Csub,M); + + // fprintf(stderr,"Inverse covariance matrix\n"); + // PrintMatrix(Cinv,M); + + // Calculate weights w[l] + for (m=0; m<M; m++) + { + double sum = 0.0; + for (n=0; n<M; n++) + sum += 1.0 * Cinv[m][n]; + w[m] = fmax(sum,0.0); + } + +// for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]); + + for (l=0; l<M; l++){ + delete[](Cinv[l]); (Cinv[l]) = NULL; + } + delete[](Cinv); (Cinv) = NULL; + } + + // Calculate Zrq[k] and normalize + float norm = NormalizationFactor(Csub,w,M); + double sumZ = 0.0; + for (m=0; m<M; m++) + sumZ += w[m] * Zq[ll[m]]; + Zrq[k] = sumZ/norm; + + for (l=0; l<M; l++){ + delete[](Csub[l]); (Csub[l]) = NULL; + } + delete[](Csub); (Csub) = NULL; + } + +// fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]); + } + + // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score + for (k=0; k<N; k++) + { + float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]); +// if (isnan(Zqtot)) +// { +// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); +// printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot); +// par.trans=0; +// return; +// } + if (v>=2 && Zq[k] + Zqtot > 2*Zmin_trans) { + printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot); + } + Ztq[k] = Zqtot; + } + + // Calculate mean and standard deviation of Z1q + fprintf(stderr,"Calculate mean and standard deviation of Ztq\n"); + double sumw=0.0; + double sumZ=0.0; + double sumZ2=0.0; + for (k=0; k<N; k++) + { + if (excluded.Contains(fold[k])) continue; + sumw += weight[k]; + sumZ += weight[k]*Ztq[k]; + sumZ2 += weight[k]*Ztq[k]*Ztq[k]; +// if (isnan(sumZ)) +// { +// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); +// printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]); +// par.trans=0; +// return; +// } + } + float mu = sumZ/sumw; + float sigma = sqrt(sumZ2/sumw-mu*mu); + if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma); + sigma *= 1.01;// correct different fitting of EVD and normal variables + + // Normalize Ztq and calculate P1-values + fprintf(stderr,"Normalize Ztq and calculate P1-values\n"); + Reset(); + while (!End()) + { + hit = ReadNext(); + hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma); + hit.E1val = N_searched*(hit.logPval<-100.0? 0.0 : exp(hit.logPval)); + // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue)) + hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss; + hit.Probab = Probab(hit); + hit.score_sort = hit.logPval; + Overwrite(hit); // copy hit object into current position of hitlist + } + + for (k=0; k<N; k++){ + delete[](Z[k]); (Z[k]) = NULL; + } + for (k=0; k<N; k++){ + delete[](C[k]); (C[k]) = NULL; + } + for (k=0; k<N; k++){ + delete[](fold[k]); (fold[k]) = NULL; + } + for (k=0; k<N; k++){ + delete[](fam[k]); (fam[k]) = NULL; + } + delete[](C); (C) = NULL; + delete[](Z); (Z) = NULL; + delete[](fold); (fold) = NULL; + delete[](fam); (fam) = NULL; + delete[](Prob); (Prob) = NULL; + delete[](ll); (ll) = NULL; + delete[](Zq); (Zq) = NULL; + delete[](Ztq); (Ztq) = NULL; +} + + + +////////////////////////////////////////////////////////////////////////////// +/** + * @brief Calculate P-values and Probabilities from transitive scoring over whole database + */ +void +HitList::TransitiveScoring2() +{ + void PrintMatrix(float** V, int N); + void PrintMatrix(double** V, int N); + + float** Z; // matrix of intra-db Z-scores Z_kl + float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm) + char** fold; // fold name of HMM k + char** fam; // family of HMM k + float* Prob; // probability of HMM k + float* Zq; // Zq[k] = Z-score between query and database HMM k + float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q + float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k + float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively + int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans + int N; // dimension of weight matrix is NxN + int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively) + int k,l,m,n; // indices for database HMMs + char name[NAMELEN]; + Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N} + index.Null(-1); // Set int value to return when no data can be retrieved + Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit + excluded.Null(0); // Set int value to return when no data can be retrieved + Hit hit; + size_t unused; /* disable fread gcc warning */ + + // Read weights matrix W with index hash and names array + fprintf(stderr,"Reading in weights file\n"); + FILE* wfile = fopen(par.wfile,"rb"); + if (v>=1 && wfile==NULL) + { + fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched); + perror("fopen"); + fprintf(stderr,"Skipping caclulation of transitive P-values\n"); + par.trans=0; + return; + } + unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database) + if (v>=1 && N!=N_searched) + { + fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched); + fprintf(stderr,"Skipping caclulation of transitive P-values\n"); + par.trans=0; + return; + } + if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N); + // Read names of HMMs (to specify mapping of HMM to matrix indices) + for (k=0; k<N; k++) + { + unused = fread(name,sizeof(char),IDLEN,wfile); + index.Add(name,k); + } + // Read symmetric Z-scores matrix + Z = new(float*[N]); + for (k=0; k<N; k++) + { + Z[k] = new(float[N]); + for (l=0; l<k; l++) Z[k][l] = Z[l][k]; + unused = fread(Z[k]+k,sizeof(float),N-k,wfile); + } + // Read symmetric covariance matrix + C = new(float*[N]); + for (k=0; k<N; k++) + { + C[k] = new(float[N]); + for (l=0; l<k; l++) C[k][l] = C[l][k]; + unused = fread(C[k]+k,sizeof(float),N-k,wfile); + } + fclose(wfile); + + // Allocate memory + Zq = new(float[N]); + Ztq = new(float[N]); + Zrq = new(float[N]); + fold = new(char*[N]); + fam = new(char*[N]); + Prob = new(float[N]); + ll = new(int[N]); + w = new(float[N]); + + // Transform P-values to normally distributed Z-scores and store in Zq vector + fprintf(stderr,"Transform P-values to Z-scores\n"); + float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL + float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans + printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans); + + Reset(); + while (!End()) + { + hit = ReadNext(); + if (hit.irep>1) continue; + k = index.Show(hit.name); + if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);} + if (hit.logPvalt<0) + Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk) + else + Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk +// printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]); +// if (isnan(Zq[k])) +// { +// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); +// printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]); +// par.trans=0; +// return; +// } + if (Zq[k]>Zmax_neg) excluded.Add(hit.fold); + fold[k] = new(char[IDLEN]); + fam[k] = new(char[IDLEN]); + strcpy(fold[k],hit.fold); + strcpy(fam[k],hit.fam); + weight[k] = hit.weight; + Prob[k] = hit.Probab; + } + + if (v>=3) + { + excluded.Reset(); + while (!excluded.End()) + { + excluded.ReadNext(name); + printf("Excluded fold %s from fitting to Ztq\n",name); + } + } + + + //////////////////////////////////////////////////////////////// + // Calculate transitive score (query->l) Zt[l] + + // Construct vector ll of indices l for which Z_lq > Zmin_trans + m = 0; + for (l=0; l<N; l++) + if (Zq[l]>=Zmin_trans) ll[m++]=l; + M = m; // number of indices l for which Z_lq,Z_lk > Zmin_trans + +// for (m=0; m<M; m++) +// fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]); + + if (M<=1) + for (k=0; k<N; k++) Ztq[k]=0.0; + else + { + // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans + double** Csub = new(double*[M]); + double** Cinv = new(double*[M]); + for (m=0; m<M; m++) + { + Csub[m] = new(double[M]); + Cinv[m] = new(double[M]); + for (n=0; n<M; n++) + Csub[m][n] = double(C[ll[m]][ll[n]]); + } + + if (v>=3) + { + fprintf(stderr,"Covariance matrix\n"); + PrintMatrix(Csub,M); + } + +// // Invert Csub +// fprintf(stderr,"Calculate inverse of covariance submatrix\n"); +// InvertMatrix(Cinv,Csub,M); + +// if (v>=3) +// { +// fprintf(stderr,"Inverse covariance matrix\n"); +// PrintMatrix(Cinv,M); +// } + + + // Calculate weights w[l] + for (m=0; m<M; m++) + { + double sum = 0.0; + for (n=0; n<M; n++) + sum += 1.0 * Csub[m][n]; + printf("w[%4i] = %-8.5f\n",ll[m],1.0/sum); + w[m] = (sum>0? Zq[ll[m]] / sum : 0.0); + } + for (l=0; l<M; l++){ + delete[](Cinv[l]); (Cinv[l]) = NULL; + } + delete[](Cinv); (Cinv) = NULL; + + // Calculate Ztq[k] for all HMMs k + fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n"); + float norm = NormalizationFactor(Csub,w,M); + for (k=0; k<N; k++) + { + double sumZ = 0.0; + for (m=0; m<M; m++) + sumZ += w[m] * Z[ll[m]][k]; + Ztq[k] = sumZ/norm; + } + + for (l=0; l<M; l++){ + delete[](Csub[l]); (Csub[l]) = NULL; + } + delete[](Csub); (Csub) = NULL; + } + + //////////////////////////////////////////////////////////////// + // Calculate reverse transitive score (l->query-) Zrq[l] + + fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n"); + for (k=0; k<N; k++) + { + // Construct vector ll of indices l for which Z_lk > Zmin_tran + m = 0; + for (l=0; l<N; l++) + if (Z[l][k]+Z[k][l]>=2*Zmin_trans) ll[m++]=l; + int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran + + +// fprintf(stderr,"\nfam[k]: %s\n",fam[k]); +// for (m=0; m<M; m++) +// printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]); + + if (M<=1) + { + Zrq[k] = Zq[k]; + } + else + { + // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans + double** Csub = new(double*[M]); + for (m=0; m<M; m++) + { + Csub[m] = new(double[M]); + for (n=0; n<M; n++) + Csub[m][n] = double(C[ll[m]][ll[n]]); + } +// fprintf(stderr,"Covariance matrix\n"); +// PrintMatrix(Csub,M); + + if (M<=2) + { + for (m=0; m<M; m++) w[m] = 1.0/M; + } + else + { + + double** Cinv = new(double*[M]); + for (m=0; m<M; m++) Cinv[m] = new(double[M]); + +// // Invert Csub +// InvertMatrix(Cinv,Csub,M); + +// // fprintf(stderr,"Inverse covariance matrix\n"); +// // PrintMatrix(Cinv,M); + + // Calculate weights w[l] + for (m=0; m<M; m++) + { + double sum = 0.0; + for (n=0; n<M; n++) + sum += 1.0 * Csub[m][n]; + w[m] = (sum>0? Z[ll[m]][k] / sum : 0.0); + } + +// for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]); + + for (l=0; l<M; l++){ + delete[](Cinv[l]); (Cinv[l]) = NULL; + } + delete[](Cinv); (Cinv) = NULL; + } + + // Calculate Zrq[k] and normalize + float norm = NormalizationFactor(Csub,w,M); + double sumZ = 0.0; + for (m=0; m<M; m++) + sumZ += w[m] * Zq[ll[m]]; + Zrq[k] = sumZ/norm; + + for (l=0; l<M; l++){ + delete[](Csub[l]); (Csub[l]) = NULL; + } + delete[](Csub); (Csub) = NULL; + } + +// fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]); + } + + // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score + for (k=0; k<N; k++) + { + float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]); +// if (isnan(Zqtot)) +// { +// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); +// printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot); +// par.trans=0; +// return; +// } + if (v>=2 && Zq[k] + Zqtot > 2*Zmin_trans) { + printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot); + } + Ztq[k] = Zqtot; + } + + // Calculate mean and standard deviation of Z1q + fprintf(stderr,"Calculate mean and standard deviation of Ztq\n"); + double sumw=0.0; + double sumZ=0.0; + double sumZ2=0.0; + for (k=0; k<N; k++) + { + if (excluded.Contains(fold[k])) continue; + sumw += weight[k]; + sumZ += weight[k]*Ztq[k]; + sumZ2 += weight[k]*Ztq[k]*Ztq[k]; +// if (isnan(sumZ)) +// { +// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); +// printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]); +// par.trans=0; +// return; +// } + } + float mu = sumZ/sumw; + float sigma = sqrt(sumZ2/sumw-mu*mu); + if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma); + sigma *= 1.01;// correct different fitting of EVD and normal variables + + // Normalize Ztq and calculate P1-values + fprintf(stderr,"Normalize Ztq and calculate P1-values\n"); + Reset(); + while (!End()) + { + hit = ReadNext(); + hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma); + hit.E1val = N_searched*(hit.logPval<-100? 0.0 : exp(hit.logPval)); + // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue)) + hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss; + hit.Probab = Probab(hit); + hit.score_sort = hit.logPval; + Overwrite(hit); // copy hit object into current position of hitlist + } + + for (k=0; k<N; k++){ + delete[](Z[k]); (Z[k]) = NULL; + } + for (k=0; k<N; k++){ + delete[](C[k]); (C[k]) = NULL; + } + for (k=0; k<N; k++){ + delete[](fold[k]); (fold[k]) = NULL; + } + for (k=0; k<N; k++){ + delete[](fam[k]); (fam[k]) = NULL; + } + delete[](C); (C) = NULL; + delete[](Z); (Z) = NULL; + delete[](fold); (fold) = NULL; + delete[](fam); (fam) = NULL; + delete[](Prob); (Prob) = NULL; + delete[](ll); (ll) = NULL; + delete[](Zq); (Zq) = NULL; + delete[](Ztq); (Ztq) = NULL; +} + + +///////////////////////////////////////////////////////////////////////////////////// +/** + * @brief Calculate P-values and Probabilities from transitive scoring over whole database + * Like TransitiveScoring(), + * but in transitive scoring, Z1_qk = sum_l w_l*Z_lk, use all l:E_ql<=E_qk + * and in reverse scoring, Z1_kr = sum_l w_l*Z_lq, use all l:E_kl<=E_kq + */ +void +HitList::TransitiveScoring3() +{ + void PrintMatrix(float** V, int N); + void PrintMatrix(double** V, int N); + + float** Z; // matrix of intra-db Z-scores Z_kl + float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm) + char** fold; // fold name of HMM k + char** fam; // family of HMM k + float* Prob; // probability of HMM k + float* Zq; // Zq[k] = Z-score between query and database HMM k + float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q + float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k + float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively + int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans + int N; // dimension of weight matrix is NxN + int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively) + int k,l,m,n; // indices for database HMMs + char name[NAMELEN]; + Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N} + index.Null(-1); // Set int value to return when no data can be retrieved + Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit + excluded.Null(0); // Set int value to return when no data can be retrieved + Hit hit; + size_t unused; /* disable fread gcc warning */ + + // Read weights matrix W with index hash and names array + fprintf(stderr,"Reading in weights file\n"); + FILE* wfile = fopen(par.wfile,"rb"); + if (v>=1 && wfile==NULL) + { + fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched); + perror("fopen"); + fprintf(stderr,"Skipping caclulation of transitive P-values\n"); + par.trans=0; + return; + } + unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database) + if (v>=1 && N!=N_searched) + { + fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched); + fprintf(stderr,"Skipping caclulation of transitive P-values\n"); + par.trans=0; + return; + } + if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N); + // Read names of HMMs (to specify mapping of HMM to matrix indices) + for (k=0; k<N; k++) + { + unused = fread(name,sizeof(char),IDLEN,wfile); + index.Add(name,k); + } + // Read symmetric Z-scores matrix + Z = new(float*[N]); + for (k=0; k<N; k++) + { + Z[k] = new(float[N]); + for (l=0; l<k; l++) Z[k][l] = Z[l][k]; + unused = fread(Z[k]+k,sizeof(float),N-k,wfile); + } + // Read symmetric covariance matrix + C = new(float*[N]); + for (k=0; k<N; k++) + { + C[k] = new(float[N]); + for (l=0; l<k; l++) C[k][l] = C[l][k]; + unused = fread(C[k]+k,sizeof(float),N-k,wfile); + } + fclose(wfile); + + // Allocate memory + Zq = new(float[N]); + Ztq = new(float[N]); + Zrq = new(float[N]); + fold = new(char*[N]); + fam = new(char*[N]); + Prob = new(float[N]); + ll = new(int[N]); + w = new(float[N]); + + // Transform P-values to normally distributed Z-scores and store in Zq vector + fprintf(stderr,"Transform P-values to Z-scores\n"); + float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL + float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans + printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans); + + Reset(); + while (!End()) + { + hit = ReadNext(); + if (hit.irep>1) continue; + k = index.Show(hit.name); + if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);} + if (hit.logPvalt<0) + Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk) + else + Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk +// printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]); +// if (isnan(Zq[k])) +// { +// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); +// printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]); +// par.trans=0; +// return; +// } + if (Zq[k]>Zmax_neg) excluded.Add(hit.fold); + fold[k] = new(char[IDLEN]); + fam[k] = new(char[IDLEN]); + strcpy(fold[k],hit.fold); + strcpy(fam[k],hit.fam); + weight[k] = hit.weight; + Prob[k] = hit.Probab; + } + + if (v>=3) + { + excluded.Reset(); + while (!excluded.End()) + { + excluded.ReadNext(name); + printf("Excluded fold %s from fitting to Ztq\n",name); + } + } + + + //////////////////////////////////////////////////////////////// + // Calculate transitive score (query->l) Ztq[l] + + fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n"); + for (k=0; k<N; k++) + { + // Construct vector ll of indices l for which Z_lq OR Z_lk >= max(Z_kq,Zmin_trans) + float Zmink = fmax(Zq[k],Zmin_trans); + for (m=l=0; l<N; l++) + if (Zq[l]>=Zmink) ll[m++]=l; + M = m; // number of indices l for which Z_lq OR Z_lk >= max(Z_kq,Zmin_trans) + +// for (m=0; m<M; m++) +// fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]); + + if (M<=1) + { + Ztq[k]=Zq[k]; + } + else + { + // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans + double** Csub = new(double*[M]); + double** Cinv = new(double*[M]); + for (m=0; m<M; m++) + { + Csub[m] = new(double[M]); + Cinv[m] = new(double[M]); + for (n=0; n<M; n++) + Csub[m][n] = double(C[ll[m]][ll[n]]); + } + +// fprintf(stderr,"Covariance matrix\n"); +// PrintMatrix(Csub,M); + + // Invert Csub +// fprintf(stderr,"Calculate inverse of covariance submatrix\n"); + InvertMatrix(Cinv,Csub,M); + +// fprintf(stderr,"Inverse covariance matrix\n"); +// PrintMatrix(Cinv,M); + + // Calculate weights w[l] + for (m=0; m<M; m++) + { + double sum = 0.0; + for (n=0; n<M; n++) + sum += 1.0 * Cinv[m][n]; // signal ~ sum_l w_l*Z_lq ! + w[m] = fmax(sum,0.0); + } + for (l=0; l<M; l++){ + delete[](Cinv[l]); (Cinv[l]) = NULL; + } + delete[](Cinv); (Cinv) = NULL; + + // Calculate Ztq[k] + float norm = NormalizationFactor(Csub,w,M); + double sumZ = 0.0; + for (m=0; m<M; m++) + sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]); +// sumZ += w[m] * Z[ll[m]][k]; + Ztq[k] = sumZ/norm; + + for (l=0; l<M; l++){ + delete[](Csub[l]); (Csub[l]) = NULL; + } + delete[](Csub); (Csub) = NULL; + } + } + + //////////////////////////////////////////////////////////////// + // Calculate reverse transitive score (l->query-) Zrq[l] + + fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n"); + for (k=0; k<N; k++) + { + // Construct vector ll of indices l for which Z_lk > Zmin_tran + float Zmink = fmax(Zq[k],Zmin_trans); + for (m=l=0; l<N; l++) + if (Z[l][k]>=Zmink) ll[m++]=l; + int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran + + +// fprintf(stderr,"\nfam[k]: %s\n",fam[k]); +// for (m=0; m<M; m++) +// printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]); + + if (M<=1) + { + Zrq[k] = Zq[k]; + } + else + { + // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans + double** Csub = new(double*[M]); + for (m=0; m<M; m++) + { + Csub[m] = new(double[M]); + for (n=0; n<M; n++) + Csub[m][n] = double(C[ll[m]][ll[n]]); + } +// fprintf(stderr,"Covariance matrix\n"); +// PrintMatrix(Csub,M); + + if (M==2) + { + for (m=0; m<M; m++) w[m] = 1.0/M; + } + else + { + + double** Cinv = new(double*[M]); + for (m=0; m<M; m++) Cinv[m] = new(double[M]); + + // Invert Csub + InvertMatrix(Cinv,Csub,M); + + // fprintf(stderr,"Inverse covariance matrix\n"); + // PrintMatrix(Cinv,M); + + // Calculate weights w[l] + for (m=0; m<M; m++) + { + double sum = 0.0; + for (n=0; n<M; n++) + sum += 1.0 * Cinv[m][n]; // signal ~ sum_l w_l*Z_lq ! + w[m] = fmax(sum,0.0); + } +// for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]); + for (l=0; l<M; l++){ + delete[](Cinv[l]); (Cinv[l]) = NULL; + } + delete[](Cinv); (Cinv) = NULL; + } + + // Calculate Zrq[k] and normalize + float norm = NormalizationFactor(Csub,w,M); + double sumZ = 0.0; + for (m=0; m<M; m++) + sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]); +// sumZ += w[m] * Zq[ll[m]]; + Zrq[k] = sumZ/norm; + + for (l=0; l<M; l++){ + delete[](Csub[l]); (Csub[l]) = NULL; + } + delete[](Csub); (Csub) = NULL; + } + +// fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]); + } + + // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score + for (k=0; k<N; k++) + { + + float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]); +// if (isnan(Zqtot)) +// { +// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); +// printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot); +// par.trans=0; +// return; +// } + if (v>=3 && Zqtot > 2*Zmin_trans) { + printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot); + } + Ztq[k] = Zqtot; + } + + // Calculate mean and standard deviation of Z1q + fprintf(stderr,"Calculate mean and standard deviation of Ztq\n"); + double sumw=0.0; + double sumZ=0.0; + double sumZ2=0.0; + for (k=0; k<N; k++) + { + if (excluded.Contains(fold[k])) continue; + sumw += weight[k]; + sumZ += weight[k]*Ztq[k]; + sumZ2 += weight[k]*Ztq[k]*Ztq[k]; +// if (isnan(sumZ)) +// { +// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); +// printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]); +// par.trans=0; +// return; +// } + } + float mu = sumZ/sumw; + float sigma = sqrt(sumZ2/sumw-mu*mu); + if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma); + sigma *= 1.01;// correct different fitting of EVD and normal variables + + // Normalize Ztq and calculate P1-values + fprintf(stderr,"Normalize Ztq and calculate P1-values\n"); + Reset(); + while (!End()) + { + hit = ReadNext(); + hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma); + hit.E1val = N_searched*(hit.logPval<-100? 0.0 : exp(hit.logPval)); + // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue)) + hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss; + hit.Probab = Probab(hit); + hit.score_sort = hit.logPval; + Overwrite(hit); // copy hit object into current position of hitlist + } + + for (k=0; k<N; k++){ + delete[](Z[k]); (Z[k]) = NULL; + } + for (k=0; k<N; k++){ + delete[](C[k]); (C[k]) = NULL; + } + for (k=0; k<N; k++){ + delete[](fold[k]); (fold[k]) = NULL; + } + for (k=0; k<N; k++){ + delete[](fam[k]); (fam[k]) = NULL; + } + delete[](C); (C) = NULL; + delete[](Z); (Z) = NULL; + delete[](fold); (fold) = NULL; + delete[](fam); (fam) = NULL; + delete[](Prob); (Prob) = NULL; + delete[](ll); (ll) = NULL; + delete[](Zq); (Zq) = NULL; + delete[](Ztq); (Ztq) = NULL; + +} + + +///////////////////////////////////////////////////////////////////////////////////// +/** + * @brief Calculate P-values and Probabilities from transitive scoring over whole database + * Best tested scheme. Use fmin(Zq[ll[m]],Z[ll[m]][k]) + * and fast approximation for weights (not inverse covariance matrix) + */ +void +HitList::TransitiveScoring4() +{ + void PrintMatrix(float** V, int N); + void PrintMatrix(double** V, int N); + + float** Z; // matrix of intra-db Z-scores Z_kl + float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm) + char** fold; // fold name of HMM k + char** fam; // family of HMM k + float* Prob; // probability of HMM k + float* Zq; // Zq[k] = Z-score between query and database HMM k + float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q + float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k + float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively + int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans + int N; // dimension of weight matrix is NxN + int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively) + int k,l,m,n; // indices for database HMMs + char name[NAMELEN]; + Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N} + index.Null(-1); // Set int value to return when no data can be retrieved + Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit + excluded.Null(0); // Set int value to return when no data can be retrieved + Hit hit; + size_t unused; /* disable fread gcc warning */ + + // Read weights matrix W with index hash and names array + fprintf(stderr,"Reading in weights file\n"); + FILE* wfile = fopen(par.wfile,"rb"); + if (v>=1 && wfile==NULL) + { + fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched); + perror("fopen"); + fprintf(stderr,"Skipping caclulation of transitive P-values\n"); + par.trans=0; + return; + } + unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database) + if (v>=1 && N!=N_searched) + { + fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched); + fprintf(stderr,"Skipping caclulation of transitive P-values\n"); + par.trans=0; + return; + } + if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N); + // Read names of HMMs (to specify mapping of HMM to matrix indices) + for (k=0; k<N; k++) + { + unused = fread(name,sizeof(char),IDLEN,wfile); + index.Add(name,k); + } + // Read symmetric Z-scores matrix + Z = new(float*[N]); + for (k=0; k<N; k++) + { + Z[k] = new(float[N]); + for (l=0; l<k; l++) Z[k][l] = Z[l][k]; + unused = fread(Z[k]+k,sizeof(float),N-k,wfile); + } + // Read symmetric covariance matrix + C = new(float*[N]); + for (k=0; k<N; k++) + { + C[k] = new(float[N]); + for (l=0; l<k; l++) C[k][l] = C[l][k]; + unused = fread(C[k]+k,sizeof(float),N-k,wfile); + } + fclose(wfile); + + // Allocate memory + Zq = new(float[N]); + Ztq = new(float[N]); + Zrq = new(float[N]); + fold = new(char*[N]); + fam = new(char*[N]); + Prob = new(float[N]); + ll = new(int[N]); + w = new(float[N]); + + // Transform P-values to normally distributed Z-scores and store in Zq vector + fprintf(stderr,"Transform P-values to Z-scores\n"); + float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL + float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans + printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans); + + Reset(); + while (!End()) + { + hit = ReadNext(); + if (hit.irep>1) continue; + k = index.Show(hit.name); + if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);} + if (hit.logPvalt<0) + Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk) + else + Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk +// printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]); +// if (isnan(Zq[k])) { +// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); +// printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]); +// par.trans=0; +// return; +// } + if (Zq[k]>Zmax_neg) excluded.Add(hit.fold); + fold[k] = new(char[IDLEN]); + fam[k] = new(char[IDLEN]); + strcpy(fold[k],hit.fold); + strcpy(fam[k],hit.fam); + weight[k] = hit.weight; + Prob[k] = hit.Probab; + } + + if (v>=3) + { + excluded.Reset(); + while (!excluded.End()) + { + excluded.ReadNext(name); + printf("Excluded fold %s from fitting to Ztq\n",name); + } + } + + //////////////////////////////////////////////////////////////// + // Calculate transitive score (query->l) Zt[l] + + // Construct vector ll of indices l for which Z_lq > Zmin_trans + m = 0; + for (l=0; l<N; l++) + if (Zq[l]>=Zmin_trans) ll[m++]=l; + M = m; // number of indices l for which Z_lq,Z_lk > Zmin_trans + +// for (m=0; m<M; m++) +// fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]); + + if (M<=1) + for (k=0; k<N; k++) Ztq[k]=0.0; + else + { + // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans + double** Csub = new(double*[M]); + for (m=0; m<M; m++) + { + Csub[m] = new(double[M]); + for (n=0; n<M; n++) + Csub[m][n] = double(C[ll[m]][ll[n]]); + } + + if (v>=3) + { + fprintf(stderr,"Covariance matrix\n"); + PrintMatrix(Csub,M); + } + + + // Calculate weights w[l] + for (m=0; m<M; m++) + { + double sum = 0.0; + for (n=0; n<M; n++) + sum += fmax(0.0,Csub[m][n]); + printf("w[%4i] = %-8.5f\n",ll[m],1.0/sum); + w[m] = 1.0/sum; + } + + // Calculate Ztq[k] for all HMMs k + fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n"); + float norm = NormalizationFactor(Csub,w,M); + for (k=0; k<N; k++) + { + double sumZ = 0.0; + for (m=0; m<M; m++) + sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]); + Ztq[k] = sumZ/norm; + } + + for (l=0; l<M; l++){ + delete[](Csub[l]); (Csub[l]) = NULL; + } + delete[](Csub); (Csub) = NULL; + } + + //////////////////////////////////////////////////////////////// + // Calculate reverse transitive score (l->query-) Zrq[l] + + fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n"); + for (k=0; k<N; k++) + { + // Construct vector ll of indices l for which Z_lk > Zmin_tran + m = 0; + for (l=0; l<N; l++) + if (Z[k][l]>=Zmin_trans) ll[m++]=l; + int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran + + +// fprintf(stderr,"\nfam[k]: %s\n",fam[k]); +// for (m=0; m<M; m++) +// printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]); + + if (M<=1) + { + Zrq[k] = Zq[k]; + } + else + { + // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans + double** Csub = new(double*[M]); + for (m=0; m<M; m++) + { + Csub[m] = new(double[M]); + for (n=0; n<M; n++) + Csub[m][n] = double(C[ll[m]][ll[n]]); + } +// fprintf(stderr,"Covariance matrix\n"); +// PrintMatrix(Csub,M); + + // Calculate weights w[l] + for (m=0; m<M; m++) + { + double sum = 0.0; + for (n=0; n<M; n++) + sum += fmax(0.0,Csub[m][n]); + w[m] = 1.0/sum; + } + +// for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]); + + + // Calculate Zrq[k] and normalize + float norm = NormalizationFactor(Csub,w,M); + double sumZ = 0.0; + for (m=0; m<M; m++) + sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]); + Zrq[k] = sumZ/norm; + + for (l=0; l<M; l++){ + delete[](Csub[l]); (Csub[l]) = NULL; + } + delete[](Csub); (Csub) = NULL; + } + +// fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]); + } + + // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score + for (k=0; k<N; k++) + { + float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]); +// if (isnan(Zqtot)) +// { +// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); +// printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot); +// par.trans=0; +// return; +// } + if (v>=3 && Zq[k] + Zqtot > 2*Zmin_trans) { + printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot); + } + Ztq[k] = Zqtot; + } + + // Calculate mean and standard deviation of Z1q + fprintf(stderr,"Calculate mean and standard deviation of Ztq\n"); + double sumw=0.0; + double sumZ=0.0; + double sumZ2=0.0; + for (k=0; k<N; k++) + { + if (excluded.Contains(fold[k])) continue; + sumw += weight[k]; + sumZ += weight[k]*Ztq[k]; + sumZ2 += weight[k]*Ztq[k]*Ztq[k]; +// if (isnan(sumZ)) +// { +// fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); +// printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]); +// par.trans=0; +// return; +// } + } + float mu = sumZ/sumw; + float sigma = sqrt(sumZ2/sumw-mu*mu); + if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma); + sigma *= 1.01;// correct different fitting of EVD and normal variables + + // Normalize Ztq and calculate P1-values + fprintf(stderr,"Normalize Ztq and calculate P1-values\n"); + Reset(); + while (!End()) + { + hit = ReadNext(); + hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma); + hit.E1val = N_searched*(hit.logPval<-100? 0.0 : exp(hit.logPval)); + // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue)) + hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss; + hit.Probab = Probab(hit); + hit.score_sort = hit.logPval; + Overwrite(hit); // copy hit object into current position of hitlist + } + + for (k=0; k<N; k++){ + delete[](Z[k]); (Z[k]) = NULL; + } + for (k=0; k<N; k++){ + delete[](C[k]); (C[k]) = NULL; + } + for (k=0; k<N; k++){ + delete[](fold[k]); (fold[k]) = NULL; + } + for (k=0; k<N; k++){ + delete[](fam[k]); (fam[k]) = NULL; + } + delete[](C); (C) = NULL; + delete[](Z); (Z) = NULL; + delete[](fold); (fold) = NULL; + delete[](fam); (fam) = NULL; + delete[](Prob); (Prob) = NULL; + delete[](ll); (ll) = NULL; + delete[](Zq); (Zq) = NULL; + delete[](Ztq); (Ztq) = NULL; +} + + +///////////////////////////////////////////////////////////////////////////////////// +/** + * @brief Score2Z transforms the -log(P-value) score into a Z-score for 0 < S + * Score2Z(S) = sqrt(2)*dierfc(2*e^(-S)), where dierfc is the inverse of the complementary error function + */ +double +HitList::Score2Z(double S) +{ + double s, t, u, w, x, y, z; + if (S<=0) return double(-100000); + y = ( S>200 ? 0.0 : 2.0*exp(-S) ); + if (y > 1) + { + z = (S<1e-6? 2*S : 2-y); + w = 0.916461398268964 - log(z); + } + else + { + z = y; + w = 0.916461398268964 - (0.69314718056-S); + } + + u = sqrt(w); + s = (log(u) + 0.488826640273108) / w; + t = 1 / (u + 0.231729200323405); + + x = u * (1 - s * (s * 0.124610454613712 + 0.5)) - + ((((-0.0728846765585675 * t + 0.269999308670029) * t + + 0.150689047360223) * t + 0.116065025341614) * t + + 0.499999303439796) * t; + t = 3.97886080735226 / (x + 3.97886080735226); + u = t - 0.5; + s = (((((((((0.00112648096188977922 * u + + 1.05739299623423047e-4) * u - 0.00351287146129100025) * u - + 7.71708358954120939e-4) * u + 0.00685649426074558612) * u + + 0.00339721910367775861) * u - 0.011274916933250487) * u - + 0.0118598117047771104) * u + 0.0142961988697898018) * u + + 0.0346494207789099922) * u + 0.00220995927012179067; + s = ((((((((((((s * u - 0.0743424357241784861) * u - + 0.105872177941595488) * u + 0.0147297938331485121) * u + + 0.316847638520135944) * u + 0.713657635868730364) * u + + 1.05375024970847138) * u + 1.21448730779995237) * u + + 1.16374581931560831) * u + 0.956464974744799006) * u + + 0.686265948274097816) * u + 0.434397492331430115) * u + + 0.244044510593190935) * t - + (z==0? 0: z * exp(x * x - 0.120782237635245222)); + x += s * (x * s + 1); + if (y > 1) { + x = -x; + } + return double (1.41421356237*x); +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +/** + * @brief Z2Score transforms the Z-score into a -log(P-value) value + * Z2Score(Z) = log(2) - log( erfc(Z/sqrt(2)) ) , where derfc is the complementary error function + */ +double +HitList::Z2Score(double Z) +{ + double t, u, x, y; + x = 0.707106781188*Z; + if (x>10) return 0.69314718056 - (-x*x - log( (1-0.5/x/x)/x/1.772453851) ); + t = 3.97886080735226 / (fabs(x) + 3.97886080735226); + u = t - 0.5; + y = (((((((((0.00127109764952614092 * u + 1.19314022838340944e-4) * u - + 0.003963850973605135) * u - 8.70779635317295828e-4) * u + + 0.00773672528313526668) * u + 0.00383335126264887303) * u - + 0.0127223813782122755) * u - 0.0133823644533460069) * u + + 0.0161315329733252248) * u + 0.0390976845588484035) * u + + 0.00249367200053503304; + y = ((((((((((((y * u - 0.0838864557023001992) * u - + 0.119463959964325415) * u + 0.0166207924969367356) * u + + 0.357524274449531043) * u + 0.805276408752910567) * u + + 1.18902982909273333) * u + 1.37040217682338167) * u + + 1.31314653831023098) * u + 1.07925515155856677) * u + + 0.774368199119538609) * u + 0.490165080585318424) * u + + 0.275374741597376782) * t * (x>10? 0.0 : exp(-x * x)); + return 0.69314718056 - log( x < 0 ? 2 - y : y ); +} + + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +/** + * @brief + */ +void +PrintMatrix(float** V, int N) +{ + int k,l; + for (k=0; k<N; k++) + { + fprintf(stderr,"k=%4i \n",k); + for (l=0; l<N; l++) + { + fprintf(stderr,"%4i:%6.3f ",l,V[k][l]); + if ((l+1)%10==0) fprintf(stderr,"\n"); + } + fprintf(stderr,"\n"); + } + fprintf(stderr,"\n"); +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +/** + * @brief + */ +void +PrintMatrix(double** V, int N) +{ + int k,l; + for (k=0; k<N; k++) + { + fprintf(stderr,"k=%4i \n",k); + for (l=0; l<N; l++) + { + fprintf(stderr,"%4i:%6.3f ",l,V[k][l]); + if ((l+1)%10==0) fprintf(stderr,"\n"); + } + fprintf(stderr,"\n"); + } + fprintf(stderr,"\n"); +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +/** + * @brief + */ +void +HitList::Normalize(float* Ztq, char** fold, Hash<int>& excluded) +{ + double sumw=0.0; + double sumZ=0.0; + double sumZ2=0.0; + for (int k=0; k<N_searched; k++) + { + if (excluded.Contains(fold[k])) continue; + sumw += weight[k]; + sumZ += weight[k]*Ztq[k]; + sumZ2 += weight[k]*Ztq[k]*Ztq[k]; + } + float mu = sumZ/sumw; + float sigma = sqrt(sumZ2/sumw-mu*mu); + printf("Transitive score Ztq: mu=%8.3g sigma=%8.3g\n",mu,sigma); + for (int k=0; k<N_searched; k++) Ztq[k] = (Ztq[k]-mu)/sigma; + return; +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +/** + * @brief Calculate standard deviation of Z1 = sum_m [ w_m * Z_m ], where Csub_mn = cov(Z_m,Z_n) + */ +float +HitList::NormalizationFactor(double** Csub, float* w, int M) + { + double sum=0.0; + for (int m=0; m<M; m++) + { + double summ=0.0; + for (int n=0; n<M; n++) summ += Csub[m][n]*w[n]; + sum += w[m]*summ; + } + return sqrt(sum); + } + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +/** + * @brief Calculate inverse of matrix A and store result in B + */ +void +HitList::InvertMatrix(double** B, double** A, int N) +{ + if (N==0) + { + printf("Error: InvertMatrix called with matrix of dimension 0\n"); + exit(6); + } + if (N==1) + { + B[0][0] = (A[0][0]==0.0? 0 :1.0/A[0][0]); + return; + } + + int k,l,m; + double** V = new(double*[N]); + double* s = new(double[N]); + for (k=0; k<N; k++) V[k] = new(double[N]); + + // Copy original matrix A into B since B will be overwritten by SVD() + for (k=0; k<N; k++) + for (l=0; l<N; l++) + B[k][l] = A[k][l]; + + SVD(B, N, s, V); // U replaces B on output; s[] contains singluar values + + // Calculate inverse of A: A^-1 = V * diag(1/s) * U^t + double** U = B; + // Calculate V[k][m] -> V[k][m] *diag(1/s) + for (k=0; k<N; k++) + for (m=0; m<N; m++) + if (s[m]!=0.0) V[k][m] /= s[m]; else V[k][m] = 0.0; + // Calculate V[k][l] -> (V * U^t)_kl + for (k=0; k<N; k++) + { + if (v>=4 && k%100==0) printf("%i\n",k); + for (l=0; l<N; l++) + { + s[l] = 0.0; // use s[] as temporary memory to avoid overwriting B[k][] as long as it is needed + for (m=0; m<N; m++) + s[l] += V[k][m]*U[l][m]; + } + for (l=0; l<N; l++) V[k][l]=s[l]; + } + for (k=0; k<N; k++) + for (l=0; l<N; l++) + B[k][l] = V[k][l]; + + for (k=0; k<N; k++){ + delete[](V[k]); (V[k]) = NULL; + } + delete[](V); (V) = NULL; + return; +} + + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +/** + * @brief + */ +void +HitList::TransposeMatrix(double** V, int N) +{ + int k,l; + for (k=0; k<N; k++) // transpose Z for efficiency of ensuing matrix multiplication + for (l=0; l<k; l++) + { + double buf = V[k][l]; + V[k][l] = V[l][k]; + V[l][k] = buf; + } +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +static double sqrarg; +#define SQR(a) ((sqrarg=(a)) == 0.0 ? 0.0 : sqrarg*sqrarg) +static double maxarg1,maxarg2; +#define FMAX(a,b) (maxarg1=(a),maxarg2=(b),(maxarg1) > (maxarg2) ? (maxarg1) : (maxarg2)) +static int iminarg1,iminarg2; +#define IMIN(a,b) (iminarg1=(a),iminarg2=(b),(iminarg1) < (iminarg2) ? (iminarg1) : (iminarg2)) +#define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a)) + +/** + * @brief This is a version of the Golub and Reinsch algorithm for singular value decomposition for a quadratic + * (n x n) matrix A. It is sped up by transposing A amd V matrices at various places in the algorithm. + * On a 400x400 matrix it runs in 1.6 s or 2.3 times faster than the original (n x m) version. + * On a 4993x4993 matrix it runs in 2h03 or 4.5 times faster than the original (n x m) version. + * + * Given a matrix a[0..n-1][0..n-1], this routine computes its singular value decomposition, A = U · W · V^t . + * The matrix U replaces a on output. The diagonal matrix of singular values W is out-put as a vector w[0..n-1]. + * The matrix V (not the transpose V^t) is output as V[0..n-1][0..n-1] ./ + */ +void +HitList::SVD(double **A, int n, double w[], double **V) +{ + int m=n; // in general algorithm A is an (m x n) matrix instead of (n x n) + + double pythag(double a, double b); + int flag,i,its,j,jj,k,l=1,nm=1; + double anorm,c,f,g,h,s,scale,x,y,z,*rv1; + rv1=new(double[n]); + g=scale=anorm=0.0; + + // Householder reduction to bidiagonal form. + if (v>=5) printf("\nHouseholder reduction to bidiagonal form\n"); + for (i=0;i<n;i++) { + if (v>=4 && i%100==0) printf("i=%i\n",i); + if (v>=4) fprintf(stderr,"."); + l=i+1; + rv1[i]=scale*g; + g=s=scale=0.0; + if (i < m) { + for (k=i;k<m;k++) scale += fabs(A[k][i]); + if (scale) { + for (k=i;k<m;k++) { + A[k][i] /= scale; + s += A[k][i]*A[k][i]; + } + f=A[i][i]; + g = -SIGN(sqrt(s),f); + h=f*g-s; + A[i][i]=f-g; + for (j=l;j<n;j++) { + for (s=0.0,k=i;k<m;k++) s += A[k][i]*A[k][j]; + f=s/h; + for (k=i;k<m;k++) A[k][j] += f*A[k][i]; + } + for (k=i;k<m;k++) A[k][i] *= scale; + } + } + w[i]=scale *g; + g=s=scale=0.0; + if (i < m && i != n-1) { + for (k=l;k<n;k++) scale += fabs(A[i][k]); + if (scale) { + for (k=l;k<n;k++) { + A[i][k] /= scale; + s += A[i][k]*A[i][k]; + } + f=A[i][l]; + g = -SIGN(sqrt(s),f); + h=f*g-s; + A[i][l]=f-g; + for (k=l;k<n;k++) rv1[k]=A[i][k]/h; + for (j=l;j<m;j++) { + for (s=0.0,k=l;k<n;k++) s += A[j][k]*A[i][k]; + for (k=l;k<n;k++) A[j][k] += s*rv1[k]; + } + for (k=l;k<n;k++) A[i][k] *= scale; + } + } + anorm=FMAX(anorm,(fabs(w[i])+fabs(rv1[i]))); + } + // Accumulation of right-hand transformations. + if (v>=5) printf("\nAccumulation of right-hand transformations\n"); + TransposeMatrix(V,n); + for (i=n-1;i>=0;i--) { + if (v>=4 && i%100==0) printf("i=%i\n",i); + if (v>=4) fprintf(stderr,"."); + if (i < n-1) { + if (g) { + // Double division to avoid possible underflow. + for (j=l;j<n;j++) + V[i][j]=(A[i][j]/A[i][l])/g; + for (j=l;j<n;j++) { + for (s=0.0,k=l;k<n;k++) s += A[i][k]*V[j][k]; + for (k=l;k<n;k++) V[j][k] += s*V[i][k]; + } + } + for (j=l;j<n;j++) V[j][i]=V[i][j]=0.0; + } + V[i][i]=1.0; + g=rv1[i]; + l=i; + } + // Accumulation of left-hand transformations. + if (v>=5) printf("\nAccumulation of left-hand transformations\n"); + TransposeMatrix(A,n); + for (i=IMIN(m,n)-1;i>=0;i--) { + if (v>=4 && i%100==0) printf("i=%i\n",i); + if (v>=4) fprintf(stderr,"."); + l=i+1; + g=w[i]; + for (j=l;j<n;j++) A[j][i]=0.0; + if (g) { + g=1.0/g; + for (j=l;j<n;j++) { + for (s=0.0,k=l;k<m;k++) s += A[i][k]*A[j][k]; + f=(s/A[i][i])*g; + for (k=i;k<m;k++) A[j][k] += f*A[i][k]; + } + for (j=i;j<m;j++) A[i][j] *= g; + } else for (j=i;j<m;j++) A[i][j]=0.0; + ++A[i][i]; + } + + // Diagonalization of the bidiagonal form: Loop over singular values, and over allowed iterations. + if (v>=5) printf("\nDiagonalization of the bidiagonal form\n"); + for (k=n-1;k>=0;k--) { + if (v>=4 && k%100==0) printf("k=%i\n",k); + if (v>=4) fprintf(stderr,"."); + for (its=1;its<=30;its++) { + flag=1; + // Test for splitting. Note that rv1[1] is always zero. + for (l=k;l>=0;l--) { + nm=l-1; + if ((double)(fabs(rv1[l])+anorm) == anorm) { + flag=0; + break; + } + if ((double)(fabs(w[nm])+anorm) == anorm) break; + } + if (flag) { + // Cancellation of rv1[l], if l > 1. + c=0.0; + s=1.0; + for (i=l;i<=k;i++) { + f=s*rv1[i]; + rv1[i]=c*rv1[i]; + if ((double)(fabs(f)+anorm) == anorm) break; + g=w[i]; + h=pythag(f,g); + w[i]=h; + h=1.0/h; + c=g*h; + s = -f*h; + for (j=0;j<m;j++) { + y=A[nm][j]; + z=A[i][j]; + A[nm][j]=y*c+z*s; + A[i][j]=z*c-y*s; + } + } + } + z=w[k]; + // Convergence. + if (l == k) { + // Singular value is made nonnegative. + if (z < 0.0) { + w[k] = -z; + for (j=0;j<n;j++) V[k][j] = -V[k][j]; + } + break; + } + if (its == 30) {printf("Error in SVD: no convergence in 30 iterations\n"); exit(7);} + // Shift from bottom 2-by-2 minor. + x=w[l]; + nm=k-1; + y=w[nm]; + g=rv1[nm]; + h=rv1[k]; + f=((y-z)*(y+z)+(g-h)*(g+h))/(2.0*h*y); + g=pythag(f,1.0); + f=((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x; + // Next QR transformation: + c=s=1.0; + for (j=l;j<=nm;j++) { + i=j+1; + g=rv1[i]; + y=w[i]; + h=s*g; + g=c*g; + z=pythag(f,h); + rv1[j]=z; + c=f/z; + s=h/z; + f=x*c+g*s; + g = g*c-x*s; + h=y*s; + y *= c; + for (jj=0;jj<n;jj++) { + x=V[j][jj]; + z=V[i][jj]; + V[j][jj]=x*c+z*s; + V[i][jj]=z*c-x*s; + } + z=pythag(f,h); + // Rotation can be arbitrary if z = 0. + w[j]=z; + if (z) { + z=1.0/z; + c=f*z; + s=h*z; + } + f=c*g+s*y; + x=c*y-s*g; + + for (jj=0;jj<m;jj++) { + y=A[j][jj]; + z=A[i][jj]; + A[j][jj]=y*c+z*s; + A[i][jj]=z*c-y*s; + } + } + rv1[l]=0.0; + rv1[k]=f; + w[k]=x; + } + } + TransposeMatrix(V,n); + TransposeMatrix(A,n); + delete[](rv1); (rv1) = NULL; +} + +/** + * @brief Computes (a2 + b2 )^1/2 without destructive underflow or overflow. + */ +double +pythag(double a, double b) +{ + double absa,absb; + absa=fabs(a); + absb=fabs(b); + if (absa > absb) + return absa*sqrt(1.0+SQR(absb/absa)); + else + return (absb == 0.0 ? 0.0 : absb*sqrt(1.0+SQR(absa/absb))); +} + + +/* @* HitList::ClobberGlobal(void) + */ +void +HitList::ClobberGlobal(void){ + + + /* @<variables local to HitList::ClobberGlobal@> */ + class List<Hit>::ListEl<Hit> *pvIter = head; + + /* NOTE: no free/delete-ing of data to be done here + hitlist only holds a shallow copy of hit; + hit is being cleared off properly. + just reset everything to 0/0.0/NULL. + The only important thing to do at this stage + is to attach head and tail and set size = 0 + (FS, 2010-02-18) + + NOTE: I only ever saw 1 (one) in-between element, + but there may ctually be a real linked list + of more than 1 element (FS, 2010-02-18) + */ + + // printf("POINTER:\t%p\t=HEAD\n", head); + while (pvIter->next != tail){ + + // printf("POINTER:\t%p->\t%p\n", pvIter, pvIter->next); + pvIter = pvIter->next; + +#if 1 + pvIter->data.longname = pvIter->data.name = + pvIter->data.file = pvIter->data.dbfile = NULL; + pvIter->data.sname = NULL; + pvIter->data.seq = NULL; + pvIter->data.self = 0; + pvIter->data.i = pvIter->data.j = NULL; + pvIter->data.states = NULL; + pvIter->data.S = pvIter->data.S_ss = pvIter->data.P_posterior = NULL; + pvIter->data.Xcons = NULL; + pvIter->data.sum_of_probs = 0.0; + pvIter->data.Neff_HMM = 0.0; + pvIter->data.score_ss = pvIter->data.Pval = pvIter->data.logPval = + pvIter->data.Eval = pvIter->data.Probab = pvIter->data.Pforward = 0.0; + pvIter->data.nss_conf = pvIter->data.nfirst = + pvIter->data.i1 = pvIter->data.i2 = pvIter->data.j1 = pvIter->data.j2 = + pvIter->data.matched_cols = pvIter->data.ssm1 = pvIter->data.ssm2 = 0; +#endif + } + // printf("POINTER:\t\t\t%p=TAIL\n", tail); + + + head->next = tail; + tail->prev = head; + size = 0; + + /* @= */ + return; + +} /* this is the end of HitList::ClobberGlobal() */ + + +/* + * EOF hhhitlist-C.h + */