comparison clustalomega/clustal-omega-0.2.0/src/hhalign/hhhitlist-C.h @ 0:ff1768533a07

Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author clustalomega
date Tue, 07 Jun 2011 17:04:25 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ff1768533a07
1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
5 *
6 * Copyright (C) 2010 University College Dublin
7 *
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
12 *
13 * This file is part of Clustal-Omega.
14 *
15 ********************************************************************/
16
17 /*
18 * RCS $Id: hhhitlist-C.h 199 2011-02-21 18:24:49Z fabian $
19 */
20
21 // hhhitlist.C
22
23 #ifndef MAIN
24 #define MAIN
25 #include <iostream> // cin, cout, cerr
26 #include <fstream> // ofstream, ifstream
27 #include <stdio.h> // printf
28 #include <stdlib.h> // exit
29 #include <string> // strcmp, strstr
30 #include <math.h> // sqrt, pow
31 #include <limits.h> // INT_MIN
32 #include <float.h> // FLT_MIN
33 #include <time.h> // clock
34 #include <ctype.h> // islower, isdigit etc
35 using std::ios;
36 using std::ifstream;
37 using std::ofstream;
38 using std::cout;
39 using std::cerr;
40 using std::endl;
41 #include "util-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
42 #include "list.h" // list data structure
43 #include "hash.h" // hash data structure
44 #include "hhdecl-C.h" // constants, class
45 #include "hhutil-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
46 #include "hhhmm.h" // class HMM
47 #include "hhalignment.h" // class Alignment
48 #include "hhhit.h"
49 #include "hhhalfalignment.h"
50 #include "hhfullalignment.h"
51 #endif
52
53
54 //////////////////////////////////////////////////////////////////////////////
55 //////////////////////////////////////////////////////////////////////////////
56 //// Methods of class HitList
57 //////////////////////////////////////////////////////////////////////////////
58 //////////////////////////////////////////////////////////////////////////////
59
60
61
62 //////////////////////////////////////////////////////////////////////////////
63 /**
64 * @brief Print summary listing of hits
65 */
66 void
67 HitList::PrintHitList(HMM& q, char* outfile)
68 {
69 Hit hit;
70 int nhits=0;
71 char str[NAMELEN]="";
72
73 FILE* outf=NULL;
74 if (strcmp(outfile,"stdout"))
75 {
76 outf=fopen(outfile,"w");
77 if (!outf) OpenFileError(outfile);
78 }
79 else
80 outf = stdout;
81
82
83 fprintf(outf,"Query %s\n",q.longname);
84 // fprintf(outf,"Family %s\n",q.fam);
85 fprintf(outf,"Match_columns %i\n",q.L);
86 fprintf(outf,"No_of_seqs %i out of %i\n",q.N_filtered,q.N_in);
87 fprintf(outf,"Neff %-4.1f\n",q.Neff_HMM);
88 fprintf(outf,"Searched_HMMs %i\n",N_searched);
89
90 // Print date stamp
91 time_t* tp=new(time_t);
92 *tp=time(NULL);
93 fprintf(outf,"Date %s",ctime(tp));
94 delete (tp); (tp) = NULL;
95
96 // Print command line
97 fprintf(outf,"Command ");
98 for (int i=0; i<par.argc; i++)
99 if (strlen(par.argv[i])<=par.maxdbstrlen)
100 fprintf(outf,"%s ",par.argv[i]);
101 else
102 fprintf(outf,"<%i characters> ",(int)strlen(par.argv[i]));
103 fprintf(outf,"\n\n");
104
105 #ifdef WINDOWS
106 if (par.trans)
107 fprintf(outf," No Hit Prob E-trans E-value Score SS Cols Query HMM Template HMM\n");
108 else
109 fprintf(outf," No Hit Prob E-value P-value Score SS Cols Query HMM Template HMM\n");
110 #else
111 if (par.trans)
112 fprintf(outf," No Hit Prob E-trans E-value Score SS Cols Query HMM Template HMM\n");
113 else
114 fprintf(outf," No Hit Prob E-value P-value Score SS Cols Query HMM Template HMM\n");
115 #endif
116
117 Reset();
118 while (!End()) // print hit list
119 {
120 hit = ReadNext();
121 if (nhits>=par.Z) break; //max number of lines reached?
122 if (nhits>=par.z && hit.Probab < par.p) break;
123 if (nhits>=par.z && hit.Eval > par.E) continue;
124 // if (hit.matched_cols <=1) continue; // adding this might get to intransparent... analogous statement in PrintAlignments
125 nhits++;
126 sprintf(str,"%3i %-30.30s ",nhits,hit.longname);
127
128
129 #ifdef WINDOWS
130 if (par.trans) // Transitive scoring
131 fprintf(outf,"%-34.34s %5.1f %8.2G %8.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.E1val,hit.Eval,hit.score,hit.score_ss,hit.matched_cols);
132 else // Normal scoring
133 fprintf(outf,"%-34.34s %5.1f %8.2G %8.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.Eval,hit.Pval,hit.score,hit.score_ss,hit.matched_cols);
134 #else
135 if (par.trans) // Transitive scoring
136 fprintf(outf,"%-34.34s %5.1f %7.2G %7.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.E1val,hit.Eval,hit.score,hit.score_ss,hit.matched_cols);
137 else // Normal scoring
138 fprintf(outf,"%-34.34s %5.1f %7.2G %7.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.Eval,hit.Pval,hit.score,hit.score_ss,hit.matched_cols);
139 #endif
140
141 sprintf(str,"%4i-%-4i ",hit.i1,hit.i2);
142 fprintf(outf,"%-10.10s",str);
143 sprintf(str,"%4i-%-4i",hit.j1,hit.j2);
144 fprintf(outf,"%-9.9s(%i)\n",str,hit.L);
145 } //end print hit list
146 fprintf(outf,"\n");
147 if (strcmp(outfile,"stdout")) fclose(outf);
148 }
149
150
151
152 //////////////////////////////////////////////////////////////////////////////
153 /**
154 * @brief Print alignments of query sequences against hit sequences
155 */
156 void
157 HitList::PrintAlignments(
158
159
160 #ifdef CLUSTALO
161 char **ppcFirstProf, char **ppcSecndProf,
162 #endif
163 HMM& q, char* outfile, char outformat)
164 {
165 Hit hit;
166 FullAlignment qt_ali(par.nseqdis+10); // maximum 10 annotation (pseudo) sequences (ss_dssp, sa_dssp, ss_pred, ss_conf, consens,...)
167 int nhits=0;
168
169 #ifndef CLUSTALO_NOFILE
170 FILE* outf=NULL;
171 if (strcmp(outfile,"stdout"))
172 {
173 if (outformat==0)
174 outf=fopen(outfile,"a"); //append to summary hitlist
175 else
176 outf=fopen(outfile,"w"); //open for writing
177 if (!outf) OpenFileError(outfile);
178 }
179 else
180 outf = stdout;
181 #endif
182
183 Reset();
184 while (!End()) // print hit list
185 {
186 if (nhits>=par.B) break; //max number of lines reached?
187 hit = ReadNext();
188 if (nhits>=par.b && hit.Probab < par.p) break;
189 if (nhits>=par.b && hit.Eval > par.E) continue;
190 // // adding this might get to intransparent...
191 // // analogous statement in PrintHitlist and hhalign.C
192 // if (hit.matched_cols <=1) continue;
193 nhits++;
194
195 // Build double alignment of query against template sequences
196 qt_ali.Build(q,hit);
197
198 #ifndef CLUSTALO
199 // Print out alignment
200 if (outformat==0) // HHR format
201 {
202 fprintf(outf,"No %-3i\n",nhits);
203 qt_ali.PrintHeader(outf,q,hit);
204 qt_ali.PrintHHR(outf,hit);
205 }
206 else if (outformat==1) // FASTA format
207 {
208 fprintf(outf,"# No %-3i\n",nhits);
209 qt_ali.PrintFASTA(outf,hit);
210 }
211 else if(outformat==2) // A2M format
212 {
213 fprintf(outf,"# No %-3i\n",nhits);
214 qt_ali.PrintA2M(outf,hit);
215 }
216 else // A3m format
217 {
218 fprintf(outf,"# No %-3i\n",nhits);
219 qt_ali.PrintA3M(outf,hit);
220 }
221 #else
222 qt_ali.OverWriteSeqs(ppcFirstProf, ppcSecndProf);
223 #endif
224
225 qt_ali.FreeMemory();
226 }
227 #ifndef CLUSTALO_NOFILE
228 if (strcmp(outfile,"stdout")) fclose(outf);
229 #endif
230 }
231
232
233
234
235
236 ////////////////////////////////////////////////////////////////////////////
237 /**
238 * @brief Return the ROC_5 score for optimization
239 * (changed 28.3.08 by Michael & Johannes)
240 */
241 void
242 HitList::Optimize(HMM& q, char* buffer)
243 {
244 const int NFAM =5; // calculate ROC_5 score
245 const int NSFAM=5; // calculate ROC_5 score
246 int roc=0; // ROC score
247 int fam=0; // number of hits from same family (at current threshold)
248 int not_fam=0; // number of hits not from same family
249 int sfam=0; // number of hits from same suporfamily (at current threshold)
250 int not_sfam=0; // number of hits not from same superfamily
251 Hit hit;
252
253 SortList();
254 Reset();
255 while (!End())
256 {
257 hit = ReadNext();
258 if (!strcmp(hit.fam,q.fam)) fam++; // query and template from same superfamily? => positive
259 else if (not_fam<NFAM) // query and template from different family? => negative
260 {
261 not_fam++;
262 roc += fam;
263 }
264 if (!strcmp(hit.sfam,q.sfam)) sfam++; // query and template from same superfamily? => positive
265 else if (not_sfam<NSFAM) // query and template from different superfamily? => negative
266 {
267 not_sfam++;
268 roc += sfam;
269 }
270 // printf("qfam=%s tfam=%s qsfam=%s tsfam=%s fam=%-2i not_fam=%3i sfam=%-3i not_sfam=%-5i roc=%-3i\n",q.fam,hit.fam,q.sfam,hit.sfam,fam,not_fam,sfam,not_sfam,roc);
271 }
272
273 // Write ROC score to file or stdout
274 FILE* buf=NULL;
275 if (strcmp(par.buffer,"stdout"))
276 {
277 buf=fopen(buffer,"w");
278 if (!buf) OpenFileError(par.buffer);
279 }
280 else
281 buf = stdout;
282
283 fprintf(buf,"%f\n",float(roc)/float(fam*NFAM+sfam*NSFAM)); // must be between 0 and 1
284 if (v>=2) printf("ROC=%f\n",float(roc)/float(fam*NFAM+sfam*NSFAM));
285 fclose(buf);
286 }
287
288
289
290 //////////////////////////////////////////////////////////////////////////////
291 /**
292 * @brief Print score distribution into file score_dist
293 */
294 void
295 HitList::PrintScoreFile(HMM& q)
296 {
297 int i=0, n;
298 FILE* scoref=NULL;
299 Hit hit;
300 Hash<int> twice(10000); // make sure only one hit per HMM is listed
301 twice.Null(-1);
302
303 if (strcmp(par.scorefile,"stdout"))
304 {
305 scoref=fopen(par.scorefile,"w");
306 if (!scoref)
307 {cerr<<endl<<"WARNING from "<<par.argv[0]<<": could not open \'"<<par.scorefile<<"\'\n"; return;}
308 }
309 else
310 scoref = stdout;
311 Reset();
312 fprintf(scoref,"NAME %s\n",q.longname);
313 fprintf(scoref,"FAM %s\n",q.fam);
314 fprintf(scoref,"FILE %s\n",q.file);
315 fprintf(scoref,"LENG %i\n",q.L);
316 fprintf(scoref,"\n");
317 //fprintf(scoref,"TARGET REL LEN COL LOG-PVA S-TOT MS NALI\n");
318
319 //For hhformat, the PROBAB field has to start at position 41 !!
320 // ----+----1----+----2----+----3----+----4----+----
321 fprintf(scoref,"TARGET FAMILY REL LEN COL LOG-PVA S-AASS PROBAB SCORE\n");
322 // d153l__ 5 185 185 287.82 464.22 100.00
323 // d1qsaa2 3 168 124 145.55 239.22 57.36
324 while (!End())
325 {
326 i++;
327 hit = ReadNext();
328 if (twice[hit.name]==1) continue; // better hit with same HMM has been listed already
329 twice.Add(hit.name,1);
330 //if template and query are from the same superfamily
331 if (!strcmp(hit.name,q.name)) n=5;
332 else if (!strcmp(hit.fam,q.fam)) n=4;
333 else if (!strcmp(hit.sfam,q.sfam)) n=3;
334 else if (!strcmp(hit.fold,q.fold)) n=2;
335 else if (!strcmp(hit.cl,q.cl)) n=1;
336 else n=0;
337 fprintf(scoref,"%-10s %-10s %1i %3i %3i %s %7.2f %6.2f %7.2f\n",hit.name,hit.fam,n,hit.L,hit.matched_cols,sprintg(-1.443*hit.logPval,7),-hit.score_aass,hit.Probab,hit.score);
338 }
339 fclose(scoref);
340 }
341
342
343 inline double
344 logPvalue_HHblast(double s, double corr)
345 {
346 return -s*(1.0-0.5*corr) + (1.0-corr)*log(1.0+s);
347 // return -s*(1.0-0.5*corr) + log( 1.0+(1.0-corr)*s );
348 // return -s*(1.0-0.5*corr) + log( 1.0+(1.0-corr)*(1.0-0.5*corr)*s );
349 }
350
351 inline double
352 Pvalue_HHblast(double s, double corr)
353 {
354 return exp(-s*(1.0-0.5*corr)) * pow(1.0+s,1.0-corr);
355 // return exp(-s*(1.0-0.5*corr)) * ( 1.0+(1.0-corr)*s );
356 // return exp(-s*(1.0-0.5*corr)) * ( 1.0+(1.0-corr)*(1.0-0.5*corr)*s );
357 }
358
359 inline double
360 logLikelihood_HHblast(double s, double corr)
361 {
362 if (s<0.0) { s=0.0; if (corr<1E-5) corr=1E-5; else if (corr>0.99999) corr=0.99999; }
363 else { if (corr<0.0) corr=0.0; else if (corr>1.0) corr=1.0; }
364 return -s*(1.0-0.5*corr) - corr*log(1.0+s) + log(s*(1.0-0.5*corr)+0.5*corr);
365 // return -s*(1.0-0.5*corr) + log( s*(1.0-corr)*(1.0-0.5*corr)+0.5*corr );
366 // return -s*(1.0-0.5*corr) + log((s*(1.0-corr)*(1.0-0.5*corr)+corr)*(1.0-0.5*corr));
367 }
368
369 /////////////////////////////////////////////////////////////////////////////////////
370 /**
371 * @brief Evaluate the *negative* log likelihood for the order statistic of the uniform distribution
372 * for the best 10% of hits (vertex v = (corr,offset) )
373 * The k'th order statistic for X~Uniform is p:=X^(k)~Beta(k,n-k+1) = const*p^(k-1)*(1-p)^(n-k)
374 * Needed to fit the correlation and score offset in HHblast
375 */
376 double
377 HitList::RankOrderFitCorr(double* v)
378 {
379 double sum=0.0;
380 // printf("%8.2G %8.2G %i\n",v[0],v[1],Nprof);
381 int i1 = imin(Nprof,imax(50,int(0.05*Nprof)));
382 for (int i=0; i<i1; i++)
383 {
384 double p = Pvalue_HHblast(score[i]+v[1],v[0]);
385 // sum -= (1.0-double(i)/double(i1)) * weight[i] * ( double(i)*log(p) + (Nprof-i-1.0)*log(1.0-p) );
386 float diff = p-(float(i)+1.0)/(Nprof+1.0);
387 sum += (1.0-double(i)/double(i1)) * weight[i]*diff*diff*(Nprof+1.0)*(Nprof+1.0)*(Nprof+2.0)/(i+10.0)/(Nprof-i);
388 // printf("%-3i Pval=%7.5f Preal=%7.5f diff=%7.5f sum=%7.5f\n",i,p,float(i+1)/(1.0+Nprof),diff,sum);
389 }
390 return sum;
391 }
392
393 /**
394 * @brief Static wrapper-function for calling the nonstatic member function RankOrderFitCorr()
395 * ( see http://www.newty.de/fpt/callback.html#member )
396 */
397 double
398 HitList::RankOrderFitCorr_static(void* pt2hitlist, double* v)
399 {
400 HitList* mySelf = (HitList*) pt2hitlist; // explicitly cast to a pointer to Hitlist
401 return mySelf->RankOrderFitCorr(v); // call member function
402 }
403
404 /////////////////////////////////////////////////////////////////////////////////////
405 /**
406 * @brief Evaluate the *negative* log likelihood of the data at the vertex v = (corr,offset)
407 * Needed to fit the correlation and score offset in HHblast
408 */
409 double
410 HitList::LogLikelihoodCorr(double* v)
411 {
412 double sum=0.0;
413 // printf("%8.2G %8.2G %i\n",v[0],v[1],Nprof);
414 for (int i=0; i<Nprof; i++)
415 {
416 sum -= weight[i]*logLikelihood_HHblast(score[i]+v[1],v[0]);
417 // printf("%-3i Pval=%7.5f Preal=%7.5f diff=%7.5f rmsd=%7.5f sum=%7.5f\n",i,Pvalue_HHblast(score[i],v[0]),float(i)/(1.0+Nprof),x,sqrt(sum/sumw),sum);
418 }
419 return sum;
420 }
421
422 /**
423 * @brief Static wrapper-function for calling the nonstatic member function LogLikelihoodCorr()
424 * ( see http://www.newty.de/fpt/callback.html#member )
425 */
426 double
427 HitList::LogLikelihoodCorr_static(void* pt2hitlist, double* v)
428 {
429 HitList* mySelf = (HitList*) pt2hitlist; // explicitly cast to a pointer to Hitlist
430 return mySelf->LogLikelihoodCorr(v); // call member function
431 }
432
433 /////////////////////////////////////////////////////////////////////////////////////
434 /**
435 * @brief Evaluate the *negative* log likelihood of the data at the vertex v = (lamda,mu)
436 * p(s) = lamda * exp{ -exp[-lamda*(s-mu)] - lamda*(s-mu) } = lamda * exp( -exp(-x) - x)
437 */
438 double
439 HitList::LogLikelihoodEVD(double* v)
440 {
441 double sum=0.0, sumw=0.0;
442 for (int i=0; i<Nprof; i++)
443 {
444 double x = v[0]*(score[i]-v[1]);
445 sum += weight[i]*(exp(-x)+x);
446 sumw += weight[i];
447 }
448 return sum - sumw*log(v[0]);
449 }
450
451 /**
452 * @brief Static wrapper-function for calling the nonstatic member function LogLikelihoodEVD()
453 * ( see http://www.newty.de/fpt/callback.html#member )
454 */
455 double
456 HitList::LogLikelihoodEVD_static(void* pt2hitlist, double* v)
457 {
458 HitList* mySelf = (HitList*) pt2hitlist; // explicitly cast to a pointer to Hitlist
459 return mySelf->LogLikelihoodEVD(v); // call member function
460 }
461
462 /////////////////////////////////////////////////////////////////////////////////////
463 /**
464 * @brief Subroutine to FindMin: try new point given by highest point ihigh and fac and replace ihigh if it is lower
465 */
466 double
467 HitList::TryPoint(const int ndim, double* p, double* y, double* psum, int ihigh, double fac, double (*Func)(void* pt2hitlist, double* v))
468 {
469 // New point p_try = p_c + fac*(p_high-p_c),
470 // where p_c = ( sum_i (p_i) - p_high)/ndim is the center of ndim other points
471 // => p_try = fac1*sum_i(p_i) + fac2*p_high
472 double fac1=(1.-fac)/ndim;
473 double fac2=fac-fac1;
474 double ptry[ndim]; //new point to try out
475 double ytry; //function value of new point
476 int j; //index for the ndim parameters
477
478 for (j=0; j<ndim; j++)
479 ptry[j]=psum[j]*fac1+p[ihigh*ndim+j]*fac2;
480 ytry = (*Func)(this,ptry);
481 if (ytry<=y[ihigh])
482 {
483 // if (v>=4) printf("Trying: %-7.3f %-7.3f %-7.3f -> accept\n",ptry[0],ptry[1],ytry);
484 y[ihigh]=ytry;
485 for (j=0; j<ndim; j++)
486 {
487 psum[j] += ptry[j]-p[ihigh*ndim+j]; //update psum[j]
488 p[ihigh*ndim+j]=ptry[j]; //replace p[ihigh] with ptry
489 } //Note: ihigh is now not highest point anymore!
490 }
491 // else if (v>=4) printf("Trying: %-7.3f %-7.3f %-7.3f -> reject\n",ptry[0],ptry[1],ytry);
492
493 return ytry;
494 }
495
496
497
498 /////////////////////////////////////////////////////////////////////////////////////
499 /**
500 * @brief Find minimum with simplex method of Nelder and Mead (1965)
501 */
502 float
503 HitList::FindMin(const int ndim, double* p, double* y, double tol, int& nfunc, double (*Func)(void* pt2hitlist, double* v))
504 {
505 const int MAXNFUNC=99; //maximum allowed number of function evaluations
506 int ihigh; //index of highest point on simplex
507 int inext; //index of second highest point on simplex
508 int ilow; //index of lowest point on simplex
509 int i; //index for the ndim+1 points
510 int j; //index for the ndim parameters
511 double rtol; //tolerance: difference of function value between highest and lowest point of simplex
512 double temp; //dummy
513 double ytry; //function value of trial point
514 double psum[ndim]; //psum[j] = j'th coordinate of sum vector (sum over all vertex vectors)
515
516 nfunc=0; //number of function evaluations =0
517 //Calculate sum vector psum[j]
518 for (j=0; j<ndim; j++)
519 {
520 psum[j]=p[j];
521 for (i=1; i<ndim+1; i++)
522 psum[j]+=p[i*ndim+j];
523 }
524
525 // Repeat finding better points in simplex until rtol<tol
526 while(1)
527 {
528 // Find indices for highest, next highest and lowest point
529 ilow=0;
530 if (y[0]>y[1]) {inext=1; ihigh=0;} else {inext=0; ihigh=1;}
531 for (i=0; i<ndim+1; i++)
532 {
533 if (y[i]<=y[ilow]) ilow=i;
534 if (y[i]>y[ihigh]) {inext=ihigh; ihigh=i;}
535 else if (y[i]>y[inext] && i!= ihigh) inext=i;
536 }
537
538 // If tolerance in y is smaller than tol swap lowest point to index 0 and break -> return
539 rtol = 2.*fabs(y[ihigh]-y[ilow]) / (fabs(y[ihigh])+fabs(y[ilow])+1E-10);
540 if (rtol<tol)
541 {
542 temp=y[ilow]; y[ilow]=y[0]; y[0]=temp;
543 for (j=0; j<ndim; j++)
544 {
545 temp=p[ilow*ndim+j]; p[ilow*ndim+j]=p[j]; p[j]=temp;
546 }
547 break;
548 }
549
550 // Max number of function evaluations exceeded?
551 if (nfunc>=MAXNFUNC )
552 {
553 if (v) fprintf(stderr,"\nWARNING: maximum likelihood fit of score distribution did not converge.\n");
554 return 1;
555 }
556
557 nfunc+=2;
558 // Point-reflect highest point on the center of gravity p_c of the other ndim points of the simplex
559 if (v>=3) printf("%3i %-7.3f %-7.3f %-12.8f %-9.3E\n",nfunc,p[ilow*ndim],p[ilow*ndim+1],y[ilow],rtol);
560 // if (v>=2) printf(" %3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]);
561 ytry = TryPoint(ndim,p,y,psum,ihigh,-1.0,Func); //reflect highest point on p_c
562
563 if (ytry<=y[ilow])
564 {
565 ytry = TryPoint(ndim,p,y,psum,ihigh,2.0,Func); //expand: try new point 2x further away from p_c
566 // if (v>=2) printf("Expanded: %3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]);
567 }
568 else if (ytry>=y[inext])
569 {
570 // The new point is worse than the second worst point
571 temp=y[ihigh];
572 ytry=TryPoint(ndim,p,y,psum,ihigh,0.5,Func); //contract simplex by 0.5 along (p_high-p_c
573 // if (v>=2) printf("Compressed:%3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]);
574 if (ytry>=temp)
575 {
576 // Trial point is larger than worst point => contract simplex by 0.5 towards lowest point
577 for (i=0; i<ndim+1; i++)
578 {
579 if (i!=ilow)
580 {
581 for (j=0; j<ndim; j++)
582 p[i*ndim+j]=0.5*(p[i*ndim+j]+p[ilow+j]);
583 y[i] = (*Func)(this,p+i*ndim);
584 // y[i] = (*Func)(p+i*ndim);
585 }
586 }
587 nfunc+=ndim;
588 // if (v>=2) printf("Contracted:%3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]);
589
590 //Calculate psum[j]
591 for (j=0; j<ndim; j++)
592 {
593 psum[j]=p[j];
594 for (i=1; i<ndim+1; i++)
595 psum[j]+=p[i*ndim+j];
596 }
597 }
598 }
599 else nfunc--;
600 }
601 return (float)rtol;
602 }
603
604
605
606 /////////////////////////////////////////////////////////////////////////////////////
607 /**
608 * @brief Do a maximum likelihod fit of the scores with an EV distribution with parameters lamda and mu
609 */
610 void
611 HitList::MaxLikelihoodEVD(HMM& q, int nbest)
612 {
613 double tol=1E-6; // Maximum relative tolerance when minimizing -log(P)/N (~likelihood)
614 static char first_call=1;
615 static Hash<int> size_fam(MAXPROF/10); // Hash counts number of HMMs in family
616 static Hash<int> size_sfam(MAXPROF/10); // Hash counts number of families in superfamily
617 Hash<int> excluded(50); // Hash containing names of superfamilies to be excluded from fit
618 size_fam.Null(0); // Set int value to return when no data can be retrieved
619 size_sfam.Null(0); // Set int value to return when no data can be retrieved
620 excluded.Null(0); // Set int value to return when no data can be retrieved
621 Hit hit;
622
623 double mu; // EVD[mu,lam](x) = exp(-exp(-(x-mu)/lam)) = P(score<=x)
624 double vertex[2*3]; // three vertices of the simplex in lamda-mu plane
625 double yvertex[3]; // log likelihood values at the three vertices of the simplex
626 int nfunc=0; // number of function calls
627 double sum_weights=0.0;
628 float sum_scores=0.0;
629 float rtol;
630
631 if (first_call==1)
632 {
633 first_call=0;
634 // Count how many HMMs are in each family; set number of multiple hits per template nrep
635 if (v>=4) printf(" count number of profiles in each family and families in each superfamily ...\n");
636 Reset();
637 while (!End())
638 {
639 hit = ReadNext();
640 if (!size_fam.Contains(hit.fam)) (*size_sfam(hit.sfam))++; //Add one to hash element for superfamily
641 (*size_fam(hit.fam))++; //Add one to hash element for family
642 // printf("size(%s)=%i name=%s\n",hit.fam,*size_fam(hit.fam),hit.name)
643 }
644 fams=size_fam.Size();
645 sfams=size_sfam.Size();
646 if (v>=3)
647 printf("%-3i HMMs from %i families and %i superfamilies searched. Found %i hits\n",N_searched,fams,sfams,Size());
648 }
649
650 // Query has SCOP family identifier?
651 if (q.fam && q.fam[0]>='a' && q.fam[0]<='k' && q.fam[1]=='.')
652 {
653 char sfamid[NAMELEN];
654 char* ptr_in_fam=q.fam;
655 while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-')))
656 {
657 char* ptr=strrchr(sfamid,'.');
658 if (ptr) *ptr='\0';
659 excluded.Add(sfamid);
660 // fprintf(stderr,"Exclude SCOP superfamily %s ptr_in_fam='%s'\n",sfamid,ptr_in_fam);
661 }
662 }
663 // Exclude best superfamilies from fit
664 else if (nbest>0)
665 {
666 if (sfams<97+nbest) return;
667
668 // Find the nbest best-scoring superfamilies for exclusion from first ML fit
669 if (v>=4) printf(" find %i best-scoring superfamilies to exclude from first fit ...\n",nbest);
670 hit = Smallest();
671 excluded.Add(hit.sfam);
672 // printf("Exclude in first round: %s %8.2f %s\n",hit.name,hit.score_aass,hit.sfam);
673 while (excluded.Size()<nbest)
674 {
675 Reset();
676 while (!End() && excluded.Contains(ReadNext().sfam)) ;
677 hit=ReadCurrent();
678 while (!End())
679 {
680 if (ReadNext()<hit && !excluded.Contains(ReadCurrent().sfam))
681 hit=ReadCurrent();
682 }
683 excluded.Add(hit.sfam);
684 // printf("Exclude in first round: %s %8.2f %s %i %i\n",hit.name,hit.score_aass,hit.sfam,excluded.Size(),excluded.Contains(hit.sfam));
685 }
686 tol = 0.01/size_sfam.Size(); // tol=1/N would lead to delta(log-likelihood)~1 (where N ~ number of superfamilies) since (1+1/N)^N = e
687 }
688 else
689 {
690 // Find the best-scoring superfamilies from first fit for exclusion from second ML fit
691 if (v>=4) printf(" find best-scoring superfamilies to exclude from second fit ...\n");
692 Reset();
693 while (!End())
694 {
695 hit = ReadNext();
696 if (hit.Eval < 0.05) excluded.Add(hit.sfam); // changed from 0.5 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
697 }
698 tol = 0.001/size_sfam.Size(); // tol=1/N would lead to delta(log-likelihood)~1 (where N ~ number of superfamilies) since (1+1/N)^N = e
699 }
700
701 // Put scores into score[] and weights into weight[]
702 if (v>=3) printf(" generate scores and weights array for ML fitting ...\n");
703 Nprof=0;
704 Reset();
705 while (!End())
706 {
707 hit = ReadNext();
708 if (hit.irep > 1) continue; //Use only best hit per template
709 if (Nprof>=MAXPROF) break;
710
711 char sfamid[NAMELEN];
712 char* ptr_in_fam=hit.fam;
713 while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-')))
714 {
715 char* ptr=strrchr(sfamid,'.');
716 if (ptr) *ptr='\0';
717 if (excluded.Contains(sfamid)) break; //HMM is among superfamilies to be excluded
718 }
719 if (excluded.Contains(sfamid)) {
720 if (v>=3) fprintf(stderr,"Exclude hit %s (family %s contains %s)\n",hit.name,hit.fam,sfamid);
721 continue;
722 }
723 // ScopID(hit.cl,hit.fold,hit.sfam,hit.fam); //Get scop superfamily code for template
724 // if (*hit.sfam=='\0' || excluded.Contains(hit.sfam)) continue; // skip HMM
725
726 score[Nprof] = hit.score;
727 weight[Nprof]=1./size_fam[hit.fam]/size_sfam[hit.sfam];
728 sum_scores +=hit.score*weight[Nprof];
729 sum_weights+=weight[Nprof];
730
731 //DEBUG
732 // if (v>=4) printf("%-10.10s %-12.12s %-3i %-12.12s %-3i %6.4f %6.4f %7.1f\n",hit.name,hit.fam,size_fam[hit.fam],hit.sfam,size_sfam[hit.sfam],1./size_fam[hit.fam]/size_sfam[hit.sfam],sum,hit.score);
733 Nprof++;
734 }
735 //DEBUG
736 if (v>=3)
737 printf("%i hits used for score distribution\n",Nprof);
738 // for (int i=0; i<Nprof; i++) printf("%3i score=%8.3f weight=%7.5f\n",i,score[i],weight[i]);
739
740 // Set simplex vertices and function values
741 mu = sum_scores/sum_weights - 0.584/LAMDA;
742 if (par.loc) // fit only in local mode; in global mode use fixed value LAMDA and mu mean score
743 {
744 double (*Func)(void*, double*);
745 Func = HitList::LogLikelihoodEVD_static;
746
747 if (nbest>0) {vertex[0]=LAMDA; vertex[1]=mu;} /////////////////////////////////////////// DEBUG
748 else {vertex[0]=q.lamda; vertex[1]=mu;}
749 vertex[2]=vertex[0]+0.1; vertex[3]=vertex[1];
750 vertex[4]=vertex[0]; vertex[5]=vertex[1]+0.2;
751 yvertex[0]=Func(this,vertex );
752 yvertex[1]=Func(this,vertex+2);
753 yvertex[2]=Func(this,vertex+4);
754
755 // Find lam and mu that minimize negative log likelihood of data
756 if (v>=3) printf("Fitting to EVD by maximum likelihood...\niter lamda mu -log(P)/N tol\n");
757 rtol = FindMin(2,vertex,yvertex,tol,nfunc,Func);
758 if (v>=3) printf("%3i %-7.3f %-7.2f %-7.3f %-7.1E\n\n",nfunc,vertex[0],vertex[1],yvertex[0]-(1.5772-log(vertex[0])),rtol);
759 // printf("HHsearch lamda=%-6.3f mu=%-6.3f\n",vertex[0],vertex[1]);
760 }
761 else
762 {
763 vertex[0]=LAMDA_GLOB; vertex[1]=mu;
764 }
765
766 // Set lamda and mu of profile
767 q.lamda = vertex[0];
768 q.mu = vertex[1];
769
770 // Set P-values and E-values
771 // CHECK UPDATE FROM score=-logpval to score=-logpval+SSSCORE2NATLOG*score_ss !!!!
772 Reset();
773 while (!End())
774 {
775 hit = ReadNext();
776
777 // Calculate total score in raw score units: P-value = 1- exp(-exp(-lamda*(Saa-mu)))
778 hit.weight=1./size_fam[hit.fam]/size_sfam[hit.sfam]; // needed for transitive scoring
779 hit.logPval = logPvalue(hit.score,vertex);
780 hit.Pval=Pvalue(hit.score,vertex);
781 hit.Eval=exp(hit.logPval+log(N_searched));
782 // hit.score_aass = hit.logPval/0.45-3.0 - hit.score_ss; // median(lamda)~0.45, median(mu)~4.0 in EVDs for scop20.1.63 HMMs
783 hit.score_aass = -q.lamda*(hit.score-q.mu)/0.45-3.0 - fmin(hit.score_ss,fmax(0.0,0.5*hit.score-5.0)); // median(lamda)~0.45, median(mu)~3.0 in EVDs for scop20.1.63 HMMs
784 hit.Probab = Probab(hit);
785 if (nbest>0 && par.loc) // correct length correction (not needed for second round of fitting, since lamda very similar)
786 if (par.idummy==0) ////////////////////////////////////////////
787 hit.score += log(q.L*hit.L)*(1/LAMDA-1/vertex[0]);
788 hit.score_sort = hit.score_aass;
789 Overwrite(hit); // copy hit object into current position of hitlist
790
791 if (nbest==0 && par.trans==1) // if in transitive scoring mode (weights file given)
792 TransitiveScoring();
793 else if (nbest==0 && par.trans==2) // if in transitive scoring mode (weights file given)
794 TransitiveScoring2();
795 else if (nbest==0 && par.trans==3) // if in transitive scoring mode (weights file given)
796 TransitiveScoring3();
797 else if (nbest==0 && par.trans==4) // if in transitive scoring mode (weights file given)
798 TransitiveScoring4();
799 }
800 }
801
802
803 /////////////////////////////////////////////////////////////////////////////////////
804 /**
805 * @brief Calculate correlation and score offset for HHblast composite E-values
806 */
807 void
808 HitList::CalculateHHblastCorrelation(HMM& q)
809 {
810 int nfunc=0; // number of function calls
811 double tol; // Maximum relative tolerance when minimizing -log(P)/N (~likelihood)
812 double vertex[2*3]; // three vertices of the simplex in lamda-mu plane
813 double yvertex[3]; // log likelihood values at the three vertices of the simplex
814 Hit hit;
815 Hash<int> excluded(50); // Hash containing names of superfamilies to be excluded from fit
816 excluded.Null(0); // Set int value to return when no data can be retrieved
817
818 // Set sum of HHsearch and PSI-BLAST score for calculating correlation
819 Reset();
820 while (!End())
821 {
822 hit = ReadNext();
823 hit.score_sort = hit.logPval + blast_logPvals->Show(hit.name); // if template not in hash, return log Pval = 0, i.e. Pvalue = 1!
824 Overwrite(hit); // copy hit object into current position of hitlist
825 }
826
827 // Query has SCOP family identifier?
828 if (q.fam && q.fam[0]>='a' && q.fam[0]<='k' && q.fam[1]=='.')
829 {
830 char sfamid[NAMELEN];
831 char* ptr_in_fam=q.fam;
832 while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-')))
833 {
834 char* ptr=strrchr(sfamid,'.');
835 if (ptr) *ptr='\0';
836 excluded.Add(sfamid);
837 fprintf(stderr,"Exclude SCOP superfamily %s ptr_in_fam='%s'\n",sfamid,ptr_in_fam);
838 }
839 }
840
841 // Resort list by sum of log P-values
842 ResortList(); // use InsertSort to resort list according to sum of minus-log-Pvalues
843 Nprof=0;
844 Reset();
845 ReadNext(); // skip best hit
846 while (!End())
847 {
848 hit = ReadNext();
849 if (hit.irep>=2) continue; // use only best alignments
850 // if (hit.Eval<0.005) {if (v>=3) printf("Fitting HHblast correlation coefficient: skipping %s with Evalue=%9.1g\n",hit.name,hit.Eval); continue;}
851 if (Nprof>=MAXPROF) break;
852
853 char sfamid[NAMELEN];
854 char* ptr_in_fam=hit.fam;
855 while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-')))
856 {
857 char* ptr=strrchr(sfamid,'.');
858 if (ptr) *ptr='\0';
859 if (excluded.Contains(sfamid)) break; //HMM is among superfamilies to be excluded
860 }
861 if (excluded.Contains(sfamid)) {
862 if (v>=1) fprintf(stderr,"Exclude hit %s (family %s contains %s)\n",hit.name,hit.fam,sfamid);
863 continue;
864 }
865 score[Nprof] = -hit.score_sort;
866 weight[Nprof] = 1.0; // = hit.weight;
867 // printf("%3i %-12.12s %7.3f + %7.3f = %7.3f \n",Nprof,hit.name,hit.logPval,blast_logPvals->Show(hit.name),-hit.score_sort); //////////////////////
868 printf("%3i %7.3f %7.3f\n",Nprof,hit.Pval,exp(blast_logPvals->Show(hit.name))); //////////////////////
869 Nprof++;
870 }
871
872 // Fit correlation
873 vertex[0]=0.5; vertex[1]=0.2;
874 vertex[2]=vertex[0]+0.2; vertex[3]=vertex[1];
875 vertex[4]=vertex[0]; vertex[5]=vertex[1]+0.2;
876
877 yvertex[0]=RankOrderFitCorr(vertex );
878 yvertex[1]=RankOrderFitCorr(vertex+2);
879 yvertex[2]=RankOrderFitCorr(vertex+4);
880 // yvertex[0]=LogLikelihoodCorr(vertex );
881 // yvertex[1]=LogLikelihoodCorr(vertex+2);
882 // yvertex[2]=LogLikelihoodCorr(vertex+4);
883 tol = 1e-6;
884 v=3;//////////////////////////////////
885 // Find correlation and offset that minimize mean square deviation of reported composite Pvalues from actual
886 if (v>=2) printf("Fitting correlation coefficient for HHblast...\niter corr offset logLikelihood tol\n");
887 float rtol = FindMin(2,vertex,yvertex,tol,nfunc, HitList::RankOrderFitCorr_static);
888 if (v>=2) printf("%3i %-7.3f %-7.2f %-7.3f %-7.1E\n\n",nfunc,vertex[0],vertex[1],yvertex[0],rtol);
889 if (vertex[0]<0) vertex[0]=0.0;
890
891 // Print correlation and offset for profile
892 printf("HHblast correlation=%-6.3f score offset=%-6.3f\n",vertex[0],vertex[1]);
893 v=2;//////////////////////////////////
894 }
895
896
897 /**
898 * @brief Calculate HHblast composite E-values
899 */
900 inline double
901 corr_HHblast(float Nq, float Nt)
902 {
903 return 0.5;
904 }
905
906 /**
907 * @brief Calculate HHblast composite E-values
908 */
909 inline double
910 offset_HHblast(float Nq, float Nt)
911 {
912 return 0.0;
913 }
914
915 //////////////////////////////////////////////////////////////////////////////
916 /**
917 * @brief Calculate HHblast composite E-values
918 */
919 void
920 HitList::CalculateHHblastEvalues(HMM& q)
921 {
922 Hit hit;
923 float corr, offset; // correlation coefficient and offset for calculating composite HHblast P-values
924
925 Reset();
926 while (!End())
927 {
928 hit = ReadNext();
929 corr = corr_HHblast(q.Neff_HMM,hit.Neff_HMM);
930 offset = offset_HHblast(q.Neff_HMM,hit.Neff_HMM);
931 hit.score_sort = hit.logPval + blast_logPvals->Show(hit.name);
932 hit.logPval = logPvalue_HHblast(-hit.score_sort+offset,corr); // overwrite logPval from HHsearch with composite logPval from HHblast
933 hit.Pval = Pvalue_HHblast(-hit.score_sort+offset,corr); // overwrite P-value from HHsearch with composite P-value from HHblast
934 hit.Eval = exp(hit.logPval+log(N_searched)); // overwrite E-value from HHsearch with composite E-value from HHblast
935 hit.Probab = Probab(hit);
936 Overwrite(hit); // copy hit object into current position of hitlist
937 }
938 ResortList(); // use InsertSort to resort list according to sum of minus-log-Pvalues
939 }
940
941
942 //////////////////////////////////////////////////////////////////////////////
943 /**
944 * @brief Read file generated by blastpgp (default output) and store P-values in hash
945 */
946 void
947 HitList::ReadBlastFile(HMM& q)
948 {
949 char line[LINELEN]=""; // input line
950 int Ndb; // number of sequences in database
951 int Ldb=0; // size of database in number of amino acids
952 char* templ;
953 int i;
954 if (!blast_logPvals) { blast_logPvals = new(Hash<float>); blast_logPvals->New(16381,0); }
955
956 FILE* blaf = NULL;
957 if (!strcmp(par.blafile,"stdin")) blaf=stdin;
958 else
959 {
960 blaf = fopen(par.blafile,"rb");
961 if (!blaf) OpenFileError(par.blafile);
962 }
963
964 // Read number of sequences and size of database
965 while (fgetline(line,LINELEN-1,blaf) && !strstr(line,"sequences;"));
966 if (!strstr(line,"sequences;")) FormatError(par.blafile,"No 'Database:' string found.");
967 char* ptr=line;
968 Ndb = strint(ptr);
969 if (Ndb==INT_MIN) FormatError(par.blafile,"No integer for number of sequences in database found.");
970 while ((i=strint(ptr))>INT_MIN) Ldb = 1000*Ldb + i;
971 if (Ldb==0) FormatError(par.blafile,"No integer for size of database found.");
972 printf("\nNumber of sequences in database = %i Size of database = %i\n",Ndb,Ldb);
973
974 // Read all E-values and sequence lengths
975 while (fgetline(line,LINELEN-1,blaf))
976 {
977 if (line[0]=='>')
978 {
979 // Read template name
980 templ = new(char[255]);
981 ptr = line+1;
982 strwrd(templ,ptr);
983 if (!blast_logPvals->Contains(templ)) // store logPval only for best HSP with template
984 {
985 // Read length
986 while (fgetline(line,LINELEN-1,blaf) && !strstr(line,"Length ="));
987 ptr = line+18;
988 int length = strint(ptr);
989 // Read E-value
990 fgetline(line,LINELEN-1,blaf);
991 fgetline(line,LINELEN-1,blaf);
992 float EvalDB; // E-value[seq-db] = Evalue for comparison Query vs. database, from PSI-BLAST
993 float EvalQT; // E-value[seq-seq] = Evalue for comparison Query vs. template (seq-seq)
994 double logPval;
995 ptr = strstr(line+20,"Expect =");
996 if (!ptr) FormatError(par.blafile,"No 'Expect =' string found.");
997 if (sscanf(ptr+8,"%g",&EvalDB)<1)
998 {
999 ptr[7]='1';
1000 if (sscanf(ptr+7,"%g",&EvalDB)<1)
1001 FormatError(par.blafile,"No Evalue found after 'Expect ='.");
1002 }
1003 // Calculate P-value[seq-seq] = 1 - exp(-E-value[seq-seq]) = 1 - exp(-Lt/Ldb*E-value[seq-db])
1004 EvalQT = length/double(Ldb)*double(EvalDB);
1005 if (EvalQT>1E-3) logPval = log(1.0-exp(-EvalQT)); else logPval=log(double(EvalQT)+1.0E-99);
1006 blast_logPvals->Add(templ,logPval);
1007 printf("template=%-10.10s length=%-3i EvalDB=%8.2g EvalQT=%8.2g P-value=%8.2g log Pval=%8.2g\n",templ,length,EvalDB,EvalQT,exp(logPval),logPval);
1008 }
1009 else {
1010 delete[] templ; templ = NULL;
1011 }
1012 }
1013 }
1014 fclose(blaf);
1015 }
1016
1017
1018 /////////////////////////////////////////////////////////////////////////////////////
1019 /**
1020 * @brief Calculate output of hidden neural network units
1021 */
1022 inline float
1023 calc_hidden_output(const float* weights, const float* bias, float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm)
1024 {
1025 float res;
1026 // Calculate activation of hidden unit = sum of all inputs * weights + bias
1027 res = Lqnorm*weights[0] + Ltnorm*weights[1] + Nqnorm*weights[2] + Ntnorm*weights[3] + *bias;
1028 res = 1.0 / (1.0 + exp(-(res ))); // logistic function
1029 return res;
1030 }
1031
1032 ////////////////////////////////////////////////////////////////////////////////////
1033 /**
1034 * @brief Neural network regressions of lamda for EVD
1035 */
1036 inline float
1037 lamda_NN(float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm)
1038 {
1039 const int inputs = 4;
1040 const int hidden = 4;
1041 const float biases[] = {-0.73195, -1.43792, -1.18839, -3.01141}; // bias for all hidden units
1042 const float weights[] = { // Weights for the neural networks (column = start unit, row = end unit)
1043 -0.52356, -3.37650, 1.12984, -0.46796,
1044 -4.71361, 0.14166, 1.66807, 0.16383,
1045 -0.94895, -1.24358, -1.20293, 0.95434,
1046 -0.00318, 0.53022, -0.04914, -0.77046,
1047 2.45630, 3.02905, 2.53803, 2.64379
1048 };
1049 float lamda=0.0;
1050 for (int h = 0; h<hidden; h++) {
1051 lamda += calc_hidden_output( weights+inputs*h, biases+h, Lqnorm,Ltnorm,Nqnorm,Ntnorm ) * weights[hidden*inputs+h];
1052 }
1053 return lamda;
1054 }
1055
1056 ////////////////////////////////////////////////////////////////////////////////////
1057 /**
1058 * @brief Neural network regressions of mu for EVD
1059 */
1060 inline float
1061 mu_NN(float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm)
1062 {
1063 const int inputs = 4;
1064 const int hidden = 6;
1065 const float biases[] = {-4.25264, -3.63484, -5.86653, -4.78472, -2.76356, -2.21580}; // bias for all hidden units
1066 const float weights[] = { // Weights for the neural networks (column = start unit, row = end unit)
1067 1.96172, 1.07181, -7.41256, 0.26471,
1068 0.84643, 1.46777, -1.04800, -0.51425,
1069 1.42697, 1.99927, 0.64647, 0.27834,
1070 1.34216, 1.64064, 0.35538, -8.08311,
1071 2.30046, 1.31700, -0.46435, -0.46803,
1072 0.90090, -3.53067, 0.59212, 1.47503,
1073 -1.26036, 1.52812, 1.58413, -1.90409, 0.92803, -0.66871
1074 };
1075 float mu=0.0;
1076 for (int h = 0; h<hidden; h++) {
1077 mu += calc_hidden_output( weights+inputs*h, biases+h, Lqnorm,Ltnorm,Nqnorm,Ntnorm ) * weights[hidden*inputs+h];
1078 }
1079 return 20.0*mu;
1080 }
1081
1082 //////////////////////////////////////////////////////////////////////////////
1083 /**
1084 * @brief Calculate Pvalues as a function of query and template lengths and diversities
1085 */
1086 void
1087 HitList::CalculatePvalues(HMM& q)
1088 {
1089 Hit hit;
1090 float lamda=0.4, mu=3.0;
1091 const float log1000=log(1000.0);
1092
1093 if (par.idummy!=2)
1094 {
1095 printf("WARNING: idummy should have been ==2 (no length correction)\n");
1096 exit(4);
1097 }
1098
1099 if(N_searched==0) N_searched=1;
1100 if (v>=2)
1101 printf("Calculate Pvalues as a function of query and template lengths and diversities...\n");
1102 Reset();
1103 while (!End())
1104 {
1105 hit = ReadNext();
1106
1107 if (par.loc)
1108 {
1109 lamda = lamda_NN( log(q.L)/log1000, log(hit.L)/log1000, q.Neff_HMM/10.0, hit.Neff_HMM/10.0 );
1110 mu = mu_NN( log(q.L)/log1000, log(hit.L)/log1000, q.Neff_HMM/10.0, hit.Neff_HMM/10.0 );
1111 // if (v>=3 && nhits++<20)
1112 // printf("hit=%-10.10s Lq=%-4i Lt=%-4i Nq=%5.2f Nt=%5.2f => lamda=%-6.3f mu=%-6.3f\n",hit.name,q.L,hit.L,q.Neff_HMM,hit.Neff_HMM,lamda,mu);
1113 }
1114 else
1115 {
1116 printf("WARNING: global calibration not yet implemented!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
1117 }
1118 hit.logPval = logPvalue(hit.score,lamda,mu);
1119 hit.Pval = Pvalue(hit.score,lamda,mu);
1120 hit.Eval=exp(hit.logPval+log(N_searched));
1121 // hit.score_aass = hit.logPval/LAMDA-3.0 - hit.score_ss; // median(lamda)~0.45, median(mu)~3.0 in EVDs for scop20.1.63 HMMs
1122 // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))
1123 hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-hit.Pval)) )/0.45 - fmin(lamda*hit.score_ss,fmax(0.0,0.2*(hit.score-8.0)))/0.45 - 3.0;
1124 hit.score_sort = hit.score_aass;
1125 hit.Probab = Probab(hit);
1126 Overwrite(hit);
1127 }
1128 SortList();
1129 Reset();
1130 return;
1131 }
1132
1133 //////////////////////////////////////////////////////////////////////////////
1134 /**
1135 * @brief Calculate Pvalues from calibration of 0: query HMM, 1:template HMMs, 2: both
1136 */
1137 void
1138 HitList::GetPvalsFromCalibration(HMM& q)
1139 {
1140 Hit hit;
1141 char warn=0;
1142 if(N_searched==0) N_searched=1;
1143 if (v>=2)
1144 {
1145 switch (par.calm)
1146 {
1147 case 0:
1148 printf("Using lamda=%-5.3f and mu=%-5.2f from calibrated query HMM %s. \n",q.lamda,q.mu,q.name);
1149 printf("Note that HMMs need to be recalibrated when changing HMM-HMM alignment options.\n");
1150 break;
1151 case 1:
1152 printf("Using score distribution parameters lamda and mu from database HMMs \n");
1153 break;
1154 case 2:
1155 printf("Combining score distribution parameters lamda and mu from query and database HMMs\n");
1156 printf("Note that HMMs need to be recalibrated when changing HMM-HMM alignment options.\n");
1157 break;
1158 }
1159 }
1160 Reset();
1161 while (!End())
1162 {
1163 hit = ReadNext();
1164 if (par.calm==0 || (hit.logPvalt==0) )
1165 {
1166 hit.logPval = logPvalue(hit.score,q.lamda,q.mu);
1167 hit.Pval = Pvalue(hit.score,q.lamda,q.mu);
1168 if (par.calm>0 && warn++<1 && v>=1)
1169 printf("Warning: some template HMM (e.g. %s) are not calibrated. Using query calibration.\n",hit.name);
1170 }
1171 else if (par.calm==1)
1172 {
1173 hit.logPval = hit.logPvalt;
1174 hit.Pval = hit.Pvalt;
1175 }
1176 else if (par.calm==2)
1177 {
1178 hit.logPval = 0.5*( logPvalue(hit.score,q.lamda,q.mu) + hit.logPvalt);
1179 hit.Pval = sqrt( Pvalue(hit.score,q.lamda,q.mu) * hit.Pvalt);
1180 if (v>=5) printf("Score: %7.1f lamda: %7.1f mu: %7.1f P-values: query-calibrated: %8.2G template-calibrated: %8.2G geometric mean: %8.2G\n",hit.score,q.lamda,q.mu,Pvalue(hit.score,q.lamda,q.mu),hit.Pvalt,hit.Pval);
1181 }
1182
1183 hit.Eval=exp(hit.logPval+log(N_searched));
1184 // hit.score_aass = hit.logPval/LAMDA-3.0 - hit.score_ss; // median(lamda)~0.45, median(mu)~3.0 in EVDs for scop20.1.63 HMMs
1185 // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))
1186 hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-hit.Pval)) ) / 0.45-3.0 - fmin(hit.score_ss,fmax(0.0,0.5*hit.score-5.0));
1187 hit.score_sort = hit.score_aass;
1188 hit.Probab = Probab(hit);
1189 Overwrite(hit);
1190 }
1191 SortList();
1192 Reset();
1193 return;
1194 }
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204 //////////////////////////////////////////////////////////////////////////////
1205 //////////////////////////////////////////////////////////////////////////////
1206 //////////////////////////////////////////////////////////////////////////////
1207 // Transitive scoring
1208 //////////////////////////////////////////////////////////////////////////////
1209 //////////////////////////////////////////////////////////////////////////////
1210 //////////////////////////////////////////////////////////////////////////////
1211
1212
1213
1214
1215
1216
1217
1218 /////////////////////////////////////////////////////////////////////////////////////
1219 /**
1220 * @brief Calculate P-values and Probabilities from transitive scoring over whole database
1221 */
1222 void
1223 HitList::TransitiveScoring()
1224 {
1225 void PrintMatrix(float** V, int N);
1226 void PrintMatrix(double** V, int N);
1227
1228 float** Z; // matrix of intra-db Z-scores Z_kl
1229 float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm)
1230 char** fold; // fold name of HMM k
1231 char** fam; // family of HMM k
1232 float* Prob; // probability of HMM k
1233 float* Zq; // Zq[k] = Z-score between query and database HMM k
1234 float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q
1235 float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k
1236 float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively
1237 int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans
1238 int N; // dimension of weight matrix is NxN
1239 int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively)
1240 int k,l,m,n; // indices for database HMMs
1241 char name[NAMELEN];
1242 Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N}
1243 index.Null(-1); // Set int value to return when no data can be retrieved
1244 Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit
1245 excluded.Null(0); // Set int value to return when no data can be retrieved
1246 Hit hit;
1247 size_t unused; /* disable fread gcc warning */
1248
1249 // Read weights matrix W with index hash and names array
1250 fprintf(stderr,"Reading in weights file\n");
1251 FILE* wfile = fopen(par.wfile,"rb");
1252 if (v>=1 && wfile==NULL)
1253 {
1254 fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched);
1255 perror("fopen");
1256 fprintf(stderr,"Skipping caclulation of transitive P-values\n");
1257 par.trans=0;
1258 return;
1259 }
1260 unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database)
1261 if (v>=1 && N!=N_searched)
1262 {
1263 fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched);
1264 fprintf(stderr,"Skipping caclulation of transitive P-values\n");
1265 par.trans=0;
1266 return;
1267 }
1268 if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N);
1269 // Read names of HMMs (to specify mapping of HMM to matrix indices)
1270 for (k=0; k<N; k++)
1271 {
1272 unused = fread(name,sizeof(char),IDLEN,wfile);
1273 index.Add(name,k);
1274 }
1275 // Read symmetric Z-scores matrix
1276 Z = new(float*[N]);
1277 for (k=0; k<N; k++)
1278 {
1279 Z[k] = new(float[N]);
1280 for (l=0; l<k; l++) Z[k][l] = Z[l][k];
1281 unused = fread(Z[k]+k,sizeof(float),N-k,wfile);
1282 }
1283 // Read symmetric covariance matrix
1284 C = new(float*[N]);
1285 for (k=0; k<N; k++)
1286 {
1287 C[k] = new(float[N]);
1288 for (l=0; l<k; l++) C[k][l] = C[l][k];
1289 unused = fread(C[k]+k,sizeof(float),N-k,wfile);
1290 }
1291 fclose(wfile);
1292
1293 // Allocate memory
1294 Zq = new(float[N]);
1295 Ztq = new(float[N]);
1296 Zrq = new(float[N]);
1297 fold = new(char*[N]);
1298 fam = new(char*[N]);
1299 Prob = new(float[N]);
1300 ll = new(int[N]);
1301 w = new(float[N]);
1302
1303 // Transform P-values to normally distributed Z-scores and store in Zq vector
1304 fprintf(stderr,"Transform P-values to Z-scores\n");
1305 float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL
1306 float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans
1307 printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans);
1308
1309 Reset();
1310 while (!End())
1311 {
1312 hit = ReadNext();
1313 if (hit.irep>1) continue;
1314 k = index.Show(hit.name);
1315 if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);}
1316 if (hit.logPvalt<0)
1317 Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk)
1318 else
1319 Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk
1320 // printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]);
1321 // if (isnan(Zq[k])) {
1322 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
1323 // printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]);
1324 // par.trans=0;
1325 // return;
1326 // }
1327 if (Zq[k]>Zmax_neg) excluded.Add(hit.fold);
1328 fold[k] = new(char[IDLEN]);
1329 fam[k] = new(char[IDLEN]);
1330 strcpy(fold[k],hit.fold);
1331 strcpy(fam[k],hit.fam);
1332 weight[k] = hit.weight;
1333 Prob[k] = hit.Probab;
1334 }
1335
1336 if (v>=3)
1337 {
1338 excluded.Reset();
1339 while (!excluded.End())
1340 {
1341 excluded.ReadNext(name);
1342 printf("Excluded fold %s from fitting to Ztq\n",name);
1343 }
1344 }
1345
1346
1347 ////////////////////////////////////////////////////////////////
1348 // Calculate transitive score (query->l) Zt[l]
1349
1350 // Construct vector ll of indices l for which Z_lq > Zmin_trans
1351 m = 0;
1352 for (l=0; l<N; l++)
1353 if (Zq[l]>=Zmin_trans) ll[m++]=l;
1354 M = m; // number of indices l for which Z_lq,Z_lk > Zmin_trans
1355
1356 // for (m=0; m<M; m++)
1357 // fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]);
1358
1359 if (M<=1)
1360 for (k=0; k<N; k++) Ztq[k]=0.0;
1361 else
1362 {
1363 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
1364 double** Csub = new(double*[M]);
1365 double** Cinv = new(double*[M]);
1366 for (m=0; m<M; m++)
1367 {
1368 Csub[m] = new(double[M]);
1369 Cinv[m] = new(double[M]);
1370 for (n=0; n<M; n++)
1371 Csub[m][n] = double(C[ll[m]][ll[n]]);
1372 }
1373
1374 if (v>=3)
1375 {
1376 fprintf(stderr,"Covariance matrix\n");
1377 PrintMatrix(Csub,M);
1378 }
1379
1380 // Invert Csub
1381 fprintf(stderr,"Calculate inverse of covariance submatrix\n");
1382 InvertMatrix(Cinv,Csub,M);
1383
1384 if (v>=3)
1385 {
1386 fprintf(stderr,"Inverse covariance matrix\n");
1387 PrintMatrix(Cinv,M);
1388 }
1389
1390 // Calculate weights w[l]
1391 for (m=0; m<M; m++)
1392 {
1393 double sum = 0.0;
1394 for (n=0; n<M; n++)
1395 sum += 1.0 * Cinv[m][n];
1396 w[m] = fmax(sum,0.0);
1397 }
1398 for (l=0; l<M; l++){
1399 delete[](Cinv[l]); (Cinv[l]) = NULL;
1400 }
1401 delete[](Cinv); (Cinv) = NULL;
1402
1403 // Calculate Ztq[k] for all HMMs k
1404 fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n");
1405 float norm = NormalizationFactor(Csub,w,M);
1406 for (k=0; k<N; k++)
1407 {
1408 double sumZ = 0.0;
1409 for (m=0; m<M; m++)
1410 sumZ += w[m] * Z[ll[m]][k];
1411 Ztq[k] = sumZ/norm;
1412 }
1413
1414 for (l=0; l<M; l++){
1415 delete[](Csub[l]); (Csub[l]) = NULL;
1416 }
1417 delete[](Csub); (Csub) = NULL;
1418 }
1419
1420 ////////////////////////////////////////////////////////////////
1421 // Calculate reverse transitive score (l->query-) Zrq[l]
1422
1423 fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n");
1424 for (k=0; k<N; k++)
1425 {
1426 // Construct vector ll of indices l for which Z_lk > Zmin_tran
1427 m = 0;
1428 for (l=0; l<N; l++)
1429 if (Z[l][k]+Z[k][l]>=2*Zmin_trans) ll[m++]=l;
1430 int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran
1431
1432
1433 // fprintf(stderr,"\nfam[k]: %s\n",fam[k]);
1434 // for (m=0; m<M; m++)
1435 // printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]);
1436
1437 if (M<=1)
1438 {
1439 Zrq[k] = Zq[k];
1440 }
1441 else
1442 {
1443 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
1444 double** Csub = new(double*[M]);
1445 for (m=0; m<M; m++)
1446 {
1447 Csub[m] = new(double[M]);
1448 for (n=0; n<M; n++)
1449 Csub[m][n] = double(C[ll[m]][ll[n]]);
1450 }
1451 // fprintf(stderr,"Covariance matrix\n");
1452 // PrintMatrix(Csub,M);
1453
1454 if (M==2)
1455 {
1456 for (m=0; m<M; m++) w[m] = 1.0/M;
1457 }
1458 else
1459 {
1460
1461 double** Cinv = new(double*[M]);
1462 for (m=0; m<M; m++) Cinv[m] = new(double[M]);
1463
1464 // Invert Csub
1465 InvertMatrix(Cinv,Csub,M);
1466
1467 // fprintf(stderr,"Inverse covariance matrix\n");
1468 // PrintMatrix(Cinv,M);
1469
1470 // Calculate weights w[l]
1471 for (m=0; m<M; m++)
1472 {
1473 double sum = 0.0;
1474 for (n=0; n<M; n++)
1475 sum += 1.0 * Cinv[m][n];
1476 w[m] = fmax(sum,0.0);
1477 }
1478
1479 // for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]);
1480
1481 for (l=0; l<M; l++){
1482 delete[](Cinv[l]); (Cinv[l]) = NULL;
1483 }
1484 delete[](Cinv); (Cinv) = NULL;
1485 }
1486
1487 // Calculate Zrq[k] and normalize
1488 float norm = NormalizationFactor(Csub,w,M);
1489 double sumZ = 0.0;
1490 for (m=0; m<M; m++)
1491 sumZ += w[m] * Zq[ll[m]];
1492 Zrq[k] = sumZ/norm;
1493
1494 for (l=0; l<M; l++){
1495 delete[](Csub[l]); (Csub[l]) = NULL;
1496 }
1497 delete[](Csub); (Csub) = NULL;
1498 }
1499
1500 // fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]);
1501 }
1502
1503 // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score
1504 for (k=0; k<N; k++)
1505 {
1506 float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]);
1507 // if (isnan(Zqtot))
1508 // {
1509 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
1510 // printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
1511 // par.trans=0;
1512 // return;
1513 // }
1514 if (v>=2 && Zq[k] + Zqtot > 2*Zmin_trans) {
1515 printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
1516 }
1517 Ztq[k] = Zqtot;
1518 }
1519
1520 // Calculate mean and standard deviation of Z1q
1521 fprintf(stderr,"Calculate mean and standard deviation of Ztq\n");
1522 double sumw=0.0;
1523 double sumZ=0.0;
1524 double sumZ2=0.0;
1525 for (k=0; k<N; k++)
1526 {
1527 if (excluded.Contains(fold[k])) continue;
1528 sumw += weight[k];
1529 sumZ += weight[k]*Ztq[k];
1530 sumZ2 += weight[k]*Ztq[k]*Ztq[k];
1531 // if (isnan(sumZ))
1532 // {
1533 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
1534 // printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]);
1535 // par.trans=0;
1536 // return;
1537 // }
1538 }
1539 float mu = sumZ/sumw;
1540 float sigma = sqrt(sumZ2/sumw-mu*mu);
1541 if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma);
1542 sigma *= 1.01;// correct different fitting of EVD and normal variables
1543
1544 // Normalize Ztq and calculate P1-values
1545 fprintf(stderr,"Normalize Ztq and calculate P1-values\n");
1546 Reset();
1547 while (!End())
1548 {
1549 hit = ReadNext();
1550 hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma);
1551 hit.E1val = N_searched*(hit.logPval<-100.0? 0.0 : exp(hit.logPval));
1552 // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))
1553 hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss;
1554 hit.Probab = Probab(hit);
1555 hit.score_sort = hit.logPval;
1556 Overwrite(hit); // copy hit object into current position of hitlist
1557 }
1558
1559 for (k=0; k<N; k++){
1560 delete[](Z[k]); (Z[k]) = NULL;
1561 }
1562 for (k=0; k<N; k++){
1563 delete[](C[k]); (C[k]) = NULL;
1564 }
1565 for (k=0; k<N; k++){
1566 delete[](fold[k]); (fold[k]) = NULL;
1567 }
1568 for (k=0; k<N; k++){
1569 delete[](fam[k]); (fam[k]) = NULL;
1570 }
1571 delete[](C); (C) = NULL;
1572 delete[](Z); (Z) = NULL;
1573 delete[](fold); (fold) = NULL;
1574 delete[](fam); (fam) = NULL;
1575 delete[](Prob); (Prob) = NULL;
1576 delete[](ll); (ll) = NULL;
1577 delete[](Zq); (Zq) = NULL;
1578 delete[](Ztq); (Ztq) = NULL;
1579 }
1580
1581
1582
1583 //////////////////////////////////////////////////////////////////////////////
1584 /**
1585 * @brief Calculate P-values and Probabilities from transitive scoring over whole database
1586 */
1587 void
1588 HitList::TransitiveScoring2()
1589 {
1590 void PrintMatrix(float** V, int N);
1591 void PrintMatrix(double** V, int N);
1592
1593 float** Z; // matrix of intra-db Z-scores Z_kl
1594 float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm)
1595 char** fold; // fold name of HMM k
1596 char** fam; // family of HMM k
1597 float* Prob; // probability of HMM k
1598 float* Zq; // Zq[k] = Z-score between query and database HMM k
1599 float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q
1600 float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k
1601 float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively
1602 int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans
1603 int N; // dimension of weight matrix is NxN
1604 int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively)
1605 int k,l,m,n; // indices for database HMMs
1606 char name[NAMELEN];
1607 Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N}
1608 index.Null(-1); // Set int value to return when no data can be retrieved
1609 Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit
1610 excluded.Null(0); // Set int value to return when no data can be retrieved
1611 Hit hit;
1612 size_t unused; /* disable fread gcc warning */
1613
1614 // Read weights matrix W with index hash and names array
1615 fprintf(stderr,"Reading in weights file\n");
1616 FILE* wfile = fopen(par.wfile,"rb");
1617 if (v>=1 && wfile==NULL)
1618 {
1619 fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched);
1620 perror("fopen");
1621 fprintf(stderr,"Skipping caclulation of transitive P-values\n");
1622 par.trans=0;
1623 return;
1624 }
1625 unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database)
1626 if (v>=1 && N!=N_searched)
1627 {
1628 fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched);
1629 fprintf(stderr,"Skipping caclulation of transitive P-values\n");
1630 par.trans=0;
1631 return;
1632 }
1633 if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N);
1634 // Read names of HMMs (to specify mapping of HMM to matrix indices)
1635 for (k=0; k<N; k++)
1636 {
1637 unused = fread(name,sizeof(char),IDLEN,wfile);
1638 index.Add(name,k);
1639 }
1640 // Read symmetric Z-scores matrix
1641 Z = new(float*[N]);
1642 for (k=0; k<N; k++)
1643 {
1644 Z[k] = new(float[N]);
1645 for (l=0; l<k; l++) Z[k][l] = Z[l][k];
1646 unused = fread(Z[k]+k,sizeof(float),N-k,wfile);
1647 }
1648 // Read symmetric covariance matrix
1649 C = new(float*[N]);
1650 for (k=0; k<N; k++)
1651 {
1652 C[k] = new(float[N]);
1653 for (l=0; l<k; l++) C[k][l] = C[l][k];
1654 unused = fread(C[k]+k,sizeof(float),N-k,wfile);
1655 }
1656 fclose(wfile);
1657
1658 // Allocate memory
1659 Zq = new(float[N]);
1660 Ztq = new(float[N]);
1661 Zrq = new(float[N]);
1662 fold = new(char*[N]);
1663 fam = new(char*[N]);
1664 Prob = new(float[N]);
1665 ll = new(int[N]);
1666 w = new(float[N]);
1667
1668 // Transform P-values to normally distributed Z-scores and store in Zq vector
1669 fprintf(stderr,"Transform P-values to Z-scores\n");
1670 float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL
1671 float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans
1672 printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans);
1673
1674 Reset();
1675 while (!End())
1676 {
1677 hit = ReadNext();
1678 if (hit.irep>1) continue;
1679 k = index.Show(hit.name);
1680 if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);}
1681 if (hit.logPvalt<0)
1682 Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk)
1683 else
1684 Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk
1685 // printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]);
1686 // if (isnan(Zq[k]))
1687 // {
1688 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
1689 // printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]);
1690 // par.trans=0;
1691 // return;
1692 // }
1693 if (Zq[k]>Zmax_neg) excluded.Add(hit.fold);
1694 fold[k] = new(char[IDLEN]);
1695 fam[k] = new(char[IDLEN]);
1696 strcpy(fold[k],hit.fold);
1697 strcpy(fam[k],hit.fam);
1698 weight[k] = hit.weight;
1699 Prob[k] = hit.Probab;
1700 }
1701
1702 if (v>=3)
1703 {
1704 excluded.Reset();
1705 while (!excluded.End())
1706 {
1707 excluded.ReadNext(name);
1708 printf("Excluded fold %s from fitting to Ztq\n",name);
1709 }
1710 }
1711
1712
1713 ////////////////////////////////////////////////////////////////
1714 // Calculate transitive score (query->l) Zt[l]
1715
1716 // Construct vector ll of indices l for which Z_lq > Zmin_trans
1717 m = 0;
1718 for (l=0; l<N; l++)
1719 if (Zq[l]>=Zmin_trans) ll[m++]=l;
1720 M = m; // number of indices l for which Z_lq,Z_lk > Zmin_trans
1721
1722 // for (m=0; m<M; m++)
1723 // fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]);
1724
1725 if (M<=1)
1726 for (k=0; k<N; k++) Ztq[k]=0.0;
1727 else
1728 {
1729 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
1730 double** Csub = new(double*[M]);
1731 double** Cinv = new(double*[M]);
1732 for (m=0; m<M; m++)
1733 {
1734 Csub[m] = new(double[M]);
1735 Cinv[m] = new(double[M]);
1736 for (n=0; n<M; n++)
1737 Csub[m][n] = double(C[ll[m]][ll[n]]);
1738 }
1739
1740 if (v>=3)
1741 {
1742 fprintf(stderr,"Covariance matrix\n");
1743 PrintMatrix(Csub,M);
1744 }
1745
1746 // // Invert Csub
1747 // fprintf(stderr,"Calculate inverse of covariance submatrix\n");
1748 // InvertMatrix(Cinv,Csub,M);
1749
1750 // if (v>=3)
1751 // {
1752 // fprintf(stderr,"Inverse covariance matrix\n");
1753 // PrintMatrix(Cinv,M);
1754 // }
1755
1756
1757 // Calculate weights w[l]
1758 for (m=0; m<M; m++)
1759 {
1760 double sum = 0.0;
1761 for (n=0; n<M; n++)
1762 sum += 1.0 * Csub[m][n];
1763 printf("w[%4i] = %-8.5f\n",ll[m],1.0/sum);
1764 w[m] = (sum>0? Zq[ll[m]] / sum : 0.0);
1765 }
1766 for (l=0; l<M; l++){
1767 delete[](Cinv[l]); (Cinv[l]) = NULL;
1768 }
1769 delete[](Cinv); (Cinv) = NULL;
1770
1771 // Calculate Ztq[k] for all HMMs k
1772 fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n");
1773 float norm = NormalizationFactor(Csub,w,M);
1774 for (k=0; k<N; k++)
1775 {
1776 double sumZ = 0.0;
1777 for (m=0; m<M; m++)
1778 sumZ += w[m] * Z[ll[m]][k];
1779 Ztq[k] = sumZ/norm;
1780 }
1781
1782 for (l=0; l<M; l++){
1783 delete[](Csub[l]); (Csub[l]) = NULL;
1784 }
1785 delete[](Csub); (Csub) = NULL;
1786 }
1787
1788 ////////////////////////////////////////////////////////////////
1789 // Calculate reverse transitive score (l->query-) Zrq[l]
1790
1791 fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n");
1792 for (k=0; k<N; k++)
1793 {
1794 // Construct vector ll of indices l for which Z_lk > Zmin_tran
1795 m = 0;
1796 for (l=0; l<N; l++)
1797 if (Z[l][k]+Z[k][l]>=2*Zmin_trans) ll[m++]=l;
1798 int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran
1799
1800
1801 // fprintf(stderr,"\nfam[k]: %s\n",fam[k]);
1802 // for (m=0; m<M; m++)
1803 // printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]);
1804
1805 if (M<=1)
1806 {
1807 Zrq[k] = Zq[k];
1808 }
1809 else
1810 {
1811 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
1812 double** Csub = new(double*[M]);
1813 for (m=0; m<M; m++)
1814 {
1815 Csub[m] = new(double[M]);
1816 for (n=0; n<M; n++)
1817 Csub[m][n] = double(C[ll[m]][ll[n]]);
1818 }
1819 // fprintf(stderr,"Covariance matrix\n");
1820 // PrintMatrix(Csub,M);
1821
1822 if (M<=2)
1823 {
1824 for (m=0; m<M; m++) w[m] = 1.0/M;
1825 }
1826 else
1827 {
1828
1829 double** Cinv = new(double*[M]);
1830 for (m=0; m<M; m++) Cinv[m] = new(double[M]);
1831
1832 // // Invert Csub
1833 // InvertMatrix(Cinv,Csub,M);
1834
1835 // // fprintf(stderr,"Inverse covariance matrix\n");
1836 // // PrintMatrix(Cinv,M);
1837
1838 // Calculate weights w[l]
1839 for (m=0; m<M; m++)
1840 {
1841 double sum = 0.0;
1842 for (n=0; n<M; n++)
1843 sum += 1.0 * Csub[m][n];
1844 w[m] = (sum>0? Z[ll[m]][k] / sum : 0.0);
1845 }
1846
1847 // for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]);
1848
1849 for (l=0; l<M; l++){
1850 delete[](Cinv[l]); (Cinv[l]) = NULL;
1851 }
1852 delete[](Cinv); (Cinv) = NULL;
1853 }
1854
1855 // Calculate Zrq[k] and normalize
1856 float norm = NormalizationFactor(Csub,w,M);
1857 double sumZ = 0.0;
1858 for (m=0; m<M; m++)
1859 sumZ += w[m] * Zq[ll[m]];
1860 Zrq[k] = sumZ/norm;
1861
1862 for (l=0; l<M; l++){
1863 delete[](Csub[l]); (Csub[l]) = NULL;
1864 }
1865 delete[](Csub); (Csub) = NULL;
1866 }
1867
1868 // fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]);
1869 }
1870
1871 // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score
1872 for (k=0; k<N; k++)
1873 {
1874 float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]);
1875 // if (isnan(Zqtot))
1876 // {
1877 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
1878 // printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
1879 // par.trans=0;
1880 // return;
1881 // }
1882 if (v>=2 && Zq[k] + Zqtot > 2*Zmin_trans) {
1883 printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
1884 }
1885 Ztq[k] = Zqtot;
1886 }
1887
1888 // Calculate mean and standard deviation of Z1q
1889 fprintf(stderr,"Calculate mean and standard deviation of Ztq\n");
1890 double sumw=0.0;
1891 double sumZ=0.0;
1892 double sumZ2=0.0;
1893 for (k=0; k<N; k++)
1894 {
1895 if (excluded.Contains(fold[k])) continue;
1896 sumw += weight[k];
1897 sumZ += weight[k]*Ztq[k];
1898 sumZ2 += weight[k]*Ztq[k]*Ztq[k];
1899 // if (isnan(sumZ))
1900 // {
1901 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
1902 // printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]);
1903 // par.trans=0;
1904 // return;
1905 // }
1906 }
1907 float mu = sumZ/sumw;
1908 float sigma = sqrt(sumZ2/sumw-mu*mu);
1909 if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma);
1910 sigma *= 1.01;// correct different fitting of EVD and normal variables
1911
1912 // Normalize Ztq and calculate P1-values
1913 fprintf(stderr,"Normalize Ztq and calculate P1-values\n");
1914 Reset();
1915 while (!End())
1916 {
1917 hit = ReadNext();
1918 hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma);
1919 hit.E1val = N_searched*(hit.logPval<-100? 0.0 : exp(hit.logPval));
1920 // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))
1921 hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss;
1922 hit.Probab = Probab(hit);
1923 hit.score_sort = hit.logPval;
1924 Overwrite(hit); // copy hit object into current position of hitlist
1925 }
1926
1927 for (k=0; k<N; k++){
1928 delete[](Z[k]); (Z[k]) = NULL;
1929 }
1930 for (k=0; k<N; k++){
1931 delete[](C[k]); (C[k]) = NULL;
1932 }
1933 for (k=0; k<N; k++){
1934 delete[](fold[k]); (fold[k]) = NULL;
1935 }
1936 for (k=0; k<N; k++){
1937 delete[](fam[k]); (fam[k]) = NULL;
1938 }
1939 delete[](C); (C) = NULL;
1940 delete[](Z); (Z) = NULL;
1941 delete[](fold); (fold) = NULL;
1942 delete[](fam); (fam) = NULL;
1943 delete[](Prob); (Prob) = NULL;
1944 delete[](ll); (ll) = NULL;
1945 delete[](Zq); (Zq) = NULL;
1946 delete[](Ztq); (Ztq) = NULL;
1947 }
1948
1949
1950 /////////////////////////////////////////////////////////////////////////////////////
1951 /**
1952 * @brief Calculate P-values and Probabilities from transitive scoring over whole database
1953 * Like TransitiveScoring(),
1954 * but in transitive scoring, Z1_qk = sum_l w_l*Z_lk, use all l:E_ql<=E_qk
1955 * and in reverse scoring, Z1_kr = sum_l w_l*Z_lq, use all l:E_kl<=E_kq
1956 */
1957 void
1958 HitList::TransitiveScoring3()
1959 {
1960 void PrintMatrix(float** V, int N);
1961 void PrintMatrix(double** V, int N);
1962
1963 float** Z; // matrix of intra-db Z-scores Z_kl
1964 float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm)
1965 char** fold; // fold name of HMM k
1966 char** fam; // family of HMM k
1967 float* Prob; // probability of HMM k
1968 float* Zq; // Zq[k] = Z-score between query and database HMM k
1969 float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q
1970 float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k
1971 float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively
1972 int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans
1973 int N; // dimension of weight matrix is NxN
1974 int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively)
1975 int k,l,m,n; // indices for database HMMs
1976 char name[NAMELEN];
1977 Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N}
1978 index.Null(-1); // Set int value to return when no data can be retrieved
1979 Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit
1980 excluded.Null(0); // Set int value to return when no data can be retrieved
1981 Hit hit;
1982 size_t unused; /* disable fread gcc warning */
1983
1984 // Read weights matrix W with index hash and names array
1985 fprintf(stderr,"Reading in weights file\n");
1986 FILE* wfile = fopen(par.wfile,"rb");
1987 if (v>=1 && wfile==NULL)
1988 {
1989 fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched);
1990 perror("fopen");
1991 fprintf(stderr,"Skipping caclulation of transitive P-values\n");
1992 par.trans=0;
1993 return;
1994 }
1995 unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database)
1996 if (v>=1 && N!=N_searched)
1997 {
1998 fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched);
1999 fprintf(stderr,"Skipping caclulation of transitive P-values\n");
2000 par.trans=0;
2001 return;
2002 }
2003 if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N);
2004 // Read names of HMMs (to specify mapping of HMM to matrix indices)
2005 for (k=0; k<N; k++)
2006 {
2007 unused = fread(name,sizeof(char),IDLEN,wfile);
2008 index.Add(name,k);
2009 }
2010 // Read symmetric Z-scores matrix
2011 Z = new(float*[N]);
2012 for (k=0; k<N; k++)
2013 {
2014 Z[k] = new(float[N]);
2015 for (l=0; l<k; l++) Z[k][l] = Z[l][k];
2016 unused = fread(Z[k]+k,sizeof(float),N-k,wfile);
2017 }
2018 // Read symmetric covariance matrix
2019 C = new(float*[N]);
2020 for (k=0; k<N; k++)
2021 {
2022 C[k] = new(float[N]);
2023 for (l=0; l<k; l++) C[k][l] = C[l][k];
2024 unused = fread(C[k]+k,sizeof(float),N-k,wfile);
2025 }
2026 fclose(wfile);
2027
2028 // Allocate memory
2029 Zq = new(float[N]);
2030 Ztq = new(float[N]);
2031 Zrq = new(float[N]);
2032 fold = new(char*[N]);
2033 fam = new(char*[N]);
2034 Prob = new(float[N]);
2035 ll = new(int[N]);
2036 w = new(float[N]);
2037
2038 // Transform P-values to normally distributed Z-scores and store in Zq vector
2039 fprintf(stderr,"Transform P-values to Z-scores\n");
2040 float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL
2041 float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans
2042 printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans);
2043
2044 Reset();
2045 while (!End())
2046 {
2047 hit = ReadNext();
2048 if (hit.irep>1) continue;
2049 k = index.Show(hit.name);
2050 if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);}
2051 if (hit.logPvalt<0)
2052 Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk)
2053 else
2054 Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk
2055 // printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]);
2056 // if (isnan(Zq[k]))
2057 // {
2058 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
2059 // printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]);
2060 // par.trans=0;
2061 // return;
2062 // }
2063 if (Zq[k]>Zmax_neg) excluded.Add(hit.fold);
2064 fold[k] = new(char[IDLEN]);
2065 fam[k] = new(char[IDLEN]);
2066 strcpy(fold[k],hit.fold);
2067 strcpy(fam[k],hit.fam);
2068 weight[k] = hit.weight;
2069 Prob[k] = hit.Probab;
2070 }
2071
2072 if (v>=3)
2073 {
2074 excluded.Reset();
2075 while (!excluded.End())
2076 {
2077 excluded.ReadNext(name);
2078 printf("Excluded fold %s from fitting to Ztq\n",name);
2079 }
2080 }
2081
2082
2083 ////////////////////////////////////////////////////////////////
2084 // Calculate transitive score (query->l) Ztq[l]
2085
2086 fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n");
2087 for (k=0; k<N; k++)
2088 {
2089 // Construct vector ll of indices l for which Z_lq OR Z_lk >= max(Z_kq,Zmin_trans)
2090 float Zmink = fmax(Zq[k],Zmin_trans);
2091 for (m=l=0; l<N; l++)
2092 if (Zq[l]>=Zmink) ll[m++]=l;
2093 M = m; // number of indices l for which Z_lq OR Z_lk >= max(Z_kq,Zmin_trans)
2094
2095 // for (m=0; m<M; m++)
2096 // fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]);
2097
2098 if (M<=1)
2099 {
2100 Ztq[k]=Zq[k];
2101 }
2102 else
2103 {
2104 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
2105 double** Csub = new(double*[M]);
2106 double** Cinv = new(double*[M]);
2107 for (m=0; m<M; m++)
2108 {
2109 Csub[m] = new(double[M]);
2110 Cinv[m] = new(double[M]);
2111 for (n=0; n<M; n++)
2112 Csub[m][n] = double(C[ll[m]][ll[n]]);
2113 }
2114
2115 // fprintf(stderr,"Covariance matrix\n");
2116 // PrintMatrix(Csub,M);
2117
2118 // Invert Csub
2119 // fprintf(stderr,"Calculate inverse of covariance submatrix\n");
2120 InvertMatrix(Cinv,Csub,M);
2121
2122 // fprintf(stderr,"Inverse covariance matrix\n");
2123 // PrintMatrix(Cinv,M);
2124
2125 // Calculate weights w[l]
2126 for (m=0; m<M; m++)
2127 {
2128 double sum = 0.0;
2129 for (n=0; n<M; n++)
2130 sum += 1.0 * Cinv[m][n]; // signal ~ sum_l w_l*Z_lq !
2131 w[m] = fmax(sum,0.0);
2132 }
2133 for (l=0; l<M; l++){
2134 delete[](Cinv[l]); (Cinv[l]) = NULL;
2135 }
2136 delete[](Cinv); (Cinv) = NULL;
2137
2138 // Calculate Ztq[k]
2139 float norm = NormalizationFactor(Csub,w,M);
2140 double sumZ = 0.0;
2141 for (m=0; m<M; m++)
2142 sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]);
2143 // sumZ += w[m] * Z[ll[m]][k];
2144 Ztq[k] = sumZ/norm;
2145
2146 for (l=0; l<M; l++){
2147 delete[](Csub[l]); (Csub[l]) = NULL;
2148 }
2149 delete[](Csub); (Csub) = NULL;
2150 }
2151 }
2152
2153 ////////////////////////////////////////////////////////////////
2154 // Calculate reverse transitive score (l->query-) Zrq[l]
2155
2156 fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n");
2157 for (k=0; k<N; k++)
2158 {
2159 // Construct vector ll of indices l for which Z_lk > Zmin_tran
2160 float Zmink = fmax(Zq[k],Zmin_trans);
2161 for (m=l=0; l<N; l++)
2162 if (Z[l][k]>=Zmink) ll[m++]=l;
2163 int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran
2164
2165
2166 // fprintf(stderr,"\nfam[k]: %s\n",fam[k]);
2167 // for (m=0; m<M; m++)
2168 // printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]);
2169
2170 if (M<=1)
2171 {
2172 Zrq[k] = Zq[k];
2173 }
2174 else
2175 {
2176 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
2177 double** Csub = new(double*[M]);
2178 for (m=0; m<M; m++)
2179 {
2180 Csub[m] = new(double[M]);
2181 for (n=0; n<M; n++)
2182 Csub[m][n] = double(C[ll[m]][ll[n]]);
2183 }
2184 // fprintf(stderr,"Covariance matrix\n");
2185 // PrintMatrix(Csub,M);
2186
2187 if (M==2)
2188 {
2189 for (m=0; m<M; m++) w[m] = 1.0/M;
2190 }
2191 else
2192 {
2193
2194 double** Cinv = new(double*[M]);
2195 for (m=0; m<M; m++) Cinv[m] = new(double[M]);
2196
2197 // Invert Csub
2198 InvertMatrix(Cinv,Csub,M);
2199
2200 // fprintf(stderr,"Inverse covariance matrix\n");
2201 // PrintMatrix(Cinv,M);
2202
2203 // Calculate weights w[l]
2204 for (m=0; m<M; m++)
2205 {
2206 double sum = 0.0;
2207 for (n=0; n<M; n++)
2208 sum += 1.0 * Cinv[m][n]; // signal ~ sum_l w_l*Z_lq !
2209 w[m] = fmax(sum,0.0);
2210 }
2211 // for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]);
2212 for (l=0; l<M; l++){
2213 delete[](Cinv[l]); (Cinv[l]) = NULL;
2214 }
2215 delete[](Cinv); (Cinv) = NULL;
2216 }
2217
2218 // Calculate Zrq[k] and normalize
2219 float norm = NormalizationFactor(Csub,w,M);
2220 double sumZ = 0.0;
2221 for (m=0; m<M; m++)
2222 sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]);
2223 // sumZ += w[m] * Zq[ll[m]];
2224 Zrq[k] = sumZ/norm;
2225
2226 for (l=0; l<M; l++){
2227 delete[](Csub[l]); (Csub[l]) = NULL;
2228 }
2229 delete[](Csub); (Csub) = NULL;
2230 }
2231
2232 // fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]);
2233 }
2234
2235 // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score
2236 for (k=0; k<N; k++)
2237 {
2238
2239 float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]);
2240 // if (isnan(Zqtot))
2241 // {
2242 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
2243 // printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
2244 // par.trans=0;
2245 // return;
2246 // }
2247 if (v>=3 && Zqtot > 2*Zmin_trans) {
2248 printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
2249 }
2250 Ztq[k] = Zqtot;
2251 }
2252
2253 // Calculate mean and standard deviation of Z1q
2254 fprintf(stderr,"Calculate mean and standard deviation of Ztq\n");
2255 double sumw=0.0;
2256 double sumZ=0.0;
2257 double sumZ2=0.0;
2258 for (k=0; k<N; k++)
2259 {
2260 if (excluded.Contains(fold[k])) continue;
2261 sumw += weight[k];
2262 sumZ += weight[k]*Ztq[k];
2263 sumZ2 += weight[k]*Ztq[k]*Ztq[k];
2264 // if (isnan(sumZ))
2265 // {
2266 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
2267 // printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]);
2268 // par.trans=0;
2269 // return;
2270 // }
2271 }
2272 float mu = sumZ/sumw;
2273 float sigma = sqrt(sumZ2/sumw-mu*mu);
2274 if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma);
2275 sigma *= 1.01;// correct different fitting of EVD and normal variables
2276
2277 // Normalize Ztq and calculate P1-values
2278 fprintf(stderr,"Normalize Ztq and calculate P1-values\n");
2279 Reset();
2280 while (!End())
2281 {
2282 hit = ReadNext();
2283 hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma);
2284 hit.E1val = N_searched*(hit.logPval<-100? 0.0 : exp(hit.logPval));
2285 // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))
2286 hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss;
2287 hit.Probab = Probab(hit);
2288 hit.score_sort = hit.logPval;
2289 Overwrite(hit); // copy hit object into current position of hitlist
2290 }
2291
2292 for (k=0; k<N; k++){
2293 delete[](Z[k]); (Z[k]) = NULL;
2294 }
2295 for (k=0; k<N; k++){
2296 delete[](C[k]); (C[k]) = NULL;
2297 }
2298 for (k=0; k<N; k++){
2299 delete[](fold[k]); (fold[k]) = NULL;
2300 }
2301 for (k=0; k<N; k++){
2302 delete[](fam[k]); (fam[k]) = NULL;
2303 }
2304 delete[](C); (C) = NULL;
2305 delete[](Z); (Z) = NULL;
2306 delete[](fold); (fold) = NULL;
2307 delete[](fam); (fam) = NULL;
2308 delete[](Prob); (Prob) = NULL;
2309 delete[](ll); (ll) = NULL;
2310 delete[](Zq); (Zq) = NULL;
2311 delete[](Ztq); (Ztq) = NULL;
2312
2313 }
2314
2315
2316 /////////////////////////////////////////////////////////////////////////////////////
2317 /**
2318 * @brief Calculate P-values and Probabilities from transitive scoring over whole database
2319 * Best tested scheme. Use fmin(Zq[ll[m]],Z[ll[m]][k])
2320 * and fast approximation for weights (not inverse covariance matrix)
2321 */
2322 void
2323 HitList::TransitiveScoring4()
2324 {
2325 void PrintMatrix(float** V, int N);
2326 void PrintMatrix(double** V, int N);
2327
2328 float** Z; // matrix of intra-db Z-scores Z_kl
2329 float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm)
2330 char** fold; // fold name of HMM k
2331 char** fam; // family of HMM k
2332 float* Prob; // probability of HMM k
2333 float* Zq; // Zq[k] = Z-score between query and database HMM k
2334 float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q
2335 float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k
2336 float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively
2337 int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans
2338 int N; // dimension of weight matrix is NxN
2339 int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively)
2340 int k,l,m,n; // indices for database HMMs
2341 char name[NAMELEN];
2342 Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N}
2343 index.Null(-1); // Set int value to return when no data can be retrieved
2344 Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit
2345 excluded.Null(0); // Set int value to return when no data can be retrieved
2346 Hit hit;
2347 size_t unused; /* disable fread gcc warning */
2348
2349 // Read weights matrix W with index hash and names array
2350 fprintf(stderr,"Reading in weights file\n");
2351 FILE* wfile = fopen(par.wfile,"rb");
2352 if (v>=1 && wfile==NULL)
2353 {
2354 fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched);
2355 perror("fopen");
2356 fprintf(stderr,"Skipping caclulation of transitive P-values\n");
2357 par.trans=0;
2358 return;
2359 }
2360 unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database)
2361 if (v>=1 && N!=N_searched)
2362 {
2363 fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched);
2364 fprintf(stderr,"Skipping caclulation of transitive P-values\n");
2365 par.trans=0;
2366 return;
2367 }
2368 if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N);
2369 // Read names of HMMs (to specify mapping of HMM to matrix indices)
2370 for (k=0; k<N; k++)
2371 {
2372 unused = fread(name,sizeof(char),IDLEN,wfile);
2373 index.Add(name,k);
2374 }
2375 // Read symmetric Z-scores matrix
2376 Z = new(float*[N]);
2377 for (k=0; k<N; k++)
2378 {
2379 Z[k] = new(float[N]);
2380 for (l=0; l<k; l++) Z[k][l] = Z[l][k];
2381 unused = fread(Z[k]+k,sizeof(float),N-k,wfile);
2382 }
2383 // Read symmetric covariance matrix
2384 C = new(float*[N]);
2385 for (k=0; k<N; k++)
2386 {
2387 C[k] = new(float[N]);
2388 for (l=0; l<k; l++) C[k][l] = C[l][k];
2389 unused = fread(C[k]+k,sizeof(float),N-k,wfile);
2390 }
2391 fclose(wfile);
2392
2393 // Allocate memory
2394 Zq = new(float[N]);
2395 Ztq = new(float[N]);
2396 Zrq = new(float[N]);
2397 fold = new(char*[N]);
2398 fam = new(char*[N]);
2399 Prob = new(float[N]);
2400 ll = new(int[N]);
2401 w = new(float[N]);
2402
2403 // Transform P-values to normally distributed Z-scores and store in Zq vector
2404 fprintf(stderr,"Transform P-values to Z-scores\n");
2405 float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL
2406 float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans
2407 printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans);
2408
2409 Reset();
2410 while (!End())
2411 {
2412 hit = ReadNext();
2413 if (hit.irep>1) continue;
2414 k = index.Show(hit.name);
2415 if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);}
2416 if (hit.logPvalt<0)
2417 Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk)
2418 else
2419 Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk
2420 // printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]);
2421 // if (isnan(Zq[k])) {
2422 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
2423 // printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]);
2424 // par.trans=0;
2425 // return;
2426 // }
2427 if (Zq[k]>Zmax_neg) excluded.Add(hit.fold);
2428 fold[k] = new(char[IDLEN]);
2429 fam[k] = new(char[IDLEN]);
2430 strcpy(fold[k],hit.fold);
2431 strcpy(fam[k],hit.fam);
2432 weight[k] = hit.weight;
2433 Prob[k] = hit.Probab;
2434 }
2435
2436 if (v>=3)
2437 {
2438 excluded.Reset();
2439 while (!excluded.End())
2440 {
2441 excluded.ReadNext(name);
2442 printf("Excluded fold %s from fitting to Ztq\n",name);
2443 }
2444 }
2445
2446 ////////////////////////////////////////////////////////////////
2447 // Calculate transitive score (query->l) Zt[l]
2448
2449 // Construct vector ll of indices l for which Z_lq > Zmin_trans
2450 m = 0;
2451 for (l=0; l<N; l++)
2452 if (Zq[l]>=Zmin_trans) ll[m++]=l;
2453 M = m; // number of indices l for which Z_lq,Z_lk > Zmin_trans
2454
2455 // for (m=0; m<M; m++)
2456 // fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]);
2457
2458 if (M<=1)
2459 for (k=0; k<N; k++) Ztq[k]=0.0;
2460 else
2461 {
2462 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
2463 double** Csub = new(double*[M]);
2464 for (m=0; m<M; m++)
2465 {
2466 Csub[m] = new(double[M]);
2467 for (n=0; n<M; n++)
2468 Csub[m][n] = double(C[ll[m]][ll[n]]);
2469 }
2470
2471 if (v>=3)
2472 {
2473 fprintf(stderr,"Covariance matrix\n");
2474 PrintMatrix(Csub,M);
2475 }
2476
2477
2478 // Calculate weights w[l]
2479 for (m=0; m<M; m++)
2480 {
2481 double sum = 0.0;
2482 for (n=0; n<M; n++)
2483 sum += fmax(0.0,Csub[m][n]);
2484 printf("w[%4i] = %-8.5f\n",ll[m],1.0/sum);
2485 w[m] = 1.0/sum;
2486 }
2487
2488 // Calculate Ztq[k] for all HMMs k
2489 fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n");
2490 float norm = NormalizationFactor(Csub,w,M);
2491 for (k=0; k<N; k++)
2492 {
2493 double sumZ = 0.0;
2494 for (m=0; m<M; m++)
2495 sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]);
2496 Ztq[k] = sumZ/norm;
2497 }
2498
2499 for (l=0; l<M; l++){
2500 delete[](Csub[l]); (Csub[l]) = NULL;
2501 }
2502 delete[](Csub); (Csub) = NULL;
2503 }
2504
2505 ////////////////////////////////////////////////////////////////
2506 // Calculate reverse transitive score (l->query-) Zrq[l]
2507
2508 fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n");
2509 for (k=0; k<N; k++)
2510 {
2511 // Construct vector ll of indices l for which Z_lk > Zmin_tran
2512 m = 0;
2513 for (l=0; l<N; l++)
2514 if (Z[k][l]>=Zmin_trans) ll[m++]=l;
2515 int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran
2516
2517
2518 // fprintf(stderr,"\nfam[k]: %s\n",fam[k]);
2519 // for (m=0; m<M; m++)
2520 // printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]);
2521
2522 if (M<=1)
2523 {
2524 Zrq[k] = Zq[k];
2525 }
2526 else
2527 {
2528 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans
2529 double** Csub = new(double*[M]);
2530 for (m=0; m<M; m++)
2531 {
2532 Csub[m] = new(double[M]);
2533 for (n=0; n<M; n++)
2534 Csub[m][n] = double(C[ll[m]][ll[n]]);
2535 }
2536 // fprintf(stderr,"Covariance matrix\n");
2537 // PrintMatrix(Csub,M);
2538
2539 // Calculate weights w[l]
2540 for (m=0; m<M; m++)
2541 {
2542 double sum = 0.0;
2543 for (n=0; n<M; n++)
2544 sum += fmax(0.0,Csub[m][n]);
2545 w[m] = 1.0/sum;
2546 }
2547
2548 // for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]);
2549
2550
2551 // Calculate Zrq[k] and normalize
2552 float norm = NormalizationFactor(Csub,w,M);
2553 double sumZ = 0.0;
2554 for (m=0; m<M; m++)
2555 sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]);
2556 Zrq[k] = sumZ/norm;
2557
2558 for (l=0; l<M; l++){
2559 delete[](Csub[l]); (Csub[l]) = NULL;
2560 }
2561 delete[](Csub); (Csub) = NULL;
2562 }
2563
2564 // fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]);
2565 }
2566
2567 // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score
2568 for (k=0; k<N; k++)
2569 {
2570 float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]);
2571 // if (isnan(Zqtot))
2572 // {
2573 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
2574 // printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
2575 // par.trans=0;
2576 // return;
2577 // }
2578 if (v>=3 && Zq[k] + Zqtot > 2*Zmin_trans) {
2579 printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot);
2580 }
2581 Ztq[k] = Zqtot;
2582 }
2583
2584 // Calculate mean and standard deviation of Z1q
2585 fprintf(stderr,"Calculate mean and standard deviation of Ztq\n");
2586 double sumw=0.0;
2587 double sumZ=0.0;
2588 double sumZ2=0.0;
2589 for (k=0; k<N; k++)
2590 {
2591 if (excluded.Contains(fold[k])) continue;
2592 sumw += weight[k];
2593 sumZ += weight[k]*Ztq[k];
2594 sumZ2 += weight[k]*Ztq[k]*Ztq[k];
2595 // if (isnan(sumZ))
2596 // {
2597 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n");
2598 // printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]);
2599 // par.trans=0;
2600 // return;
2601 // }
2602 }
2603 float mu = sumZ/sumw;
2604 float sigma = sqrt(sumZ2/sumw-mu*mu);
2605 if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma);
2606 sigma *= 1.01;// correct different fitting of EVD and normal variables
2607
2608 // Normalize Ztq and calculate P1-values
2609 fprintf(stderr,"Normalize Ztq and calculate P1-values\n");
2610 Reset();
2611 while (!End())
2612 {
2613 hit = ReadNext();
2614 hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma);
2615 hit.E1val = N_searched*(hit.logPval<-100? 0.0 : exp(hit.logPval));
2616 // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue))
2617 hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss;
2618 hit.Probab = Probab(hit);
2619 hit.score_sort = hit.logPval;
2620 Overwrite(hit); // copy hit object into current position of hitlist
2621 }
2622
2623 for (k=0; k<N; k++){
2624 delete[](Z[k]); (Z[k]) = NULL;
2625 }
2626 for (k=0; k<N; k++){
2627 delete[](C[k]); (C[k]) = NULL;
2628 }
2629 for (k=0; k<N; k++){
2630 delete[](fold[k]); (fold[k]) = NULL;
2631 }
2632 for (k=0; k<N; k++){
2633 delete[](fam[k]); (fam[k]) = NULL;
2634 }
2635 delete[](C); (C) = NULL;
2636 delete[](Z); (Z) = NULL;
2637 delete[](fold); (fold) = NULL;
2638 delete[](fam); (fam) = NULL;
2639 delete[](Prob); (Prob) = NULL;
2640 delete[](ll); (ll) = NULL;
2641 delete[](Zq); (Zq) = NULL;
2642 delete[](Ztq); (Ztq) = NULL;
2643 }
2644
2645
2646 /////////////////////////////////////////////////////////////////////////////////////
2647 /**
2648 * @brief Score2Z transforms the -log(P-value) score into a Z-score for 0 < S
2649 * Score2Z(S) = sqrt(2)*dierfc(2*e^(-S)), where dierfc is the inverse of the complementary error function
2650 */
2651 double
2652 HitList::Score2Z(double S)
2653 {
2654 double s, t, u, w, x, y, z;
2655 if (S<=0) return double(-100000);
2656 y = ( S>200 ? 0.0 : 2.0*exp(-S) );
2657 if (y > 1)
2658 {
2659 z = (S<1e-6? 2*S : 2-y);
2660 w = 0.916461398268964 - log(z);
2661 }
2662 else
2663 {
2664 z = y;
2665 w = 0.916461398268964 - (0.69314718056-S);
2666 }
2667
2668 u = sqrt(w);
2669 s = (log(u) + 0.488826640273108) / w;
2670 t = 1 / (u + 0.231729200323405);
2671
2672 x = u * (1 - s * (s * 0.124610454613712 + 0.5)) -
2673 ((((-0.0728846765585675 * t + 0.269999308670029) * t +
2674 0.150689047360223) * t + 0.116065025341614) * t +
2675 0.499999303439796) * t;
2676 t = 3.97886080735226 / (x + 3.97886080735226);
2677 u = t - 0.5;
2678 s = (((((((((0.00112648096188977922 * u +
2679 1.05739299623423047e-4) * u - 0.00351287146129100025) * u -
2680 7.71708358954120939e-4) * u + 0.00685649426074558612) * u +
2681 0.00339721910367775861) * u - 0.011274916933250487) * u -
2682 0.0118598117047771104) * u + 0.0142961988697898018) * u +
2683 0.0346494207789099922) * u + 0.00220995927012179067;
2684 s = ((((((((((((s * u - 0.0743424357241784861) * u -
2685 0.105872177941595488) * u + 0.0147297938331485121) * u +
2686 0.316847638520135944) * u + 0.713657635868730364) * u +
2687 1.05375024970847138) * u + 1.21448730779995237) * u +
2688 1.16374581931560831) * u + 0.956464974744799006) * u +
2689 0.686265948274097816) * u + 0.434397492331430115) * u +
2690 0.244044510593190935) * t -
2691 (z==0? 0: z * exp(x * x - 0.120782237635245222));
2692 x += s * (x * s + 1);
2693 if (y > 1) {
2694 x = -x;
2695 }
2696 return double (1.41421356237*x);
2697 }
2698
2699 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2700 /**
2701 * @brief Z2Score transforms the Z-score into a -log(P-value) value
2702 * Z2Score(Z) = log(2) - log( erfc(Z/sqrt(2)) ) , where derfc is the complementary error function
2703 */
2704 double
2705 HitList::Z2Score(double Z)
2706 {
2707 double t, u, x, y;
2708 x = 0.707106781188*Z;
2709 if (x>10) return 0.69314718056 - (-x*x - log( (1-0.5/x/x)/x/1.772453851) );
2710 t = 3.97886080735226 / (fabs(x) + 3.97886080735226);
2711 u = t - 0.5;
2712 y = (((((((((0.00127109764952614092 * u + 1.19314022838340944e-4) * u -
2713 0.003963850973605135) * u - 8.70779635317295828e-4) * u +
2714 0.00773672528313526668) * u + 0.00383335126264887303) * u -
2715 0.0127223813782122755) * u - 0.0133823644533460069) * u +
2716 0.0161315329733252248) * u + 0.0390976845588484035) * u +
2717 0.00249367200053503304;
2718 y = ((((((((((((y * u - 0.0838864557023001992) * u -
2719 0.119463959964325415) * u + 0.0166207924969367356) * u +
2720 0.357524274449531043) * u + 0.805276408752910567) * u +
2721 1.18902982909273333) * u + 1.37040217682338167) * u +
2722 1.31314653831023098) * u + 1.07925515155856677) * u +
2723 0.774368199119538609) * u + 0.490165080585318424) * u +
2724 0.275374741597376782) * t * (x>10? 0.0 : exp(-x * x));
2725 return 0.69314718056 - log( x < 0 ? 2 - y : y );
2726 }
2727
2728
2729 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2730 /**
2731 * @brief
2732 */
2733 void
2734 PrintMatrix(float** V, int N)
2735 {
2736 int k,l;
2737 for (k=0; k<N; k++)
2738 {
2739 fprintf(stderr,"k=%4i \n",k);
2740 for (l=0; l<N; l++)
2741 {
2742 fprintf(stderr,"%4i:%6.3f ",l,V[k][l]);
2743 if ((l+1)%10==0) fprintf(stderr,"\n");
2744 }
2745 fprintf(stderr,"\n");
2746 }
2747 fprintf(stderr,"\n");
2748 }
2749
2750 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2751 /**
2752 * @brief
2753 */
2754 void
2755 PrintMatrix(double** V, int N)
2756 {
2757 int k,l;
2758 for (k=0; k<N; k++)
2759 {
2760 fprintf(stderr,"k=%4i \n",k);
2761 for (l=0; l<N; l++)
2762 {
2763 fprintf(stderr,"%4i:%6.3f ",l,V[k][l]);
2764 if ((l+1)%10==0) fprintf(stderr,"\n");
2765 }
2766 fprintf(stderr,"\n");
2767 }
2768 fprintf(stderr,"\n");
2769 }
2770
2771 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2772 /**
2773 * @brief
2774 */
2775 void
2776 HitList::Normalize(float* Ztq, char** fold, Hash<int>& excluded)
2777 {
2778 double sumw=0.0;
2779 double sumZ=0.0;
2780 double sumZ2=0.0;
2781 for (int k=0; k<N_searched; k++)
2782 {
2783 if (excluded.Contains(fold[k])) continue;
2784 sumw += weight[k];
2785 sumZ += weight[k]*Ztq[k];
2786 sumZ2 += weight[k]*Ztq[k]*Ztq[k];
2787 }
2788 float mu = sumZ/sumw;
2789 float sigma = sqrt(sumZ2/sumw-mu*mu);
2790 printf("Transitive score Ztq: mu=%8.3g sigma=%8.3g\n",mu,sigma);
2791 for (int k=0; k<N_searched; k++) Ztq[k] = (Ztq[k]-mu)/sigma;
2792 return;
2793 }
2794
2795 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2796 /**
2797 * @brief Calculate standard deviation of Z1 = sum_m [ w_m * Z_m ], where Csub_mn = cov(Z_m,Z_n)
2798 */
2799 float
2800 HitList::NormalizationFactor(double** Csub, float* w, int M)
2801 {
2802 double sum=0.0;
2803 for (int m=0; m<M; m++)
2804 {
2805 double summ=0.0;
2806 for (int n=0; n<M; n++) summ += Csub[m][n]*w[n];
2807 sum += w[m]*summ;
2808 }
2809 return sqrt(sum);
2810 }
2811
2812 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2813 /**
2814 * @brief Calculate inverse of matrix A and store result in B
2815 */
2816 void
2817 HitList::InvertMatrix(double** B, double** A, int N)
2818 {
2819 if (N==0)
2820 {
2821 printf("Error: InvertMatrix called with matrix of dimension 0\n");
2822 exit(6);
2823 }
2824 if (N==1)
2825 {
2826 B[0][0] = (A[0][0]==0.0? 0 :1.0/A[0][0]);
2827 return;
2828 }
2829
2830 int k,l,m;
2831 double** V = new(double*[N]);
2832 double* s = new(double[N]);
2833 for (k=0; k<N; k++) V[k] = new(double[N]);
2834
2835 // Copy original matrix A into B since B will be overwritten by SVD()
2836 for (k=0; k<N; k++)
2837 for (l=0; l<N; l++)
2838 B[k][l] = A[k][l];
2839
2840 SVD(B, N, s, V); // U replaces B on output; s[] contains singluar values
2841
2842 // Calculate inverse of A: A^-1 = V * diag(1/s) * U^t
2843 double** U = B;
2844 // Calculate V[k][m] -> V[k][m] *diag(1/s)
2845 for (k=0; k<N; k++)
2846 for (m=0; m<N; m++)
2847 if (s[m]!=0.0) V[k][m] /= s[m]; else V[k][m] = 0.0;
2848 // Calculate V[k][l] -> (V * U^t)_kl
2849 for (k=0; k<N; k++)
2850 {
2851 if (v>=4 && k%100==0) printf("%i\n",k);
2852 for (l=0; l<N; l++)
2853 {
2854 s[l] = 0.0; // use s[] as temporary memory to avoid overwriting B[k][] as long as it is needed
2855 for (m=0; m<N; m++)
2856 s[l] += V[k][m]*U[l][m];
2857 }
2858 for (l=0; l<N; l++) V[k][l]=s[l];
2859 }
2860 for (k=0; k<N; k++)
2861 for (l=0; l<N; l++)
2862 B[k][l] = V[k][l];
2863
2864 for (k=0; k<N; k++){
2865 delete[](V[k]); (V[k]) = NULL;
2866 }
2867 delete[](V); (V) = NULL;
2868 return;
2869 }
2870
2871
2872 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2873 /**
2874 * @brief
2875 */
2876 void
2877 HitList::TransposeMatrix(double** V, int N)
2878 {
2879 int k,l;
2880 for (k=0; k<N; k++) // transpose Z for efficiency of ensuing matrix multiplication
2881 for (l=0; l<k; l++)
2882 {
2883 double buf = V[k][l];
2884 V[k][l] = V[l][k];
2885 V[l][k] = buf;
2886 }
2887 }
2888
2889 /////////////////////////////////////////////////////////////////////////////////////////////////////////
2890 static double sqrarg;
2891 #define SQR(a) ((sqrarg=(a)) == 0.0 ? 0.0 : sqrarg*sqrarg)
2892 static double maxarg1,maxarg2;
2893 #define FMAX(a,b) (maxarg1=(a),maxarg2=(b),(maxarg1) > (maxarg2) ? (maxarg1) : (maxarg2))
2894 static int iminarg1,iminarg2;
2895 #define IMIN(a,b) (iminarg1=(a),iminarg2=(b),(iminarg1) < (iminarg2) ? (iminarg1) : (iminarg2))
2896 #define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a))
2897
2898 /**
2899 * @brief This is a version of the Golub and Reinsch algorithm for singular value decomposition for a quadratic
2900 * (n x n) matrix A. It is sped up by transposing A amd V matrices at various places in the algorithm.
2901 * On a 400x400 matrix it runs in 1.6 s or 2.3 times faster than the original (n x m) version.
2902 * On a 4993x4993 matrix it runs in 2h03 or 4.5 times faster than the original (n x m) version.
2903 *
2904 * Given a matrix a[0..n-1][0..n-1], this routine computes its singular value decomposition, A = U · W · V^t .
2905 * The matrix U replaces a on output. The diagonal matrix of singular values W is out-put as a vector w[0..n-1].
2906 * The matrix V (not the transpose V^t) is output as V[0..n-1][0..n-1] ./
2907 */
2908 void
2909 HitList::SVD(double **A, int n, double w[], double **V)
2910 {
2911 int m=n; // in general algorithm A is an (m x n) matrix instead of (n x n)
2912
2913 double pythag(double a, double b);
2914 int flag,i,its,j,jj,k,l=1,nm=1;
2915 double anorm,c,f,g,h,s,scale,x,y,z,*rv1;
2916 rv1=new(double[n]);
2917 g=scale=anorm=0.0;
2918
2919 // Householder reduction to bidiagonal form.
2920 if (v>=5) printf("\nHouseholder reduction to bidiagonal form\n");
2921 for (i=0;i<n;i++) {
2922 if (v>=4 && i%100==0) printf("i=%i\n",i);
2923 if (v>=4) fprintf(stderr,".");
2924 l=i+1;
2925 rv1[i]=scale*g;
2926 g=s=scale=0.0;
2927 if (i < m) {
2928 for (k=i;k<m;k++) scale += fabs(A[k][i]);
2929 if (scale) {
2930 for (k=i;k<m;k++) {
2931 A[k][i] /= scale;
2932 s += A[k][i]*A[k][i];
2933 }
2934 f=A[i][i];
2935 g = -SIGN(sqrt(s),f);
2936 h=f*g-s;
2937 A[i][i]=f-g;
2938 for (j=l;j<n;j++) {
2939 for (s=0.0,k=i;k<m;k++) s += A[k][i]*A[k][j];
2940 f=s/h;
2941 for (k=i;k<m;k++) A[k][j] += f*A[k][i];
2942 }
2943 for (k=i;k<m;k++) A[k][i] *= scale;
2944 }
2945 }
2946 w[i]=scale *g;
2947 g=s=scale=0.0;
2948 if (i < m && i != n-1) {
2949 for (k=l;k<n;k++) scale += fabs(A[i][k]);
2950 if (scale) {
2951 for (k=l;k<n;k++) {
2952 A[i][k] /= scale;
2953 s += A[i][k]*A[i][k];
2954 }
2955 f=A[i][l];
2956 g = -SIGN(sqrt(s),f);
2957 h=f*g-s;
2958 A[i][l]=f-g;
2959 for (k=l;k<n;k++) rv1[k]=A[i][k]/h;
2960 for (j=l;j<m;j++) {
2961 for (s=0.0,k=l;k<n;k++) s += A[j][k]*A[i][k];
2962 for (k=l;k<n;k++) A[j][k] += s*rv1[k];
2963 }
2964 for (k=l;k<n;k++) A[i][k] *= scale;
2965 }
2966 }
2967 anorm=FMAX(anorm,(fabs(w[i])+fabs(rv1[i])));
2968 }
2969 // Accumulation of right-hand transformations.
2970 if (v>=5) printf("\nAccumulation of right-hand transformations\n");
2971 TransposeMatrix(V,n);
2972 for (i=n-1;i>=0;i--) {
2973 if (v>=4 && i%100==0) printf("i=%i\n",i);
2974 if (v>=4) fprintf(stderr,".");
2975 if (i < n-1) {
2976 if (g) {
2977 // Double division to avoid possible underflow.
2978 for (j=l;j<n;j++)
2979 V[i][j]=(A[i][j]/A[i][l])/g;
2980 for (j=l;j<n;j++) {
2981 for (s=0.0,k=l;k<n;k++) s += A[i][k]*V[j][k];
2982 for (k=l;k<n;k++) V[j][k] += s*V[i][k];
2983 }
2984 }
2985 for (j=l;j<n;j++) V[j][i]=V[i][j]=0.0;
2986 }
2987 V[i][i]=1.0;
2988 g=rv1[i];
2989 l=i;
2990 }
2991 // Accumulation of left-hand transformations.
2992 if (v>=5) printf("\nAccumulation of left-hand transformations\n");
2993 TransposeMatrix(A,n);
2994 for (i=IMIN(m,n)-1;i>=0;i--) {
2995 if (v>=4 && i%100==0) printf("i=%i\n",i);
2996 if (v>=4) fprintf(stderr,".");
2997 l=i+1;
2998 g=w[i];
2999 for (j=l;j<n;j++) A[j][i]=0.0;
3000 if (g) {
3001 g=1.0/g;
3002 for (j=l;j<n;j++) {
3003 for (s=0.0,k=l;k<m;k++) s += A[i][k]*A[j][k];
3004 f=(s/A[i][i])*g;
3005 for (k=i;k<m;k++) A[j][k] += f*A[i][k];
3006 }
3007 for (j=i;j<m;j++) A[i][j] *= g;
3008 } else for (j=i;j<m;j++) A[i][j]=0.0;
3009 ++A[i][i];
3010 }
3011
3012 // Diagonalization of the bidiagonal form: Loop over singular values, and over allowed iterations.
3013 if (v>=5) printf("\nDiagonalization of the bidiagonal form\n");
3014 for (k=n-1;k>=0;k--) {
3015 if (v>=4 && k%100==0) printf("k=%i\n",k);
3016 if (v>=4) fprintf(stderr,".");
3017 for (its=1;its<=30;its++) {
3018 flag=1;
3019 // Test for splitting. Note that rv1[1] is always zero.
3020 for (l=k;l>=0;l--) {
3021 nm=l-1;
3022 if ((double)(fabs(rv1[l])+anorm) == anorm) {
3023 flag=0;
3024 break;
3025 }
3026 if ((double)(fabs(w[nm])+anorm) == anorm) break;
3027 }
3028 if (flag) {
3029 // Cancellation of rv1[l], if l > 1.
3030 c=0.0;
3031 s=1.0;
3032 for (i=l;i<=k;i++) {
3033 f=s*rv1[i];
3034 rv1[i]=c*rv1[i];
3035 if ((double)(fabs(f)+anorm) == anorm) break;
3036 g=w[i];
3037 h=pythag(f,g);
3038 w[i]=h;
3039 h=1.0/h;
3040 c=g*h;
3041 s = -f*h;
3042 for (j=0;j<m;j++) {
3043 y=A[nm][j];
3044 z=A[i][j];
3045 A[nm][j]=y*c+z*s;
3046 A[i][j]=z*c-y*s;
3047 }
3048 }
3049 }
3050 z=w[k];
3051 // Convergence.
3052 if (l == k) {
3053 // Singular value is made nonnegative.
3054 if (z < 0.0) {
3055 w[k] = -z;
3056 for (j=0;j<n;j++) V[k][j] = -V[k][j];
3057 }
3058 break;
3059 }
3060 if (its == 30) {printf("Error in SVD: no convergence in 30 iterations\n"); exit(7);}
3061 // Shift from bottom 2-by-2 minor.
3062 x=w[l];
3063 nm=k-1;
3064 y=w[nm];
3065 g=rv1[nm];
3066 h=rv1[k];
3067 f=((y-z)*(y+z)+(g-h)*(g+h))/(2.0*h*y);
3068 g=pythag(f,1.0);
3069 f=((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
3070 // Next QR transformation:
3071 c=s=1.0;
3072 for (j=l;j<=nm;j++) {
3073 i=j+1;
3074 g=rv1[i];
3075 y=w[i];
3076 h=s*g;
3077 g=c*g;
3078 z=pythag(f,h);
3079 rv1[j]=z;
3080 c=f/z;
3081 s=h/z;
3082 f=x*c+g*s;
3083 g = g*c-x*s;
3084 h=y*s;
3085 y *= c;
3086 for (jj=0;jj<n;jj++) {
3087 x=V[j][jj];
3088 z=V[i][jj];
3089 V[j][jj]=x*c+z*s;
3090 V[i][jj]=z*c-x*s;
3091 }
3092 z=pythag(f,h);
3093 // Rotation can be arbitrary if z = 0.
3094 w[j]=z;
3095 if (z) {
3096 z=1.0/z;
3097 c=f*z;
3098 s=h*z;
3099 }
3100 f=c*g+s*y;
3101 x=c*y-s*g;
3102
3103 for (jj=0;jj<m;jj++) {
3104 y=A[j][jj];
3105 z=A[i][jj];
3106 A[j][jj]=y*c+z*s;
3107 A[i][jj]=z*c-y*s;
3108 }
3109 }
3110 rv1[l]=0.0;
3111 rv1[k]=f;
3112 w[k]=x;
3113 }
3114 }
3115 TransposeMatrix(V,n);
3116 TransposeMatrix(A,n);
3117 delete[](rv1); (rv1) = NULL;
3118 }
3119
3120 /**
3121 * @brief Computes (a2 + b2 )^1/2 without destructive underflow or overflow.
3122 */
3123 double
3124 pythag(double a, double b)
3125 {
3126 double absa,absb;
3127 absa=fabs(a);
3128 absb=fabs(b);
3129 if (absa > absb)
3130 return absa*sqrt(1.0+SQR(absb/absa));
3131 else
3132 return (absb == 0.0 ? 0.0 : absb*sqrt(1.0+SQR(absa/absb)));
3133 }
3134
3135
3136 /* @* HitList::ClobberGlobal(void)
3137 */
3138 void
3139 HitList::ClobberGlobal(void){
3140
3141
3142 /* @<variables local to HitList::ClobberGlobal@> */
3143 class List<Hit>::ListEl<Hit> *pvIter = head;
3144
3145 /* NOTE: no free/delete-ing of data to be done here
3146 hitlist only holds a shallow copy of hit;
3147 hit is being cleared off properly.
3148 just reset everything to 0/0.0/NULL.
3149 The only important thing to do at this stage
3150 is to attach head and tail and set size = 0
3151 (FS, 2010-02-18)
3152
3153 NOTE: I only ever saw 1 (one) in-between element,
3154 but there may ctually be a real linked list
3155 of more than 1 element (FS, 2010-02-18)
3156 */
3157
3158 // printf("POINTER:\t%p\t=HEAD\n", head);
3159 while (pvIter->next != tail){
3160
3161 // printf("POINTER:\t%p->\t%p\n", pvIter, pvIter->next);
3162 pvIter = pvIter->next;
3163
3164 #if 1
3165 pvIter->data.longname = pvIter->data.name =
3166 pvIter->data.file = pvIter->data.dbfile = NULL;
3167 pvIter->data.sname = NULL;
3168 pvIter->data.seq = NULL;
3169 pvIter->data.self = 0;
3170 pvIter->data.i = pvIter->data.j = NULL;
3171 pvIter->data.states = NULL;
3172 pvIter->data.S = pvIter->data.S_ss = pvIter->data.P_posterior = NULL;
3173 pvIter->data.Xcons = NULL;
3174 pvIter->data.sum_of_probs = 0.0;
3175 pvIter->data.Neff_HMM = 0.0;
3176 pvIter->data.score_ss = pvIter->data.Pval = pvIter->data.logPval =
3177 pvIter->data.Eval = pvIter->data.Probab = pvIter->data.Pforward = 0.0;
3178 pvIter->data.nss_conf = pvIter->data.nfirst =
3179 pvIter->data.i1 = pvIter->data.i2 = pvIter->data.j1 = pvIter->data.j2 =
3180 pvIter->data.matched_cols = pvIter->data.ssm1 = pvIter->data.ssm2 = 0;
3181 #endif
3182 }
3183 // printf("POINTER:\t\t\t%p=TAIL\n", tail);
3184
3185
3186 head->next = tail;
3187 tail->prev = head;
3188 size = 0;
3189
3190 /* @= */
3191 return;
3192
3193 } /* this is the end of HitList::ClobberGlobal() */
3194
3195
3196 /*
3197 * EOF hhhitlist-C.h
3198 */