Mercurial > repos > clustalomega > clustalomega
comparison clustalomega/clustal-omega-0.2.0/src/hhalign/hhhitlist-C.h @ 0:ff1768533a07
Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author | clustalomega |
---|---|
date | Tue, 07 Jun 2011 17:04:25 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ff1768533a07 |
---|---|
1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ | |
2 | |
3 /********************************************************************* | |
4 * Clustal Omega - Multiple sequence alignment | |
5 * | |
6 * Copyright (C) 2010 University College Dublin | |
7 * | |
8 * Clustal-Omega is free software; you can redistribute it and/or | |
9 * modify it under the terms of the GNU General Public License as | |
10 * published by the Free Software Foundation; either version 2 of the | |
11 * License, or (at your option) any later version. | |
12 * | |
13 * This file is part of Clustal-Omega. | |
14 * | |
15 ********************************************************************/ | |
16 | |
17 /* | |
18 * RCS $Id: hhhitlist-C.h 199 2011-02-21 18:24:49Z fabian $ | |
19 */ | |
20 | |
21 // hhhitlist.C | |
22 | |
23 #ifndef MAIN | |
24 #define MAIN | |
25 #include <iostream> // cin, cout, cerr | |
26 #include <fstream> // ofstream, ifstream | |
27 #include <stdio.h> // printf | |
28 #include <stdlib.h> // exit | |
29 #include <string> // strcmp, strstr | |
30 #include <math.h> // sqrt, pow | |
31 #include <limits.h> // INT_MIN | |
32 #include <float.h> // FLT_MIN | |
33 #include <time.h> // clock | |
34 #include <ctype.h> // islower, isdigit etc | |
35 using std::ios; | |
36 using std::ifstream; | |
37 using std::ofstream; | |
38 using std::cout; | |
39 using std::cerr; | |
40 using std::endl; | |
41 #include "util-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc. | |
42 #include "list.h" // list data structure | |
43 #include "hash.h" // hash data structure | |
44 #include "hhdecl-C.h" // constants, class | |
45 #include "hhutil-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc. | |
46 #include "hhhmm.h" // class HMM | |
47 #include "hhalignment.h" // class Alignment | |
48 #include "hhhit.h" | |
49 #include "hhhalfalignment.h" | |
50 #include "hhfullalignment.h" | |
51 #endif | |
52 | |
53 | |
54 ////////////////////////////////////////////////////////////////////////////// | |
55 ////////////////////////////////////////////////////////////////////////////// | |
56 //// Methods of class HitList | |
57 ////////////////////////////////////////////////////////////////////////////// | |
58 ////////////////////////////////////////////////////////////////////////////// | |
59 | |
60 | |
61 | |
62 ////////////////////////////////////////////////////////////////////////////// | |
63 /** | |
64 * @brief Print summary listing of hits | |
65 */ | |
66 void | |
67 HitList::PrintHitList(HMM& q, char* outfile) | |
68 { | |
69 Hit hit; | |
70 int nhits=0; | |
71 char str[NAMELEN]=""; | |
72 | |
73 FILE* outf=NULL; | |
74 if (strcmp(outfile,"stdout")) | |
75 { | |
76 outf=fopen(outfile,"w"); | |
77 if (!outf) OpenFileError(outfile); | |
78 } | |
79 else | |
80 outf = stdout; | |
81 | |
82 | |
83 fprintf(outf,"Query %s\n",q.longname); | |
84 // fprintf(outf,"Family %s\n",q.fam); | |
85 fprintf(outf,"Match_columns %i\n",q.L); | |
86 fprintf(outf,"No_of_seqs %i out of %i\n",q.N_filtered,q.N_in); | |
87 fprintf(outf,"Neff %-4.1f\n",q.Neff_HMM); | |
88 fprintf(outf,"Searched_HMMs %i\n",N_searched); | |
89 | |
90 // Print date stamp | |
91 time_t* tp=new(time_t); | |
92 *tp=time(NULL); | |
93 fprintf(outf,"Date %s",ctime(tp)); | |
94 delete (tp); (tp) = NULL; | |
95 | |
96 // Print command line | |
97 fprintf(outf,"Command "); | |
98 for (int i=0; i<par.argc; i++) | |
99 if (strlen(par.argv[i])<=par.maxdbstrlen) | |
100 fprintf(outf,"%s ",par.argv[i]); | |
101 else | |
102 fprintf(outf,"<%i characters> ",(int)strlen(par.argv[i])); | |
103 fprintf(outf,"\n\n"); | |
104 | |
105 #ifdef WINDOWS | |
106 if (par.trans) | |
107 fprintf(outf," No Hit Prob E-trans E-value Score SS Cols Query HMM Template HMM\n"); | |
108 else | |
109 fprintf(outf," No Hit Prob E-value P-value Score SS Cols Query HMM Template HMM\n"); | |
110 #else | |
111 if (par.trans) | |
112 fprintf(outf," No Hit Prob E-trans E-value Score SS Cols Query HMM Template HMM\n"); | |
113 else | |
114 fprintf(outf," No Hit Prob E-value P-value Score SS Cols Query HMM Template HMM\n"); | |
115 #endif | |
116 | |
117 Reset(); | |
118 while (!End()) // print hit list | |
119 { | |
120 hit = ReadNext(); | |
121 if (nhits>=par.Z) break; //max number of lines reached? | |
122 if (nhits>=par.z && hit.Probab < par.p) break; | |
123 if (nhits>=par.z && hit.Eval > par.E) continue; | |
124 // if (hit.matched_cols <=1) continue; // adding this might get to intransparent... analogous statement in PrintAlignments | |
125 nhits++; | |
126 sprintf(str,"%3i %-30.30s ",nhits,hit.longname); | |
127 | |
128 | |
129 #ifdef WINDOWS | |
130 if (par.trans) // Transitive scoring | |
131 fprintf(outf,"%-34.34s %5.1f %8.2G %8.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.E1val,hit.Eval,hit.score,hit.score_ss,hit.matched_cols); | |
132 else // Normal scoring | |
133 fprintf(outf,"%-34.34s %5.1f %8.2G %8.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.Eval,hit.Pval,hit.score,hit.score_ss,hit.matched_cols); | |
134 #else | |
135 if (par.trans) // Transitive scoring | |
136 fprintf(outf,"%-34.34s %5.1f %7.2G %7.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.E1val,hit.Eval,hit.score,hit.score_ss,hit.matched_cols); | |
137 else // Normal scoring | |
138 fprintf(outf,"%-34.34s %5.1f %7.2G %7.2G %6.1f %5.1f %4i ",str,hit.Probab,hit.Eval,hit.Pval,hit.score,hit.score_ss,hit.matched_cols); | |
139 #endif | |
140 | |
141 sprintf(str,"%4i-%-4i ",hit.i1,hit.i2); | |
142 fprintf(outf,"%-10.10s",str); | |
143 sprintf(str,"%4i-%-4i",hit.j1,hit.j2); | |
144 fprintf(outf,"%-9.9s(%i)\n",str,hit.L); | |
145 } //end print hit list | |
146 fprintf(outf,"\n"); | |
147 if (strcmp(outfile,"stdout")) fclose(outf); | |
148 } | |
149 | |
150 | |
151 | |
152 ////////////////////////////////////////////////////////////////////////////// | |
153 /** | |
154 * @brief Print alignments of query sequences against hit sequences | |
155 */ | |
156 void | |
157 HitList::PrintAlignments( | |
158 | |
159 | |
160 #ifdef CLUSTALO | |
161 char **ppcFirstProf, char **ppcSecndProf, | |
162 #endif | |
163 HMM& q, char* outfile, char outformat) | |
164 { | |
165 Hit hit; | |
166 FullAlignment qt_ali(par.nseqdis+10); // maximum 10 annotation (pseudo) sequences (ss_dssp, sa_dssp, ss_pred, ss_conf, consens,...) | |
167 int nhits=0; | |
168 | |
169 #ifndef CLUSTALO_NOFILE | |
170 FILE* outf=NULL; | |
171 if (strcmp(outfile,"stdout")) | |
172 { | |
173 if (outformat==0) | |
174 outf=fopen(outfile,"a"); //append to summary hitlist | |
175 else | |
176 outf=fopen(outfile,"w"); //open for writing | |
177 if (!outf) OpenFileError(outfile); | |
178 } | |
179 else | |
180 outf = stdout; | |
181 #endif | |
182 | |
183 Reset(); | |
184 while (!End()) // print hit list | |
185 { | |
186 if (nhits>=par.B) break; //max number of lines reached? | |
187 hit = ReadNext(); | |
188 if (nhits>=par.b && hit.Probab < par.p) break; | |
189 if (nhits>=par.b && hit.Eval > par.E) continue; | |
190 // // adding this might get to intransparent... | |
191 // // analogous statement in PrintHitlist and hhalign.C | |
192 // if (hit.matched_cols <=1) continue; | |
193 nhits++; | |
194 | |
195 // Build double alignment of query against template sequences | |
196 qt_ali.Build(q,hit); | |
197 | |
198 #ifndef CLUSTALO | |
199 // Print out alignment | |
200 if (outformat==0) // HHR format | |
201 { | |
202 fprintf(outf,"No %-3i\n",nhits); | |
203 qt_ali.PrintHeader(outf,q,hit); | |
204 qt_ali.PrintHHR(outf,hit); | |
205 } | |
206 else if (outformat==1) // FASTA format | |
207 { | |
208 fprintf(outf,"# No %-3i\n",nhits); | |
209 qt_ali.PrintFASTA(outf,hit); | |
210 } | |
211 else if(outformat==2) // A2M format | |
212 { | |
213 fprintf(outf,"# No %-3i\n",nhits); | |
214 qt_ali.PrintA2M(outf,hit); | |
215 } | |
216 else // A3m format | |
217 { | |
218 fprintf(outf,"# No %-3i\n",nhits); | |
219 qt_ali.PrintA3M(outf,hit); | |
220 } | |
221 #else | |
222 qt_ali.OverWriteSeqs(ppcFirstProf, ppcSecndProf); | |
223 #endif | |
224 | |
225 qt_ali.FreeMemory(); | |
226 } | |
227 #ifndef CLUSTALO_NOFILE | |
228 if (strcmp(outfile,"stdout")) fclose(outf); | |
229 #endif | |
230 } | |
231 | |
232 | |
233 | |
234 | |
235 | |
236 //////////////////////////////////////////////////////////////////////////// | |
237 /** | |
238 * @brief Return the ROC_5 score for optimization | |
239 * (changed 28.3.08 by Michael & Johannes) | |
240 */ | |
241 void | |
242 HitList::Optimize(HMM& q, char* buffer) | |
243 { | |
244 const int NFAM =5; // calculate ROC_5 score | |
245 const int NSFAM=5; // calculate ROC_5 score | |
246 int roc=0; // ROC score | |
247 int fam=0; // number of hits from same family (at current threshold) | |
248 int not_fam=0; // number of hits not from same family | |
249 int sfam=0; // number of hits from same suporfamily (at current threshold) | |
250 int not_sfam=0; // number of hits not from same superfamily | |
251 Hit hit; | |
252 | |
253 SortList(); | |
254 Reset(); | |
255 while (!End()) | |
256 { | |
257 hit = ReadNext(); | |
258 if (!strcmp(hit.fam,q.fam)) fam++; // query and template from same superfamily? => positive | |
259 else if (not_fam<NFAM) // query and template from different family? => negative | |
260 { | |
261 not_fam++; | |
262 roc += fam; | |
263 } | |
264 if (!strcmp(hit.sfam,q.sfam)) sfam++; // query and template from same superfamily? => positive | |
265 else if (not_sfam<NSFAM) // query and template from different superfamily? => negative | |
266 { | |
267 not_sfam++; | |
268 roc += sfam; | |
269 } | |
270 // printf("qfam=%s tfam=%s qsfam=%s tsfam=%s fam=%-2i not_fam=%3i sfam=%-3i not_sfam=%-5i roc=%-3i\n",q.fam,hit.fam,q.sfam,hit.sfam,fam,not_fam,sfam,not_sfam,roc); | |
271 } | |
272 | |
273 // Write ROC score to file or stdout | |
274 FILE* buf=NULL; | |
275 if (strcmp(par.buffer,"stdout")) | |
276 { | |
277 buf=fopen(buffer,"w"); | |
278 if (!buf) OpenFileError(par.buffer); | |
279 } | |
280 else | |
281 buf = stdout; | |
282 | |
283 fprintf(buf,"%f\n",float(roc)/float(fam*NFAM+sfam*NSFAM)); // must be between 0 and 1 | |
284 if (v>=2) printf("ROC=%f\n",float(roc)/float(fam*NFAM+sfam*NSFAM)); | |
285 fclose(buf); | |
286 } | |
287 | |
288 | |
289 | |
290 ////////////////////////////////////////////////////////////////////////////// | |
291 /** | |
292 * @brief Print score distribution into file score_dist | |
293 */ | |
294 void | |
295 HitList::PrintScoreFile(HMM& q) | |
296 { | |
297 int i=0, n; | |
298 FILE* scoref=NULL; | |
299 Hit hit; | |
300 Hash<int> twice(10000); // make sure only one hit per HMM is listed | |
301 twice.Null(-1); | |
302 | |
303 if (strcmp(par.scorefile,"stdout")) | |
304 { | |
305 scoref=fopen(par.scorefile,"w"); | |
306 if (!scoref) | |
307 {cerr<<endl<<"WARNING from "<<par.argv[0]<<": could not open \'"<<par.scorefile<<"\'\n"; return;} | |
308 } | |
309 else | |
310 scoref = stdout; | |
311 Reset(); | |
312 fprintf(scoref,"NAME %s\n",q.longname); | |
313 fprintf(scoref,"FAM %s\n",q.fam); | |
314 fprintf(scoref,"FILE %s\n",q.file); | |
315 fprintf(scoref,"LENG %i\n",q.L); | |
316 fprintf(scoref,"\n"); | |
317 //fprintf(scoref,"TARGET REL LEN COL LOG-PVA S-TOT MS NALI\n"); | |
318 | |
319 //For hhformat, the PROBAB field has to start at position 41 !! | |
320 // ----+----1----+----2----+----3----+----4----+---- | |
321 fprintf(scoref,"TARGET FAMILY REL LEN COL LOG-PVA S-AASS PROBAB SCORE\n"); | |
322 // d153l__ 5 185 185 287.82 464.22 100.00 | |
323 // d1qsaa2 3 168 124 145.55 239.22 57.36 | |
324 while (!End()) | |
325 { | |
326 i++; | |
327 hit = ReadNext(); | |
328 if (twice[hit.name]==1) continue; // better hit with same HMM has been listed already | |
329 twice.Add(hit.name,1); | |
330 //if template and query are from the same superfamily | |
331 if (!strcmp(hit.name,q.name)) n=5; | |
332 else if (!strcmp(hit.fam,q.fam)) n=4; | |
333 else if (!strcmp(hit.sfam,q.sfam)) n=3; | |
334 else if (!strcmp(hit.fold,q.fold)) n=2; | |
335 else if (!strcmp(hit.cl,q.cl)) n=1; | |
336 else n=0; | |
337 fprintf(scoref,"%-10s %-10s %1i %3i %3i %s %7.2f %6.2f %7.2f\n",hit.name,hit.fam,n,hit.L,hit.matched_cols,sprintg(-1.443*hit.logPval,7),-hit.score_aass,hit.Probab,hit.score); | |
338 } | |
339 fclose(scoref); | |
340 } | |
341 | |
342 | |
343 inline double | |
344 logPvalue_HHblast(double s, double corr) | |
345 { | |
346 return -s*(1.0-0.5*corr) + (1.0-corr)*log(1.0+s); | |
347 // return -s*(1.0-0.5*corr) + log( 1.0+(1.0-corr)*s ); | |
348 // return -s*(1.0-0.5*corr) + log( 1.0+(1.0-corr)*(1.0-0.5*corr)*s ); | |
349 } | |
350 | |
351 inline double | |
352 Pvalue_HHblast(double s, double corr) | |
353 { | |
354 return exp(-s*(1.0-0.5*corr)) * pow(1.0+s,1.0-corr); | |
355 // return exp(-s*(1.0-0.5*corr)) * ( 1.0+(1.0-corr)*s ); | |
356 // return exp(-s*(1.0-0.5*corr)) * ( 1.0+(1.0-corr)*(1.0-0.5*corr)*s ); | |
357 } | |
358 | |
359 inline double | |
360 logLikelihood_HHblast(double s, double corr) | |
361 { | |
362 if (s<0.0) { s=0.0; if (corr<1E-5) corr=1E-5; else if (corr>0.99999) corr=0.99999; } | |
363 else { if (corr<0.0) corr=0.0; else if (corr>1.0) corr=1.0; } | |
364 return -s*(1.0-0.5*corr) - corr*log(1.0+s) + log(s*(1.0-0.5*corr)+0.5*corr); | |
365 // return -s*(1.0-0.5*corr) + log( s*(1.0-corr)*(1.0-0.5*corr)+0.5*corr ); | |
366 // return -s*(1.0-0.5*corr) + log((s*(1.0-corr)*(1.0-0.5*corr)+corr)*(1.0-0.5*corr)); | |
367 } | |
368 | |
369 ///////////////////////////////////////////////////////////////////////////////////// | |
370 /** | |
371 * @brief Evaluate the *negative* log likelihood for the order statistic of the uniform distribution | |
372 * for the best 10% of hits (vertex v = (corr,offset) ) | |
373 * The k'th order statistic for X~Uniform is p:=X^(k)~Beta(k,n-k+1) = const*p^(k-1)*(1-p)^(n-k) | |
374 * Needed to fit the correlation and score offset in HHblast | |
375 */ | |
376 double | |
377 HitList::RankOrderFitCorr(double* v) | |
378 { | |
379 double sum=0.0; | |
380 // printf("%8.2G %8.2G %i\n",v[0],v[1],Nprof); | |
381 int i1 = imin(Nprof,imax(50,int(0.05*Nprof))); | |
382 for (int i=0; i<i1; i++) | |
383 { | |
384 double p = Pvalue_HHblast(score[i]+v[1],v[0]); | |
385 // sum -= (1.0-double(i)/double(i1)) * weight[i] * ( double(i)*log(p) + (Nprof-i-1.0)*log(1.0-p) ); | |
386 float diff = p-(float(i)+1.0)/(Nprof+1.0); | |
387 sum += (1.0-double(i)/double(i1)) * weight[i]*diff*diff*(Nprof+1.0)*(Nprof+1.0)*(Nprof+2.0)/(i+10.0)/(Nprof-i); | |
388 // printf("%-3i Pval=%7.5f Preal=%7.5f diff=%7.5f sum=%7.5f\n",i,p,float(i+1)/(1.0+Nprof),diff,sum); | |
389 } | |
390 return sum; | |
391 } | |
392 | |
393 /** | |
394 * @brief Static wrapper-function for calling the nonstatic member function RankOrderFitCorr() | |
395 * ( see http://www.newty.de/fpt/callback.html#member ) | |
396 */ | |
397 double | |
398 HitList::RankOrderFitCorr_static(void* pt2hitlist, double* v) | |
399 { | |
400 HitList* mySelf = (HitList*) pt2hitlist; // explicitly cast to a pointer to Hitlist | |
401 return mySelf->RankOrderFitCorr(v); // call member function | |
402 } | |
403 | |
404 ///////////////////////////////////////////////////////////////////////////////////// | |
405 /** | |
406 * @brief Evaluate the *negative* log likelihood of the data at the vertex v = (corr,offset) | |
407 * Needed to fit the correlation and score offset in HHblast | |
408 */ | |
409 double | |
410 HitList::LogLikelihoodCorr(double* v) | |
411 { | |
412 double sum=0.0; | |
413 // printf("%8.2G %8.2G %i\n",v[0],v[1],Nprof); | |
414 for (int i=0; i<Nprof; i++) | |
415 { | |
416 sum -= weight[i]*logLikelihood_HHblast(score[i]+v[1],v[0]); | |
417 // printf("%-3i Pval=%7.5f Preal=%7.5f diff=%7.5f rmsd=%7.5f sum=%7.5f\n",i,Pvalue_HHblast(score[i],v[0]),float(i)/(1.0+Nprof),x,sqrt(sum/sumw),sum); | |
418 } | |
419 return sum; | |
420 } | |
421 | |
422 /** | |
423 * @brief Static wrapper-function for calling the nonstatic member function LogLikelihoodCorr() | |
424 * ( see http://www.newty.de/fpt/callback.html#member ) | |
425 */ | |
426 double | |
427 HitList::LogLikelihoodCorr_static(void* pt2hitlist, double* v) | |
428 { | |
429 HitList* mySelf = (HitList*) pt2hitlist; // explicitly cast to a pointer to Hitlist | |
430 return mySelf->LogLikelihoodCorr(v); // call member function | |
431 } | |
432 | |
433 ///////////////////////////////////////////////////////////////////////////////////// | |
434 /** | |
435 * @brief Evaluate the *negative* log likelihood of the data at the vertex v = (lamda,mu) | |
436 * p(s) = lamda * exp{ -exp[-lamda*(s-mu)] - lamda*(s-mu) } = lamda * exp( -exp(-x) - x) | |
437 */ | |
438 double | |
439 HitList::LogLikelihoodEVD(double* v) | |
440 { | |
441 double sum=0.0, sumw=0.0; | |
442 for (int i=0; i<Nprof; i++) | |
443 { | |
444 double x = v[0]*(score[i]-v[1]); | |
445 sum += weight[i]*(exp(-x)+x); | |
446 sumw += weight[i]; | |
447 } | |
448 return sum - sumw*log(v[0]); | |
449 } | |
450 | |
451 /** | |
452 * @brief Static wrapper-function for calling the nonstatic member function LogLikelihoodEVD() | |
453 * ( see http://www.newty.de/fpt/callback.html#member ) | |
454 */ | |
455 double | |
456 HitList::LogLikelihoodEVD_static(void* pt2hitlist, double* v) | |
457 { | |
458 HitList* mySelf = (HitList*) pt2hitlist; // explicitly cast to a pointer to Hitlist | |
459 return mySelf->LogLikelihoodEVD(v); // call member function | |
460 } | |
461 | |
462 ///////////////////////////////////////////////////////////////////////////////////// | |
463 /** | |
464 * @brief Subroutine to FindMin: try new point given by highest point ihigh and fac and replace ihigh if it is lower | |
465 */ | |
466 double | |
467 HitList::TryPoint(const int ndim, double* p, double* y, double* psum, int ihigh, double fac, double (*Func)(void* pt2hitlist, double* v)) | |
468 { | |
469 // New point p_try = p_c + fac*(p_high-p_c), | |
470 // where p_c = ( sum_i (p_i) - p_high)/ndim is the center of ndim other points | |
471 // => p_try = fac1*sum_i(p_i) + fac2*p_high | |
472 double fac1=(1.-fac)/ndim; | |
473 double fac2=fac-fac1; | |
474 double ptry[ndim]; //new point to try out | |
475 double ytry; //function value of new point | |
476 int j; //index for the ndim parameters | |
477 | |
478 for (j=0; j<ndim; j++) | |
479 ptry[j]=psum[j]*fac1+p[ihigh*ndim+j]*fac2; | |
480 ytry = (*Func)(this,ptry); | |
481 if (ytry<=y[ihigh]) | |
482 { | |
483 // if (v>=4) printf("Trying: %-7.3f %-7.3f %-7.3f -> accept\n",ptry[0],ptry[1],ytry); | |
484 y[ihigh]=ytry; | |
485 for (j=0; j<ndim; j++) | |
486 { | |
487 psum[j] += ptry[j]-p[ihigh*ndim+j]; //update psum[j] | |
488 p[ihigh*ndim+j]=ptry[j]; //replace p[ihigh] with ptry | |
489 } //Note: ihigh is now not highest point anymore! | |
490 } | |
491 // else if (v>=4) printf("Trying: %-7.3f %-7.3f %-7.3f -> reject\n",ptry[0],ptry[1],ytry); | |
492 | |
493 return ytry; | |
494 } | |
495 | |
496 | |
497 | |
498 ///////////////////////////////////////////////////////////////////////////////////// | |
499 /** | |
500 * @brief Find minimum with simplex method of Nelder and Mead (1965) | |
501 */ | |
502 float | |
503 HitList::FindMin(const int ndim, double* p, double* y, double tol, int& nfunc, double (*Func)(void* pt2hitlist, double* v)) | |
504 { | |
505 const int MAXNFUNC=99; //maximum allowed number of function evaluations | |
506 int ihigh; //index of highest point on simplex | |
507 int inext; //index of second highest point on simplex | |
508 int ilow; //index of lowest point on simplex | |
509 int i; //index for the ndim+1 points | |
510 int j; //index for the ndim parameters | |
511 double rtol; //tolerance: difference of function value between highest and lowest point of simplex | |
512 double temp; //dummy | |
513 double ytry; //function value of trial point | |
514 double psum[ndim]; //psum[j] = j'th coordinate of sum vector (sum over all vertex vectors) | |
515 | |
516 nfunc=0; //number of function evaluations =0 | |
517 //Calculate sum vector psum[j] | |
518 for (j=0; j<ndim; j++) | |
519 { | |
520 psum[j]=p[j]; | |
521 for (i=1; i<ndim+1; i++) | |
522 psum[j]+=p[i*ndim+j]; | |
523 } | |
524 | |
525 // Repeat finding better points in simplex until rtol<tol | |
526 while(1) | |
527 { | |
528 // Find indices for highest, next highest and lowest point | |
529 ilow=0; | |
530 if (y[0]>y[1]) {inext=1; ihigh=0;} else {inext=0; ihigh=1;} | |
531 for (i=0; i<ndim+1; i++) | |
532 { | |
533 if (y[i]<=y[ilow]) ilow=i; | |
534 if (y[i]>y[ihigh]) {inext=ihigh; ihigh=i;} | |
535 else if (y[i]>y[inext] && i!= ihigh) inext=i; | |
536 } | |
537 | |
538 // If tolerance in y is smaller than tol swap lowest point to index 0 and break -> return | |
539 rtol = 2.*fabs(y[ihigh]-y[ilow]) / (fabs(y[ihigh])+fabs(y[ilow])+1E-10); | |
540 if (rtol<tol) | |
541 { | |
542 temp=y[ilow]; y[ilow]=y[0]; y[0]=temp; | |
543 for (j=0; j<ndim; j++) | |
544 { | |
545 temp=p[ilow*ndim+j]; p[ilow*ndim+j]=p[j]; p[j]=temp; | |
546 } | |
547 break; | |
548 } | |
549 | |
550 // Max number of function evaluations exceeded? | |
551 if (nfunc>=MAXNFUNC ) | |
552 { | |
553 if (v) fprintf(stderr,"\nWARNING: maximum likelihood fit of score distribution did not converge.\n"); | |
554 return 1; | |
555 } | |
556 | |
557 nfunc+=2; | |
558 // Point-reflect highest point on the center of gravity p_c of the other ndim points of the simplex | |
559 if (v>=3) printf("%3i %-7.3f %-7.3f %-12.8f %-9.3E\n",nfunc,p[ilow*ndim],p[ilow*ndim+1],y[ilow],rtol); | |
560 // if (v>=2) printf(" %3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]); | |
561 ytry = TryPoint(ndim,p,y,psum,ihigh,-1.0,Func); //reflect highest point on p_c | |
562 | |
563 if (ytry<=y[ilow]) | |
564 { | |
565 ytry = TryPoint(ndim,p,y,psum,ihigh,2.0,Func); //expand: try new point 2x further away from p_c | |
566 // if (v>=2) printf("Expanded: %3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]); | |
567 } | |
568 else if (ytry>=y[inext]) | |
569 { | |
570 // The new point is worse than the second worst point | |
571 temp=y[ihigh]; | |
572 ytry=TryPoint(ndim,p,y,psum,ihigh,0.5,Func); //contract simplex by 0.5 along (p_high-p_c | |
573 // if (v>=2) printf("Compressed:%3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]); | |
574 if (ytry>=temp) | |
575 { | |
576 // Trial point is larger than worst point => contract simplex by 0.5 towards lowest point | |
577 for (i=0; i<ndim+1; i++) | |
578 { | |
579 if (i!=ilow) | |
580 { | |
581 for (j=0; j<ndim; j++) | |
582 p[i*ndim+j]=0.5*(p[i*ndim+j]+p[ilow+j]); | |
583 y[i] = (*Func)(this,p+i*ndim); | |
584 // y[i] = (*Func)(p+i*ndim); | |
585 } | |
586 } | |
587 nfunc+=ndim; | |
588 // if (v>=2) printf("Contracted:%3i %-9.3E %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",nfunc,rtol,p[ilow*ndim],p[ilow*ndim+1],y[ilow],p[inext*ndim],p[inext*ndim+1],y[inext],p[ihigh*ndim],p[ihigh*ndim+1],y[ihigh]); | |
589 | |
590 //Calculate psum[j] | |
591 for (j=0; j<ndim; j++) | |
592 { | |
593 psum[j]=p[j]; | |
594 for (i=1; i<ndim+1; i++) | |
595 psum[j]+=p[i*ndim+j]; | |
596 } | |
597 } | |
598 } | |
599 else nfunc--; | |
600 } | |
601 return (float)rtol; | |
602 } | |
603 | |
604 | |
605 | |
606 ///////////////////////////////////////////////////////////////////////////////////// | |
607 /** | |
608 * @brief Do a maximum likelihod fit of the scores with an EV distribution with parameters lamda and mu | |
609 */ | |
610 void | |
611 HitList::MaxLikelihoodEVD(HMM& q, int nbest) | |
612 { | |
613 double tol=1E-6; // Maximum relative tolerance when minimizing -log(P)/N (~likelihood) | |
614 static char first_call=1; | |
615 static Hash<int> size_fam(MAXPROF/10); // Hash counts number of HMMs in family | |
616 static Hash<int> size_sfam(MAXPROF/10); // Hash counts number of families in superfamily | |
617 Hash<int> excluded(50); // Hash containing names of superfamilies to be excluded from fit | |
618 size_fam.Null(0); // Set int value to return when no data can be retrieved | |
619 size_sfam.Null(0); // Set int value to return when no data can be retrieved | |
620 excluded.Null(0); // Set int value to return when no data can be retrieved | |
621 Hit hit; | |
622 | |
623 double mu; // EVD[mu,lam](x) = exp(-exp(-(x-mu)/lam)) = P(score<=x) | |
624 double vertex[2*3]; // three vertices of the simplex in lamda-mu plane | |
625 double yvertex[3]; // log likelihood values at the three vertices of the simplex | |
626 int nfunc=0; // number of function calls | |
627 double sum_weights=0.0; | |
628 float sum_scores=0.0; | |
629 float rtol; | |
630 | |
631 if (first_call==1) | |
632 { | |
633 first_call=0; | |
634 // Count how many HMMs are in each family; set number of multiple hits per template nrep | |
635 if (v>=4) printf(" count number of profiles in each family and families in each superfamily ...\n"); | |
636 Reset(); | |
637 while (!End()) | |
638 { | |
639 hit = ReadNext(); | |
640 if (!size_fam.Contains(hit.fam)) (*size_sfam(hit.sfam))++; //Add one to hash element for superfamily | |
641 (*size_fam(hit.fam))++; //Add one to hash element for family | |
642 // printf("size(%s)=%i name=%s\n",hit.fam,*size_fam(hit.fam),hit.name) | |
643 } | |
644 fams=size_fam.Size(); | |
645 sfams=size_sfam.Size(); | |
646 if (v>=3) | |
647 printf("%-3i HMMs from %i families and %i superfamilies searched. Found %i hits\n",N_searched,fams,sfams,Size()); | |
648 } | |
649 | |
650 // Query has SCOP family identifier? | |
651 if (q.fam && q.fam[0]>='a' && q.fam[0]<='k' && q.fam[1]=='.') | |
652 { | |
653 char sfamid[NAMELEN]; | |
654 char* ptr_in_fam=q.fam; | |
655 while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-'))) | |
656 { | |
657 char* ptr=strrchr(sfamid,'.'); | |
658 if (ptr) *ptr='\0'; | |
659 excluded.Add(sfamid); | |
660 // fprintf(stderr,"Exclude SCOP superfamily %s ptr_in_fam='%s'\n",sfamid,ptr_in_fam); | |
661 } | |
662 } | |
663 // Exclude best superfamilies from fit | |
664 else if (nbest>0) | |
665 { | |
666 if (sfams<97+nbest) return; | |
667 | |
668 // Find the nbest best-scoring superfamilies for exclusion from first ML fit | |
669 if (v>=4) printf(" find %i best-scoring superfamilies to exclude from first fit ...\n",nbest); | |
670 hit = Smallest(); | |
671 excluded.Add(hit.sfam); | |
672 // printf("Exclude in first round: %s %8.2f %s\n",hit.name,hit.score_aass,hit.sfam); | |
673 while (excluded.Size()<nbest) | |
674 { | |
675 Reset(); | |
676 while (!End() && excluded.Contains(ReadNext().sfam)) ; | |
677 hit=ReadCurrent(); | |
678 while (!End()) | |
679 { | |
680 if (ReadNext()<hit && !excluded.Contains(ReadCurrent().sfam)) | |
681 hit=ReadCurrent(); | |
682 } | |
683 excluded.Add(hit.sfam); | |
684 // printf("Exclude in first round: %s %8.2f %s %i %i\n",hit.name,hit.score_aass,hit.sfam,excluded.Size(),excluded.Contains(hit.sfam)); | |
685 } | |
686 tol = 0.01/size_sfam.Size(); // tol=1/N would lead to delta(log-likelihood)~1 (where N ~ number of superfamilies) since (1+1/N)^N = e | |
687 } | |
688 else | |
689 { | |
690 // Find the best-scoring superfamilies from first fit for exclusion from second ML fit | |
691 if (v>=4) printf(" find best-scoring superfamilies to exclude from second fit ...\n"); | |
692 Reset(); | |
693 while (!End()) | |
694 { | |
695 hit = ReadNext(); | |
696 if (hit.Eval < 0.05) excluded.Add(hit.sfam); // changed from 0.5 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
697 } | |
698 tol = 0.001/size_sfam.Size(); // tol=1/N would lead to delta(log-likelihood)~1 (where N ~ number of superfamilies) since (1+1/N)^N = e | |
699 } | |
700 | |
701 // Put scores into score[] and weights into weight[] | |
702 if (v>=3) printf(" generate scores and weights array for ML fitting ...\n"); | |
703 Nprof=0; | |
704 Reset(); | |
705 while (!End()) | |
706 { | |
707 hit = ReadNext(); | |
708 if (hit.irep > 1) continue; //Use only best hit per template | |
709 if (Nprof>=MAXPROF) break; | |
710 | |
711 char sfamid[NAMELEN]; | |
712 char* ptr_in_fam=hit.fam; | |
713 while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-'))) | |
714 { | |
715 char* ptr=strrchr(sfamid,'.'); | |
716 if (ptr) *ptr='\0'; | |
717 if (excluded.Contains(sfamid)) break; //HMM is among superfamilies to be excluded | |
718 } | |
719 if (excluded.Contains(sfamid)) { | |
720 if (v>=3) fprintf(stderr,"Exclude hit %s (family %s contains %s)\n",hit.name,hit.fam,sfamid); | |
721 continue; | |
722 } | |
723 // ScopID(hit.cl,hit.fold,hit.sfam,hit.fam); //Get scop superfamily code for template | |
724 // if (*hit.sfam=='\0' || excluded.Contains(hit.sfam)) continue; // skip HMM | |
725 | |
726 score[Nprof] = hit.score; | |
727 weight[Nprof]=1./size_fam[hit.fam]/size_sfam[hit.sfam]; | |
728 sum_scores +=hit.score*weight[Nprof]; | |
729 sum_weights+=weight[Nprof]; | |
730 | |
731 //DEBUG | |
732 // if (v>=4) printf("%-10.10s %-12.12s %-3i %-12.12s %-3i %6.4f %6.4f %7.1f\n",hit.name,hit.fam,size_fam[hit.fam],hit.sfam,size_sfam[hit.sfam],1./size_fam[hit.fam]/size_sfam[hit.sfam],sum,hit.score); | |
733 Nprof++; | |
734 } | |
735 //DEBUG | |
736 if (v>=3) | |
737 printf("%i hits used for score distribution\n",Nprof); | |
738 // for (int i=0; i<Nprof; i++) printf("%3i score=%8.3f weight=%7.5f\n",i,score[i],weight[i]); | |
739 | |
740 // Set simplex vertices and function values | |
741 mu = sum_scores/sum_weights - 0.584/LAMDA; | |
742 if (par.loc) // fit only in local mode; in global mode use fixed value LAMDA and mu mean score | |
743 { | |
744 double (*Func)(void*, double*); | |
745 Func = HitList::LogLikelihoodEVD_static; | |
746 | |
747 if (nbest>0) {vertex[0]=LAMDA; vertex[1]=mu;} /////////////////////////////////////////// DEBUG | |
748 else {vertex[0]=q.lamda; vertex[1]=mu;} | |
749 vertex[2]=vertex[0]+0.1; vertex[3]=vertex[1]; | |
750 vertex[4]=vertex[0]; vertex[5]=vertex[1]+0.2; | |
751 yvertex[0]=Func(this,vertex ); | |
752 yvertex[1]=Func(this,vertex+2); | |
753 yvertex[2]=Func(this,vertex+4); | |
754 | |
755 // Find lam and mu that minimize negative log likelihood of data | |
756 if (v>=3) printf("Fitting to EVD by maximum likelihood...\niter lamda mu -log(P)/N tol\n"); | |
757 rtol = FindMin(2,vertex,yvertex,tol,nfunc,Func); | |
758 if (v>=3) printf("%3i %-7.3f %-7.2f %-7.3f %-7.1E\n\n",nfunc,vertex[0],vertex[1],yvertex[0]-(1.5772-log(vertex[0])),rtol); | |
759 // printf("HHsearch lamda=%-6.3f mu=%-6.3f\n",vertex[0],vertex[1]); | |
760 } | |
761 else | |
762 { | |
763 vertex[0]=LAMDA_GLOB; vertex[1]=mu; | |
764 } | |
765 | |
766 // Set lamda and mu of profile | |
767 q.lamda = vertex[0]; | |
768 q.mu = vertex[1]; | |
769 | |
770 // Set P-values and E-values | |
771 // CHECK UPDATE FROM score=-logpval to score=-logpval+SSSCORE2NATLOG*score_ss !!!! | |
772 Reset(); | |
773 while (!End()) | |
774 { | |
775 hit = ReadNext(); | |
776 | |
777 // Calculate total score in raw score units: P-value = 1- exp(-exp(-lamda*(Saa-mu))) | |
778 hit.weight=1./size_fam[hit.fam]/size_sfam[hit.sfam]; // needed for transitive scoring | |
779 hit.logPval = logPvalue(hit.score,vertex); | |
780 hit.Pval=Pvalue(hit.score,vertex); | |
781 hit.Eval=exp(hit.logPval+log(N_searched)); | |
782 // hit.score_aass = hit.logPval/0.45-3.0 - hit.score_ss; // median(lamda)~0.45, median(mu)~4.0 in EVDs for scop20.1.63 HMMs | |
783 hit.score_aass = -q.lamda*(hit.score-q.mu)/0.45-3.0 - fmin(hit.score_ss,fmax(0.0,0.5*hit.score-5.0)); // median(lamda)~0.45, median(mu)~3.0 in EVDs for scop20.1.63 HMMs | |
784 hit.Probab = Probab(hit); | |
785 if (nbest>0 && par.loc) // correct length correction (not needed for second round of fitting, since lamda very similar) | |
786 if (par.idummy==0) //////////////////////////////////////////// | |
787 hit.score += log(q.L*hit.L)*(1/LAMDA-1/vertex[0]); | |
788 hit.score_sort = hit.score_aass; | |
789 Overwrite(hit); // copy hit object into current position of hitlist | |
790 | |
791 if (nbest==0 && par.trans==1) // if in transitive scoring mode (weights file given) | |
792 TransitiveScoring(); | |
793 else if (nbest==0 && par.trans==2) // if in transitive scoring mode (weights file given) | |
794 TransitiveScoring2(); | |
795 else if (nbest==0 && par.trans==3) // if in transitive scoring mode (weights file given) | |
796 TransitiveScoring3(); | |
797 else if (nbest==0 && par.trans==4) // if in transitive scoring mode (weights file given) | |
798 TransitiveScoring4(); | |
799 } | |
800 } | |
801 | |
802 | |
803 ///////////////////////////////////////////////////////////////////////////////////// | |
804 /** | |
805 * @brief Calculate correlation and score offset for HHblast composite E-values | |
806 */ | |
807 void | |
808 HitList::CalculateHHblastCorrelation(HMM& q) | |
809 { | |
810 int nfunc=0; // number of function calls | |
811 double tol; // Maximum relative tolerance when minimizing -log(P)/N (~likelihood) | |
812 double vertex[2*3]; // three vertices of the simplex in lamda-mu plane | |
813 double yvertex[3]; // log likelihood values at the three vertices of the simplex | |
814 Hit hit; | |
815 Hash<int> excluded(50); // Hash containing names of superfamilies to be excluded from fit | |
816 excluded.Null(0); // Set int value to return when no data can be retrieved | |
817 | |
818 // Set sum of HHsearch and PSI-BLAST score for calculating correlation | |
819 Reset(); | |
820 while (!End()) | |
821 { | |
822 hit = ReadNext(); | |
823 hit.score_sort = hit.logPval + blast_logPvals->Show(hit.name); // if template not in hash, return log Pval = 0, i.e. Pvalue = 1! | |
824 Overwrite(hit); // copy hit object into current position of hitlist | |
825 } | |
826 | |
827 // Query has SCOP family identifier? | |
828 if (q.fam && q.fam[0]>='a' && q.fam[0]<='k' && q.fam[1]=='.') | |
829 { | |
830 char sfamid[NAMELEN]; | |
831 char* ptr_in_fam=q.fam; | |
832 while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-'))) | |
833 { | |
834 char* ptr=strrchr(sfamid,'.'); | |
835 if (ptr) *ptr='\0'; | |
836 excluded.Add(sfamid); | |
837 fprintf(stderr,"Exclude SCOP superfamily %s ptr_in_fam='%s'\n",sfamid,ptr_in_fam); | |
838 } | |
839 } | |
840 | |
841 // Resort list by sum of log P-values | |
842 ResortList(); // use InsertSort to resort list according to sum of minus-log-Pvalues | |
843 Nprof=0; | |
844 Reset(); | |
845 ReadNext(); // skip best hit | |
846 while (!End()) | |
847 { | |
848 hit = ReadNext(); | |
849 if (hit.irep>=2) continue; // use only best alignments | |
850 // if (hit.Eval<0.005) {if (v>=3) printf("Fitting HHblast correlation coefficient: skipping %s with Evalue=%9.1g\n",hit.name,hit.Eval); continue;} | |
851 if (Nprof>=MAXPROF) break; | |
852 | |
853 char sfamid[NAMELEN]; | |
854 char* ptr_in_fam=hit.fam; | |
855 while ((ptr_in_fam=strwrd(sfamid,ptr_in_fam,'-'))) | |
856 { | |
857 char* ptr=strrchr(sfamid,'.'); | |
858 if (ptr) *ptr='\0'; | |
859 if (excluded.Contains(sfamid)) break; //HMM is among superfamilies to be excluded | |
860 } | |
861 if (excluded.Contains(sfamid)) { | |
862 if (v>=1) fprintf(stderr,"Exclude hit %s (family %s contains %s)\n",hit.name,hit.fam,sfamid); | |
863 continue; | |
864 } | |
865 score[Nprof] = -hit.score_sort; | |
866 weight[Nprof] = 1.0; // = hit.weight; | |
867 // printf("%3i %-12.12s %7.3f + %7.3f = %7.3f \n",Nprof,hit.name,hit.logPval,blast_logPvals->Show(hit.name),-hit.score_sort); ////////////////////// | |
868 printf("%3i %7.3f %7.3f\n",Nprof,hit.Pval,exp(blast_logPvals->Show(hit.name))); ////////////////////// | |
869 Nprof++; | |
870 } | |
871 | |
872 // Fit correlation | |
873 vertex[0]=0.5; vertex[1]=0.2; | |
874 vertex[2]=vertex[0]+0.2; vertex[3]=vertex[1]; | |
875 vertex[4]=vertex[0]; vertex[5]=vertex[1]+0.2; | |
876 | |
877 yvertex[0]=RankOrderFitCorr(vertex ); | |
878 yvertex[1]=RankOrderFitCorr(vertex+2); | |
879 yvertex[2]=RankOrderFitCorr(vertex+4); | |
880 // yvertex[0]=LogLikelihoodCorr(vertex ); | |
881 // yvertex[1]=LogLikelihoodCorr(vertex+2); | |
882 // yvertex[2]=LogLikelihoodCorr(vertex+4); | |
883 tol = 1e-6; | |
884 v=3;////////////////////////////////// | |
885 // Find correlation and offset that minimize mean square deviation of reported composite Pvalues from actual | |
886 if (v>=2) printf("Fitting correlation coefficient for HHblast...\niter corr offset logLikelihood tol\n"); | |
887 float rtol = FindMin(2,vertex,yvertex,tol,nfunc, HitList::RankOrderFitCorr_static); | |
888 if (v>=2) printf("%3i %-7.3f %-7.2f %-7.3f %-7.1E\n\n",nfunc,vertex[0],vertex[1],yvertex[0],rtol); | |
889 if (vertex[0]<0) vertex[0]=0.0; | |
890 | |
891 // Print correlation and offset for profile | |
892 printf("HHblast correlation=%-6.3f score offset=%-6.3f\n",vertex[0],vertex[1]); | |
893 v=2;////////////////////////////////// | |
894 } | |
895 | |
896 | |
897 /** | |
898 * @brief Calculate HHblast composite E-values | |
899 */ | |
900 inline double | |
901 corr_HHblast(float Nq, float Nt) | |
902 { | |
903 return 0.5; | |
904 } | |
905 | |
906 /** | |
907 * @brief Calculate HHblast composite E-values | |
908 */ | |
909 inline double | |
910 offset_HHblast(float Nq, float Nt) | |
911 { | |
912 return 0.0; | |
913 } | |
914 | |
915 ////////////////////////////////////////////////////////////////////////////// | |
916 /** | |
917 * @brief Calculate HHblast composite E-values | |
918 */ | |
919 void | |
920 HitList::CalculateHHblastEvalues(HMM& q) | |
921 { | |
922 Hit hit; | |
923 float corr, offset; // correlation coefficient and offset for calculating composite HHblast P-values | |
924 | |
925 Reset(); | |
926 while (!End()) | |
927 { | |
928 hit = ReadNext(); | |
929 corr = corr_HHblast(q.Neff_HMM,hit.Neff_HMM); | |
930 offset = offset_HHblast(q.Neff_HMM,hit.Neff_HMM); | |
931 hit.score_sort = hit.logPval + blast_logPvals->Show(hit.name); | |
932 hit.logPval = logPvalue_HHblast(-hit.score_sort+offset,corr); // overwrite logPval from HHsearch with composite logPval from HHblast | |
933 hit.Pval = Pvalue_HHblast(-hit.score_sort+offset,corr); // overwrite P-value from HHsearch with composite P-value from HHblast | |
934 hit.Eval = exp(hit.logPval+log(N_searched)); // overwrite E-value from HHsearch with composite E-value from HHblast | |
935 hit.Probab = Probab(hit); | |
936 Overwrite(hit); // copy hit object into current position of hitlist | |
937 } | |
938 ResortList(); // use InsertSort to resort list according to sum of minus-log-Pvalues | |
939 } | |
940 | |
941 | |
942 ////////////////////////////////////////////////////////////////////////////// | |
943 /** | |
944 * @brief Read file generated by blastpgp (default output) and store P-values in hash | |
945 */ | |
946 void | |
947 HitList::ReadBlastFile(HMM& q) | |
948 { | |
949 char line[LINELEN]=""; // input line | |
950 int Ndb; // number of sequences in database | |
951 int Ldb=0; // size of database in number of amino acids | |
952 char* templ; | |
953 int i; | |
954 if (!blast_logPvals) { blast_logPvals = new(Hash<float>); blast_logPvals->New(16381,0); } | |
955 | |
956 FILE* blaf = NULL; | |
957 if (!strcmp(par.blafile,"stdin")) blaf=stdin; | |
958 else | |
959 { | |
960 blaf = fopen(par.blafile,"rb"); | |
961 if (!blaf) OpenFileError(par.blafile); | |
962 } | |
963 | |
964 // Read number of sequences and size of database | |
965 while (fgetline(line,LINELEN-1,blaf) && !strstr(line,"sequences;")); | |
966 if (!strstr(line,"sequences;")) FormatError(par.blafile,"No 'Database:' string found."); | |
967 char* ptr=line; | |
968 Ndb = strint(ptr); | |
969 if (Ndb==INT_MIN) FormatError(par.blafile,"No integer for number of sequences in database found."); | |
970 while ((i=strint(ptr))>INT_MIN) Ldb = 1000*Ldb + i; | |
971 if (Ldb==0) FormatError(par.blafile,"No integer for size of database found."); | |
972 printf("\nNumber of sequences in database = %i Size of database = %i\n",Ndb,Ldb); | |
973 | |
974 // Read all E-values and sequence lengths | |
975 while (fgetline(line,LINELEN-1,blaf)) | |
976 { | |
977 if (line[0]=='>') | |
978 { | |
979 // Read template name | |
980 templ = new(char[255]); | |
981 ptr = line+1; | |
982 strwrd(templ,ptr); | |
983 if (!blast_logPvals->Contains(templ)) // store logPval only for best HSP with template | |
984 { | |
985 // Read length | |
986 while (fgetline(line,LINELEN-1,blaf) && !strstr(line,"Length =")); | |
987 ptr = line+18; | |
988 int length = strint(ptr); | |
989 // Read E-value | |
990 fgetline(line,LINELEN-1,blaf); | |
991 fgetline(line,LINELEN-1,blaf); | |
992 float EvalDB; // E-value[seq-db] = Evalue for comparison Query vs. database, from PSI-BLAST | |
993 float EvalQT; // E-value[seq-seq] = Evalue for comparison Query vs. template (seq-seq) | |
994 double logPval; | |
995 ptr = strstr(line+20,"Expect ="); | |
996 if (!ptr) FormatError(par.blafile,"No 'Expect =' string found."); | |
997 if (sscanf(ptr+8,"%g",&EvalDB)<1) | |
998 { | |
999 ptr[7]='1'; | |
1000 if (sscanf(ptr+7,"%g",&EvalDB)<1) | |
1001 FormatError(par.blafile,"No Evalue found after 'Expect ='."); | |
1002 } | |
1003 // Calculate P-value[seq-seq] = 1 - exp(-E-value[seq-seq]) = 1 - exp(-Lt/Ldb*E-value[seq-db]) | |
1004 EvalQT = length/double(Ldb)*double(EvalDB); | |
1005 if (EvalQT>1E-3) logPval = log(1.0-exp(-EvalQT)); else logPval=log(double(EvalQT)+1.0E-99); | |
1006 blast_logPvals->Add(templ,logPval); | |
1007 printf("template=%-10.10s length=%-3i EvalDB=%8.2g EvalQT=%8.2g P-value=%8.2g log Pval=%8.2g\n",templ,length,EvalDB,EvalQT,exp(logPval),logPval); | |
1008 } | |
1009 else { | |
1010 delete[] templ; templ = NULL; | |
1011 } | |
1012 } | |
1013 } | |
1014 fclose(blaf); | |
1015 } | |
1016 | |
1017 | |
1018 ///////////////////////////////////////////////////////////////////////////////////// | |
1019 /** | |
1020 * @brief Calculate output of hidden neural network units | |
1021 */ | |
1022 inline float | |
1023 calc_hidden_output(const float* weights, const float* bias, float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm) | |
1024 { | |
1025 float res; | |
1026 // Calculate activation of hidden unit = sum of all inputs * weights + bias | |
1027 res = Lqnorm*weights[0] + Ltnorm*weights[1] + Nqnorm*weights[2] + Ntnorm*weights[3] + *bias; | |
1028 res = 1.0 / (1.0 + exp(-(res ))); // logistic function | |
1029 return res; | |
1030 } | |
1031 | |
1032 //////////////////////////////////////////////////////////////////////////////////// | |
1033 /** | |
1034 * @brief Neural network regressions of lamda for EVD | |
1035 */ | |
1036 inline float | |
1037 lamda_NN(float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm) | |
1038 { | |
1039 const int inputs = 4; | |
1040 const int hidden = 4; | |
1041 const float biases[] = {-0.73195, -1.43792, -1.18839, -3.01141}; // bias for all hidden units | |
1042 const float weights[] = { // Weights for the neural networks (column = start unit, row = end unit) | |
1043 -0.52356, -3.37650, 1.12984, -0.46796, | |
1044 -4.71361, 0.14166, 1.66807, 0.16383, | |
1045 -0.94895, -1.24358, -1.20293, 0.95434, | |
1046 -0.00318, 0.53022, -0.04914, -0.77046, | |
1047 2.45630, 3.02905, 2.53803, 2.64379 | |
1048 }; | |
1049 float lamda=0.0; | |
1050 for (int h = 0; h<hidden; h++) { | |
1051 lamda += calc_hidden_output( weights+inputs*h, biases+h, Lqnorm,Ltnorm,Nqnorm,Ntnorm ) * weights[hidden*inputs+h]; | |
1052 } | |
1053 return lamda; | |
1054 } | |
1055 | |
1056 //////////////////////////////////////////////////////////////////////////////////// | |
1057 /** | |
1058 * @brief Neural network regressions of mu for EVD | |
1059 */ | |
1060 inline float | |
1061 mu_NN(float Lqnorm, float Ltnorm, float Nqnorm, float Ntnorm) | |
1062 { | |
1063 const int inputs = 4; | |
1064 const int hidden = 6; | |
1065 const float biases[] = {-4.25264, -3.63484, -5.86653, -4.78472, -2.76356, -2.21580}; // bias for all hidden units | |
1066 const float weights[] = { // Weights for the neural networks (column = start unit, row = end unit) | |
1067 1.96172, 1.07181, -7.41256, 0.26471, | |
1068 0.84643, 1.46777, -1.04800, -0.51425, | |
1069 1.42697, 1.99927, 0.64647, 0.27834, | |
1070 1.34216, 1.64064, 0.35538, -8.08311, | |
1071 2.30046, 1.31700, -0.46435, -0.46803, | |
1072 0.90090, -3.53067, 0.59212, 1.47503, | |
1073 -1.26036, 1.52812, 1.58413, -1.90409, 0.92803, -0.66871 | |
1074 }; | |
1075 float mu=0.0; | |
1076 for (int h = 0; h<hidden; h++) { | |
1077 mu += calc_hidden_output( weights+inputs*h, biases+h, Lqnorm,Ltnorm,Nqnorm,Ntnorm ) * weights[hidden*inputs+h]; | |
1078 } | |
1079 return 20.0*mu; | |
1080 } | |
1081 | |
1082 ////////////////////////////////////////////////////////////////////////////// | |
1083 /** | |
1084 * @brief Calculate Pvalues as a function of query and template lengths and diversities | |
1085 */ | |
1086 void | |
1087 HitList::CalculatePvalues(HMM& q) | |
1088 { | |
1089 Hit hit; | |
1090 float lamda=0.4, mu=3.0; | |
1091 const float log1000=log(1000.0); | |
1092 | |
1093 if (par.idummy!=2) | |
1094 { | |
1095 printf("WARNING: idummy should have been ==2 (no length correction)\n"); | |
1096 exit(4); | |
1097 } | |
1098 | |
1099 if(N_searched==0) N_searched=1; | |
1100 if (v>=2) | |
1101 printf("Calculate Pvalues as a function of query and template lengths and diversities...\n"); | |
1102 Reset(); | |
1103 while (!End()) | |
1104 { | |
1105 hit = ReadNext(); | |
1106 | |
1107 if (par.loc) | |
1108 { | |
1109 lamda = lamda_NN( log(q.L)/log1000, log(hit.L)/log1000, q.Neff_HMM/10.0, hit.Neff_HMM/10.0 ); | |
1110 mu = mu_NN( log(q.L)/log1000, log(hit.L)/log1000, q.Neff_HMM/10.0, hit.Neff_HMM/10.0 ); | |
1111 // if (v>=3 && nhits++<20) | |
1112 // printf("hit=%-10.10s Lq=%-4i Lt=%-4i Nq=%5.2f Nt=%5.2f => lamda=%-6.3f mu=%-6.3f\n",hit.name,q.L,hit.L,q.Neff_HMM,hit.Neff_HMM,lamda,mu); | |
1113 } | |
1114 else | |
1115 { | |
1116 printf("WARNING: global calibration not yet implemented!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); | |
1117 } | |
1118 hit.logPval = logPvalue(hit.score,lamda,mu); | |
1119 hit.Pval = Pvalue(hit.score,lamda,mu); | |
1120 hit.Eval=exp(hit.logPval+log(N_searched)); | |
1121 // hit.score_aass = hit.logPval/LAMDA-3.0 - hit.score_ss; // median(lamda)~0.45, median(mu)~3.0 in EVDs for scop20.1.63 HMMs | |
1122 // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue)) | |
1123 hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-hit.Pval)) )/0.45 - fmin(lamda*hit.score_ss,fmax(0.0,0.2*(hit.score-8.0)))/0.45 - 3.0; | |
1124 hit.score_sort = hit.score_aass; | |
1125 hit.Probab = Probab(hit); | |
1126 Overwrite(hit); | |
1127 } | |
1128 SortList(); | |
1129 Reset(); | |
1130 return; | |
1131 } | |
1132 | |
1133 ////////////////////////////////////////////////////////////////////////////// | |
1134 /** | |
1135 * @brief Calculate Pvalues from calibration of 0: query HMM, 1:template HMMs, 2: both | |
1136 */ | |
1137 void | |
1138 HitList::GetPvalsFromCalibration(HMM& q) | |
1139 { | |
1140 Hit hit; | |
1141 char warn=0; | |
1142 if(N_searched==0) N_searched=1; | |
1143 if (v>=2) | |
1144 { | |
1145 switch (par.calm) | |
1146 { | |
1147 case 0: | |
1148 printf("Using lamda=%-5.3f and mu=%-5.2f from calibrated query HMM %s. \n",q.lamda,q.mu,q.name); | |
1149 printf("Note that HMMs need to be recalibrated when changing HMM-HMM alignment options.\n"); | |
1150 break; | |
1151 case 1: | |
1152 printf("Using score distribution parameters lamda and mu from database HMMs \n"); | |
1153 break; | |
1154 case 2: | |
1155 printf("Combining score distribution parameters lamda and mu from query and database HMMs\n"); | |
1156 printf("Note that HMMs need to be recalibrated when changing HMM-HMM alignment options.\n"); | |
1157 break; | |
1158 } | |
1159 } | |
1160 Reset(); | |
1161 while (!End()) | |
1162 { | |
1163 hit = ReadNext(); | |
1164 if (par.calm==0 || (hit.logPvalt==0) ) | |
1165 { | |
1166 hit.logPval = logPvalue(hit.score,q.lamda,q.mu); | |
1167 hit.Pval = Pvalue(hit.score,q.lamda,q.mu); | |
1168 if (par.calm>0 && warn++<1 && v>=1) | |
1169 printf("Warning: some template HMM (e.g. %s) are not calibrated. Using query calibration.\n",hit.name); | |
1170 } | |
1171 else if (par.calm==1) | |
1172 { | |
1173 hit.logPval = hit.logPvalt; | |
1174 hit.Pval = hit.Pvalt; | |
1175 } | |
1176 else if (par.calm==2) | |
1177 { | |
1178 hit.logPval = 0.5*( logPvalue(hit.score,q.lamda,q.mu) + hit.logPvalt); | |
1179 hit.Pval = sqrt( Pvalue(hit.score,q.lamda,q.mu) * hit.Pvalt); | |
1180 if (v>=5) printf("Score: %7.1f lamda: %7.1f mu: %7.1f P-values: query-calibrated: %8.2G template-calibrated: %8.2G geometric mean: %8.2G\n",hit.score,q.lamda,q.mu,Pvalue(hit.score,q.lamda,q.mu),hit.Pvalt,hit.Pval); | |
1181 } | |
1182 | |
1183 hit.Eval=exp(hit.logPval+log(N_searched)); | |
1184 // hit.score_aass = hit.logPval/LAMDA-3.0 - hit.score_ss; // median(lamda)~0.45, median(mu)~3.0 in EVDs for scop20.1.63 HMMs | |
1185 // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue)) | |
1186 hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-hit.Pval)) ) / 0.45-3.0 - fmin(hit.score_ss,fmax(0.0,0.5*hit.score-5.0)); | |
1187 hit.score_sort = hit.score_aass; | |
1188 hit.Probab = Probab(hit); | |
1189 Overwrite(hit); | |
1190 } | |
1191 SortList(); | |
1192 Reset(); | |
1193 return; | |
1194 } | |
1195 | |
1196 | |
1197 | |
1198 | |
1199 | |
1200 | |
1201 | |
1202 | |
1203 | |
1204 ////////////////////////////////////////////////////////////////////////////// | |
1205 ////////////////////////////////////////////////////////////////////////////// | |
1206 ////////////////////////////////////////////////////////////////////////////// | |
1207 // Transitive scoring | |
1208 ////////////////////////////////////////////////////////////////////////////// | |
1209 ////////////////////////////////////////////////////////////////////////////// | |
1210 ////////////////////////////////////////////////////////////////////////////// | |
1211 | |
1212 | |
1213 | |
1214 | |
1215 | |
1216 | |
1217 | |
1218 ///////////////////////////////////////////////////////////////////////////////////// | |
1219 /** | |
1220 * @brief Calculate P-values and Probabilities from transitive scoring over whole database | |
1221 */ | |
1222 void | |
1223 HitList::TransitiveScoring() | |
1224 { | |
1225 void PrintMatrix(float** V, int N); | |
1226 void PrintMatrix(double** V, int N); | |
1227 | |
1228 float** Z; // matrix of intra-db Z-scores Z_kl | |
1229 float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm) | |
1230 char** fold; // fold name of HMM k | |
1231 char** fam; // family of HMM k | |
1232 float* Prob; // probability of HMM k | |
1233 float* Zq; // Zq[k] = Z-score between query and database HMM k | |
1234 float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q | |
1235 float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k | |
1236 float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively | |
1237 int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans | |
1238 int N; // dimension of weight matrix is NxN | |
1239 int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively) | |
1240 int k,l,m,n; // indices for database HMMs | |
1241 char name[NAMELEN]; | |
1242 Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N} | |
1243 index.Null(-1); // Set int value to return when no data can be retrieved | |
1244 Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit | |
1245 excluded.Null(0); // Set int value to return when no data can be retrieved | |
1246 Hit hit; | |
1247 size_t unused; /* disable fread gcc warning */ | |
1248 | |
1249 // Read weights matrix W with index hash and names array | |
1250 fprintf(stderr,"Reading in weights file\n"); | |
1251 FILE* wfile = fopen(par.wfile,"rb"); | |
1252 if (v>=1 && wfile==NULL) | |
1253 { | |
1254 fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched); | |
1255 perror("fopen"); | |
1256 fprintf(stderr,"Skipping caclulation of transitive P-values\n"); | |
1257 par.trans=0; | |
1258 return; | |
1259 } | |
1260 unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database) | |
1261 if (v>=1 && N!=N_searched) | |
1262 { | |
1263 fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched); | |
1264 fprintf(stderr,"Skipping caclulation of transitive P-values\n"); | |
1265 par.trans=0; | |
1266 return; | |
1267 } | |
1268 if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N); | |
1269 // Read names of HMMs (to specify mapping of HMM to matrix indices) | |
1270 for (k=0; k<N; k++) | |
1271 { | |
1272 unused = fread(name,sizeof(char),IDLEN,wfile); | |
1273 index.Add(name,k); | |
1274 } | |
1275 // Read symmetric Z-scores matrix | |
1276 Z = new(float*[N]); | |
1277 for (k=0; k<N; k++) | |
1278 { | |
1279 Z[k] = new(float[N]); | |
1280 for (l=0; l<k; l++) Z[k][l] = Z[l][k]; | |
1281 unused = fread(Z[k]+k,sizeof(float),N-k,wfile); | |
1282 } | |
1283 // Read symmetric covariance matrix | |
1284 C = new(float*[N]); | |
1285 for (k=0; k<N; k++) | |
1286 { | |
1287 C[k] = new(float[N]); | |
1288 for (l=0; l<k; l++) C[k][l] = C[l][k]; | |
1289 unused = fread(C[k]+k,sizeof(float),N-k,wfile); | |
1290 } | |
1291 fclose(wfile); | |
1292 | |
1293 // Allocate memory | |
1294 Zq = new(float[N]); | |
1295 Ztq = new(float[N]); | |
1296 Zrq = new(float[N]); | |
1297 fold = new(char*[N]); | |
1298 fam = new(char*[N]); | |
1299 Prob = new(float[N]); | |
1300 ll = new(int[N]); | |
1301 w = new(float[N]); | |
1302 | |
1303 // Transform P-values to normally distributed Z-scores and store in Zq vector | |
1304 fprintf(stderr,"Transform P-values to Z-scores\n"); | |
1305 float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL | |
1306 float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans | |
1307 printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans); | |
1308 | |
1309 Reset(); | |
1310 while (!End()) | |
1311 { | |
1312 hit = ReadNext(); | |
1313 if (hit.irep>1) continue; | |
1314 k = index.Show(hit.name); | |
1315 if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);} | |
1316 if (hit.logPvalt<0) | |
1317 Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk) | |
1318 else | |
1319 Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk | |
1320 // printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]); | |
1321 // if (isnan(Zq[k])) { | |
1322 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); | |
1323 // printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]); | |
1324 // par.trans=0; | |
1325 // return; | |
1326 // } | |
1327 if (Zq[k]>Zmax_neg) excluded.Add(hit.fold); | |
1328 fold[k] = new(char[IDLEN]); | |
1329 fam[k] = new(char[IDLEN]); | |
1330 strcpy(fold[k],hit.fold); | |
1331 strcpy(fam[k],hit.fam); | |
1332 weight[k] = hit.weight; | |
1333 Prob[k] = hit.Probab; | |
1334 } | |
1335 | |
1336 if (v>=3) | |
1337 { | |
1338 excluded.Reset(); | |
1339 while (!excluded.End()) | |
1340 { | |
1341 excluded.ReadNext(name); | |
1342 printf("Excluded fold %s from fitting to Ztq\n",name); | |
1343 } | |
1344 } | |
1345 | |
1346 | |
1347 //////////////////////////////////////////////////////////////// | |
1348 // Calculate transitive score (query->l) Zt[l] | |
1349 | |
1350 // Construct vector ll of indices l for which Z_lq > Zmin_trans | |
1351 m = 0; | |
1352 for (l=0; l<N; l++) | |
1353 if (Zq[l]>=Zmin_trans) ll[m++]=l; | |
1354 M = m; // number of indices l for which Z_lq,Z_lk > Zmin_trans | |
1355 | |
1356 // for (m=0; m<M; m++) | |
1357 // fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]); | |
1358 | |
1359 if (M<=1) | |
1360 for (k=0; k<N; k++) Ztq[k]=0.0; | |
1361 else | |
1362 { | |
1363 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans | |
1364 double** Csub = new(double*[M]); | |
1365 double** Cinv = new(double*[M]); | |
1366 for (m=0; m<M; m++) | |
1367 { | |
1368 Csub[m] = new(double[M]); | |
1369 Cinv[m] = new(double[M]); | |
1370 for (n=0; n<M; n++) | |
1371 Csub[m][n] = double(C[ll[m]][ll[n]]); | |
1372 } | |
1373 | |
1374 if (v>=3) | |
1375 { | |
1376 fprintf(stderr,"Covariance matrix\n"); | |
1377 PrintMatrix(Csub,M); | |
1378 } | |
1379 | |
1380 // Invert Csub | |
1381 fprintf(stderr,"Calculate inverse of covariance submatrix\n"); | |
1382 InvertMatrix(Cinv,Csub,M); | |
1383 | |
1384 if (v>=3) | |
1385 { | |
1386 fprintf(stderr,"Inverse covariance matrix\n"); | |
1387 PrintMatrix(Cinv,M); | |
1388 } | |
1389 | |
1390 // Calculate weights w[l] | |
1391 for (m=0; m<M; m++) | |
1392 { | |
1393 double sum = 0.0; | |
1394 for (n=0; n<M; n++) | |
1395 sum += 1.0 * Cinv[m][n]; | |
1396 w[m] = fmax(sum,0.0); | |
1397 } | |
1398 for (l=0; l<M; l++){ | |
1399 delete[](Cinv[l]); (Cinv[l]) = NULL; | |
1400 } | |
1401 delete[](Cinv); (Cinv) = NULL; | |
1402 | |
1403 // Calculate Ztq[k] for all HMMs k | |
1404 fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n"); | |
1405 float norm = NormalizationFactor(Csub,w,M); | |
1406 for (k=0; k<N; k++) | |
1407 { | |
1408 double sumZ = 0.0; | |
1409 for (m=0; m<M; m++) | |
1410 sumZ += w[m] * Z[ll[m]][k]; | |
1411 Ztq[k] = sumZ/norm; | |
1412 } | |
1413 | |
1414 for (l=0; l<M; l++){ | |
1415 delete[](Csub[l]); (Csub[l]) = NULL; | |
1416 } | |
1417 delete[](Csub); (Csub) = NULL; | |
1418 } | |
1419 | |
1420 //////////////////////////////////////////////////////////////// | |
1421 // Calculate reverse transitive score (l->query-) Zrq[l] | |
1422 | |
1423 fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n"); | |
1424 for (k=0; k<N; k++) | |
1425 { | |
1426 // Construct vector ll of indices l for which Z_lk > Zmin_tran | |
1427 m = 0; | |
1428 for (l=0; l<N; l++) | |
1429 if (Z[l][k]+Z[k][l]>=2*Zmin_trans) ll[m++]=l; | |
1430 int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran | |
1431 | |
1432 | |
1433 // fprintf(stderr,"\nfam[k]: %s\n",fam[k]); | |
1434 // for (m=0; m<M; m++) | |
1435 // printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]); | |
1436 | |
1437 if (M<=1) | |
1438 { | |
1439 Zrq[k] = Zq[k]; | |
1440 } | |
1441 else | |
1442 { | |
1443 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans | |
1444 double** Csub = new(double*[M]); | |
1445 for (m=0; m<M; m++) | |
1446 { | |
1447 Csub[m] = new(double[M]); | |
1448 for (n=0; n<M; n++) | |
1449 Csub[m][n] = double(C[ll[m]][ll[n]]); | |
1450 } | |
1451 // fprintf(stderr,"Covariance matrix\n"); | |
1452 // PrintMatrix(Csub,M); | |
1453 | |
1454 if (M==2) | |
1455 { | |
1456 for (m=0; m<M; m++) w[m] = 1.0/M; | |
1457 } | |
1458 else | |
1459 { | |
1460 | |
1461 double** Cinv = new(double*[M]); | |
1462 for (m=0; m<M; m++) Cinv[m] = new(double[M]); | |
1463 | |
1464 // Invert Csub | |
1465 InvertMatrix(Cinv,Csub,M); | |
1466 | |
1467 // fprintf(stderr,"Inverse covariance matrix\n"); | |
1468 // PrintMatrix(Cinv,M); | |
1469 | |
1470 // Calculate weights w[l] | |
1471 for (m=0; m<M; m++) | |
1472 { | |
1473 double sum = 0.0; | |
1474 for (n=0; n<M; n++) | |
1475 sum += 1.0 * Cinv[m][n]; | |
1476 w[m] = fmax(sum,0.0); | |
1477 } | |
1478 | |
1479 // for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]); | |
1480 | |
1481 for (l=0; l<M; l++){ | |
1482 delete[](Cinv[l]); (Cinv[l]) = NULL; | |
1483 } | |
1484 delete[](Cinv); (Cinv) = NULL; | |
1485 } | |
1486 | |
1487 // Calculate Zrq[k] and normalize | |
1488 float norm = NormalizationFactor(Csub,w,M); | |
1489 double sumZ = 0.0; | |
1490 for (m=0; m<M; m++) | |
1491 sumZ += w[m] * Zq[ll[m]]; | |
1492 Zrq[k] = sumZ/norm; | |
1493 | |
1494 for (l=0; l<M; l++){ | |
1495 delete[](Csub[l]); (Csub[l]) = NULL; | |
1496 } | |
1497 delete[](Csub); (Csub) = NULL; | |
1498 } | |
1499 | |
1500 // fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]); | |
1501 } | |
1502 | |
1503 // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score | |
1504 for (k=0; k<N; k++) | |
1505 { | |
1506 float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]); | |
1507 // if (isnan(Zqtot)) | |
1508 // { | |
1509 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); | |
1510 // printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot); | |
1511 // par.trans=0; | |
1512 // return; | |
1513 // } | |
1514 if (v>=2 && Zq[k] + Zqtot > 2*Zmin_trans) { | |
1515 printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot); | |
1516 } | |
1517 Ztq[k] = Zqtot; | |
1518 } | |
1519 | |
1520 // Calculate mean and standard deviation of Z1q | |
1521 fprintf(stderr,"Calculate mean and standard deviation of Ztq\n"); | |
1522 double sumw=0.0; | |
1523 double sumZ=0.0; | |
1524 double sumZ2=0.0; | |
1525 for (k=0; k<N; k++) | |
1526 { | |
1527 if (excluded.Contains(fold[k])) continue; | |
1528 sumw += weight[k]; | |
1529 sumZ += weight[k]*Ztq[k]; | |
1530 sumZ2 += weight[k]*Ztq[k]*Ztq[k]; | |
1531 // if (isnan(sumZ)) | |
1532 // { | |
1533 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); | |
1534 // printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]); | |
1535 // par.trans=0; | |
1536 // return; | |
1537 // } | |
1538 } | |
1539 float mu = sumZ/sumw; | |
1540 float sigma = sqrt(sumZ2/sumw-mu*mu); | |
1541 if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma); | |
1542 sigma *= 1.01;// correct different fitting of EVD and normal variables | |
1543 | |
1544 // Normalize Ztq and calculate P1-values | |
1545 fprintf(stderr,"Normalize Ztq and calculate P1-values\n"); | |
1546 Reset(); | |
1547 while (!End()) | |
1548 { | |
1549 hit = ReadNext(); | |
1550 hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma); | |
1551 hit.E1val = N_searched*(hit.logPval<-100.0? 0.0 : exp(hit.logPval)); | |
1552 // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue)) | |
1553 hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss; | |
1554 hit.Probab = Probab(hit); | |
1555 hit.score_sort = hit.logPval; | |
1556 Overwrite(hit); // copy hit object into current position of hitlist | |
1557 } | |
1558 | |
1559 for (k=0; k<N; k++){ | |
1560 delete[](Z[k]); (Z[k]) = NULL; | |
1561 } | |
1562 for (k=0; k<N; k++){ | |
1563 delete[](C[k]); (C[k]) = NULL; | |
1564 } | |
1565 for (k=0; k<N; k++){ | |
1566 delete[](fold[k]); (fold[k]) = NULL; | |
1567 } | |
1568 for (k=0; k<N; k++){ | |
1569 delete[](fam[k]); (fam[k]) = NULL; | |
1570 } | |
1571 delete[](C); (C) = NULL; | |
1572 delete[](Z); (Z) = NULL; | |
1573 delete[](fold); (fold) = NULL; | |
1574 delete[](fam); (fam) = NULL; | |
1575 delete[](Prob); (Prob) = NULL; | |
1576 delete[](ll); (ll) = NULL; | |
1577 delete[](Zq); (Zq) = NULL; | |
1578 delete[](Ztq); (Ztq) = NULL; | |
1579 } | |
1580 | |
1581 | |
1582 | |
1583 ////////////////////////////////////////////////////////////////////////////// | |
1584 /** | |
1585 * @brief Calculate P-values and Probabilities from transitive scoring over whole database | |
1586 */ | |
1587 void | |
1588 HitList::TransitiveScoring2() | |
1589 { | |
1590 void PrintMatrix(float** V, int N); | |
1591 void PrintMatrix(double** V, int N); | |
1592 | |
1593 float** Z; // matrix of intra-db Z-scores Z_kl | |
1594 float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm) | |
1595 char** fold; // fold name of HMM k | |
1596 char** fam; // family of HMM k | |
1597 float* Prob; // probability of HMM k | |
1598 float* Zq; // Zq[k] = Z-score between query and database HMM k | |
1599 float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q | |
1600 float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k | |
1601 float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively | |
1602 int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans | |
1603 int N; // dimension of weight matrix is NxN | |
1604 int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively) | |
1605 int k,l,m,n; // indices for database HMMs | |
1606 char name[NAMELEN]; | |
1607 Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N} | |
1608 index.Null(-1); // Set int value to return when no data can be retrieved | |
1609 Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit | |
1610 excluded.Null(0); // Set int value to return when no data can be retrieved | |
1611 Hit hit; | |
1612 size_t unused; /* disable fread gcc warning */ | |
1613 | |
1614 // Read weights matrix W with index hash and names array | |
1615 fprintf(stderr,"Reading in weights file\n"); | |
1616 FILE* wfile = fopen(par.wfile,"rb"); | |
1617 if (v>=1 && wfile==NULL) | |
1618 { | |
1619 fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched); | |
1620 perror("fopen"); | |
1621 fprintf(stderr,"Skipping caclulation of transitive P-values\n"); | |
1622 par.trans=0; | |
1623 return; | |
1624 } | |
1625 unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database) | |
1626 if (v>=1 && N!=N_searched) | |
1627 { | |
1628 fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched); | |
1629 fprintf(stderr,"Skipping caclulation of transitive P-values\n"); | |
1630 par.trans=0; | |
1631 return; | |
1632 } | |
1633 if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N); | |
1634 // Read names of HMMs (to specify mapping of HMM to matrix indices) | |
1635 for (k=0; k<N; k++) | |
1636 { | |
1637 unused = fread(name,sizeof(char),IDLEN,wfile); | |
1638 index.Add(name,k); | |
1639 } | |
1640 // Read symmetric Z-scores matrix | |
1641 Z = new(float*[N]); | |
1642 for (k=0; k<N; k++) | |
1643 { | |
1644 Z[k] = new(float[N]); | |
1645 for (l=0; l<k; l++) Z[k][l] = Z[l][k]; | |
1646 unused = fread(Z[k]+k,sizeof(float),N-k,wfile); | |
1647 } | |
1648 // Read symmetric covariance matrix | |
1649 C = new(float*[N]); | |
1650 for (k=0; k<N; k++) | |
1651 { | |
1652 C[k] = new(float[N]); | |
1653 for (l=0; l<k; l++) C[k][l] = C[l][k]; | |
1654 unused = fread(C[k]+k,sizeof(float),N-k,wfile); | |
1655 } | |
1656 fclose(wfile); | |
1657 | |
1658 // Allocate memory | |
1659 Zq = new(float[N]); | |
1660 Ztq = new(float[N]); | |
1661 Zrq = new(float[N]); | |
1662 fold = new(char*[N]); | |
1663 fam = new(char*[N]); | |
1664 Prob = new(float[N]); | |
1665 ll = new(int[N]); | |
1666 w = new(float[N]); | |
1667 | |
1668 // Transform P-values to normally distributed Z-scores and store in Zq vector | |
1669 fprintf(stderr,"Transform P-values to Z-scores\n"); | |
1670 float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL | |
1671 float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans | |
1672 printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans); | |
1673 | |
1674 Reset(); | |
1675 while (!End()) | |
1676 { | |
1677 hit = ReadNext(); | |
1678 if (hit.irep>1) continue; | |
1679 k = index.Show(hit.name); | |
1680 if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);} | |
1681 if (hit.logPvalt<0) | |
1682 Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk) | |
1683 else | |
1684 Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk | |
1685 // printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]); | |
1686 // if (isnan(Zq[k])) | |
1687 // { | |
1688 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); | |
1689 // printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]); | |
1690 // par.trans=0; | |
1691 // return; | |
1692 // } | |
1693 if (Zq[k]>Zmax_neg) excluded.Add(hit.fold); | |
1694 fold[k] = new(char[IDLEN]); | |
1695 fam[k] = new(char[IDLEN]); | |
1696 strcpy(fold[k],hit.fold); | |
1697 strcpy(fam[k],hit.fam); | |
1698 weight[k] = hit.weight; | |
1699 Prob[k] = hit.Probab; | |
1700 } | |
1701 | |
1702 if (v>=3) | |
1703 { | |
1704 excluded.Reset(); | |
1705 while (!excluded.End()) | |
1706 { | |
1707 excluded.ReadNext(name); | |
1708 printf("Excluded fold %s from fitting to Ztq\n",name); | |
1709 } | |
1710 } | |
1711 | |
1712 | |
1713 //////////////////////////////////////////////////////////////// | |
1714 // Calculate transitive score (query->l) Zt[l] | |
1715 | |
1716 // Construct vector ll of indices l for which Z_lq > Zmin_trans | |
1717 m = 0; | |
1718 for (l=0; l<N; l++) | |
1719 if (Zq[l]>=Zmin_trans) ll[m++]=l; | |
1720 M = m; // number of indices l for which Z_lq,Z_lk > Zmin_trans | |
1721 | |
1722 // for (m=0; m<M; m++) | |
1723 // fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]); | |
1724 | |
1725 if (M<=1) | |
1726 for (k=0; k<N; k++) Ztq[k]=0.0; | |
1727 else | |
1728 { | |
1729 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans | |
1730 double** Csub = new(double*[M]); | |
1731 double** Cinv = new(double*[M]); | |
1732 for (m=0; m<M; m++) | |
1733 { | |
1734 Csub[m] = new(double[M]); | |
1735 Cinv[m] = new(double[M]); | |
1736 for (n=0; n<M; n++) | |
1737 Csub[m][n] = double(C[ll[m]][ll[n]]); | |
1738 } | |
1739 | |
1740 if (v>=3) | |
1741 { | |
1742 fprintf(stderr,"Covariance matrix\n"); | |
1743 PrintMatrix(Csub,M); | |
1744 } | |
1745 | |
1746 // // Invert Csub | |
1747 // fprintf(stderr,"Calculate inverse of covariance submatrix\n"); | |
1748 // InvertMatrix(Cinv,Csub,M); | |
1749 | |
1750 // if (v>=3) | |
1751 // { | |
1752 // fprintf(stderr,"Inverse covariance matrix\n"); | |
1753 // PrintMatrix(Cinv,M); | |
1754 // } | |
1755 | |
1756 | |
1757 // Calculate weights w[l] | |
1758 for (m=0; m<M; m++) | |
1759 { | |
1760 double sum = 0.0; | |
1761 for (n=0; n<M; n++) | |
1762 sum += 1.0 * Csub[m][n]; | |
1763 printf("w[%4i] = %-8.5f\n",ll[m],1.0/sum); | |
1764 w[m] = (sum>0? Zq[ll[m]] / sum : 0.0); | |
1765 } | |
1766 for (l=0; l<M; l++){ | |
1767 delete[](Cinv[l]); (Cinv[l]) = NULL; | |
1768 } | |
1769 delete[](Cinv); (Cinv) = NULL; | |
1770 | |
1771 // Calculate Ztq[k] for all HMMs k | |
1772 fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n"); | |
1773 float norm = NormalizationFactor(Csub,w,M); | |
1774 for (k=0; k<N; k++) | |
1775 { | |
1776 double sumZ = 0.0; | |
1777 for (m=0; m<M; m++) | |
1778 sumZ += w[m] * Z[ll[m]][k]; | |
1779 Ztq[k] = sumZ/norm; | |
1780 } | |
1781 | |
1782 for (l=0; l<M; l++){ | |
1783 delete[](Csub[l]); (Csub[l]) = NULL; | |
1784 } | |
1785 delete[](Csub); (Csub) = NULL; | |
1786 } | |
1787 | |
1788 //////////////////////////////////////////////////////////////// | |
1789 // Calculate reverse transitive score (l->query-) Zrq[l] | |
1790 | |
1791 fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n"); | |
1792 for (k=0; k<N; k++) | |
1793 { | |
1794 // Construct vector ll of indices l for which Z_lk > Zmin_tran | |
1795 m = 0; | |
1796 for (l=0; l<N; l++) | |
1797 if (Z[l][k]+Z[k][l]>=2*Zmin_trans) ll[m++]=l; | |
1798 int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran | |
1799 | |
1800 | |
1801 // fprintf(stderr,"\nfam[k]: %s\n",fam[k]); | |
1802 // for (m=0; m<M; m++) | |
1803 // printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]); | |
1804 | |
1805 if (M<=1) | |
1806 { | |
1807 Zrq[k] = Zq[k]; | |
1808 } | |
1809 else | |
1810 { | |
1811 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans | |
1812 double** Csub = new(double*[M]); | |
1813 for (m=0; m<M; m++) | |
1814 { | |
1815 Csub[m] = new(double[M]); | |
1816 for (n=0; n<M; n++) | |
1817 Csub[m][n] = double(C[ll[m]][ll[n]]); | |
1818 } | |
1819 // fprintf(stderr,"Covariance matrix\n"); | |
1820 // PrintMatrix(Csub,M); | |
1821 | |
1822 if (M<=2) | |
1823 { | |
1824 for (m=0; m<M; m++) w[m] = 1.0/M; | |
1825 } | |
1826 else | |
1827 { | |
1828 | |
1829 double** Cinv = new(double*[M]); | |
1830 for (m=0; m<M; m++) Cinv[m] = new(double[M]); | |
1831 | |
1832 // // Invert Csub | |
1833 // InvertMatrix(Cinv,Csub,M); | |
1834 | |
1835 // // fprintf(stderr,"Inverse covariance matrix\n"); | |
1836 // // PrintMatrix(Cinv,M); | |
1837 | |
1838 // Calculate weights w[l] | |
1839 for (m=0; m<M; m++) | |
1840 { | |
1841 double sum = 0.0; | |
1842 for (n=0; n<M; n++) | |
1843 sum += 1.0 * Csub[m][n]; | |
1844 w[m] = (sum>0? Z[ll[m]][k] / sum : 0.0); | |
1845 } | |
1846 | |
1847 // for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]); | |
1848 | |
1849 for (l=0; l<M; l++){ | |
1850 delete[](Cinv[l]); (Cinv[l]) = NULL; | |
1851 } | |
1852 delete[](Cinv); (Cinv) = NULL; | |
1853 } | |
1854 | |
1855 // Calculate Zrq[k] and normalize | |
1856 float norm = NormalizationFactor(Csub,w,M); | |
1857 double sumZ = 0.0; | |
1858 for (m=0; m<M; m++) | |
1859 sumZ += w[m] * Zq[ll[m]]; | |
1860 Zrq[k] = sumZ/norm; | |
1861 | |
1862 for (l=0; l<M; l++){ | |
1863 delete[](Csub[l]); (Csub[l]) = NULL; | |
1864 } | |
1865 delete[](Csub); (Csub) = NULL; | |
1866 } | |
1867 | |
1868 // fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]); | |
1869 } | |
1870 | |
1871 // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score | |
1872 for (k=0; k<N; k++) | |
1873 { | |
1874 float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]); | |
1875 // if (isnan(Zqtot)) | |
1876 // { | |
1877 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); | |
1878 // printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot); | |
1879 // par.trans=0; | |
1880 // return; | |
1881 // } | |
1882 if (v>=2 && Zq[k] + Zqtot > 2*Zmin_trans) { | |
1883 printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot); | |
1884 } | |
1885 Ztq[k] = Zqtot; | |
1886 } | |
1887 | |
1888 // Calculate mean and standard deviation of Z1q | |
1889 fprintf(stderr,"Calculate mean and standard deviation of Ztq\n"); | |
1890 double sumw=0.0; | |
1891 double sumZ=0.0; | |
1892 double sumZ2=0.0; | |
1893 for (k=0; k<N; k++) | |
1894 { | |
1895 if (excluded.Contains(fold[k])) continue; | |
1896 sumw += weight[k]; | |
1897 sumZ += weight[k]*Ztq[k]; | |
1898 sumZ2 += weight[k]*Ztq[k]*Ztq[k]; | |
1899 // if (isnan(sumZ)) | |
1900 // { | |
1901 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); | |
1902 // printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]); | |
1903 // par.trans=0; | |
1904 // return; | |
1905 // } | |
1906 } | |
1907 float mu = sumZ/sumw; | |
1908 float sigma = sqrt(sumZ2/sumw-mu*mu); | |
1909 if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma); | |
1910 sigma *= 1.01;// correct different fitting of EVD and normal variables | |
1911 | |
1912 // Normalize Ztq and calculate P1-values | |
1913 fprintf(stderr,"Normalize Ztq and calculate P1-values\n"); | |
1914 Reset(); | |
1915 while (!End()) | |
1916 { | |
1917 hit = ReadNext(); | |
1918 hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma); | |
1919 hit.E1val = N_searched*(hit.logPval<-100? 0.0 : exp(hit.logPval)); | |
1920 // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue)) | |
1921 hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss; | |
1922 hit.Probab = Probab(hit); | |
1923 hit.score_sort = hit.logPval; | |
1924 Overwrite(hit); // copy hit object into current position of hitlist | |
1925 } | |
1926 | |
1927 for (k=0; k<N; k++){ | |
1928 delete[](Z[k]); (Z[k]) = NULL; | |
1929 } | |
1930 for (k=0; k<N; k++){ | |
1931 delete[](C[k]); (C[k]) = NULL; | |
1932 } | |
1933 for (k=0; k<N; k++){ | |
1934 delete[](fold[k]); (fold[k]) = NULL; | |
1935 } | |
1936 for (k=0; k<N; k++){ | |
1937 delete[](fam[k]); (fam[k]) = NULL; | |
1938 } | |
1939 delete[](C); (C) = NULL; | |
1940 delete[](Z); (Z) = NULL; | |
1941 delete[](fold); (fold) = NULL; | |
1942 delete[](fam); (fam) = NULL; | |
1943 delete[](Prob); (Prob) = NULL; | |
1944 delete[](ll); (ll) = NULL; | |
1945 delete[](Zq); (Zq) = NULL; | |
1946 delete[](Ztq); (Ztq) = NULL; | |
1947 } | |
1948 | |
1949 | |
1950 ///////////////////////////////////////////////////////////////////////////////////// | |
1951 /** | |
1952 * @brief Calculate P-values and Probabilities from transitive scoring over whole database | |
1953 * Like TransitiveScoring(), | |
1954 * but in transitive scoring, Z1_qk = sum_l w_l*Z_lk, use all l:E_ql<=E_qk | |
1955 * and in reverse scoring, Z1_kr = sum_l w_l*Z_lq, use all l:E_kl<=E_kq | |
1956 */ | |
1957 void | |
1958 HitList::TransitiveScoring3() | |
1959 { | |
1960 void PrintMatrix(float** V, int N); | |
1961 void PrintMatrix(double** V, int N); | |
1962 | |
1963 float** Z; // matrix of intra-db Z-scores Z_kl | |
1964 float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm) | |
1965 char** fold; // fold name of HMM k | |
1966 char** fam; // family of HMM k | |
1967 float* Prob; // probability of HMM k | |
1968 float* Zq; // Zq[k] = Z-score between query and database HMM k | |
1969 float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q | |
1970 float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k | |
1971 float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively | |
1972 int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans | |
1973 int N; // dimension of weight matrix is NxN | |
1974 int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively) | |
1975 int k,l,m,n; // indices for database HMMs | |
1976 char name[NAMELEN]; | |
1977 Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N} | |
1978 index.Null(-1); // Set int value to return when no data can be retrieved | |
1979 Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit | |
1980 excluded.Null(0); // Set int value to return when no data can be retrieved | |
1981 Hit hit; | |
1982 size_t unused; /* disable fread gcc warning */ | |
1983 | |
1984 // Read weights matrix W with index hash and names array | |
1985 fprintf(stderr,"Reading in weights file\n"); | |
1986 FILE* wfile = fopen(par.wfile,"rb"); | |
1987 if (v>=1 && wfile==NULL) | |
1988 { | |
1989 fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched); | |
1990 perror("fopen"); | |
1991 fprintf(stderr,"Skipping caclulation of transitive P-values\n"); | |
1992 par.trans=0; | |
1993 return; | |
1994 } | |
1995 unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database) | |
1996 if (v>=1 && N!=N_searched) | |
1997 { | |
1998 fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched); | |
1999 fprintf(stderr,"Skipping caclulation of transitive P-values\n"); | |
2000 par.trans=0; | |
2001 return; | |
2002 } | |
2003 if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N); | |
2004 // Read names of HMMs (to specify mapping of HMM to matrix indices) | |
2005 for (k=0; k<N; k++) | |
2006 { | |
2007 unused = fread(name,sizeof(char),IDLEN,wfile); | |
2008 index.Add(name,k); | |
2009 } | |
2010 // Read symmetric Z-scores matrix | |
2011 Z = new(float*[N]); | |
2012 for (k=0; k<N; k++) | |
2013 { | |
2014 Z[k] = new(float[N]); | |
2015 for (l=0; l<k; l++) Z[k][l] = Z[l][k]; | |
2016 unused = fread(Z[k]+k,sizeof(float),N-k,wfile); | |
2017 } | |
2018 // Read symmetric covariance matrix | |
2019 C = new(float*[N]); | |
2020 for (k=0; k<N; k++) | |
2021 { | |
2022 C[k] = new(float[N]); | |
2023 for (l=0; l<k; l++) C[k][l] = C[l][k]; | |
2024 unused = fread(C[k]+k,sizeof(float),N-k,wfile); | |
2025 } | |
2026 fclose(wfile); | |
2027 | |
2028 // Allocate memory | |
2029 Zq = new(float[N]); | |
2030 Ztq = new(float[N]); | |
2031 Zrq = new(float[N]); | |
2032 fold = new(char*[N]); | |
2033 fam = new(char*[N]); | |
2034 Prob = new(float[N]); | |
2035 ll = new(int[N]); | |
2036 w = new(float[N]); | |
2037 | |
2038 // Transform P-values to normally distributed Z-scores and store in Zq vector | |
2039 fprintf(stderr,"Transform P-values to Z-scores\n"); | |
2040 float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL | |
2041 float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans | |
2042 printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans); | |
2043 | |
2044 Reset(); | |
2045 while (!End()) | |
2046 { | |
2047 hit = ReadNext(); | |
2048 if (hit.irep>1) continue; | |
2049 k = index.Show(hit.name); | |
2050 if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);} | |
2051 if (hit.logPvalt<0) | |
2052 Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk) | |
2053 else | |
2054 Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk | |
2055 // printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]); | |
2056 // if (isnan(Zq[k])) | |
2057 // { | |
2058 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); | |
2059 // printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]); | |
2060 // par.trans=0; | |
2061 // return; | |
2062 // } | |
2063 if (Zq[k]>Zmax_neg) excluded.Add(hit.fold); | |
2064 fold[k] = new(char[IDLEN]); | |
2065 fam[k] = new(char[IDLEN]); | |
2066 strcpy(fold[k],hit.fold); | |
2067 strcpy(fam[k],hit.fam); | |
2068 weight[k] = hit.weight; | |
2069 Prob[k] = hit.Probab; | |
2070 } | |
2071 | |
2072 if (v>=3) | |
2073 { | |
2074 excluded.Reset(); | |
2075 while (!excluded.End()) | |
2076 { | |
2077 excluded.ReadNext(name); | |
2078 printf("Excluded fold %s from fitting to Ztq\n",name); | |
2079 } | |
2080 } | |
2081 | |
2082 | |
2083 //////////////////////////////////////////////////////////////// | |
2084 // Calculate transitive score (query->l) Ztq[l] | |
2085 | |
2086 fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n"); | |
2087 for (k=0; k<N; k++) | |
2088 { | |
2089 // Construct vector ll of indices l for which Z_lq OR Z_lk >= max(Z_kq,Zmin_trans) | |
2090 float Zmink = fmax(Zq[k],Zmin_trans); | |
2091 for (m=l=0; l<N; l++) | |
2092 if (Zq[l]>=Zmink) ll[m++]=l; | |
2093 M = m; // number of indices l for which Z_lq OR Z_lk >= max(Z_kq,Zmin_trans) | |
2094 | |
2095 // for (m=0; m<M; m++) | |
2096 // fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]); | |
2097 | |
2098 if (M<=1) | |
2099 { | |
2100 Ztq[k]=Zq[k]; | |
2101 } | |
2102 else | |
2103 { | |
2104 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans | |
2105 double** Csub = new(double*[M]); | |
2106 double** Cinv = new(double*[M]); | |
2107 for (m=0; m<M; m++) | |
2108 { | |
2109 Csub[m] = new(double[M]); | |
2110 Cinv[m] = new(double[M]); | |
2111 for (n=0; n<M; n++) | |
2112 Csub[m][n] = double(C[ll[m]][ll[n]]); | |
2113 } | |
2114 | |
2115 // fprintf(stderr,"Covariance matrix\n"); | |
2116 // PrintMatrix(Csub,M); | |
2117 | |
2118 // Invert Csub | |
2119 // fprintf(stderr,"Calculate inverse of covariance submatrix\n"); | |
2120 InvertMatrix(Cinv,Csub,M); | |
2121 | |
2122 // fprintf(stderr,"Inverse covariance matrix\n"); | |
2123 // PrintMatrix(Cinv,M); | |
2124 | |
2125 // Calculate weights w[l] | |
2126 for (m=0; m<M; m++) | |
2127 { | |
2128 double sum = 0.0; | |
2129 for (n=0; n<M; n++) | |
2130 sum += 1.0 * Cinv[m][n]; // signal ~ sum_l w_l*Z_lq ! | |
2131 w[m] = fmax(sum,0.0); | |
2132 } | |
2133 for (l=0; l<M; l++){ | |
2134 delete[](Cinv[l]); (Cinv[l]) = NULL; | |
2135 } | |
2136 delete[](Cinv); (Cinv) = NULL; | |
2137 | |
2138 // Calculate Ztq[k] | |
2139 float norm = NormalizationFactor(Csub,w,M); | |
2140 double sumZ = 0.0; | |
2141 for (m=0; m<M; m++) | |
2142 sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]); | |
2143 // sumZ += w[m] * Z[ll[m]][k]; | |
2144 Ztq[k] = sumZ/norm; | |
2145 | |
2146 for (l=0; l<M; l++){ | |
2147 delete[](Csub[l]); (Csub[l]) = NULL; | |
2148 } | |
2149 delete[](Csub); (Csub) = NULL; | |
2150 } | |
2151 } | |
2152 | |
2153 //////////////////////////////////////////////////////////////// | |
2154 // Calculate reverse transitive score (l->query-) Zrq[l] | |
2155 | |
2156 fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n"); | |
2157 for (k=0; k<N; k++) | |
2158 { | |
2159 // Construct vector ll of indices l for which Z_lk > Zmin_tran | |
2160 float Zmink = fmax(Zq[k],Zmin_trans); | |
2161 for (m=l=0; l<N; l++) | |
2162 if (Z[l][k]>=Zmink) ll[m++]=l; | |
2163 int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran | |
2164 | |
2165 | |
2166 // fprintf(stderr,"\nfam[k]: %s\n",fam[k]); | |
2167 // for (m=0; m<M; m++) | |
2168 // printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]); | |
2169 | |
2170 if (M<=1) | |
2171 { | |
2172 Zrq[k] = Zq[k]; | |
2173 } | |
2174 else | |
2175 { | |
2176 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans | |
2177 double** Csub = new(double*[M]); | |
2178 for (m=0; m<M; m++) | |
2179 { | |
2180 Csub[m] = new(double[M]); | |
2181 for (n=0; n<M; n++) | |
2182 Csub[m][n] = double(C[ll[m]][ll[n]]); | |
2183 } | |
2184 // fprintf(stderr,"Covariance matrix\n"); | |
2185 // PrintMatrix(Csub,M); | |
2186 | |
2187 if (M==2) | |
2188 { | |
2189 for (m=0; m<M; m++) w[m] = 1.0/M; | |
2190 } | |
2191 else | |
2192 { | |
2193 | |
2194 double** Cinv = new(double*[M]); | |
2195 for (m=0; m<M; m++) Cinv[m] = new(double[M]); | |
2196 | |
2197 // Invert Csub | |
2198 InvertMatrix(Cinv,Csub,M); | |
2199 | |
2200 // fprintf(stderr,"Inverse covariance matrix\n"); | |
2201 // PrintMatrix(Cinv,M); | |
2202 | |
2203 // Calculate weights w[l] | |
2204 for (m=0; m<M; m++) | |
2205 { | |
2206 double sum = 0.0; | |
2207 for (n=0; n<M; n++) | |
2208 sum += 1.0 * Cinv[m][n]; // signal ~ sum_l w_l*Z_lq ! | |
2209 w[m] = fmax(sum,0.0); | |
2210 } | |
2211 // for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]); | |
2212 for (l=0; l<M; l++){ | |
2213 delete[](Cinv[l]); (Cinv[l]) = NULL; | |
2214 } | |
2215 delete[](Cinv); (Cinv) = NULL; | |
2216 } | |
2217 | |
2218 // Calculate Zrq[k] and normalize | |
2219 float norm = NormalizationFactor(Csub,w,M); | |
2220 double sumZ = 0.0; | |
2221 for (m=0; m<M; m++) | |
2222 sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]); | |
2223 // sumZ += w[m] * Zq[ll[m]]; | |
2224 Zrq[k] = sumZ/norm; | |
2225 | |
2226 for (l=0; l<M; l++){ | |
2227 delete[](Csub[l]); (Csub[l]) = NULL; | |
2228 } | |
2229 delete[](Csub); (Csub) = NULL; | |
2230 } | |
2231 | |
2232 // fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]); | |
2233 } | |
2234 | |
2235 // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score | |
2236 for (k=0; k<N; k++) | |
2237 { | |
2238 | |
2239 float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]); | |
2240 // if (isnan(Zqtot)) | |
2241 // { | |
2242 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); | |
2243 // printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot); | |
2244 // par.trans=0; | |
2245 // return; | |
2246 // } | |
2247 if (v>=3 && Zqtot > 2*Zmin_trans) { | |
2248 printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot); | |
2249 } | |
2250 Ztq[k] = Zqtot; | |
2251 } | |
2252 | |
2253 // Calculate mean and standard deviation of Z1q | |
2254 fprintf(stderr,"Calculate mean and standard deviation of Ztq\n"); | |
2255 double sumw=0.0; | |
2256 double sumZ=0.0; | |
2257 double sumZ2=0.0; | |
2258 for (k=0; k<N; k++) | |
2259 { | |
2260 if (excluded.Contains(fold[k])) continue; | |
2261 sumw += weight[k]; | |
2262 sumZ += weight[k]*Ztq[k]; | |
2263 sumZ2 += weight[k]*Ztq[k]*Ztq[k]; | |
2264 // if (isnan(sumZ)) | |
2265 // { | |
2266 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); | |
2267 // printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]); | |
2268 // par.trans=0; | |
2269 // return; | |
2270 // } | |
2271 } | |
2272 float mu = sumZ/sumw; | |
2273 float sigma = sqrt(sumZ2/sumw-mu*mu); | |
2274 if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma); | |
2275 sigma *= 1.01;// correct different fitting of EVD and normal variables | |
2276 | |
2277 // Normalize Ztq and calculate P1-values | |
2278 fprintf(stderr,"Normalize Ztq and calculate P1-values\n"); | |
2279 Reset(); | |
2280 while (!End()) | |
2281 { | |
2282 hit = ReadNext(); | |
2283 hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma); | |
2284 hit.E1val = N_searched*(hit.logPval<-100? 0.0 : exp(hit.logPval)); | |
2285 // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue)) | |
2286 hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss; | |
2287 hit.Probab = Probab(hit); | |
2288 hit.score_sort = hit.logPval; | |
2289 Overwrite(hit); // copy hit object into current position of hitlist | |
2290 } | |
2291 | |
2292 for (k=0; k<N; k++){ | |
2293 delete[](Z[k]); (Z[k]) = NULL; | |
2294 } | |
2295 for (k=0; k<N; k++){ | |
2296 delete[](C[k]); (C[k]) = NULL; | |
2297 } | |
2298 for (k=0; k<N; k++){ | |
2299 delete[](fold[k]); (fold[k]) = NULL; | |
2300 } | |
2301 for (k=0; k<N; k++){ | |
2302 delete[](fam[k]); (fam[k]) = NULL; | |
2303 } | |
2304 delete[](C); (C) = NULL; | |
2305 delete[](Z); (Z) = NULL; | |
2306 delete[](fold); (fold) = NULL; | |
2307 delete[](fam); (fam) = NULL; | |
2308 delete[](Prob); (Prob) = NULL; | |
2309 delete[](ll); (ll) = NULL; | |
2310 delete[](Zq); (Zq) = NULL; | |
2311 delete[](Ztq); (Ztq) = NULL; | |
2312 | |
2313 } | |
2314 | |
2315 | |
2316 ///////////////////////////////////////////////////////////////////////////////////// | |
2317 /** | |
2318 * @brief Calculate P-values and Probabilities from transitive scoring over whole database | |
2319 * Best tested scheme. Use fmin(Zq[ll[m]],Z[ll[m]][k]) | |
2320 * and fast approximation for weights (not inverse covariance matrix) | |
2321 */ | |
2322 void | |
2323 HitList::TransitiveScoring4() | |
2324 { | |
2325 void PrintMatrix(float** V, int N); | |
2326 void PrintMatrix(double** V, int N); | |
2327 | |
2328 float** Z; // matrix of intra-db Z-scores Z_kl | |
2329 float** C; // covariance matrix for Z_k: C_kl = sum_m=1^N (Z_km * Z_lm) | |
2330 char** fold; // fold name of HMM k | |
2331 char** fam; // family of HMM k | |
2332 float* Prob; // probability of HMM k | |
2333 float* Zq; // Zq[k] = Z-score between query and database HMM k | |
2334 float* Ztq; // Ztq[k] = transitive Z-score from query to database HMM k: Ztq[k] = sum_l[ w_ql * Z_lk] / normalization_q | |
2335 float* Zrq; // Zrq[k] = transitive Z-score from database HMM k to query: Zrq[k] = sum_l[ w_kl * Z_lq] / normalization_k | |
2336 float* w; // unnormalized weight matrix; w[l] is w_ql or w_kl, respectively | |
2337 int* ll; // ll[m] is the m'th index l for which Z_lq, Z_lk > Zmin_trans | |
2338 int N; // dimension of weight matrix is NxN | |
2339 int M; // number of HMMs l with Z_ql>Ztrans_min (or Z_lk>Ztrans_min, respectively) | |
2340 int k,l,m,n; // indices for database HMMs | |
2341 char name[NAMELEN]; | |
2342 Hash<int> index(MAXPROF+7); // index{name} = index of HMM name in {1,...,N} | |
2343 index.Null(-1); // Set int value to return when no data can be retrieved | |
2344 Hash<int> excluded(13); // Hash containing names of superfamilies to be excluded from fit | |
2345 excluded.Null(0); // Set int value to return when no data can be retrieved | |
2346 Hit hit; | |
2347 size_t unused; /* disable fread gcc warning */ | |
2348 | |
2349 // Read weights matrix W with index hash and names array | |
2350 fprintf(stderr,"Reading in weights file\n"); | |
2351 FILE* wfile = fopen(par.wfile,"rb"); | |
2352 if (v>=1 && wfile==NULL) | |
2353 { | |
2354 fprintf(stderr,"Error: %s could not be opened: (N_searched=%i) ",par.wfile,N_searched); | |
2355 perror("fopen"); | |
2356 fprintf(stderr,"Skipping caclulation of transitive P-values\n"); | |
2357 par.trans=0; | |
2358 return; | |
2359 } | |
2360 unused = fread(&N,sizeof(int),1,wfile); // read matrix dimension (i.e. number of HMMs in database) | |
2361 if (v>=1 && N!=N_searched) | |
2362 { | |
2363 fprintf(stderr,"Error: Number %i of HMMs in weight file is different from number %i of HMMs in searched databases. \n",N,N_searched); | |
2364 fprintf(stderr,"Skipping caclulation of transitive P-values\n"); | |
2365 par.trans=0; | |
2366 return; | |
2367 } | |
2368 if (v>=2) fprintf(stderr,"Calculating transitive P-values for %i HMMs\n",N); | |
2369 // Read names of HMMs (to specify mapping of HMM to matrix indices) | |
2370 for (k=0; k<N; k++) | |
2371 { | |
2372 unused = fread(name,sizeof(char),IDLEN,wfile); | |
2373 index.Add(name,k); | |
2374 } | |
2375 // Read symmetric Z-scores matrix | |
2376 Z = new(float*[N]); | |
2377 for (k=0; k<N; k++) | |
2378 { | |
2379 Z[k] = new(float[N]); | |
2380 for (l=0; l<k; l++) Z[k][l] = Z[l][k]; | |
2381 unused = fread(Z[k]+k,sizeof(float),N-k,wfile); | |
2382 } | |
2383 // Read symmetric covariance matrix | |
2384 C = new(float*[N]); | |
2385 for (k=0; k<N; k++) | |
2386 { | |
2387 C[k] = new(float[N]); | |
2388 for (l=0; l<k; l++) C[k][l] = C[l][k]; | |
2389 unused = fread(C[k]+k,sizeof(float),N-k,wfile); | |
2390 } | |
2391 fclose(wfile); | |
2392 | |
2393 // Allocate memory | |
2394 Zq = new(float[N]); | |
2395 Ztq = new(float[N]); | |
2396 Zrq = new(float[N]); | |
2397 fold = new(char*[N]); | |
2398 fam = new(char*[N]); | |
2399 Prob = new(float[N]); | |
2400 ll = new(int[N]); | |
2401 w = new(float[N]); | |
2402 | |
2403 // Transform P-values to normally distributed Z-scores and store in Zq vector | |
2404 fprintf(stderr,"Transform P-values to Z-scores\n"); | |
2405 float Zmax_neg = Score2Z( -log(MINEVALEXCL) + log(N_searched) ); // calculate Z-score corresponding to E-value MINEVALEXCL | |
2406 float Zmin_trans = Score2Z( -log(par.Emax_trans) + log(N_searched) ); // calculate Z-score corresponding to E-value par.Emax_trans | |
2407 printf("Zmax = %6.2f Zmin = %6.2f \n",Zmax_neg,Zmin_trans); | |
2408 | |
2409 Reset(); | |
2410 while (!End()) | |
2411 { | |
2412 hit = ReadNext(); | |
2413 if (hit.irep>1) continue; | |
2414 k = index.Show(hit.name); | |
2415 if (k<0) {fprintf(stderr,"Error: no index found in weights file for domain %s\n",hit.name); exit(1);} | |
2416 if (hit.logPvalt<0) | |
2417 Zq[k] = 0.5*Score2Z(fabs(hit.logPval)) + 0.5*Score2Z(fabs(hit.logPvalt)); // Zq[k] = 0.5*(Zkq + Zqk) | |
2418 else | |
2419 Zq[k] = Score2Z(fabs(hit.logPval)); // Zq[k] = Zqk | |
2420 // printf("%4i %-10.10s logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPvalt,Zq[k]); | |
2421 // if (isnan(Zq[k])) { | |
2422 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); | |
2423 // printf("%4i %-10.10s logPval=%9g logPvalt=%9g Zq=%9f\n",k,hit.name,hit.logPval,hit.logPvalt,Zq[k]); | |
2424 // par.trans=0; | |
2425 // return; | |
2426 // } | |
2427 if (Zq[k]>Zmax_neg) excluded.Add(hit.fold); | |
2428 fold[k] = new(char[IDLEN]); | |
2429 fam[k] = new(char[IDLEN]); | |
2430 strcpy(fold[k],hit.fold); | |
2431 strcpy(fam[k],hit.fam); | |
2432 weight[k] = hit.weight; | |
2433 Prob[k] = hit.Probab; | |
2434 } | |
2435 | |
2436 if (v>=3) | |
2437 { | |
2438 excluded.Reset(); | |
2439 while (!excluded.End()) | |
2440 { | |
2441 excluded.ReadNext(name); | |
2442 printf("Excluded fold %s from fitting to Ztq\n",name); | |
2443 } | |
2444 } | |
2445 | |
2446 //////////////////////////////////////////////////////////////// | |
2447 // Calculate transitive score (query->l) Zt[l] | |
2448 | |
2449 // Construct vector ll of indices l for which Z_lq > Zmin_trans | |
2450 m = 0; | |
2451 for (l=0; l<N; l++) | |
2452 if (Zq[l]>=Zmin_trans) ll[m++]=l; | |
2453 M = m; // number of indices l for which Z_lq,Z_lk > Zmin_trans | |
2454 | |
2455 // for (m=0; m<M; m++) | |
2456 // fprintf(stderr,"m=%-4i l=%-4i %-10.10s Zq[l]=%7f\n",m,ll[m],fam[ll[m]],Zq[ll[m]]); | |
2457 | |
2458 if (M<=1) | |
2459 for (k=0; k<N; k++) Ztq[k]=0.0; | |
2460 else | |
2461 { | |
2462 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans | |
2463 double** Csub = new(double*[M]); | |
2464 for (m=0; m<M; m++) | |
2465 { | |
2466 Csub[m] = new(double[M]); | |
2467 for (n=0; n<M; n++) | |
2468 Csub[m][n] = double(C[ll[m]][ll[n]]); | |
2469 } | |
2470 | |
2471 if (v>=3) | |
2472 { | |
2473 fprintf(stderr,"Covariance matrix\n"); | |
2474 PrintMatrix(Csub,M); | |
2475 } | |
2476 | |
2477 | |
2478 // Calculate weights w[l] | |
2479 for (m=0; m<M; m++) | |
2480 { | |
2481 double sum = 0.0; | |
2482 for (n=0; n<M; n++) | |
2483 sum += fmax(0.0,Csub[m][n]); | |
2484 printf("w[%4i] = %-8.5f\n",ll[m],1.0/sum); | |
2485 w[m] = 1.0/sum; | |
2486 } | |
2487 | |
2488 // Calculate Ztq[k] for all HMMs k | |
2489 fprintf(stderr,"Calculate Ztq vector of transitive Z-scores\n"); | |
2490 float norm = NormalizationFactor(Csub,w,M); | |
2491 for (k=0; k<N; k++) | |
2492 { | |
2493 double sumZ = 0.0; | |
2494 for (m=0; m<M; m++) | |
2495 sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]); | |
2496 Ztq[k] = sumZ/norm; | |
2497 } | |
2498 | |
2499 for (l=0; l<M; l++){ | |
2500 delete[](Csub[l]); (Csub[l]) = NULL; | |
2501 } | |
2502 delete[](Csub); (Csub) = NULL; | |
2503 } | |
2504 | |
2505 //////////////////////////////////////////////////////////////// | |
2506 // Calculate reverse transitive score (l->query-) Zrq[l] | |
2507 | |
2508 fprintf(stderr,"Calculate Zrq vector of transitive Z-scores\n"); | |
2509 for (k=0; k<N; k++) | |
2510 { | |
2511 // Construct vector ll of indices l for which Z_lk > Zmin_tran | |
2512 m = 0; | |
2513 for (l=0; l<N; l++) | |
2514 if (Z[k][l]>=Zmin_trans) ll[m++]=l; | |
2515 int M = m; // number of indices l for which Z_lq,Z_lk > Zmin_tran | |
2516 | |
2517 | |
2518 // fprintf(stderr,"\nfam[k]: %s\n",fam[k]); | |
2519 // for (m=0; m<M; m++) | |
2520 // printf(stderr,"m=%-4i k=%-4i l=%-4i %-10.10s Zq[l]=%7f Z_lk=%7f \n",m,k,ll[m],fold[ll[m]],Zq[ll[m]],Z[k][ll[m]]); | |
2521 | |
2522 if (M<=1) | |
2523 { | |
2524 Zrq[k] = Zq[k]; | |
2525 } | |
2526 else | |
2527 { | |
2528 // Generate submatrix of C for indices l for which Z_lq,Z_lk > Zmin_trans | |
2529 double** Csub = new(double*[M]); | |
2530 for (m=0; m<M; m++) | |
2531 { | |
2532 Csub[m] = new(double[M]); | |
2533 for (n=0; n<M; n++) | |
2534 Csub[m][n] = double(C[ll[m]][ll[n]]); | |
2535 } | |
2536 // fprintf(stderr,"Covariance matrix\n"); | |
2537 // PrintMatrix(Csub,M); | |
2538 | |
2539 // Calculate weights w[l] | |
2540 for (m=0; m<M; m++) | |
2541 { | |
2542 double sum = 0.0; | |
2543 for (n=0; n<M; n++) | |
2544 sum += fmax(0.0,Csub[m][n]); | |
2545 w[m] = 1.0/sum; | |
2546 } | |
2547 | |
2548 // for (m=0; m<M; m++) fprintf(stderr,"w[%i]=%8.2g\n",m,w[m]); | |
2549 | |
2550 | |
2551 // Calculate Zrq[k] and normalize | |
2552 float norm = NormalizationFactor(Csub,w,M); | |
2553 double sumZ = 0.0; | |
2554 for (m=0; m<M; m++) | |
2555 sumZ += w[m] * fmin(Zq[ll[m]],Z[ll[m]][k]); | |
2556 Zrq[k] = sumZ/norm; | |
2557 | |
2558 for (l=0; l<M; l++){ | |
2559 delete[](Csub[l]); (Csub[l]) = NULL; | |
2560 } | |
2561 delete[](Csub); (Csub) = NULL; | |
2562 } | |
2563 | |
2564 // fprintf(stderr,"\nZq[k]=%8.2g Zq1[k]=%8.2g\n",Zq[k],Zrq[k]); | |
2565 } | |
2566 | |
2567 // Total Z-score = weighted sum over original Z-score, forward transitive and reverse transitive Z-score | |
2568 for (k=0; k<N; k++) | |
2569 { | |
2570 float Zqtot = Zq[k] + par.wtrans*(Ztq[k]+Zrq[k]); | |
2571 // if (isnan(Zqtot)) | |
2572 // { | |
2573 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); | |
2574 // printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot); | |
2575 // par.trans=0; | |
2576 // return; | |
2577 // } | |
2578 if (v>=3 && Zq[k] + Zqtot > 2*Zmin_trans) { | |
2579 printf("%4i %-10.10s Zq=%6.2f Ztq=%6.2f Zrq=%6.2f -> Zqtot=%6.2f\n",k,fam[k],Zq[k],Ztq[k],Zrq[k],Zqtot); | |
2580 } | |
2581 Ztq[k] = Zqtot; | |
2582 } | |
2583 | |
2584 // Calculate mean and standard deviation of Z1q | |
2585 fprintf(stderr,"Calculate mean and standard deviation of Ztq\n"); | |
2586 double sumw=0.0; | |
2587 double sumZ=0.0; | |
2588 double sumZ2=0.0; | |
2589 for (k=0; k<N; k++) | |
2590 { | |
2591 if (excluded.Contains(fold[k])) continue; | |
2592 sumw += weight[k]; | |
2593 sumZ += weight[k]*Ztq[k]; | |
2594 sumZ2 += weight[k]*Ztq[k]*Ztq[k]; | |
2595 // if (isnan(sumZ)) | |
2596 // { | |
2597 // fprintf(stderr,"Error: a floating point exception occurred. Skipping transitive scoring\n"); | |
2598 // printf("%4i %-10.10s Zq=%9f Zrq=%9f Ztq=%9f\n",k,fam[k],Zq[k],Zrq[k],Ztq[k]); | |
2599 // par.trans=0; | |
2600 // return; | |
2601 // } | |
2602 } | |
2603 float mu = sumZ/sumw; | |
2604 float sigma = sqrt(sumZ2/sumw-mu*mu); | |
2605 if (v>=2) printf("mu(Ztq)=%6.3f sigma(Ztq)=%6.2f\n",mu,sigma); | |
2606 sigma *= 1.01;// correct different fitting of EVD and normal variables | |
2607 | |
2608 // Normalize Ztq and calculate P1-values | |
2609 fprintf(stderr,"Normalize Ztq and calculate P1-values\n"); | |
2610 Reset(); | |
2611 while (!End()) | |
2612 { | |
2613 hit = ReadNext(); | |
2614 hit.logPval = -Z2Score((Ztq[index.Show(hit.name)]-mu)/sigma); | |
2615 hit.E1val = N_searched*(hit.logPval<-100? 0.0 : exp(hit.logPval)); | |
2616 // P-value = 1- exp(-exp(-lamda*(Saa-mu))) => -lamda*(Saa-mu) = log(-log(1-Pvalue)) | |
2617 hit.score_aass = (hit.logPval<-10.0? hit.logPval : log(-log(1-exp(hit.logPval))) ) / 0.45-3.0 - hit.score_ss; | |
2618 hit.Probab = Probab(hit); | |
2619 hit.score_sort = hit.logPval; | |
2620 Overwrite(hit); // copy hit object into current position of hitlist | |
2621 } | |
2622 | |
2623 for (k=0; k<N; k++){ | |
2624 delete[](Z[k]); (Z[k]) = NULL; | |
2625 } | |
2626 for (k=0; k<N; k++){ | |
2627 delete[](C[k]); (C[k]) = NULL; | |
2628 } | |
2629 for (k=0; k<N; k++){ | |
2630 delete[](fold[k]); (fold[k]) = NULL; | |
2631 } | |
2632 for (k=0; k<N; k++){ | |
2633 delete[](fam[k]); (fam[k]) = NULL; | |
2634 } | |
2635 delete[](C); (C) = NULL; | |
2636 delete[](Z); (Z) = NULL; | |
2637 delete[](fold); (fold) = NULL; | |
2638 delete[](fam); (fam) = NULL; | |
2639 delete[](Prob); (Prob) = NULL; | |
2640 delete[](ll); (ll) = NULL; | |
2641 delete[](Zq); (Zq) = NULL; | |
2642 delete[](Ztq); (Ztq) = NULL; | |
2643 } | |
2644 | |
2645 | |
2646 ///////////////////////////////////////////////////////////////////////////////////// | |
2647 /** | |
2648 * @brief Score2Z transforms the -log(P-value) score into a Z-score for 0 < S | |
2649 * Score2Z(S) = sqrt(2)*dierfc(2*e^(-S)), where dierfc is the inverse of the complementary error function | |
2650 */ | |
2651 double | |
2652 HitList::Score2Z(double S) | |
2653 { | |
2654 double s, t, u, w, x, y, z; | |
2655 if (S<=0) return double(-100000); | |
2656 y = ( S>200 ? 0.0 : 2.0*exp(-S) ); | |
2657 if (y > 1) | |
2658 { | |
2659 z = (S<1e-6? 2*S : 2-y); | |
2660 w = 0.916461398268964 - log(z); | |
2661 } | |
2662 else | |
2663 { | |
2664 z = y; | |
2665 w = 0.916461398268964 - (0.69314718056-S); | |
2666 } | |
2667 | |
2668 u = sqrt(w); | |
2669 s = (log(u) + 0.488826640273108) / w; | |
2670 t = 1 / (u + 0.231729200323405); | |
2671 | |
2672 x = u * (1 - s * (s * 0.124610454613712 + 0.5)) - | |
2673 ((((-0.0728846765585675 * t + 0.269999308670029) * t + | |
2674 0.150689047360223) * t + 0.116065025341614) * t + | |
2675 0.499999303439796) * t; | |
2676 t = 3.97886080735226 / (x + 3.97886080735226); | |
2677 u = t - 0.5; | |
2678 s = (((((((((0.00112648096188977922 * u + | |
2679 1.05739299623423047e-4) * u - 0.00351287146129100025) * u - | |
2680 7.71708358954120939e-4) * u + 0.00685649426074558612) * u + | |
2681 0.00339721910367775861) * u - 0.011274916933250487) * u - | |
2682 0.0118598117047771104) * u + 0.0142961988697898018) * u + | |
2683 0.0346494207789099922) * u + 0.00220995927012179067; | |
2684 s = ((((((((((((s * u - 0.0743424357241784861) * u - | |
2685 0.105872177941595488) * u + 0.0147297938331485121) * u + | |
2686 0.316847638520135944) * u + 0.713657635868730364) * u + | |
2687 1.05375024970847138) * u + 1.21448730779995237) * u + | |
2688 1.16374581931560831) * u + 0.956464974744799006) * u + | |
2689 0.686265948274097816) * u + 0.434397492331430115) * u + | |
2690 0.244044510593190935) * t - | |
2691 (z==0? 0: z * exp(x * x - 0.120782237635245222)); | |
2692 x += s * (x * s + 1); | |
2693 if (y > 1) { | |
2694 x = -x; | |
2695 } | |
2696 return double (1.41421356237*x); | |
2697 } | |
2698 | |
2699 ///////////////////////////////////////////////////////////////////////////////////////////////////////// | |
2700 /** | |
2701 * @brief Z2Score transforms the Z-score into a -log(P-value) value | |
2702 * Z2Score(Z) = log(2) - log( erfc(Z/sqrt(2)) ) , where derfc is the complementary error function | |
2703 */ | |
2704 double | |
2705 HitList::Z2Score(double Z) | |
2706 { | |
2707 double t, u, x, y; | |
2708 x = 0.707106781188*Z; | |
2709 if (x>10) return 0.69314718056 - (-x*x - log( (1-0.5/x/x)/x/1.772453851) ); | |
2710 t = 3.97886080735226 / (fabs(x) + 3.97886080735226); | |
2711 u = t - 0.5; | |
2712 y = (((((((((0.00127109764952614092 * u + 1.19314022838340944e-4) * u - | |
2713 0.003963850973605135) * u - 8.70779635317295828e-4) * u + | |
2714 0.00773672528313526668) * u + 0.00383335126264887303) * u - | |
2715 0.0127223813782122755) * u - 0.0133823644533460069) * u + | |
2716 0.0161315329733252248) * u + 0.0390976845588484035) * u + | |
2717 0.00249367200053503304; | |
2718 y = ((((((((((((y * u - 0.0838864557023001992) * u - | |
2719 0.119463959964325415) * u + 0.0166207924969367356) * u + | |
2720 0.357524274449531043) * u + 0.805276408752910567) * u + | |
2721 1.18902982909273333) * u + 1.37040217682338167) * u + | |
2722 1.31314653831023098) * u + 1.07925515155856677) * u + | |
2723 0.774368199119538609) * u + 0.490165080585318424) * u + | |
2724 0.275374741597376782) * t * (x>10? 0.0 : exp(-x * x)); | |
2725 return 0.69314718056 - log( x < 0 ? 2 - y : y ); | |
2726 } | |
2727 | |
2728 | |
2729 ///////////////////////////////////////////////////////////////////////////////////////////////////////// | |
2730 /** | |
2731 * @brief | |
2732 */ | |
2733 void | |
2734 PrintMatrix(float** V, int N) | |
2735 { | |
2736 int k,l; | |
2737 for (k=0; k<N; k++) | |
2738 { | |
2739 fprintf(stderr,"k=%4i \n",k); | |
2740 for (l=0; l<N; l++) | |
2741 { | |
2742 fprintf(stderr,"%4i:%6.3f ",l,V[k][l]); | |
2743 if ((l+1)%10==0) fprintf(stderr,"\n"); | |
2744 } | |
2745 fprintf(stderr,"\n"); | |
2746 } | |
2747 fprintf(stderr,"\n"); | |
2748 } | |
2749 | |
2750 ///////////////////////////////////////////////////////////////////////////////////////////////////////// | |
2751 /** | |
2752 * @brief | |
2753 */ | |
2754 void | |
2755 PrintMatrix(double** V, int N) | |
2756 { | |
2757 int k,l; | |
2758 for (k=0; k<N; k++) | |
2759 { | |
2760 fprintf(stderr,"k=%4i \n",k); | |
2761 for (l=0; l<N; l++) | |
2762 { | |
2763 fprintf(stderr,"%4i:%6.3f ",l,V[k][l]); | |
2764 if ((l+1)%10==0) fprintf(stderr,"\n"); | |
2765 } | |
2766 fprintf(stderr,"\n"); | |
2767 } | |
2768 fprintf(stderr,"\n"); | |
2769 } | |
2770 | |
2771 ///////////////////////////////////////////////////////////////////////////////////////////////////////// | |
2772 /** | |
2773 * @brief | |
2774 */ | |
2775 void | |
2776 HitList::Normalize(float* Ztq, char** fold, Hash<int>& excluded) | |
2777 { | |
2778 double sumw=0.0; | |
2779 double sumZ=0.0; | |
2780 double sumZ2=0.0; | |
2781 for (int k=0; k<N_searched; k++) | |
2782 { | |
2783 if (excluded.Contains(fold[k])) continue; | |
2784 sumw += weight[k]; | |
2785 sumZ += weight[k]*Ztq[k]; | |
2786 sumZ2 += weight[k]*Ztq[k]*Ztq[k]; | |
2787 } | |
2788 float mu = sumZ/sumw; | |
2789 float sigma = sqrt(sumZ2/sumw-mu*mu); | |
2790 printf("Transitive score Ztq: mu=%8.3g sigma=%8.3g\n",mu,sigma); | |
2791 for (int k=0; k<N_searched; k++) Ztq[k] = (Ztq[k]-mu)/sigma; | |
2792 return; | |
2793 } | |
2794 | |
2795 ///////////////////////////////////////////////////////////////////////////////////////////////////////// | |
2796 /** | |
2797 * @brief Calculate standard deviation of Z1 = sum_m [ w_m * Z_m ], where Csub_mn = cov(Z_m,Z_n) | |
2798 */ | |
2799 float | |
2800 HitList::NormalizationFactor(double** Csub, float* w, int M) | |
2801 { | |
2802 double sum=0.0; | |
2803 for (int m=0; m<M; m++) | |
2804 { | |
2805 double summ=0.0; | |
2806 for (int n=0; n<M; n++) summ += Csub[m][n]*w[n]; | |
2807 sum += w[m]*summ; | |
2808 } | |
2809 return sqrt(sum); | |
2810 } | |
2811 | |
2812 ///////////////////////////////////////////////////////////////////////////////////////////////////////// | |
2813 /** | |
2814 * @brief Calculate inverse of matrix A and store result in B | |
2815 */ | |
2816 void | |
2817 HitList::InvertMatrix(double** B, double** A, int N) | |
2818 { | |
2819 if (N==0) | |
2820 { | |
2821 printf("Error: InvertMatrix called with matrix of dimension 0\n"); | |
2822 exit(6); | |
2823 } | |
2824 if (N==1) | |
2825 { | |
2826 B[0][0] = (A[0][0]==0.0? 0 :1.0/A[0][0]); | |
2827 return; | |
2828 } | |
2829 | |
2830 int k,l,m; | |
2831 double** V = new(double*[N]); | |
2832 double* s = new(double[N]); | |
2833 for (k=0; k<N; k++) V[k] = new(double[N]); | |
2834 | |
2835 // Copy original matrix A into B since B will be overwritten by SVD() | |
2836 for (k=0; k<N; k++) | |
2837 for (l=0; l<N; l++) | |
2838 B[k][l] = A[k][l]; | |
2839 | |
2840 SVD(B, N, s, V); // U replaces B on output; s[] contains singluar values | |
2841 | |
2842 // Calculate inverse of A: A^-1 = V * diag(1/s) * U^t | |
2843 double** U = B; | |
2844 // Calculate V[k][m] -> V[k][m] *diag(1/s) | |
2845 for (k=0; k<N; k++) | |
2846 for (m=0; m<N; m++) | |
2847 if (s[m]!=0.0) V[k][m] /= s[m]; else V[k][m] = 0.0; | |
2848 // Calculate V[k][l] -> (V * U^t)_kl | |
2849 for (k=0; k<N; k++) | |
2850 { | |
2851 if (v>=4 && k%100==0) printf("%i\n",k); | |
2852 for (l=0; l<N; l++) | |
2853 { | |
2854 s[l] = 0.0; // use s[] as temporary memory to avoid overwriting B[k][] as long as it is needed | |
2855 for (m=0; m<N; m++) | |
2856 s[l] += V[k][m]*U[l][m]; | |
2857 } | |
2858 for (l=0; l<N; l++) V[k][l]=s[l]; | |
2859 } | |
2860 for (k=0; k<N; k++) | |
2861 for (l=0; l<N; l++) | |
2862 B[k][l] = V[k][l]; | |
2863 | |
2864 for (k=0; k<N; k++){ | |
2865 delete[](V[k]); (V[k]) = NULL; | |
2866 } | |
2867 delete[](V); (V) = NULL; | |
2868 return; | |
2869 } | |
2870 | |
2871 | |
2872 ///////////////////////////////////////////////////////////////////////////////////////////////////////// | |
2873 /** | |
2874 * @brief | |
2875 */ | |
2876 void | |
2877 HitList::TransposeMatrix(double** V, int N) | |
2878 { | |
2879 int k,l; | |
2880 for (k=0; k<N; k++) // transpose Z for efficiency of ensuing matrix multiplication | |
2881 for (l=0; l<k; l++) | |
2882 { | |
2883 double buf = V[k][l]; | |
2884 V[k][l] = V[l][k]; | |
2885 V[l][k] = buf; | |
2886 } | |
2887 } | |
2888 | |
2889 ///////////////////////////////////////////////////////////////////////////////////////////////////////// | |
2890 static double sqrarg; | |
2891 #define SQR(a) ((sqrarg=(a)) == 0.0 ? 0.0 : sqrarg*sqrarg) | |
2892 static double maxarg1,maxarg2; | |
2893 #define FMAX(a,b) (maxarg1=(a),maxarg2=(b),(maxarg1) > (maxarg2) ? (maxarg1) : (maxarg2)) | |
2894 static int iminarg1,iminarg2; | |
2895 #define IMIN(a,b) (iminarg1=(a),iminarg2=(b),(iminarg1) < (iminarg2) ? (iminarg1) : (iminarg2)) | |
2896 #define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a)) | |
2897 | |
2898 /** | |
2899 * @brief This is a version of the Golub and Reinsch algorithm for singular value decomposition for a quadratic | |
2900 * (n x n) matrix A. It is sped up by transposing A amd V matrices at various places in the algorithm. | |
2901 * On a 400x400 matrix it runs in 1.6 s or 2.3 times faster than the original (n x m) version. | |
2902 * On a 4993x4993 matrix it runs in 2h03 or 4.5 times faster than the original (n x m) version. | |
2903 * | |
2904 * Given a matrix a[0..n-1][0..n-1], this routine computes its singular value decomposition, A = U · W · V^t . | |
2905 * The matrix U replaces a on output. The diagonal matrix of singular values W is out-put as a vector w[0..n-1]. | |
2906 * The matrix V (not the transpose V^t) is output as V[0..n-1][0..n-1] ./ | |
2907 */ | |
2908 void | |
2909 HitList::SVD(double **A, int n, double w[], double **V) | |
2910 { | |
2911 int m=n; // in general algorithm A is an (m x n) matrix instead of (n x n) | |
2912 | |
2913 double pythag(double a, double b); | |
2914 int flag,i,its,j,jj,k,l=1,nm=1; | |
2915 double anorm,c,f,g,h,s,scale,x,y,z,*rv1; | |
2916 rv1=new(double[n]); | |
2917 g=scale=anorm=0.0; | |
2918 | |
2919 // Householder reduction to bidiagonal form. | |
2920 if (v>=5) printf("\nHouseholder reduction to bidiagonal form\n"); | |
2921 for (i=0;i<n;i++) { | |
2922 if (v>=4 && i%100==0) printf("i=%i\n",i); | |
2923 if (v>=4) fprintf(stderr,"."); | |
2924 l=i+1; | |
2925 rv1[i]=scale*g; | |
2926 g=s=scale=0.0; | |
2927 if (i < m) { | |
2928 for (k=i;k<m;k++) scale += fabs(A[k][i]); | |
2929 if (scale) { | |
2930 for (k=i;k<m;k++) { | |
2931 A[k][i] /= scale; | |
2932 s += A[k][i]*A[k][i]; | |
2933 } | |
2934 f=A[i][i]; | |
2935 g = -SIGN(sqrt(s),f); | |
2936 h=f*g-s; | |
2937 A[i][i]=f-g; | |
2938 for (j=l;j<n;j++) { | |
2939 for (s=0.0,k=i;k<m;k++) s += A[k][i]*A[k][j]; | |
2940 f=s/h; | |
2941 for (k=i;k<m;k++) A[k][j] += f*A[k][i]; | |
2942 } | |
2943 for (k=i;k<m;k++) A[k][i] *= scale; | |
2944 } | |
2945 } | |
2946 w[i]=scale *g; | |
2947 g=s=scale=0.0; | |
2948 if (i < m && i != n-1) { | |
2949 for (k=l;k<n;k++) scale += fabs(A[i][k]); | |
2950 if (scale) { | |
2951 for (k=l;k<n;k++) { | |
2952 A[i][k] /= scale; | |
2953 s += A[i][k]*A[i][k]; | |
2954 } | |
2955 f=A[i][l]; | |
2956 g = -SIGN(sqrt(s),f); | |
2957 h=f*g-s; | |
2958 A[i][l]=f-g; | |
2959 for (k=l;k<n;k++) rv1[k]=A[i][k]/h; | |
2960 for (j=l;j<m;j++) { | |
2961 for (s=0.0,k=l;k<n;k++) s += A[j][k]*A[i][k]; | |
2962 for (k=l;k<n;k++) A[j][k] += s*rv1[k]; | |
2963 } | |
2964 for (k=l;k<n;k++) A[i][k] *= scale; | |
2965 } | |
2966 } | |
2967 anorm=FMAX(anorm,(fabs(w[i])+fabs(rv1[i]))); | |
2968 } | |
2969 // Accumulation of right-hand transformations. | |
2970 if (v>=5) printf("\nAccumulation of right-hand transformations\n"); | |
2971 TransposeMatrix(V,n); | |
2972 for (i=n-1;i>=0;i--) { | |
2973 if (v>=4 && i%100==0) printf("i=%i\n",i); | |
2974 if (v>=4) fprintf(stderr,"."); | |
2975 if (i < n-1) { | |
2976 if (g) { | |
2977 // Double division to avoid possible underflow. | |
2978 for (j=l;j<n;j++) | |
2979 V[i][j]=(A[i][j]/A[i][l])/g; | |
2980 for (j=l;j<n;j++) { | |
2981 for (s=0.0,k=l;k<n;k++) s += A[i][k]*V[j][k]; | |
2982 for (k=l;k<n;k++) V[j][k] += s*V[i][k]; | |
2983 } | |
2984 } | |
2985 for (j=l;j<n;j++) V[j][i]=V[i][j]=0.0; | |
2986 } | |
2987 V[i][i]=1.0; | |
2988 g=rv1[i]; | |
2989 l=i; | |
2990 } | |
2991 // Accumulation of left-hand transformations. | |
2992 if (v>=5) printf("\nAccumulation of left-hand transformations\n"); | |
2993 TransposeMatrix(A,n); | |
2994 for (i=IMIN(m,n)-1;i>=0;i--) { | |
2995 if (v>=4 && i%100==0) printf("i=%i\n",i); | |
2996 if (v>=4) fprintf(stderr,"."); | |
2997 l=i+1; | |
2998 g=w[i]; | |
2999 for (j=l;j<n;j++) A[j][i]=0.0; | |
3000 if (g) { | |
3001 g=1.0/g; | |
3002 for (j=l;j<n;j++) { | |
3003 for (s=0.0,k=l;k<m;k++) s += A[i][k]*A[j][k]; | |
3004 f=(s/A[i][i])*g; | |
3005 for (k=i;k<m;k++) A[j][k] += f*A[i][k]; | |
3006 } | |
3007 for (j=i;j<m;j++) A[i][j] *= g; | |
3008 } else for (j=i;j<m;j++) A[i][j]=0.0; | |
3009 ++A[i][i]; | |
3010 } | |
3011 | |
3012 // Diagonalization of the bidiagonal form: Loop over singular values, and over allowed iterations. | |
3013 if (v>=5) printf("\nDiagonalization of the bidiagonal form\n"); | |
3014 for (k=n-1;k>=0;k--) { | |
3015 if (v>=4 && k%100==0) printf("k=%i\n",k); | |
3016 if (v>=4) fprintf(stderr,"."); | |
3017 for (its=1;its<=30;its++) { | |
3018 flag=1; | |
3019 // Test for splitting. Note that rv1[1] is always zero. | |
3020 for (l=k;l>=0;l--) { | |
3021 nm=l-1; | |
3022 if ((double)(fabs(rv1[l])+anorm) == anorm) { | |
3023 flag=0; | |
3024 break; | |
3025 } | |
3026 if ((double)(fabs(w[nm])+anorm) == anorm) break; | |
3027 } | |
3028 if (flag) { | |
3029 // Cancellation of rv1[l], if l > 1. | |
3030 c=0.0; | |
3031 s=1.0; | |
3032 for (i=l;i<=k;i++) { | |
3033 f=s*rv1[i]; | |
3034 rv1[i]=c*rv1[i]; | |
3035 if ((double)(fabs(f)+anorm) == anorm) break; | |
3036 g=w[i]; | |
3037 h=pythag(f,g); | |
3038 w[i]=h; | |
3039 h=1.0/h; | |
3040 c=g*h; | |
3041 s = -f*h; | |
3042 for (j=0;j<m;j++) { | |
3043 y=A[nm][j]; | |
3044 z=A[i][j]; | |
3045 A[nm][j]=y*c+z*s; | |
3046 A[i][j]=z*c-y*s; | |
3047 } | |
3048 } | |
3049 } | |
3050 z=w[k]; | |
3051 // Convergence. | |
3052 if (l == k) { | |
3053 // Singular value is made nonnegative. | |
3054 if (z < 0.0) { | |
3055 w[k] = -z; | |
3056 for (j=0;j<n;j++) V[k][j] = -V[k][j]; | |
3057 } | |
3058 break; | |
3059 } | |
3060 if (its == 30) {printf("Error in SVD: no convergence in 30 iterations\n"); exit(7);} | |
3061 // Shift from bottom 2-by-2 minor. | |
3062 x=w[l]; | |
3063 nm=k-1; | |
3064 y=w[nm]; | |
3065 g=rv1[nm]; | |
3066 h=rv1[k]; | |
3067 f=((y-z)*(y+z)+(g-h)*(g+h))/(2.0*h*y); | |
3068 g=pythag(f,1.0); | |
3069 f=((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x; | |
3070 // Next QR transformation: | |
3071 c=s=1.0; | |
3072 for (j=l;j<=nm;j++) { | |
3073 i=j+1; | |
3074 g=rv1[i]; | |
3075 y=w[i]; | |
3076 h=s*g; | |
3077 g=c*g; | |
3078 z=pythag(f,h); | |
3079 rv1[j]=z; | |
3080 c=f/z; | |
3081 s=h/z; | |
3082 f=x*c+g*s; | |
3083 g = g*c-x*s; | |
3084 h=y*s; | |
3085 y *= c; | |
3086 for (jj=0;jj<n;jj++) { | |
3087 x=V[j][jj]; | |
3088 z=V[i][jj]; | |
3089 V[j][jj]=x*c+z*s; | |
3090 V[i][jj]=z*c-x*s; | |
3091 } | |
3092 z=pythag(f,h); | |
3093 // Rotation can be arbitrary if z = 0. | |
3094 w[j]=z; | |
3095 if (z) { | |
3096 z=1.0/z; | |
3097 c=f*z; | |
3098 s=h*z; | |
3099 } | |
3100 f=c*g+s*y; | |
3101 x=c*y-s*g; | |
3102 | |
3103 for (jj=0;jj<m;jj++) { | |
3104 y=A[j][jj]; | |
3105 z=A[i][jj]; | |
3106 A[j][jj]=y*c+z*s; | |
3107 A[i][jj]=z*c-y*s; | |
3108 } | |
3109 } | |
3110 rv1[l]=0.0; | |
3111 rv1[k]=f; | |
3112 w[k]=x; | |
3113 } | |
3114 } | |
3115 TransposeMatrix(V,n); | |
3116 TransposeMatrix(A,n); | |
3117 delete[](rv1); (rv1) = NULL; | |
3118 } | |
3119 | |
3120 /** | |
3121 * @brief Computes (a2 + b2 )^1/2 without destructive underflow or overflow. | |
3122 */ | |
3123 double | |
3124 pythag(double a, double b) | |
3125 { | |
3126 double absa,absb; | |
3127 absa=fabs(a); | |
3128 absb=fabs(b); | |
3129 if (absa > absb) | |
3130 return absa*sqrt(1.0+SQR(absb/absa)); | |
3131 else | |
3132 return (absb == 0.0 ? 0.0 : absb*sqrt(1.0+SQR(absa/absb))); | |
3133 } | |
3134 | |
3135 | |
3136 /* @* HitList::ClobberGlobal(void) | |
3137 */ | |
3138 void | |
3139 HitList::ClobberGlobal(void){ | |
3140 | |
3141 | |
3142 /* @<variables local to HitList::ClobberGlobal@> */ | |
3143 class List<Hit>::ListEl<Hit> *pvIter = head; | |
3144 | |
3145 /* NOTE: no free/delete-ing of data to be done here | |
3146 hitlist only holds a shallow copy of hit; | |
3147 hit is being cleared off properly. | |
3148 just reset everything to 0/0.0/NULL. | |
3149 The only important thing to do at this stage | |
3150 is to attach head and tail and set size = 0 | |
3151 (FS, 2010-02-18) | |
3152 | |
3153 NOTE: I only ever saw 1 (one) in-between element, | |
3154 but there may ctually be a real linked list | |
3155 of more than 1 element (FS, 2010-02-18) | |
3156 */ | |
3157 | |
3158 // printf("POINTER:\t%p\t=HEAD\n", head); | |
3159 while (pvIter->next != tail){ | |
3160 | |
3161 // printf("POINTER:\t%p->\t%p\n", pvIter, pvIter->next); | |
3162 pvIter = pvIter->next; | |
3163 | |
3164 #if 1 | |
3165 pvIter->data.longname = pvIter->data.name = | |
3166 pvIter->data.file = pvIter->data.dbfile = NULL; | |
3167 pvIter->data.sname = NULL; | |
3168 pvIter->data.seq = NULL; | |
3169 pvIter->data.self = 0; | |
3170 pvIter->data.i = pvIter->data.j = NULL; | |
3171 pvIter->data.states = NULL; | |
3172 pvIter->data.S = pvIter->data.S_ss = pvIter->data.P_posterior = NULL; | |
3173 pvIter->data.Xcons = NULL; | |
3174 pvIter->data.sum_of_probs = 0.0; | |
3175 pvIter->data.Neff_HMM = 0.0; | |
3176 pvIter->data.score_ss = pvIter->data.Pval = pvIter->data.logPval = | |
3177 pvIter->data.Eval = pvIter->data.Probab = pvIter->data.Pforward = 0.0; | |
3178 pvIter->data.nss_conf = pvIter->data.nfirst = | |
3179 pvIter->data.i1 = pvIter->data.i2 = pvIter->data.j1 = pvIter->data.j2 = | |
3180 pvIter->data.matched_cols = pvIter->data.ssm1 = pvIter->data.ssm2 = 0; | |
3181 #endif | |
3182 } | |
3183 // printf("POINTER:\t\t\t%p=TAIL\n", tail); | |
3184 | |
3185 | |
3186 head->next = tail; | |
3187 tail->prev = head; | |
3188 size = 0; | |
3189 | |
3190 /* @= */ | |
3191 return; | |
3192 | |
3193 } /* this is the end of HitList::ClobberGlobal() */ | |
3194 | |
3195 | |
3196 /* | |
3197 * EOF hhhitlist-C.h | |
3198 */ |