comparison clustalomega/clustal-omega-1.0.2/src/hhalign/hhhmm-C.h @ 1:bc707542e5de

Uploaded
author clustalomega
date Thu, 21 Jul 2011 13:35:08 -0400
parents
children
comparison
equal deleted inserted replaced
0:ff1768533a07 1:bc707542e5de
1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
5 *
6 * Copyright (C) 2010 University College Dublin
7 *
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
12 *
13 * This file is part of Clustal-Omega.
14 *
15 ********************************************************************/
16
17 /*
18 * RCS $Id: hhhmm-C.h 224 2011-03-23 12:13:33Z fabian $
19 */
20
21
22 // hhhmm.C
23
24 #ifndef MAIN
25 #define MAIN
26 #include <iostream> // cin, cout, cerr
27 #include <fstream> // ofstream, ifstream
28 #include <stdio.h> // printf
29 using std::cout;
30 using std::cerr;
31 using std::endl;
32 using std::ios;
33 using std::ifstream;
34 using std::ofstream;
35 #include <stdlib.h> // exit
36 #include <string> // strcmp, strstr
37 #include <math.h> // sqrt, pow
38 #include <limits.h> // INT_MIN
39 #include <float.h> // FLT_MIN
40 #include <time.h> // clock
41 #include <ctype.h> // islower, isdigit etc
42 #include "util-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
43 #include "list.h" // list data structure
44 #include "hash.h" // hash data structure
45 #include "hhdecl-C.h"
46 #include "hhutil-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
47 #endif
48
49 // #ifndef WNLIB
50 // #define WNLIB
51 // #include "wnconj.h" // Will Naylor's wnlib for optimization in C
52 // #endif
53
54 //////////////////////////////////////////////////////////////////////////////
55 //// Class HMM
56 //////////////////////////////////////////////////////////////////////////////
57
58 //////////////////////////////////////////////////////////////////////////////
59 // Object constructor
60 //////////////////////////////////////////////////////////////////////////////
61 HMM::HMM(int maxseqdis, int maxres)
62 {
63 sname = new char*[maxseqdis]; // names of stored sequences
64 for (int i = 0; i < maxseqdis; i++){ sname[i] = NULL;}
65 seq = new char*[maxseqdis]; // residues of stored sequences (first at pos 1!)
66 for (int i = 0; i < maxseqdis; i++){ seq[i] = NULL;}
67 Neff_M = new float[maxres]; // Neff_M[i] = diversity of subalignment of seqs that have residue in col i
68 Neff_I = new float[maxres]; // Neff_I[i] = diversity of subalignment of seqs that have insert in col i
69 Neff_D = new float[maxres]; // Neff_D[i] = diversity of subalignment of seqs that have delete in col i
70 longname = new char[DESCLEN]; // Full name of first sequence of original alignment (NAME field)
71 ss_dssp = new char[maxres]; // secondary structure determined by dssp 0:- 1:H 2:E 3:C 4:S 5:T 6:G 7:B
72 sa_dssp = new char[maxres]; // solvent accessibility state determined by dssp 0:- 1:A (absolutely buried) 2:B 3:C 4:D 5:E (exposed)
73 ss_pred = new char[maxres]; // predicted secondary structure 0:- 1:H 2:E 3:C
74 ss_conf = new char[maxres]; // confidence value of prediction 0:- 1:0 ... 10:9
75 Xcons = NULL; // create only when needed: consensus sequence in internal representation (A=0 R=1 N=2 D=3 ...)
76 l = new int[maxres]; // l[i] = pos. of j'th match state in aligment
77 /* FS introduced sentinel, NULL terminates loop in destructor, FS, r221->222 */
78 f = new float*[maxres+1]; f[maxres] = NULL; // f[i][a] = prob of finding amino acid a in column i WITHOUT pseudocounts
79 g = new float*[maxres+1]; g[maxres] = NULL; // f[i][a] = prob of finding amino acid a in column i WITH pseudocounts
80 p = new float*[maxres+1]; p[maxres] = NULL; // p[i][a] = prob of finding amino acid a in column i WITH OPTIMUM pseudocounts
81 tr = new float*[maxres+1]; tr[maxres] = NULL; // log2 of transition probabilities M2M M2I M2D I2M I2I D2M D2D M2M_GAPOPEN GAPOPEN GAPEXTD
82 // tr_lin = new float*[maxres]; // linear transition probabilities M2M M2I M2D I2M I2I D2M D2D M2M_GAPOPEN GAPOPEN GAPEXTD
83 for (int i=0; i<maxres; i++) {f[i]=new(float[NAA+3]);}
84 for (int i=0; i<maxres; i++) {g[i]=new(float[NAA]);}
85 for (int i=0; i<maxres; i++) {p[i]=new(float[NAA]);}
86 for (int i=0; i<maxres; i++) {tr[i]=new(float[NTRANS]);}
87 // for (int i=0; i<maxres; i++) tr_lin[i]=new(float[NTRANS]);
88 L=0;
89 Neff_HMM=0;
90 n_display=N_in=N_filtered=0;
91 nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1;
92 // lamda_hash.New(37,0.0); // Set size and NULL element for hash
93 // mu_hash.New(37,0.0); // Set size and NULL element for hash
94 lamda=0.0; mu=0.0;
95 name[0]=longname[0]=fam[0]='\0';
96 trans_lin=0; // transition probs in log space
97 }
98
99
100 //////////////////////////////////////////////////////////////////////////////
101 // Object destructor
102 //////////////////////////////////////////////////////////////////////////////
103 HMM::~HMM()
104 {
105 //Delete name and seq matrices
106 if (NULL != sname){
107 for (int k=0; (k < n_display) && (NULL != sname[k]); k++){
108 delete[] sname[k]; sname[k] = NULL;
109 }
110 delete[] sname; sname = NULL;
111 }
112 if (NULL != seq){
113 for (int k=0; (k < n_display) && (NULL != seq[k]); k++){
114 delete[] seq[k]; seq[k] = NULL;
115 }
116 delete[] seq; seq = NULL;
117 }
118 delete[] Neff_M; Neff_M = NULL;
119 delete[] Neff_D; Neff_D = NULL;
120 delete[] Neff_I; Neff_I = NULL;
121 delete[] longname; longname = NULL;
122 delete[] ss_dssp; ss_dssp = NULL;
123 delete[] sa_dssp; sa_dssp = NULL;
124 delete[] ss_pred; ss_pred = NULL;
125 delete[] ss_conf; ss_conf = NULL;
126 delete[] Xcons; Xcons = NULL;
127 delete[] l; l = NULL;
128 for (int i=0; i</*MAXRES*/par.maxResLen; i++){
129 if (f[i]){
130 delete[] f[i]; f[i] = NULL;
131 }
132 else break;
133 }
134 for (int i=0; i</*MAXRES*/par.maxResLen; i++){
135 if (g[i]){
136 delete[] g[i]; g[i] = NULL;
137 }
138 else break;
139 }
140 for (int i=0; i</*MAXRES*/par.maxResLen; i++){
141 if (p[i]){
142 delete[] p[i]; p[i] = NULL;
143 }
144 else break;
145 }
146 for (int i=0; i</*MAXRES*/par.maxResLen; i++){
147 if (tr[i]){
148 delete[] tr[i]; tr[i] = NULL;
149 }
150 else break;
151 }
152 // for (int i=0; i</*MAXRES*/par.maxResLen; i++) if (tr_lin[i]) delete[] tr_lin[i]; else break;
153 delete[] f; f = NULL;
154 delete[] g; g = NULL;
155 delete[] p; p = NULL;
156 delete[] tr; tr = NULL;
157 // delete[] tr_lin;
158 }
159
160 //////////////////////////////////////////////////////////////////////////////
161 // Deep-copy constructor
162 //////////////////////////////////////////////////////////////////////////////
163 HMM& HMM::operator=(HMM& q)
164 {
165 L=q.L;
166 for (int i=0; i<=L+1; ++i)
167 {
168 for (int a=0; a<NAA; ++a)
169 {
170 f[i][a]=q.f[i][a];
171 g[i][a]=q.g[i][a];
172 p[i][a]=q.p[i][a];
173 }
174 for (int a=0; a<NTRANS; ++a)
175 tr[i][a]=q.tr[i][a];
176 ss_dssp[i]=q.ss_dssp[i];
177 sa_dssp[i]=q.sa_dssp[i];
178 ss_pred[i]=q.ss_pred[i];
179 ss_conf[i]=q.ss_conf[i];
180 l[i]=q.l[i];
181 }
182 if (q.Xcons)
183 for (int i=0; i<=L+1; ++i)
184 Xcons[i] =q.Xcons[i];
185
186 n_display=q.n_display;
187 for (int k=0; k<n_display; k++) {
188 sname[k]=new(char[strlen(q.sname[k])+1]);
189 if (!sname[k]) MemoryError("array of names for sequences to display");
190 strcpy(sname[k],q.sname[k]);
191 }
192 for (int k=0; k<n_display; k++) {
193 seq[k]=new(char[strlen(q.seq[k])+1]);
194 if (!seq[k]) MemoryError("array of names for sequences to display");
195 strcpy(seq[k],q.seq[k]);
196 }
197 ncons=q.ncons;
198 nfirst=q.nfirst;
199 nss_dssp=q.nss_dssp;
200 nsa_dssp=q.nsa_dssp;
201 nss_pred=q.nss_pred;
202 nss_conf=q.nss_conf;
203
204 for (int i=0; i<=L+1; ++i) Neff_M[i]=q.Neff_M[i];
205 for (int i=0; i<=L+1; ++i) Neff_I[i]=q.Neff_I[i];
206 for (int i=0; i<=L+1; ++i) Neff_D[i]=q.Neff_D[i];
207 Neff_HMM=q.Neff_HMM;
208
209 strcpy(longname,q.longname);
210 strcpy(name,q.name);
211 strcpy(fam,q.fam);
212 strcpy(sfam,q.sfam);
213 strcpy(fold,q.fold);
214 strcpy(cl,q.cl);
215 strcpy(file,q.file);
216
217 lamda=q.lamda;
218 mu=q.mu;
219
220 for (int a=0; a<NAA; ++a) pav[a]=q.pav[a];
221 N_in=q.N_in;
222 N_filtered=q.N_filtered;
223 trans_lin=q.trans_lin;
224 return (HMM&) (*this);
225 }
226
227
228 ///////////////////////////////////////////////////////////////////////////////
229 /**
230 * @brief Read an HMM from an HHsearch .hhm file; return 0 at end of file
231 */
232 int
233 HMM::Read(FILE* dbf, char* path)
234 {
235 char line[LINELEN]=""; // input line
236 char str3[8]="",str4[8]=""; // first 3 and 4 letters of input line
237 char* ptr; // pointer for string manipulation
238 int i=0; // index for match state (first=1)
239 int a; // amino acid index
240 static int warn=0;
241
242 trans_lin=0;
243 L=0;
244 Neff_HMM=0;
245 n_display=N_in=N_filtered=0;
246 nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1;
247 lamda=mu=0.0;
248 trans_lin=0; // transition probs in log space
249 name[0]=longname[0]=fam[0]='\0';
250 //If at the end of while-loop L is still 0 then we have reached end of db file
251
252 //Do not delete name and seq vectors because their adresses are transferred to hitlist as part of a hit!!
253
254 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/'))
255 {
256
257 if (strscn(line)==NULL) continue; // skip lines that contain only white space
258 substr(str3,line,0,2); // copy the first three characters into str3
259 substr(str4,line,0,3); // copy the first four characters into str4
260
261 if (!strncmp("HH",line,2)) continue;
262
263 if (!strcmp("NAME",str4))
264 {
265 ptr=strscn(line+4); //advance to first non-white-space character
266 if (ptr)
267 {
268 strncpy(longname,ptr,DESCLEN-1); //copy full name to longname
269 longname[DESCLEN-1]='\0';
270 strncpy(name,ptr,NAMELEN-1); //copy longname to name...
271 strcut(name); //...cut after first word...
272 }
273 else
274 {
275 strcpy(longname,"undefined");
276 strcpy(name,"undefined");
277 }
278 if (v>=4) cout<<"Reading in HMM "<<name<<":\n";
279 }
280
281 else if (!strcmp("FAM",str3))
282 {
283 ptr=strscn(line+3); //advance to first non-white-space character
284 if (ptr) strncpy(fam,ptr,IDLEN-1); else strcpy(fam,""); //copy family name to basename
285 ScopID(cl,fold,sfam,fam); //get scop classification from basename (e.g. a.1.2.3.4)
286 }
287
288 else if (!strcmp("FILE",str4))
289 {
290 if (path) strncpy(file,path,NAMELEN-1); else *file='\0'; // copy path to file variable
291 ptr=strscn(line+4); //advance to first non-white-space character
292 if (ptr)
293 strncat(file,ptr,NAMELEN-1-strlen(file)); // append file name read from file to path
294 else strcat(file,"*");
295 }
296
297 else if (!strcmp("LENG",str4))
298 {
299 ptr=line+4;
300 L=strint(ptr); //read next integer (number of match states)
301 }
302 else if (!strcmp("FILT",str4) || !strcmp("NSEQ",str4))
303 {
304 ptr=line+4;
305 N_filtered=strint(ptr); //read next integer: number of sequences after filtering
306 N_in=strint(ptr); //read next integer: number of sequences in alignment
307 }
308
309 else if (!strcmp("NEFF",str4) || !strcmp("NAA",str3)) sscanf(line+6,"%f",&Neff_HMM);
310
311 else if (!strcmp("EVD",str3))
312 {
313 // char key[IDLEN];
314 sscanf(line+6,"%f %f",&lamda,&mu);
315 // sscanf(line+22,"%s",key);
316 // lamda_hash.Add(key,lamda);
317 // mu_hash.Add(key,mu);
318 }
319
320 else if (!strcmp("DESC",str4)) continue;
321 else if (!strcmp("COM",str3)) continue;
322 else if (!strcmp("DATE",str4)) continue;
323
324 /////////////////////////////////////////////////////////////////////////////////////
325 // Read template sequences that should get displayed in output alignments
326 else if (!strcmp("SEQ",str3))
327 {
328 //char cur_seq[MAXCOL]=""; //Sequence currently read in
329 char *cur_seq = new(char[par.maxColCnt]); //Sequence currently read in
330 int k; // sequence index; start with -1; after reading name of n'th sequence-> k=n
331 int h; // index for character in input line
332 int l=1; // index of character in sequence seq[k]
333 int i=1; // index of match states in ss_dssp[i] and ss_pred[i] sequence
334 int n_seq=0; // number of sequences to be displayed EXCLUDING ss sequences
335 cur_seq[0]='-'; // overwrite '\0' character at beginning to be able to do strcpy(*,cur_seq)
336 k=-1;
337 while (fgetline(line,LINELEN-1,dbf) && line[0]!='#')
338 {
339 if (v>=4) cout<<"Read from file:"<<line<<"\n"; //DEBUG
340 if (line[0]=='>') //line contains sequence name
341 {
342 if (k>=MAXSEQDIS-1) //maximum number of allowable sequences exceeded
343 {while (fgetline(line,LINELEN-1,dbf) && line[0]!='#'); break;}
344 k++;
345 if (!strncmp(line,">ss_dssp",8)) nss_dssp=k;
346 else if (!strncmp(line,">sa_dssp",8)) nsa_dssp=k;
347 else if (!strncmp(line,">ss_pred",8)) nss_pred=k;
348 else if (!strncmp(line,">ss_conf",8)) nss_conf=k;
349 else if (!strncmp(line,">Cons-",6) || !strncmp(line,">Consensus",10)) ncons=k;
350 else
351 {
352 if (nfirst==-1) nfirst=k;
353 if (n_seq>=par.nseqdis)
354 {while (fgetline(line,LINELEN-1,dbf) && line[0]!='#'); k--; break;}
355 n_seq++;
356 }
357
358 //If this is not the first sequence then store residues of previous sequence
359 if (k>0) {
360 seq[k-1]=new(char[strlen(cur_seq)+1]);
361 if (!seq[k-1]) MemoryError("array of sequences to display");
362 strcpy(seq[k-1],cur_seq);
363 }
364
365 // store sequence name
366 strcut(line+1); //find next white-space character and overwrite it with end-of-string character
367 sname[k] = new (char[strlen(line+1)+1]); //+1 for terminating '\0'
368 if (!sname[k]) MemoryError("array of names for sequences to display");
369 strcpy(sname[k],line+1); //store sequence name in **name
370 l=1; i=1;
371 }
372 else //line contains sequence residues
373 {
374 if (k==-1)
375 {
376 cerr<<endl<<"WARNING: Ignoring following line while reading HMM"<<name<<":\n\'"<<line<<"\'\n";
377 continue;
378 }
379
380 h=0; //counts characters in current line
381
382 // Check whether all characters are correct; store into cur_seq
383 if (k==nss_dssp) // lines with dssp secondary structure states (. - H E C S T G B)
384 {
385 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
386 {
387 if (ss2i(line[h])>=0 && line[h]!='.')
388 {
389 char c=ss2ss(line[h]);
390 cur_seq[l]=c;
391 if (c!='.' && !(c>='a' && c<='z')) ss_dssp[i++]=ss2i(c);
392 l++;
393 }
394 else if (v && ss2i(line[h])==-2)
395 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
396 h++;
397 }
398 }
399 if (k==nsa_dssp) // lines with dssp secondary solvent accessibility (- A B C D E)
400 {
401 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
402 {
403 if (sa2i(line[h])>=0)
404 {
405 char c=line[h];
406 cur_seq[l]=c;
407 if (c!='.' && !(c>='a' && c<='z')) sa_dssp[i++]=sa2i(c);
408 l++;
409 }
410 else if (v && sa2i(line[h])==-2)
411 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
412 h++;
413 }
414 }
415 else if (k==nss_pred) // lines with predicted secondary structure (. - H E C)
416 {
417 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
418 {
419 if (ss2i(line[h])>=0 && ss2i(line[h])<=3 && line[h]!='.')
420 {
421 char c=ss2ss(line[h]);
422 cur_seq[l]=c;
423 if (c!='.' && !(c>='a' && c<='z')) ss_pred[i++]=ss2i(c);
424 l++;
425 }
426 else if (v && ss2i(line[h])==-2)
427 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
428 h++;
429 }
430 }
431 else if (k==nss_conf) // lines with confidence values should contain only 0-9, '-', or '.'
432 {
433 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
434 {
435 if (line[h]=='-' || (line[h]>='0' && line[h]<='9'))
436 {
437 cur_seq[l]=line[h];
438 ss_conf[l]=cf2i(line[h]);
439 l++;
440 }
441 else if (v && cf2i(line[h])==-2)
442 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
443 h++;
444 }
445 }
446 else // normal line containing residues
447 {
448 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
449 {
450 if (aa2i(line[h])>=0 && line[h]!='.') // ignore '.' and white-space characters ' ', \t and \n (aa2i()==-1)
451 {cur_seq[l]=line[h]; l++;}
452 else if (aa2i(line[h])==-2 && v)
453 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
454 h++;
455 }
456 }
457 cur_seq[l]='\0'; //Ensure that cur_seq ends with a '\0' character
458
459 } //end else
460 } //while(getline)
461 //If this is not the first sequence some residues have already been read in
462 if (k>=0) {
463 seq[k]=new(char[strlen(cur_seq)+1]);
464 if (!seq[k]) MemoryError("array of sequences to display");
465 strcpy(seq[k],cur_seq);
466 }
467 n_display=k+1;
468
469 // DEBUG
470 if (v>=4)
471 {
472 printf("nss_dssp=%i nsa_dssp=%i nss_pred=%i nss_conf=%i nfirst=%i\n",nss_dssp,nsa_dssp,nss_pred,nss_conf,nfirst);
473 for (k=0; k<n_display; k++)
474 {
475 int j;
476 cout<<">"<<sname[k]<<"(k="<<k<<")\n";
477 if (k==nss_dssp) {for (j=1; j<=L; j++) cout<<char(i2ss(ss_dssp[j]));}
478 else if (k==nsa_dssp) {for (j=1; j<=L; j++) cout<<char(i2sa(sa_dssp[j]));}
479 else if (k==nss_pred) {for (j=1; j<=L; j++) cout<<char(i2ss(ss_pred[j]));}
480 else if (k==nss_conf) {for (j=1; j<=L; j++) cout<<int(ss_conf[j]-1);}
481 else {for (j=1; j<=L; j++) cout<<seq[k][j];}
482 cout<<"\n";
483 }
484 }
485
486 } //end if("SEQ")
487
488 /////////////////////////////////////////////////////////////////////////////////////
489 // Read average amino acid frequencies for HMM
490 else if (!strcmp("FREQ",str4))
491 {
492 fprintf(stderr,"Error: hhm file has obsolete format.\n");
493 fprintf(stderr,"Please use hhmake version > 1.1 to generate hhm files.\n");
494 exit(1);
495 }
496
497 else if (!strcmp("AVER",str4)) {} // AVER line scrapped
498 else if (!strcmp("NULL",str4))
499 {
500 ptr=line+4;
501 for (a=0; a<20 && ptr; ++a)
502 //s2[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
503 pb[s2a[a]] = (float) fpow2(float(-strinta(ptr))/HMMSCALE);
504 if (!ptr) return Warning(dbf,line,name);
505 if (v>=4)
506 {
507 printf("\nNULL ");
508 for (a=0; a<20; ++a) printf("%5.1f ",100.*pb[s2a[a]]);
509 printf("\n");
510 }
511 }
512
513 /////////////////////////////////////////////////////////////////////////////////////
514 // Read transition probabilities from start state
515 else if (!strcmp("HMM",str3))
516 {
517 fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels
518 fgetline(line,LINELEN-1,dbf); // Skip line with transition labels
519 ptr=line;
520 for (a=0; a<=D2D && ptr; ++a)
521 tr[0][a] = float(-strinta(ptr))/HMMSCALE; //store transition probabilites as log2 values
522 // strinta returns next integer in string and puts ptr to first char
523 // after the integer. Returns -99999 if '*' is found.
524 // ptr is set to 0 if no integer is found after ptr.
525 Neff_M[0] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with M->? transition
526 Neff_I[0] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with I->? transition
527 Neff_D[0] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with D->? transition
528 if (!ptr) return Warning(dbf,line,name);
529
530 /////////////////////////////////////////////////////////////////////////////////////
531 // Read columns of HMM
532 int next_i=0; // index of next column
533 while (fgetline(line,LINELEN-2,dbf) && !(line[0]=='/' && line[1]=='/') && line[0]!='#')
534 {
535 if (strscn(line)==NULL) continue; // skip lines that contain only white space
536
537 // Read in AA probabilities
538 ptr=line+1;
539 int prev_i = next_i;
540 next_i = strint(ptr); ++i;
541 if (v && next_i!=prev_i+1)
542 if (++warn<=5)
543 {
544 cerr<<endl<<"WARNING: in HMM "<<name<<" state "<<prev_i<<" is followed by state "<<next_i<<"\n";
545 if (warn==5) cerr<<endl<<"WARNING: further warnings while reading HMMs will be suppressed.\n";
546 }
547 if (i>L)
548 {
549 cerr<<endl<<"WARNING: in HMM "<<name<<" there are more columns than the stated length "<<L<<". Skipping HMM\n";
550 return 2;
551 }
552 if (i>=/*MAXRES*/par.maxResLen-2)
553 {
554 fgetline(line,LINELEN-1,dbf); // Skip line
555 continue;
556 }
557
558 for (a=0; a<20 && ptr; ++a)
559 // f[i][s2a[a]] = (float)pow(2.,float(-strinta(ptr))/HMMSCALE);
560 f[i][s2a[a]] = fpow2(float(-strinta(ptr))/HMMSCALE); // speed-up ~5 s for 10000 SCOP domains
561
562 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
563 l[i]=strint(ptr);
564 if (!ptr) return Warning(dbf,line,name);
565 if (v>=4)
566 {
567 printf("%s",line);
568 printf("%6i ",i);
569 for (a=0; a<20; ++a) printf("%5.1f ",100*f[i][s2a[a]]);
570 printf("%5i",l[i]);
571 printf("\n");
572 }
573
574 // Read transition probabilities
575 fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels
576 if (line[0]!=' ' && line[0]!='\t') return Warning(dbf,line,name);
577 ptr=line;
578 for (a=0; a<=D2D && ptr; ++a)
579 tr[i][a] = float(-strinta(ptr))/HMMSCALE; //store transition prob's as log2-values
580 Neff_M[i] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with M->? transition
581 Neff_I[i] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with I->? transition
582 Neff_D[i] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with D->? transition
583 if (!ptr) return Warning(dbf,line,name);
584 if (v>=4)
585 {
586 printf(" ");
587 for (a=0; a<=D2D; ++a) printf("%5.1f ",100*fpow2(tr[i][a]));
588 printf("%5.1f %5.1f %5.1f \n",Neff_M[i],Neff_I[i],Neff_D[i]);
589 }
590 }
591 if (line[0]=='/' && line[1]=='/') break;
592 }
593 else if (v) cerr<<endl<<"WARNING: Ignoring line\n\'"<<line<<"\'\nin HMM "<<name<<"\n";
594
595 } //while(getline)
596
597 if (L==0) return 0; //End of db file -> stop reading in
598
599 // Set coefficients of EVD (= 0.0 if not calibrated for these parameters)
600 // lamda = lamda_hash.Show(par.Key());
601 // mu = mu_hash.Show(par.Key());
602 if (lamda && v>=3) printf("HMM %s is already calibrated: lamda=%-5.3f, mu=%-5.2f\n",name,lamda,mu);
603
604 if (v && i!=L) cerr<<endl<<"Warning: in HMM "<<name<<" there are only "<<i<<" columns while the stated length is "<<L<<"\n";
605 if (v && i>/*MAXRES*/par.maxResLen-2) {i=/*MAXRES*/par.maxResLen-2; cerr<<endl<<"WARNING: maximum number "<</*MAXRES*/par.maxResLen-2<<" of residues exceeded while reading HMM "<<name<<"\n";}
606 if (v && !i) cerr<<endl<<"WARNING: HMM "<<name<<" contains no match states. Check the alignment that gave rise to this HMM.\n";
607 if (v>=2) cout<<"Read in HMM "<<name<<" with "<<L<<" match states and effective number of sequences = "<<Neff_HMM<<"\n";
608 L = i;
609
610 // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
611 for (a=0; a<20; ++a) f[0][a]=f[L+1][a]=pb[a];
612 Neff_M[L+1]=1.0f;
613 Neff_I[L+1]=Neff_D[L+1]=0.0f;
614
615 return 1; //return status: ok
616
617 } /* this is the end of HMM::Read() */
618
619
620 /////////////////////////////////////////////////////////////////////////////////////
621 /**
622 * @brief Read an HMM from a HMMer .hmm file; return 0 at end of file
623 */
624 int
625 HMM::ReadHMMer(FILE* dbf, char* filestr)
626 {
627 char line[LINELEN]=""; // input line
628 char desc[DESCLEN]=""; // description of family
629 char str4[5]=""; // first 4 letters of input line
630 char* ptr; // pointer for string manipulation
631 int i=0; // index for match state (first=1)
632 int a; // amino acid index
633 char dssp=0; // 1 if a consensus SS has been found in the transition prob lines
634 char annot=0; // 1 if at least one annotation character in insert lines is ne '-' or ' '
635 int k=0; // index for seq[k]
636 static char ignore_hmmer_cal = 0;
637 char* annotchr; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
638 //annotchr = new char[MAXRES]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
639 annotchr = new char[par.maxResLen]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
640 static int warn=0;
641 int iAlpha = 20; /* size of alphabet, default is protein = 20 */
642 double dAlphaInv = 1.00 / (double)(iAlpha); /* weight of AA */
643
644 trans_lin=0;
645 L=0;
646 Neff_HMM=0;
647 n_display=N_in=N_filtered=0;
648 nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1;
649 lamda=mu=0.0;
650 trans_lin=0; // transition probs in log space
651 name[0]=longname[0]=desc[0]=fam[0]='\0';
652 //If at the end of while-loop L is still 0 then we have reached end of db file
653
654 // Do not delete name and seq vectors because their adresses are transferred to hitlist as part of a hit!!
655
656 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/'))
657 {
658
659 if (strscn(line)==NULL) continue; // skip lines that contain only white space
660 if (!strncmp("HMMER",line,5)) continue;
661
662 substr(str4,line,0,3); // copy the first four characters into str4
663
664 if (!strcmp("NAME",str4) && name[0]=='\0')
665 {
666 ptr=strscn(line+4); // advance to first non-white-space character
667 strncpy(name,ptr,NAMELEN-1); // copy full name to name
668 strcut(name); // ...cut after first word...
669 if (v>=4) cout<<"Reading in HMM "<<name<<":\n";
670 }
671
672 else if (!strcmp("ACC ",str4))
673 {
674 ptr=strscn(line+4); // advance to first non-white-space character
675 strncpy(longname,ptr,DESCLEN-1); // copy Accession id to longname...
676 }
677
678 else if (!strcmp("DESC",str4))
679 {
680 ptr=strscn(line+4); // advance to first non-white-space character
681 if (ptr)
682 {
683 strncpy(desc,ptr,DESCLEN-1); // copy description to name...
684 desc[DESCLEN-1]='\0';
685 strcut(ptr); // ...cut after first word...
686 }
687 if (!ptr || ptr[1]!='.' || strchr(ptr+3,'.')==NULL) strcpy(fam,""); else strcpy(fam,ptr); // could not find two '.' in name?
688 }
689
690 else if (!strcmp("LENG",str4))
691 {
692 ptr=line+4;
693 L=strint(ptr); //read next integer (number of match states)
694 }
695
696 else if (!strcmp("ALPH",str4)) {
697
698 ptr=strscn(line+4);
699
700 if (0 == strcmp(ptr, "Amino")){
701 iAlpha = 20;
702 }
703 else if (0 == strcmp(ptr, "Nucleic")){
704 iAlpha = 4;
705 printf("%s:%s:%d: WARNING: HMM reading does not work for DNA/RNA\n",
706 __FUNCTION__, __FILE__, __LINE__);
707 }
708 else {
709 return Warning(dbf,line,name);
710 }
711 dAlphaInv = 1.00 / (double)(iAlpha);
712 //continue;
713 }
714 else if (!strcmp("RF ",str4)) continue;
715 else if (!strcmp("CS ",str4)) continue;
716 else if (!strcmp("MAP ",str4)) continue;
717 else if (!strcmp("COM ",str4)) continue;
718 else if (!strcmp("NSEQ",str4))
719 {
720 ptr=line+4;
721 N_in=N_filtered=strint(ptr); //read next integer: number of sequences after filtering
722 }
723
724 else if (!strcmp("DATE",str4)) continue;
725 else if (!strncmp("CKSUM ",line,5)) continue;
726 else if (!strcmp("GA ",str4)) continue;
727 else if (!strcmp("TC ",str4)) continue;
728 else if (!strcmp("NC ",str4)) continue;
729
730 else if (!strncmp("SADSS",line,5))
731 {
732 if (nsa_dssp<0)
733 {
734 nsa_dssp=k++;
735 seq[nsa_dssp] = new(char[/*MAXRES*/par.maxResLen+2]);
736 sname[nsa_dssp] = new(char[15]);
737 strcpy(seq[nsa_dssp]," ");
738 strcpy(sname[nsa_dssp],"sa_dssp");
739
740 }
741 ptr=strscn(line+5);
742 if (ptr)
743 {
744 strcut(ptr);
745 if (strlen(seq[nsa_dssp])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
746 printf("\nWARNING: HMM %s has SADSS records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
747 else strcat(seq[nsa_dssp],ptr);
748 }
749 }
750
751 else if (!strncmp("SSPRD",line,5))
752 {
753 if (nss_pred<0)
754 {
755 nss_pred=k++;
756 seq[nss_pred] = new(char[/*MAXRES*/par.maxResLen+2]);
757 sname[nss_pred] = new(char[15]);
758 strcpy(seq[nss_pred]," ");
759 strcpy(sname[nss_pred],"ss_pred");
760
761 }
762 ptr=strscn(line+5);
763 if (ptr)
764 {
765 strcut(ptr);
766 if (strlen(seq[nss_pred])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
767 printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
768 else strcat(seq[nss_pred],ptr);
769 }
770 }
771
772 else if (!strncmp("SSCON",line,5))
773 {
774 if (nss_conf<0)
775 {
776 nss_conf=k++;
777 seq[nss_conf] = new(char[/*MAXRES*/par.maxResLen+2]);
778 sname[nss_conf] = new(char[15]);
779 strcpy(seq[nss_conf]," ");
780 strcpy(sname[nss_conf],"ss_conf");
781 }
782 ptr=strscn(line+5);
783 if (ptr)
784 {
785 strcut(ptr);
786 if (strlen(seq[nss_conf])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
787 printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
788 else strcat(seq[nss_conf],ptr);
789 }
790 }
791
792 else if (!strncmp("SSCIT",line,5)) continue;
793 else if (!strcmp("XT ",str4)) continue;
794 else if (!strcmp("NULT",str4)) continue;
795
796 else if (!strcmp("NULE",str4))
797 {
798 ptr=line+4;
799 for (a=0; (a < iAlpha) && ptr; ++a){
800 /* FIXME: FS introduced alphabet size (was '20')
801 and dAlphaInv (was '0.05' = 1/20) */
802 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
803 pb[s2a[a]] = (float) dAlphaInv * fpow2(float(strinta(ptr,-99999))/HMMSCALE); /* dAlphaInv */
804 }
805 for (a = iAlpha; a < 20; a++){
806 pb[s2a[a]] = 0.0;
807 }
808 if (!ptr) return Warning(dbf,line,name);
809 if (v>=4)
810 {
811 printf("\nNULL ");
812 for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */
813 printf("%5.1f ",100.*pb[s2a[a]]);
814 }
815 printf("\n");
816 }
817 }
818
819 else if (!strcmp("EVD ",str4))
820 {
821 char* ptr=line+4;
822 ptr = strscn(ptr);
823 sscanf(ptr,"%f",&lamda);
824 ptr = strscn(ptr);
825 sscanf(ptr,"%f",&mu);
826 if (lamda<0)
827 {
828 if (v>=2 && ignore_hmmer_cal==0)
829 cerr<<endl<<"Warning: some HMMs have been calibrated with HMMER's 'hmmcalibrate'. These calibrations will be ignored\n";
830 ignore_hmmer_cal=1;
831 mu = lamda = 0.0;
832 }
833 }
834
835 /////////////////////////////////////////////////////////////////////////////////////
836 // Read transition probabilities from start state
837 else if (!strncmp("HMM",line,3))
838 {
839 fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels
840 fgetline(line,LINELEN-1,dbf); // Skip line with transition labels
841 ptr=line;
842 for (a=0; a<=M2D && ptr; ++a)
843 tr[0][a] = float(strinta(ptr,-99999))/HMMSCALE; //store transition probabilites as log2 values
844 // strinta returns next integer in string and puts ptr to first char
845 // after the integer. Returns -99999 if '*' is found.
846 // ptr is set to 0 if no integer is found after ptr.
847 tr[0][I2M] = tr[0][D2M] = 0.0;
848 tr[0][I2I] = tr[0][D2D] = -99999.0;
849 if (!ptr) return Warning(dbf,line,name);
850 if (v>=4)
851 {
852 printf(" ");
853 for (a=0; a<=D2D && ptr; ++a) printf("%5.1f ",100*fpow2(tr[i][a]));
854 printf("\n");
855 }
856
857 // Prepare to store DSSP states (if there are none, delete afterwards)
858 nss_dssp=k++;
859 seq[nss_dssp] = new(char[/*MAXRES*/par.maxResLen+2]);
860 sname[nss_dssp] = new(char[15]);
861 strcpy(sname[nss_dssp],"ss_dssp");
862
863 /////////////////////////////////////////////////////////////////////////////////////
864 // Read columns of HMM
865 int next_i=0; // index of next column
866 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/') && line[0]!='#')
867 {
868 if (strscn(line)==NULL) continue; // skip lines that contain only white space
869
870 // Read in AA probabilities
871 ptr=line;
872 int prev_i = next_i;
873 next_i = strint(ptr); ++i;
874 if (v && next_i!=prev_i+1)
875 if (++warn<5)
876 {
877 cerr<<endl<<"WARNING: in HMM "<<name<<" state "<<prev_i<<" is followed by state "<<next_i<<"\n";
878 if (warn==5) cerr<<endl<<"WARNING: further warnings while reading HMMs will be suppressed.\n";
879 }
880 if (i>L)
881 {
882 cerr<<endl<<"Error: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n";
883 return 2;
884 }
885 if (i>L && v)
886 cerr<<endl<<"WARNING: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n";
887 if (i>=/*MAXRES*/par.maxResLen-2)
888 {
889 fgetline(line,LINELEN-1,dbf); // Skip two lines
890 fgetline(line,LINELEN-1,dbf);
891 continue;
892 }
893
894 for (a=0; (a<iAlpha) && ptr; ++a){ /* FIXME: FS introduced iAlpha, was '20' */
895 f[i][s2a[a]] = (float) pb[s2a[a]]*fpow2(float(strinta(ptr,-99999))/HMMSCALE);
896 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
897 }
898 for (a = iAlpha; a < 20; a++){
899 f[i][s2a[a]] = 0.0;
900 }
901 if (!ptr) return Warning(dbf,line,name);
902 if (v>=4)
903 {
904 printf("%6i ",i);
905 for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */
906 printf("%5.1f ",100*f[i][s2a[a]]);
907 }
908 printf("\n");
909 }
910
911 // Read insert emission line
912 fgetline(line,LINELEN-1,dbf);
913 ptr = strscn(line);
914 if (!ptr) return Warning(dbf,line,name);
915 annotchr[i]=uprchr(*ptr);
916 if (*ptr!='-' && *ptr!=' ') annot=1;
917
918 // Read annotation character and seven transition probabilities
919 fgetline(line,LINELEN-1,dbf);
920 ptr = strscn(line);
921 switch (*ptr)
922 {
923 case 'H':
924 ss_dssp[i]=1;
925 seq[nss_dssp][i]=*ptr;
926 dssp=1;
927 break;
928 case 'E':
929 ss_dssp[i]=2;
930 seq[nss_dssp][i]=*ptr;
931 dssp=1;
932 break;
933 case 'C':
934 ss_dssp[i]=3;
935 seq[nss_dssp][i]=*ptr;
936 dssp=1;
937 break;
938 case 'S':
939 ss_dssp[i]=4;
940 seq[nss_dssp][i]=*ptr;
941 dssp=1;
942 break;
943 case 'T':
944 ss_dssp[i]=5;
945 seq[nss_dssp][i]=*ptr;
946 dssp=1;
947 break;
948 case 'G':
949 ss_dssp[i]=6;
950 seq[nss_dssp][i]=*ptr;
951 dssp=1;
952 break;
953 case 'B':
954 ss_dssp[i]=7;
955 seq[nss_dssp][i]=*ptr;
956 dssp=1;
957 break;
958 case 'I':
959 dssp=1;
960 case '~':
961 ss_dssp[i]=3;
962 seq[nss_dssp][i]=*ptr;
963 break;
964 case '-':
965 default:
966 ss_dssp[i]=0;
967 seq[nss_dssp][i]=*ptr;
968 break;
969
970 }
971
972 ptr+=2;
973 for (a=0; a<=D2D && ptr; ++a)
974 tr[i][a] = float(strinta(ptr,-99999))/HMMSCALE; //store transition prob's as log2-values
975 if (!ptr) return Warning(dbf,line,name);
976 if (v>=4)
977 {
978 printf(" ");
979 for (a=0; a<=D2D; ++a) printf("%5.1f ",100*fpow2(tr[i][a]));
980 printf("\n");
981 }
982 }
983
984 if (line[0]=='/' && line[1]=='/') break;
985
986 } /* strncmp("HMM") */
987
988 } //while(getline)
989
990 if (L==0) return 0; //End of db file -> stop reading in
991
992 // Set coefficients of EVD (= 0.0 if not calibrated for these parameters)
993 // lamda = lamda_hash.Show(par.Key());
994 // mu = mu_hash.Show(par.Key());
995 if (lamda && v>=2) printf("HMM %s is already calibrated: lamda=%-5.3f, mu=%-5.2f\n",name,lamda,mu);
996
997 if (v && i!=L) cerr<<endl<<"Warning: in HMM "<<name<<" there are only "<<i<<" columns while the stated length is "<<L<<"\n";
998 if (v && i>=/*MAXRES*/par.maxResLen-2) {i=/*MAXRES*/par.maxResLen-2; cerr<<endl<<"WARNING: maximum number "<</*MAXRES*/par.maxResLen-2<<" of residues exceeded while reading HMM "<<name<<"\n";}
999 if (v && !i) cerr<<endl<<"WARNING: HMM "<<name<<" contains no match states. Check the alignment that gave rise to this HMM.\n";
1000 L = i;
1001
1002 if (strlen(longname)>0) strcat(longname," ");
1003 strncat(longname,name,DESCLEN-strlen(longname)-1); // longname = ACC NAME DESC
1004 if (strlen(name)>0) strcat(longname," ");
1005 strncat(longname,desc,DESCLEN-strlen(longname)-1);
1006 longname[DESCLEN-1]='\0';
1007 ScopID(cl,fold,sfam,fam);// get scop classification from basename (e.g. a.1.2.3.4)
1008 RemoveExtension(file,filestr); // copy name of dbfile without extension into 'file'
1009
1010 // Secondary structure
1011 if (!dssp)
1012 {
1013 // remove dssp sequence
1014 // memory that had been allocated in case ss_dssp was given needs to be freed
1015 delete[] seq[nss_dssp]; seq[nss_dssp] = NULL;
1016 // memory that had been allocated in case ss_dssp was given needs to be freed
1017 delete[] sname[nss_dssp]; sname[nss_dssp] = NULL;
1018 nss_dssp=-1;
1019 k--;
1020 }
1021 if (nss_pred>=0)
1022 {
1023 for (i=1; i<=L; ++i) ss_pred[i] = ss2i(seq[nss_pred][i]);
1024 if (nss_conf>=0)
1025 for (i=1; i<=L; ++i) ss_conf[i] = cf2i(seq[nss_conf][i]);
1026 else
1027 for (i=1; i<=L; ++i) ss_conf[i] = 5;
1028 }
1029
1030 // Copy query (first sequence) and consensus residues?
1031 if (par.showcons)
1032 {
1033 sname[k]=new(char[10]);
1034 strcpy(sname[k],"Consensus");
1035 sname[k+1]=new(char[strlen(longname)+1]);
1036 strcpy(sname[k+1],longname);
1037 seq[k]=new(char[L+2]);
1038 seq[k][0]=' ';
1039 seq[k][L+1]='\0';
1040 seq[k+1]=new(char[L+2]);
1041 seq[k+1][0]=' ';
1042 seq[k+1][L+1]='\0';
1043 for (i=1; i<=L; ++i)
1044 {
1045 float pmax=0.0;
1046 int amax=0;
1047 for (a=0; a<NAA; ++a)
1048 if (f[i][a]>pmax) {amax=a; pmax=f[i][a];}
1049 if (pmax>0.6) seq[k][i]=i2aa(amax);
1050 else if (pmax>0.4) seq[k][i]=lwrchr(i2aa(amax));
1051 else seq[k][i]='x';
1052 seq[k+1][i]=i2aa(amax);
1053 }
1054 ncons=k++; // nfirst is set later!
1055 }
1056 else
1057 {
1058 sname[k]=new(char[strlen(longname)+1]);
1059 /* FIXME valgrind says bytes get lost here during hmm iteration --
1060 fixed in HMM::ClobberGlobal(), I (FS) think */
1061 strcpy(sname[k],longname);
1062 seq[k]=new(char[L+2]);
1063 seq[k][0]=' ';
1064 seq[k][L+1]='\0';
1065 }
1066
1067 if (annot) // read in some annotation characters?
1068 {
1069 annotchr[0]=' ';
1070 annotchr[L+1]='\0';
1071 strcpy(seq[k],annotchr); // overwrite the consensus sequence with the annotation characters
1072 }
1073 else if (!par.showcons) // we have not yet calculated the consensus, but we need it now as query (first sequence)
1074 {
1075 /* FIXME: FS set ncons=k
1076 don't understand why it is not set but seem to need it */
1077 ncons = k;
1078 for (i=1; i<=L; ++i)
1079 {
1080 float pmax=0.0;
1081 int amax=0;
1082 for (a=0; a<NAA; ++a)
1083 if (f[i][a]>pmax) {amax=a; pmax=f[i][a];}
1084 seq[k][i]=i2aa(amax);
1085 }
1086 }
1087 // printf("%i query name=%s seq=%s\n",n,sname[n],seq[n]);
1088 nfirst=k++;
1089
1090 n_display=k;
1091
1092 // Calculate overall Neff_HMM
1093 Neff_HMM=0;
1094 for (i=1; i<=L; ++i)
1095 {
1096 float S=0.0;
1097 for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */
1098 if (f[i][a]>1E-10) S-=f[i][a]*fast_log2(f[i][a]);
1099 }
1100 Neff_HMM+=(float) fpow2(S);
1101 }
1102 Neff_HMM/=L;
1103 for (i=0; i<=L; ++i) Neff_M[i] = Neff_I[i] = Neff_D[i] = 10.0; // to add only little additional pseudocounts!
1104 if (v>=2)
1105 cout<<"Read in HMM "<<name<<" with "<<L<<" match states and effective number of sequences = "<<Neff_HMM<<"\n";
1106
1107 // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
1108 for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */
1109 f[0][a]=f[L+1][a]=pb[a];
1110 }
1111 delete[] annotchr; annotchr = NULL;
1112
1113 return 1; //return status: ok
1114
1115 } /* this is the end of HMM::ReadHMMer() */
1116
1117
1118
1119 /////////////////////////////////////////////////////////////////////////////////////
1120 /**
1121 * @brief Read an HMM from a HMMER3 .hmm file; return 0 at end of file
1122 */
1123 int
1124 HMM::ReadHMMer3(FILE* dbf, char* filestr)
1125 {
1126 char line[LINELEN]=""; // input line
1127 char desc[DESCLEN]=""; // description of family
1128 char str4[5]=""; // first 4 letters of input line
1129 char* ptr; // pointer for string manipulation
1130 int i=0; // index for match state (first=1)
1131 int a; // amino acid index
1132 char dssp=0; // 1 if a consensus SS has been found in the transition prob lines
1133 char annot=0; // 1 if at least one annotation character in insert lines is ne '-' or ' '
1134 int k=0; // index for seq[k]
1135 char* annotchr; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
1136 //annotchr = new char[MAXRES]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
1137 annotchr = new char[par.maxResLen]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
1138 static int warn=0;
1139 int iAlpha = 20; /* size of alphabet, default is protein = 20 */
1140 double dAlphaInv = 1.00 / (double)(iAlpha); /* weight of AA */
1141
1142 trans_lin=0;
1143 L=0;
1144 Neff_HMM=0;
1145 n_seqs=n_display=N_in=N_filtered=0;
1146 nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1;
1147 lamda=mu=0.0;
1148 trans_lin=0; // transition probs in log space
1149 name[0]=longname[0]=desc[0]=fam[0]='\0';
1150 //If at the end of while-loop L is still 0 then we have reached end of db file
1151
1152 // Do not delete name and seq vectors because their adresses are transferred to hitlist as part of a hit!!
1153
1154
1155 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/'))
1156 {
1157
1158 if (strscn(line)==NULL) continue; // skip lines that contain only white space
1159 if (!strncmp("HMMER",line,5)) continue;
1160
1161 substr(str4,line,0,3); // copy the first four characters into str4
1162
1163 if (!strcmp("NAME",str4) && name[0]=='\0')
1164 {
1165 ptr=strscn(line+4); // advance to first non-white-space character
1166 strncpy(name,ptr,NAMELEN-1); // copy full name to name
1167 strcut(name); // ...cut after first word...
1168 if (v>=4) cout<<"Reading in HMM "<<name<<":\n";
1169 }
1170
1171 else if (!strcmp("ACC ",str4))
1172 {
1173 ptr=strscn(line+4); // advance to first non-white-space character
1174 strncpy(longname,ptr,DESCLEN-1); // copy Accession id to longname...
1175 }
1176
1177 else if (!strcmp("DESC",str4))
1178 {
1179 ptr=strscn(line+4); // advance to first non-white-space character
1180 if (ptr)
1181 {
1182 strncpy(desc,ptr,DESCLEN-1); // copy description to name...
1183 desc[DESCLEN-1]='\0';
1184 strcut(ptr); // ...cut after first word...
1185 }
1186 if (!ptr || ptr[1]!='.' || strchr(ptr+3,'.')==NULL) strcpy(fam,""); else strcpy(fam,ptr); // could not find two '.' in name?
1187 }
1188
1189 else if (!strcmp("LENG",str4))
1190 {
1191 ptr=line+4;
1192 L=strint(ptr); //read next integer (number of match states)
1193 }
1194
1195 else if (!strcmp("ALPH",str4)) {
1196
1197 ptr=strscn(line+4);
1198
1199 if (0 == strcmp(ptr, "amino")){
1200 iAlpha = 20;
1201 }
1202 else if (0 == strcmp(ptr, "Nucleic")){
1203 iAlpha = 4;
1204 printf("%s:%s:%d: WARNING: HMM reading does not work for DNA/RNA\n",
1205 __FUNCTION__, __FILE__, __LINE__);
1206 }
1207 else {
1208 return Warning(dbf,line,name);
1209 }
1210 dAlphaInv = 1.00 / (double)(iAlpha);
1211 //continue;
1212 }
1213 else if (!strcmp("RF ",str4)) continue;
1214 else if (!strcmp("CS ",str4)) continue;
1215 else if (!strcmp("MAP ",str4)) continue;
1216 else if (!strcmp("COM ",str4)) continue;
1217 else if (!strcmp("NSEQ",str4))
1218 {
1219 ptr=line+4;
1220 N_in=N_filtered=strint(ptr); //read next integer: number of sequences after filtering
1221 }
1222
1223 else if (!strcmp("DATE",str4)) continue;
1224 else if (!strncmp("CKSUM ",line,5)) continue;
1225 else if (!strcmp("GA ",str4)) continue;
1226 else if (!strcmp("TC ",str4)) continue;
1227 else if (!strcmp("NC ",str4)) continue;
1228
1229 //////////////////////////////////////////////////////////////////////////////////////////////////////
1230 // Still needed???
1231
1232 else if (!strncmp("SADSS",line,5))
1233 {
1234 if (nsa_dssp<0)
1235 {
1236 nsa_dssp=k++;
1237 seq[nsa_dssp] = new(char[/*MAXRES*/par.maxResLen+2]);
1238 sname[nsa_dssp] = new(char[15]);
1239 strcpy(seq[nsa_dssp]," ");
1240 strcpy(sname[nsa_dssp],"sa_dssp");
1241
1242 }
1243 ptr=strscn(line+5);
1244 if (ptr)
1245 {
1246 strcut(ptr);
1247 if (strlen(seq[nsa_dssp])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
1248 printf("\nWARNING: HMM %s has SADSS records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
1249 else strcat(seq[nsa_dssp],ptr);
1250 }
1251 }
1252
1253 else if (!strncmp("SSPRD",line,5))
1254 {
1255 if (nss_pred<0)
1256 {
1257 nss_pred=k++;
1258 seq[nss_pred] = new(char[/*MAXRES*/par.maxResLen+2]);
1259 sname[nss_pred] = new(char[15]);
1260 strcpy(seq[nss_pred]," ");
1261 strcpy(sname[nss_pred],"ss_pred");
1262
1263 }
1264 ptr=strscn(line+5);
1265 if (ptr)
1266 {
1267 strcut(ptr);
1268 if (strlen(seq[nss_pred])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
1269 printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
1270 else strcat(seq[nss_pred],ptr);
1271 }
1272 }
1273
1274 else if (!strncmp("SSCON",line,5))
1275 {
1276 if (nss_conf<0)
1277 {
1278 nss_conf=k++;
1279 seq[nss_conf] = new(char[/*MAXRES*/par.maxResLen+2]);
1280 sname[nss_conf] = new(char[15]);
1281 strcpy(seq[nss_conf]," ");
1282 strcpy(sname[nss_conf],"ss_conf");
1283 }
1284 ptr=strscn(line+5);
1285 if (ptr)
1286 {
1287 strcut(ptr);
1288 if (strlen(seq[nss_conf])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
1289 printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
1290 else strcat(seq[nss_conf],ptr);
1291 }
1292 }
1293
1294 else if (!strncmp("SSCIT",line,5)) continue;
1295 else if (!strcmp("XT ",str4)) continue;
1296 //////////////////////////////////////////////////////////////////////////////////////////////////////
1297 else if (!strncmp("STATS LOCAL",line,11)) continue;
1298
1299 else if (!strcmp("EFFN",str4))
1300 {
1301 ptr=line+4;
1302 float effn = strflt(ptr);
1303 // Calculate Neff_HMM by using f(x) = ax^0.1 + bx^0.5 + cx + d (fitted with scop25 dataset)
1304 Neff_HMM = -1.403534 * pow(effn, 0.1) + 4.428118 * pow(effn, 0.5) - 0.2885410 * effn - 1.108568;
1305 }
1306
1307 /////////////////////////////////////////////////////////////////////////////////////
1308 // Read transition probabilities from start state
1309 else if (!strncmp("HMM",line,3))
1310 {
1311 fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels
1312 fgetline(line,LINELEN-1,dbf); // Skip line with transition labels
1313 ptr=strscn(line);
1314
1315 if (!strncmp("COMPO",ptr,5))
1316 {
1317 ptr=ptr+5;
1318 for (a=0; a<20 && ptr; ++a)
1319 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
1320 pb[s2a[a]] = (float) exp(-1.0*strflta(ptr,99999));
1321 if (!ptr) return Warning(dbf,line,name);
1322 if (v>=4)
1323 {
1324 printf("\nNULL ");
1325 for (a=0; a<20; ++a) printf("%6.3g ",100.*pb[s2a[a]]);
1326 printf("\n");
1327 }
1328 fgetline(line,LINELEN-1,dbf); // Read next line
1329 }
1330
1331 fgetline(line,LINELEN-1,dbf); // Skip line with 0-states insert probabilities
1332
1333 ptr = strscn(line);
1334 for (a=0; a<=D2D && ptr; ++a)
1335 tr[0][a] = log2((float) exp(-1.0*strflta(ptr,99999))); //store transition probabilites as log2 values
1336 // strinta returns next integer in string and puts ptr to first char
1337 // after the integer. Returns -99999 if '*' is found.
1338 // ptr is set to 0 if no integer is found after ptr.
1339 if (!ptr) return Warning(dbf,line,name);
1340 if (v>=4)
1341 {
1342 printf(" ");
1343 for (a=0; a<=D2D && ptr; ++a) printf("%6.3g ",100*fpow2(tr[i][a]));
1344 printf("\n");
1345 }
1346
1347 // Prepare to store DSSP states (if there are none, delete afterwards)
1348 nss_dssp=k++;
1349 seq[nss_dssp] = new(char[/*MAXRES*/par.maxResLen+2]);
1350 sname[nss_dssp] = new(char[15]);
1351 strcpy(sname[nss_dssp],"ss_dssp");
1352
1353 /////////////////////////////////////////////////////////////////////////////////////
1354 // Read columns of HMM
1355 int next_i=0; // index of next column
1356 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/') && line[0]!='#')
1357 {
1358 if (strscn(line)==NULL) continue; // skip lines that contain only white space
1359
1360 // Read in AA probabilities
1361 ptr=line;
1362 int prev_i = next_i;
1363 next_i = strint(ptr); ++i;
1364 if (v && next_i!=prev_i+1)
1365 if (++warn<5)
1366 {
1367 cerr<<endl<<"WARNING: in HMM "<<name<<" state "<<prev_i<<" is followed by state "<<next_i<<"\n";
1368 if (warn==5) cerr<<endl<<"WARNING: further warnings while reading HMMs will be suppressed.\n";
1369 }
1370 if (i>L)
1371 {
1372 cerr<<endl<<"Error: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n";
1373 return 2;
1374 }
1375 if (i>L && v)
1376 cerr<<endl<<"WARNING: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n";
1377 if (i>=/*MAXRES*/par.maxResLen-2)
1378 {
1379 fgetline(line,LINELEN-1,dbf); // Skip two lines
1380 fgetline(line,LINELEN-1,dbf);
1381 continue;
1382 }
1383
1384 for (a=0; a<iAlpha && ptr; ++a){ /* FIXME: FS introduced iAlpha, was '20' */
1385 f[i][s2a[a]] = (float) exp(-1.0*strflta(ptr,99999));
1386 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
1387 }
1388 for (a = iAlpha; a < 20; a++){
1389 f[i][s2a[a]] = 0.0;
1390 }
1391 if (!ptr) return Warning(dbf,line,name);
1392 if (v>=4)
1393 {
1394 printf("%6i ",i);
1395 for (a=0; a<iAlpha; ++a) printf("%6.3g ",100*f[i][s2a[a]]);
1396 printf("\n");
1397 }
1398
1399 // Ignore MAP annotation
1400 ptr = strscn(line); //find next word
1401 ptr = strscn_ws(line); // ignore word
1402
1403 // Read RF and CS annotation
1404 ptr = strscn(line);
1405 if (!ptr) return Warning(dbf,line,name);
1406 annotchr[i]=uprchr(*ptr);
1407 if (*ptr!='-' && *ptr!=' ') annot=1;
1408
1409 ptr = strscn(line);
1410 switch (*ptr)
1411 {
1412 case 'H':
1413 ss_dssp[i]=1;
1414 seq[nss_dssp][i]=*ptr;
1415 dssp=1;
1416 break;
1417 case 'E':
1418 ss_dssp[i]=2;
1419 seq[nss_dssp][i]=*ptr;
1420 dssp=1;
1421 break;
1422 case 'C':
1423 ss_dssp[i]=3;
1424 seq[nss_dssp][i]=*ptr;
1425 dssp=1;
1426 break;
1427 case 'S':
1428 ss_dssp[i]=4;
1429 seq[nss_dssp][i]=*ptr;
1430 dssp=1;
1431 break;
1432 case 'T':
1433 ss_dssp[i]=5;
1434 seq[nss_dssp][i]=*ptr;
1435 dssp=1;
1436 break;
1437 case 'G':
1438 ss_dssp[i]=6;
1439 seq[nss_dssp][i]=*ptr;
1440 dssp=1;
1441 break;
1442 case 'B':
1443 ss_dssp[i]=7;
1444 seq[nss_dssp][i]=*ptr;
1445 dssp=1;
1446 break;
1447 case 'I':
1448 dssp=1;
1449 case '~':
1450 ss_dssp[i]=3;
1451 seq[nss_dssp][i]=*ptr;
1452 break;
1453 case '-': // no SS available from any template
1454 case '.': // no clear consensus SS structure
1455 case 'X': // no clear consensus SS structure
1456 ss_dssp[i]=0;
1457 seq[nss_dssp][i]='-';
1458 break;
1459 default:
1460 ss_dssp[i]=0;
1461 seq[nss_dssp][i]=*ptr;
1462 break;
1463 }
1464
1465 // Read insert emission line
1466 fgetline(line,LINELEN-1,dbf);
1467
1468 // Read seven transition probabilities
1469 fgetline(line,LINELEN-1,dbf);
1470
1471 ptr+=2;
1472 for (a=0; a<=D2D && ptr; ++a)
1473 tr[i][a] = log2((float) exp(-1.0*strflta(ptr,99999))); //store transition prob's as log2-values
1474 if (!ptr) return Warning(dbf,line,name);
1475 if (v>=4)
1476 {
1477 printf(" ");
1478 for (a=0; a<=D2D; ++a) printf("%6.3g ",100*fpow2(tr[i][a]));
1479 printf("\n");
1480 }
1481 }
1482
1483 if (line[0]=='/' && line[1]=='/') break;
1484
1485 } /* strncmp("HMM") */
1486
1487 } //while(getline)
1488
1489 if (L==0) return 0; //End of db file -> stop reading in
1490
1491 if (v && i!=L) cerr<<endl<<"Warning: in HMM "<<name<<" there are only "<<i<<" columns while the stated length is "<<L<<"\n";
1492 if (v && i>=/*MAXRES*/par.maxResLen-2) {i=/*MAXRES*/par.maxResLen-2; cerr<<endl<<"WARNING: maximum number "<</*MAXRES*/par.maxResLen-2<<" of residues exceeded while reading HMM "<<name<<"\n";}
1493 if (v && !i) cerr<<endl<<"WARNING: HMM "<<name<<" contains no match states. Check the alignment that gave rise to this HMM.\n";
1494 L = i;
1495
1496 if (strlen(longname)>0) strcat(longname," ");
1497 strncat(longname,name,DESCLEN-strlen(longname)-1); // longname = ACC NAME DESC
1498 if (strlen(name)>0) strcat(longname," ");
1499 strncat(longname,desc,DESCLEN-strlen(longname)-1);
1500 longname[DESCLEN-1]='\0';
1501 ScopID(cl,fold,sfam,fam);// get scop classification from basename (e.g. a.1.2.3.4)
1502 RemoveExtension(file,filestr); // copy name of dbfile without extension into 'file'
1503
1504 // Secondary structure
1505 if (!dssp)
1506 {
1507 // remove dssp sequence
1508 delete[] seq[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed
1509 delete[] sname[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed
1510 nss_dssp=-1;
1511 k--;
1512 }
1513 else { seq[nss_dssp][0]='-'; seq[nss_dssp][L+1]='\0'; }
1514
1515 if (nss_pred>=0)
1516 {
1517 for (i=1; i<=L; ++i) ss_pred[i] = ss2i(seq[nss_pred][i]);
1518 if (nss_conf>=0)
1519 for (i=1; i<=L; ++i) ss_conf[i] = cf2i(seq[nss_conf][i]);
1520 else
1521 for (i=1; i<=L; ++i) ss_conf[i] = 5;
1522 }
1523
1524 // Copy query (first sequence) and consensus residues?
1525 if (par.showcons)
1526 {
1527 sname[k]=new(char[10]);
1528 strcpy(sname[k],"Consensus");
1529 sname[k+1]=new(char[strlen(longname)+1]);
1530 strcpy(sname[k+1],longname);
1531 seq[k]=new(char[L+2]);
1532 seq[k][0]=' ';
1533 seq[k][L+1]='\0';
1534 seq[k+1]=new(char[L+2]);
1535 seq[k+1][0]=' ';
1536 seq[k+1][L+1]='\0';
1537 for (i=1; i<=L; ++i)
1538 {
1539 float pmax=0.0;
1540 int amax=0;
1541 for (a=0; a<NAA; ++a)
1542 if (f[i][a]>pmax) {amax=a; pmax=f[i][a];}
1543 if (pmax>0.6) seq[k][i]=i2aa(amax);
1544 else if (pmax>0.4) seq[k][i]=lwrchr(i2aa(amax));
1545 else seq[k][i]='x';
1546 seq[k+1][i]=i2aa(amax);
1547 }
1548 ncons=k++; // nfirst is set later!
1549 }
1550 else
1551 {
1552 sname[k]=new(char[strlen(longname)+1]);
1553 strcpy(sname[k],longname);
1554 seq[k]=new(char[L+2]);
1555 seq[k][0]=' ';
1556 seq[k][L+1]='\0';
1557 }
1558
1559 if (annot) // read in some annotation characters?
1560 {
1561 annotchr[0]=' ';
1562 annotchr[L+1]='\0';
1563 strcpy(seq[k],annotchr); // overwrite the consensus sequence with the annotation characters
1564 }
1565 else if (!par.showcons) // we have not yet calculated the consensus, but we need it now as query (first sequence)
1566 {
1567 for (i=1; i<=L; ++i)
1568 {
1569 float pmax=0.0;
1570 int amax=0;
1571 for (a=0; a<NAA; ++a)
1572 if (f[i][a]>pmax) {amax=a; pmax=f[i][a];}
1573 seq[k][i]=i2aa(amax);
1574 }
1575 }
1576 // printf("%i query name=%s seq=%s\n",n,sname[n],seq[n]);
1577 nfirst=k++;
1578
1579 n_display=k;
1580 n_seqs=k;
1581
1582 // If no effektive number of sequences is given, calculate Neff_HMM by given profile
1583 if (Neff_HMM == 0) {
1584 for (i=1; i<=L; ++i)
1585 {
1586 float S=0.0;
1587 for (a=0; a<20; ++a)
1588 if (f[i][a]>1E-10) S-=f[i][a]*fast_log2(f[i][a]);
1589 Neff_HMM+=(float) fpow2(S);
1590 }
1591 Neff_HMM/=L;
1592 }
1593
1594 for (i=0; i<=L; ++i) Neff_M[i] = Neff_I[i] = Neff_D[i] = 10.0; // to add only little additional pseudocounts!
1595 Neff_M[L+1]=1.0f;
1596 Neff_I[L+1]=Neff_D[L+1]=0.0f;
1597
1598 if (v>=2)
1599 cout<<"Read in HMM "<<name<<" with "<<L<<" match states and effective number of sequences = "<<Neff_HMM<<"\n";
1600
1601 ///////////////////////////////////////////////////////////////////
1602
1603 // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
1604 for (a=0; a<20; ++a) f[0][a]=f[L+1][a]=pb[a];
1605 delete[] annotchr;
1606
1607 has_pseudocounts=true;
1608
1609 return 1; //return status: ok
1610
1611
1612 } /* this is the end of HMM::ReadHMMer3() */
1613
1614
1615 //////////////////////////////////////////////////////////////////////////////
1616 /**
1617 * @brief Add transition pseudocounts to HMM (and calculate lin-space transition probs)
1618 */
1619 void
1620 HMM::AddTransitionPseudocounts(float gapd, float gape, float gapf, float gapg, float gaph, float gapi, float gapb)
1621 {
1622 int i; //position in alignment
1623 float sum;
1624 float pM2M, pM2I, pM2D, pI2I, pI2M, pD2D, pD2M;
1625 float p0,p1,p2;
1626 if (par.gapb<=0) return;
1627 if (trans_lin==1) {fprintf(stderr,"Error: Adding transition pseudocounts to linear representation of %s not allowed. Please report this error to the HHsearch developers.\n",name); exit(6);}
1628 if (trans_lin==2) {fprintf(stderr,"Error: Adding transition pseudocounts twice is %s not allowed. Please report this error to the HHsearch developers.\n",name); exit(6);}
1629 trans_lin=2;
1630
1631 // Calculate pseudocount transition probabilities
1632 pM2D=pM2I=gapd*0.0286; //a-priori probability for inserts and deletions
1633 pM2M=1-pM2D-pM2I;
1634 // gape=0 -> pI2I=0 gape=1 -> pI2I=0.75 gape=inf -> pI2I=1.
1635 pI2I=1.0*gape/(gape-1+1.0/0.75);
1636 pI2M=1-pI2I;
1637 // gape=0 -> pD2D=0 gape=1 -> pD2D=0.75 gape=inf -> pD2D=1.
1638 pD2D=1.0*gape/(gape-1+1.0/0.75);
1639 pD2M=1-pD2D;
1640
1641 for (i=0; i<=L; ++i) //for all columns in HMM
1642 {
1643 // Transitions from M state
1644 p0 = (Neff_M[i]-1)*fpow2(tr[i][M2M]) + gapb*pM2M;
1645 p1 = (Neff_M[i]-1)*fpow2(tr[i][M2D]) + gapb*pM2D;
1646 p2 = (Neff_M[i]-1)*fpow2(tr[i][M2I]) + gapb*pM2I;
1647 if (i==0) p1=p2=0; //from M(0) no transition to D(1) and I(0) possible
1648 if (i==L) p1=p2=0; //from M(L) no transition to D(L+1) and I(L+1) possible
1649 sum = p0+p1+p2+FLT_MIN;
1650
1651 // p0 = p0/sum ;
1652 // p1 = pow(p1/sum,gapf);
1653 // p2 = pow(p2/sum,gapg);
1654 // sum = p0+p1+p2+FLT_MIN;
1655 // tr[i][M2M] = fast_log2(p0/sum);
1656 // tr[i][M2D] = fast_log2(p1/sum);
1657 // tr[i][M2I] = fast_log2(p2/sum);
1658
1659 tr[i][M2M] = fast_log2(p0/sum);
1660 tr[i][M2D] = fast_log2(p1/sum)*gapf;
1661 tr[i][M2I] = fast_log2(p2/sum)*gapg;
1662
1663 // Transitions from I state
1664 p0 = Neff_I[i]*fpow2(tr[i][I2M]) + gapb*pI2M;
1665 p1 = Neff_I[i]*fpow2(tr[i][I2I]) + gapb*pI2I;
1666 sum = p0+p1+FLT_MIN;
1667
1668 // p0 = pow(p0/sum,gapg);
1669 // p1 = pow(p1/sum,gapi);
1670 // sum = p0+p1+FLT_MIN;
1671 // tr[i][I2M] = fast_log2(p0/sum);
1672 // tr[i][I2I] = fast_log2(p1/sum);
1673
1674 tr[i][I2M] = fast_log2(p0/sum);
1675 tr[i][I2I] = fast_log2(p1/sum)*gapi;
1676
1677 // Transitions from D state
1678 p0 = Neff_D[i]*fpow2(tr[i][D2M]) + gapb*pD2M;
1679 p1 = Neff_D[i]*fpow2(tr[i][D2D]) + gapb*pD2D;
1680 if (i==L) p1=0; //from D(L) no transition to D(L+1) possible
1681 sum = p0+p1+FLT_MIN;
1682
1683 // p0 = pow(p0/sum,gapf);
1684 // p1 = pow(p1/sum,gaph);
1685 // sum = p0+p1+FLT_MIN;
1686 // tr[i][D2M] = fast_log2(p0/sum);
1687 // tr[i][D2D] = fast_log2(p1/sum);
1688
1689 tr[i][D2M] = fast_log2(p0/sum);
1690 tr[i][D2D] = fast_log2(p1/sum)*gaph;
1691
1692 // SS-dependent gap penalties
1693 tr[i][M2M_GAPOPEN]=tr[i][M2M];
1694 tr[i][GAPOPEN]=0.0;
1695 tr[i][GAPEXTD]=0.0;
1696 }
1697
1698 if (v>=4)
1699 {
1700 printf("\nPseudocount transition probabilities:\n");
1701 printf("pM2M=%4.1f%%, pM2I=%4.1f%%, pM2D=%4.1f%%, ",100*pM2M,100*pM2I,100*pM2D);
1702 printf("pI2M=%4.1f%%, pI2I=%4.1f%%, ",100*pI2M,100*pI2I);
1703 printf("pD2M=%4.1f%%, pD2D=%4.1f%% ",100*pD2M,100*pD2D);
1704 printf("tau = %4.1f%%\n\n",100.*gapb/(Neff_HMM-1+gapb));
1705 printf("Listing transition probabilities WITH pseudocounts:\n");
1706 printf(" i dssp pred sacc M->M M->I M->D I->M I->I D->M D->D\n");
1707
1708 for (i=1; i<=L; ++i) //for all columns in HMM
1709 {
1710 printf("%4i %1c %1c %1c %6.3f %6.3f %6.3f ",i,i2ss(ss_dssp[i]),i2ss(ss_pred[i]),i2sa(sa_dssp[i]),fpow2(tr[i][M2M]),fpow2(tr[i][M2I]),fpow2(tr[i][M2D]));
1711 printf("%6.3f %6.3f ",fpow2(tr[i][I2M]),fpow2(tr[i][I2I]));
1712 printf("%6.3f %6.3f ",fpow2(tr[i][D2M]),fpow2(tr[i][D2D]));
1713 printf("%1i %2i %1i\n",ss_pred[i],ss_conf[i],ss_dssp[i]);
1714 }
1715 printf("\n");
1716 printf("nss_dssp=%i nss_pred=%i\n",nss_dssp,nss_pred);
1717 }
1718 return;
1719 }
1720
1721
1722 //////////////////////////////////////////////////////////////////////////////
1723 /**
1724 * @brief Use secondary structure-dependent gap penalties
1725 * on top of the HMM transition penalties
1726 */
1727 void
1728 HMM::UseSecStrucDependentGapPenalties()
1729 {
1730 int i; // column in HMM
1731 int ii;
1732 //unsigned char iis[MAXRES]; // inside-integer array
1733 unsigned char iis[par.maxResLen]; // inside-integer array
1734 float d; // Additional penalty for opening gap whithin SS element
1735 float e; // Additional penalty for extending gap whithin SS element
1736
1737 // Determine inside-integers:
1738 // CCSTCCCHHHHHHHHHHHCCCCCEEEEECCSBGGGCCCCEECC
1739 // 0000000123444432100000012210000000000001000
1740 ii=0;
1741 for (i=0; i<=L; ++i) // forward run
1742 {
1743 if (ss_dssp[i]==1 || ss_dssp[i]==2) {ii+=(ii<par.ssgapi);} else ii=0;
1744 iis[i]=ii;
1745 } for (i=0; i<=L; ++i)
1746 ii=0;
1747 iis[0]=iis[L]=0;
1748 for (i=L; i>=0; i--) // backward run
1749 {
1750 if (ss_dssp[i]==1 || ss_dssp[i]==2) {ii+=(ii<par.ssgapi);} else ii=0;
1751 iis[i-1]=imin(ii,iis[i-1]);
1752 }
1753
1754 // Add SS-dependent gap penalties to HMM transition penalties
1755 for (i=0; i<=L; ++i) //for all columns in HMM
1756 {
1757 d=-iis[i]*par.ssgapd;
1758 e=-iis[i]*par.ssgape;
1759 tr[i][GAPOPEN]=d;
1760 tr[i][GAPEXTD]=e;
1761 tr[i][M2M_GAPOPEN]+=d;
1762 tr[i][M2I]+=d;
1763 tr[i][I2M]+=d;
1764 tr[i][I2I]+=e;
1765 tr[i][M2D]+=d;
1766 tr[i][D2M]+=d;
1767 tr[i][D2D]+=e;
1768 }
1769
1770 if (v>=3)
1771 {
1772 printf("Col SS II\n");
1773 for (i=0; i<=L; ++i) printf("%3i %c %2i\n",i,i2ss(ss_dssp[i]),iis[i]);
1774 }
1775 return;
1776 }
1777
1778 /////////////////////////////////////////////////////////////////////////////////////
1779 /**
1780 * @brief Generate an amino acid frequency matrix g[][] with full pseudocount admixture (tau=1)
1781 */
1782 void
1783 HMM::PreparePseudocounts()
1784 {
1785 for (int i=0; i<=L+1; ++i)
1786 for (int a=0; a<20; ++a)
1787 g[i][a] = // produces fast code
1788 R[a][0]*f[i][0] +R[a][1]*f[i][1] +R[a][2]*f[i][2] +R[a][3]*f[i][3] +R[a][4]*f[i][4]
1789 +R[a][5]*f[i][5] +R[a][6]*f[i][6] +R[a][7]*f[i][7] +R[a][8]*f[i][8] +R[a][9]*f[i][9]
1790 +R[a][10]*f[i][10]+R[a][11]*f[i][11]+R[a][12]*f[i][12]+R[a][13]*f[i][13]+R[a][14]*f[i][14]
1791 +R[a][15]*f[i][15]+R[a][16]*f[i][16]+R[a][17]*f[i][17]+R[a][18]*f[i][18]+R[a][19]*f[i][19];
1792 }
1793
1794 /////////////////////////////////////////////////////////////////////////////////////
1795 /**
1796 * @brief Add amino acid pseudocounts to HMM and calculate average protein aa probabilities pav[a]
1797 * Pseudocounts: t.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
1798 */
1799 void
1800 HMM::AddAminoAcidPseudocounts(char pcm, float pca, float pcb, float pcc)
1801 {
1802 int i; //position in HMM
1803 int a; //amino acid (0..19)
1804 float sum;
1805 float tau; //tau = pseudocount admixture
1806
1807 for (a=0; a<20; ++a) pav[a]=pb[a]*100.0f/Neff_HMM; // initialize vector of average aa freqs with pseudocounts
1808
1809 // Calculate amino acid frequencies p[i][a] = (1-tau(i))*f[i][a] + tau(i)*g[i][a]
1810 switch (pcm)
1811 {
1812 case 0: //no pseudocounts whatsoever: tau=0
1813 for (i=1; i<=L; ++i)
1814 for (a=0; a<20; ++a)
1815 pav[a] += ( p[i][a]=f[i][a] );
1816 break;
1817 case 1: //constant pseudocounts (for optimization): tau = pca
1818 tau = pca;
1819 for (i=1; i<=L; ++i)
1820 for (a=0; a<20; ++a)
1821 pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] );
1822 break;
1823 case 2: //divergence-dependent pseudocounts
1824 case 4: //divergence-dependent pseudocounts and rate matrix rescaling
1825 if (par.pcc==1.0f)
1826 for (i=1; i<=L; ++i)
1827 {
1828 tau = fmin(1.0, pca/(1. + Neff_M[i]/pcb ) );
1829 for (a=0; a<20; ++a)
1830 pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] );
1831 }
1832 else
1833 for (i=1; i<=L; ++i)
1834 {
1835 tau = fmin(1.0, pca/(1. + pow((Neff_M[i])/pcb,pcc)));
1836 for (a=0; a<20; ++a)
1837 pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] );
1838 }
1839 break;
1840 case 3: // constant-divergence pseudocounts
1841 for (i=1; i<=L; ++i)
1842 {
1843 float x = Neff_M[i]/pcb;
1844 pca = 0.793 + 0.048*(pcb-10.0);
1845 tau = fmax(0.0, pca*(1-x + pcc*x*(1-x)) );
1846 for (a=0; a<20; ++a)
1847 pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] );
1848 }
1849 if (v>=2) { printf("Divergence before / after addition of amino acid pseudocounts: %5.2f / %5.2f\n",Neff_HMM, CalcNeff()); }
1850 break;
1851 } //end switch (pcm)
1852
1853
1854 // Normalize vector of average aa frequencies pav[a]
1855 NormalizeTo1(pav,NAA);
1856
1857 for (a=0; a<20; ++a)
1858 p[0][a] = p[L+1][a] = pav[a];
1859
1860 // DEBUGGING output
1861 if (v>=3)
1862 {
1863 switch (pcm)
1864 {
1865 case 0:
1866 cout<<"No pseudocounts added (-pcm 0)\n";
1867 return;
1868 case 1:
1869 cout<<"Adding constant AA pseudocount admixture of "<<pca<<" to HMM "<<name<<"\n";
1870 break;
1871 case 2:
1872 cout<<"Adding divergence-dependent AA pseudocounts (-pcm 2) with admixture of "
1873 <<pca/(1.+pow((Neff_HMM-1.)/pcb,pcc))<<" to HMM "<<name<<"\n";
1874 break;
1875 } //end switch (pcm)
1876 cout<<"\nAverage amino acid frequencies WITH pseudocounts in HMM: \nProf: ";
1877 for (a=0; a<20; ++a) printf("%4.1f ",100*pav[a]);
1878 cout<<"\n";
1879 if (v>=4)
1880 {
1881 cout<<"\nAmino acid frequencies WITHOUT pseudocounts:\n A R N D C Q E G H I L K M F P S T W Y V\n";
1882 for (i=1; i<=L; ++i)
1883 {
1884 printf("%3i: ",i);
1885 sum=0;
1886 for (a=0; a<20; ++a)
1887 {
1888 sum+=f[i][a];
1889 printf("%4.1f ",100*f[i][a]);
1890 }
1891 printf(" sum=%5.3f\n",sum);
1892 }
1893 cout<<"\nAmino acid frequencies WITH pseudocounts:\n A R N D C Q E G H I L K M F P S T W Y V\n";
1894 for (i=1; i<=L; ++i)
1895 {
1896 printf("%3i: ",i);
1897 sum=0;
1898 for (a=0; a<20; ++a)
1899 {
1900 sum+=p[i][a];
1901 printf("%4.1f ",100*p[i][a]);
1902 }
1903 printf(" sum=%5.3f\n",sum);
1904 }
1905 }
1906 }
1907 return;
1908 }
1909
1910
1911 /////////////////////////////////////////////////////////////////////////////////////
1912 /**
1913 * @brief Factor Null model into HMM t
1914 */
1915 void
1916 HMM::IncludeNullModelInHMM(HMM& q, HMM& t)
1917 {
1918
1919 int i,j; //query and template match state indices
1920 int a; //amino acid index
1921
1922 switch (par.columnscore)
1923 {
1924 default:
1925 case 0: // Null model with background prob. from database
1926 for (a=0; a<20; ++a) pnul[a]=pb[a];
1927 break;
1928
1929 case 1: // Null model with background prob. equal average from query and template
1930 for (a=0; a<20; ++a) pnul[a]=0.5*(q.pav[a]+t.pav[a]);
1931 break;
1932
1933 case 2: // Null model with background prob. from template protein
1934 for (a=0; a<20; ++a) pnul[a]=t.pav[a];
1935 break;
1936
1937 case 3: // Null model with background prob. from query protein
1938 for (a=0; a<20; ++a) pnul[a]=q.pav[a];
1939 break;
1940
1941 case 4: // Null model with background prob. equal average from query and template
1942 for (a=0; a<20; ++a) pnul[a]=sqrt(q.pav[a]*t.pav[a]);
1943 break;
1944
1945 case 10: // Separated column scoring for Stochastic Backtracing (STILL USED??)
1946 for (i=0; i<=q.L+1; ++i)
1947 {
1948 float sum = 0.0;
1949 for (a=0; a<20; ++a) sum += pb[a]*q.p[i][a];
1950 sum = 1.0/sqrt(sum);
1951 for (a=0; a<20; ++a) q.p[i][a]*=sum;
1952 }
1953 for (j=0; j<=t.L+1; j++)
1954 {
1955 float sum = 0.0;
1956 for (a=0; a<20; ++a) sum += pb[a]*t.p[j][a];
1957 sum = 1.0/sqrt(sum);
1958 for (a=0; a<20; ++a) t.p[j][a]*=sum;
1959 }
1960 break;
1961
1962 case 11: // log co-emission probability (no null model)
1963 for (a=0; a<20; ++a) pnul[a]=0.05;
1964 break;
1965
1966 }
1967
1968 // !!!!! ATTENTION!!!!!!! after this t.p is not the same as after adding pseudocounts !!!
1969 //Introduce amino acid weights into template (for all but SOP scores)
1970 if (par.columnscore!=10)
1971 for (a=0; a<20; ++a)
1972 for (j=0; j<=t.L+1; j++)
1973 t.p[j][a]/=pnul[a];
1974
1975 if (v>=5)
1976 {
1977 cout<<"\nAverage amino acid frequencies\n";
1978 cout<<" A R N D C Q E G H I L K M F P S T W Y V\n";
1979 cout<<"Q: ";
1980 for (a=0; a<20; ++a) printf("%4.1f ",100*q.pav[a]);
1981 cout<<"\nT: ";
1982 for (a=0; a<20; ++a) printf("%4.1f ",100*t.pav[a]);
1983 cout<<"\nNull: ";
1984 for (a=0; a<20; ++a) printf("%4.1f ",100*pnul[a]);
1985 cout<<"\npb: ";
1986 for (a=0; a<20; ++a) printf("%4.1f ",100*pb[a]);
1987 }
1988
1989
1990 return;
1991 }
1992
1993
1994 /////////////////////////////////////////////////////////////////////////////////////
1995 /**
1996 * @brief Write HMM to output file
1997 */
1998 void
1999 HMM::WriteToFile(char* outfile)
2000 {
2001 const int SEQLEN=100; // number of residues per line for sequences to be displayed
2002 int i,a;
2003
2004 if (trans_lin) {fprintf(stderr,"Error: Writing transition pseudocounts in linear representation not allowed. Please report this error to the HHsearch developers.\n"); exit(6);}
2005
2006 FILE *outf=NULL;
2007 if (strcmp(outfile,"stdout"))
2008 {
2009 if (par.append) outf=fopen(outfile,"a"); else outf=fopen(outfile,"w");
2010 if (!outf) OpenFileError(outfile);
2011 }
2012 else
2013 outf = stdout;
2014 if (v>=2) cout<<"Writing HMM to "<<outfile<<"\n";
2015
2016 // fprintf(outf,"HHsearch HHM format 1.5\n");
2017 fprintf(outf,"HHsearch 1.5\n"); // format specification
2018 fprintf(outf,"NAME %s\n",longname); // name of first sequence
2019 fprintf(outf,"FAM %s\n",fam); // family name
2020 char file_nopath[NAMELEN];
2021 RemovePath(file_nopath,file);
2022 fprintf(outf,"FILE %s\n",file_nopath); // base name of alignment file
2023
2024 // Print command line
2025 fprintf(outf,"COM ");
2026 for (int i=0; i<par.argc; i++)
2027 if (strlen(par.argv[i])<=100)
2028 fprintf(outf,"%s ",par.argv[i]);
2029 else
2030 fprintf(outf,"<%i characters> ",(int)strlen(par.argv[i]));
2031 fprintf(outf,"\n");
2032
2033 // print out date stamp
2034 time_t* tp=new(time_t);
2035 *tp=time(NULL);
2036 fprintf(outf,"DATE %s",ctime(tp));
2037 delete tp; tp = NULL; /* really? FS */
2038
2039 // Print out some statistics of alignment
2040 fprintf(outf,"LENG %i match states, %i columns in multiple alignment\n",L,l[L]);
2041 fprintf(outf,"FILT %i out of %i sequences passed filter (-id %i -cov %i -qid %i -qsc %.2f -diff %i)\n",N_filtered,N_in,par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff);
2042 fprintf(outf,"NEFF %-4.1f\n",Neff_HMM);
2043
2044 // Print selected sequences from alignment (including secondary structure and confidence values, if known)
2045 fprintf(outf,"SEQ\n");
2046 for (int n=0; n<n_display; n++)
2047 {
2048 fprintf(outf,">%s\n",sname[n]);
2049 //first sequence character starts at 1; 0 not used.
2050 for(unsigned int j=0; j<strlen(seq[n]+1); j+=SEQLEN) fprintf(outf,"%-.*s\n",SEQLEN,seq[n]+1+j);
2051 }
2052 fprintf(outf,"#\n");
2053
2054 // print null model background probabilities from substitution matrix
2055 fprintf(outf,"NULL ");
2056 for (a=0; a<20; ++a) fout(outf,-iround(fast_log2(pb[s2a[a]])*HMMSCALE ));
2057 fprintf(outf,"\n");
2058
2059 // print table header line with amino acids
2060 fprintf(outf,"HMM ");
2061 for (a=0; a<20; ++a) fprintf(outf,"%1c\t",i2aa(s2a[a]));
2062 fprintf(outf,"\n");
2063
2064 // print table header line with state transitions
2065 fprintf(outf," M->M\tM->I\tM->D\tI->M\tI->I\tD->M\tD->D\tNeff\tNeff_I\tNeff_D\n");
2066
2067 // print out transition probabilities from begin state (virtual match state)
2068 fprintf(outf," ");
2069 for (a=0; a<=D2D; ++a) fout(outf,-iround(tr[0][a]*HMMSCALE));
2070 fout(outf,iround(Neff_M[0]*HMMSCALE));
2071 fout(outf,iround(Neff_I[0]*HMMSCALE));
2072 fout(outf,iround(Neff_D[0]*HMMSCALE));
2073 fprintf(outf,"\n");
2074
2075 // Start loop for printing HMM columns
2076 int h=1;
2077 for (i=1; i<=L; ++i)
2078 {
2079
2080 while(islower(seq[nfirst][h]) && seq[nfirst][h]) h++;
2081 fprintf(outf,"%1c %-4i ",seq[nfirst][h++],i);
2082
2083 // Print emission probabilities for match state
2084 for (a=0; a<20; ++a) fout(outf,-iround(fast_log2(p[i][s2a[a]])*HMMSCALE ));
2085 fprintf(outf,"%-i",l[i]);
2086 fprintf(outf,"\n");
2087
2088 // Print transition probabilities
2089 fprintf(outf," ");
2090 for (a=0; a<=D2D; ++a) fout(outf,-iround(tr[i][a]*HMMSCALE));
2091 fout(outf,iround(Neff_M[i]*HMMSCALE));
2092 fout(outf,iround(Neff_I[i]*HMMSCALE));
2093 fout(outf,iround(Neff_D[i]*HMMSCALE));
2094 fprintf(outf,"\n\n");
2095 } // end for(i)-loop for printing HMM columns
2096
2097 fprintf(outf,"//\n");
2098 fclose(outf);
2099 }
2100
2101 /////////////////////////////////////////////////////////////////////////////////////
2102 /**
2103 * @brief Write HMM to output file
2104 */
2105 void
2106 HMM::InsertCalibration(char* infile)
2107 {
2108 char* line = new(char[LINELEN]); // input line
2109 char** lines = new(char*[3*L+100000]);
2110 int nline=0;
2111 int l;
2112 char done=0; // inserted new 'EVD mu sigma' line?
2113
2114 // Read from infile all lines and insert the EVD line with lamda and mu coefficients
2115 ifstream inf;
2116 inf.open(infile, ios::in);
2117 if (!inf) OpenFileError(infile);
2118 if (v>=2) cout<<"Recording calibration coefficients in "<<infile<<"\n";
2119
2120 while (inf.getline(line,LINELEN) && !(line[0]=='/' && line[1]=='/') && nline<2*/*MAXRES*/par.maxResLen)
2121 {
2122
2123 // Found an EVD lamda mu line? -> remove
2124 while (!done && !strncmp(line,"EVD ",3) && !(line[0]=='/' && line[1]=='/') && nline<2*/*MAXRES*/par.maxResLen)
2125 inf.getline(line,LINELEN);
2126 if ((line[0]=='/' && line[1]=='/') || nline>=2*/*MAXRES*/par.maxResLen)
2127 {fprintf(stderr,"Error: wrong format in %s. Expecting hhm format\n",infile); exit(1);}
2128
2129 // Found the SEQ line? -> insert calibration before this line
2130 if (!done && (!strncmp("SEQ",line,3) || !strncmp("HMM",line,3)) && (isspace(line[3]) || line[3]=='\0'))
2131 {
2132 done=1;
2133 lines[nline]=new(char[128]);
2134 if (!lines[nline]) MemoryError("space to read in HHM file for calibration");
2135 sprintf(lines[nline],"EVD %-7.4f %-7.4f",lamda,mu);
2136 nline++;
2137 }
2138 lines[nline]=new(char[strlen(line)+1]);
2139 if (!lines[nline]) MemoryError("space to read in HHM file for calibration");
2140 strcpy (lines[nline],line);
2141 nline++;
2142 }
2143 inf.close();
2144
2145 // Write to infile all lines
2146 FILE* infout=fopen(infile,"w");
2147 if (!infout) {
2148 cerr<<endl<<"WARNING in "<<program_name<<": no calibration coefficients written to "<<infile<<":\n";
2149 cerr<<"Could not open file for writing.\n";
2150 return;
2151 }
2152 for (l=0; l<nline; l++) {
2153 fprintf(infout,"%s\n",lines[l]);
2154 delete[] lines[l]; lines[l] = NULL;
2155 }
2156 fprintf(infout,"//\n");
2157 fclose(infout);
2158 delete[] line; line = NULL;
2159 delete[] lines; lines = NULL;
2160 return;
2161 }
2162
2163 /////////////////////////////////////////////////////////////////////////////////////
2164 /**
2165 * @brief Write HMM to output file in HMMER format
2166 */
2167 void
2168 HMM::WriteToFileHMMER(char* outfile)
2169 {
2170 const int INTSCALE=1000; //scaling factor in HMMER files
2171 const float pBD=0.50;
2172 const int LOG2pBD=iround(fast_log2(pBD)*INTSCALE);
2173 const int LOG2pBM=iround(fast_log2(1-pBD)*INTSCALE);
2174 const float pJB=1.0/350;
2175 const int LOG2pJB=iround(fast_log2(pJB)*INTSCALE);
2176 const int LOG2pJJ=iround(fast_log2(1-pJB)*INTSCALE);
2177 const float pEJ=0.5;
2178 const int LOG2pEJ=iround(fast_log2(pEJ)*INTSCALE);
2179 const int LOG2pEC=iround(fast_log2(1-pEJ)*INTSCALE);
2180 char c;
2181 int i,a;
2182
2183 if (trans_lin) {fprintf(stderr,"Error: Writing transition pseudocounts in linear representation not allowed. Please report this error to the HHsearch developers.\n"); exit(6);}
2184
2185 FILE *outf=NULL;
2186 if (strcmp(outfile,"stdout"))
2187 {
2188 if (par.append) outf=fopen(outfile,"a"); else outf=fopen(outfile,"w");
2189 if (!outf) OpenFileError(outfile);
2190 }
2191 else
2192 outf = stdout;
2193 if (v>=2) cout<<"Writing HMM to "<<outfile<<"\n";
2194
2195 fprintf(outf,"HMMER2.0 [hhmake %s]\n",VERSION_AND_DATE);
2196 fprintf(outf,"NAME %s\n",file); // base name of alignment file
2197 fprintf(outf,"DESC %s\n",longname);
2198 fprintf(outf,"LENG %i\n",L);
2199 fprintf(outf,"ALPH Amino\n"); // amino acid seuqences (not DNA)
2200 fprintf(outf,"RF yes\n"); // reference annotation flag
2201 fprintf(outf,"CS yes\n"); // consensus structure annotation flag
2202 fprintf(outf,"MAP yes\n"); // write MA column number after each line of aa probabilities
2203
2204 fprintf(outf,"COM "); // print out command line
2205 for (i=0; i<=par.argc-1; ++i) fprintf(outf,"%s ",par.argv[i]); fprintf(outf,"\n");
2206
2207 fprintf(outf,"NSEQ %i\n",N_filtered); // print number of sequences after filtering
2208
2209 // Date stamp
2210 time_t* tp=new(time_t);
2211 *tp=time(NULL);
2212 fprintf(outf,"DATE %s",ctime(tp));
2213 delete tp; tp = NULL; /* really? FS */
2214
2215 // Print out secondary structure
2216 if (nss_dssp>=0)
2217 fprintf(outf,"SSDSS %s\n",seq[nss_dssp]);
2218 if (nsa_dssp>=0)
2219 fprintf(outf,"SADSS %s\n",seq[nsa_dssp]);
2220 if (nss_pred>=0)
2221 fprintf(outf,"SSPRD %s\n",seq[nss_pred]);
2222 if (nss_conf>=0)
2223 fprintf(outf,"SSCNF %s\n",seq[nss_conf]);
2224
2225
2226 // Special Plan7 transitions that control repeated detection of profile HMM within sequence
2227 fprintf(outf,"XT %6i %6i %6i %6i %6i %6i %6i %6i\n",LOG2pJB,LOG2pJJ,LOG2pEC,LOG2pEJ,LOG2pJB,LOG2pJJ,LOG2pJB,LOG2pJJ);
2228 fprintf(outf,"NULT -4 -8455\n");
2229
2230
2231 // Null model background probabilities from substitution matrix
2232 fprintf(outf,"NULE ");
2233 for (a=0; a<20; ++a)
2234 {
2235 float lg2=fast_log2(pb[s2a[a]]*20.0);
2236 if (lg2<-99.999) fprintf(outf," *"); else fprintf(outf," %6i",iround(lg2*INTSCALE));
2237 }
2238 fprintf(outf,"\n");
2239
2240 // Table header line with amino acids
2241 fprintf(outf,"HMM ");
2242 for (a=0; a<20; ++a) fprintf(outf," %1c ",i2aa(s2a[a]));
2243 fprintf(outf,"\n");
2244
2245 // Table header line with state transitions
2246 fprintf(outf," m->m m->i m->d i->m i->i d->m d->d b->m m->e\n");
2247
2248 // Transition probabilities from begin state
2249 fprintf(outf," %6i * %6i\n",LOG2pBM,LOG2pBD);
2250
2251 // Start loop for printing HMM columns
2252 int h=1, hss=1;
2253 for (i=1; i<=L; ++i)
2254 {
2255
2256 // Emission probabilities for match state
2257 fprintf(outf," %5i",i);
2258 for (a=0; a<20; ++a) fprintf(outf," %6i",imax(-9999,iround(fast_log2(p[i][s2a[a]]/pb[s2a[a]])*INTSCALE)));
2259 fprintf(outf," %5i",l[i]);
2260 fprintf(outf,"\n");
2261
2262 // Emission probabilities (relative to null model) for insert state
2263 while(islower(seq[nfirst][h]) && seq[nfirst][h]) h++;
2264 if (i==L)
2265 fprintf(outf," %1c * * * * * * * * * * * * * * * * * * * *\n",seq[nfirst][h++]);
2266 else
2267 fprintf(outf," %1c 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",seq[nfirst][h++]);
2268
2269 // Transition probabilities
2270 if (nss_dssp>=0)
2271 {
2272 while(islower(seq[nss_dssp][hss]) && seq[nss_dssp][hss]) hss++;
2273 c=seq[nss_dssp][hss++];
2274 }
2275 else c=' ';
2276 fprintf(outf," %1c",c);
2277 if (i==1)
2278 {
2279 for (a=0; a<=D2D; ++a) fprintf(outf," %6i",imax(-9999,iround(tr[i][a]*INTSCALE)));
2280 fprintf(outf," %6i *\n",LOG2pBM);
2281 }
2282 else if (i==L)
2283 {
2284 for (a=0; a<=D2D; ++a) fprintf(outf," *");
2285 fprintf(outf," * 0\n");
2286 }
2287 else
2288 {
2289 for (a=0; a<=D2D; ++a) fprintf(outf," %6i",imax(-9999,iround(tr[i][a]*INTSCALE)));
2290 fprintf(outf," * *\n");
2291 }
2292 }
2293
2294 fprintf(outf,"//\n");
2295 fclose(outf);
2296 }
2297
2298
2299 /////////////////////////////////////////////////////////////////////////////////////
2300 /**
2301 * @brief Transform log to lin transition probs
2302 */
2303 void
2304 HMM::Log2LinTransitionProbs(float beta)
2305 {
2306 if (trans_lin==1) return;
2307 trans_lin=1;
2308 for (int i=0; i<=L; ++i)
2309 {
2310 for (int a=0; a<NTRANS; ++a)
2311 tr[i][a] = fpow2(beta*tr[i][a]);
2312 /* FIXME valgrind says: "Conditional jump or move depends on
2313 * uninitialised value(s)" when using hmm iteration
2314 */
2315 }
2316 }
2317
2318
2319 /**
2320 * @brief Set query columns in His-tags etc to Null model distribution
2321 */
2322 void
2323 HMM::NeutralizeTags()
2324 {
2325 char* qseq = seq[nfirst];
2326 char* pt;
2327 int a,i;
2328
2329 if (NULL == qseq){
2330 return;
2331 }
2332
2333 // Neutralize His tag
2334 if ( (pt=strstr(qseq,"HHHHH")) )
2335 {
2336 int i0 = pt-qseq+1;
2337 if (v>=2) printf("Neutralized His-tag at position %i\n",i0);
2338 for (i=imax(i0-5,1); i<i0; ++i) // neutralize leading 5 columns
2339 for (a=0; a<NAA; ++a) p[i][a]=pb[a];
2340 for (; (*pt)!='H'; ++i,++pt) // neutralize His columns
2341 for (a=0; a<NAA; ++a) p[i][a]=pb[a];
2342 i0=i;
2343 for (; i<imin(i0+5,L+1); ++i) // neutralize trailing 5 columns
2344 for (a=0; a<NAA; ++a) p[i][a]=pb[a];
2345 if (v>=3) printf("start:%i end:%i\n",imax(i0-5,1),i-1);
2346 }
2347
2348 // Neutralize C-myc tag
2349 if ( (pt=strstr(qseq,"EQKLISEEDL")) )
2350 {
2351 if (v>=2) printf("Neutralized C-myc-tag at position %i\n",int(pt-qseq)+1);
2352 for (i=pt-qseq+1; i<=pt-qseq+10; ++i)
2353 for (a=0; a<NAA; ++a) p[i][a]=pb[a];
2354 }
2355 // Neutralize FLAG tag
2356 if ( (pt=strstr(qseq,"DYKDDDDK")) )
2357 {
2358 if (v>=2) printf("Neutralized FLAG-tag at position %i\n",int(pt-qseq)+1);
2359 for (i=pt-qseq+1; i<=pt-qseq+8; ++i)
2360 for (a=0; a<NAA; ++a) p[i][a]=pb[a];
2361 }
2362 }
2363
2364
2365
2366 /////////////////////////////////////////////////////////////////////////////////////
2367 /**
2368 * @brief Calculate effective number of sequences using profiles INCLUDING pseudocounts
2369 */
2370 float
2371 HMM::CalcNeff()
2372 {
2373 float Neff=0;
2374 for (int i=1; i<=L; ++i)
2375 for (int a=0; a<20; ++a)
2376 if (p[i][a]>1E-10) Neff-=p[i][a]*fast_log2(p[i][a]);
2377 return fpow2(Neff/L);
2378 }
2379
2380
2381 /////////////////////////////////////////////////////////////////////////////////////
2382 /**
2383 * @brief Calculate consensus of HMM (needed to merge HMMs later)
2384 */
2385 void
2386 HMM::CalculateConsensus()
2387 {
2388 int i; // position in query
2389 int a; // amino acid
2390 if (!Xcons) Xcons = new char[/*MAXRES*/par.maxResLen+2];
2391 for (i=1; i<=L; ++i)
2392 {
2393 float max=f[i][0]-pb[0];
2394 for (a=1; a<20; ++a)
2395 if (f[i][a]-pb[a]>max) Xcons[i]=a;
2396 }
2397 Xcons[0]=Xcons[L+1]=ENDGAP;
2398 }
2399
2400 // /////////////////////////////////////////////////////////////////////////////////////
2401 // // Store linear transition probabilities
2402 // /////////////////////////////////////////////////////////////////////////////////////
2403 // void HMM::StoreLinearTransitionProbs()
2404 // {
2405 // int i; // position in query
2406 // for (i=0; i<=L+1; ++i) if (!tr_lin[i]) tr_lin[i] = new(float[NTRANS]);
2407 // for (i=0; i<=L+1; ++i)
2408 // {
2409 // tr_lin[i][M2M] = fpow2(tr[i][M2M]);
2410 // tr_lin[i][M2I] = fpow2(tr[i][M2I]);
2411 // tr_lin[i][M2D] = fpow2(tr[i][M2D]);
2412 // tr_lin[i][D2M] = fpow2(tr[i][M2D]);
2413 // tr_lin[i][D2D] = fpow2(tr[i][D2D]);
2414 // tr_lin[i][I2M] = fpow2(tr[i][I2M]);
2415 // tr_lin[i][I2I] = fpow2(tr[i][I2I]);
2416 // }
2417 // }
2418
2419
2420 // #define Weff(Neff) (1.0+par.neffa*(Neff-1.0)+(par.neffb-4.0*par.neffa)/16.0*(Neff-1.0)*(Neff-1.0))
2421
2422 // /////////////////////////////////////////////////////////////////////////////////////
2423 // // Initialize f[i][a] with query HMM
2424 // /////////////////////////////////////////////////////////////////////////////////////
2425 // void HMM::MergeQueryHMM(HMM& q, float wk[])
2426 // {
2427 // int i; // position in query
2428 // int a; // amino acid
2429 // float Weff_M, Weff_D, Weff_I;
2430 // for (i=1; i<=L; i++)
2431 // {
2432 // Weff_M = Weff(q.Neff_M[i]-1.0);
2433 // Weff_D = Weff(q.Neff_D[i]-1.0);
2434 // Weff_I = Weff(q.Neff_I[i]-1.0);
2435 // for (a=0; a<20; a++) f[i][a] = q.f[i][a]*wk[i]*Weff_M;
2436 // tr_lin[i][M2M] = q.tr_lin[i][M2M]*wk[i]*Weff_M;
2437 // tr_lin[i][M2I] = q.tr_lin[i][M2I]*wk[i]*Weff_M;
2438 // tr_lin[i][M2D] = q.tr_lin[i][M2D]*wk[i]*Weff_M;
2439 // tr_lin[i][D2M] = q.tr_lin[i][D2M]*wk[i]*Weff_D;
2440 // tr_lin[i][D2D] = q.tr_lin[i][D2D]*wk[i]*Weff_D;
2441 // tr_lin[i][I2M] = q.tr_lin[i][I2M]*wk[i]*Weff_I;
2442 // tr_lin[i][I2I] = q.tr_lin[i][I2I]*wk[i]*Weff_I;
2443 // }
2444 // }
2445
2446
2447
2448 // /////////////////////////////////////////////////////////////////////////////////////
2449 // // Normalize probabilities in total merged super-HMM
2450 // /////////////////////////////////////////////////////////////////////////////////////
2451 // void HMM::NormalizeHMMandTransitionsLin2Log()
2452 // {
2453 // int i; // position in query
2454 // int a; // amino acid
2455 // for (i=0; i<=L+1; i++)
2456 // {
2457 // float sum=0.0;
2458 // for (a=0; a<20; a++) sum += f[i][a];
2459 // for (a=0; a<20; a++) f[i][a]/=sum;
2460 // sum = tr_lin[i][M2M] + tr_lin[i][M2I] + tr_lin[i][M2D];
2461 // tr_lin[i][M2M] /= sum;
2462 // tr_lin[i][M2I] /= sum;
2463 // tr_lin[i][M2D] /= sum;
2464 // tr[i][M2M] = fast_log2(tr_lin[i][M2M]);
2465 // tr[i][M2I] = fast_log2(tr_lin[i][M2I]);
2466 // tr[i][M2D] = fast_log2(tr_lin[i][M2D]);
2467 // sum = tr_lin[i][D2M] + tr_lin[i][D2D];
2468 // tr_lin[i][D2M] /= sum;
2469 // tr_lin[i][D2D] /= sum;
2470 // tr[i][D2M] = fast_log2(tr_lin[i][D2M]);
2471 // tr[i][D2D] = fast_log2(tr_lin[i][D2D]);
2472 // sum = tr_lin[i][I2M] + tr_lin[i][I2I];
2473 // tr_lin[i][I2M] /= sum;
2474 // tr_lin[i][I2I] /= sum;
2475 // tr[i][I2M] = fast_log2(tr_lin[i][I2M]);
2476 // tr[i][I2I] = fast_log2(tr_lin[i][I2I]);
2477 // }
2478 // }
2479
2480
2481 // UNCOMMENT TO ACTIVATE COMPOSITIONALLY BIASED PSEUDOCOUNTS BY RESCALING THE RATE MATRIX
2482
2483 // /////////////////////////////////////////////////////////////////////////////////////
2484 // //// Function to minimize
2485 // /////////////////////////////////////////////////////////////////////////////////////
2486 // double RescaleMatrixFunc(double x[])
2487 // {
2488 // double sum=0.0;
2489 // for (int a=0; a<20; ++a)
2490 // {
2491 // double za=0.0;
2492 // for (int b=0; b<20; ++b) za+=P[a][b]*x[b];
2493 // sum += (x[a]*za-qav[a])*(x[a]*za-qav[a]);
2494 // }
2495 // return sum;
2496 // }
2497
2498 // /////////////////////////////////////////////////////////////////////////////////////
2499 // //// Gradient of function to minimize
2500 // /////////////////////////////////////////////////////////////////////////////////////
2501 // void RescaleMatrixFuncGrad(double grad[], double x[])
2502 // {
2503 // double z[20] = {0.0};
2504 // double w[20];
2505 // double tmp;
2506 // for (int a=0; a<20; ++a)
2507 // for (int b=0; b<20; ++b) z[a] += P[a][b]*x[b];
2508
2509 // for (int a=0; a<20; ++a) w[a] = x[a]*z[a]-qav[a];
2510 // for (int a=0; a<20; ++a)
2511 // {
2512 // tmp = w[a]*z[a];
2513 // for (int b=0; b<20; ++b) tmp += P[a][b]*x[b]*w[b];
2514 // grad[a] = 2.0*tmp;
2515 // }
2516 // return;
2517 // }
2518
2519
2520 // /////////////////////////////////////////////////////////////////////////////////////
2521 // //// Rescale a substitution matrix to biased aa frequencies in global vector qav[a]
2522 // /////////////////////////////////////////////////////////////////////////////////////
2523 // void HMM::RescaleMatrix()
2524 // {
2525 // int a,b;
2526 // int code;
2527 // double x[21]; // scaling factor
2528 // double val_min;
2529 // const int len=20;
2530 // const int max_iterations=50;
2531
2532 // if (v>=2) printf("Adjusting rate matrix to query amino acid composition ...\n");
2533
2534 // // Put amino acid frequencies into global array (needed to call WNLIB's conjugate gradient method)
2535 // for (a=0; a<20; ++a) qav[a] = pav[a];
2536
2537 // // Initialize scaling factors x[a]
2538 // for (a=0; a<20; ++a) x[a]=pow(qav[a]/pb[a],0.73); // Initialize
2539
2540 // // Call conjugate gradient minimization method from WNLIB
2541 // wn_conj_gradient_method(&code,&val_min,x,len,&RescaleMatrixFunc,&RescaleMatrixFuncGrad,max_iterations);
2542
2543
2544 // // Calculate z[a] = sum_b Pab*xb
2545 // float sum_err=0.0f;
2546 // float sum = 0.0f;
2547 // for (a=0; a<20; ++a)
2548 // {
2549 // float za=0.0f; // za = sum_b Pab*xb
2550 // for (b=0; b<20; ++b) za+=P[a][b]*x[b];
2551 // sum_err += (x[a]*za/qav[a]-1)*(x[a]*za/qav[a]-1);
2552 // sum += x[a]*za;
2553 // }
2554 // if (sum_err>1e-3 & v>=1) fprintf(stderr,"WARNING: adjusting rate matrix by CG resulted in residual error of %5.3f.\n",sum_err);
2555
2556 // // Rescale rate matrix
2557 // for (a=0; a<20; ++a)
2558 // for (b=0; b<20; ++b)
2559 // {
2560 // P[a][b] *= x[a]*x[b]/sum;
2561 // R[a][b] = P[a][b]/qav[b];
2562 // }
2563
2564 // // How well approximated?
2565 // if (v>=3)
2566 // {
2567 // // Calculate z[a] = sum_b Pab*xb
2568 // float z[21];
2569 // for (a=0; a<20; ++a)
2570 // for (z[a]=0.0, b=0; b<20; ++b) z[a]+=P[a][b];
2571 // printf("Adjust A R N D C Q E G H I L K M F P S T W Y V\nErr? ");
2572 // for (a=0; a<20; ++a) printf("%4.0f ",1000*z[a]/qav[a]);
2573 // cout<<endl<<"xa ";
2574 // for (a=0; a<20; ++a) fprintf(stdout,"%4.2f ",x[a]);
2575 // cout<<endl;
2576 // }
2577
2578 // // Evaluate sequence identity underlying substitution matrix
2579 // if (v>=3)
2580 // {
2581 // float id=0.0f;
2582 // float entropy=0.0f;
2583 // float entropy_qav=0.0f;
2584 // float mut_info=0.0f;
2585 // for (a=0; a<20; ++a) id += P[a][a];
2586 // for (a=0; a<20; ++a) entropy_qav-=qav[a]*fast_log2(qav[a]);
2587 // for (a=0; a<20; ++a)
2588 // for (b=0; b<20; ++b)
2589 // {
2590 // entropy-=P[a][b]*fast_log2(R[a][b]);
2591 // mut_info += P[a][b]*fast_log2(P[a][b]/qav[a]/qav[b]);
2592 // }
2593
2594 // fprintf(stdout,"Rescaling rate matrix: sequence identity = %2.0f%%; entropy per column = %4.2f bits (out of %4.2f); mutual information = %4.2f bits\n",100*id,entropy,entropy_qav,mut_info);
2595 // }
2596 // return;
2597 // }
2598
2599
2600 /* @* HMM::ClobberGlobal (eg, q,t)
2601 */
2602 void
2603 HMM::ClobberGlobal(void){
2604
2605 for (int i = 0; i < n_display; i++){
2606 if (sname[i]){
2607 delete[] sname[i]; sname[i] = NULL;
2608 }
2609 if (seq[i]){
2610 delete[] seq[i]; seq[i] = NULL;
2611 }
2612 }
2613 Neff_M[0] = Neff_I[0] = Neff_D[0] = 0.0;
2614 longname[0] = '\0'; file[0] = '\0';
2615 ss_dssp[0] = sa_dssp[0] = ss_pred[0] = ss_conf[0] = '\0';
2616 Xcons = NULL;
2617 l[0] = 0;
2618 L = 0;
2619 Neff_HMM = 0;
2620 n_display = N_in = N_filtered = 0;
2621 nss_dssp = nsa_dssp = nss_pred = nss_conf = nfirst = ncons = -1;
2622 lamda = 0.0; mu = 0.0;
2623 name[0] = longname[0] = fam[0] = '\0';
2624
2625 for (int i = 0; i < NAA; i++){
2626 pav[i] = 0;
2627 }
2628
2629 /* @= */
2630 return;
2631
2632 } /* this is the end of ClobberGlobal() */
2633
2634
2635 /*
2636 * EOF hhhmm-C.h
2637 */