Mercurial > repos > clustalomega > clustalomega
comparison clustalomega/clustal-omega-1.0.2/src/hhalign/hhhmm-C.h @ 1:bc707542e5de
Uploaded
author | clustalomega |
---|---|
date | Thu, 21 Jul 2011 13:35:08 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:ff1768533a07 | 1:bc707542e5de |
---|---|
1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ | |
2 | |
3 /********************************************************************* | |
4 * Clustal Omega - Multiple sequence alignment | |
5 * | |
6 * Copyright (C) 2010 University College Dublin | |
7 * | |
8 * Clustal-Omega is free software; you can redistribute it and/or | |
9 * modify it under the terms of the GNU General Public License as | |
10 * published by the Free Software Foundation; either version 2 of the | |
11 * License, or (at your option) any later version. | |
12 * | |
13 * This file is part of Clustal-Omega. | |
14 * | |
15 ********************************************************************/ | |
16 | |
17 /* | |
18 * RCS $Id: hhhmm-C.h 224 2011-03-23 12:13:33Z fabian $ | |
19 */ | |
20 | |
21 | |
22 // hhhmm.C | |
23 | |
24 #ifndef MAIN | |
25 #define MAIN | |
26 #include <iostream> // cin, cout, cerr | |
27 #include <fstream> // ofstream, ifstream | |
28 #include <stdio.h> // printf | |
29 using std::cout; | |
30 using std::cerr; | |
31 using std::endl; | |
32 using std::ios; | |
33 using std::ifstream; | |
34 using std::ofstream; | |
35 #include <stdlib.h> // exit | |
36 #include <string> // strcmp, strstr | |
37 #include <math.h> // sqrt, pow | |
38 #include <limits.h> // INT_MIN | |
39 #include <float.h> // FLT_MIN | |
40 #include <time.h> // clock | |
41 #include <ctype.h> // islower, isdigit etc | |
42 #include "util-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc. | |
43 #include "list.h" // list data structure | |
44 #include "hash.h" // hash data structure | |
45 #include "hhdecl-C.h" | |
46 #include "hhutil-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc. | |
47 #endif | |
48 | |
49 // #ifndef WNLIB | |
50 // #define WNLIB | |
51 // #include "wnconj.h" // Will Naylor's wnlib for optimization in C | |
52 // #endif | |
53 | |
54 ////////////////////////////////////////////////////////////////////////////// | |
55 //// Class HMM | |
56 ////////////////////////////////////////////////////////////////////////////// | |
57 | |
58 ////////////////////////////////////////////////////////////////////////////// | |
59 // Object constructor | |
60 ////////////////////////////////////////////////////////////////////////////// | |
61 HMM::HMM(int maxseqdis, int maxres) | |
62 { | |
63 sname = new char*[maxseqdis]; // names of stored sequences | |
64 for (int i = 0; i < maxseqdis; i++){ sname[i] = NULL;} | |
65 seq = new char*[maxseqdis]; // residues of stored sequences (first at pos 1!) | |
66 for (int i = 0; i < maxseqdis; i++){ seq[i] = NULL;} | |
67 Neff_M = new float[maxres]; // Neff_M[i] = diversity of subalignment of seqs that have residue in col i | |
68 Neff_I = new float[maxres]; // Neff_I[i] = diversity of subalignment of seqs that have insert in col i | |
69 Neff_D = new float[maxres]; // Neff_D[i] = diversity of subalignment of seqs that have delete in col i | |
70 longname = new char[DESCLEN]; // Full name of first sequence of original alignment (NAME field) | |
71 ss_dssp = new char[maxres]; // secondary structure determined by dssp 0:- 1:H 2:E 3:C 4:S 5:T 6:G 7:B | |
72 sa_dssp = new char[maxres]; // solvent accessibility state determined by dssp 0:- 1:A (absolutely buried) 2:B 3:C 4:D 5:E (exposed) | |
73 ss_pred = new char[maxres]; // predicted secondary structure 0:- 1:H 2:E 3:C | |
74 ss_conf = new char[maxres]; // confidence value of prediction 0:- 1:0 ... 10:9 | |
75 Xcons = NULL; // create only when needed: consensus sequence in internal representation (A=0 R=1 N=2 D=3 ...) | |
76 l = new int[maxres]; // l[i] = pos. of j'th match state in aligment | |
77 /* FS introduced sentinel, NULL terminates loop in destructor, FS, r221->222 */ | |
78 f = new float*[maxres+1]; f[maxres] = NULL; // f[i][a] = prob of finding amino acid a in column i WITHOUT pseudocounts | |
79 g = new float*[maxres+1]; g[maxres] = NULL; // f[i][a] = prob of finding amino acid a in column i WITH pseudocounts | |
80 p = new float*[maxres+1]; p[maxres] = NULL; // p[i][a] = prob of finding amino acid a in column i WITH OPTIMUM pseudocounts | |
81 tr = new float*[maxres+1]; tr[maxres] = NULL; // log2 of transition probabilities M2M M2I M2D I2M I2I D2M D2D M2M_GAPOPEN GAPOPEN GAPEXTD | |
82 // tr_lin = new float*[maxres]; // linear transition probabilities M2M M2I M2D I2M I2I D2M D2D M2M_GAPOPEN GAPOPEN GAPEXTD | |
83 for (int i=0; i<maxres; i++) {f[i]=new(float[NAA+3]);} | |
84 for (int i=0; i<maxres; i++) {g[i]=new(float[NAA]);} | |
85 for (int i=0; i<maxres; i++) {p[i]=new(float[NAA]);} | |
86 for (int i=0; i<maxres; i++) {tr[i]=new(float[NTRANS]);} | |
87 // for (int i=0; i<maxres; i++) tr_lin[i]=new(float[NTRANS]); | |
88 L=0; | |
89 Neff_HMM=0; | |
90 n_display=N_in=N_filtered=0; | |
91 nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1; | |
92 // lamda_hash.New(37,0.0); // Set size and NULL element for hash | |
93 // mu_hash.New(37,0.0); // Set size and NULL element for hash | |
94 lamda=0.0; mu=0.0; | |
95 name[0]=longname[0]=fam[0]='\0'; | |
96 trans_lin=0; // transition probs in log space | |
97 } | |
98 | |
99 | |
100 ////////////////////////////////////////////////////////////////////////////// | |
101 // Object destructor | |
102 ////////////////////////////////////////////////////////////////////////////// | |
103 HMM::~HMM() | |
104 { | |
105 //Delete name and seq matrices | |
106 if (NULL != sname){ | |
107 for (int k=0; (k < n_display) && (NULL != sname[k]); k++){ | |
108 delete[] sname[k]; sname[k] = NULL; | |
109 } | |
110 delete[] sname; sname = NULL; | |
111 } | |
112 if (NULL != seq){ | |
113 for (int k=0; (k < n_display) && (NULL != seq[k]); k++){ | |
114 delete[] seq[k]; seq[k] = NULL; | |
115 } | |
116 delete[] seq; seq = NULL; | |
117 } | |
118 delete[] Neff_M; Neff_M = NULL; | |
119 delete[] Neff_D; Neff_D = NULL; | |
120 delete[] Neff_I; Neff_I = NULL; | |
121 delete[] longname; longname = NULL; | |
122 delete[] ss_dssp; ss_dssp = NULL; | |
123 delete[] sa_dssp; sa_dssp = NULL; | |
124 delete[] ss_pred; ss_pred = NULL; | |
125 delete[] ss_conf; ss_conf = NULL; | |
126 delete[] Xcons; Xcons = NULL; | |
127 delete[] l; l = NULL; | |
128 for (int i=0; i</*MAXRES*/par.maxResLen; i++){ | |
129 if (f[i]){ | |
130 delete[] f[i]; f[i] = NULL; | |
131 } | |
132 else break; | |
133 } | |
134 for (int i=0; i</*MAXRES*/par.maxResLen; i++){ | |
135 if (g[i]){ | |
136 delete[] g[i]; g[i] = NULL; | |
137 } | |
138 else break; | |
139 } | |
140 for (int i=0; i</*MAXRES*/par.maxResLen; i++){ | |
141 if (p[i]){ | |
142 delete[] p[i]; p[i] = NULL; | |
143 } | |
144 else break; | |
145 } | |
146 for (int i=0; i</*MAXRES*/par.maxResLen; i++){ | |
147 if (tr[i]){ | |
148 delete[] tr[i]; tr[i] = NULL; | |
149 } | |
150 else break; | |
151 } | |
152 // for (int i=0; i</*MAXRES*/par.maxResLen; i++) if (tr_lin[i]) delete[] tr_lin[i]; else break; | |
153 delete[] f; f = NULL; | |
154 delete[] g; g = NULL; | |
155 delete[] p; p = NULL; | |
156 delete[] tr; tr = NULL; | |
157 // delete[] tr_lin; | |
158 } | |
159 | |
160 ////////////////////////////////////////////////////////////////////////////// | |
161 // Deep-copy constructor | |
162 ////////////////////////////////////////////////////////////////////////////// | |
163 HMM& HMM::operator=(HMM& q) | |
164 { | |
165 L=q.L; | |
166 for (int i=0; i<=L+1; ++i) | |
167 { | |
168 for (int a=0; a<NAA; ++a) | |
169 { | |
170 f[i][a]=q.f[i][a]; | |
171 g[i][a]=q.g[i][a]; | |
172 p[i][a]=q.p[i][a]; | |
173 } | |
174 for (int a=0; a<NTRANS; ++a) | |
175 tr[i][a]=q.tr[i][a]; | |
176 ss_dssp[i]=q.ss_dssp[i]; | |
177 sa_dssp[i]=q.sa_dssp[i]; | |
178 ss_pred[i]=q.ss_pred[i]; | |
179 ss_conf[i]=q.ss_conf[i]; | |
180 l[i]=q.l[i]; | |
181 } | |
182 if (q.Xcons) | |
183 for (int i=0; i<=L+1; ++i) | |
184 Xcons[i] =q.Xcons[i]; | |
185 | |
186 n_display=q.n_display; | |
187 for (int k=0; k<n_display; k++) { | |
188 sname[k]=new(char[strlen(q.sname[k])+1]); | |
189 if (!sname[k]) MemoryError("array of names for sequences to display"); | |
190 strcpy(sname[k],q.sname[k]); | |
191 } | |
192 for (int k=0; k<n_display; k++) { | |
193 seq[k]=new(char[strlen(q.seq[k])+1]); | |
194 if (!seq[k]) MemoryError("array of names for sequences to display"); | |
195 strcpy(seq[k],q.seq[k]); | |
196 } | |
197 ncons=q.ncons; | |
198 nfirst=q.nfirst; | |
199 nss_dssp=q.nss_dssp; | |
200 nsa_dssp=q.nsa_dssp; | |
201 nss_pred=q.nss_pred; | |
202 nss_conf=q.nss_conf; | |
203 | |
204 for (int i=0; i<=L+1; ++i) Neff_M[i]=q.Neff_M[i]; | |
205 for (int i=0; i<=L+1; ++i) Neff_I[i]=q.Neff_I[i]; | |
206 for (int i=0; i<=L+1; ++i) Neff_D[i]=q.Neff_D[i]; | |
207 Neff_HMM=q.Neff_HMM; | |
208 | |
209 strcpy(longname,q.longname); | |
210 strcpy(name,q.name); | |
211 strcpy(fam,q.fam); | |
212 strcpy(sfam,q.sfam); | |
213 strcpy(fold,q.fold); | |
214 strcpy(cl,q.cl); | |
215 strcpy(file,q.file); | |
216 | |
217 lamda=q.lamda; | |
218 mu=q.mu; | |
219 | |
220 for (int a=0; a<NAA; ++a) pav[a]=q.pav[a]; | |
221 N_in=q.N_in; | |
222 N_filtered=q.N_filtered; | |
223 trans_lin=q.trans_lin; | |
224 return (HMM&) (*this); | |
225 } | |
226 | |
227 | |
228 /////////////////////////////////////////////////////////////////////////////// | |
229 /** | |
230 * @brief Read an HMM from an HHsearch .hhm file; return 0 at end of file | |
231 */ | |
232 int | |
233 HMM::Read(FILE* dbf, char* path) | |
234 { | |
235 char line[LINELEN]=""; // input line | |
236 char str3[8]="",str4[8]=""; // first 3 and 4 letters of input line | |
237 char* ptr; // pointer for string manipulation | |
238 int i=0; // index for match state (first=1) | |
239 int a; // amino acid index | |
240 static int warn=0; | |
241 | |
242 trans_lin=0; | |
243 L=0; | |
244 Neff_HMM=0; | |
245 n_display=N_in=N_filtered=0; | |
246 nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1; | |
247 lamda=mu=0.0; | |
248 trans_lin=0; // transition probs in log space | |
249 name[0]=longname[0]=fam[0]='\0'; | |
250 //If at the end of while-loop L is still 0 then we have reached end of db file | |
251 | |
252 //Do not delete name and seq vectors because their adresses are transferred to hitlist as part of a hit!! | |
253 | |
254 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/')) | |
255 { | |
256 | |
257 if (strscn(line)==NULL) continue; // skip lines that contain only white space | |
258 substr(str3,line,0,2); // copy the first three characters into str3 | |
259 substr(str4,line,0,3); // copy the first four characters into str4 | |
260 | |
261 if (!strncmp("HH",line,2)) continue; | |
262 | |
263 if (!strcmp("NAME",str4)) | |
264 { | |
265 ptr=strscn(line+4); //advance to first non-white-space character | |
266 if (ptr) | |
267 { | |
268 strncpy(longname,ptr,DESCLEN-1); //copy full name to longname | |
269 longname[DESCLEN-1]='\0'; | |
270 strncpy(name,ptr,NAMELEN-1); //copy longname to name... | |
271 strcut(name); //...cut after first word... | |
272 } | |
273 else | |
274 { | |
275 strcpy(longname,"undefined"); | |
276 strcpy(name,"undefined"); | |
277 } | |
278 if (v>=4) cout<<"Reading in HMM "<<name<<":\n"; | |
279 } | |
280 | |
281 else if (!strcmp("FAM",str3)) | |
282 { | |
283 ptr=strscn(line+3); //advance to first non-white-space character | |
284 if (ptr) strncpy(fam,ptr,IDLEN-1); else strcpy(fam,""); //copy family name to basename | |
285 ScopID(cl,fold,sfam,fam); //get scop classification from basename (e.g. a.1.2.3.4) | |
286 } | |
287 | |
288 else if (!strcmp("FILE",str4)) | |
289 { | |
290 if (path) strncpy(file,path,NAMELEN-1); else *file='\0'; // copy path to file variable | |
291 ptr=strscn(line+4); //advance to first non-white-space character | |
292 if (ptr) | |
293 strncat(file,ptr,NAMELEN-1-strlen(file)); // append file name read from file to path | |
294 else strcat(file,"*"); | |
295 } | |
296 | |
297 else if (!strcmp("LENG",str4)) | |
298 { | |
299 ptr=line+4; | |
300 L=strint(ptr); //read next integer (number of match states) | |
301 } | |
302 else if (!strcmp("FILT",str4) || !strcmp("NSEQ",str4)) | |
303 { | |
304 ptr=line+4; | |
305 N_filtered=strint(ptr); //read next integer: number of sequences after filtering | |
306 N_in=strint(ptr); //read next integer: number of sequences in alignment | |
307 } | |
308 | |
309 else if (!strcmp("NEFF",str4) || !strcmp("NAA",str3)) sscanf(line+6,"%f",&Neff_HMM); | |
310 | |
311 else if (!strcmp("EVD",str3)) | |
312 { | |
313 // char key[IDLEN]; | |
314 sscanf(line+6,"%f %f",&lamda,&mu); | |
315 // sscanf(line+22,"%s",key); | |
316 // lamda_hash.Add(key,lamda); | |
317 // mu_hash.Add(key,mu); | |
318 } | |
319 | |
320 else if (!strcmp("DESC",str4)) continue; | |
321 else if (!strcmp("COM",str3)) continue; | |
322 else if (!strcmp("DATE",str4)) continue; | |
323 | |
324 ///////////////////////////////////////////////////////////////////////////////////// | |
325 // Read template sequences that should get displayed in output alignments | |
326 else if (!strcmp("SEQ",str3)) | |
327 { | |
328 //char cur_seq[MAXCOL]=""; //Sequence currently read in | |
329 char *cur_seq = new(char[par.maxColCnt]); //Sequence currently read in | |
330 int k; // sequence index; start with -1; after reading name of n'th sequence-> k=n | |
331 int h; // index for character in input line | |
332 int l=1; // index of character in sequence seq[k] | |
333 int i=1; // index of match states in ss_dssp[i] and ss_pred[i] sequence | |
334 int n_seq=0; // number of sequences to be displayed EXCLUDING ss sequences | |
335 cur_seq[0]='-'; // overwrite '\0' character at beginning to be able to do strcpy(*,cur_seq) | |
336 k=-1; | |
337 while (fgetline(line,LINELEN-1,dbf) && line[0]!='#') | |
338 { | |
339 if (v>=4) cout<<"Read from file:"<<line<<"\n"; //DEBUG | |
340 if (line[0]=='>') //line contains sequence name | |
341 { | |
342 if (k>=MAXSEQDIS-1) //maximum number of allowable sequences exceeded | |
343 {while (fgetline(line,LINELEN-1,dbf) && line[0]!='#'); break;} | |
344 k++; | |
345 if (!strncmp(line,">ss_dssp",8)) nss_dssp=k; | |
346 else if (!strncmp(line,">sa_dssp",8)) nsa_dssp=k; | |
347 else if (!strncmp(line,">ss_pred",8)) nss_pred=k; | |
348 else if (!strncmp(line,">ss_conf",8)) nss_conf=k; | |
349 else if (!strncmp(line,">Cons-",6) || !strncmp(line,">Consensus",10)) ncons=k; | |
350 else | |
351 { | |
352 if (nfirst==-1) nfirst=k; | |
353 if (n_seq>=par.nseqdis) | |
354 {while (fgetline(line,LINELEN-1,dbf) && line[0]!='#'); k--; break;} | |
355 n_seq++; | |
356 } | |
357 | |
358 //If this is not the first sequence then store residues of previous sequence | |
359 if (k>0) { | |
360 seq[k-1]=new(char[strlen(cur_seq)+1]); | |
361 if (!seq[k-1]) MemoryError("array of sequences to display"); | |
362 strcpy(seq[k-1],cur_seq); | |
363 } | |
364 | |
365 // store sequence name | |
366 strcut(line+1); //find next white-space character and overwrite it with end-of-string character | |
367 sname[k] = new (char[strlen(line+1)+1]); //+1 for terminating '\0' | |
368 if (!sname[k]) MemoryError("array of names for sequences to display"); | |
369 strcpy(sname[k],line+1); //store sequence name in **name | |
370 l=1; i=1; | |
371 } | |
372 else //line contains sequence residues | |
373 { | |
374 if (k==-1) | |
375 { | |
376 cerr<<endl<<"WARNING: Ignoring following line while reading HMM"<<name<<":\n\'"<<line<<"\'\n"; | |
377 continue; | |
378 } | |
379 | |
380 h=0; //counts characters in current line | |
381 | |
382 // Check whether all characters are correct; store into cur_seq | |
383 if (k==nss_dssp) // lines with dssp secondary structure states (. - H E C S T G B) | |
384 { | |
385 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1) | |
386 { | |
387 if (ss2i(line[h])>=0 && line[h]!='.') | |
388 { | |
389 char c=ss2ss(line[h]); | |
390 cur_seq[l]=c; | |
391 if (c!='.' && !(c>='a' && c<='z')) ss_dssp[i++]=ss2i(c); | |
392 l++; | |
393 } | |
394 else if (v && ss2i(line[h])==-2) | |
395 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n"; | |
396 h++; | |
397 } | |
398 } | |
399 if (k==nsa_dssp) // lines with dssp secondary solvent accessibility (- A B C D E) | |
400 { | |
401 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1) | |
402 { | |
403 if (sa2i(line[h])>=0) | |
404 { | |
405 char c=line[h]; | |
406 cur_seq[l]=c; | |
407 if (c!='.' && !(c>='a' && c<='z')) sa_dssp[i++]=sa2i(c); | |
408 l++; | |
409 } | |
410 else if (v && sa2i(line[h])==-2) | |
411 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n"; | |
412 h++; | |
413 } | |
414 } | |
415 else if (k==nss_pred) // lines with predicted secondary structure (. - H E C) | |
416 { | |
417 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1) | |
418 { | |
419 if (ss2i(line[h])>=0 && ss2i(line[h])<=3 && line[h]!='.') | |
420 { | |
421 char c=ss2ss(line[h]); | |
422 cur_seq[l]=c; | |
423 if (c!='.' && !(c>='a' && c<='z')) ss_pred[i++]=ss2i(c); | |
424 l++; | |
425 } | |
426 else if (v && ss2i(line[h])==-2) | |
427 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n"; | |
428 h++; | |
429 } | |
430 } | |
431 else if (k==nss_conf) // lines with confidence values should contain only 0-9, '-', or '.' | |
432 { | |
433 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1) | |
434 { | |
435 if (line[h]=='-' || (line[h]>='0' && line[h]<='9')) | |
436 { | |
437 cur_seq[l]=line[h]; | |
438 ss_conf[l]=cf2i(line[h]); | |
439 l++; | |
440 } | |
441 else if (v && cf2i(line[h])==-2) | |
442 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n"; | |
443 h++; | |
444 } | |
445 } | |
446 else // normal line containing residues | |
447 { | |
448 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1) | |
449 { | |
450 if (aa2i(line[h])>=0 && line[h]!='.') // ignore '.' and white-space characters ' ', \t and \n (aa2i()==-1) | |
451 {cur_seq[l]=line[h]; l++;} | |
452 else if (aa2i(line[h])==-2 && v) | |
453 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n"; | |
454 h++; | |
455 } | |
456 } | |
457 cur_seq[l]='\0'; //Ensure that cur_seq ends with a '\0' character | |
458 | |
459 } //end else | |
460 } //while(getline) | |
461 //If this is not the first sequence some residues have already been read in | |
462 if (k>=0) { | |
463 seq[k]=new(char[strlen(cur_seq)+1]); | |
464 if (!seq[k]) MemoryError("array of sequences to display"); | |
465 strcpy(seq[k],cur_seq); | |
466 } | |
467 n_display=k+1; | |
468 | |
469 // DEBUG | |
470 if (v>=4) | |
471 { | |
472 printf("nss_dssp=%i nsa_dssp=%i nss_pred=%i nss_conf=%i nfirst=%i\n",nss_dssp,nsa_dssp,nss_pred,nss_conf,nfirst); | |
473 for (k=0; k<n_display; k++) | |
474 { | |
475 int j; | |
476 cout<<">"<<sname[k]<<"(k="<<k<<")\n"; | |
477 if (k==nss_dssp) {for (j=1; j<=L; j++) cout<<char(i2ss(ss_dssp[j]));} | |
478 else if (k==nsa_dssp) {for (j=1; j<=L; j++) cout<<char(i2sa(sa_dssp[j]));} | |
479 else if (k==nss_pred) {for (j=1; j<=L; j++) cout<<char(i2ss(ss_pred[j]));} | |
480 else if (k==nss_conf) {for (j=1; j<=L; j++) cout<<int(ss_conf[j]-1);} | |
481 else {for (j=1; j<=L; j++) cout<<seq[k][j];} | |
482 cout<<"\n"; | |
483 } | |
484 } | |
485 | |
486 } //end if("SEQ") | |
487 | |
488 ///////////////////////////////////////////////////////////////////////////////////// | |
489 // Read average amino acid frequencies for HMM | |
490 else if (!strcmp("FREQ",str4)) | |
491 { | |
492 fprintf(stderr,"Error: hhm file has obsolete format.\n"); | |
493 fprintf(stderr,"Please use hhmake version > 1.1 to generate hhm files.\n"); | |
494 exit(1); | |
495 } | |
496 | |
497 else if (!strcmp("AVER",str4)) {} // AVER line scrapped | |
498 else if (!strcmp("NULL",str4)) | |
499 { | |
500 ptr=line+4; | |
501 for (a=0; a<20 && ptr; ++a) | |
502 //s2[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids | |
503 pb[s2a[a]] = (float) fpow2(float(-strinta(ptr))/HMMSCALE); | |
504 if (!ptr) return Warning(dbf,line,name); | |
505 if (v>=4) | |
506 { | |
507 printf("\nNULL "); | |
508 for (a=0; a<20; ++a) printf("%5.1f ",100.*pb[s2a[a]]); | |
509 printf("\n"); | |
510 } | |
511 } | |
512 | |
513 ///////////////////////////////////////////////////////////////////////////////////// | |
514 // Read transition probabilities from start state | |
515 else if (!strcmp("HMM",str3)) | |
516 { | |
517 fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels | |
518 fgetline(line,LINELEN-1,dbf); // Skip line with transition labels | |
519 ptr=line; | |
520 for (a=0; a<=D2D && ptr; ++a) | |
521 tr[0][a] = float(-strinta(ptr))/HMMSCALE; //store transition probabilites as log2 values | |
522 // strinta returns next integer in string and puts ptr to first char | |
523 // after the integer. Returns -99999 if '*' is found. | |
524 // ptr is set to 0 if no integer is found after ptr. | |
525 Neff_M[0] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with M->? transition | |
526 Neff_I[0] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with I->? transition | |
527 Neff_D[0] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with D->? transition | |
528 if (!ptr) return Warning(dbf,line,name); | |
529 | |
530 ///////////////////////////////////////////////////////////////////////////////////// | |
531 // Read columns of HMM | |
532 int next_i=0; // index of next column | |
533 while (fgetline(line,LINELEN-2,dbf) && !(line[0]=='/' && line[1]=='/') && line[0]!='#') | |
534 { | |
535 if (strscn(line)==NULL) continue; // skip lines that contain only white space | |
536 | |
537 // Read in AA probabilities | |
538 ptr=line+1; | |
539 int prev_i = next_i; | |
540 next_i = strint(ptr); ++i; | |
541 if (v && next_i!=prev_i+1) | |
542 if (++warn<=5) | |
543 { | |
544 cerr<<endl<<"WARNING: in HMM "<<name<<" state "<<prev_i<<" is followed by state "<<next_i<<"\n"; | |
545 if (warn==5) cerr<<endl<<"WARNING: further warnings while reading HMMs will be suppressed.\n"; | |
546 } | |
547 if (i>L) | |
548 { | |
549 cerr<<endl<<"WARNING: in HMM "<<name<<" there are more columns than the stated length "<<L<<". Skipping HMM\n"; | |
550 return 2; | |
551 } | |
552 if (i>=/*MAXRES*/par.maxResLen-2) | |
553 { | |
554 fgetline(line,LINELEN-1,dbf); // Skip line | |
555 continue; | |
556 } | |
557 | |
558 for (a=0; a<20 && ptr; ++a) | |
559 // f[i][s2a[a]] = (float)pow(2.,float(-strinta(ptr))/HMMSCALE); | |
560 f[i][s2a[a]] = fpow2(float(-strinta(ptr))/HMMSCALE); // speed-up ~5 s for 10000 SCOP domains | |
561 | |
562 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids | |
563 l[i]=strint(ptr); | |
564 if (!ptr) return Warning(dbf,line,name); | |
565 if (v>=4) | |
566 { | |
567 printf("%s",line); | |
568 printf("%6i ",i); | |
569 for (a=0; a<20; ++a) printf("%5.1f ",100*f[i][s2a[a]]); | |
570 printf("%5i",l[i]); | |
571 printf("\n"); | |
572 } | |
573 | |
574 // Read transition probabilities | |
575 fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels | |
576 if (line[0]!=' ' && line[0]!='\t') return Warning(dbf,line,name); | |
577 ptr=line; | |
578 for (a=0; a<=D2D && ptr; ++a) | |
579 tr[i][a] = float(-strinta(ptr))/HMMSCALE; //store transition prob's as log2-values | |
580 Neff_M[i] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with M->? transition | |
581 Neff_I[i] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with I->? transition | |
582 Neff_D[i] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with D->? transition | |
583 if (!ptr) return Warning(dbf,line,name); | |
584 if (v>=4) | |
585 { | |
586 printf(" "); | |
587 for (a=0; a<=D2D; ++a) printf("%5.1f ",100*fpow2(tr[i][a])); | |
588 printf("%5.1f %5.1f %5.1f \n",Neff_M[i],Neff_I[i],Neff_D[i]); | |
589 } | |
590 } | |
591 if (line[0]=='/' && line[1]=='/') break; | |
592 } | |
593 else if (v) cerr<<endl<<"WARNING: Ignoring line\n\'"<<line<<"\'\nin HMM "<<name<<"\n"; | |
594 | |
595 } //while(getline) | |
596 | |
597 if (L==0) return 0; //End of db file -> stop reading in | |
598 | |
599 // Set coefficients of EVD (= 0.0 if not calibrated for these parameters) | |
600 // lamda = lamda_hash.Show(par.Key()); | |
601 // mu = mu_hash.Show(par.Key()); | |
602 if (lamda && v>=3) printf("HMM %s is already calibrated: lamda=%-5.3f, mu=%-5.2f\n",name,lamda,mu); | |
603 | |
604 if (v && i!=L) cerr<<endl<<"Warning: in HMM "<<name<<" there are only "<<i<<" columns while the stated length is "<<L<<"\n"; | |
605 if (v && i>/*MAXRES*/par.maxResLen-2) {i=/*MAXRES*/par.maxResLen-2; cerr<<endl<<"WARNING: maximum number "<</*MAXRES*/par.maxResLen-2<<" of residues exceeded while reading HMM "<<name<<"\n";} | |
606 if (v && !i) cerr<<endl<<"WARNING: HMM "<<name<<" contains no match states. Check the alignment that gave rise to this HMM.\n"; | |
607 if (v>=2) cout<<"Read in HMM "<<name<<" with "<<L<<" match states and effective number of sequences = "<<Neff_HMM<<"\n"; | |
608 L = i; | |
609 | |
610 // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities | |
611 for (a=0; a<20; ++a) f[0][a]=f[L+1][a]=pb[a]; | |
612 Neff_M[L+1]=1.0f; | |
613 Neff_I[L+1]=Neff_D[L+1]=0.0f; | |
614 | |
615 return 1; //return status: ok | |
616 | |
617 } /* this is the end of HMM::Read() */ | |
618 | |
619 | |
620 ///////////////////////////////////////////////////////////////////////////////////// | |
621 /** | |
622 * @brief Read an HMM from a HMMer .hmm file; return 0 at end of file | |
623 */ | |
624 int | |
625 HMM::ReadHMMer(FILE* dbf, char* filestr) | |
626 { | |
627 char line[LINELEN]=""; // input line | |
628 char desc[DESCLEN]=""; // description of family | |
629 char str4[5]=""; // first 4 letters of input line | |
630 char* ptr; // pointer for string manipulation | |
631 int i=0; // index for match state (first=1) | |
632 int a; // amino acid index | |
633 char dssp=0; // 1 if a consensus SS has been found in the transition prob lines | |
634 char annot=0; // 1 if at least one annotation character in insert lines is ne '-' or ' ' | |
635 int k=0; // index for seq[k] | |
636 static char ignore_hmmer_cal = 0; | |
637 char* annotchr; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line | |
638 //annotchr = new char[MAXRES]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line | |
639 annotchr = new char[par.maxResLen]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line | |
640 static int warn=0; | |
641 int iAlpha = 20; /* size of alphabet, default is protein = 20 */ | |
642 double dAlphaInv = 1.00 / (double)(iAlpha); /* weight of AA */ | |
643 | |
644 trans_lin=0; | |
645 L=0; | |
646 Neff_HMM=0; | |
647 n_display=N_in=N_filtered=0; | |
648 nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1; | |
649 lamda=mu=0.0; | |
650 trans_lin=0; // transition probs in log space | |
651 name[0]=longname[0]=desc[0]=fam[0]='\0'; | |
652 //If at the end of while-loop L is still 0 then we have reached end of db file | |
653 | |
654 // Do not delete name and seq vectors because their adresses are transferred to hitlist as part of a hit!! | |
655 | |
656 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/')) | |
657 { | |
658 | |
659 if (strscn(line)==NULL) continue; // skip lines that contain only white space | |
660 if (!strncmp("HMMER",line,5)) continue; | |
661 | |
662 substr(str4,line,0,3); // copy the first four characters into str4 | |
663 | |
664 if (!strcmp("NAME",str4) && name[0]=='\0') | |
665 { | |
666 ptr=strscn(line+4); // advance to first non-white-space character | |
667 strncpy(name,ptr,NAMELEN-1); // copy full name to name | |
668 strcut(name); // ...cut after first word... | |
669 if (v>=4) cout<<"Reading in HMM "<<name<<":\n"; | |
670 } | |
671 | |
672 else if (!strcmp("ACC ",str4)) | |
673 { | |
674 ptr=strscn(line+4); // advance to first non-white-space character | |
675 strncpy(longname,ptr,DESCLEN-1); // copy Accession id to longname... | |
676 } | |
677 | |
678 else if (!strcmp("DESC",str4)) | |
679 { | |
680 ptr=strscn(line+4); // advance to first non-white-space character | |
681 if (ptr) | |
682 { | |
683 strncpy(desc,ptr,DESCLEN-1); // copy description to name... | |
684 desc[DESCLEN-1]='\0'; | |
685 strcut(ptr); // ...cut after first word... | |
686 } | |
687 if (!ptr || ptr[1]!='.' || strchr(ptr+3,'.')==NULL) strcpy(fam,""); else strcpy(fam,ptr); // could not find two '.' in name? | |
688 } | |
689 | |
690 else if (!strcmp("LENG",str4)) | |
691 { | |
692 ptr=line+4; | |
693 L=strint(ptr); //read next integer (number of match states) | |
694 } | |
695 | |
696 else if (!strcmp("ALPH",str4)) { | |
697 | |
698 ptr=strscn(line+4); | |
699 | |
700 if (0 == strcmp(ptr, "Amino")){ | |
701 iAlpha = 20; | |
702 } | |
703 else if (0 == strcmp(ptr, "Nucleic")){ | |
704 iAlpha = 4; | |
705 printf("%s:%s:%d: WARNING: HMM reading does not work for DNA/RNA\n", | |
706 __FUNCTION__, __FILE__, __LINE__); | |
707 } | |
708 else { | |
709 return Warning(dbf,line,name); | |
710 } | |
711 dAlphaInv = 1.00 / (double)(iAlpha); | |
712 //continue; | |
713 } | |
714 else if (!strcmp("RF ",str4)) continue; | |
715 else if (!strcmp("CS ",str4)) continue; | |
716 else if (!strcmp("MAP ",str4)) continue; | |
717 else if (!strcmp("COM ",str4)) continue; | |
718 else if (!strcmp("NSEQ",str4)) | |
719 { | |
720 ptr=line+4; | |
721 N_in=N_filtered=strint(ptr); //read next integer: number of sequences after filtering | |
722 } | |
723 | |
724 else if (!strcmp("DATE",str4)) continue; | |
725 else if (!strncmp("CKSUM ",line,5)) continue; | |
726 else if (!strcmp("GA ",str4)) continue; | |
727 else if (!strcmp("TC ",str4)) continue; | |
728 else if (!strcmp("NC ",str4)) continue; | |
729 | |
730 else if (!strncmp("SADSS",line,5)) | |
731 { | |
732 if (nsa_dssp<0) | |
733 { | |
734 nsa_dssp=k++; | |
735 seq[nsa_dssp] = new(char[/*MAXRES*/par.maxResLen+2]); | |
736 sname[nsa_dssp] = new(char[15]); | |
737 strcpy(seq[nsa_dssp]," "); | |
738 strcpy(sname[nsa_dssp],"sa_dssp"); | |
739 | |
740 } | |
741 ptr=strscn(line+5); | |
742 if (ptr) | |
743 { | |
744 strcut(ptr); | |
745 if (strlen(seq[nsa_dssp])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen)) | |
746 printf("\nWARNING: HMM %s has SADSS records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen); | |
747 else strcat(seq[nsa_dssp],ptr); | |
748 } | |
749 } | |
750 | |
751 else if (!strncmp("SSPRD",line,5)) | |
752 { | |
753 if (nss_pred<0) | |
754 { | |
755 nss_pred=k++; | |
756 seq[nss_pred] = new(char[/*MAXRES*/par.maxResLen+2]); | |
757 sname[nss_pred] = new(char[15]); | |
758 strcpy(seq[nss_pred]," "); | |
759 strcpy(sname[nss_pred],"ss_pred"); | |
760 | |
761 } | |
762 ptr=strscn(line+5); | |
763 if (ptr) | |
764 { | |
765 strcut(ptr); | |
766 if (strlen(seq[nss_pred])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen)) | |
767 printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen); | |
768 else strcat(seq[nss_pred],ptr); | |
769 } | |
770 } | |
771 | |
772 else if (!strncmp("SSCON",line,5)) | |
773 { | |
774 if (nss_conf<0) | |
775 { | |
776 nss_conf=k++; | |
777 seq[nss_conf] = new(char[/*MAXRES*/par.maxResLen+2]); | |
778 sname[nss_conf] = new(char[15]); | |
779 strcpy(seq[nss_conf]," "); | |
780 strcpy(sname[nss_conf],"ss_conf"); | |
781 } | |
782 ptr=strscn(line+5); | |
783 if (ptr) | |
784 { | |
785 strcut(ptr); | |
786 if (strlen(seq[nss_conf])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen)) | |
787 printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen); | |
788 else strcat(seq[nss_conf],ptr); | |
789 } | |
790 } | |
791 | |
792 else if (!strncmp("SSCIT",line,5)) continue; | |
793 else if (!strcmp("XT ",str4)) continue; | |
794 else if (!strcmp("NULT",str4)) continue; | |
795 | |
796 else if (!strcmp("NULE",str4)) | |
797 { | |
798 ptr=line+4; | |
799 for (a=0; (a < iAlpha) && ptr; ++a){ | |
800 /* FIXME: FS introduced alphabet size (was '20') | |
801 and dAlphaInv (was '0.05' = 1/20) */ | |
802 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids | |
803 pb[s2a[a]] = (float) dAlphaInv * fpow2(float(strinta(ptr,-99999))/HMMSCALE); /* dAlphaInv */ | |
804 } | |
805 for (a = iAlpha; a < 20; a++){ | |
806 pb[s2a[a]] = 0.0; | |
807 } | |
808 if (!ptr) return Warning(dbf,line,name); | |
809 if (v>=4) | |
810 { | |
811 printf("\nNULL "); | |
812 for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */ | |
813 printf("%5.1f ",100.*pb[s2a[a]]); | |
814 } | |
815 printf("\n"); | |
816 } | |
817 } | |
818 | |
819 else if (!strcmp("EVD ",str4)) | |
820 { | |
821 char* ptr=line+4; | |
822 ptr = strscn(ptr); | |
823 sscanf(ptr,"%f",&lamda); | |
824 ptr = strscn(ptr); | |
825 sscanf(ptr,"%f",&mu); | |
826 if (lamda<0) | |
827 { | |
828 if (v>=2 && ignore_hmmer_cal==0) | |
829 cerr<<endl<<"Warning: some HMMs have been calibrated with HMMER's 'hmmcalibrate'. These calibrations will be ignored\n"; | |
830 ignore_hmmer_cal=1; | |
831 mu = lamda = 0.0; | |
832 } | |
833 } | |
834 | |
835 ///////////////////////////////////////////////////////////////////////////////////// | |
836 // Read transition probabilities from start state | |
837 else if (!strncmp("HMM",line,3)) | |
838 { | |
839 fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels | |
840 fgetline(line,LINELEN-1,dbf); // Skip line with transition labels | |
841 ptr=line; | |
842 for (a=0; a<=M2D && ptr; ++a) | |
843 tr[0][a] = float(strinta(ptr,-99999))/HMMSCALE; //store transition probabilites as log2 values | |
844 // strinta returns next integer in string and puts ptr to first char | |
845 // after the integer. Returns -99999 if '*' is found. | |
846 // ptr is set to 0 if no integer is found after ptr. | |
847 tr[0][I2M] = tr[0][D2M] = 0.0; | |
848 tr[0][I2I] = tr[0][D2D] = -99999.0; | |
849 if (!ptr) return Warning(dbf,line,name); | |
850 if (v>=4) | |
851 { | |
852 printf(" "); | |
853 for (a=0; a<=D2D && ptr; ++a) printf("%5.1f ",100*fpow2(tr[i][a])); | |
854 printf("\n"); | |
855 } | |
856 | |
857 // Prepare to store DSSP states (if there are none, delete afterwards) | |
858 nss_dssp=k++; | |
859 seq[nss_dssp] = new(char[/*MAXRES*/par.maxResLen+2]); | |
860 sname[nss_dssp] = new(char[15]); | |
861 strcpy(sname[nss_dssp],"ss_dssp"); | |
862 | |
863 ///////////////////////////////////////////////////////////////////////////////////// | |
864 // Read columns of HMM | |
865 int next_i=0; // index of next column | |
866 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/') && line[0]!='#') | |
867 { | |
868 if (strscn(line)==NULL) continue; // skip lines that contain only white space | |
869 | |
870 // Read in AA probabilities | |
871 ptr=line; | |
872 int prev_i = next_i; | |
873 next_i = strint(ptr); ++i; | |
874 if (v && next_i!=prev_i+1) | |
875 if (++warn<5) | |
876 { | |
877 cerr<<endl<<"WARNING: in HMM "<<name<<" state "<<prev_i<<" is followed by state "<<next_i<<"\n"; | |
878 if (warn==5) cerr<<endl<<"WARNING: further warnings while reading HMMs will be suppressed.\n"; | |
879 } | |
880 if (i>L) | |
881 { | |
882 cerr<<endl<<"Error: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n"; | |
883 return 2; | |
884 } | |
885 if (i>L && v) | |
886 cerr<<endl<<"WARNING: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n"; | |
887 if (i>=/*MAXRES*/par.maxResLen-2) | |
888 { | |
889 fgetline(line,LINELEN-1,dbf); // Skip two lines | |
890 fgetline(line,LINELEN-1,dbf); | |
891 continue; | |
892 } | |
893 | |
894 for (a=0; (a<iAlpha) && ptr; ++a){ /* FIXME: FS introduced iAlpha, was '20' */ | |
895 f[i][s2a[a]] = (float) pb[s2a[a]]*fpow2(float(strinta(ptr,-99999))/HMMSCALE); | |
896 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids | |
897 } | |
898 for (a = iAlpha; a < 20; a++){ | |
899 f[i][s2a[a]] = 0.0; | |
900 } | |
901 if (!ptr) return Warning(dbf,line,name); | |
902 if (v>=4) | |
903 { | |
904 printf("%6i ",i); | |
905 for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */ | |
906 printf("%5.1f ",100*f[i][s2a[a]]); | |
907 } | |
908 printf("\n"); | |
909 } | |
910 | |
911 // Read insert emission line | |
912 fgetline(line,LINELEN-1,dbf); | |
913 ptr = strscn(line); | |
914 if (!ptr) return Warning(dbf,line,name); | |
915 annotchr[i]=uprchr(*ptr); | |
916 if (*ptr!='-' && *ptr!=' ') annot=1; | |
917 | |
918 // Read annotation character and seven transition probabilities | |
919 fgetline(line,LINELEN-1,dbf); | |
920 ptr = strscn(line); | |
921 switch (*ptr) | |
922 { | |
923 case 'H': | |
924 ss_dssp[i]=1; | |
925 seq[nss_dssp][i]=*ptr; | |
926 dssp=1; | |
927 break; | |
928 case 'E': | |
929 ss_dssp[i]=2; | |
930 seq[nss_dssp][i]=*ptr; | |
931 dssp=1; | |
932 break; | |
933 case 'C': | |
934 ss_dssp[i]=3; | |
935 seq[nss_dssp][i]=*ptr; | |
936 dssp=1; | |
937 break; | |
938 case 'S': | |
939 ss_dssp[i]=4; | |
940 seq[nss_dssp][i]=*ptr; | |
941 dssp=1; | |
942 break; | |
943 case 'T': | |
944 ss_dssp[i]=5; | |
945 seq[nss_dssp][i]=*ptr; | |
946 dssp=1; | |
947 break; | |
948 case 'G': | |
949 ss_dssp[i]=6; | |
950 seq[nss_dssp][i]=*ptr; | |
951 dssp=1; | |
952 break; | |
953 case 'B': | |
954 ss_dssp[i]=7; | |
955 seq[nss_dssp][i]=*ptr; | |
956 dssp=1; | |
957 break; | |
958 case 'I': | |
959 dssp=1; | |
960 case '~': | |
961 ss_dssp[i]=3; | |
962 seq[nss_dssp][i]=*ptr; | |
963 break; | |
964 case '-': | |
965 default: | |
966 ss_dssp[i]=0; | |
967 seq[nss_dssp][i]=*ptr; | |
968 break; | |
969 | |
970 } | |
971 | |
972 ptr+=2; | |
973 for (a=0; a<=D2D && ptr; ++a) | |
974 tr[i][a] = float(strinta(ptr,-99999))/HMMSCALE; //store transition prob's as log2-values | |
975 if (!ptr) return Warning(dbf,line,name); | |
976 if (v>=4) | |
977 { | |
978 printf(" "); | |
979 for (a=0; a<=D2D; ++a) printf("%5.1f ",100*fpow2(tr[i][a])); | |
980 printf("\n"); | |
981 } | |
982 } | |
983 | |
984 if (line[0]=='/' && line[1]=='/') break; | |
985 | |
986 } /* strncmp("HMM") */ | |
987 | |
988 } //while(getline) | |
989 | |
990 if (L==0) return 0; //End of db file -> stop reading in | |
991 | |
992 // Set coefficients of EVD (= 0.0 if not calibrated for these parameters) | |
993 // lamda = lamda_hash.Show(par.Key()); | |
994 // mu = mu_hash.Show(par.Key()); | |
995 if (lamda && v>=2) printf("HMM %s is already calibrated: lamda=%-5.3f, mu=%-5.2f\n",name,lamda,mu); | |
996 | |
997 if (v && i!=L) cerr<<endl<<"Warning: in HMM "<<name<<" there are only "<<i<<" columns while the stated length is "<<L<<"\n"; | |
998 if (v && i>=/*MAXRES*/par.maxResLen-2) {i=/*MAXRES*/par.maxResLen-2; cerr<<endl<<"WARNING: maximum number "<</*MAXRES*/par.maxResLen-2<<" of residues exceeded while reading HMM "<<name<<"\n";} | |
999 if (v && !i) cerr<<endl<<"WARNING: HMM "<<name<<" contains no match states. Check the alignment that gave rise to this HMM.\n"; | |
1000 L = i; | |
1001 | |
1002 if (strlen(longname)>0) strcat(longname," "); | |
1003 strncat(longname,name,DESCLEN-strlen(longname)-1); // longname = ACC NAME DESC | |
1004 if (strlen(name)>0) strcat(longname," "); | |
1005 strncat(longname,desc,DESCLEN-strlen(longname)-1); | |
1006 longname[DESCLEN-1]='\0'; | |
1007 ScopID(cl,fold,sfam,fam);// get scop classification from basename (e.g. a.1.2.3.4) | |
1008 RemoveExtension(file,filestr); // copy name of dbfile without extension into 'file' | |
1009 | |
1010 // Secondary structure | |
1011 if (!dssp) | |
1012 { | |
1013 // remove dssp sequence | |
1014 // memory that had been allocated in case ss_dssp was given needs to be freed | |
1015 delete[] seq[nss_dssp]; seq[nss_dssp] = NULL; | |
1016 // memory that had been allocated in case ss_dssp was given needs to be freed | |
1017 delete[] sname[nss_dssp]; sname[nss_dssp] = NULL; | |
1018 nss_dssp=-1; | |
1019 k--; | |
1020 } | |
1021 if (nss_pred>=0) | |
1022 { | |
1023 for (i=1; i<=L; ++i) ss_pred[i] = ss2i(seq[nss_pred][i]); | |
1024 if (nss_conf>=0) | |
1025 for (i=1; i<=L; ++i) ss_conf[i] = cf2i(seq[nss_conf][i]); | |
1026 else | |
1027 for (i=1; i<=L; ++i) ss_conf[i] = 5; | |
1028 } | |
1029 | |
1030 // Copy query (first sequence) and consensus residues? | |
1031 if (par.showcons) | |
1032 { | |
1033 sname[k]=new(char[10]); | |
1034 strcpy(sname[k],"Consensus"); | |
1035 sname[k+1]=new(char[strlen(longname)+1]); | |
1036 strcpy(sname[k+1],longname); | |
1037 seq[k]=new(char[L+2]); | |
1038 seq[k][0]=' '; | |
1039 seq[k][L+1]='\0'; | |
1040 seq[k+1]=new(char[L+2]); | |
1041 seq[k+1][0]=' '; | |
1042 seq[k+1][L+1]='\0'; | |
1043 for (i=1; i<=L; ++i) | |
1044 { | |
1045 float pmax=0.0; | |
1046 int amax=0; | |
1047 for (a=0; a<NAA; ++a) | |
1048 if (f[i][a]>pmax) {amax=a; pmax=f[i][a];} | |
1049 if (pmax>0.6) seq[k][i]=i2aa(amax); | |
1050 else if (pmax>0.4) seq[k][i]=lwrchr(i2aa(amax)); | |
1051 else seq[k][i]='x'; | |
1052 seq[k+1][i]=i2aa(amax); | |
1053 } | |
1054 ncons=k++; // nfirst is set later! | |
1055 } | |
1056 else | |
1057 { | |
1058 sname[k]=new(char[strlen(longname)+1]); | |
1059 /* FIXME valgrind says bytes get lost here during hmm iteration -- | |
1060 fixed in HMM::ClobberGlobal(), I (FS) think */ | |
1061 strcpy(sname[k],longname); | |
1062 seq[k]=new(char[L+2]); | |
1063 seq[k][0]=' '; | |
1064 seq[k][L+1]='\0'; | |
1065 } | |
1066 | |
1067 if (annot) // read in some annotation characters? | |
1068 { | |
1069 annotchr[0]=' '; | |
1070 annotchr[L+1]='\0'; | |
1071 strcpy(seq[k],annotchr); // overwrite the consensus sequence with the annotation characters | |
1072 } | |
1073 else if (!par.showcons) // we have not yet calculated the consensus, but we need it now as query (first sequence) | |
1074 { | |
1075 /* FIXME: FS set ncons=k | |
1076 don't understand why it is not set but seem to need it */ | |
1077 ncons = k; | |
1078 for (i=1; i<=L; ++i) | |
1079 { | |
1080 float pmax=0.0; | |
1081 int amax=0; | |
1082 for (a=0; a<NAA; ++a) | |
1083 if (f[i][a]>pmax) {amax=a; pmax=f[i][a];} | |
1084 seq[k][i]=i2aa(amax); | |
1085 } | |
1086 } | |
1087 // printf("%i query name=%s seq=%s\n",n,sname[n],seq[n]); | |
1088 nfirst=k++; | |
1089 | |
1090 n_display=k; | |
1091 | |
1092 // Calculate overall Neff_HMM | |
1093 Neff_HMM=0; | |
1094 for (i=1; i<=L; ++i) | |
1095 { | |
1096 float S=0.0; | |
1097 for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */ | |
1098 if (f[i][a]>1E-10) S-=f[i][a]*fast_log2(f[i][a]); | |
1099 } | |
1100 Neff_HMM+=(float) fpow2(S); | |
1101 } | |
1102 Neff_HMM/=L; | |
1103 for (i=0; i<=L; ++i) Neff_M[i] = Neff_I[i] = Neff_D[i] = 10.0; // to add only little additional pseudocounts! | |
1104 if (v>=2) | |
1105 cout<<"Read in HMM "<<name<<" with "<<L<<" match states and effective number of sequences = "<<Neff_HMM<<"\n"; | |
1106 | |
1107 // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities | |
1108 for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */ | |
1109 f[0][a]=f[L+1][a]=pb[a]; | |
1110 } | |
1111 delete[] annotchr; annotchr = NULL; | |
1112 | |
1113 return 1; //return status: ok | |
1114 | |
1115 } /* this is the end of HMM::ReadHMMer() */ | |
1116 | |
1117 | |
1118 | |
1119 ///////////////////////////////////////////////////////////////////////////////////// | |
1120 /** | |
1121 * @brief Read an HMM from a HMMER3 .hmm file; return 0 at end of file | |
1122 */ | |
1123 int | |
1124 HMM::ReadHMMer3(FILE* dbf, char* filestr) | |
1125 { | |
1126 char line[LINELEN]=""; // input line | |
1127 char desc[DESCLEN]=""; // description of family | |
1128 char str4[5]=""; // first 4 letters of input line | |
1129 char* ptr; // pointer for string manipulation | |
1130 int i=0; // index for match state (first=1) | |
1131 int a; // amino acid index | |
1132 char dssp=0; // 1 if a consensus SS has been found in the transition prob lines | |
1133 char annot=0; // 1 if at least one annotation character in insert lines is ne '-' or ' ' | |
1134 int k=0; // index for seq[k] | |
1135 char* annotchr; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line | |
1136 //annotchr = new char[MAXRES]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line | |
1137 annotchr = new char[par.maxResLen]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line | |
1138 static int warn=0; | |
1139 int iAlpha = 20; /* size of alphabet, default is protein = 20 */ | |
1140 double dAlphaInv = 1.00 / (double)(iAlpha); /* weight of AA */ | |
1141 | |
1142 trans_lin=0; | |
1143 L=0; | |
1144 Neff_HMM=0; | |
1145 n_seqs=n_display=N_in=N_filtered=0; | |
1146 nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1; | |
1147 lamda=mu=0.0; | |
1148 trans_lin=0; // transition probs in log space | |
1149 name[0]=longname[0]=desc[0]=fam[0]='\0'; | |
1150 //If at the end of while-loop L is still 0 then we have reached end of db file | |
1151 | |
1152 // Do not delete name and seq vectors because their adresses are transferred to hitlist as part of a hit!! | |
1153 | |
1154 | |
1155 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/')) | |
1156 { | |
1157 | |
1158 if (strscn(line)==NULL) continue; // skip lines that contain only white space | |
1159 if (!strncmp("HMMER",line,5)) continue; | |
1160 | |
1161 substr(str4,line,0,3); // copy the first four characters into str4 | |
1162 | |
1163 if (!strcmp("NAME",str4) && name[0]=='\0') | |
1164 { | |
1165 ptr=strscn(line+4); // advance to first non-white-space character | |
1166 strncpy(name,ptr,NAMELEN-1); // copy full name to name | |
1167 strcut(name); // ...cut after first word... | |
1168 if (v>=4) cout<<"Reading in HMM "<<name<<":\n"; | |
1169 } | |
1170 | |
1171 else if (!strcmp("ACC ",str4)) | |
1172 { | |
1173 ptr=strscn(line+4); // advance to first non-white-space character | |
1174 strncpy(longname,ptr,DESCLEN-1); // copy Accession id to longname... | |
1175 } | |
1176 | |
1177 else if (!strcmp("DESC",str4)) | |
1178 { | |
1179 ptr=strscn(line+4); // advance to first non-white-space character | |
1180 if (ptr) | |
1181 { | |
1182 strncpy(desc,ptr,DESCLEN-1); // copy description to name... | |
1183 desc[DESCLEN-1]='\0'; | |
1184 strcut(ptr); // ...cut after first word... | |
1185 } | |
1186 if (!ptr || ptr[1]!='.' || strchr(ptr+3,'.')==NULL) strcpy(fam,""); else strcpy(fam,ptr); // could not find two '.' in name? | |
1187 } | |
1188 | |
1189 else if (!strcmp("LENG",str4)) | |
1190 { | |
1191 ptr=line+4; | |
1192 L=strint(ptr); //read next integer (number of match states) | |
1193 } | |
1194 | |
1195 else if (!strcmp("ALPH",str4)) { | |
1196 | |
1197 ptr=strscn(line+4); | |
1198 | |
1199 if (0 == strcmp(ptr, "amino")){ | |
1200 iAlpha = 20; | |
1201 } | |
1202 else if (0 == strcmp(ptr, "Nucleic")){ | |
1203 iAlpha = 4; | |
1204 printf("%s:%s:%d: WARNING: HMM reading does not work for DNA/RNA\n", | |
1205 __FUNCTION__, __FILE__, __LINE__); | |
1206 } | |
1207 else { | |
1208 return Warning(dbf,line,name); | |
1209 } | |
1210 dAlphaInv = 1.00 / (double)(iAlpha); | |
1211 //continue; | |
1212 } | |
1213 else if (!strcmp("RF ",str4)) continue; | |
1214 else if (!strcmp("CS ",str4)) continue; | |
1215 else if (!strcmp("MAP ",str4)) continue; | |
1216 else if (!strcmp("COM ",str4)) continue; | |
1217 else if (!strcmp("NSEQ",str4)) | |
1218 { | |
1219 ptr=line+4; | |
1220 N_in=N_filtered=strint(ptr); //read next integer: number of sequences after filtering | |
1221 } | |
1222 | |
1223 else if (!strcmp("DATE",str4)) continue; | |
1224 else if (!strncmp("CKSUM ",line,5)) continue; | |
1225 else if (!strcmp("GA ",str4)) continue; | |
1226 else if (!strcmp("TC ",str4)) continue; | |
1227 else if (!strcmp("NC ",str4)) continue; | |
1228 | |
1229 ////////////////////////////////////////////////////////////////////////////////////////////////////// | |
1230 // Still needed??? | |
1231 | |
1232 else if (!strncmp("SADSS",line,5)) | |
1233 { | |
1234 if (nsa_dssp<0) | |
1235 { | |
1236 nsa_dssp=k++; | |
1237 seq[nsa_dssp] = new(char[/*MAXRES*/par.maxResLen+2]); | |
1238 sname[nsa_dssp] = new(char[15]); | |
1239 strcpy(seq[nsa_dssp]," "); | |
1240 strcpy(sname[nsa_dssp],"sa_dssp"); | |
1241 | |
1242 } | |
1243 ptr=strscn(line+5); | |
1244 if (ptr) | |
1245 { | |
1246 strcut(ptr); | |
1247 if (strlen(seq[nsa_dssp])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen)) | |
1248 printf("\nWARNING: HMM %s has SADSS records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen); | |
1249 else strcat(seq[nsa_dssp],ptr); | |
1250 } | |
1251 } | |
1252 | |
1253 else if (!strncmp("SSPRD",line,5)) | |
1254 { | |
1255 if (nss_pred<0) | |
1256 { | |
1257 nss_pred=k++; | |
1258 seq[nss_pred] = new(char[/*MAXRES*/par.maxResLen+2]); | |
1259 sname[nss_pred] = new(char[15]); | |
1260 strcpy(seq[nss_pred]," "); | |
1261 strcpy(sname[nss_pred],"ss_pred"); | |
1262 | |
1263 } | |
1264 ptr=strscn(line+5); | |
1265 if (ptr) | |
1266 { | |
1267 strcut(ptr); | |
1268 if (strlen(seq[nss_pred])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen)) | |
1269 printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen); | |
1270 else strcat(seq[nss_pred],ptr); | |
1271 } | |
1272 } | |
1273 | |
1274 else if (!strncmp("SSCON",line,5)) | |
1275 { | |
1276 if (nss_conf<0) | |
1277 { | |
1278 nss_conf=k++; | |
1279 seq[nss_conf] = new(char[/*MAXRES*/par.maxResLen+2]); | |
1280 sname[nss_conf] = new(char[15]); | |
1281 strcpy(seq[nss_conf]," "); | |
1282 strcpy(sname[nss_conf],"ss_conf"); | |
1283 } | |
1284 ptr=strscn(line+5); | |
1285 if (ptr) | |
1286 { | |
1287 strcut(ptr); | |
1288 if (strlen(seq[nss_conf])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen)) | |
1289 printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen); | |
1290 else strcat(seq[nss_conf],ptr); | |
1291 } | |
1292 } | |
1293 | |
1294 else if (!strncmp("SSCIT",line,5)) continue; | |
1295 else if (!strcmp("XT ",str4)) continue; | |
1296 ////////////////////////////////////////////////////////////////////////////////////////////////////// | |
1297 else if (!strncmp("STATS LOCAL",line,11)) continue; | |
1298 | |
1299 else if (!strcmp("EFFN",str4)) | |
1300 { | |
1301 ptr=line+4; | |
1302 float effn = strflt(ptr); | |
1303 // Calculate Neff_HMM by using f(x) = ax^0.1 + bx^0.5 + cx + d (fitted with scop25 dataset) | |
1304 Neff_HMM = -1.403534 * pow(effn, 0.1) + 4.428118 * pow(effn, 0.5) - 0.2885410 * effn - 1.108568; | |
1305 } | |
1306 | |
1307 ///////////////////////////////////////////////////////////////////////////////////// | |
1308 // Read transition probabilities from start state | |
1309 else if (!strncmp("HMM",line,3)) | |
1310 { | |
1311 fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels | |
1312 fgetline(line,LINELEN-1,dbf); // Skip line with transition labels | |
1313 ptr=strscn(line); | |
1314 | |
1315 if (!strncmp("COMPO",ptr,5)) | |
1316 { | |
1317 ptr=ptr+5; | |
1318 for (a=0; a<20 && ptr; ++a) | |
1319 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids | |
1320 pb[s2a[a]] = (float) exp(-1.0*strflta(ptr,99999)); | |
1321 if (!ptr) return Warning(dbf,line,name); | |
1322 if (v>=4) | |
1323 { | |
1324 printf("\nNULL "); | |
1325 for (a=0; a<20; ++a) printf("%6.3g ",100.*pb[s2a[a]]); | |
1326 printf("\n"); | |
1327 } | |
1328 fgetline(line,LINELEN-1,dbf); // Read next line | |
1329 } | |
1330 | |
1331 fgetline(line,LINELEN-1,dbf); // Skip line with 0-states insert probabilities | |
1332 | |
1333 ptr = strscn(line); | |
1334 for (a=0; a<=D2D && ptr; ++a) | |
1335 tr[0][a] = log2((float) exp(-1.0*strflta(ptr,99999))); //store transition probabilites as log2 values | |
1336 // strinta returns next integer in string and puts ptr to first char | |
1337 // after the integer. Returns -99999 if '*' is found. | |
1338 // ptr is set to 0 if no integer is found after ptr. | |
1339 if (!ptr) return Warning(dbf,line,name); | |
1340 if (v>=4) | |
1341 { | |
1342 printf(" "); | |
1343 for (a=0; a<=D2D && ptr; ++a) printf("%6.3g ",100*fpow2(tr[i][a])); | |
1344 printf("\n"); | |
1345 } | |
1346 | |
1347 // Prepare to store DSSP states (if there are none, delete afterwards) | |
1348 nss_dssp=k++; | |
1349 seq[nss_dssp] = new(char[/*MAXRES*/par.maxResLen+2]); | |
1350 sname[nss_dssp] = new(char[15]); | |
1351 strcpy(sname[nss_dssp],"ss_dssp"); | |
1352 | |
1353 ///////////////////////////////////////////////////////////////////////////////////// | |
1354 // Read columns of HMM | |
1355 int next_i=0; // index of next column | |
1356 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/') && line[0]!='#') | |
1357 { | |
1358 if (strscn(line)==NULL) continue; // skip lines that contain only white space | |
1359 | |
1360 // Read in AA probabilities | |
1361 ptr=line; | |
1362 int prev_i = next_i; | |
1363 next_i = strint(ptr); ++i; | |
1364 if (v && next_i!=prev_i+1) | |
1365 if (++warn<5) | |
1366 { | |
1367 cerr<<endl<<"WARNING: in HMM "<<name<<" state "<<prev_i<<" is followed by state "<<next_i<<"\n"; | |
1368 if (warn==5) cerr<<endl<<"WARNING: further warnings while reading HMMs will be suppressed.\n"; | |
1369 } | |
1370 if (i>L) | |
1371 { | |
1372 cerr<<endl<<"Error: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n"; | |
1373 return 2; | |
1374 } | |
1375 if (i>L && v) | |
1376 cerr<<endl<<"WARNING: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n"; | |
1377 if (i>=/*MAXRES*/par.maxResLen-2) | |
1378 { | |
1379 fgetline(line,LINELEN-1,dbf); // Skip two lines | |
1380 fgetline(line,LINELEN-1,dbf); | |
1381 continue; | |
1382 } | |
1383 | |
1384 for (a=0; a<iAlpha && ptr; ++a){ /* FIXME: FS introduced iAlpha, was '20' */ | |
1385 f[i][s2a[a]] = (float) exp(-1.0*strflta(ptr,99999)); | |
1386 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids | |
1387 } | |
1388 for (a = iAlpha; a < 20; a++){ | |
1389 f[i][s2a[a]] = 0.0; | |
1390 } | |
1391 if (!ptr) return Warning(dbf,line,name); | |
1392 if (v>=4) | |
1393 { | |
1394 printf("%6i ",i); | |
1395 for (a=0; a<iAlpha; ++a) printf("%6.3g ",100*f[i][s2a[a]]); | |
1396 printf("\n"); | |
1397 } | |
1398 | |
1399 // Ignore MAP annotation | |
1400 ptr = strscn(line); //find next word | |
1401 ptr = strscn_ws(line); // ignore word | |
1402 | |
1403 // Read RF and CS annotation | |
1404 ptr = strscn(line); | |
1405 if (!ptr) return Warning(dbf,line,name); | |
1406 annotchr[i]=uprchr(*ptr); | |
1407 if (*ptr!='-' && *ptr!=' ') annot=1; | |
1408 | |
1409 ptr = strscn(line); | |
1410 switch (*ptr) | |
1411 { | |
1412 case 'H': | |
1413 ss_dssp[i]=1; | |
1414 seq[nss_dssp][i]=*ptr; | |
1415 dssp=1; | |
1416 break; | |
1417 case 'E': | |
1418 ss_dssp[i]=2; | |
1419 seq[nss_dssp][i]=*ptr; | |
1420 dssp=1; | |
1421 break; | |
1422 case 'C': | |
1423 ss_dssp[i]=3; | |
1424 seq[nss_dssp][i]=*ptr; | |
1425 dssp=1; | |
1426 break; | |
1427 case 'S': | |
1428 ss_dssp[i]=4; | |
1429 seq[nss_dssp][i]=*ptr; | |
1430 dssp=1; | |
1431 break; | |
1432 case 'T': | |
1433 ss_dssp[i]=5; | |
1434 seq[nss_dssp][i]=*ptr; | |
1435 dssp=1; | |
1436 break; | |
1437 case 'G': | |
1438 ss_dssp[i]=6; | |
1439 seq[nss_dssp][i]=*ptr; | |
1440 dssp=1; | |
1441 break; | |
1442 case 'B': | |
1443 ss_dssp[i]=7; | |
1444 seq[nss_dssp][i]=*ptr; | |
1445 dssp=1; | |
1446 break; | |
1447 case 'I': | |
1448 dssp=1; | |
1449 case '~': | |
1450 ss_dssp[i]=3; | |
1451 seq[nss_dssp][i]=*ptr; | |
1452 break; | |
1453 case '-': // no SS available from any template | |
1454 case '.': // no clear consensus SS structure | |
1455 case 'X': // no clear consensus SS structure | |
1456 ss_dssp[i]=0; | |
1457 seq[nss_dssp][i]='-'; | |
1458 break; | |
1459 default: | |
1460 ss_dssp[i]=0; | |
1461 seq[nss_dssp][i]=*ptr; | |
1462 break; | |
1463 } | |
1464 | |
1465 // Read insert emission line | |
1466 fgetline(line,LINELEN-1,dbf); | |
1467 | |
1468 // Read seven transition probabilities | |
1469 fgetline(line,LINELEN-1,dbf); | |
1470 | |
1471 ptr+=2; | |
1472 for (a=0; a<=D2D && ptr; ++a) | |
1473 tr[i][a] = log2((float) exp(-1.0*strflta(ptr,99999))); //store transition prob's as log2-values | |
1474 if (!ptr) return Warning(dbf,line,name); | |
1475 if (v>=4) | |
1476 { | |
1477 printf(" "); | |
1478 for (a=0; a<=D2D; ++a) printf("%6.3g ",100*fpow2(tr[i][a])); | |
1479 printf("\n"); | |
1480 } | |
1481 } | |
1482 | |
1483 if (line[0]=='/' && line[1]=='/') break; | |
1484 | |
1485 } /* strncmp("HMM") */ | |
1486 | |
1487 } //while(getline) | |
1488 | |
1489 if (L==0) return 0; //End of db file -> stop reading in | |
1490 | |
1491 if (v && i!=L) cerr<<endl<<"Warning: in HMM "<<name<<" there are only "<<i<<" columns while the stated length is "<<L<<"\n"; | |
1492 if (v && i>=/*MAXRES*/par.maxResLen-2) {i=/*MAXRES*/par.maxResLen-2; cerr<<endl<<"WARNING: maximum number "<</*MAXRES*/par.maxResLen-2<<" of residues exceeded while reading HMM "<<name<<"\n";} | |
1493 if (v && !i) cerr<<endl<<"WARNING: HMM "<<name<<" contains no match states. Check the alignment that gave rise to this HMM.\n"; | |
1494 L = i; | |
1495 | |
1496 if (strlen(longname)>0) strcat(longname," "); | |
1497 strncat(longname,name,DESCLEN-strlen(longname)-1); // longname = ACC NAME DESC | |
1498 if (strlen(name)>0) strcat(longname," "); | |
1499 strncat(longname,desc,DESCLEN-strlen(longname)-1); | |
1500 longname[DESCLEN-1]='\0'; | |
1501 ScopID(cl,fold,sfam,fam);// get scop classification from basename (e.g. a.1.2.3.4) | |
1502 RemoveExtension(file,filestr); // copy name of dbfile without extension into 'file' | |
1503 | |
1504 // Secondary structure | |
1505 if (!dssp) | |
1506 { | |
1507 // remove dssp sequence | |
1508 delete[] seq[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed | |
1509 delete[] sname[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed | |
1510 nss_dssp=-1; | |
1511 k--; | |
1512 } | |
1513 else { seq[nss_dssp][0]='-'; seq[nss_dssp][L+1]='\0'; } | |
1514 | |
1515 if (nss_pred>=0) | |
1516 { | |
1517 for (i=1; i<=L; ++i) ss_pred[i] = ss2i(seq[nss_pred][i]); | |
1518 if (nss_conf>=0) | |
1519 for (i=1; i<=L; ++i) ss_conf[i] = cf2i(seq[nss_conf][i]); | |
1520 else | |
1521 for (i=1; i<=L; ++i) ss_conf[i] = 5; | |
1522 } | |
1523 | |
1524 // Copy query (first sequence) and consensus residues? | |
1525 if (par.showcons) | |
1526 { | |
1527 sname[k]=new(char[10]); | |
1528 strcpy(sname[k],"Consensus"); | |
1529 sname[k+1]=new(char[strlen(longname)+1]); | |
1530 strcpy(sname[k+1],longname); | |
1531 seq[k]=new(char[L+2]); | |
1532 seq[k][0]=' '; | |
1533 seq[k][L+1]='\0'; | |
1534 seq[k+1]=new(char[L+2]); | |
1535 seq[k+1][0]=' '; | |
1536 seq[k+1][L+1]='\0'; | |
1537 for (i=1; i<=L; ++i) | |
1538 { | |
1539 float pmax=0.0; | |
1540 int amax=0; | |
1541 for (a=0; a<NAA; ++a) | |
1542 if (f[i][a]>pmax) {amax=a; pmax=f[i][a];} | |
1543 if (pmax>0.6) seq[k][i]=i2aa(amax); | |
1544 else if (pmax>0.4) seq[k][i]=lwrchr(i2aa(amax)); | |
1545 else seq[k][i]='x'; | |
1546 seq[k+1][i]=i2aa(amax); | |
1547 } | |
1548 ncons=k++; // nfirst is set later! | |
1549 } | |
1550 else | |
1551 { | |
1552 sname[k]=new(char[strlen(longname)+1]); | |
1553 strcpy(sname[k],longname); | |
1554 seq[k]=new(char[L+2]); | |
1555 seq[k][0]=' '; | |
1556 seq[k][L+1]='\0'; | |
1557 } | |
1558 | |
1559 if (annot) // read in some annotation characters? | |
1560 { | |
1561 annotchr[0]=' '; | |
1562 annotchr[L+1]='\0'; | |
1563 strcpy(seq[k],annotchr); // overwrite the consensus sequence with the annotation characters | |
1564 } | |
1565 else if (!par.showcons) // we have not yet calculated the consensus, but we need it now as query (first sequence) | |
1566 { | |
1567 for (i=1; i<=L; ++i) | |
1568 { | |
1569 float pmax=0.0; | |
1570 int amax=0; | |
1571 for (a=0; a<NAA; ++a) | |
1572 if (f[i][a]>pmax) {amax=a; pmax=f[i][a];} | |
1573 seq[k][i]=i2aa(amax); | |
1574 } | |
1575 } | |
1576 // printf("%i query name=%s seq=%s\n",n,sname[n],seq[n]); | |
1577 nfirst=k++; | |
1578 | |
1579 n_display=k; | |
1580 n_seqs=k; | |
1581 | |
1582 // If no effektive number of sequences is given, calculate Neff_HMM by given profile | |
1583 if (Neff_HMM == 0) { | |
1584 for (i=1; i<=L; ++i) | |
1585 { | |
1586 float S=0.0; | |
1587 for (a=0; a<20; ++a) | |
1588 if (f[i][a]>1E-10) S-=f[i][a]*fast_log2(f[i][a]); | |
1589 Neff_HMM+=(float) fpow2(S); | |
1590 } | |
1591 Neff_HMM/=L; | |
1592 } | |
1593 | |
1594 for (i=0; i<=L; ++i) Neff_M[i] = Neff_I[i] = Neff_D[i] = 10.0; // to add only little additional pseudocounts! | |
1595 Neff_M[L+1]=1.0f; | |
1596 Neff_I[L+1]=Neff_D[L+1]=0.0f; | |
1597 | |
1598 if (v>=2) | |
1599 cout<<"Read in HMM "<<name<<" with "<<L<<" match states and effective number of sequences = "<<Neff_HMM<<"\n"; | |
1600 | |
1601 /////////////////////////////////////////////////////////////////// | |
1602 | |
1603 // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities | |
1604 for (a=0; a<20; ++a) f[0][a]=f[L+1][a]=pb[a]; | |
1605 delete[] annotchr; | |
1606 | |
1607 has_pseudocounts=true; | |
1608 | |
1609 return 1; //return status: ok | |
1610 | |
1611 | |
1612 } /* this is the end of HMM::ReadHMMer3() */ | |
1613 | |
1614 | |
1615 ////////////////////////////////////////////////////////////////////////////// | |
1616 /** | |
1617 * @brief Add transition pseudocounts to HMM (and calculate lin-space transition probs) | |
1618 */ | |
1619 void | |
1620 HMM::AddTransitionPseudocounts(float gapd, float gape, float gapf, float gapg, float gaph, float gapi, float gapb) | |
1621 { | |
1622 int i; //position in alignment | |
1623 float sum; | |
1624 float pM2M, pM2I, pM2D, pI2I, pI2M, pD2D, pD2M; | |
1625 float p0,p1,p2; | |
1626 if (par.gapb<=0) return; | |
1627 if (trans_lin==1) {fprintf(stderr,"Error: Adding transition pseudocounts to linear representation of %s not allowed. Please report this error to the HHsearch developers.\n",name); exit(6);} | |
1628 if (trans_lin==2) {fprintf(stderr,"Error: Adding transition pseudocounts twice is %s not allowed. Please report this error to the HHsearch developers.\n",name); exit(6);} | |
1629 trans_lin=2; | |
1630 | |
1631 // Calculate pseudocount transition probabilities | |
1632 pM2D=pM2I=gapd*0.0286; //a-priori probability for inserts and deletions | |
1633 pM2M=1-pM2D-pM2I; | |
1634 // gape=0 -> pI2I=0 gape=1 -> pI2I=0.75 gape=inf -> pI2I=1. | |
1635 pI2I=1.0*gape/(gape-1+1.0/0.75); | |
1636 pI2M=1-pI2I; | |
1637 // gape=0 -> pD2D=0 gape=1 -> pD2D=0.75 gape=inf -> pD2D=1. | |
1638 pD2D=1.0*gape/(gape-1+1.0/0.75); | |
1639 pD2M=1-pD2D; | |
1640 | |
1641 for (i=0; i<=L; ++i) //for all columns in HMM | |
1642 { | |
1643 // Transitions from M state | |
1644 p0 = (Neff_M[i]-1)*fpow2(tr[i][M2M]) + gapb*pM2M; | |
1645 p1 = (Neff_M[i]-1)*fpow2(tr[i][M2D]) + gapb*pM2D; | |
1646 p2 = (Neff_M[i]-1)*fpow2(tr[i][M2I]) + gapb*pM2I; | |
1647 if (i==0) p1=p2=0; //from M(0) no transition to D(1) and I(0) possible | |
1648 if (i==L) p1=p2=0; //from M(L) no transition to D(L+1) and I(L+1) possible | |
1649 sum = p0+p1+p2+FLT_MIN; | |
1650 | |
1651 // p0 = p0/sum ; | |
1652 // p1 = pow(p1/sum,gapf); | |
1653 // p2 = pow(p2/sum,gapg); | |
1654 // sum = p0+p1+p2+FLT_MIN; | |
1655 // tr[i][M2M] = fast_log2(p0/sum); | |
1656 // tr[i][M2D] = fast_log2(p1/sum); | |
1657 // tr[i][M2I] = fast_log2(p2/sum); | |
1658 | |
1659 tr[i][M2M] = fast_log2(p0/sum); | |
1660 tr[i][M2D] = fast_log2(p1/sum)*gapf; | |
1661 tr[i][M2I] = fast_log2(p2/sum)*gapg; | |
1662 | |
1663 // Transitions from I state | |
1664 p0 = Neff_I[i]*fpow2(tr[i][I2M]) + gapb*pI2M; | |
1665 p1 = Neff_I[i]*fpow2(tr[i][I2I]) + gapb*pI2I; | |
1666 sum = p0+p1+FLT_MIN; | |
1667 | |
1668 // p0 = pow(p0/sum,gapg); | |
1669 // p1 = pow(p1/sum,gapi); | |
1670 // sum = p0+p1+FLT_MIN; | |
1671 // tr[i][I2M] = fast_log2(p0/sum); | |
1672 // tr[i][I2I] = fast_log2(p1/sum); | |
1673 | |
1674 tr[i][I2M] = fast_log2(p0/sum); | |
1675 tr[i][I2I] = fast_log2(p1/sum)*gapi; | |
1676 | |
1677 // Transitions from D state | |
1678 p0 = Neff_D[i]*fpow2(tr[i][D2M]) + gapb*pD2M; | |
1679 p1 = Neff_D[i]*fpow2(tr[i][D2D]) + gapb*pD2D; | |
1680 if (i==L) p1=0; //from D(L) no transition to D(L+1) possible | |
1681 sum = p0+p1+FLT_MIN; | |
1682 | |
1683 // p0 = pow(p0/sum,gapf); | |
1684 // p1 = pow(p1/sum,gaph); | |
1685 // sum = p0+p1+FLT_MIN; | |
1686 // tr[i][D2M] = fast_log2(p0/sum); | |
1687 // tr[i][D2D] = fast_log2(p1/sum); | |
1688 | |
1689 tr[i][D2M] = fast_log2(p0/sum); | |
1690 tr[i][D2D] = fast_log2(p1/sum)*gaph; | |
1691 | |
1692 // SS-dependent gap penalties | |
1693 tr[i][M2M_GAPOPEN]=tr[i][M2M]; | |
1694 tr[i][GAPOPEN]=0.0; | |
1695 tr[i][GAPEXTD]=0.0; | |
1696 } | |
1697 | |
1698 if (v>=4) | |
1699 { | |
1700 printf("\nPseudocount transition probabilities:\n"); | |
1701 printf("pM2M=%4.1f%%, pM2I=%4.1f%%, pM2D=%4.1f%%, ",100*pM2M,100*pM2I,100*pM2D); | |
1702 printf("pI2M=%4.1f%%, pI2I=%4.1f%%, ",100*pI2M,100*pI2I); | |
1703 printf("pD2M=%4.1f%%, pD2D=%4.1f%% ",100*pD2M,100*pD2D); | |
1704 printf("tau = %4.1f%%\n\n",100.*gapb/(Neff_HMM-1+gapb)); | |
1705 printf("Listing transition probabilities WITH pseudocounts:\n"); | |
1706 printf(" i dssp pred sacc M->M M->I M->D I->M I->I D->M D->D\n"); | |
1707 | |
1708 for (i=1; i<=L; ++i) //for all columns in HMM | |
1709 { | |
1710 printf("%4i %1c %1c %1c %6.3f %6.3f %6.3f ",i,i2ss(ss_dssp[i]),i2ss(ss_pred[i]),i2sa(sa_dssp[i]),fpow2(tr[i][M2M]),fpow2(tr[i][M2I]),fpow2(tr[i][M2D])); | |
1711 printf("%6.3f %6.3f ",fpow2(tr[i][I2M]),fpow2(tr[i][I2I])); | |
1712 printf("%6.3f %6.3f ",fpow2(tr[i][D2M]),fpow2(tr[i][D2D])); | |
1713 printf("%1i %2i %1i\n",ss_pred[i],ss_conf[i],ss_dssp[i]); | |
1714 } | |
1715 printf("\n"); | |
1716 printf("nss_dssp=%i nss_pred=%i\n",nss_dssp,nss_pred); | |
1717 } | |
1718 return; | |
1719 } | |
1720 | |
1721 | |
1722 ////////////////////////////////////////////////////////////////////////////// | |
1723 /** | |
1724 * @brief Use secondary structure-dependent gap penalties | |
1725 * on top of the HMM transition penalties | |
1726 */ | |
1727 void | |
1728 HMM::UseSecStrucDependentGapPenalties() | |
1729 { | |
1730 int i; // column in HMM | |
1731 int ii; | |
1732 //unsigned char iis[MAXRES]; // inside-integer array | |
1733 unsigned char iis[par.maxResLen]; // inside-integer array | |
1734 float d; // Additional penalty for opening gap whithin SS element | |
1735 float e; // Additional penalty for extending gap whithin SS element | |
1736 | |
1737 // Determine inside-integers: | |
1738 // CCSTCCCHHHHHHHHHHHCCCCCEEEEECCSBGGGCCCCEECC | |
1739 // 0000000123444432100000012210000000000001000 | |
1740 ii=0; | |
1741 for (i=0; i<=L; ++i) // forward run | |
1742 { | |
1743 if (ss_dssp[i]==1 || ss_dssp[i]==2) {ii+=(ii<par.ssgapi);} else ii=0; | |
1744 iis[i]=ii; | |
1745 } for (i=0; i<=L; ++i) | |
1746 ii=0; | |
1747 iis[0]=iis[L]=0; | |
1748 for (i=L; i>=0; i--) // backward run | |
1749 { | |
1750 if (ss_dssp[i]==1 || ss_dssp[i]==2) {ii+=(ii<par.ssgapi);} else ii=0; | |
1751 iis[i-1]=imin(ii,iis[i-1]); | |
1752 } | |
1753 | |
1754 // Add SS-dependent gap penalties to HMM transition penalties | |
1755 for (i=0; i<=L; ++i) //for all columns in HMM | |
1756 { | |
1757 d=-iis[i]*par.ssgapd; | |
1758 e=-iis[i]*par.ssgape; | |
1759 tr[i][GAPOPEN]=d; | |
1760 tr[i][GAPEXTD]=e; | |
1761 tr[i][M2M_GAPOPEN]+=d; | |
1762 tr[i][M2I]+=d; | |
1763 tr[i][I2M]+=d; | |
1764 tr[i][I2I]+=e; | |
1765 tr[i][M2D]+=d; | |
1766 tr[i][D2M]+=d; | |
1767 tr[i][D2D]+=e; | |
1768 } | |
1769 | |
1770 if (v>=3) | |
1771 { | |
1772 printf("Col SS II\n"); | |
1773 for (i=0; i<=L; ++i) printf("%3i %c %2i\n",i,i2ss(ss_dssp[i]),iis[i]); | |
1774 } | |
1775 return; | |
1776 } | |
1777 | |
1778 ///////////////////////////////////////////////////////////////////////////////////// | |
1779 /** | |
1780 * @brief Generate an amino acid frequency matrix g[][] with full pseudocount admixture (tau=1) | |
1781 */ | |
1782 void | |
1783 HMM::PreparePseudocounts() | |
1784 { | |
1785 for (int i=0; i<=L+1; ++i) | |
1786 for (int a=0; a<20; ++a) | |
1787 g[i][a] = // produces fast code | |
1788 R[a][0]*f[i][0] +R[a][1]*f[i][1] +R[a][2]*f[i][2] +R[a][3]*f[i][3] +R[a][4]*f[i][4] | |
1789 +R[a][5]*f[i][5] +R[a][6]*f[i][6] +R[a][7]*f[i][7] +R[a][8]*f[i][8] +R[a][9]*f[i][9] | |
1790 +R[a][10]*f[i][10]+R[a][11]*f[i][11]+R[a][12]*f[i][12]+R[a][13]*f[i][13]+R[a][14]*f[i][14] | |
1791 +R[a][15]*f[i][15]+R[a][16]*f[i][16]+R[a][17]*f[i][17]+R[a][18]*f[i][18]+R[a][19]*f[i][19]; | |
1792 } | |
1793 | |
1794 ///////////////////////////////////////////////////////////////////////////////////// | |
1795 /** | |
1796 * @brief Add amino acid pseudocounts to HMM and calculate average protein aa probabilities pav[a] | |
1797 * Pseudocounts: t.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] | |
1798 */ | |
1799 void | |
1800 HMM::AddAminoAcidPseudocounts(char pcm, float pca, float pcb, float pcc) | |
1801 { | |
1802 int i; //position in HMM | |
1803 int a; //amino acid (0..19) | |
1804 float sum; | |
1805 float tau; //tau = pseudocount admixture | |
1806 | |
1807 for (a=0; a<20; ++a) pav[a]=pb[a]*100.0f/Neff_HMM; // initialize vector of average aa freqs with pseudocounts | |
1808 | |
1809 // Calculate amino acid frequencies p[i][a] = (1-tau(i))*f[i][a] + tau(i)*g[i][a] | |
1810 switch (pcm) | |
1811 { | |
1812 case 0: //no pseudocounts whatsoever: tau=0 | |
1813 for (i=1; i<=L; ++i) | |
1814 for (a=0; a<20; ++a) | |
1815 pav[a] += ( p[i][a]=f[i][a] ); | |
1816 break; | |
1817 case 1: //constant pseudocounts (for optimization): tau = pca | |
1818 tau = pca; | |
1819 for (i=1; i<=L; ++i) | |
1820 for (a=0; a<20; ++a) | |
1821 pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] ); | |
1822 break; | |
1823 case 2: //divergence-dependent pseudocounts | |
1824 case 4: //divergence-dependent pseudocounts and rate matrix rescaling | |
1825 if (par.pcc==1.0f) | |
1826 for (i=1; i<=L; ++i) | |
1827 { | |
1828 tau = fmin(1.0, pca/(1. + Neff_M[i]/pcb ) ); | |
1829 for (a=0; a<20; ++a) | |
1830 pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] ); | |
1831 } | |
1832 else | |
1833 for (i=1; i<=L; ++i) | |
1834 { | |
1835 tau = fmin(1.0, pca/(1. + pow((Neff_M[i])/pcb,pcc))); | |
1836 for (a=0; a<20; ++a) | |
1837 pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] ); | |
1838 } | |
1839 break; | |
1840 case 3: // constant-divergence pseudocounts | |
1841 for (i=1; i<=L; ++i) | |
1842 { | |
1843 float x = Neff_M[i]/pcb; | |
1844 pca = 0.793 + 0.048*(pcb-10.0); | |
1845 tau = fmax(0.0, pca*(1-x + pcc*x*(1-x)) ); | |
1846 for (a=0; a<20; ++a) | |
1847 pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] ); | |
1848 } | |
1849 if (v>=2) { printf("Divergence before / after addition of amino acid pseudocounts: %5.2f / %5.2f\n",Neff_HMM, CalcNeff()); } | |
1850 break; | |
1851 } //end switch (pcm) | |
1852 | |
1853 | |
1854 // Normalize vector of average aa frequencies pav[a] | |
1855 NormalizeTo1(pav,NAA); | |
1856 | |
1857 for (a=0; a<20; ++a) | |
1858 p[0][a] = p[L+1][a] = pav[a]; | |
1859 | |
1860 // DEBUGGING output | |
1861 if (v>=3) | |
1862 { | |
1863 switch (pcm) | |
1864 { | |
1865 case 0: | |
1866 cout<<"No pseudocounts added (-pcm 0)\n"; | |
1867 return; | |
1868 case 1: | |
1869 cout<<"Adding constant AA pseudocount admixture of "<<pca<<" to HMM "<<name<<"\n"; | |
1870 break; | |
1871 case 2: | |
1872 cout<<"Adding divergence-dependent AA pseudocounts (-pcm 2) with admixture of " | |
1873 <<pca/(1.+pow((Neff_HMM-1.)/pcb,pcc))<<" to HMM "<<name<<"\n"; | |
1874 break; | |
1875 } //end switch (pcm) | |
1876 cout<<"\nAverage amino acid frequencies WITH pseudocounts in HMM: \nProf: "; | |
1877 for (a=0; a<20; ++a) printf("%4.1f ",100*pav[a]); | |
1878 cout<<"\n"; | |
1879 if (v>=4) | |
1880 { | |
1881 cout<<"\nAmino acid frequencies WITHOUT pseudocounts:\n A R N D C Q E G H I L K M F P S T W Y V\n"; | |
1882 for (i=1; i<=L; ++i) | |
1883 { | |
1884 printf("%3i: ",i); | |
1885 sum=0; | |
1886 for (a=0; a<20; ++a) | |
1887 { | |
1888 sum+=f[i][a]; | |
1889 printf("%4.1f ",100*f[i][a]); | |
1890 } | |
1891 printf(" sum=%5.3f\n",sum); | |
1892 } | |
1893 cout<<"\nAmino acid frequencies WITH pseudocounts:\n A R N D C Q E G H I L K M F P S T W Y V\n"; | |
1894 for (i=1; i<=L; ++i) | |
1895 { | |
1896 printf("%3i: ",i); | |
1897 sum=0; | |
1898 for (a=0; a<20; ++a) | |
1899 { | |
1900 sum+=p[i][a]; | |
1901 printf("%4.1f ",100*p[i][a]); | |
1902 } | |
1903 printf(" sum=%5.3f\n",sum); | |
1904 } | |
1905 } | |
1906 } | |
1907 return; | |
1908 } | |
1909 | |
1910 | |
1911 ///////////////////////////////////////////////////////////////////////////////////// | |
1912 /** | |
1913 * @brief Factor Null model into HMM t | |
1914 */ | |
1915 void | |
1916 HMM::IncludeNullModelInHMM(HMM& q, HMM& t) | |
1917 { | |
1918 | |
1919 int i,j; //query and template match state indices | |
1920 int a; //amino acid index | |
1921 | |
1922 switch (par.columnscore) | |
1923 { | |
1924 default: | |
1925 case 0: // Null model with background prob. from database | |
1926 for (a=0; a<20; ++a) pnul[a]=pb[a]; | |
1927 break; | |
1928 | |
1929 case 1: // Null model with background prob. equal average from query and template | |
1930 for (a=0; a<20; ++a) pnul[a]=0.5*(q.pav[a]+t.pav[a]); | |
1931 break; | |
1932 | |
1933 case 2: // Null model with background prob. from template protein | |
1934 for (a=0; a<20; ++a) pnul[a]=t.pav[a]; | |
1935 break; | |
1936 | |
1937 case 3: // Null model with background prob. from query protein | |
1938 for (a=0; a<20; ++a) pnul[a]=q.pav[a]; | |
1939 break; | |
1940 | |
1941 case 4: // Null model with background prob. equal average from query and template | |
1942 for (a=0; a<20; ++a) pnul[a]=sqrt(q.pav[a]*t.pav[a]); | |
1943 break; | |
1944 | |
1945 case 10: // Separated column scoring for Stochastic Backtracing (STILL USED??) | |
1946 for (i=0; i<=q.L+1; ++i) | |
1947 { | |
1948 float sum = 0.0; | |
1949 for (a=0; a<20; ++a) sum += pb[a]*q.p[i][a]; | |
1950 sum = 1.0/sqrt(sum); | |
1951 for (a=0; a<20; ++a) q.p[i][a]*=sum; | |
1952 } | |
1953 for (j=0; j<=t.L+1; j++) | |
1954 { | |
1955 float sum = 0.0; | |
1956 for (a=0; a<20; ++a) sum += pb[a]*t.p[j][a]; | |
1957 sum = 1.0/sqrt(sum); | |
1958 for (a=0; a<20; ++a) t.p[j][a]*=sum; | |
1959 } | |
1960 break; | |
1961 | |
1962 case 11: // log co-emission probability (no null model) | |
1963 for (a=0; a<20; ++a) pnul[a]=0.05; | |
1964 break; | |
1965 | |
1966 } | |
1967 | |
1968 // !!!!! ATTENTION!!!!!!! after this t.p is not the same as after adding pseudocounts !!! | |
1969 //Introduce amino acid weights into template (for all but SOP scores) | |
1970 if (par.columnscore!=10) | |
1971 for (a=0; a<20; ++a) | |
1972 for (j=0; j<=t.L+1; j++) | |
1973 t.p[j][a]/=pnul[a]; | |
1974 | |
1975 if (v>=5) | |
1976 { | |
1977 cout<<"\nAverage amino acid frequencies\n"; | |
1978 cout<<" A R N D C Q E G H I L K M F P S T W Y V\n"; | |
1979 cout<<"Q: "; | |
1980 for (a=0; a<20; ++a) printf("%4.1f ",100*q.pav[a]); | |
1981 cout<<"\nT: "; | |
1982 for (a=0; a<20; ++a) printf("%4.1f ",100*t.pav[a]); | |
1983 cout<<"\nNull: "; | |
1984 for (a=0; a<20; ++a) printf("%4.1f ",100*pnul[a]); | |
1985 cout<<"\npb: "; | |
1986 for (a=0; a<20; ++a) printf("%4.1f ",100*pb[a]); | |
1987 } | |
1988 | |
1989 | |
1990 return; | |
1991 } | |
1992 | |
1993 | |
1994 ///////////////////////////////////////////////////////////////////////////////////// | |
1995 /** | |
1996 * @brief Write HMM to output file | |
1997 */ | |
1998 void | |
1999 HMM::WriteToFile(char* outfile) | |
2000 { | |
2001 const int SEQLEN=100; // number of residues per line for sequences to be displayed | |
2002 int i,a; | |
2003 | |
2004 if (trans_lin) {fprintf(stderr,"Error: Writing transition pseudocounts in linear representation not allowed. Please report this error to the HHsearch developers.\n"); exit(6);} | |
2005 | |
2006 FILE *outf=NULL; | |
2007 if (strcmp(outfile,"stdout")) | |
2008 { | |
2009 if (par.append) outf=fopen(outfile,"a"); else outf=fopen(outfile,"w"); | |
2010 if (!outf) OpenFileError(outfile); | |
2011 } | |
2012 else | |
2013 outf = stdout; | |
2014 if (v>=2) cout<<"Writing HMM to "<<outfile<<"\n"; | |
2015 | |
2016 // fprintf(outf,"HHsearch HHM format 1.5\n"); | |
2017 fprintf(outf,"HHsearch 1.5\n"); // format specification | |
2018 fprintf(outf,"NAME %s\n",longname); // name of first sequence | |
2019 fprintf(outf,"FAM %s\n",fam); // family name | |
2020 char file_nopath[NAMELEN]; | |
2021 RemovePath(file_nopath,file); | |
2022 fprintf(outf,"FILE %s\n",file_nopath); // base name of alignment file | |
2023 | |
2024 // Print command line | |
2025 fprintf(outf,"COM "); | |
2026 for (int i=0; i<par.argc; i++) | |
2027 if (strlen(par.argv[i])<=100) | |
2028 fprintf(outf,"%s ",par.argv[i]); | |
2029 else | |
2030 fprintf(outf,"<%i characters> ",(int)strlen(par.argv[i])); | |
2031 fprintf(outf,"\n"); | |
2032 | |
2033 // print out date stamp | |
2034 time_t* tp=new(time_t); | |
2035 *tp=time(NULL); | |
2036 fprintf(outf,"DATE %s",ctime(tp)); | |
2037 delete tp; tp = NULL; /* really? FS */ | |
2038 | |
2039 // Print out some statistics of alignment | |
2040 fprintf(outf,"LENG %i match states, %i columns in multiple alignment\n",L,l[L]); | |
2041 fprintf(outf,"FILT %i out of %i sequences passed filter (-id %i -cov %i -qid %i -qsc %.2f -diff %i)\n",N_filtered,N_in,par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff); | |
2042 fprintf(outf,"NEFF %-4.1f\n",Neff_HMM); | |
2043 | |
2044 // Print selected sequences from alignment (including secondary structure and confidence values, if known) | |
2045 fprintf(outf,"SEQ\n"); | |
2046 for (int n=0; n<n_display; n++) | |
2047 { | |
2048 fprintf(outf,">%s\n",sname[n]); | |
2049 //first sequence character starts at 1; 0 not used. | |
2050 for(unsigned int j=0; j<strlen(seq[n]+1); j+=SEQLEN) fprintf(outf,"%-.*s\n",SEQLEN,seq[n]+1+j); | |
2051 } | |
2052 fprintf(outf,"#\n"); | |
2053 | |
2054 // print null model background probabilities from substitution matrix | |
2055 fprintf(outf,"NULL "); | |
2056 for (a=0; a<20; ++a) fout(outf,-iround(fast_log2(pb[s2a[a]])*HMMSCALE )); | |
2057 fprintf(outf,"\n"); | |
2058 | |
2059 // print table header line with amino acids | |
2060 fprintf(outf,"HMM "); | |
2061 for (a=0; a<20; ++a) fprintf(outf,"%1c\t",i2aa(s2a[a])); | |
2062 fprintf(outf,"\n"); | |
2063 | |
2064 // print table header line with state transitions | |
2065 fprintf(outf," M->M\tM->I\tM->D\tI->M\tI->I\tD->M\tD->D\tNeff\tNeff_I\tNeff_D\n"); | |
2066 | |
2067 // print out transition probabilities from begin state (virtual match state) | |
2068 fprintf(outf," "); | |
2069 for (a=0; a<=D2D; ++a) fout(outf,-iround(tr[0][a]*HMMSCALE)); | |
2070 fout(outf,iround(Neff_M[0]*HMMSCALE)); | |
2071 fout(outf,iround(Neff_I[0]*HMMSCALE)); | |
2072 fout(outf,iround(Neff_D[0]*HMMSCALE)); | |
2073 fprintf(outf,"\n"); | |
2074 | |
2075 // Start loop for printing HMM columns | |
2076 int h=1; | |
2077 for (i=1; i<=L; ++i) | |
2078 { | |
2079 | |
2080 while(islower(seq[nfirst][h]) && seq[nfirst][h]) h++; | |
2081 fprintf(outf,"%1c %-4i ",seq[nfirst][h++],i); | |
2082 | |
2083 // Print emission probabilities for match state | |
2084 for (a=0; a<20; ++a) fout(outf,-iround(fast_log2(p[i][s2a[a]])*HMMSCALE )); | |
2085 fprintf(outf,"%-i",l[i]); | |
2086 fprintf(outf,"\n"); | |
2087 | |
2088 // Print transition probabilities | |
2089 fprintf(outf," "); | |
2090 for (a=0; a<=D2D; ++a) fout(outf,-iround(tr[i][a]*HMMSCALE)); | |
2091 fout(outf,iround(Neff_M[i]*HMMSCALE)); | |
2092 fout(outf,iround(Neff_I[i]*HMMSCALE)); | |
2093 fout(outf,iround(Neff_D[i]*HMMSCALE)); | |
2094 fprintf(outf,"\n\n"); | |
2095 } // end for(i)-loop for printing HMM columns | |
2096 | |
2097 fprintf(outf,"//\n"); | |
2098 fclose(outf); | |
2099 } | |
2100 | |
2101 ///////////////////////////////////////////////////////////////////////////////////// | |
2102 /** | |
2103 * @brief Write HMM to output file | |
2104 */ | |
2105 void | |
2106 HMM::InsertCalibration(char* infile) | |
2107 { | |
2108 char* line = new(char[LINELEN]); // input line | |
2109 char** lines = new(char*[3*L+100000]); | |
2110 int nline=0; | |
2111 int l; | |
2112 char done=0; // inserted new 'EVD mu sigma' line? | |
2113 | |
2114 // Read from infile all lines and insert the EVD line with lamda and mu coefficients | |
2115 ifstream inf; | |
2116 inf.open(infile, ios::in); | |
2117 if (!inf) OpenFileError(infile); | |
2118 if (v>=2) cout<<"Recording calibration coefficients in "<<infile<<"\n"; | |
2119 | |
2120 while (inf.getline(line,LINELEN) && !(line[0]=='/' && line[1]=='/') && nline<2*/*MAXRES*/par.maxResLen) | |
2121 { | |
2122 | |
2123 // Found an EVD lamda mu line? -> remove | |
2124 while (!done && !strncmp(line,"EVD ",3) && !(line[0]=='/' && line[1]=='/') && nline<2*/*MAXRES*/par.maxResLen) | |
2125 inf.getline(line,LINELEN); | |
2126 if ((line[0]=='/' && line[1]=='/') || nline>=2*/*MAXRES*/par.maxResLen) | |
2127 {fprintf(stderr,"Error: wrong format in %s. Expecting hhm format\n",infile); exit(1);} | |
2128 | |
2129 // Found the SEQ line? -> insert calibration before this line | |
2130 if (!done && (!strncmp("SEQ",line,3) || !strncmp("HMM",line,3)) && (isspace(line[3]) || line[3]=='\0')) | |
2131 { | |
2132 done=1; | |
2133 lines[nline]=new(char[128]); | |
2134 if (!lines[nline]) MemoryError("space to read in HHM file for calibration"); | |
2135 sprintf(lines[nline],"EVD %-7.4f %-7.4f",lamda,mu); | |
2136 nline++; | |
2137 } | |
2138 lines[nline]=new(char[strlen(line)+1]); | |
2139 if (!lines[nline]) MemoryError("space to read in HHM file for calibration"); | |
2140 strcpy (lines[nline],line); | |
2141 nline++; | |
2142 } | |
2143 inf.close(); | |
2144 | |
2145 // Write to infile all lines | |
2146 FILE* infout=fopen(infile,"w"); | |
2147 if (!infout) { | |
2148 cerr<<endl<<"WARNING in "<<program_name<<": no calibration coefficients written to "<<infile<<":\n"; | |
2149 cerr<<"Could not open file for writing.\n"; | |
2150 return; | |
2151 } | |
2152 for (l=0; l<nline; l++) { | |
2153 fprintf(infout,"%s\n",lines[l]); | |
2154 delete[] lines[l]; lines[l] = NULL; | |
2155 } | |
2156 fprintf(infout,"//\n"); | |
2157 fclose(infout); | |
2158 delete[] line; line = NULL; | |
2159 delete[] lines; lines = NULL; | |
2160 return; | |
2161 } | |
2162 | |
2163 ///////////////////////////////////////////////////////////////////////////////////// | |
2164 /** | |
2165 * @brief Write HMM to output file in HMMER format | |
2166 */ | |
2167 void | |
2168 HMM::WriteToFileHMMER(char* outfile) | |
2169 { | |
2170 const int INTSCALE=1000; //scaling factor in HMMER files | |
2171 const float pBD=0.50; | |
2172 const int LOG2pBD=iround(fast_log2(pBD)*INTSCALE); | |
2173 const int LOG2pBM=iround(fast_log2(1-pBD)*INTSCALE); | |
2174 const float pJB=1.0/350; | |
2175 const int LOG2pJB=iround(fast_log2(pJB)*INTSCALE); | |
2176 const int LOG2pJJ=iround(fast_log2(1-pJB)*INTSCALE); | |
2177 const float pEJ=0.5; | |
2178 const int LOG2pEJ=iround(fast_log2(pEJ)*INTSCALE); | |
2179 const int LOG2pEC=iround(fast_log2(1-pEJ)*INTSCALE); | |
2180 char c; | |
2181 int i,a; | |
2182 | |
2183 if (trans_lin) {fprintf(stderr,"Error: Writing transition pseudocounts in linear representation not allowed. Please report this error to the HHsearch developers.\n"); exit(6);} | |
2184 | |
2185 FILE *outf=NULL; | |
2186 if (strcmp(outfile,"stdout")) | |
2187 { | |
2188 if (par.append) outf=fopen(outfile,"a"); else outf=fopen(outfile,"w"); | |
2189 if (!outf) OpenFileError(outfile); | |
2190 } | |
2191 else | |
2192 outf = stdout; | |
2193 if (v>=2) cout<<"Writing HMM to "<<outfile<<"\n"; | |
2194 | |
2195 fprintf(outf,"HMMER2.0 [hhmake %s]\n",VERSION_AND_DATE); | |
2196 fprintf(outf,"NAME %s\n",file); // base name of alignment file | |
2197 fprintf(outf,"DESC %s\n",longname); | |
2198 fprintf(outf,"LENG %i\n",L); | |
2199 fprintf(outf,"ALPH Amino\n"); // amino acid seuqences (not DNA) | |
2200 fprintf(outf,"RF yes\n"); // reference annotation flag | |
2201 fprintf(outf,"CS yes\n"); // consensus structure annotation flag | |
2202 fprintf(outf,"MAP yes\n"); // write MA column number after each line of aa probabilities | |
2203 | |
2204 fprintf(outf,"COM "); // print out command line | |
2205 for (i=0; i<=par.argc-1; ++i) fprintf(outf,"%s ",par.argv[i]); fprintf(outf,"\n"); | |
2206 | |
2207 fprintf(outf,"NSEQ %i\n",N_filtered); // print number of sequences after filtering | |
2208 | |
2209 // Date stamp | |
2210 time_t* tp=new(time_t); | |
2211 *tp=time(NULL); | |
2212 fprintf(outf,"DATE %s",ctime(tp)); | |
2213 delete tp; tp = NULL; /* really? FS */ | |
2214 | |
2215 // Print out secondary structure | |
2216 if (nss_dssp>=0) | |
2217 fprintf(outf,"SSDSS %s\n",seq[nss_dssp]); | |
2218 if (nsa_dssp>=0) | |
2219 fprintf(outf,"SADSS %s\n",seq[nsa_dssp]); | |
2220 if (nss_pred>=0) | |
2221 fprintf(outf,"SSPRD %s\n",seq[nss_pred]); | |
2222 if (nss_conf>=0) | |
2223 fprintf(outf,"SSCNF %s\n",seq[nss_conf]); | |
2224 | |
2225 | |
2226 // Special Plan7 transitions that control repeated detection of profile HMM within sequence | |
2227 fprintf(outf,"XT %6i %6i %6i %6i %6i %6i %6i %6i\n",LOG2pJB,LOG2pJJ,LOG2pEC,LOG2pEJ,LOG2pJB,LOG2pJJ,LOG2pJB,LOG2pJJ); | |
2228 fprintf(outf,"NULT -4 -8455\n"); | |
2229 | |
2230 | |
2231 // Null model background probabilities from substitution matrix | |
2232 fprintf(outf,"NULE "); | |
2233 for (a=0; a<20; ++a) | |
2234 { | |
2235 float lg2=fast_log2(pb[s2a[a]]*20.0); | |
2236 if (lg2<-99.999) fprintf(outf," *"); else fprintf(outf," %6i",iround(lg2*INTSCALE)); | |
2237 } | |
2238 fprintf(outf,"\n"); | |
2239 | |
2240 // Table header line with amino acids | |
2241 fprintf(outf,"HMM "); | |
2242 for (a=0; a<20; ++a) fprintf(outf," %1c ",i2aa(s2a[a])); | |
2243 fprintf(outf,"\n"); | |
2244 | |
2245 // Table header line with state transitions | |
2246 fprintf(outf," m->m m->i m->d i->m i->i d->m d->d b->m m->e\n"); | |
2247 | |
2248 // Transition probabilities from begin state | |
2249 fprintf(outf," %6i * %6i\n",LOG2pBM,LOG2pBD); | |
2250 | |
2251 // Start loop for printing HMM columns | |
2252 int h=1, hss=1; | |
2253 for (i=1; i<=L; ++i) | |
2254 { | |
2255 | |
2256 // Emission probabilities for match state | |
2257 fprintf(outf," %5i",i); | |
2258 for (a=0; a<20; ++a) fprintf(outf," %6i",imax(-9999,iround(fast_log2(p[i][s2a[a]]/pb[s2a[a]])*INTSCALE))); | |
2259 fprintf(outf," %5i",l[i]); | |
2260 fprintf(outf,"\n"); | |
2261 | |
2262 // Emission probabilities (relative to null model) for insert state | |
2263 while(islower(seq[nfirst][h]) && seq[nfirst][h]) h++; | |
2264 if (i==L) | |
2265 fprintf(outf," %1c * * * * * * * * * * * * * * * * * * * *\n",seq[nfirst][h++]); | |
2266 else | |
2267 fprintf(outf," %1c 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",seq[nfirst][h++]); | |
2268 | |
2269 // Transition probabilities | |
2270 if (nss_dssp>=0) | |
2271 { | |
2272 while(islower(seq[nss_dssp][hss]) && seq[nss_dssp][hss]) hss++; | |
2273 c=seq[nss_dssp][hss++]; | |
2274 } | |
2275 else c=' '; | |
2276 fprintf(outf," %1c",c); | |
2277 if (i==1) | |
2278 { | |
2279 for (a=0; a<=D2D; ++a) fprintf(outf," %6i",imax(-9999,iround(tr[i][a]*INTSCALE))); | |
2280 fprintf(outf," %6i *\n",LOG2pBM); | |
2281 } | |
2282 else if (i==L) | |
2283 { | |
2284 for (a=0; a<=D2D; ++a) fprintf(outf," *"); | |
2285 fprintf(outf," * 0\n"); | |
2286 } | |
2287 else | |
2288 { | |
2289 for (a=0; a<=D2D; ++a) fprintf(outf," %6i",imax(-9999,iround(tr[i][a]*INTSCALE))); | |
2290 fprintf(outf," * *\n"); | |
2291 } | |
2292 } | |
2293 | |
2294 fprintf(outf,"//\n"); | |
2295 fclose(outf); | |
2296 } | |
2297 | |
2298 | |
2299 ///////////////////////////////////////////////////////////////////////////////////// | |
2300 /** | |
2301 * @brief Transform log to lin transition probs | |
2302 */ | |
2303 void | |
2304 HMM::Log2LinTransitionProbs(float beta) | |
2305 { | |
2306 if (trans_lin==1) return; | |
2307 trans_lin=1; | |
2308 for (int i=0; i<=L; ++i) | |
2309 { | |
2310 for (int a=0; a<NTRANS; ++a) | |
2311 tr[i][a] = fpow2(beta*tr[i][a]); | |
2312 /* FIXME valgrind says: "Conditional jump or move depends on | |
2313 * uninitialised value(s)" when using hmm iteration | |
2314 */ | |
2315 } | |
2316 } | |
2317 | |
2318 | |
2319 /** | |
2320 * @brief Set query columns in His-tags etc to Null model distribution | |
2321 */ | |
2322 void | |
2323 HMM::NeutralizeTags() | |
2324 { | |
2325 char* qseq = seq[nfirst]; | |
2326 char* pt; | |
2327 int a,i; | |
2328 | |
2329 if (NULL == qseq){ | |
2330 return; | |
2331 } | |
2332 | |
2333 // Neutralize His tag | |
2334 if ( (pt=strstr(qseq,"HHHHH")) ) | |
2335 { | |
2336 int i0 = pt-qseq+1; | |
2337 if (v>=2) printf("Neutralized His-tag at position %i\n",i0); | |
2338 for (i=imax(i0-5,1); i<i0; ++i) // neutralize leading 5 columns | |
2339 for (a=0; a<NAA; ++a) p[i][a]=pb[a]; | |
2340 for (; (*pt)!='H'; ++i,++pt) // neutralize His columns | |
2341 for (a=0; a<NAA; ++a) p[i][a]=pb[a]; | |
2342 i0=i; | |
2343 for (; i<imin(i0+5,L+1); ++i) // neutralize trailing 5 columns | |
2344 for (a=0; a<NAA; ++a) p[i][a]=pb[a]; | |
2345 if (v>=3) printf("start:%i end:%i\n",imax(i0-5,1),i-1); | |
2346 } | |
2347 | |
2348 // Neutralize C-myc tag | |
2349 if ( (pt=strstr(qseq,"EQKLISEEDL")) ) | |
2350 { | |
2351 if (v>=2) printf("Neutralized C-myc-tag at position %i\n",int(pt-qseq)+1); | |
2352 for (i=pt-qseq+1; i<=pt-qseq+10; ++i) | |
2353 for (a=0; a<NAA; ++a) p[i][a]=pb[a]; | |
2354 } | |
2355 // Neutralize FLAG tag | |
2356 if ( (pt=strstr(qseq,"DYKDDDDK")) ) | |
2357 { | |
2358 if (v>=2) printf("Neutralized FLAG-tag at position %i\n",int(pt-qseq)+1); | |
2359 for (i=pt-qseq+1; i<=pt-qseq+8; ++i) | |
2360 for (a=0; a<NAA; ++a) p[i][a]=pb[a]; | |
2361 } | |
2362 } | |
2363 | |
2364 | |
2365 | |
2366 ///////////////////////////////////////////////////////////////////////////////////// | |
2367 /** | |
2368 * @brief Calculate effective number of sequences using profiles INCLUDING pseudocounts | |
2369 */ | |
2370 float | |
2371 HMM::CalcNeff() | |
2372 { | |
2373 float Neff=0; | |
2374 for (int i=1; i<=L; ++i) | |
2375 for (int a=0; a<20; ++a) | |
2376 if (p[i][a]>1E-10) Neff-=p[i][a]*fast_log2(p[i][a]); | |
2377 return fpow2(Neff/L); | |
2378 } | |
2379 | |
2380 | |
2381 ///////////////////////////////////////////////////////////////////////////////////// | |
2382 /** | |
2383 * @brief Calculate consensus of HMM (needed to merge HMMs later) | |
2384 */ | |
2385 void | |
2386 HMM::CalculateConsensus() | |
2387 { | |
2388 int i; // position in query | |
2389 int a; // amino acid | |
2390 if (!Xcons) Xcons = new char[/*MAXRES*/par.maxResLen+2]; | |
2391 for (i=1; i<=L; ++i) | |
2392 { | |
2393 float max=f[i][0]-pb[0]; | |
2394 for (a=1; a<20; ++a) | |
2395 if (f[i][a]-pb[a]>max) Xcons[i]=a; | |
2396 } | |
2397 Xcons[0]=Xcons[L+1]=ENDGAP; | |
2398 } | |
2399 | |
2400 // ///////////////////////////////////////////////////////////////////////////////////// | |
2401 // // Store linear transition probabilities | |
2402 // ///////////////////////////////////////////////////////////////////////////////////// | |
2403 // void HMM::StoreLinearTransitionProbs() | |
2404 // { | |
2405 // int i; // position in query | |
2406 // for (i=0; i<=L+1; ++i) if (!tr_lin[i]) tr_lin[i] = new(float[NTRANS]); | |
2407 // for (i=0; i<=L+1; ++i) | |
2408 // { | |
2409 // tr_lin[i][M2M] = fpow2(tr[i][M2M]); | |
2410 // tr_lin[i][M2I] = fpow2(tr[i][M2I]); | |
2411 // tr_lin[i][M2D] = fpow2(tr[i][M2D]); | |
2412 // tr_lin[i][D2M] = fpow2(tr[i][M2D]); | |
2413 // tr_lin[i][D2D] = fpow2(tr[i][D2D]); | |
2414 // tr_lin[i][I2M] = fpow2(tr[i][I2M]); | |
2415 // tr_lin[i][I2I] = fpow2(tr[i][I2I]); | |
2416 // } | |
2417 // } | |
2418 | |
2419 | |
2420 // #define Weff(Neff) (1.0+par.neffa*(Neff-1.0)+(par.neffb-4.0*par.neffa)/16.0*(Neff-1.0)*(Neff-1.0)) | |
2421 | |
2422 // ///////////////////////////////////////////////////////////////////////////////////// | |
2423 // // Initialize f[i][a] with query HMM | |
2424 // ///////////////////////////////////////////////////////////////////////////////////// | |
2425 // void HMM::MergeQueryHMM(HMM& q, float wk[]) | |
2426 // { | |
2427 // int i; // position in query | |
2428 // int a; // amino acid | |
2429 // float Weff_M, Weff_D, Weff_I; | |
2430 // for (i=1; i<=L; i++) | |
2431 // { | |
2432 // Weff_M = Weff(q.Neff_M[i]-1.0); | |
2433 // Weff_D = Weff(q.Neff_D[i]-1.0); | |
2434 // Weff_I = Weff(q.Neff_I[i]-1.0); | |
2435 // for (a=0; a<20; a++) f[i][a] = q.f[i][a]*wk[i]*Weff_M; | |
2436 // tr_lin[i][M2M] = q.tr_lin[i][M2M]*wk[i]*Weff_M; | |
2437 // tr_lin[i][M2I] = q.tr_lin[i][M2I]*wk[i]*Weff_M; | |
2438 // tr_lin[i][M2D] = q.tr_lin[i][M2D]*wk[i]*Weff_M; | |
2439 // tr_lin[i][D2M] = q.tr_lin[i][D2M]*wk[i]*Weff_D; | |
2440 // tr_lin[i][D2D] = q.tr_lin[i][D2D]*wk[i]*Weff_D; | |
2441 // tr_lin[i][I2M] = q.tr_lin[i][I2M]*wk[i]*Weff_I; | |
2442 // tr_lin[i][I2I] = q.tr_lin[i][I2I]*wk[i]*Weff_I; | |
2443 // } | |
2444 // } | |
2445 | |
2446 | |
2447 | |
2448 // ///////////////////////////////////////////////////////////////////////////////////// | |
2449 // // Normalize probabilities in total merged super-HMM | |
2450 // ///////////////////////////////////////////////////////////////////////////////////// | |
2451 // void HMM::NormalizeHMMandTransitionsLin2Log() | |
2452 // { | |
2453 // int i; // position in query | |
2454 // int a; // amino acid | |
2455 // for (i=0; i<=L+1; i++) | |
2456 // { | |
2457 // float sum=0.0; | |
2458 // for (a=0; a<20; a++) sum += f[i][a]; | |
2459 // for (a=0; a<20; a++) f[i][a]/=sum; | |
2460 // sum = tr_lin[i][M2M] + tr_lin[i][M2I] + tr_lin[i][M2D]; | |
2461 // tr_lin[i][M2M] /= sum; | |
2462 // tr_lin[i][M2I] /= sum; | |
2463 // tr_lin[i][M2D] /= sum; | |
2464 // tr[i][M2M] = fast_log2(tr_lin[i][M2M]); | |
2465 // tr[i][M2I] = fast_log2(tr_lin[i][M2I]); | |
2466 // tr[i][M2D] = fast_log2(tr_lin[i][M2D]); | |
2467 // sum = tr_lin[i][D2M] + tr_lin[i][D2D]; | |
2468 // tr_lin[i][D2M] /= sum; | |
2469 // tr_lin[i][D2D] /= sum; | |
2470 // tr[i][D2M] = fast_log2(tr_lin[i][D2M]); | |
2471 // tr[i][D2D] = fast_log2(tr_lin[i][D2D]); | |
2472 // sum = tr_lin[i][I2M] + tr_lin[i][I2I]; | |
2473 // tr_lin[i][I2M] /= sum; | |
2474 // tr_lin[i][I2I] /= sum; | |
2475 // tr[i][I2M] = fast_log2(tr_lin[i][I2M]); | |
2476 // tr[i][I2I] = fast_log2(tr_lin[i][I2I]); | |
2477 // } | |
2478 // } | |
2479 | |
2480 | |
2481 // UNCOMMENT TO ACTIVATE COMPOSITIONALLY BIASED PSEUDOCOUNTS BY RESCALING THE RATE MATRIX | |
2482 | |
2483 // ///////////////////////////////////////////////////////////////////////////////////// | |
2484 // //// Function to minimize | |
2485 // ///////////////////////////////////////////////////////////////////////////////////// | |
2486 // double RescaleMatrixFunc(double x[]) | |
2487 // { | |
2488 // double sum=0.0; | |
2489 // for (int a=0; a<20; ++a) | |
2490 // { | |
2491 // double za=0.0; | |
2492 // for (int b=0; b<20; ++b) za+=P[a][b]*x[b]; | |
2493 // sum += (x[a]*za-qav[a])*(x[a]*za-qav[a]); | |
2494 // } | |
2495 // return sum; | |
2496 // } | |
2497 | |
2498 // ///////////////////////////////////////////////////////////////////////////////////// | |
2499 // //// Gradient of function to minimize | |
2500 // ///////////////////////////////////////////////////////////////////////////////////// | |
2501 // void RescaleMatrixFuncGrad(double grad[], double x[]) | |
2502 // { | |
2503 // double z[20] = {0.0}; | |
2504 // double w[20]; | |
2505 // double tmp; | |
2506 // for (int a=0; a<20; ++a) | |
2507 // for (int b=0; b<20; ++b) z[a] += P[a][b]*x[b]; | |
2508 | |
2509 // for (int a=0; a<20; ++a) w[a] = x[a]*z[a]-qav[a]; | |
2510 // for (int a=0; a<20; ++a) | |
2511 // { | |
2512 // tmp = w[a]*z[a]; | |
2513 // for (int b=0; b<20; ++b) tmp += P[a][b]*x[b]*w[b]; | |
2514 // grad[a] = 2.0*tmp; | |
2515 // } | |
2516 // return; | |
2517 // } | |
2518 | |
2519 | |
2520 // ///////////////////////////////////////////////////////////////////////////////////// | |
2521 // //// Rescale a substitution matrix to biased aa frequencies in global vector qav[a] | |
2522 // ///////////////////////////////////////////////////////////////////////////////////// | |
2523 // void HMM::RescaleMatrix() | |
2524 // { | |
2525 // int a,b; | |
2526 // int code; | |
2527 // double x[21]; // scaling factor | |
2528 // double val_min; | |
2529 // const int len=20; | |
2530 // const int max_iterations=50; | |
2531 | |
2532 // if (v>=2) printf("Adjusting rate matrix to query amino acid composition ...\n"); | |
2533 | |
2534 // // Put amino acid frequencies into global array (needed to call WNLIB's conjugate gradient method) | |
2535 // for (a=0; a<20; ++a) qav[a] = pav[a]; | |
2536 | |
2537 // // Initialize scaling factors x[a] | |
2538 // for (a=0; a<20; ++a) x[a]=pow(qav[a]/pb[a],0.73); // Initialize | |
2539 | |
2540 // // Call conjugate gradient minimization method from WNLIB | |
2541 // wn_conj_gradient_method(&code,&val_min,x,len,&RescaleMatrixFunc,&RescaleMatrixFuncGrad,max_iterations); | |
2542 | |
2543 | |
2544 // // Calculate z[a] = sum_b Pab*xb | |
2545 // float sum_err=0.0f; | |
2546 // float sum = 0.0f; | |
2547 // for (a=0; a<20; ++a) | |
2548 // { | |
2549 // float za=0.0f; // za = sum_b Pab*xb | |
2550 // for (b=0; b<20; ++b) za+=P[a][b]*x[b]; | |
2551 // sum_err += (x[a]*za/qav[a]-1)*(x[a]*za/qav[a]-1); | |
2552 // sum += x[a]*za; | |
2553 // } | |
2554 // if (sum_err>1e-3 & v>=1) fprintf(stderr,"WARNING: adjusting rate matrix by CG resulted in residual error of %5.3f.\n",sum_err); | |
2555 | |
2556 // // Rescale rate matrix | |
2557 // for (a=0; a<20; ++a) | |
2558 // for (b=0; b<20; ++b) | |
2559 // { | |
2560 // P[a][b] *= x[a]*x[b]/sum; | |
2561 // R[a][b] = P[a][b]/qav[b]; | |
2562 // } | |
2563 | |
2564 // // How well approximated? | |
2565 // if (v>=3) | |
2566 // { | |
2567 // // Calculate z[a] = sum_b Pab*xb | |
2568 // float z[21]; | |
2569 // for (a=0; a<20; ++a) | |
2570 // for (z[a]=0.0, b=0; b<20; ++b) z[a]+=P[a][b]; | |
2571 // printf("Adjust A R N D C Q E G H I L K M F P S T W Y V\nErr? "); | |
2572 // for (a=0; a<20; ++a) printf("%4.0f ",1000*z[a]/qav[a]); | |
2573 // cout<<endl<<"xa "; | |
2574 // for (a=0; a<20; ++a) fprintf(stdout,"%4.2f ",x[a]); | |
2575 // cout<<endl; | |
2576 // } | |
2577 | |
2578 // // Evaluate sequence identity underlying substitution matrix | |
2579 // if (v>=3) | |
2580 // { | |
2581 // float id=0.0f; | |
2582 // float entropy=0.0f; | |
2583 // float entropy_qav=0.0f; | |
2584 // float mut_info=0.0f; | |
2585 // for (a=0; a<20; ++a) id += P[a][a]; | |
2586 // for (a=0; a<20; ++a) entropy_qav-=qav[a]*fast_log2(qav[a]); | |
2587 // for (a=0; a<20; ++a) | |
2588 // for (b=0; b<20; ++b) | |
2589 // { | |
2590 // entropy-=P[a][b]*fast_log2(R[a][b]); | |
2591 // mut_info += P[a][b]*fast_log2(P[a][b]/qav[a]/qav[b]); | |
2592 // } | |
2593 | |
2594 // fprintf(stdout,"Rescaling rate matrix: sequence identity = %2.0f%%; entropy per column = %4.2f bits (out of %4.2f); mutual information = %4.2f bits\n",100*id,entropy,entropy_qav,mut_info); | |
2595 // } | |
2596 // return; | |
2597 // } | |
2598 | |
2599 | |
2600 /* @* HMM::ClobberGlobal (eg, q,t) | |
2601 */ | |
2602 void | |
2603 HMM::ClobberGlobal(void){ | |
2604 | |
2605 for (int i = 0; i < n_display; i++){ | |
2606 if (sname[i]){ | |
2607 delete[] sname[i]; sname[i] = NULL; | |
2608 } | |
2609 if (seq[i]){ | |
2610 delete[] seq[i]; seq[i] = NULL; | |
2611 } | |
2612 } | |
2613 Neff_M[0] = Neff_I[0] = Neff_D[0] = 0.0; | |
2614 longname[0] = '\0'; file[0] = '\0'; | |
2615 ss_dssp[0] = sa_dssp[0] = ss_pred[0] = ss_conf[0] = '\0'; | |
2616 Xcons = NULL; | |
2617 l[0] = 0; | |
2618 L = 0; | |
2619 Neff_HMM = 0; | |
2620 n_display = N_in = N_filtered = 0; | |
2621 nss_dssp = nsa_dssp = nss_pred = nss_conf = nfirst = ncons = -1; | |
2622 lamda = 0.0; mu = 0.0; | |
2623 name[0] = longname[0] = fam[0] = '\0'; | |
2624 | |
2625 for (int i = 0; i < NAA; i++){ | |
2626 pav[i] = 0; | |
2627 } | |
2628 | |
2629 /* @= */ | |
2630 return; | |
2631 | |
2632 } /* this is the end of ClobberGlobal() */ | |
2633 | |
2634 | |
2635 /* | |
2636 * EOF hhhmm-C.h | |
2637 */ |