1
|
1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
|
|
2
|
|
3 /*********************************************************************
|
|
4 * Clustal Omega - Multiple sequence alignment
|
|
5 *
|
|
6 * Copyright (C) 2010 University College Dublin
|
|
7 *
|
|
8 * Clustal-Omega is free software; you can redistribute it and/or
|
|
9 * modify it under the terms of the GNU General Public License as
|
|
10 * published by the Free Software Foundation; either version 2 of the
|
|
11 * License, or (at your option) any later version.
|
|
12 *
|
|
13 * This file is part of Clustal-Omega.
|
|
14 *
|
|
15 ********************************************************************/
|
|
16
|
|
17 /*
|
|
18 * RCS $Id: hhhmm-C.h 224 2011-03-23 12:13:33Z fabian $
|
|
19 */
|
|
20
|
|
21
|
|
22 // hhhmm.C
|
|
23
|
|
24 #ifndef MAIN
|
|
25 #define MAIN
|
|
26 #include <iostream> // cin, cout, cerr
|
|
27 #include <fstream> // ofstream, ifstream
|
|
28 #include <stdio.h> // printf
|
|
29 using std::cout;
|
|
30 using std::cerr;
|
|
31 using std::endl;
|
|
32 using std::ios;
|
|
33 using std::ifstream;
|
|
34 using std::ofstream;
|
|
35 #include <stdlib.h> // exit
|
|
36 #include <string> // strcmp, strstr
|
|
37 #include <math.h> // sqrt, pow
|
|
38 #include <limits.h> // INT_MIN
|
|
39 #include <float.h> // FLT_MIN
|
|
40 #include <time.h> // clock
|
|
41 #include <ctype.h> // islower, isdigit etc
|
|
42 #include "util-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
|
|
43 #include "list.h" // list data structure
|
|
44 #include "hash.h" // hash data structure
|
|
45 #include "hhdecl-C.h"
|
|
46 #include "hhutil-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
|
|
47 #endif
|
|
48
|
|
49 // #ifndef WNLIB
|
|
50 // #define WNLIB
|
|
51 // #include "wnconj.h" // Will Naylor's wnlib for optimization in C
|
|
52 // #endif
|
|
53
|
|
54 //////////////////////////////////////////////////////////////////////////////
|
|
55 //// Class HMM
|
|
56 //////////////////////////////////////////////////////////////////////////////
|
|
57
|
|
58 //////////////////////////////////////////////////////////////////////////////
|
|
59 // Object constructor
|
|
60 //////////////////////////////////////////////////////////////////////////////
|
|
61 HMM::HMM(int maxseqdis, int maxres)
|
|
62 {
|
|
63 sname = new char*[maxseqdis]; // names of stored sequences
|
|
64 for (int i = 0; i < maxseqdis; i++){ sname[i] = NULL;}
|
|
65 seq = new char*[maxseqdis]; // residues of stored sequences (first at pos 1!)
|
|
66 for (int i = 0; i < maxseqdis; i++){ seq[i] = NULL;}
|
|
67 Neff_M = new float[maxres]; // Neff_M[i] = diversity of subalignment of seqs that have residue in col i
|
|
68 Neff_I = new float[maxres]; // Neff_I[i] = diversity of subalignment of seqs that have insert in col i
|
|
69 Neff_D = new float[maxres]; // Neff_D[i] = diversity of subalignment of seqs that have delete in col i
|
|
70 longname = new char[DESCLEN]; // Full name of first sequence of original alignment (NAME field)
|
|
71 ss_dssp = new char[maxres]; // secondary structure determined by dssp 0:- 1:H 2:E 3:C 4:S 5:T 6:G 7:B
|
|
72 sa_dssp = new char[maxres]; // solvent accessibility state determined by dssp 0:- 1:A (absolutely buried) 2:B 3:C 4:D 5:E (exposed)
|
|
73 ss_pred = new char[maxres]; // predicted secondary structure 0:- 1:H 2:E 3:C
|
|
74 ss_conf = new char[maxres]; // confidence value of prediction 0:- 1:0 ... 10:9
|
|
75 Xcons = NULL; // create only when needed: consensus sequence in internal representation (A=0 R=1 N=2 D=3 ...)
|
|
76 l = new int[maxres]; // l[i] = pos. of j'th match state in aligment
|
|
77 /* FS introduced sentinel, NULL terminates loop in destructor, FS, r221->222 */
|
|
78 f = new float*[maxres+1]; f[maxres] = NULL; // f[i][a] = prob of finding amino acid a in column i WITHOUT pseudocounts
|
|
79 g = new float*[maxres+1]; g[maxres] = NULL; // f[i][a] = prob of finding amino acid a in column i WITH pseudocounts
|
|
80 p = new float*[maxres+1]; p[maxres] = NULL; // p[i][a] = prob of finding amino acid a in column i WITH OPTIMUM pseudocounts
|
|
81 tr = new float*[maxres+1]; tr[maxres] = NULL; // log2 of transition probabilities M2M M2I M2D I2M I2I D2M D2D M2M_GAPOPEN GAPOPEN GAPEXTD
|
|
82 // tr_lin = new float*[maxres]; // linear transition probabilities M2M M2I M2D I2M I2I D2M D2D M2M_GAPOPEN GAPOPEN GAPEXTD
|
|
83 for (int i=0; i<maxres; i++) {f[i]=new(float[NAA+3]);}
|
|
84 for (int i=0; i<maxres; i++) {g[i]=new(float[NAA]);}
|
|
85 for (int i=0; i<maxres; i++) {p[i]=new(float[NAA]);}
|
|
86 for (int i=0; i<maxres; i++) {tr[i]=new(float[NTRANS]);}
|
|
87 // for (int i=0; i<maxres; i++) tr_lin[i]=new(float[NTRANS]);
|
|
88 L=0;
|
|
89 Neff_HMM=0;
|
|
90 n_display=N_in=N_filtered=0;
|
|
91 nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1;
|
|
92 // lamda_hash.New(37,0.0); // Set size and NULL element for hash
|
|
93 // mu_hash.New(37,0.0); // Set size and NULL element for hash
|
|
94 lamda=0.0; mu=0.0;
|
|
95 name[0]=longname[0]=fam[0]='\0';
|
|
96 trans_lin=0; // transition probs in log space
|
|
97 }
|
|
98
|
|
99
|
|
100 //////////////////////////////////////////////////////////////////////////////
|
|
101 // Object destructor
|
|
102 //////////////////////////////////////////////////////////////////////////////
|
|
103 HMM::~HMM()
|
|
104 {
|
|
105 //Delete name and seq matrices
|
|
106 if (NULL != sname){
|
|
107 for (int k=0; (k < n_display) && (NULL != sname[k]); k++){
|
|
108 delete[] sname[k]; sname[k] = NULL;
|
|
109 }
|
|
110 delete[] sname; sname = NULL;
|
|
111 }
|
|
112 if (NULL != seq){
|
|
113 for (int k=0; (k < n_display) && (NULL != seq[k]); k++){
|
|
114 delete[] seq[k]; seq[k] = NULL;
|
|
115 }
|
|
116 delete[] seq; seq = NULL;
|
|
117 }
|
|
118 delete[] Neff_M; Neff_M = NULL;
|
|
119 delete[] Neff_D; Neff_D = NULL;
|
|
120 delete[] Neff_I; Neff_I = NULL;
|
|
121 delete[] longname; longname = NULL;
|
|
122 delete[] ss_dssp; ss_dssp = NULL;
|
|
123 delete[] sa_dssp; sa_dssp = NULL;
|
|
124 delete[] ss_pred; ss_pred = NULL;
|
|
125 delete[] ss_conf; ss_conf = NULL;
|
|
126 delete[] Xcons; Xcons = NULL;
|
|
127 delete[] l; l = NULL;
|
|
128 for (int i=0; i</*MAXRES*/par.maxResLen; i++){
|
|
129 if (f[i]){
|
|
130 delete[] f[i]; f[i] = NULL;
|
|
131 }
|
|
132 else break;
|
|
133 }
|
|
134 for (int i=0; i</*MAXRES*/par.maxResLen; i++){
|
|
135 if (g[i]){
|
|
136 delete[] g[i]; g[i] = NULL;
|
|
137 }
|
|
138 else break;
|
|
139 }
|
|
140 for (int i=0; i</*MAXRES*/par.maxResLen; i++){
|
|
141 if (p[i]){
|
|
142 delete[] p[i]; p[i] = NULL;
|
|
143 }
|
|
144 else break;
|
|
145 }
|
|
146 for (int i=0; i</*MAXRES*/par.maxResLen; i++){
|
|
147 if (tr[i]){
|
|
148 delete[] tr[i]; tr[i] = NULL;
|
|
149 }
|
|
150 else break;
|
|
151 }
|
|
152 // for (int i=0; i</*MAXRES*/par.maxResLen; i++) if (tr_lin[i]) delete[] tr_lin[i]; else break;
|
|
153 delete[] f; f = NULL;
|
|
154 delete[] g; g = NULL;
|
|
155 delete[] p; p = NULL;
|
|
156 delete[] tr; tr = NULL;
|
|
157 // delete[] tr_lin;
|
|
158 }
|
|
159
|
|
160 //////////////////////////////////////////////////////////////////////////////
|
|
161 // Deep-copy constructor
|
|
162 //////////////////////////////////////////////////////////////////////////////
|
|
163 HMM& HMM::operator=(HMM& q)
|
|
164 {
|
|
165 L=q.L;
|
|
166 for (int i=0; i<=L+1; ++i)
|
|
167 {
|
|
168 for (int a=0; a<NAA; ++a)
|
|
169 {
|
|
170 f[i][a]=q.f[i][a];
|
|
171 g[i][a]=q.g[i][a];
|
|
172 p[i][a]=q.p[i][a];
|
|
173 }
|
|
174 for (int a=0; a<NTRANS; ++a)
|
|
175 tr[i][a]=q.tr[i][a];
|
|
176 ss_dssp[i]=q.ss_dssp[i];
|
|
177 sa_dssp[i]=q.sa_dssp[i];
|
|
178 ss_pred[i]=q.ss_pred[i];
|
|
179 ss_conf[i]=q.ss_conf[i];
|
|
180 l[i]=q.l[i];
|
|
181 }
|
|
182 if (q.Xcons)
|
|
183 for (int i=0; i<=L+1; ++i)
|
|
184 Xcons[i] =q.Xcons[i];
|
|
185
|
|
186 n_display=q.n_display;
|
|
187 for (int k=0; k<n_display; k++) {
|
|
188 sname[k]=new(char[strlen(q.sname[k])+1]);
|
|
189 if (!sname[k]) MemoryError("array of names for sequences to display");
|
|
190 strcpy(sname[k],q.sname[k]);
|
|
191 }
|
|
192 for (int k=0; k<n_display; k++) {
|
|
193 seq[k]=new(char[strlen(q.seq[k])+1]);
|
|
194 if (!seq[k]) MemoryError("array of names for sequences to display");
|
|
195 strcpy(seq[k],q.seq[k]);
|
|
196 }
|
|
197 ncons=q.ncons;
|
|
198 nfirst=q.nfirst;
|
|
199 nss_dssp=q.nss_dssp;
|
|
200 nsa_dssp=q.nsa_dssp;
|
|
201 nss_pred=q.nss_pred;
|
|
202 nss_conf=q.nss_conf;
|
|
203
|
|
204 for (int i=0; i<=L+1; ++i) Neff_M[i]=q.Neff_M[i];
|
|
205 for (int i=0; i<=L+1; ++i) Neff_I[i]=q.Neff_I[i];
|
|
206 for (int i=0; i<=L+1; ++i) Neff_D[i]=q.Neff_D[i];
|
|
207 Neff_HMM=q.Neff_HMM;
|
|
208
|
|
209 strcpy(longname,q.longname);
|
|
210 strcpy(name,q.name);
|
|
211 strcpy(fam,q.fam);
|
|
212 strcpy(sfam,q.sfam);
|
|
213 strcpy(fold,q.fold);
|
|
214 strcpy(cl,q.cl);
|
|
215 strcpy(file,q.file);
|
|
216
|
|
217 lamda=q.lamda;
|
|
218 mu=q.mu;
|
|
219
|
|
220 for (int a=0; a<NAA; ++a) pav[a]=q.pav[a];
|
|
221 N_in=q.N_in;
|
|
222 N_filtered=q.N_filtered;
|
|
223 trans_lin=q.trans_lin;
|
|
224 return (HMM&) (*this);
|
|
225 }
|
|
226
|
|
227
|
|
228 ///////////////////////////////////////////////////////////////////////////////
|
|
229 /**
|
|
230 * @brief Read an HMM from an HHsearch .hhm file; return 0 at end of file
|
|
231 */
|
|
232 int
|
|
233 HMM::Read(FILE* dbf, char* path)
|
|
234 {
|
|
235 char line[LINELEN]=""; // input line
|
|
236 char str3[8]="",str4[8]=""; // first 3 and 4 letters of input line
|
|
237 char* ptr; // pointer for string manipulation
|
|
238 int i=0; // index for match state (first=1)
|
|
239 int a; // amino acid index
|
|
240 static int warn=0;
|
|
241
|
|
242 trans_lin=0;
|
|
243 L=0;
|
|
244 Neff_HMM=0;
|
|
245 n_display=N_in=N_filtered=0;
|
|
246 nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1;
|
|
247 lamda=mu=0.0;
|
|
248 trans_lin=0; // transition probs in log space
|
|
249 name[0]=longname[0]=fam[0]='\0';
|
|
250 //If at the end of while-loop L is still 0 then we have reached end of db file
|
|
251
|
|
252 //Do not delete name and seq vectors because their adresses are transferred to hitlist as part of a hit!!
|
|
253
|
|
254 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/'))
|
|
255 {
|
|
256
|
|
257 if (strscn(line)==NULL) continue; // skip lines that contain only white space
|
|
258 substr(str3,line,0,2); // copy the first three characters into str3
|
|
259 substr(str4,line,0,3); // copy the first four characters into str4
|
|
260
|
|
261 if (!strncmp("HH",line,2)) continue;
|
|
262
|
|
263 if (!strcmp("NAME",str4))
|
|
264 {
|
|
265 ptr=strscn(line+4); //advance to first non-white-space character
|
|
266 if (ptr)
|
|
267 {
|
|
268 strncpy(longname,ptr,DESCLEN-1); //copy full name to longname
|
|
269 longname[DESCLEN-1]='\0';
|
|
270 strncpy(name,ptr,NAMELEN-1); //copy longname to name...
|
|
271 strcut(name); //...cut after first word...
|
|
272 }
|
|
273 else
|
|
274 {
|
|
275 strcpy(longname,"undefined");
|
|
276 strcpy(name,"undefined");
|
|
277 }
|
|
278 if (v>=4) cout<<"Reading in HMM "<<name<<":\n";
|
|
279 }
|
|
280
|
|
281 else if (!strcmp("FAM",str3))
|
|
282 {
|
|
283 ptr=strscn(line+3); //advance to first non-white-space character
|
|
284 if (ptr) strncpy(fam,ptr,IDLEN-1); else strcpy(fam,""); //copy family name to basename
|
|
285 ScopID(cl,fold,sfam,fam); //get scop classification from basename (e.g. a.1.2.3.4)
|
|
286 }
|
|
287
|
|
288 else if (!strcmp("FILE",str4))
|
|
289 {
|
|
290 if (path) strncpy(file,path,NAMELEN-1); else *file='\0'; // copy path to file variable
|
|
291 ptr=strscn(line+4); //advance to first non-white-space character
|
|
292 if (ptr)
|
|
293 strncat(file,ptr,NAMELEN-1-strlen(file)); // append file name read from file to path
|
|
294 else strcat(file,"*");
|
|
295 }
|
|
296
|
|
297 else if (!strcmp("LENG",str4))
|
|
298 {
|
|
299 ptr=line+4;
|
|
300 L=strint(ptr); //read next integer (number of match states)
|
|
301 }
|
|
302 else if (!strcmp("FILT",str4) || !strcmp("NSEQ",str4))
|
|
303 {
|
|
304 ptr=line+4;
|
|
305 N_filtered=strint(ptr); //read next integer: number of sequences after filtering
|
|
306 N_in=strint(ptr); //read next integer: number of sequences in alignment
|
|
307 }
|
|
308
|
|
309 else if (!strcmp("NEFF",str4) || !strcmp("NAA",str3)) sscanf(line+6,"%f",&Neff_HMM);
|
|
310
|
|
311 else if (!strcmp("EVD",str3))
|
|
312 {
|
|
313 // char key[IDLEN];
|
|
314 sscanf(line+6,"%f %f",&lamda,&mu);
|
|
315 // sscanf(line+22,"%s",key);
|
|
316 // lamda_hash.Add(key,lamda);
|
|
317 // mu_hash.Add(key,mu);
|
|
318 }
|
|
319
|
|
320 else if (!strcmp("DESC",str4)) continue;
|
|
321 else if (!strcmp("COM",str3)) continue;
|
|
322 else if (!strcmp("DATE",str4)) continue;
|
|
323
|
|
324 /////////////////////////////////////////////////////////////////////////////////////
|
|
325 // Read template sequences that should get displayed in output alignments
|
|
326 else if (!strcmp("SEQ",str3))
|
|
327 {
|
|
328 //char cur_seq[MAXCOL]=""; //Sequence currently read in
|
|
329 char *cur_seq = new(char[par.maxColCnt]); //Sequence currently read in
|
|
330 int k; // sequence index; start with -1; after reading name of n'th sequence-> k=n
|
|
331 int h; // index for character in input line
|
|
332 int l=1; // index of character in sequence seq[k]
|
|
333 int i=1; // index of match states in ss_dssp[i] and ss_pred[i] sequence
|
|
334 int n_seq=0; // number of sequences to be displayed EXCLUDING ss sequences
|
|
335 cur_seq[0]='-'; // overwrite '\0' character at beginning to be able to do strcpy(*,cur_seq)
|
|
336 k=-1;
|
|
337 while (fgetline(line,LINELEN-1,dbf) && line[0]!='#')
|
|
338 {
|
|
339 if (v>=4) cout<<"Read from file:"<<line<<"\n"; //DEBUG
|
|
340 if (line[0]=='>') //line contains sequence name
|
|
341 {
|
|
342 if (k>=MAXSEQDIS-1) //maximum number of allowable sequences exceeded
|
|
343 {while (fgetline(line,LINELEN-1,dbf) && line[0]!='#'); break;}
|
|
344 k++;
|
|
345 if (!strncmp(line,">ss_dssp",8)) nss_dssp=k;
|
|
346 else if (!strncmp(line,">sa_dssp",8)) nsa_dssp=k;
|
|
347 else if (!strncmp(line,">ss_pred",8)) nss_pred=k;
|
|
348 else if (!strncmp(line,">ss_conf",8)) nss_conf=k;
|
|
349 else if (!strncmp(line,">Cons-",6) || !strncmp(line,">Consensus",10)) ncons=k;
|
|
350 else
|
|
351 {
|
|
352 if (nfirst==-1) nfirst=k;
|
|
353 if (n_seq>=par.nseqdis)
|
|
354 {while (fgetline(line,LINELEN-1,dbf) && line[0]!='#'); k--; break;}
|
|
355 n_seq++;
|
|
356 }
|
|
357
|
|
358 //If this is not the first sequence then store residues of previous sequence
|
|
359 if (k>0) {
|
|
360 seq[k-1]=new(char[strlen(cur_seq)+1]);
|
|
361 if (!seq[k-1]) MemoryError("array of sequences to display");
|
|
362 strcpy(seq[k-1],cur_seq);
|
|
363 }
|
|
364
|
|
365 // store sequence name
|
|
366 strcut(line+1); //find next white-space character and overwrite it with end-of-string character
|
|
367 sname[k] = new (char[strlen(line+1)+1]); //+1 for terminating '\0'
|
|
368 if (!sname[k]) MemoryError("array of names for sequences to display");
|
|
369 strcpy(sname[k],line+1); //store sequence name in **name
|
|
370 l=1; i=1;
|
|
371 }
|
|
372 else //line contains sequence residues
|
|
373 {
|
|
374 if (k==-1)
|
|
375 {
|
|
376 cerr<<endl<<"WARNING: Ignoring following line while reading HMM"<<name<<":\n\'"<<line<<"\'\n";
|
|
377 continue;
|
|
378 }
|
|
379
|
|
380 h=0; //counts characters in current line
|
|
381
|
|
382 // Check whether all characters are correct; store into cur_seq
|
|
383 if (k==nss_dssp) // lines with dssp secondary structure states (. - H E C S T G B)
|
|
384 {
|
|
385 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
|
|
386 {
|
|
387 if (ss2i(line[h])>=0 && line[h]!='.')
|
|
388 {
|
|
389 char c=ss2ss(line[h]);
|
|
390 cur_seq[l]=c;
|
|
391 if (c!='.' && !(c>='a' && c<='z')) ss_dssp[i++]=ss2i(c);
|
|
392 l++;
|
|
393 }
|
|
394 else if (v && ss2i(line[h])==-2)
|
|
395 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
|
|
396 h++;
|
|
397 }
|
|
398 }
|
|
399 if (k==nsa_dssp) // lines with dssp secondary solvent accessibility (- A B C D E)
|
|
400 {
|
|
401 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
|
|
402 {
|
|
403 if (sa2i(line[h])>=0)
|
|
404 {
|
|
405 char c=line[h];
|
|
406 cur_seq[l]=c;
|
|
407 if (c!='.' && !(c>='a' && c<='z')) sa_dssp[i++]=sa2i(c);
|
|
408 l++;
|
|
409 }
|
|
410 else if (v && sa2i(line[h])==-2)
|
|
411 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
|
|
412 h++;
|
|
413 }
|
|
414 }
|
|
415 else if (k==nss_pred) // lines with predicted secondary structure (. - H E C)
|
|
416 {
|
|
417 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
|
|
418 {
|
|
419 if (ss2i(line[h])>=0 && ss2i(line[h])<=3 && line[h]!='.')
|
|
420 {
|
|
421 char c=ss2ss(line[h]);
|
|
422 cur_seq[l]=c;
|
|
423 if (c!='.' && !(c>='a' && c<='z')) ss_pred[i++]=ss2i(c);
|
|
424 l++;
|
|
425 }
|
|
426 else if (v && ss2i(line[h])==-2)
|
|
427 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
|
|
428 h++;
|
|
429 }
|
|
430 }
|
|
431 else if (k==nss_conf) // lines with confidence values should contain only 0-9, '-', or '.'
|
|
432 {
|
|
433 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
|
|
434 {
|
|
435 if (line[h]=='-' || (line[h]>='0' && line[h]<='9'))
|
|
436 {
|
|
437 cur_seq[l]=line[h];
|
|
438 ss_conf[l]=cf2i(line[h]);
|
|
439 l++;
|
|
440 }
|
|
441 else if (v && cf2i(line[h])==-2)
|
|
442 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
|
|
443 h++;
|
|
444 }
|
|
445 }
|
|
446 else // normal line containing residues
|
|
447 {
|
|
448 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
|
|
449 {
|
|
450 if (aa2i(line[h])>=0 && line[h]!='.') // ignore '.' and white-space characters ' ', \t and \n (aa2i()==-1)
|
|
451 {cur_seq[l]=line[h]; l++;}
|
|
452 else if (aa2i(line[h])==-2 && v)
|
|
453 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
|
|
454 h++;
|
|
455 }
|
|
456 }
|
|
457 cur_seq[l]='\0'; //Ensure that cur_seq ends with a '\0' character
|
|
458
|
|
459 } //end else
|
|
460 } //while(getline)
|
|
461 //If this is not the first sequence some residues have already been read in
|
|
462 if (k>=0) {
|
|
463 seq[k]=new(char[strlen(cur_seq)+1]);
|
|
464 if (!seq[k]) MemoryError("array of sequences to display");
|
|
465 strcpy(seq[k],cur_seq);
|
|
466 }
|
|
467 n_display=k+1;
|
|
468
|
|
469 // DEBUG
|
|
470 if (v>=4)
|
|
471 {
|
|
472 printf("nss_dssp=%i nsa_dssp=%i nss_pred=%i nss_conf=%i nfirst=%i\n",nss_dssp,nsa_dssp,nss_pred,nss_conf,nfirst);
|
|
473 for (k=0; k<n_display; k++)
|
|
474 {
|
|
475 int j;
|
|
476 cout<<">"<<sname[k]<<"(k="<<k<<")\n";
|
|
477 if (k==nss_dssp) {for (j=1; j<=L; j++) cout<<char(i2ss(ss_dssp[j]));}
|
|
478 else if (k==nsa_dssp) {for (j=1; j<=L; j++) cout<<char(i2sa(sa_dssp[j]));}
|
|
479 else if (k==nss_pred) {for (j=1; j<=L; j++) cout<<char(i2ss(ss_pred[j]));}
|
|
480 else if (k==nss_conf) {for (j=1; j<=L; j++) cout<<int(ss_conf[j]-1);}
|
|
481 else {for (j=1; j<=L; j++) cout<<seq[k][j];}
|
|
482 cout<<"\n";
|
|
483 }
|
|
484 }
|
|
485
|
|
486 } //end if("SEQ")
|
|
487
|
|
488 /////////////////////////////////////////////////////////////////////////////////////
|
|
489 // Read average amino acid frequencies for HMM
|
|
490 else if (!strcmp("FREQ",str4))
|
|
491 {
|
|
492 fprintf(stderr,"Error: hhm file has obsolete format.\n");
|
|
493 fprintf(stderr,"Please use hhmake version > 1.1 to generate hhm files.\n");
|
|
494 exit(1);
|
|
495 }
|
|
496
|
|
497 else if (!strcmp("AVER",str4)) {} // AVER line scrapped
|
|
498 else if (!strcmp("NULL",str4))
|
|
499 {
|
|
500 ptr=line+4;
|
|
501 for (a=0; a<20 && ptr; ++a)
|
|
502 //s2[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
|
|
503 pb[s2a[a]] = (float) fpow2(float(-strinta(ptr))/HMMSCALE);
|
|
504 if (!ptr) return Warning(dbf,line,name);
|
|
505 if (v>=4)
|
|
506 {
|
|
507 printf("\nNULL ");
|
|
508 for (a=0; a<20; ++a) printf("%5.1f ",100.*pb[s2a[a]]);
|
|
509 printf("\n");
|
|
510 }
|
|
511 }
|
|
512
|
|
513 /////////////////////////////////////////////////////////////////////////////////////
|
|
514 // Read transition probabilities from start state
|
|
515 else if (!strcmp("HMM",str3))
|
|
516 {
|
|
517 fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels
|
|
518 fgetline(line,LINELEN-1,dbf); // Skip line with transition labels
|
|
519 ptr=line;
|
|
520 for (a=0; a<=D2D && ptr; ++a)
|
|
521 tr[0][a] = float(-strinta(ptr))/HMMSCALE; //store transition probabilites as log2 values
|
|
522 // strinta returns next integer in string and puts ptr to first char
|
|
523 // after the integer. Returns -99999 if '*' is found.
|
|
524 // ptr is set to 0 if no integer is found after ptr.
|
|
525 Neff_M[0] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with M->? transition
|
|
526 Neff_I[0] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with I->? transition
|
|
527 Neff_D[0] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with D->? transition
|
|
528 if (!ptr) return Warning(dbf,line,name);
|
|
529
|
|
530 /////////////////////////////////////////////////////////////////////////////////////
|
|
531 // Read columns of HMM
|
|
532 int next_i=0; // index of next column
|
|
533 while (fgetline(line,LINELEN-2,dbf) && !(line[0]=='/' && line[1]=='/') && line[0]!='#')
|
|
534 {
|
|
535 if (strscn(line)==NULL) continue; // skip lines that contain only white space
|
|
536
|
|
537 // Read in AA probabilities
|
|
538 ptr=line+1;
|
|
539 int prev_i = next_i;
|
|
540 next_i = strint(ptr); ++i;
|
|
541 if (v && next_i!=prev_i+1)
|
|
542 if (++warn<=5)
|
|
543 {
|
|
544 cerr<<endl<<"WARNING: in HMM "<<name<<" state "<<prev_i<<" is followed by state "<<next_i<<"\n";
|
|
545 if (warn==5) cerr<<endl<<"WARNING: further warnings while reading HMMs will be suppressed.\n";
|
|
546 }
|
|
547 if (i>L)
|
|
548 {
|
|
549 cerr<<endl<<"WARNING: in HMM "<<name<<" there are more columns than the stated length "<<L<<". Skipping HMM\n";
|
|
550 return 2;
|
|
551 }
|
|
552 if (i>=/*MAXRES*/par.maxResLen-2)
|
|
553 {
|
|
554 fgetline(line,LINELEN-1,dbf); // Skip line
|
|
555 continue;
|
|
556 }
|
|
557
|
|
558 for (a=0; a<20 && ptr; ++a)
|
|
559 // f[i][s2a[a]] = (float)pow(2.,float(-strinta(ptr))/HMMSCALE);
|
|
560 f[i][s2a[a]] = fpow2(float(-strinta(ptr))/HMMSCALE); // speed-up ~5 s for 10000 SCOP domains
|
|
561
|
|
562 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
|
|
563 l[i]=strint(ptr);
|
|
564 if (!ptr) return Warning(dbf,line,name);
|
|
565 if (v>=4)
|
|
566 {
|
|
567 printf("%s",line);
|
|
568 printf("%6i ",i);
|
|
569 for (a=0; a<20; ++a) printf("%5.1f ",100*f[i][s2a[a]]);
|
|
570 printf("%5i",l[i]);
|
|
571 printf("\n");
|
|
572 }
|
|
573
|
|
574 // Read transition probabilities
|
|
575 fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels
|
|
576 if (line[0]!=' ' && line[0]!='\t') return Warning(dbf,line,name);
|
|
577 ptr=line;
|
|
578 for (a=0; a<=D2D && ptr; ++a)
|
|
579 tr[i][a] = float(-strinta(ptr))/HMMSCALE; //store transition prob's as log2-values
|
|
580 Neff_M[i] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with M->? transition
|
|
581 Neff_I[i] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with I->? transition
|
|
582 Neff_D[i] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with D->? transition
|
|
583 if (!ptr) return Warning(dbf,line,name);
|
|
584 if (v>=4)
|
|
585 {
|
|
586 printf(" ");
|
|
587 for (a=0; a<=D2D; ++a) printf("%5.1f ",100*fpow2(tr[i][a]));
|
|
588 printf("%5.1f %5.1f %5.1f \n",Neff_M[i],Neff_I[i],Neff_D[i]);
|
|
589 }
|
|
590 }
|
|
591 if (line[0]=='/' && line[1]=='/') break;
|
|
592 }
|
|
593 else if (v) cerr<<endl<<"WARNING: Ignoring line\n\'"<<line<<"\'\nin HMM "<<name<<"\n";
|
|
594
|
|
595 } //while(getline)
|
|
596
|
|
597 if (L==0) return 0; //End of db file -> stop reading in
|
|
598
|
|
599 // Set coefficients of EVD (= 0.0 if not calibrated for these parameters)
|
|
600 // lamda = lamda_hash.Show(par.Key());
|
|
601 // mu = mu_hash.Show(par.Key());
|
|
602 if (lamda && v>=3) printf("HMM %s is already calibrated: lamda=%-5.3f, mu=%-5.2f\n",name,lamda,mu);
|
|
603
|
|
604 if (v && i!=L) cerr<<endl<<"Warning: in HMM "<<name<<" there are only "<<i<<" columns while the stated length is "<<L<<"\n";
|
|
605 if (v && i>/*MAXRES*/par.maxResLen-2) {i=/*MAXRES*/par.maxResLen-2; cerr<<endl<<"WARNING: maximum number "<</*MAXRES*/par.maxResLen-2<<" of residues exceeded while reading HMM "<<name<<"\n";}
|
|
606 if (v && !i) cerr<<endl<<"WARNING: HMM "<<name<<" contains no match states. Check the alignment that gave rise to this HMM.\n";
|
|
607 if (v>=2) cout<<"Read in HMM "<<name<<" with "<<L<<" match states and effective number of sequences = "<<Neff_HMM<<"\n";
|
|
608 L = i;
|
|
609
|
|
610 // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
|
|
611 for (a=0; a<20; ++a) f[0][a]=f[L+1][a]=pb[a];
|
|
612 Neff_M[L+1]=1.0f;
|
|
613 Neff_I[L+1]=Neff_D[L+1]=0.0f;
|
|
614
|
|
615 return 1; //return status: ok
|
|
616
|
|
617 } /* this is the end of HMM::Read() */
|
|
618
|
|
619
|
|
620 /////////////////////////////////////////////////////////////////////////////////////
|
|
621 /**
|
|
622 * @brief Read an HMM from a HMMer .hmm file; return 0 at end of file
|
|
623 */
|
|
624 int
|
|
625 HMM::ReadHMMer(FILE* dbf, char* filestr)
|
|
626 {
|
|
627 char line[LINELEN]=""; // input line
|
|
628 char desc[DESCLEN]=""; // description of family
|
|
629 char str4[5]=""; // first 4 letters of input line
|
|
630 char* ptr; // pointer for string manipulation
|
|
631 int i=0; // index for match state (first=1)
|
|
632 int a; // amino acid index
|
|
633 char dssp=0; // 1 if a consensus SS has been found in the transition prob lines
|
|
634 char annot=0; // 1 if at least one annotation character in insert lines is ne '-' or ' '
|
|
635 int k=0; // index for seq[k]
|
|
636 static char ignore_hmmer_cal = 0;
|
|
637 char* annotchr; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
|
|
638 //annotchr = new char[MAXRES]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
|
|
639 annotchr = new char[par.maxResLen]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
|
|
640 static int warn=0;
|
|
641 int iAlpha = 20; /* size of alphabet, default is protein = 20 */
|
|
642 double dAlphaInv = 1.00 / (double)(iAlpha); /* weight of AA */
|
|
643
|
|
644 trans_lin=0;
|
|
645 L=0;
|
|
646 Neff_HMM=0;
|
|
647 n_display=N_in=N_filtered=0;
|
|
648 nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1;
|
|
649 lamda=mu=0.0;
|
|
650 trans_lin=0; // transition probs in log space
|
|
651 name[0]=longname[0]=desc[0]=fam[0]='\0';
|
|
652 //If at the end of while-loop L is still 0 then we have reached end of db file
|
|
653
|
|
654 // Do not delete name and seq vectors because their adresses are transferred to hitlist as part of a hit!!
|
|
655
|
|
656 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/'))
|
|
657 {
|
|
658
|
|
659 if (strscn(line)==NULL) continue; // skip lines that contain only white space
|
|
660 if (!strncmp("HMMER",line,5)) continue;
|
|
661
|
|
662 substr(str4,line,0,3); // copy the first four characters into str4
|
|
663
|
|
664 if (!strcmp("NAME",str4) && name[0]=='\0')
|
|
665 {
|
|
666 ptr=strscn(line+4); // advance to first non-white-space character
|
|
667 strncpy(name,ptr,NAMELEN-1); // copy full name to name
|
|
668 strcut(name); // ...cut after first word...
|
|
669 if (v>=4) cout<<"Reading in HMM "<<name<<":\n";
|
|
670 }
|
|
671
|
|
672 else if (!strcmp("ACC ",str4))
|
|
673 {
|
|
674 ptr=strscn(line+4); // advance to first non-white-space character
|
|
675 strncpy(longname,ptr,DESCLEN-1); // copy Accession id to longname...
|
|
676 }
|
|
677
|
|
678 else if (!strcmp("DESC",str4))
|
|
679 {
|
|
680 ptr=strscn(line+4); // advance to first non-white-space character
|
|
681 if (ptr)
|
|
682 {
|
|
683 strncpy(desc,ptr,DESCLEN-1); // copy description to name...
|
|
684 desc[DESCLEN-1]='\0';
|
|
685 strcut(ptr); // ...cut after first word...
|
|
686 }
|
|
687 if (!ptr || ptr[1]!='.' || strchr(ptr+3,'.')==NULL) strcpy(fam,""); else strcpy(fam,ptr); // could not find two '.' in name?
|
|
688 }
|
|
689
|
|
690 else if (!strcmp("LENG",str4))
|
|
691 {
|
|
692 ptr=line+4;
|
|
693 L=strint(ptr); //read next integer (number of match states)
|
|
694 }
|
|
695
|
|
696 else if (!strcmp("ALPH",str4)) {
|
|
697
|
|
698 ptr=strscn(line+4);
|
|
699
|
|
700 if (0 == strcmp(ptr, "Amino")){
|
|
701 iAlpha = 20;
|
|
702 }
|
|
703 else if (0 == strcmp(ptr, "Nucleic")){
|
|
704 iAlpha = 4;
|
|
705 printf("%s:%s:%d: WARNING: HMM reading does not work for DNA/RNA\n",
|
|
706 __FUNCTION__, __FILE__, __LINE__);
|
|
707 }
|
|
708 else {
|
|
709 return Warning(dbf,line,name);
|
|
710 }
|
|
711 dAlphaInv = 1.00 / (double)(iAlpha);
|
|
712 //continue;
|
|
713 }
|
|
714 else if (!strcmp("RF ",str4)) continue;
|
|
715 else if (!strcmp("CS ",str4)) continue;
|
|
716 else if (!strcmp("MAP ",str4)) continue;
|
|
717 else if (!strcmp("COM ",str4)) continue;
|
|
718 else if (!strcmp("NSEQ",str4))
|
|
719 {
|
|
720 ptr=line+4;
|
|
721 N_in=N_filtered=strint(ptr); //read next integer: number of sequences after filtering
|
|
722 }
|
|
723
|
|
724 else if (!strcmp("DATE",str4)) continue;
|
|
725 else if (!strncmp("CKSUM ",line,5)) continue;
|
|
726 else if (!strcmp("GA ",str4)) continue;
|
|
727 else if (!strcmp("TC ",str4)) continue;
|
|
728 else if (!strcmp("NC ",str4)) continue;
|
|
729
|
|
730 else if (!strncmp("SADSS",line,5))
|
|
731 {
|
|
732 if (nsa_dssp<0)
|
|
733 {
|
|
734 nsa_dssp=k++;
|
|
735 seq[nsa_dssp] = new(char[/*MAXRES*/par.maxResLen+2]);
|
|
736 sname[nsa_dssp] = new(char[15]);
|
|
737 strcpy(seq[nsa_dssp]," ");
|
|
738 strcpy(sname[nsa_dssp],"sa_dssp");
|
|
739
|
|
740 }
|
|
741 ptr=strscn(line+5);
|
|
742 if (ptr)
|
|
743 {
|
|
744 strcut(ptr);
|
|
745 if (strlen(seq[nsa_dssp])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
|
|
746 printf("\nWARNING: HMM %s has SADSS records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
|
|
747 else strcat(seq[nsa_dssp],ptr);
|
|
748 }
|
|
749 }
|
|
750
|
|
751 else if (!strncmp("SSPRD",line,5))
|
|
752 {
|
|
753 if (nss_pred<0)
|
|
754 {
|
|
755 nss_pred=k++;
|
|
756 seq[nss_pred] = new(char[/*MAXRES*/par.maxResLen+2]);
|
|
757 sname[nss_pred] = new(char[15]);
|
|
758 strcpy(seq[nss_pred]," ");
|
|
759 strcpy(sname[nss_pred],"ss_pred");
|
|
760
|
|
761 }
|
|
762 ptr=strscn(line+5);
|
|
763 if (ptr)
|
|
764 {
|
|
765 strcut(ptr);
|
|
766 if (strlen(seq[nss_pred])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
|
|
767 printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
|
|
768 else strcat(seq[nss_pred],ptr);
|
|
769 }
|
|
770 }
|
|
771
|
|
772 else if (!strncmp("SSCON",line,5))
|
|
773 {
|
|
774 if (nss_conf<0)
|
|
775 {
|
|
776 nss_conf=k++;
|
|
777 seq[nss_conf] = new(char[/*MAXRES*/par.maxResLen+2]);
|
|
778 sname[nss_conf] = new(char[15]);
|
|
779 strcpy(seq[nss_conf]," ");
|
|
780 strcpy(sname[nss_conf],"ss_conf");
|
|
781 }
|
|
782 ptr=strscn(line+5);
|
|
783 if (ptr)
|
|
784 {
|
|
785 strcut(ptr);
|
|
786 if (strlen(seq[nss_conf])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
|
|
787 printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
|
|
788 else strcat(seq[nss_conf],ptr);
|
|
789 }
|
|
790 }
|
|
791
|
|
792 else if (!strncmp("SSCIT",line,5)) continue;
|
|
793 else if (!strcmp("XT ",str4)) continue;
|
|
794 else if (!strcmp("NULT",str4)) continue;
|
|
795
|
|
796 else if (!strcmp("NULE",str4))
|
|
797 {
|
|
798 ptr=line+4;
|
|
799 for (a=0; (a < iAlpha) && ptr; ++a){
|
|
800 /* FIXME: FS introduced alphabet size (was '20')
|
|
801 and dAlphaInv (was '0.05' = 1/20) */
|
|
802 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
|
|
803 pb[s2a[a]] = (float) dAlphaInv * fpow2(float(strinta(ptr,-99999))/HMMSCALE); /* dAlphaInv */
|
|
804 }
|
|
805 for (a = iAlpha; a < 20; a++){
|
|
806 pb[s2a[a]] = 0.0;
|
|
807 }
|
|
808 if (!ptr) return Warning(dbf,line,name);
|
|
809 if (v>=4)
|
|
810 {
|
|
811 printf("\nNULL ");
|
|
812 for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */
|
|
813 printf("%5.1f ",100.*pb[s2a[a]]);
|
|
814 }
|
|
815 printf("\n");
|
|
816 }
|
|
817 }
|
|
818
|
|
819 else if (!strcmp("EVD ",str4))
|
|
820 {
|
|
821 char* ptr=line+4;
|
|
822 ptr = strscn(ptr);
|
|
823 sscanf(ptr,"%f",&lamda);
|
|
824 ptr = strscn(ptr);
|
|
825 sscanf(ptr,"%f",&mu);
|
|
826 if (lamda<0)
|
|
827 {
|
|
828 if (v>=2 && ignore_hmmer_cal==0)
|
|
829 cerr<<endl<<"Warning: some HMMs have been calibrated with HMMER's 'hmmcalibrate'. These calibrations will be ignored\n";
|
|
830 ignore_hmmer_cal=1;
|
|
831 mu = lamda = 0.0;
|
|
832 }
|
|
833 }
|
|
834
|
|
835 /////////////////////////////////////////////////////////////////////////////////////
|
|
836 // Read transition probabilities from start state
|
|
837 else if (!strncmp("HMM",line,3))
|
|
838 {
|
|
839 fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels
|
|
840 fgetline(line,LINELEN-1,dbf); // Skip line with transition labels
|
|
841 ptr=line;
|
|
842 for (a=0; a<=M2D && ptr; ++a)
|
|
843 tr[0][a] = float(strinta(ptr,-99999))/HMMSCALE; //store transition probabilites as log2 values
|
|
844 // strinta returns next integer in string and puts ptr to first char
|
|
845 // after the integer. Returns -99999 if '*' is found.
|
|
846 // ptr is set to 0 if no integer is found after ptr.
|
|
847 tr[0][I2M] = tr[0][D2M] = 0.0;
|
|
848 tr[0][I2I] = tr[0][D2D] = -99999.0;
|
|
849 if (!ptr) return Warning(dbf,line,name);
|
|
850 if (v>=4)
|
|
851 {
|
|
852 printf(" ");
|
|
853 for (a=0; a<=D2D && ptr; ++a) printf("%5.1f ",100*fpow2(tr[i][a]));
|
|
854 printf("\n");
|
|
855 }
|
|
856
|
|
857 // Prepare to store DSSP states (if there are none, delete afterwards)
|
|
858 nss_dssp=k++;
|
|
859 seq[nss_dssp] = new(char[/*MAXRES*/par.maxResLen+2]);
|
|
860 sname[nss_dssp] = new(char[15]);
|
|
861 strcpy(sname[nss_dssp],"ss_dssp");
|
|
862
|
|
863 /////////////////////////////////////////////////////////////////////////////////////
|
|
864 // Read columns of HMM
|
|
865 int next_i=0; // index of next column
|
|
866 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/') && line[0]!='#')
|
|
867 {
|
|
868 if (strscn(line)==NULL) continue; // skip lines that contain only white space
|
|
869
|
|
870 // Read in AA probabilities
|
|
871 ptr=line;
|
|
872 int prev_i = next_i;
|
|
873 next_i = strint(ptr); ++i;
|
|
874 if (v && next_i!=prev_i+1)
|
|
875 if (++warn<5)
|
|
876 {
|
|
877 cerr<<endl<<"WARNING: in HMM "<<name<<" state "<<prev_i<<" is followed by state "<<next_i<<"\n";
|
|
878 if (warn==5) cerr<<endl<<"WARNING: further warnings while reading HMMs will be suppressed.\n";
|
|
879 }
|
|
880 if (i>L)
|
|
881 {
|
|
882 cerr<<endl<<"Error: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n";
|
|
883 return 2;
|
|
884 }
|
|
885 if (i>L && v)
|
|
886 cerr<<endl<<"WARNING: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n";
|
|
887 if (i>=/*MAXRES*/par.maxResLen-2)
|
|
888 {
|
|
889 fgetline(line,LINELEN-1,dbf); // Skip two lines
|
|
890 fgetline(line,LINELEN-1,dbf);
|
|
891 continue;
|
|
892 }
|
|
893
|
|
894 for (a=0; (a<iAlpha) && ptr; ++a){ /* FIXME: FS introduced iAlpha, was '20' */
|
|
895 f[i][s2a[a]] = (float) pb[s2a[a]]*fpow2(float(strinta(ptr,-99999))/HMMSCALE);
|
|
896 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
|
|
897 }
|
|
898 for (a = iAlpha; a < 20; a++){
|
|
899 f[i][s2a[a]] = 0.0;
|
|
900 }
|
|
901 if (!ptr) return Warning(dbf,line,name);
|
|
902 if (v>=4)
|
|
903 {
|
|
904 printf("%6i ",i);
|
|
905 for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */
|
|
906 printf("%5.1f ",100*f[i][s2a[a]]);
|
|
907 }
|
|
908 printf("\n");
|
|
909 }
|
|
910
|
|
911 // Read insert emission line
|
|
912 fgetline(line,LINELEN-1,dbf);
|
|
913 ptr = strscn(line);
|
|
914 if (!ptr) return Warning(dbf,line,name);
|
|
915 annotchr[i]=uprchr(*ptr);
|
|
916 if (*ptr!='-' && *ptr!=' ') annot=1;
|
|
917
|
|
918 // Read annotation character and seven transition probabilities
|
|
919 fgetline(line,LINELEN-1,dbf);
|
|
920 ptr = strscn(line);
|
|
921 switch (*ptr)
|
|
922 {
|
|
923 case 'H':
|
|
924 ss_dssp[i]=1;
|
|
925 seq[nss_dssp][i]=*ptr;
|
|
926 dssp=1;
|
|
927 break;
|
|
928 case 'E':
|
|
929 ss_dssp[i]=2;
|
|
930 seq[nss_dssp][i]=*ptr;
|
|
931 dssp=1;
|
|
932 break;
|
|
933 case 'C':
|
|
934 ss_dssp[i]=3;
|
|
935 seq[nss_dssp][i]=*ptr;
|
|
936 dssp=1;
|
|
937 break;
|
|
938 case 'S':
|
|
939 ss_dssp[i]=4;
|
|
940 seq[nss_dssp][i]=*ptr;
|
|
941 dssp=1;
|
|
942 break;
|
|
943 case 'T':
|
|
944 ss_dssp[i]=5;
|
|
945 seq[nss_dssp][i]=*ptr;
|
|
946 dssp=1;
|
|
947 break;
|
|
948 case 'G':
|
|
949 ss_dssp[i]=6;
|
|
950 seq[nss_dssp][i]=*ptr;
|
|
951 dssp=1;
|
|
952 break;
|
|
953 case 'B':
|
|
954 ss_dssp[i]=7;
|
|
955 seq[nss_dssp][i]=*ptr;
|
|
956 dssp=1;
|
|
957 break;
|
|
958 case 'I':
|
|
959 dssp=1;
|
|
960 case '~':
|
|
961 ss_dssp[i]=3;
|
|
962 seq[nss_dssp][i]=*ptr;
|
|
963 break;
|
|
964 case '-':
|
|
965 default:
|
|
966 ss_dssp[i]=0;
|
|
967 seq[nss_dssp][i]=*ptr;
|
|
968 break;
|
|
969
|
|
970 }
|
|
971
|
|
972 ptr+=2;
|
|
973 for (a=0; a<=D2D && ptr; ++a)
|
|
974 tr[i][a] = float(strinta(ptr,-99999))/HMMSCALE; //store transition prob's as log2-values
|
|
975 if (!ptr) return Warning(dbf,line,name);
|
|
976 if (v>=4)
|
|
977 {
|
|
978 printf(" ");
|
|
979 for (a=0; a<=D2D; ++a) printf("%5.1f ",100*fpow2(tr[i][a]));
|
|
980 printf("\n");
|
|
981 }
|
|
982 }
|
|
983
|
|
984 if (line[0]=='/' && line[1]=='/') break;
|
|
985
|
|
986 } /* strncmp("HMM") */
|
|
987
|
|
988 } //while(getline)
|
|
989
|
|
990 if (L==0) return 0; //End of db file -> stop reading in
|
|
991
|
|
992 // Set coefficients of EVD (= 0.0 if not calibrated for these parameters)
|
|
993 // lamda = lamda_hash.Show(par.Key());
|
|
994 // mu = mu_hash.Show(par.Key());
|
|
995 if (lamda && v>=2) printf("HMM %s is already calibrated: lamda=%-5.3f, mu=%-5.2f\n",name,lamda,mu);
|
|
996
|
|
997 if (v && i!=L) cerr<<endl<<"Warning: in HMM "<<name<<" there are only "<<i<<" columns while the stated length is "<<L<<"\n";
|
|
998 if (v && i>=/*MAXRES*/par.maxResLen-2) {i=/*MAXRES*/par.maxResLen-2; cerr<<endl<<"WARNING: maximum number "<</*MAXRES*/par.maxResLen-2<<" of residues exceeded while reading HMM "<<name<<"\n";}
|
|
999 if (v && !i) cerr<<endl<<"WARNING: HMM "<<name<<" contains no match states. Check the alignment that gave rise to this HMM.\n";
|
|
1000 L = i;
|
|
1001
|
|
1002 if (strlen(longname)>0) strcat(longname," ");
|
|
1003 strncat(longname,name,DESCLEN-strlen(longname)-1); // longname = ACC NAME DESC
|
|
1004 if (strlen(name)>0) strcat(longname," ");
|
|
1005 strncat(longname,desc,DESCLEN-strlen(longname)-1);
|
|
1006 longname[DESCLEN-1]='\0';
|
|
1007 ScopID(cl,fold,sfam,fam);// get scop classification from basename (e.g. a.1.2.3.4)
|
|
1008 RemoveExtension(file,filestr); // copy name of dbfile without extension into 'file'
|
|
1009
|
|
1010 // Secondary structure
|
|
1011 if (!dssp)
|
|
1012 {
|
|
1013 // remove dssp sequence
|
|
1014 // memory that had been allocated in case ss_dssp was given needs to be freed
|
|
1015 delete[] seq[nss_dssp]; seq[nss_dssp] = NULL;
|
|
1016 // memory that had been allocated in case ss_dssp was given needs to be freed
|
|
1017 delete[] sname[nss_dssp]; sname[nss_dssp] = NULL;
|
|
1018 nss_dssp=-1;
|
|
1019 k--;
|
|
1020 }
|
|
1021 if (nss_pred>=0)
|
|
1022 {
|
|
1023 for (i=1; i<=L; ++i) ss_pred[i] = ss2i(seq[nss_pred][i]);
|
|
1024 if (nss_conf>=0)
|
|
1025 for (i=1; i<=L; ++i) ss_conf[i] = cf2i(seq[nss_conf][i]);
|
|
1026 else
|
|
1027 for (i=1; i<=L; ++i) ss_conf[i] = 5;
|
|
1028 }
|
|
1029
|
|
1030 // Copy query (first sequence) and consensus residues?
|
|
1031 if (par.showcons)
|
|
1032 {
|
|
1033 sname[k]=new(char[10]);
|
|
1034 strcpy(sname[k],"Consensus");
|
|
1035 sname[k+1]=new(char[strlen(longname)+1]);
|
|
1036 strcpy(sname[k+1],longname);
|
|
1037 seq[k]=new(char[L+2]);
|
|
1038 seq[k][0]=' ';
|
|
1039 seq[k][L+1]='\0';
|
|
1040 seq[k+1]=new(char[L+2]);
|
|
1041 seq[k+1][0]=' ';
|
|
1042 seq[k+1][L+1]='\0';
|
|
1043 for (i=1; i<=L; ++i)
|
|
1044 {
|
|
1045 float pmax=0.0;
|
|
1046 int amax=0;
|
|
1047 for (a=0; a<NAA; ++a)
|
|
1048 if (f[i][a]>pmax) {amax=a; pmax=f[i][a];}
|
|
1049 if (pmax>0.6) seq[k][i]=i2aa(amax);
|
|
1050 else if (pmax>0.4) seq[k][i]=lwrchr(i2aa(amax));
|
|
1051 else seq[k][i]='x';
|
|
1052 seq[k+1][i]=i2aa(amax);
|
|
1053 }
|
|
1054 ncons=k++; // nfirst is set later!
|
|
1055 }
|
|
1056 else
|
|
1057 {
|
|
1058 sname[k]=new(char[strlen(longname)+1]);
|
|
1059 /* FIXME valgrind says bytes get lost here during hmm iteration --
|
|
1060 fixed in HMM::ClobberGlobal(), I (FS) think */
|
|
1061 strcpy(sname[k],longname);
|
|
1062 seq[k]=new(char[L+2]);
|
|
1063 seq[k][0]=' ';
|
|
1064 seq[k][L+1]='\0';
|
|
1065 }
|
|
1066
|
|
1067 if (annot) // read in some annotation characters?
|
|
1068 {
|
|
1069 annotchr[0]=' ';
|
|
1070 annotchr[L+1]='\0';
|
|
1071 strcpy(seq[k],annotchr); // overwrite the consensus sequence with the annotation characters
|
|
1072 }
|
|
1073 else if (!par.showcons) // we have not yet calculated the consensus, but we need it now as query (first sequence)
|
|
1074 {
|
|
1075 /* FIXME: FS set ncons=k
|
|
1076 don't understand why it is not set but seem to need it */
|
|
1077 ncons = k;
|
|
1078 for (i=1; i<=L; ++i)
|
|
1079 {
|
|
1080 float pmax=0.0;
|
|
1081 int amax=0;
|
|
1082 for (a=0; a<NAA; ++a)
|
|
1083 if (f[i][a]>pmax) {amax=a; pmax=f[i][a];}
|
|
1084 seq[k][i]=i2aa(amax);
|
|
1085 }
|
|
1086 }
|
|
1087 // printf("%i query name=%s seq=%s\n",n,sname[n],seq[n]);
|
|
1088 nfirst=k++;
|
|
1089
|
|
1090 n_display=k;
|
|
1091
|
|
1092 // Calculate overall Neff_HMM
|
|
1093 Neff_HMM=0;
|
|
1094 for (i=1; i<=L; ++i)
|
|
1095 {
|
|
1096 float S=0.0;
|
|
1097 for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */
|
|
1098 if (f[i][a]>1E-10) S-=f[i][a]*fast_log2(f[i][a]);
|
|
1099 }
|
|
1100 Neff_HMM+=(float) fpow2(S);
|
|
1101 }
|
|
1102 Neff_HMM/=L;
|
|
1103 for (i=0; i<=L; ++i) Neff_M[i] = Neff_I[i] = Neff_D[i] = 10.0; // to add only little additional pseudocounts!
|
|
1104 if (v>=2)
|
|
1105 cout<<"Read in HMM "<<name<<" with "<<L<<" match states and effective number of sequences = "<<Neff_HMM<<"\n";
|
|
1106
|
|
1107 // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
|
|
1108 for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */
|
|
1109 f[0][a]=f[L+1][a]=pb[a];
|
|
1110 }
|
|
1111 delete[] annotchr; annotchr = NULL;
|
|
1112
|
|
1113 return 1; //return status: ok
|
|
1114
|
|
1115 } /* this is the end of HMM::ReadHMMer() */
|
|
1116
|
|
1117
|
|
1118
|
|
1119 /////////////////////////////////////////////////////////////////////////////////////
|
|
1120 /**
|
|
1121 * @brief Read an HMM from a HMMER3 .hmm file; return 0 at end of file
|
|
1122 */
|
|
1123 int
|
|
1124 HMM::ReadHMMer3(FILE* dbf, char* filestr)
|
|
1125 {
|
|
1126 char line[LINELEN]=""; // input line
|
|
1127 char desc[DESCLEN]=""; // description of family
|
|
1128 char str4[5]=""; // first 4 letters of input line
|
|
1129 char* ptr; // pointer for string manipulation
|
|
1130 int i=0; // index for match state (first=1)
|
|
1131 int a; // amino acid index
|
|
1132 char dssp=0; // 1 if a consensus SS has been found in the transition prob lines
|
|
1133 char annot=0; // 1 if at least one annotation character in insert lines is ne '-' or ' '
|
|
1134 int k=0; // index for seq[k]
|
|
1135 char* annotchr; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
|
|
1136 //annotchr = new char[MAXRES]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
|
|
1137 annotchr = new char[par.maxResLen]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
|
|
1138 static int warn=0;
|
|
1139 int iAlpha = 20; /* size of alphabet, default is protein = 20 */
|
|
1140 double dAlphaInv = 1.00 / (double)(iAlpha); /* weight of AA */
|
|
1141
|
|
1142 trans_lin=0;
|
|
1143 L=0;
|
|
1144 Neff_HMM=0;
|
|
1145 n_seqs=n_display=N_in=N_filtered=0;
|
|
1146 nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1;
|
|
1147 lamda=mu=0.0;
|
|
1148 trans_lin=0; // transition probs in log space
|
|
1149 name[0]=longname[0]=desc[0]=fam[0]='\0';
|
|
1150 //If at the end of while-loop L is still 0 then we have reached end of db file
|
|
1151
|
|
1152 // Do not delete name and seq vectors because their adresses are transferred to hitlist as part of a hit!!
|
|
1153
|
|
1154
|
|
1155 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/'))
|
|
1156 {
|
|
1157
|
|
1158 if (strscn(line)==NULL) continue; // skip lines that contain only white space
|
|
1159 if (!strncmp("HMMER",line,5)) continue;
|
|
1160
|
|
1161 substr(str4,line,0,3); // copy the first four characters into str4
|
|
1162
|
|
1163 if (!strcmp("NAME",str4) && name[0]=='\0')
|
|
1164 {
|
|
1165 ptr=strscn(line+4); // advance to first non-white-space character
|
|
1166 strncpy(name,ptr,NAMELEN-1); // copy full name to name
|
|
1167 strcut(name); // ...cut after first word...
|
|
1168 if (v>=4) cout<<"Reading in HMM "<<name<<":\n";
|
|
1169 }
|
|
1170
|
|
1171 else if (!strcmp("ACC ",str4))
|
|
1172 {
|
|
1173 ptr=strscn(line+4); // advance to first non-white-space character
|
|
1174 strncpy(longname,ptr,DESCLEN-1); // copy Accession id to longname...
|
|
1175 }
|
|
1176
|
|
1177 else if (!strcmp("DESC",str4))
|
|
1178 {
|
|
1179 ptr=strscn(line+4); // advance to first non-white-space character
|
|
1180 if (ptr)
|
|
1181 {
|
|
1182 strncpy(desc,ptr,DESCLEN-1); // copy description to name...
|
|
1183 desc[DESCLEN-1]='\0';
|
|
1184 strcut(ptr); // ...cut after first word...
|
|
1185 }
|
|
1186 if (!ptr || ptr[1]!='.' || strchr(ptr+3,'.')==NULL) strcpy(fam,""); else strcpy(fam,ptr); // could not find two '.' in name?
|
|
1187 }
|
|
1188
|
|
1189 else if (!strcmp("LENG",str4))
|
|
1190 {
|
|
1191 ptr=line+4;
|
|
1192 L=strint(ptr); //read next integer (number of match states)
|
|
1193 }
|
|
1194
|
|
1195 else if (!strcmp("ALPH",str4)) {
|
|
1196
|
|
1197 ptr=strscn(line+4);
|
|
1198
|
|
1199 if (0 == strcmp(ptr, "amino")){
|
|
1200 iAlpha = 20;
|
|
1201 }
|
|
1202 else if (0 == strcmp(ptr, "Nucleic")){
|
|
1203 iAlpha = 4;
|
|
1204 printf("%s:%s:%d: WARNING: HMM reading does not work for DNA/RNA\n",
|
|
1205 __FUNCTION__, __FILE__, __LINE__);
|
|
1206 }
|
|
1207 else {
|
|
1208 return Warning(dbf,line,name);
|
|
1209 }
|
|
1210 dAlphaInv = 1.00 / (double)(iAlpha);
|
|
1211 //continue;
|
|
1212 }
|
|
1213 else if (!strcmp("RF ",str4)) continue;
|
|
1214 else if (!strcmp("CS ",str4)) continue;
|
|
1215 else if (!strcmp("MAP ",str4)) continue;
|
|
1216 else if (!strcmp("COM ",str4)) continue;
|
|
1217 else if (!strcmp("NSEQ",str4))
|
|
1218 {
|
|
1219 ptr=line+4;
|
|
1220 N_in=N_filtered=strint(ptr); //read next integer: number of sequences after filtering
|
|
1221 }
|
|
1222
|
|
1223 else if (!strcmp("DATE",str4)) continue;
|
|
1224 else if (!strncmp("CKSUM ",line,5)) continue;
|
|
1225 else if (!strcmp("GA ",str4)) continue;
|
|
1226 else if (!strcmp("TC ",str4)) continue;
|
|
1227 else if (!strcmp("NC ",str4)) continue;
|
|
1228
|
|
1229 //////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
1230 // Still needed???
|
|
1231
|
|
1232 else if (!strncmp("SADSS",line,5))
|
|
1233 {
|
|
1234 if (nsa_dssp<0)
|
|
1235 {
|
|
1236 nsa_dssp=k++;
|
|
1237 seq[nsa_dssp] = new(char[/*MAXRES*/par.maxResLen+2]);
|
|
1238 sname[nsa_dssp] = new(char[15]);
|
|
1239 strcpy(seq[nsa_dssp]," ");
|
|
1240 strcpy(sname[nsa_dssp],"sa_dssp");
|
|
1241
|
|
1242 }
|
|
1243 ptr=strscn(line+5);
|
|
1244 if (ptr)
|
|
1245 {
|
|
1246 strcut(ptr);
|
|
1247 if (strlen(seq[nsa_dssp])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
|
|
1248 printf("\nWARNING: HMM %s has SADSS records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
|
|
1249 else strcat(seq[nsa_dssp],ptr);
|
|
1250 }
|
|
1251 }
|
|
1252
|
|
1253 else if (!strncmp("SSPRD",line,5))
|
|
1254 {
|
|
1255 if (nss_pred<0)
|
|
1256 {
|
|
1257 nss_pred=k++;
|
|
1258 seq[nss_pred] = new(char[/*MAXRES*/par.maxResLen+2]);
|
|
1259 sname[nss_pred] = new(char[15]);
|
|
1260 strcpy(seq[nss_pred]," ");
|
|
1261 strcpy(sname[nss_pred],"ss_pred");
|
|
1262
|
|
1263 }
|
|
1264 ptr=strscn(line+5);
|
|
1265 if (ptr)
|
|
1266 {
|
|
1267 strcut(ptr);
|
|
1268 if (strlen(seq[nss_pred])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
|
|
1269 printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
|
|
1270 else strcat(seq[nss_pred],ptr);
|
|
1271 }
|
|
1272 }
|
|
1273
|
|
1274 else if (!strncmp("SSCON",line,5))
|
|
1275 {
|
|
1276 if (nss_conf<0)
|
|
1277 {
|
|
1278 nss_conf=k++;
|
|
1279 seq[nss_conf] = new(char[/*MAXRES*/par.maxResLen+2]);
|
|
1280 sname[nss_conf] = new(char[15]);
|
|
1281 strcpy(seq[nss_conf]," ");
|
|
1282 strcpy(sname[nss_conf],"ss_conf");
|
|
1283 }
|
|
1284 ptr=strscn(line+5);
|
|
1285 if (ptr)
|
|
1286 {
|
|
1287 strcut(ptr);
|
|
1288 if (strlen(seq[nss_conf])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
|
|
1289 printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
|
|
1290 else strcat(seq[nss_conf],ptr);
|
|
1291 }
|
|
1292 }
|
|
1293
|
|
1294 else if (!strncmp("SSCIT",line,5)) continue;
|
|
1295 else if (!strcmp("XT ",str4)) continue;
|
|
1296 //////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
1297 else if (!strncmp("STATS LOCAL",line,11)) continue;
|
|
1298
|
|
1299 else if (!strcmp("EFFN",str4))
|
|
1300 {
|
|
1301 ptr=line+4;
|
|
1302 float effn = strflt(ptr);
|
|
1303 // Calculate Neff_HMM by using f(x) = ax^0.1 + bx^0.5 + cx + d (fitted with scop25 dataset)
|
|
1304 Neff_HMM = -1.403534 * pow(effn, 0.1) + 4.428118 * pow(effn, 0.5) - 0.2885410 * effn - 1.108568;
|
|
1305 }
|
|
1306
|
|
1307 /////////////////////////////////////////////////////////////////////////////////////
|
|
1308 // Read transition probabilities from start state
|
|
1309 else if (!strncmp("HMM",line,3))
|
|
1310 {
|
|
1311 fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels
|
|
1312 fgetline(line,LINELEN-1,dbf); // Skip line with transition labels
|
|
1313 ptr=strscn(line);
|
|
1314
|
|
1315 if (!strncmp("COMPO",ptr,5))
|
|
1316 {
|
|
1317 ptr=ptr+5;
|
|
1318 for (a=0; a<20 && ptr; ++a)
|
|
1319 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
|
|
1320 pb[s2a[a]] = (float) exp(-1.0*strflta(ptr,99999));
|
|
1321 if (!ptr) return Warning(dbf,line,name);
|
|
1322 if (v>=4)
|
|
1323 {
|
|
1324 printf("\nNULL ");
|
|
1325 for (a=0; a<20; ++a) printf("%6.3g ",100.*pb[s2a[a]]);
|
|
1326 printf("\n");
|
|
1327 }
|
|
1328 fgetline(line,LINELEN-1,dbf); // Read next line
|
|
1329 }
|
|
1330
|
|
1331 fgetline(line,LINELEN-1,dbf); // Skip line with 0-states insert probabilities
|
|
1332
|
|
1333 ptr = strscn(line);
|
|
1334 for (a=0; a<=D2D && ptr; ++a)
|
|
1335 tr[0][a] = log2((float) exp(-1.0*strflta(ptr,99999))); //store transition probabilites as log2 values
|
|
1336 // strinta returns next integer in string and puts ptr to first char
|
|
1337 // after the integer. Returns -99999 if '*' is found.
|
|
1338 // ptr is set to 0 if no integer is found after ptr.
|
|
1339 if (!ptr) return Warning(dbf,line,name);
|
|
1340 if (v>=4)
|
|
1341 {
|
|
1342 printf(" ");
|
|
1343 for (a=0; a<=D2D && ptr; ++a) printf("%6.3g ",100*fpow2(tr[i][a]));
|
|
1344 printf("\n");
|
|
1345 }
|
|
1346
|
|
1347 // Prepare to store DSSP states (if there are none, delete afterwards)
|
|
1348 nss_dssp=k++;
|
|
1349 seq[nss_dssp] = new(char[/*MAXRES*/par.maxResLen+2]);
|
|
1350 sname[nss_dssp] = new(char[15]);
|
|
1351 strcpy(sname[nss_dssp],"ss_dssp");
|
|
1352
|
|
1353 /////////////////////////////////////////////////////////////////////////////////////
|
|
1354 // Read columns of HMM
|
|
1355 int next_i=0; // index of next column
|
|
1356 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/') && line[0]!='#')
|
|
1357 {
|
|
1358 if (strscn(line)==NULL) continue; // skip lines that contain only white space
|
|
1359
|
|
1360 // Read in AA probabilities
|
|
1361 ptr=line;
|
|
1362 int prev_i = next_i;
|
|
1363 next_i = strint(ptr); ++i;
|
|
1364 if (v && next_i!=prev_i+1)
|
|
1365 if (++warn<5)
|
|
1366 {
|
|
1367 cerr<<endl<<"WARNING: in HMM "<<name<<" state "<<prev_i<<" is followed by state "<<next_i<<"\n";
|
|
1368 if (warn==5) cerr<<endl<<"WARNING: further warnings while reading HMMs will be suppressed.\n";
|
|
1369 }
|
|
1370 if (i>L)
|
|
1371 {
|
|
1372 cerr<<endl<<"Error: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n";
|
|
1373 return 2;
|
|
1374 }
|
|
1375 if (i>L && v)
|
|
1376 cerr<<endl<<"WARNING: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n";
|
|
1377 if (i>=/*MAXRES*/par.maxResLen-2)
|
|
1378 {
|
|
1379 fgetline(line,LINELEN-1,dbf); // Skip two lines
|
|
1380 fgetline(line,LINELEN-1,dbf);
|
|
1381 continue;
|
|
1382 }
|
|
1383
|
|
1384 for (a=0; a<iAlpha && ptr; ++a){ /* FIXME: FS introduced iAlpha, was '20' */
|
|
1385 f[i][s2a[a]] = (float) exp(-1.0*strflta(ptr,99999));
|
|
1386 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
|
|
1387 }
|
|
1388 for (a = iAlpha; a < 20; a++){
|
|
1389 f[i][s2a[a]] = 0.0;
|
|
1390 }
|
|
1391 if (!ptr) return Warning(dbf,line,name);
|
|
1392 if (v>=4)
|
|
1393 {
|
|
1394 printf("%6i ",i);
|
|
1395 for (a=0; a<iAlpha; ++a) printf("%6.3g ",100*f[i][s2a[a]]);
|
|
1396 printf("\n");
|
|
1397 }
|
|
1398
|
|
1399 // Ignore MAP annotation
|
|
1400 ptr = strscn(line); //find next word
|
|
1401 ptr = strscn_ws(line); // ignore word
|
|
1402
|
|
1403 // Read RF and CS annotation
|
|
1404 ptr = strscn(line);
|
|
1405 if (!ptr) return Warning(dbf,line,name);
|
|
1406 annotchr[i]=uprchr(*ptr);
|
|
1407 if (*ptr!='-' && *ptr!=' ') annot=1;
|
|
1408
|
|
1409 ptr = strscn(line);
|
|
1410 switch (*ptr)
|
|
1411 {
|
|
1412 case 'H':
|
|
1413 ss_dssp[i]=1;
|
|
1414 seq[nss_dssp][i]=*ptr;
|
|
1415 dssp=1;
|
|
1416 break;
|
|
1417 case 'E':
|
|
1418 ss_dssp[i]=2;
|
|
1419 seq[nss_dssp][i]=*ptr;
|
|
1420 dssp=1;
|
|
1421 break;
|
|
1422 case 'C':
|
|
1423 ss_dssp[i]=3;
|
|
1424 seq[nss_dssp][i]=*ptr;
|
|
1425 dssp=1;
|
|
1426 break;
|
|
1427 case 'S':
|
|
1428 ss_dssp[i]=4;
|
|
1429 seq[nss_dssp][i]=*ptr;
|
|
1430 dssp=1;
|
|
1431 break;
|
|
1432 case 'T':
|
|
1433 ss_dssp[i]=5;
|
|
1434 seq[nss_dssp][i]=*ptr;
|
|
1435 dssp=1;
|
|
1436 break;
|
|
1437 case 'G':
|
|
1438 ss_dssp[i]=6;
|
|
1439 seq[nss_dssp][i]=*ptr;
|
|
1440 dssp=1;
|
|
1441 break;
|
|
1442 case 'B':
|
|
1443 ss_dssp[i]=7;
|
|
1444 seq[nss_dssp][i]=*ptr;
|
|
1445 dssp=1;
|
|
1446 break;
|
|
1447 case 'I':
|
|
1448 dssp=1;
|
|
1449 case '~':
|
|
1450 ss_dssp[i]=3;
|
|
1451 seq[nss_dssp][i]=*ptr;
|
|
1452 break;
|
|
1453 case '-': // no SS available from any template
|
|
1454 case '.': // no clear consensus SS structure
|
|
1455 case 'X': // no clear consensus SS structure
|
|
1456 ss_dssp[i]=0;
|
|
1457 seq[nss_dssp][i]='-';
|
|
1458 break;
|
|
1459 default:
|
|
1460 ss_dssp[i]=0;
|
|
1461 seq[nss_dssp][i]=*ptr;
|
|
1462 break;
|
|
1463 }
|
|
1464
|
|
1465 // Read insert emission line
|
|
1466 fgetline(line,LINELEN-1,dbf);
|
|
1467
|
|
1468 // Read seven transition probabilities
|
|
1469 fgetline(line,LINELEN-1,dbf);
|
|
1470
|
|
1471 ptr+=2;
|
|
1472 for (a=0; a<=D2D && ptr; ++a)
|
|
1473 tr[i][a] = log2((float) exp(-1.0*strflta(ptr,99999))); //store transition prob's as log2-values
|
|
1474 if (!ptr) return Warning(dbf,line,name);
|
|
1475 if (v>=4)
|
|
1476 {
|
|
1477 printf(" ");
|
|
1478 for (a=0; a<=D2D; ++a) printf("%6.3g ",100*fpow2(tr[i][a]));
|
|
1479 printf("\n");
|
|
1480 }
|
|
1481 }
|
|
1482
|
|
1483 if (line[0]=='/' && line[1]=='/') break;
|
|
1484
|
|
1485 } /* strncmp("HMM") */
|
|
1486
|
|
1487 } //while(getline)
|
|
1488
|
|
1489 if (L==0) return 0; //End of db file -> stop reading in
|
|
1490
|
|
1491 if (v && i!=L) cerr<<endl<<"Warning: in HMM "<<name<<" there are only "<<i<<" columns while the stated length is "<<L<<"\n";
|
|
1492 if (v && i>=/*MAXRES*/par.maxResLen-2) {i=/*MAXRES*/par.maxResLen-2; cerr<<endl<<"WARNING: maximum number "<</*MAXRES*/par.maxResLen-2<<" of residues exceeded while reading HMM "<<name<<"\n";}
|
|
1493 if (v && !i) cerr<<endl<<"WARNING: HMM "<<name<<" contains no match states. Check the alignment that gave rise to this HMM.\n";
|
|
1494 L = i;
|
|
1495
|
|
1496 if (strlen(longname)>0) strcat(longname," ");
|
|
1497 strncat(longname,name,DESCLEN-strlen(longname)-1); // longname = ACC NAME DESC
|
|
1498 if (strlen(name)>0) strcat(longname," ");
|
|
1499 strncat(longname,desc,DESCLEN-strlen(longname)-1);
|
|
1500 longname[DESCLEN-1]='\0';
|
|
1501 ScopID(cl,fold,sfam,fam);// get scop classification from basename (e.g. a.1.2.3.4)
|
|
1502 RemoveExtension(file,filestr); // copy name of dbfile without extension into 'file'
|
|
1503
|
|
1504 // Secondary structure
|
|
1505 if (!dssp)
|
|
1506 {
|
|
1507 // remove dssp sequence
|
|
1508 delete[] seq[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed
|
|
1509 delete[] sname[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed
|
|
1510 nss_dssp=-1;
|
|
1511 k--;
|
|
1512 }
|
|
1513 else { seq[nss_dssp][0]='-'; seq[nss_dssp][L+1]='\0'; }
|
|
1514
|
|
1515 if (nss_pred>=0)
|
|
1516 {
|
|
1517 for (i=1; i<=L; ++i) ss_pred[i] = ss2i(seq[nss_pred][i]);
|
|
1518 if (nss_conf>=0)
|
|
1519 for (i=1; i<=L; ++i) ss_conf[i] = cf2i(seq[nss_conf][i]);
|
|
1520 else
|
|
1521 for (i=1; i<=L; ++i) ss_conf[i] = 5;
|
|
1522 }
|
|
1523
|
|
1524 // Copy query (first sequence) and consensus residues?
|
|
1525 if (par.showcons)
|
|
1526 {
|
|
1527 sname[k]=new(char[10]);
|
|
1528 strcpy(sname[k],"Consensus");
|
|
1529 sname[k+1]=new(char[strlen(longname)+1]);
|
|
1530 strcpy(sname[k+1],longname);
|
|
1531 seq[k]=new(char[L+2]);
|
|
1532 seq[k][0]=' ';
|
|
1533 seq[k][L+1]='\0';
|
|
1534 seq[k+1]=new(char[L+2]);
|
|
1535 seq[k+1][0]=' ';
|
|
1536 seq[k+1][L+1]='\0';
|
|
1537 for (i=1; i<=L; ++i)
|
|
1538 {
|
|
1539 float pmax=0.0;
|
|
1540 int amax=0;
|
|
1541 for (a=0; a<NAA; ++a)
|
|
1542 if (f[i][a]>pmax) {amax=a; pmax=f[i][a];}
|
|
1543 if (pmax>0.6) seq[k][i]=i2aa(amax);
|
|
1544 else if (pmax>0.4) seq[k][i]=lwrchr(i2aa(amax));
|
|
1545 else seq[k][i]='x';
|
|
1546 seq[k+1][i]=i2aa(amax);
|
|
1547 }
|
|
1548 ncons=k++; // nfirst is set later!
|
|
1549 }
|
|
1550 else
|
|
1551 {
|
|
1552 sname[k]=new(char[strlen(longname)+1]);
|
|
1553 strcpy(sname[k],longname);
|
|
1554 seq[k]=new(char[L+2]);
|
|
1555 seq[k][0]=' ';
|
|
1556 seq[k][L+1]='\0';
|
|
1557 }
|
|
1558
|
|
1559 if (annot) // read in some annotation characters?
|
|
1560 {
|
|
1561 annotchr[0]=' ';
|
|
1562 annotchr[L+1]='\0';
|
|
1563 strcpy(seq[k],annotchr); // overwrite the consensus sequence with the annotation characters
|
|
1564 }
|
|
1565 else if (!par.showcons) // we have not yet calculated the consensus, but we need it now as query (first sequence)
|
|
1566 {
|
|
1567 for (i=1; i<=L; ++i)
|
|
1568 {
|
|
1569 float pmax=0.0;
|
|
1570 int amax=0;
|
|
1571 for (a=0; a<NAA; ++a)
|
|
1572 if (f[i][a]>pmax) {amax=a; pmax=f[i][a];}
|
|
1573 seq[k][i]=i2aa(amax);
|
|
1574 }
|
|
1575 }
|
|
1576 // printf("%i query name=%s seq=%s\n",n,sname[n],seq[n]);
|
|
1577 nfirst=k++;
|
|
1578
|
|
1579 n_display=k;
|
|
1580 n_seqs=k;
|
|
1581
|
|
1582 // If no effektive number of sequences is given, calculate Neff_HMM by given profile
|
|
1583 if (Neff_HMM == 0) {
|
|
1584 for (i=1; i<=L; ++i)
|
|
1585 {
|
|
1586 float S=0.0;
|
|
1587 for (a=0; a<20; ++a)
|
|
1588 if (f[i][a]>1E-10) S-=f[i][a]*fast_log2(f[i][a]);
|
|
1589 Neff_HMM+=(float) fpow2(S);
|
|
1590 }
|
|
1591 Neff_HMM/=L;
|
|
1592 }
|
|
1593
|
|
1594 for (i=0; i<=L; ++i) Neff_M[i] = Neff_I[i] = Neff_D[i] = 10.0; // to add only little additional pseudocounts!
|
|
1595 Neff_M[L+1]=1.0f;
|
|
1596 Neff_I[L+1]=Neff_D[L+1]=0.0f;
|
|
1597
|
|
1598 if (v>=2)
|
|
1599 cout<<"Read in HMM "<<name<<" with "<<L<<" match states and effective number of sequences = "<<Neff_HMM<<"\n";
|
|
1600
|
|
1601 ///////////////////////////////////////////////////////////////////
|
|
1602
|
|
1603 // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
|
|
1604 for (a=0; a<20; ++a) f[0][a]=f[L+1][a]=pb[a];
|
|
1605 delete[] annotchr;
|
|
1606
|
|
1607 has_pseudocounts=true;
|
|
1608
|
|
1609 return 1; //return status: ok
|
|
1610
|
|
1611
|
|
1612 } /* this is the end of HMM::ReadHMMer3() */
|
|
1613
|
|
1614
|
|
1615 //////////////////////////////////////////////////////////////////////////////
|
|
1616 /**
|
|
1617 * @brief Add transition pseudocounts to HMM (and calculate lin-space transition probs)
|
|
1618 */
|
|
1619 void
|
|
1620 HMM::AddTransitionPseudocounts(float gapd, float gape, float gapf, float gapg, float gaph, float gapi, float gapb)
|
|
1621 {
|
|
1622 int i; //position in alignment
|
|
1623 float sum;
|
|
1624 float pM2M, pM2I, pM2D, pI2I, pI2M, pD2D, pD2M;
|
|
1625 float p0,p1,p2;
|
|
1626 if (par.gapb<=0) return;
|
|
1627 if (trans_lin==1) {fprintf(stderr,"Error: Adding transition pseudocounts to linear representation of %s not allowed. Please report this error to the HHsearch developers.\n",name); exit(6);}
|
|
1628 if (trans_lin==2) {fprintf(stderr,"Error: Adding transition pseudocounts twice is %s not allowed. Please report this error to the HHsearch developers.\n",name); exit(6);}
|
|
1629 trans_lin=2;
|
|
1630
|
|
1631 // Calculate pseudocount transition probabilities
|
|
1632 pM2D=pM2I=gapd*0.0286; //a-priori probability for inserts and deletions
|
|
1633 pM2M=1-pM2D-pM2I;
|
|
1634 // gape=0 -> pI2I=0 gape=1 -> pI2I=0.75 gape=inf -> pI2I=1.
|
|
1635 pI2I=1.0*gape/(gape-1+1.0/0.75);
|
|
1636 pI2M=1-pI2I;
|
|
1637 // gape=0 -> pD2D=0 gape=1 -> pD2D=0.75 gape=inf -> pD2D=1.
|
|
1638 pD2D=1.0*gape/(gape-1+1.0/0.75);
|
|
1639 pD2M=1-pD2D;
|
|
1640
|
|
1641 for (i=0; i<=L; ++i) //for all columns in HMM
|
|
1642 {
|
|
1643 // Transitions from M state
|
|
1644 p0 = (Neff_M[i]-1)*fpow2(tr[i][M2M]) + gapb*pM2M;
|
|
1645 p1 = (Neff_M[i]-1)*fpow2(tr[i][M2D]) + gapb*pM2D;
|
|
1646 p2 = (Neff_M[i]-1)*fpow2(tr[i][M2I]) + gapb*pM2I;
|
|
1647 if (i==0) p1=p2=0; //from M(0) no transition to D(1) and I(0) possible
|
|
1648 if (i==L) p1=p2=0; //from M(L) no transition to D(L+1) and I(L+1) possible
|
|
1649 sum = p0+p1+p2+FLT_MIN;
|
|
1650
|
|
1651 // p0 = p0/sum ;
|
|
1652 // p1 = pow(p1/sum,gapf);
|
|
1653 // p2 = pow(p2/sum,gapg);
|
|
1654 // sum = p0+p1+p2+FLT_MIN;
|
|
1655 // tr[i][M2M] = fast_log2(p0/sum);
|
|
1656 // tr[i][M2D] = fast_log2(p1/sum);
|
|
1657 // tr[i][M2I] = fast_log2(p2/sum);
|
|
1658
|
|
1659 tr[i][M2M] = fast_log2(p0/sum);
|
|
1660 tr[i][M2D] = fast_log2(p1/sum)*gapf;
|
|
1661 tr[i][M2I] = fast_log2(p2/sum)*gapg;
|
|
1662
|
|
1663 // Transitions from I state
|
|
1664 p0 = Neff_I[i]*fpow2(tr[i][I2M]) + gapb*pI2M;
|
|
1665 p1 = Neff_I[i]*fpow2(tr[i][I2I]) + gapb*pI2I;
|
|
1666 sum = p0+p1+FLT_MIN;
|
|
1667
|
|
1668 // p0 = pow(p0/sum,gapg);
|
|
1669 // p1 = pow(p1/sum,gapi);
|
|
1670 // sum = p0+p1+FLT_MIN;
|
|
1671 // tr[i][I2M] = fast_log2(p0/sum);
|
|
1672 // tr[i][I2I] = fast_log2(p1/sum);
|
|
1673
|
|
1674 tr[i][I2M] = fast_log2(p0/sum);
|
|
1675 tr[i][I2I] = fast_log2(p1/sum)*gapi;
|
|
1676
|
|
1677 // Transitions from D state
|
|
1678 p0 = Neff_D[i]*fpow2(tr[i][D2M]) + gapb*pD2M;
|
|
1679 p1 = Neff_D[i]*fpow2(tr[i][D2D]) + gapb*pD2D;
|
|
1680 if (i==L) p1=0; //from D(L) no transition to D(L+1) possible
|
|
1681 sum = p0+p1+FLT_MIN;
|
|
1682
|
|
1683 // p0 = pow(p0/sum,gapf);
|
|
1684 // p1 = pow(p1/sum,gaph);
|
|
1685 // sum = p0+p1+FLT_MIN;
|
|
1686 // tr[i][D2M] = fast_log2(p0/sum);
|
|
1687 // tr[i][D2D] = fast_log2(p1/sum);
|
|
1688
|
|
1689 tr[i][D2M] = fast_log2(p0/sum);
|
|
1690 tr[i][D2D] = fast_log2(p1/sum)*gaph;
|
|
1691
|
|
1692 // SS-dependent gap penalties
|
|
1693 tr[i][M2M_GAPOPEN]=tr[i][M2M];
|
|
1694 tr[i][GAPOPEN]=0.0;
|
|
1695 tr[i][GAPEXTD]=0.0;
|
|
1696 }
|
|
1697
|
|
1698 if (v>=4)
|
|
1699 {
|
|
1700 printf("\nPseudocount transition probabilities:\n");
|
|
1701 printf("pM2M=%4.1f%%, pM2I=%4.1f%%, pM2D=%4.1f%%, ",100*pM2M,100*pM2I,100*pM2D);
|
|
1702 printf("pI2M=%4.1f%%, pI2I=%4.1f%%, ",100*pI2M,100*pI2I);
|
|
1703 printf("pD2M=%4.1f%%, pD2D=%4.1f%% ",100*pD2M,100*pD2D);
|
|
1704 printf("tau = %4.1f%%\n\n",100.*gapb/(Neff_HMM-1+gapb));
|
|
1705 printf("Listing transition probabilities WITH pseudocounts:\n");
|
|
1706 printf(" i dssp pred sacc M->M M->I M->D I->M I->I D->M D->D\n");
|
|
1707
|
|
1708 for (i=1; i<=L; ++i) //for all columns in HMM
|
|
1709 {
|
|
1710 printf("%4i %1c %1c %1c %6.3f %6.3f %6.3f ",i,i2ss(ss_dssp[i]),i2ss(ss_pred[i]),i2sa(sa_dssp[i]),fpow2(tr[i][M2M]),fpow2(tr[i][M2I]),fpow2(tr[i][M2D]));
|
|
1711 printf("%6.3f %6.3f ",fpow2(tr[i][I2M]),fpow2(tr[i][I2I]));
|
|
1712 printf("%6.3f %6.3f ",fpow2(tr[i][D2M]),fpow2(tr[i][D2D]));
|
|
1713 printf("%1i %2i %1i\n",ss_pred[i],ss_conf[i],ss_dssp[i]);
|
|
1714 }
|
|
1715 printf("\n");
|
|
1716 printf("nss_dssp=%i nss_pred=%i\n",nss_dssp,nss_pred);
|
|
1717 }
|
|
1718 return;
|
|
1719 }
|
|
1720
|
|
1721
|
|
1722 //////////////////////////////////////////////////////////////////////////////
|
|
1723 /**
|
|
1724 * @brief Use secondary structure-dependent gap penalties
|
|
1725 * on top of the HMM transition penalties
|
|
1726 */
|
|
1727 void
|
|
1728 HMM::UseSecStrucDependentGapPenalties()
|
|
1729 {
|
|
1730 int i; // column in HMM
|
|
1731 int ii;
|
|
1732 //unsigned char iis[MAXRES]; // inside-integer array
|
|
1733 unsigned char iis[par.maxResLen]; // inside-integer array
|
|
1734 float d; // Additional penalty for opening gap whithin SS element
|
|
1735 float e; // Additional penalty for extending gap whithin SS element
|
|
1736
|
|
1737 // Determine inside-integers:
|
|
1738 // CCSTCCCHHHHHHHHHHHCCCCCEEEEECCSBGGGCCCCEECC
|
|
1739 // 0000000123444432100000012210000000000001000
|
|
1740 ii=0;
|
|
1741 for (i=0; i<=L; ++i) // forward run
|
|
1742 {
|
|
1743 if (ss_dssp[i]==1 || ss_dssp[i]==2) {ii+=(ii<par.ssgapi);} else ii=0;
|
|
1744 iis[i]=ii;
|
|
1745 } for (i=0; i<=L; ++i)
|
|
1746 ii=0;
|
|
1747 iis[0]=iis[L]=0;
|
|
1748 for (i=L; i>=0; i--) // backward run
|
|
1749 {
|
|
1750 if (ss_dssp[i]==1 || ss_dssp[i]==2) {ii+=(ii<par.ssgapi);} else ii=0;
|
|
1751 iis[i-1]=imin(ii,iis[i-1]);
|
|
1752 }
|
|
1753
|
|
1754 // Add SS-dependent gap penalties to HMM transition penalties
|
|
1755 for (i=0; i<=L; ++i) //for all columns in HMM
|
|
1756 {
|
|
1757 d=-iis[i]*par.ssgapd;
|
|
1758 e=-iis[i]*par.ssgape;
|
|
1759 tr[i][GAPOPEN]=d;
|
|
1760 tr[i][GAPEXTD]=e;
|
|
1761 tr[i][M2M_GAPOPEN]+=d;
|
|
1762 tr[i][M2I]+=d;
|
|
1763 tr[i][I2M]+=d;
|
|
1764 tr[i][I2I]+=e;
|
|
1765 tr[i][M2D]+=d;
|
|
1766 tr[i][D2M]+=d;
|
|
1767 tr[i][D2D]+=e;
|
|
1768 }
|
|
1769
|
|
1770 if (v>=3)
|
|
1771 {
|
|
1772 printf("Col SS II\n");
|
|
1773 for (i=0; i<=L; ++i) printf("%3i %c %2i\n",i,i2ss(ss_dssp[i]),iis[i]);
|
|
1774 }
|
|
1775 return;
|
|
1776 }
|
|
1777
|
|
1778 /////////////////////////////////////////////////////////////////////////////////////
|
|
1779 /**
|
|
1780 * @brief Generate an amino acid frequency matrix g[][] with full pseudocount admixture (tau=1)
|
|
1781 */
|
|
1782 void
|
|
1783 HMM::PreparePseudocounts()
|
|
1784 {
|
|
1785 for (int i=0; i<=L+1; ++i)
|
|
1786 for (int a=0; a<20; ++a)
|
|
1787 g[i][a] = // produces fast code
|
|
1788 R[a][0]*f[i][0] +R[a][1]*f[i][1] +R[a][2]*f[i][2] +R[a][3]*f[i][3] +R[a][4]*f[i][4]
|
|
1789 +R[a][5]*f[i][5] +R[a][6]*f[i][6] +R[a][7]*f[i][7] +R[a][8]*f[i][8] +R[a][9]*f[i][9]
|
|
1790 +R[a][10]*f[i][10]+R[a][11]*f[i][11]+R[a][12]*f[i][12]+R[a][13]*f[i][13]+R[a][14]*f[i][14]
|
|
1791 +R[a][15]*f[i][15]+R[a][16]*f[i][16]+R[a][17]*f[i][17]+R[a][18]*f[i][18]+R[a][19]*f[i][19];
|
|
1792 }
|
|
1793
|
|
1794 /////////////////////////////////////////////////////////////////////////////////////
|
|
1795 /**
|
|
1796 * @brief Add amino acid pseudocounts to HMM and calculate average protein aa probabilities pav[a]
|
|
1797 * Pseudocounts: t.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
|
|
1798 */
|
|
1799 void
|
|
1800 HMM::AddAminoAcidPseudocounts(char pcm, float pca, float pcb, float pcc)
|
|
1801 {
|
|
1802 int i; //position in HMM
|
|
1803 int a; //amino acid (0..19)
|
|
1804 float sum;
|
|
1805 float tau; //tau = pseudocount admixture
|
|
1806
|
|
1807 for (a=0; a<20; ++a) pav[a]=pb[a]*100.0f/Neff_HMM; // initialize vector of average aa freqs with pseudocounts
|
|
1808
|
|
1809 // Calculate amino acid frequencies p[i][a] = (1-tau(i))*f[i][a] + tau(i)*g[i][a]
|
|
1810 switch (pcm)
|
|
1811 {
|
|
1812 case 0: //no pseudocounts whatsoever: tau=0
|
|
1813 for (i=1; i<=L; ++i)
|
|
1814 for (a=0; a<20; ++a)
|
|
1815 pav[a] += ( p[i][a]=f[i][a] );
|
|
1816 break;
|
|
1817 case 1: //constant pseudocounts (for optimization): tau = pca
|
|
1818 tau = pca;
|
|
1819 for (i=1; i<=L; ++i)
|
|
1820 for (a=0; a<20; ++a)
|
|
1821 pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] );
|
|
1822 break;
|
|
1823 case 2: //divergence-dependent pseudocounts
|
|
1824 case 4: //divergence-dependent pseudocounts and rate matrix rescaling
|
|
1825 if (par.pcc==1.0f)
|
|
1826 for (i=1; i<=L; ++i)
|
|
1827 {
|
|
1828 tau = fmin(1.0, pca/(1. + Neff_M[i]/pcb ) );
|
|
1829 for (a=0; a<20; ++a)
|
|
1830 pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] );
|
|
1831 }
|
|
1832 else
|
|
1833 for (i=1; i<=L; ++i)
|
|
1834 {
|
|
1835 tau = fmin(1.0, pca/(1. + pow((Neff_M[i])/pcb,pcc)));
|
|
1836 for (a=0; a<20; ++a)
|
|
1837 pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] );
|
|
1838 }
|
|
1839 break;
|
|
1840 case 3: // constant-divergence pseudocounts
|
|
1841 for (i=1; i<=L; ++i)
|
|
1842 {
|
|
1843 float x = Neff_M[i]/pcb;
|
|
1844 pca = 0.793 + 0.048*(pcb-10.0);
|
|
1845 tau = fmax(0.0, pca*(1-x + pcc*x*(1-x)) );
|
|
1846 for (a=0; a<20; ++a)
|
|
1847 pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] );
|
|
1848 }
|
|
1849 if (v>=2) { printf("Divergence before / after addition of amino acid pseudocounts: %5.2f / %5.2f\n",Neff_HMM, CalcNeff()); }
|
|
1850 break;
|
|
1851 } //end switch (pcm)
|
|
1852
|
|
1853
|
|
1854 // Normalize vector of average aa frequencies pav[a]
|
|
1855 NormalizeTo1(pav,NAA);
|
|
1856
|
|
1857 for (a=0; a<20; ++a)
|
|
1858 p[0][a] = p[L+1][a] = pav[a];
|
|
1859
|
|
1860 // DEBUGGING output
|
|
1861 if (v>=3)
|
|
1862 {
|
|
1863 switch (pcm)
|
|
1864 {
|
|
1865 case 0:
|
|
1866 cout<<"No pseudocounts added (-pcm 0)\n";
|
|
1867 return;
|
|
1868 case 1:
|
|
1869 cout<<"Adding constant AA pseudocount admixture of "<<pca<<" to HMM "<<name<<"\n";
|
|
1870 break;
|
|
1871 case 2:
|
|
1872 cout<<"Adding divergence-dependent AA pseudocounts (-pcm 2) with admixture of "
|
|
1873 <<pca/(1.+pow((Neff_HMM-1.)/pcb,pcc))<<" to HMM "<<name<<"\n";
|
|
1874 break;
|
|
1875 } //end switch (pcm)
|
|
1876 cout<<"\nAverage amino acid frequencies WITH pseudocounts in HMM: \nProf: ";
|
|
1877 for (a=0; a<20; ++a) printf("%4.1f ",100*pav[a]);
|
|
1878 cout<<"\n";
|
|
1879 if (v>=4)
|
|
1880 {
|
|
1881 cout<<"\nAmino acid frequencies WITHOUT pseudocounts:\n A R N D C Q E G H I L K M F P S T W Y V\n";
|
|
1882 for (i=1; i<=L; ++i)
|
|
1883 {
|
|
1884 printf("%3i: ",i);
|
|
1885 sum=0;
|
|
1886 for (a=0; a<20; ++a)
|
|
1887 {
|
|
1888 sum+=f[i][a];
|
|
1889 printf("%4.1f ",100*f[i][a]);
|
|
1890 }
|
|
1891 printf(" sum=%5.3f\n",sum);
|
|
1892 }
|
|
1893 cout<<"\nAmino acid frequencies WITH pseudocounts:\n A R N D C Q E G H I L K M F P S T W Y V\n";
|
|
1894 for (i=1; i<=L; ++i)
|
|
1895 {
|
|
1896 printf("%3i: ",i);
|
|
1897 sum=0;
|
|
1898 for (a=0; a<20; ++a)
|
|
1899 {
|
|
1900 sum+=p[i][a];
|
|
1901 printf("%4.1f ",100*p[i][a]);
|
|
1902 }
|
|
1903 printf(" sum=%5.3f\n",sum);
|
|
1904 }
|
|
1905 }
|
|
1906 }
|
|
1907 return;
|
|
1908 }
|
|
1909
|
|
1910
|
|
1911 /////////////////////////////////////////////////////////////////////////////////////
|
|
1912 /**
|
|
1913 * @brief Factor Null model into HMM t
|
|
1914 */
|
|
1915 void
|
|
1916 HMM::IncludeNullModelInHMM(HMM& q, HMM& t)
|
|
1917 {
|
|
1918
|
|
1919 int i,j; //query and template match state indices
|
|
1920 int a; //amino acid index
|
|
1921
|
|
1922 switch (par.columnscore)
|
|
1923 {
|
|
1924 default:
|
|
1925 case 0: // Null model with background prob. from database
|
|
1926 for (a=0; a<20; ++a) pnul[a]=pb[a];
|
|
1927 break;
|
|
1928
|
|
1929 case 1: // Null model with background prob. equal average from query and template
|
|
1930 for (a=0; a<20; ++a) pnul[a]=0.5*(q.pav[a]+t.pav[a]);
|
|
1931 break;
|
|
1932
|
|
1933 case 2: // Null model with background prob. from template protein
|
|
1934 for (a=0; a<20; ++a) pnul[a]=t.pav[a];
|
|
1935 break;
|
|
1936
|
|
1937 case 3: // Null model with background prob. from query protein
|
|
1938 for (a=0; a<20; ++a) pnul[a]=q.pav[a];
|
|
1939 break;
|
|
1940
|
|
1941 case 4: // Null model with background prob. equal average from query and template
|
|
1942 for (a=0; a<20; ++a) pnul[a]=sqrt(q.pav[a]*t.pav[a]);
|
|
1943 break;
|
|
1944
|
|
1945 case 10: // Separated column scoring for Stochastic Backtracing (STILL USED??)
|
|
1946 for (i=0; i<=q.L+1; ++i)
|
|
1947 {
|
|
1948 float sum = 0.0;
|
|
1949 for (a=0; a<20; ++a) sum += pb[a]*q.p[i][a];
|
|
1950 sum = 1.0/sqrt(sum);
|
|
1951 for (a=0; a<20; ++a) q.p[i][a]*=sum;
|
|
1952 }
|
|
1953 for (j=0; j<=t.L+1; j++)
|
|
1954 {
|
|
1955 float sum = 0.0;
|
|
1956 for (a=0; a<20; ++a) sum += pb[a]*t.p[j][a];
|
|
1957 sum = 1.0/sqrt(sum);
|
|
1958 for (a=0; a<20; ++a) t.p[j][a]*=sum;
|
|
1959 }
|
|
1960 break;
|
|
1961
|
|
1962 case 11: // log co-emission probability (no null model)
|
|
1963 for (a=0; a<20; ++a) pnul[a]=0.05;
|
|
1964 break;
|
|
1965
|
|
1966 }
|
|
1967
|
|
1968 // !!!!! ATTENTION!!!!!!! after this t.p is not the same as after adding pseudocounts !!!
|
|
1969 //Introduce amino acid weights into template (for all but SOP scores)
|
|
1970 if (par.columnscore!=10)
|
|
1971 for (a=0; a<20; ++a)
|
|
1972 for (j=0; j<=t.L+1; j++)
|
|
1973 t.p[j][a]/=pnul[a];
|
|
1974
|
|
1975 if (v>=5)
|
|
1976 {
|
|
1977 cout<<"\nAverage amino acid frequencies\n";
|
|
1978 cout<<" A R N D C Q E G H I L K M F P S T W Y V\n";
|
|
1979 cout<<"Q: ";
|
|
1980 for (a=0; a<20; ++a) printf("%4.1f ",100*q.pav[a]);
|
|
1981 cout<<"\nT: ";
|
|
1982 for (a=0; a<20; ++a) printf("%4.1f ",100*t.pav[a]);
|
|
1983 cout<<"\nNull: ";
|
|
1984 for (a=0; a<20; ++a) printf("%4.1f ",100*pnul[a]);
|
|
1985 cout<<"\npb: ";
|
|
1986 for (a=0; a<20; ++a) printf("%4.1f ",100*pb[a]);
|
|
1987 }
|
|
1988
|
|
1989
|
|
1990 return;
|
|
1991 }
|
|
1992
|
|
1993
|
|
1994 /////////////////////////////////////////////////////////////////////////////////////
|
|
1995 /**
|
|
1996 * @brief Write HMM to output file
|
|
1997 */
|
|
1998 void
|
|
1999 HMM::WriteToFile(char* outfile)
|
|
2000 {
|
|
2001 const int SEQLEN=100; // number of residues per line for sequences to be displayed
|
|
2002 int i,a;
|
|
2003
|
|
2004 if (trans_lin) {fprintf(stderr,"Error: Writing transition pseudocounts in linear representation not allowed. Please report this error to the HHsearch developers.\n"); exit(6);}
|
|
2005
|
|
2006 FILE *outf=NULL;
|
|
2007 if (strcmp(outfile,"stdout"))
|
|
2008 {
|
|
2009 if (par.append) outf=fopen(outfile,"a"); else outf=fopen(outfile,"w");
|
|
2010 if (!outf) OpenFileError(outfile);
|
|
2011 }
|
|
2012 else
|
|
2013 outf = stdout;
|
|
2014 if (v>=2) cout<<"Writing HMM to "<<outfile<<"\n";
|
|
2015
|
|
2016 // fprintf(outf,"HHsearch HHM format 1.5\n");
|
|
2017 fprintf(outf,"HHsearch 1.5\n"); // format specification
|
|
2018 fprintf(outf,"NAME %s\n",longname); // name of first sequence
|
|
2019 fprintf(outf,"FAM %s\n",fam); // family name
|
|
2020 char file_nopath[NAMELEN];
|
|
2021 RemovePath(file_nopath,file);
|
|
2022 fprintf(outf,"FILE %s\n",file_nopath); // base name of alignment file
|
|
2023
|
|
2024 // Print command line
|
|
2025 fprintf(outf,"COM ");
|
|
2026 for (int i=0; i<par.argc; i++)
|
|
2027 if (strlen(par.argv[i])<=100)
|
|
2028 fprintf(outf,"%s ",par.argv[i]);
|
|
2029 else
|
|
2030 fprintf(outf,"<%i characters> ",(int)strlen(par.argv[i]));
|
|
2031 fprintf(outf,"\n");
|
|
2032
|
|
2033 // print out date stamp
|
|
2034 time_t* tp=new(time_t);
|
|
2035 *tp=time(NULL);
|
|
2036 fprintf(outf,"DATE %s",ctime(tp));
|
|
2037 delete tp; tp = NULL; /* really? FS */
|
|
2038
|
|
2039 // Print out some statistics of alignment
|
|
2040 fprintf(outf,"LENG %i match states, %i columns in multiple alignment\n",L,l[L]);
|
|
2041 fprintf(outf,"FILT %i out of %i sequences passed filter (-id %i -cov %i -qid %i -qsc %.2f -diff %i)\n",N_filtered,N_in,par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff);
|
|
2042 fprintf(outf,"NEFF %-4.1f\n",Neff_HMM);
|
|
2043
|
|
2044 // Print selected sequences from alignment (including secondary structure and confidence values, if known)
|
|
2045 fprintf(outf,"SEQ\n");
|
|
2046 for (int n=0; n<n_display; n++)
|
|
2047 {
|
|
2048 fprintf(outf,">%s\n",sname[n]);
|
|
2049 //first sequence character starts at 1; 0 not used.
|
|
2050 for(unsigned int j=0; j<strlen(seq[n]+1); j+=SEQLEN) fprintf(outf,"%-.*s\n",SEQLEN,seq[n]+1+j);
|
|
2051 }
|
|
2052 fprintf(outf,"#\n");
|
|
2053
|
|
2054 // print null model background probabilities from substitution matrix
|
|
2055 fprintf(outf,"NULL ");
|
|
2056 for (a=0; a<20; ++a) fout(outf,-iround(fast_log2(pb[s2a[a]])*HMMSCALE ));
|
|
2057 fprintf(outf,"\n");
|
|
2058
|
|
2059 // print table header line with amino acids
|
|
2060 fprintf(outf,"HMM ");
|
|
2061 for (a=0; a<20; ++a) fprintf(outf,"%1c\t",i2aa(s2a[a]));
|
|
2062 fprintf(outf,"\n");
|
|
2063
|
|
2064 // print table header line with state transitions
|
|
2065 fprintf(outf," M->M\tM->I\tM->D\tI->M\tI->I\tD->M\tD->D\tNeff\tNeff_I\tNeff_D\n");
|
|
2066
|
|
2067 // print out transition probabilities from begin state (virtual match state)
|
|
2068 fprintf(outf," ");
|
|
2069 for (a=0; a<=D2D; ++a) fout(outf,-iround(tr[0][a]*HMMSCALE));
|
|
2070 fout(outf,iround(Neff_M[0]*HMMSCALE));
|
|
2071 fout(outf,iround(Neff_I[0]*HMMSCALE));
|
|
2072 fout(outf,iround(Neff_D[0]*HMMSCALE));
|
|
2073 fprintf(outf,"\n");
|
|
2074
|
|
2075 // Start loop for printing HMM columns
|
|
2076 int h=1;
|
|
2077 for (i=1; i<=L; ++i)
|
|
2078 {
|
|
2079
|
|
2080 while(islower(seq[nfirst][h]) && seq[nfirst][h]) h++;
|
|
2081 fprintf(outf,"%1c %-4i ",seq[nfirst][h++],i);
|
|
2082
|
|
2083 // Print emission probabilities for match state
|
|
2084 for (a=0; a<20; ++a) fout(outf,-iround(fast_log2(p[i][s2a[a]])*HMMSCALE ));
|
|
2085 fprintf(outf,"%-i",l[i]);
|
|
2086 fprintf(outf,"\n");
|
|
2087
|
|
2088 // Print transition probabilities
|
|
2089 fprintf(outf," ");
|
|
2090 for (a=0; a<=D2D; ++a) fout(outf,-iround(tr[i][a]*HMMSCALE));
|
|
2091 fout(outf,iround(Neff_M[i]*HMMSCALE));
|
|
2092 fout(outf,iround(Neff_I[i]*HMMSCALE));
|
|
2093 fout(outf,iround(Neff_D[i]*HMMSCALE));
|
|
2094 fprintf(outf,"\n\n");
|
|
2095 } // end for(i)-loop for printing HMM columns
|
|
2096
|
|
2097 fprintf(outf,"//\n");
|
|
2098 fclose(outf);
|
|
2099 }
|
|
2100
|
|
2101 /////////////////////////////////////////////////////////////////////////////////////
|
|
2102 /**
|
|
2103 * @brief Write HMM to output file
|
|
2104 */
|
|
2105 void
|
|
2106 HMM::InsertCalibration(char* infile)
|
|
2107 {
|
|
2108 char* line = new(char[LINELEN]); // input line
|
|
2109 char** lines = new(char*[3*L+100000]);
|
|
2110 int nline=0;
|
|
2111 int l;
|
|
2112 char done=0; // inserted new 'EVD mu sigma' line?
|
|
2113
|
|
2114 // Read from infile all lines and insert the EVD line with lamda and mu coefficients
|
|
2115 ifstream inf;
|
|
2116 inf.open(infile, ios::in);
|
|
2117 if (!inf) OpenFileError(infile);
|
|
2118 if (v>=2) cout<<"Recording calibration coefficients in "<<infile<<"\n";
|
|
2119
|
|
2120 while (inf.getline(line,LINELEN) && !(line[0]=='/' && line[1]=='/') && nline<2*/*MAXRES*/par.maxResLen)
|
|
2121 {
|
|
2122
|
|
2123 // Found an EVD lamda mu line? -> remove
|
|
2124 while (!done && !strncmp(line,"EVD ",3) && !(line[0]=='/' && line[1]=='/') && nline<2*/*MAXRES*/par.maxResLen)
|
|
2125 inf.getline(line,LINELEN);
|
|
2126 if ((line[0]=='/' && line[1]=='/') || nline>=2*/*MAXRES*/par.maxResLen)
|
|
2127 {fprintf(stderr,"Error: wrong format in %s. Expecting hhm format\n",infile); exit(1);}
|
|
2128
|
|
2129 // Found the SEQ line? -> insert calibration before this line
|
|
2130 if (!done && (!strncmp("SEQ",line,3) || !strncmp("HMM",line,3)) && (isspace(line[3]) || line[3]=='\0'))
|
|
2131 {
|
|
2132 done=1;
|
|
2133 lines[nline]=new(char[128]);
|
|
2134 if (!lines[nline]) MemoryError("space to read in HHM file for calibration");
|
|
2135 sprintf(lines[nline],"EVD %-7.4f %-7.4f",lamda,mu);
|
|
2136 nline++;
|
|
2137 }
|
|
2138 lines[nline]=new(char[strlen(line)+1]);
|
|
2139 if (!lines[nline]) MemoryError("space to read in HHM file for calibration");
|
|
2140 strcpy (lines[nline],line);
|
|
2141 nline++;
|
|
2142 }
|
|
2143 inf.close();
|
|
2144
|
|
2145 // Write to infile all lines
|
|
2146 FILE* infout=fopen(infile,"w");
|
|
2147 if (!infout) {
|
|
2148 cerr<<endl<<"WARNING in "<<program_name<<": no calibration coefficients written to "<<infile<<":\n";
|
|
2149 cerr<<"Could not open file for writing.\n";
|
|
2150 return;
|
|
2151 }
|
|
2152 for (l=0; l<nline; l++) {
|
|
2153 fprintf(infout,"%s\n",lines[l]);
|
|
2154 delete[] lines[l]; lines[l] = NULL;
|
|
2155 }
|
|
2156 fprintf(infout,"//\n");
|
|
2157 fclose(infout);
|
|
2158 delete[] line; line = NULL;
|
|
2159 delete[] lines; lines = NULL;
|
|
2160 return;
|
|
2161 }
|
|
2162
|
|
2163 /////////////////////////////////////////////////////////////////////////////////////
|
|
2164 /**
|
|
2165 * @brief Write HMM to output file in HMMER format
|
|
2166 */
|
|
2167 void
|
|
2168 HMM::WriteToFileHMMER(char* outfile)
|
|
2169 {
|
|
2170 const int INTSCALE=1000; //scaling factor in HMMER files
|
|
2171 const float pBD=0.50;
|
|
2172 const int LOG2pBD=iround(fast_log2(pBD)*INTSCALE);
|
|
2173 const int LOG2pBM=iround(fast_log2(1-pBD)*INTSCALE);
|
|
2174 const float pJB=1.0/350;
|
|
2175 const int LOG2pJB=iround(fast_log2(pJB)*INTSCALE);
|
|
2176 const int LOG2pJJ=iround(fast_log2(1-pJB)*INTSCALE);
|
|
2177 const float pEJ=0.5;
|
|
2178 const int LOG2pEJ=iround(fast_log2(pEJ)*INTSCALE);
|
|
2179 const int LOG2pEC=iround(fast_log2(1-pEJ)*INTSCALE);
|
|
2180 char c;
|
|
2181 int i,a;
|
|
2182
|
|
2183 if (trans_lin) {fprintf(stderr,"Error: Writing transition pseudocounts in linear representation not allowed. Please report this error to the HHsearch developers.\n"); exit(6);}
|
|
2184
|
|
2185 FILE *outf=NULL;
|
|
2186 if (strcmp(outfile,"stdout"))
|
|
2187 {
|
|
2188 if (par.append) outf=fopen(outfile,"a"); else outf=fopen(outfile,"w");
|
|
2189 if (!outf) OpenFileError(outfile);
|
|
2190 }
|
|
2191 else
|
|
2192 outf = stdout;
|
|
2193 if (v>=2) cout<<"Writing HMM to "<<outfile<<"\n";
|
|
2194
|
|
2195 fprintf(outf,"HMMER2.0 [hhmake %s]\n",VERSION_AND_DATE);
|
|
2196 fprintf(outf,"NAME %s\n",file); // base name of alignment file
|
|
2197 fprintf(outf,"DESC %s\n",longname);
|
|
2198 fprintf(outf,"LENG %i\n",L);
|
|
2199 fprintf(outf,"ALPH Amino\n"); // amino acid seuqences (not DNA)
|
|
2200 fprintf(outf,"RF yes\n"); // reference annotation flag
|
|
2201 fprintf(outf,"CS yes\n"); // consensus structure annotation flag
|
|
2202 fprintf(outf,"MAP yes\n"); // write MA column number after each line of aa probabilities
|
|
2203
|
|
2204 fprintf(outf,"COM "); // print out command line
|
|
2205 for (i=0; i<=par.argc-1; ++i) fprintf(outf,"%s ",par.argv[i]); fprintf(outf,"\n");
|
|
2206
|
|
2207 fprintf(outf,"NSEQ %i\n",N_filtered); // print number of sequences after filtering
|
|
2208
|
|
2209 // Date stamp
|
|
2210 time_t* tp=new(time_t);
|
|
2211 *tp=time(NULL);
|
|
2212 fprintf(outf,"DATE %s",ctime(tp));
|
|
2213 delete tp; tp = NULL; /* really? FS */
|
|
2214
|
|
2215 // Print out secondary structure
|
|
2216 if (nss_dssp>=0)
|
|
2217 fprintf(outf,"SSDSS %s\n",seq[nss_dssp]);
|
|
2218 if (nsa_dssp>=0)
|
|
2219 fprintf(outf,"SADSS %s\n",seq[nsa_dssp]);
|
|
2220 if (nss_pred>=0)
|
|
2221 fprintf(outf,"SSPRD %s\n",seq[nss_pred]);
|
|
2222 if (nss_conf>=0)
|
|
2223 fprintf(outf,"SSCNF %s\n",seq[nss_conf]);
|
|
2224
|
|
2225
|
|
2226 // Special Plan7 transitions that control repeated detection of profile HMM within sequence
|
|
2227 fprintf(outf,"XT %6i %6i %6i %6i %6i %6i %6i %6i\n",LOG2pJB,LOG2pJJ,LOG2pEC,LOG2pEJ,LOG2pJB,LOG2pJJ,LOG2pJB,LOG2pJJ);
|
|
2228 fprintf(outf,"NULT -4 -8455\n");
|
|
2229
|
|
2230
|
|
2231 // Null model background probabilities from substitution matrix
|
|
2232 fprintf(outf,"NULE ");
|
|
2233 for (a=0; a<20; ++a)
|
|
2234 {
|
|
2235 float lg2=fast_log2(pb[s2a[a]]*20.0);
|
|
2236 if (lg2<-99.999) fprintf(outf," *"); else fprintf(outf," %6i",iround(lg2*INTSCALE));
|
|
2237 }
|
|
2238 fprintf(outf,"\n");
|
|
2239
|
|
2240 // Table header line with amino acids
|
|
2241 fprintf(outf,"HMM ");
|
|
2242 for (a=0; a<20; ++a) fprintf(outf," %1c ",i2aa(s2a[a]));
|
|
2243 fprintf(outf,"\n");
|
|
2244
|
|
2245 // Table header line with state transitions
|
|
2246 fprintf(outf," m->m m->i m->d i->m i->i d->m d->d b->m m->e\n");
|
|
2247
|
|
2248 // Transition probabilities from begin state
|
|
2249 fprintf(outf," %6i * %6i\n",LOG2pBM,LOG2pBD);
|
|
2250
|
|
2251 // Start loop for printing HMM columns
|
|
2252 int h=1, hss=1;
|
|
2253 for (i=1; i<=L; ++i)
|
|
2254 {
|
|
2255
|
|
2256 // Emission probabilities for match state
|
|
2257 fprintf(outf," %5i",i);
|
|
2258 for (a=0; a<20; ++a) fprintf(outf," %6i",imax(-9999,iround(fast_log2(p[i][s2a[a]]/pb[s2a[a]])*INTSCALE)));
|
|
2259 fprintf(outf," %5i",l[i]);
|
|
2260 fprintf(outf,"\n");
|
|
2261
|
|
2262 // Emission probabilities (relative to null model) for insert state
|
|
2263 while(islower(seq[nfirst][h]) && seq[nfirst][h]) h++;
|
|
2264 if (i==L)
|
|
2265 fprintf(outf," %1c * * * * * * * * * * * * * * * * * * * *\n",seq[nfirst][h++]);
|
|
2266 else
|
|
2267 fprintf(outf," %1c 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",seq[nfirst][h++]);
|
|
2268
|
|
2269 // Transition probabilities
|
|
2270 if (nss_dssp>=0)
|
|
2271 {
|
|
2272 while(islower(seq[nss_dssp][hss]) && seq[nss_dssp][hss]) hss++;
|
|
2273 c=seq[nss_dssp][hss++];
|
|
2274 }
|
|
2275 else c=' ';
|
|
2276 fprintf(outf," %1c",c);
|
|
2277 if (i==1)
|
|
2278 {
|
|
2279 for (a=0; a<=D2D; ++a) fprintf(outf," %6i",imax(-9999,iround(tr[i][a]*INTSCALE)));
|
|
2280 fprintf(outf," %6i *\n",LOG2pBM);
|
|
2281 }
|
|
2282 else if (i==L)
|
|
2283 {
|
|
2284 for (a=0; a<=D2D; ++a) fprintf(outf," *");
|
|
2285 fprintf(outf," * 0\n");
|
|
2286 }
|
|
2287 else
|
|
2288 {
|
|
2289 for (a=0; a<=D2D; ++a) fprintf(outf," %6i",imax(-9999,iround(tr[i][a]*INTSCALE)));
|
|
2290 fprintf(outf," * *\n");
|
|
2291 }
|
|
2292 }
|
|
2293
|
|
2294 fprintf(outf,"//\n");
|
|
2295 fclose(outf);
|
|
2296 }
|
|
2297
|
|
2298
|
|
2299 /////////////////////////////////////////////////////////////////////////////////////
|
|
2300 /**
|
|
2301 * @brief Transform log to lin transition probs
|
|
2302 */
|
|
2303 void
|
|
2304 HMM::Log2LinTransitionProbs(float beta)
|
|
2305 {
|
|
2306 if (trans_lin==1) return;
|
|
2307 trans_lin=1;
|
|
2308 for (int i=0; i<=L; ++i)
|
|
2309 {
|
|
2310 for (int a=0; a<NTRANS; ++a)
|
|
2311 tr[i][a] = fpow2(beta*tr[i][a]);
|
|
2312 /* FIXME valgrind says: "Conditional jump or move depends on
|
|
2313 * uninitialised value(s)" when using hmm iteration
|
|
2314 */
|
|
2315 }
|
|
2316 }
|
|
2317
|
|
2318
|
|
2319 /**
|
|
2320 * @brief Set query columns in His-tags etc to Null model distribution
|
|
2321 */
|
|
2322 void
|
|
2323 HMM::NeutralizeTags()
|
|
2324 {
|
|
2325 char* qseq = seq[nfirst];
|
|
2326 char* pt;
|
|
2327 int a,i;
|
|
2328
|
|
2329 if (NULL == qseq){
|
|
2330 return;
|
|
2331 }
|
|
2332
|
|
2333 // Neutralize His tag
|
|
2334 if ( (pt=strstr(qseq,"HHHHH")) )
|
|
2335 {
|
|
2336 int i0 = pt-qseq+1;
|
|
2337 if (v>=2) printf("Neutralized His-tag at position %i\n",i0);
|
|
2338 for (i=imax(i0-5,1); i<i0; ++i) // neutralize leading 5 columns
|
|
2339 for (a=0; a<NAA; ++a) p[i][a]=pb[a];
|
|
2340 for (; (*pt)!='H'; ++i,++pt) // neutralize His columns
|
|
2341 for (a=0; a<NAA; ++a) p[i][a]=pb[a];
|
|
2342 i0=i;
|
|
2343 for (; i<imin(i0+5,L+1); ++i) // neutralize trailing 5 columns
|
|
2344 for (a=0; a<NAA; ++a) p[i][a]=pb[a];
|
|
2345 if (v>=3) printf("start:%i end:%i\n",imax(i0-5,1),i-1);
|
|
2346 }
|
|
2347
|
|
2348 // Neutralize C-myc tag
|
|
2349 if ( (pt=strstr(qseq,"EQKLISEEDL")) )
|
|
2350 {
|
|
2351 if (v>=2) printf("Neutralized C-myc-tag at position %i\n",int(pt-qseq)+1);
|
|
2352 for (i=pt-qseq+1; i<=pt-qseq+10; ++i)
|
|
2353 for (a=0; a<NAA; ++a) p[i][a]=pb[a];
|
|
2354 }
|
|
2355 // Neutralize FLAG tag
|
|
2356 if ( (pt=strstr(qseq,"DYKDDDDK")) )
|
|
2357 {
|
|
2358 if (v>=2) printf("Neutralized FLAG-tag at position %i\n",int(pt-qseq)+1);
|
|
2359 for (i=pt-qseq+1; i<=pt-qseq+8; ++i)
|
|
2360 for (a=0; a<NAA; ++a) p[i][a]=pb[a];
|
|
2361 }
|
|
2362 }
|
|
2363
|
|
2364
|
|
2365
|
|
2366 /////////////////////////////////////////////////////////////////////////////////////
|
|
2367 /**
|
|
2368 * @brief Calculate effective number of sequences using profiles INCLUDING pseudocounts
|
|
2369 */
|
|
2370 float
|
|
2371 HMM::CalcNeff()
|
|
2372 {
|
|
2373 float Neff=0;
|
|
2374 for (int i=1; i<=L; ++i)
|
|
2375 for (int a=0; a<20; ++a)
|
|
2376 if (p[i][a]>1E-10) Neff-=p[i][a]*fast_log2(p[i][a]);
|
|
2377 return fpow2(Neff/L);
|
|
2378 }
|
|
2379
|
|
2380
|
|
2381 /////////////////////////////////////////////////////////////////////////////////////
|
|
2382 /**
|
|
2383 * @brief Calculate consensus of HMM (needed to merge HMMs later)
|
|
2384 */
|
|
2385 void
|
|
2386 HMM::CalculateConsensus()
|
|
2387 {
|
|
2388 int i; // position in query
|
|
2389 int a; // amino acid
|
|
2390 if (!Xcons) Xcons = new char[/*MAXRES*/par.maxResLen+2];
|
|
2391 for (i=1; i<=L; ++i)
|
|
2392 {
|
|
2393 float max=f[i][0]-pb[0];
|
|
2394 for (a=1; a<20; ++a)
|
|
2395 if (f[i][a]-pb[a]>max) Xcons[i]=a;
|
|
2396 }
|
|
2397 Xcons[0]=Xcons[L+1]=ENDGAP;
|
|
2398 }
|
|
2399
|
|
2400 // /////////////////////////////////////////////////////////////////////////////////////
|
|
2401 // // Store linear transition probabilities
|
|
2402 // /////////////////////////////////////////////////////////////////////////////////////
|
|
2403 // void HMM::StoreLinearTransitionProbs()
|
|
2404 // {
|
|
2405 // int i; // position in query
|
|
2406 // for (i=0; i<=L+1; ++i) if (!tr_lin[i]) tr_lin[i] = new(float[NTRANS]);
|
|
2407 // for (i=0; i<=L+1; ++i)
|
|
2408 // {
|
|
2409 // tr_lin[i][M2M] = fpow2(tr[i][M2M]);
|
|
2410 // tr_lin[i][M2I] = fpow2(tr[i][M2I]);
|
|
2411 // tr_lin[i][M2D] = fpow2(tr[i][M2D]);
|
|
2412 // tr_lin[i][D2M] = fpow2(tr[i][M2D]);
|
|
2413 // tr_lin[i][D2D] = fpow2(tr[i][D2D]);
|
|
2414 // tr_lin[i][I2M] = fpow2(tr[i][I2M]);
|
|
2415 // tr_lin[i][I2I] = fpow2(tr[i][I2I]);
|
|
2416 // }
|
|
2417 // }
|
|
2418
|
|
2419
|
|
2420 // #define Weff(Neff) (1.0+par.neffa*(Neff-1.0)+(par.neffb-4.0*par.neffa)/16.0*(Neff-1.0)*(Neff-1.0))
|
|
2421
|
|
2422 // /////////////////////////////////////////////////////////////////////////////////////
|
|
2423 // // Initialize f[i][a] with query HMM
|
|
2424 // /////////////////////////////////////////////////////////////////////////////////////
|
|
2425 // void HMM::MergeQueryHMM(HMM& q, float wk[])
|
|
2426 // {
|
|
2427 // int i; // position in query
|
|
2428 // int a; // amino acid
|
|
2429 // float Weff_M, Weff_D, Weff_I;
|
|
2430 // for (i=1; i<=L; i++)
|
|
2431 // {
|
|
2432 // Weff_M = Weff(q.Neff_M[i]-1.0);
|
|
2433 // Weff_D = Weff(q.Neff_D[i]-1.0);
|
|
2434 // Weff_I = Weff(q.Neff_I[i]-1.0);
|
|
2435 // for (a=0; a<20; a++) f[i][a] = q.f[i][a]*wk[i]*Weff_M;
|
|
2436 // tr_lin[i][M2M] = q.tr_lin[i][M2M]*wk[i]*Weff_M;
|
|
2437 // tr_lin[i][M2I] = q.tr_lin[i][M2I]*wk[i]*Weff_M;
|
|
2438 // tr_lin[i][M2D] = q.tr_lin[i][M2D]*wk[i]*Weff_M;
|
|
2439 // tr_lin[i][D2M] = q.tr_lin[i][D2M]*wk[i]*Weff_D;
|
|
2440 // tr_lin[i][D2D] = q.tr_lin[i][D2D]*wk[i]*Weff_D;
|
|
2441 // tr_lin[i][I2M] = q.tr_lin[i][I2M]*wk[i]*Weff_I;
|
|
2442 // tr_lin[i][I2I] = q.tr_lin[i][I2I]*wk[i]*Weff_I;
|
|
2443 // }
|
|
2444 // }
|
|
2445
|
|
2446
|
|
2447
|
|
2448 // /////////////////////////////////////////////////////////////////////////////////////
|
|
2449 // // Normalize probabilities in total merged super-HMM
|
|
2450 // /////////////////////////////////////////////////////////////////////////////////////
|
|
2451 // void HMM::NormalizeHMMandTransitionsLin2Log()
|
|
2452 // {
|
|
2453 // int i; // position in query
|
|
2454 // int a; // amino acid
|
|
2455 // for (i=0; i<=L+1; i++)
|
|
2456 // {
|
|
2457 // float sum=0.0;
|
|
2458 // for (a=0; a<20; a++) sum += f[i][a];
|
|
2459 // for (a=0; a<20; a++) f[i][a]/=sum;
|
|
2460 // sum = tr_lin[i][M2M] + tr_lin[i][M2I] + tr_lin[i][M2D];
|
|
2461 // tr_lin[i][M2M] /= sum;
|
|
2462 // tr_lin[i][M2I] /= sum;
|
|
2463 // tr_lin[i][M2D] /= sum;
|
|
2464 // tr[i][M2M] = fast_log2(tr_lin[i][M2M]);
|
|
2465 // tr[i][M2I] = fast_log2(tr_lin[i][M2I]);
|
|
2466 // tr[i][M2D] = fast_log2(tr_lin[i][M2D]);
|
|
2467 // sum = tr_lin[i][D2M] + tr_lin[i][D2D];
|
|
2468 // tr_lin[i][D2M] /= sum;
|
|
2469 // tr_lin[i][D2D] /= sum;
|
|
2470 // tr[i][D2M] = fast_log2(tr_lin[i][D2M]);
|
|
2471 // tr[i][D2D] = fast_log2(tr_lin[i][D2D]);
|
|
2472 // sum = tr_lin[i][I2M] + tr_lin[i][I2I];
|
|
2473 // tr_lin[i][I2M] /= sum;
|
|
2474 // tr_lin[i][I2I] /= sum;
|
|
2475 // tr[i][I2M] = fast_log2(tr_lin[i][I2M]);
|
|
2476 // tr[i][I2I] = fast_log2(tr_lin[i][I2I]);
|
|
2477 // }
|
|
2478 // }
|
|
2479
|
|
2480
|
|
2481 // UNCOMMENT TO ACTIVATE COMPOSITIONALLY BIASED PSEUDOCOUNTS BY RESCALING THE RATE MATRIX
|
|
2482
|
|
2483 // /////////////////////////////////////////////////////////////////////////////////////
|
|
2484 // //// Function to minimize
|
|
2485 // /////////////////////////////////////////////////////////////////////////////////////
|
|
2486 // double RescaleMatrixFunc(double x[])
|
|
2487 // {
|
|
2488 // double sum=0.0;
|
|
2489 // for (int a=0; a<20; ++a)
|
|
2490 // {
|
|
2491 // double za=0.0;
|
|
2492 // for (int b=0; b<20; ++b) za+=P[a][b]*x[b];
|
|
2493 // sum += (x[a]*za-qav[a])*(x[a]*za-qav[a]);
|
|
2494 // }
|
|
2495 // return sum;
|
|
2496 // }
|
|
2497
|
|
2498 // /////////////////////////////////////////////////////////////////////////////////////
|
|
2499 // //// Gradient of function to minimize
|
|
2500 // /////////////////////////////////////////////////////////////////////////////////////
|
|
2501 // void RescaleMatrixFuncGrad(double grad[], double x[])
|
|
2502 // {
|
|
2503 // double z[20] = {0.0};
|
|
2504 // double w[20];
|
|
2505 // double tmp;
|
|
2506 // for (int a=0; a<20; ++a)
|
|
2507 // for (int b=0; b<20; ++b) z[a] += P[a][b]*x[b];
|
|
2508
|
|
2509 // for (int a=0; a<20; ++a) w[a] = x[a]*z[a]-qav[a];
|
|
2510 // for (int a=0; a<20; ++a)
|
|
2511 // {
|
|
2512 // tmp = w[a]*z[a];
|
|
2513 // for (int b=0; b<20; ++b) tmp += P[a][b]*x[b]*w[b];
|
|
2514 // grad[a] = 2.0*tmp;
|
|
2515 // }
|
|
2516 // return;
|
|
2517 // }
|
|
2518
|
|
2519
|
|
2520 // /////////////////////////////////////////////////////////////////////////////////////
|
|
2521 // //// Rescale a substitution matrix to biased aa frequencies in global vector qav[a]
|
|
2522 // /////////////////////////////////////////////////////////////////////////////////////
|
|
2523 // void HMM::RescaleMatrix()
|
|
2524 // {
|
|
2525 // int a,b;
|
|
2526 // int code;
|
|
2527 // double x[21]; // scaling factor
|
|
2528 // double val_min;
|
|
2529 // const int len=20;
|
|
2530 // const int max_iterations=50;
|
|
2531
|
|
2532 // if (v>=2) printf("Adjusting rate matrix to query amino acid composition ...\n");
|
|
2533
|
|
2534 // // Put amino acid frequencies into global array (needed to call WNLIB's conjugate gradient method)
|
|
2535 // for (a=0; a<20; ++a) qav[a] = pav[a];
|
|
2536
|
|
2537 // // Initialize scaling factors x[a]
|
|
2538 // for (a=0; a<20; ++a) x[a]=pow(qav[a]/pb[a],0.73); // Initialize
|
|
2539
|
|
2540 // // Call conjugate gradient minimization method from WNLIB
|
|
2541 // wn_conj_gradient_method(&code,&val_min,x,len,&RescaleMatrixFunc,&RescaleMatrixFuncGrad,max_iterations);
|
|
2542
|
|
2543
|
|
2544 // // Calculate z[a] = sum_b Pab*xb
|
|
2545 // float sum_err=0.0f;
|
|
2546 // float sum = 0.0f;
|
|
2547 // for (a=0; a<20; ++a)
|
|
2548 // {
|
|
2549 // float za=0.0f; // za = sum_b Pab*xb
|
|
2550 // for (b=0; b<20; ++b) za+=P[a][b]*x[b];
|
|
2551 // sum_err += (x[a]*za/qav[a]-1)*(x[a]*za/qav[a]-1);
|
|
2552 // sum += x[a]*za;
|
|
2553 // }
|
|
2554 // if (sum_err>1e-3 & v>=1) fprintf(stderr,"WARNING: adjusting rate matrix by CG resulted in residual error of %5.3f.\n",sum_err);
|
|
2555
|
|
2556 // // Rescale rate matrix
|
|
2557 // for (a=0; a<20; ++a)
|
|
2558 // for (b=0; b<20; ++b)
|
|
2559 // {
|
|
2560 // P[a][b] *= x[a]*x[b]/sum;
|
|
2561 // R[a][b] = P[a][b]/qav[b];
|
|
2562 // }
|
|
2563
|
|
2564 // // How well approximated?
|
|
2565 // if (v>=3)
|
|
2566 // {
|
|
2567 // // Calculate z[a] = sum_b Pab*xb
|
|
2568 // float z[21];
|
|
2569 // for (a=0; a<20; ++a)
|
|
2570 // for (z[a]=0.0, b=0; b<20; ++b) z[a]+=P[a][b];
|
|
2571 // printf("Adjust A R N D C Q E G H I L K M F P S T W Y V\nErr? ");
|
|
2572 // for (a=0; a<20; ++a) printf("%4.0f ",1000*z[a]/qav[a]);
|
|
2573 // cout<<endl<<"xa ";
|
|
2574 // for (a=0; a<20; ++a) fprintf(stdout,"%4.2f ",x[a]);
|
|
2575 // cout<<endl;
|
|
2576 // }
|
|
2577
|
|
2578 // // Evaluate sequence identity underlying substitution matrix
|
|
2579 // if (v>=3)
|
|
2580 // {
|
|
2581 // float id=0.0f;
|
|
2582 // float entropy=0.0f;
|
|
2583 // float entropy_qav=0.0f;
|
|
2584 // float mut_info=0.0f;
|
|
2585 // for (a=0; a<20; ++a) id += P[a][a];
|
|
2586 // for (a=0; a<20; ++a) entropy_qav-=qav[a]*fast_log2(qav[a]);
|
|
2587 // for (a=0; a<20; ++a)
|
|
2588 // for (b=0; b<20; ++b)
|
|
2589 // {
|
|
2590 // entropy-=P[a][b]*fast_log2(R[a][b]);
|
|
2591 // mut_info += P[a][b]*fast_log2(P[a][b]/qav[a]/qav[b]);
|
|
2592 // }
|
|
2593
|
|
2594 // fprintf(stdout,"Rescaling rate matrix: sequence identity = %2.0f%%; entropy per column = %4.2f bits (out of %4.2f); mutual information = %4.2f bits\n",100*id,entropy,entropy_qav,mut_info);
|
|
2595 // }
|
|
2596 // return;
|
|
2597 // }
|
|
2598
|
|
2599
|
|
2600 /* @* HMM::ClobberGlobal (eg, q,t)
|
|
2601 */
|
|
2602 void
|
|
2603 HMM::ClobberGlobal(void){
|
|
2604
|
|
2605 for (int i = 0; i < n_display; i++){
|
|
2606 if (sname[i]){
|
|
2607 delete[] sname[i]; sname[i] = NULL;
|
|
2608 }
|
|
2609 if (seq[i]){
|
|
2610 delete[] seq[i]; seq[i] = NULL;
|
|
2611 }
|
|
2612 }
|
|
2613 Neff_M[0] = Neff_I[0] = Neff_D[0] = 0.0;
|
|
2614 longname[0] = '\0'; file[0] = '\0';
|
|
2615 ss_dssp[0] = sa_dssp[0] = ss_pred[0] = ss_conf[0] = '\0';
|
|
2616 Xcons = NULL;
|
|
2617 l[0] = 0;
|
|
2618 L = 0;
|
|
2619 Neff_HMM = 0;
|
|
2620 n_display = N_in = N_filtered = 0;
|
|
2621 nss_dssp = nsa_dssp = nss_pred = nss_conf = nfirst = ncons = -1;
|
|
2622 lamda = 0.0; mu = 0.0;
|
|
2623 name[0] = longname[0] = fam[0] = '\0';
|
|
2624
|
|
2625 for (int i = 0; i < NAA; i++){
|
|
2626 pav[i] = 0;
|
|
2627 }
|
|
2628
|
|
2629 /* @= */
|
|
2630 return;
|
|
2631
|
|
2632 } /* this is the end of ClobberGlobal() */
|
|
2633
|
|
2634
|
|
2635 /*
|
|
2636 * EOF hhhmm-C.h
|
|
2637 */
|