Mercurial > repos > clustalomega > clustalomega
comparison clustalomega/clustal-omega-0.2.0/src/hhalign/hhfunc-C.h @ 0:ff1768533a07
Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author | clustalomega |
---|---|
date | Tue, 07 Jun 2011 17:04:25 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ff1768533a07 |
---|---|
1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ | |
2 | |
3 /********************************************************************* | |
4 * Clustal Omega - Multiple sequence alignment | |
5 * | |
6 * Copyright (C) 2010 University College Dublin | |
7 * | |
8 * Clustal-Omega is free software; you can redistribute it and/or | |
9 * modify it under the terms of the GNU General Public License as | |
10 * published by the Free Software Foundation; either version 2 of the | |
11 * License, or (at your option) any later version. | |
12 * | |
13 * This file is part of Clustal-Omega. | |
14 * | |
15 ********************************************************************/ | |
16 | |
17 /* | |
18 * RCS $Id: hhfunc-C.h 199 2011-02-21 18:24:49Z fabian $ | |
19 */ | |
20 | |
21 /* | |
22 * Changelog: Michael Remmert made changes to hhalign stand-alone code | |
23 * FS implemented some of the changes on 2010-11-11 -> MR1 | |
24 * | |
25 * did not incorporate SS prediction PSIpred (yet); functions are: | |
26 * CalculateSS(3), | |
27 */ | |
28 | |
29 | |
30 // hhfunc.C | |
31 | |
32 | |
33 /** | |
34 * AddBackgroundFrequencies() | |
35 * | |
36 * @brief add background frequencies (derived from HMM) to | |
37 * sequence/profile | |
38 * | |
39 * @param[in,out] ppfFreq, | |
40 * [in] residue frequencies of sequence/profile, | |
41 * [out] overlayed with HMM background frequencies | |
42 * @param[in,out] ppfPseudoF, | |
43 * [in] residue frequencies+pseudocounts of sequence/profile, | |
44 * [out] overlayed with HMM background frequencies+pseudocounts | |
45 * @param[in] iSeqLen, | |
46 * length of sequence/profile (not aligned to HMM) | |
47 * @pram[in] prHMM, | |
48 * background HMM | |
49 * @param[in] ppcSeq, | |
50 * sequences/profile to be 'softened' | |
51 * @param[in] pcPrealigned, | |
52 * sequence aligned to HMM, this is not quite a consensus, | |
53 * it is identical to 1st sequence but over-writes gap information, | |
54 * if other sequences in profile have (non-gap) residues | |
55 * @param[in] iPreCnt | |
56 * number of sequences pre-aligned (pcPrealigned is 'consensus' of these sequences) | |
57 * @param[in] pcRepresent | |
58 * sequence representative of HMM, aligned to pcSeq0 | |
59 */ | |
60 void | |
61 AddBackgroundFrequencies(float **ppfFreq, float **ppfPseudoF, float **ppfPseudoTr, | |
62 int iSeqLen, hmm_light *prHMM, | |
63 char **ppcSeq, char *pcPrealigned, int iPreCnt, char *pcRepresent) | |
64 { | |
65 | |
66 char *pcS = pcPrealigned; /* current residue in pre-aligned sequence */ | |
67 char *pcH = pcRepresent; /* current residue in pre-aligned HMM */ | |
68 int iS = 0; /* position in un-aligned sequence (corresponds to pcS) */ | |
69 int iH = 0; /* position in un-aligned HMM (corresponds to pcH) */ | |
70 int iA; /* residue iterator */ | |
71 int iT; /* transition state iterator */ | |
72 float fFWeight = 0.50 / sqrt((float)(iPreCnt)); /* weight of backgroud frequencies */ /* FIXME: tune value, 0.50 default */ | |
73 //float fFWeight = 0.75; | |
74 float fFThgiew = UNITY - fFWeight; /* weight of 'true' frequencies */ | |
75 float fGWeight = 0.50 / sqrt((float)(iPreCnt)); /* weight of backgroud transitions */ /* FIXME: tune value, 0.50 default */ | |
76 //float fGWeight = 0.50 /*/ (float)(iPreCnt)*/; /* weight of backgroud transitions */ /* FIXME: tune value, 0.50 default */ | |
77 float fGThgiew = UNITY - fGWeight; /* weight of 'true' transitions */ | |
78 float fAux; | |
79 /* zf1SeqTrans[] are the (phenomenological) transition probabilities (+ pseudo-counts) | |
80 for a single sequence. It is almost certain (0.99) to stay in a match state, and very | |
81 unlikely (0.05) to go from match to insertion/deletion */ | |
82 float zf1SeqTrans[] = {0.98, 0.01, 0.01, 0.25, 0.75, 0.25, 0.75}; | |
83 float zf1SeqInit[] = {0.98, 0.01, 0.01, 0.99, 0.01, 0.99, 0.01}; | |
84 float zf1SeqDel[] = {0.24, 0.01, 0.75, 0.01, 0.01, 0.01, 0.01}; | |
85 float zf1SeqRevrt[] = {0.98, 0.01, 0.01, 0.01, 0.01, 0.99, 0.01}; | |
86 | |
87 if ( (NULL == pcPrealigned) || (NULL == pcRepresent) ){ | |
88 /*printf("%s/%s:%d: WARNING HMM=NULL -- didn't think I would get here (carry on, no danger)\n", | |
89 __FUNCTION__, __FILE__, __LINE__);*/ | |
90 return; | |
91 } | |
92 | |
93 if (NULL == prHMM->p){ | |
94 printf("%s:%s:%d: WARNING -- Background Pseudocounts point to NULL\n" | |
95 "\tthis is not intended - don't add background but CONTINUE\n", | |
96 __FUNCTION__, __FILE__, __LINE__); | |
97 return; | |
98 | |
99 } | |
100 | |
101 /* FIXME: should be 0 (FS thinks) but -1 gives better results */ | |
102 iH = iS = 0/*-1*//*+1*/; | |
103 while ( ('\0' != *pcS) && ('\0' != *pcH) ){ | |
104 | |
105 if ( ('-' != *pcH) && ('-' != *pcS) ){ | |
106 iH++; | |
107 iS++; | |
108 | |
109 #if 0 | |
110 /* match state | |
111 * - HMM had a gap in previous position (now closed) | |
112 * FIXME: this does not really work */ | |
113 if ((iH > 0) && ('-' == *(pcH-1))){ | |
114 | |
115 for (iT = 0; iT < STATE_TRANSITIONS; iT++){ | |
116 ppfPseudoTr[iS-1][iT] = log2f(zf1SeqRevrt[iT]); | |
117 } | |
118 } | |
119 #endif | |
120 | |
121 #if 1 | |
122 /* do frequencies -- this is not really useful; | |
123 frequencies are derived from HMM | |
124 and contain already pseudocounts (PCs), | |
125 adding frequencies and then PCs will add PCs _twice_ | |
126 results are better than not to add them, | |
127 but not as good as PCs */ | |
128 for (iA = 0; iA < AMINOACIDS; iA++){ | |
129 fAux = fFThgiew * ppfFreq[iS][iA] + fFWeight * prHMM->f[iH][iA]; | |
130 ppfFreq[iS][iA] = fAux; | |
131 } | |
132 #endif | |
133 /* do pseudo-counts */ | |
134 for (iA = 0; iA < AMINOACIDS; iA++){ | |
135 fAux = | |
136 fFThgiew * ppfPseudoF[iS][iA] + fFWeight * prHMM->p[iH][iA]; | |
137 ppfPseudoF[iS][iA] = fAux; | |
138 } | |
139 #if 0 /* do state transitions */ | |
140 for (iT = 0; iT < STATE_TRANSITIONS; iT++){ | |
141 #if 1 | |
142 /* this is a very crude method, | |
143 which averages the logarithms of the transitions, | |
144 effectively performing a geometric mean - | |
145 this presumably violates normalisation. | |
146 however, results are surprisingly good */ | |
147 fAux = | |
148 fGThgiew * ppfPseudoTr[iS][iT] + | |
149 fGWeight * prHMM->tr[iH][iT]; | |
150 ppfPseudoTr[iS][iT] = fAux; | |
151 #else /* crude averaging */ | |
152 /* There are 2 things to consider: | |
153 (1) one should really blend the probabilities of | |
154 the transitions, however, by default we have | |
155 the logarithms thereof. | |
156 So must exponentiate, blend, and then take log again. | |
157 This is expensive, and does not seem to lead to better | |
158 results than blending the logarithms | |
159 (and violating normalisation) | |
160 (2) transition probabilities for a single sequence | |
161 are really easy, there are no insert/delete transitions. | |
162 However, there is a begin state that is different | |
163 from the main body. | |
164 But again, this does not seem to make a blind bit | |
165 of difference | |
166 */ | |
167 if (iS > 0){ | |
168 fAux = | |
169 fGThgiew * zf1SeqTrans[iT] + | |
170 fGWeight * prHMM->linTr[iH][iT]; | |
171 ppfPseudoTr[iS][iT] = log2f(fAux); | |
172 } | |
173 else { | |
174 fAux = | |
175 fGThgiew * zf1SeqInit[iT] + | |
176 fGWeight * prHMM->linTr[iH][iT]; | |
177 ppfPseudoTr[iS][iT] = log2f(fAux); | |
178 } | |
179 #endif /* mixing of linear/log */ | |
180 } /* 0 <= iT < STATE_TRANSITIONS */ | |
181 #endif /* did state transitions */ | |
182 } /* was Match -- neither HMM nor sequence have gap */ | |
183 | |
184 else if ('-' == *pcH){ | |
185 /* sequence opened up gap in HMM */ | |
186 #if 0 | |
187 if ((iH > 0) && ('-' != *(pcH-1)) && (iS > 0)){ | |
188 /* this is the first gap in HMM | |
189 * FIXME: this does not really work */ | |
190 for (iT = 0; iT < STATE_TRANSITIONS; iT++){ | |
191 ppfPseudoTr[iS-1][iT] = log2f(zf1SeqDel[iT]); | |
192 } | |
193 } | |
194 else { | |
195 /* do nothing, keep single sequence values exactly as they are*/ | |
196 } | |
197 #endif | |
198 iS++; | |
199 } | |
200 else if ('-' == *pcS){ | |
201 /* here the single sequence has a gap, | |
202 and the HMM (as a whole) does not. There may be individual gaps | |
203 in the HMM at this stage. By ignoring this we say that the HMM | |
204 dominates the overall behaviour - as in the M2M state as well | |
205 */ | |
206 iH++; | |
207 } | |
208 | |
209 pcH++; | |
210 pcS++; | |
211 | |
212 } /* !EO[seq/hmm] */ | |
213 | |
214 return; | |
215 | |
216 } /* this is the end of AddBackgroundFrequencies() */ | |
217 | |
218 | |
219 | |
220 /** | |
221 * ReadAndPrepare() | |
222 * | |
223 * @brief Read HMM input file or transfer alignment | |
224 * and add pseudocounts etc. | |
225 * | |
226 * @param[in] iRnPtype | |
227 * type of read/prepare | |
228 * enum {INTERN_ALN_2_HMM = 0, READ_ALN_2_HMM, READ_HMM_2_HMM}; | |
229 * @param[in] ppcProf | |
230 * alignment | |
231 * @param[in] iCnt | |
232 * number of seqs in alignment | |
233 * @param[in,out] prHMM, | |
234 * [in] if sequences read/prepared, [out] if HMM from file | |
235 * @param[in] pcPrealigned, | |
236 * (single) sequence aligned to background HMM | |
237 * @param[in] pcRepresent, | |
238 * sequence representing HMM aligned to individual sequence | |
239 * param[in] pdExWeights | |
240 * (external) sequence weights, derived from tree | |
241 * @param[in] infile | |
242 * name of file with HMM info (formerly also alignment) | |
243 * @param[out] q | |
244 * HMM structure with transition probabilities, residue frequencies etc | |
245 * @param[???] qali | |
246 * FIXME: what is qali? | |
247 * | |
248 * @return FAILURE on error, OK otherwise | |
249 */ | |
250 int | |
251 ReadAndPrepare(int iRnPtype, | |
252 char **ppcProf, int iCnt, hmm_light *prHMM, | |
253 char *pcPrealigned, char *pcRepresent, double *pdExWeights, | |
254 char* infile, HMM& q, Alignment* qali=NULL) { | |
255 | |
256 //#ifndef CLUSTALO_NOFILE | |
257 char path[NAMELEN]; | |
258 | |
259 /* NOTE: there are different scenarios: | |
260 | |
261 (i) ("" != infile) - read HMM from file, | |
262 transfer frequency/transition/pseudo-count (f/t/p) info into prHMM | |
263 | |
264 (ii) ('\0' != ppcProf[0]) - transfer sequence/alignment into q/qali, | |
265 don't save f/t/p into prHMM, | |
266 on the contrary, if prior f/t/p available then add it to q/qali, | |
267 this is only done if (1==iCnt) | |
268 | |
269 (iii) ('\0' == ppcProf[0]) - re-cycle old HMM information | |
270 recreate a HMM from previous data | |
271 */ | |
272 | |
273 /********************************/ | |
274 /*** (o) Recycle internal HMM ***/ | |
275 /********************************/ | |
276 if ( (INTERN_ALN_2_HMM == iRnPtype) && (iCnt <= 0) ){ | |
277 | |
278 /* NOTE: here we are writing into a HMM structure/class; | |
279 memory has been allocated for this in hhalign.cpp; | |
280 however, as iCnt<=0, there may not be memory for | |
281 prHMM->n_display sequences/names. | |
282 But then, there doesn't have to be. | |
283 At this stage we are just copying one HMM into another HMM, | |
284 sequences are irrelevant. The only sequence of (marginal) | |
285 interest is the consensus sequence */ | |
286 /* FIXME: check that prHMM is valid -- how? */ | |
287 | |
288 const int ciCons = 0; | |
289 const int ciNoof = ciCons+1; | |
290 const int ciInvalid = -1; | |
291 q.n_display = ciNoof; /* only consensus */ | |
292 q.sname = NULL; | |
293 q.ncons = ciCons; | |
294 q.nfirst = ciCons;//prHMM->nfirst; | |
295 q.nss_dssp = ciInvalid;//prHMM->nss_dssp; | |
296 q.nsa_dssp = ciInvalid;//prHMM->nsa_dssp; | |
297 q.nss_pred = ciInvalid;//prHMM->nss_pred; | |
298 q.nss_conf = ciInvalid;//prHMM->nss_conf; | |
299 q.L = prHMM->L; | |
300 q.N_in = prHMM->N_in; | |
301 q.N_filtered = prHMM->N_filtered; | |
302 /* NOTE: I (FS) think that only ever 1 sequence will be transferred here, | |
303 that is, the consensus sequence. However, we might want to allow | |
304 (in the future) to transfer more sequences, | |
305 hence the awkward for() loop */ | |
306 #if 0 | |
307 for (int i = prHMM->ncons+0; i < prHMM->ncons+q.n_display; i++){ | |
308 /* NOTE: In the original hhalign code the first position | |
309 is kept open ('\0'). This makes it difficult to use | |
310 string functions like strlen/strdup etc. | |
311 Insert a temporary gap (.) to facilitate string operations */ | |
312 char cHead = prHMM->seq[i][0]; | |
313 prHMM->seq[i][0] = '.'; | |
314 q.seq[i] = strdup(prHMM->seq[i]); | |
315 prHMM->seq[i][0] = q.seq[i][0] = cHead; | |
316 } | |
317 #else | |
318 { | |
319 char cHead = prHMM->seq[prHMM->ncons][0]; | |
320 prHMM->seq[prHMM->ncons][0] = '.'; | |
321 q.seq[q.ncons] = strdup(prHMM->seq[prHMM->ncons]); | |
322 prHMM->seq[prHMM->ncons][0] = q.seq[q.ncons][0] = cHead; | |
323 } | |
324 #endif | |
325 const int NEFF_SCORE = 10; /* FIXME: Magic Number */ | |
326 for (int i = 0; i < prHMM->L+1; i++){ | |
327 q.Neff_M[i] = prHMM->Neff_M[i]; | |
328 q.Neff_I[i] = prHMM->Neff_I[i]; | |
329 q.Neff_D[i] = prHMM->Neff_D[i]; | |
330 } | |
331 q.Neff_HMM = prHMM->Neff_HMM; | |
332 /* skip longname,name,file,fam,sfam,fold,cl */ | |
333 q.lamda = prHMM->lamda; | |
334 q.mu = prHMM->mu; | |
335 | |
336 HMMshadow rShadow = {0}; /* friend of HMM to access private members */ | |
337 rShadow.copyShadowToHMM(q, *prHMM); | |
338 | |
339 /* skip trans_lin,ss_dssp,sa_dssp,ss_pred,ss_conf,Xcons */ | |
340 /* pav already done in copyShadowToHMM */ | |
341 /* skip pnul */ | |
342 | |
343 return OK; | |
344 | |
345 | |
346 } /* INTERN_ALN_2_HMM && iCnt<=0 */ | |
347 | |
348 /******************************/ | |
349 /*** (i) Read HMM from file ***/ | |
350 /******************************/ | |
351 char line[LINELEN] = {0}; // input line | |
352 FILE *inf = NULL; | |
353 //if ( (0 != strcmp(infile,"")) /*&& (iCnt > 0)*/ ) | |
354 if ( (READ_HMM_2_HMM == iRnPtype) || (READ_ALN_2_HMM == iRnPtype)) { | |
355 | |
356 if (0 == strcmp(infile,"")){ | |
357 printf("%s:%s:%d:\n" | |
358 "\texpected to re %s from file but no file specified\n" | |
359 "" | |
360 , __FUNCTION__, __FILE__, __LINE__ | |
361 , (READ_HMM_2_HMM==iRnPtype?"HMM":"alignment")); | |
362 return FAILURE; | |
363 } | |
364 inf = fopen(infile, "r"); | |
365 if (!inf) OpenFileError(infile); | |
366 Pathname(path,infile); | |
367 | |
368 //} | |
369 //else { | |
370 //inf = stdin; | |
371 //if (v>=2) printf("Reading HMM / multiple alignment from standard input ...\n(To get a help list instead, quit and type %s -h.)\n",program_name); | |
372 //*path='\0'; | |
373 //} | |
374 | |
375 fgetline(line,LINELEN-1,inf); | |
376 } | |
377 | |
378 //if ( (0 != strcmp(infile,"")) && (iCnt > 0) ) | |
379 if ( (READ_HMM_2_HMM == iRnPtype) ){ | |
380 | |
381 if (0 == strcmp(infile,"")){ | |
382 printf("%s:%s:%d: expected to read HMM from file but no file-name\n", | |
383 __FUNCTION__, __FILE__, __LINE__); | |
384 } | |
385 | |
386 // Is infile a HMMER3 file? /* MR1 */ | |
387 if (!strncmp(line,"HMMER3",6)) | |
388 { | |
389 if (v>=2) { | |
390 cout<<"Query file is in HMMER3 format\n"; | |
391 cout<<"WARNING! Use of HMMER3 format as input results in dramatically loss of sensitivity!\n"; | |
392 } | |
393 | |
394 // Read 'query HMMER file | |
395 rewind(inf); | |
396 q.ReadHMMer3(inf,path); | |
397 | |
398 // Don't add transition pseudocounts to query!! | |
399 | |
400 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] | |
401 q.PreparePseudocounts(); | |
402 | |
403 // DON'T ADD amino acid pseudocounts to query: pcm=0! q.p[i][a] = f[i][a] | |
404 q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc); | |
405 | |
406 /* further down there is a q.Log2LinTransitionProbs | |
407 but only if (iCnt>0), however, we still need it it here (i think), | |
408 there is no danger of doing this twice, as trans_lin is checked | |
409 FIXME (FS, 2011-01-12) */ | |
410 /* further down there is a q.Log2LinTransitionProbs | |
411 but only if (iCnt>0), however, we still need it it here (i think), | |
412 there is no danger of doing this twice, as trans_lin is checked | |
413 FIXME (FS, 2011-01-12) */ | |
414 //if (par.forward >= 1) | |
415 { | |
416 q.Log2LinTransitionProbs(1.0); | |
417 } | |
418 | |
419 } | |
420 // ... or Is infile an old HMMER file? | |
421 else if (!strncmp(line,"HMMER",5)) { | |
422 if (v>=2) { | |
423 cout<<"Query file is in HMMER format\n"; | |
424 cout<<"WARNING! Use of HMMER format as input results in dramatically loss of sensitivity!\n"; | |
425 } | |
426 | |
427 // Read 'query HMMER file | |
428 q.ReadHMMer(inf,path); | |
429 if (v>=2 && q.Neff_HMM>11.0) | |
430 fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM); | |
431 | |
432 // Don't add transition pseudocounts to query!! | |
433 | |
434 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] | |
435 q.PreparePseudocounts(); | |
436 | |
437 // DON'T ADD amino acid pseudocounts to query: pcm=0! q.p[i][a] = f[i][a] | |
438 q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc); | |
439 | |
440 /* further down there is a q.Log2LinTransitionProbs | |
441 but only if (iCnt>0), however, we still need it it here (i think), | |
442 there is no danger of doing this twice, as trans_lin is checked | |
443 FIXME (FS, 2011-01-12) */ | |
444 //if (par.forward >= 1) | |
445 { | |
446 q.Log2LinTransitionProbs(1.0); | |
447 } | |
448 | |
449 } /* it was a HMMer file */ | |
450 | |
451 // ... or is it an hhm file? | |
452 else if (!strncmp(line,"NAME",4) || !strncmp(line,"HH",2)) { | |
453 | |
454 if (v>=2) cout<<"Query file is in HHM format\n"; | |
455 | |
456 // Rewind to beginning of line and read query hhm file | |
457 rewind(inf); | |
458 q.Read(inf,path); | |
459 if (v>=2 && q.Neff_HMM>11.0) | |
460 fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM); | |
461 | |
462 // Add transition pseudocounts to query -> q.p[i][a] | |
463 q.AddTransitionPseudocounts(); | |
464 | |
465 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] | |
466 q.PreparePseudocounts(); | |
467 | |
468 // Add amino acid pseudocounts to query: q.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] | |
469 q.AddAminoAcidPseudocounts(); | |
470 | |
471 } /* it was a HHM file */ | |
472 else { | |
473 fprintf(stderr, "%s:%s:%d: Unknown HMM format in infile\n" | |
474 "infile=%s, #seq=%d\n" | |
475 , __FUNCTION__, __FILE__, __LINE__ | |
476 , infile, iCnt); | |
477 return FAILURE; | |
478 } | |
479 | |
480 /*fclose(inf);*/ | |
481 | |
482 /*** transfer class info to struct */ | |
483 prHMM->n_display = q.n_display; | |
484 /* ignore sname*/ | |
485 prHMM->seq = (char **)calloc((q.n_display+1), sizeof(char *)); | |
486 /* FIXME valgrind says bytes get lost in the above calloc during | |
487 * hmm-iteration | |
488 */ | |
489 for (int i = 0; i < q.n_display; i++){ | |
490 /* NOTE: In the original hhalign code the first position | |
491 is kept open ('\0'). This makes it difficult to use | |
492 string functions like strlen/strdup etc. | |
493 Insert a temporary gap (.) to facilitate string operations */ | |
494 char cHead = q.seq[i][0]; | |
495 q.seq[i][0] = '.'; | |
496 prHMM->seq[i] = strdup(q.seq[i]); | |
497 q.seq[i][0] = prHMM->seq[i][0] = cHead; | |
498 } | |
499 prHMM->ncons = q.ncons; | |
500 prHMM->nfirst = q.nfirst; | |
501 prHMM->nss_dssp = q.nss_dssp; | |
502 prHMM->nsa_dssp = q.nsa_dssp; | |
503 prHMM->nss_pred = q.nss_pred; | |
504 prHMM->nss_conf = q.nss_conf; | |
505 prHMM->L = q.L; | |
506 prHMM->N_in = q.N_in; | |
507 prHMM->N_filtered = q.N_filtered; | |
508 const int NEFF_SCORE = 10; /* FIXME: Magic Number */ | |
509 prHMM->Neff_M = (float *)calloc(prHMM->L+1, sizeof(float)); | |
510 prHMM->Neff_I = (float *)calloc(prHMM->L+1, sizeof(float)); | |
511 prHMM->Neff_D = (float *)calloc(prHMM->L+1, sizeof(float)); | |
512 /*for (int i = 0; i < prHMM->L+1; i++){ | |
513 prHMM->Neff_M[i] = prHMM->Neff_I[i] = prHMM->Neff_D[i] = NEFF_SCORE; | |
514 }*/ | |
515 prHMM->Neff_HMM = q.Neff_HMM; | |
516 /* skip longname,name,file,fam,sfam,fold,cl */ | |
517 prHMM->lamda = q.lamda; | |
518 prHMM->mu = q.mu; | |
519 | |
520 HMMshadow rShadow = {0}; /* friend of HMM to access private members */ | |
521 rShadow.copyHMMtoShadow(q); | |
522 | |
523 prHMM->f = (float **)calloc(prHMM->L+1, sizeof(float *)); | |
524 prHMM->g = (float **)calloc(prHMM->L+1, sizeof(float *)); | |
525 prHMM->p = (float **)calloc(prHMM->L+1, sizeof(float *)); | |
526 prHMM->tr = (float **)calloc(prHMM->L+1, sizeof(float *)); | |
527 prHMM->linTr = (float **)calloc(prHMM->L+1, sizeof(float *)); | |
528 for (int i = 0; i < prHMM->L+1; i++){ | |
529 prHMM->f[i] = (float *)calloc(AMINOACIDS, sizeof(float)); | |
530 prHMM->g[i] = (float *)calloc(AMINOACIDS, sizeof(float)); | |
531 prHMM->p[i] = (float *)calloc(AMINOACIDS, sizeof(float)); | |
532 for (int j = 0; j < AMINOACIDS; j++){ | |
533 prHMM->f[i][j] = (float)rShadow.f[i][j]; | |
534 prHMM->g[i][j] = (float)rShadow.g[i][j]; | |
535 prHMM->p[i][j] = (float)rShadow.p[i][j]; | |
536 } | |
537 prHMM->tr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float)); | |
538 prHMM->linTr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float)); | |
539 for (int j = 0; j< STATE_TRANSITIONS; j++){ | |
540 prHMM->tr[i][j] = (float)rShadow.tr[i][j]; | |
541 prHMM->linTr[i][j] = fpow2(rShadow.tr[i][j]); | |
542 } | |
543 } /*0 <= i < prHMM->L+1 */ | |
544 /* skip trans_lin,ss_dssp,sa_dssp,ss_pred,ss_conf,Xcons */ | |
545 for (int j = 0; j < AMINOACIDS; j++){ | |
546 prHMM->pav[j] = (float)rShadow.pav[j]; | |
547 } | |
548 /* skip pnul */ | |
549 | |
550 } /* have read HHM from file */ | |
551 /*else if ( ((NULL != ppcProf) && (iCnt > 0) && ('\0' != ppcProf[0][0])) || | |
552 ( (0 != strcmp(infile,"") && (0 == iCnt) )) )*/ | |
553 else if ( (INTERN_ALN_2_HMM == iRnPtype) || (READ_ALN_2_HMM == iRnPtype) ) { | |
554 | |
555 if ( (INTERN_ALN_2_HMM == iRnPtype) && | |
556 ( (NULL == ppcProf) || (iCnt <= 0) || ('\0' == ppcProf[0][0]) ) ){ | |
557 printf("%s:%s:%d: was expecting internal alignment but\n" | |
558 "\tppcProf=%p, #seq=%d, ppcProf[0][0]=%c\n" | |
559 , __FUNCTION__, __FILE__, __LINE__ | |
560 , ppcProf, iCnt, ppcProf[0][0]); | |
561 return FAILURE; | |
562 } | |
563 else if ( (READ_ALN_2_HMM == iRnPtype) && | |
564 (0 == strcmp(infile,"")) ){ | |
565 printf("%s:%s:%d: was expecting to read alignment from file but no filename\n" | |
566 , __FUNCTION__, __FILE__, __LINE__); | |
567 return FAILURE; | |
568 } | |
569 | |
570 /*******************************/ | |
571 /*** (ii) it is an alignment ***/ | |
572 /*******************************/ | |
573 /* transfer alignment information from clustal character array | |
574 into pali/q/t classes */ | |
575 /* | |
576 * NOTE that emissions in HMMer file format contain pseudo-counts. | |
577 * HHM file format does not contain emission pseudo-counts. | |
578 * the structure that stores background HMM information does contain pseudo-counts | |
579 */ | |
580 Alignment* pali; | |
581 if (qali==NULL){ | |
582 pali=new Alignment(iCnt); /* FIXME: pass in iCnt to get rid of MAXSEQ */ | |
583 } | |
584 else{ | |
585 pali=qali; | |
586 } | |
587 | |
588 if (par.calibrate) { | |
589 printf("\nError in %s: only HHM files can be calibrated.\n",program_name); | |
590 printf("Build an HHM file from your alignment with 'hhmake -i %s' and rerun hhsearch with the hhm file\n\n",infile); | |
591 exit(1); | |
592 } | |
593 | |
594 if (v>=2 && strcmp(infile,"stdin")) cout<<infile<<" is in A2M, A3M or FASTA format\n"; | |
595 | |
596 /* Read alignment from infile into matrix X[k][l] as ASCII | |
597 (and supply first line as extra argument) */ | |
598 //if (iCnt > 0) | |
599 if (INTERN_ALN_2_HMM == iRnPtype){ | |
600 pali->Transfer(ppcProf, iCnt); | |
601 } | |
602 //else if (0 == iCnt) | |
603 else if (READ_ALN_2_HMM == iRnPtype){ | |
604 pali->Read(inf, infile, line); | |
605 } | |
606 else { | |
607 printf("%s:%s:%d: FATAL problem\n" | |
608 "infile = (%s), #seq = %d\n" | |
609 , __FUNCTION__, __FILE__, __LINE__ | |
610 , infile, iCnt); | |
611 return FAILURE; | |
612 } | |
613 | |
614 /* Convert ASCII to int (0-20), throw out all insert states, | |
615 record their number in I[k][i] | |
616 and store marked sequences in name[k] and seq[k] */ | |
617 pali->Compress(infile); | |
618 | |
619 /* Sort out the nseqdis most dissimilar sequences for display | |
620 in the output alignments */ | |
621 pali->FilterForDisplay(par.max_seqid, par.coverage, par.qid, | |
622 par.qsc,par.nseqdis); | |
623 | |
624 // Filter alignment for min score per column with core query profile, defined by coverage_core and qsc_core | |
625 //if (par.coresc>-10) pali->HomologyFilter(par.coverage_core, par.qsc_core, par.coresc); | |
626 | |
627 /* Remove sequences with seq. identity larger than seqid percent | |
628 (remove the shorter of two) */ | |
629 pali->N_filtered = pali->Filter(par.max_seqid, par.coverage, | |
630 par.qid, par.qsc, par.Ndiff); | |
631 | |
632 /* Calculate pos-specific weights, | |
633 AA frequencies and transitions -> f[i][a], tr[i][a] */ | |
634 pali->FrequenciesAndTransitions(q); | |
635 if (v>=2 && q.Neff_HMM>11.0) | |
636 fprintf(stderr,"WARNING: alignment %s looks too diverse (Neff=%.1f>11). Better check it with an alignment viewer... \n",q.name,q.Neff_HMM); | |
637 | |
638 /*printf("%d %d %f %d (N,Nf,Neff,L) %s:%s:%d\n" | |
639 , q.N_in, q.N_filtered, q.Neff_HMM, q.L, __FUNCTION__, __FILE__, __LINE__);*/ | |
640 | |
641 // Add transition pseudocounts to query -> p[i][a] | |
642 q.AddTransitionPseudocounts(); | |
643 | |
644 /* Generate an amino acid frequency matrix from f[i][a] | |
645 with full pseudocount admixture (tau=1) -> g[i][a] */ | |
646 q.PreparePseudocounts(); | |
647 | |
648 /* Add amino acid pseudocounts to query: | |
649 p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] */ | |
650 q.AddAminoAcidPseudocounts(); | |
651 | |
652 /* ****** add aligned background pseudocounts ***** */ | |
653 HMMshadow rShadowQ = {0}; | |
654 rShadowQ.copyHMMtoShadow(q); | |
655 | |
656 AddBackgroundFrequencies(rShadowQ.f, rShadowQ.p, rShadowQ.tr, | |
657 q.L, prHMM, | |
658 q.seq, pcPrealigned, iCnt, pcRepresent); | |
659 | |
660 | |
661 if (qali==NULL){ | |
662 delete(pali); pali = NULL; | |
663 } | |
664 | |
665 } /* else if (NULL != ppcProf) // not hmmer/hhalign but alignment */ | |
666 | |
667 //else if ((prHMM->L > 0) && ('\0' == ppcProf[0][0])) | |
668 else if (INTERN_HMM_2_HMM == iRnPtype){ | |
669 | |
670 /******************************************/ | |
671 /*** (iii) re-cycle old HMM information ***/ | |
672 /******************************************/ | |
673 | |
674 if (prHMM->L <= 0){ | |
675 printf("%s:%s:%d: was expecting to copy HMM structure but L=%d\n" | |
676 , __FUNCTION__, __FILE__, __LINE__, prHMM->L); | |
677 } | |
678 | |
679 printf("%s:%s:%d: RE-CYCLING HMM\n", __FUNCTION__, __FILE__, __LINE__); | |
680 | |
681 #if 0 | |
682 q.n_display = prHMM->n_display; | |
683 /* ignore sname*/ | |
684 for (int i = 0; i < q.n_display; i++){ | |
685 /* NOTE: In the original hhalign code the first position | |
686 is kept open ('\0'). This makes it difficult to use | |
687 string functions like strlen/strdup etc. | |
688 Insert a temporary gap (.) to facilitate string operations */ | |
689 char cHead = prHMM->seq[i][0]; | |
690 prHMM->seq[i][0] = '.'; | |
691 q.seq[i] = strdup(prHMM->seq[i]); | |
692 q.seq[i][0] = prHMM->seq[i][0] = cHead; | |
693 } | |
694 q.nfirst = prHMM->nfirst; | |
695 #else | |
696 q.n_display = 1; | |
697 q.nfirst = 0; | |
698 char cHead = prHMM->seq[prHMM->nfirst][0]; | |
699 prHMM->seq[prHMM->nfirst][0] = '.'; | |
700 q.seq[0] = strdup(prHMM->seq[prHMM->nfirst]); | |
701 q.seq[q.n_display-1][0] = prHMM->seq[prHMM->nfirst][0] = cHead; | |
702 #endif | |
703 q.ncons = prHMM->ncons; | |
704 q.nss_dssp = prHMM->nss_dssp; | |
705 q.nsa_dssp = prHMM->nsa_dssp; | |
706 q.nss_pred = prHMM->nss_pred; | |
707 q.nss_conf = prHMM->nss_conf; | |
708 q.L = prHMM->L; | |
709 q.N_in = prHMM->N_in; | |
710 q.N_filtered = prHMM->N_filtered; | |
711 #define NEFF_SCORE 10 /* FIXME: Magic Number */ | |
712 /*for (int i; i < prHMM->L+1; i++){ | |
713 q.Neff_M[i] = q.Neff_I[i] = q.Neff_D[i] = NEFF_SCORE; | |
714 }*/ | |
715 q.Neff_HMM = prHMM->Neff_HMM; | |
716 /* skip longname,name,file,fam,sfam,fold,cl */ | |
717 q.lamda = prHMM->lamda; | |
718 q.mu = prHMM->mu; | |
719 | |
720 HMMshadow rShadow = {0}; /* friend of HMM to access private members */ | |
721 rShadow.copyShadowToHMM(q, *prHMM); | |
722 | |
723 } | |
724 | |
725 if (iCnt > 0){ | |
726 if (par.forward>=1) q.Log2LinTransitionProbs(1.0); | |
727 } | |
728 | |
729 if (NULL != inf){ | |
730 fclose(inf); | |
731 } | |
732 | |
733 return OK; | |
734 | |
735 } /*** end: ReadAndPrepare() ***/ | |
736 | |
737 /** | |
738 * FreeHMMstruct() | |
739 * | |
740 * @brief FIXME | |
741 * | |
742 * @param[in,out] | |
743 */ | |
744 extern "C" void | |
745 FreeHMMstruct(hmm_light *prHMM) | |
746 { | |
747 int i; | |
748 | |
749 if (NULL != prHMM->f){ | |
750 for (i = 0; i < prHMM->L+1; i++){ | |
751 if (NULL != prHMM->f[i]){ | |
752 free(prHMM->f[i]); prHMM->f[i] = NULL; | |
753 } | |
754 } /* 0 <= i < prHMM->L+1 */ | |
755 free(prHMM->f); prHMM->f = NULL; | |
756 } | |
757 if (NULL != prHMM->g){ | |
758 for (i = 0; i < prHMM->L+1; i++){ | |
759 if (NULL != prHMM->g[i]){ | |
760 free(prHMM->g[i]); prHMM->g[i] = NULL; | |
761 } | |
762 } /* 0 <= i < prHMM->L+1 */ | |
763 free(prHMM->g); prHMM->g = NULL; | |
764 } | |
765 if (NULL != prHMM->p){ | |
766 for (i = 0; i < prHMM->L+1; i++){ | |
767 if (NULL != prHMM->p[i]){ | |
768 free(prHMM->p[i]); prHMM->p[i] = NULL; | |
769 } | |
770 } /* 0 <= i < prHMM->L+1 */ | |
771 free(prHMM->p); prHMM->p = NULL; | |
772 } | |
773 if (NULL != prHMM->tr){ | |
774 for (i = 0; i < prHMM->L+1; i++){ | |
775 if (NULL != prHMM->tr[i]){ | |
776 free(prHMM->tr[i]); prHMM->tr[i] = NULL; | |
777 } | |
778 } /* 0 <= i < prHMM->L+1 */ | |
779 free(prHMM->tr); prHMM->tr = NULL; | |
780 } | |
781 if (NULL != prHMM->linTr){ | |
782 for (i = 0; i < prHMM->L+1; i++){ | |
783 if (NULL != prHMM->linTr[i]){ | |
784 free(prHMM->linTr[i]); prHMM->linTr[i] = NULL; | |
785 } | |
786 } /* 0 <= i < prHMM->L+1 */ | |
787 free(prHMM->linTr); prHMM->linTr = NULL; | |
788 } | |
789 | |
790 if (NULL != prHMM->Neff_M){ | |
791 free(prHMM->Neff_M); prHMM->Neff_M = NULL; | |
792 } | |
793 if (NULL != prHMM->Neff_I){ | |
794 free(prHMM->Neff_I); prHMM->Neff_I = NULL; | |
795 } | |
796 if (NULL != prHMM->Neff_D){ | |
797 free(prHMM->Neff_D); prHMM->Neff_D = NULL; | |
798 } | |
799 | |
800 if (NULL != prHMM->seq){ | |
801 for (i = 0; i < prHMM->n_display; i++){ | |
802 if (NULL != prHMM->seq[i]){ | |
803 free(prHMM->seq[i]); prHMM->seq[i] = NULL; | |
804 } | |
805 } | |
806 free(prHMM->seq); prHMM->seq = NULL; | |
807 } | |
808 | |
809 memset(prHMM, 0, sizeof(hmm_light)); | |
810 | |
811 } /*** end: FreeHMMstruct() ***/ | |
812 | |
813 | |
814 /** | |
815 * @brief comparisin function for ascending qsort() of doubles | |
816 */ | |
817 int | |
818 CompDblAsc(const void *pv1, const void *pv2){ | |
819 | |
820 double d1 = *(double *)pv1; | |
821 double d2 = *(double *)pv2; | |
822 | |
823 if (d1 > d2) { return +1; } | |
824 else if (d1 < d2) { return -1; } | |
825 else { return 0; } | |
826 | |
827 } /*** end: CompDblAsc() ***/ | |
828 | |
829 | |
830 /** | |
831 * AlnToHMM2() | |
832 * | |
833 * @brief convert alignment into HMM (hhmake) | |
834 * | |
835 * @param[out] prHMM | |
836 * structure with pseudocounts etc | |
837 * @param[in] pcHMM_input | |
838 * name of file with HMM | |
839 * | |
840 */ | |
841 extern "C" int | |
842 AlnToHMM2(hmm_light *rHMM_p, | |
843 char **ppcSeq, int iN){ | |
844 | |
845 if (0 == par.M){ | |
846 SetDefaults(); | |
847 SetSubstitutionMatrix(); | |
848 par.cons = 1; | |
849 par.M = 2; | |
850 par.forward=2; | |
851 par.Mgaps=100; | |
852 const int ciGoodMeasureSeq = 10; | |
853 int iAuxLen = strlen(ppcSeq[0]); | |
854 par.maxResLen = iAuxLen; | |
855 par.maxResLen += ciGoodMeasureSeq; | |
856 par.maxColCnt = iAuxLen + ciGoodMeasureSeq; | |
857 par.max_seqid=DEFAULT_FILTER; | |
858 par.loc=0; par.mact=0; | |
859 /* some minor parameters, affecting alignment (i think) */ | |
860 par.p = 0.0; /* minimum threshold for inclusion in hit list */ | |
861 par.Z = 100; /* minimum threshold for inclusion in hit list and alignment listing */ | |
862 par.z = 1; /* min number of lines in hit list */ | |
863 par.B = 100; /* max number of alignments */ | |
864 par.b = 1; /* min number of alignments */ | |
865 par.E = 1e6; /* maximum threshold for inclusion in hit list and alignment listing */ | |
866 par.altali=1;par.hitrank=0;par.showcons=1; par.showdssp=1;par.showpred=1;par.nseqdis=iN;par.cons=1; | |
867 } | |
868 | |
869 const int ciGoodMeasure = 10; | |
870 int iLen = strlen(ppcSeq[0]) + ciGoodMeasure; | |
871 if (iLen > par.maxResLen){ | |
872 par.maxResLen = par.maxColCnt = iLen; | |
873 } | |
874 HMM rTemp(iN+2, iLen); /* temporary template */ | |
875 Alignment rTempAli(iN+2, iLen); /* temporary alignment */ | |
876 int iParCons = par.cons; | |
877 | |
878 /*printf(">>>>>>>>>>> %s:%s:%d: there are %d sequences\n", __FUNCTION__, __FILE__, __LINE__, iN);*/ | |
879 | |
880 par.cons = 1; | |
881 if (OK != ReadAndPrepare(INTERN_ALN_2_HMM, | |
882 ppcSeq, iN, rHMM_p, | |
883 NULL/*prealigned*/, NULL/*representative*/, NULL/*weights*/, //YES/*transfer*/, | |
884 (char*)("")/*in-file*/, rTemp, &rTempAli)) { | |
885 return FAILURE; | |
886 } | |
887 par.cons = iParCons; | |
888 | |
889 /******/ | |
890 /*** transfer class info to struct */ | |
891 rHMM_p->n_display = rTemp.n_display; | |
892 rHMM_p->sname = NULL; | |
893 rHMM_p->seq = (char **)calloc((rTemp.n_display+1), sizeof(char *)); | |
894 | |
895 for (int i = 0; i < rTemp.n_display; i++){ | |
896 /* NOTE: In the original hhalign code the first position | |
897 is kept open ('\0'). This makes it difficult to use | |
898 string functions like strlen/strdup etc. | |
899 Insert a temporary gap (.) to facilitate string operations */ | |
900 char cHead = rTemp.seq[i][0]; | |
901 rTemp.seq[i][0] = '.'; | |
902 rHMM_p->seq[i] = strdup(rTemp.seq[i]); | |
903 rTemp.seq[i][0] = rHMM_p->seq[i][0] = cHead; | |
904 } | |
905 rHMM_p->ncons = rTemp.ncons; | |
906 rHMM_p->nfirst = rTemp.nfirst; | |
907 if (-1 == rHMM_p->ncons){ | |
908 /* ncons needed later for alignment of | |
909 representative sequence and copy of profile. | |
910 ncons not always set (-1 default), | |
911 this will cause segmentation fault. | |
912 nfirst (probably) right index - | |
913 no problem if not */ | |
914 rHMM_p->ncons = rTemp.nfirst; | |
915 } | |
916 rHMM_p->nss_dssp = rTemp.nss_dssp; | |
917 rHMM_p->nsa_dssp = rTemp.nsa_dssp; | |
918 rHMM_p->nss_pred = rTemp.nss_pred; | |
919 rHMM_p->nss_conf = rTemp.nss_conf; | |
920 rHMM_p->L = rTemp.L; | |
921 rHMM_p->N_in = rTemp.N_in; | |
922 rHMM_p->N_filtered = rTemp.N_filtered; | |
923 #define NEFF_SCORE 10 /* FIXME: Magic Number */ //// get rid of that, FS, 2010-12-22 | |
924 rHMM_p->Neff_M = (float *)calloc(rHMM_p->L+1, sizeof(float)); | |
925 rHMM_p->Neff_I = (float *)calloc(rHMM_p->L+1, sizeof(float)); | |
926 rHMM_p->Neff_D = (float *)calloc(rHMM_p->L+1, sizeof(float)); | |
927 for (int i = 0; i < rHMM_p->L+1; i++){ | |
928 rHMM_p->Neff_M[i] = rHMM_p->Neff_I[i] = rHMM_p->Neff_D[i] = NEFF_SCORE; //// get rid of that, FS, 2010-12-22 | |
929 } | |
930 rHMM_p->Neff_HMM = rTemp.Neff_HMM; | |
931 /* skip longname,name,file,fam,sfam,fold,cl */ | |
932 rHMM_p->lamda = rTemp.lamda; | |
933 rHMM_p->mu = rTemp.mu; | |
934 | |
935 HMMshadow rShadow = {0}; /* friend of HMM to access private members */ | |
936 rShadow.copyHMMtoShadow(rTemp); | |
937 | |
938 rHMM_p->Neff_M = (float *)calloc(rHMM_p->L+1, sizeof(float)); | |
939 rHMM_p->Neff_I = (float *)calloc(rHMM_p->L+1, sizeof(float)); | |
940 rHMM_p->Neff_D = (float *)calloc(rHMM_p->L+1, sizeof(float)); | |
941 rHMM_p->f = (float **)calloc(rHMM_p->L+1, sizeof(float *)); | |
942 rHMM_p->g = (float **)calloc(rHMM_p->L+1, sizeof(float *)); | |
943 rHMM_p->p = (float **)calloc(rHMM_p->L+1, sizeof(float *)); | |
944 rHMM_p->tr = (float **)calloc(rHMM_p->L+1, sizeof(float *)); | |
945 rHMM_p->linTr = (float **)calloc(rHMM_p->L+1, sizeof(float *)); | |
946 | |
947 for (int i = 0; i < rHMM_p->L+1; i++){ | |
948 rHMM_p->Neff_M[i] = (float)rShadow.Neff_M[i]; | |
949 rHMM_p->Neff_I[i] = (float)rShadow.Neff_I[i]; | |
950 rHMM_p->Neff_D[i] = (float)rShadow.Neff_D[i]; | |
951 rHMM_p->f[i] = (float *)calloc(AMINOACIDS, sizeof(float)); | |
952 rHMM_p->g[i] = (float *)calloc(AMINOACIDS, sizeof(float)); | |
953 rHMM_p->p[i] = (float *)calloc(AMINOACIDS, sizeof(float)); | |
954 for (int j = 0; j < AMINOACIDS; j++){ | |
955 rHMM_p->f[i][j] = (float)rShadow.f[i][j]; | |
956 rHMM_p->g[i][j] = (float)rShadow.g[i][j]; | |
957 rHMM_p->p[i][j] = (float)rShadow.p[i][j]; | |
958 } | |
959 rHMM_p->tr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float)); | |
960 rHMM_p->linTr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float)); | |
961 for (int j = 0; j< STATE_TRANSITIONS; j++){ | |
962 rHMM_p->tr[i][j] = (float)rShadow.tr[i][j]; | |
963 rHMM_p->linTr[i][j] = fpow2(rShadow.tr[i][j]); | |
964 } | |
965 } /*0 <= i < prHMM->L+1 */ | |
966 /* skip trans_lin,ss_dssp,sa_dssp,ss_pred,ss_conf,Xcons */ | |
967 for (int j = 0; j < AMINOACIDS; j++){ | |
968 rHMM_p->pav[j] = (float)rShadow.pav[j]; | |
969 } | |
970 /* skip pnul */ | |
971 | |
972 | |
973 /******/ | |
974 | |
975 | |
976 rTemp.ClobberGlobal(); | |
977 rTempAli.ClobberGlobal(); | |
978 | |
979 return OK; | |
980 | |
981 } /*** end of AlnToHMM2() ***/ | |
982 | |
983 | |
984 /** | |
985 * HHMake_Wrapper() | |
986 * | |
987 * @brief turn alignment (from file) into HHM (HMM) on file | |
988 * | |
989 * @param[out] hmm_out | |
990 * name of file with HMM info corresponding to tmp_aln | |
991 * @param[in] tmp_aln | |
992 * name of file with alignment, to be turned into HMM (HHM) | |
993 * | |
994 */ | |
995 extern "C" int | |
996 HHMake_Wrapper(char *tmp_aln, char *hmm_out) | |
997 { | |
998 | |
999 HMM rTemp; /* temporary template */ | |
1000 Alignment rTempAli; /* temporary alignment */ | |
1001 hmm_light *rHMM_p = NULL; | |
1002 | |
1003 /** Note: | |
1004 this is a wrapper for a stand-alone program hhmake. | |
1005 hhmake uses a different set of parameters than hhalign. | |
1006 However, all parameters are GLOBAL. | |
1007 So at this point we register the hhalign settings, | |
1008 reset them to hhmake settings and set them back | |
1009 at the end of the function | |
1010 */ | |
1011 | |
1012 /* save settings from hhalign */ | |
1013 int iParShowcons=par.showcons; | |
1014 int iParAppend=par.append; | |
1015 int iParNseqdis=par.nseqdis; | |
1016 int iParMark=par.mark; | |
1017 int iParMaxSeqid=par.max_seqid; | |
1018 int iParQid=par.qid; | |
1019 double dParQsc=par.qsc; | |
1020 int iParCoverage=par.coverage; | |
1021 int iParNdiff=par.Ndiff; | |
1022 int iParCoverageCore=par.coverage_core; | |
1023 double dParQscCore=par.qsc_core; | |
1024 double dParCoresc=par.coresc; | |
1025 int iParM=par.M; | |
1026 int iParMgaps=par.Mgaps; | |
1027 int iParMatrix=par.matrix; | |
1028 int iParPcm=par.pcm; | |
1029 int iParPcw=par.pcw; | |
1030 double dParGapb=par.gapb; | |
1031 int iParWg=par.wg; | |
1032 | |
1033 /* these are settings suitable for hhmake */ | |
1034 par.showcons=1; // write consensus sequence into hhm file | |
1035 par.append=0; // overwrite output file | |
1036 par.nseqdis=10; // maximum number of query or template sequences to be recoreded in HMM and diplayed in output alignments | |
1037 par.mark=0; // 1: only marked sequences (or first) get displayed; 0: most divergent ones get displayed | |
1038 par.max_seqid=90; // default for maximum sequence identity threshold | |
1039 par.qid=0; // default for maximum sequence identity threshold | |
1040 par.qsc=-20.0f; // default for minimum score per column with query | |
1041 par.coverage=0; // default for minimum coverage threshold | |
1042 par.Ndiff=100; // pick Ndiff most different sequences from alignment | |
1043 par.coverage_core=30; // Minimum coverage for sequences in core alignment | |
1044 par.qsc_core=0.3f; // Minimum score per column with core alignment (HMM) | |
1045 par.coresc=-20.0f; // Minimum score per column with core alignment (HMM) | |
1046 par.M=1; // match state assignment is by A2M/A3M | |
1047 par.Mgaps=50; // above this percentage of gaps, columns are assigned to insert states | |
1048 par.matrix=0; // Subst.matrix 0: Gonnet, 1: HSDM, 2: BLOSUM50 3: BLOSUM62 | |
1049 par.pcm=0; // no amino acid and transition pseudocounts added | |
1050 par.pcw=0; // wc>0 weighs columns according to their intra-clomun similarity | |
1051 par.gapb=0.0; // default values for transition pseudocounts; 0.0: add no transition pseudocounts! | |
1052 par.wg=0; // 0: use local sequence weights 1: use local ones | |
1053 | |
1054 | |
1055 if (OK != ReadAndPrepare(READ_ALN_2_HMM, | |
1056 NULL, 0, rHMM_p, NULL, NULL, NULL, | |
1057 tmp_aln, rTemp, &rTempAli)) { | |
1058 return FAILURE; | |
1059 } | |
1060 | |
1061 rTemp.WriteToFile(hmm_out); | |
1062 | |
1063 | |
1064 | |
1065 /* restore settings from hhalign */ | |
1066 par.showcons=iParShowcons; | |
1067 par.append=iParAppend; | |
1068 par.nseqdis=iParNseqdis; | |
1069 par.mark=iParMark; | |
1070 par.max_seqid=iParMaxSeqid; | |
1071 par.qid=iParQid; | |
1072 par.qsc=dParQsc; | |
1073 par.coverage=iParCoverage; | |
1074 par.Ndiff=iParNdiff; | |
1075 par.coverage_core=iParCoverageCore; | |
1076 par.qsc_core=dParQscCore; | |
1077 par.coresc=dParCoresc; | |
1078 par.M=iParM; | |
1079 par.Mgaps=iParMgaps; | |
1080 par.matrix=iParMatrix; | |
1081 par.pcm=iParPcm; | |
1082 par.pcw=iParPcw; | |
1083 par.gapb=dParGapb; | |
1084 par.wg=iParWg; | |
1085 | |
1086 | |
1087 /* prepare exit */ | |
1088 rTemp.ClobberGlobal(); | |
1089 rTempAli.ClobberGlobal(); | |
1090 | |
1091 return OK; | |
1092 } | |
1093 | |
1094 /** | |
1095 * readHMMWrapper() | |
1096 * | |
1097 * @brief read HMM from file, copy (relevant) info into struct | |
1098 * | |
1099 * @param[out] prHMM | |
1100 * structure with pseudocounts etc, probably uninitialised on entry | |
1101 * @param[in] pcHMM_input | |
1102 * name of file with HMM | |
1103 * | |
1104 */ | |
1105 extern "C" int | |
1106 readHMMWrapper(hmm_light *rHMM_p, | |
1107 char *pcHMM_input) | |
1108 { | |
1109 par.maxResLen = 15002; | |
1110 HMM rTemp(1000,par.maxResLen); /* temporary template */ | |
1111 Alignment rTempAli; /* temporary alignment */ | |
1112 /* AW changed init from {0} to 0 because it failed to compile with | |
1113 * gcc 4.3.3 with the following error: | |
1114 * error: braces around initializer for non-aggregate type | |
1115 */ | |
1116 /* FS taken out initialiser alltogether */ | |
1117 | |
1118 /* 0th arg of RnP is the type of RnP, ie, | |
1119 enum {INTERN_ALN_2_HMM = 0, READ_ALN_2_HMM, READ_HMM_2_HMM};*/ | |
1120 /* 1st arg of ReadAndPrepare() is ppcSeqs, 2nd is #seq */ | |
1121 /* FIXME: at this stage the 3rd arg, rHMM_p, only acts as a dummy. | |
1122 this is rather silly, as it is the actual struct that will | |
1123 carry the HMM info at the end. | |
1124 If we write it already in ReadAndPrepare() then we could | |
1125 dispense with all this friend-class nonsense */ | |
1126 if (OK != ReadAndPrepare(READ_HMM_2_HMM, | |
1127 NULL, 0, rHMM_p, NULL, NULL, NULL, | |
1128 pcHMM_input, rTemp, &rTempAli)) { | |
1129 return FAILURE; | |
1130 } | |
1131 | |
1132 /* an important piece of information I want to get out of here | |
1133 is the consenssus sequence. there are however certain | |
1134 Pfam HMMs that don't trigger consensus calculation. | |
1135 at the moment I (FS) don't understand why this is | |
1136 (or rather why this is not). The proper place to do this | |
1137 should be inside ReadAndPrepare():ReadHMMer(), but | |
1138 I have not quite penetrated the logic there. | |
1139 Therefore I try to catch this condition at this point (here) | |
1140 and rectify it. | |
1141 */ | |
1142 if (-1 == rHMM_p->ncons){ | |
1143 int i, iA; | |
1144 rHMM_p->ncons = rHMM_p->nfirst; | |
1145 | |
1146 for (i = 0; i < rHMM_p->L; i++){ | |
1147 double dPmax = 0.00; | |
1148 int iAmax = -1; | |
1149 for (iA = 0; iA < AMINOACIDS; iA++){ | |
1150 if (rHMM_p->f[i][iA] > dPmax){ | |
1151 iAmax = iA; | |
1152 dPmax = rHMM_p->f[i][iA]; | |
1153 } | |
1154 } /* (0 <= iA < AMINOACIDS) */ | |
1155 rHMM_p->seq[rHMM_p->ncons][i] = i2aa(iAmax); | |
1156 } /* (0 <= i < rHMM_p->L) */ | |
1157 | |
1158 } /* ncons not set */ | |
1159 | |
1160 | |
1161 rTemp.ClobberGlobal(); | |
1162 rTempAli.ClobberGlobal(); | |
1163 | |
1164 return OK; | |
1165 | |
1166 } /*** end: readHMMWrapper() ***/ | |
1167 | |
1168 | |
1169 | |
1170 | |
1171 | |
1172 | |
1173 ///////////////////////////////////////////////////////////////////////////// | |
1174 /** | |
1175 * @brief Do precalculations for q and t to prepare comparison | |
1176 */ | |
1177 void | |
1178 PrepareTemplate(HMM& q, HMM& t, int format) | |
1179 { | |
1180 if (format==0) // HHM format | |
1181 { | |
1182 // Add transition pseudocounts to template | |
1183 t.AddTransitionPseudocounts(); | |
1184 | |
1185 // Generate an amino acid frequency matrix from t.f[i][a] with max pseudocounts (tau=1) -> t.g[i][a] | |
1186 t.PreparePseudocounts(); | |
1187 | |
1188 // Add amino acid pseudocounts to template: t.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] | |
1189 t.AddAminoAcidPseudocounts(); | |
1190 } | |
1191 else // HHMER format | |
1192 { | |
1193 // Don't add transition pseudocounts to template | |
1194 // t.AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg, par.gaph, par.gapi, 0.0); | |
1195 | |
1196 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] | |
1197 t.PreparePseudocounts(); | |
1198 | |
1199 // DON'T ADD amino acid pseudocounts to temlate: pcm=0! t.p[i][a] = t.f[i][a] | |
1200 t.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc); | |
1201 } | |
1202 | |
1203 // Modify transition probabilities to include SS-dependent penalties | |
1204 if (par.ssgap) t.UseSecStrucDependentGapPenalties(); | |
1205 | |
1206 if (par.forward>=1) t.Log2LinTransitionProbs(1.0); | |
1207 | |
1208 // Factor Null model into HMM t | |
1209 // ATTENTION! t.p[i][a] is divided by pnul[a] (for reasons of efficiency) => do not reuse t.p | |
1210 t.IncludeNullModelInHMM(q,t); // Can go BEFORE the loop if not dependent on template | |
1211 | |
1212 return; | |
1213 } | |
1214 | |
1215 | |
1216 /*** end of hhfunc-C.h ***/ |