Mercurial > repos > clustalomega > clustalomega
comparison clustalomega/clustal-omega-1.0.2/src/clustal/ktuple_pair.c @ 1:bc707542e5de
Uploaded
author | clustalomega |
---|---|
date | Thu, 21 Jul 2011 13:35:08 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:ff1768533a07 | 1:bc707542e5de |
---|---|
1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ | |
2 | |
3 /********************************************************************* | |
4 * Clustal Omega - Multiple sequence alignment | |
5 * | |
6 * Copyright (C) 2010 University College Dublin | |
7 * | |
8 * Clustal-Omega is free software; you can redistribute it and/or | |
9 * modify it under the terms of the GNU General Public License as | |
10 * published by the Free Software Foundation; either version 2 of the | |
11 * License, or (at your option) any later version. | |
12 * | |
13 * This file is part of Clustal-Omega. | |
14 * | |
15 ********************************************************************/ | |
16 | |
17 /* | |
18 * RCS $Id: ktuple_pair.c 230 2011-04-09 15:37:50Z andreas $ | |
19 * | |
20 * | |
21 * K-Tuple code for pairwise alignment (Wilbur and Lipman, 1983; PMID | |
22 * 6572363). Most code taken from showpair.c (Clustal 1.83) | |
23 * DD: some functions now have lots of parameters as static variables | |
24 * were removed to make code OpenMP-friendly | |
25 * | |
26 */ | |
27 | |
28 #ifdef HAVE_CONFIG_H | |
29 #include "config.h" | |
30 #endif | |
31 | |
32 | |
33 #include <stdio.h> | |
34 #include <string.h> | |
35 #include <ctype.h> | |
36 #include <stdlib.h> | |
37 #include <math.h> | |
38 #include <assert.h> | |
39 | |
40 #ifdef HAVE_OPENMP | |
41 #include <omp.h> | |
42 #endif | |
43 | |
44 #include "squid/squid.h" | |
45 #include "util.h" | |
46 #include "symmatrix.h" | |
47 #include "ktuple_pair.h" | |
48 #include "log.h" | |
49 #include "progress.h" | |
50 | |
51 #define END_MARK -3 /* see interface.c in 1.83 */ | |
52 #define NUMRES 32 /* max size of comparison matrix */ | |
53 | |
54 /* see notes below */ | |
55 #undef SORT_LAST_ELEMENT_AS_WELL | |
56 | |
57 /* gap_pos1 = NUMRES-2; /@ code for gaps inserted by clustalw @/ */ | |
58 static const int GAP_POS2 = NUMRES-1; /* code for gaps already in alignment */ | |
59 static bool DNAFLAG = FALSE; | |
60 | |
61 static const char *AMINO_ACID_CODES = "ABCDEFGHIKLMNPQRSTUVWXYZ-"; | |
62 static const char *NUCLEIC_ACID_CODES = "ACGTUN-"; | |
63 /* As far as I understand the gap symbol should not be necessary here, | |
64 * because we use isgap for testing later anyway. But changing this, | |
65 * will affect max_res_code and max_nuc as well. So I leave it for now | |
66 * as it is. AW | |
67 */ | |
68 | |
69 static bool percent = TRUE; | |
70 | |
71 static void make_ptrs(int *tptr, int *pl, const int naseq, const int l, const int ktup, const int max_res_code, char **seq_array); | |
72 static void put_frag(const int fs, const int v1, const int v2, const int flen, const int curr_frag, int *next, int *maxsf, int **accum); | |
73 static bool frag_rel_pos(int a1, int b1, int a2, int b2, int ktup); | |
74 static void des_quick_sort(int *array1, int *array2, const int array_size); | |
75 static void pair_align(int seq_no, int l1, int l2, int max_res_code, ktuple_param_t *aln_param, | |
76 char **seq_array, int *maxsf, int **accum, int max_aln_length, | |
77 int *zza, int *zzb, int *zzc, int *zzd); | |
78 static void encode(char *seq, char *naseq, int l, const char *res_codes); | |
79 static int res_index(const char *lookup, char c); | |
80 | |
81 | |
82 typedef struct { | |
83 int i1; | |
84 int i2; | |
85 } two_ints_t; | |
86 | |
87 | |
88 | |
89 /* default ktuple pairwise alignment parameters | |
90 * | |
91 */ | |
92 /* protein | |
93 */ | |
94 /* designated initializer */ | |
95 const ktuple_param_t default_protein_param = { | |
96 .ktup = 1, | |
97 .wind_gap = 3, | |
98 .signif = 5, | |
99 .window = 5, | |
100 }; | |
101 /* dna | |
102 */ | |
103 /* designated initializer */ | |
104 const ktuple_param_t default_dna_param = { | |
105 .ktup = 2, | |
106 .wind_gap = 5, | |
107 .signif = 4, | |
108 .window = 4, | |
109 }; | |
110 | |
111 | |
112 /** | |
113 * note: naseq should be unit-offset | |
114 */ | |
115 static void | |
116 encode(char *seq, char *naseq, int l, const char *res_codes) | |
117 { | |
118 /* code seq as ints .. use GAP_POS2 for gap */ | |
119 register int i; | |
120 bool seq_contains_unknown_char = FALSE; | |
121 /*LOG_DEBUG("seq=%s naseq=%p l=%d", &(seq[1]), naseq, l); */ | |
122 | |
123 | |
124 for (i=1; i<=l; i++) { | |
125 char res = toupper(seq[i]); | |
126 if (isgap(res)) { | |
127 naseq[i] = GAP_POS2; /* gap in input */ | |
128 } else { | |
129 naseq[i] = res_index(res_codes, res); | |
130 } | |
131 | |
132 /*LOG_DEBUG("Character '%c' at pos %d", res, i);*/ | |
133 if (-1 == naseq[i]) { | |
134 seq_contains_unknown_char = TRUE; | |
135 /*LOG_DEBUG("Unknown character '%c' at pos %d", res, i);*/ | |
136 } | |
137 /*LOG_DEBUG("na_seq[%d]=%d", i, naseq[i]);*/ | |
138 } | |
139 | |
140 if (TRUE == seq_contains_unknown_char) | |
141 Log(&rLog, LOG_WARN, "Unknown character in seq '%s'", &(seq[1])); | |
142 | |
143 naseq[i] = END_MARK; | |
144 | |
145 return; | |
146 } | |
147 /* end of encode */ | |
148 | |
149 | |
150 /** | |
151 * | |
152 */ | |
153 static int | |
154 res_index(const char *t, char c) | |
155 { | |
156 register int i; | |
157 for (i=0; t[i] && t[i] != c; i++) | |
158 ; | |
159 if (t[i]) { | |
160 return (i); | |
161 } else { | |
162 return -1; | |
163 } | |
164 } | |
165 /* end of res_index */ | |
166 | |
167 | |
168 /** | |
169 * | |
170 */ | |
171 static void | |
172 make_ptrs(int *tptr, int *pl, const int naseq, const int l, const int ktup, const int max_res_code, char **seq_array) | |
173 { | |
174 /* FIXME make 10 a constant and give it a nice name */ | |
175 static int a[10]; | |
176 int i, j, code, flag; | |
177 char residue; | |
178 const int limit = (int) pow((double)(max_res_code+1),(double)ktup); | |
179 | |
180 for (i=1;i<=ktup;i++) | |
181 a[i] = (int) pow((double)(max_res_code+1),(double)(i-1)); | |
182 | |
183 for (i=1; i<=limit; ++i) | |
184 pl[i]=0; | |
185 for (i=1; i<=l; ++i) | |
186 tptr[i]=0; | |
187 | |
188 for (i=1; i<=(l-ktup+1); ++i) { | |
189 code=0; | |
190 flag=FALSE; | |
191 for (j=1; j<=ktup; ++j) { | |
192 /* Log(&rLog, LOG_FORCED_DEBUG, "naseq=%d i=%d j=%d seq_array[naseq]=%p", | |
193 * naseq, i, j, seq_array[naseq]); | |
194 */ | |
195 residue = seq_array[naseq][i+j-1]; | |
196 /* Log(&rLog, LOG_FORCED_DEBUG, "residue = %d", residue); */ | |
197 if ((residue<0) || (residue > max_res_code)){ | |
198 flag=TRUE; | |
199 break; | |
200 } | |
201 code += ((residue) * a[j]); | |
202 } | |
203 if (flag) | |
204 continue; | |
205 ++code; | |
206 if (0 != pl[code]) | |
207 tptr[i] =pl[code]; | |
208 pl[code] = i; | |
209 } | |
210 | |
211 return; | |
212 } | |
213 /* end of make_ptrs */ | |
214 | |
215 | |
216 /** | |
217 * | |
218 * FIXME Why hardcoding of 5? | |
219 */ | |
220 static void | |
221 put_frag(const int fs, const int v1, const int v2, const int flen, const int curr_frag, int *next, int *maxsf, int **accum) | |
222 { | |
223 int end; | |
224 accum[0][curr_frag]=fs; | |
225 accum[1][curr_frag]=v1; | |
226 accum[2][curr_frag]=v2; | |
227 accum[3][curr_frag]=flen; | |
228 | |
229 if (!*maxsf) { | |
230 *maxsf=1; | |
231 accum[4][curr_frag]=0; | |
232 return; | |
233 } | |
234 | |
235 if (fs >= accum[0][*maxsf]) { | |
236 accum[4][curr_frag]=*maxsf; | |
237 *maxsf=curr_frag; | |
238 return; | |
239 } else { | |
240 *next=*maxsf; | |
241 while (TRUE) { | |
242 end=*next; | |
243 *next=accum[4][*next]; | |
244 if (fs>=accum[0][*next]) | |
245 break; | |
246 } | |
247 accum[4][curr_frag]=*next; | |
248 accum[4][end]=curr_frag; | |
249 } | |
250 | |
251 return; | |
252 } | |
253 /* end of put_frag */ | |
254 | |
255 | |
256 /** | |
257 * | |
258 */ | |
259 static bool | |
260 frag_rel_pos(int a1, int b1, int a2, int b2, int ktup) | |
261 { | |
262 if (a1-b1 == a2-b2) { | |
263 if (a2<a1) { | |
264 return TRUE; | |
265 } | |
266 } else { | |
267 if (a2+ktup-1<a1 && b2+ktup-1<b1) { | |
268 return TRUE; | |
269 } | |
270 } | |
271 return FALSE; | |
272 } | |
273 /* end of frag_rel_pos */ | |
274 | |
275 | |
276 | |
277 | |
278 /** | |
279 * | |
280 * @note: This is together with des_quick_sort most time consuming | |
281 * routine according to gprof on r110. Tried to replace it with qsort | |
282 * and/or QSortAndTrackIndex(), which is always slower! So we keep the | |
283 * original. | |
284 * | |
285 * Original doc: Quicksort routine, adapted from chapter 4, page 115 | |
286 * of software tools by Kernighan and Plauger, (1986). Sort the | |
287 * elements of array1 and sort the elements of array2 accordingly | |
288 * | |
289 * There might be a bug here. The original function apparently never | |
290 * touches the last element and keeps it as is. Tried to fix this (see | |
291 * SORT_LAST_ELEMENT_AS_WELL) which gives slightly worse performance | |
292 * (-0.5% on BB). My fix might not be working or it's not a bug at | |
293 * all... | |
294 * | |
295 * | |
296 * | |
297 */ | |
298 static void | |
299 des_quick_sort(int *array1, int *array2, const int array_size) | |
300 { | |
301 int temp1, temp2; | |
302 int p, pivlin; | |
303 int i, j; | |
304 int lst[50], ust[50]; /* the maximum no. of elements must be*/ | |
305 /* < log(base2) of 50 */ | |
306 | |
307 #if 0 | |
308 for (i=1; i<=array_size; i++) { | |
309 Log(&rLog, LOG_FORCED_DEBUG, "b4 sort array1[%d]=%d array2[%d]=%d", i, array1[i], i, array2[i]); | |
310 } | |
311 #endif | |
312 lst[1] = 1; | |
313 | |
314 #ifdef SORT_LAST_ELEMENT_AS_WELL | |
315 ust[1] = array_size; | |
316 #else | |
317 /* original */ | |
318 ust[1] = array_size-1; | |
319 #endif | |
320 p = 1; | |
321 | |
322 | |
323 while (p > 0) { | |
324 if (lst[p] >= ust[p]) { | |
325 p--; | |
326 } else { | |
327 i = lst[p] - 1; | |
328 j = ust[p]; | |
329 pivlin = array1[j]; | |
330 while (i < j) { | |
331 for (i=i+1; array1[i] < pivlin; i++) | |
332 ; | |
333 for (j=j-1; j > i; j--) | |
334 if (array1[j] <= pivlin) break; | |
335 if (i < j) { | |
336 temp1 = array1[i]; | |
337 array1[i] = array1[j]; | |
338 array1[j] = temp1; | |
339 | |
340 temp2 = array2[i]; | |
341 array2[i] = array2[j]; | |
342 array2[j] = temp2; | |
343 } | |
344 } | |
345 | |
346 j = ust[p]; | |
347 | |
348 temp1 = array1[i]; | |
349 array1[i] = array1[j]; | |
350 array1[j] = temp1; | |
351 | |
352 temp2 = array2[i]; | |
353 array2[i] = array2[j]; | |
354 array2[j] = temp2; | |
355 | |
356 if (i-lst[p] < ust[p] - i) { | |
357 lst[p+1] = lst[p]; | |
358 ust[p+1] = i - 1; | |
359 lst[p] = i + 1; | |
360 | |
361 } else { | |
362 lst[p+1] = i + 1; | |
363 ust[p+1] = ust[p]; | |
364 ust[p] = i - 1; | |
365 } | |
366 p = p + 1; | |
367 } | |
368 } | |
369 | |
370 #if 0 | |
371 for (i=1; i<=array_size; i++) { | |
372 Log(&rLog, LOG_FORCED_DEBUG, "after sort array1[%d]=%d array2[%d]=%d", i, array1[i], i, array2[i]); | |
373 } | |
374 #endif | |
375 | |
376 return; | |
377 } | |
378 /* end of des_quick_sort */ | |
379 | |
380 | |
381 | |
382 /** | |
383 * | |
384 * FIXME together with des_quick_sort most time consuming routine | |
385 * according to gprof on r110 | |
386 * | |
387 */ | |
388 static void | |
389 pair_align(int seq_no, int l1, int l2, int max_res_code, ktuple_param_t *aln_param, | |
390 char **seq_array, int *maxsf, int **accum, int max_aln_length, | |
391 int *zza, int *zzb, int *zzc, int *zzd) | |
392 { | |
393 int next; /* forrmerly static */ | |
394 int pot[8],i, j, l, m, flag, limit, pos, vn1, vn2, flen, osptr, fs; | |
395 int tv1, tv2, encrypt, subt1, subt2, rmndr; | |
396 char residue; | |
397 int *diag_index; | |
398 int *displ; | |
399 char *slopes; | |
400 int curr_frag; | |
401 const int tl1 = (l1+l2)-1; | |
402 | |
403 assert(NULL!=aln_param); | |
404 | |
405 /* | |
406 Log(&rLog, LOG_FORCED_DEBUG, "DNAFLAG=%d seq_no=%d l1=%d l2=%d window=%d ktup=%d signif=%d wind_gap=%d", | |
407 DNAFLAG, seq_no, l1, l2, window, ktup, signif, | |
408 wind_gap); | |
409 */ | |
410 | |
411 slopes = (char *) CKCALLOC(tl1+1, sizeof(char)); | |
412 displ = (int *) CKCALLOC(tl1+1, sizeof(int)); | |
413 diag_index = (int *) CKMALLOC((tl1+1) * sizeof(int)); | |
414 | |
415 for (i=1; i<=tl1; ++i) { | |
416 /* unnecessary, because we calloced: slopes[i] = displ[i] = 0; */ | |
417 diag_index[i] = i; | |
418 } | |
419 | |
420 for (i=1;i<=aln_param->ktup;i++) | |
421 pot[i] = (int) pow((double)(max_res_code+1),(double)(i-1)); | |
422 limit = (int) pow((double)(max_res_code+1),(double)aln_param->ktup); | |
423 | |
424 | |
425 | |
426 /* increment diagonal score for each k_tuple match */ | |
427 | |
428 for (i=1; i<=limit; ++i) { | |
429 vn1=zzc[i]; | |
430 while (TRUE) { | |
431 if (!vn1) break; | |
432 vn2 = zzd[i]; | |
433 while (0 != vn2) { | |
434 osptr = vn1-vn2+l2; | |
435 ++displ[osptr]; | |
436 vn2 = zzb[vn2]; | |
437 } | |
438 vn1=zza[vn1]; | |
439 } | |
440 } | |
441 | |
442 | |
443 /* choose the top SIGNIF diagonals | |
444 */ | |
445 | |
446 #ifdef QSORT_REPLACEMENT | |
447 /* This was an attempt to replace des_quick_sort with qsort(), | |
448 * which turns out to be much slower, so don't use this | |
449 */ | |
450 | |
451 /* FIXME: if we use this branch, we don't need to init diag_index | |
452 * before, because that is done in QSortAndTrackIndex() | |
453 * automatically. | |
454 */ | |
455 #if 0 | |
456 for (i=1; i<=tl1; i++) { | |
457 Log(&rLog, LOG_FORCED_DEBUG, "b4 sort disp[%d]=%d diag_index[%d]=%d", i, diag_index[i], i, displ[i]); | |
458 } | |
459 #endif | |
460 | |
461 QSortAndTrackIndex(&(diag_index[1]), &(displ[1]), tl1, 'a', TRUE); | |
462 | |
463 #if 0 | |
464 for (i=1; i<=tl1; i++) { | |
465 Log(&rLog, LOG_FORCED_DEBUG, "after sort disp[%d]=%d diag_index[%d]=%d", i, diag_index[i], i, displ[i]); | |
466 } | |
467 #endif | |
468 | |
469 #else | |
470 | |
471 des_quick_sort(displ, diag_index, tl1); | |
472 | |
473 #endif | |
474 | |
475 j = tl1 - aln_param->signif + 1; | |
476 | |
477 if (j < 1) { | |
478 j = 1; | |
479 } | |
480 | |
481 /* flag all diagonals within WINDOW of a top diagonal */ | |
482 | |
483 for (i=tl1; i>=j; i--) { | |
484 if (displ[i] > 0) { | |
485 pos = diag_index[i]; | |
486 l = (1 > pos - aln_param->window) ? | |
487 1 : pos - aln_param->window; | |
488 m = (tl1 < pos + aln_param->window) ? | |
489 tl1 : pos + aln_param->window; | |
490 for (; l <= m; l++) | |
491 slopes[l] = 1; | |
492 } | |
493 } | |
494 | |
495 for (i=1; i<=tl1; i++) { | |
496 displ[i] = 0; | |
497 } | |
498 | |
499 curr_frag=*maxsf=0; | |
500 | |
501 for (i=1; i<=(l1-aln_param->ktup+1); ++i) { | |
502 encrypt=flag=0; | |
503 for (j=1; j<=aln_param->ktup; ++j) { | |
504 residue = seq_array[seq_no][i+j-1]; | |
505 if ((residue<0) || (residue>max_res_code)) { | |
506 flag=TRUE; | |
507 break; | |
508 } | |
509 encrypt += ((residue)*pot[j]); | |
510 } | |
511 if (flag) { | |
512 continue; | |
513 } | |
514 ++encrypt; | |
515 | |
516 vn2=zzd[encrypt]; | |
517 | |
518 flag=FALSE; | |
519 while (TRUE) { | |
520 if (!vn2) { | |
521 flag=TRUE; | |
522 break; | |
523 } | |
524 osptr=i-vn2+l2; | |
525 if (1 != slopes[osptr]) { | |
526 vn2=zzb[vn2]; | |
527 continue; | |
528 } | |
529 flen=0; | |
530 fs=aln_param->ktup; | |
531 next=*maxsf; | |
532 | |
533 /* | |
534 * A-loop | |
535 */ | |
536 | |
537 while (TRUE) { | |
538 if (!next) { | |
539 ++curr_frag; | |
540 if (curr_frag >= 2*max_aln_length) { | |
541 Log(&rLog, LOG_VERBOSE, "(Partial alignment)"); | |
542 goto free_and_exit; /* Yesss! Always wanted to | |
543 * use a goto (AW) */ | |
544 } | |
545 displ[osptr]=curr_frag; | |
546 put_frag(fs, i, vn2, flen, curr_frag, &next, maxsf, accum); | |
547 | |
548 } else { | |
549 tv1=accum[1][next]; | |
550 tv2=accum[2][next]; | |
551 | |
552 if (frag_rel_pos(i, vn2, tv1, tv2, aln_param->ktup)) { | |
553 if (i-vn2 == accum[1][next]-accum[2][next]) { | |
554 if (i > accum[1][next]+(aln_param->ktup-1)) { | |
555 fs = accum[0][next]+aln_param->ktup; | |
556 } else { | |
557 rmndr = i-accum[1][next]; | |
558 fs = accum[0][next]+rmndr; | |
559 } | |
560 flen=next; | |
561 next=0; | |
562 continue; | |
563 | |
564 } else { | |
565 if (0 == displ[osptr]) { | |
566 subt1=aln_param->ktup; | |
567 } else { | |
568 if (i > accum[1][displ[osptr]]+(aln_param->ktup-1)) { | |
569 subt1=accum[0][displ[osptr]]+aln_param->ktup; | |
570 } else { | |
571 rmndr=i-accum[1][displ[osptr]]; | |
572 subt1=accum[0][displ[osptr]]+rmndr; | |
573 } | |
574 } | |
575 subt2=accum[0][next] - aln_param->wind_gap + aln_param->ktup; | |
576 if (subt2>subt1) { | |
577 flen=next; | |
578 fs=subt2; | |
579 } else { | |
580 flen=displ[osptr]; | |
581 fs=subt1; | |
582 } | |
583 next=0; | |
584 continue; | |
585 } | |
586 } else { | |
587 next=accum[4][next]; | |
588 continue; | |
589 } | |
590 } | |
591 break; | |
592 } | |
593 /* | |
594 * End of Aloop | |
595 */ | |
596 | |
597 vn2=zzb[vn2]; | |
598 } | |
599 } | |
600 | |
601 free_and_exit: | |
602 CKFREE(displ); | |
603 CKFREE(slopes); | |
604 CKFREE(diag_index); | |
605 | |
606 return; | |
607 } | |
608 /* end of pair_align */ | |
609 | |
610 | |
611 | |
612 /** | |
613 * | |
614 * Will compute ktuple scores and store in tmat | |
615 * Following values will be set: tmat[i][j], where | |
616 * istart <= i <iend | |
617 * and | |
618 * jstart <= j < jend | |
619 * i.e. zero-offset | |
620 * tmat data members have to be preallocated | |
621 * | |
622 * if ktuple_param_t *aln_param == NULL defaults will be used | |
623 */ | |
624 void | |
625 KTuplePairDist(symmatrix_t *tmat, mseq_t *mseq, | |
626 int istart, int iend, | |
627 int jstart, int jend, | |
628 ktuple_param_t *param_override, | |
629 progress_t *prProgress, | |
630 unsigned long int *ulStepNo, unsigned long int ulTotalStepNo) | |
631 { | |
632 /* this first group of variables were previously static | |
633 and hence un-parallelisable */ | |
634 char **seq_array; | |
635 int maxsf; | |
636 int **accum; | |
637 int max_aln_length; | |
638 /* divide score with length of smallest sequence */ | |
639 int *zza, *zzb, *zzc, *zzd; | |
640 int private_step_no = 0; | |
641 | |
642 int i, j, dsr; | |
643 double calc_score; | |
644 int max_res_code = -1; | |
645 | |
646 int max_seq_len; | |
647 int *seqlen_array; | |
648 /* progress_t *prProgress; */ | |
649 /* int uStepNo, uTotalStepNo; */ | |
650 ktuple_param_t aln_param = default_protein_param; | |
651 bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE; | |
652 | |
653 | |
654 if(prProgress == NULL) { | |
655 NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO), | |
656 "Ktuple-distance calculation progress", bPrintCR); | |
657 } | |
658 | |
659 /* conversion to old style data types follows | |
660 * | |
661 */ | |
662 | |
663 seqlen_array = (int*) CKMALLOC((mseq->nseqs+1) * sizeof(int)); | |
664 for (i=0; i<mseq->nseqs; i++) { | |
665 seqlen_array[i+1] = mseq->sqinfo[i].len; | |
666 } | |
667 | |
668 /* setup alignment parameters | |
669 */ | |
670 if (SEQTYPE_PROTEIN == mseq->seqtype) { | |
671 DNAFLAG = FALSE; | |
672 max_res_code = strlen(AMINO_ACID_CODES)-2; | |
673 aln_param = default_protein_param; | |
674 | |
675 } else if (SEQTYPE_RNA == mseq->seqtype || SEQTYPE_DNA == mseq->seqtype) { | |
676 DNAFLAG = TRUE; | |
677 max_res_code = strlen(NUCLEIC_ACID_CODES)-2; | |
678 aln_param = default_dna_param; | |
679 | |
680 } else { | |
681 Log(&rLog, LOG_FATAL, "Internal error in %s: Unknown sequence type.", __FUNCTION__); | |
682 } | |
683 | |
684 if (NULL!=param_override) { | |
685 aln_param.ktup = param_override->ktup; | |
686 aln_param.wind_gap = param_override->wind_gap; | |
687 aln_param.signif = param_override->signif; | |
688 aln_param.window = param_override->window; | |
689 } | |
690 | |
691 /*LOG_DEBUG("DNAFLAG = %d max_res_code = %d", DNAFLAG, max_res_code);*/ | |
692 | |
693 /* convert mseq to clustal's old-style int encoded sequences (unit-offset) | |
694 */ | |
695 max_aln_length = 0; | |
696 max_seq_len = 0; | |
697 seq_array = (char **) CKMALLOC((mseq->nseqs+1) * sizeof(char *)); | |
698 seq_array[0] = NULL; | |
699 /* FIXME check that non of the seqs is smaller than ktup (?). | |
700 * Otherwise segfault occurs | |
701 */ | |
702 for (i=0; i<mseq->nseqs; i++) { | |
703 seq_array[i+1] = (char *) CKMALLOC((seqlen_array[i+1]+2) * sizeof (char));; | |
704 } | |
705 for (i=0; i<mseq->nseqs; i++) { | |
706 /*LOG_DEBUG("calling encode with seq_array[%d+1] len=%d and seq=%s", | |
707 i, seqlen_array[i+1], mseq->seq[i]);*/ | |
708 if (TRUE == DNAFLAG) { | |
709 encode(&(mseq->seq[i][-1]), seq_array[i+1], | |
710 seqlen_array[i+1], NUCLEIC_ACID_CODES); | |
711 } else { | |
712 encode(&(mseq->seq[i][-1]), seq_array[i+1], | |
713 seqlen_array[i+1], AMINO_ACID_CODES); | |
714 } | |
715 | |
716 if (seqlen_array[i+1]>max_seq_len) { | |
717 max_seq_len = seqlen_array[i+1]; | |
718 } | |
719 } | |
720 max_aln_length = max_seq_len * 2; | |
721 /* see sequence.c in old source */ | |
722 | |
723 /* FIXME: short sequences can cause seg-fault | |
724 * because max_aln_length can get shorter | |
725 * than (max_res_code+1)^k | |
726 * FS, r222->r223 */ | |
727 max_aln_length = max_aln_length > pow((max_res_code+1), aln_param.ktup)+1 ? | |
728 max_aln_length : pow((max_res_code+1), aln_param.ktup)+1; | |
729 | |
730 /* | |
731 * | |
732 * conversion to old style clustal done (in no time) */ | |
733 | |
734 | |
735 accum = (int **) CKCALLOC(5, sizeof (int *)); | |
736 for (i=0;i<5;i++) { | |
737 accum[i] = (int *) CKCALLOC((2*max_aln_length+1), sizeof(int)); | |
738 } | |
739 zza = (int *) CKCALLOC( (max_aln_length+1), sizeof(int)); | |
740 zzb = (int *) CKCALLOC( (max_aln_length+1), sizeof(int)); | |
741 zzc = (int *) CKCALLOC( (max_aln_length+1), sizeof(int)); | |
742 zzd = (int *) CKCALLOC( (max_aln_length+1), sizeof(int)); | |
743 | |
744 /* estimation of total number of steps (if istart and jstart are | |
745 * both 0) (now handled in the calling routine) | |
746 */ | |
747 /* uTotalStepNo = iend*jend - iend*iend/2 + iend/2; | |
748 uStepNo = 0; */ | |
749 /*LOG_DEBUG("istart=%d iend=%d jstart=%d jend=%d", istart, iend, jstart, jend);*/ | |
750 | |
751 for (i=istart+1; i<=iend; ++i) { | |
752 /* by definition a sequence compared to itself should give | |
753 a score of 0. AW */ | |
754 SymMatrixSetValue(tmat, i-1, i-1, 0.0); | |
755 make_ptrs(zza, zzc, i, seqlen_array[i], aln_param.ktup, max_res_code, seq_array); | |
756 | |
757 #ifdef HAVE_OPENMP | |
758 #pragma omp critical(ktuple) | |
759 #endif | |
760 { | |
761 ProgressLog(prProgress, *ulStepNo, ulTotalStepNo, FALSE); | |
762 } | |
763 | |
764 for (j=MAX(i+1, jstart+1); j<=jend; ++j) { | |
765 (*ulStepNo)++; | |
766 private_step_no++; | |
767 /*LOG_DEBUG("comparing pair %d:%d", i, j);*/ | |
768 | |
769 make_ptrs(zzb, zzd, j, seqlen_array[j], aln_param.ktup, max_res_code, seq_array); | |
770 pair_align(i, seqlen_array[i], seqlen_array[j], max_res_code, &aln_param, | |
771 seq_array, &maxsf, accum, max_aln_length, zza, zzb, zzc, zzd); | |
772 | |
773 if (!maxsf) { | |
774 calc_score=0.0; | |
775 } else { | |
776 calc_score=(double)accum[0][maxsf]; | |
777 if (percent) { | |
778 dsr=(seqlen_array[i]<seqlen_array[j]) ? | |
779 seqlen_array[i] : seqlen_array[j]; | |
780 calc_score = (calc_score/(double)dsr) * 100.0; | |
781 } | |
782 } | |
783 | |
784 /* printf("%d %d %d\n", i-1, j-1, (100.0 - calc_score)/100.0); */ | |
785 SymMatrixSetValue(tmat, i-1, j-1, (100.0 - calc_score)/100.0); | |
786 | |
787 /* the function allows you not to compute the full matrix. | |
788 * here we explicitely make the resulting matrix a | |
789 * rectangle, i.e. we always set full rows. in other | |
790 * words, if we don't complete the full matrix then we | |
791 * don't have a full symmetry. so only use the defined | |
792 * symmetric part. AW | |
793 */ | |
794 /*LOG_DEBUG("setting %d : %d = %f", j, i, tmat[i][j]);*/ | |
795 /* not needed anymore since we use symmatrix_t | |
796 if (j<=iend) { | |
797 tmat[j][i] = tmat[i][j]; | |
798 } | |
799 */ | |
800 #ifdef HAVE_OPENMP | |
801 #pragma omp critical(ktuple) | |
802 #endif | |
803 { | |
804 Log(&rLog, LOG_DEBUG, "K-tuple distance for sequence pair %d:%d = %lg", | |
805 i, j, SymMatrixGetValue(tmat, i-1, j-1)); | |
806 } | |
807 } | |
808 } | |
809 /* | |
810 Log(&rLog, LOG_FORCED_DEBUG, "uTotalStepNo=%d for istart=%d iend=%d jstart=%d jend=%d", uStepNo, istart, iend, jstart, jend); | |
811 Log(&rLog, LOG_FORCED_DEBUG, "Fabian = %d", iend*jend - iend*iend/2 + iend/2); | |
812 */ | |
813 | |
814 /* printf("\n\n%d\t%d\t%d\t%d\n\n", omp_get_thread_num(), uStepNo, istart, iend); */ | |
815 | |
816 for (i=0;i<5;i++) { | |
817 CKFREE(accum[i]); | |
818 } | |
819 CKFREE(accum); | |
820 | |
821 #ifdef HAVE_OPENMP | |
822 #pragma omp critical(ktuple) | |
823 #if 0 | |
824 { | |
825 printf("steps: %d\n", private_step_no); | |
826 } | |
827 #endif | |
828 #endif | |
829 | |
830 CKFREE(zza); | |
831 CKFREE(zzb); | |
832 CKFREE(zzc); | |
833 CKFREE(zzd); | |
834 | |
835 free(seqlen_array); | |
836 | |
837 for (i=1; i<=mseq->nseqs; i++) { | |
838 CKFREE(seq_array[i]); | |
839 } | |
840 CKFREE(seq_array); | |
841 } | |
842 /* end of KTuplePairDist */ |