1
|
1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
|
|
2
|
|
3 /*********************************************************************
|
|
4 * Clustal Omega - Multiple sequence alignment
|
|
5 *
|
|
6 * Copyright (C) 2010 University College Dublin
|
|
7 *
|
|
8 * Clustal-Omega is free software; you can redistribute it and/or
|
|
9 * modify it under the terms of the GNU General Public License as
|
|
10 * published by the Free Software Foundation; either version 2 of the
|
|
11 * License, or (at your option) any later version.
|
|
12 *
|
|
13 * This file is part of Clustal-Omega.
|
|
14 *
|
|
15 ********************************************************************/
|
|
16
|
|
17 /*
|
|
18 * RCS $Id: ktuple_pair.c 230 2011-04-09 15:37:50Z andreas $
|
|
19 *
|
|
20 *
|
|
21 * K-Tuple code for pairwise alignment (Wilbur and Lipman, 1983; PMID
|
|
22 * 6572363). Most code taken from showpair.c (Clustal 1.83)
|
|
23 * DD: some functions now have lots of parameters as static variables
|
|
24 * were removed to make code OpenMP-friendly
|
|
25 *
|
|
26 */
|
|
27
|
|
28 #ifdef HAVE_CONFIG_H
|
|
29 #include "config.h"
|
|
30 #endif
|
|
31
|
|
32
|
|
33 #include <stdio.h>
|
|
34 #include <string.h>
|
|
35 #include <ctype.h>
|
|
36 #include <stdlib.h>
|
|
37 #include <math.h>
|
|
38 #include <assert.h>
|
|
39
|
|
40 #ifdef HAVE_OPENMP
|
|
41 #include <omp.h>
|
|
42 #endif
|
|
43
|
|
44 #include "squid/squid.h"
|
|
45 #include "util.h"
|
|
46 #include "symmatrix.h"
|
|
47 #include "ktuple_pair.h"
|
|
48 #include "log.h"
|
|
49 #include "progress.h"
|
|
50
|
|
51 #define END_MARK -3 /* see interface.c in 1.83 */
|
|
52 #define NUMRES 32 /* max size of comparison matrix */
|
|
53
|
|
54 /* see notes below */
|
|
55 #undef SORT_LAST_ELEMENT_AS_WELL
|
|
56
|
|
57 /* gap_pos1 = NUMRES-2; /@ code for gaps inserted by clustalw @/ */
|
|
58 static const int GAP_POS2 = NUMRES-1; /* code for gaps already in alignment */
|
|
59 static bool DNAFLAG = FALSE;
|
|
60
|
|
61 static const char *AMINO_ACID_CODES = "ABCDEFGHIKLMNPQRSTUVWXYZ-";
|
|
62 static const char *NUCLEIC_ACID_CODES = "ACGTUN-";
|
|
63 /* As far as I understand the gap symbol should not be necessary here,
|
|
64 * because we use isgap for testing later anyway. But changing this,
|
|
65 * will affect max_res_code and max_nuc as well. So I leave it for now
|
|
66 * as it is. AW
|
|
67 */
|
|
68
|
|
69 static bool percent = TRUE;
|
|
70
|
|
71 static void make_ptrs(int *tptr, int *pl, const int naseq, const int l, const int ktup, const int max_res_code, char **seq_array);
|
|
72 static void put_frag(const int fs, const int v1, const int v2, const int flen, const int curr_frag, int *next, int *maxsf, int **accum);
|
|
73 static bool frag_rel_pos(int a1, int b1, int a2, int b2, int ktup);
|
|
74 static void des_quick_sort(int *array1, int *array2, const int array_size);
|
|
75 static void pair_align(int seq_no, int l1, int l2, int max_res_code, ktuple_param_t *aln_param,
|
|
76 char **seq_array, int *maxsf, int **accum, int max_aln_length,
|
|
77 int *zza, int *zzb, int *zzc, int *zzd);
|
|
78 static void encode(char *seq, char *naseq, int l, const char *res_codes);
|
|
79 static int res_index(const char *lookup, char c);
|
|
80
|
|
81
|
|
82 typedef struct {
|
|
83 int i1;
|
|
84 int i2;
|
|
85 } two_ints_t;
|
|
86
|
|
87
|
|
88
|
|
89 /* default ktuple pairwise alignment parameters
|
|
90 *
|
|
91 */
|
|
92 /* protein
|
|
93 */
|
|
94 /* designated initializer */
|
|
95 const ktuple_param_t default_protein_param = {
|
|
96 .ktup = 1,
|
|
97 .wind_gap = 3,
|
|
98 .signif = 5,
|
|
99 .window = 5,
|
|
100 };
|
|
101 /* dna
|
|
102 */
|
|
103 /* designated initializer */
|
|
104 const ktuple_param_t default_dna_param = {
|
|
105 .ktup = 2,
|
|
106 .wind_gap = 5,
|
|
107 .signif = 4,
|
|
108 .window = 4,
|
|
109 };
|
|
110
|
|
111
|
|
112 /**
|
|
113 * note: naseq should be unit-offset
|
|
114 */
|
|
115 static void
|
|
116 encode(char *seq, char *naseq, int l, const char *res_codes)
|
|
117 {
|
|
118 /* code seq as ints .. use GAP_POS2 for gap */
|
|
119 register int i;
|
|
120 bool seq_contains_unknown_char = FALSE;
|
|
121 /*LOG_DEBUG("seq=%s naseq=%p l=%d", &(seq[1]), naseq, l); */
|
|
122
|
|
123
|
|
124 for (i=1; i<=l; i++) {
|
|
125 char res = toupper(seq[i]);
|
|
126 if (isgap(res)) {
|
|
127 naseq[i] = GAP_POS2; /* gap in input */
|
|
128 } else {
|
|
129 naseq[i] = res_index(res_codes, res);
|
|
130 }
|
|
131
|
|
132 /*LOG_DEBUG("Character '%c' at pos %d", res, i);*/
|
|
133 if (-1 == naseq[i]) {
|
|
134 seq_contains_unknown_char = TRUE;
|
|
135 /*LOG_DEBUG("Unknown character '%c' at pos %d", res, i);*/
|
|
136 }
|
|
137 /*LOG_DEBUG("na_seq[%d]=%d", i, naseq[i]);*/
|
|
138 }
|
|
139
|
|
140 if (TRUE == seq_contains_unknown_char)
|
|
141 Log(&rLog, LOG_WARN, "Unknown character in seq '%s'", &(seq[1]));
|
|
142
|
|
143 naseq[i] = END_MARK;
|
|
144
|
|
145 return;
|
|
146 }
|
|
147 /* end of encode */
|
|
148
|
|
149
|
|
150 /**
|
|
151 *
|
|
152 */
|
|
153 static int
|
|
154 res_index(const char *t, char c)
|
|
155 {
|
|
156 register int i;
|
|
157 for (i=0; t[i] && t[i] != c; i++)
|
|
158 ;
|
|
159 if (t[i]) {
|
|
160 return (i);
|
|
161 } else {
|
|
162 return -1;
|
|
163 }
|
|
164 }
|
|
165 /* end of res_index */
|
|
166
|
|
167
|
|
168 /**
|
|
169 *
|
|
170 */
|
|
171 static void
|
|
172 make_ptrs(int *tptr, int *pl, const int naseq, const int l, const int ktup, const int max_res_code, char **seq_array)
|
|
173 {
|
|
174 /* FIXME make 10 a constant and give it a nice name */
|
|
175 static int a[10];
|
|
176 int i, j, code, flag;
|
|
177 char residue;
|
|
178 const int limit = (int) pow((double)(max_res_code+1),(double)ktup);
|
|
179
|
|
180 for (i=1;i<=ktup;i++)
|
|
181 a[i] = (int) pow((double)(max_res_code+1),(double)(i-1));
|
|
182
|
|
183 for (i=1; i<=limit; ++i)
|
|
184 pl[i]=0;
|
|
185 for (i=1; i<=l; ++i)
|
|
186 tptr[i]=0;
|
|
187
|
|
188 for (i=1; i<=(l-ktup+1); ++i) {
|
|
189 code=0;
|
|
190 flag=FALSE;
|
|
191 for (j=1; j<=ktup; ++j) {
|
|
192 /* Log(&rLog, LOG_FORCED_DEBUG, "naseq=%d i=%d j=%d seq_array[naseq]=%p",
|
|
193 * naseq, i, j, seq_array[naseq]);
|
|
194 */
|
|
195 residue = seq_array[naseq][i+j-1];
|
|
196 /* Log(&rLog, LOG_FORCED_DEBUG, "residue = %d", residue); */
|
|
197 if ((residue<0) || (residue > max_res_code)){
|
|
198 flag=TRUE;
|
|
199 break;
|
|
200 }
|
|
201 code += ((residue) * a[j]);
|
|
202 }
|
|
203 if (flag)
|
|
204 continue;
|
|
205 ++code;
|
|
206 if (0 != pl[code])
|
|
207 tptr[i] =pl[code];
|
|
208 pl[code] = i;
|
|
209 }
|
|
210
|
|
211 return;
|
|
212 }
|
|
213 /* end of make_ptrs */
|
|
214
|
|
215
|
|
216 /**
|
|
217 *
|
|
218 * FIXME Why hardcoding of 5?
|
|
219 */
|
|
220 static void
|
|
221 put_frag(const int fs, const int v1, const int v2, const int flen, const int curr_frag, int *next, int *maxsf, int **accum)
|
|
222 {
|
|
223 int end;
|
|
224 accum[0][curr_frag]=fs;
|
|
225 accum[1][curr_frag]=v1;
|
|
226 accum[2][curr_frag]=v2;
|
|
227 accum[3][curr_frag]=flen;
|
|
228
|
|
229 if (!*maxsf) {
|
|
230 *maxsf=1;
|
|
231 accum[4][curr_frag]=0;
|
|
232 return;
|
|
233 }
|
|
234
|
|
235 if (fs >= accum[0][*maxsf]) {
|
|
236 accum[4][curr_frag]=*maxsf;
|
|
237 *maxsf=curr_frag;
|
|
238 return;
|
|
239 } else {
|
|
240 *next=*maxsf;
|
|
241 while (TRUE) {
|
|
242 end=*next;
|
|
243 *next=accum[4][*next];
|
|
244 if (fs>=accum[0][*next])
|
|
245 break;
|
|
246 }
|
|
247 accum[4][curr_frag]=*next;
|
|
248 accum[4][end]=curr_frag;
|
|
249 }
|
|
250
|
|
251 return;
|
|
252 }
|
|
253 /* end of put_frag */
|
|
254
|
|
255
|
|
256 /**
|
|
257 *
|
|
258 */
|
|
259 static bool
|
|
260 frag_rel_pos(int a1, int b1, int a2, int b2, int ktup)
|
|
261 {
|
|
262 if (a1-b1 == a2-b2) {
|
|
263 if (a2<a1) {
|
|
264 return TRUE;
|
|
265 }
|
|
266 } else {
|
|
267 if (a2+ktup-1<a1 && b2+ktup-1<b1) {
|
|
268 return TRUE;
|
|
269 }
|
|
270 }
|
|
271 return FALSE;
|
|
272 }
|
|
273 /* end of frag_rel_pos */
|
|
274
|
|
275
|
|
276
|
|
277
|
|
278 /**
|
|
279 *
|
|
280 * @note: This is together with des_quick_sort most time consuming
|
|
281 * routine according to gprof on r110. Tried to replace it with qsort
|
|
282 * and/or QSortAndTrackIndex(), which is always slower! So we keep the
|
|
283 * original.
|
|
284 *
|
|
285 * Original doc: Quicksort routine, adapted from chapter 4, page 115
|
|
286 * of software tools by Kernighan and Plauger, (1986). Sort the
|
|
287 * elements of array1 and sort the elements of array2 accordingly
|
|
288 *
|
|
289 * There might be a bug here. The original function apparently never
|
|
290 * touches the last element and keeps it as is. Tried to fix this (see
|
|
291 * SORT_LAST_ELEMENT_AS_WELL) which gives slightly worse performance
|
|
292 * (-0.5% on BB). My fix might not be working or it's not a bug at
|
|
293 * all...
|
|
294 *
|
|
295 *
|
|
296 *
|
|
297 */
|
|
298 static void
|
|
299 des_quick_sort(int *array1, int *array2, const int array_size)
|
|
300 {
|
|
301 int temp1, temp2;
|
|
302 int p, pivlin;
|
|
303 int i, j;
|
|
304 int lst[50], ust[50]; /* the maximum no. of elements must be*/
|
|
305 /* < log(base2) of 50 */
|
|
306
|
|
307 #if 0
|
|
308 for (i=1; i<=array_size; i++) {
|
|
309 Log(&rLog, LOG_FORCED_DEBUG, "b4 sort array1[%d]=%d array2[%d]=%d", i, array1[i], i, array2[i]);
|
|
310 }
|
|
311 #endif
|
|
312 lst[1] = 1;
|
|
313
|
|
314 #ifdef SORT_LAST_ELEMENT_AS_WELL
|
|
315 ust[1] = array_size;
|
|
316 #else
|
|
317 /* original */
|
|
318 ust[1] = array_size-1;
|
|
319 #endif
|
|
320 p = 1;
|
|
321
|
|
322
|
|
323 while (p > 0) {
|
|
324 if (lst[p] >= ust[p]) {
|
|
325 p--;
|
|
326 } else {
|
|
327 i = lst[p] - 1;
|
|
328 j = ust[p];
|
|
329 pivlin = array1[j];
|
|
330 while (i < j) {
|
|
331 for (i=i+1; array1[i] < pivlin; i++)
|
|
332 ;
|
|
333 for (j=j-1; j > i; j--)
|
|
334 if (array1[j] <= pivlin) break;
|
|
335 if (i < j) {
|
|
336 temp1 = array1[i];
|
|
337 array1[i] = array1[j];
|
|
338 array1[j] = temp1;
|
|
339
|
|
340 temp2 = array2[i];
|
|
341 array2[i] = array2[j];
|
|
342 array2[j] = temp2;
|
|
343 }
|
|
344 }
|
|
345
|
|
346 j = ust[p];
|
|
347
|
|
348 temp1 = array1[i];
|
|
349 array1[i] = array1[j];
|
|
350 array1[j] = temp1;
|
|
351
|
|
352 temp2 = array2[i];
|
|
353 array2[i] = array2[j];
|
|
354 array2[j] = temp2;
|
|
355
|
|
356 if (i-lst[p] < ust[p] - i) {
|
|
357 lst[p+1] = lst[p];
|
|
358 ust[p+1] = i - 1;
|
|
359 lst[p] = i + 1;
|
|
360
|
|
361 } else {
|
|
362 lst[p+1] = i + 1;
|
|
363 ust[p+1] = ust[p];
|
|
364 ust[p] = i - 1;
|
|
365 }
|
|
366 p = p + 1;
|
|
367 }
|
|
368 }
|
|
369
|
|
370 #if 0
|
|
371 for (i=1; i<=array_size; i++) {
|
|
372 Log(&rLog, LOG_FORCED_DEBUG, "after sort array1[%d]=%d array2[%d]=%d", i, array1[i], i, array2[i]);
|
|
373 }
|
|
374 #endif
|
|
375
|
|
376 return;
|
|
377 }
|
|
378 /* end of des_quick_sort */
|
|
379
|
|
380
|
|
381
|
|
382 /**
|
|
383 *
|
|
384 * FIXME together with des_quick_sort most time consuming routine
|
|
385 * according to gprof on r110
|
|
386 *
|
|
387 */
|
|
388 static void
|
|
389 pair_align(int seq_no, int l1, int l2, int max_res_code, ktuple_param_t *aln_param,
|
|
390 char **seq_array, int *maxsf, int **accum, int max_aln_length,
|
|
391 int *zza, int *zzb, int *zzc, int *zzd)
|
|
392 {
|
|
393 int next; /* forrmerly static */
|
|
394 int pot[8],i, j, l, m, flag, limit, pos, vn1, vn2, flen, osptr, fs;
|
|
395 int tv1, tv2, encrypt, subt1, subt2, rmndr;
|
|
396 char residue;
|
|
397 int *diag_index;
|
|
398 int *displ;
|
|
399 char *slopes;
|
|
400 int curr_frag;
|
|
401 const int tl1 = (l1+l2)-1;
|
|
402
|
|
403 assert(NULL!=aln_param);
|
|
404
|
|
405 /*
|
|
406 Log(&rLog, LOG_FORCED_DEBUG, "DNAFLAG=%d seq_no=%d l1=%d l2=%d window=%d ktup=%d signif=%d wind_gap=%d",
|
|
407 DNAFLAG, seq_no, l1, l2, window, ktup, signif,
|
|
408 wind_gap);
|
|
409 */
|
|
410
|
|
411 slopes = (char *) CKCALLOC(tl1+1, sizeof(char));
|
|
412 displ = (int *) CKCALLOC(tl1+1, sizeof(int));
|
|
413 diag_index = (int *) CKMALLOC((tl1+1) * sizeof(int));
|
|
414
|
|
415 for (i=1; i<=tl1; ++i) {
|
|
416 /* unnecessary, because we calloced: slopes[i] = displ[i] = 0; */
|
|
417 diag_index[i] = i;
|
|
418 }
|
|
419
|
|
420 for (i=1;i<=aln_param->ktup;i++)
|
|
421 pot[i] = (int) pow((double)(max_res_code+1),(double)(i-1));
|
|
422 limit = (int) pow((double)(max_res_code+1),(double)aln_param->ktup);
|
|
423
|
|
424
|
|
425
|
|
426 /* increment diagonal score for each k_tuple match */
|
|
427
|
|
428 for (i=1; i<=limit; ++i) {
|
|
429 vn1=zzc[i];
|
|
430 while (TRUE) {
|
|
431 if (!vn1) break;
|
|
432 vn2 = zzd[i];
|
|
433 while (0 != vn2) {
|
|
434 osptr = vn1-vn2+l2;
|
|
435 ++displ[osptr];
|
|
436 vn2 = zzb[vn2];
|
|
437 }
|
|
438 vn1=zza[vn1];
|
|
439 }
|
|
440 }
|
|
441
|
|
442
|
|
443 /* choose the top SIGNIF diagonals
|
|
444 */
|
|
445
|
|
446 #ifdef QSORT_REPLACEMENT
|
|
447 /* This was an attempt to replace des_quick_sort with qsort(),
|
|
448 * which turns out to be much slower, so don't use this
|
|
449 */
|
|
450
|
|
451 /* FIXME: if we use this branch, we don't need to init diag_index
|
|
452 * before, because that is done in QSortAndTrackIndex()
|
|
453 * automatically.
|
|
454 */
|
|
455 #if 0
|
|
456 for (i=1; i<=tl1; i++) {
|
|
457 Log(&rLog, LOG_FORCED_DEBUG, "b4 sort disp[%d]=%d diag_index[%d]=%d", i, diag_index[i], i, displ[i]);
|
|
458 }
|
|
459 #endif
|
|
460
|
|
461 QSortAndTrackIndex(&(diag_index[1]), &(displ[1]), tl1, 'a', TRUE);
|
|
462
|
|
463 #if 0
|
|
464 for (i=1; i<=tl1; i++) {
|
|
465 Log(&rLog, LOG_FORCED_DEBUG, "after sort disp[%d]=%d diag_index[%d]=%d", i, diag_index[i], i, displ[i]);
|
|
466 }
|
|
467 #endif
|
|
468
|
|
469 #else
|
|
470
|
|
471 des_quick_sort(displ, diag_index, tl1);
|
|
472
|
|
473 #endif
|
|
474
|
|
475 j = tl1 - aln_param->signif + 1;
|
|
476
|
|
477 if (j < 1) {
|
|
478 j = 1;
|
|
479 }
|
|
480
|
|
481 /* flag all diagonals within WINDOW of a top diagonal */
|
|
482
|
|
483 for (i=tl1; i>=j; i--) {
|
|
484 if (displ[i] > 0) {
|
|
485 pos = diag_index[i];
|
|
486 l = (1 > pos - aln_param->window) ?
|
|
487 1 : pos - aln_param->window;
|
|
488 m = (tl1 < pos + aln_param->window) ?
|
|
489 tl1 : pos + aln_param->window;
|
|
490 for (; l <= m; l++)
|
|
491 slopes[l] = 1;
|
|
492 }
|
|
493 }
|
|
494
|
|
495 for (i=1; i<=tl1; i++) {
|
|
496 displ[i] = 0;
|
|
497 }
|
|
498
|
|
499 curr_frag=*maxsf=0;
|
|
500
|
|
501 for (i=1; i<=(l1-aln_param->ktup+1); ++i) {
|
|
502 encrypt=flag=0;
|
|
503 for (j=1; j<=aln_param->ktup; ++j) {
|
|
504 residue = seq_array[seq_no][i+j-1];
|
|
505 if ((residue<0) || (residue>max_res_code)) {
|
|
506 flag=TRUE;
|
|
507 break;
|
|
508 }
|
|
509 encrypt += ((residue)*pot[j]);
|
|
510 }
|
|
511 if (flag) {
|
|
512 continue;
|
|
513 }
|
|
514 ++encrypt;
|
|
515
|
|
516 vn2=zzd[encrypt];
|
|
517
|
|
518 flag=FALSE;
|
|
519 while (TRUE) {
|
|
520 if (!vn2) {
|
|
521 flag=TRUE;
|
|
522 break;
|
|
523 }
|
|
524 osptr=i-vn2+l2;
|
|
525 if (1 != slopes[osptr]) {
|
|
526 vn2=zzb[vn2];
|
|
527 continue;
|
|
528 }
|
|
529 flen=0;
|
|
530 fs=aln_param->ktup;
|
|
531 next=*maxsf;
|
|
532
|
|
533 /*
|
|
534 * A-loop
|
|
535 */
|
|
536
|
|
537 while (TRUE) {
|
|
538 if (!next) {
|
|
539 ++curr_frag;
|
|
540 if (curr_frag >= 2*max_aln_length) {
|
|
541 Log(&rLog, LOG_VERBOSE, "(Partial alignment)");
|
|
542 goto free_and_exit; /* Yesss! Always wanted to
|
|
543 * use a goto (AW) */
|
|
544 }
|
|
545 displ[osptr]=curr_frag;
|
|
546 put_frag(fs, i, vn2, flen, curr_frag, &next, maxsf, accum);
|
|
547
|
|
548 } else {
|
|
549 tv1=accum[1][next];
|
|
550 tv2=accum[2][next];
|
|
551
|
|
552 if (frag_rel_pos(i, vn2, tv1, tv2, aln_param->ktup)) {
|
|
553 if (i-vn2 == accum[1][next]-accum[2][next]) {
|
|
554 if (i > accum[1][next]+(aln_param->ktup-1)) {
|
|
555 fs = accum[0][next]+aln_param->ktup;
|
|
556 } else {
|
|
557 rmndr = i-accum[1][next];
|
|
558 fs = accum[0][next]+rmndr;
|
|
559 }
|
|
560 flen=next;
|
|
561 next=0;
|
|
562 continue;
|
|
563
|
|
564 } else {
|
|
565 if (0 == displ[osptr]) {
|
|
566 subt1=aln_param->ktup;
|
|
567 } else {
|
|
568 if (i > accum[1][displ[osptr]]+(aln_param->ktup-1)) {
|
|
569 subt1=accum[0][displ[osptr]]+aln_param->ktup;
|
|
570 } else {
|
|
571 rmndr=i-accum[1][displ[osptr]];
|
|
572 subt1=accum[0][displ[osptr]]+rmndr;
|
|
573 }
|
|
574 }
|
|
575 subt2=accum[0][next] - aln_param->wind_gap + aln_param->ktup;
|
|
576 if (subt2>subt1) {
|
|
577 flen=next;
|
|
578 fs=subt2;
|
|
579 } else {
|
|
580 flen=displ[osptr];
|
|
581 fs=subt1;
|
|
582 }
|
|
583 next=0;
|
|
584 continue;
|
|
585 }
|
|
586 } else {
|
|
587 next=accum[4][next];
|
|
588 continue;
|
|
589 }
|
|
590 }
|
|
591 break;
|
|
592 }
|
|
593 /*
|
|
594 * End of Aloop
|
|
595 */
|
|
596
|
|
597 vn2=zzb[vn2];
|
|
598 }
|
|
599 }
|
|
600
|
|
601 free_and_exit:
|
|
602 CKFREE(displ);
|
|
603 CKFREE(slopes);
|
|
604 CKFREE(diag_index);
|
|
605
|
|
606 return;
|
|
607 }
|
|
608 /* end of pair_align */
|
|
609
|
|
610
|
|
611
|
|
612 /**
|
|
613 *
|
|
614 * Will compute ktuple scores and store in tmat
|
|
615 * Following values will be set: tmat[i][j], where
|
|
616 * istart <= i <iend
|
|
617 * and
|
|
618 * jstart <= j < jend
|
|
619 * i.e. zero-offset
|
|
620 * tmat data members have to be preallocated
|
|
621 *
|
|
622 * if ktuple_param_t *aln_param == NULL defaults will be used
|
|
623 */
|
|
624 void
|
|
625 KTuplePairDist(symmatrix_t *tmat, mseq_t *mseq,
|
|
626 int istart, int iend,
|
|
627 int jstart, int jend,
|
|
628 ktuple_param_t *param_override,
|
|
629 progress_t *prProgress,
|
|
630 unsigned long int *ulStepNo, unsigned long int ulTotalStepNo)
|
|
631 {
|
|
632 /* this first group of variables were previously static
|
|
633 and hence un-parallelisable */
|
|
634 char **seq_array;
|
|
635 int maxsf;
|
|
636 int **accum;
|
|
637 int max_aln_length;
|
|
638 /* divide score with length of smallest sequence */
|
|
639 int *zza, *zzb, *zzc, *zzd;
|
|
640 int private_step_no = 0;
|
|
641
|
|
642 int i, j, dsr;
|
|
643 double calc_score;
|
|
644 int max_res_code = -1;
|
|
645
|
|
646 int max_seq_len;
|
|
647 int *seqlen_array;
|
|
648 /* progress_t *prProgress; */
|
|
649 /* int uStepNo, uTotalStepNo; */
|
|
650 ktuple_param_t aln_param = default_protein_param;
|
|
651 bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
|
|
652
|
|
653
|
|
654 if(prProgress == NULL) {
|
|
655 NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO),
|
|
656 "Ktuple-distance calculation progress", bPrintCR);
|
|
657 }
|
|
658
|
|
659 /* conversion to old style data types follows
|
|
660 *
|
|
661 */
|
|
662
|
|
663 seqlen_array = (int*) CKMALLOC((mseq->nseqs+1) * sizeof(int));
|
|
664 for (i=0; i<mseq->nseqs; i++) {
|
|
665 seqlen_array[i+1] = mseq->sqinfo[i].len;
|
|
666 }
|
|
667
|
|
668 /* setup alignment parameters
|
|
669 */
|
|
670 if (SEQTYPE_PROTEIN == mseq->seqtype) {
|
|
671 DNAFLAG = FALSE;
|
|
672 max_res_code = strlen(AMINO_ACID_CODES)-2;
|
|
673 aln_param = default_protein_param;
|
|
674
|
|
675 } else if (SEQTYPE_RNA == mseq->seqtype || SEQTYPE_DNA == mseq->seqtype) {
|
|
676 DNAFLAG = TRUE;
|
|
677 max_res_code = strlen(NUCLEIC_ACID_CODES)-2;
|
|
678 aln_param = default_dna_param;
|
|
679
|
|
680 } else {
|
|
681 Log(&rLog, LOG_FATAL, "Internal error in %s: Unknown sequence type.", __FUNCTION__);
|
|
682 }
|
|
683
|
|
684 if (NULL!=param_override) {
|
|
685 aln_param.ktup = param_override->ktup;
|
|
686 aln_param.wind_gap = param_override->wind_gap;
|
|
687 aln_param.signif = param_override->signif;
|
|
688 aln_param.window = param_override->window;
|
|
689 }
|
|
690
|
|
691 /*LOG_DEBUG("DNAFLAG = %d max_res_code = %d", DNAFLAG, max_res_code);*/
|
|
692
|
|
693 /* convert mseq to clustal's old-style int encoded sequences (unit-offset)
|
|
694 */
|
|
695 max_aln_length = 0;
|
|
696 max_seq_len = 0;
|
|
697 seq_array = (char **) CKMALLOC((mseq->nseqs+1) * sizeof(char *));
|
|
698 seq_array[0] = NULL;
|
|
699 /* FIXME check that non of the seqs is smaller than ktup (?).
|
|
700 * Otherwise segfault occurs
|
|
701 */
|
|
702 for (i=0; i<mseq->nseqs; i++) {
|
|
703 seq_array[i+1] = (char *) CKMALLOC((seqlen_array[i+1]+2) * sizeof (char));;
|
|
704 }
|
|
705 for (i=0; i<mseq->nseqs; i++) {
|
|
706 /*LOG_DEBUG("calling encode with seq_array[%d+1] len=%d and seq=%s",
|
|
707 i, seqlen_array[i+1], mseq->seq[i]);*/
|
|
708 if (TRUE == DNAFLAG) {
|
|
709 encode(&(mseq->seq[i][-1]), seq_array[i+1],
|
|
710 seqlen_array[i+1], NUCLEIC_ACID_CODES);
|
|
711 } else {
|
|
712 encode(&(mseq->seq[i][-1]), seq_array[i+1],
|
|
713 seqlen_array[i+1], AMINO_ACID_CODES);
|
|
714 }
|
|
715
|
|
716 if (seqlen_array[i+1]>max_seq_len) {
|
|
717 max_seq_len = seqlen_array[i+1];
|
|
718 }
|
|
719 }
|
|
720 max_aln_length = max_seq_len * 2;
|
|
721 /* see sequence.c in old source */
|
|
722
|
|
723 /* FIXME: short sequences can cause seg-fault
|
|
724 * because max_aln_length can get shorter
|
|
725 * than (max_res_code+1)^k
|
|
726 * FS, r222->r223 */
|
|
727 max_aln_length = max_aln_length > pow((max_res_code+1), aln_param.ktup)+1 ?
|
|
728 max_aln_length : pow((max_res_code+1), aln_param.ktup)+1;
|
|
729
|
|
730 /*
|
|
731 *
|
|
732 * conversion to old style clustal done (in no time) */
|
|
733
|
|
734
|
|
735 accum = (int **) CKCALLOC(5, sizeof (int *));
|
|
736 for (i=0;i<5;i++) {
|
|
737 accum[i] = (int *) CKCALLOC((2*max_aln_length+1), sizeof(int));
|
|
738 }
|
|
739 zza = (int *) CKCALLOC( (max_aln_length+1), sizeof(int));
|
|
740 zzb = (int *) CKCALLOC( (max_aln_length+1), sizeof(int));
|
|
741 zzc = (int *) CKCALLOC( (max_aln_length+1), sizeof(int));
|
|
742 zzd = (int *) CKCALLOC( (max_aln_length+1), sizeof(int));
|
|
743
|
|
744 /* estimation of total number of steps (if istart and jstart are
|
|
745 * both 0) (now handled in the calling routine)
|
|
746 */
|
|
747 /* uTotalStepNo = iend*jend - iend*iend/2 + iend/2;
|
|
748 uStepNo = 0; */
|
|
749 /*LOG_DEBUG("istart=%d iend=%d jstart=%d jend=%d", istart, iend, jstart, jend);*/
|
|
750
|
|
751 for (i=istart+1; i<=iend; ++i) {
|
|
752 /* by definition a sequence compared to itself should give
|
|
753 a score of 0. AW */
|
|
754 SymMatrixSetValue(tmat, i-1, i-1, 0.0);
|
|
755 make_ptrs(zza, zzc, i, seqlen_array[i], aln_param.ktup, max_res_code, seq_array);
|
|
756
|
|
757 #ifdef HAVE_OPENMP
|
|
758 #pragma omp critical(ktuple)
|
|
759 #endif
|
|
760 {
|
|
761 ProgressLog(prProgress, *ulStepNo, ulTotalStepNo, FALSE);
|
|
762 }
|
|
763
|
|
764 for (j=MAX(i+1, jstart+1); j<=jend; ++j) {
|
|
765 (*ulStepNo)++;
|
|
766 private_step_no++;
|
|
767 /*LOG_DEBUG("comparing pair %d:%d", i, j);*/
|
|
768
|
|
769 make_ptrs(zzb, zzd, j, seqlen_array[j], aln_param.ktup, max_res_code, seq_array);
|
|
770 pair_align(i, seqlen_array[i], seqlen_array[j], max_res_code, &aln_param,
|
|
771 seq_array, &maxsf, accum, max_aln_length, zza, zzb, zzc, zzd);
|
|
772
|
|
773 if (!maxsf) {
|
|
774 calc_score=0.0;
|
|
775 } else {
|
|
776 calc_score=(double)accum[0][maxsf];
|
|
777 if (percent) {
|
|
778 dsr=(seqlen_array[i]<seqlen_array[j]) ?
|
|
779 seqlen_array[i] : seqlen_array[j];
|
|
780 calc_score = (calc_score/(double)dsr) * 100.0;
|
|
781 }
|
|
782 }
|
|
783
|
|
784 /* printf("%d %d %d\n", i-1, j-1, (100.0 - calc_score)/100.0); */
|
|
785 SymMatrixSetValue(tmat, i-1, j-1, (100.0 - calc_score)/100.0);
|
|
786
|
|
787 /* the function allows you not to compute the full matrix.
|
|
788 * here we explicitely make the resulting matrix a
|
|
789 * rectangle, i.e. we always set full rows. in other
|
|
790 * words, if we don't complete the full matrix then we
|
|
791 * don't have a full symmetry. so only use the defined
|
|
792 * symmetric part. AW
|
|
793 */
|
|
794 /*LOG_DEBUG("setting %d : %d = %f", j, i, tmat[i][j]);*/
|
|
795 /* not needed anymore since we use symmatrix_t
|
|
796 if (j<=iend) {
|
|
797 tmat[j][i] = tmat[i][j];
|
|
798 }
|
|
799 */
|
|
800 #ifdef HAVE_OPENMP
|
|
801 #pragma omp critical(ktuple)
|
|
802 #endif
|
|
803 {
|
|
804 Log(&rLog, LOG_DEBUG, "K-tuple distance for sequence pair %d:%d = %lg",
|
|
805 i, j, SymMatrixGetValue(tmat, i-1, j-1));
|
|
806 }
|
|
807 }
|
|
808 }
|
|
809 /*
|
|
810 Log(&rLog, LOG_FORCED_DEBUG, "uTotalStepNo=%d for istart=%d iend=%d jstart=%d jend=%d", uStepNo, istart, iend, jstart, jend);
|
|
811 Log(&rLog, LOG_FORCED_DEBUG, "Fabian = %d", iend*jend - iend*iend/2 + iend/2);
|
|
812 */
|
|
813
|
|
814 /* printf("\n\n%d\t%d\t%d\t%d\n\n", omp_get_thread_num(), uStepNo, istart, iend); */
|
|
815
|
|
816 for (i=0;i<5;i++) {
|
|
817 CKFREE(accum[i]);
|
|
818 }
|
|
819 CKFREE(accum);
|
|
820
|
|
821 #ifdef HAVE_OPENMP
|
|
822 #pragma omp critical(ktuple)
|
|
823 #if 0
|
|
824 {
|
|
825 printf("steps: %d\n", private_step_no);
|
|
826 }
|
|
827 #endif
|
|
828 #endif
|
|
829
|
|
830 CKFREE(zza);
|
|
831 CKFREE(zzb);
|
|
832 CKFREE(zzc);
|
|
833 CKFREE(zzd);
|
|
834
|
|
835 free(seqlen_array);
|
|
836
|
|
837 for (i=1; i<=mseq->nseqs; i++) {
|
|
838 CKFREE(seq_array[i]);
|
|
839 }
|
|
840 CKFREE(seq_array);
|
|
841 }
|
|
842 /* end of KTuplePairDist */
|