comparison clustalomega/clustal-omega-0.2.0/src/squid/shuffle.c @ 0:ff1768533a07

Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author clustalomega
date Tue, 07 Jun 2011 17:04:25 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ff1768533a07
1 /*****************************************************************
2 * SQUID - a library of functions for biological sequence analysis
3 * Copyright (C) 1992-2002 Washington University School of Medicine
4 *
5 * This source code is freely distributed under the terms of the
6 * GNU General Public License. See the files COPYRIGHT and LICENSE
7 * for details.
8 *****************************************************************/
9
10 /* shuffle.c
11 *
12 * Routines for randomizing sequences.
13 *
14 * All routines are alphabet-independent (DNA, protein, RNA, whatever);
15 * they assume that input strings are purely alphabetical [a-zA-Z], and
16 * will return strings in all upper case [A-Z].
17 *
18 * All return 1 on success, and 0 on failure; 0 status invariably
19 * means the input string was not alphabetical.
20 *
21 * StrShuffle() - shuffled string, preserve mono-symbol composition.
22 * StrDPShuffle() - shuffled string, preserve mono- and di-symbol composition.
23 *
24 * StrMarkov0() - random string, same zeroth order Markov properties.
25 * StrMarkov1() - random string, same first order Markov properties.
26 *
27 * StrReverse() - simple reversal of string
28 * StrRegionalShuffle() - mono-symbol shuffled string in regional windows
29 *
30 * There are also similar routines for shuffling alignments:
31 *
32 * AlignmentShuffle() - alignment version of StrShuffle().
33 * AlignmentBootstrap() - sample with replacement; a bootstrap dataset.
34 * QRNAShuffle() - shuffle a pairwise alignment, preserving all gap positions.
35 *
36 * CVS $Id: shuffle.c,v 1.6 2002/10/09 14:26:09 eddy Exp)
37 */
38
39 #include <string.h>
40 #include <ctype.h>
41
42 #include "squid.h"
43 #include "sre_random.h"
44
45 /* Function: StrShuffle()
46 *
47 * Purpose: Returns a shuffled version of s2, in s1.
48 * (s1 and s2 can be identical, to shuffle in place.)
49 *
50 * Args: s1 - allocated space for shuffled string.
51 * s2 - string to shuffle.
52 *
53 * Return: 1 on success.
54 */
55 int
56 StrShuffle(char *s1, char *s2)
57 {
58 int len;
59 int pos;
60 char c;
61
62 if (s1 != s2) strcpy(s1, s2);
63 for (len = strlen(s1); len > 1; len--)
64 {
65 pos = CHOOSE(len);
66 c = s1[pos];
67 s1[pos] = s1[len-1];
68 s1[len-1] = c;
69 }
70 return 1;
71 }
72
73 /* Function: StrDPShuffle()
74 * Date: SRE, Fri Oct 29 09:15:17 1999 [St. Louis]
75 *
76 * Purpose: Returns a shuffled version of s2, in s1.
77 * (s1 and s2 may be identical; i.e. a string
78 * may be shuffled in place.) The shuffle is a
79 * "doublet-preserving" (DP) shuffle. Both
80 * mono- and di-symbol composition are preserved.
81 *
82 * Done by searching for a random Eulerian
83 * walk on a directed multigraph.
84 * Reference: S.F. Altschul and B.W. Erickson, Mol. Biol.
85 * Evol. 2:526-538, 1985. Quoted bits in my comments
86 * are from Altschul's outline of the algorithm.
87 *
88 * Args: s1 - RETURN: the string after it's been shuffled
89 * (space for s1 allocated by caller)
90 * s2 - the string to be shuffled
91 *
92 * Returns: 0 if string can't be shuffled (it's not all [a-zA-z]
93 * alphabetic.
94 * 1 on success.
95 */
96 int
97 StrDPShuffle(char *s1, char *s2)
98 {
99 int len;
100 int pos; /* a position in s1 or s2 */
101 int x,y; /* indices of two characters */
102 char **E; /* edge lists: E[0] is the edge list from vertex A */
103 int *nE; /* lengths of edge lists */
104 int *iE; /* positions in edge lists */
105 int n; /* tmp: remaining length of an edge list to be shuffled */
106 char sf; /* last character in s2 */
107 char Z[26]; /* connectivity in last edge graph Z */
108 int keep_connecting; /* flag used in Z connectivity algorithm */
109 int is_eulerian; /* flag used for when we've got a good Z */
110
111 /* First, verify that the string is entirely alphabetic.
112 */
113 len = strlen(s2);
114 for (pos = 0; pos < len; pos++)
115 if (! isalpha(s2[pos])) return 0;
116
117 /* "(1) Construct the doublet graph G and edge ordering E
118 * corresponding to S."
119 *
120 * Note that these also imply the graph G; and note,
121 * for any list x with nE[x] = 0, vertex x is not part
122 * of G.
123 */
124 E = MallocOrDie(sizeof(char *) * 26);
125 nE = MallocOrDie(sizeof(int) * 26);
126 for (x = 0; x < 26; x++)
127 {
128 E[x] = MallocOrDie(sizeof(char) * (len-1));
129 nE[x] = 0;
130 }
131
132 x = toupper(s2[0]) - 'A';
133 for (pos = 1; pos < len; pos++)
134 {
135 y = toupper(s2[pos]) - 'A';
136 E[x][nE[x]] = y;
137 nE[x]++;
138 x = y;
139 }
140
141 /* Now we have to find a random Eulerian edge ordering.
142 */
143 sf = toupper(s2[len-1]) - 'A';
144 is_eulerian = 0;
145 while (! is_eulerian)
146 {
147 /* "(2) For each vertex s in G except s_f, randomly select
148 * one edge from the s edge list of E(S) to be the
149 * last edge of the s list in a new edge ordering."
150 *
151 * select random edges and move them to the end of each
152 * edge list.
153 */
154 for (x = 0; x < 26; x++)
155 {
156 if (nE[x] == 0 || x == sf) continue;
157
158 pos = CHOOSE(nE[x]);
159 y = E[x][pos];
160 E[x][pos] = E[x][nE[x]-1];
161 E[x][nE[x]-1] = y;
162 }
163
164 /* "(3) From this last set of edges, construct the last-edge
165 * graph Z and determine whether or not all of its
166 * vertices are connected to s_f."
167 *
168 * a probably stupid algorithm for looking at the
169 * connectivity in Z: iteratively sweep through the
170 * edges in Z, and build up an array (confusing called Z[x])
171 * whose elements are 1 if x is connected to sf, else 0.
172 */
173 for (x = 0; x < 26; x++) Z[x] = 0;
174 Z[(int) sf] = keep_connecting = 1;
175
176 while (keep_connecting) {
177 keep_connecting = 0;
178 for (x = 0; x < 26; x++)
179 {
180 y = E[x][nE[x]-1]; /* xy is an edge in Z */
181 if (Z[x] == 0 && Z[y] == 1) /* x is connected to sf in Z */
182 {
183 Z[x] = 1;
184 keep_connecting = 1;
185 }
186 }
187 }
188
189 /* if any vertex in Z is tagged with a 0, it's
190 * not connected to sf, and we won't have a Eulerian
191 * walk.
192 */
193 is_eulerian = 1;
194 for (x = 0; x < 26; x++)
195 {
196 if (nE[x] == 0 || x == sf) continue;
197 if (Z[x] == 0) {
198 is_eulerian = 0;
199 break;
200 }
201 }
202
203 /* "(4) If any vertex is not connected in Z to s_f, the
204 * new edge ordering will not be Eulerian, so return to
205 * (2). If all vertices are connected in Z to s_f,
206 * the new edge ordering will be Eulerian, so
207 * continue to (5)."
208 *
209 * e.g. note infinite loop while is_eulerian is FALSE.
210 */
211 }
212
213 /* "(5) For each vertex s in G, randomly permute the remaining
214 * edges of the s edge list of E(S) to generate the s
215 * edge list of the new edge ordering E(S')."
216 *
217 * Essentially a StrShuffle() on the remaining nE[x]-1 elements
218 * of each edge list; unfortunately our edge lists are arrays,
219 * not strings, so we can't just call out to StrShuffle().
220 */
221 for (x = 0; x < 26; x++)
222 for (n = nE[x] - 1; n > 1; n--)
223 {
224 pos = CHOOSE(n);
225 y = E[x][pos];
226 E[x][pos] = E[x][n-1];
227 E[x][n-1] = y;
228 }
229
230 /* "(6) Construct sequence S', a random DP permutation of
231 * S, from E(S') as follows. Start at the s_1 edge list.
232 * At each s_i edge list, add s_i to S', delete the
233 * first edge s_i,s_j of the edge list, and move to
234 * the s_j edge list. Continue this process until
235 * all edge lists are exhausted."
236 */
237 iE = MallocOrDie(sizeof(int) * 26);
238 for (x = 0; x < 26; x++) iE[x] = 0;
239
240 pos = 0;
241 x = toupper(s2[0]) - 'A';
242 while (1)
243 {
244 s1[pos++] = 'A' + x; /* add s_i to S' */
245
246 y = E[x][iE[x]];
247 iE[x]++; /* "delete" s_i,s_j from edge list */
248
249 x = y; /* move to s_j edge list. */
250
251 if (iE[x] == nE[x])
252 break; /* the edge list is exhausted. */
253 }
254 s1[pos++] = 'A' + sf;
255 s1[pos] = '\0';
256
257 /* Reality checks.
258 */
259 if (x != sf) Die("hey, you didn't end on s_f.");
260 if (pos != len) Die("hey, pos (%d) != len (%d).", pos, len);
261
262 /* Free and return.
263 */
264 Free2DArray((void **) E, 26);
265 free(nE);
266 free(iE);
267 return 1;
268 }
269
270
271 /* Function: StrMarkov0()
272 * Date: SRE, Fri Oct 29 11:08:31 1999 [St. Louis]
273 *
274 * Purpose: Returns a random string s1 with the same
275 * length and zero-th order Markov properties
276 * as s2.
277 *
278 * s1 and s2 may be identical, to randomize s2
279 * in place.
280 *
281 * Args: s1 - allocated space for random string
282 * s2 - string to base s1's properties on.
283 *
284 * Returns: 1 on success; 0 if s2 doesn't look alphabetical.
285 */
286 int
287 StrMarkov0(char *s1, char *s2)
288 {
289 int len;
290 int pos;
291 float p[26]; /* symbol probabilities */
292
293 /* First, verify that the string is entirely alphabetic.
294 */
295 len = strlen(s2);
296 for (pos = 0; pos < len; pos++)
297 if (! isalpha(s2[pos])) return 0;
298
299 /* Collect zeroth order counts and convert to frequencies.
300 */
301 FSet(p, 26, 0.);
302 for (pos = 0; pos < len; pos++)
303 p[(int)(toupper(s2[pos]) - 'A')] += 1.0;
304 FNorm(p, 26);
305
306 /* Generate a random string using those p's.
307 */
308 for (pos = 0; pos < len; pos++)
309 s1[pos] = FChoose(p, 26) + 'A';
310 s1[pos] = '\0';
311
312 return 1;
313 }
314
315
316 /* Function: StrMarkov1()
317 * Date: SRE, Fri Oct 29 11:22:20 1999 [St. Louis]
318 *
319 * Purpose: Returns a random string s1 with the same
320 * length and first order Markov properties
321 * as s2.
322 *
323 * s1 and s2 may be identical, to randomize s2
324 * in place.
325 *
326 * Args: s1 - allocated space for random string
327 * s2 - string to base s1's properties on.
328 *
329 * Returns: 1 on success; 0 if s2 doesn't look alphabetical.
330 */
331 int
332 StrMarkov1(char *s1, char *s2)
333 {
334 int len;
335 int pos;
336 int x,y;
337 int i; /* initial symbol */
338 float p[26][26]; /* symbol probabilities */
339
340 /* First, verify that the string is entirely alphabetic.
341 */
342 len = strlen(s2);
343 for (pos = 0; pos < len; pos++)
344 if (! isalpha(s2[pos])) return 0;
345
346 /* Collect first order counts and convert to frequencies.
347 */
348 for (x = 0; x < 26; x++) FSet(p[x], 26, 0.);
349
350 i = x = toupper(s2[0]) - 'A';
351 for (pos = 1; pos < len; pos++)
352 {
353 y = toupper(s2[pos]) - 'A';
354 p[x][y] += 1.0;
355 x = y;
356 }
357 for (x = 0; x < 26; x++)
358 FNorm(p[x], 26);
359
360 /* Generate a random string using those p's.
361 */
362 x = i;
363 s1[0] = x + 'A';
364 for (pos = 1; pos < len; pos++)
365 {
366 y = FChoose(p[x], 26);
367 s1[pos] = y + 'A';
368 x = y;
369 }
370 s1[pos] = '\0';
371
372 return 1;
373 }
374
375
376
377 /* Function: StrReverse()
378 * Date: SRE, Thu Nov 20 10:54:52 1997 [St. Louis]
379 *
380 * Purpose: Returns a reversed version of s2, in s1.
381 * (s1 and s2 can be identical, to reverse in place)
382 *
383 * Args: s1 - allocated space for reversed string.
384 * s2 - string to reverse.
385 *
386 * Return: 1.
387 */
388 int
389 StrReverse(char *s1, char *s2)
390 {
391 int len;
392 int pos;
393 char c;
394
395 len = strlen(s2);
396 for (pos = 0; pos < len/2; pos++)
397 { /* swap ends */
398 c = s2[len-pos-1];
399 s1[len-pos-1] = s2[pos];
400 s1[pos] = c;
401 }
402 if (len%2) { s1[pos] = s2[pos]; } /* copy middle residue in odd-len s2 */
403 s1[len] = '\0';
404 return 1;
405 }
406
407 /* Function: StrRegionalShuffle()
408 * Date: SRE, Thu Nov 20 11:02:34 1997 [St. Louis]
409 *
410 * Purpose: Returns a regionally shuffled version of s2, in s1.
411 * (s1 and s2 can be identical to regionally
412 * shuffle in place.) See [Pearson88].
413 *
414 * Args: s1 - allocated space for regionally shuffled string.
415 * s2 - string to regionally shuffle
416 * w - window size (typically 10 or 20)
417 *
418 * Return: 1.
419 */
420 int
421 StrRegionalShuffle(char *s1, char *s2, int w)
422 {
423 int len;
424 char c;
425 int pos;
426 int i, j;
427
428 if (s1 != s2) strcpy(s1, s2);
429 len = strlen(s1);
430
431 for (i = 0; i < len; i += w)
432 for (j = MIN(len-1, i+w-1); j > i; j--)
433 {
434 pos = i + CHOOSE(j-i);
435 c = s1[pos];
436 s1[pos] = s1[j];
437 s1[j] = c;
438 }
439 return 1;
440 }
441
442
443 /* Function: AlignmentShuffle()
444 * Date: SRE, Sun Apr 22 18:37:15 2001 [St. Louis]
445 *
446 * Purpose: Returns a shuffled version of ali2, in ali1.
447 * (ali1 and ali2 can be identical, to shuffle
448 * in place.) The alignment columns are shuffled,
449 * preserving % identity within the columns.
450 *
451 * Args: ali1 - allocated space for shuffled alignment
452 * [0..nseq-1][0..alen-1]
453 * ali2 - alignment to be shuffled
454 * nseq - number of sequences in the alignment
455 * alen - length of alignment, in columns.
456 *
457 * Returns: int
458 */
459 int
460 AlignmentShuffle(char **ali1, char **ali2, int nseq, int alen)
461 {
462 int i;
463 int pos;
464 char c;
465
466 if (ali1 != ali2)
467 {
468 for (i = 0; i < nseq; i++) strcpy(ali1[i], ali2[i]);
469 }
470
471 for (i = 0; i < nseq; i++)
472 ali1[i][alen] = '\0';
473
474 for (; alen > 1; alen--)
475 {
476 pos = CHOOSE(alen);
477 for (i = 0; i < nseq; i++)
478 {
479 c = ali1[i][pos];
480 ali1[i][pos] = ali1[i][alen-1];
481 ali1[i][alen-1] = c;
482 }
483 }
484
485 return 1;
486 }
487
488 /* Function: AlignmentBootstrap()
489 * Date: SRE, Sun Apr 22 18:49:14 2001 [St. Louis]
490 *
491 * Purpose: Returns a bootstrapped alignment sample in ali1,
492 * constructed from ali2 by sampling columns with
493 * replacement.
494 *
495 * Unlike the other shuffling routines, ali1 and
496 * ali2 cannot be the same. ali2 is left unchanged.
497 * ali1 must be a properly allocated space for an
498 * alignment the same size as ali2.
499 *
500 * Args: ali1 - allocated space for bootstrapped alignment
501 * [0..nseq-1][0..alen-1]
502 * ali2 - alignment to be bootstrapped
503 * nseq - number of sequences in the alignment
504 * alen - length of alignment, in columns.
505 *
506 * Returns: 1 on success.
507 */
508 int
509 AlignmentBootstrap(char **ali1, char **ali2, int nseq, int alen)
510 {
511 int pos;
512 int col;
513 int i;
514
515 for (pos = 0; pos < alen; pos++)
516 {
517 col = CHOOSE(alen);
518 for (i = 0; i < nseq; i++)
519 ali1[i][pos] = ali2[i][col];
520 }
521 for (i = 0; i < nseq; i++)
522 ali1[i][alen] = '\0';
523
524 return 1;
525 }
526
527
528 /* Function: QRNAShuffle()
529 * Date: SRE, Mon Dec 10 10:14:12 2001 [St. Louis]
530 *
531 * Purpose: Shuffle a pairwise alignment x,y while preserving the
532 * position of gaps; return the shuffled alignment in xs,
533 * ys.
534 *
535 * Works by doing three separate
536 * shuffles, of (1) columns with residues in both
537 * x and y, (2) columns with residue in x and gap in y,
538 * and (3) columns with gap in x and residue in y.
539 *
540 * xs,x and ys,y may be identical: that is, to shuffle
541 * an alignment "in place", destroying the original
542 * alignment, just call:
543 * QRNAShuffle(x,y,x,y);
544 *
545 * Args: xs, ys: allocated space for shuffled pairwise ali of x,y [L+1]
546 * x, y: pairwise alignment to be shuffled [0..L-1]
547 *
548 * Returns: 1 on success, 0 on failure.
549 * The shuffled alignment is returned in xs, ys.
550 */
551 int
552 QRNAShuffle(char *xs, char *ys, char *x, char *y)
553 {
554 int L;
555 int *xycol, *xcol, *ycol;
556 int nxy, nx, ny;
557 int i;
558 int pos, c;
559 char xsym, ysym;
560
561 if (xs != x) strcpy(xs, x);
562 if (ys != y) strcpy(ys, y);
563
564 /* First, construct three arrays containing lists of the column positions
565 * of the three types of columns. (If a column contains gaps in both x and y,
566 * we've already simply copied it to the shuffled sequence.)
567 */
568 L = strlen(x);
569 xycol = MallocOrDie(sizeof(int) * L);
570 xcol = MallocOrDie(sizeof(int) * L);
571 ycol = MallocOrDie(sizeof(int) * L);
572 nxy = nx = ny = 0;
573
574 for (i = 0; i < L; i++)
575 {
576 if (isgap(x[i]) && isgap(y[i])) { continue; }
577 else if (! isgap(x[i]) && ! isgap(y[i])) { xycol[nxy] = i; nxy++; }
578 else if (isgap(x[i])) { ycol[ny] = i; ny++; }
579 else if (isgap(y[i])) { xcol[nx] = i; nx++; }
580 }
581
582 /* Second, shuffle the sequences indirectly, via shuffling these arrays.
583 * Yow, careful with those indices, and with order of the statements...
584 */
585 for (; nxy > 1; nxy--) {
586 pos = CHOOSE(nxy);
587 xsym = xs[xycol[pos]]; ysym = ys[xycol[pos]]; c = xycol[pos];
588 xs[xycol[pos]] = xs[xycol[nxy-1]]; ys[xycol[pos]] = ys[xycol[nxy-1]]; xycol[pos] = xycol[nxy-1];
589 xs[xycol[nxy-1]] = xsym; ys[xycol[nxy-1]] = ysym; xycol[pos] = xycol[nxy-1];
590 }
591 for (; nx > 1; nx--) {
592 pos = CHOOSE(nx);
593 xsym = xs[xcol[pos]]; ysym = ys[xcol[pos]]; c = xcol[pos];
594 xs[xcol[pos]] = xs[xcol[nx-1]]; ys[xcol[pos]] = ys[xcol[nx-1]]; xcol[pos] = xcol[nx-1];
595 xs[xcol[nx-1]] = xsym; ys[xcol[nx-1]] = ysym; xcol[nx-1] = c;
596 }
597 for (; ny > 1; ny--) {
598 pos = CHOOSE(ny);
599 xsym = xs[ycol[pos]]; ysym = ys[ycol[pos]]; c = ycol[pos];
600 xs[ycol[pos]] = xs[ycol[ny-1]]; ys[ycol[pos]] = ys[ycol[ny-1]]; ycol[pos] = ycol[ny-1];
601 xs[ycol[ny-1]] = xsym; ys[ycol[ny-1]] = ysym; ycol[ny-1] = c;
602 }
603
604 free(xycol); free(xcol); free(ycol);
605 return 1;
606 }
607
608
609 #ifdef TESTDRIVER
610 /*
611 * cc -g -o testdriver -DTESTDRIVER -L. shuffle.c -lsquid -lm
612 */
613 int
614 main(int argc, char **argv)
615 {
616 char s1[100];
617 char s2[100];
618
619 sre_srandom(42);
620 strcpy(s2, "GGGGGGGGGGCCCCCCCCCC");
621 /* strcpy(s2, "AGACATAAAGTTCCGTACTGCCGGGAT");
622 */
623 StrDPShuffle(s1, s2);
624 printf("DPshuffle: %s\n", s1);
625 StrMarkov0(s1,s2);
626 printf("Markov 0 : %s\n", s1);
627 StrMarkov1(s1,s2);
628 printf("Markov 1 : %s\n", s1);
629
630 strcpy(s1, "ACGTACGT--------ACGTACGT----ACGTACGT");
631 strcpy(s2, "ACGTACGTACGTACGT------------ACGTACGT");
632 QRNAShuffle(s1,s2,s1,s2);
633 printf("QRNA : %s\n", s1);
634 printf(" : %s\n", s2);
635
636 return 0;
637 }
638 #endif