Mercurial > repos > clustalomega > clustalomega
diff clustalomega/clustal-omega-0.2.0/src/squid/shuffle.c @ 0:ff1768533a07
Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author | clustalomega |
---|---|
date | Tue, 07 Jun 2011 17:04:25 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/clustalomega/clustal-omega-0.2.0/src/squid/shuffle.c Tue Jun 07 17:04:25 2011 -0400 @@ -0,0 +1,638 @@ +/***************************************************************** + * SQUID - a library of functions for biological sequence analysis + * Copyright (C) 1992-2002 Washington University School of Medicine + * + * This source code is freely distributed under the terms of the + * GNU General Public License. See the files COPYRIGHT and LICENSE + * for details. + *****************************************************************/ + +/* shuffle.c + * + * Routines for randomizing sequences. + * + * All routines are alphabet-independent (DNA, protein, RNA, whatever); + * they assume that input strings are purely alphabetical [a-zA-Z], and + * will return strings in all upper case [A-Z]. + * + * All return 1 on success, and 0 on failure; 0 status invariably + * means the input string was not alphabetical. + * + * StrShuffle() - shuffled string, preserve mono-symbol composition. + * StrDPShuffle() - shuffled string, preserve mono- and di-symbol composition. + * + * StrMarkov0() - random string, same zeroth order Markov properties. + * StrMarkov1() - random string, same first order Markov properties. + * + * StrReverse() - simple reversal of string + * StrRegionalShuffle() - mono-symbol shuffled string in regional windows + * + * There are also similar routines for shuffling alignments: + * + * AlignmentShuffle() - alignment version of StrShuffle(). + * AlignmentBootstrap() - sample with replacement; a bootstrap dataset. + * QRNAShuffle() - shuffle a pairwise alignment, preserving all gap positions. + * + * CVS $Id: shuffle.c,v 1.6 2002/10/09 14:26:09 eddy Exp) + */ + +#include <string.h> +#include <ctype.h> + +#include "squid.h" +#include "sre_random.h" + +/* Function: StrShuffle() + * + * Purpose: Returns a shuffled version of s2, in s1. + * (s1 and s2 can be identical, to shuffle in place.) + * + * Args: s1 - allocated space for shuffled string. + * s2 - string to shuffle. + * + * Return: 1 on success. + */ +int +StrShuffle(char *s1, char *s2) +{ + int len; + int pos; + char c; + + if (s1 != s2) strcpy(s1, s2); + for (len = strlen(s1); len > 1; len--) + { + pos = CHOOSE(len); + c = s1[pos]; + s1[pos] = s1[len-1]; + s1[len-1] = c; + } + return 1; +} + +/* Function: StrDPShuffle() + * Date: SRE, Fri Oct 29 09:15:17 1999 [St. Louis] + * + * Purpose: Returns a shuffled version of s2, in s1. + * (s1 and s2 may be identical; i.e. a string + * may be shuffled in place.) The shuffle is a + * "doublet-preserving" (DP) shuffle. Both + * mono- and di-symbol composition are preserved. + * + * Done by searching for a random Eulerian + * walk on a directed multigraph. + * Reference: S.F. Altschul and B.W. Erickson, Mol. Biol. + * Evol. 2:526-538, 1985. Quoted bits in my comments + * are from Altschul's outline of the algorithm. + * + * Args: s1 - RETURN: the string after it's been shuffled + * (space for s1 allocated by caller) + * s2 - the string to be shuffled + * + * Returns: 0 if string can't be shuffled (it's not all [a-zA-z] + * alphabetic. + * 1 on success. + */ +int +StrDPShuffle(char *s1, char *s2) +{ + int len; + int pos; /* a position in s1 or s2 */ + int x,y; /* indices of two characters */ + char **E; /* edge lists: E[0] is the edge list from vertex A */ + int *nE; /* lengths of edge lists */ + int *iE; /* positions in edge lists */ + int n; /* tmp: remaining length of an edge list to be shuffled */ + char sf; /* last character in s2 */ + char Z[26]; /* connectivity in last edge graph Z */ + int keep_connecting; /* flag used in Z connectivity algorithm */ + int is_eulerian; /* flag used for when we've got a good Z */ + + /* First, verify that the string is entirely alphabetic. + */ + len = strlen(s2); + for (pos = 0; pos < len; pos++) + if (! isalpha(s2[pos])) return 0; + + /* "(1) Construct the doublet graph G and edge ordering E + * corresponding to S." + * + * Note that these also imply the graph G; and note, + * for any list x with nE[x] = 0, vertex x is not part + * of G. + */ + E = MallocOrDie(sizeof(char *) * 26); + nE = MallocOrDie(sizeof(int) * 26); + for (x = 0; x < 26; x++) + { + E[x] = MallocOrDie(sizeof(char) * (len-1)); + nE[x] = 0; + } + + x = toupper(s2[0]) - 'A'; + for (pos = 1; pos < len; pos++) + { + y = toupper(s2[pos]) - 'A'; + E[x][nE[x]] = y; + nE[x]++; + x = y; + } + + /* Now we have to find a random Eulerian edge ordering. + */ + sf = toupper(s2[len-1]) - 'A'; + is_eulerian = 0; + while (! is_eulerian) + { + /* "(2) For each vertex s in G except s_f, randomly select + * one edge from the s edge list of E(S) to be the + * last edge of the s list in a new edge ordering." + * + * select random edges and move them to the end of each + * edge list. + */ + for (x = 0; x < 26; x++) + { + if (nE[x] == 0 || x == sf) continue; + + pos = CHOOSE(nE[x]); + y = E[x][pos]; + E[x][pos] = E[x][nE[x]-1]; + E[x][nE[x]-1] = y; + } + + /* "(3) From this last set of edges, construct the last-edge + * graph Z and determine whether or not all of its + * vertices are connected to s_f." + * + * a probably stupid algorithm for looking at the + * connectivity in Z: iteratively sweep through the + * edges in Z, and build up an array (confusing called Z[x]) + * whose elements are 1 if x is connected to sf, else 0. + */ + for (x = 0; x < 26; x++) Z[x] = 0; + Z[(int) sf] = keep_connecting = 1; + + while (keep_connecting) { + keep_connecting = 0; + for (x = 0; x < 26; x++) + { + y = E[x][nE[x]-1]; /* xy is an edge in Z */ + if (Z[x] == 0 && Z[y] == 1) /* x is connected to sf in Z */ + { + Z[x] = 1; + keep_connecting = 1; + } + } + } + + /* if any vertex in Z is tagged with a 0, it's + * not connected to sf, and we won't have a Eulerian + * walk. + */ + is_eulerian = 1; + for (x = 0; x < 26; x++) + { + if (nE[x] == 0 || x == sf) continue; + if (Z[x] == 0) { + is_eulerian = 0; + break; + } + } + + /* "(4) If any vertex is not connected in Z to s_f, the + * new edge ordering will not be Eulerian, so return to + * (2). If all vertices are connected in Z to s_f, + * the new edge ordering will be Eulerian, so + * continue to (5)." + * + * e.g. note infinite loop while is_eulerian is FALSE. + */ + } + + /* "(5) For each vertex s in G, randomly permute the remaining + * edges of the s edge list of E(S) to generate the s + * edge list of the new edge ordering E(S')." + * + * Essentially a StrShuffle() on the remaining nE[x]-1 elements + * of each edge list; unfortunately our edge lists are arrays, + * not strings, so we can't just call out to StrShuffle(). + */ + for (x = 0; x < 26; x++) + for (n = nE[x] - 1; n > 1; n--) + { + pos = CHOOSE(n); + y = E[x][pos]; + E[x][pos] = E[x][n-1]; + E[x][n-1] = y; + } + + /* "(6) Construct sequence S', a random DP permutation of + * S, from E(S') as follows. Start at the s_1 edge list. + * At each s_i edge list, add s_i to S', delete the + * first edge s_i,s_j of the edge list, and move to + * the s_j edge list. Continue this process until + * all edge lists are exhausted." + */ + iE = MallocOrDie(sizeof(int) * 26); + for (x = 0; x < 26; x++) iE[x] = 0; + + pos = 0; + x = toupper(s2[0]) - 'A'; + while (1) + { + s1[pos++] = 'A' + x; /* add s_i to S' */ + + y = E[x][iE[x]]; + iE[x]++; /* "delete" s_i,s_j from edge list */ + + x = y; /* move to s_j edge list. */ + + if (iE[x] == nE[x]) + break; /* the edge list is exhausted. */ + } + s1[pos++] = 'A' + sf; + s1[pos] = '\0'; + + /* Reality checks. + */ + if (x != sf) Die("hey, you didn't end on s_f."); + if (pos != len) Die("hey, pos (%d) != len (%d).", pos, len); + + /* Free and return. + */ + Free2DArray((void **) E, 26); + free(nE); + free(iE); + return 1; +} + + +/* Function: StrMarkov0() + * Date: SRE, Fri Oct 29 11:08:31 1999 [St. Louis] + * + * Purpose: Returns a random string s1 with the same + * length and zero-th order Markov properties + * as s2. + * + * s1 and s2 may be identical, to randomize s2 + * in place. + * + * Args: s1 - allocated space for random string + * s2 - string to base s1's properties on. + * + * Returns: 1 on success; 0 if s2 doesn't look alphabetical. + */ +int +StrMarkov0(char *s1, char *s2) +{ + int len; + int pos; + float p[26]; /* symbol probabilities */ + + /* First, verify that the string is entirely alphabetic. + */ + len = strlen(s2); + for (pos = 0; pos < len; pos++) + if (! isalpha(s2[pos])) return 0; + + /* Collect zeroth order counts and convert to frequencies. + */ + FSet(p, 26, 0.); + for (pos = 0; pos < len; pos++) + p[(int)(toupper(s2[pos]) - 'A')] += 1.0; + FNorm(p, 26); + + /* Generate a random string using those p's. + */ + for (pos = 0; pos < len; pos++) + s1[pos] = FChoose(p, 26) + 'A'; + s1[pos] = '\0'; + + return 1; +} + + +/* Function: StrMarkov1() + * Date: SRE, Fri Oct 29 11:22:20 1999 [St. Louis] + * + * Purpose: Returns a random string s1 with the same + * length and first order Markov properties + * as s2. + * + * s1 and s2 may be identical, to randomize s2 + * in place. + * + * Args: s1 - allocated space for random string + * s2 - string to base s1's properties on. + * + * Returns: 1 on success; 0 if s2 doesn't look alphabetical. + */ +int +StrMarkov1(char *s1, char *s2) +{ + int len; + int pos; + int x,y; + int i; /* initial symbol */ + float p[26][26]; /* symbol probabilities */ + + /* First, verify that the string is entirely alphabetic. + */ + len = strlen(s2); + for (pos = 0; pos < len; pos++) + if (! isalpha(s2[pos])) return 0; + + /* Collect first order counts and convert to frequencies. + */ + for (x = 0; x < 26; x++) FSet(p[x], 26, 0.); + + i = x = toupper(s2[0]) - 'A'; + for (pos = 1; pos < len; pos++) + { + y = toupper(s2[pos]) - 'A'; + p[x][y] += 1.0; + x = y; + } + for (x = 0; x < 26; x++) + FNorm(p[x], 26); + + /* Generate a random string using those p's. + */ + x = i; + s1[0] = x + 'A'; + for (pos = 1; pos < len; pos++) + { + y = FChoose(p[x], 26); + s1[pos] = y + 'A'; + x = y; + } + s1[pos] = '\0'; + + return 1; +} + + + +/* Function: StrReverse() + * Date: SRE, Thu Nov 20 10:54:52 1997 [St. Louis] + * + * Purpose: Returns a reversed version of s2, in s1. + * (s1 and s2 can be identical, to reverse in place) + * + * Args: s1 - allocated space for reversed string. + * s2 - string to reverse. + * + * Return: 1. + */ +int +StrReverse(char *s1, char *s2) +{ + int len; + int pos; + char c; + + len = strlen(s2); + for (pos = 0; pos < len/2; pos++) + { /* swap ends */ + c = s2[len-pos-1]; + s1[len-pos-1] = s2[pos]; + s1[pos] = c; + } + if (len%2) { s1[pos] = s2[pos]; } /* copy middle residue in odd-len s2 */ + s1[len] = '\0'; + return 1; +} + +/* Function: StrRegionalShuffle() + * Date: SRE, Thu Nov 20 11:02:34 1997 [St. Louis] + * + * Purpose: Returns a regionally shuffled version of s2, in s1. + * (s1 and s2 can be identical to regionally + * shuffle in place.) See [Pearson88]. + * + * Args: s1 - allocated space for regionally shuffled string. + * s2 - string to regionally shuffle + * w - window size (typically 10 or 20) + * + * Return: 1. + */ +int +StrRegionalShuffle(char *s1, char *s2, int w) +{ + int len; + char c; + int pos; + int i, j; + + if (s1 != s2) strcpy(s1, s2); + len = strlen(s1); + + for (i = 0; i < len; i += w) + for (j = MIN(len-1, i+w-1); j > i; j--) + { + pos = i + CHOOSE(j-i); + c = s1[pos]; + s1[pos] = s1[j]; + s1[j] = c; + } + return 1; +} + + +/* Function: AlignmentShuffle() + * Date: SRE, Sun Apr 22 18:37:15 2001 [St. Louis] + * + * Purpose: Returns a shuffled version of ali2, in ali1. + * (ali1 and ali2 can be identical, to shuffle + * in place.) The alignment columns are shuffled, + * preserving % identity within the columns. + * + * Args: ali1 - allocated space for shuffled alignment + * [0..nseq-1][0..alen-1] + * ali2 - alignment to be shuffled + * nseq - number of sequences in the alignment + * alen - length of alignment, in columns. + * + * Returns: int + */ +int +AlignmentShuffle(char **ali1, char **ali2, int nseq, int alen) +{ + int i; + int pos; + char c; + + if (ali1 != ali2) + { + for (i = 0; i < nseq; i++) strcpy(ali1[i], ali2[i]); + } + + for (i = 0; i < nseq; i++) + ali1[i][alen] = '\0'; + + for (; alen > 1; alen--) + { + pos = CHOOSE(alen); + for (i = 0; i < nseq; i++) + { + c = ali1[i][pos]; + ali1[i][pos] = ali1[i][alen-1]; + ali1[i][alen-1] = c; + } + } + + return 1; +} + +/* Function: AlignmentBootstrap() + * Date: SRE, Sun Apr 22 18:49:14 2001 [St. Louis] + * + * Purpose: Returns a bootstrapped alignment sample in ali1, + * constructed from ali2 by sampling columns with + * replacement. + * + * Unlike the other shuffling routines, ali1 and + * ali2 cannot be the same. ali2 is left unchanged. + * ali1 must be a properly allocated space for an + * alignment the same size as ali2. + * + * Args: ali1 - allocated space for bootstrapped alignment + * [0..nseq-1][0..alen-1] + * ali2 - alignment to be bootstrapped + * nseq - number of sequences in the alignment + * alen - length of alignment, in columns. + * + * Returns: 1 on success. + */ +int +AlignmentBootstrap(char **ali1, char **ali2, int nseq, int alen) +{ + int pos; + int col; + int i; + + for (pos = 0; pos < alen; pos++) + { + col = CHOOSE(alen); + for (i = 0; i < nseq; i++) + ali1[i][pos] = ali2[i][col]; + } + for (i = 0; i < nseq; i++) + ali1[i][alen] = '\0'; + + return 1; +} + + +/* Function: QRNAShuffle() + * Date: SRE, Mon Dec 10 10:14:12 2001 [St. Louis] + * + * Purpose: Shuffle a pairwise alignment x,y while preserving the + * position of gaps; return the shuffled alignment in xs, + * ys. + * + * Works by doing three separate + * shuffles, of (1) columns with residues in both + * x and y, (2) columns with residue in x and gap in y, + * and (3) columns with gap in x and residue in y. + * + * xs,x and ys,y may be identical: that is, to shuffle + * an alignment "in place", destroying the original + * alignment, just call: + * QRNAShuffle(x,y,x,y); + * + * Args: xs, ys: allocated space for shuffled pairwise ali of x,y [L+1] + * x, y: pairwise alignment to be shuffled [0..L-1] + * + * Returns: 1 on success, 0 on failure. + * The shuffled alignment is returned in xs, ys. + */ +int +QRNAShuffle(char *xs, char *ys, char *x, char *y) +{ + int L; + int *xycol, *xcol, *ycol; + int nxy, nx, ny; + int i; + int pos, c; + char xsym, ysym; + + if (xs != x) strcpy(xs, x); + if (ys != y) strcpy(ys, y); + + /* First, construct three arrays containing lists of the column positions + * of the three types of columns. (If a column contains gaps in both x and y, + * we've already simply copied it to the shuffled sequence.) + */ + L = strlen(x); + xycol = MallocOrDie(sizeof(int) * L); + xcol = MallocOrDie(sizeof(int) * L); + ycol = MallocOrDie(sizeof(int) * L); + nxy = nx = ny = 0; + + for (i = 0; i < L; i++) + { + if (isgap(x[i]) && isgap(y[i])) { continue; } + else if (! isgap(x[i]) && ! isgap(y[i])) { xycol[nxy] = i; nxy++; } + else if (isgap(x[i])) { ycol[ny] = i; ny++; } + else if (isgap(y[i])) { xcol[nx] = i; nx++; } + } + + /* Second, shuffle the sequences indirectly, via shuffling these arrays. + * Yow, careful with those indices, and with order of the statements... + */ + for (; nxy > 1; nxy--) { + pos = CHOOSE(nxy); + xsym = xs[xycol[pos]]; ysym = ys[xycol[pos]]; c = xycol[pos]; + xs[xycol[pos]] = xs[xycol[nxy-1]]; ys[xycol[pos]] = ys[xycol[nxy-1]]; xycol[pos] = xycol[nxy-1]; + xs[xycol[nxy-1]] = xsym; ys[xycol[nxy-1]] = ysym; xycol[pos] = xycol[nxy-1]; + } + for (; nx > 1; nx--) { + pos = CHOOSE(nx); + xsym = xs[xcol[pos]]; ysym = ys[xcol[pos]]; c = xcol[pos]; + xs[xcol[pos]] = xs[xcol[nx-1]]; ys[xcol[pos]] = ys[xcol[nx-1]]; xcol[pos] = xcol[nx-1]; + xs[xcol[nx-1]] = xsym; ys[xcol[nx-1]] = ysym; xcol[nx-1] = c; + } + for (; ny > 1; ny--) { + pos = CHOOSE(ny); + xsym = xs[ycol[pos]]; ysym = ys[ycol[pos]]; c = ycol[pos]; + xs[ycol[pos]] = xs[ycol[ny-1]]; ys[ycol[pos]] = ys[ycol[ny-1]]; ycol[pos] = ycol[ny-1]; + xs[ycol[ny-1]] = xsym; ys[ycol[ny-1]] = ysym; ycol[ny-1] = c; + } + + free(xycol); free(xcol); free(ycol); + return 1; +} + + +#ifdef TESTDRIVER +/* + * cc -g -o testdriver -DTESTDRIVER -L. shuffle.c -lsquid -lm + */ +int +main(int argc, char **argv) +{ + char s1[100]; + char s2[100]; + + sre_srandom(42); + strcpy(s2, "GGGGGGGGGGCCCCCCCCCC"); + /* strcpy(s2, "AGACATAAAGTTCCGTACTGCCGGGAT"); + */ + StrDPShuffle(s1, s2); + printf("DPshuffle: %s\n", s1); + StrMarkov0(s1,s2); + printf("Markov 0 : %s\n", s1); + StrMarkov1(s1,s2); + printf("Markov 1 : %s\n", s1); + + strcpy(s1, "ACGTACGT--------ACGTACGT----ACGTACGT"); + strcpy(s2, "ACGTACGTACGTACGT------------ACGTACGT"); + QRNAShuffle(s1,s2,s1,s2); + printf("QRNA : %s\n", s1); + printf(" : %s\n", s2); + + return 0; +} +#endif