Mercurial > repos > miller-lab > genome_diversity

--- a/datatypes_conf.xml	Mon Apr 09 12:03:06 2012 -0400
+++ b/datatypes_conf.xml	Mon Apr 09 12:40:57 2012 -0400
@@ -9,4 +9,5 @@
     <datatype extension="wsf" type="galaxy.datatypes.wsf:SnpFile" display_in_upload="true"/>
     <datatype extension="wpf" type="galaxy.datatypes.wsf:SapFile" display_in_upload="true"/>
   </registration>
+   <sniffers/>
 </datatypes>
--- a/genome_diversity/src/dpmix.c	Mon Apr 09 12:03:06 2012 -0400
+++ b/genome_diversity/src/dpmix.c	Mon Apr 09 12:40:57 2012 -0400
@@ -12,15 +12,15 @@
 *    argv[6] = switch penalty (>= 0)
 *    argv[7] = file giving heterochromatic intervals ('-' means that no file is
 *	       given)
-*    argv[8] = file name to write additional output
-*    argv[9], argv[10], ...,  have the form "13:1:paul", "13:2:fred" or
+*    argv[8] = file name for additional output
+*    argv[9], argv[10], ...,  have the form "13:1:Peter", "13:2:Paul" or
 *	      "13:0:Mary", meaning that the 13th and 14th columns (base 1)
 *	      give the allele counts for an individual that is in ancestral
 *	      population 1, ancestral population 2, or is a potentially admixed
 *	      individual, resp.

 What it does on Galaxy
-The user specifies two "ancestral" populations (i.e., sources for chromosomes) and a set of potentially admixed individuals, and chooses between the sequence coverage or the estimated genotypes to measure the similarity of genomic intervals in admixed individuals to the two classes of ancestral chromosomes. The user also picks a "switch penalty", typically between 10 and 100. For each potentially admixed individual, the program divides the genome into three "genotypes": (0) homozygous for the second ancestral population (i.e., both chromosomes from that population), (1) heterozygous, or (2) homozygous for the second ancestral population. Parts of a chromosome that are labeled as "heterochromatic" via argv[7] are given the "non-genotype, 3. Smaller values of the switch penalty (corresponding to more ancient admixture events) generally lead to the reconstruction of more frequent changes between genotypes.
+The user specifies two "ancestral" populations (i.e., sources for chromosomes) and a set of potentially admixed individuals, and chooses between the sequence coverage or the estimated genotypes to measure the similarity of genomic intervals in admixed individuals to the two classes of ancestral chromosomes. The user also picks a "switch penalty", typically between 10 and 100. For each potentially admixed individual, the program divides the genome into three "genotypes": (0) homozygous for the second ancestral population (i.e., both chromosomes from that population), (1) heterozygous, or (2) homozygous for the second ancestral population. Parts of a reference chromosome that are labeled as "heterochromatic" are given the non-genotype, 3. Smaller values of the switch penalty (corresponding to more ancient admixture events) generally lead to the reconstruction of more frequent changes between genotypes.
 */

 #include "lib.h"
@@ -29,20 +29,23 @@
 // maximum length of a line from the table
 #define MOST 5000

+// we create a linked list of "events" on a chromosome -- mostly SNPs, but
+// also ends of hetorochomatic intervals
 struct snp {
 	double F1, F2;	// reference allele frequencies in the two populations
-	int pos, het_end, *g; // position and an array of admixed genotypes
+	int pos, *g,	// position and an array of admixed genotypes
+	  type;		// 0 = SNP, 1 = start of het. interval, 2 = end
 	struct snp *prev;	// we keep the list in order of decreasing pos
 } *last;

+// array of potentially admixed individuals
 struct admixed {
 	char *name;
 	int gcol, ge20, gt02;
-	long long x[4];
+	long long x[4];		// number of reference bp in each state
 } A[MOST];

-// information about the specified individuals, namely column and population
-// for the ancestral populations and genotype columns for the admixed.
+// information about "ancestral" individuals, namely column and population
 struct ances {
 	int col, pop;
 	char *name;
@@ -55,8 +58,7 @@
 } H[MOST];

 // global variables
-double *S[4];	// S[s][i] = best score ending in state s at SNP i
-int *B[4],	// backpointer to state at the previous SNP
+int *B[4],	// backpointer to state at the previous SNP (or event)
     *P;		// chromosome position
 int nH, nI, nG, genotypes, nsnp, debug, chr_col, logs;
 char this_chr[100];
@@ -94,7 +96,6 @@
 			p = f1*(1.0-f2)  + (1.0-f1)*f2;
 	}

-//fprintf(stderr, "%f %f %d %d => %f\n", f1, f2, g, s, p);
 	if (p < 0.0)
 		fatalf("%f %f %d %d => %f", f1, f2, g, s, p);
 	if (!logs)
@@ -122,46 +123,78 @@
 	return s;
 }

-// process one potentially admixed individual
+/* Process the a-th potentially admixed individual.
+*  We think of a graph with nodes (event, state) for each event (SNP or
+*  end-point of a heterochromatic interval on the current chromosome) and state
+*  = 0, 1, 2, 3 (corresponding to genotypes 0, 1, and 2, plus 3 =
+*  heterochromatin); for events other than the last one, there are edges from
+*  each (event, state) to (event+1, k) for 0 <= k <= 3. An edge (event, j) to
+*  (event+1, k) has penalty 0 if j = k and penalty switch_penalty otherwise.
+*  The bonus at SNP node (event, state) for 0 <= state <= 2 is the probability
+*  of generating the genotype observed in the a-th potentially admixed
+*  individual given the allele frequences in the two ancestral populations and
+*  the assumed admixture state in this region of the chromosome. The score of a
+*  path is the sum of the node bonuses minus the sum of the edge penalties.
+*
+*  Working backwards through the events, we compute the maximum path score,
+*  from[state], from (event,state) back to the closest admixed interval.
+*  To force paths to reach state 3 at an event signalling the start of a
+*  heterochromatic interval (type = 1), but to avoid state 3 at other events,
+*  we assign huge but arbitrary negative scores (see "avoid", below).
+*  At (event,state), B[event][state] is the backpointer to the state at
+*  event+1 on an optimal path. Finally, we follow backpointers to partition
+*  the chromosome into admixture states.
+*/
 void one_admix(int a) {
 	int i, j, m, state, prev_pos, b;
-	double from[4];
+	double from[4], f[4], ff[4], avoid = -1000000.0;
 	struct snp *p;

+	// from[i] = highest score of a path from the current event
+	// (usually a SNP) to the next (to the right) heterochromatic interval
+	// or the end of the chromosome. The score of the path is the sum of
+	// SNP scores minus (switch_penalty times number of state switches).
+	// We assume that the last two event on the chromosome are the start
+	// and end of a heterochromatic interval (possibly of length 0)/
+	for (i = 0; i < 4; ++i)
+		from[i] = 0;
 	for (i = nsnp-1, p = last; i >= 0 && p != NULL; --i, p = p->prev) {
 		for (state = 0; state < 4; ++state) {
-			// end (start, rather) in this state
-			for (j = 0; j < 4; ++j) {
-				// preceding state is j
-				from[j] = S[j][i+1];
+			// find highest path-score from this event onward
+			for (m = j = 0; j < 4; ++j) {
+				f[j] = from[j];
 				if (j != state)
-					from[j] -= switch_penalty;
+					f[j] -= switch_penalty;
 				//if (abs(j-state) == 2)
 					//from[j] -= switch_penalty;
-			}
-			for (m = 0, j = 1; j < 4; ++j)
-				if (from[j] > from[m])
+				if (f[j] > f[m])
 					m = j;
-			S[state][i] = from[m];
+			}
 			B[state][i] = m;
-			if (state < 3)
-				S[state][i] +=
+			ff[state] = f[m];
+			if (state < 3 && p->type == 0)
+				ff[state] +=
 				    score(p->F1, p->F2, p->g[a], state);
 		}
-		if (p->het_end == 1) {
-			S[3][i] = 0;
-			S[0][i] = S[1][i] = S[2][i] = -1000000;
-		} else
-			S[3][i] = -1000000;
+		if (p->type == 1) {
+			// start of heterochomatic interval. Force paths
+			// reaching this point to go through state 3
+			from[3] = 0;
+			from[0] = from[1] = from[2] = avoid;
+		} else {
+			for (j = 0; j < 3; ++j)
+				from[j] = ff[j];
+			from[3] = avoid;
+		}
 		if (debug)
-			fprintf(stderr, "%d: %f(%d) %f(%d) %f(%d)\n",
-			  i, S[0][i], B[0][i], S[1][i], B[1][i], S[2][i],
-			  B[2][i]);
+			fprintf(stderr, "%d: %f(%d) %f(%d) %f(%d) %f(%d)\n",
+			  i, from[0], B[0][i], from[1], B[1][i], from[2],
+			  B[2][i], from[3], B[3][i]);
 	}

 	// find the best initial state
 	for (state = 0, j = 1; j < 4; ++j)
-		if (S[j][0] > S[state][0])
+		if (from[j] > from[state])
 			state = j;

 	// trace back to find the switch points
@@ -176,21 +209,17 @@
 			state = b;
 		}
 	}
-/*
-	printf("%s\t%d\t%d\t%d\t%s\n",
-	  this_chr, prev_pos, P[nsnp-1], state, A[a].name);
-	A[a].x[state] += (P[nsnp-1]-prev_pos);
-*/
 }

-// add a heterochromatic interval to the SNP list
-void add_het(int b, int e) {
+// Add a heterochromatic interval to the SNP list, where type = 1 signifies
+// the start of the interval, 2 signifies the end.
+void add_het(int b, int type) {
 	struct snp *new = ckalloc(sizeof(struct snp));
 	int i;

 	new->F1 = new->F2 = 0.0;
 	new->pos = b;
-	new->het_end = e;
+	new->type = type;
 	new->g = ckalloc(nG*sizeof(int));
 	for (i = 0; i < nG; ++i)
 		new->g[i] = 0;
@@ -198,10 +227,13 @@
 	last = new;
 }

-// Process one chromosome. Read the SNPs on the chromosome (the first one is
-// already in the buf). For each SNP, determine the frequencies of the
-// reference allele in the two ancestral populations, and put them in the
-// linked list. Then call the dynamic-programming routine.
+/* Process one chromosome. Read the SNPs on the chromosome (the first one is
+*  already in the buf). Boil each SNP down to the contents of a SNP entry
+*  (pos, F1, F2, g[]) and put it in the linked list. Also, intersperse the
+*  "events" corresponding to the start and end of a heterochromatic interval.
+*  Then call the dynamic-programming routine for each potentially admixed
+*  individual.
+*/
 void one_chr() {
 	char *s, *z = "\t\n";
 	int X[MOST], n, i, g, A1, B1, A2, B2, a, do_read, p, pos, het;
@@ -211,6 +243,7 @@
 	strcpy(this_chr, get_chr_name());
 	nsnp = 0;
 	last = NULL;
+	// advance to this chromosome in the list of heterochromatic intervals
 	for (het = 0; het < nH && !same_string(this_chr, H[het].chr); ++het)
 		;
 	// loop over the SNPs on the current chromosome
@@ -225,7 +258,8 @@
 		  ++i, s = strtok(NULL, z))
 			X[i] = atoi(s);

-		// insert heterochomatin intervals coming before the SNP
+		// insert events (pseudo-SNPs) for heterochomatin intervals
+		// coming before the SNP
 		pos = X[chr_col+1];
 		while (het < nH && same_string(this_chr, H[het].chr) &&
 		   H[het].b < pos) {
@@ -236,7 +270,7 @@
 		}

 		// should we discard this SNP?
-		if ((pos = X[chr_col+1]) == -1)	// SNP not mapped to the reference
+		if (pos == -1)	// SNP not mapped to the reference
 			continue;
 		for (i = 0; i < nG && X[A[i].gcol] >= 0; ++i)
 			;
@@ -244,7 +278,7 @@
 			continue;

 		// add SNP to a "backward pointing" linked list, recording the
-		// "major" allele frequencies in the two reference populations
+		// major allele frequencies in the two reference populations
 		// and genotypes in the potential admixed individuals
 		for (i = A1 = B1 = A2 = B2 = 0; i < nI; ++i) {
 			n = C[i].col;
@@ -279,7 +313,7 @@
 		new->pos = X[chr_col+1];
 		new->F1 = F1 = (double)A1/(double)(A1+B1);
 		new->F2 = F2 = (double)A2/(double)(A2+B2);
-		new->het_end = 0;
+		new->type = 0;
 		new->g = ckalloc(nG*sizeof(int));
 		for (i = 0; i < nG; ++i) {
 			g = new->g[i] = X[A[i].gcol];
@@ -307,7 +341,7 @@
 /*
 printf("nsnp = %d\n", nsnp);
 for (i = nsnp-1, new = last; i >= 0 && new != NULL; --i, new = new->prev) {
-printf("%d %d ", new->pos, new->het_end);
+printf("%d %d ", new->pos, new->type);
 printf("%g %g ", new->F1, new->F2);
 for (a = 0; a < nG; ++a)
 printf("%d", new->g[a]);
@@ -318,14 +352,12 @@
 */

 	// allocate arrays for the DP analysis
-	P = ckalloc(nsnp*sizeof(int));
+	P = ckalloc(nsnp*sizeof(int));	// position of each event
 	for (i = nsnp-1, new = last; i >= 0 && new != NULL;
 	     --i, new = new->prev)
 		P[i] = new->pos;

-	for (i = 0; i < 4; ++i) {
-		S[i] = ckalloc((nsnp+1)*sizeof(double));
-		S[i][nsnp] = 0.0;
+	for (i = 0; i < 4; ++i) {	// space for back-pointers
 		B[i] = ckalloc((nsnp+1)*sizeof(int));
 		B[i][nsnp] = 0;
 	}
@@ -342,20 +374,18 @@
 		free(new);
 	}
 	free(P);
-	for (i = 0; i < 4; ++i) {
-		free(S[i]);
+	for (i = 0; i < 4; ++i)
 		free(B[i]);
-	}
 }

 int main(int argc, char **argv) {
 	int n, i, j, k, saw[3];
-	long long het_len;
+	long long het_len, ref_len;
 	float N;
 	char nam[100], *chr;

-	if (argc < 8)
-		fatal("args: table chr-col chr data-source switch heterochrom outfile n:1:name1 m:2:name2 ...");
+	if (argc < 9)
+		fatal("args: table chr-col chr data-source logs switch heterochrom outfile n:1:name1 m:2:name2 ...");
 	if (same_string(argv[argc-1], "debug")) {
 		debug = 1;
 		--argc;
@@ -365,19 +395,18 @@
 	chr_col = atoi(argv[2]);
 	chr = argv[3];
 	genotypes = atoi(argv[4]);
+
 	logs = atoi(argv[5]);
 	if (logs)
 		fatal("logarithms of probabilities -- under development");
+	//if (logs) switch_penalty = log(switch_penalty);
+
 	switch_penalty = atof(argv[6]);
-/*
-	if (logs)
-		switch_penalty = log(switch_penalty);
-*/
 	if (switch_penalty < 0.0)
 		fatal("negative switch penalty");
 	out = ckopen(argv[8], "w");

-	het_len = 0;
+	het_len = ref_len = 0;
 	if (!same_string(argv[7], "-")) {
 		fp = ckopen(argv[7], "r");
 		while (fgets(buf, MOST, fp)) {
@@ -388,19 +417,18 @@
 			H[nH].chr = copy_string(nam);
 			H[nH].b = i;
 			H[nH].e = j;
+			// assumes last event per chrom. is a het. interval
+			if (nH > 0 && !same_string(nam, H[nH-1].chr))
+				ref_len += j;
 			het_len += (j - i);
 			++nH;
 		}
 		fclose(fp);
 	}
-/*
-for (i = 0; i < nH; ++i)
-printf("%s %d %d\n", H[i].chr, H[i].b, H[i].e);
-exit(0);
-*/
+	ref_len += H[nH-1].e;

+	// populations must be disjoint
 	saw[1] = saw[2] = 0;
-	// populations must be disjoint
 	for (i = 9; i < argc; ++i) {
 		if (sscanf(argv[i], "%d:%d:%s", &j, &k, nam) != 3)
 			fatalf("not like 13:2:fred : %s", argv[i]);
@@ -413,12 +441,12 @@
 			;
 		if (n < nI)
 			fatal("populations are not disjoint");
-		if (k == 0) {	// admixed individuals
+		if (k == 0) {	// admixed individual
 			if (nG >= MOST)
 				fatal("Too many admixed individuals");
 			A[nG].name = copy_string(nam);
 			A[nG++].gcol = j+2;
-		} else {	// ancestral populations
+		} else {	// in an ancestral population
 			if (nI >= MOST)
 				fatal("Too many ancestral individuals");
 			C[nI].col = j;
@@ -426,7 +454,6 @@
 			C[nI++].name = copy_string(nam);
 		}
 	}
-
 	if (saw[0] == 0)
 		fatal("no admixed individual is specified");
 	if (saw[1] == 0)
@@ -466,6 +493,12 @@
 		  A[i].name, A[i].gt02);
 	}
 	// write fractions in each state to the output text file
+
+	if (ref_len)
+		fprintf(out,
+		  "%lld of %lld reference bp (%1.1f%%) are heterochromatin\n\n",
+		  het_len, ref_len, 100.0*(float)het_len/(float)ref_len);
+
 	for (i = 0; i < nG; ++i) {
 		N = (float)(A[i].x[0] + A[i].x[1] + A[i].x[2])/100.0;
 		fprintf(out, "%s: 0 = %1.1f%%, 1 = %1.1f%%, 2 = %1.1f%%\n",