view genome_diversity/src/mt_pi.c @ 24:248b06e86022

Added gd_genotype datatype. Modified tools to support new datatype.
author Richard Burhans <burhans@bx.psu.edu>
date Tue, 28 May 2013 16:24:19 -0400
parents
children
line wrap: on
line source

/* mt_pi -- compute the diversity measure pi for mitochondrial genomes
*  [SHOULD I OPTIONALLY INCLUDE INDELS?]
*
*  argv[1] -- SNP table for the mitogenome
*
*  argv[2] -- file of intervals with lines like:

P.ingens-mt     175     194     6       9-M-352

*  giving genome name, start postion (base-0), end position (half open),
*  coverage and sample name.
*
*  argv[3] -- the minimum coverage. Intervals of lower coverage are ignored.
*
*  argv[4], argv[5], ... column:name pairs like "9:sam".
*  
*  Also, if the last argument is "debug", then much output sent to stderr, if it
*  is "debug2", then the coverage and difference-count for each mitogenome-pair
*  is sent to stderr.
*/

#include "lib.h"
#include "mito_lib.h"

int debug2;

// for a pair of samples, determine how much of the reference is in the
// adequately covered segments for both, and count the number of SNPs in those
// shared regions where they differ
// PUTATIVE HETEROPLASMIES ARE IGNORED
float pair(int i, int j, int nS) {
	int covered, B, E, diffs, k;
	struct interv *p = M[i].intervals, *q = M[j].intervals;
	char x, y;

	// k scans the SNPs
	covered = diffs = k = 0;
	while (p && q) {
		if (debug)
			fprintf(stderr, "trying %d-%d and %d-%d\n",
			  p->b, p->e, q->b, q->e);
		// take the intersection of the two well-covered intervals
		B = MAX(p->b, q->b);
		E = MIN(p->e, q->e);
		if (B < E) {
			if (debug)
				fprintf(stderr, "  covered %d\n", E-B);
			covered += (E - B);
			for ( ; k < nS && S[k].pos < E; ++k) {
				if (S[k].pos >= B) {
					x = S[k].g[i];
					y = S[k].g[j];
					if (debug)
						fprintf(stderr,
						  "   SNP %c %c at %d\n",
						  x, y, S[k].pos);
/*
					if (x == '-' || y == '-')
						fatalf("SNP at %d missing geno",
						  S[k].pos);
*/
/*
					if (x == '1' || y == '1')
						continue;
*/
					if (x != y) {
						++diffs;
						if (debug)
							fprintf(stderr,
							  "\tdiff at %d\n",
							  S[k].pos);
					}
				}
			}
		}
		// go to next adequately covered interval(s)
		if (p->e < q->e)
			p = p->next;
		else if (p->e > q->e)
			q = q->next;
		else {
			p = p->next;
			q = q->next;
		}
	}
	if (debug2)
		fprintf(stderr, "cov(%s,%s) = %d, diffs = %d\n",
		  M[i].name, M[j].name, covered, diffs);
/*
	if (covered == 0)
		fatalf("coverage threshold is too high for %s and %s",
		  M/[i].name, M[j].name);
*/
	if (covered == 0)
		return -1.0;
	return (float)diffs/(float)covered;
}

int main(int argc, char **argv) {
	struct interv *t;
	int i, j, nS, good_pairs, bad_pairs;
	char *a, *s;
	float tot, pr;

	if (argc > 4 && same_string(argv[argc-1], "debug")) {
		--argc;
		debug = debug2 = 1;
	} else if (argc > 4 && same_string(argv[argc-1], "debug2")) {
		--argc;
		debug2 = 1;
	}

	if (argc < 5)
		fatal("args: snps intervals min_cov 9:sam 13:judy ... ");
	// store sample names and start positions (argv[4], argv[5], ...)
	for (nM = 0, i = 4; i < argc; ++nM, ++i) {
		if (nM >= MAX_SAMPLE)
			fatalf("Too many mitogenomes");
		if ((s = strchr(a = argv[i], ':')) == NULL)
			fatalf("colon: %s", a);
		M[nM].col = atoi(a);
		M[nM].name = copy_string(s+1);
	}
	min_cov = atoi(argv[3]);
	get_intervals(argv[2]);

	if (debug) {
		for (i = 0; i < nM; ++i) {
			fprintf(stderr, ">%s\n", M[i].name);
			for (t = M[i].intervals; t; t = t->next)
				fprintf(stderr, "%d\t%d\n", t->b, t->e);
		}
		putc('\n', stderr);
	}

	// record the SNPs
	nS = get_variants(argv[1], S, 0);

	if (debug) {
		for (i = 0; i < nS; ++i)
			fprintf(stderr, "%d %s\n", S[i].pos, S[i].g);
		putc('\n', stderr);
	}

	// record the total rate of diversity, over all pairs of individuals
	// having overlapping well-covered intervals
	good_pairs = bad_pairs = 0;
	for (i = 0, tot = 0.0; i < nM; ++i) {
		for (j = i+1; j < nM; ++j) {
			pr = pair(i, j, nS);
			if (pr >= 0.0) {
				++good_pairs;
				tot += pr;
			} else
				++bad_pairs;
		}
	}
	printf("pi = %5.5f\n", tot/(float)good_pairs);
	if (bad_pairs > 0)
		printf("%d of %d pairs had no sequenced regions in common\n",
		  bad_pairs, bad_pairs + good_pairs);

	return 0;
}