view genome_diversity/src/get_pi.c @ 24:248b06e86022

Added gd_genotype datatype. Modified tools to support new datatype.
author Richard Burhans <burhans@bx.psu.edu>
date Tue, 28 May 2013 16:24:19 -0400
parents 95a05c1ef5d5
children
line wrap: on
line source

/* get_pi -- compute piNon, piSyn, thetaNon and thetaSyn
*
*  argv[1] -- SAPs file
*  argv[2] -- SNPs file
*  argv[3] -- covered intervals file
*  argv[4], argv[5], ... -- starting columns in the SNP file for the chosen
*    individuals
*
*  We assume that lines of argv[1], argv[2] and argv[3] start with a scaffold
*  name and a scaffold position, and that they are sorted on those two fields.
*  The 4th entry in an interval line gives the reference chromosome. We ignore
*  unnumbered chromosome, e.g., "chrX".
*
*  Output:
*    the number of nonsyn SNPs, total number of nonsynon sites,  piNon,
*    the number of synon SNPs, total number of synon sites, piSyn, plus
*    total length of covered intervals, thetaNon, thetaSyn
*
* What it does on Galaxy
The tool computes values that estimate some basic parameters
*/  

#include "lib.h"

// END_FILE should be larger than any real scaffold number
#define END_FILE 1000000000 // scaffold "number" signifying end-of-file
#define BUF_SIZE 50000	// maximum length of a SNP-file row

int col[10000], nC; // columns containing the chosen genotypes

// scaffold numbers and positions of the current SAP, SNP, and interval
int nbr_SAP, nbr_SNP, nbr_interv, pos_SAP, pos_SNP, beg, end, columns, debug;

// current SNP row, the variant amino acids of the SAP, interval's reference chr
char snp[BUF_SIZE], A[100], B[100], chr[100];

// number of nonsynon and snynon sites in the current interval
float all_non, all_syn;

// return the number of chromosome pairs that differ at a SNP
int diff_pair() {
	int i, j, X[1000];
	char *p, *z = "\t\n";

	// set X[i] = atoi(i-th word of SNP-file row), base 1
	for (i = 1, p = strtok(snp, z); p != NULL;
	  ++i, p = strtok(NULL, z))
		X[i] = atoi(p);
	// add genotypes to count the reference allele
	for (j = i = 0; i < nC; ++i)
		j += X[col[i]];
	// of the 2*nC chromosomes, j have the reference nucleotide
	if (debug)
		printf("get_pi: j = %d, return %d\n", j, j*(2*nC-j));
	return j*(2*nC-j);
}

// translate e.g. "scaffold234" to the integer 234
int name2nbr(char *s) {
	if (same_string(s, "chrX"))
		return 1000;
	if (same_string(s, "chrY"))
		return 1001;
	while (isalpha(*s))
		++s;
	return atoi(s);
}

// does one scaffold-position pair strictly precede another
int before(int nbra, int posa, int nbrb, int posb) {
	return (nbra < nbrb || (nbra == nbrb && posa < posb));
}

// get a SAP; set A and B; set nbr_SAP = END_FILE for end-of-file
void get_SAP(FILE *fp) {
	char buf[500], scaf_name[100];
	int old_nbr = nbr_SAP, old_pos = pos_SAP;

	if (nbr_SAP >= END_FILE)
		return;
	if (!fgets(buf, 500, fp)) {
		nbr_SAP = END_FILE;
		return;
	}
	while (buf[0] == '#')
		if (!fgets(buf, 500, fp)) {
			nbr_SAP = END_FILE;
			return;
		}
	if (columns == 8) {
		if (sscanf(buf, "%s %d %*s %*s %*s %*s %s %*s %s",
		    scaf_name, &pos_SAP, A, B) != 4)
			fatalf("bad SAP : %s", buf);
	} else if (columns == 5) {
		if (sscanf(buf, "%s %d %*s %*s %s %*s %s",
		    scaf_name, &pos_SAP, A, B) != 4)
			fatalf("bad SAP : %s", buf);
	} else
		fatalf("get_SAP: columns = %d", columns);
	nbr_SAP = name2nbr(scaf_name);
	if (before(nbr_SAP, pos_SAP, old_nbr, old_pos))
		fatalf("SAP at scaf%d %d is out of order", nbr_SAP, pos_SAP);
	if (debug)
		printf("SAP: scaf%d %d\n", nbr_SAP, pos_SAP);
}

// get a SNP
void get_SNP(FILE *fp) {
	char scaf_name[100];
	int old_nbr = nbr_SNP, old_pos = pos_SNP;

	if (nbr_SNP >= END_FILE)
		return;
	if (!fgets(snp, BUF_SIZE, fp)) {
		nbr_SNP = END_FILE+1;
		return;
	}
	while (snp[0] == '#')
		if (!fgets(snp, 500, fp)) {
			nbr_SNP = END_FILE+1;
			return;
		}
	if (sscanf(snp, "%s %d", scaf_name, &pos_SNP) != 2)
		fatalf("bad SNP : %s", snp);
	nbr_SNP = name2nbr(scaf_name);
	if (before(nbr_SNP, pos_SNP, old_nbr, old_pos)) {
		fprintf(stderr, "seq%d %d before seq%d %d\n",
		  nbr_SNP, pos_SNP, old_nbr, old_pos);
		fatalf("SNP at sequence %d %d is out of order", nbr_SNP, pos_SNP);
	}
	if (debug)
		printf("SNP: scaf%d %d\n", nbr_SNP, pos_SNP);
}

// expand fractions .333 and .666 to full double-precision accuracy
double grow(float x) {
	int chop = x;
	float remain;
	double d, third = (double)1/(double)3;

	d = (double)chop;
	remain = x - (float)chop;
	if (0.1 < remain)
		d += third;
	if (0.5 < remain)
		d += third;
	return d;
}

// read an interval; update tot_non and tot_syn
int get_interval(FILE *fp) {
	char buf[500], scaf_name[100], tmp[500], *t, *z = " \t\n";
	int old_nbr = nbr_interv, old_end = end;

	if (!fgets(buf, 500, fp))
		return 0;
	while (buf[0] == '#')
		if (!fgets(buf, 500, fp))
			return 0;
	if (columns == 0) {
		strcpy(tmp, buf);
		for (columns = 0, t = strtok(tmp, z); t != NULL;
		     ++columns, t = strtok(NULL, z))
			;
	} 
	if (columns != 5 && columns != 8)
		fatalf("interval table has %d columns", columns);
	if (columns == 8 && sscanf(buf, "%s %d %d %s %*s %*s %f %f",
	    scaf_name, &beg, &end, chr, &all_non, &all_syn) != 6)
		fatalf("bad interval : %s", buf);
	if (columns == 5) {
		if (sscanf(buf, "%s %d %d %f %f",
		    scaf_name, &beg, &end, &all_non, &all_syn) != 5)
			fatalf("bad interval : %s", buf);
		strcpy(chr, scaf_name);
	}
	nbr_interv = name2nbr(scaf_name);
	if (before(nbr_interv, beg, old_nbr, old_end))
		fatalf("interval at %s %d is out of order", scaf_name, beg);
	if (debug)
		printf("int: scaf%d %d-%d\n", nbr_interv, beg, end);
		
	return 1;
}

int main(int argc, char **argv) {
	FILE *fp1, *fp2, *fp3;
	int i, nint, nsap, no_sap, no_snp, no_chr, nsyn, nnon, d, tot_len;
	float non, syn, x;
	double tot_non = 0.0, tot_syn = 0.0,	// total sites in the intervals
	  factor;

	// process command-line arguments
	if (same_string(argv[argc-1], "debug")) {
		debug = 1;
		--argc;
	}
	if (argc < 5)
		fatal("args: SAPs SNPs intervals individual1 ... [debug]");
	fp1 = ckopen(argv[1], "r");
	fp2 = ckopen(argv[2], "r");
	fp3 = ckopen(argv[3], "r");
	for (i = 4; i < argc; ++i)
		col[i-4] = atoi(argv[i]) + 2;
	nC = argc - 4;

	// loop over the intervals
	tot_len = no_sap = nsap = no_snp = no_chr = nsyn = nnon = 0;
	non = syn = 0.0;
	for (nint = 0; get_interval(fp3); ++nint) {
		if (strncmp(chr, "chr", 3) == 0 && !isdigit(chr[3])) {
			++no_chr;
			continue;
		}
		tot_len += (end - beg);
		// expand e.g. .333 to .3333333..
		tot_non += grow(all_non);
		tot_syn += grow(all_syn);

		// skip SAPs coming before this interval
		while (before(nbr_SAP, pos_SAP, nbr_interv, beg))
			get_SAP(fp1);
		// loop over SAPs in this inteval
		while (before(nbr_SAP, pos_SAP, nbr_interv, end)) {
			++nsap;

			// look for this SAP in the SNP file
			while (before(nbr_SNP, pos_SNP, nbr_SAP, pos_SAP)) {
				if (nbr_SNP == nbr_interv && pos_SNP >= beg)
					++no_sap;
				get_SNP(fp2);
			}

			// is this the SAP we looked for?
			if (nbr_SAP == nbr_SNP && pos_SAP == pos_SNP) {
				d = diff_pair();
				if (A[0] == B[0]) {
					++nsyn;
					syn += (float)d;
				} else {
					++nnon;
					non += (float)d;
				}
				get_SNP(fp2);
			} else
				++no_snp;
			get_SAP(fp1);
		}
		// process SNPs in the interval but not in the SAP file
		while (before(nbr_SNP, pos_SNP, nbr_interv, end)) {
			if (nbr_SNP == nbr_interv && pos_SNP >= beg)
				++no_sap;
			get_SNP(fp2);
		}
	}

	// there are x = (2*nC choose 2) pairs of chromosomes
	x = (float)(nC*(2*nC-1));
	non /= x;
	syn /= x;
	printf("%d intervals\n", nint);
	if (no_chr)
		printf("ignored %d interval%s on unnumbered chromosomes, like chrX\n",
		  no_chr, no_chr == 1 ? "" : "s");
	printf("%d SNPs, %d nonsyn, %d synon\n", nsap, nnon, nsyn);
	if (no_sap)
		printf("%d SNPs in an interval are not in the SAP table\n",
		  no_sap);
	if (no_snp)
		printf("%d SNPs in an interval are not in the SNP table\n",
		  no_snp);
	printf("nonsynon: %4.3f/%4.3f = %6.5f%%\n",
	  non, tot_non, 100.0*non/tot_non);
	printf("synon: %4.3f/%4.3f = %6.5f%%\n",
	  syn, tot_syn, 100.0*syn/tot_syn);
	for (factor = 0.0, i = 1; i < 2*nC; ++i)
		factor += (1.0/(double)i);
	factor *= (double)tot_len/100.0;
	printf("%d covered bp, thetaNon = %6.5f%%, thetaSyn = %6.5f%%\n",
	 tot_len, (double)nnon/factor, (double)nsyn/factor);
	return 0;
}