view genome_diversity/src/dist_mat.c @ 24:248b06e86022

Added gd_genotype datatype. Modified tools to support new datatype.
author Richard Burhans <burhans@bx.psu.edu>
date Tue, 28 May 2013 16:24:19 -0400
parents 2c498d40ecde
children
line wrap: on
line source

/* dist_mat -- create a distance matrix in PHYLIP format for pairs of
*  specified individuals, including by default the reference sequence
*
*  argv[1] -- a Galaxy SNP table
*  argv[2] -- min coverage
*  argv[3] -- min quality
*  argv[4] -- name of reference species (or "none")
*  argv[5] -- 0 = distance from coverage; 1 = distance from genotype
*  argv[6] -- name of file for the numbers of informative SNPs
*  argv[7] -- name of file to write the Mega-format distance matrix
*  argv[k] for k > 7 have the form "13:fred", meaning that the 13th and 14th
*    columns (base 0) give the allele counts for the individual or group named
*    "fred".

What it does on Galaxy
This tool uses the selected SNP table to determine a "genetic distance" between each pair of selected individuals; the table of pairwise distances can be used by the Neighbor-Joining methods to construct a tree that depicts how the individuals are related. For a given pair of individuals, we find all SNP positions where both individuals have at least a minimum number of sequence "reads"; the individuals' distance at that SNP is defined as the absolute value of difference in the frequency of the first allele (equivalently: the second allele). For instance, if the first individuals has 5 reads of each allele and the second individual has respectivley 3 and 6 reads, then the frequencies are 1/2 and 1/3, giving a distance 1/6 at that SNP (provided that the minimum read total is at most 9). The output includes a report of the numbers of SNPs passing that thresold for each pair of individuals.

*/

#include "lib.h"

// bounds line length for a line of the Galaxy table

#define MOST 50000
#define MIN_SNPS 3

struct argument {
	int column;
	char *name;
} A[MOST];
int nA;	// number of individuals or groups + 1 (for the reference species)

#define MOST_INDIVIDUALS 1000
#define SIZ 1+MOST_INDIVIDUALS // includes the reference

double tot_diff[SIZ][SIZ];
int ndiff[SIZ][SIZ], X[MOST];

int main(int argc, char **argv) {
	FILE *fp, *gp, *mega;
	char *p, *z = "\t\n", buf[MOST], name[100], B[100], C[100], D[100],
	  *nucs = "ACGT";
	int i, j, m, n, min_coverage, too_few, ref_allele = -1, has_ref,
	  min_quality, genotype;
	double fi, fj, dist;

	if (argc < 8)
		fatal("args: Galaxy-table min-cov min-qual min-snp ref-name genotype dist-out mega-out 13:fred 16:mary ...");
	min_coverage = atoi(argv[2]);
	min_quality = atoi(argv[3]);
	genotype = atoi(argv[5]);
	if (!genotype && min_coverage <= 0 && min_quality <= 0)
		fatal("coverage and/or quality of SNPs should be constrained");

	if (same_string(argv[4], "none"))
		has_ref = 0;
	else {
		has_ref = 1;
		A[0].name = copy_string(argv[4]);
	}
	gp = ckopen(argv[6], "w");
	mega = ckopen(argv[7], "w");
	fprintf(mega, "#mega\n!Title: Galaxy;\n");
	  
	for (nA = has_ref, i = 8; i < argc; ++i, ++nA) {
		if (nA >= SIZ)
			fatal("Too many individuals");
		if (sscanf(argv[i], "%d:%s", &(A[nA].column), name) != 2)
			fatalf("bad arg: %s", argv[i]);
		A[nA].name = copy_string(name);
	}
	fprintf(mega,
	  "!Format DataType=Distance DataFormat=LowerLeft NTaxa=%d;\n\n",
	  nA);
	for (i = 0; i < nA; ++i)
		fprintf(mega, "[%d] #%s\n", i+1, A[i].name);
	fprintf(mega, "\n\n\n[");
	for (i = 1; i <= nA; ++i)
		fprintf(mega, "%4d", i);
	fprintf(mega, " ]\n");
	fp = ckopen(argv[1], "r");
	while (fgets(buf, MOST, fp)) {
		if (buf[0] == '#')
			continue;
		if (has_ref) {
			// get the reference allele
			if (sscanf(buf, "%*s %*s %s %s %*s %*s %*s %s", B, C, D)
			    != 3)
				fatalf("3 fields: %s", buf);
			if (strchr(nucs, B[0]) == NULL ||
			    strchr(nucs, C[0]) == NULL)
				fatalf("not nucs : %s %s", B, C);
			if (D[0] == B[0])
				ref_allele = 1;
			else if (D[0] == C[0])
				ref_allele = 2;
			else if (strchr(nucs, D[0]) != NULL)
				ref_allele = 3;
			else {
				if (D[0] != '-' && D[0] != 'N')
					fatalf("what is this: %s", D);
				ref_allele = -1;
			}
		}
			
		// X[i] = atoi(i-th word base-1)
		for (i = 1, p = strtok(buf, z); p != NULL;
		     ++i, p = strtok(NULL, z))
			X[i] = atoi(p);
		for (i = has_ref; i < nA; ++i) {
			m = A[i].column;
			if (X[m] + X[m+1] < min_coverage ||
			    X[m+3] < min_quality)
				continue;

			// frequency of the second allele
			if (genotype) {
				if (X[m+2] == -1)
					continue;	// no genotype
				fi = (double)X[m+2];
			} else
				fi = (double)X[m+1] / (double)(X[m]+X[m+1]);
			if (has_ref && ref_allele > 0) {
				ndiff[0][i]++;
				// reference allele might be different from both
				if (ref_allele == 1)
					tot_diff[0][i] += fi;
				else if (ref_allele == 2)
					tot_diff[0][i] += (1.0 - fi);
				else
					tot_diff[0][i] += 1.0;
			}
			for (j = i+1; j < nA; ++j) {
				n = A[j].column;
				if (X[n] + X[n+1] < min_coverage ||
				   X[n+3] < min_quality)
					continue;
				if (genotype && X[n+2] == -1)
					continue;
				ndiff[i][j]++;
				if (genotype)
					fj = (double)X[n+2];
				else
					fj = (double)X[n+1] /
					     (double)(X[n] + X[n+1]);
				fj -= fi;
				// add abs. value of difference in frequencies
				tot_diff[i][j] += (fj >= 0.0 ? fj : -fj);
			}

		}
	}
	for (i = too_few = 0; i < nA; ++i)
		for (j = i+1; j < nA; ++j)
			if (ndiff[i][j] < MIN_SNPS) {
				too_few = 1;
				fprintf(stderr,
				  "%s and %s have only %d informative SNPs\n",
				  A[i].name, A[j].name, ndiff[i][j]);
			}
	if (too_few)
		fatal("remove individuals or relax constraints");
		
	// print distances
	printf("%d\n", nA);
	for (i = 0; i < nA; ++i) {
		printf("%9s", A[i].name);
		fprintf(mega, "[%d] ", i+1);  
		for (j = 0; j < i; ++j) {
			dist = tot_diff[j][i]/(double)ndiff[j][i];
			printf(" %6.4f", dist);
			fprintf(mega, " %6.4f", dist);
		}
		fprintf(mega, "  \n");
		printf(" 0.0000");
		for (j = i+1; j < nA; ++j)
			printf(" %6.4f",
			  tot_diff[i][j]/(double)ndiff[i][j]);
		putchar('\n');
	}
	fprintf(mega, "\n\n\n\n\n");
	fclose(mega);

	// print numbers of SNPs
	for (i = 0; i < nA; ++i) {
		fprintf(gp, "%9s", A[i].name);
		for (j = 0; j < i; ++j)
			fprintf(gp, " %8d", ndiff[j][i]);
		fprintf(gp, "        0");
		for (j = i+1; j < nA; ++j)
			fprintf(gp," %8d", ndiff[i][j]);
		putc('\n', gp);
	}

	return 0;
}