view genome_diversity/src/mk_Ji.c @ 24:248b06e86022

Added gd_genotype datatype. Modified tools to support new datatype.
author Richard Burhans <burhans@bx.psu.edu>
date Tue, 28 May 2013 16:24:19 -0400
parents
children
line wrap: on
line source

/* mk_Ji -- prepare data for drawing a picture of mitogenome data
*
*  argv[1] -- SNP table for the mitogenome
*
*  argv[2] -- indel table for the mitogenome
*
*  argv[3] -- coverage table: file of intervals with lines like:

P.ingens-mt     175     194     6       9-M-352

*  giving genome name, start postion (base-0), end position (half open),
*  coverage and sample name.
*
*  argv[4] -- annotation file like

P.ingens-mt     0       70      tRNA    +       tRNA-Phe
P.ingens-mt     70      1030    rRNA    +       12S
P.ingens-mt     1030    1100    tRNA    +       tRNA-Val
P.ingens-mt     1101    2680    CDS     +       rRNA
P.ingens-mt     1101    2680    rRNA    +       16S
P.ingens-mt     2680    2755    tRNA    +       tRNA-Leu
P.ingens-mt     2758    3713    CDS     +       ND1
...
P.ingens-mt     15484   16910   D-loop  +       D-loop

*  argv[5] -- the minimum coverage. Intervals of lower coverage are ignored.
*
*  argv[6] -- either the string "default" or the name of an individual
*
*  argv[7], argv[8], ... column:name pairs like "9:sam".
*  
*  Also, if the last argument is "debug", then much output sent to stderr.
*/

#include "lib.h"
#include "mito_lib.h"

int ref_len;

// read gene annotation, change "CDS" to "gene", print for the drawing tool,
// print lines showing the genome name and length (last annotated position).
void get_genes(char *filename) {
	FILE *fp = ckopen(filename, "r");
	char buf[500], ref[100], type[100], name[100], *t;
	int b, e;

	while (fgets(buf, 500, fp)) {
		if (sscanf(buf, "%s %d %d %s %*s %s",
		    ref, &b, &e, type, name) != 5)
			fatalf("gag: %s", buf);
		t = (same_string(type, "CDS") ? "gene" : type);
		// print the Genome Annotation line
		printf("@GA=%d:%d:%s:%s\n", b, e, name, t);
	}
	printf("@GL=%d\n", ref_len = e);
	printf("@GN=%s\n", ref);
}

// print items that are adequately covered
void visible(int i, struct snp *S, int nS, char *s) {
	struct interv *t;
	int j;

	for (j = 0, t = M[i].intervals; j < nS; ++j) {
		while (t && t->e <= S[j].pos)
			t = t->next;
		if (t && t->b <= S[j].pos && S[j].g[i] == '0')
			printf(" %s%d", s, S[j].pos);
	}
}

int main(int argc, char **argv) {
	struct interv *t;
	int i, nS, nI, last_e, refcol;
	char *a, *s;

	if (argc > 7 && same_string(argv[argc-1], "debug")) {
		--argc;
		debug = 1;
	}

	if (argc < 7)
		fatal("args: snps indels intervals genes min_cov ref 9:sam 13:judy ... ");

	// store sample names and start positions (argv[6], argv[7], ...)
	for (nM = 0, i = 7; i < argc; ++nM, ++i) {
		if (nM >= MAX_SAMPLE)
			fatalf("Too many mitogenomes");
		if ((s = strchr(a = argv[i], ':')) == NULL)
			fatalf("colon: %s", a);
		M[nM].col = atoi(a);
		M[nM].name = copy_string(s+1);
	}
	if (same_string(argv[6], "default"))
		refcol = 0;
	else {
		for (i = 0; i < nM && !same_string(argv[6], M[i].name); ++i)
			;
		if (i == nM)
			fatalf("improper reference name '%s'", argv[6]);
		refcol = M[i].col;
		// fprintf(stderr, "refcol = %d\n", refcol);
	}

	// read annotation and annotate in the file for drawing
	get_genes(argv[4]);

	// record color information
	printf("@CL=rRNA:#EF8A62\n@CL=tRNA:#31A354\n@CL=gene:#B2182B\n");
	printf("@CL=missing:#67A9CF:L\n@CL=indel:#2166AC\n@CL=special:#000000\n");

	min_cov = atoi(argv[5]);

	// store the coverage data
	get_intervals(argv[3]);

	if (debug) {
		for (i = 0; i < nM; ++i) {
			fprintf(stderr, ">%s\n", M[i].name);
			for (t = M[i].intervals; t; t = t->next)
				fprintf(stderr, "%d\t%d\n", t->b, t->e);
		}
		putc('\n', stderr);
	}

	// record the variants
	nS = get_variants(argv[1], S, refcol);
	nI = get_variants(argv[2], I, refcol);

	// report the information for each sample
	for (i = 0; i < nM; ++i) {
		printf("%s", M[i].name);
		visible(i, S, nS, "");
		visible(i, I, nI, "indel=");
		last_e = 0;
		for (t = M[i].intervals; t; t = t->next) {
			if (last_e < t->b)
				printf(" missing=%d:%d", last_e, t->b);
			last_e = t->e;
		}
		if (last_e < ref_len)
			printf(" missing=%d:%d", last_e, ref_len);
		putchar('\n');
	}

	return 0;
}