Mercurial > repos > miller-lab > genome_diversity
view genome_diversity/src/admix_prep.c @ 24:248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Tue, 28 May 2013 16:24:19 -0400 |
parents | 2c498d40ecde |
children |
line wrap: on
line source
/* admix_prep -- prepare the ".ped" and ".map" files (PLINK format) for input to * the "admixture" program. * * argv[1] -- a Galaxy SNP table * argv[2] -- required number of reads for each individual to use a SNP * argv[3] -- required genotype quality for each individual to use a SNP * argv[4] -- minimum spacing between SNPs on the same scaffold * argv[k] for k > 4 have the form "13:fred", meaning that the 13th and 14th * columns (base 0) give the allele counts for the individual or group named * "fred". What it does on Galaxy The tool converts a SNP table into two tables, called "admix.map" and "admix.ped", needed for estimating the population structure. The user can read or download those files, or simply pass this tool's output on to other programs. The user imposes conditions on which SNPs to consider, such as the minimum coverage and/or quality value for every individual, or the distance to the closest SNP in the same contig (as named in the first column of the SNP table). A useful piece of information produced by the tool is the number of SNPs meeting those conditions, which can be found by clicking on the "eye" after the program runs. */ #include "lib.h" // bounds line length for a line of the Galaxy table #define MOST 50000 struct individual { int column; char *name; } I[MOST/8]; // each individual has 4 columns and 4 tab characters int nI; // number of individuals int X[MOST]; // integer values in a row of the SNP table // bounds the number of SNPs that can be kept #define MAX_KEEP 10000000 char *S[MAX_KEEP]; // S[i] is a row of 2*nI alleles int nK; int main(int argc, char **argv) { FILE *fp, *ped, *map; char *p, *z = " \t\n", buf[MOST], trash[MOST], name[100], *s, scaf[100], prev_scaf[100]; int i, j, m, min_coverage, min_quality, min_space, nsnp, genotype, pos, prev_pos; if (argc < 5) fatal("args: Galaxy-table min-cov min-qual min-space 13:fred 16:mary ..."); min_coverage = atoi(argv[2]); min_quality = atoi(argv[3]); min_space = atoi(argv[4]); for (i = 5; i < argc; ++i, ++nI) { if (nI >= MOST/8) fatal("Too many individuals"); if (sscanf(argv[i], "%d:%s", &(I[nI].column), name) != 2) fatalf("bad arg: %s", argv[i]); I[nI].name = copy_string(name); } map = ckopen("admix.map", "w"); fp = ckopen(argv[1], "r"); prev_scaf[0] = '\0'; prev_pos = 0; for (nsnp = 0; fgets(buf, MOST, fp); ) { if (buf[0] == '#') continue; ++nsnp; if (sscanf(buf, "%s %d", scaf, &pos) != 2) fatalf("choke: %s", buf); if (same_string(scaf, prev_scaf)) { if (pos < prev_pos + min_space) continue; } else { strcpy(prev_scaf, scaf); prev_pos = -min_space; } // X[i] = atoi(i-th word base-1) strcpy(trash, buf); for (i = 1, p = strtok(trash, z); p != NULL; ++i, p = strtok(NULL, z)) X[i] = atoi(p); for (i = 0; i < nI; ++i) { m = I[i].column; if (X[m] + X[m+1] < min_coverage || X[m+3] < min_quality) break; } if (i < nI) continue; prev_pos = pos; if (nK >= MAX_KEEP) fatal("Too many SNPs"); fprintf(map, "1 snp%d 0 %d\n", nsnp, nsnp+1); s = S[nK++] = ckalloc(2*nI*sizeof(char)); for (i = j = 0; i < nI; ++i, j += 2) { genotype = X[I[i].column+2]; if (genotype == 2) s[j] = s[j+1] = '1'; else if (genotype == 0) s[j] = s[j+1] = '2'; else if (genotype == 1) { s[j] = '1'; s[j+1] = '2'; } else // undefined genotype s[j] = s[j+1] = '0'; } } fclose(map); ped = ckopen("admix.ped", "w"); for (i = 0; i < nI; ++i) { fprintf(ped, "%s 1 0 0 1 1", I[i].name); for (j = 0; j < nK; ++j) fprintf(ped, " %c %c", S[j][2*i], S[j][2*i+1]); putc('\n', ped); } printf("Using %d of %d SNPs\n", nK, nsnp); fclose(ped); return 0; }