# HG changeset patch # User Richard Burhans # Date 1370276969 14400 # Node ID 91e835060ad2f1dfa932f201264a5b2bf918fa5d # Parent cba0d7a63b82616d7e7639f69dc9c04b86c46d15 Updates to Admixture, Aggregate Individuals, and Restore Attributes to support gd_genotype diff -r cba0d7a63b82 -r 91e835060ad2 aggregate_gd_indivs.py --- a/aggregate_gd_indivs.py Wed May 29 13:49:19 2013 -0400 +++ b/aggregate_gd_indivs.py Mon Jun 03 12:29:29 2013 -0400 @@ -6,14 +6,12 @@ ################################################################################ -if len(sys.argv) < 9: +if len(sys.argv) < 6: print >> sys.stderr, "Usage" sys.exit(1) -#input, p1_input, output, lo, hi, lo_ind, lo_ind_qual = sys.argv[1:8] -#individual_metadata = sys.argv[8:] -input, p1_input, output, = sys.argv[1:4] -individual_metadata = sys.argv[4:] +input, p1_input, output, input_type = sys.argv[1:5] +individual_metadata = sys.argv[5:] p_total = Population() p_total.from_tag_list(individual_metadata) @@ -33,9 +31,19 @@ args.append(prog) args.append(input) +if input_type == 'gd_snp': + args.append('1') +elif input_type == 'gd_genotype': + args.append('0') +else: + print >> sys.stderr, "unknown input type:", input_type + sys.exit(1) + columns = p1.column_list() for column in sorted(columns): + if input_type == 'gd_genotype': + column = str(int(column) - 2) args.append(column) fh = open(output, 'w') diff -r cba0d7a63b82 -r 91e835060ad2 aggregate_gd_indivs.xml --- a/aggregate_gd_indivs.xml Wed May 29 13:49:19 2013 -0400 +++ b/aggregate_gd_indivs.xml Mon Jun 03 12:29:29 2013 -0400 @@ -1,8 +1,13 @@ - + : Append summary columns for a population aggregate_gd_indivs.py "$input" "$p1_input" "$output" + #if $input_type.choice == '0' + "gd_snp" + #else if $input_type.choice == '1' + "gd_genotype" + #end if #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) #set $arg = '%s:%s' % ($individual_col, $individual) "$arg" @@ -10,12 +15,26 @@ - + + + + + + + + + + + + + + + - + @@ -30,10 +49,11 @@ **Dataset formats** -The input datasets are in gd_snp_ and gd_indivs_ formats. -The output dataset is in gd_snp_ format. (`Dataset missing?`_) +The input datasets are in gd_snp_, gd_genotype_, and gd_indivs_ formats. +The output dataset is in gd_snp_ or gd_genotype_ format. (`Dataset missing?`_) .. _gd_snp: ./static/formatHelp.html#gd_snp +.. _gd_genotype: ./static/formatHelp.html#gd_genotype .. _gd_indivs: ./static/formatHelp.html#gd_indivs .. _Dataset missing?: ./static/formatHelp.html @@ -41,17 +61,19 @@ **What it does** -The user specifies that some of the individuals in a gd_snp dataset form a -"population", by supplying a list that has been previously created using the -Specify Individuals tool. The program appends a -new "entity" (set of four columns) to the gd_snp table, analogous to the columns -for an individual but containing summary data for the population as a group. -These four columns give the total counts for the two alleles, the "genotype" for -the population, and the maximum quality value, taken over all individuals in the -population. If all defined genotypes in the population are 2 (agree with the -reference), then the population's genotype is 2, and similarly for 0; otherwise -the genotype is 1 (unless all individuals have undefined genotype, in which case -it is -1). +The user specifies that some of the individuals in a gd_snp or gd_genotype +dataset form a "population", by supplying a list that has been previously +created using the Specify Individuals tool. The program appends a new +"entity" (set of four columns for a gd_snp table, or one column for a +gd_genotype table), analogous to the column(s) for an individual but +containing summary data for the population as a group. For a gd_snp +table, these four columns give the total counts for the two alleles, +the "genotype" for the population, and the maximum quality value, taken +over all individuals in the population. If all defined genotypes in +the population are 2 (agree with the reference), then the population's +genotype is 2, and similarly for 0; otherwise the genotype is 1 (unless +all individuals have undefined genotype, in which case it is -1). +For a gd_genotype file, only the aggregate genotype is appended. ----- diff -r cba0d7a63b82 -r 91e835060ad2 dpmix.xml --- a/dpmix.xml Wed May 29 13:49:19 2013 -0400 +++ b/dpmix.xml Mon Jun 03 12:29:29 2013 -0400 @@ -42,7 +42,7 @@ - + diff -r cba0d7a63b82 -r 91e835060ad2 genome_diversity/src/aggregate.c --- a/genome_diversity/src/aggregate.c Wed May 29 13:49:19 2013 -0400 +++ b/genome_diversity/src/aggregate.c Mon Jun 03 12:29:29 2013 -0400 @@ -2,10 +2,11 @@ * a specified population to a Galaxy SNP table * * argv[1] = file containing a Galaxy table -* argv[2] ... are the starting columns (base-1) for the chosen individuals +* argv[2] = 0 for a gd_genotype file, 1 for a gd_snp file +* argv[3] ... are the starting columns (base-1) for the chosen individuals What it does on Galaxy -The user specifies that some of the individuals in a gd_snp dataset form a "population", by supplying a list that has been previously created using the Specify Individuals tool. The program appends a new "entity" (set of four columns) to the gd_snp table, analogous to the columns for an individual but containing summary data for the population as a group. These four columns give the total counts for the two alleles, the "genotype" for the population, and the maximum quality value, taken over all individuals in the population. If all defined genotypes in the population are 2 (agree with the reference), then the population's genotype is 2, and similarly for 0; otherwise the genotype is 1 (unless all individuals have undefined genotype, in which case it is -1). +The user specifies that some of the individuals in a gd_snp or gd_genotype dataset form a "population", by supplying a list that has been previously created using the Specify Individuals tool. The program appends a new "entity" (set of four columns for a gd_snp table, or one column for a gd_genotype table), analogous to the column(s) for an individual but containing summary data for the population as a group. For a gd_snp table, these four columns give the total counts for the two alleles, the "genotype" for the population, and the maximum quality value, taken over all individuals in the population. If all defined genotypes in the population are 2 (agree with the reference), then the population's genotype is 2, and similarly for 0; otherwise the genotype is 1 (unless all individuals have undefined genotype, in which case it is -1). For a gd_genotype file, only the aggregate genotype is appended. */ #include "lib.h" @@ -20,12 +21,13 @@ int main(int argc, char **argv) { FILE *fp; char *p, *z = "\t\n", buf[MOST], trash[MOST]; - int X[MOST], m, i, A, B, G, Q, g; + int X[MOST], m, i, A, B, G, Q, g, gd_snp; if (argc < 3) - fatalf("args: SNP-table individual1 ..."); + fatalf("args: SNP-table typedef individual1 ..."); - for (i = 2, nI = 0; i < argc; ++i, ++nI) + gd_snp = atoi(argv[2]); + for (i = 3, nI = 0; i < argc; ++i, ++nI) col[nI] = atoi(argv[i]); fp = ckopen(argv[1], "r"); @@ -39,8 +41,11 @@ X[i] = atoi(p); for (i = A = B = Q = 0, G = -1; i < nI; ++i) { m = col[i]; - A += X[m]; - B += X[m+1]; + if (gd_snp) { + A += X[m]; + B += X[m+1]; + Q = MAX(Q, X[m+3]); + } g = X[m+2]; if (g != -1) { if (G == -1) // first time @@ -48,14 +53,16 @@ else if (G != g) G = 1; } - Q = MAX(Q, X[m+3]); } if (i < nI) // check bounds on the population's individuals continue; // add columns if ((p = strchr(buf, '\n')) != NULL) *p = '\0'; - printf("%s\t%d\t%d\t%d\t%d\n", buf, A, B, G, Q); + if (gd_snp) + printf("%s\t%d\t%d\t%d\t%d\n", buf, A, B, G, Q); + else + printf("%s\t%d\n", buf, G); } return 0; diff -r cba0d7a63b82 -r 91e835060ad2 genome_diversity/src/dpmix.c --- a/genome_diversity/src/dpmix.c Wed May 29 13:49:19 2013 -0400 +++ b/genome_diversity/src/dpmix.c Mon Jun 03 12:29:29 2013 -0400 @@ -159,6 +159,13 @@ for (i = 0; i < 4; ++i) from[i] = 0; for (i = nsnp-1, p = last; i >= 0 && p != NULL; --i, p = p->prev) { + if (p->type == 0 && p->g[a] == -1) { // no genotype + // no state change at this SNP + for (state = 0; state < 4; ++state) + B[state][i] = state; + continue; + } + for (state = 0; state < 4; ++state) { // find highest path-score from this event onward for (m = j = 0; j < 4; ++j) { @@ -272,10 +279,12 @@ // should we discard this SNP? if (pos == -1) // SNP not mapped to the reference continue; +/* for (i = 0; i < nG && X[A[i].gcol] >= 0; ++i) ; if (i < nG) // genotype of admixed individual not called continue; +*/ // add SNP to a "backward pointing" linked list, recording the // major allele frequencies in the two reference populations diff -r cba0d7a63b82 -r 91e835060ad2 restore_attributes.xml --- a/restore_attributes.xml Wed May 29 13:49:19 2013 -0400 +++ b/restore_attributes.xml Mon Jun 03 12:29:29 2013 -0400 @@ -1,26 +1,40 @@ - - : Fill in missing properties for a gd_snp dataset + + : Fill in missing properties for a gd_snp or gd_genotype dataset cp.py "$dst" "$output" - - + + + + + + + + + + + + + + + - + **Dataset formats** -All of the input and output datasets are in gd_snp_ format. (`Dataset missing?`_) +All of the input and output datasets are in gd_snp_ or gd_genotype_ format. (`Dataset missing?`_) .. _gd_snp: ./static/formatHelp.html#gd_snp +.. _gd_genotype: ./static/formatHelp.html#gd_genotype .. _Dataset missing?: ./static/formatHelp.html -----