changeset 26:91e835060ad2

Updates to Admixture, Aggregate Individuals, and Restore Attributes to support gd_genotype
author Richard Burhans <burhans@bx.psu.edu>
date Mon, 03 Jun 2013 12:29:29 -0400
parents cba0d7a63b82
children 8997f2ca8c7a
files aggregate_gd_indivs.py aggregate_gd_indivs.xml dpmix.xml genome_diversity/src/aggregate.c genome_diversity/src/dpmix.c restore_attributes.xml
diffstat 6 files changed, 97 insertions(+), 37 deletions(-) [+]
line wrap: on
line diff
--- a/aggregate_gd_indivs.py	Wed May 29 13:49:19 2013 -0400
+++ b/aggregate_gd_indivs.py	Mon Jun 03 12:29:29 2013 -0400
@@ -6,14 +6,12 @@
 
 ################################################################################
 
-if len(sys.argv) < 9:
+if len(sys.argv) < 6:
     print >> sys.stderr, "Usage"
     sys.exit(1)
 
-#input, p1_input, output, lo, hi, lo_ind, lo_ind_qual = sys.argv[1:8]
-#individual_metadata = sys.argv[8:]
-input, p1_input, output,  = sys.argv[1:4]
-individual_metadata = sys.argv[4:]
+input, p1_input, output, input_type  = sys.argv[1:5]
+individual_metadata = sys.argv[5:]
 
 p_total = Population()
 p_total.from_tag_list(individual_metadata)
@@ -33,9 +31,19 @@
 args.append(prog)
 args.append(input)
 
+if input_type == 'gd_snp':
+    args.append('1')
+elif input_type == 'gd_genotype':
+    args.append('0')
+else:
+    print >> sys.stderr, "unknown input type:", input_type
+    sys.exit(1)
+
 columns = p1.column_list()
 
 for column in sorted(columns):
+    if input_type == 'gd_genotype':
+        column = str(int(column) - 2)
     args.append(column)
 
 fh = open(output, 'w')
--- a/aggregate_gd_indivs.xml	Wed May 29 13:49:19 2013 -0400
+++ b/aggregate_gd_indivs.xml	Mon Jun 03 12:29:29 2013 -0400
@@ -1,8 +1,13 @@
-<tool id="gd_sum_gd_snp" name="Aggregate Individuals" version="1.0.0">
+<tool id="gd_sum_gd_snp" name="Aggregate Individuals" version="1.1.0">
   <description>: Append summary columns for a population</description>
 
   <command interpreter="python">
     aggregate_gd_indivs.py "$input" "$p1_input" "$output"
+    #if $input_type.choice == '0'
+      "gd_snp"
+    #else if $input_type.choice == '1'
+      "gd_genotype"
+    #end if
     #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns)
         #set $arg = '%s:%s' % ($individual_col, $individual)
         "$arg"
@@ -10,12 +15,26 @@
   </command>
 
   <inputs>
-    <param name="input" type="data" format="gd_snp" label="SNP dataset" />
+
+  <conditional name="input_type">
+    <param name="choice" type="select" format="integer" label="Input format">
+      <option value="0" selected="true">gd_snp</option>
+      <option value="1">gd_genotype</option>
+    </param>
+
+    <when value="0">
+      <param name="input" type="data" format="gd_snp" label="SNP dataset" />
+    </when>
+    <when value="1">
+      <param name="input" type="data" format="gd_genotype" label="Genotype dataset" />
+    </when>
+  </conditional>
+
     <param name="p1_input" type="data" format="gd_indivs" label="Population individuals" />
   </inputs>
 
   <outputs>
-    <data name="output" format="gd_snp" metadata_source="input" />
+    <data name="output" format="input" format_source="input" metadata_source="input" />
   </outputs>
 
   <tests>
@@ -30,10 +49,11 @@
 
 **Dataset formats**
 
-The input datasets are in gd_snp_ and gd_indivs_ formats.
-The output dataset is in gd_snp_ format.  (`Dataset missing?`_)
+The input datasets are in gd_snp_, gd_genotype_, and gd_indivs_ formats.
+The output dataset is in gd_snp_ or gd_genotype_ format.  (`Dataset missing?`_)
 
 .. _gd_snp: ./static/formatHelp.html#gd_snp
+.. _gd_genotype: ./static/formatHelp.html#gd_genotype
 .. _gd_indivs: ./static/formatHelp.html#gd_indivs
 .. _Dataset missing?: ./static/formatHelp.html
 
@@ -41,17 +61,19 @@
 
 **What it does**
 
-The user specifies that some of the individuals in a gd_snp dataset form a
-"population", by supplying a list that has been previously created using the
-Specify Individuals tool.  The program appends a
-new "entity" (set of four columns) to the gd_snp table, analogous to the columns
-for an individual but containing summary data for the population as a group.
-These four columns give the total counts for the two alleles, the "genotype" for
-the population, and the maximum quality value, taken over all individuals in the
-population.  If all defined genotypes in the population are 2 (agree with the
-reference), then the population's genotype is 2, and similarly for 0; otherwise
-the genotype is 1 (unless all individuals have undefined genotype, in which case
-it is -1).
+The user specifies that some of the individuals in a gd_snp or gd_genotype
+dataset form a "population", by supplying a list that has been previously
+created using the Specify Individuals tool.  The program appends a new
+"entity" (set of four columns for a gd_snp table, or one column for a
+gd_genotype table), analogous to the column(s) for an individual but
+containing summary data for the population as a group.  For a gd_snp
+table, these four columns give the total counts for the two alleles,
+the "genotype" for the population, and the maximum quality value, taken
+over all individuals in the population.  If all defined genotypes in
+the population are 2 (agree with the reference), then the population's
+genotype is 2, and similarly for 0; otherwise the genotype is 1 (unless
+all individuals have undefined genotype, in which case it is -1).
+For a gd_genotype file, only the aggregate genotype is appended.
 
 -----
 
--- a/dpmix.xml	Wed May 29 13:49:19 2013 -0400
+++ b/dpmix.xml	Mon Jun 03 12:29:29 2013 -0400
@@ -42,7 +42,7 @@
     <param name="ap2_input" type="data" format="gd_indivs" label="Ancestral population 2 individuals" />
     <param name="p_input" type="data" format="gd_indivs" label="Potentially admixed individuals" />
 
-    <param name="switch_penalty" type="float" min="0" value="10" label="Genotype switch penalty" help="Note: typically between 10 and 100."/>
+    <param name="switch_penalty" type="float" min="0" value="10" label="Genotype switch penalty" help="Note: Depends on the density of SNPs.  For instance, with 50,000 SNPs in a vertebrate genome, 1.0 might be appropriate, with millions of SNPs, a value between 10 and 100 might be reasonable."/>
   </inputs>
 
   <outputs>
--- a/genome_diversity/src/aggregate.c	Wed May 29 13:49:19 2013 -0400
+++ b/genome_diversity/src/aggregate.c	Mon Jun 03 12:29:29 2013 -0400
@@ -2,10 +2,11 @@
 *  a specified population to a Galaxy SNP table
 *
 *  argv[1] = file containing a Galaxy table
-*  argv[2] ... are the starting columns (base-1) for the chosen individuals
+*  argv[2] = 0 for a gd_genotype file, 1 for a gd_snp file
+*  argv[3] ... are the starting columns (base-1) for the chosen individuals
 
 What it does on Galaxy
-The user specifies that some of the individuals in a gd_snp dataset form a "population", by supplying a list that has been previously created using the Specify Individuals tool. The program appends a new "entity" (set of four columns) to the gd_snp table, analogous to the columns for an individual but containing summary data for the population as a group. These four columns give the total counts for the two alleles, the "genotype" for the population, and the maximum quality value, taken over all individuals in the population. If all defined genotypes in the population are 2 (agree with the reference), then the population's genotype is 2, and similarly for 0; otherwise the genotype is 1 (unless all individuals have undefined genotype, in which case it is -1).
+The user specifies that some of the individuals in a gd_snp or gd_genotype dataset form a "population", by supplying a list that has been previously created using the Specify Individuals tool. The program appends a new "entity" (set of four columns for a gd_snp table, or one column for a gd_genotype table), analogous to the column(s) for an individual but containing summary data for the population as a group. For a gd_snp table, these four columns give the total counts for the two alleles, the "genotype" for the population, and the maximum quality value, taken over all individuals in the population. If all defined genotypes in the population are 2 (agree with the reference), then the population's genotype is 2, and similarly for 0; otherwise the genotype is 1 (unless all individuals have undefined genotype, in which case it is -1). For a gd_genotype file, only the aggregate genotype is appended.
 */
 
 #include "lib.h"
@@ -20,12 +21,13 @@
 int main(int argc, char **argv) {
 	FILE *fp;
 	char *p, *z = "\t\n", buf[MOST], trash[MOST];
-	int X[MOST], m, i, A, B, G, Q, g;
+	int X[MOST], m, i, A, B, G, Q, g, gd_snp;
 
 	if (argc < 3)
-		fatalf("args: SNP-table individual1 ...");
+		fatalf("args: SNP-table typedef individual1 ...");
 
-	for (i = 2, nI = 0; i < argc; ++i, ++nI)
+	gd_snp = atoi(argv[2]);
+	for (i = 3, nI = 0; i < argc; ++i, ++nI)
 		col[nI] = atoi(argv[i]);
 
 	fp = ckopen(argv[1], "r");
@@ -39,8 +41,11 @@
 			X[i] = atoi(p);
 		for (i = A = B = Q = 0, G = -1; i < nI; ++i) {
 			m = col[i];
-			A += X[m];
-			B += X[m+1];
+			if (gd_snp) {
+				A += X[m];
+				B += X[m+1];
+				Q = MAX(Q, X[m+3]);
+			}
 			g = X[m+2];
 			if (g != -1) {
 				if (G == -1)	// first time
@@ -48,14 +53,16 @@
 				else if (G != g)
 					G = 1;
 			}
-			Q = MAX(Q, X[m+3]);
 		}
 		if (i < nI)	// check bounds on the population's individuals
 			continue;
 		// add columns
 		if ((p = strchr(buf, '\n')) != NULL)
 			*p = '\0';
-		printf("%s\t%d\t%d\t%d\t%d\n", buf, A, B, G, Q);
+		if (gd_snp)
+			printf("%s\t%d\t%d\t%d\t%d\n", buf, A, B, G, Q);
+		else
+			printf("%s\t%d\n", buf, G);
 	}
 
 	return 0;
--- a/genome_diversity/src/dpmix.c	Wed May 29 13:49:19 2013 -0400
+++ b/genome_diversity/src/dpmix.c	Mon Jun 03 12:29:29 2013 -0400
@@ -159,6 +159,13 @@
 	for (i = 0; i < 4; ++i)
 		from[i] = 0;
 	for (i = nsnp-1, p = last; i >= 0 && p != NULL; --i, p = p->prev) {
+		if (p->type == 0 && p->g[a] == -1) { // no genotype
+			// no state change at this SNP
+			for (state = 0; state < 4; ++state)
+				B[state][i] = state;
+			continue;
+		}
+		
 		for (state = 0; state < 4; ++state) {
 			// find highest path-score from this event onward
 			for (m = j = 0; j < 4; ++j) {
@@ -272,10 +279,12 @@
 		// should we discard this SNP?
 		if (pos == -1)	// SNP not mapped to the reference
 			continue;
+/*
 		for (i = 0; i < nG && X[A[i].gcol] >= 0; ++i)
 			;
 		if (i < nG)	// genotype of admixed individual not called
 			continue;
+*/
 
 		// add SNP to a "backward pointing" linked list, recording the
 		// major allele frequencies in the two reference populations
--- a/restore_attributes.xml	Wed May 29 13:49:19 2013 -0400
+++ b/restore_attributes.xml	Mon Jun 03 12:29:29 2013 -0400
@@ -1,26 +1,40 @@
-<tool id="gd_restore_attributes" name="Restore Attributes" version="1.0.0">
-  <description>: Fill in missing properties for a gd_snp dataset</description>
+<tool id="gd_restore_attributes" name="Restore Attributes" version="1.1.0">
+  <description>: Fill in missing properties for a gd_snp or gd_genotype dataset</description>
 
   <command interpreter="python">
     cp.py "$dst" "$output"
   </command>
 
   <inputs>
-    <param name="src" type="data" format="gd_snp" label="SNP dataset to copy attributes from" />
-    <param name="dst" type="data" format="gd_snp" label="SNP dataset to receive attributes" />
+    <conditional name="input_type">
+      <param name="choice" type="select" format="integer" label="Input format">
+        <option value="0" selected="true">gd_snp</option>
+        <option value="1">gd_genotype</option>
+      </param>
+
+      <when value="0">
+        <param name="input" type="data" format="gd_snp" label="SNP dataset to copy attributes from" />
+        <param name="dst" type="data" format="gd_snp" label="SNP dataset to receive attributes" />
+      </when>
+      <when value="1">
+        <param name="input" type="data" format="gd_genotype" label="Genotype dataset to copy attributes from" />
+        <param name="dst" type="data" format="gd_genotype" label="Genotype dataset to receive attributes" />
+      </when>
+    </conditional>
   </inputs>
 
   <outputs>
-    <data name="output" format="gd_snp" metadata_source="src" />
+    <data name="output" format="input" format_source="input" metadata_source="input" />
   </outputs>
 
   <help>
 
 **Dataset formats**
 
-All of the input and output datasets are in gd_snp_ format.  (`Dataset missing?`_)
+All of the input and output datasets are in gd_snp_ or gd_genotype_ format.  (`Dataset missing?`_)
 
 .. _gd_snp: ./static/formatHelp.html#gd_snp
+.. _gd_genotype: ./static/formatHelp.html#gd_genotype
 .. _Dataset missing?: ./static/formatHelp.html
 
 -----