Mercurial > repos > miller-lab > genome_diversity
annotate filter_gd_snp.xml @ 31:a631c2f6d913
Update to Miller Lab devshed revision 3c4110ffacc3
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Fri, 20 Sep 2013 13:25:27 -0400 |
parents | 8997f2ca8c7a |
children |
rev | line source |
---|---|
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
1 <tool id="gd_filter_gd_snp" name="Filter SNPs" version="1.2.0"> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
2 <description>: Discard some SNPs based on coverage, quality or spacing</description> |
13 | 3 |
4 <command interpreter="python"> | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
5 #import json |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
6 #import base64 |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
7 #import zlib |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
8 #set $ind_names = $input.dataset.metadata.individual_names |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
9 #set $ind_colms = $input.dataset.metadata.individual_columns |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
10 #set $ind_dict = dict(zip($ind_names, $ind_colms)) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
11 #set $ind_json = json.dumps($ind_dict, separators=(',',':')) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
12 #set $ind_comp = zlib.compress($ind_json, 9) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
13 #set $ind_arg = base64.b64encode($ind_comp) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
14 filter_gd_snp.py '$input' '$output' |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
15 #if str($input.dataset.metadata.dbkey) == '?' |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
16 '0' |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
17 #else |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
18 '$input.dataset.metadata.ref' |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
19 #end if |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
20 '$min_spacing' '$lo_genotypes' '$input_type.p1_input' |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
21 #if $input_type.choice == '0' |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
22 'gd_snp' '$input_type.lo_coverage' '$input_type.hi_coverage' '$input_type.low_ind_cov' '$input_type.lo_quality' |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
23 #else if $input_type.choice == '1' |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
24 'gd_genotype' '0' '0' '0' '0' |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
25 #end if |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
26 '$ind_arg' |
13 | 27 </command> |
28 | |
29 <inputs> | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
30 <conditional name="input_type"> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
31 <param name="choice" type="select" format="integer" label="Input format"> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
32 <option value="0" selected="true">gd_snp</option> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
33 <option value="1">gd_genotype</option> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
34 </param> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
35 <when value="0"> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
36 <param name="input" type="data" format="gd_snp" label="SNP dataset" /> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
37 <param name="p1_input" type="data" format="gd_indivs" label="Population individuals" /> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
38 <param name="lo_coverage" type="text" value="0" label="Lower bound on total coverage"> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
39 <sanitizer> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
40 <valid initial="string.digits"> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
41 <!-- % is the percent (%) character --> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
42 <add value="%" /> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
43 </valid> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
44 </sanitizer> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
45 </param> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
46 <param name="hi_coverage" type="text" value="1000" label="Upper bound on total coverage"> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
47 <sanitizer> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
48 <valid initial="string.digits"> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
49 <!-- % is the percent (%) character --> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
50 <add value="%" /> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
51 </valid> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
52 </sanitizer> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
53 </param> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
54 <param name="low_ind_cov" type="integer" min="0" value="0" label="Lower bound on individual coverage" /> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
55 <param name="lo_quality" type="integer" min="0" value="0" label="Lower bound on individual quality values" /> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
56 </when> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
57 <when value="1"> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
58 <param name="input" type="data" format="gd_genotype" label="Genotype dataset" /> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
59 <param name="p1_input" type="data" format="gd_indivs" label="Population individuals" /> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
60 </when> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
61 </conditional> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
62 <param name="min_spacing" type="integer" min="0" value="0" label="Minimum spacing between SNPs" /> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
63 <param name="lo_genotypes" type="integer" min="0" value="0" label="Lower bound on the number of defined genotypes" /> |
13 | 64 </inputs> |
65 | |
66 <outputs> | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
67 <data name="output" format="input" format_source="input" metadata_source="input" /> |
13 | 68 </outputs> |
69 | |
31
a631c2f6d913
Update to Miller Lab devshed revision 3c4110ffacc3
Richard Burhans <burhans@bx.psu.edu>
parents:
27
diff
changeset
|
70 <requirements> |
a631c2f6d913
Update to Miller Lab devshed revision 3c4110ffacc3
Richard Burhans <burhans@bx.psu.edu>
parents:
27
diff
changeset
|
71 <requirement type="package" version="0.1">gd_c_tools</requirement> |
a631c2f6d913
Update to Miller Lab devshed revision 3c4110ffacc3
Richard Burhans <burhans@bx.psu.edu>
parents:
27
diff
changeset
|
72 </requirements> |
a631c2f6d913
Update to Miller Lab devshed revision 3c4110ffacc3
Richard Burhans <burhans@bx.psu.edu>
parents:
27
diff
changeset
|
73 |
13 | 74 <tests> |
75 <test> | |
76 <param name="input" value="test_in/sample.gd_snp" ftype="gd_snp" /> | |
77 <param name="p1_input" value="test_in/a.gd_indivs" ftype="gd_indivs" /> | |
78 <param name="lo_coverage" value="0" /> | |
79 <param name="hi_coverage" value="1000" /> | |
80 <param name="low_ind_cov" value="3" /> | |
81 <param name="lo_quality" value="30" /> | |
82 <output name="output" file="test_out/modify_snp_table/modify.gd_snp" /> | |
83 </test> | |
84 </tests> | |
85 | |
86 <help> | |
87 | |
88 **Dataset formats** | |
89 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
90 The input datasets are in gd_snp_, gd_genotype_, and gd_indivs_ formats. |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
91 The output dataset is in gd_snp_ or gd_genotype_ format. (`Dataset missing?`_) |
13 | 92 |
93 .. _gd_snp: ./static/formatHelp.html#gd_snp | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
94 .. _gd_genotype: ./static/formatHelp.html#gd_genotype |
13 | 95 .. _gd_indivs: ./static/formatHelp.html#gd_indivs |
96 .. _Dataset missing?: ./static/formatHelp.html | |
97 | |
98 ----- | |
99 | |
100 **What it does** | |
101 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
102 For a gd_snp dataset, the user specifies that some of the individuals |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
103 form a "population", by supplying a list that has been previously created |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
104 using the Specify Individuals tool. SNPs are then discarded if their |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
105 total coverage for the population is too low or too high, or if their |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
106 coverage or quality score for any individual in the population is too low. |
13 | 107 |
22
95a05c1ef5d5
update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
18
diff
changeset
|
108 The upper and lower bounds on total population coverage can be specified |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
109 either as read counts or as percentiles (e.g. "5%", with no decimal |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
110 places). For percentile bounds the SNPs are ranked by read count, so |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
111 for example, a lower bound of "10%" means that the least-covered 10% |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
112 of the SNPs will be discarded, while an upper bound of, say, "80%" will |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
113 discard all SNPs above the 80% mark, i.e. the top 20%. The threshold |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
114 for the lower bound on individual coverage can only be specified as a |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
115 plain read count. |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
116 |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
117 For either a gd_snp or gd_genotype dataset, the user can specify a |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
118 minimum number of defined genotypes (i.e., not -1) and/or a minimum |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
119 spacing relative to the reference sequence. An error is reported if the |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
120 user requests a minimum spacing but no reference sequence is available. |
22
95a05c1ef5d5
update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
18
diff
changeset
|
121 |
13 | 122 ----- |
123 | |
124 **Example** | |
125 | |
126 - input gd_snp:: | |
127 | |
128 Contig161_chr1_4641264_4641879 115 C T 73.5 chr1 4641382 C 6 0 2 45 8 0 2 51 15 0 2 72 5 0 2 42 6 0 2 45 10 0 2 57 Y 54 0.323 0 | |
129 Contig48_chr1_10150253_10151311 11 A G 94.3 chr1 10150264 A 1 0 2 30 1 0 2 30 1 0 2 30 3 0 2 36 1 0 2 30 1 0 2 30 Y 22 +99. 0 | |
130 Contig20_chr1_21313469_21313570 66 C T 54.0 chr1 21313534 C 4 0 2 39 4 0 2 39 5 0 2 42 4 0 2 39 4 0 2 39 5 0 2 42 N 1 +99. 0 | |
131 etc. | |
132 | |
133 - input individuals:: | |
134 | |
135 9 PB1 | |
136 13 PB2 | |
137 17 PB3 | |
138 | |
139 - output when the lower bound on individual coverage is "3":: | |
140 | |
141 Contig161_chr1_4641264_4641879 115 C T 73.5 chr1 4641382 C 6 0 2 45 8 0 2 51 15 0 2 72 5 0 2 42 6 0 2 45 10 0 2 57 Y 54 0.323 0 | |
142 Contig20_chr1_21313469_21313570 66 C T 54.0 chr1 21313534 C 4 0 2 39 4 0 2 39 5 0 2 42 4 0 2 39 4 0 2 39 5 0 2 42 N 1 +99. 0 | |
143 etc. | |
144 | |
145 </help> | |
146 </tool> |