annotate tools/rgenetics/rgClean.xml @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 <tool id="rgClean1" name="Clean genotypes:">
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 <description>filter markers, subjects</description>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 <command interpreter="python">
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 '$geno' '$hwe' '$maf' '$mef' '$mei' '$out_file1' '$out_file1.files_path'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 '$relfilter' '$afffilter' '$sexfilter' '$fixaff'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 </command>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 <inputs>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 <param name="input_file" type="data" label="RGenetics genotype library file in compressed Plink format"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 size="120" format="pbed" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 <param name="title" type="text" size="80" label="Descriptive title for cleaned genotype file" value="Cleaned_data"/>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 <param name="geno" type="text" label="Maximum Missing Fraction: Markers" value="0.05" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 <param name="mind" type="text" value="0.1" label="Maximum Missing Fraction: Subjects"/>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 <param name="mef" type="text" label="Maximum Mendel Error Rate: Family" value="0.05"/>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 <param name="mei" type="text" label="Maximum Mendel Error Rate: Marker" value="0.05"/>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 <param name="hwe" type="text" value="0" label="Smallest HWE p value (set to 0 for all)" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 <param name="maf" type="text" value="0.01"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 label="Smallest Minor Allele Frequency (set to 0 for all)"/>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 <param name='relfilter' label = "Filter on pedigree relatedness" type="select"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 optional="false" size="132"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 help="Optionally remove related subjects if pedigree identifies founders and their offspring">
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 <option value="all" selected='true'>No filter on relatedness</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 <option value="fo" >Keep Founders only (pedigree m/f ID = "0")</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 <option value="oo" >Keep Offspring only (one randomly chosen if >1 sibs in family)</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 </param>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 <param name='afffilter' label = "Filter on affection status" type="select"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 optional="false" size="132"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 help="Optionally remove affected or non affected subjects">
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 <option value="allaff" selected='true'>No filter on affection status</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 <option value="affonly" >Keep Controls only (affection='1')</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 <option value="unaffonly" >Keep Cases only (affection='2')</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 </param>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 <param name='sexfilter' label = "Filter on gender" type="select"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 optional="false" size="132"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 help="Optionally remove all male or all female subjects">
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 <option value="allsex" selected='true'>No filter on gender status</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 <option value="msex" >Keep Males only (pedigree gender='1')</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 <option value="fsex" >Keep Females only (pedigree gender='2')</option>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 </param>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 <param name="fixaff" type="text" value="0"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 label = "Change ALL subjects affection status to (0=no change,1=unaff,2=aff)"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 help="Use this option to switch the affection status to a new value for all output subjects" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 </inputs>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 <outputs>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 <data format="pbed" name="out_file1" metadata_source="input_file" label="${title}_rgClean.pbed" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 </outputs>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 <tests>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 <test>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 <param name='input_file' value='tinywga' ftype='pbed' >
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 <metadata name='base_name' value='tinywga' />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 <composite_data value='tinywga.bim' />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 <composite_data value='tinywga.bed' />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 <composite_data value='tinywga.fam' />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 <edit_attributes type='name' value='tinywga' />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 </param>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 <param name='title' value='rgCleantest1' />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 <param name="geno" value="1" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 <param name="mind" value="1" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 <param name="mef" value="0" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 <param name="mei" value="0" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 <param name="hwe" value="0" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 <param name="maf" value="0" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 <param name="relfilter" value="all" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 <param name="afffilter" value="allaff" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 <param name="sexfilter" value="allsex" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 <param name="fixaff" value="0" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 <output name='out_file1' file='rgtestouts/rgClean/rgCleantest1.pbed' compare="diff" lines_diff="25" >
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 <extra_files type="file" name='rgCleantest1.bim' value="rgtestouts/rgClean/rgCleantest1.bim" compare="diff" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 <extra_files type="file" name='rgCleantest1.fam' value="rgtestouts/rgClean/rgCleantest1.fam" compare="diff" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 <extra_files type="file" name='rgCleantest1.bed' value="rgtestouts/rgClean/rgCleantest1.bed" compare="diff" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 </output>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 </test>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 </tests>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 <help>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 .. class:: infomark
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 **Syntax**
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 - **Genotype data** is the input genotype file chosen from your current history
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 - **Descriptive title** is the name to use for the filtered output file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 - **Missfrac threshold: subjects** is the threshold for missingness by subject. Subjects with more than this fraction missing will be excluded from the import
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 - **Missfrac threshold: markers** is the threshold for missingness by marker. Markers with more than this fraction missing will be excluded from the import
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 - **MaxMendel Individuals** Mendel error fraction above which to exclude subjects with more than the specified fraction of mendelian errors in transmission (for family data only)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 - **MaxMendel Families** Mendel error fraction above which to exclude families with more than the specified fraction of mendelian errors in transmission (for family data only)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 - **HWE** is the threshold for HWE test p values below which the marker will not be imported. Set this to -1 and all markers will be imported regardless of HWE p value
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 - **MAF** is the threshold for minor allele frequency - SNPs with lower MAF will be excluded
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 - **Filters** for founders/offspring or affected/unaffected or males/females are optionally available if needed
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 - **Change Affection** is only needed if you want to change the affection status for creating new analysis datasets
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 -----
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 **Attribution**
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 This tool relies on the work of many people. It uses Plink http://pngu.mgh.harvard.edu/~purcell/plink/,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 and the R http://cran.r-project.org/ and
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 Bioconductor http://www.bioconductor.org/ projects.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 respectively.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 In particular, http://pngu.mgh.harvard.edu/~purcell/plink/
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 has excellent documentation describing the parameters you can set here.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 This implementation is a Galaxy tool wrapper around these third party applications.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 It was originally designed and written for family based data from the CAMP Illumina run of 2007 by
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 ross lazarus (ross.lazarus@gmail.com) and incorporated into the rgenetics toolkit.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 Rgenetics merely exposes them, wrapping Plink so you can use it in Galaxy.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 -----
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 **Summary**
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 Reliable statistical inference depends on reliable data. Poor quality samples and markers
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 may add more noise than signal, decreasing statistical power. Removing the worst of them
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 can be done by setting thresholds for some of the commonly used technical quality measures
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 for genotype data. Of course discordant replicate calls are also very informative but are not
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 in scope here.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 Marker cleaning: Filters are available to remove markers below a specific minor allele
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 frequency, beyond a Hardy Wienberg threshold, below a minor allele frequency threshold,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 or above a threshold for missingness. If family data are available, thresholds for Mendelian
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 error can be set.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 Subject cleaning: Filters are available to remove subjects with many missing calls. Subjects and markers for family data can be filtered by proportions
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 of Mendelian errors in observed transmission. Use the QC reporting tool to
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130 generate a comprehensive series of reports for quality control.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 Note that ancestry and cryptic relatedness should also be checked using the relevant tools.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 -----
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 .. class:: infomark
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 **Tip**
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 You can check that you got what you asked for by running the QC tool to ensure that the distributions
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 are truncated the way you expect. Note that you do not expect that the thresholds will be exactly
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142 what you set - some bad assays and subjects are out in multiple QC measures, so you sometimes have
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 more samples or markers than you exactly set for each threshold. Finally, the ordering of
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144 operations matters and Plink is somewhat restrictive about what it will do on each pass
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145 of the data. At least it's fixed.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 -----
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 This Galaxy tool was written by Ross Lazarus for the Rgenetics project
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150 It uses Plink for most calculations - for full Plink attribution, source code and documentation,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151 please see http://pngu.mgh.harvard.edu/~purcell/plink/ plus some custom python code
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153 </help>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154 </tool>