3
|
1 <tool id="sniplay_vcftoolsfilter" name="VCFtools Filter" version="1.1.2">
|
2
|
2
|
|
3 <!-- [REQUIRED] Tool description displayed after the tool name -->
|
|
4 <description> </description>
|
|
5
|
|
6 <!-- [OPTIONAL] 3rd party tools, binaries, modules... required for the tool to work -->
|
|
7 <requirements>
|
|
8 <requirement type="binary">perl</requirement>
|
|
9 <requirement type="package" version="0.1.12b">vcftools</requirement>
|
|
10 </requirements>
|
|
11
|
3
|
12
|
|
13 <!-- [STRONGLY RECOMMANDED] Exit code rules -->
|
|
14 <stdio>
|
|
15 <!-- [HELP] If no exit code rule is defined, the tool will stop if anything is written to STDERR -->
|
|
16 <exit_code range="1:" level="fatal" />
|
|
17 </stdio>
|
|
18
|
2
|
19 <!-- [REQUIRED] The command to execute -->
|
|
20 <command interpreter="perl">
|
3
|
21 vcfToolsFilter.sh $filein $fileout $filelog $export $frequency $max_freq $allow_missing $nb_alleles_min $nb_alleles_max $type_p $bound_start $bound_end
|
2
|
22 #if str( $samples ) == "":
|
|
23 'None'
|
|
24 #else
|
|
25 $samples
|
|
26 #end if
|
|
27 #if str( $chromosomes ) == "":
|
|
28 'None'
|
|
29 #else
|
|
30 $chromosomes
|
|
31 #end if
|
|
32 #if str( $export ) == "plink":
|
|
33 $fileout_map
|
|
34 #else
|
|
35 ''
|
|
36 #end if
|
|
37 </command>
|
|
38
|
|
39 <!-- [REQUIRED] Input files and tool parameters -->
|
|
40 <inputs>
|
|
41 <param name="filein" type="data" format="vcf" optional="false" label="VCF input" />
|
|
42 <param name="fileout_label" type="text" value="filtered" optional="false" label="Output file basename"/>
|
|
43 <param name="samples" type="text" optional="true" label="Samples" help="Samples to be analyzed. Comma separated list">
|
|
44 <validator type="regex" message="Please enter a comma separated list.">^\w+(,\w+)*$</validator>
|
|
45 </param>
|
|
46 <param name="chromosomes" type="text" optional="true" label="Chromosomes" help="Chromosomes to be analyzed. Comma separated list">
|
|
47 <validator type="regex" message="Please enter a comma separated list.">^\w+(,\w+)*$</validator>
|
|
48 </param>
|
|
49 <param name="export" type="select" label="Output format" >
|
|
50 <option value="VCF" selected="true">VCF</option>
|
|
51 <option value="freq">freq</option>
|
|
52 <option value="plink">plink</option>
|
|
53 </param>
|
|
54 <param name="frequency" type="float" value="0.001" label="Minimum MAF." help="Minimum frequency." />
|
|
55 <param name="max_freq" type="float" value="0.5" label="Maximum MAF." help="Maximum frequency." />
|
|
56 <param name="allow_missing" type="float" value="1" min="0" max="1" label="Missing data proportion" help="Allowed missing data proportion per site. Must be comprised between 0 and 1." />
|
|
57 <param name="nb_alleles_min" type="integer" value="2" label="Minimum number of alleles" help="Minimum accepted number of alleles." min="2" max="4" />
|
3
|
58 <param name="nb_alleles_max" type="integer" value="4" label="Maximum number of alleles" help="Maximum accepted number of alleles." min="2" max="4" />
|
2
|
59 <param name="type_p" type="select" label="Polymorphisms" help="Type of polymorphisms to keep." >
|
|
60 <option value="ALL" selected="true">All</option>
|
|
61 <option value="SNP">SNP</option>
|
|
62 <option value="INDEL">Indel</option>
|
|
63 </param>
|
|
64 <param name="bound_start" type="integer" value="1" label="Lower bound" help="Lower bound for a range of sites to be processed." />
|
|
65 <param name="bound_end" type="integer" value="100000000" label="Upper bound" help="Upper bound for a range of sites to be processed." />
|
|
66 </inputs>
|
|
67
|
|
68 <!-- [REQUIRED] Output files -->
|
|
69 <outputs>
|
|
70 <data name="fileout" format="vcf" label="${fileout_label}.#if str($export)=='plink' then 'ped' else '' # #if str($export)=='freq' then 'frq' else '' # #if str($export)=='VCF' then 'vcf' else '' #" >
|
|
71 <change_format>
|
|
72 <when input="export" value="freq" format="tabular" />
|
|
73 <when input="export" value="plink" format="txt" />
|
|
74 </change_format>
|
|
75 </data>
|
3
|
76 <data name="fileout_map" format="tabular" label="${fileout_label}.map">
|
2
|
77 <filter>(export == 'plink')</filter>
|
|
78 </data>
|
|
79 <data name="filelog" format="txt" label="${fileout_label}.log" />
|
|
80 </outputs>
|
|
81
|
|
82
|
|
83 <!-- [OPTIONAL] Tests to be run manually by the Galaxy admin -->
|
|
84 <tests>
|
|
85 <!-- [HELP] Test files have to be in the ~/test-data directory -->
|
|
86 <test>
|
|
87 <param name="filein" value="sample.vcf" />
|
|
88 <param name="chromosomes" value="chr1" />
|
|
89 <param name="export" value="VCF" />
|
|
90 <param name="frequency" value="0.001" />
|
|
91 <param name="max_freq" value="0.5" />
|
|
92 <param name="allow_missing" value="1" />
|
|
93 <param name="nb_alleles_min" value="2" />
|
|
94 <param name="nb_alleles_max" value="4" />
|
|
95 <param name="type_p" value="ALL" />
|
|
96 <param name="bound_start" value="1" />
|
|
97 <param name="bound_end" value="100000000" />
|
4
|
98 <output name="fileout" file="filter-result.vcf" />
|
3
|
99 </test>
|
|
100 <test>
|
|
101 <param name="filein" value="sample.vcf" />
|
|
102 <param name="export" value="plink" />
|
|
103 <param name="type_p" value="SNP" />
|
4
|
104 <output name="fileout" file="filter-result.ped" />
|
|
105 <output name="fileout_map" file="filter-result.map" />
|
3
|
106 </test>
|
|
107 <test>
|
|
108 <param name="filein" value="sample.vcf" />
|
|
109 <param name="export" value="freq" />
|
|
110 <param name="type_p" value="ALL" />
|
4
|
111 <output name="fileout" file="filter-result.frq" />
|
2
|
112 </test>
|
|
113 </tests>
|
|
114
|
|
115 <!-- [OPTIONAL] Help displayed in Galaxy -->
|
|
116 <help>
|
|
117
|
|
118 .. class:: infomark
|
|
119
|
|
120 **Authors** Adam Auton, Petr Danecek and Anthony Marcketta (C++ Module) : VCFtools_
|
|
121
|
|
122 .. _VCFtools: http://vcftools.sourceforge.net
|
|
123
|
|
124 | **Please cite** "The Variant Call Format and VCFtools", Petr Danecek, Adam Auton, Goncalo Abecasis, Cornelis A. Albers, Eric Banks, Mark A. DePristo, Robert Handsaker, Gerton Lunter, Gabor Marth, Stephen T. Sherry, Gilean McVean, Richard Durbin and 1000 Genomes Project Analysis Group, **Bioinformatics**, 2011
|
|
125
|
|
126 .. class:: infomark
|
|
127
|
|
128 **Galaxy integration** Andres Gwendoline, Institut Français de Bioinformatique.
|
|
129
|
|
130 .. class:: infomark
|
|
131
|
|
132 **Support** For any questions about Galaxy integration, please send an e-mail to support.abims@sb-roscoff.fr
|
|
133
|
|
134 ---------------------------------------------------
|
|
135
|
|
136
|
|
137
|
|
138 ================
|
|
139 VCF tools filter
|
|
140 ================
|
|
141
|
|
142 -----------
|
|
143 Description
|
|
144 -----------
|
|
145
|
|
146 | Filter VCF file
|
|
147 | For further informations on VCFtools, please visite the VCFtools website_.
|
|
148
|
|
149 .. _website: http://vcftools.sourceforge.net
|
|
150
|
|
151 -----------------
|
|
152 Workflow position
|
|
153 -----------------
|
|
154
|
|
155 **Upstream tools**
|
|
156
|
|
157 =========== ========================== =======
|
|
158 Name output file(s) format
|
|
159 =========== ========================== =======
|
|
160 =========== ========================== =======
|
|
161
|
|
162
|
|
163 **Downstream tools**
|
|
164
|
|
165 =========== ========================== =======
|
|
166 Name output file(s) format
|
|
167 =========== ========================== =======
|
|
168 =========== ========================== =======
|
|
169
|
|
170
|
|
171 ----------
|
|
172 Input file
|
|
173 ----------
|
|
174
|
|
175 VCF file
|
|
176 VCF file with all SNPs
|
|
177
|
|
178 ----------
|
|
179 Parameters
|
|
180 ----------
|
|
181
|
|
182 Output file basename
|
|
183 Prefix for the output VCF file
|
|
184
|
|
185 Samples
|
|
186 Samples to be analyzed. Comma separated list
|
|
187
|
|
188 Chromosomes
|
|
189 Chromosomes to be analyzed. Comma separated list
|
|
190
|
|
191 Output format
|
|
192 VCF/freq/plink
|
|
193
|
|
194 Minimum MAF
|
|
195 Minimum frequency
|
|
196
|
|
197 Maximum MAF
|
|
198 Maximum frequency
|
|
199
|
|
200 Missing data proportion
|
|
201 Allowed missing data proportion per site. Must be comprised between 0 and 1.
|
|
202
|
|
203 Number of alleles
|
|
204 Accepted number of alleles min and max.
|
|
205
|
|
206 Polymorphisms
|
|
207 Type of polymorphisms to keep (ALL/SNP/INDEL).
|
|
208 Bounds
|
|
209 Lower bound and upper bound for a range of sites to be processed.
|
|
210
|
|
211 ------------
|
|
212 Output files
|
|
213 ------------
|
|
214
|
|
215 VCF file
|
|
216 VCF file filtered
|
|
217
|
|
218 Log file
|
|
219
|
|
220 ---------------------------------------------------
|
|
221
|
|
222 ---------------
|
|
223 Working example
|
|
224 ---------------
|
|
225
|
|
226 Input files
|
|
227 ===========
|
|
228
|
|
229 VCF file
|
|
230 ---------
|
|
231
|
|
232 ::
|
|
233
|
|
234 #fileformat=VCFv4.1
|
|
235 #FILTER=<ID=LowQual,Description="Low quality">
|
|
236 #FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
|
|
237 [...]
|
|
238 CHROM POS ID REF ALT QUAL FILTER INFO FORMAT CATB1
|
|
239 chr1 2209 . G T 213.84 . AC=2;AF=1.00;AN=2;DP=7;Dels=0.00;FS=0.000;HaplotypeScore=0.0000;MLEAC=2;MLEAF=1.00;MQ=41.50;MQ0=0;QD=30.55;EFF=DOWNSTREAM(MODIFIER||||Cc01g00020|mRNA||GSCOCT00012438001|),UPSTREAM(MODIFIER||||Cc01g00010|mRNA||GSCOCT00012439001|) GT:AD:DP:GQ:PL 1/1:0,7:7:18:242,18,0
|
|
240
|
|
241
|
|
242 Parameters
|
|
243 ==========
|
|
244
|
|
245 Output name -> filtered_chr1
|
|
246
|
|
247 Chromosomes -> chr1
|
|
248
|
|
249 Output format -> VCF
|
|
250
|
|
251 Minimum MAF -> 0.001
|
|
252
|
|
253 Maximum MAF -> 0.5
|
|
254
|
|
255 Missing data proportion -> 1
|
|
256
|
|
257 Number of alleles min -> 2
|
|
258
|
|
259 Number of alleles max -> 4
|
|
260
|
|
261 Polymorphisms -> All
|
|
262
|
|
263 Lower bound -> 1
|
|
264
|
|
265 Upper bound -> 100000000
|
|
266
|
|
267
|
|
268 Output files
|
|
269 ============
|
|
270
|
|
271 filtered_genelist_intron.vcf
|
|
272 ----------------------------
|
|
273
|
|
274 ::
|
|
275
|
|
276 #fileformat=VCFv4.1
|
|
277 #FILTER=<ID=LowQual,Description="Low quality">
|
|
278 #FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
|
|
279 [...]
|
|
280 CHROM POS ID REF ALT QUAL FILTER INFO FORMAT CATB1
|
|
281 chr1 5059 . C G 146.84 . AC=2;AF=1.00;AN=2;DP=8;Dels=0.00;FS=0.000;HaplotypeScore=0.0000;MLEAC=2;MLEAF=1.00;MQ=24.14;MQ0=1;QD=18.35;EFF=INTRON(MODIFIER||||Cc01g00020|mRNA||GSCOCT00012438001|),UPSTREAM(MODIFIER||||Cc01g00010|mRNA||GSCOCT00012439001|) GT:AD:DP:GQ:PL 1/1:0,8:8:18:175,18,0
|
|
282
|
|
283
|
|
284 </help>
|
|
285 <citations>
|
|
286 <!-- [HELP] As DOI or BibTex entry -->
|
|
287 <citation type="bibtex">
|
|
288 @article{Danecek01082011,
|
|
289 author = {Danecek, Petr and Auton, Adam and Abecasis, Goncalo and Albers, Cornelis A. and Banks, Eric and DePristo, Mark A. and Handsaker, Robert E. and Lunter, Gerton and Marth, Gabor T. and Sherry, Stephen T. and McVean, Gilean and Durbin, Richard and 1000 Genomes Project Analysis Group},
|
|
290 title = {The variant call format and VCFtools},
|
|
291 volume = {27},
|
|
292 number = {15},
|
|
293 pages = {2156-2158},
|
|
294 year = {2011},
|
|
295 doi = {10.1093/bioinformatics/btr330},
|
|
296 abstract ={Summary: The variant call format (VCF) is a generic format for storing DNA polymorphism data such as SNPs, insertions, deletions and structural variants, together with rich annotations. VCF is usually stored in a compressed manner and can be indexed for fast data retrieval of variants from a range of positions on the reference genome. The format was developed for the 1000 Genomes Project, and has also been adopted by other projects such as UK10K, dbSNP and the NHLBI Exome Project. VCFtools is a software suite that implements various utilities for processing VCF files, including validation, merging, comparing and also provides a general Perl API.Availability: http://vcftools.sourceforge.netContact: rd@sanger.ac.uk},
|
|
297 URL = {http://bioinformatics.oxfordjournals.org/content/27/15/2156.abstract},
|
|
298 eprint = {http://bioinformatics.oxfordjournals.org/content/27/15/2156.full.pdf+html},
|
|
299 journal = {Bioinformatics}
|
|
300 }
|
|
301 </citation>
|
|
302
|
|
303 </citations>
|
|
304
|
|
305 </tool>
|