comparison varscan_mpileup2indel_from_bam.xml @ 0:10e2ea79ec55 draft

planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools commit 0bc9864516071632199ddf9a4ff403893060c99f
author yhoogstrate
date Thu, 05 Nov 2015 10:00:19 -0500
parents
children 2c56a59a112f
comparison
equal deleted inserted replaced
-1:000000000000 0:10e2ea79ec55
1 <?xml version="1.0" encoding="UTF-8"?>
2 <tool id="varscan_mpileup2indel_from_bam" name="VarScan2 Call INDELs from BAM" version="2.3.6.a">
3 <description>VarScan2 INDEL detection; directly reading *.bam file(s) &amp; using parallel mpileup generation, to avoid unnecessairy I/O overhead and increase performance.</description>
4
5 <requirements>
6 <requirement type="package" version="0.1.19a">samtools_parallel_mpileup_0_1_19a</requirement>
7 <requirement type="package" version="0.1.19">samtools</requirement>
8 <requirement type="package" version="2.3.6">varscan</requirement>
9 </requirements>
10
11 <version_command>java -jar $JAVA_JAR_PATH/VarScan.v2.3.6.jar 2>&amp;1 | head -n 1</version_command>
12
13 <command>
14 #if $reference_genome_source.source_select == "attribute" and len({ alignment.metadata.dbkey:True for alignment in $alignments }.keys()) != 1
15 echo "Invalid number of dbkeys are found: ${ len({ alignment.metadata.dbkey:True for alignment in $alignments }.keys()) }, while only one should be used. Make sure that the alignments are done on the same reference genome and that 'tool-data/all_fasta.loc' is configured properly!" >&amp;2
16 #else
17 #import os.path
18 #for $alignment in $alignments
19 <!-- @todo use the existence of $alignment.metadata.bam_index or $alignment.metadata['bam_index'] -->
20 #if not os.path.isfile(str($alignment)+".bai")
21 echo "- Indexing alignment file: $alignment.name " ;
22 samtools index $alignment 2>&amp;1 ;
23 #else
24 echo "- Skiping indexing: $alignment.name " ;
25 #end if
26 #end for
27
28 #if $mpileup_parallelization.mpileup_parallelization_select == "true"
29 samtools-parallel-mpileup mpileup
30 -t $mpileup_parallelization.samtools_threads
31 #else
32 samtools mpileup
33 #end if
34 -f
35 #if $reference_genome_source.source_select == "indexed_filtered"
36 "$reference_genome_source.reference_genome"
37 #else if $reference_genome_source.source_select == "indexed_all"
38 "$reference_genome_source.reference_genome"
39 #else if $reference_genome_source.source_select == "history"
40 "$reference_genome_source.reference_genome"
41 #else
42 <!--
43 This is a workaround to obtain the "genome.fa" file that
44 corresponds to the dbkey of the alignments.
45 Because this file is "calculated" during run-time, it can
46 be used in a workflow.
47 -->
48 "${ filter( lambda x: str( x[0] ) == str( { alignment.metadata.dbkey:True for alignment in $alignments }.keys()[0] ), $__app__.tool_data_tables[ 'all_fasta' ].get_fields() )[0][-1] }"
49 #end if
50
51 #if $extended_parameters_regions.samtools_regions == "region"
52 -r $extended_parameters_regions.samtools_r
53 #elif $extended_parameters_regions.samtools_regions == "regions_file_pos" or $extended_parameters_regions.samtools_regions == "regions_file_bed"
54 -l $extended_parameters_regions.samtools_l
55 #end if
56
57 #if $extended_parameters.parameters == "extended"
58 $extended_parameters.samtools_6
59 $extended_parameters.samtools_A
60 $extended_parameters.samtools_B
61 -C $extended_parameters.samtools_C
62 -d $extended_parameters.samtools_d
63 $extended_parameters.samtools_E
64 -M $extended_parameters.samtools_M
65 $extended_parameters.samtools_R
66 -q $extended_parameters.samtools_q
67 -Q $extended_parameters.samtools_Q
68
69 -e $extended_parameters.samtools_e
70 -F $extended_parameters.samtools_F
71 -h $extended_parameters.samtools_h
72 $extended_parameters.samtools_I
73 -L $extended_parameters.samtools_L
74 -m $extended_parameters.samtools_m
75 -o $extended_parameters.samtools_o
76 $extended_parameters.samtools_p
77 -P $extended_parameters.samtools_P
78 #end if
79
80 #for $alignment in $alignments
81 ${alignment}
82 #end for
83 2>stderr_1.txt
84
85 #if $mpileup_parallelization.mpileup_parallelization_select == "true"
86 #if $mpileup_parallelization.sort_mpileup
87 | sort -k1,1V -k2,2g
88 #end if
89 #end if
90
91 | java
92 -Xmx64G
93 -jar \$JAVA_JAR_PATH/VarScan.v2.3.6.jar
94 mpileup2indel
95
96 #if $extended_parameters.parameters == "extended"
97 --min-coverage $extended_parameters.varscan_min_coverage
98 --min-reads2 $extended_parameters.varscan_min_reads2
99 --min-avg-qual $extended_parameters.varscan_min_avg_qual
100 --min-var-freq $extended_parameters.varscan_min_var_freq
101 --min-freq-for-hom $extended_parameters.varscan_min_freq_for_hom
102 --p-value $extended_parameters.varscan_p_value
103 $extended_parameters.varscan_strand_filter
104 $extended_parameters.varscan_variants
105 #end if
106
107 #if $varscan_output == "vcf" or $varscan_output.value == "vcf"
108 --output-vcf 1
109 #end if
110
111 2>stderr_2.txt
112 > $snv_output ;
113
114
115 echo "---------------[ mpileup generation ]---------------" ;
116 cat stderr_1.txt ;
117 echo "" ;
118 echo "---------------[ VarScan INDEL detect ]-------------" ;
119 cat stderr_2.txt ;
120 echo "" ;
121 echo "----------------------------------------------------" ;
122 #end if
123 </command>
124
125 <inputs>
126 <param format="bam,sam" multiple="true" name="alignments" type="data" label="Alignment file(s)" help="Mapped reads in BAM or SAM format."/>
127
128 <!-- Find out how to access the reference genome from the BAM file(s) -->
129 <conditional name="reference_genome_source">
130 <param name="source_select" type="select" label="Fasta Source">
131 <option value="indexed_filtered">Use a built-in index (which fits your reference)</option>
132 <option value="history">Use reference from the history</option>
133 <option value="indexed_all">Use a built-in index (entire list) - avoid this option if possible; only useful if you design a workflow</option>
134 <option value="attribute">Use a built-in index based on the 'metadata.dbkey' attribute; ideal in workflows</option>
135 </param>
136 <when value="history">
137 <param name="reference_genome" format="fasta" type="data" label="Reference Genome used during alignment (FASTA)" help="Reference genome (genome.fa) that corresponds to the *.bam file." />
138 </when>
139 <when value="indexed_filtered">
140 <param name="reference_genome" type="select" label="Reference Genome used during alignment (FASTA)" >
141 <options from_data_table="all_fasta">
142 <column name="name" index="2"/>
143 <column name="dbkey" index="1"/>
144 <column name="value" index="3"/><!-- Value is the path of the fasta file -->
145 <filter type="data_meta" ref="alignments" multiple="false" key="dbkey" column="1" />
146 <validator type="no_options" message="No indexes are available for the selected input dataset" />
147 </options>
148 </param>
149 </when>
150 <when value="indexed_all">
151 <param name="reference_genome" type="select" label="Reference Genome used during alignment (FASTA)" >
152 <options from_data_table="all_fasta">
153 <column name="name" index="2"/>
154 <column name="dbkey" index="1"/>
155 <column name="value" index="3"/><!-- Value is the path of the fasta file -->
156 <validator type="no_options" message="No indexes are available for the selected input dataset" />
157 </options>
158 </param>
159 </when>
160 <when value="attribute" />
161 </conditional>
162
163 <conditional name="extended_parameters_regions">
164 <param name="samtools_regions" type="select" label="Region specific parameters" help="Let samtools target specific genomic locations.">
165 <option value="entire_genome">Entire genome</option>
166 <option value="region">Specific region</option>
167 <option value="regions_file_pos">Specific positions (file); list of positions</option>
168 <option value="regions_file_bed">Specific regions (file); list of regions in BED</option>
169 </param>
170 <when value="entire_genome" />
171 <when value="region">
172 <param type="text" name="samtools_r" label="Samtools: region in which pileup is generated" help="e.g. chrX or chr:pos or chr:start-end" />
173 </when>
174 <when value="regions_file_pos">
175 <param type="data" name="samtools_l" format="tabular" label="Samtools: list of positions (chr pos)" />
176 </when>
177 <when value="regions_file_bed">
178 <param type="data" name="samtools_l" format="bed" label="Samtools: specific regions (BED)" />
179 </when>
180 </conditional>
181
182 <conditional name="mpileup_parallelization">
183 <param name="mpileup_parallelization_select" type="select" label="Use parallelization for the mpileup generation (experimental)" help="Especially if larger numbers of bam/sam files are processed, or the file infrastructure is optimized for IO-paralellization, this feature might improve performance.">
184 <option value="false" >False - uses classical samtools</option>
185 <option value="true">True - uses (experimental) samtools mpileup-parallel</option>
186 </param>
187 <when value="false" />
188 <when value="true">
189 <param type="integer" name="samtools_threads" value="2" min="1" label="Samtools: mpileup threads" />
190 <param type="boolean" name="sort_mpileup" truevalue="true" falsevalue="false" label="Sort mpileup file (SLOW)" help="Because parallelization may disrupt the outputs order, sorting can be conveniet for e.g. testing. Notice that this function has only use in a limited number of situations but consumes (much) resources. Only use it if it's really neccesairy." />
191 </when>
192 </conditional>
193
194 <conditional name="extended_parameters">
195 <param name="parameters" type="select" label="Advanced parameters" help="For more advanced VarScan and samtools settings.">
196 <option value="default">Default settings</option>
197 <option value="extended">Extended settings</option>
198 </param>
199 <when value="default" />
200 <when value="extended">
201 <param type="boolean" name="samtools_6" falsevalue="" truevalue=" -6" label="Samtools: assume the quality is in the Illumina-1.3+ encoding" />
202 <param type="boolean" name="samtools_A" falsevalue="" truevalue=" -A" label="Samtools: count anomalous read pairs" />
203 <param type="boolean" name="samtools_B" falsevalue="" truevalue=" -B" label="Samtools: disable BAQ computation" />
204 <param type="integer" name="samtools_C" value="0" label="Samtools: parameter for adjusting mapQ; 0 to disable [0]" />
205 <param type="integer" name="samtools_d" value="250" label="Samtools: max per-BAM depth to avoid excessive memory usage [250]" />
206 <param type="boolean" name="samtools_E" falsevalue="" truevalue=" -E" label="Samtools: recalculate extended BAQ on the fly thus ignoring existing BQs" />
207 <param type="integer" name="samtools_M" value="60" label="cap mapping quality at INT [60]" />
208 <param type="boolean" name="samtools_R" falsevalue="" truevalue=" -R" label="Samtools: ignore RG tags" />
209 <param type="integer" name="samtools_q" value="0" label="Samtools: skip alignments with mapQ smaller than INT [0]" />
210 <param type="integer" name="samtools_Q" value="13" label="Samtools: skip bases with baseQ/BAQ smaller than INT [13]" />
211
212 <param type="integer" name="samtools_e" value="20" label="Samtools: Phred-scaled gap extension seq error probability [20]" />
213 <param type="float" name="samtools_F" value="0.002" label="Samtools: minimum fraction of gapped reads for candidates [0.002]" help="Alias: -F" />
214 <param type="integer" name="samtools_h" value="100" label="Samtools: coefficient for homopolymer errors [100]" />
215 <param type="boolean" name="samtools_I" falsevalue="" truevalue=" -I" label="Samtools: do not perform indel calling" />
216 <param type="integer" name="samtools_L" value="250" label="Samtools: max per-sample depth for INDEL calling [250]" />
217 <param type="integer" name="samtools_m" value="1" label="Samtools: minimum gapped reads for indel candidates [1]" help="Alias: -m" />
218 <param type="integer" name="samtools_o" value="40" label="Samtools: Phred-scaled gap open sequencing error probability [40]" />
219 <param type="boolean" name="samtools_p" falsevalue="" truevalue=" -p" label="Samtools: apply -m and -F per-sample to increase sensitivity" />
220 <param type="text" name="samtools_P" value="all" label="Samtools: comma separated list of platforms for indels [all]" />
221
222 <param type="integer" name="varscan_min_coverage" value="8" label="VarScan: Minimum read depth at a position to make a call [8]" />
223 <param type="integer" name="varscan_min_reads2" value="2" label="VarScan: PMinimum supporting reads at a position to call variants [2]" />
224 <param type="integer" name="varscan_min_avg_qual" value="15" label="VarScan: Minimum base quality at a position to count a read [15]" />
225 <param type="float" name="varscan_min_var_freq" value="0.01" label="VarScan: minimum fraction of gapped reads for candidates [0.002]" help="Alias: -F" />
226 <param type="float" name="varscan_min_freq_for_hom" value="0.75" label="VarScan: Minimum frequency to call homozygote [0.75]" />
227 <param type="float" name="varscan_p_value" value="0.99" label="VarScan: Default p-value threshold for calling variants [99e-02]" />
228 <param type="boolean" name="varscan_strand_filter" falsevalue=" --strand_filter 0" truevalue=" --strand_filter 1" checked="true" label="VarScan: Ignore variants with >90% support on one strand [1]" />
229 <param type="boolean" name="varscan_variants" falsevalue=" --variants 0" truevalue=" --variants 1" checked="false" label="VarScan: Report only variant (SNP/indel) positions [0]" />
230 </when>
231 </conditional>
232
233 <param name="varscan_output" type="select" label="Output format">
234 <option value="vcf">VCF</option>
235 <option value="tabular">tabular</option>
236 </param>
237 </inputs>
238
239 <outputs>
240 <data format="tabular" name="snv_output" label="${tool.name} on ${', '.join([ str(a.hid)+': '+a.name for a in $alignments ])}">
241 <change_format>
242 <when input="varscan_output" value="vcf" format="vcf" />
243 </change_format>
244 </data>
245 </outputs>
246
247 <tests>
248 <test><!-- Use classical samtools -->
249 <param name="alignments" value="example.bam" ftype="bam" />
250
251 <param name="source_select" value="history" />
252 <param name="reference_genome" value="example.fa" ftype="fasta" />
253
254 <param name="samtools_regions" value="entire_genome" />
255
256 <param name="mpileup_parallelization_select" value="false" />
257 <param name="sort_mpileup" value="true" />
258
259 <param name="parameters" value="default" />
260 <param name="varscan_output_vcf" value="1" />
261
262
263 <output name="snv_output" file="example.vcf" />
264 </test>
265 <test><!-- Use parallelized samtools -->
266 <param name="alignments" value="example.bam" ftype="bam" />
267
268 <param name="source_select" value="history" />
269 <param name="reference_genome" value="example.fa" ftype="fasta" />
270
271 <param name="samtools_regions" value="entire_genome" />
272
273 <param name="mpileup_parallelization_select" value="true" />
274 <param name="samtools_threads" value="2" />
275 <param name="sort_mpileup" value="true" />
276
277 <param name="parameters" value="default" />
278 <param name="varscan_output_vcf" value="1" />
279
280
281 <output name="snv_output" file="example.vcf" />
282 </test>
283 </tests>
284
285 <help>
286 **VarScan 2.3.6**
287
288 VarScan is a platform-independent mutation caller for targeted, exome, and whole-genome resequencing data generated on Illumina, SOLiD, Life/PGM, Roche/454, and similar instruments. The newest version, VarScan 2, is written in Java, so it runs on most operating systems.
289 http://dx.doi.org/10.1101/gr.129684.111
290 http://www.ncbi.nlm.nih.gov/pubmed/19542151
291
292 *VarScan* requires mpileup formatted input files, which are generally derived from BAM files. Since mpileup files can become humongous, the interim step of storing it is bypassed. Thus, in this wrapper one or multiple BAM/SAM files go in, get processed into a mpileup file and get directly linked to VarScan.
293 The samtools package is not able to parallelize the mpileup generation which make it a very slow process.
294 Other people were aware of this and have written a version that can do parallelization:
295 https://github.com/mydatascience/parallel-mpileup
296
297 Consequently, when a BAM files gets processed by this wrapper, it's processed by *parallel-mpileup* before its send to VarScan.
298
299 .. _VarScan: http://varscan.sourceforge.net/
300
301 **Input formats**
302
303 VarScan2 accepts sequencing alignments in the same, either SAM or BAM format (http://samtools.sourceforge.net/). The alignment files have to be linked to a reference genome by galaxy. This is indicated under every history item with e.g.: *"database: hg19"* for a link to hg19, or *"database: ?"* if the link is missing.
304
305 **Installation**
306
307 Make sure your reference genomes are properly annotated in "tool-data/all_fasta.loc", and linked to the names of the reference used for alignment.
308
309 **License**
310
311 * VarScan2.3.6: Non-Profit Open Software License 3.0 (Non-Profit OSL 3.0)
312 * parallel-mpileup: MIT License (https://github.com/mydatascience/parallel-mpileup/blob/master/samtools-0.1.19/COPYING)
313
314
315 Contact
316 -------
317
318 The tool wrapper has been written by Youri Hoogstrate from the Erasmus
319 Medical Center (Rotterdam, Netherlands) on behalf of the Translational
320 Research IT (TraIT) project:
321
322 http://www.ctmm.nl/en/programmas/infrastructuren/traitprojecttranslationeleresearch
323
324 More tools by the Translational Research IT (TraIT) project can be found
325 in the following toolsheds:
326
327 http://toolshed.g2.bx.psu.edu/
328
329 http://testtoolshed.g2.bx.psu.edu/
330 </help>
331 <citations>
332 <citation type="doi">10.1101/gr.129684.111</citation>
333 </citations>
334 </tool>