comparison strelka_germline.xml @ 0:1fbe84e8a740 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/strelka commit 2e445e7c519b2b77498cb74c03ca6ed12b22423a"
author iuc
date Wed, 27 Jan 2021 14:48:23 +0000
parents
children 19481653a22f
comparison
equal deleted inserted replaced
-1:000000000000 0:1fbe84e8a740
1 <?xml version="1.0"?>
2 <tool id="strelka_germline" name="Strelka Germline" version="@TOOL_VERSION@+@GALAXY_VERSION@">
3 <description>@DESCRIPTION@ for germline variation in small cohorts</description>
4 <macros>
5 <import>macros.xml</import>
6 </macros>
7 <expand macro="requirements"/>
8 <command detect_errors="exit_code"><![CDATA[
9 ## sanity check
10 #if len($bam) > 1 and str($expert_settings.evs.selector) == "enableEVS" and $expert_settings.evs.reportEVSFeatures
11 echo "Reporting of EVS features can only be used with a single input sample" 1>&2; exit 1
12 #else
13 ## initialize
14 #set $bam_inputs = []
15 #for $i, $s in enumerate($bam):
16 #set $target_file = 'input_sample_%d.%s' % ($i, $s.ext)
17 ln -s '$s' $target_file &&
18 #if $s.is_of_type('bam')
19 ln -s '$s.metadata.bam_index' ${target_file}.bai &&
20 #elif $s.is_of_type('cram')
21 ln -s '$s.metadata.cram_index' ${target_file}.crai &&
22 #end if
23 #silent $bam_inputs.extend(['--bam', $target_file])
24 #end for
25 #set $bam_spec = ' '.join($bam_inputs)
26 ## Strelka requires both the --ploidy vcf and the --noCompress bed
27 ## to be bgzipped and tabixed.
28 ## Same for the --callRegions bed, but that's handled inside
29 ## the shared INIT code.
30 #if $pl.ploidy
31 #if $pl.ploidy.ext == 'vcf'
32 ln -s '$pl.ploidy' input_ploidy.vcf &&
33 bgzip -c input_ploidy.vcf > input_ploidy.vcf.gz &&
34 tabix -p vcf input_ploidy.vcf.gz &&
35 #else
36 ## File is bgzipped and tabixed already
37 ## -> just symlink data and index
38 ln -s '$pl.ploidy' input_ploidy.vcf.gz &&
39 ln -s '$pl.ploidy.metadata.tabix_index' input_ploidy.vcf.gz.tbi &&
40 #end if
41 #end if
42 #if $oo.gvcf.emit_gvcfs == 'yes' and $oo.gvcf.noCompress
43 ln -s '$oo.gvcf.noCompress' input_nocompress.bed &&
44 bgzip -c input_nocompress.bed > input_nocompress.bed.gz &&
45 tabix -p bed input_nocompress.bed.gz &&
46 #end if
47 @INIT@
48
49 ## create workflow
50 configureStrelkaGermlineWorkflow.py
51 $bam_spec
52 #if str($pl.callContinuousVf)
53 --callContinuousVf '$pl.callContinuousVf'
54 #end if
55 #if $pl.ploidy
56 --ploidy input_ploidy.vcf.gz
57 #end if
58 #if $oo.gvcf.emit_gvcfs == 'yes' and $oo.gvcf.noCompress
59 --noCompress input_nocompress.bed.gz
60 #end if
61 $expert_settings.s_e_e
62 @CREATE@
63
64 ## run workflow
65 @RUN@
66
67 ## decompress results
68 #if $oo.vcf_type == "decompressed"
69 ## we decompress just the main variants file
70 ## per-sample gvcf files are always emitted as a collection of
71 ## compressed files.
72 && bgzip -d results/results/variants/variants.vcf.gz
73 && mv results/results/variants/variants.vcf results/results/variants/variants_out
74 #else
75 && mv results/results/variants/variants.vcf.gz results/results/variants/variants_out
76 #end if
77 #end if
78 ]]></command>
79
80 <configfiles>
81 <configfile name="config_file">
82 ## parser cannot handle indents
83 [StrelkaGermline]
84 minMapq = $strelka.minMapq
85 @CONFIG@
86 </configfile>
87 </configfiles>
88
89 <inputs>
90 <param argument="--bam" type="data" format="bam,cram" multiple="true" label="Select sample file(s)" help=""/>
91 <expand macro="input_required" ref="bam"/>
92 <expand macro="calling_model">
93 <option value="--rna">RNA sequencing data (--rna)</option>
94 </expand>
95 <expand macro="calling_model_expert">
96 <param name="s_e_e" type="select" label="Configure sequence error estimation for indels"
97 help="By default, indel error rates are estimated from a subsample of the input sequencing data for each sample. This step can be disabled so that the tool reverts to precomputed indel error rates reflecting an intermediate point between different sequencing assays. Alternatively, all data (from sufficiently large chromosomes) can be used to obtain the estimate, but this may greatly increase runtime.">
98 <option value="--disableSequenceErrorEstimation">Use default sequence error estimate (--disableSequenceErrorEstimation)</option>
99 <option value="" selected="true">Estimate sequence error rate from a subsample of the input data (default)</option>
100 <option value="--useAllDataForSequenceErrorEstimation">Estimate sequence error rate from the full input data (--useAllDataForSequenceErrorEstimation)</option>
101 </param>
102 </expand>
103 <expand macro="regions_select" />
104 <section name="pl" title="Ploidy configuration" expanded="false">
105 <param argument="--ploidy" type="data" format="vcf,vcf_bgzip" optional="true" label="Select ploidy file" help="Provide ploidy file in VCF. The VCF should include one sample column per input sample labeled with the same sample names found in the input BAM/CRAM RG header sections. Ploidy should be provided in records using the FORMAT/CN field, which are interpreted to span the range [POS+1, INFO/END]. Any CN value besides 1 (haploid regions) or 0 (regions expected to be absent) will be treated as 2."/>
106 <param argument="--callContinuousVf" type="text" value="" label="Call variants without ploidy assumption on this chromosome" help="For the specified chromosome the sequencing data will be treated as a pooled sample: variants on it will be called with continuous frequencies and scored using a simple Poisson noise model. May be applied to the mitochondrial genome, for example." />
107 </section>
108 <section name="oo" title="Output options" expanded="false">
109 <expand macro="input_output"/>
110 <conditional name="gvcf">
111 <param name="emit_gvcfs" type="select" label="Generate per-sample gVCFs?">
112 <option value="no">No</option>
113 <option value="yes">Yes</option>
114 </param>
115 <when value="no" />
116 <when value="yes">
117 <param argument="--noCompress" type="data" format="bed" optional="true"
118 label="In the gVCFs, do NOT block-compress sites that fall in these regions"
119 help="If you have regions of special interest, for which you do not want site-information to be binned into blocks in the gVCF output, but would want to have calling statistics reported separately, you can specify those regions through a BED dataset here." />
120 </when>
121 </conditional>
122 </section>
123 <section name="strelka" title="Strelka run configuration" expanded="false">
124 <param argument="minMapq" name="minMapq" type="integer" value="20" label="Set minMapq" help="Don't use reads with MAPQ less than this value for variant calling."/>
125 <expand macro="input_strelka"/>
126 </section>
127 </inputs>
128 <outputs>
129 <data name="out_variants" format="vcf" from_work_dir="results/results/variants/variants_out" label="${tool.name} on ${on_string}, Variants, vcf">
130 <change_format>
131 <when input="oo.vcf_type" value="compressed" format="vcf_bgzip" />
132 </change_format>
133 </data>
134 <collection name="out_genome" type="list" label="${tool.name} on ${on_string}: Genome, vcf">
135 <discover_datasets pattern="genome\.(?P&lt;designation&gt;.+)\.vcf\.gz&#36;" format="vcf_bgzip" directory="results/results/variants/" />
136 <filter>oo['gvcf']['emit_gvcfs'] == 'yes'</filter>
137 </collection>
138 </outputs>
139 <tests>
140 <!-- #1; input bam, decompressed -->
141 <test expect_num_outputs="2">
142 <param name="bam" value="sample1.bam,sample2.bam,sample3.bam"/>
143 <conditional name="ref_cond">
144 <param name="ref_sel" value="history"/>
145 <param name="ref" value="hg98.fa" ftype="fasta"/>
146 </conditional>
147 <section name="expert_settings">
148 <param name="s_e_e" value="--disableSequenceErrorEstimation" />
149 </section>
150 <section name="oo">
151 <param name="vcf_type" value="decompressed"/>
152 <conditional name="gvcf">
153 <param name="emit_gvcfs" value="yes" />
154 </conditional>
155 </section>
156 <output name="out_variants" ftype="vcf">
157 <assert_contents>
158 <has_n_lines n="62"/>
159 <has_line_matching expression="#CHROM&#009;POS&#009;.+"/>
160 <has_line_matching expression="demo20&#009;3664&#009;.+"/>
161 </assert_contents>
162 </output>
163 <output_collection name="out_genome" type="list" count="3">
164 <element name="S1" ftype="vcf_bgzip" file="genome_test1.vcf" decompress="true" compare="diff" lines_diff="8" />
165 </output_collection>
166 </test>
167 <!-- #2; input cram, compressed -->
168 <test expect_num_outputs="1">
169 <param name="bam" value="sample1.cram,sample2.cram"/>
170 <conditional name="ref_cond">
171 <param name="ref_sel" value="history"/>
172 <param name="ref" value="hg98.fa" ftype="fasta"/>
173 </conditional>
174 <section name="expert_settings">
175 <param name="s_e_e" value="--disableSequenceErrorEstimation" />
176 </section>
177 <section name="oo">
178 <param name="vcf_type" value="compressed"/>
179 </section>
180 <output name="out_variants" ftype="vcf_bgzip" file="variants_test2.vcf" decompress="true" compare="diff" lines_diff="8" />
181 </test>
182 <!-- #3; input bam, no defaults -->
183 <test expect_num_outputs="2">
184 <param name="bam" value="sample1.cram,sample2.cram"/>
185 <conditional name="ref_cond">
186 <param name="ref_sel" value="history"/>
187 <param name="ref" value="hg98.fa" ftype="fasta"/>
188 </conditional>
189 <param name="optimization" value="--rna" />
190 <section name="oo">
191 <param name="vcf_type" value="decompressed"/>
192 <conditional name="gvcf">
193 <param name="emit_gvcfs" value="yes" />
194 </conditional>
195 </section>
196 <section name="pl">
197 <param name="callContinuousVf" value="Chr1"/>
198 </section>
199 <section name="strelka">
200 <param name="minMapq" value="21"/>
201 <param name="maxIndelSize" value="51"/>
202 </section>
203 <output name="out_variants" ftype="vcf">
204 <assert_contents>
205 <has_n_lines n="81"/>
206 <has_line_matching expression="#CHROM&#009;POS&#009;.+"/>
207 <has_line_matching expression="demo20&#009;3664&#009;.+"/>
208 </assert_contents>
209 </output>
210 <output_collection name="out_genome" type="list" count="2">
211 <element name="S1" ftype="vcf_bgzip">
212 <assert_contents>
213 <has_n_lines n="219"/>
214 <has_line_matching expression="demo20&#009;4101&#009;.+"/>
215 </assert_contents>
216 </element>
217 </output_collection>
218 </test>
219 </tests>
220 <help><![CDATA[
221 .. class:: infomark
222
223 **What it does**
224
225 @HELP_STRELKA@
226
227 The germline caller employs an efficient tiered haplotype model to improve accuracy and provide read-backed phasing, adaptively selecting between assembly and a faster alignment-based haplotyping approach at each variant locus. The germline caller also analyzes input sequencing data using a mixture-model indel error estimation method to improve robustness to indel noise.
228
229 **Input**
230
231 @HELP_INPUT@
232
233 **Output**
234
235 *Variants*
236
237 This describes all potential variant loci across all samples. Note this file includes non-variant loci if they have a non-trivial level of variant evidence or contain one or more alleles for which genotyping has been forced. Please see the multi-sample variants VCF section below for additional details on interpreting this file.
238
239 *Genome*
240
241 This is the genome VCF output for sample N, which includes both variant records and compressed non-variant blocks. The sample index, N is 1-indexed and corresponds to the input order of alignment files on the configuration command-line.
242
243 .. class:: infomark
244
245 **References**
246
247 @HELP_REFERENCES@
248 ]]></help>
249 <expand macro="citations"/>
250 </tool>