comparison lumpy_sv.xml @ 0:477a07f387e0 draft

"planemo upload for repository https://github.com/arq5x/lumpy-sv commit cce17262b21b0964c31eb983bac5e89ae92b8ee9"
author iuc
date Thu, 12 Nov 2020 16:48:15 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:477a07f387e0
1 <tool id="lumpy_sv" name="LUMPY" version="@WRAPPER_VERSION@">
2 <description>is a probabilistic framework for structural variant discovery</description>
3 <macros>
4 <import>macros.xml</import>
5 <xml name="pe_options">
6 <param name="read_length" type="integer" value="101" label="Length of sequenced reads" help="" />
7 <param name="min_non_overlap" type="integer" value="101" label="Number of base pair positions that must be unique to each end of a read pair" help="Some library preps are created with large reads and small library sizes such that read overlap, in all over cases overlapping reads tends to be a sign of an error. We typically set this to read length (pairs cannot overlap)." />
8 <param name="discordant_z" type="integer" value="5" label="Number of standard deviations away from the mean to be considered as a normal library size" help="" />
9 <param name="back_distance" type="integer" value="10" label="Distance into the read to add to the breakpoint interval" help="" />
10 <param name="min_mapping_threshold" type="integer" value="20" label="Minimum mapping quality (reported from the aligner) that a read must have to be considered" help="A quality of 1 will filter all reads with two or more equally good mappings." />
11 <param name="weight" type="integer" value="1" label="Weight of each piece of evidence from this sample" help="" />
12 </xml>
13 <xml name="sr_options">
14 <param name="sr_back_distance" type="integer" value="10" label="Distance around the +/- of the split to include in the breakpoint interval" help="A distance of 20 will created a breakpoint interval of size 40 centered at the split." />
15 <param name="sr_min_mapping_threshold" type="integer" value="20" label="Minimum mapping quality (reported from the aligner) that a read must have to be considered" help="A quality of 1 will filter all reads with two or more equally good mappings." />
16 <param name="sr_weight" type="integer" value="1" label="Weight of each piece of evidence from this sample" help="" />
17 </xml>
18 </macros>
19 <requirements>
20 <requirement type="package" version="@TOOL_VERSION@">lumpy-sv</requirement>
21 </requirements>
22 <command detect_errors="exit_code"><![CDATA[
23 python '$configure_job' > lumpy_job.sh &&
24 chmod u+x lumpy_job.sh &&
25 cat lumpy_job.sh &&
26 ./lumpy_job.sh > '$result'
27 ]]></command>
28 <configfiles>
29 <configfile name="configure_job"><![CDATA[
30 ## The Python script that gets put together here, will emit a shell script
31 ## with the necessary commands for a traditional (non-express) LUMPY workflow.
32 ## After running the python code, the resulting shell script is all that's
33 ## needed to run LUMPY with all user-specified settings.
34 import os
35 import pysam
36
37 preproc_cmds = []
38 lumpy_cmd_parts = ['lumpy -mw ${general.mw} -tt ${general.tt} ${general.e}']
39
40 ## symlink in all input bams, and there index files if available
41 full_bams = []
42 disc_bams = []
43 split_bams = []
44 #for $n,$bam_file in enumerate($input_bams):
45 ## main input files are collated BAMs without index
46 os.symlink('$bam_file', 'f${n}.bam')
47 full_bams.append('f${n}.bam')
48 #end for
49 #if $discordant_alns:
50 #for $n, $disc_file in enumerate($discordant_alns):
51 os.symlink('$disc_file', 'd${n}.bam')
52 os.symlink('$disc_file.metadata.bam_index', 'd${n}.bam.bai')
53 disc_bams.append('d${n}.bam')
54 #end for
55 #end if
56 #if $split_alns:
57 #for $n, $split_file in enumerate($split_alns):
58 os.symlink('$split_file', 's${n}.bam')
59 os.symlink('$split_file.metadata.bam_index', 's${n}.bam.bai')
60 split_bams.append('s${n}.bam')
61 #end for
62 #end if
63 if not disc_bams and not split_bams:
64 raise Exception('Either discordant or split alignments are required as input')
65
66 ## make pe and sr params available in the Python code
67 #set $pe_rg_specs = {}
68 #for $per_rg_pe in $pe.rg_specific:
69 #silent $pe_rg_specs[str($per_rg_pe.rg_id)] = {
70 'read_len': int($per_rg_pe.read_length),
71 'min_non_overlap': int($per_rg_pe.min_non_overlap),
72 'discordant_z': int($per_rg_pe.discordant_z),
73 'back_distance': int($per_rg_pe.back_distance),
74 'min_mapping_threshold': int($per_rg_pe.min_mapping_threshold),
75 'weight': int($per_rg_pe.weight)
76 }
77 #end for
78 #silent $pe_rg_specs[None] = {
79 'read_len': int($pe.default.read_length),
80 'min_non_overlap': int($pe.default.min_non_overlap),
81 'discordant_z': int($pe.default.discordant_z),
82 'back_distance': int($pe.default.back_distance),
83 'min_mapping_threshold': int($pe.default.min_mapping_threshold),
84 'weight': int($pe.default.weight)
85 }
86 pe_rg_specs = $pe_rg_specs
87
88 #set $sr_sm_specs = {}
89 #for $per_sm_sr in $sr.sm_specific:
90 #silent $sr_sm_specs[str($per_sm_sr.rg_sm)] = {
91 'back_distance': int($per_sm_sr.sr_back_distance),
92 'min_mapping_threshold': int($per_sm_sr.sr_min_mapping_threshold),
93 'weight': int($per_sm_sr.sr_weight)
94 }
95 #end for
96
97 #silent $sr_sm_specs[None] = {
98 'back_distance': int($sr.default.sr_back_distance),
99 'min_mapping_threshold': int($sr.default.sr_min_mapping_threshold),
100 'weight': int($sr.default.sr_weight)
101 }
102 sr_sm_specs = $sr_sm_specs
103
104 ## discover read groups and samples defined in any of the input files
105 ## the main input files are used to build a catalogue of read groups and
106 ## samples to work with.
107 ## For each combination of RG ID and SM, the input that has those reads is
108 ## recorded
109 known_rg_records = {}
110 for bam_file in full_bams:
111 ibam = pysam.AlignmentFile(bam_file, 'rb')
112 for rg_record in ibam.header['RG']:
113 known_rg_records[(rg_record['ID'], rg_record['SM'])] = bam_file
114 ibam.close()
115 print('# Available read groups and samples in input BAMs:')
116 print('# RG\tSample')
117 for r, s in known_rg_records:
118 print('# {0}\t{1}'.format(r, s))
119
120 ## Lumpy can analyze *discordant pairs* per read group.
121 ## We only use read groups that we've also seen in any of the main inputs.
122 ## For these read groups, we retrieve the main input BAM that has the read
123 ## group and use the file to generate paired-end stats with the help of LUMPY's
124 ## pairend_distro script.
125 ## The paired-end stats (emitted as numbered .histo files, but also to stdout)
126 ## are then used as part of the actual LUMPY command, i.e. in the value of the
127 ## -pe option.
128 n = 0
129 for discordant_file in disc_bams:
130 dbam = pysam.AlignmentFile(discordant_file, 'rb')
131 sample_info = {}
132 for rg_record in dbam.header['RG']:
133 if (rg_record['ID'], rg_record['SM']) in known_rg_records:
134 if rg_record['SM'] not in sample_info:
135 sample_info[rg_record['SM']] = []
136 sample_info[rg_record['SM']].append({'ID': rg_record['ID'], 'bam_for_stats': known_rg_records[(rg_record['ID'], rg_record['SM'])]})
137 dbam.close()
138 for sample in sample_info:
139 for rg_info in sample_info[sample]:
140 rg_specs = pe_rg_specs.get(rg_info['ID'], pe_rg_specs[None])
141 preproc_cmds.append(
142 ## calculate pe stats, store the main output in distinct
143 ## -histo files per read group and capture stdout in
144 ## shell variables, one per read-group.
145 "pe_var_{0}=\$(samtools view -r '{1}' '{2}' | tail -n+${general.stat_tail_n} | pairend_distro.py -r {3} -X 4 -N 10000 -o rg{0}.histo | tr '\t' ',') &&"
146 .format(n, rg_info['ID'], rg_info['bam_for_stats'], rg_specs['read_len'])
147 )
148 ## reuse the .histo files and the captured stdout in the value
149 ## of LUMPY's -pe option
150 lumpy_cmd_parts.append(
151 '-pe id:{0},'
152 'read_group:{1},'
153 'bam_file:{2},'
154 'histo_file:rg{3}.histo,'
155 '\$pe_var_{3},'
156 'read_length:{read_len},'
157 'min_non_overlap:{min_non_overlap},discordant_z:{discordant_z},back_distance:{back_distance},weight:{weight},min_mapping_threshold:{min_mapping_threshold}'
158 .format(sample, rg_info['ID'], discordant_file, n, **rg_specs)
159 )
160 n += 1
161
162 ## *Split-reads* get analyzed on a per-sample level.
163 ## Again, we only use samples that we've seen in the main inputs before.
164 ## The names of these samples and the correponding split-reads bam files are
165 ## used directly in the -sr option value of the LUMPY main command.
166 for split_file in split_bams:
167 sbam = pysam.AlignmentFile(split_file, 'rb')
168 for rg_record in sbam.header['RG']:
169 if (rg_record['ID'], rg_record['SM']) in known_rg_records:
170 sm_specs = sr_sm_specs.get(rg_record['SM'], sr_sm_specs[None])
171 lumpy_cmd_parts.append(
172 '-sr id:{0},'
173 'bam_file:{1},'
174 'back_distance:{back_distance},weight:{weight},min_mapping_threshold:{min_mapping_threshold}'
175 .format(rg_record['SM'], split_file, **sm_specs)
176 )
177 sbam.close()
178 print('\n'.join(preproc_cmds))
179 print(' '.join(lumpy_cmd_parts))
180 ]]></configfile>
181 </configfiles>
182 <inputs>
183 <param name="input_bams" type="data" format="qname_sorted.bam" multiple="true" label="BAM input dataset" />
184 <param name="discordant_alns" type="data" format="bam" multiple="true" optional="true" label="Discordant reads input" />
185 <param name="split_alns" type="data" format="bam" multiple="true" optional="true" label="Split alignments input" />
186 <!--<param name="input_bedpe_file" type="data" format="bed" multiple="false" label="BEDPE file" help="Position sorted bedpe file containing the breakpoint intervals for this sample." />-->
187 <!-- General options -->
188 <section name="general" title="General options" expanded="false">
189 <!--<param name="g" type="data" label="Genome file (-g)" help="Defines chromosome order" />-->
190 <param name="e" type="boolean" truevalue="-e" falsevalue=""
191 label="Show evidence for each call (-e)" help="" />
192 <!--<param name="w" type="integer" value="1000000" label="File read windows size (-w)" help="Default 1000000" />-->
193 <param name="mw" type="integer" value="4" label="Minimum weight across all samples for a call (-mw)" help="" />
194 <!--<param name="msw" type="integer" value="1" label="Minimum per-sample weight for a call (-msw)" help="" />-->
195 <param name="tt" type="integer" value="0" label="Trim threshold (-tt)" help="" />
196 <!--<param name="x" type="boolean" checked="true" label="Exclude bed file (-x)" help="" />-->
197 <!--<param name="t" type="text" value="" label="Temp file prefix, must be to a writeable directory (-t)" help="" />-->
198 <!--<param name="P" type="boolean" checked="false" label="Output probability curve for each variant (-P)" help="" />-->
199 <!--<param name="b" type="boolean" checked="false" label="output as BEDPE instead of VCF (-b)" help="" />-->
200 <param name="stat_tail_n" type="integer" min="1" value="100000" label="stat_tail_n" help="" />
201 </section>
202 <!-- PE options -->
203 <section name="pe" title="Paired-end options (-pe)" expanded="false">
204 <repeat name="rg_specific" title="Read group-specific settings" default="0" min="0"
205 help="Define paired-end options to be applied to one specific read group.">
206 <param name="rg_id" type="text"
207 label="Read group to apply settings to"
208 help="All settings below will only be applied to reads belonging to the specified read group. The value provided here must correspond to one of the read group IDs defined in the main input and the discordant pairs datasets." />
209 <expand macro="pe_options" />
210 </repeat>
211 <section name="default" title="Default settings for unconfigured read groups" expanded="true">
212 <expand macro="pe_options" />
213 </section>
214 </section>
215 <!-- SR options -->
216 <section name="sr" title="Split-read options (-sr)" expanded="false">
217 <repeat name="sm_specific" title="Sample-specific settings" default="0" min="0"
218 help="Define split-reads options to be applied to one specific sample.">
219 <param name="rg_sm" type="text"
220 label="Sample to apply settings to"
221 help="All settings below will only be applied to reads of the specified sample. The value provided here must correspond to one of the read group SM values defined in the main input and the split-reads datasets." />
222 <expand macro="sr_options" />
223 </repeat>
224 <section name="default" title="Default settings for unconfigured samples" expanded="true">
225 <expand macro="sr_options" />
226 </section>
227 </section>
228 <!-- BEDPE options -->
229 <!--<section name="bedpe" title="BEDPE options (-bedpe)" expanded="false">-->
230 <!--<param name="bedpe_back_distance" type="integer" value="10" label="Distance into the read to add to the breakpoint interval" help="" />-->
231 <!--<param name="bedpe_weight" type="integer" value="1" label="Weight of each piece of evidence from this sample" help="" />-->
232 <!--</section>-->
233 </inputs>
234 <outputs>
235 <data name="result" format="vcf" label="${tool.name} on ${on_string}" />
236 </outputs>
237 <tests>
238 <test>
239 <param name="input_bams" ftype="qname_sorted.bam" value="blasted.bam" />
240 <param name="discordant_alns" ftype="bam" value="discordants.bam" />
241 <section name="general">
242 <param name="e" value="false" />
243 <param name="stat_tail_n" value="1" />
244 </section>
245 <section name="pe">
246 <section name="default">
247 <param name="weight" value="2" />
248 </section>
249 </section>
250 <output name="result" ftype="vcf" file="sample.vcf" />
251 </test>
252 </tests>
253 <help><![CDATA[
254 LUMPY
255 =============
256
257 A probabilistic framework for structural variant discovery.
258
259 For more information see the LUMPY documentation_.
260
261 .. _documentation: https://github.com/arq5x/lumpy-sv
262
263 ]]></help>
264 <expand macro="citations" />
265 </tool>