changeset 0:477a07f387e0 draft

"planemo upload for repository https://github.com/arq5x/lumpy-sv commit cce17262b21b0964c31eb983bac5e89ae92b8ee9"
author iuc
date Thu, 12 Nov 2020 16:48:15 +0000
parents
children 88b35d91a662
files lumpy_sv.xml macros.xml test-data/blasted.bam test-data/discordants.bam test-data/sample.bam test-data/sample.vcf test-data/splitters.bam
diffstat 7 files changed, 310 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lumpy_sv.xml	Thu Nov 12 16:48:15 2020 +0000
@@ -0,0 +1,265 @@
+<tool id="lumpy_sv" name="LUMPY" version="@WRAPPER_VERSION@">
+    <description>is a probabilistic framework for structural variant discovery</description>
+    <macros>
+        <import>macros.xml</import>
+        <xml name="pe_options">
+            <param name="read_length" type="integer" value="101" label="Length of sequenced reads" help="" />
+            <param name="min_non_overlap" type="integer" value="101" label="Number of base pair positions that must be unique to each end of a read pair" help="Some library preps are created with large reads and small library sizes such that read overlap, in all over cases overlapping reads tends to be a sign of an error. We typically set this to read length (pairs cannot overlap)." />
+            <param name="discordant_z" type="integer" value="5" label="Number of standard deviations away from the mean to be considered as a normal library size" help="" />
+            <param name="back_distance" type="integer" value="10" label="Distance into the read to add to the breakpoint interval" help="" />
+            <param name="min_mapping_threshold" type="integer" value="20" label="Minimum mapping quality (reported from the aligner) that a read must have to be considered" help="A quality of 1 will filter all reads with two or more equally good mappings." />
+            <param name="weight" type="integer" value="1" label="Weight of each piece of evidence from this sample" help="" />
+        </xml>
+        <xml name="sr_options">
+            <param name="sr_back_distance" type="integer" value="10" label="Distance around the +/- of the split to include in the breakpoint interval" help="A distance of 20 will created a breakpoint interval of size 40 centered at the split." />
+            <param name="sr_min_mapping_threshold" type="integer" value="20" label="Minimum mapping quality (reported from the aligner) that a read must have to be considered" help="A quality of 1 will filter all reads with two or more equally good mappings." />
+            <param name="sr_weight" type="integer" value="1" label="Weight of each piece of evidence from this sample" help="" />
+        </xml>
+    </macros>
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">lumpy-sv</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+python '$configure_job' > lumpy_job.sh &&
+chmod u+x lumpy_job.sh &&
+cat lumpy_job.sh &&
+./lumpy_job.sh > '$result'
+    ]]></command>
+    <configfiles>
+        <configfile name="configure_job"><![CDATA[
+## The Python script that gets put together here, will emit a shell script
+## with the necessary commands for a traditional (non-express) LUMPY workflow.
+## After running the python code, the resulting shell script is all that's
+## needed to run LUMPY with all user-specified settings.
+import os
+import pysam
+
+preproc_cmds = []
+lumpy_cmd_parts = ['lumpy -mw ${general.mw} -tt ${general.tt} ${general.e}']
+
+## symlink in all input bams, and there index files if available
+full_bams = []
+disc_bams = []
+split_bams = []
+#for $n,$bam_file in enumerate($input_bams):
+## main input files are collated BAMs without index
+os.symlink('$bam_file', 'f${n}.bam')
+full_bams.append('f${n}.bam')
+#end for
+#if $discordant_alns:
+    #for $n, $disc_file in enumerate($discordant_alns):
+os.symlink('$disc_file', 'd${n}.bam')
+os.symlink('$disc_file.metadata.bam_index', 'd${n}.bam.bai')
+disc_bams.append('d${n}.bam')
+    #end for
+#end if
+#if $split_alns:
+    #for $n, $split_file in enumerate($split_alns):
+os.symlink('$split_file', 's${n}.bam')
+os.symlink('$split_file.metadata.bam_index', 's${n}.bam.bai')
+split_bams.append('s${n}.bam')
+    #end for
+#end if
+if not disc_bams and not split_bams:
+    raise Exception('Either discordant or split alignments are required as input')
+
+## make pe and sr params available in the Python code
+#set $pe_rg_specs = {}
+#for $per_rg_pe in $pe.rg_specific:
+    #silent $pe_rg_specs[str($per_rg_pe.rg_id)] = {
+        'read_len': int($per_rg_pe.read_length),
+        'min_non_overlap': int($per_rg_pe.min_non_overlap),
+        'discordant_z': int($per_rg_pe.discordant_z),
+        'back_distance': int($per_rg_pe.back_distance),
+        'min_mapping_threshold': int($per_rg_pe.min_mapping_threshold),
+        'weight': int($per_rg_pe.weight)
+     }
+#end for
+#silent $pe_rg_specs[None] = {
+    'read_len': int($pe.default.read_length),
+    'min_non_overlap': int($pe.default.min_non_overlap),
+    'discordant_z': int($pe.default.discordant_z),
+    'back_distance': int($pe.default.back_distance),
+    'min_mapping_threshold': int($pe.default.min_mapping_threshold),
+    'weight': int($pe.default.weight)
+}
+pe_rg_specs = $pe_rg_specs
+
+#set $sr_sm_specs = {}
+#for $per_sm_sr in $sr.sm_specific:
+    #silent $sr_sm_specs[str($per_sm_sr.rg_sm)] = {
+        'back_distance': int($per_sm_sr.sr_back_distance),
+        'min_mapping_threshold': int($per_sm_sr.sr_min_mapping_threshold),
+        'weight': int($per_sm_sr.sr_weight)
+     }
+#end for
+
+#silent $sr_sm_specs[None] = {
+    'back_distance': int($sr.default.sr_back_distance),
+    'min_mapping_threshold': int($sr.default.sr_min_mapping_threshold),
+    'weight': int($sr.default.sr_weight)
+}
+sr_sm_specs = $sr_sm_specs
+
+## discover read groups and samples defined in any of the input files
+## the main input files are used to build a catalogue of read groups and
+## samples to work with.
+## For each combination of RG ID and SM, the input that has those reads is
+## recorded
+known_rg_records = {}
+for bam_file in full_bams:
+    ibam = pysam.AlignmentFile(bam_file, 'rb')
+    for rg_record in ibam.header['RG']:
+        known_rg_records[(rg_record['ID'], rg_record['SM'])] = bam_file
+    ibam.close()
+print('# Available read groups and samples in input BAMs:')
+print('# RG\tSample')
+for r, s in known_rg_records:
+    print('# {0}\t{1}'.format(r, s))
+
+## Lumpy can analyze *discordant pairs* per read group.
+## We only use read groups that we've also seen in any of the main inputs.
+## For these read groups, we retrieve the main input BAM that has the read
+## group and use the file to generate paired-end stats with the help of LUMPY's
+## pairend_distro script.
+## The paired-end stats (emitted as numbered .histo files, but also to stdout)
+## are then used as part of the actual LUMPY command, i.e. in the value of the
+## -pe option.
+n = 0
+for discordant_file in disc_bams:
+    dbam = pysam.AlignmentFile(discordant_file, 'rb')
+    sample_info = {}
+    for rg_record in dbam.header['RG']:
+        if (rg_record['ID'], rg_record['SM']) in known_rg_records:
+            if rg_record['SM'] not in sample_info:
+                sample_info[rg_record['SM']] = []
+            sample_info[rg_record['SM']].append({'ID': rg_record['ID'], 'bam_for_stats': known_rg_records[(rg_record['ID'], rg_record['SM'])]})
+    dbam.close()
+    for sample in sample_info:
+        for rg_info in sample_info[sample]:
+            rg_specs = pe_rg_specs.get(rg_info['ID'], pe_rg_specs[None])
+            preproc_cmds.append(
+                ## calculate pe stats, store the main output in distinct
+                ## -histo files per read group and capture stdout in
+                ## shell variables, one per read-group.
+                "pe_var_{0}=\$(samtools view -r '{1}' '{2}' | tail -n+${general.stat_tail_n} | pairend_distro.py -r {3} -X 4 -N 10000 -o rg{0}.histo | tr '\t' ',') &&"
+                .format(n, rg_info['ID'], rg_info['bam_for_stats'], rg_specs['read_len'])
+            )
+            ## reuse the .histo files and the captured stdout in the value
+            ## of LUMPY's -pe option
+            lumpy_cmd_parts.append(
+                '-pe id:{0},'
+                'read_group:{1},'
+                'bam_file:{2},'
+                'histo_file:rg{3}.histo,'
+                '\$pe_var_{3},'
+                'read_length:{read_len},'
+                'min_non_overlap:{min_non_overlap},discordant_z:{discordant_z},back_distance:{back_distance},weight:{weight},min_mapping_threshold:{min_mapping_threshold}'
+                .format(sample, rg_info['ID'], discordant_file, n, **rg_specs)
+            )
+            n += 1
+
+## *Split-reads* get analyzed on a per-sample level.
+## Again, we only use samples that we've seen in the main inputs before.
+## The names of these samples and the correponding split-reads bam files are
+## used directly in the -sr option value of the LUMPY main command.
+for split_file in split_bams:
+    sbam = pysam.AlignmentFile(split_file, 'rb')
+    for rg_record in sbam.header['RG']:
+        if (rg_record['ID'], rg_record['SM']) in known_rg_records:
+            sm_specs = sr_sm_specs.get(rg_record['SM'], sr_sm_specs[None])
+            lumpy_cmd_parts.append(
+                '-sr id:{0},'
+                'bam_file:{1},'
+                'back_distance:{back_distance},weight:{weight},min_mapping_threshold:{min_mapping_threshold}'
+                .format(rg_record['SM'], split_file, **sm_specs)
+            )
+    sbam.close()
+print('\n'.join(preproc_cmds))
+print(' '.join(lumpy_cmd_parts))
+        ]]></configfile>
+    </configfiles>
+    <inputs>
+        <param name="input_bams" type="data" format="qname_sorted.bam" multiple="true" label="BAM input dataset" />
+        <param name="discordant_alns" type="data" format="bam" multiple="true" optional="true" label="Discordant reads input" />
+        <param name="split_alns" type="data" format="bam" multiple="true" optional="true" label="Split alignments input" />
+        <!--<param name="input_bedpe_file" type="data" format="bed" multiple="false" label="BEDPE file" help="Position sorted bedpe file containing the breakpoint intervals for this sample." />-->
+        <!-- General options -->
+        <section name="general" title="General options" expanded="false">
+            <!--<param name="g" type="data" label="Genome file (-g)" help="Defines chromosome order" />-->
+            <param name="e" type="boolean" truevalue="-e" falsevalue=""
+            label="Show evidence for each call (-e)" help="" />
+            <!--<param name="w" type="integer" value="1000000" label="File read windows size (-w)" help="Default 1000000" />-->
+            <param name="mw" type="integer" value="4" label="Minimum weight across all samples for a call (-mw)" help="" />
+            <!--<param name="msw" type="integer" value="1" label="Minimum per-sample weight for a call (-msw)" help="" />-->
+            <param name="tt" type="integer" value="0" label="Trim threshold (-tt)" help="" />
+            <!--<param name="x" type="boolean" checked="true" label="Exclude bed file (-x)" help="" />-->
+            <!--<param name="t" type="text" value="" label="Temp file prefix, must be to a writeable directory (-t)" help="" />-->
+            <!--<param name="P" type="boolean" checked="false" label="Output probability curve for each variant (-P)" help="" />-->
+            <!--<param name="b" type="boolean" checked="false" label="output as BEDPE instead of VCF (-b)" help="" />-->
+            <param name="stat_tail_n" type="integer" min="1" value="100000" label="stat_tail_n" help="" />
+        </section>
+        <!-- PE options -->
+        <section name="pe" title="Paired-end options (-pe)" expanded="false">
+            <repeat name="rg_specific" title="Read group-specific settings" default="0" min="0"
+            help="Define paired-end options to be applied to one specific read group.">
+                <param name="rg_id" type="text"
+                label="Read group to apply settings to"
+                help="All settings below will only be applied to reads belonging to the specified read group. The value provided here must correspond to one of the read group IDs defined in the main input and the discordant pairs datasets." />
+                <expand macro="pe_options" />
+            </repeat>
+            <section name="default" title="Default settings for unconfigured read groups" expanded="true">
+                <expand macro="pe_options" />
+            </section>
+        </section>
+        <!-- SR options -->
+        <section name="sr" title="Split-read options (-sr)" expanded="false">
+            <repeat name="sm_specific" title="Sample-specific settings" default="0" min="0"
+            help="Define split-reads options to be applied to one specific sample.">
+                <param name="rg_sm" type="text"
+                label="Sample to apply settings to"
+                help="All settings below will only be applied to reads of the specified sample. The value provided here must correspond to one of the read group SM values defined in the main input and the split-reads datasets." />
+                <expand macro="sr_options" />
+            </repeat>
+            <section name="default" title="Default settings for unconfigured samples" expanded="true">
+                <expand macro="sr_options" />
+            </section>
+        </section>
+        <!-- BEDPE options -->
+        <!--<section name="bedpe" title="BEDPE options (-bedpe)" expanded="false">-->
+            <!--<param name="bedpe_back_distance" type="integer" value="10" label="Distance into the read to add to the breakpoint interval" help="" />-->
+            <!--<param name="bedpe_weight" type="integer" value="1" label="Weight of each piece of evidence from this sample" help="" />-->
+        <!--</section>-->
+    </inputs>
+    <outputs>
+        <data name="result" format="vcf" label="${tool.name} on ${on_string}" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_bams" ftype="qname_sorted.bam" value="blasted.bam" />
+            <param name="discordant_alns" ftype="bam" value="discordants.bam" />
+            <section name="general">
+                <param name="e" value="false" />
+                <param name="stat_tail_n" value="1" />
+            </section>
+            <section name="pe">
+                <section name="default">
+                    <param name="weight" value="2" />
+                </section>
+            </section>
+            <output name="result" ftype="vcf" file="sample.vcf" />
+        </test>
+    </tests>
+    <help><![CDATA[
+LUMPY
+=============
+
+A probabilistic framework for structural variant discovery.
+
+For more information see the LUMPY documentation_.
+
+.. _documentation: https://github.com/arq5x/lumpy-sv
+
+    ]]></help>
+    <expand macro="citations" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Thu Nov 12 16:48:15 2020 +0000
@@ -0,0 +1,10 @@
+<macros>
+    <token name="@WRAPPER_VERSION@">@TOOL_VERSION@+galaxy0</token>
+    <token name="@TOOL_VERSION@">0.3.1</token>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1186/gb-2014-15-6-r84</citation>
+            <yield />
+        </citations>
+    </xml>
+</macros>
Binary file test-data/blasted.bam has changed
Binary file test-data/discordants.bam has changed
Binary file test-data/sample.bam has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample.vcf	Thu Nov 12 16:48:15 2020 +0000
@@ -0,0 +1,35 @@
+##fileformat=VCFv4.2
+##source=LUMPY
+##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
+##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
+##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">
+##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">
+##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants">
+##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants">
+##INFO=<ID=CIPOS95,Number=2,Type=Integer,Description="Confidence interval (95%) around POS for imprecise variants">
+##INFO=<ID=CIEND95,Number=2,Type=Integer,Description="Confidence interval (95%) around END for imprecise variants">
+##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends">
+##INFO=<ID=EVENT,Number=1,Type=String,Description="ID of event associated to breakend">
+##INFO=<ID=SECONDARY,Number=0,Type=Flag,Description="Secondary breakend in a multi-line variants">
+##INFO=<ID=SU,Number=.,Type=Integer,Description="Number of pieces of evidence supporting the variant across all samples">
+##INFO=<ID=PE,Number=.,Type=Integer,Description="Number of paired-end reads supporting the variant across all samples">
+##INFO=<ID=SR,Number=.,Type=Integer,Description="Number of split reads supporting the variant across all samples">
+##INFO=<ID=BD,Number=.,Type=Integer,Description="Amount of BED evidence supporting the variant across all samples">
+##INFO=<ID=EV,Number=.,Type=String,Description="Type of LUMPY evidence contributing to the variant call">
+##INFO=<ID=PRPOS,Number=.,Type=String,Description="LUMPY probability curve of the POS breakend">
+##INFO=<ID=PREND,Number=.,Type=String,Description="LUMPY probability curve of the END breakend">
+##ALT=<ID=DEL,Description="Deletion">
+##ALT=<ID=DUP,Description="Duplication">
+##ALT=<ID=INV,Description="Inversion">
+##ALT=<ID=DUP:TANDEM,Description="Tandem duplication">
+##ALT=<ID=INS,Description="Insertion of novel sequence">
+##ALT=<ID=CNV,Description="Copy number variable region">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">
+##FORMAT=<ID=PE,Number=1,Type=Integer,Description="Number of paired-end reads supporting the variant">
+##FORMAT=<ID=SR,Number=1,Type=Integer,Description="Number of split reads supporting the variant">
+##FORMAT=<ID=BD,Number=1,Type=Integer,Description="Amount of BED evidence supporting the variant">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample1
+chr8	245202	1	N	<DEL>	.	.	SVTYPE=DEL;STRANDS=+-:3;SVLEN=-229;END=245431;CIPOS=-10,64;CIEND=-51,9;CIPOS95=-1,33;CIEND95=-22,3;IMPRECISE;SU=3;PE=3	GT:SU:PE	./.:3:3
+chr8	246846	2	N	<DEL>	.	.	SVTYPE=DEL;STRANDS=+-:2;SVLEN=-176;END=247022;CIPOS=-10,186;CIEND=-159,21;CIPOS95=0,137;CIEND95=-95,13;IMPRECISE;SU=2;PE=2	GT:SU:PE	./.:2:2
Binary file test-data/splitters.bam has changed