Mercurial > repos > iuc > lumpy_sv

<tool id="lumpy_sv" name="LUMPY" version="@WRAPPER_VERSION@">
    <description>is a probabilistic framework for structural variant discovery</description>
    <macros>
        <import>macros.xml</import>
        <xml name="pe_options">
            <param name="read_length" type="integer" value="101" label="Length of sequenced reads" help="" />
            <param name="min_non_overlap" type="integer" value="101" label="Number of base pair positions that must be unique to each end of a read pair" help="Some library preps are created with large reads and small library sizes such that read overlap, in all over cases overlapping reads tends to be a sign of an error. We typically set this to read length (pairs cannot overlap)." />
            <param name="discordant_z" type="integer" value="5" label="Number of standard deviations away from the mean to be considered as a normal library size" help="" />
            <param name="back_distance" type="integer" value="10" label="Distance into the read to add to the breakpoint interval" help="" />
            <param name="min_mapping_threshold" type="integer" value="20" label="Minimum mapping quality (reported from the aligner) that a read must have to be considered" help="A quality of 1 will filter all reads with two or more equally good mappings." />
            <param name="weight" type="integer" value="1" label="Weight of each piece of evidence from this sample" help="" />
        </xml>
        <xml name="sr_options">
            <param name="sr_back_distance" type="integer" value="10" label="Distance around the +/- of the split to include in the breakpoint interval" help="A distance of 20 will created a breakpoint interval of size 40 centered at the split." />
            <param name="sr_min_mapping_threshold" type="integer" value="20" label="Minimum mapping quality (reported from the aligner) that a read must have to be considered" help="A quality of 1 will filter all reads with two or more equally good mappings." />
            <param name="sr_weight" type="integer" value="1" label="Weight of each piece of evidence from this sample" help="" />
        </xml>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">lumpy-sv</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
python '$configure_job' > lumpy_job.sh &&
chmod u+x lumpy_job.sh &&
cat lumpy_job.sh &&
./lumpy_job.sh > '$result'
    ]]></command>
    <configfiles>
        <configfile name="configure_job"><![CDATA[
## The Python script that gets put together here, will emit a shell script
## with the necessary commands for a traditional (non-express) LUMPY workflow.
## After running the python code, the resulting shell script is all that's
## needed to run LUMPY with all user-specified settings.
import os
import pysam

preproc_cmds = []
lumpy_cmd_parts = ['lumpy -mw ${general.mw} -tt ${general.tt} ${general.e}']

## symlink in all input bams, and there index files if available
full_bams = []
disc_bams = []
split_bams = []
#for $n,$bam_file in enumerate($input_bams):
## main input files are collated BAMs without index
os.symlink('$bam_file', 'f${n}.bam')
full_bams.append('f${n}.bam')
#end for
#if $discordant_alns:
    #for $n, $disc_file in enumerate($discordant_alns):
os.symlink('$disc_file', 'd${n}.bam')
os.symlink('$disc_file.metadata.bam_index', 'd${n}.bam.bai')
disc_bams.append('d${n}.bam')
    #end for
#end if
#if $split_alns:
    #for $n, $split_file in enumerate($split_alns):
os.symlink('$split_file', 's${n}.bam')
os.symlink('$split_file.metadata.bam_index', 's${n}.bam.bai')
split_bams.append('s${n}.bam')
    #end for
#end if
if not disc_bams and not split_bams:
    raise Exception('Either discordant or split alignments are required as input')

## make pe and sr params available in the Python code
#set $pe_rg_specs = {}
#for $per_rg_pe in $pe.rg_specific:
    #silent $pe_rg_specs[str($per_rg_pe.rg_id)] = {
        'read_len': int($per_rg_pe.read_length),
        'min_non_overlap': int($per_rg_pe.min_non_overlap),
        'discordant_z': int($per_rg_pe.discordant_z),
        'back_distance': int($per_rg_pe.back_distance),
        'min_mapping_threshold': int($per_rg_pe.min_mapping_threshold),
        'weight': int($per_rg_pe.weight)
     }
#end for
#silent $pe_rg_specs[None] = {
    'read_len': int($pe.default.read_length),
    'min_non_overlap': int($pe.default.min_non_overlap),
    'discordant_z': int($pe.default.discordant_z),
    'back_distance': int($pe.default.back_distance),
    'min_mapping_threshold': int($pe.default.min_mapping_threshold),
    'weight': int($pe.default.weight)
}
pe_rg_specs = $pe_rg_specs

#set $sr_sm_specs = {}
#for $per_sm_sr in $sr.sm_specific:
    #silent $sr_sm_specs[str($per_sm_sr.rg_sm)] = {
        'back_distance': int($per_sm_sr.sr_back_distance),
        'min_mapping_threshold': int($per_sm_sr.sr_min_mapping_threshold),
        'weight': int($per_sm_sr.sr_weight)
     }
#end for

#silent $sr_sm_specs[None] = {
    'back_distance': int($sr.default.sr_back_distance),
    'min_mapping_threshold': int($sr.default.sr_min_mapping_threshold),
    'weight': int($sr.default.sr_weight)
}
sr_sm_specs = $sr_sm_specs

## discover read groups and samples defined in any of the input files
## the main input files are used to build a catalogue of read groups and
## samples to work with.
## For each combination of RG ID and SM, the input that has those reads is
## recorded
known_rg_records = {}
for bam_file in full_bams:
    ibam = pysam.AlignmentFile(bam_file, 'rb')
    for rg_record in ibam.header['RG']:
        known_rg_records[(rg_record['ID'], rg_record['SM'])] = bam_file
    ibam.close()
print('# Available read groups and samples in input BAMs:')
print('# RG\tSample')
for r, s in known_rg_records:
    print('# {0}\t{1}'.format(r, s))

## Lumpy can analyze *discordant pairs* per read group.
## We only use read groups that we've also seen in any of the main inputs.
## For these read groups, we retrieve the main input BAM that has the read
## group and use the file to generate paired-end stats with the help of LUMPY's
## pairend_distro script.
## The paired-end stats (emitted as numbered .histo files, but also to stdout)
## are then used as part of the actual LUMPY command, i.e. in the value of the
## -pe option.
n = 0
for discordant_file in disc_bams:
    dbam = pysam.AlignmentFile(discordant_file, 'rb')
    sample_info = {}
    for rg_record in dbam.header['RG']:
        if (rg_record['ID'], rg_record['SM']) in known_rg_records:
            if rg_record['SM'] not in sample_info:
                sample_info[rg_record['SM']] = []
            sample_info[rg_record['SM']].append({'ID': rg_record['ID'], 'bam_for_stats': known_rg_records[(rg_record['ID'], rg_record['SM'])]})
    dbam.close()
    for sample in sample_info:
        for rg_info in sample_info[sample]:
            rg_specs = pe_rg_specs.get(rg_info['ID'], pe_rg_specs[None])
            preproc_cmds.append(
                ## calculate pe stats, store the main output in distinct
                ## -histo files per read group and capture stdout in
                ## shell variables, one per read-group.
                "pe_var_{0}=\$(samtools view -r '{1}' '{2}' | tail -n+${general.stat_tail_n} | pairend_distro.py -r {3} -X 4 -N 10000 -o rg{0}.histo | tr '\t' ',') &&"
                .format(n, rg_info['ID'], rg_info['bam_for_stats'], rg_specs['read_len'])
            )
            ## reuse the .histo files and the captured stdout in the value
            ## of LUMPY's -pe option
            lumpy_cmd_parts.append(
                '-pe id:{0},'
                'read_group:{1},'
                'bam_file:{2},'
                'histo_file:rg{3}.histo,'
                '\$pe_var_{3},'
                'read_length:{read_len},'
                'min_non_overlap:{min_non_overlap},discordant_z:{discordant_z},back_distance:{back_distance},weight:{weight},min_mapping_threshold:{min_mapping_threshold}'
                .format(sample, rg_info['ID'], discordant_file, n, **rg_specs)
            )
            n += 1

## *Split-reads* get analyzed on a per-sample level.
## Again, we only use samples that we've seen in the main inputs before.
## The names of these samples and the correponding split-reads bam files are
## used directly in the -sr option value of the LUMPY main command.
for split_file in split_bams:
    sbam = pysam.AlignmentFile(split_file, 'rb')
    for rg_record in sbam.header['RG']:
        if (rg_record['ID'], rg_record['SM']) in known_rg_records:
            sm_specs = sr_sm_specs.get(rg_record['SM'], sr_sm_specs[None])
            lumpy_cmd_parts.append(
                '-sr id:{0},'
                'bam_file:{1},'
                'back_distance:{back_distance},weight:{weight},min_mapping_threshold:{min_mapping_threshold}'
                .format(rg_record['SM'], split_file, **sm_specs)
            )
    sbam.close()
print('\n'.join(preproc_cmds))
print(' '.join(lumpy_cmd_parts))
        ]]></configfile>
    </configfiles>
    <inputs>
        <param name="input_bams" type="data" format="qname_sorted.bam" multiple="true" label="BAM input dataset" />
        <param name="discordant_alns" type="data" format="bam" multiple="true" optional="true" label="Discordant reads input" />
        <param name="split_alns" type="data" format="bam" multiple="true" optional="true" label="Split alignments input" />
        <!--<param name="input_bedpe_file" type="data" format="bed" multiple="false" label="BEDPE file" help="Position sorted bedpe file containing the breakpoint intervals for this sample." />-->
        <!-- General options -->
        <section name="general" title="General options" expanded="false">
            <!--<param name="g" type="data" label="Genome file (-g)" help="Defines chromosome order" />-->
            <param name="e" type="boolean" truevalue="-e" falsevalue=""
            label="Show evidence for each call (-e)" help="" />
            <!--<param name="w" type="integer" value="1000000" label="File read windows size (-w)" help="Default 1000000" />-->
            <param name="mw" type="integer" value="4" label="Minimum weight across all samples for a call (-mw)" help="" />
            <!--<param name="msw" type="integer" value="1" label="Minimum per-sample weight for a call (-msw)" help="" />-->
            <param name="tt" type="integer" value="0" label="Trim threshold (-tt)" help="" />
            <!--<param name="x" type="boolean" checked="true" label="Exclude bed file (-x)" help="" />-->
            <!--<param name="t" type="text" value="" label="Temp file prefix, must be to a writeable directory (-t)" help="" />-->
            <!--<param name="P" type="boolean" checked="false" label="Output probability curve for each variant (-P)" help="" />-->
            <!--<param name="b" type="boolean" checked="false" label="output as BEDPE instead of VCF (-b)" help="" />-->
            <param name="stat_tail_n" type="integer" min="1" value="100000" label="stat_tail_n" help="" />
        </section>
        <!-- PE options -->
        <section name="pe" title="Paired-end options (-pe)" expanded="false">
            <repeat name="rg_specific" title="Read group-specific settings" default="0" min="0"
            help="Define paired-end options to be applied to one specific read group.">
                <param name="rg_id" type="text"
                label="Read group to apply settings to"
                help="All settings below will only be applied to reads belonging to the specified read group. The value provided here must correspond to one of the read group IDs defined in the main input and the discordant pairs datasets." />
                <expand macro="pe_options" />
            </repeat>
            <section name="default" title="Default settings for unconfigured read groups" expanded="true">
                <expand macro="pe_options" />
            </section>
        </section>
        <!-- SR options -->
        <section name="sr" title="Split-read options (-sr)" expanded="false">
            <repeat name="sm_specific" title="Sample-specific settings" default="0" min="0"
            help="Define split-reads options to be applied to one specific sample.">
                <param name="rg_sm" type="text"
                label="Sample to apply settings to"
                help="All settings below will only be applied to reads of the specified sample. The value provided here must correspond to one of the read group SM values defined in the main input and the split-reads datasets." />
                <expand macro="sr_options" />
            </repeat>
            <section name="default" title="Default settings for unconfigured samples" expanded="true">
                <expand macro="sr_options" />
            </section>
        </section>
        <!-- BEDPE options -->
        <!--<section name="bedpe" title="BEDPE options (-bedpe)" expanded="false">-->
            <!--<param name="bedpe_back_distance" type="integer" value="10" label="Distance into the read to add to the breakpoint interval" help="" />-->
            <!--<param name="bedpe_weight" type="integer" value="1" label="Weight of each piece of evidence from this sample" help="" />-->
        <!--</section>-->
    </inputs>
    <outputs>
        <data name="result" format="vcf" label="${tool.name} on ${on_string}" />
    </outputs>
    <tests>
        <test>
            <param name="input_bams" ftype="qname_sorted.bam" value="blasted.bam" />
            <param name="discordant_alns" ftype="bam" value="discordants.bam" />
            <section name="general">
                <param name="e" value="false" />
                <param name="stat_tail_n" value="1" />
            </section>
            <section name="pe">
                <section name="default">
                    <param name="weight" value="2" />
                </section>
            </section>
            <output name="result" ftype="vcf" file="sample.vcf" />
        </test>
    </tests>
    <help><![CDATA[
LUMPY
=============

A probabilistic framework for structural variant discovery.

For more information see the LUMPY documentation_.

.. _documentation: https://github.com/arq5x/lumpy-sv

    ]]></help>
    <expand macro="citations" />
</tool>