view artbio_bam_cleaning.xml @ 12:b0f65f88411f draft default tip

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/artbio_bam_cleaning commit 6a7ed08b76a45cfcda4e030163d9f246163934ed
author artbio
date Tue, 10 Oct 2023 11:14:10 +0000
parents 6815060fb056
children
line wrap: on
line source

<tool id="artbio_bam_cleaning" name="ARTbio bam cleaning" version="1.10+galaxy0">
    <description>
        on flags and PCR Duplicates and MD recalibration
    </description>
    <macros>
        <import>macro.xml</import>
    </macros>
    <requirements>
        <requirement type="package" version="1.10=h2e538c0_3">samtools</requirement>
        <requirement type="package" version="0.8.1=hadffe2f_1">sambamba</requirement>
        <requirement type="package" version="1.3.7">freebayes</requirement>
    </requirements>
    <stdio>
        <exit_code range="1:" level="fatal" description="Error occured" />
    </stdio>
    <command detect_errors="exit_code"><![CDATA[
    @pipefail@
    @set_fasta_index@
    #set input_base = 'input'   
    ln -f -s $input_bam.metadata.bam_index input.bam.bai &&
    ln -s $input_bam input.bam &&
    sambamba view -h -t \${GALAXY_SLOTS:-2} --filter="mapping_quality >= 1 and not(unmapped) and not(mate_is_unmapped) and not(duplicate)" -f "bam" ${input_base}".bam"
    | bamleftalign --fasta-reference reference.fa -c --max-iterations "5" -
    | samtools calmd  -C 50 -b -@ \${GALAXY_SLOTS:-2} - reference.fa
    #if $specify_outputs == 'just_calMD':
        > $calmd
    #else if $specify_outputs == 'both':
        | tee $calmd
        | sambamba view -h -t \${GALAXY_SLOTS:-2} --filter='mapping_quality <= 254 and proper_pair' -f 'bam' /dev/stdin > $fullfilter
    #else:
        | sambamba view -h -t \${GALAXY_SLOTS:-2} --filter='mapping_quality <= 254 and proper_pair' -f 'bam' /dev/stdin > $fullfilter
    #end if
    ]]></command>
    <inputs>
        <expand macro="reference_source_conditional" />
        <param name="input_bam" type="data" format="bam" label="BAM or SAM file to process"/>
        <param name="specify_outputs" type="select" label="specify cleaned output(s)"
               display="radio"
               help="The tool first generates MD-recalibrated alignements, then discards
                     aberrant Mapping Quality alignements generated by calMD recalibration.
                     One, the other, or both types of outputs can be retained by the tool">
            <option value="just_calMD">Alignments are only MD-recalibrated (for split or discordant read aware variant callers)</option>
            <option value="calMDandMQ" selected="true">Alignments are MD-recalibrated AND mapping quality &gt; 254 are discarded AND only proper_pair flag is retained (for snv and small indel callers)</option>
            <option value="both">Both types of outputs are retained</option>
        </param>
    </inputs>
    <outputs>
        <data name="calmd" format="bam" label="CalMD filter (for lumpy-smoove)">
            <filter>specify_outputs == 'just_calMD' or specify_outputs == 'both'</filter>
        </data>
        <data name="fullfilter" format="bam" label="Full filtering (for somatic-varscan)">
            <filter>specify_outputs == 'calMDandMQ' or specify_outputs == 'both'</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="1">
            <param name="input_bam" value="chr22_sample.bam" ftype="bam" />
            <param name="reference_source_selector" value="history" />
            <param name="specify_outputs" value="calMDandMQ" />
            <param name="ref_file" value="chr22.fa" />
            <output name="fullfilter" file="full.bam" ftype="bam" lines_diff="4" />
        </test>
        <test expect_num_outputs="2">
            <param name="input_bam" value="chr22_sample.bam" ftype="bam" />
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" value="chr22.fa" />
            <param name="specify_outputs" value="both" />
            <output name="calmd" file="calmd.bam" ftype="bam" lines_diff="4" />
            <output name="fullfilter" file="full.bam" ftype="bam" lines_diff="4" />
        </test>
        <test expect_num_outputs="1">
            <param name="input_bam" value="chr22_sample.bam" ftype="bam" />
            <param name="reference_source_selector" value="history" />
            <param name="specify_outputs" value="just_calMD" />
            <param name="ref_file" value="chr22.fa" />
            <output name="calmd" file="calmd.bam" ftype="bam" lines_diff="4" />
        </test>
    </tests>
    <help>
ARTbio bam cleaning overview
============================

.. class:: infomark

This tool is wrapping several cleaning steps to produce bam files suitable for subsequent
analyses with lumpy-smoove (or other large structural variation callers) or with
somatic-varscan (or other small structural variation callers)


Workflow 
=============

.. class:: infomark

The tool is using the following command line for filtering:

::

    sambamba view -h -t 8 --filter='mapping_quality >= 1 and not(unmapped) and not(mate_is_unmapped) and not(duplicate)' -f 'bam' $input_base".bam"
    &#124; bamleftalign --fasta-reference reference.fa -c --max-iterations "5" -
    &#124; samtools calmd  -C 50 -b -@ 4 - reference.fa &gt; $input_base".filt1.dedup.bamleft.calmd.bam" ;
    sambamba view -h -t 8 --filter='mapping_quality &lt;&#61; 254' -f 'bam' -o $input_base".filt1.dedup.bamleft.calmd.filt2.bam" $input_base".filt1.dedup.bamleft.calmd.bam"
    
.. class:: warningmark

From version **1.7+galaxy0**, this tool assumes that the input bam already has its
optical/PCR duplicate alignments marked appropriately in their flag value. If it is not the
case, it may be necessary to use tool that perform this job, for instance samtools markdup,
or sambamba markdup.

Purpose
--------

This "workflow" tool was generated in order to limit the number of ``python metadata/set.py`` jobs
which occur at each step of standard galaxy workflows. Indeed, these jobs are poorly optimized and may last considerable
amounts of time when datasets are large, at each step, lowering the overall performance of the workflow.

    </help>
    <citations>
        <citation type="doi">10.1371/journal.pone.0168397</citation>
    </citations>
</tool>