view vsnp_add_zero_coverage.xml @ 1:eaf4c304fd22 draft

Uploaded
author greg
date Tue, 21 Apr 2020 09:51:00 -0400
parents 3cb0bf7e1b2d
children 01312f8a6ca9
line wrap: on
line source

<tool id="vsnp_add_zero_coverage" name="vSNP: add zero coverage" version="1.0.0">
    <description></description>
    <requirements>
        <requirement type="package" version="1.76">biopython</requirement>
        <requirement type="package" version="1.16.5">numpy</requirement>
        <requirement type="package" version="0.25.3">pandas</requirement>
        <requirement type="package" version="0.15.4">pysam</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
#import os
#import re
#set input_type = $input_type_cond.input_type
#set input_bam_dir = 'input_bam_dir'
#set input_vcf_dir = 'input_vcf_dir'
#set output_vcf_dir = 'output_vcf_dir'
#set output_metrics_dir = 'output_metrics_dir'
mkdir -p $input_bam_dir &&
mkdir -p $input_vcf_dir &&
mkdir -p $output_vcf_dir &&
mkdir -p $output_metrics_dir &&
#if str($input_type) == "single":
    #set bam_input = $input_type_cond.bam_input
    #set file_name = $bam_input.file_name
    #set file_name_base = $os.path.basename($file_name)
    ln -s $file_name $input_bam_dir/$file_name_base &&
    #set vcf_input = $input_type_cond.vcf_input
    #set file_name = $vcf_input.file_name
    #set file_name_base = $os.path.basename($file_name)
    ln -s $file_name $input_vcf_dir/$file_name_base &&
#else:
    #for $i in $input_type_cond.bam_input_collection:
        #set filename = $i.file_name
        #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
        ln -s $filename $input_bam_dir/$identifier &&
    #end for
    #for $i in $input_type_cond.vcf_input_collection:
        #set filename = $i.file_name
        #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
        ln -s $filename $input_vcf_dir/$identifier &&
    #end for
#end if
python '$__tool_directory__/vsnp_add_zero_coverage.py'
--processes $processes
#if str($reference_cond.reference_source) == "cached"
    --reference '$reference_cond.reference.fields.path'
#else:
    --reference '$reference_cond.reference'
#end if
#if str($input_type) == "single":
    --output_metrics '$output_metrics'
    --output_vcf '$output_vcf'
#end if
]]></command>
    <inputs>
        <conditional name="input_type_cond">
            <param name="input_type" type="select" label="Choose the category of the files to be analyzed">
                <option value="single" selected="true">Single files</option>
                <option value="collection">Collections of files</option>
            </param>
            <when value="single">
                <param name="bam_input" type="data" format="bam" label="BAM file">
                    <validator type="unspecified_build"/>
                </param>
                <param name="vcf_input" type="data" format="vcf" label="VCF file">
                    <validator type="unspecified_build"/>
                </param>
            </when>
            <when value="collection">
                <param name="bam_input_collection" type="data_collection" format="bam" collection_type="list" label="Collection of BAM files">
                    <validator type="unspecified_build"/>
                </param>
                <param name="vcf_input_collection" type="data_collection" format="vcf" collection_type="list" label="Collection of VCF files">
                    <validator type="unspecified_build"/>
                </param>
            </when>
        </conditional>
        <conditional name="reference_cond">
            <param name="reference_source" type="select" label="Choose the source for the reference genome">
                <option value="cached" selected="true">locally cached</option>
                <option value="history">from history</option>
            </param>
            <when value="cached">
                <param name="reference" type="select" label="Using reference genome">
                    <options from_data_table="fasta_indexes"/>
                    <!-- No <filter> tag here! -->
                    <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected BAM file"/>
                </param>
            </when>
            <when value="history">
                <param name="reference" type="data" format="fasta" label="Using reference genome">
                    <validator type="no_options" message="The current history does not include a fasta dataset"/>
                </param>
            </when>
        </conditional>
        <param name="processes" type="integer" min="1" max="20" value="8" label="Number of processes for job splitting"/>
    </inputs>
    <outputs>
        <data name="output_vcf" format="vcf"  label="${tool.name} (filtered VCF) on ${on_string}">
            <filter>input_type_cond['input_type'] == 'single'</filter>
        </data>
        <collection name="output_vcf_collection" type="list" label="${tool.name} (filtered VCFs) on ${on_string}">
            <discover_datasets pattern="__name__" directory="output_vcf_dir" format="vcf" />
            <filter>input_type_cond['input_type'] == 'collection'</filter>
        </collection>
        <data name="output_metrics" format="tabular"  label="${tool.name} (metrics) on ${on_string}">
            <filter>input_type_cond['input_type'] == 'single'</filter>
        </data>
        <collection name="output_metrics_collection" type="list" label="${tool.name} (metrics) on ${on_string}">
            <discover_datasets pattern="__name__" directory="output_metrics_dir" format="tabular" />
            <filter>input_type_cond['input_type'] == 'collection'</filter>
        </collection>
    </outputs>
    <tests>
        <test>
            <param name="input_type" value="collection"/>
            <param name="bam_input_collection">
                <collection type="list">
                    <element name="bam_input.bam" value="bam_input.bam" dbkey="89"/>
                    <element name="bam_input2.bam" value="bam_input2.bam" dbkey="89"/>
                </collection>
            </param>
            <param name="vcf_input_collection">
                <collection type="list">
                    <element name="vcf_input.vcf" value="vcf_input.vcf" dbkey="89"/>
                    <element name="vcf_input2.vcf" value="vcf_input2.vcf" dbkey="89"/>
                </collection>
            </param>
            <param name="reference_source" value="history"/>
            <param name="reference" value="NC_002945v4.fasta" ftype="fasta"/>
            <output_collection name="output_vcf_collection" type="list">
                <element name="vcf_input.vcf" file="output_vcf.vcf" ftype="vcf" compare="contains"/>
                <element name="vcf_input2.vcf" file="output_vcf.vcf" ftype="vcf" compare="contains"/>
            </output_collection>
            <output_collection name="output_metrics_collection" type="list">
                <element name="vcf_input.tabular" file="output_metrics.tabular" ftype="tabular" compare="contains"/>
                <element name="vcf_input2.tabular" file="output_metrics.tabular" ftype="tabular" compare="contains"/>
            </output_collection>
        </test>
        <test>
            <param name="bam_input" value="bam_input.bam" ftype="bam" dbkey="89"/>
            <param name="vcf_input" value="vcf_input.vcf" ftype="vcf" dbkey="89"/>
            <param name="reference_source" value="history"/>
            <param name="reference" value="NC_002945v4.fasta" ftype="fasta"/>
            <param name="output_vcf" value="output_vcf.vcf" ftype="vcf" compare="contains"/>
            <output name="output_metrics" file="output_metrics.tabular" ftype="tabular" compare="contains"/>
        </test>
    </tests>
    <help>
**What it does**

Accepts a combination of single BAM and associated VCF files (or associated collections of each) to produce a VCF file for each
combination whose positions with no coverage are represented as "N".  These outputs are restricted to SNPs and those regions
along the reference with no coverage.

A metrics file is produced for each combination which provides the number of good SNPs, the average coverage and the genome
coverage percentage.

**Required Options**

 * **Choose the category of the files to be analyzed** - select "Single files" or "Collections of files", then select the appropriate history items (single BAM and VCF files or collections of BAM and VCF files) based on the selected option.
 * **Choose the source for the reference genome** - select "locally cached" if the reference associated with the BAM and VCF files is available within the Galaxy environment or "from history" to select the reference from the current history.
 * **Number of processes for job splitting** - Select the number of processes for splitting the job to shorten execution time.
    </help>
    <citations>
        <citation type="bibtex">
            @misc{None,
            journal = {None},
            author = {1. Stuber T},
            title = {Manuscript in preparation},
            year = {None},
            url = {https://github.com/USDA-VS/vSNP},}
        </citation>
    </citations>
</tool>