view epic2/epic2_wrapper.xml @ 2:601ad3ea888b draft

author mpaya
date Wed, 08 May 2019 08:11:56 -0400
line wrap: on
line source

<tool id="epic2" name="epic2" version="@VERSION@.0">
    <description>peak calling of broad ChIP-Seq marks</description>
   <expand macro="requirements" />
        <exit_code range="1:125" level="fatal" description="Unknown error occurred" />
        <exit_code range="130:" level="fatal" description="Unknown error occurred" />
        <regex match="epic2: (command ){0,1}not found" source="stderr" level="fatal" description="The epic2 python package is not properly installed, contact Galaxy administrators" />

        ##set up treatment files, extension must be bed, bedpe, bam, sam
        #set $t_file_list = list()
        #if str($treatment.t_multi_select) == "No":
            #if $treatment.input_treatment_file.is_of_type('bed')
                #set $t_file = 'ChIP_file.bed'
                ln -s '$treatment.input_treatment_file' $t_file &&
            #elif $treatment.input_treatment_file.is_of_type('bam')
                #set $t_file = 'ChIP_file.bam'
                ln -s '$treatment.input_treatment_file' $t_file &&
                ln -s '$treatment.input_treatment_file.metadata.bam_index' ${t_file}.bai &&
            #elif $treatment.input_treatment_file.is_of_type('sam')
                #set $t_file = 'ChIP_file.sam'
                ln -s '$treatment.input_treatment_file' $t_file &&
            #end if
            #set $inputs = $treatment.input_treatment_file
            #for $i, $f in enumerate($inputs)
                #if $f.is_of_type('bed')
                    #set $t_file = ''.join(['ChIP_file_',str($i),'.bed'])
                    ln -s '$f' $t_file &&
                #elif $f.is_of_type('bam')
                    #set $t_file = ''.join(['ChIP_file_',str($i),'.bam'])
                    ln -s '$f' $t_file &&
                    ln -s '$f.metadata.bam_index' ${t_file}.bai &&
                #elif $f.is_of_type('sam')
                    #set $t_file = ''.join(['ChIP_file_',str($i),'.sam'])
                    ln -s '$f' $t_file &&
                #end if
            #end for
        #end if

        ##set up control files, extension must be bed, bedpe, bam, sam
        #if str($control.c_select) == "Yes":
            #set $c_file_list = list()
            #if str($control.c_multiple.c_multi_select) == "No":
                #set $f = $control.c_multiple.input_control_file
                #if $f.is_of_type('bed')
                    #set $c_file = 'control_file.bed'
                    ln -s '$f' $c_file &&
                #elif $f.is_of_type('bam')
                    #set $c_file = 'control_file.bam'
                    ln -s '$f' $c_file &&
                    ln -s '$f.metadata.bam_index' ${c_file}.bai &&
                #elif $f.is_of_type('sam')
                    #set $c_file = 'control_file.sam'
                    ln -s '$f' $c_file &&
                #end if
                #set $inputs = $control.c_multiple.input_control_file
                #for $i, $f in enumerate($inputs)
                    #if $f.is_of_type('bed')
                        #set $c_file = ''.join(['control_file',str($i),'.bed'])
                        ln -s '$f' $c_file &&
                    #elif $f.is_of_type('bam')
                        #set $c_file = ''.join(['control_file',str($i),'.bam'])
                        ln -s '$f' $c_file &&
                        ln -s '$f.metadata.bam_index' ${c_file}.bai &&
                    #elif $f.is_of_type('sam')
                        #set $c_file = ''.join(['control_file',str($i),'.sam'])
                        ln -s '$f' $c_file &&
                    #end if
                #end for
            #end if
        #end if


            ## Treatment File(s)
            -t ${ ' '.join( $t_file_list ) }

            ## Control File(s)
            #if str($control.c_select) == "Yes":
                -c ${ ' '.join( $c_file_list ) }
            #end if

            ## Predefined or Custom Genome
            #if str($genome.g_select) == "Yes":
                --genome ${genome.builtin_genome}
                #if str($genome.chromsizes.chr_select) == "No":
                    #if $genome.chromsizes.cs_file.is_of_type('fasta'):
                        --chromsizes <(awk '/^>/ {if (seqlen) print seqlen;printf substr($1,2) "\t";seqlen=0;next}
                            {seqlen+=length($0)}END{print seqlen}' '${genome.chromsizes.cs_file}')
                        --chromsizes ${genome.chromsizes.cs_file}
                    #end if
                    #if $genome.chromsizes.builtin_fasta.fields.path
                        --chromsizes <(awk '/^>/ {if (seqlen) print seqlen;printf substr($1,2) "\t";seqlen=0;next}
                            {seqlen+=length($0)}END{print seqlen}' '${genome.chromsizes.builtin_fasta.fields.path}')
                    #end if
                #end if
            #end if
            #if $genome.egf:
                --effective-genome-fraction ${genome.egf}
            #end if

            #if $fdr:
                -fdr $fdr
            #end if

        ## BAM OPTIONS
        #if $bam_options.required_flag:
            --required-flag $bam_options.required_flag
        #end if

        #if $bam_options.filter_flag:
            --filter-flag $bam_options.filter_flag
        #end if

        #if $bam_options.mapq:
            --mapq $bam_options.mapq
        #end if

        #if $bam_options.autodetect_chroms:
        #end if

        #if $bam_options.discard_chroms:
            --discard-chromosomes-pattern $bam_options.discard_chroms
        #end if

        #if $advanced_options.keep_dupes:
        #end if

        #if $advanced_options.bin_size:
            --bin-size $advanced_options.bin_size
        #end if

        #if $advanced_options.gaps_allowed:
            --gaps-allowed $advanced_options.gaps_allowed
        #end if

        #if $advanced_options.fragment_size:
            --fragment-size $advanced_options.fragment_size
        #end if

        #if $advanced_options.original_algorithm:
        #end if

        > ${peaks} 
        2> >(awk 'NF' >&2)

        #if $to_bed:
            awk 'NR>1{if ($4==0) {pv=50;qv=50}else{pv=-log($4)/log(10);qv=-log($9)/log(10)};
            print $1,$2,$3,"island_"NR-1,int($5),$6,$10,pv,qv}' OFS="\t" ${peaks} > ${bed_peaks}
        #end if

        <conditional name="treatment">
            <param name="t_multi_select" type="select" label="Are you pooling Treatment Files?" help="" >
                <option value="No" selected="True">No</option>
                <option value="Yes">Yes</option>
            <when value="No" >
                <param name="input_treatment_file" argument="-t" type="data" format="bam,sam,bed" label="ChIP-Seq Treatment File" help="(-t)" />
            <when value="Yes">
                <param name="input_treatment_file" argument="-t" type="data" format="bam,sam,bed" multiple="true" 
                       label="ChIP-Seq Treatment File" help="(-t)" />

        <conditional name="control">
            <param name="c_select" type="select" label="Do you have a Control File?" >
                <option value="Yes">Yes</option>
                <option value="No" selected="True">No</option>
            <when value="Yes">
                <conditional name="c_multiple">
                    <param name="c_multi_select" type="select" label="Are you pooling Control Files?" help="" >
                        <option value="No" selected="True">No</option>
                        <option value="Yes">Yes</option>
                    <when value="No" >
                        <param name="input_control_file" argument="-c" type="data" format="bam,sam,bed" label="ChIP-Seq Control File" help="(-c)" />
                    <when value="Yes">
                        <param name="input_control_file" argument="-c" type="data" format="bam,sam,bed" multiple="true" 
                               label="ChIP-Seq Control File" help="(-c)" />
            <when value="No">
                <param name="evalue" argument="-e" type="integer" optional="True" label="e-value" 
                       help="The E-value controls the genome-wide error rate of identified islands under the random 
                             background assumption. Should be used when not using a control library. Default 1000." />

        <conditional name="genome">
            <param name="g_select" type="select" label="Is your genome indexed?" >
                <option value="Yes" selected="True">Yes</option>
                <option value="No">No</option>
            <when value="Yes">
                <expand macro="effectiveGenomeSize" />
                <param name="egf" argument="-egf" type="float" min="0" max="1" optional="True" label="Effective genome fraction" 
                       help="Use a different effective genome fraction than the one included in epic2, which depends on genome and readlength. (-egf)" />
            <when value="No">
                <conditional name="chromsizes">
                    <param name="chr_select" type="select" label="Do you want to use an indexed fasta file?" 
                           help="Chromosome sizes will be calculated from the provided fasta file." >
                        <option value="No">No</option>
                        <option value="Yes" selected="True">Yes</option>
                    <when value="No" >
                        <param name="cs_file" argument="--chromsizes" type="data" format="fasta,txt,tabular,tsv" label="Chromosome sizes" 
                               help="Provide a fasta file for automated calculation, or a tab-separated file with 
                                     two columns: chromosome names and sizes. (--chromsizes)" />
                    <when value="Yes">
                        <param name="builtin_fasta" argument="--chromsizes" type="select" optional="True" label="Genome for fasta file" help="(--chromsizes)" >
                            <options from_data_table="fasta_indexes">
                                <filter type="sort_by" column="2" />
                                <validator type="no_options" message="No indexes are available" />
                <param name="egf" argument="-egf" type="float" min="0" max="1" optional="True" label="Effective genome fraction" 
                       help="The effective genome fraction is the proportion of the genome that is mappable, excluding Ns. (-egf)" />

            <param name="fdr" argument="-fdr" type="float" min="0" max="1" optional="True" label="False discovery rate cutoff" 
                   help="Remove all islands with an FDR above cutoff. Default 0.05 (-fdr)" />

            <param name="to_bed" type="boolean" checked="false" label="Print output in bed format?"/>

        <section name="bam_options" title="BAM Options">
            <param name="required_flag" argument="--required-flag" type="integer" optional="True" label="Required flag" 
                   help="Keep reads with these bits set in flag. Same as `samtools view -f`. Default 0. (--required-flag)" />
            <param name="filter_flag" argument="--filter-flag" type="integer" optional="True" label="Filter flag" 
                   help="Discard reads with these bits set in flag. Same as `samtools view -F`. Default 1540 (hex: 0x604). (--filter-flag)" />
            <param name="mapq" argument="--mapq" type="integer" optional="True" label="Mapping quality" 
                   help="Discard reads with mapping quality lower than this. Default 5. (--mapq)" />
            <param name="autodetect_chroms" type="boolean" checked="false" truevalue="--autodetect-chroms" falsevalue="" label="Autodetect chromosomes?" 
                   help="Autodetect chromosomes from bam file. Use with --discard-chromosomes flag to avoid non-canonical chromosomes. (--autodetect-chroms)" />
            <param name="discard_chroms" argument="--discard-chromosomes-pattern" type="text" optional="True" label="Discard chromosomes pattern" 
                   help="Discard reads from chromosomes matching
                        this pattern. Default '_'. Note that if you are not
                        interested in the results from non-canonical
                        chromosomes, you should ensure they are removed with
                        this flag, otherwise they will make the statistical
                        analysis too stringent. (--discard-chromosomes-pattern)"/>
        <section name="advanced_options" title="Advanced Options">
            <param name="keep_dupes" type="boolean" checked="false" truevalue="--keep-duplicates" falsevalue="" label="Keep duplicates?" 
                   help="Keep reads mapping to the same position on the same strand within a library. (--keep-duplicates)" />
            <param name="bin_size" argument="--bin-size" type="integer" optional="True" label="Bin size" 
                   help="Size of the windows to scan the genome. BIN-SIZE is the smallest possible island. Default 200. (--bin-size)" />
            <param name="gaps_allowed" argument="--gaps-allowed" type="integer" optional="True" label="Gaps allowed" 
                   help="This number is multiplied by the window size to determine the number of gaps (ineligible windows) allowed 
                         between two eligible windows. Default 3. (--gaps-allowed)"/>
            <param name="fragment_size" argument="--fragment-size" type="integer" optional="True" label="Fragment size" 
                   help="(Single end reads only) Size of the sequenced fragment. Each read is extended half the fragment size from the 5' end. 
                         Default 150 (i.e. extend by 75). (--fragment-size)" />
            <param name="original_algorithm" type="boolean" checked="false" truevalue="--original-algorithm" falsevalue="" 
                   label="Compute p-values with SICER original algorithm?" 
                   help="Use the original SICER algorithm, without the epic2 fix. This will use all reads in your files to compute
                        the p-values, including those falling outside the genome boundaries. (--original-algorithm)" />

        <data format="tabular" name="peaks" label="${} on ${on_string}"/>
        <data format='bed' name='bed_peaks' label="${} on ${on_string}: BED">


            <param name="input_treatment_file" value="test.bam" ftype="bam" />
            <param name="c_select" value="Yes" />
            <param name="input_control_file" value="control.bam" ftype="bam"/>
            <output name="peaks" file="epic2_results.txt"/>
            <param name="input_treatment_file" value="test.bed.gz" ftype="bed" />
            <param name="c_select" value="Yes" />
            <param name="input_control_file" value="control.bed.gz" ftype="bed"/>
            <output name="peaks" file="epic2_results1.txt"/>
            <param name="input_treatment_file" value="test_ChIP.bam" ftype="bam" />
            <param name="c_select" value="Yes" />
            <param name="input_control_file" value="test_Input.bam" ftype="bam"/>
            <param name="g_select" value="No" />
            <param name="chr_select" value="No" />
            <param name="cs_file" value="test_chromsizes.txt" />
            <param name="egf" value="0.99" />
            <param name="original_algorithm" value="Yes" />
            <output name="peaks" file="epic2_results2.txt"/>
            <param name="input_treatment_file" value="test_ChIP.bam" ftype="bam" />
            <param name="c_select" value="Yes" />
            <param name="input_control_file" value="test_Input.bam" ftype="bam"/>
            <param name="g_select" value="No" />
            <param name="chr_select" value="No" />
            <param name="cs_file" value="test_fasta.fasta" />
            <param name="egf" value="0.99" />
            <param name="to_bed" value="Yes" />
            <param name="mapq" value="10" />
            <param name="bin_size" value="100" />
            <param name="gaps_allowed" value="0" />
            <output name="peaks" file="epic2_results3.txt"/>
            <output name="bed_peaks" file="epic2_results3.bed"/>

Chip-Seq broad peak/domain finder based on SICER.

**Accepted input formats**

Input file extension must be bed, bedpe, bam or sam.

**Broad peaks format**
Original epic2 output does not follow any standard format, then broad peaks bed format is offered. This format follows the standard from ENCODE, BED 6 + 3, and contains the following columns:
        * **1.** Chrom 
        * **2.** Start 
        * **3.** End 
        * **4.** Name 
        * **5.** Score 
        * **6.** Strand 
        * **7.** log2FoldChange 
        * **8.** -log10PValue 
        * **9.** -log10FDR
.. class:: warningmark

        On columns 8 and 9, the max value is set to 50 when Pvalue == 0.0.

Tool adapted to Galaxy by Miriam PayĆ” Milans. Original documentation on
    <expand macro="citations" />