view epic2 0.41/epic2_wrapper.xml @ 9:ea03253198b4 draft default tip

Uploaded
author mpaya
date Mon, 03 Feb 2020 11:46:00 -0500
parents 5f3952470864
children
line wrap: on
line source

<tool id="epic2" name="epic2" version="@VERSION@">
    <description>peak calling of broad ChIP-Seq marks</description>
    <macros>
        <import>macros.xml</import>
    </macros>
   <expand macro="requirements" />
    
    <stdio>
        <exit_code range="1:125" level="fatal" description="Unknown error occurred" />
        <exit_code range="130:" level="fatal" description="Unknown error occurred" />
        <regex match="epic2: (command ){0,1}not found" source="stderr" level="fatal" description="The epic2 python package is not properly installed, contact Galaxy administrators" />
    </stdio>
    
    <command><![CDATA[

        ##set up treatment files, extension must be bed, bedpe, bam, sam
        #set $t_file_list = list()
        #if str($treatment.t_multi_select) == "No":
            #if $treatment.input_treatment_file.is_of_type('bed')
                #set $t_file = 'ChIP_file.bed'
                ln -s '$treatment.input_treatment_file' $t_file &&
            #elif $treatment.input_treatment_file.is_of_type('bam')
                #set $t_file = 'ChIP_file.bam'
                ln -s '$treatment.input_treatment_file' $t_file &&
                ln -s '$treatment.input_treatment_file.metadata.bam_index' ${t_file}.bai &&
            #elif $treatment.input_treatment_file.is_of_type('sam')
                #set $t_file = 'ChIP_file.sam'
                ln -s '$treatment.input_treatment_file' $t_file &&
            #end if
            $t_file_list.append($t_file)
        #else
            #set $inputs = $treatment.input_treatment_file
            #for $i, $f in enumerate($inputs)
                #if $f.is_of_type('bed')
                    #set $t_file = ''.join(['ChIP_file_',str($i),'.bed'])
                    ln -s '$f' $t_file &&
                #elif $f.is_of_type('bam')
                    #set $t_file = ''.join(['ChIP_file_',str($i),'.bam'])
                    ln -s '$f' $t_file &&
                    ln -s '$f.metadata.bam_index' ${t_file}.bai &&
                #elif $f.is_of_type('sam')
                    #set $t_file = ''.join(['ChIP_file_',str($i),'.sam'])
                    ln -s '$f' $t_file &&
                #end if
                $t_file_list.append($t_file)
            #end for
        #end if

        ##set up control files, extension must be bed, bedpe, bam, sam
        #if str($control.c_select) == "Yes":
            #set $c_file_list = list()
            #if str($control.c_multiple.c_multi_select) == "No":
                #set $f = $control.c_multiple.input_control_file
                #if $f.is_of_type('bed')
                    #set $c_file = 'control_file.bed'
                    ln -s '$f' $c_file &&
                #elif $f.is_of_type('bam')
                    #set $c_file = 'control_file.bam'
                    ln -s '$f' $c_file &&
                    ln -s '$f.metadata.bam_index' ${c_file}.bai &&
                #elif $f.is_of_type('sam')
                    #set $c_file = 'control_file.sam'
                    ln -s '$f' $c_file &&
                #end if
                $c_file_list.append($c_file)
            #else
                #set $inputs = $control.c_multiple.input_control_file
                #for $i, $f in enumerate($inputs)
                    #if $f.is_of_type('bed')
                        #set $c_file = ''.join(['control_file',str($i),'.bed'])
                        ln -s '$f' $c_file &&
                    #elif $f.is_of_type('bam')
                        #set $c_file = ''.join(['control_file',str($i),'.bam'])
                        ln -s '$f' $c_file &&
                        ln -s '$f.metadata.bam_index' ${c_file}.bai &&
                    #elif $f.is_of_type('sam')
                        #set $c_file = ''.join(['control_file',str($i),'.sam'])
                        ln -s '$f' $c_file &&
                    #end if
                    $c_file_list.append($c_file)
                #end for
            #end if
        #end if

        epic2

            ## Treatment File(s)
            -t ${ ' '.join( $t_file_list ) }

            ## Control File(s)
            #if str($control.c_select) == "Yes":
                -c ${ ' '.join( $c_file_list ) }
            #end if

            ## Predefined or Custom Genome
            #if str($genome.g_select) == "Yes":
                --genome ${genome.builtin_genome}
            #else
                #if str($genome.chromsizes.chr_select) == "No":
                    #if $genome.chromsizes.cs_file.is_of_type('fasta'):
                        --chromsizes <(awk '/^>/ {if (seqlen) print seqlen;printf substr($1,2) "\t";seqlen=0;next}
                            {seqlen+=length($0)}END{print seqlen}' '${genome.chromsizes.cs_file}')
                    #else
                        --chromsizes ${genome.chromsizes.cs_file}
                    #end if
                #else
                    #if $genome.chromsizes.builtin_fasta.fields.path
                        --chromsizes <(awk '/^>/ {if (seqlen) print seqlen;printf substr($1,2) "\t";seqlen=0;next}
                            {seqlen+=length($0)}END{print seqlen}' '${genome.chromsizes.builtin_fasta.fields.path}')
                    #end if
                #end if
            #end if
            #if $genome.egf:
                --effective-genome-fraction ${genome.egf}
            #end if

            #if $fdr:
                -fdr $fdr
            #end if

        ## BAM OPTIONS
        #if $bam_options.required_flag:
            --required-flag $bam_options.required_flag
        #end if

        #if $bam_options.filter_flag:
            --filter-flag $bam_options.filter_flag
        #end if

        #if $bam_options.mapq:
            --mapq $bam_options.mapq
        #end if

        #if $bam_options.autodetect_chroms:
            --autodetect-chroms
        #end if

        #if $bam_options.discard_chroms:
            --discard-chromosomes-pattern $bam_options.discard_chroms
        #end if

        ## ADVANCED OPTIONS
        #if $advanced_options.keep_dupes:
            --keep-duplicates
        #end if

        #if $advanced_options.bin_size:
            --bin-size $advanced_options.bin_size
        #end if

        #if $advanced_options.gaps_allowed:
            --gaps-allowed $advanced_options.gaps_allowed
        #end if

        #if $advanced_options.fragment_size:
            --fragment-size $advanced_options.fragment_size
        #end if

        #if $advanced_options.original_algorithm:
            --original-algorithm
        #end if

        #if $advanced_options.original_stats:
            --original-statistics
        #end if

        > ${peaks} 
        2> >(awk 'NF' >&2)

        #if $to_bed:
            && 
            awk 'NR>1{if ($4==0) {pv=500;qv=500}else{pv=-log($4)/log(10);qv=-log($9)/log(10)};
            print $1,$2,$3,"island_"NR-1,int($5),$6,$10,pv,qv}' OFS="\t" ${peaks} > ${bed_peaks}
        #end if

    ]]></command>
    
    <inputs>
        <conditional name="treatment">
            <param name="t_multi_select" type="select" label="Are you pooling Treatment Files?" help="" >
                <option value="No" selected="True">No</option>
                <option value="Yes">Yes</option>
            </param>
            <when value="No" >
                <param name="input_treatment_file" argument="-t" type="data" 
                       format="bam,sam,bed" label="ChIP-Seq Treatment File" help="(-t)" />
            </when>
            <when value="Yes">
                <param name="input_treatment_file" argument="-t" type="data" 
                       format="bam,sam,bed" multiple="true" 
                       label="ChIP-Seq Treatment File" help="(-t)" />
            </when>
        </conditional>

        <conditional name="control">
            <param name="c_select" type="select" label="Do you have a Control File?" >
                <option value="Yes">Yes</option>
                <option value="No" selected="True">No</option>
            </param>
            <when value="Yes">
                <conditional name="c_multiple">
                    <param name="c_multi_select" type="select" 
                           label="Are you pooling Control Files?" help="" >
                        <option value="No" selected="True">No</option>
                        <option value="Yes">Yes</option>
                    </param>
                    <when value="No" >
                        <param name="input_control_file" argument="-c" type="data" 
                               format="bam,sam,bed" label="ChIP-Seq Control File" 
                               help="(-c)" />
                    </when>
                    <when value="Yes">
                        <param name="input_control_file" argument="-c" type="data" 
                               format="bam,sam,bed" multiple="true" 
                               label="ChIP-Seq Control File" help="(-c)" />
                    </when>
                </conditional>
            </when>
            <when value="No">
                <param name="evalue" argument="-e" type="integer" optional="True" 
                       label="e-value" help="The E-value controls the genome-wide error 
                             rate of identified islands under the random 
                             background assumption. Should be used when not using 
                             a control library. Default 1000." />
            </when>
        </conditional>

        <conditional name="genome">
            <param name="g_select" type="select" label="Is your genome indexed?" >
                <option value="Yes" selected="True">Yes</option>
                <option value="No">No</option>
            </param>
            <when value="Yes">
                <expand macro="effectiveGenomeSize" />
                <param name="egf" argument="-egf" type="float" min="0" max="1" 
                       optional="True" label="Effective genome fraction" 
                       help="Use a different effective genome fraction than the 
                             one included in epic2, which depends on genome and
                             readlength. (-egf)" />
            </when>
            <when value="No">
                <conditional name="chromsizes">
                    <param name="chr_select" type="select" label="Use an indexed fasta file?" 
                           help="Chromosome sizes will be calculated from the provided fasta file." >
                        <option value="No">No</option>
                        <option value="Yes" selected="True">Yes</option>
                    </param>
                    <when value="No" >
                        <param name="cs_file" argument="--chromsizes" type="data" 
                               format="fasta,txt,tabular,tsv" label="Chromosome sizes" 
                               help="Provide a fasta file for automated calculation, 
                                     or a tab-separated file with two columns: 
                                     chromosome names and sizes. (--chromsizes)" />
                    </when>
                    <when value="Yes">
                        <param name="builtin_fasta" argument="--chromsizes" type="select" 
                               optional="True" label="Genome for fasta file" 
                               help="(--chromsizes)" >
                            <options from_data_table="fasta_indexes">
                                <filter type="sort_by" column="2" />
                                <validator type="no_options" message="No indexes are available" />
                            </options>
                        </param>
                    </when>
                </conditional>
                <param name="egf" argument="-egf" type="float" min="0" max="1" 
                       optional="True" label="Effective genome fraction" 
                       help="The effective genome fraction is the proportion 
                             of the genome that is mappable, excluding Ns. (-egf)" />
            </when>
        </conditional>

            <param name="fdr" argument="-fdr" type="float" min="0" max="1" 
                   optional="True" label="False discovery rate cutoff" 
                   help="Remove all islands with an FDR above cutoff. Default 0.05 (-fdr)" />

            <param name="to_bed" type="boolean" checked="false" label="Convert output to BED format?"/>

        <section name="bam_options" title="BAM Options">
            <param name="required_flag" argument="--required-flag" type="integer" 
                   optional="True" label="Required flag" 
                   help="Keep reads with these bits set in flag. Same as `samtools 
                         view -f`. Default 0. (--required-flag)" />
            <param name="filter_flag" argument="--filter-flag" type="integer" 
                   optional="True" label="Filter flag" 
                   help="Discard reads with these bits set in flag. Same as `samtools 
                         view -F`. Default 1540 (hex: 0x604). (--filter-flag)" />
            <param name="mapq" argument="--mapq" type="integer" 
                   optional="True" label="Mapping quality" 
                   help="Discard reads with mapping quality lower than this. Default 5. (--mapq)" />
            <param name="autodetect_chroms" type="boolean" checked="false" 
                   truevalue="--autodetect-chroms" falsevalue="" label="Autodetect chromosomes?" 
                   help="Autodetect chromosomes from bam file. Use with 
                         --discard-chromosomes flag to avoid non-canonical 
                         chromosomes. (--autodetect-chroms)" />
            <param name="discard_chroms" argument="--discard-chromosomes-pattern" 
                   type="text" optional="True" label="Discard chromosomes pattern" 
                   help="Discard reads from chromosomes matching
                        this pattern. Default '_'. Note that if you are not
                        interested in the results from non-canonical
                        chromosomes, you should ensure they are removed with
                        this flag, otherwise they will make the statistical
                        analysis too stringent. (--discard-chromosomes-pattern)"/>
        </section>
        
        <section name="advanced_options" title="Advanced Options">
            <param name="keep_dupes" type="boolean" checked="false" 
                   truevalue="--keep-duplicates" falsevalue="" label="Keep duplicates?" 
                   help="Keep reads mapping to the same position on the same 
                         strand within a library. (--keep-duplicates)" />
            <param name="bin_size" argument="--bin-size" type="integer" 
                   optional="True" label="Bin size" 
                   help="Size of the windows to scan the genome. BIN-SIZE is the 
                         smallest possible island. Default 200. (--bin-size)" />
            <param name="gaps_allowed" argument="--gaps-allowed" type="integer" 
                   optional="True" label="Gaps allowed" 
                   help="This number is multiplied by the window size to determine 
                         the number of gaps (ineligible windows) allowed 
                         between two eligible windows. Default 3. (--gaps-allowed)"/>
            <param name="fragment_size" argument="--fragment-size" type="integer" 
                   optional="True" label="Fragment size" 
                   help="(Single end reads only) Size of the sequenced fragment. 
                         Each read is extended half the fragment size from the 5' end. 
                         Default 150 (i.e. extend by 75). (--fragment-size)" />
            <param name="original_algorithm" type="boolean" checked="false" 
                   truevalue="--original-algorithm" falsevalue="" 
                   label="Compute p-values with SICER original algorithm?" 
                   help="Use the original SICER algorithm, without the epic2 fix. 
                        This will use all reads in your files to compute
                        the p-values, including those falling outside the
                        genome boundaries. (--original-algorithm)" />
            <param name="original_stats" type="boolean" checked="false" 
                   truevalue="--original-statistics" falsevalue="" 
                   label="Compute p-values with SICER original algorithm?" 
                   help="Use the original SICER way of computing the
                        statistics. Like SICER itself, this method raises an
                        error on large datasets. Only included for debugging-
                        purposes. (--original-statistics)" />
        </section>
   </inputs>

    <outputs>
        <data format="tabular" name="peaks" label="${tool.name} on ${on_string}"/>
        <data format='bed' name='bed_peaks' label="${tool.name} on ${on_string}: BED">
            <filter>to_bed</filter>
        </data>

    </outputs>

    <tests>
        <test>
            <param name="input_treatment_file" value="test.bam" ftype="bam" />
            <param name="c_select" value="Yes" />
            <param name="input_control_file" value="control.bam" ftype="bam"/>
            <output name="peaks" file="epic2_results.txt"/>
        </test>
        <test>
            <param name="input_treatment_file" value="test.bed.gz" ftype="bed" />
            <param name="c_select" value="Yes" />
            <param name="input_control_file" value="control.bed.gz" ftype="bed"/>
            <output name="peaks" file="epic2_results1.txt"/>
        </test>
        <test>
            <param name="input_treatment_file" value="test_ChIP.bam" ftype="bam" />
            <param name="c_select" value="Yes" />
            <param name="input_control_file" value="test_Input.bam" ftype="bam"/>
            <param name="g_select" value="No" />
            <param name="chr_select" value="No" />
            <param name="cs_file" value="test_chromsizes.txt" />
            <param name="egf" value="0.99" />
            <param name="original_algorithm" value="Yes" />
            <output name="peaks" file="epic2_results2.txt"/>
        </test>
        <test>
            <param name="input_treatment_file" value="test_ChIP.bam" ftype="bam" />
            <param name="c_select" value="Yes" />
            <param name="input_control_file" value="test_Input.bam" ftype="bam"/>
            <param name="g_select" value="No" />
            <param name="chr_select" value="No" />
            <param name="cs_file" value="test_fasta.fasta" />
            <param name="egf" value="0.99" />
            <param name="to_bed" value="Yes" />
            <param name="mapq" value="10" />
            <param name="bin_size" value="100" />
            <param name="gaps_allowed" value="0" />
            <output name="peaks" file="epic2_results3.txt"/>
            <output name="bed_peaks" file="epic2_results3.bed"/>
        </test>
    </tests>

    <help>
epic2 is an ultraperformant reimplementation of SICER, a Chip-Seq broad peak/diffuse domain finder. 
epic2 is focused on speed, low memory overhead and ease-of-use.
 Software documentation may be found on https://github.com/biocore-ntnu/epic2.

**Accepted input**

epic2 is designed to be used with ChIP-Seq data from treatment samples, though control samples are encouraged to increase specificity. Single or multiple files are allowed as input, with same or different file formats (file extension must be bed, bedpe, bam or sam). To use multiple files as input (either as treatment or control samples), group them in a collection and select to pool files, otherwise Galaxy will run them in batch mode.

epic2 works with only a treatment file specified as input. In this case, epic2 will run with default parameters, using the pre-indexed human genome hg19 and a FDR cutoff of 0.05. Several genomes are indexed and included on the installation of epic2, although custom genomes may be used. If your genome is not already indexed, two options are provided. One of them is to select a fasta file from which calculate chromosome sizes; the second option is to provide a tab-separated list with one chromosome name and length per row. On custom genomes, the effective genome fraction may be introduced (for more information see [deepTools](https://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html)).

**Broad peaks format**

The output of epic2 contains called peaks in a table that does not follow any standard format. This table may be converted to BED format, assigning a unique name to each peak (islands). This format follows the standard from ENCODE, BED 6+3, and contains the following columns:
        
        * **1.** Chrom 
        * **2.** Start 
        * **3.** End 
        * **4.** Name 
        * **5.** Score 
        * **6.** Strand 
        * **7.** log2FoldChange 
        * **8.** -log10PValue 
        * **9.** -log10FDR
        
.. class:: warningmark

        On columns 8 and 9, the max value is set to 500 when Pvalue == 0.0.

Tool adapted to Galaxy by Miriam PayĆ”.
        
        
    </help>
    <expand macro="citations" />
</tool>