diff epic2 0.41/epic2_wrapper.xml @ 8:5f3952470864 draft

Uploaded 0.0.41
author mpaya
date Mon, 03 Feb 2020 11:43:12 -0500
parents
children ea03253198b4
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/epic2 0.41/epic2_wrapper.xml	Mon Feb 03 11:43:12 2020 -0500
@@ -0,0 +1,425 @@
+<tool id="epic2" name="epic2" version="@VERSION@.0">
+    <description>peak calling of broad ChIP-Seq marks</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+   <expand macro="requirements" />
+    
+    <stdio>
+        <exit_code range="1:125" level="fatal" description="Unknown error occurred" />
+        <exit_code range="130:" level="fatal" description="Unknown error occurred" />
+        <regex match="epic2: (command ){0,1}not found" source="stderr" level="fatal" description="The epic2 python package is not properly installed, contact Galaxy administrators" />
+    </stdio>
+    
+    <command><![CDATA[
+
+        ##set up treatment files, extension must be bed, bedpe, bam, sam
+        #set $t_file_list = list()
+        #if str($treatment.t_multi_select) == "No":
+            #if $treatment.input_treatment_file.is_of_type('bed')
+                #set $t_file = 'ChIP_file.bed'
+                ln -s '$treatment.input_treatment_file' $t_file &&
+            #elif $treatment.input_treatment_file.is_of_type('bam')
+                #set $t_file = 'ChIP_file.bam'
+                ln -s '$treatment.input_treatment_file' $t_file &&
+                ln -s '$treatment.input_treatment_file.metadata.bam_index' ${t_file}.bai &&
+            #elif $treatment.input_treatment_file.is_of_type('sam')
+                #set $t_file = 'ChIP_file.sam'
+                ln -s '$treatment.input_treatment_file' $t_file &&
+            #end if
+            $t_file_list.append($t_file)
+        #else
+            #set $inputs = $treatment.input_treatment_file
+            #for $i, $f in enumerate($inputs)
+                #if $f.is_of_type('bed')
+                    #set $t_file = ''.join(['ChIP_file_',str($i),'.bed'])
+                    ln -s '$f' $t_file &&
+                #elif $f.is_of_type('bam')
+                    #set $t_file = ''.join(['ChIP_file_',str($i),'.bam'])
+                    ln -s '$f' $t_file &&
+                    ln -s '$f.metadata.bam_index' ${t_file}.bai &&
+                #elif $f.is_of_type('sam')
+                    #set $t_file = ''.join(['ChIP_file_',str($i),'.sam'])
+                    ln -s '$f' $t_file &&
+                #end if
+                $t_file_list.append($t_file)
+            #end for
+        #end if
+
+        ##set up control files, extension must be bed, bedpe, bam, sam
+        #if str($control.c_select) == "Yes":
+            #set $c_file_list = list()
+            #if str($control.c_multiple.c_multi_select) == "No":
+                #set $f = $control.c_multiple.input_control_file
+                #if $f.is_of_type('bed')
+                    #set $c_file = 'control_file.bed'
+                    ln -s '$f' $c_file &&
+                #elif $f.is_of_type('bam')
+                    #set $c_file = 'control_file.bam'
+                    ln -s '$f' $c_file &&
+                    ln -s '$f.metadata.bam_index' ${c_file}.bai &&
+                #elif $f.is_of_type('sam')
+                    #set $c_file = 'control_file.sam'
+                    ln -s '$f' $c_file &&
+                #end if
+                $c_file_list.append($c_file)
+            #else
+                #set $inputs = $control.c_multiple.input_control_file
+                #for $i, $f in enumerate($inputs)
+                    #if $f.is_of_type('bed')
+                        #set $c_file = ''.join(['control_file',str($i),'.bed'])
+                        ln -s '$f' $c_file &&
+                    #elif $f.is_of_type('bam')
+                        #set $c_file = ''.join(['control_file',str($i),'.bam'])
+                        ln -s '$f' $c_file &&
+                        ln -s '$f.metadata.bam_index' ${c_file}.bai &&
+                    #elif $f.is_of_type('sam')
+                        #set $c_file = ''.join(['control_file',str($i),'.sam'])
+                        ln -s '$f' $c_file &&
+                    #end if
+                    $c_file_list.append($c_file)
+                #end for
+            #end if
+        #end if
+
+        epic2
+
+            ## Treatment File(s)
+            -t ${ ' '.join( $t_file_list ) }
+
+            ## Control File(s)
+            #if str($control.c_select) == "Yes":
+                -c ${ ' '.join( $c_file_list ) }
+            #end if
+
+            ## Predefined or Custom Genome
+            #if str($genome.g_select) == "Yes":
+                --genome ${genome.builtin_genome}
+            #else
+                #if str($genome.chromsizes.chr_select) == "No":
+                    #if $genome.chromsizes.cs_file.is_of_type('fasta'):
+                        --chromsizes <(awk '/^>/ {if (seqlen) print seqlen;printf substr($1,2) "\t";seqlen=0;next}
+                            {seqlen+=length($0)}END{print seqlen}' '${genome.chromsizes.cs_file}')
+                    #else
+                        --chromsizes ${genome.chromsizes.cs_file}
+                    #end if
+                #else
+                    #if $genome.chromsizes.builtin_fasta.fields.path
+                        --chromsizes <(awk '/^>/ {if (seqlen) print seqlen;printf substr($1,2) "\t";seqlen=0;next}
+                            {seqlen+=length($0)}END{print seqlen}' '${genome.chromsizes.builtin_fasta.fields.path}')
+                    #end if
+                #end if
+            #end if
+            #if $genome.egf:
+                --effective-genome-fraction ${genome.egf}
+            #end if
+
+            #if $fdr:
+                -fdr $fdr
+            #end if
+
+        ## BAM OPTIONS
+        #if $bam_options.required_flag:
+            --required-flag $bam_options.required_flag
+        #end if
+
+        #if $bam_options.filter_flag:
+            --filter-flag $bam_options.filter_flag
+        #end if
+
+        #if $bam_options.mapq:
+            --mapq $bam_options.mapq
+        #end if
+
+        #if $bam_options.autodetect_chroms:
+            --autodetect-chroms
+        #end if
+
+        #if $bam_options.discard_chroms:
+            --discard-chromosomes-pattern $bam_options.discard_chroms
+        #end if
+
+        ## ADVANCED OPTIONS
+        #if $advanced_options.keep_dupes:
+            --keep-duplicates
+        #end if
+
+        #if $advanced_options.bin_size:
+            --bin-size $advanced_options.bin_size
+        #end if
+
+        #if $advanced_options.gaps_allowed:
+            --gaps-allowed $advanced_options.gaps_allowed
+        #end if
+
+        #if $advanced_options.fragment_size:
+            --fragment-size $advanced_options.fragment_size
+        #end if
+
+        #if $advanced_options.original_algorithm:
+            --original-algorithm
+        #end if
+
+        #if $advanced_options.original_stats:
+            --original-statistics
+        #end if
+
+        > ${peaks} 
+        2> >(awk 'NF' >&2)
+
+        #if $to_bed:
+            && 
+            awk 'NR>1{if ($4==0) {pv=500;qv=500}else{pv=-log($4)/log(10);qv=-log($9)/log(10)};
+            print $1,$2,$3,"island_"NR-1,int($5),$6,$10,pv,qv}' OFS="\t" ${peaks} > ${bed_peaks}
+        #end if
+
+    ]]></command>
+    
+    <inputs>
+        <conditional name="treatment">
+            <param name="t_multi_select" type="select" label="Are you pooling Treatment Files?" help="" >
+                <option value="No" selected="True">No</option>
+                <option value="Yes">Yes</option>
+            </param>
+            <when value="No" >
+                <param name="input_treatment_file" argument="-t" type="data" 
+                       format="bam,sam,bed" label="ChIP-Seq Treatment File" help="(-t)" />
+            </when>
+            <when value="Yes">
+                <param name="input_treatment_file" argument="-t" type="data" 
+                       format="bam,sam,bed" multiple="true" 
+                       label="ChIP-Seq Treatment File" help="(-t)" />
+            </when>
+        </conditional>
+
+        <conditional name="control">
+            <param name="c_select" type="select" label="Do you have a Control File?" >
+                <option value="Yes">Yes</option>
+                <option value="No" selected="True">No</option>
+            </param>
+            <when value="Yes">
+                <conditional name="c_multiple">
+                    <param name="c_multi_select" type="select" 
+                           label="Are you pooling Control Files?" help="" >
+                        <option value="No" selected="True">No</option>
+                        <option value="Yes">Yes</option>
+                    </param>
+                    <when value="No" >
+                        <param name="input_control_file" argument="-c" type="data" 
+                               format="bam,sam,bed" label="ChIP-Seq Control File" 
+                               help="(-c)" />
+                    </when>
+                    <when value="Yes">
+                        <param name="input_control_file" argument="-c" type="data" 
+                               format="bam,sam,bed" multiple="true" 
+                               label="ChIP-Seq Control File" help="(-c)" />
+                    </when>
+                </conditional>
+            </when>
+            <when value="No">
+                <param name="evalue" argument="-e" type="integer" optional="True" 
+                       label="e-value" help="The E-value controls the genome-wide error 
+                             rate of identified islands under the random 
+                             background assumption. Should be used when not using 
+                             a control library. Default 1000." />
+            </when>
+        </conditional>
+
+        <conditional name="genome">
+            <param name="g_select" type="select" label="Is your genome indexed?" >
+                <option value="Yes" selected="True">Yes</option>
+                <option value="No">No</option>
+            </param>
+            <when value="Yes">
+                <expand macro="effectiveGenomeSize" />
+                <param name="egf" argument="-egf" type="float" min="0" max="1" 
+                       optional="True" label="Effective genome fraction" 
+                       help="Use a different effective genome fraction than the 
+                             one included in epic2, which depends on genome and
+                             readlength. (-egf)" />
+            </when>
+            <when value="No">
+                <conditional name="chromsizes">
+                    <param name="chr_select" type="select" label="Use an indexed fasta file?" 
+                           help="Chromosome sizes will be calculated from the provided fasta file." >
+                        <option value="No">No</option>
+                        <option value="Yes" selected="True">Yes</option>
+                    </param>
+                    <when value="No" >
+                        <param name="cs_file" argument="--chromsizes" type="data" 
+                               format="fasta,txt,tabular,tsv" label="Chromosome sizes" 
+                               help="Provide a fasta file for automated calculation, 
+                                     or a tab-separated file with two columns: 
+                                     chromosome names and sizes. (--chromsizes)" />
+                    </when>
+                    <when value="Yes">
+                        <param name="builtin_fasta" argument="--chromsizes" type="select" 
+                               optional="True" label="Genome for fasta file" 
+                               help="(--chromsizes)" >
+                            <options from_data_table="fasta_indexes">
+                                <filter type="sort_by" column="2" />
+                                <validator type="no_options" message="No indexes are available" />
+                            </options>
+                        </param>
+                    </when>
+                </conditional>
+                <param name="egf" argument="-egf" type="float" min="0" max="1" 
+                       optional="True" label="Effective genome fraction" 
+                       help="The effective genome fraction is the proportion 
+                             of the genome that is mappable, excluding Ns. (-egf)" />
+            </when>
+        </conditional>
+
+            <param name="fdr" argument="-fdr" type="float" min="0" max="1" 
+                   optional="True" label="False discovery rate cutoff" 
+                   help="Remove all islands with an FDR above cutoff. Default 0.05 (-fdr)" />
+
+            <param name="to_bed" type="boolean" checked="false" label="Convert output to BED format?"/>
+
+        <section name="bam_options" title="BAM Options">
+            <param name="required_flag" argument="--required-flag" type="integer" 
+                   optional="True" label="Required flag" 
+                   help="Keep reads with these bits set in flag. Same as `samtools 
+                         view -f`. Default 0. (--required-flag)" />
+            <param name="filter_flag" argument="--filter-flag" type="integer" 
+                   optional="True" label="Filter flag" 
+                   help="Discard reads with these bits set in flag. Same as `samtools 
+                         view -F`. Default 1540 (hex: 0x604). (--filter-flag)" />
+            <param name="mapq" argument="--mapq" type="integer" 
+                   optional="True" label="Mapping quality" 
+                   help="Discard reads with mapping quality lower than this. Default 5. (--mapq)" />
+            <param name="autodetect_chroms" type="boolean" checked="false" 
+                   truevalue="--autodetect-chroms" falsevalue="" label="Autodetect chromosomes?" 
+                   help="Autodetect chromosomes from bam file. Use with 
+                         --discard-chromosomes flag to avoid non-canonical 
+                         chromosomes. (--autodetect-chroms)" />
+            <param name="discard_chroms" argument="--discard-chromosomes-pattern" 
+                   type="text" optional="True" label="Discard chromosomes pattern" 
+                   help="Discard reads from chromosomes matching
+                        this pattern. Default '_'. Note that if you are not
+                        interested in the results from non-canonical
+                        chromosomes, you should ensure they are removed with
+                        this flag, otherwise they will make the statistical
+                        analysis too stringent. (--discard-chromosomes-pattern)"/>
+        </section>
+        
+        <section name="advanced_options" title="Advanced Options">
+            <param name="keep_dupes" type="boolean" checked="false" 
+                   truevalue="--keep-duplicates" falsevalue="" label="Keep duplicates?" 
+                   help="Keep reads mapping to the same position on the same 
+                         strand within a library. (--keep-duplicates)" />
+            <param name="bin_size" argument="--bin-size" type="integer" 
+                   optional="True" label="Bin size" 
+                   help="Size of the windows to scan the genome. BIN-SIZE is the 
+                         smallest possible island. Default 200. (--bin-size)" />
+            <param name="gaps_allowed" argument="--gaps-allowed" type="integer" 
+                   optional="True" label="Gaps allowed" 
+                   help="This number is multiplied by the window size to determine 
+                         the number of gaps (ineligible windows) allowed 
+                         between two eligible windows. Default 3. (--gaps-allowed)"/>
+            <param name="fragment_size" argument="--fragment-size" type="integer" 
+                   optional="True" label="Fragment size" 
+                   help="(Single end reads only) Size of the sequenced fragment. 
+                         Each read is extended half the fragment size from the 5' end. 
+                         Default 150 (i.e. extend by 75). (--fragment-size)" />
+            <param name="original_algorithm" type="boolean" checked="false" 
+                   truevalue="--original-algorithm" falsevalue="" 
+                   label="Compute p-values with SICER original algorithm?" 
+                   help="Use the original SICER algorithm, without the epic2 fix. 
+                        This will use all reads in your files to compute
+                        the p-values, including those falling outside the
+                        genome boundaries. (--original-algorithm)" />
+            <param name="original_stats" type="boolean" checked="false" 
+                   truevalue="--original-statistics" falsevalue="" 
+                   label="Compute p-values with SICER original algorithm?" 
+                   help="Use the original SICER way of computing the
+                        statistics. Like SICER itself, this method raises an
+                        error on large datasets. Only included for debugging-
+                        purposes. (--original-statistics)" />
+        </section>
+   </inputs>
+
+    <outputs>
+        <data format="tabular" name="peaks" label="${tool.name} on ${on_string}"/>
+        <data format='bed' name='bed_peaks' label="${tool.name} on ${on_string}: BED">
+            <filter>to_bed</filter>
+        </data>
+
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="input_treatment_file" value="test.bam" ftype="bam" />
+            <param name="c_select" value="Yes" />
+            <param name="input_control_file" value="control.bam" ftype="bam"/>
+            <output name="peaks" file="epic2_results.txt"/>
+        </test>
+        <test>
+            <param name="input_treatment_file" value="test.bed.gz" ftype="bed" />
+            <param name="c_select" value="Yes" />
+            <param name="input_control_file" value="control.bed.gz" ftype="bed"/>
+            <output name="peaks" file="epic2_results1.txt"/>
+        </test>
+        <test>
+            <param name="input_treatment_file" value="test_ChIP.bam" ftype="bam" />
+            <param name="c_select" value="Yes" />
+            <param name="input_control_file" value="test_Input.bam" ftype="bam"/>
+            <param name="g_select" value="No" />
+            <param name="chr_select" value="No" />
+            <param name="cs_file" value="test_chromsizes.txt" />
+            <param name="egf" value="0.99" />
+            <param name="original_algorithm" value="Yes" />
+            <output name="peaks" file="epic2_results2.txt"/>
+        </test>
+        <test>
+            <param name="input_treatment_file" value="test_ChIP.bam" ftype="bam" />
+            <param name="c_select" value="Yes" />
+            <param name="input_control_file" value="test_Input.bam" ftype="bam"/>
+            <param name="g_select" value="No" />
+            <param name="chr_select" value="No" />
+            <param name="cs_file" value="test_fasta.fasta" />
+            <param name="egf" value="0.99" />
+            <param name="to_bed" value="Yes" />
+            <param name="mapq" value="10" />
+            <param name="bin_size" value="100" />
+            <param name="gaps_allowed" value="0" />
+            <output name="peaks" file="epic2_results3.txt"/>
+            <output name="bed_peaks" file="epic2_results3.bed"/>
+        </test>
+    </tests>
+
+    <help>
+epic2 is an ultraperformant reimplementation of SICER, a Chip-Seq broad peak/diffuse domain finder. 
+epic2 is focused on speed, low memory overhead and ease-of-use.
+ Software documentation may be found on https://github.com/biocore-ntnu/epic2.
+
+**Accepted input**
+
+epic2 is designed to be used with ChIP-Seq data from treatment samples, though control samples are encouraged to increase specificity. Single or multiple files are allowed as input, with same or different file formats (file extension must be bed, bedpe, bam or sam). To use multiple files as input (either as treatment or control samples), group them in a collection and select to pool files, otherwise Galaxy will run them in batch mode.
+
+epic2 works with only a treatment file specified as input. In this case, epic2 will run with default parameters, using the pre-indexed human genome hg19 and a FDR cutoff of 0.05. Several genomes are indexed and included on the installation of epic2, although custom genomes may be used. If your genome is not already indexed, two options are provided. One of them is to select a fasta file from which calculate chromosome sizes; the second option is to provide a tab-separated list with one chromosome name and length per row. On custom genomes, the effective genome fraction may be introduced (for more information see [deepTools](https://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html)).
+
+**Broad peaks format**
+
+The output of epic2 contains called peaks in a table that does not follow any standard format. This table may be converted to BED format, assigning a unique name to each peak (islands). This format follows the standard from ENCODE, BED 6+3, and contains the following columns:
+        
+        * **1.** Chrom 
+        * **2.** Start 
+        * **3.** End 
+        * **4.** Name 
+        * **5.** Score 
+        * **6.** Strand 
+        * **7.** log2FoldChange 
+        * **8.** -log10PValue 
+        * **9.** -log10FDR
+        
+.. class:: warningmark
+
+        On columns 8 and 9, the max value is set to 500 when Pvalue == 0.0.
+
+Tool adapted to Galaxy by Miriam PayĆ”.
+        
+        
+    </help>
+    <expand macro="citations" />
+</tool>