9
|
1 <tool id="epic2" name="epic2" version="@VERSION@">
|
8
|
2 <description>peak calling of broad ChIP-Seq marks</description>
|
|
3 <macros>
|
|
4 <import>macros.xml</import>
|
|
5 </macros>
|
|
6 <expand macro="requirements" />
|
|
7
|
|
8 <stdio>
|
|
9 <exit_code range="1:125" level="fatal" description="Unknown error occurred" />
|
|
10 <exit_code range="130:" level="fatal" description="Unknown error occurred" />
|
|
11 <regex match="epic2: (command ){0,1}not found" source="stderr" level="fatal" description="The epic2 python package is not properly installed, contact Galaxy administrators" />
|
|
12 </stdio>
|
|
13
|
|
14 <command><![CDATA[
|
|
15
|
|
16 ##set up treatment files, extension must be bed, bedpe, bam, sam
|
|
17 #set $t_file_list = list()
|
|
18 #if str($treatment.t_multi_select) == "No":
|
|
19 #if $treatment.input_treatment_file.is_of_type('bed')
|
|
20 #set $t_file = 'ChIP_file.bed'
|
|
21 ln -s '$treatment.input_treatment_file' $t_file &&
|
|
22 #elif $treatment.input_treatment_file.is_of_type('bam')
|
|
23 #set $t_file = 'ChIP_file.bam'
|
|
24 ln -s '$treatment.input_treatment_file' $t_file &&
|
|
25 ln -s '$treatment.input_treatment_file.metadata.bam_index' ${t_file}.bai &&
|
|
26 #elif $treatment.input_treatment_file.is_of_type('sam')
|
|
27 #set $t_file = 'ChIP_file.sam'
|
|
28 ln -s '$treatment.input_treatment_file' $t_file &&
|
|
29 #end if
|
|
30 $t_file_list.append($t_file)
|
|
31 #else
|
|
32 #set $inputs = $treatment.input_treatment_file
|
|
33 #for $i, $f in enumerate($inputs)
|
|
34 #if $f.is_of_type('bed')
|
|
35 #set $t_file = ''.join(['ChIP_file_',str($i),'.bed'])
|
|
36 ln -s '$f' $t_file &&
|
|
37 #elif $f.is_of_type('bam')
|
|
38 #set $t_file = ''.join(['ChIP_file_',str($i),'.bam'])
|
|
39 ln -s '$f' $t_file &&
|
|
40 ln -s '$f.metadata.bam_index' ${t_file}.bai &&
|
|
41 #elif $f.is_of_type('sam')
|
|
42 #set $t_file = ''.join(['ChIP_file_',str($i),'.sam'])
|
|
43 ln -s '$f' $t_file &&
|
|
44 #end if
|
|
45 $t_file_list.append($t_file)
|
|
46 #end for
|
|
47 #end if
|
|
48
|
|
49 ##set up control files, extension must be bed, bedpe, bam, sam
|
|
50 #if str($control.c_select) == "Yes":
|
|
51 #set $c_file_list = list()
|
|
52 #if str($control.c_multiple.c_multi_select) == "No":
|
|
53 #set $f = $control.c_multiple.input_control_file
|
|
54 #if $f.is_of_type('bed')
|
|
55 #set $c_file = 'control_file.bed'
|
|
56 ln -s '$f' $c_file &&
|
|
57 #elif $f.is_of_type('bam')
|
|
58 #set $c_file = 'control_file.bam'
|
|
59 ln -s '$f' $c_file &&
|
|
60 ln -s '$f.metadata.bam_index' ${c_file}.bai &&
|
|
61 #elif $f.is_of_type('sam')
|
|
62 #set $c_file = 'control_file.sam'
|
|
63 ln -s '$f' $c_file &&
|
|
64 #end if
|
|
65 $c_file_list.append($c_file)
|
|
66 #else
|
|
67 #set $inputs = $control.c_multiple.input_control_file
|
|
68 #for $i, $f in enumerate($inputs)
|
|
69 #if $f.is_of_type('bed')
|
|
70 #set $c_file = ''.join(['control_file',str($i),'.bed'])
|
|
71 ln -s '$f' $c_file &&
|
|
72 #elif $f.is_of_type('bam')
|
|
73 #set $c_file = ''.join(['control_file',str($i),'.bam'])
|
|
74 ln -s '$f' $c_file &&
|
|
75 ln -s '$f.metadata.bam_index' ${c_file}.bai &&
|
|
76 #elif $f.is_of_type('sam')
|
|
77 #set $c_file = ''.join(['control_file',str($i),'.sam'])
|
|
78 ln -s '$f' $c_file &&
|
|
79 #end if
|
|
80 $c_file_list.append($c_file)
|
|
81 #end for
|
|
82 #end if
|
|
83 #end if
|
|
84
|
|
85 epic2
|
|
86
|
|
87 ## Treatment File(s)
|
|
88 -t ${ ' '.join( $t_file_list ) }
|
|
89
|
|
90 ## Control File(s)
|
|
91 #if str($control.c_select) == "Yes":
|
|
92 -c ${ ' '.join( $c_file_list ) }
|
|
93 #end if
|
|
94
|
|
95 ## Predefined or Custom Genome
|
|
96 #if str($genome.g_select) == "Yes":
|
|
97 --genome ${genome.builtin_genome}
|
|
98 #else
|
|
99 #if str($genome.chromsizes.chr_select) == "No":
|
|
100 #if $genome.chromsizes.cs_file.is_of_type('fasta'):
|
|
101 --chromsizes <(awk '/^>/ {if (seqlen) print seqlen;printf substr($1,2) "\t";seqlen=0;next}
|
|
102 {seqlen+=length($0)}END{print seqlen}' '${genome.chromsizes.cs_file}')
|
|
103 #else
|
|
104 --chromsizes ${genome.chromsizes.cs_file}
|
|
105 #end if
|
|
106 #else
|
|
107 #if $genome.chromsizes.builtin_fasta.fields.path
|
|
108 --chromsizes <(awk '/^>/ {if (seqlen) print seqlen;printf substr($1,2) "\t";seqlen=0;next}
|
|
109 {seqlen+=length($0)}END{print seqlen}' '${genome.chromsizes.builtin_fasta.fields.path}')
|
|
110 #end if
|
|
111 #end if
|
|
112 #end if
|
|
113 #if $genome.egf:
|
|
114 --effective-genome-fraction ${genome.egf}
|
|
115 #end if
|
|
116
|
|
117 #if $fdr:
|
|
118 -fdr $fdr
|
|
119 #end if
|
|
120
|
|
121 ## BAM OPTIONS
|
|
122 #if $bam_options.required_flag:
|
|
123 --required-flag $bam_options.required_flag
|
|
124 #end if
|
|
125
|
|
126 #if $bam_options.filter_flag:
|
|
127 --filter-flag $bam_options.filter_flag
|
|
128 #end if
|
|
129
|
|
130 #if $bam_options.mapq:
|
|
131 --mapq $bam_options.mapq
|
|
132 #end if
|
|
133
|
|
134 #if $bam_options.autodetect_chroms:
|
|
135 --autodetect-chroms
|
|
136 #end if
|
|
137
|
|
138 #if $bam_options.discard_chroms:
|
|
139 --discard-chromosomes-pattern $bam_options.discard_chroms
|
|
140 #end if
|
|
141
|
|
142 ## ADVANCED OPTIONS
|
|
143 #if $advanced_options.keep_dupes:
|
|
144 --keep-duplicates
|
|
145 #end if
|
|
146
|
|
147 #if $advanced_options.bin_size:
|
|
148 --bin-size $advanced_options.bin_size
|
|
149 #end if
|
|
150
|
|
151 #if $advanced_options.gaps_allowed:
|
|
152 --gaps-allowed $advanced_options.gaps_allowed
|
|
153 #end if
|
|
154
|
|
155 #if $advanced_options.fragment_size:
|
|
156 --fragment-size $advanced_options.fragment_size
|
|
157 #end if
|
|
158
|
|
159 #if $advanced_options.original_algorithm:
|
|
160 --original-algorithm
|
|
161 #end if
|
|
162
|
|
163 #if $advanced_options.original_stats:
|
|
164 --original-statistics
|
|
165 #end if
|
|
166
|
|
167 > ${peaks}
|
|
168 2> >(awk 'NF' >&2)
|
|
169
|
|
170 #if $to_bed:
|
|
171 &&
|
|
172 awk 'NR>1{if ($4==0) {pv=500;qv=500}else{pv=-log($4)/log(10);qv=-log($9)/log(10)};
|
|
173 print $1,$2,$3,"island_"NR-1,int($5),$6,$10,pv,qv}' OFS="\t" ${peaks} > ${bed_peaks}
|
|
174 #end if
|
|
175
|
|
176 ]]></command>
|
|
177
|
|
178 <inputs>
|
|
179 <conditional name="treatment">
|
|
180 <param name="t_multi_select" type="select" label="Are you pooling Treatment Files?" help="" >
|
|
181 <option value="No" selected="True">No</option>
|
|
182 <option value="Yes">Yes</option>
|
|
183 </param>
|
|
184 <when value="No" >
|
|
185 <param name="input_treatment_file" argument="-t" type="data"
|
|
186 format="bam,sam,bed" label="ChIP-Seq Treatment File" help="(-t)" />
|
|
187 </when>
|
|
188 <when value="Yes">
|
|
189 <param name="input_treatment_file" argument="-t" type="data"
|
|
190 format="bam,sam,bed" multiple="true"
|
|
191 label="ChIP-Seq Treatment File" help="(-t)" />
|
|
192 </when>
|
|
193 </conditional>
|
|
194
|
|
195 <conditional name="control">
|
|
196 <param name="c_select" type="select" label="Do you have a Control File?" >
|
|
197 <option value="Yes">Yes</option>
|
|
198 <option value="No" selected="True">No</option>
|
|
199 </param>
|
|
200 <when value="Yes">
|
|
201 <conditional name="c_multiple">
|
|
202 <param name="c_multi_select" type="select"
|
|
203 label="Are you pooling Control Files?" help="" >
|
|
204 <option value="No" selected="True">No</option>
|
|
205 <option value="Yes">Yes</option>
|
|
206 </param>
|
|
207 <when value="No" >
|
|
208 <param name="input_control_file" argument="-c" type="data"
|
|
209 format="bam,sam,bed" label="ChIP-Seq Control File"
|
|
210 help="(-c)" />
|
|
211 </when>
|
|
212 <when value="Yes">
|
|
213 <param name="input_control_file" argument="-c" type="data"
|
|
214 format="bam,sam,bed" multiple="true"
|
|
215 label="ChIP-Seq Control File" help="(-c)" />
|
|
216 </when>
|
|
217 </conditional>
|
|
218 </when>
|
|
219 <when value="No">
|
|
220 <param name="evalue" argument="-e" type="integer" optional="True"
|
|
221 label="e-value" help="The E-value controls the genome-wide error
|
|
222 rate of identified islands under the random
|
|
223 background assumption. Should be used when not using
|
|
224 a control library. Default 1000." />
|
|
225 </when>
|
|
226 </conditional>
|
|
227
|
|
228 <conditional name="genome">
|
|
229 <param name="g_select" type="select" label="Is your genome indexed?" >
|
|
230 <option value="Yes" selected="True">Yes</option>
|
|
231 <option value="No">No</option>
|
|
232 </param>
|
|
233 <when value="Yes">
|
|
234 <expand macro="effectiveGenomeSize" />
|
|
235 <param name="egf" argument="-egf" type="float" min="0" max="1"
|
|
236 optional="True" label="Effective genome fraction"
|
|
237 help="Use a different effective genome fraction than the
|
|
238 one included in epic2, which depends on genome and
|
|
239 readlength. (-egf)" />
|
|
240 </when>
|
|
241 <when value="No">
|
|
242 <conditional name="chromsizes">
|
|
243 <param name="chr_select" type="select" label="Use an indexed fasta file?"
|
|
244 help="Chromosome sizes will be calculated from the provided fasta file." >
|
|
245 <option value="No">No</option>
|
|
246 <option value="Yes" selected="True">Yes</option>
|
|
247 </param>
|
|
248 <when value="No" >
|
|
249 <param name="cs_file" argument="--chromsizes" type="data"
|
|
250 format="fasta,txt,tabular,tsv" label="Chromosome sizes"
|
|
251 help="Provide a fasta file for automated calculation,
|
|
252 or a tab-separated file with two columns:
|
|
253 chromosome names and sizes. (--chromsizes)" />
|
|
254 </when>
|
|
255 <when value="Yes">
|
|
256 <param name="builtin_fasta" argument="--chromsizes" type="select"
|
|
257 optional="True" label="Genome for fasta file"
|
|
258 help="(--chromsizes)" >
|
|
259 <options from_data_table="fasta_indexes">
|
|
260 <filter type="sort_by" column="2" />
|
|
261 <validator type="no_options" message="No indexes are available" />
|
|
262 </options>
|
|
263 </param>
|
|
264 </when>
|
|
265 </conditional>
|
|
266 <param name="egf" argument="-egf" type="float" min="0" max="1"
|
|
267 optional="True" label="Effective genome fraction"
|
|
268 help="The effective genome fraction is the proportion
|
|
269 of the genome that is mappable, excluding Ns. (-egf)" />
|
|
270 </when>
|
|
271 </conditional>
|
|
272
|
|
273 <param name="fdr" argument="-fdr" type="float" min="0" max="1"
|
|
274 optional="True" label="False discovery rate cutoff"
|
|
275 help="Remove all islands with an FDR above cutoff. Default 0.05 (-fdr)" />
|
|
276
|
|
277 <param name="to_bed" type="boolean" checked="false" label="Convert output to BED format?"/>
|
|
278
|
|
279 <section name="bam_options" title="BAM Options">
|
|
280 <param name="required_flag" argument="--required-flag" type="integer"
|
|
281 optional="True" label="Required flag"
|
|
282 help="Keep reads with these bits set in flag. Same as `samtools
|
|
283 view -f`. Default 0. (--required-flag)" />
|
|
284 <param name="filter_flag" argument="--filter-flag" type="integer"
|
|
285 optional="True" label="Filter flag"
|
|
286 help="Discard reads with these bits set in flag. Same as `samtools
|
|
287 view -F`. Default 1540 (hex: 0x604). (--filter-flag)" />
|
|
288 <param name="mapq" argument="--mapq" type="integer"
|
|
289 optional="True" label="Mapping quality"
|
|
290 help="Discard reads with mapping quality lower than this. Default 5. (--mapq)" />
|
|
291 <param name="autodetect_chroms" type="boolean" checked="false"
|
|
292 truevalue="--autodetect-chroms" falsevalue="" label="Autodetect chromosomes?"
|
|
293 help="Autodetect chromosomes from bam file. Use with
|
|
294 --discard-chromosomes flag to avoid non-canonical
|
|
295 chromosomes. (--autodetect-chroms)" />
|
|
296 <param name="discard_chroms" argument="--discard-chromosomes-pattern"
|
|
297 type="text" optional="True" label="Discard chromosomes pattern"
|
|
298 help="Discard reads from chromosomes matching
|
|
299 this pattern. Default '_'. Note that if you are not
|
|
300 interested in the results from non-canonical
|
|
301 chromosomes, you should ensure they are removed with
|
|
302 this flag, otherwise they will make the statistical
|
|
303 analysis too stringent. (--discard-chromosomes-pattern)"/>
|
|
304 </section>
|
|
305
|
|
306 <section name="advanced_options" title="Advanced Options">
|
|
307 <param name="keep_dupes" type="boolean" checked="false"
|
|
308 truevalue="--keep-duplicates" falsevalue="" label="Keep duplicates?"
|
|
309 help="Keep reads mapping to the same position on the same
|
|
310 strand within a library. (--keep-duplicates)" />
|
|
311 <param name="bin_size" argument="--bin-size" type="integer"
|
|
312 optional="True" label="Bin size"
|
|
313 help="Size of the windows to scan the genome. BIN-SIZE is the
|
|
314 smallest possible island. Default 200. (--bin-size)" />
|
|
315 <param name="gaps_allowed" argument="--gaps-allowed" type="integer"
|
|
316 optional="True" label="Gaps allowed"
|
|
317 help="This number is multiplied by the window size to determine
|
|
318 the number of gaps (ineligible windows) allowed
|
|
319 between two eligible windows. Default 3. (--gaps-allowed)"/>
|
|
320 <param name="fragment_size" argument="--fragment-size" type="integer"
|
|
321 optional="True" label="Fragment size"
|
|
322 help="(Single end reads only) Size of the sequenced fragment.
|
|
323 Each read is extended half the fragment size from the 5' end.
|
|
324 Default 150 (i.e. extend by 75). (--fragment-size)" />
|
|
325 <param name="original_algorithm" type="boolean" checked="false"
|
|
326 truevalue="--original-algorithm" falsevalue=""
|
|
327 label="Compute p-values with SICER original algorithm?"
|
|
328 help="Use the original SICER algorithm, without the epic2 fix.
|
|
329 This will use all reads in your files to compute
|
|
330 the p-values, including those falling outside the
|
|
331 genome boundaries. (--original-algorithm)" />
|
|
332 <param name="original_stats" type="boolean" checked="false"
|
|
333 truevalue="--original-statistics" falsevalue=""
|
|
334 label="Compute p-values with SICER original algorithm?"
|
|
335 help="Use the original SICER way of computing the
|
|
336 statistics. Like SICER itself, this method raises an
|
|
337 error on large datasets. Only included for debugging-
|
|
338 purposes. (--original-statistics)" />
|
|
339 </section>
|
|
340 </inputs>
|
|
341
|
|
342 <outputs>
|
|
343 <data format="tabular" name="peaks" label="${tool.name} on ${on_string}"/>
|
|
344 <data format='bed' name='bed_peaks' label="${tool.name} on ${on_string}: BED">
|
|
345 <filter>to_bed</filter>
|
|
346 </data>
|
|
347
|
|
348 </outputs>
|
|
349
|
|
350 <tests>
|
|
351 <test>
|
|
352 <param name="input_treatment_file" value="test.bam" ftype="bam" />
|
|
353 <param name="c_select" value="Yes" />
|
|
354 <param name="input_control_file" value="control.bam" ftype="bam"/>
|
|
355 <output name="peaks" file="epic2_results.txt"/>
|
|
356 </test>
|
|
357 <test>
|
|
358 <param name="input_treatment_file" value="test.bed.gz" ftype="bed" />
|
|
359 <param name="c_select" value="Yes" />
|
|
360 <param name="input_control_file" value="control.bed.gz" ftype="bed"/>
|
|
361 <output name="peaks" file="epic2_results1.txt"/>
|
|
362 </test>
|
|
363 <test>
|
|
364 <param name="input_treatment_file" value="test_ChIP.bam" ftype="bam" />
|
|
365 <param name="c_select" value="Yes" />
|
|
366 <param name="input_control_file" value="test_Input.bam" ftype="bam"/>
|
|
367 <param name="g_select" value="No" />
|
|
368 <param name="chr_select" value="No" />
|
|
369 <param name="cs_file" value="test_chromsizes.txt" />
|
|
370 <param name="egf" value="0.99" />
|
|
371 <param name="original_algorithm" value="Yes" />
|
|
372 <output name="peaks" file="epic2_results2.txt"/>
|
|
373 </test>
|
|
374 <test>
|
|
375 <param name="input_treatment_file" value="test_ChIP.bam" ftype="bam" />
|
|
376 <param name="c_select" value="Yes" />
|
|
377 <param name="input_control_file" value="test_Input.bam" ftype="bam"/>
|
|
378 <param name="g_select" value="No" />
|
|
379 <param name="chr_select" value="No" />
|
|
380 <param name="cs_file" value="test_fasta.fasta" />
|
|
381 <param name="egf" value="0.99" />
|
|
382 <param name="to_bed" value="Yes" />
|
|
383 <param name="mapq" value="10" />
|
|
384 <param name="bin_size" value="100" />
|
|
385 <param name="gaps_allowed" value="0" />
|
|
386 <output name="peaks" file="epic2_results3.txt"/>
|
|
387 <output name="bed_peaks" file="epic2_results3.bed"/>
|
|
388 </test>
|
|
389 </tests>
|
|
390
|
|
391 <help>
|
|
392 epic2 is an ultraperformant reimplementation of SICER, a Chip-Seq broad peak/diffuse domain finder.
|
|
393 epic2 is focused on speed, low memory overhead and ease-of-use.
|
|
394 Software documentation may be found on https://github.com/biocore-ntnu/epic2.
|
|
395
|
|
396 **Accepted input**
|
|
397
|
|
398 epic2 is designed to be used with ChIP-Seq data from treatment samples, though control samples are encouraged to increase specificity. Single or multiple files are allowed as input, with same or different file formats (file extension must be bed, bedpe, bam or sam). To use multiple files as input (either as treatment or control samples), group them in a collection and select to pool files, otherwise Galaxy will run them in batch mode.
|
|
399
|
|
400 epic2 works with only a treatment file specified as input. In this case, epic2 will run with default parameters, using the pre-indexed human genome hg19 and a FDR cutoff of 0.05. Several genomes are indexed and included on the installation of epic2, although custom genomes may be used. If your genome is not already indexed, two options are provided. One of them is to select a fasta file from which calculate chromosome sizes; the second option is to provide a tab-separated list with one chromosome name and length per row. On custom genomes, the effective genome fraction may be introduced (for more information see [deepTools](https://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html)).
|
|
401
|
|
402 **Broad peaks format**
|
|
403
|
|
404 The output of epic2 contains called peaks in a table that does not follow any standard format. This table may be converted to BED format, assigning a unique name to each peak (islands). This format follows the standard from ENCODE, BED 6+3, and contains the following columns:
|
|
405
|
|
406 * **1.** Chrom
|
|
407 * **2.** Start
|
|
408 * **3.** End
|
|
409 * **4.** Name
|
|
410 * **5.** Score
|
|
411 * **6.** Strand
|
|
412 * **7.** log2FoldChange
|
|
413 * **8.** -log10PValue
|
|
414 * **9.** -log10FDR
|
|
415
|
|
416 .. class:: warningmark
|
|
417
|
|
418 On columns 8 and 9, the max value is set to 500 when Pvalue == 0.0.
|
|
419
|
|
420 Tool adapted to Galaxy by Miriam PayĆ”.
|
|
421
|
|
422
|
|
423 </help>
|
|
424 <expand macro="citations" />
|
|
425 </tool>
|