comparison chipenrich.xml @ 1:3eaa000a7bf1 draft

Uploaded
author mora-lab
date Thu, 20 May 2021 08:41:59 +0000
parents
children ddefda892a8d
comparison
equal deleted inserted replaced
0:63ec097240bf 1:3eaa000a7bf1
1 <tool id="Chipenrich" name="ChIP-Enrich" version="0.1.0" python_template_version="3.5">
2 <description>Gene set enrichment for ChIP-Seq peak data</description>
3 <requirements>
4 <requirement type="package" version="1.20.3">r-getopt</requirement>
5 <requirement type="package" version="2.14.0">bioconductor-chipenrich</requirement>
6 <requirement type="package" version="2.14.0">bioconductor-chipenrich.data</requirement>
7 </requirements>
8 <command detect_errors="exit_code"><![CDATA[
9 Rscript '$__tool_directory__/chipenrich.R'
10 --input_peaks '$peaks'
11 --input_genome '$geneset_option.genome'
12 --input_geneset '$geneset_option.genesets'
13 --input_locusdef '$peaks_option.locusdefs'
14 --input_method '$method'
15 --input_minSize '$adv.minSize'
16 --input_maxSize '$adv.maxSize'
17 --input_randomization '$adv.randomization'
18 --input_num_peak_threshould '$threshold'
19 --output_peaks '$output_peaks'
20 --output_enrich_result '$enrich_result'
21 --output_peaks_per_gene '$peaks_per_gene'
22
23 ]]></command>
24 <inputs>
25 <param name="peaks" type="data" format="csv" label="Peaks" help="A CSV file whose three first columns correspond to 'chr', 'start' and 'end'. " />
26 <conditional name="geneset_option">
27 <param name="genome" type="select" label="Genome" help="" >
28 <option value="hg19" selected="true">Human(hg19)</option>
29 <option value="hg38">Human(hg38)</option>
30 <option value="mm10">Mouse(mm10)</option>
31 <option value="mm9">Mouse(mm9)</option>
32 <option value="rn4">Rat(rn4)</option>
33 <option value="rn5">Rat(rn5)</option>
34 <option value="rn6">Rat(rn6)</option>
35 <option value="dm3">D.melanogaster(dm3)</option>
36 <option value="dm6">D.melanogaster(dm6)</option>
37 <option value="danRer10">D.Zebrafish(danRer10)</option>
38 </param>
39 <when value="hg19">
40 <param name="genesets" type="select" label="GeneSets" help="Select gene sets to test." multiple="true" display="checkboxes">
41 <option value="GOBP" selected="true">GO Biological Process</option>
42 <option value="GOCC" selected="true">GO Cellular Component</option>
43 <option value="GOMF" selected="true">GO Molecular Function</option>
44 <option value="biocarta_pathway">Biocarta Pathways</option>
45 <option value="kegg_pathway">KEGG Pathways</option>
46 <option value="panther_pathway">PANTHER Pathways</option>
47 <option value="pfam">PFAM</option>
48 <option value="reactome">Reactome</option>
49 <option value="mesh">MeSH</option>
50 <option value="hallmark">Hallmark gene sets</option>
51 <option value="immunologic">Immunologic signature gene sets</option>
52 <option value="oncogenic">Oncogenic signature gene sets</option>
53 <option value="ctd">Comparative Toxicogenomics Database (CTD)</option>
54 <option value="drug_bank">DrugBank</option>
55 <option value="microrna">MicroRNA</option>
56 <option value="transcription_factors">Transcription Factors</option>
57 <option value="protein_interaction_biogrid">Protein Interactions (BioGRID)</option>
58 <option value="metabolite">Metabolites</option>
59 <option value="cytoband">Cytobands</option>
60 </param>
61 </when>
62 <when value="hg38">
63 <param name="genesets" type="select" label="GeneSets" help="Selcet gene sets to test." multiple="true" display="checkboxes">
64 <option value="GOBP" selected="true">GO Biological Process</option>
65 <option value="GOCC" selected="true">GO Cellular Component</option>
66 <option value="GOMF" selected="true">GO Molecular Function</option>
67 <option value="biocarta_pathway">Biocarta Pathways</option>
68 <option value="kegg_pathway">KEGG Pathways</option>
69 <option value="panther_pathway">PANTHER Pathways</option>
70 <option value="pfam">PFAM</option>
71 <option value="reactome">Reactome</option>
72 <option value="mesh">MeSH</option>
73 <option value="hallmark">Hallmark gene sets</option>
74 <option value="immunologic">Immunologic signature gene sets</option>
75 <option value="oncogenic">Oncogenic signature gene sets</option>
76 <option value="ctd">Comparative Toxicogenomics Database (CTD)</option>
77 <option value="drug_bank">DrugBank</option>
78 <option value="microrna">MicroRNA</option>
79 <option value="transcription_factors">Transcription Factors</option>
80 <option value="protein_interaction_biogrid">Protein Interactions (BioGRID)</option>
81 <option value="metabolite">Metabolites</option>
82 <option value="cytoband">Cytobands</option>
83 </param>
84 </when>
85 <when value="mm10">
86 <param name="genesets" type="select" label="GeneSets" help="Select gene sets to test." multiple="true" display="checkboxes">
87 <option value="GOBP" selected="true">GO Biological Process</option>
88 <option value="GOCC" selected="true">GO Cellular Component</option>
89 <option value="GOMF" selected="true">GO Molecular Function</option>
90 <option value="biocarta_pathway">Biocarta Pathways</option>
91 <option value="kegg_pathway">KEGG Pathways</option>
92 <option value="panther_pathway">PANTHER Pathways</option>
93 <option value="pfam">PFAM</option>
94 <option value="reactome">Reactome</option>
95 <option value="mesh">MeSH</option>
96 <option value="ctd">Comparative Toxicogenomics Database (CTD)</option>
97 <option value="drug_bank">DrugBank</option>
98 <option value="microrna">MicroRNA</option>
99 <option value="transcription_factors">Transcription Factors</option>
100 <option value="protein_interaction_biogrid">Protein Interactions (BioGRID)</option>
101 <option value="metabolite">Metabolites</option>
102 </param>
103 </when>
104 <when value="mm9">
105 <param name="genesets" type="select" label="GeneSets" help="Select gene sets to test." multiple="true" display="checkboxes">
106 <option value="GOBP" selected="true">GO Biological Process</option>
107 <option value="GOCC" selected="true">GO Cellular Component</option>
108 <option value="GOMF" selected="true">GO Molecular Function</option>
109 <option value="biocarta_pathway">Biocarta Pathways</option>
110 <option value="kegg_pathway">KEGG Pathways</option>
111 <option value="panther_pathway">PANTHER Pathways</option>
112 <option value="pfam">PFAM</option>
113 <option value="reactome">Reactome</option>
114 <option value="mesh">MeSH</option>
115 <option value="ctd">Comparative Toxicogenomics Database (CTD)</option>
116 <option value="drug_bank">DrugBank</option>
117 <option value="microrna">MicroRNA</option>
118 <option value="transcription_factors">Transcription Factors</option>
119 <option value="protein_interaction_biogrid">Protein Interactions (BioGRID)</option>
120 <option value="metabolite">Metabolites</option>
121 </param>
122 </when>
123 <when value="rn4">
124 <param name="genesets" type="select" label="GeneSets" help="Select gene sets to test." multiple="true" display="checkboxes">
125 <option value="GOBP" selected="true">GO Biological Process</option>
126 <option value="GOCC" selected="true">GO Cellular Component</option>
127 <option value="GOMF" selected="true">GO Molecular Function</option>
128 <option value="biocarta_pathway">Biocarta Pathways</option>
129 <option value="kegg_pathway">KEGG Pathways</option>
130 <option value="panther_pathway">PANTHER Pathways</option>
131 <option value="pfam">PFAM</option>
132 <option value="reactome">Reactome</option>
133 <option value="mesh">MeSH</option>
134 <option value="drug_bank">DrugBank</option>
135 <option value="microrna">MicroRNA</option>
136 <option value="transcription_factors">Transcription Factors</option>
137 <option value="ctd">Comparative Toxicogenomics Database (CTD)</option>
138 <option value="metabolite">Metabolites</option>
139 </param>
140 </when>
141 <when value="rn5">
142 <param name="genesets" type="select" label="GeneSets" help="Select gene sets to test." multiple="true" display="checkboxes">
143 <option value="GOBP" selected="true">GO Biological Process</option>
144 <option value="GOCC" selected="true">GO Cellular Component</option>
145 <option value="GOMF" selected="true">GO Molecular Function</option>
146 <option value="biocarta_pathway">Biocarta Pathways</option>
147 <option value="kegg_pathway">KEGG Pathways</option>
148 <option value="panther_pathway">PANTHER Pathways</option>
149 <option value="pfam">PFAM</option>
150 <option value="reactome">Reactome</option>
151 <option value="mesh">MeSH</option>
152 <option value="drug_bank">DrugBank</option>
153 <option value="microrna">MicroRNA</option>
154 <option value="transcription_factors">Transcription Factors</option>
155 <option value="ctd">Comparative Toxicogenomics Database (CTD)</option>
156 <option value="metabolite">Metabolites</option>
157 </param>
158 </when>
159 <when value="rn6">
160 <param name="genesets" type="select" label="GeneSets" help="Select gene sets to test." multiple="true" display="checkboxes">
161 <option value="GOBP" selected="true">GO Biological Process</option>
162 <option value="GOCC" selected="true">GO Cellular Component</option>
163 <option value="GOMF" selected="true">GO Molecular Function</option>
164 <option value="biocarta_pathway">Biocarta Pathways</option>
165 <option value="kegg_pathway">KEGG Pathways</option>
166 <option value="panther_pathway">PANTHER Pathways</option>
167 <option value="pfam">PFAM</option>
168 <option value="reactome">Reactome</option>
169 <option value="mesh">MeSH</option>
170 <option value="drug_bank">DrugBank</option>
171 <option value="microrna">MicroRNA</option>
172 <option value="transcription_factors">Transcription Factors</option>
173 <option value="ctd">Comparative Toxicogenomics Database (CTD)</option>
174 <option value="metabolite">Metabolites</option>
175 </param>
176 </when>
177 <when value="dm3">
178 <param name="genesets" type="select" label="GeneSets" help="Select gene sets to test." multiple="true" display="checkboxes">
179 <option value="GOBP" selected="true">GO Biological Process</option>
180 <option value="GOCC" selected="true">GO Cellular Component</option>
181 <option value="GOMF" selected="true">GO Molecular Function</option>
182 <option value="reactome">Reactome</option>
183 </param>
184 </when>
185 <when value="dm6">
186 <param name="genesets" type="select" label="GeneSets" help="Select gene sets to test." multiple="true" display="checkboxes">
187 <option value="GOBP" selected="true">GO Biological Process</option>
188 <option value="GOCC" selected="true">GO Cellular Component</option>
189 <option value="GOMF" selected="true">GO Molecular Function</option>
190 <option value="reactome">Reactome</option>
191 </param>
192 </when>
193 <when value="danRer10">
194 <param name="genesets" type="select" label="GeneSets" help="Select gene sets to test." multiple="true" display="checkboxes">
195 <option value="GOBP" selected="true">GO Biological Process</option>
196 <option value="GOCC" selected="true">GO Cellular Component</option>
197 <option value="GOMF" selected="true">GO Molecular Function</option>
198 <option value="reactome">Reactome</option>
199 </param>
200 </when>
201 </conditional>
202
203 <param name="method" type="select" label="Method" display="radio" help="See details in the help section." >
204 <option value="chipenrich" selected="true">Chip-Enrich</option>
205 <option value="polyenrich" >Poly-Enrich</option>
206 <option value="hybridenrich">Hybrid-Enrich</option>
207 <option value="broadenrich">Broad-Enrich</option>
208 </param>
209
210 <conditional name="peaks_option">
211 <param name="peaks_type" type="select" label="Which Peaks to use" display="radio" help="" >
212 <option value="promoter">Promoter regulation choices</option>
213 <option value="genedistal">Gene distal regulation choices</option>
214 <option value="regulation">Regulation from across the whole genome</option>
215 <option value="other">Other</option>
216 </param>
217 <when value="promoter">
218 <param name="locusdefs" type="select" label="Promoter regulation choices" display="radio" help="" >
219 <option value="1kb"> &lt; 1kb (only use peaks within 1kb of a transcription start site)</option>
220 <option value="5kb"> &lt; 5kb (only use peaks within 5kb of a transcription start site)</option>
221 <option value="10kb"> &lt; 10kb (only use peaks within 10kb of a transcription start site)</option>
222 </param>
223 </when>
224 <when value="genedistal">
225 <param name="locusdefs" type="select" label="Gene distal regulation choices" display="radio" help="" >
226 <option value="1kb_outstie"> &gt; 1kb (only use peaks greater than 1kb of a transcription start site)</option>
227 <option value="5kb_outstie"> &gt; 5kb (only use peaks greater than 5kb of a transcription start site)</option>
228 <option value="10kb_outstie"> &gt; 10kb (only use peaks greater than 10kb of a transcription start site)</option>
229 <option value="1kb_outstie_upstream"> &gt; 1kb upstream (only use peaks greater than 1kb upstream of a transcription start site)</option>
230 <option value="5kb_outstie_upstream"> &gt; 5kb upstream (only use peaks greater than 5kb upstream of a transcription start site)</option>
231 <option value="10kb_outstie_upstream"> &gt; 10kb upstream (only use peaks greater than 10kb upstream of a transcription start site)</option>
232 </param>
233 </when>
234 <when value="regulation">
235 <param name="locusdefs" type="select" label="Regulation from across the whole genome" display="radio" help="" >
236 <option value="nearest_gene">Nearest Gene (use all peaks; assign peaks to the nearest gene defined by transcription start and end sites)</option>
237 <option value="nearest_tss">Nearest TSS (use all peaks; assign peaks to the gene with the closest TSS)</option>
238 </param>
239 </when>
240 <when value="other">
241 <param name="locusdefs" type="select" label="Other" display="radio" help="" >
242 <option value="exon">Exon (only use peaks that fall within an annotated exon)</option>
243 <option value="intron">Intron (only use peaks that fall within an annotated itron)</option>
244 </param>
245 </when>
246 </conditional>
247
248 <param name="threshold" type="integer" value="1" min="1" label="Peak Threshold Number" help="Number of peaks a gene must have assigned to it before getting coded as 1 (having a peak) in the test. Typically, this should be set to 1." />
249
250 <section name="adv" title="Advanced options">
251 <param name="minSize" type="integer" value="15" min="1" label="Minimum gene set size" help="" />
252 <param name="maxSize" type="integer" value="2000" min="1" label="Maximum gene set size" help="" />
253 <param name="randomization" type="select" label="Randomization" help="See details in the help section.">
254 <option value="NULL">No randomizations</option>
255 <option value="complete">complete</option>
256 <option value="bylength">by length</option>
257 <option value="bylocation">by location</option>
258 </param>
259 </section>
260
261 </inputs>
262
263 <outputs>
264 <data name="output_peaks" format="csv" label="peaks_result" />
265 <data name="enrich_result" format="csv" label="enrich_result" />
266 <data name="peaks_per_gene" format="csv" label="peaks_per_gene" />
267 </outputs>
268
269 <tests>
270 <test>
271 <param name="peaks" value="peaks.csv" ftype="csv" />
272 <conditional name="geneset_option">
273 <param name="genome" value="hg19" />
274 <param name="genesets" value="GOBP,GOCC,GOMF" />
275 </conditional>
276 <param name="method" value="chipenrich" />
277 <conditional name="peaks_option">
278 <param name="peaks_type" value="promoter" />
279 <param name="locusdefs" value="1kb" />
280 </conditional>
281 <param name="threshold" value="1" />
282 <section name="adv">
283 <param name="minSize" value="15" />
284 <param name="maxSize" value="2000" />
285 <param name="randomization" value="NULL" />
286 </section>
287 <output name="output_peaks" file="output_peaks.csv" ftype="csv" />
288 <output name="enrich_result" file="enrich_result.csv" ftype="csv" />
289 <output name="peaks_per_gene" file="peaks_per_gene.csv" ftype="csv" />
290 </test>
291 </tests>
292
293 <help><![CDATA[
294
295 .. class:: infomark
296
297 **What it does**
298
299 Chip-Enrich includes four methods to test Chip-seq peak data for enrichment of biological pathways, Gene Ontology terms, and
300 other types of gene sets. Using a CSV file whose first three columns correspond to 'chr', 'start' and 'end',
301 Chip-Enrich assigns peaks to genes based on a chosen "locus definition". The "locus" of a gene is the region from
302 which the gene is predicted to be regulated.
303
304 -------
305
306 =========
307 **Input**
308 =========
309
310 **Peaks**
311
312 The CSV file contains a table whose first three columns correspond to 'chr', 'start' and 'end'. For example:
313
314 ====== ========== ===========
315 chr start end
316 ====== ========== ===========
317 chr1 156186314 156186469
318 chr1 10490456 10490550
319 chr1 46713352 46713436
320 chr1 226496843 226496924
321 chr1 200589825 200589928
322 chr1 47779789 47779907
323 ====== ========== ===========
324
325 **Method**
326
327 The following guidelines are intended to help select an enrichment method:
328
329 * Chip-Enrich: is designed for use with 1,000s or 10,000s of narrow peaks which results in fewer gene loci containing a peak overall. For example, ChIP-seq experiments for transcription factors.
330
331 * Poly-Enrich: is also designed for narrow peaks, for experiments with 100,000s of peaks, or in cases where the number of binding sites per gene affects its regulation. If unsure whether to use chipenrich or polyenrich, then we recommend hybridenrich.
332
333 * Hybrid-Enrich: is a combination of chipenrich and polyenrich, to be used when one is unsure which is the optimal method.
334
335 * Broad-Enrich: is designed for use with broad peaks that may intersect multiple gene loci, and cumulatively cover greater than 5% of the genome. For example, ChIP-seq experiments for histone modifications.
336
337 **Randomizations**
338
339 Randomization of locus definitions allows for the assessment of Type I Error under the null hypothesis. The randomization codes are:
340
341 * No randomizations: the default.
342 * complete: Shuffle the `gene_id` and `symbol` columns of the `locusdef` together, without regard for the chromosome location, or locus length. The null hypothesis is that there is no true gene set enrichment.
343 * bylength: Shuffle the `gene_id` and `symbol` columns of the `locusdef` together within bins of 100 genes sorted by locus length. The null hypothesis is that there is no true gene set enrichment, but with preserved locus length relationship.
344 * bylocation: Shuffle the `gene_id` and `symbol` columns of the `locusdef` together within bins of 50 genes sorted by genomic location. The null hypothesis is that there is no true gene set enrichment, but with preserved genomic location.
345
346 The return value with a selected randomization is the same list as without. To assess the Type I error, the alpha level for the particular data set can be calculated by dividing the total number of gene sets with p-value < alpha by the total number of tests. Users may want to perform multiple randomizations for a set of peaks and take the median of the alpha values.
347
348 ==========
349 **Output**
350 ==========
351
352 **Peaks**
353
354 A CSV file containing peak assignments to genes. Peaks which do not overlap a gene locus are not included. Each peak that was assigned to a gene is listed, along with the peak midpoint or peak interval coordinates (depending on which was used), the gene to which the peak was assigned, the locus start and end position of the gene, and the distance from the peak to the TSS.
355
356 **Peaks per gene**
357
358 A CSV file containing the count of peaks per gene.
359
360 **Enrichment results**
361
362 A CSV file containing the results from performing the gene set enrichment test on each gene set that was considered.
363
364 ]]></help>
365 <citations>
366 <citation type="doi">10.1093/nar/gku463</citation>
367 <citation type="doi">10.1093/bioinformatics/btu444</citation>
368 </citations>
369 </tool>