4
|
1 <tool name="Chip-Cluster: Cluster ChIP-seq peaks and create a heatmap" id="chip-cluster">
|
|
2 <description>
|
|
3 Merge multiple ChIP-seq experiments, alligning their peaks to MTLs (Multi
|
|
4 Transcription Factor Loci(us)) and optionally incorperate expression
|
|
5 </description>
|
|
6 <command interpreter="command">/bin/bash $shscript </command>
|
|
7 <inputs>
|
|
8 <param name="chipInputFormat" type="select" display="radio" label="ChIP Input Format">
|
|
9 <option name="macs" value="MACS">MACS</option>
|
|
10 <option name="bed" value="BED">BED</option>
|
|
11 </param>
|
|
12 <param name="mtlType" type="select" display="radio" label="Cluster by: ">
|
|
13 <option name="summit" value="summit">Summit</option>
|
|
14 <option name="interval" value="interval">Interval</option>
|
|
15 </param>
|
|
16 <param name="summitDistance" type="text" label="Summit Distance (BP) - Summit only" value="100">
|
|
17 </param>
|
|
18 <param name="numberBins" type="text" label="Number of Bins" value="30">
|
|
19 </param>
|
|
20 <repeat name="chip_tracks" title="MACS/BED Files">
|
|
21 <param name="file" type="data" format="tabular" label="Dataset"/>
|
|
22 <param name="name" type="text" label="Dataset Name"/>
|
|
23 </repeat>
|
|
24 <param name="map_rna" type="boolean" truevalue="yes" falsevalue="no" label="Incorperate RNA?"/>
|
|
25 <param name="includeTargetless" checked="true" type="boolean" truevalue="yes" falsevalue="no" label="Include Targetless MTLs?"/>
|
|
26 <param name="reference_file" type="data" format="tabular" label="Reference Genome File"/>
|
|
27
|
|
28 <param name="normalize_rna" type="boolean" truevalue="yes" falsevalue="no" label="Normalize Expression?"/>
|
|
29 <param name="use_mean" type="boolean" truevalue="yes" falsevalue="no" label="Use mean expression across exp. to normalize?"/>
|
|
30 <param name="rnaInputFormat" type="select" display="radio" label="RNA Input Format">
|
|
31 <option name="cufflinks" value="cufflinks">Cufflinks</option>
|
|
32 <option name="bed" value="bed">BED</option>
|
|
33 </param>
|
|
34 <param name="numClusters" type="text" label="Number of Clusters (kmeans)" value="8">
|
|
35 </param>
|
|
36 <param name="trgtDistance" type="text" label="Transcript threshold distance" value="5000">
|
|
37 </param>
|
|
38 <repeat name="rna_tracks" title="Cufflinks/BED Files">
|
|
39 <param name="file" type="data" format="tabular" label="Dataset"/>
|
|
40 <param name="name" type="text" label="Dataset Name"/>
|
|
41 <param name="norm" type="data" label="Normalization Dataset"/>
|
|
42 </repeat>
|
|
43 </inputs>
|
|
44 <outputs>
|
|
45 <data format="xls" name="cluster_assignments" label="Cluster Assignments"/>
|
|
46 <data format="xls" name="mtls" label="MTLS File"/>
|
|
47 <data format="txt" name="log" label="Log file" />
|
|
48 <data format="bmp" name="heatmap_image" label="Heatmap Image" />
|
|
49 <!-- <data format="png" name="heatmap_image" label="Heatmap Image" >-->
|
|
50 <!-- <filter>imageFormat=="png"</filter>-->
|
|
51 <!-- </data>-->
|
|
52 <!-- <data format="pdf" name="heatmap_image" label="Heatmap Image" >-->
|
|
53 <!-- <filter>imageFormat=="pdf"</filter>-->
|
|
54 <!-- </data>-->
|
|
55
|
|
56 </outputs>
|
|
57 <configfiles>
|
|
58 <configfile name="shscript">
|
|
59 <!-- This is the script that runs (Chettah/bash code)-->
|
|
60 #!/bin/bash
|
|
61
|
|
62 #import os
|
|
63 #set $path = $os.path.abspath($__app__.config.tool_path)
|
|
64
|
|
65
|
|
66 ## Set symbols so that they are not incorrectly interpreted:
|
|
67 #set $dollar = chr(36)
|
|
68 #set $gt = chr(62)
|
|
69 #set $lt = chr(60)
|
|
70 #set $ad = chr(38)
|
|
71 #set $bs = chr(92)
|
|
72
|
|
73 echo $map_rna ${ad}${gt}${gt} $log
|
|
74 echo "This is the Bash log file: " ${ad}${gt}${gt} $log
|
|
75 ###############################################################################
|
|
76 ## Convert the gtf file to a file that aviv's script can hadel
|
|
77 #if str($map_rna)=='yes'
|
|
78 echo "Converting gtf file" ${ad}${gt}${gt} $log
|
|
79 Rscript $path/visualization/gtfToMapFriendlyAnnotation.R $reference_file ${ad}${gt}${gt} $log
|
|
80 echo "done converting gtf file" ${ad}${gt}${gt} $log
|
|
81 #end if
|
|
82 ###############################################################################
|
|
83 ## Get ChIP data in correctly formated strings and annotate if nessisary.
|
|
84 #set $sep = '::'
|
|
85 #for $i, $chip in enumerate( $chip_tracks )
|
|
86 #if $i==0
|
|
87 echo "Chip Files:" ${ad}${gt}${gt} $log
|
|
88 echo "The first file label is: ${chip.name}" ${ad}${gt}${gt} $log
|
|
89 echo "The first file path is: ${chip.file}" ${ad}${gt}${gt} $log
|
|
90 chip_labels=${chip.name}
|
|
91 chip_paths=${chip.file}
|
|
92 #else
|
|
93 echo "The next file label is: ${chip.name}" ${ad}${gt}${gt} $log
|
|
94 echo "The next file path is: ${chip.file}" ${ad}${gt}${gt} $log
|
|
95 chip_labels=${dollar}chip_labels${sep}${chip.name}
|
|
96 chip_paths=${dollar}chip_paths${sep}${chip.file}
|
|
97 #end if
|
|
98 #end for
|
|
99
|
|
100 echo chip paths are - ${dollar}chip_paths ${ad}${gt}${gt} $log
|
|
101 echo chip labels are - ${dollar}chip_labels ${ad}${gt}${gt} $log
|
|
102
|
|
103 ###############################################################################
|
|
104 ## Cluster peaks
|
|
105
|
|
106 Rscript $path/visualization/cluster_peaks.R \
|
|
107 --input_files ${dollar}chip_paths \
|
|
108 --input_type $chipInputFormat \
|
|
109 --path_output ./ \
|
|
110 --expt_names ${dollar}chip_labels \
|
|
111 --dist_summits $summitDistance \
|
|
112 --mtl_type $mtlType ${ad}${gt}${gt} $log
|
|
113
|
|
114 ###############################################################################
|
|
115 ## Annotate mtls.xls if nessisary
|
|
116 #if str($map_rna)=="yes"
|
|
117 echo "annotating mtls.xls..." ${ad}${gt}${gt} $log
|
|
118 Rscript $path/visualization/annotate_mtls.R mtls.xls gene_annotation.txt $trgtDistance ${ad}${gt}${gt} $log
|
|
119 #end if
|
|
120 ###############################################################################
|
|
121 ## If rna is specified, then get RNA data in correctly formated strings:
|
|
122 #if str($map_rna)=='yes'
|
|
123 #set $sep = '::'
|
|
124 #for $i, $rna in enumerate( $rna_tracks )
|
|
125 #if $i==0
|
|
126 echo "The first file label is: ${rna.name}" ${ad}${gt}${gt} $log
|
|
127 echo "The first file path is: ${rna.file}" ${ad}${gt}${gt} $log
|
|
128 rna_labels=${rna.name}
|
|
129 rna_paths=${rna.file}
|
|
130 rna_norm_paths=${rna.norm}
|
|
131 #else
|
|
132 echo "The next file label is: ${rna.name}" ${ad}${gt}${gt} $log
|
|
133 echo "The next file path is: ${rna.file}" ${ad}${gt}${gt} $log
|
|
134 rna_labels=${dollar}rna_labels${sep}${rna.name}
|
|
135 rna_paths=${dollar}rna_paths${sep}${rna.file}
|
|
136 rna_norm_paths=${dollar}rna_norm_paths${sep}${rna.norm}
|
|
137 #end if
|
|
138 #end for
|
|
139 echo rna paths are - ${dollar}rna_paths ${ad}${gt}${gt} $log
|
|
140 echo rna labels are - ${dollar}rna_labels ${ad}${gt}${gt} $log
|
|
141 echo rna norm files are - ${dollar}rna_norm_paths ${ad}${gt}${gt} $log
|
|
142 #end if
|
|
143 ###############################################################################
|
|
144
|
|
145 #if str($normalize_rna)=='no'
|
|
146 echo "Normalization by file is set to no" ${ad}${gt}${gt} $log
|
|
147 rna_norm_paths=no
|
|
148 #end if
|
|
149
|
|
150 #if str($use_mean)=='yes'
|
|
151 echo "Normalization of expression will be done by mean" ${ad}${gt}${gt} $log
|
|
152 rna_norm_paths=mean
|
|
153 #end if
|
|
154
|
|
155 #if str($map_rna)=='no'
|
|
156 mtls_file=mtls.xls
|
|
157 rna_paths=none
|
|
158 rna_labels=none
|
|
159 #else
|
|
160 mtls_file=annotated_mtls.xls
|
|
161 #end if
|
|
162
|
|
163 echo "
|
|
164 Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \
|
|
165 --cluster_file ./cluster \
|
|
166 --chip_experiment_order ${dollar}chip_labels \
|
|
167 --heatmap_file ./heatmap \
|
|
168 --heatmap_type bmp \
|
|
169 --n_clusters $numClusters \
|
|
170 --filter_percentage 100 \
|
|
171 --expression_file ${dollar}rna_paths \
|
|
172 --expression_name ${dollar}rna_labels \
|
|
173 --normalization_file ${dollar}rna_norm_paths \
|
|
174 ${ad}${gt}${gt} $log" ${ad}${gt}${gt} $log
|
|
175
|
|
176 Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \
|
|
177 --cluster_file ./cluster \
|
|
178 --chip_experiment_order ${dollar}chip_labels \
|
|
179 --heatmap_file ./heatmap \
|
|
180 --heatmap_type bmp \
|
|
181 --n_clusters $numClusters \
|
|
182 --filter_percentage 100 \
|
|
183 --number_bins $numberBins \
|
|
184 --include_targetless $includeTargetless \
|
|
185 --expression_file ${dollar}rna_paths \
|
|
186 --expression_name ${dollar}rna_labels \
|
|
187 --normalization_file ${dollar}rna_norm_paths \
|
|
188 ${ad}${gt}${gt} $log
|
|
189
|
|
190 ls ${ad}${gt}${gt} $log
|
|
191
|
|
192
|
|
193
|
|
194
|
|
195 ##################################################################
|
|
196 #if str($map_rna)=='yes'
|
|
197 mv ./annotated_mtls.xls $mtls
|
|
198 #else
|
|
199 mv ./mtls.xls $mtls
|
|
200 #end if
|
|
201 mv ./heatmap.* $heatmap_image
|
|
202 mv ./cluster.tsv $cluster_assignments
|
|
203
|
|
204 </configfile>
|
|
205 </configfiles>
|
|
206 <!--<tests>-->
|
|
207 <!-- <test maxseconds="3600" name="GCA_1">-->
|
|
208 <!-- <param name="bfile" value="bedfile.bed" />-->
|
|
209 <!-- <param name="span" value="3000" />-->
|
|
210 <!-- <param name="genome" value="hg18" />-->
|
|
211 <!-- <output name="output" file="gca_1/gca_1.xls" />-->
|
|
212 <!-- <output name="output" file="gca_1/gca_1.log" lines_diff = "200" />-->
|
|
213 <!-- </test>-->
|
|
214 <!-- <test maxseconds="3600" name="GCA_2">-->
|
|
215 <!-- <param name="bfile" value="bedfile.bed" />-->
|
|
216 <!-- <param name="span" value="100" />-->
|
|
217 <!-- <param name="genome" value="hg18" />-->
|
|
218 <!-- <output name="output" file="gca_2/gca_2.xls" />-->
|
|
219 <!-- <output name="output" file="gca_2/gca_2.log" lines_diff = "200" />-->
|
|
220 <!-- </test>-->
|
|
221 <!-- <test maxseconds="3600" name="GCA_3">-->
|
|
222 <!-- <param name="bfile" value="bedfile.bed" />-->
|
|
223 <!-- <param name="span" value="500" />-->
|
|
224 <!-- <param name="genome" value="hg18" />-->
|
|
225 <!-- <output name="output" file="gca_3/gca_3.xls" />-->
|
|
226 <!-- <output name="output" file="gca_3/gca_3.log" lines_diff = "200" />-->
|
|
227 <!-- </test>-->
|
|
228 <!-- <test maxseconds="3600" name="GCA_4">-->
|
|
229 <!-- <param name="bfile" value="bedfile.bed" />-->
|
|
230 <!-- <param name="span" value="1000" />-->
|
|
231 <!-- <param name="genome" value="hg18" />-->
|
|
232 <!-- <output name="output" file="gca_4/gca_4.xls" />-->
|
|
233 <!-- <output name="output" file="gca_4/gca_4.log" lines_diff = "200" />-->
|
|
234 <!-- </test>-->
|
|
235 <!-- <test maxseconds="3600" name="GCA_5">-->
|
|
236 <!-- <param name="bfile" value="bedfile.bed" />-->
|
|
237 <!-- <param name="span" value="10000" />-->
|
|
238 <!-- <param name="genome" value="hg18" />-->
|
|
239 <!-- <output name="output" file="gca_5/gca_5.xls" />-->
|
|
240 <!-- <output name="output" file="gca_5/gca_5.log" lines_diff = "200" />-->
|
|
241 <!-- </test>-->
|
|
242 <!--</tests>-->
|
|
243 <help>
|
|
244 This tool will merge peaks form multiple chip-seq experiments, creating MTLs for
|
|
245 each overlapping region. It will then cluster each MTL based on the score of
|
|
246 each peak within each MTL (using K-means clustering, with k set by user). A
|
|
247 heatmap is then generated from the resulting cluster along with the MTLs
|
|
248 generated. This module in writin in R and is will be made available on github
|
|
249 and bioconductor. This work was done by Kieran Mace and Aviv Madar.
|
|
250
|
|
251 **NEED IMPROVEMENT**
|
|
252
|
|
253 -----
|
|
254
|
|
255 **Parameters**
|
|
256
|
|
257 - **Input files** contains either macs or BED files to be merged. This list of files must be two or larger.
|
|
258 - **Experiment names** contains the name given to each track.
|
|
259 - **Summit distance** is the cuttoff distance (in BP) to be included in an MTL. This option is not used with the summit option below
|
|
260 - **Input Format** Either bed of MACS file format, all files must be of one type. Defaults to MACS
|
|
261 - **MTL Type** Either interval or summit (defaults to summit).
|
|
262 - **Number clusters** the value of k for kmeans clustering.
|
|
263 - **Filter top MTLS** The top percentage of MTLs to keep for image and cluster (based on the union of mean, non-zero mean, and variance of the scores).
|
|
264 -----
|
|
265
|
|
266 **Output**
|
|
267
|
|
268 - **XLS file** is the tab-delimited file containing the MTL data.
|
|
269 - **PNG file** is the heatmap image generated after clustering the MTL data.
|
|
270
|
|
271 -----
|
|
272
|
|
273 **script parameter list of Chip-Cluster**
|
|
274
|
|
275 Options:
|
|
276 DESCRIPTIION:
|
|
277 cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where
|
|
278 each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files)
|
|
279 fall within a certain distance between summits from eachother.
|
|
280
|
|
281 INPUT:
|
|
282 1.path_input=path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk]
|
|
283 2.path_output=path to save generated MTL cluster file (where to save mtls.xls)
|
|
284 3.expt_names=user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk]
|
|
285 4.dist.summits=maximum distance between summits belonging to the same MTL (defaults to 100)
|
|
286 5.input_type=the type of input file used (MACS or .bed; defaults to MACS)
|
|
287 6.mtl_type=interval or summit (defaults to summit)
|
|
288
|
|
289 EXAMPLE RUN:
|
|
290 cluster_peaks.R
|
|
291 --input_macs_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls
|
|
292 --input_type MACS
|
|
293 --path_output results/
|
|
294 --expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17
|
|
295 --dist_summits 100
|
|
296 --mtl_type summit
|
|
297
|
|
298 DESCRIPTIION:
|
|
299 heatmap.R takes a ...
|
|
300
|
|
301 INPUT:
|
|
302 1.--mtls_file path to mtls file.
|
|
303
|
|
304 2.--cluster_file the destination path for the cluster file.
|
|
305
|
|
306 3.--heatmap_file the destination path for heatmap image (no extension).
|
|
307
|
|
308 4.--heatmap_type choice of image type, currently support png and pdf.
|
|
309
|
|
310 5.--n_clusters number of clusters in the heatmap
|
|
311
|
|
312 6.--filter_percentage percentage of mtls that will be analysed. for eg. if
|
|
313 we make filter_percentage 30, we will take the union of the top mtls in
|
|
314 mean, non-zero mean and variance.
|
|
315
|
|
316
|
|
317 EXAMPLE RUN:
|
|
318 Rscript heatmap.R
|
|
319 --mtls_file mtls.xls
|
|
320 --cluster_file output/cluster
|
|
321 --heatmap_file output/heatmap
|
|
322 --heatmap_type png
|
|
323 --n_clusters 13
|
|
324 --filter_percentage 60
|
|
325
|
|
326 Please cite us if you used this script:
|
|
327 The transcription factor network regulating Th17 lineage specification and function.
|
|
328 Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers,
|
|
329 Richard Bonneau and Dan R. Littman et. al. (in preperation)
|
|
330
|
|
331 </help>
|
|
332
|
|
333 </tool>
|