comparison mtls_analyze/mtls_analyze.xml @ 4:b465306d00ba draft default tip

Uploaded
author kmace
date Mon, 23 Jul 2012 13:00:15 -0400
parents
children
comparison
equal deleted inserted replaced
3:a0306edbf2f8 4:b465306d00ba
1 <tool name="Chip-Cluster: Cluster ChIP-seq peaks and create a heatmap" id="chip-cluster">
2 <description>
3 Merge multiple ChIP-seq experiments, alligning their peaks to MTLs (Multi
4 Transcription Factor Loci(us)) and optionally incorperate expression
5 </description>
6 <command interpreter="command">/bin/bash $shscript </command>
7 <inputs>
8 <param name="chipInputFormat" type="select" display="radio" label="ChIP Input Format">
9 <option name="macs" value="MACS">MACS</option>
10 <option name="bed" value="BED">BED</option>
11 </param>
12 <param name="mtlType" type="select" display="radio" label="Cluster by: ">
13 <option name="summit" value="summit">Summit</option>
14 <option name="interval" value="interval">Interval</option>
15 </param>
16 <param name="summitDistance" type="text" label="Summit Distance (BP) - Summit only" value="100">
17 </param>
18 <param name="numberBins" type="text" label="Number of Bins" value="30">
19 </param>
20 <repeat name="chip_tracks" title="MACS/BED Files">
21 <param name="file" type="data" format="tabular" label="Dataset"/>
22 <param name="name" type="text" label="Dataset Name"/>
23 </repeat>
24 <param name="map_rna" type="boolean" truevalue="yes" falsevalue="no" label="Incorperate RNA?"/>
25 <param name="includeTargetless" checked="true" type="boolean" truevalue="yes" falsevalue="no" label="Include Targetless MTLs?"/>
26 <param name="reference_file" type="data" format="tabular" label="Reference Genome File"/>
27
28 <param name="normalize_rna" type="boolean" truevalue="yes" falsevalue="no" label="Normalize Expression?"/>
29 <param name="use_mean" type="boolean" truevalue="yes" falsevalue="no" label="Use mean expression across exp. to normalize?"/>
30 <param name="rnaInputFormat" type="select" display="radio" label="RNA Input Format">
31 <option name="cufflinks" value="cufflinks">Cufflinks</option>
32 <option name="bed" value="bed">BED</option>
33 </param>
34 <param name="numClusters" type="text" label="Number of Clusters (kmeans)" value="8">
35 </param>
36 <param name="trgtDistance" type="text" label="Transcript threshold distance" value="5000">
37 </param>
38 <repeat name="rna_tracks" title="Cufflinks/BED Files">
39 <param name="file" type="data" format="tabular" label="Dataset"/>
40 <param name="name" type="text" label="Dataset Name"/>
41 <param name="norm" type="data" label="Normalization Dataset"/>
42 </repeat>
43 </inputs>
44 <outputs>
45 <data format="xls" name="cluster_assignments" label="Cluster Assignments"/>
46 <data format="xls" name="mtls" label="MTLS File"/>
47 <data format="txt" name="log" label="Log file" />
48 <data format="bmp" name="heatmap_image" label="Heatmap Image" />
49 <!-- <data format="png" name="heatmap_image" label="Heatmap Image" >-->
50 <!-- <filter>imageFormat=="png"</filter>-->
51 <!-- </data>-->
52 <!-- <data format="pdf" name="heatmap_image" label="Heatmap Image" >-->
53 <!-- <filter>imageFormat=="pdf"</filter>-->
54 <!-- </data>-->
55
56 </outputs>
57 <configfiles>
58 <configfile name="shscript">
59 <!-- This is the script that runs (Chettah/bash code)-->
60 #!/bin/bash
61
62 #import os
63 #set $path = $os.path.abspath($__app__.config.tool_path)
64
65
66 ## Set symbols so that they are not incorrectly interpreted:
67 #set $dollar = chr(36)
68 #set $gt = chr(62)
69 #set $lt = chr(60)
70 #set $ad = chr(38)
71 #set $bs = chr(92)
72
73 echo $map_rna ${ad}${gt}${gt} $log
74 echo "This is the Bash log file: " ${ad}${gt}${gt} $log
75 ###############################################################################
76 ## Convert the gtf file to a file that aviv's script can hadel
77 #if str($map_rna)=='yes'
78 echo "Converting gtf file" ${ad}${gt}${gt} $log
79 Rscript $path/visualization/gtfToMapFriendlyAnnotation.R $reference_file ${ad}${gt}${gt} $log
80 echo "done converting gtf file" ${ad}${gt}${gt} $log
81 #end if
82 ###############################################################################
83 ## Get ChIP data in correctly formated strings and annotate if nessisary.
84 #set $sep = '::'
85 #for $i, $chip in enumerate( $chip_tracks )
86 #if $i==0
87 echo "Chip Files:" ${ad}${gt}${gt} $log
88 echo "The first file label is: ${chip.name}" ${ad}${gt}${gt} $log
89 echo "The first file path is: ${chip.file}" ${ad}${gt}${gt} $log
90 chip_labels=${chip.name}
91 chip_paths=${chip.file}
92 #else
93 echo "The next file label is: ${chip.name}" ${ad}${gt}${gt} $log
94 echo "The next file path is: ${chip.file}" ${ad}${gt}${gt} $log
95 chip_labels=${dollar}chip_labels${sep}${chip.name}
96 chip_paths=${dollar}chip_paths${sep}${chip.file}
97 #end if
98 #end for
99
100 echo chip paths are - ${dollar}chip_paths ${ad}${gt}${gt} $log
101 echo chip labels are - ${dollar}chip_labels ${ad}${gt}${gt} $log
102
103 ###############################################################################
104 ## Cluster peaks
105
106 Rscript $path/visualization/cluster_peaks.R \
107 --input_files ${dollar}chip_paths \
108 --input_type $chipInputFormat \
109 --path_output ./ \
110 --expt_names ${dollar}chip_labels \
111 --dist_summits $summitDistance \
112 --mtl_type $mtlType ${ad}${gt}${gt} $log
113
114 ###############################################################################
115 ## Annotate mtls.xls if nessisary
116 #if str($map_rna)=="yes"
117 echo "annotating mtls.xls..." ${ad}${gt}${gt} $log
118 Rscript $path/visualization/annotate_mtls.R mtls.xls gene_annotation.txt $trgtDistance ${ad}${gt}${gt} $log
119 #end if
120 ###############################################################################
121 ## If rna is specified, then get RNA data in correctly formated strings:
122 #if str($map_rna)=='yes'
123 #set $sep = '::'
124 #for $i, $rna in enumerate( $rna_tracks )
125 #if $i==0
126 echo "The first file label is: ${rna.name}" ${ad}${gt}${gt} $log
127 echo "The first file path is: ${rna.file}" ${ad}${gt}${gt} $log
128 rna_labels=${rna.name}
129 rna_paths=${rna.file}
130 rna_norm_paths=${rna.norm}
131 #else
132 echo "The next file label is: ${rna.name}" ${ad}${gt}${gt} $log
133 echo "The next file path is: ${rna.file}" ${ad}${gt}${gt} $log
134 rna_labels=${dollar}rna_labels${sep}${rna.name}
135 rna_paths=${dollar}rna_paths${sep}${rna.file}
136 rna_norm_paths=${dollar}rna_norm_paths${sep}${rna.norm}
137 #end if
138 #end for
139 echo rna paths are - ${dollar}rna_paths ${ad}${gt}${gt} $log
140 echo rna labels are - ${dollar}rna_labels ${ad}${gt}${gt} $log
141 echo rna norm files are - ${dollar}rna_norm_paths ${ad}${gt}${gt} $log
142 #end if
143 ###############################################################################
144
145 #if str($normalize_rna)=='no'
146 echo "Normalization by file is set to no" ${ad}${gt}${gt} $log
147 rna_norm_paths=no
148 #end if
149
150 #if str($use_mean)=='yes'
151 echo "Normalization of expression will be done by mean" ${ad}${gt}${gt} $log
152 rna_norm_paths=mean
153 #end if
154
155 #if str($map_rna)=='no'
156 mtls_file=mtls.xls
157 rna_paths=none
158 rna_labels=none
159 #else
160 mtls_file=annotated_mtls.xls
161 #end if
162
163 echo "
164 Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \
165 --cluster_file ./cluster \
166 --chip_experiment_order ${dollar}chip_labels \
167 --heatmap_file ./heatmap \
168 --heatmap_type bmp \
169 --n_clusters $numClusters \
170 --filter_percentage 100 \
171 --expression_file ${dollar}rna_paths \
172 --expression_name ${dollar}rna_labels \
173 --normalization_file ${dollar}rna_norm_paths \
174 ${ad}${gt}${gt} $log" ${ad}${gt}${gt} $log
175
176 Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \
177 --cluster_file ./cluster \
178 --chip_experiment_order ${dollar}chip_labels \
179 --heatmap_file ./heatmap \
180 --heatmap_type bmp \
181 --n_clusters $numClusters \
182 --filter_percentage 100 \
183 --number_bins $numberBins \
184 --include_targetless $includeTargetless \
185 --expression_file ${dollar}rna_paths \
186 --expression_name ${dollar}rna_labels \
187 --normalization_file ${dollar}rna_norm_paths \
188 ${ad}${gt}${gt} $log
189
190 ls ${ad}${gt}${gt} $log
191
192
193
194
195 ##################################################################
196 #if str($map_rna)=='yes'
197 mv ./annotated_mtls.xls $mtls
198 #else
199 mv ./mtls.xls $mtls
200 #end if
201 mv ./heatmap.* $heatmap_image
202 mv ./cluster.tsv $cluster_assignments
203
204 </configfile>
205 </configfiles>
206 <!--<tests>-->
207 <!-- <test maxseconds="3600" name="GCA_1">-->
208 <!-- <param name="bfile" value="bedfile.bed" />-->
209 <!-- <param name="span" value="3000" />-->
210 <!-- <param name="genome" value="hg18" />-->
211 <!-- <output name="output" file="gca_1/gca_1.xls" />-->
212 <!-- <output name="output" file="gca_1/gca_1.log" lines_diff = "200" />-->
213 <!-- </test>-->
214 <!-- <test maxseconds="3600" name="GCA_2">-->
215 <!-- <param name="bfile" value="bedfile.bed" />-->
216 <!-- <param name="span" value="100" />-->
217 <!-- <param name="genome" value="hg18" />-->
218 <!-- <output name="output" file="gca_2/gca_2.xls" />-->
219 <!-- <output name="output" file="gca_2/gca_2.log" lines_diff = "200" />-->
220 <!-- </test>-->
221 <!-- <test maxseconds="3600" name="GCA_3">-->
222 <!-- <param name="bfile" value="bedfile.bed" />-->
223 <!-- <param name="span" value="500" />-->
224 <!-- <param name="genome" value="hg18" />-->
225 <!-- <output name="output" file="gca_3/gca_3.xls" />-->
226 <!-- <output name="output" file="gca_3/gca_3.log" lines_diff = "200" />-->
227 <!-- </test>-->
228 <!-- <test maxseconds="3600" name="GCA_4">-->
229 <!-- <param name="bfile" value="bedfile.bed" />-->
230 <!-- <param name="span" value="1000" />-->
231 <!-- <param name="genome" value="hg18" />-->
232 <!-- <output name="output" file="gca_4/gca_4.xls" />-->
233 <!-- <output name="output" file="gca_4/gca_4.log" lines_diff = "200" />-->
234 <!-- </test>-->
235 <!-- <test maxseconds="3600" name="GCA_5">-->
236 <!-- <param name="bfile" value="bedfile.bed" />-->
237 <!-- <param name="span" value="10000" />-->
238 <!-- <param name="genome" value="hg18" />-->
239 <!-- <output name="output" file="gca_5/gca_5.xls" />-->
240 <!-- <output name="output" file="gca_5/gca_5.log" lines_diff = "200" />-->
241 <!-- </test>-->
242 <!--</tests>-->
243 <help>
244 This tool will merge peaks form multiple chip-seq experiments, creating MTLs for
245 each overlapping region. It will then cluster each MTL based on the score of
246 each peak within each MTL (using K-means clustering, with k set by user). A
247 heatmap is then generated from the resulting cluster along with the MTLs
248 generated. This module in writin in R and is will be made available on github
249 and bioconductor. This work was done by Kieran Mace and Aviv Madar.
250
251 **NEED IMPROVEMENT**
252
253 -----
254
255 **Parameters**
256
257 - **Input files** contains either macs or BED files to be merged. This list of files must be two or larger.
258 - **Experiment names** contains the name given to each track.
259 - **Summit distance** is the cuttoff distance (in BP) to be included in an MTL. This option is not used with the summit option below
260 - **Input Format** Either bed of MACS file format, all files must be of one type. Defaults to MACS
261 - **MTL Type** Either interval or summit (defaults to summit).
262 - **Number clusters** the value of k for kmeans clustering.
263 - **Filter top MTLS** The top percentage of MTLs to keep for image and cluster (based on the union of mean, non-zero mean, and variance of the scores).
264 -----
265
266 **Output**
267
268 - **XLS file** is the tab-delimited file containing the MTL data.
269 - **PNG file** is the heatmap image generated after clustering the MTL data.
270
271 -----
272
273 **script parameter list of Chip-Cluster**
274
275 Options:
276 DESCRIPTIION:
277 cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where
278 each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files)
279 fall within a certain distance between summits from eachother.
280
281 INPUT:
282 1.path_input=path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk]
283 2.path_output=path to save generated MTL cluster file (where to save mtls.xls)
284 3.expt_names=user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk]
285 4.dist.summits=maximum distance between summits belonging to the same MTL (defaults to 100)
286 5.input_type=the type of input file used (MACS or .bed; defaults to MACS)
287 6.mtl_type=interval or summit (defaults to summit)
288
289 EXAMPLE RUN:
290 cluster_peaks.R
291 --input_macs_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls
292 --input_type MACS
293 --path_output results/
294 --expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17
295 --dist_summits 100
296 --mtl_type summit
297
298 DESCRIPTIION:
299 heatmap.R takes a ...
300
301 INPUT:
302 1.--mtls_file path to mtls file.
303
304 2.--cluster_file the destination path for the cluster file.
305
306 3.--heatmap_file the destination path for heatmap image (no extension).
307
308 4.--heatmap_type choice of image type, currently support png and pdf.
309
310 5.--n_clusters number of clusters in the heatmap
311
312 6.--filter_percentage percentage of mtls that will be analysed. for eg. if
313 we make filter_percentage 30, we will take the union of the top mtls in
314 mean, non-zero mean and variance.
315
316
317 EXAMPLE RUN:
318 Rscript heatmap.R
319 --mtls_file mtls.xls
320 --cluster_file output/cluster
321 --heatmap_file output/heatmap
322 --heatmap_type png
323 --n_clusters 13
324 --filter_percentage 60
325
326 Please cite us if you used this script:
327 The transcription factor network regulating Th17 lineage specification and function.
328 Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers,
329 Richard Bonneau and Dan R. Littman et. al. (in preperation)
330
331 </help>
332
333 </tool>