Mercurial > repos > kmace > mtls_analysis
comparison mtls_analyze/mtls_analyze.xml @ 4:b465306d00ba draft default tip
Uploaded
author | kmace |
---|---|
date | Mon, 23 Jul 2012 13:00:15 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:a0306edbf2f8 | 4:b465306d00ba |
---|---|
1 <tool name="Chip-Cluster: Cluster ChIP-seq peaks and create a heatmap" id="chip-cluster"> | |
2 <description> | |
3 Merge multiple ChIP-seq experiments, alligning their peaks to MTLs (Multi | |
4 Transcription Factor Loci(us)) and optionally incorperate expression | |
5 </description> | |
6 <command interpreter="command">/bin/bash $shscript </command> | |
7 <inputs> | |
8 <param name="chipInputFormat" type="select" display="radio" label="ChIP Input Format"> | |
9 <option name="macs" value="MACS">MACS</option> | |
10 <option name="bed" value="BED">BED</option> | |
11 </param> | |
12 <param name="mtlType" type="select" display="radio" label="Cluster by: "> | |
13 <option name="summit" value="summit">Summit</option> | |
14 <option name="interval" value="interval">Interval</option> | |
15 </param> | |
16 <param name="summitDistance" type="text" label="Summit Distance (BP) - Summit only" value="100"> | |
17 </param> | |
18 <param name="numberBins" type="text" label="Number of Bins" value="30"> | |
19 </param> | |
20 <repeat name="chip_tracks" title="MACS/BED Files"> | |
21 <param name="file" type="data" format="tabular" label="Dataset"/> | |
22 <param name="name" type="text" label="Dataset Name"/> | |
23 </repeat> | |
24 <param name="map_rna" type="boolean" truevalue="yes" falsevalue="no" label="Incorperate RNA?"/> | |
25 <param name="includeTargetless" checked="true" type="boolean" truevalue="yes" falsevalue="no" label="Include Targetless MTLs?"/> | |
26 <param name="reference_file" type="data" format="tabular" label="Reference Genome File"/> | |
27 | |
28 <param name="normalize_rna" type="boolean" truevalue="yes" falsevalue="no" label="Normalize Expression?"/> | |
29 <param name="use_mean" type="boolean" truevalue="yes" falsevalue="no" label="Use mean expression across exp. to normalize?"/> | |
30 <param name="rnaInputFormat" type="select" display="radio" label="RNA Input Format"> | |
31 <option name="cufflinks" value="cufflinks">Cufflinks</option> | |
32 <option name="bed" value="bed">BED</option> | |
33 </param> | |
34 <param name="numClusters" type="text" label="Number of Clusters (kmeans)" value="8"> | |
35 </param> | |
36 <param name="trgtDistance" type="text" label="Transcript threshold distance" value="5000"> | |
37 </param> | |
38 <repeat name="rna_tracks" title="Cufflinks/BED Files"> | |
39 <param name="file" type="data" format="tabular" label="Dataset"/> | |
40 <param name="name" type="text" label="Dataset Name"/> | |
41 <param name="norm" type="data" label="Normalization Dataset"/> | |
42 </repeat> | |
43 </inputs> | |
44 <outputs> | |
45 <data format="xls" name="cluster_assignments" label="Cluster Assignments"/> | |
46 <data format="xls" name="mtls" label="MTLS File"/> | |
47 <data format="txt" name="log" label="Log file" /> | |
48 <data format="bmp" name="heatmap_image" label="Heatmap Image" /> | |
49 <!-- <data format="png" name="heatmap_image" label="Heatmap Image" >--> | |
50 <!-- <filter>imageFormat=="png"</filter>--> | |
51 <!-- </data>--> | |
52 <!-- <data format="pdf" name="heatmap_image" label="Heatmap Image" >--> | |
53 <!-- <filter>imageFormat=="pdf"</filter>--> | |
54 <!-- </data>--> | |
55 | |
56 </outputs> | |
57 <configfiles> | |
58 <configfile name="shscript"> | |
59 <!-- This is the script that runs (Chettah/bash code)--> | |
60 #!/bin/bash | |
61 | |
62 #import os | |
63 #set $path = $os.path.abspath($__app__.config.tool_path) | |
64 | |
65 | |
66 ## Set symbols so that they are not incorrectly interpreted: | |
67 #set $dollar = chr(36) | |
68 #set $gt = chr(62) | |
69 #set $lt = chr(60) | |
70 #set $ad = chr(38) | |
71 #set $bs = chr(92) | |
72 | |
73 echo $map_rna ${ad}${gt}${gt} $log | |
74 echo "This is the Bash log file: " ${ad}${gt}${gt} $log | |
75 ############################################################################### | |
76 ## Convert the gtf file to a file that aviv's script can hadel | |
77 #if str($map_rna)=='yes' | |
78 echo "Converting gtf file" ${ad}${gt}${gt} $log | |
79 Rscript $path/visualization/gtfToMapFriendlyAnnotation.R $reference_file ${ad}${gt}${gt} $log | |
80 echo "done converting gtf file" ${ad}${gt}${gt} $log | |
81 #end if | |
82 ############################################################################### | |
83 ## Get ChIP data in correctly formated strings and annotate if nessisary. | |
84 #set $sep = '::' | |
85 #for $i, $chip in enumerate( $chip_tracks ) | |
86 #if $i==0 | |
87 echo "Chip Files:" ${ad}${gt}${gt} $log | |
88 echo "The first file label is: ${chip.name}" ${ad}${gt}${gt} $log | |
89 echo "The first file path is: ${chip.file}" ${ad}${gt}${gt} $log | |
90 chip_labels=${chip.name} | |
91 chip_paths=${chip.file} | |
92 #else | |
93 echo "The next file label is: ${chip.name}" ${ad}${gt}${gt} $log | |
94 echo "The next file path is: ${chip.file}" ${ad}${gt}${gt} $log | |
95 chip_labels=${dollar}chip_labels${sep}${chip.name} | |
96 chip_paths=${dollar}chip_paths${sep}${chip.file} | |
97 #end if | |
98 #end for | |
99 | |
100 echo chip paths are - ${dollar}chip_paths ${ad}${gt}${gt} $log | |
101 echo chip labels are - ${dollar}chip_labels ${ad}${gt}${gt} $log | |
102 | |
103 ############################################################################### | |
104 ## Cluster peaks | |
105 | |
106 Rscript $path/visualization/cluster_peaks.R \ | |
107 --input_files ${dollar}chip_paths \ | |
108 --input_type $chipInputFormat \ | |
109 --path_output ./ \ | |
110 --expt_names ${dollar}chip_labels \ | |
111 --dist_summits $summitDistance \ | |
112 --mtl_type $mtlType ${ad}${gt}${gt} $log | |
113 | |
114 ############################################################################### | |
115 ## Annotate mtls.xls if nessisary | |
116 #if str($map_rna)=="yes" | |
117 echo "annotating mtls.xls..." ${ad}${gt}${gt} $log | |
118 Rscript $path/visualization/annotate_mtls.R mtls.xls gene_annotation.txt $trgtDistance ${ad}${gt}${gt} $log | |
119 #end if | |
120 ############################################################################### | |
121 ## If rna is specified, then get RNA data in correctly formated strings: | |
122 #if str($map_rna)=='yes' | |
123 #set $sep = '::' | |
124 #for $i, $rna in enumerate( $rna_tracks ) | |
125 #if $i==0 | |
126 echo "The first file label is: ${rna.name}" ${ad}${gt}${gt} $log | |
127 echo "The first file path is: ${rna.file}" ${ad}${gt}${gt} $log | |
128 rna_labels=${rna.name} | |
129 rna_paths=${rna.file} | |
130 rna_norm_paths=${rna.norm} | |
131 #else | |
132 echo "The next file label is: ${rna.name}" ${ad}${gt}${gt} $log | |
133 echo "The next file path is: ${rna.file}" ${ad}${gt}${gt} $log | |
134 rna_labels=${dollar}rna_labels${sep}${rna.name} | |
135 rna_paths=${dollar}rna_paths${sep}${rna.file} | |
136 rna_norm_paths=${dollar}rna_norm_paths${sep}${rna.norm} | |
137 #end if | |
138 #end for | |
139 echo rna paths are - ${dollar}rna_paths ${ad}${gt}${gt} $log | |
140 echo rna labels are - ${dollar}rna_labels ${ad}${gt}${gt} $log | |
141 echo rna norm files are - ${dollar}rna_norm_paths ${ad}${gt}${gt} $log | |
142 #end if | |
143 ############################################################################### | |
144 | |
145 #if str($normalize_rna)=='no' | |
146 echo "Normalization by file is set to no" ${ad}${gt}${gt} $log | |
147 rna_norm_paths=no | |
148 #end if | |
149 | |
150 #if str($use_mean)=='yes' | |
151 echo "Normalization of expression will be done by mean" ${ad}${gt}${gt} $log | |
152 rna_norm_paths=mean | |
153 #end if | |
154 | |
155 #if str($map_rna)=='no' | |
156 mtls_file=mtls.xls | |
157 rna_paths=none | |
158 rna_labels=none | |
159 #else | |
160 mtls_file=annotated_mtls.xls | |
161 #end if | |
162 | |
163 echo " | |
164 Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \ | |
165 --cluster_file ./cluster \ | |
166 --chip_experiment_order ${dollar}chip_labels \ | |
167 --heatmap_file ./heatmap \ | |
168 --heatmap_type bmp \ | |
169 --n_clusters $numClusters \ | |
170 --filter_percentage 100 \ | |
171 --expression_file ${dollar}rna_paths \ | |
172 --expression_name ${dollar}rna_labels \ | |
173 --normalization_file ${dollar}rna_norm_paths \ | |
174 ${ad}${gt}${gt} $log" ${ad}${gt}${gt} $log | |
175 | |
176 Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \ | |
177 --cluster_file ./cluster \ | |
178 --chip_experiment_order ${dollar}chip_labels \ | |
179 --heatmap_file ./heatmap \ | |
180 --heatmap_type bmp \ | |
181 --n_clusters $numClusters \ | |
182 --filter_percentage 100 \ | |
183 --number_bins $numberBins \ | |
184 --include_targetless $includeTargetless \ | |
185 --expression_file ${dollar}rna_paths \ | |
186 --expression_name ${dollar}rna_labels \ | |
187 --normalization_file ${dollar}rna_norm_paths \ | |
188 ${ad}${gt}${gt} $log | |
189 | |
190 ls ${ad}${gt}${gt} $log | |
191 | |
192 | |
193 | |
194 | |
195 ################################################################## | |
196 #if str($map_rna)=='yes' | |
197 mv ./annotated_mtls.xls $mtls | |
198 #else | |
199 mv ./mtls.xls $mtls | |
200 #end if | |
201 mv ./heatmap.* $heatmap_image | |
202 mv ./cluster.tsv $cluster_assignments | |
203 | |
204 </configfile> | |
205 </configfiles> | |
206 <!--<tests>--> | |
207 <!-- <test maxseconds="3600" name="GCA_1">--> | |
208 <!-- <param name="bfile" value="bedfile.bed" />--> | |
209 <!-- <param name="span" value="3000" />--> | |
210 <!-- <param name="genome" value="hg18" />--> | |
211 <!-- <output name="output" file="gca_1/gca_1.xls" />--> | |
212 <!-- <output name="output" file="gca_1/gca_1.log" lines_diff = "200" />--> | |
213 <!-- </test>--> | |
214 <!-- <test maxseconds="3600" name="GCA_2">--> | |
215 <!-- <param name="bfile" value="bedfile.bed" />--> | |
216 <!-- <param name="span" value="100" />--> | |
217 <!-- <param name="genome" value="hg18" />--> | |
218 <!-- <output name="output" file="gca_2/gca_2.xls" />--> | |
219 <!-- <output name="output" file="gca_2/gca_2.log" lines_diff = "200" />--> | |
220 <!-- </test>--> | |
221 <!-- <test maxseconds="3600" name="GCA_3">--> | |
222 <!-- <param name="bfile" value="bedfile.bed" />--> | |
223 <!-- <param name="span" value="500" />--> | |
224 <!-- <param name="genome" value="hg18" />--> | |
225 <!-- <output name="output" file="gca_3/gca_3.xls" />--> | |
226 <!-- <output name="output" file="gca_3/gca_3.log" lines_diff = "200" />--> | |
227 <!-- </test>--> | |
228 <!-- <test maxseconds="3600" name="GCA_4">--> | |
229 <!-- <param name="bfile" value="bedfile.bed" />--> | |
230 <!-- <param name="span" value="1000" />--> | |
231 <!-- <param name="genome" value="hg18" />--> | |
232 <!-- <output name="output" file="gca_4/gca_4.xls" />--> | |
233 <!-- <output name="output" file="gca_4/gca_4.log" lines_diff = "200" />--> | |
234 <!-- </test>--> | |
235 <!-- <test maxseconds="3600" name="GCA_5">--> | |
236 <!-- <param name="bfile" value="bedfile.bed" />--> | |
237 <!-- <param name="span" value="10000" />--> | |
238 <!-- <param name="genome" value="hg18" />--> | |
239 <!-- <output name="output" file="gca_5/gca_5.xls" />--> | |
240 <!-- <output name="output" file="gca_5/gca_5.log" lines_diff = "200" />--> | |
241 <!-- </test>--> | |
242 <!--</tests>--> | |
243 <help> | |
244 This tool will merge peaks form multiple chip-seq experiments, creating MTLs for | |
245 each overlapping region. It will then cluster each MTL based on the score of | |
246 each peak within each MTL (using K-means clustering, with k set by user). A | |
247 heatmap is then generated from the resulting cluster along with the MTLs | |
248 generated. This module in writin in R and is will be made available on github | |
249 and bioconductor. This work was done by Kieran Mace and Aviv Madar. | |
250 | |
251 **NEED IMPROVEMENT** | |
252 | |
253 ----- | |
254 | |
255 **Parameters** | |
256 | |
257 - **Input files** contains either macs or BED files to be merged. This list of files must be two or larger. | |
258 - **Experiment names** contains the name given to each track. | |
259 - **Summit distance** is the cuttoff distance (in BP) to be included in an MTL. This option is not used with the summit option below | |
260 - **Input Format** Either bed of MACS file format, all files must be of one type. Defaults to MACS | |
261 - **MTL Type** Either interval or summit (defaults to summit). | |
262 - **Number clusters** the value of k for kmeans clustering. | |
263 - **Filter top MTLS** The top percentage of MTLs to keep for image and cluster (based on the union of mean, non-zero mean, and variance of the scores). | |
264 ----- | |
265 | |
266 **Output** | |
267 | |
268 - **XLS file** is the tab-delimited file containing the MTL data. | |
269 - **PNG file** is the heatmap image generated after clustering the MTL data. | |
270 | |
271 ----- | |
272 | |
273 **script parameter list of Chip-Cluster** | |
274 | |
275 Options: | |
276 DESCRIPTIION: | |
277 cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where | |
278 each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files) | |
279 fall within a certain distance between summits from eachother. | |
280 | |
281 INPUT: | |
282 1.path_input=path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk] | |
283 2.path_output=path to save generated MTL cluster file (where to save mtls.xls) | |
284 3.expt_names=user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk] | |
285 4.dist.summits=maximum distance between summits belonging to the same MTL (defaults to 100) | |
286 5.input_type=the type of input file used (MACS or .bed; defaults to MACS) | |
287 6.mtl_type=interval or summit (defaults to summit) | |
288 | |
289 EXAMPLE RUN: | |
290 cluster_peaks.R | |
291 --input_macs_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls | |
292 --input_type MACS | |
293 --path_output results/ | |
294 --expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17 | |
295 --dist_summits 100 | |
296 --mtl_type summit | |
297 | |
298 DESCRIPTIION: | |
299 heatmap.R takes a ... | |
300 | |
301 INPUT: | |
302 1.--mtls_file path to mtls file. | |
303 | |
304 2.--cluster_file the destination path for the cluster file. | |
305 | |
306 3.--heatmap_file the destination path for heatmap image (no extension). | |
307 | |
308 4.--heatmap_type choice of image type, currently support png and pdf. | |
309 | |
310 5.--n_clusters number of clusters in the heatmap | |
311 | |
312 6.--filter_percentage percentage of mtls that will be analysed. for eg. if | |
313 we make filter_percentage 30, we will take the union of the top mtls in | |
314 mean, non-zero mean and variance. | |
315 | |
316 | |
317 EXAMPLE RUN: | |
318 Rscript heatmap.R | |
319 --mtls_file mtls.xls | |
320 --cluster_file output/cluster | |
321 --heatmap_file output/heatmap | |
322 --heatmap_type png | |
323 --n_clusters 13 | |
324 --filter_percentage 60 | |
325 | |
326 Please cite us if you used this script: | |
327 The transcription factor network regulating Th17 lineage specification and function. | |
328 Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers, | |
329 Richard Bonneau and Dan R. Littman et. al. (in preperation) | |
330 | |
331 </help> | |
332 | |
333 </tool> |