diff mtls_analyze/mtls_analyze.xml @ 4:b465306d00ba draft default tip

Uploaded
author kmace
date Mon, 23 Jul 2012 13:00:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mtls_analyze/mtls_analyze.xml	Mon Jul 23 13:00:15 2012 -0400
@@ -0,0 +1,333 @@
+<tool name="Chip-Cluster: Cluster ChIP-seq peaks and create a heatmap" id="chip-cluster">
+  <description>
+    Merge multiple ChIP-seq experiments, alligning their peaks to MTLs (Multi
+    Transcription Factor Loci(us)) and optionally incorperate expression
+  </description>
+  <command interpreter="command">/bin/bash $shscript </command>
+  <inputs>
+    <param name="chipInputFormat" type="select" display="radio" label="ChIP Input Format">
+      <option name="macs" value="MACS">MACS</option>
+      <option name="bed" value="BED">BED</option>
+    </param>
+    <param name="mtlType" type="select" display="radio" label="Cluster by: ">
+      <option name="summit" value="summit">Summit</option>
+      <option name="interval" value="interval">Interval</option>
+    </param>
+    <param name="summitDistance" type="text" label="Summit Distance (BP) - Summit only" value="100">
+    </param>
+    <param name="numberBins" type="text" label="Number of Bins" value="30">
+    </param>
+    <repeat name="chip_tracks" title="MACS/BED Files">
+      <param name="file" type="data" format="tabular" label="Dataset"/>
+      <param name="name" type="text" label="Dataset Name"/>
+    </repeat>
+    <param name="map_rna" type="boolean" truevalue="yes" falsevalue="no" label="Incorperate RNA?"/>
+    <param name="includeTargetless" checked="true" type="boolean" truevalue="yes" falsevalue="no" label="Include Targetless MTLs?"/>
+    <param name="reference_file" type="data" format="tabular" label="Reference Genome File"/>
+
+    <param name="normalize_rna" type="boolean" truevalue="yes" falsevalue="no" label="Normalize Expression?"/>
+    <param name="use_mean" type="boolean" truevalue="yes" falsevalue="no" label="Use mean expression across exp. to normalize?"/>
+    <param name="rnaInputFormat" type="select" display="radio" label="RNA Input Format">
+      <option name="cufflinks" value="cufflinks">Cufflinks</option>
+      <option name="bed" value="bed">BED</option>
+    </param>
+    <param name="numClusters" type="text" label="Number of Clusters (kmeans)" value="8">
+    </param>
+    <param name="trgtDistance" type="text" label="Transcript threshold distance" value="5000">
+    </param>
+    <repeat name="rna_tracks" title="Cufflinks/BED Files">
+      <param name="file" type="data" format="tabular" label="Dataset"/>
+      <param name="name" type="text" label="Dataset Name"/>
+      <param name="norm" type="data" label="Normalization Dataset"/>
+    </repeat>
+  </inputs>
+  <outputs>
+    <data format="xls" name="cluster_assignments" label="Cluster Assignments"/>
+    <data format="xls" name="mtls" label="MTLS File"/>
+    <data format="txt" name="log" label="Log file" />
+    <data format="bmp" name="heatmap_image" label="Heatmap Image" />
+<!--    <data format="png" name="heatmap_image" label="Heatmap Image" >-->
+<!--      <filter>imageFormat=="png"</filter>-->
+<!--    </data>-->
+<!--    <data format="pdf" name="heatmap_image" label="Heatmap Image" >-->
+<!--      <filter>imageFormat=="pdf"</filter>-->
+<!--    </data>-->
+
+  </outputs>
+  <configfiles>
+    <configfile name="shscript">
+<!-- This is the script that runs (Chettah/bash code)-->
+#!/bin/bash
+
+#import os
+#set $path = $os.path.abspath($__app__.config.tool_path)
+
+
+## Set symbols so that they are not incorrectly interpreted:
+#set $dollar = chr(36)
+#set $gt = chr(62)
+#set $lt = chr(60)
+#set $ad = chr(38)
+#set $bs = chr(92)
+
+echo $map_rna ${ad}${gt}${gt} $log
+echo "This is the Bash log file: " ${ad}${gt}${gt} $log
+###############################################################################
+## Convert the gtf file to a file that aviv's script can hadel
+#if str($map_rna)=='yes'
+  echo "Converting gtf file" ${ad}${gt}${gt} $log
+  Rscript $path/visualization/gtfToMapFriendlyAnnotation.R $reference_file ${ad}${gt}${gt} $log
+  echo "done converting gtf file" ${ad}${gt}${gt} $log
+#end if
+###############################################################################
+## Get ChIP data in correctly formated strings and annotate if nessisary.
+#set $sep = '::'
+#for $i, $chip in enumerate( $chip_tracks )
+  #if $i==0
+    echo "Chip Files:" ${ad}${gt}${gt} $log
+    echo "The first file label is:  ${chip.name}" ${ad}${gt}${gt} $log
+    echo "The first file path is:  ${chip.file}" ${ad}${gt}${gt} $log
+    chip_labels=${chip.name}
+    chip_paths=${chip.file}
+  #else
+    echo "The next file label is:  ${chip.name}" ${ad}${gt}${gt} $log
+    echo "The next file path is:  ${chip.file}" ${ad}${gt}${gt} $log
+    chip_labels=${dollar}chip_labels${sep}${chip.name}
+    chip_paths=${dollar}chip_paths${sep}${chip.file}
+  #end if
+#end for
+
+echo chip paths are - ${dollar}chip_paths ${ad}${gt}${gt} $log
+echo chip labels are - ${dollar}chip_labels ${ad}${gt}${gt} $log
+
+###############################################################################
+## Cluster peaks
+
+Rscript $path/visualization/cluster_peaks.R \
+--input_files ${dollar}chip_paths \
+--input_type $chipInputFormat \
+--path_output ./ \
+--expt_names ${dollar}chip_labels \
+--dist_summits $summitDistance \
+--mtl_type $mtlType ${ad}${gt}${gt} $log
+
+###############################################################################
+## Annotate mtls.xls if nessisary
+#if str($map_rna)=="yes"
+  echo "annotating mtls.xls..." ${ad}${gt}${gt} $log
+  Rscript $path/visualization/annotate_mtls.R mtls.xls gene_annotation.txt $trgtDistance ${ad}${gt}${gt} $log
+#end if
+###############################################################################
+## If rna is specified, then get RNA data in correctly formated strings:
+#if str($map_rna)=='yes'
+  #set $sep = '::'
+  #for $i, $rna in enumerate( $rna_tracks )
+    #if $i==0
+      echo "The first file label is:  ${rna.name}" ${ad}${gt}${gt} $log
+      echo "The first file path is:  ${rna.file}" ${ad}${gt}${gt} $log
+      rna_labels=${rna.name}
+      rna_paths=${rna.file}
+      rna_norm_paths=${rna.norm}
+    #else
+      echo "The next file label is:  ${rna.name}" ${ad}${gt}${gt} $log
+      echo "The next file path is:  ${rna.file}" ${ad}${gt}${gt} $log
+      rna_labels=${dollar}rna_labels${sep}${rna.name}
+      rna_paths=${dollar}rna_paths${sep}${rna.file}
+      rna_norm_paths=${dollar}rna_norm_paths${sep}${rna.norm}
+    #end if
+  #end for
+  echo rna paths are - ${dollar}rna_paths ${ad}${gt}${gt} $log
+  echo rna labels are - ${dollar}rna_labels ${ad}${gt}${gt} $log
+  echo rna norm files are - ${dollar}rna_norm_paths ${ad}${gt}${gt} $log
+#end if
+###############################################################################
+
+#if str($normalize_rna)=='no'
+  echo "Normalization by file is set to no" ${ad}${gt}${gt} $log
+  rna_norm_paths=no
+#end if
+
+#if str($use_mean)=='yes'
+  echo "Normalization of expression will be done by mean" ${ad}${gt}${gt} $log
+  rna_norm_paths=mean
+#end if
+
+#if str($map_rna)=='no'
+  mtls_file=mtls.xls
+  rna_paths=none
+  rna_labels=none
+#else
+  mtls_file=annotated_mtls.xls
+#end if
+
+echo "
+Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \
+--cluster_file ./cluster \
+--chip_experiment_order ${dollar}chip_labels \
+--heatmap_file ./heatmap \
+--heatmap_type bmp \
+--n_clusters $numClusters \
+--filter_percentage 100 \
+--expression_file ${dollar}rna_paths \
+--expression_name ${dollar}rna_labels \
+--normalization_file ${dollar}rna_norm_paths \
+${ad}${gt}${gt} $log" ${ad}${gt}${gt} $log  
+
+Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \
+--cluster_file ./cluster \
+--chip_experiment_order ${dollar}chip_labels \
+--heatmap_file ./heatmap \
+--heatmap_type bmp \
+--n_clusters $numClusters \
+--filter_percentage 100 \
+--number_bins $numberBins \
+--include_targetless $includeTargetless \
+--expression_file ${dollar}rna_paths \
+--expression_name ${dollar}rna_labels \
+--normalization_file ${dollar}rna_norm_paths \
+${ad}${gt}${gt} $log
+
+ls ${ad}${gt}${gt} $log
+
+
+
+
+##################################################################
+#if str($map_rna)=='yes'
+    mv ./annotated_mtls.xls $mtls
+#else
+    mv ./mtls.xls $mtls
+#end if
+mv ./heatmap.* $heatmap_image
+mv ./cluster.tsv $cluster_assignments
+
+    </configfile>
+  </configfiles>
+<!--<tests>-->
+<!--  <test maxseconds="3600" name="GCA_1">-->
+<!--    <param name="bfile" value="bedfile.bed" />-->
+<!--    <param name="span" value="3000" />-->
+<!--    <param name="genome" value="hg18" />-->
+<!--    <output name="output" file="gca_1/gca_1.xls" />-->
+<!--    <output name="output" file="gca_1/gca_1.log" lines_diff = "200" />-->
+<!--  </test>-->
+<!--  <test maxseconds="3600" name="GCA_2">-->
+<!--    <param name="bfile" value="bedfile.bed" />-->
+<!--    <param name="span" value="100" />-->
+<!--    <param name="genome" value="hg18" />-->
+<!--    <output name="output" file="gca_2/gca_2.xls" />-->
+<!--    <output name="output" file="gca_2/gca_2.log" lines_diff = "200" />-->
+<!--  </test>-->
+<!--  <test maxseconds="3600" name="GCA_3">-->
+<!--    <param name="bfile" value="bedfile.bed" />-->
+<!--    <param name="span" value="500" />-->
+<!--    <param name="genome" value="hg18" />-->
+<!--    <output name="output" file="gca_3/gca_3.xls" />-->
+<!--    <output name="output" file="gca_3/gca_3.log" lines_diff = "200" />-->
+<!--  </test>-->
+<!--  <test maxseconds="3600" name="GCA_4">-->
+<!--    <param name="bfile" value="bedfile.bed" />-->
+<!--    <param name="span" value="1000" />-->
+<!--    <param name="genome" value="hg18" />-->
+<!--    <output name="output" file="gca_4/gca_4.xls" />-->
+<!--    <output name="output" file="gca_4/gca_4.log" lines_diff = "200" />-->
+<!--  </test>-->
+<!--  <test maxseconds="3600" name="GCA_5">-->
+<!--    <param name="bfile" value="bedfile.bed" />-->
+<!--    <param name="span" value="10000" />-->
+<!--    <param name="genome" value="hg18" />-->
+<!--    <output name="output" file="gca_5/gca_5.xls" />-->
+<!--    <output name="output" file="gca_5/gca_5.log" lines_diff = "200" />-->
+<!--  </test>-->
+<!--</tests>-->
+  <help>
+This tool will merge peaks form multiple chip-seq experiments, creating MTLs for
+each overlapping region. It will then cluster each MTL based on the score of
+each peak within each MTL (using K-means clustering, with k set by user). A
+heatmap is then generated from the resulting cluster along with the MTLs
+generated. This module in writin in R and is will be made available on github
+and bioconductor. This work was done by Kieran Mace and Aviv Madar.
+
+**NEED IMPROVEMENT**
+
+-----
+
+**Parameters**
+
+- **Input files** contains either macs or BED files to be merged. This list of files must be two or larger.
+- **Experiment names** contains the name given to each track.
+- **Summit distance** is the cuttoff distance (in BP) to be included in an MTL. This option is not used with the summit option below
+- **Input Format** Either bed of MACS file format, all files must be of one type. Defaults to MACS
+- **MTL Type** Either interval or summit (defaults to summit).
+- **Number clusters** the value of k for kmeans clustering.
+- **Filter top MTLS** The top percentage of MTLs to keep for image and cluster (based on the union of mean, non-zero mean, and variance of the scores).
+-----
+
+**Output**
+
+- **XLS file** is the tab-delimited file containing the MTL data.
+- **PNG file** is the heatmap image generated after clustering the MTL data.
+
+-----
+
+**script parameter list of Chip-Cluster**
+
+Options:
+DESCRIPTIION:
+	cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where
+	each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files)
+	fall within a certain distance between summits from eachother.
+
+INPUT:
+	1.path_input=path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk]
+	2.path_output=path to save generated MTL cluster file (where to save mtls.xls)
+	3.expt_names=user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk]
+	4.dist.summits=maximum distance between summits belonging to the same MTL (defaults to 100)
+	5.input_type=the type of input file used (MACS or .bed; defaults to MACS)
+	6.mtl_type=interval or summit (defaults to summit)
+
+EXAMPLE RUN:
+	cluster_peaks.R
+	--input_macs_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls
+	--input_type MACS
+	--path_output results/
+	--expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17
+	--dist_summits 100
+	--mtl_type summit
+
+	DESCRIPTIION:
+	heatmap.R takes a ...
+
+INPUT:
+	1.--mtls_file path to mtls file.
+
+	2.--cluster_file the destination path for the cluster file.
+
+	3.--heatmap_file the destination path for heatmap image (no extension).
+
+	4.--heatmap_type choice of image type, currently support png and pdf.
+
+	5.--n_clusters number of clusters in the heatmap
+
+	6.--filter_percentage percentage of mtls that will be analysed. for eg. if
+	we make filter_percentage 30, we will take the union of the top mtls in
+	mean, non-zero mean and variance.
+
+
+EXAMPLE RUN:
+	Rscript heatmap.R
+				--mtls_file mtls.xls
+				--cluster_file output/cluster
+				--heatmap_file output/heatmap
+				--heatmap_type png
+				--n_clusters 13
+				--filter_percentage 60
+
+Please cite us if you used this script:
+	The transcription factor network regulating Th17 lineage specification and function.
+	Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers,
+	Richard Bonneau and Dan R. Littman et. al. (in preperation)
+
+  </help>
+
+</tool>