view mtls_analyze/mtls_analyze.xml @ 4:b465306d00ba draft default tip

Uploaded
author kmace
date Mon, 23 Jul 2012 13:00:15 -0400
parents
children
line wrap: on
line source

<tool name="Chip-Cluster: Cluster ChIP-seq peaks and create a heatmap" id="chip-cluster">
  <description>
    Merge multiple ChIP-seq experiments, alligning their peaks to MTLs (Multi
    Transcription Factor Loci(us)) and optionally incorperate expression
  </description>
  <command interpreter="command">/bin/bash $shscript </command>
  <inputs>
    <param name="chipInputFormat" type="select" display="radio" label="ChIP Input Format">
      <option name="macs" value="MACS">MACS</option>
      <option name="bed" value="BED">BED</option>
    </param>
    <param name="mtlType" type="select" display="radio" label="Cluster by: ">
      <option name="summit" value="summit">Summit</option>
      <option name="interval" value="interval">Interval</option>
    </param>
    <param name="summitDistance" type="text" label="Summit Distance (BP) - Summit only" value="100">
    </param>
    <param name="numberBins" type="text" label="Number of Bins" value="30">
    </param>
    <repeat name="chip_tracks" title="MACS/BED Files">
      <param name="file" type="data" format="tabular" label="Dataset"/>
      <param name="name" type="text" label="Dataset Name"/>
    </repeat>
    <param name="map_rna" type="boolean" truevalue="yes" falsevalue="no" label="Incorperate RNA?"/>
    <param name="includeTargetless" checked="true" type="boolean" truevalue="yes" falsevalue="no" label="Include Targetless MTLs?"/>
    <param name="reference_file" type="data" format="tabular" label="Reference Genome File"/>

    <param name="normalize_rna" type="boolean" truevalue="yes" falsevalue="no" label="Normalize Expression?"/>
    <param name="use_mean" type="boolean" truevalue="yes" falsevalue="no" label="Use mean expression across exp. to normalize?"/>
    <param name="rnaInputFormat" type="select" display="radio" label="RNA Input Format">
      <option name="cufflinks" value="cufflinks">Cufflinks</option>
      <option name="bed" value="bed">BED</option>
    </param>
    <param name="numClusters" type="text" label="Number of Clusters (kmeans)" value="8">
    </param>
    <param name="trgtDistance" type="text" label="Transcript threshold distance" value="5000">
    </param>
    <repeat name="rna_tracks" title="Cufflinks/BED Files">
      <param name="file" type="data" format="tabular" label="Dataset"/>
      <param name="name" type="text" label="Dataset Name"/>
      <param name="norm" type="data" label="Normalization Dataset"/>
    </repeat>
  </inputs>
  <outputs>
    <data format="xls" name="cluster_assignments" label="Cluster Assignments"/>
    <data format="xls" name="mtls" label="MTLS File"/>
    <data format="txt" name="log" label="Log file" />
    <data format="bmp" name="heatmap_image" label="Heatmap Image" />
<!--    <data format="png" name="heatmap_image" label="Heatmap Image" >-->
<!--      <filter>imageFormat=="png"</filter>-->
<!--    </data>-->
<!--    <data format="pdf" name="heatmap_image" label="Heatmap Image" >-->
<!--      <filter>imageFormat=="pdf"</filter>-->
<!--    </data>-->

  </outputs>
  <configfiles>
    <configfile name="shscript">
<!-- This is the script that runs (Chettah/bash code)-->
#!/bin/bash

#import os
#set $path = $os.path.abspath($__app__.config.tool_path)


## Set symbols so that they are not incorrectly interpreted:
#set $dollar = chr(36)
#set $gt = chr(62)
#set $lt = chr(60)
#set $ad = chr(38)
#set $bs = chr(92)

echo $map_rna ${ad}${gt}${gt} $log
echo "This is the Bash log file: " ${ad}${gt}${gt} $log
###############################################################################
## Convert the gtf file to a file that aviv's script can hadel
#if str($map_rna)=='yes'
  echo "Converting gtf file" ${ad}${gt}${gt} $log
  Rscript $path/visualization/gtfToMapFriendlyAnnotation.R $reference_file ${ad}${gt}${gt} $log
  echo "done converting gtf file" ${ad}${gt}${gt} $log
#end if
###############################################################################
## Get ChIP data in correctly formated strings and annotate if nessisary.
#set $sep = '::'
#for $i, $chip in enumerate( $chip_tracks )
  #if $i==0
    echo "Chip Files:" ${ad}${gt}${gt} $log
    echo "The first file label is:  ${chip.name}" ${ad}${gt}${gt} $log
    echo "The first file path is:  ${chip.file}" ${ad}${gt}${gt} $log
    chip_labels=${chip.name}
    chip_paths=${chip.file}
  #else
    echo "The next file label is:  ${chip.name}" ${ad}${gt}${gt} $log
    echo "The next file path is:  ${chip.file}" ${ad}${gt}${gt} $log
    chip_labels=${dollar}chip_labels${sep}${chip.name}
    chip_paths=${dollar}chip_paths${sep}${chip.file}
  #end if
#end for

echo chip paths are - ${dollar}chip_paths ${ad}${gt}${gt} $log
echo chip labels are - ${dollar}chip_labels ${ad}${gt}${gt} $log

###############################################################################
## Cluster peaks

Rscript $path/visualization/cluster_peaks.R \
--input_files ${dollar}chip_paths \
--input_type $chipInputFormat \
--path_output ./ \
--expt_names ${dollar}chip_labels \
--dist_summits $summitDistance \
--mtl_type $mtlType ${ad}${gt}${gt} $log

###############################################################################
## Annotate mtls.xls if nessisary
#if str($map_rna)=="yes"
  echo "annotating mtls.xls..." ${ad}${gt}${gt} $log
  Rscript $path/visualization/annotate_mtls.R mtls.xls gene_annotation.txt $trgtDistance ${ad}${gt}${gt} $log
#end if
###############################################################################
## If rna is specified, then get RNA data in correctly formated strings:
#if str($map_rna)=='yes'
  #set $sep = '::'
  #for $i, $rna in enumerate( $rna_tracks )
    #if $i==0
      echo "The first file label is:  ${rna.name}" ${ad}${gt}${gt} $log
      echo "The first file path is:  ${rna.file}" ${ad}${gt}${gt} $log
      rna_labels=${rna.name}
      rna_paths=${rna.file}
      rna_norm_paths=${rna.norm}
    #else
      echo "The next file label is:  ${rna.name}" ${ad}${gt}${gt} $log
      echo "The next file path is:  ${rna.file}" ${ad}${gt}${gt} $log
      rna_labels=${dollar}rna_labels${sep}${rna.name}
      rna_paths=${dollar}rna_paths${sep}${rna.file}
      rna_norm_paths=${dollar}rna_norm_paths${sep}${rna.norm}
    #end if
  #end for
  echo rna paths are - ${dollar}rna_paths ${ad}${gt}${gt} $log
  echo rna labels are - ${dollar}rna_labels ${ad}${gt}${gt} $log
  echo rna norm files are - ${dollar}rna_norm_paths ${ad}${gt}${gt} $log
#end if
###############################################################################

#if str($normalize_rna)=='no'
  echo "Normalization by file is set to no" ${ad}${gt}${gt} $log
  rna_norm_paths=no
#end if

#if str($use_mean)=='yes'
  echo "Normalization of expression will be done by mean" ${ad}${gt}${gt} $log
  rna_norm_paths=mean
#end if

#if str($map_rna)=='no'
  mtls_file=mtls.xls
  rna_paths=none
  rna_labels=none
#else
  mtls_file=annotated_mtls.xls
#end if

echo "
Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \
--cluster_file ./cluster \
--chip_experiment_order ${dollar}chip_labels \
--heatmap_file ./heatmap \
--heatmap_type bmp \
--n_clusters $numClusters \
--filter_percentage 100 \
--expression_file ${dollar}rna_paths \
--expression_name ${dollar}rna_labels \
--normalization_file ${dollar}rna_norm_paths \
${ad}${gt}${gt} $log" ${ad}${gt}${gt} $log  

Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \
--cluster_file ./cluster \
--chip_experiment_order ${dollar}chip_labels \
--heatmap_file ./heatmap \
--heatmap_type bmp \
--n_clusters $numClusters \
--filter_percentage 100 \
--number_bins $numberBins \
--include_targetless $includeTargetless \
--expression_file ${dollar}rna_paths \
--expression_name ${dollar}rna_labels \
--normalization_file ${dollar}rna_norm_paths \
${ad}${gt}${gt} $log

ls ${ad}${gt}${gt} $log




##################################################################
#if str($map_rna)=='yes'
    mv ./annotated_mtls.xls $mtls
#else
    mv ./mtls.xls $mtls
#end if
mv ./heatmap.* $heatmap_image
mv ./cluster.tsv $cluster_assignments

    </configfile>
  </configfiles>
<!--<tests>-->
<!--  <test maxseconds="3600" name="GCA_1">-->
<!--    <param name="bfile" value="bedfile.bed" />-->
<!--    <param name="span" value="3000" />-->
<!--    <param name="genome" value="hg18" />-->
<!--    <output name="output" file="gca_1/gca_1.xls" />-->
<!--    <output name="output" file="gca_1/gca_1.log" lines_diff = "200" />-->
<!--  </test>-->
<!--  <test maxseconds="3600" name="GCA_2">-->
<!--    <param name="bfile" value="bedfile.bed" />-->
<!--    <param name="span" value="100" />-->
<!--    <param name="genome" value="hg18" />-->
<!--    <output name="output" file="gca_2/gca_2.xls" />-->
<!--    <output name="output" file="gca_2/gca_2.log" lines_diff = "200" />-->
<!--  </test>-->
<!--  <test maxseconds="3600" name="GCA_3">-->
<!--    <param name="bfile" value="bedfile.bed" />-->
<!--    <param name="span" value="500" />-->
<!--    <param name="genome" value="hg18" />-->
<!--    <output name="output" file="gca_3/gca_3.xls" />-->
<!--    <output name="output" file="gca_3/gca_3.log" lines_diff = "200" />-->
<!--  </test>-->
<!--  <test maxseconds="3600" name="GCA_4">-->
<!--    <param name="bfile" value="bedfile.bed" />-->
<!--    <param name="span" value="1000" />-->
<!--    <param name="genome" value="hg18" />-->
<!--    <output name="output" file="gca_4/gca_4.xls" />-->
<!--    <output name="output" file="gca_4/gca_4.log" lines_diff = "200" />-->
<!--  </test>-->
<!--  <test maxseconds="3600" name="GCA_5">-->
<!--    <param name="bfile" value="bedfile.bed" />-->
<!--    <param name="span" value="10000" />-->
<!--    <param name="genome" value="hg18" />-->
<!--    <output name="output" file="gca_5/gca_5.xls" />-->
<!--    <output name="output" file="gca_5/gca_5.log" lines_diff = "200" />-->
<!--  </test>-->
<!--</tests>-->
  <help>
This tool will merge peaks form multiple chip-seq experiments, creating MTLs for
each overlapping region. It will then cluster each MTL based on the score of
each peak within each MTL (using K-means clustering, with k set by user). A
heatmap is then generated from the resulting cluster along with the MTLs
generated. This module in writin in R and is will be made available on github
and bioconductor. This work was done by Kieran Mace and Aviv Madar.

**NEED IMPROVEMENT**

-----

**Parameters**

- **Input files** contains either macs or BED files to be merged. This list of files must be two or larger.
- **Experiment names** contains the name given to each track.
- **Summit distance** is the cuttoff distance (in BP) to be included in an MTL. This option is not used with the summit option below
- **Input Format** Either bed of MACS file format, all files must be of one type. Defaults to MACS
- **MTL Type** Either interval or summit (defaults to summit).
- **Number clusters** the value of k for kmeans clustering.
- **Filter top MTLS** The top percentage of MTLs to keep for image and cluster (based on the union of mean, non-zero mean, and variance of the scores).
-----

**Output**

- **XLS file** is the tab-delimited file containing the MTL data.
- **PNG file** is the heatmap image generated after clustering the MTL data.

-----

**script parameter list of Chip-Cluster**

Options:
DESCRIPTIION:
	cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where
	each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files)
	fall within a certain distance between summits from eachother.

INPUT:
	1.path_input=path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk]
	2.path_output=path to save generated MTL cluster file (where to save mtls.xls)
	3.expt_names=user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk]
	4.dist.summits=maximum distance between summits belonging to the same MTL (defaults to 100)
	5.input_type=the type of input file used (MACS or .bed; defaults to MACS)
	6.mtl_type=interval or summit (defaults to summit)

EXAMPLE RUN:
	cluster_peaks.R
	--input_macs_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls
	--input_type MACS
	--path_output results/
	--expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17
	--dist_summits 100
	--mtl_type summit

	DESCRIPTIION:
	heatmap.R takes a ...

INPUT:
	1.--mtls_file path to mtls file.

	2.--cluster_file the destination path for the cluster file.

	3.--heatmap_file the destination path for heatmap image (no extension).

	4.--heatmap_type choice of image type, currently support png and pdf.

	5.--n_clusters number of clusters in the heatmap

	6.--filter_percentage percentage of mtls that will be analysed. for eg. if
	we make filter_percentage 30, we will take the union of the top mtls in
	mean, non-zero mean and variance.


EXAMPLE RUN:
	Rscript heatmap.R
				--mtls_file mtls.xls
				--cluster_file output/cluster
				--heatmap_file output/heatmap
				--heatmap_type png
				--n_clusters 13
				--filter_percentage 60

Please cite us if you used this script:
	The transcription factor network regulating Th17 lineage specification and function.
	Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers,
	Richard Bonneau and Dan R. Littman et. al. (in preperation)

  </help>

</tool>