Mercurial > repos > kmace > mtls_analysis
diff mtls_analyze/mtls_analyze.xml @ 4:b465306d00ba draft default tip
Uploaded
author | kmace |
---|---|
date | Mon, 23 Jul 2012 13:00:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mtls_analyze/mtls_analyze.xml Mon Jul 23 13:00:15 2012 -0400 @@ -0,0 +1,333 @@ +<tool name="Chip-Cluster: Cluster ChIP-seq peaks and create a heatmap" id="chip-cluster"> + <description> + Merge multiple ChIP-seq experiments, alligning their peaks to MTLs (Multi + Transcription Factor Loci(us)) and optionally incorperate expression + </description> + <command interpreter="command">/bin/bash $shscript </command> + <inputs> + <param name="chipInputFormat" type="select" display="radio" label="ChIP Input Format"> + <option name="macs" value="MACS">MACS</option> + <option name="bed" value="BED">BED</option> + </param> + <param name="mtlType" type="select" display="radio" label="Cluster by: "> + <option name="summit" value="summit">Summit</option> + <option name="interval" value="interval">Interval</option> + </param> + <param name="summitDistance" type="text" label="Summit Distance (BP) - Summit only" value="100"> + </param> + <param name="numberBins" type="text" label="Number of Bins" value="30"> + </param> + <repeat name="chip_tracks" title="MACS/BED Files"> + <param name="file" type="data" format="tabular" label="Dataset"/> + <param name="name" type="text" label="Dataset Name"/> + </repeat> + <param name="map_rna" type="boolean" truevalue="yes" falsevalue="no" label="Incorperate RNA?"/> + <param name="includeTargetless" checked="true" type="boolean" truevalue="yes" falsevalue="no" label="Include Targetless MTLs?"/> + <param name="reference_file" type="data" format="tabular" label="Reference Genome File"/> + + <param name="normalize_rna" type="boolean" truevalue="yes" falsevalue="no" label="Normalize Expression?"/> + <param name="use_mean" type="boolean" truevalue="yes" falsevalue="no" label="Use mean expression across exp. to normalize?"/> + <param name="rnaInputFormat" type="select" display="radio" label="RNA Input Format"> + <option name="cufflinks" value="cufflinks">Cufflinks</option> + <option name="bed" value="bed">BED</option> + </param> + <param name="numClusters" type="text" label="Number of Clusters (kmeans)" value="8"> + </param> + <param name="trgtDistance" type="text" label="Transcript threshold distance" value="5000"> + </param> + <repeat name="rna_tracks" title="Cufflinks/BED Files"> + <param name="file" type="data" format="tabular" label="Dataset"/> + <param name="name" type="text" label="Dataset Name"/> + <param name="norm" type="data" label="Normalization Dataset"/> + </repeat> + </inputs> + <outputs> + <data format="xls" name="cluster_assignments" label="Cluster Assignments"/> + <data format="xls" name="mtls" label="MTLS File"/> + <data format="txt" name="log" label="Log file" /> + <data format="bmp" name="heatmap_image" label="Heatmap Image" /> +<!-- <data format="png" name="heatmap_image" label="Heatmap Image" >--> +<!-- <filter>imageFormat=="png"</filter>--> +<!-- </data>--> +<!-- <data format="pdf" name="heatmap_image" label="Heatmap Image" >--> +<!-- <filter>imageFormat=="pdf"</filter>--> +<!-- </data>--> + + </outputs> + <configfiles> + <configfile name="shscript"> +<!-- This is the script that runs (Chettah/bash code)--> +#!/bin/bash + +#import os +#set $path = $os.path.abspath($__app__.config.tool_path) + + +## Set symbols so that they are not incorrectly interpreted: +#set $dollar = chr(36) +#set $gt = chr(62) +#set $lt = chr(60) +#set $ad = chr(38) +#set $bs = chr(92) + +echo $map_rna ${ad}${gt}${gt} $log +echo "This is the Bash log file: " ${ad}${gt}${gt} $log +############################################################################### +## Convert the gtf file to a file that aviv's script can hadel +#if str($map_rna)=='yes' + echo "Converting gtf file" ${ad}${gt}${gt} $log + Rscript $path/visualization/gtfToMapFriendlyAnnotation.R $reference_file ${ad}${gt}${gt} $log + echo "done converting gtf file" ${ad}${gt}${gt} $log +#end if +############################################################################### +## Get ChIP data in correctly formated strings and annotate if nessisary. +#set $sep = '::' +#for $i, $chip in enumerate( $chip_tracks ) + #if $i==0 + echo "Chip Files:" ${ad}${gt}${gt} $log + echo "The first file label is: ${chip.name}" ${ad}${gt}${gt} $log + echo "The first file path is: ${chip.file}" ${ad}${gt}${gt} $log + chip_labels=${chip.name} + chip_paths=${chip.file} + #else + echo "The next file label is: ${chip.name}" ${ad}${gt}${gt} $log + echo "The next file path is: ${chip.file}" ${ad}${gt}${gt} $log + chip_labels=${dollar}chip_labels${sep}${chip.name} + chip_paths=${dollar}chip_paths${sep}${chip.file} + #end if +#end for + +echo chip paths are - ${dollar}chip_paths ${ad}${gt}${gt} $log +echo chip labels are - ${dollar}chip_labels ${ad}${gt}${gt} $log + +############################################################################### +## Cluster peaks + +Rscript $path/visualization/cluster_peaks.R \ +--input_files ${dollar}chip_paths \ +--input_type $chipInputFormat \ +--path_output ./ \ +--expt_names ${dollar}chip_labels \ +--dist_summits $summitDistance \ +--mtl_type $mtlType ${ad}${gt}${gt} $log + +############################################################################### +## Annotate mtls.xls if nessisary +#if str($map_rna)=="yes" + echo "annotating mtls.xls..." ${ad}${gt}${gt} $log + Rscript $path/visualization/annotate_mtls.R mtls.xls gene_annotation.txt $trgtDistance ${ad}${gt}${gt} $log +#end if +############################################################################### +## If rna is specified, then get RNA data in correctly formated strings: +#if str($map_rna)=='yes' + #set $sep = '::' + #for $i, $rna in enumerate( $rna_tracks ) + #if $i==0 + echo "The first file label is: ${rna.name}" ${ad}${gt}${gt} $log + echo "The first file path is: ${rna.file}" ${ad}${gt}${gt} $log + rna_labels=${rna.name} + rna_paths=${rna.file} + rna_norm_paths=${rna.norm} + #else + echo "The next file label is: ${rna.name}" ${ad}${gt}${gt} $log + echo "The next file path is: ${rna.file}" ${ad}${gt}${gt} $log + rna_labels=${dollar}rna_labels${sep}${rna.name} + rna_paths=${dollar}rna_paths${sep}${rna.file} + rna_norm_paths=${dollar}rna_norm_paths${sep}${rna.norm} + #end if + #end for + echo rna paths are - ${dollar}rna_paths ${ad}${gt}${gt} $log + echo rna labels are - ${dollar}rna_labels ${ad}${gt}${gt} $log + echo rna norm files are - ${dollar}rna_norm_paths ${ad}${gt}${gt} $log +#end if +############################################################################### + +#if str($normalize_rna)=='no' + echo "Normalization by file is set to no" ${ad}${gt}${gt} $log + rna_norm_paths=no +#end if + +#if str($use_mean)=='yes' + echo "Normalization of expression will be done by mean" ${ad}${gt}${gt} $log + rna_norm_paths=mean +#end if + +#if str($map_rna)=='no' + mtls_file=mtls.xls + rna_paths=none + rna_labels=none +#else + mtls_file=annotated_mtls.xls +#end if + +echo " +Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \ +--cluster_file ./cluster \ +--chip_experiment_order ${dollar}chip_labels \ +--heatmap_file ./heatmap \ +--heatmap_type bmp \ +--n_clusters $numClusters \ +--filter_percentage 100 \ +--expression_file ${dollar}rna_paths \ +--expression_name ${dollar}rna_labels \ +--normalization_file ${dollar}rna_norm_paths \ +${ad}${gt}${gt} $log" ${ad}${gt}${gt} $log + +Rscript $path/visualization/heatmap.R --mtls_file ./${dollar}mtls_file \ +--cluster_file ./cluster \ +--chip_experiment_order ${dollar}chip_labels \ +--heatmap_file ./heatmap \ +--heatmap_type bmp \ +--n_clusters $numClusters \ +--filter_percentage 100 \ +--number_bins $numberBins \ +--include_targetless $includeTargetless \ +--expression_file ${dollar}rna_paths \ +--expression_name ${dollar}rna_labels \ +--normalization_file ${dollar}rna_norm_paths \ +${ad}${gt}${gt} $log + +ls ${ad}${gt}${gt} $log + + + + +################################################################## +#if str($map_rna)=='yes' + mv ./annotated_mtls.xls $mtls +#else + mv ./mtls.xls $mtls +#end if +mv ./heatmap.* $heatmap_image +mv ./cluster.tsv $cluster_assignments + + </configfile> + </configfiles> +<!--<tests>--> +<!-- <test maxseconds="3600" name="GCA_1">--> +<!-- <param name="bfile" value="bedfile.bed" />--> +<!-- <param name="span" value="3000" />--> +<!-- <param name="genome" value="hg18" />--> +<!-- <output name="output" file="gca_1/gca_1.xls" />--> +<!-- <output name="output" file="gca_1/gca_1.log" lines_diff = "200" />--> +<!-- </test>--> +<!-- <test maxseconds="3600" name="GCA_2">--> +<!-- <param name="bfile" value="bedfile.bed" />--> +<!-- <param name="span" value="100" />--> +<!-- <param name="genome" value="hg18" />--> +<!-- <output name="output" file="gca_2/gca_2.xls" />--> +<!-- <output name="output" file="gca_2/gca_2.log" lines_diff = "200" />--> +<!-- </test>--> +<!-- <test maxseconds="3600" name="GCA_3">--> +<!-- <param name="bfile" value="bedfile.bed" />--> +<!-- <param name="span" value="500" />--> +<!-- <param name="genome" value="hg18" />--> +<!-- <output name="output" file="gca_3/gca_3.xls" />--> +<!-- <output name="output" file="gca_3/gca_3.log" lines_diff = "200" />--> +<!-- </test>--> +<!-- <test maxseconds="3600" name="GCA_4">--> +<!-- <param name="bfile" value="bedfile.bed" />--> +<!-- <param name="span" value="1000" />--> +<!-- <param name="genome" value="hg18" />--> +<!-- <output name="output" file="gca_4/gca_4.xls" />--> +<!-- <output name="output" file="gca_4/gca_4.log" lines_diff = "200" />--> +<!-- </test>--> +<!-- <test maxseconds="3600" name="GCA_5">--> +<!-- <param name="bfile" value="bedfile.bed" />--> +<!-- <param name="span" value="10000" />--> +<!-- <param name="genome" value="hg18" />--> +<!-- <output name="output" file="gca_5/gca_5.xls" />--> +<!-- <output name="output" file="gca_5/gca_5.log" lines_diff = "200" />--> +<!-- </test>--> +<!--</tests>--> + <help> +This tool will merge peaks form multiple chip-seq experiments, creating MTLs for +each overlapping region. It will then cluster each MTL based on the score of +each peak within each MTL (using K-means clustering, with k set by user). A +heatmap is then generated from the resulting cluster along with the MTLs +generated. This module in writin in R and is will be made available on github +and bioconductor. This work was done by Kieran Mace and Aviv Madar. + +**NEED IMPROVEMENT** + +----- + +**Parameters** + +- **Input files** contains either macs or BED files to be merged. This list of files must be two or larger. +- **Experiment names** contains the name given to each track. +- **Summit distance** is the cuttoff distance (in BP) to be included in an MTL. This option is not used with the summit option below +- **Input Format** Either bed of MACS file format, all files must be of one type. Defaults to MACS +- **MTL Type** Either interval or summit (defaults to summit). +- **Number clusters** the value of k for kmeans clustering. +- **Filter top MTLS** The top percentage of MTLs to keep for image and cluster (based on the union of mean, non-zero mean, and variance of the scores). +----- + +**Output** + +- **XLS file** is the tab-delimited file containing the MTL data. +- **PNG file** is the heatmap image generated after clustering the MTL data. + +----- + +**script parameter list of Chip-Cluster** + +Options: +DESCRIPTIION: + cluster_peaks.R takes MACS/.bed tab delimited files as input and produces one tab delimeted file (named mtls.xls) where + each row corresponds to a Multi TF Loci (MTL) in which peaks from different experiments (input MACS/.bed files) + fall within a certain distance between summits from eachother. + +INPUT: + 1.path_input=path to MACS/bed files '::' delim [path_input=f1::f2::f3::...::fk] + 2.path_output=path to save generated MTL cluster file (where to save mtls.xls) + 3.expt_names=user specified names for MACS files '::' delim [expt_names=n1::n2::n3::...::nk] + 4.dist.summits=maximum distance between summits belonging to the same MTL (defaults to 100) + 5.input_type=the type of input file used (MACS or .bed; defaults to MACS) + 6.mtl_type=interval or summit (defaults to summit) + +EXAMPLE RUN: + cluster_peaks.R + --input_macs_files input/SL2870_SL2871_peaks.xls::input/SL2872_SL2876_peaks.xls::input/SL3032_SL2871_peaks.xls::input/SL3037_SL3036_peaks.xls::input/SL3315_SL3319_peaks.xls + --input_type MACS + --path_output results/ + --expt_names RORC_Th17::IRF4_Th17::MAF_Th17::BATF_Th17::STAT3_Th17 + --dist_summits 100 + --mtl_type summit + + DESCRIPTIION: + heatmap.R takes a ... + +INPUT: + 1.--mtls_file path to mtls file. + + 2.--cluster_file the destination path for the cluster file. + + 3.--heatmap_file the destination path for heatmap image (no extension). + + 4.--heatmap_type choice of image type, currently support png and pdf. + + 5.--n_clusters number of clusters in the heatmap + + 6.--filter_percentage percentage of mtls that will be analysed. for eg. if + we make filter_percentage 30, we will take the union of the top mtls in + mean, non-zero mean and variance. + + +EXAMPLE RUN: + Rscript heatmap.R + --mtls_file mtls.xls + --cluster_file output/cluster + --heatmap_file output/heatmap + --heatmap_type png + --n_clusters 13 + --filter_percentage 60 + +Please cite us if you used this script: + The transcription factor network regulating Th17 lineage specification and function. + Maria Ciofani, Aviv Madar, Carolina Galan, Kieran Mace, Agarwal, Kim Newberry, Richard M. Myers, + Richard Bonneau and Dan R. Littman et. al. (in preperation) + + </help> + +</tool>