changeset 10:d7725c5596ab draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/diffbind commit f970dcbe9d0e4c3714b1db74c404ea34223cf8ed
author iuc
date Tue, 20 Mar 2018 04:51:25 -0400
parents 6171163112de
children 4c7ab9995f9e
files diffbind.R diffbind.xml test-data/DiffBind_analysis.RData test-data/out_plots.pdf
diffstat 4 files changed, 98 insertions(+), 84 deletions(-) [+]
line wrap: on
line diff
--- a/diffbind.R	Sun Jan 28 05:10:25 2018 -0500
+++ b/diffbind.R	Tue Mar 20 04:51:25 2018 -0400
@@ -21,7 +21,8 @@
     'infile' , 'i', 1, "character",
     'format', 'f', 1, "character",
     'th', 't', 1, "double",
-    'bmatrix', 'b', 0, "logical"
+    'bmatrix', 'b', 0, "logical",
+    "rdaOpt", "r", 0, "logical"
 ), byrow=TRUE, ncol=4);
 
 opt = getopt(spec);
@@ -43,6 +44,7 @@
 sample_analyze = dba.analyze(sample_contrast)
 diff_bind = dba.report(sample_analyze)
 orvals = dba.plotHeatmap(sample_analyze, contrast=1, correlations=FALSE)
+dev.off()
 
 resSorted <- diff_bind[order(diff_bind$FDR),]
 write.table(as.data.frame(resSorted), file = opt$outfile, sep="\t", quote = FALSE, append=TRUE, row.names = FALSE, col.names = FALSE)
@@ -53,5 +55,10 @@
     write.table(as.data.frame(bmat), file="bmatrix.tab", sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE)
 }
 
-dev.off()
+## Output RData file
+
+if (!is.null(opt$rdaOpt)) {
+    save.image(file = "DiffBind_analysis.RData")
+}
+
 sessionInfo()
--- a/diffbind.xml	Sun Jan 28 05:10:25 2018 -0500
+++ b/diffbind.xml	Tue Mar 20 04:51:25 2018 -0400
@@ -1,10 +1,8 @@
-<tool id="diffbind" name="DiffBind" version="2.6.5.0">
+<tool id="diffbind" name="DiffBind" version="2.6.6.0">
     <description> differential binding analysis of ChIP-Seq peak data</description>
     <requirements>
-        <requirement type="package" version="2.6.5">bioconductor-diffbind</requirement>
+        <requirement type="package" version="2.6.6">bioconductor-diffbind</requirement>
         <requirement type="package" version="1.20.0">r-getopt</requirement>
-        <!--added rmysql requirement to remove: "Warning: namespace ‘RMySQL’ is not available"-->
-        <requirement type="package" version="0.10.11">r-rmysql</requirement>
     </requirements>
     <stdio>
         <regex match="Execution halted"
@@ -21,7 +19,7 @@
            description="An undefined error occured, please check your intput carefully and contact your administrator." />
     </stdio>
     <version_command><![CDATA[
-echo $(R --version | grep version | grep -v GNU)", DiffBind version" $(R --vanilla --slave -e "library(DiffBind); cat(sessionInfo()\$otherPkgs\$DiffBind\$Version)" 2> /dev/null | grep -v -i "WARNING: ")," getopt version" $(R --vanilla --slave -e "library(getopt); cat(sessionInfo()\$otherPkgs\$getopt\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", rmysql version" $(R --vanilla --slave -e "library(rmysql); cat(sessionInfo()\$otherPkgs\$rmysql\$Version)" 2> /dev/null | grep -v -i "WARNING: ")
+echo $(R --version | grep version | grep -v GNU)", DiffBind version" $(R --vanilla --slave -e "library(DiffBind); cat(sessionInfo()\$otherPkgs\$DiffBind\$Version)" 2> /dev/null | grep -v -i "WARNING: ")
     ]]></version_command>
     <command><![CDATA[
         ## seems that diffbind also needs file extensions to work properly
@@ -39,13 +37,17 @@
         Rscript '$__tool_directory__/diffbind.R'
             -i $infile
             -o '$outfile'
+            -t $th
+            -f $out.format
             -p '$plots'
-            -f $format
-            -t $th
 
-            #if $binding_affinity_matrix:
+            #if $out.binding_matrix:
                 -b
             #end if
+
+            #if $out.rdata:
+                -r
+            #end if
 ]]>
     </command>
     <configfiles>
@@ -66,7 +68,7 @@
 #end for]]></configfile>
     </configfiles>
     <inputs>
-        <repeat name="samples" title="Samples" min="2">
+        <repeat name="samples" title="Samples" min="4">
             <param name="sample_id" type="text" value="Sample ID" label="Specify a sample id" help="e.g. BT474.1-" />
             <param name="tissue" type="text" value="Tissue" label="Specify the tissue" help="e.g. BT474" />
             <param name="factor" type="text" value="Factor Name" label="Specify a factor name" help="e.g. ER" />
@@ -79,32 +81,41 @@
         <param name="th" type="float" value="1" min="0" max="1"
                 label="FDR Threshold"
                 help="Significance threshold; all sites with FDR less than or equal to this value will be included in the report. A value of 1 will include all binding sites in the report. Default: 1"/>
-        <param name="pdf" type="boolean" truevalue="" falsevalue="" checked="true"
-            label="Visualising the analysis results"
-            help="output an additional PDF file" />
-        <param name="format" type="select" label="Output Format">
-            <option value="bed">BED</option>
-            <option value="gff">GFF</option>
-            <option value="wig">WIG</option>
-        </param>
-        <param name="binding_affinity_matrix" type="boolean" truevalue="True" falsevalue="" checked="False" label="Output binding affinity matrix?" help="Output a table of the binding scores" />
+        
+        <!-- Output Options -->
+        <section name="out" expanded="false" title="Output Options">
+            <param name="format" type="select" label="Output Format">
+                <option value="bed">BED</option>
+                <option value="gff">GFF</option>
+                <option value="wig">WIG</option>
+            </param>
+            <param name="pdf" type="boolean" truevalue="True" falsevalue="" checked="False" label="Visualising the analysis results" help="output an additional PDF file" />
+            <param name="binding_matrix" type="boolean" truevalue="True" falsevalue="" checked="False" label="Output binding affinity matrix?" help="Output a table of the binding scores" />
+            <param name="rdata" type="boolean" truevalue="True" falsevalue="" checked="False" label="Output RData file?" help="Output all the data used by R to construct the plots and tables, can be loaded into R. Default: No">
+            </param>
+        </section>
     </inputs>
+
     <outputs>
-        <data name="outfile" format="bed" label="Differential binding sites on ${on_string}">
+        <data name="outfile" format="bed" label="${tool.name} on ${on_string}: Differentially bound sites">
             <change_format>
                 <when input="format" value="wig" format="wig" />
                 <when input="format" value="gff" format="gff" />
             </change_format>
         </data>
-        <data name="plots" format="pdf" label="Differential binding sites on ${on_string}">
-            <filter>pdf == True</filter>
+        <data name="plots" format="pdf" label="${tool.name} on ${on_string}: Plots">
+            <filter>out['pdf']</filter>
         </data>
-        <data name="binding_matrix" format="tabular" from_work_dir="bmatrix.tab" label="Differential binding sites on ${on_string}">
-            <filter>binding_affinity_matrix == True</filter>
+        <data name="binding_matrix" format="tabular" from_work_dir="bmatrix.tab" label="${tool.name} on ${on_string}: Binding matrix">
+            <filter>out['binding_matrix']</filter>
+        </data>
+        <data name="rdata" format="rdata" from_work_dir="DiffBind_analysis.RData" label="${tool.name} on ${on_string}: RData file">
+            <filter>out['rdata']</filter>
         </data>
     </outputs>
+
     <tests>
-        <test>
+        <test expect_num_outputs="4">
             <repeat name="samples">
                 <param name="sample_id" value="BT4741" />
                 <param name="tissue" value="BT474" />
@@ -142,9 +153,12 @@
                 <param name="peaks" ftype="bed" value="MCF7_ER_2.bed.gz" />
             </repeat>
             <param name="pdf" value="True" />
-            <param name="binding_affinity_matrix" value="True" />
+            <param name="binding_matrix" value="True" />
+            <param name="rdata" value="True" />
             <output name="outfile" value="out_diffbind.bed" />
+            <output name="plots" value="out_plots.pdf" compare="sim_size" />
             <output name="binding_matrix" value="out_binding.matrix" />
+            <output name="rdata" value="DiffBind_analysis.RData" compare="sim_size"/>
         </test>
     </tests>
     <help><![CDATA[
@@ -166,7 +180,7 @@
 in peak sets, and identifying statistically significantly differentially bound sites based on
 evidence of binding affinity (measured by differences in read densities). To this end it uses
 statistical routines developed in an RNA-Seq context (primarily the Bioconductor packages
-edgeR and DESeq2 ). Additionally, the package builds on Rgraphics routines to provide a
+edgeR and DESeq2). Additionally, the package builds on Rgraphics routines to provide a
 set of standardized plots to aid in binding analysis.
 
 The `DiffBind User Guide`_ includes a brief overview of the processing flow, followed by four sections of
@@ -182,6 +196,8 @@
 .. _`Bioconductor package`: https://bioconductor.org/packages/release/bioc/html/DiffBind.html
 .. _`DiffBind User Guide`: https://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf
 
+-----
+
 **Inputs**
 
 DiffBind works primarily with peaksets, which are sets of genomic intervals representing
@@ -194,7 +210,7 @@
 
 **Sample Information**
 
-You have to specify your sample information in the tool form above.
+You have to specify your sample information in the tool form above, where Condition contains the groups you want to compare.
 
 Example:
 
@@ -214,26 +230,6 @@
     ZR752         ZR75       ER         Responsive    2            
     ============= ========== ========== ============= =============
 
-Or provide a sample sheet tabular file such as below.
-
-Example:
-
-    ======== ======  ====== ========== ========== ========= ====================  ========= ===================== ================= ==========
-    SampleID Tissue  Factor Condition  Treatment  Replicate bamReads              ControlID bamControl            Peaks             PeakCaller
-    ======== ======  ====== ========== ========== ========= ====================  ========= ===================== ================= ==========
-    BT4741   BT474   ER     Resistant  Full-Media  1        Chr18_BT474_ER_1.bam  BT474c    Chr18_BT474_input.bam BT474_ER_1.bed.gz bed
-    BT4742   BT474   ER     Resistant  Full-Media  2        Chr18_BT474_ER_2.bam  BT474c    Chr18_BT474_input.bam BT474_ER_2.bed.gz bed
-    MCF71    MCF7    ER     Responsive Full-Media  1        Chr18_MCF7_ER_1.bam   MCF7c     Chr18_MCF7_input.bam  MCF7_ER_1.bed.gz  bed
-    MCF72    MCF7    ER     Responsive Full-Media  2        Chr18_MCF7_ER_2.bam   MCF7c     Chr18_MCF7_input.bam  MCF7_ER_2.bed.gz  bed
-    MCF73    MCF7    ER     Responsive Full-Media  3        Chr18_MCF7_ER_3.bam   MCF7c     Chr18_MCF7_input.bam  MCF7_ER_3.bed.gz  bed
-    T47D1    T47D    ER     Responsive Full-Media  1        Chr18_T47D_ER_1.bam   T47Dc     Chr18_T47D_input.bam  T47D_ER_1.bed.gz  bed
-    T47D2    T47D    ER     Responsive Full-Media  2        Chr18_T47D_ER_2.bam   T47Dc     Chr18_T47D_input.bam  T47D_ER_2.bed.gz  bed
-    MCF7r1   MCF7    ER     Resistant  Full-Media  1        Chr18_TAMR_ER_1.bam   TAMRc     Chr18_TAMR_input.bam  TAMR_ER_1.bed.gz  bed
-    MCF7r2   MCF7    ER     Resistant  Full-Media  2        Chr18_TAMR_ER_2.bam   TAMRc     Chr18_TAMR_input.bam  TAMR_ER_2.bed.gz  bed
-    ZR751    ZR75    ER     Responsive Full-Media  1        Chr18_ZR75_ER_1.bam   ZR75c     Chr18_ZR75_input.bam  ZR75_ER_1.bed.gz  bed
-    ZR752    ZR75    ER     Responsive Full-Media  2        Chr18_ZR75_ER_2.bam   ZR75c     Chr18_ZR75_input.bam  ZR75_ER_2.bed.gz  bed
-    ======== ======  ====== ========== ========== ========= ====================  ========= ===================== ================= ==========
-
 
 **Peak files**
 
@@ -259,37 +255,49 @@
 * BAM file which contains the mapped sequencing reads can be associated with each peakset
 * Control BAM file represents a control dataset and are optional, but have to specified for all when used.
 
+-----
 
 **Outputs**
 
+This tool outputs
+
+    * differentially bound sites in BED, WIG or GFF format
+
+Optionally, under **Output Options** you can choose to output
+
+    * a correlation heatmap plot
+    * a binding affinity matrix
+    * an RData file
+
+**Differentially Bound Sites**
+
 As output format you can choose BED, GFF, WIG.
 
-Example:
-
-======== ====== =======+
-seqnames ranges strand             Conc Conc_Resistant
+Example - BED format:
 
-2452     chr18 [64490686, 64491186] * | 6.36 1.39
-1291     chr18 [34597713, 34598213] * | 5.33 0.22
-976      chr18 [26860997, 26861497] * | 7.3 3.13
-2338     chr18 [60892900, 60893400] * | 7.13 1.84
-2077     chr18 [55569087, 55569587] * | 5.52 1.89
+    =====  ======  ======  ===== ====  ====    ====    ====    =====   ========    ========
+    1      2       3       4     5     6       7       8       9       10          **11**
+    =====  ======  ======  ===== ====  ====    ====    ====    =====   ========    ========
+    chr18  394600  396513  1914    *   7.15    7.89    5.55    2.35    7.06e-24    9.84e-21
+    chr18  111567  112005  439     *   5.71    3.63    6.53    -2.89   1.27e-08    8.88e-06
+    chr18  346464  347342  879     *   5       3.24    5.77    -2.52   6.51e-06    0.00303
+    chr18  399014  400382  1369    *   7.62    8.05    7       1.04    1.04e-05    0.00364
+    chr18  371110  372102  993     *   4.63    5.36    3.07    2.3     8.1e-05     0.0226
+    =====  ======  ======  ===== ====  ====    ====    ====    =====   ========    ========
 
-Conc_Responsive Fold p-value FDR
-<numeric> <numeric> <numeric> <numeric>
-2452 7 -5.61 3.57e-10 1.02e-06
-1291 5.97 -5.75 1.1e-09 1.57e-06
-976 7.92 -4.79 1.1e-08 1.05e-05
-2338 7.77 -5.93 1.68e-08 1.17e-05
-2077 6.13 -4.23 2.36e-08 1.17e-05
+    Columns contain the following data:
 
-The value columns show the
-Conc mean read concentration over all the samples (the default calculation uses log2 normalized ChIP read counts with control read counts subtracted) 
-Conc_Resistant mean concentration over the first (Resistant) group 
-Conc_Responsive mean concentration over second (Responsive) group 
-Fold column shows the difference in mean concentrations between the two groups (Conc_Resistant - Conc_Responsive), with a positive value indicating increased binding affinity in the Resistant group and a negative value indicating increased binding affinity in the Responsive group.
-p-value confidence measure for identifying these sites as differentially bound 
-FDR a multiple testing corrected FDR p-value
+* **1st**: Chromosome name
+* **2nd**: Start position of site
+* **3rd**: End position of site
+* **4th**: Length of site
+* **5th**: Strand
+* **6th**: Mean read concentration over all the samples (the default calculation uses log2 normalized ChIP read counts with control read counts subtracted)
+* **7th**: Mean concentration over the first (e.g. Resistant) group
+* **8th**: Mean concentration over second (e.g. Responsive) group
+* **9th**: Fold shows the difference in mean concentrations between the two groups (e.g. Resistant - Responsive), with a positive value indicating increased binding affinity in the first group and a negative value indicating increased binding affinity in the second group.
+* **10th**: P-value confidence measure for identifying these sites as differentially bound
+* **11th**: a multiple testing corrected FDR p-value
 
 
 **Binding Affinity Matrix**
@@ -315,7 +323,7 @@
     ZR752  ZR75   ER     Responsive Full-Media 2         counts 2845      0.22
     ====== ====== ====== ========== ========== ========= ====== ========= ====
 
-
+-----
 
 **More Information**
 
@@ -328,21 +336,18 @@
  #. Plotting and reporting
 
 
- * **Reading in peaksets**: 
+**Reading in peaksets**:
 
 The first step is to read in a set of peaksets and associated
-metadata. Peaksets are derived either from ChIP-Seq peak callers, such as MACS
-([1]), or using some other criterion (e.g. genomic windows, or all the promoter regions
-in a genome). The easiest way to read in peaksets is using a comma-separated value
-(csv) sample sheet with one line for each peakset. (Spreadsheets in Excel® format, with
-a .xls or .xlsx suffix, are also accepted.) A single experiment can have more than
+metadata. Peaksets are derived either from ChIP-Seq peak callers, such as **MACS2**, or using some other criterion (e.g. genomic windows, or all the promoter regions
+in a genome).  A single experiment can have more than
 one associated peakset; e.g. if multiple peak callers are used for comparison purposes
 each sample would have more than one line in the sample sheet. Once the peaksets
 are read in, a merging function finds all overlapping peaks and derives a single set of
 unique genomic intervals covering all the supplied peaks (a consensus peakset for the
 experiment).
 
- * **Occupancy analysis**: 
+**Occupancy analysis**:
 
 Peaksets, especially those generated by peak callers, provide
 an insight into the potential occupancy of the protein being ChIPed for at specific
@@ -356,7 +361,7 @@
 a consensus peakset, representing an overall set of candidate binding sites to be used
 in further analysis.
 
- * **Counting reads**: 
+**Counting reads**:
 
 Once a consensus peakset has been derived, DiffBind can use the
 supplied sequence read files to count how many reads overlap each interval for each
@@ -368,7 +373,7 @@
 data. The binding affinity matrix is used for QC plotting as well as for subsequent
 differential analysis.
 
- * **Differential binding affinity analysis**: 
+**Differential binding affinity analysis**:
 
 The core functionality of DiffBind is the
 differential binding affinity analysis, which enables binding sites to be identified that
@@ -378,7 +383,7 @@
 This will assign a p-value and FDR to each candidate binding site indicating confidence
 that they are differentially bound.
 
- * **Plotting and reporting**: 
+**Plotting and reporting**:
 
 Once one or more contrasts have been run, DiffBind provides
 a number of functions for reporting and plotting the results. MA plots give an
@@ -387,7 +392,9 @@
 of reads within differentially bound sites corresponding to whether they gain or
 lose affinity between the two sample groups. A reporting mechanism enables differentially
 bound sites to be extracted for further processing, such as annotation, motif, and
-pathway analyses.
+pathway analyses. *Note that currently only the correlation plot is implemented in this Galaxy tool.*
+
+-----
 
 **References**
 
Binary file test-data/DiffBind_analysis.RData has changed
Binary file test-data/out_plots.pdf has changed