diff limma_voom.xml @ 0:bdebdea5f6a7 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/limma_voom commit 2f34a215c35f08c3666f314a87d235437baa1d21
author iuc
date Mon, 12 Jun 2017 07:41:02 -0400
parents
children 76d01fe0ec36
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/limma_voom.xml	Mon Jun 12 07:41:02 2017 -0400
@@ -0,0 +1,388 @@
+<tool id="limma_voom" name="limma-voom" version="1.1.1">
+    <description>
+        Differential expression with optional sample weights
+    </description>
+  
+    <requirements>
+        <requirement type="package" version="3.16.5">bioconductor-edger</requirement>
+        <requirement type="package" version="3.30.13">bioconductor-limma</requirement>
+        <requirement type="package" version="1.4.29">r-statmod</requirement>
+        <requirement type="package" version="0.4.1">r-scales</requirement>
+    </requirements>
+
+    <version_command>
+    <![CDATA[
+        echo $(R --version | grep version | grep -v GNU)", limma version" $(R --vanilla --slave -e "library(limma); cat(sessionInfo()\$otherPkgs\$limma\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", edgeR version" $(R --vanilla --slave -e "library(edgeR); cat(sessionInfo()\$otherPkgs\$edgeR\$Version)" 2> /dev/null | grep -v -i "WARNING: ")
+    ]]>
+    </version_command>
+  
+    <command detect_errors="exit_code">
+    <![CDATA[
+        Rscript '$__tool_directory__/limma_voom.R'
+            '$counts'
+            
+            #if $anno.annoOpt=='yes':
+              '$geneanno'
+            #else:
+              None
+            #end if
+            
+            '$outReport'
+            '$outReport.files_path'
+            $rdaOption
+            $normalisationOption
+            $weightOption
+            '$contrast'
+
+            #if $filterCPM.filterLowCPM=='yes':
+              '$filterCPM.cpmReq'
+              '$filterCPM.sampleReq'
+            #else:
+              0
+              0
+            #end if
+            
+            #if $testOpt.wantOpt=='yes':
+              '$testOpt.pAdjust'
+              '$testOpt.pVal'
+              '$testOpt.lfc'
+            #else:
+              "BH"
+              0.05
+              0
+            #end if
+            
+            '$factName::$factLevel'
+
+            &&
+            mkdir ./output_dir
+
+            &&
+            mv '$outReport.files_path'/*.tsv output_dir/
+            
+    ]]>             
+    </command>
+   
+    <inputs>
+        <param name="counts" type="data" format="tabular" label="Counts Data"/>
+        
+        <conditional name="anno">
+            <param name="annoOpt" type="select"
+                    label="Use Gene Annotations?" 
+                    help="If an annotation file is provided, annotations will be added to the table of differential expression results to provide descriptions for each gene.">
+                <option value="no">No</option>
+                <option value="yes">Yes</option>
+            </param>
+            <when value="yes">
+                <param name="geneanno" type="data" format="tabular" label="Gene Annotations"/>
+            </when>
+            <when value="no" />
+        </conditional>
+
+      <!--*Code commented until solution for multiple factors is found*
+      <repeat name="factors" title="Factors" min="1" max="5" default="1">
+        <param name="factName" type="text" label="Factor Name (No spaces)"
+               help="Eg. Genotype"/>
+          <param name="factLevel" type="text" size="100"
+                 label="Factor Levels (No spaces)"
+                 help="Eg. WT,WT,Mut,Mut,WT"/>
+      </repeat>
+      -->
+      
+        <param name="factName" type="text" label="Factor Name" help="Eg. Genotype."/>
+        <param name="factLevel" type="text" label="Factor Values"
+                help="Eg. WT,WT,WT,Mut,Mut,Mut
+                NOTE: Please ensure that the same levels are typed identically with cases matching."/>     
+        <param name="contrast" type="text" label="Contrasts of interest" help="Eg. Mut-WT,KD-Control"/>
+      
+        <conditional name="filterCPM">
+            <param name="filterLowCPM" type="select" label="Filter Low CPM?"
+                help="Treat genes with very low expression as unexpressed and filter out to speed up computation.">
+                <option value="yes" selected="True">Yes</option>
+                <option value="no">No</option>
+            </param>
+            <when value="yes">
+                <param name="cpmReq" type="float" value="0.5" min="0" label="Minimum CPM"/>
+                       
+                <param name="sampleReq" type="integer" value="1" min="0" label="Minimum Samples"
+                    help="Filter out all the genes that do not meet the minimum CPM in at least this many samples."/>
+            </when>          
+            <when value="no"/>         
+        </conditional>
+
+        <param name="weightOption" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Apply sample weights?"
+            help="Apply weights if outliers are present."> 
+        </param>
+      
+        <param name="normalisationOption" type="select" label="Normalisation Method">
+            <option value="TMM">TMM</option>
+            <option value="RLE">RLE</option>
+            <option value="upperquartile">Upperquartile</option>
+            <option value="none">None (Don't normalise)</option>
+        </param>
+
+        <param name="rdaOption" type="boolean" truevalue="yes" falsevalue="no" checked="false" 
+            label="Output RData?"
+            help="Output all the data used by R to construct the plots and tables, can be loaded into R. A link to the RData file will be provided in the HTML report.">      
+        </param>
+                    
+        <conditional name="testOpt">
+            <param name="wantOpt" type="select" label="Use Advanced Testing Options?"
+                help="Enable choices for p-value adjustment method, p-value threshold and log2-fold-change threshold.">
+                <option value="no" selected="True">No</option>
+                <option value="yes">Yes</option>
+            </param>         
+            <when value="yes">
+                <param name="pAdjust" type="select" label="P-Value Adjustment Method.">
+                    <option value="BH">Benjamini and Hochberg (1995)</option>
+                    <option value="BY">Benjamini and Yekutieli (2001)</option>
+                    <option value="holm">Holm (1979)</option>
+                    <option value="none">None</option>
+                </param>             
+                <param name="pVal" type="float" value="0.05" min="0" max="1"
+                    label="Adjusted Threshold"
+                    help="Genes below this threshold are considered significant and highlighted in the MA plot. If either BH(1995) or BY(2001) were selected then this value is a false-discovery-rate control. If Holm(1979) was selected then this is an adjusted p-value for family-wise error rate."/>
+                <param name="lfc" type="float" value="0" min="0"
+                    label="Minimum log2-fold-change Required"
+                    help="Genes above this threshold and below the p-value threshold are considered significant and highlighted in the MA plot."/>
+            </when> 
+            <when value="no"/>       
+        </conditional>
+
+    </inputs>
+  
+    <outputs>
+        <data format="html" name="outReport" label="${tool.name} on ${on_string}: Report" />
+        <collection name="voom_results" type="list" label="${tool.name} on ${on_string}: DE genes">
+            <discover_datasets pattern="(?P&lt;name&gt;.+)\.tsv$" format="tabular" directory="output_dir" visible="false" />
+        </collection>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="counts" value="matrix.txt" />
+            <param name="factName" value="Genotype" />
+            <param name="factLevel" value="WT,WT,WT,Mut,Mut,Mut" />
+            <param name="contrast" value="Mut-WT,WT-Mut" />
+            <param name="normalisationOption" value="TMM" />
+            <output_collection name="voom_results" count="2">
+                <element name="limma-voom_Mut-WT" ftype="tabular" file="limma-voom_Mut-WT.tsv" />
+                <element name="limma-voom_WT-Mut" ftype="tabular" file="limma-voom_WT-Mut.tsv" />
+            </output_collection>    
+            <output name="outReport" >
+                <assert_contents>
+                    <has_text text="Limma-voom Analysis Output" />
+                    <not_has_text text="RData" />
+                </assert_contents>
+            </output>          
+        </test>
+        <test>
+            <param name="annoOpt" value="yes" />
+            <param name="geneanno" value="anno.txt" />
+            <param name="counts" value="matrix.txt" />
+            <param name="factName" value="Genotype" />
+            <param name="factLevel" value="WT,WT,WT,Mut,Mut,Mut" />
+            <param name="contrast" value="Mut-WT" />
+            <param name="normalisationOption" value="TMM" />
+            <output_collection name="voom_results" >
+                <element name="limma-voom_Mut-WT" ftype="tabular" file="limma-voom_Mut-WTanno.tsv" />
+            </output_collection>  
+        </test>
+        <test>
+            <param name="rdaOption" value="yes" />
+            <param name="counts" value="matrix.txt" />            
+            <param name="factName" value="Genotype" />
+            <param name="factLevel" value="WT,WT,WT,Mut,Mut,Mut" />
+            <param name="contrast" value="Mut-WT" />
+            <param name="normalisationOption" value="TMM" />
+            <output name="outReport" >
+                <assert_contents>
+                    <has_text text="RData" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+  
+    <help>
+<![CDATA[
+.. class:: infomark
+
+**What it does**
+
+Given a matrix of counts (e.g. from featureCounts) and optional information about the genes, this tool
+produces plots and tables useful in the analysis of differential gene 
+expression.
+
+-----
+
+**Inputs**
+
+**Counts Data:**
+A matrix of counts, with rows corresponding to genes
+and columns corresponding to counts for the samples.
+Values must be tab separated, with the first row containing the sample/column
+labels and the first column containing the row/gene labels.
+
+Example:
+
+    ========== ======= ======= ======= ======== ======== ========
+    **GeneID** **WT1** **WT2** **WT3** **Mut1** **Mut2** **Mut3**
+    ---------- ------- ------- ------- -------- -------- --------
+    11287      1699    1528    1601    1463     1441     1495                
+    11298      1905    1744    1834    1345     1291     1346                
+    11302      6       8       7       5        6        5                   
+    11303      2099    1974    2100    1574     1519     1654                
+    11304      356     312     337     361      397      346                 
+    11305      2528    2438    2493    1762     1942     2027                
+    ========== ======= ======= ======= ======== ======== ========
+
+**Gene Annotations:**
+Optional input for gene annotations, this can contain more
+information about the genes than just an ID number. The annotations will
+be avaiable in the differential expression results table.
+
+Example:
+
+    ==========  ==========  ===================================================
+    **GeneID**  **Symbol**  **GeneName**
+    ----------  ----------  ---------------------------------------------------
+    1287        Pzp         pregnancy zone protein
+    1298        Aanat       arylalkylamine N-acetyltransferase
+    1302        Aatk        apoptosis-associated tyrosine kinase
+    1303        Abca1       ATP-binding cassette, sub-family A (ABC1), member 1
+    1304        Abca4       ATP-binding cassette, sub-family A (ABC1), member 4
+    1305        Abca2       ATP-binding cassette, sub-family A (ABC1), member 2
+    ==========  ==========  ===================================================
+
+**Factor Name:**
+The name of the factor being investigated. This tool currently assumes
+that only one factor is of interest.
+
+**Factor Levels:**
+The levels of the factor of interest, this must be entered in the same
+order as the samples to which the levels correspond as listed in the
+columns of the counts matrix. 
+
+The values should be seperated by commas, and spaces must not be used.
+
+**Contrasts of Interest:**
+The contrasts you wish to make between levels. 
+
+A common contrast would be a simple difference between two levels: "Mut-WT" 
+represents the difference between the mutant and wild type genotypes.
+
+The values should be seperated by commas and spaces must not be used.
+
+**Filter Low CPM:**
+Option to ignore the genes that do not show significant levels of
+expression, this filtering is dependent on two criteria:
+
+    * **Minimum CPM:** This is the counts per million that a gene must have in at
+      least some specified number of samples.
+
+    * **Minumum Samples:** This is the number of samples in which the CPM
+      requirement must be met in order for that gene to be acknowledged.
+
+Only genes that exhibit a CPM greater than the required amount in at least the
+number of samples specified will be used for analysis. Care should be taken to
+ensure that the sample requirement is appropriate. In the case of an experiment
+with two experimental groups each with two members, if there is a change from
+insignificant cpm to significant cpm but the sample requirement is set to 3,
+then this will cause that gene to fail the criteria. When in doubt simply do not
+filter.
+
+
+**Normalisation Method:**
+Option for using different methods to rescale the raw library
+size. For more information, see calcNormFactor section in the edgeR_ user's
+manual.
+
+**Apply Sample Weights:**
+Option to downweight outlier samples such that their information is still
+used in the statistical analysis but their impact is reduced. Use this
+whenever significant outliers are present. The MDS plotting tool in this package
+is useful for identifying outliers. For more information on this option see Liu et al. (2015).
+
+**Use Advanced Testing Options?:**
+By default error rate for multiple testing is controlled using Benjamini and
+Hochberg's false discovery rate control at a threshold value of 0.05. However
+there are options to change this to custom values.
+
+    * **P-Value Adjustment Method:**
+      Change the multiple testing control method, the options are BH(1995) and 
+      BY(2001) which are both false discovery rate controls. There is also
+      Holm(1979) which is a method for family-wise error rate control.
+    
+    * **Adjusted Threshold:**
+      Set the threshold for the resulting value of the multiple testing control
+      method. Only observations whose statistic falls below this value is
+      considered significant, thus highlighted in the MA plot.
+      
+    * **Minimum log2-fold-change Required:**
+      In addition to meeting the requirement for the adjusted statistic for
+      multiple testing, the observation must have an absolute log2-fold-change
+      greater than this threshold to be considered significant, thus highlighted 
+      in the MA plot.
+
+-----
+
+**Citations:**
+
+.. class:: infomark
+
+limma
+
+Please cite the paper below for the limma software itself.  Please also try
+to cite the appropriate methodology articles that describe the statistical
+methods implemented in limma, depending on which limma functions you are
+using.  The methodology articles are listed in Section 2.1 of the limma 
+User's Guide.
+
+    * Smyth GK (2005). Limma: linear models for microarray data. In: 
+      'Bioinformatics and Computational Biology Solutions using R and 
+      Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, 
+      W. Huber (eds), Springer, New York, pages 397-420.
+        
+    * Law CW, Chen Y, Shi W, and Smyth GK (2014). Voom:
+      precision weights unlock linear model analysis tools for
+      RNA-seq read counts. Genome Biology 15, R29.
+
+    * Liu R, Holik AZ, Su S, Jansz N, Chen K, Leong HS, Blewitt ME, Asselin-Labat ML, Smyth GK, Ritchie ME (2015). Why weight? Modelling sample and observational level variability improves power in RNA-seq analyses. Nucleic Acids Research, 43(15), e97.
+
+    * Ritchie, M. E., Diyagama, D., Neilson, J., van Laar, R., Dobrovic, 
+      A., Holloway, A., and Smyth, G. K. (2006). Empirical array quality weights
+      for microarray data. BMC Bioinformatics 7, Article 261.
+
+.. class:: infomark
+
+edgeR
+
+Please cite the first paper for the software itself and the other papers for
+the various original statistical methods implemented in edgeR.  See 
+Section 1.2 in the User's Guide for more detail.
+
+    * Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor 
+      package for differential expression analysis of digital gene expression 
+      data. Bioinformatics 26, 139-140
+      
+    * Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing 
+      differences in tag abundance. Bioinformatics 23, 2881-2887
+      
+    * Robinson MD and Smyth GK (2008). Small-sample estimation of negative 
+      binomial dispersion, with applications to SAGE data.
+      Biostatistics, 9, 321-332
+      
+    * McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis 
+      of multifactor RNA-Seq experiments with respect to biological variation. 
+      Nucleic Acids Research 40, 4288-4297
+      
+Please report problems or suggestions to: su.s@wehi.edu.au
+
+.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
+.. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html
+]]>
+    </help>
+    <citations>
+        <citation type="doi">10.1093/nar/gkv412</citation>
+    </citations>
+</tool>