diff hairpinTool.xml @ 13:7aaa9bc23e3c

Added support for paired end reads - Changed terminology to generalise to sgRNA CRISPR experiments. - Added option to include second factor for statistical power - Added option to filter out samples with low counts - Added support for paired end reads - Added option to highlight only positive or negative fold change in smear plot - Fixed bug that caused tool to stop if more than enough sample annotations were supplied
author shian_su <registertonysu@gmail.com>
date Tue, 14 Oct 2014 17:05:07 +1100
parents c0a76e30d61b
children 44130e484a97
line wrap: on
line diff
--- a/hairpinTool.xml	Wed Oct 01 16:00:43 2014 +1000
+++ b/hairpinTool.xml	Tue Oct 14 17:05:07 2014 +1100
@@ -1,12 +1,13 @@
-<tool id="shRNAseq" name="shRNAseq Tool" version="1.0.13">
+<tool id="shRNAseq" name="shRNAseq Tool" version="1.2.0">
   <description>
-    Analyse hairpin differential representation using edgeR
+    Analyse differential representation for shRNAseq and sgRNA based procedures
+    using edgeR package from Bioconductor.
   </description>
     
   <requirements>
-    <requirement type="R-module" version="3.6.2">edgeR</requirement>
-    <requirement type="R-module" version="3.20.7">limma</requirement>
-    <requirement type="package" version="3.0.3">R_3_0_3</requirement>
+    <requirement type="R-module" version="3.7.17">edgeR</requirement>
+    <requirement type="R-module" version="3.21.16">limma</requirement>
+    <requirement type="package" version="3.1.1">R_3_0_3</requirement>
   </requirements>
   
   <stdio>
@@ -14,43 +15,90 @@
   </stdio>
   
   <command interpreter="Rscript">
-  hairpinTool.R $inputOpt.inputType
+  ampliconTool.R $inputOpt.inputType
                 #if $inputOpt.inputType=="fastq":
+
                   #for $i, $fas in enumerate($inputOpt.fastq):
                     fastq::$fas.file
                   #end for
     
                   $inputOpt.hairpin
                   $inputOpt.samples
+
+                  #if $inputOpt.positions.posOption=="yes":
+                    $inputOpt.positions.barstart
+                    $inputOpt.positions.barend
+                    0
+                    0
+                    $inputOpt.positions.hpstart
+                    $inputOpt.positions.hpend
+                  #else:
+                    1
+                    5
+                    0
+                    0
+                    37
+                    57
+                  #end if
+                #elif $inputOpt.inputType=="pairedFastq":
+
+                  #for $i, $fas in enumerate($inputOpt.fastq):
+                    fastq::$fas.file
+                  #end for
+
+                  #for $i, $fas in enumerate($inputOpt.fastq):
+                    fastqRev::$fas.fileRev
+                  #end for
+    
+                  $inputOpt.hairpin
+                  $inputOpt.samples
                     
                   #if $inputOpt.positions.posOption=="yes":
                     $inputOpt.positions.barstart
                     $inputOpt.positions.barend
+                    $inputOpt.positions.barstartRev
+                    $inputOpt.positions.barendRev
                     $inputOpt.positions.hpstart
                     $inputOpt.positions.hpend
                   #else:
                     1
                     5
+                    0
+                    0
                     37
                     57
                   #end if
-                #else:
+
+                #elif $inputOpt.inputType=="counts":
                   $inputOpt.counts
                   $inputOpt.hairpin
                   $inputOpt.samples
-                  0 0 0
+                  0
+                  0
+                  0
+                  0
+                  0
                 #end if
-          
+                
+                #if $inputOpt.secondaryFactor.secFactorOpt=="yes":
+                  $inputOpt.secondaryFactor.secFactName
+                #else:
+                  "none"
+                #end if
+
                 #if $filterCPM.filtOption=="yes":
                   $filterCPM.cpmReq
                   $filterCPM.sampleReq
+                  $filterCPM.readReq
                 #else:
                   -Inf
                   -Inf
+                  -Inf
                 #end if
           
                 $fdr
                 $lfc
+                $direction
                 $workMode.mode
                 $outFile
                 $outFile.files_path
@@ -61,6 +109,7 @@
                 #elif $workMode.mode=="glm":
                   "$workMode.contrast"
                   $workMode.roast.roastOption
+
                   #if $workMode.roast.roastOption=="yes":
                     $workMode.roast.hairpinReq
                     $workMode.roast.select.selOption
@@ -70,20 +119,22 @@
                     0
                     0
                   #end if
+
                 #end if
   </command>
   
   <inputs>
     <conditional name="inputOpt">
+
       <param name="inputType" type="select" label="Input File Type">
         <option value="fastq">FastQ File</option>
+        <option value="pairedFastq">Paired FastQ File</option>
         <option value="counts">Table of Counts</option>
       </param>
-    
+
       <when value="fastq">
         <param name="hairpin" type="data" format="tabular" 
-               label="Hairpin Annotation"/>
-          
+               label="Target Annotation"/>
         
         <param name="samples" type="data" format="tabular" 
                label="Sample Annotation"/>
@@ -91,47 +142,171 @@
         <repeat name="fastq" title="FastQ Files">       
           <param name="file" type="data" format="fastq"/>
         </repeat>
+        
+        <conditional name="secondaryFactor">
           
+          <param name="secFactorOpt" type="select"
+                 label="Include Secondary Factor">
+
+            <option value="no" selected="True">No</option>
+
+            <option value="yes">Yes</option>
+
+          </param>
+
+          <when value="yes">
+
+            <param name="secFactName" type="text" label="Secondary Factor Name"
+                   size="80"/>
+
+          </when>
+
+          <when value="no">
+          </when>
+        </conditional>
+        
         <conditional name="positions">
           <param name="posOption" type="select" 
-                 label="Specify Barcode and Hairpin Locations?"
-                 help="Default Positions: Barcode: 1 to 5, Hairpin: 37 to 57.">
+                 label="Specify Sample Index and Target Sequence Locations?"
+                 help="Default Positions: Index: 1 to 5, Target: 37 to 57.">
             <option value="no" selected="True">No</option>
             <option value="yes">Yes</option>
           </param>
           
           <when value="yes">
             <param name="barstart" type="integer" value="1"
-                   label="Barcode Starting Position"/>
+                   label="Index Starting Position"/>
             <param name="barend" type="integer" value="5"
-                   label="Barcode Ending Position"/>
+                   label="Index Ending Position"/>
             
             <param name="hpstart" type="integer" value="37"
-                   label="Hairpin Starting Position"/>
+                   label="Target Starting Position"/>
                
             <param name="hpend" type="integer" value="57"
-                   label="Hairpin Ending Position"/>
+                   label="Target Ending Position"/>
           </when>
           
           <when value="no"/>
         </conditional>
       </when>
-      
+
+      <when value="pairedFastq">
+        <param name="hairpin" type="data" format="tabular" 
+               label="Target Sequence Annotation"/>
+        
+        <param name="samples" type="data" format="tabular" 
+               label="Sample Annotation"/>
+               
+        <repeat name="fastq" title="FastQ Files">       
+          <param name="file" type="data" format="fastq"/>
+          <param name="fileRev" type="data" format="fastq"/>
+        </repeat>
+          
+        <conditional name="secondaryFactor">
+
+          <param name="secFactorOpt" type="select"
+                 label="Include Secondary Factor">
+
+            <option value="no" selected="True">No</option>
+
+            <option value="yes">Yes</option>
+
+          </param>
+
+          <when value="yes">
+
+            <param name="secFactName" type="text" label="Secondary Factor Name"
+                   size="80"/>
+
+          </when>
+
+          <when value="no">
+          </when>
+        </conditional>
+
+        <conditional name="positions">
+
+          <param name="posOption" type="select" 
+                 label="Specify Sample Index and Target Sequence Locations?"
+                 help="Default Positions: Index: 1 to 5, Input required for 
+                       reverse end, Target: 37 to 57.">
+
+            <option value="no" selected="True">No</option>
+
+            <option value="yes">Yes</option>
+
+          </param>
+          
+          <when value="yes">
+            <param name="barstart" type="integer" value="1"
+                   label="Index Starting Position"/>
+
+            <param name="barend" type="integer" value="5"
+                   label="Index Ending Position"/>
+
+            <param name="barstartRev" type="integer" value="0"
+                   label="Reverse Index Starting Position"/>
+                   
+            <param name="barendRev" type="integer" value="0"
+                   label="Reverse Index Ending Position"/>
+           
+            <param name="hpstart" type="integer" value="37"
+                   label="Target Starting Position"/>
+               
+            <param name="hpend" type="integer" value="57"
+                   label="Target Ending Position"/>
+          </when>
+
+          <when value="no">
+          </when>
+
+        </conditional>
+
+      </when>
+
       <when value="counts">
+
         <param name="counts" type="data" format="tabular" label="Counts Table"/>
+
         <param name="hairpin" type="data" format="tabular" 
-               label="Hairpin Annotation"/>
+               label="Target Sequence Annotation"/>
+
         <param name="samples" type="data" format="tabular"
                label="Sample Annotation"/> 
+
+        <conditional name="secondaryFactor">
+
+          <param name="secFactorOpt" type="select"
+                 label="Include Secondary Factor">
+
+            <option value="no" selected="True">No</option>
+
+            <option value="yes">Yes</option>
+
+          </param>
+
+          <when value="yes">
+
+            <param name="secFactName" type="text" label="Secondary Factor Name"
+                   size="80"/>
+
+          </when>
+
+          <when value="no">
+          </when>
+
+        </conditional>
+
       </when>
+
     </conditional>
     
     <conditional name="filterCPM">
       <param name="filtOption" type="select" label="Filter Low CPM?"
-       help="Ignore hairpins with very low representation when performing 
-             analysis.">
+       help="Ignore target sequences with very low representation when 
+             performing analysis.">
         <option value="yes">Yes</option>
-       	<option value="no">No</option>
+        <option value="no">No</option>
       </param>
       
         <when value="yes">
@@ -142,6 +317,12 @@
                  label="Minimum Samples" 
                  help="Filter out all the genes that do not meet the minimum 
                        CPM in at least this many samples."/>
+
+          <param name="readReq" type="integer" value="1000" min="0"
+                 label="Minimum Reads" 
+                 help="Filter out all samples that do not have the minimum 
+                       number of reads."/>
+
         </when>
         
         <when value="no"/>
@@ -175,17 +356,18 @@
         <conditional name="roast">
           <param name="roastOption" type="select" 
                  label="Perform Gene Level Analysis?"
-                 help="Analyse LogFC tendencies for hairpins belonging
-                       to the same gene.">
+                 help="Analyse LogFC tendencies for target sequences belonging
+                       to the same gene. NOTE: this is a slow procedure that
+                       scales badly with the number of genes analysed.">
             <option value="no">No</option>
             <option value="yes">Yes</option>
           </param>
           
           <when value="yes">
             <param name="hairpinReq" type="integer" value="2" min="2"
-                   label="Minimum Hairpins"
-                   help="Only genes with at least this many hairpins will
-                         be analysed."/>
+                   label="Minimum Targets Found"
+                   help="Only genes with at least this many target sequences
+                         found will be analysed."/>
                          
             <conditional name="select">
               <param name="selOption" type="select"
@@ -223,25 +405,33 @@
            label="FDR Threshold"
            help="All observations below this threshold will be highlighted
                  in the smear plot."/>
+
     <param name="lfc" type="float" value="0" min="0" 
            label="Absolute LogFC Threshold"
            help="In additional to meeting the FDR requirement, the absolute 
                  value of the log-fold-change of the observation must be above
                  this threshold to be highlighted."/>
+
+    <param name="direction" type="select" label="Highlight Option"
+        help="Only hightlight positive or negative fold changes in smear plot?">
+        <option value="all">Default</option>
+        <option value="up">Positive Only</option>
+        <option value="down">Negative Only</option>
+    </param>
   </inputs>
 
   <outputs>
-    <data format="html" name="outFile" label="shRNAseq Analysis"/>
+    <data format="html" name="outFile" label="TagSeq Analysis"/>
   </outputs>
   <help>
 .. class:: infomark
 
 **What it does**
 
-Given tables containing information about the hairpins and their associated
-barcodes, information about the samples and fastq file containing the hairpin
-reads. This tool will generate plots and tables for the analysis of differential
-representation.
+Given tables containing information about the hairpins/sgRNA and their 
+associated sample indices, information about the samples and fastq file 
+containing the sequencing reads. This tool will generate plots and tables for 
+the analysis of differential representation.
 
 .. class:: infomark
 
@@ -257,14 +447,15 @@
 **Input File Type:**
 
 This tool is able to either generate counts from a raw FastQ file given the
-information regarding the samples and hairpins. Alternatively if a table of
-counts has already been generated it can also be used.
+information regarding the samples and hairpins/sgRNA. Alternatively if a table 
+of counts has already been generated it can also be used.
 
 **Counts Table (Counts Input):**
 
-A tab delimited text table of information regarding the counts of hairpins.
-Should have a column 'ID' to denote the hairpins that counts correspond to. Each
-additional column should have titles corresponding to the label for the sample.
+A tab delimited text table of information regarding the counts of 
+hairpins/sgRNA. Should have a column 'ID' to denote the hairpins/sgRNA that 
+counts correspond to. Each additional column should have titles corresponding to 
+the label for the sample.
 
 Example::
 
@@ -281,62 +472,80 @@
   Hairpin7 49501 49076 47611
   ...
   
-**Hairpin Annotation:**
+**Target Sequence Annotation:**
 
-A tab delimited text table of information regarding the hairpins. Should have
-columns 'ID', 'Sequences' and 'Gene' to uniquely identify the hairpin, align it
-with the reads to produce counts and identify which gene the hairpin acts on.
+A tab delimited text table of information regarding the targetted 
+hairpins/sgRNA sequence. Should have columns 'ID', 'Sequences' and 'Gene' to 
+uniquely identify the target, align it with the reads to produce counts and 
+identify which gene the target acts on.
 
 NOTE: the column names are case sensitive and should be input exactly as they
 are shown here.
 
 Example::
 
-  ID	Sequences	Gene
-  Control1	TCTCGCTTGGGCGAGAGTAAG	2
-  Control2	CCGCCTGAAGTCTCTGATTAA	2
-  Control3	AGGAATTATAATGCTTATCTA	2
-  Hairpin1	AAGGCAGAGACTGACCACCTA	4
-  Hairpin2	GAGCGACCTGGTGTTACTCTA	4
-  Hairpin3	ATGGTGTAAATAGAGCTGTTA	4
-  Hairpin4	CAGCTCATCTTCTGTGAAGAA	4
-  Hairpin5	CAGCTCTGTGGGTCAGAAGAA	4
-  Hairpin6	CCAGGCACAGATCTCAAGATA	4
-  Hairpin7	ATGACAAGAAAGACATCTCAA	7
+  ID  Sequences Gene
+  Control1  TCTCGCTTGGGCGAGAGTAAG 2
+  Control2  CCGCCTGAAGTCTCTGATTAA 2
+  Control3  AGGAATTATAATGCTTATCTA 2
+  Hairpin1  AAGGCAGAGACTGACCACCTA 4
+  Hairpin2  GAGCGACCTGGTGTTACTCTA 4
+  Hairpin3  ATGGTGTAAATAGAGCTGTTA 4
+  Hairpin4  CAGCTCATCTTCTGTGAAGAA 4
+  Hairpin5  CAGCTCTGTGGGTCAGAAGAA 4
+  Hairpin6  CCAGGCACAGATCTCAAGATA 4
+  Hairpin7  ATGACAAGAAAGACATCTCAA 7
   ...
   
 **Sample Annotation (FastQ Input):**
 
 A tab delimited text table of information regarding the samples. Should have
 columns 'ID', 'Sequences' and 'group' to uniquely identify each sample, identify
-the sample in the reads by its barcode sequence and correctly group replicates
-for analysis. Additional columns may inserted for annotation purposes and will
-not interfere with analysis as long as the necessary columns are present.
+the sample in the reads by its sample index sequence and correctly group 
+replicates for analysis. Additional columns may inserted for annotation purposes 
+and will not interfere with analysis as long as the necessary columns are 
+present.
 
-NOTE: the column names are case sensitive and should be input exactly as they
-are shown here.
+NOTE: With the exception of other_group, column names are case sensitive and
+should be input exactly as they are shown here. The other_group column can be
+named by the user and specified in the "Include Secondary Factor" option of the
+tool.
 
 Example::
 
-  ID	Sequences	group	Replicate
-  3	GAAAG	Day 2	1
-  6	GAACC	Day 10	1
-  9	GAAGA	Day 5 GFP neg	1
-  16	GAATT	Day 5 GFP pos	1
-  18	GACAC	Day 2	2
-  21	GACCA	Day 10	2
-  28	GACGT	Day 5 GFP neg	2
-  31	GACTG	Day 5 GFP pos	2
-  33	GAGAA	Day 2	3
-  40	GAGCT	Day 10	3
+  ID  Sequences group other_group Replicate
+  3 GAAAG Day 2 male 1
+  6 GAACC Day 10  female  1
+  9 GAAGA Day 5 GFP neg male 1
+  16  GAATT Day 5 GFP pos male 1
+  18  GACAC Day 2 female 2
+  21  GACCA Day 10  male  2
+  28  GACGT Day 5 GFP neg male 2
+  31  GACTG Day 5 GFP pos female 2
+  33  GAGAA Day 2 male 3
+  40  GAGCT Day 10  female  3
   ...
   
-**Specify Barcode and Hairpin Locations (FastQ Input):**
+**Include Secondary Factor**
+If there are two factors involved in the experiment (i.e. Age and Gender) then
+then secondary factor should be included to improve the statistical analysis.
+The secondary factor should be specified as a column in the sample annotation
+file and the corresponding column name should be input exactly as it is into 
+the provided field in the tool.
+
+NOTE: Currently the secondary factor is used only to improve statistical
+analysis, comparisons can only be made in the primary factor specified as 
+"group" in the sample annotation.
+
+**Specify Sample Index and Target Sequence Locations (FastQ Input):**
 
 It is assumed that in the sequencing reads that the first 5 bases are the
-barcodes and that bases 37-57 are the hairpins. If this is not the case then the
-values of the positions can be changed, however it still requires the barcodes
-and hairpins to be in a consistent location an in a continuous sequence.
+sample index sequence and that bases 37-57 are the hairpins/sgRNA. If this is 
+not the case then the values of the positions can be changed, however it still
+requires the sample indices and hairpins/sgRNA to be in a consistent location an
+in a continuous sequence.
+
+NOTE: position values start at 1 for the first base.
 
 **Filter Low CPM?:**
 
@@ -346,21 +555,21 @@
 
 **Analysis Type:**
 
- * **Classic Exact Test:** This allows two experimental groups to be compared and
-   p-values for differential representation derivec for each hairpin. Simple and
-   fast for straightforward comparisons. In this option you will have the option of
-   "*Compare* x *To* y" which implicitly subtracts the data from y from that of x
-   to produce the comparison.
+ * **Classic Exact Test:** This allows two experimental groups to be compared 
+   and p-values for differential representation derivec for each target 
+   sequence. Simple and fast for straightforward comparisons. In this option you
+   will have the option of "*Compare* x *To* y" which implicitly subtracts the 
+   data from y from that of x to produce the comparison.
 
- * **Generalised Linear Model:** This allow for complex contrasts to be specified
-   and also gene level analysis to be performed. If this option is chosen then
-   contrasts must be explicitly stated in equations and multiple contrasts can be
-   made. In addition there will be the option to analyse hairpins on a per-gene
-   basis to see if hairpins belonging to a particular gene have any overall
-   tendencies for the direction of their log-fold-change.
+ * **Generalised Linear Model:** This allow for complex contrasts to be specified 
+   and also gene level analysis to be performed. If this option is chosen then 
+   contrasts must be explicitly stated in equations and multiple contrasts can 
+   be made. In addition there will be the option to analyse hairpins/sgRNA on a 
+   per-gene basis to see if hairpins/sgRNA belonging to a particular gene have 
+   any overall tendencies for the direction of their log-fold-change.
 
 **FDR Threshold:**
-The smear plot in the output will have hairpins highlighted to signify
+The smear plot in the output will have hairpins/sgRNA highlighted to signify
 significant differential representation. The significance is determined by
 contorlling the false discovery rate, only those with a FDR lower than the
 threshold will be highlighted in the plot.
@@ -379,10 +588,10 @@
 using.  The methodology articles are listed in Section 2.1 of the limma 
 User's Guide.
 
-	* Smyth, GK (2005). Limma: linear models for microarray data. In: 
-	  'Bioinformatics and Computational Biology Solutions using R and 
-	  Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, 
-	  W. Huber (eds), Springer, New York, pages 397-420.
+  * Smyth, GK (2005). Limma: linear models for microarray data. In: 
+    'Bioinformatics and Computational Biology Solutions using R and 
+    Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, 
+    W. Huber (eds), Springer, New York, pages 397-420.
 
 .. class:: infomark
 
@@ -392,25 +601,24 @@
 the various original statistical methods implemented in edgeR.  See 
 Section 1.2 in the User's Guide for more detail.
 
-	* Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor 
-	  package for differential expression analysis of digital gene expression 
-	  data. Bioinformatics 26, 139-140
-	  
-	* Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing 
-	  differences in tag abundance. Bioinformatics 23, 2881-2887
-	  
-	* Robinson MD and Smyth GK (2008). Small-sample estimation of negative 
-	  binomial dispersion, with applications to SAGE data.
-	  Biostatistics, 9, 321-332
-	  
-	* McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis 
-	  of multifactor RNA-Seq experiments with respect to biological variation. 
-	  Nucleic Acids Research 40, 4288-4297
-	  
+  * Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor 
+    package for differential expression analysis of digital gene expression 
+    data. Bioinformatics 26, 139-140
+    
+  * Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing 
+    differences in tag abundance. Bioinformatics 23, 2881-2887
+    
+  * Robinson MD and Smyth GK (2008). Small-sample estimation of negative 
+    binomial dispersion, with applications to SAGE data.
+    Biostatistics, 9, 321-332
+    
+  * McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis 
+    of multifactor RNA-Seq experiments with respect to biological variation. 
+    Nucleic Acids Research 40, 4288-4297
+    
 Report problems to: su.s@wehi.edu.au
 
 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
 .. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html
   </help>
 </tool>
-