diff cuffdiff_wrapper.xml @ 2:fdf01b3c1841

Update to new cuffdiff wrapper, add cuffdb_info.txt to cummerbund html output
author Jim Johnson <jj@umn.edu>
date Fri, 08 Nov 2013 14:54:01 -0600
parents da7241f92ecf
children f109453ecfa2
line wrap: on
line diff
--- a/cuffdiff_wrapper.xml	Mon Feb 04 21:23:20 2013 -0600
+++ b/cuffdiff_wrapper.xml	Fri Nov 08 14:54:01 2013 -0600
@@ -1,78 +1,30 @@
-<tool id="cuffdiff_cummerbund" name="Cuffdiff" version="0.0.6">
-    <!-- Wrapper supports Cuffdiff versions v1.3.0-v2.0 -->
+<tool id="cuffdiff_cummerbund" name="Cuffdiff for cummeRbund" version="0.0.7">
+    <!-- Wrapper supports Cuffdiff versions 2.1.0-2.1.1 -->
     <description>find significant changes in transcript expression, splicing, and promoter use</description>
     <requirements>
-        <requirement type="package">cufflinks</requirement>
+        <requirement type="package" version="2.1.1">cufflinks</requirement>
     </requirements>
-    <command interpreter="python">
-        #set sel_outputs = $output_sel.__str__.split(',')
-        cuffdiff_wrapper.py
+    <version_command>cuffdiff 2>&amp;1 | head -n 1</version_command>
+    <command>
+        cuffdiff
+            --no-update-check
             --FDR=$fdr
             --num-threads="4"
             --min-alignment-count=$min_alignment_count
-
-            #if 'cuffdata' in $sel_outputs or not $output_sel:
-                --cuffdatadir=$cuffdata.extra_files_path
-            #end if
-            #if 'cummeRbund_db' in $sel_outputs:
-                --cummeRbund_db=$cummeRbund_db
-            #end if
+            --library-norm-method=$library_norm_method
+            --dispersion-method=$dispersion_method
 
-            #if 'isoforms_fpkm_tracking' in $sel_outputs:
-                --isoforms_fpkm_tracking_output=$isoforms_fpkm_tracking
-            #end if
-            #if 'genes_fpkm_tracking' in $sel_outputs:
-                --genes_fpkm_tracking_output=$genes_fpkm_tracking
-            #end if
-            #if 'cds_fpkm_tracking' in $sel_outputs:
-                --cds_fpkm_tracking_output=$cds_fpkm_tracking
-            #end if
-            #if 'tss_groups_fpkm_tracking' in $sel_outputs:
-                --tss_groups_fpkm_tracking_output=$tss_groups_fpkm_tracking
-            #end if
-            #if 'isoforms_exp_diff' in $sel_outputs:
-                --isoforms_exp_output=$isoforms_exp_diff
-            #end if
-            #if 'genes_exp_diff' in $sel_outputs:
-                --genes_exp_output=$genes_exp_diff
-            #end if
-            #if 'tss_groups_exp_diff' in $sel_outputs:
-                --tss_groups_exp_output=$tss_groups_exp_diff
-            #end if
-            #if 'cds_exp_fpkm_tracking' in $sel_outputs:
-                --cds_exp_fpkm_tracking_output=$cds_exp_fpkm_tracking
-            #end if
-            #if 'splicing_diff' in $sel_outputs:
-                --splicing_diff_output=$splicing_diff
-            #end if
-            #if 'cds_diff' in $sel_outputs:
-                --cds_diff_output=$cds_diff
-            #end if
-            #if 'promoters_diff' in $sel_outputs:
-                --promoters_diff_output=$promoters_diff
-            #end if
-            #if 'cds_read_group_tracking' in $sel_outputs:
-                --cds_read_group_tracking=$cds_read_group_tracking
-            #end if
-            #if 'tss_groups_read_group_tracking' in $sel_outputs:
-                --tss_groups_read_group_tracking=$tss_groups_read_group_tracking
-            #end if
-            #if 'genes_read_group_tracking' in $sel_outputs:
-                --genes_read_group_tracking=$genes_read_group_tracking
-            #end if
-            #if 'isoforms_read_group_tracking' in $sel_outputs:
-                --isoforms_read_group_tracking=$isoforms_read_group_tracking
-            #end if
-            
             ## Set advanced data parameters?
             #if $additional.sAdditional == "Yes":
-                -m $additional.frag_mean_len
-                -s $additional.frag_len_std_dev
-            #end if
-
-            ## Normalization?
-            #if str($do_normalization) == "Yes":
-            -N
+                #if $additional.frag_mean_len:
+                    -m $additional.frag_mean_len
+                #end if
+                #if $additional.frag_len_std_dev:
+                    -s $additional.frag_len_std_dev
+                #end if
+                #if $additional.max_bundle_frags:
+                    --max-bundle-frags="$additional.max_bundle_frags"
+                #end if
             #end if
 
             ## Multi-read correct?
@@ -82,67 +34,77 @@
 
             ## Bias correction?
             #if $bias_correction.do_bias_correction == "Yes":
-	        -b
+               -b
                 #if $bias_correction.seq_source.index_source == "history":
-                    --ref_file=$bias_correction.seq_source.ref_file
+                    ## Custom genome from history.
+                    $bias_correction.seq_source.ref_file
                 #else:
-                    --ref_file="None"
+                    ## Built-in genome.
+                    ${__get_data_table_entry__('sam_fa_indexes', 'value', $gtf_input.dbkey, 'path')}
                 #end if
-                --dbkey=${gtf_input.metadata.dbkey}
-                --index_dir=${GALAXY_DATA_INDEX_DIR}
-            #end if
-                
-            ## Inputs.
-            --inputA=$gtf_input
-            #if $group_analysis.do_groups == "No":
-                --input1=$aligned_reads1
-                --input2=$aligned_reads2
-            #else:
-                ## Replicates.
-                --labels
-                #for $group in $group_analysis.groups
-                    ${group.group}
-                #end for
-                --files
-                #for $group in $group_analysis.groups
-                    #for $file in $group.files:
-                        ${file.file}
-                    #end for
-                    ,
-                #end for
             #end if
 
+            #set labels = '\'' + '\',\''.join( [ str( $condition.name ) for $condition in $conditions ] ) + '\''
+            --labels $labels
+
+            ## Inputs.
+            $gtf_input
+            #for $condition in $conditions:
+                #set samples = ','.join( [ str( $sample.sample ) for $sample in $condition.samples ] )
+                $samples
+            #end for
+ 
+            ## If build cummerbund db
+            #if $build_cummerbund_db:
+                &amp;&amp; echo 'library(cummeRbund)' > cuffData.r 
+                #if $bias_correction.do_bias_correction == "Yes":
+                    #if $bias_correction.seq_source.index_source == "history":
+                        ## Custom genome from history.
+                        &amp;&amp; echo 'cuff&lt;-readCufflinks( dbFile = "cuffdata.db", gtfFile = "$gtf_input", genome = "$bias_correction.seq_source.ref_file", rebuild = T)' >> cuffData.r
+                    #else:
+                        ## Built-in genome.
+                        ${__get_data_table_entry__('sam_fa_indexes', 'value', $gtf_input.dbkey, 'path')}
+                        &amp;&amp; echo 'cuff&lt;-readCufflinks( dbFile = "cuffdata.db", gtfFile = "$gtf_input", genome = "${__get_data_table_entry__('sam_fa_indexes', 'value', $gtf_input.dbkey, 'path')}", rebuild = T)' >> cuffData.r
+                    #end if
+                #else 
+                    &amp;&amp; echo 'cuff&lt;-readCufflinks( dbFile = "cuffdata.db", rebuild = T)' >> cuffData.r
+                #end if
+                &amp;&amp; Rscript --vanilla cuffData.r
+                &amp;&amp; cp cuffdata.db $cummerbund_db
+            #end if
     </command>
     <inputs>
         <param format="gtf,gff3" name="gtf_input" type="data" label="Transcripts" help="A transcript GFF3 or GTF file produced by cufflinks, cuffcompare, or other source."/>
-        <conditional name="group_analysis"> 
-            <param name="do_groups" type="select" label="Perform replicate analysis" help="Perform cuffdiff with replicates in each group.">
-                <option value="No">No</option>
-                <option value="Yes">Yes</option>
-            </param>
-            <when value="Yes">
-                <repeat name="groups" title="Group">
-                    <param name="group" title="Group name" type="text" label="Group name (no spaces or commas)"/>
-                    <repeat name="files" title="Replicate">
-                        <param name="file" label="Add file" type="data" format="sam,bam"/>
-                    </repeat>
-                </repeat>
-            </when>
-            <when value="No">
-                <param format="sam,bam" name="aligned_reads1" type="data" label="SAM or BAM file of aligned RNA-Seq reads" help=""/>
-                <param format="sam,bam" name="aligned_reads2" type="data" label="SAM or BAM file of aligned RNA-Seq reads" help=""/>
-            </when>
-        </conditional>
+
+        <repeat name="conditions" title="Condition" min="2">
+            <param name="name" title="Condition name" type="text" label="Name"/>
+            <repeat name="samples" title="Replicate" min="1">
+                <param name="sample" label="Add replicate" type="data" format="sam,bam"/>
+            </repeat>
+        </repeat>
+        <param name="time_series" type="boolean" checked="false" truevalue="--time-series" falsevalue="" optional="true" label="treat samples as a time-series">
+            <help>
+            Instructs Cuffdiff to analyze the provided samples as a time series, rather than testing for differences between all pairs of samples. 
+            Samples should be provided in increasing time order at the command line (e.g first time point SAM, second timepoint SAM, etc.)
+            </help>
+        </param>
+
+        <param name="library_norm_method" type="select" label="Library normalization method">
+            <option value="geometric" selected="True">geometric</option>
+            <option value="classic-fpkm">classic-fpkm</option>
+            <option value="quartile">quartile</option>
+        </param>
+
+        <param name="dispersion_method" type="select" label="Dispersion estimation method" help="If using only one sample per condition, you must use 'blind.'">
+            <option value="pooled" selected="True">pooled</option>
+            <option value="per-condition">per-condition</option>
+            <option value="blind">blind</option>
+        </param>
 
         <param name="fdr" type="float" value="0.05" label="False Discovery Rate" help="The allowed false discovery rate."/>
 
         <param name="min_alignment_count" type="integer" value="10" label="Min Alignment Count" help="The minimum number of alignments in a locus for needed to conduct significance testing on changes in that locus observed between samples."/>
 
-        <param name="do_normalization" type="select" label="Perform quartile normalization" help="Removes top 25% of genes from FPKM denominator to improve accuracy of differential expression calls for low abundance transcripts.">
-            <option value="No">No</option>
-            <option value="Yes">Yes</option>
-        </param>
-
         <param name="multiread_correct" type="select" label="Use multi-read correct" help="Tells Cufflinks to do an initial estimation procedure to more accurately weight reads mapping to multiple locations in the genome.">
             <option value="No" selected="true">No</option>
             <option value="Yes">Yes</option>
@@ -168,142 +130,95 @@
             <when value="No"></when>
         </conditional>
 
+        <param name="include_read_group_files" type="select" label="Include Read Group Datasets" help="Read group datasets provide information on replicates.">
+            <option value="No" selected="true">No</option>
+            <option value="Yes">Yes</option>
+        </param>
+        <param name="build_cummerbund_db" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Build cummeRbund database"/>
         <conditional name="additional">
-            <param name="sAdditional" type="select" label="Set Additional Parameters? (not recommended)">
+            <param name="sAdditional" type="select" label="Set Additional Parameters? (not recommended for paired-end reads)">
                 <option value="No">No</option>
                 <option value="Yes">Yes</option>
             </param>
             <when value="No"></when>
             <when value="Yes">
-                <param name="frag_mean_len" type="integer" value="200" label="Average Fragment Length"/>
-                <param name="frag_len_std_dev" type="integer" value="80" label="Fragment Length Standard Deviation"/>
+                <param name="frag_mean_len" type="integer" value="" optional="true" label="Average Fragment Length Default: 200">
+                    <help>
+                    Note: Cufflinks now learns the fragment length mean for each SAM file, 
+                    so using this option is no longer recommended with paired-end reads.
+                    </help>
+                </param>
+                <param name="frag_len_std_dev" type="integer" value="" optional="true" label="Fragment Length Standard Deviation Default: 80">
+                    <help>
+                    Note: Cufflinks now learns the fragment length mean for each SAM file, 
+                    so using this option is no longer recommended with paired-end reads.
+                    </help>
+                </param>
+                <param name="max_bundle_frags" type="integer" value="" optional="true" label="--max-bundle-frags">
+                    <help>
+                    Sets the maximum number of fragments a locus may have before being skipped. Skipped loci are listed in skipped.gtf. Default: 1000000
+                    </help>
+                    <validator type="in_range" message="Value greater than 0" min="1"/>
+                </param>
             </when>
         </conditional>
+    </inputs>
 
-        <param name="output_sel" type="select" multiple="true" display="checkboxes" force_select="true" label="Select outputs for history datasets">
-            <option value="cuffdata">cuffdata - html page with links to cuffdiff outputs</option>
-            <option value="cummeRbund_db">cummeRbund database</option>
-            <option value="run_info">run.info</option>
-            <option value="read_groups_info">read_groups.info</option>
-            <option value="splicing_diff">splicing.diff</option>
-            <option value="promoters_diff">promoters.diff</option>
-            <option value="genes_exp_diff">genes_exp.diff</option>
-            <option value="genes_fpkm_tracking">genes.fpkm_tracking</option>
-            <option value="genes_count_tracking">genes.count_tracking</option>
-            <option value="genes_read_group_tracking">genes.read_group_tracking</option>
-            <option value="isoforms_exp_diff">isoforms.exp_diff</option>
-            <option value="isoforms_fpkm_tracking">isoforms.fpkm_tracking</option>
-            <option value="isoforms_count_tracking">isoforms.count_tracking</option>
-            <option value="isoforms_read_group_tracking">isoforms.read_group_tracking</option>
-            <option value="cds_diff">cds.diff</option>
-            <option value="cds_exp_diff">cds_exp.diff</option>
-            <option value="cds_fpkm_tracking">cds.fpkm_tracking</option>
-            <option value="cds_count_tracking">cds.count_tracking</option>
-            <option value="cds_read_group_tracking">cds.read_group_tracking</option>
-            <option value="tss_groups_exp_diff">tss_groups_exp.diff</option>
-            <option value="tss_groups_fpkm_tracking">tss_groups.fpkm_tracking</option>
-            <option value="tss_groups_count_tracking">tss_groups.count_tracking</option>
-            <option value="tss_groups_read_group_tracking">tss_groups.read_group_tracking</option>
-        </param>
-        
-    </inputs>
+    <stdio>
+        <regex match="Error" source="both" level="fatal" description="Error"/>
+        <regex match=".*" source="both" level="log" description="tool progress"/>
+    </stdio>
 
     <outputs>
-        <data format="text" name="run_info" label="${tool.name} on ${on_string}: run.info">
-            <filter>output_sel and 'run_info' in output_sel</filter>
-        </data>
-        <data format="tabular" name="read_groups_info" label="${tool.name} on ${on_string}: read_groups.info">
-            <filter>output_sel and 'read_groups_info' in output_sel</filter>
-        </data>
-        <data format="tabular" name="splicing_diff" label="${tool.name} on ${on_string}: splicing differential expression testing">
-            <filter>output_sel and 'splicing_diff' in output_sel</filter>
-        </data>
-        <data format="tabular" name="promoters_diff" label="${tool.name} on ${on_string}: promoters differential expression testing">
-            <filter>output_sel and 'promoters_diff' in output_sel</filter>
-        </data>
-        <data format="tabular" name="cds_diff" label="${tool.name} on ${on_string}: CDS overloading diffential expression testing">
-            <filter>output_sel and 'cds_diff' in output_sel</filter>
-        </data>
-        <data format="tabular" name="cds_exp_diff" label="${tool.name} on ${on_string}: CDS differential expression testing">
-            <filter>output_sel and 'cds_exp_diff' in output_sel</filter>
+        <!-- Optional read group datasets. -->
+        <data format="cuffdatadb" name="cummerbund_db" label="${tool.name} on ${on_string}: cummeRbund sqlite Database" >
+            <filter>build_cummerbund_db</filter>
         </data>
-        <data format="tabular" name="cds_fpkm_tracking" label="${tool.name} on ${on_string}: CDS FPKM tracking">
-            <filter>output_sel and 'cds_fpkm_tracking' in output_sel</filter>
-        </data>
-        <data format="tabular" name="cds_count_tracking" label="${tool.name} on ${on_string}: CDS counts">
-            <filter>output_sel and 'cds_count_tracking' in output_sel</filter>
-        </data>
-        <data format="tabular" name="cds_read_group_tracking" label="${tool.name} on ${on_string}: CDS Read Group tracking">
-            <filter>output_sel and 'cds_read_group_tracking' in output_sel</filter>
+        <data format="tabular" name="isoforms_read_group" label="${tool.name} on ${on_string}: isoforms read group tracking" from_work_dir="isoforms.read_group_tracking" >
+            <filter>(params['include_read_group_files'] == 'Yes'</filter>
         </data>
-        <data format="tabular" name="tss_groups_exp_diff" label="${tool.name} on ${on_string}: TSS groups differential expression testing">
-            <filter>output_sel and 'tss_groups_exp_diff' in output_sel</filter>
+        <data format="tabular" name="genes_read_group" label="${tool.name} on ${on_string}: genes read group tracking" from_work_dir="genes.read_group_tracking" >
+            <filter>(params['include_read_group_files'] == 'Yes'</filter>
         </data>
-        <data format="tabular" name="tss_groups_fpkm_tracking" label="${tool.name} on ${on_string}: TSS groups FPKM tracking">
-            <filter>output_sel and 'tss_groups_fpkm_tracking' in output_sel</filter>
-        </data>
-        <data format="tabular" name="tss_groups_count_tracking" label="${tool.name} on ${on_string}: TSS groups counts">
-            <filter>output_sel and 'tss_groups_count_tracking' in output_sel</filter>
+        <data format="tabular" name="cds_read_group" label="${tool.name} on ${on_string}: CDs read group tracking" from_work_dir="cds.read_group_tracking" >
+            <filter>(params['include_read_group_files'] == 'Yes'</filter>
         </data>
-        <data format="tabular" name="tss_groups_read_group_tracking" label="${tool.name} on ${on_string}: TSS groups Read Group tracking">
-            <filter>output_sel and 'tss_groups_read_group_tracking' in output_sel</filter>
-        </data>
-        <data format="tabular" name="isoforms_exp_diff" label="${tool.name} on ${on_string}: transcript differential expression testing">
-            <filter>output_sel and 'isoforms_exp_diff' in output_sel</filter>
-        </data>
-        <data format="tabular" name="isoforms_fpkm_tracking" label="${tool.name} on ${on_string}: transcript FPKM tracking">
-            <filter>output_sel and 'isoforms_fpkm_tracking' in output_sel</filter>
-        </data>
-        <data format="tabular" name="isoforms_count_tracking" label="${tool.name} on ${on_string}: transcript counts">
-            <filter>output_sel and 'isoforms_count_tracking' in output_sel</filter>
-        </data>
-        <data format="tabular" name="isoforms_read_group_tracking" label="${tool.name} on ${on_string}: transcript Read Group tracking">
-            <filter>output_sel and 'isoforms_read_group_tracking' in output_sel</filter>
-        </data>
-        <data format="tabular" name="genes_exp_diff" label="${tool.name} on ${on_string}: gene differential expression testing">
-            <filter>output_sel and 'genes_exp_diff' in output_sel</filter>
+        <data format="tabular" name="tss_groups_read_group" label="${tool.name} on ${on_string}: TSS groups read group tracking" from_work_dir="tss_groups.read_group_tracking" >
+            <filter>(params['include_read_group_files'] == 'Yes'</filter>
         </data>
-        <data format="tabular" name="genes_fpkm_tracking" label="${tool.name} on ${on_string}: gene FPKM tracking">
-            <filter>output_sel and 'genes_fpkm_tracking' in output_sel</filter>
-        </data>
-        <data format="tabular" name="genes_count_tracking" label="${tool.name} on ${on_string}: gene counts">
-            <filter>output_sel and 'genes_count_tracking' in output_sel</filter>
-        </data>
-        <data format="tabular" name="genes_read_group_tracking" label="${tool.name} on ${on_string}: gene Read Group tracking">
-            <filter>output_sel and 'genes_read_group_tracking' in output_sel</filter>
-        </data>
-        <data format="cuffdata" name="cuffdata" label="${tool.name} on ${on_string}: cuffdata" >
-            <filter>not output_sel or output_sel and 'cuffdata' in output_sel</filter>
-        </data>
-        <data format="cuffdatadb" name="cummeRbund_db" label="${tool.name} on ${on_string}: cummeRbund sqlite Database" >
-            <filter>output_sel and 'cummeRbund_db' in output_sel</filter>
-        </data>
+
+        <!-- Standard datasets. -->
+        <data format="tabular" name="splicing_diff" label="${tool.name} on ${on_string}: splicing differential expression testing" from_work_dir="splicing.diff" />
+        <data format="tabular" name="promoters_diff" label="${tool.name} on ${on_string}: promoters differential expression testing" from_work_dir="promoters.diff" />
+        <data format="tabular" name="cds_diff" label="${tool.name} on ${on_string}: CDS overloading diffential expression testing" from_work_dir="cds.diff" />
+        <data format="tabular" name="cds_exp_fpkm_tracking" label="${tool.name} on ${on_string}: CDS FPKM differential expression testing" from_work_dir="cds_exp.diff" />
+        <data format="tabular" name="cds_fpkm_tracking" label="${tool.name} on ${on_string}: CDS FPKM tracking" from_work_dir="cds.fpkm_tracking" />
+        <data format="tabular" name="tss_groups_exp" label="${tool.name} on ${on_string}: TSS groups differential expression testing" from_work_dir="tss_group_exp.diff" />
+        <data format="tabular" name="tss_groups_fpkm_tracking" label="${tool.name} on ${on_string}: TSS groups FPKM tracking" from_work_dir="tss_groups.fpkm_tracking" />
+        <data format="tabular" name="genes_exp" label="${tool.name} on ${on_string}: gene differential expression testing" from_work_dir="gene_exp.diff" />
+        <data format="tabular" name="genes_fpkm_tracking" label="${tool.name} on ${on_string}: gene FPKM tracking" from_work_dir="genes.fpkm_tracking" />
+        <data format="tabular" name="isoforms_exp" label="${tool.name} on ${on_string}: transcript differential expression testing" from_work_dir="isoform_exp.diff" />
+        <data format="tabular" name="isoforms_fpkm_tracking" label="${tool.name} on ${on_string}: transcript FPKM tracking" from_work_dir="isoforms.fpkm_tracking" />
     </outputs>
-    <stdio>
-        <exit_code range="1:"  level="fatal"   description="Cufflinks Err" />
-    </stdio>
-
 
     <tests>
         <test>
                 <!--
                     cuffdiff cuffcompare_out5.gtf cuffdiff_in1.sam cuffdiff_in2.sam 
                 -->
+                <!-- 
+                    NOTE: as of version 0.0.6 of the wrapper, tests cannot be run because multiple inputs to a repeat
+                    element are not supported.
                 <param name="gtf_input" value="cuffcompare_out5.gtf" ftype="gtf" />
                 <param name="do_groups" value="No" />
                 <param name="aligned_reads1" value="cuffdiff_in1.sam" ftype="sam" />
                 <param name="aligned_reads2" value="cuffdiff_in2.sam" ftype="sam" />
-                <!-- Defaults. -->
                 <param name="fdr" value="0.05" />
                 <param name="min_alignment_count" value="0" />
                 <param name="do_bias_correction" value="No" />
                 <param name="do_normalization" value="No" />
                 <param name="multiread_correct" value="No"/>
                 <param name="sAdditional" value="No"/>
-                <!-- 
-                    Line diffs are needed because cuffdiff does not produce deterministic output.
-                    TODO: can we find datasets that lead to deterministic behavior?
-                -->
                 <output name="splicing_diff" file="cuffdiff_out9.txt"/>
                 <output name="promoters_diff" file="cuffdiff_out10.txt"/>
                 <output name="cds_diff" file="cuffdiff_out11.txt"/>
@@ -315,6 +230,7 @@
                 <output name="genes_fpkm_tracking" file="cuffdiff_out6.txt" lines_diff="200"/>
                 <output name="isoforms_exp" file="cuffdiff_out1.txt" lines_diff="200"/>
                 <output name="isoforms_fpkm_tracking" file="cuffdiff_out5.txt" lines_diff="200"/>
+                -->
         </test>
     </tests>