diff rnachipintegrator_wrapper.xml @ 1:5f69a2c1b9c9 draft

Uploaded version 1.0.0.0.
author pjbriggs
date Wed, 24 Feb 2016 09:39:14 -0500
parents d9c1f2133124
children dc498b03ca9a
line wrap: on
line diff
--- a/rnachipintegrator_wrapper.xml	Tue Jun 30 06:44:06 2015 -0400
+++ b/rnachipintegrator_wrapper.xml	Wed Feb 24 09:39:14 2016 -0500
@@ -1,161 +1,150 @@
 <?xml version="1.0" encoding="utf-8"?>
-<tool id="rnachipintegrator_wrapper" name="RnaChipIntegrator" version="@VERSION@-0">
-  <description>Integrated analysis of gene expression data and ChIP data</description>
+<tool id="rnachipintegrator_wrapper" name="RnaChipIntegrator" version="@VERSION@.0">
+  <description>Integrated analysis of 'gene' and 'peak' data</description>
   <macros>
     <import>rnachipintegrator_macros.xml</import>
   </macros>
   <expand macro="requirements" />
   <expand macro="version_command" />
-  <command interpreter="bash">rnachipintegrator_wrapper.sh
-  #if str( $analysis_options.peak_type ) == "summits"
-    #if str( $analysis_options.window ) != ""
-    --window=$analysis_options.window
-    #end if
-    #if str( $analysis_options.cutoff ) != ""
-    --cutoff=$analysis_options.cutoff
-    #end if
+  <command interpreter="bash"><![CDATA[
+  rnachipintegrator_wrapper.sh
+  #if $peaks_in.metadata.chromCol
+    --peak_cols=${peaks_in.metadata.chromCol},${peaks_in.metadata.startCol},${peaks_in.metadata.endCol}
   #end if
-  #if str( $analysis_options.peak_type ) == "regions"
-    #if str( $analysis_options.edge_cutoff ) != ""
-    --edge-cutoff=$analysis_options.edge_cutoff
-    #end if
-    #if str( $analysis_options.number ) != ""
-    --number=$analysis_options.number
-    #end if
-    #if (str( $analysis_options.promoter_start ) != "" and str( $analysis_options.promoter_end ))
-    --promoter_region=$analysis_options.promoter_start,$analysis_options.promoter_end
-    #end if
-    #if $analysis_options.pad_output
-    --pad
-    #end if
+  #if str( $cutoff ) != ""
+    --cutoff=$cutoff
+  #else
+    --cutoff=0
   #end if
-  $rnaseq $chipseq
-  --output_xls $xls_output
-  #if $results_as_zip
-  --zip_file $zip_file
-  #else
-    #if str( $analysis_options.peak_type ) == "summits"
-    --summit_outputs $peaks_to_transcripts_out $tss_to_summits_out
-    #end if
-    #if str( $analysis_options.peak_type ) == "regions"
-    --peak_outputs $transcripts_to_edges_out
-                   $transcripts_to_edges_summary
-                   $tss_to_edges_out
-                   $tss_to_edges_summary
-    #end if
+  #if str( $number ) != ""
+    --number=$number
   #end if
-  </command>
+  --promoter_region=$promoter_start,$promoter_end
+  --edge=$edge
+  $diff_expressed_only
+  --xlsx_file "$xlsx_out"
+  --output_files "$peaks_per_feature_out" "$features_per_peak_out"
+  #if $output.compact_format
+    --compact
+  #else
+    #if $output.summary
+      --summary_files "$peaks_per_feature_summary" "$features_per_peak_summary"
+    #end if
+    ${output.pad_output}
+  #end if
+  "$features_in" "$peaks_in"
+  ]]></command>
   <inputs>
-    <param format="tabular" name="rnaseq" type="data" label="Gene expression data file" />
-    <param format="tabular" name="chipseq" type="data" label="ChIP peaks data file" />
-  <conditional name="analysis_options">
-    <!-- user must specify if ChIP peaks are summits or regions -->
-    <param name="peak_type" type="select" label="ChIP peaks are"
-	   help="Options and outputs depend on whether ChIP data are summits or regions">
-      <option value="summits">summits</option>
-      <option value="regions">regions</option>
-    </param>
-    <when value="summits">
-      <param name="window" type="integer" value="20000" optional="true"
-	     label="Maximum distance a peak can be from each transcript
-		    TSS before being omitted from analysis" />
-      <param name="cutoff" type="integer" value="130000" optional="true"
-	     label="Maximum distance a transcript TSS can be from each
-		    peak before being omitted from the analysis" />
-    </when>
-    <when value="regions">
-      <param name="edge_cutoff" type="integer" value="10000" optional="true"
-	     label="Maximum distance a transcript edge can be from the
-		    peak edge before being omitted from the analysis"
-	     help="Set to zero to indicate that no cut off should be applied" />
-      <param name="number" type="integer" value="4" optional="true"
-	     label="Maximum number of transcripts per peak to report from
-		    from the analysis" />
-      <param name="promoter_start" type="integer" value="-10000" optional="true"
-	     label="Start of promoter region with respect to gene TSS" />
-      <param name="promoter_end" type="integer" value="2500" optional="true"
-	     label="End of promoter region with respect to gene TSS" />
-      <param name="pad_output" type="boolean" checked="false" truevalue="yes"
-	     label="Output same number of lines for each peak"
-	     help="Add blank lines in output for peaks with fewer than maximum number
-		   of hits (--pad)" />
-    </when>
-  </conditional>
-    <param name="results_as_zip" type="boolean" checked="false" truevalue="yes"
-           label="Put output tab-delimited files into a single zip archive" />
+    <param format="tabular" name="features_in" type="data"
+	   label="Genes/genomic features" />
+    <param format="tabular" name="peaks_in" type="data"
+	   label="Peaks/regions" />
+    <expand macro="analysis_options" />
+    <param name="diff_expressed_only" type="boolean"
+	   truevalue="--only-DE" falsevalue="" checked="false"
+	   label="Only consider genes which are flagged as differentially
+		  expressed"
+           help="NB input feature data must include differential expression
+		 flags (--only-DE)" />
+    <expand macro="output_options" />
   </inputs>
   <outputs>
-    <!-- Always produce XLS output -->
-    <data format="xls" name="xls_output"
-	  label="All RnaChipIntegrator analyses for ${rnaseq.name} vs ${chipseq.name} (Excel spreadsheet)" />
-    <!-- Outputs only produced for summit data -->
-    <data format="tabular" name="peaks_to_transcripts_out"
-	  label="Nearest summits to transcripts for ${rnaseq.name} vs ${chipseq.name}" >
-      <filter>analysis_options['peak_type'] == "summits"</filter>
-      <filter>results_as_zip is False</filter>
-    </data>
-    <data format="tabular" name="tss_to_summits_out"
-	  label="Nearest TSS to summits for ${rnaseq.name} vs ${chipseq.name}" >
-      <filter>analysis_options['peak_type'] == "summits"</filter>
-      <filter>results_as_zip is False</filter>
+    <!-- Always produce XLSX output -->
+    <data format="xlsx" name="xlsx_out"
+	  label="All RnaChipIntegrator analyses: ${features_in.name} vs ${peaks_in.name} (Excel spreadsheet)" />
+    <data format="tabular" name="peaks_per_feature_out"
+	  label="Nearest peaks to each gene: ${features_in.name} vs ${peaks_in.name}" />
+    <data format="tabular" name="features_per_peak_out"
+	  label="Nearest genes to each peak: ${features_in.name} vs ${peaks_in.name}" />
+    <data format="tabular" name="peaks_per_feature_summary"
+	  label="Nearest peaks to each gene (summary): ${features_in.name} vs ${peaks_in.name}" >
+      <filter>output['compact_format'] is False</filter>
+      <filter>output['summary'] is True</filter>
     </data>
-    <!-- Outputs only produced for peak data -->
-    <data format="tabular" name="transcripts_to_edges_out"
-	  label="Nearest transcripts to peak edges for ${rnaseq.name} vs ${chipseq.name}" >
-      <filter>analysis_options['peak_type'] == "regions"</filter>
-      <filter>results_as_zip is False</filter>
-    </data>
-    <data format="tabular" name="transcripts_to_edges_summary"
-	  label="Nearest transcripts to peak edges (summary) for ${rnaseq.name} vs ${chipseq.name}" >
-      <filter>analysis_options['peak_type'] == "regions"</filter>
-      <filter>results_as_zip is False</filter>
-    </data>
-    <data format="tabular" name="tss_to_edges_out"
-	  label="Nearest TSS to peak edges for ${rnaseq.name} vs ${chipseq.name}" >
-      <filter>analysis_options['peak_type'] == "regions"</filter>
-      <filter>results_as_zip is False</filter>
-    </data>
-    <data format="tabular" name="tss_to_edges_summary"
-	  label="Nearest TSS to peak edges (summary) for ${rnaseq.name} vs ${chipseq.name}" >
-      <filter>analysis_options['peak_type'] == "regions"</filter>
-      <filter>results_as_zip is False</filter>
-    </data>
-    <data format="zip" name="zip_file"
-	  label="All tab-delimited files for ${rnaseq.name} vs ${chipseq.name} (zip file)" >
-      <filter>results_as_zip is True</filter>
+    <data format="tabular" name="features_per_peak_summary"
+	  label="Nearest gene to each peak (summary): ${features_in.name} vs ${peaks_in.name}" >
+      <filter>output['compact_format'] is False</filter>
+      <filter>output['summary'] is True</filter>
     </data>
   </outputs>
   <tests>
+    <!--
+	RnaChipIntegrator +name=test +cutoff=130000 +promoter_region=-10000,2500 +xlsx +compact features.txt summits.txt
+    -->
     <test>
-      <param name="rnaseq" value="ExpressionData.txt" ftype="tabular" />
-      <param name="chipseq" value="ChIP_summits.txt" ftype="tabular" />
-      <param name="peak_type" value="summits" />
-      <param name="window" value="20000" />
+      <param name="features_in" value="features.txt" ftype="tabular" />
+      <param name="peaks_in" value="summits.txt" ftype="tabular" />
       <param name="cutoff" value="130000" />
-      <!-- 
-      **NB** outputs have to be specified in order that they appear in the
-      tool (which is the order they will be written to the history) - the
-      test framework seems to use the order and ignores the "name" attribute
-      -->
-      <output name="xls_output" file="summits.xls" compare="sim_size" />
-      <output name="peaks_to_transcripts_out" file="peaks_to_transcripts.out" ftype="tabular" />
-      <output name="tss_to_summits_out" file="tss_to_summits.out" ftype="tabular" />
+      <param name="promoter_start" value="-10000" />
+      <param name="promoter_end" value="2500" />
+      <output name="xlsx_out" file="summits.xlsx" compare="sim_size" />
+      <output name="peaks_per_feature_out" ftype="tabular"
+	      file="summits_per_feature.out" />
+      <output name="features_per_peak_out" ftype="tabular"
+	      file="features_per_summit.out" />
+    </test>
+    <!--
+	RnaChipIntegrator +name=test +cutoff=130000 +promoter_region=-10000,2500 +xlsx +compact features.txt peaks.txt
+    -->
+    <test>
+      <param name="features_in" value="features.txt" ftype="tabular" />
+      <param name="peaks_in" value="peaks.txt" ftype="tabular" />
+      <param name="cutoff" value="130000" />
+      <param name="promoter_start" value="-10000" />
+      <param name="promoter_end" value="2500" />
+      <output name="xlsx_out" file="peaks1.xlsx" compare="sim_size" />
+      <output name="peaks_per_feature_out" ftype="tabular"
+	      file="peaks_per_feature1.out" />
+      <output name="features_per_peak_out" ftype="tabular"
+	      file="features_per_peak1.out" />
     </test>
+    <!--
+	RnaChipIntegrator +name=test +cutoff=130000 +xlsx features.txt peaks.txt
+    -->
     <test>
-      <param name="rnaseq" value="ExpressionData.txt" ftype="tabular" />
-      <param name="chipseq" value="ChIP_peaks.txt" ftype="tabular" />
-      <param name="peak_type" value="regions" />
-      <param name="edge_cutoff" value="130000" />
-      <!-- 
-      **NB** outputs have to be specified in order that they appear in the
-      tool (which is the order they will be written to the history) - the
-      test framework seems to use the order and ignores the "name" attribute
-      -->
-      <output name="xls_output" file="peaks.xls" compare="sim_size" />
-      <output name="transcripts_to_edges_out" file="transcripts_to_edges.out" ftype="tabular" />
-      <output name="transcripts_to_edges_summary" file="transcripts_to_edges.summary" ftype="tabular" />
-      <output name="tss_to_edges_out" file="tss_to_edges.out" ftype="tabular" />
-      <output name="tss_to_edges_summary" file="tss_to_edges.summary" ftype="tabular" />
+      <param name="features_in" value="features.txt" ftype="tabular" />
+      <param name="peaks_in" value="peaks.txt" ftype="tabular" />
+      <param name="cutoff" value="130000" />
+      <param name="compact_format" value="false" />
+      <output name="xlsx_out" file="peaks2.xlsx" compare="sim_size" />
+      <output name="peaks_per_feature_out" ftype="tabular"
+	      file="peaks_per_feature2.out" />
+      <output name="features_per_peak_out" ftype="tabular"
+	      file="features_per_peak2.out" />
+    </test>
+    <!--
+	RnaChipIntegrator +name=test +cutoff=130000 +only-DE +xlsx +compact features.txt peaks.txt
+    -->
+    <test>
+      <param name="features_in" value="features.txt" ftype="tabular" />
+      <param name="peaks_in" value="peaks.txt" ftype="tabular" />
+      <param name="cutoff" value="130000" />
+      <param name="diff_expressed_only" value="true" />
+      <output name="xlsx_out" file="peaks3.xlsx" compare="sim_size" />
+      <output name="peaks_per_feature_out" ftype="tabular"
+	      file="peaks_per_feature3.out" />
+      <output name="features_per_peak_out" ftype="tabular"
+	      file="features_per_peak3.out" />
+    </test>
+    <!--
+	RnaChipIntegrator +name=test +cutoff=130000 +promoter_region=-10000,2500 +xlsx +summary features.txt peaks.txt
+    -->
+    <test>
+      <param name="features_in" value="features.txt" ftype="tabular" />
+      <param name="peaks_in" value="peaks.txt" ftype="tabular" />
+      <param name="cutoff" value="130000" />
+      <param name="compact_format" value="false" />
+      <param name="summary" value="true" />
+      <param name="pad_output" value="true" />
+      <output name="xlsx_out" file="peaks4.xlsx" compare="sim_size" />
+      <output name="peaks_per_feature_out" ftype="tabular"
+	      file="peaks_per_feature4.out" />
+      <output name="features_per_peak_out" ftype="tabular"
+	      file="features_per_peak4.out" />
+      <output name="peaks_per_feature_summary" ftype="tabular"
+	      file="peaks_per_feature4.summary" />
+      <output name="features_per_peak_summary" ftype="tabular"
+	      file="features_per_peak4.summary" />
     </test>
   </tests>
   <help>
@@ -164,33 +153,17 @@
 
 **What it does**
 
-Run RnaChipIntegrator to perform integrated analyses of gene expression
-and ChIP data, identifying the nearest ChIP peaks to each transcript
-and vice versa.
-
-For ChIP peaks defined as regions the following analyses are performed:
-
- * **TranscriptsToPeakEdges**: reports the nearest transcripts with the smallest
-   distance from either their TSS or TES to the nearest peak edge.
-
- * **TSSToPeakEdges**: reports the nearest transcripts with the smallest distance
-   from their TSS to the nearest peak edge.
-
-For ChIP peaks defined as summits:
+Performs integrated analyses of genes (or other genomic feature data)
+gainst a set of peaks (e.g. ChIP data), identifying the nearest peaks to
+each feature and vice versa.
 
- * **TSSToSummits**: reports the nearest transcripts with the smallest distance
-   from the TSS to the nearest peak summit.
-
- * **PeaksToTranscripts**: reports the nearest peak summits with the smallest
-   distance to either the TSS or TES of each transcript.
+The program was originally written specifically for ChIP-Seq and RNA-Seq
+data but works equally well for ChIP-chip and microarray expression data,
+and can also be used to integrate any set of genomic features (e.g.
+canonical genes, CpG islands) with expression data.
 
-The program was originally written specifically for ChIP-Seq and RNA-Seq data
-but works equally well for ChIP-chip and microarray expression data, and can
-also be used to integrate any set of genomic features (e.g. canonical genes,
-CpG islands) with expression data.
-
-RnaChipIntgerator can be obtained from
-http://fls-bioinformatics-core.github.com/RnaChipIntegrator/
+RnaChipIntegrator can be obtained from
+https://pypi.python.org/pypi/RnaChipIntegrator/
 
 -------------
 
@@ -198,53 +171,81 @@
 
 **Input**
 
-The expression data must be in a tab-delimited file with the following columns
-of data for each genomic feature (one feature per line):
+The gene data must be in a tabular file with the following columns
+of data for each gene or genomic feature (one gene per line):
 
 ====== ========== ======================================================================
 Column Name       Description
 ====== ========== ======================================================================
-     1 ID         Name used to identify the feature in the output
+     1 ID         Name used to identify the gene in the output
      2 chr        Chromosome name
-     3 start      Start position of the feature
-     4 end        End position of the feature
+     3 start      Start position of the gene
+     4 end        End position of the gene
      5 strand     Must be either '+' or '-'
-     6 diff_expr  Optional: indicates feature is differentially expressed (1) or not (0)
+     6 diff_expr  Optional: indicates gene is differentially expressed (1) or not (0)
 ====== ========== ======================================================================
 
-The ChIP-seq data must be in a tab-delimited file with 3 columns of data for each
-ChIP peak (one per line):
+The peak data must be in a tabular file with at least 3 columns of data
+for each peak (one peak per line):
 
-====== ========== ======================================================================
+====== ========== =================================
 Column Name       Description
-====== ========== ======================================================================
-     1 chr        Chromosome name (must match one of those in expression data file)
+====== ========== =================================
+     1 chr        Chromosome name
      2 start      Start position of the peak 
-     3 end        End position of the peak (start + 1 for summit data)
-====== ========== ======================================================================
+     3 end        End position of the peak
+====== ========== =================================
 
-The ChIP peak data can be either the summit (in which case 'end' - 'start' = 1) or the
-entire extent of the binding region (with 'start' and 'end' indicating the limits).
+If peak data is in ``bed`` format then the tool will automatically
+assign the correct columns, otherwise the first three columns of data
+will be used.
 
 -------------
 
 .. class:: infomark
 
-**Output**
+**Outputs**
+
+The key outputs from the tool are two lists compromising the nearest
+peaks for each gene, and the nearest gene for each peak (one dataset
+for each list).
+
+There are two formats for reporting: "compact" and "full":
 
-The outputs from this tool vary depending on the type of data that is input, however
-generally there is one tab-delimited results file for each analysis described above
-in the **What it does** section (some analyses output a second file with just the
-"best" hits).
+ * **Compact output** reports all the hits for each peak or gene on
+   a single line of output;
+ * **Full output** reports each peak/gene pair on a separate line
+   (i.e. a multi-line output format).
+
+In "full" output mode, additional options are available:
+
+ * The output files can be "padded" with extra (empty) lines to ensure
+   that there are always the same number of lines for each peak or
+   gene, if fewer than the requested number of hits are found.
+ * "Summary" datasets can also be requested, which include just the
+   nearest peak reported for each gene (and vice versa).
 
-A history item will be generated for each output file, unless the option to put them
-into a single zip archive is selected; this archive file will have to be downloaded
-and unzipped on your local machine. It is recommended that you refer to the
-RnaChipIntegrator documentation for information on the contents of each output file:
-https://github.com/fls-bioinformatics-core/RnaChipIntegrator/blob/master/doc/MANUAL.markdown
+In either mode these data will also be output in a single MS Excel file,
+which contains one sheet per result set.
+
+.. class:: warning
+
+Using "compact" output with the number of hits limited to more than 4
+peak/gene pairs (or with no limit at all) can result in a large number
+of columns in the output files, which in some versions of Galaxy will
+not be properly displayed. However the data files themselves should be
+okay.
 
-In addition an Excel spreadsheet (with one page for each analysis performed) is always
-produced.
+-------------
+
+.. class:: informark
+
+**More information**
+
+It is recommended that you refer to the ``RnaChipIntegrator``
+documentation for information on the contents of each output file:
+
+* http://rnachipintegrator.readthedocs.org/en/latest/
 
 -------------
 
@@ -259,4 +260,5 @@
 
 Please kindly acknowledge the Bioinformatics Core Facility if you use this tool.
   </help>
+  <expand macro="citations" />
 </tool>