diff rnachipintegrator_canonical_genes.xml @ 1:5f69a2c1b9c9 draft

Uploaded version 1.0.0.0.
author pjbriggs
date Wed, 24 Feb 2016 09:39:14 -0500
parents d9c1f2133124
children b695071de766
line wrap: on
line diff
--- a/rnachipintegrator_canonical_genes.xml	Tue Jun 30 06:44:06 2015 -0400
+++ b/rnachipintegrator_canonical_genes.xml	Wed Feb 24 09:39:14 2016 -0500
@@ -1,153 +1,111 @@
-<tool id="rnachipintegrator_canonical_genes" name="Analyse canonical genes against ChIP data" version="@VERSION@-0">
+<tool id="rnachipintegrator_canonical_genes" name="Analyse canonical genes against 'peak' data" version="@VERSION@.0">
   <description>using RnaChipIntegrator</description>
   <macros>
     <import>rnachipintegrator_macros.xml</import>
   </macros>
   <expand macro="requirements" />
   <expand macro="version_command" />
-  <command interpreter="bash">rnachipintegrator_wrapper.sh
-  #if str( $analysis_options.peak_type ) == "summits"
-    #if str( $analysis_options.window ) != ""
-    --window=$analysis_options.window
-    #end if
-    #if str( $analysis_options.cutoff ) != ""
-    --cutoff=$analysis_options.cutoff
-    #end if
+  <command interpreter="bash"><![CDATA[
+  rnachipintegrator_wrapper.sh
+  #if $peaks_in.metadata.chromCol
+    --peak_cols=${peaks_in.metadata.chromCol},${peaks_in.metadata.startCol},${peaks_in.metadata.endCol}
+  #end if
+  #if str( $cutoff ) != ""
+    --cutoff=$cutoff
+  #else
+    --cutoff=0
   #end if
-  #if str( $analysis_options.peak_type ) == "regions"
-    #if str( $analysis_options.edge_cutoff ) != ""
-    --edge-cutoff=$analysis_options.edge_cutoff
-    #end if
-    #if str( $analysis_options.number ) != ""
-    --number=$analysis_options.number
-    #end if
-    #if (str( $analysis_options.promoter_start ) != "" and str( $analysis_options.promoter_end ))
-    --promoter_region=$analysis_options.promoter_start,$analysis_options.promoter_end
+  #if str( $number ) != ""
+    --number=$number
+  #end if
+  --promoter_region=$promoter_start,$promoter_end
+  --edge=$edge
+  --xlsx_file "$xlsx_out"
+  --output_files "$peaks_per_feature_out" "$features_per_peak_out"
+  #if $output.compact_format
+    --compact
+  #else
+    #if $output.summary
+      --summary_files "$peaks_per_feature_summary" "$features_per_peak_summary"
     #end if
-    #if $analysis_options.pad_output
-    --pad
-    #end if
+    ${output.pad_output}
   #end if
-  ${canonical_genes.fields.path} $chipseq
-  --output_xls $xls_output
-  #if $results_as_zip
-  --zip_file $zip_file
-  #else
-    #if str( $analysis_options.peak_type ) == "summits"
-    --summit_outputs $peaks_to_transcripts_out $tss_to_summits_out
-    #end if
-    #if str( $analysis_options.peak_type ) == "regions"
-    --peak_outputs $transcripts_to_edges_out
-                   $transcripts_to_edges_summary
-                   $tss_to_edges_out
-                   $tss_to_edges_summary
-    #end if
-  #end if
-  </command>
+  "${canonical_genes.fields.path}" "$peaks_in"
+  ]]></command>
   <inputs>
-    <param format="tabular" name="chipseq" type="data" label="ChIP peaks data file" />
-    <param name="canonical_genes" type="select" label="Canonical genes to analyse ChIP peaks against">
+    <param format="tabular" name="peaks_in" type="data" label="Peaks" />
+    <param name="canonical_genes" type="select" label="Canonical genes to analyse peaks against">
       <options from_data_table="rnachipintegrator_canonical_genes">
       </options>
     </param>
-  <conditional name="analysis_options">
-    <!-- user must specify if ChIP peaks are summits or regions -->
-    <param name="peak_type" type="select" label="ChIP peaks are"
-	   help="Options and outputs depend on whether ChIP data are summits or regions">
-      <option value="summits">summits</option>
-      <option value="regions">regions</option>
-    </param>
-    <when value="summits">
-      <param name="window" type="integer" value="20000" optional="true"
-	     label="Maximum distance a peak can be from each transcript
-		    TSS before being omitted from analysis" />
-      <param name="cutoff" type="integer" value="130000" optional="true"
-	     label="Maximum distance a transcript TSS can be from each
-		    peak before being omitted from the analysis" />
-    </when>
-    <when value="regions">
-      <param name="edge_cutoff" type="integer" value="10000" optional="true"
-	     label="Maximum distance a transcript edge can be from the
-		    peak edge before being omitted from the analysis"
-	     help="Set to zero to indicate that no cut off should be applied" />
-      <param name="number" type="integer" value="4" optional="true"
-	     label="Maximum number of transcripts per peak to report from
-		    from the analysis" />
-      <param name="promoter_start" type="integer" value="-10000" optional="true"
-	     label="Start of promoter region with respect to gene TSS" />
-      <param name="promoter_end" type="integer" value="2500" optional="true"
-	     label="End of promoter region with respect to gene TSS" />
-      <param name="pad_output" type="boolean" checked="false" truevalue="yes"
-	     label="Output same number of lines for each peak (--pad)"
-	     help="Add blank lines in output for peaks with fewer than maximum number
-		   of hits" />
-    </when>
-  </conditional>
-    <param name="results_as_zip" type="boolean" checked="false" truevalue="yes"
-           label="Put output tab-delimited files into a single zip archive" />
+    <expand macro="analysis_options" />
+    <expand macro="output_options" />
   </inputs>
   <outputs>
     <!-- Always produce XLS output -->
-    <data format="xls" name="xls_output"
-	  label="All RnaChipIntegrator analyses for ${canonical_genes.fields.name} vs ${chipseq.name} (Excel spreadsheet)" />
-    <!-- Outputs only produced for summit data -->
-    <data format="tabular" name="peaks_to_transcripts_out"
-	  label="Nearest summits to transcripts for ${canonical_genes.fields.name} vs ${chipseq.name}" >
-      <filter>analysis_options['peak_type'] == "summits"</filter>
-      <filter>results_as_zip is False</filter>
-    </data>
-    <data format="tabular" name="tss_to_summits_out"
-	  label="Nearest summits to TSS for ${canonical_genes.fields.name} vs ${chipseq.name}" >
-      <filter>analysis_options['peak_type'] == "summits"</filter>
-      <filter>results_as_zip is False</filter>
-    </data>
-    <!-- Outputs only produced for peak data -->
-    <data format="tabular" name="transcripts_to_edges_out"
-	  label="Nearest transcripts to peak edges for ${canonical_genes.fields.name} vs ${chipseq.name}" >
-      <filter>analysis_options['peak_type'] == "regions"</filter>
-      <filter>results_as_zip is False</filter>
+    <data format="xlsx" name="xlsx_out"
+	  label="All RnaChipIntegrator analyses: ${canonical_genes.fields.name} vs ${peaks_in.name} (Excel spreadsheet)" />
+    <data format="tabular" name="peaks_per_feature_out"
+	  label="Nearest peaks to each gene: ${canonical_genes.fields.name} vs ${peaks_in.name}" />
+    <data format="tabular" name="features_per_peak_out"
+	  label="Nearest genes to each peak: ${canonical_genes.fields.name} vs ${peaks_in.name}" />
+    <data format="tabular" name="peaks_per_feature_summary"
+	  label="Nearest peaks to each gene (summary): ${canonical_genes.fields.name} vs ${peaks_in.name}" >
+      <filter>output['compact_format'] is False</filter>
+      <filter>output['summary'] is True</filter>
     </data>
-    <data format="tabular" name="transcripts_to_edges_summary"
-	  label="Nearest transcripts to peak edges (summary) for ${canonical_genes.fields.name} vs ${chipseq.name}" >
-      <filter>analysis_options['peak_type'] == "regions"</filter>
-      <filter>results_as_zip is False</filter>
-    </data>
-    <data format="tabular" name="tss_to_edges_out"
-	  label="Nearest TSS to peak edges for ${canonical_genes.fields.name} vs ${chipseq.name}" >
-      <filter>analysis_options['peak_type'] == "regions"</filter>
-      <filter>results_as_zip is False</filter>
-    </data>
-    <data format="tabular" name="tss_to_edges_summary"
-	  label="Nearest TSS to peak edges (summary) for ${canonical_genes.fields.name} vs ${chipseq.name}" >
-      <filter>analysis_options['peak_type'] == "regions"</filter>
-      <filter>results_as_zip is False</filter>
-    </data>
-    <data format="zip" name="zip_file"
-	  label="All tab-delimited files for ${canonical_genes.fields.name} vs ${chipseq.name} (zip file)" >
-      <filter>results_as_zip is True</filter>
+    <data format="tabular" name="features_per_peak_summary"
+	  label="Nearest genes to each peak (summary): ${canonical_genes.fields.name} vs ${peaks_in.name}" >
+      <filter>output['compact_format'] is False</filter>
+      <filter>output['summary'] is True</filter>
     </data>
   </outputs>
   <tests>
+    <!--
+	RnaChipIntegrator +name=mm9 +cutoff=50000 +xlsx +summary mm9_canonical_genes.tsv mm9_summits.txt
+    -->
     <test>
-      <param name="chipseq" value="mm9_summits.txt" />
+      <param name="peaks_in" value="mm9_summits.txt" ftype="tabular" />
       <param name="canonical_genes" value="mm9_test" />
-      <param name="peak_type" value="summits" />
-      <param name="window" value="50000" />
-      <param name="cutoff" value="130000" />
-      <output name="xls_output" file="mm9_summits.xls" compare="sim_size" />
-      <output name="peaks_to_transcripts_out" file="mm9_summits_to_transcripts.out" ftype="tabular" />
-      <output name="tss_to_summits_out" file="mm9_tss_to_summits.out" ftype="tabular" />
+      <param name="cutoff" value="50000" />
+      <output name="xlsx_out" file="mm9_summits.xlsx" compare="sim_size" />
+      <output name="peaks_per_feature_out" ftype="tabular"
+	      file="mm9_summits_per_feature.out" />
+      <output name="features_per_peak_out" ftype="tabular"
+	      file="mm9_features_per_summit.out" />
     </test>
+    <!--
+	RnaChipIntegrator +name=mm9 +cutoff=50000 +xlsx +compact mm9_canonical_genes.tsv mm9_peaks.txt
+    -->
     <test>
-      <param name="chipseq" value="mm9_peaks.txt" />
+      <param name="peaks_in" value="mm9_peaks.txt" ftype="tabular" />
       <param name="canonical_genes" value="mm9_test" />
-      <param name="peak_type" value="regions" />
-      <param name="edge_cutoff" value="50000" />
-      <output name="xls_output" file="mm9_peaks.xls" compare="sim_size" />
-      <output name="transcripts_to_edges_out" file="mm9_transcripts_to_edges.out" ftype="tabular" />
-      <output name="transcripts_to_edges_summary" file="mm9_transcripts_to_edges.summary" ftype="tabular" />
-      <output name="tss_to_edges_out" file="mm9_tss_to_edges.out" ftype="tabular" />
-      <output name="tss_to_edges_summary" file="mm9_tss_to_edges.summary" ftype="tabular" />
+      <param name="cutoff" value="50000" />
+      <output name="xlsx_out" file="mm9_peaks1.xlsx" compare="sim_size" />
+      <output name="peaks_per_feature_out" ftype="tabular"
+	      file="mm9_peaks_per_feature1.out" />
+      <output name="features_per_peak_out" ftype="tabular"
+	      file="mm9_features_per_peak1.out" />
+    </test>
+    <!--
+	RnaChipIntegrator +name=mm9 +cutoff=50000 +xlsx +summary +pad mm9_canonical_genes.tsv mm9_peaks.txt
+    -->
+    <test>
+      <param name="peaks_in" value="mm9_peaks.txt" ftype="tabular" />
+      <param name="canonical_genes" value="mm9_test" />
+      <param name="cutoff" value="50000" />
+      <param name="compact_format" value="false" />
+      <param name="summary" value="true" />
+      <param name="pad_output" value="true" />
+      <output name="xlsx_out" file="mm9_peaks3.xlsx" compare="sim_size" />
+      <output name="peaks_per_feature_out" ftype="tabular"
+	      file="mm9_peaks_per_feature3.out" />
+      <output name="features_per_peak_out" ftype="tabular"
+	      file="mm9_features_per_peak3.out" />
+      <output name="peaks_per_feature_summary" ftype="tabular"
+	      file="mm9_peaks_per_feature3.summary" />
+      <output name="features_per_peak_summary" ftype="tabular"
+	      file="mm9_features_per_peak3.summary" />
     </test>
   </tests>
   <help>
@@ -156,27 +114,11 @@
 
 **What it does**
 
-Run RnaChipIntegrator to analyse ChIP data against a set of list of "canonical
-genes" for a specific organism/genome build, identifying the nearest ChIP peaks
-to each cannonical gene (vice versa).
-
-For ChIP peaks defined as regions the following analyses are performed:
-
- * **TranscriptsToPeakEdges**: reports the nearest transcripts with the smallest
-   distance from either their TSS or TES to the nearest peak edge.
+Performs integrated analyses of a set of peaks (e.g. ChIP data) against a
+list of "canonical genes" for a specific organism and genome build,
+identifying the nearest peaks to each canonical gene (and vice versa).
 
- * **TSSToPeakEdges**: reports the nearest transcripts with the smallest distance
-   from their TSS to the nearest peak edge.
-
-For ChIP peaks defined as summits:
-
- * **TSSToSummits**: reports the nearest transcripts with the smallest distance
-   from the TSS to the nearest peak summit.
-
- * **PeaksToTranscripts**: reports the nearest peak summits with the smallest
-   distance to either the TSS or TES of each transcript.
-
-RnaChipIntgerator can be obtained from
+RnaChipIntegrator can be obtained from
 http://fls-bioinformatics-core.github.com/RnaChipIntegrator/
 
 -------------
@@ -185,39 +127,63 @@
 
 **Input**
 
-The ChIP-seq data must be in a tab-delimited file with 3 columns of data for each
-ChIP peak (one per line):
+The peak data must be in a tabular file with at least 3 columns of data
+for each peak (one peak per line):
 
-====== ========== ======================================================================
+====== ========== =================================
 Column Name       Description
-====== ========== ======================================================================
-     1 chr        Chromosome name (must match one of those in expression data file)
-     2 start      Start position of the peak 
-     3 end        End position of the peak (start + 1 for summit data)
-====== ========== ======================================================================
-
-The ChIP peak data can be either the summit (in which case 'end' - 'start' = 1) or the
-entire extent of the binding region (with 'start' and 'end' indicating the limits).
+====== ========== =================================
+     1 chr        Chromosome name
+     2 start      Start position of the peak
+     3 end        End position of the peak
+====== ========== =================================
 
 -------------
 
 .. class:: infomark
 
-**Output**
+**Outputs**
+
+The key outputs from the tool are two lists compromising the nearest
+peaks for each gene, and the nearest gene for each peak (one
+dataset for each list).
+
+There are two formats for reporting: "compact" and "full":
 
-The outputs from this tool vary depending on the type of ChIP data that is input (i.e
-summits or peaks), however generally there is one tab-delimited results file for each
-analysis described above in the **What it does** section (some analyses output a second
-file with just the "best" hits).
+ * **Compact output** reports all the hits for each peak or gene on
+   a single line of output;
+ * **Full output** reports each peak/gene pair on a separate line
+   (i.e. a multi-line output format).
+
+In "full" output mode, additional options are available:
+
+ * The output files can be "padded" with extra (empty) lines to ensure
+   that there are always the same number of lines for each peak or
+   gene, if fewer than the requested number of hits are found.
+ * "Summary" datasets can also be requested, which include just the
+   nearest peak reported for each gene (and vice versa).
 
-A history item will be generated for each output file, unless the option to put them
-into a single zip archive is selected; this archive file will have to be downloaded
-and unzipped on your local machine. It is recommended that you refer to the
-RnaChipIntegrator documentation for information on the contents of each output file:
-https://github.com/fls-bioinformatics-core/RnaChipIntegrator/blob/master/doc/MANUAL.markdown
+In either mode these data will also be output in a single MS Excel file,
+which contains one sheet per result set.
+
+.. class:: warning
+
+Using "compact" output with the number of hits limited to more than 4
+peak/gene pairs (or with no limit at all) can result in a large number
+of columns in the output files, which in some versions of Galaxy will
+not be properly displayed. However the data files themselves should be
+okay.
 
-In addition an Excel spreadsheet (with one page for each analysis performed) is always
-produced.
+-------------
+
+.. class:: informark
+
+**More information**
+
+It is recommended that you refer to the ``RnaChipIntegrator``
+documentation for information on the contents of each output file:
+
+* http://rnachipintegrator.readthedocs.org/en/latest/
 
 -------------
 
@@ -228,8 +194,9 @@
 This Galaxy tool has been developed within the Bioinformatics Core Facility at the
 University of Manchester. It runs the RnaChipIntegrator package which has also been
 developed by this group, and is documented at
-http://fls-bioinformatics-core.github.com/RnaChipIntegrator/
+https://pypi.python.org/pypi/RnaChipIntegrator/
 
 Please kindly acknowledge the Bioinformatics Core Facility if you use this tool.
   </help>
+  <expand macro="citations" />
 </tool>