rnachipintegrator: rnachipintegrator

comparison rnachipintegrator_wrapper.xml @ 1:5f69a2c1b9c9 draft

Uploaded version 1.0.0.0.

author	pjbriggs
date	Wed, 24 Feb 2016 09:39:14 -0500
parents	d9c1f2133124
children	dc498b03ca9a

comparison

equal deleted inserted replaced

-:d9c1f2133124
+:5f69a2c1b9c9
 <?xml version="1.0" encoding="utf-8"?>
-<tool id="rnachipintegrator_wrapper" name="RnaChipIntegrator" version="@VERSION@-0">
+<tool id="rnachipintegrator_wrapper" name="RnaChipIntegrator" version="@VERSION@.0">
-<description>Integrated analysis of gene expression data and ChIP data</description>
+<description>Integrated analysis of 'gene' and 'peak' data</description>
 <macros>
 <import>rnachipintegrator_macros.xml</import>
 </macros>
 <expand macro="requirements" />
 <expand macro="version_command" />
-<command interpreter="bash">rnachipintegrator_wrapper.sh
+<command interpreter="bash"><![CDATA[
-#if str( $analysis_options.peak_type ) == "summits"
+rnachipintegrator_wrapper.sh
-#if str( $analysis_options.window ) != ""
+#if $peaks_in.metadata.chromCol
---window=$analysis_options.window
+--peak_cols=${peaks_in.metadata.chromCol},${peaks_in.metadata.startCol},${peaks_in.metadata.endCol}
+#end if
+#if str( $cutoff ) != ""
+--cutoff=$cutoff
+#else
+--cutoff=0
+#end if
+#if str( $number ) != ""
+--number=$number
+#end if
+--promoter_region=$promoter_start,$promoter_end
+--edge=$edge
+$diff_expressed_only
+--xlsx_file "$xlsx_out"
+--output_files "$peaks_per_feature_out" "$features_per_peak_out"
+#if $output.compact_format
+--compact
+#else
+#if $output.summary
+--summary_files "$peaks_per_feature_summary" "$features_per_peak_summary"
 #end if
-#if str( $analysis_options.cutoff ) != ""
+${output.pad_output}
---cutoff=$analysis_options.cutoff
+#end if
-#end if
+"$features_in" "$peaks_in"
-#end if
+]]></command>
-#if str( $analysis_options.peak_type ) == "regions"
-#if str( $analysis_options.edge_cutoff ) != ""
---edge-cutoff=$analysis_options.edge_cutoff
-#end if
-#if str( $analysis_options.number ) != ""
---number=$analysis_options.number
-#end if
-#if (str( $analysis_options.promoter_start ) != "" and str( $analysis_options.promoter_end ))
---promoter_region=$analysis_options.promoter_start,$analysis_options.promoter_end
-#end if
-#if $analysis_options.pad_output
---pad
-#end if
-#end if
-$rnaseq $chipseq
---output_xls $xls_output
-#if $results_as_zip
---zip_file $zip_file
-#else
-#if str( $analysis_options.peak_type ) == "summits"
---summit_outputs $peaks_to_transcripts_out $tss_to_summits_out
-#end if
-#if str( $analysis_options.peak_type ) == "regions"
---peak_outputs $transcripts_to_edges_out
-$transcripts_to_edges_summary
-$tss_to_edges_out
-$tss_to_edges_summary
-#end if
-#end if
-</command>
 <inputs>
-<param format="tabular" name="rnaseq" type="data" label="Gene expression data file" />
+<param format="tabular" name="features_in" type="data"
-<param format="tabular" name="chipseq" type="data" label="ChIP peaks data file" />
+	   label="Genes/genomic features" />
-<conditional name="analysis_options">
+<param format="tabular" name="peaks_in" type="data"
-<!-- user must specify if ChIP peaks are summits or regions -->
+	   label="Peaks/regions" />
-<param name="peak_type" type="select" label="ChIP peaks are"
+<expand macro="analysis_options" />
-	   help="Options and outputs depend on whether ChIP data are summits or regions">
+<param name="diff_expressed_only" type="boolean"
-<option value="summits">summits</option>
+	   truevalue="--only-DE" falsevalue="" checked="false"
-<option value="regions">regions</option>
+	   label="Only consider genes which are flagged as differentially
-</param>
+		  expressed"
-<when value="summits">
+help="NB input feature data must include differential expression
-<param name="window" type="integer" value="20000" optional="true"
+		 flags (--only-DE)" />
-	     label="Maximum distance a peak can be from each transcript
+<expand macro="output_options" />
-		    TSS before being omitted from analysis" />
-<param name="cutoff" type="integer" value="130000" optional="true"
-	     label="Maximum distance a transcript TSS can be from each
-		    peak before being omitted from the analysis" />
-</when>
-<when value="regions">
-<param name="edge_cutoff" type="integer" value="10000" optional="true"
-	     label="Maximum distance a transcript edge can be from the
-		    peak edge before being omitted from the analysis"
-	     help="Set to zero to indicate that no cut off should be applied" />
-<param name="number" type="integer" value="4" optional="true"
-	     label="Maximum number of transcripts per peak to report from
-		    from the analysis" />
-<param name="promoter_start" type="integer" value="-10000" optional="true"
-	     label="Start of promoter region with respect to gene TSS" />
-<param name="promoter_end" type="integer" value="2500" optional="true"
-	     label="End of promoter region with respect to gene TSS" />
-<param name="pad_output" type="boolean" checked="false" truevalue="yes"
-	     label="Output same number of lines for each peak"
-	     help="Add blank lines in output for peaks with fewer than maximum number
-		   of hits (--pad)" />
-</when>
-</conditional>
-<param name="results_as_zip" type="boolean" checked="false" truevalue="yes"
-label="Put output tab-delimited files into a single zip archive" />
 </inputs>
 <outputs>
-<!-- Always produce XLS output -->
+<!-- Always produce XLSX output -->
-<data format="xls" name="xls_output"
+<data format="xlsx" name="xlsx_out"
-	  label="All RnaChipIntegrator analyses for ${rnaseq.name} vs ${chipseq.name} (Excel spreadsheet)" />
+	  label="All RnaChipIntegrator analyses: ${features_in.name} vs ${peaks_in.name} (Excel spreadsheet)" />
-<!-- Outputs only produced for summit data -->
+<data format="tabular" name="peaks_per_feature_out"
-<data format="tabular" name="peaks_to_transcripts_out"
+	  label="Nearest peaks to each gene: ${features_in.name} vs ${peaks_in.name}" />
-	  label="Nearest summits to transcripts for ${rnaseq.name} vs ${chipseq.name}" >
+<data format="tabular" name="features_per_peak_out"
-<filter>analysis_options['peak_type'] == "summits"</filter>
+	  label="Nearest genes to each peak: ${features_in.name} vs ${peaks_in.name}" />
-<filter>results_as_zip is False</filter>
+<data format="tabular" name="peaks_per_feature_summary"
+	  label="Nearest peaks to each gene (summary): ${features_in.name} vs ${peaks_in.name}" >
+<filter>output['compact_format'] is False</filter>
+<filter>output['summary'] is True</filter>
 </data>
-<data format="tabular" name="tss_to_summits_out"
+<data format="tabular" name="features_per_peak_summary"
-	  label="Nearest TSS to summits for ${rnaseq.name} vs ${chipseq.name}" >
+	  label="Nearest gene to each peak (summary): ${features_in.name} vs ${peaks_in.name}" >
-<filter>analysis_options['peak_type'] == "summits"</filter>
+<filter>output['compact_format'] is False</filter>
-<filter>results_as_zip is False</filter>
+<filter>output['summary'] is True</filter>
-</data>
-<!-- Outputs only produced for peak data -->
-<data format="tabular" name="transcripts_to_edges_out"
-	  label="Nearest transcripts to peak edges for ${rnaseq.name} vs ${chipseq.name}" >
-<filter>analysis_options['peak_type'] == "regions"</filter>
-<filter>results_as_zip is False</filter>
-</data>
-<data format="tabular" name="transcripts_to_edges_summary"
-	  label="Nearest transcripts to peak edges (summary) for ${rnaseq.name} vs ${chipseq.name}" >
-<filter>analysis_options['peak_type'] == "regions"</filter>
-<filter>results_as_zip is False</filter>
-</data>
-<data format="tabular" name="tss_to_edges_out"
-	  label="Nearest TSS to peak edges for ${rnaseq.name} vs ${chipseq.name}" >
-<filter>analysis_options['peak_type'] == "regions"</filter>
-<filter>results_as_zip is False</filter>
-</data>
-<data format="tabular" name="tss_to_edges_summary"
-	  label="Nearest TSS to peak edges (summary) for ${rnaseq.name} vs ${chipseq.name}" >
-<filter>analysis_options['peak_type'] == "regions"</filter>
-<filter>results_as_zip is False</filter>
-</data>
-<data format="zip" name="zip_file"
-	  label="All tab-delimited files for ${rnaseq.name} vs ${chipseq.name} (zip file)" >
-<filter>results_as_zip is True</filter>
 </data>
 </outputs>
 <tests>
-<test>
+<!--
-<param name="rnaseq" value="ExpressionData.txt" ftype="tabular" />
+	RnaChipIntegrator +name=test +cutoff=130000 +promoter_region=-10000,2500 +xlsx +compact features.txt summits.txt
-<param name="chipseq" value="ChIP_summits.txt" ftype="tabular" />
+-->
-<param name="peak_type" value="summits" />
+<test>
-<param name="window" value="20000" />
+<param name="features_in" value="features.txt" ftype="tabular" />
-<param name="cutoff" value="130000" />
+<param name="peaks_in" value="summits.txt" ftype="tabular" />
-<!--
+<param name="cutoff" value="130000" />
-**NB** outputs have to be specified in order that they appear in the
+<param name="promoter_start" value="-10000" />
-tool (which is the order they will be written to the history) - the
+<param name="promoter_end" value="2500" />
-test framework seems to use the order and ignores the "name" attribute
+<output name="xlsx_out" file="summits.xlsx" compare="sim_size" />
--->
+<output name="peaks_per_feature_out" ftype="tabular"
-<output name="xls_output" file="summits.xls" compare="sim_size" />
+	      file="summits_per_feature.out" />
-<output name="peaks_to_transcripts_out" file="peaks_to_transcripts.out" ftype="tabular" />
+<output name="features_per_peak_out" ftype="tabular"
-<output name="tss_to_summits_out" file="tss_to_summits.out" ftype="tabular" />
+	      file="features_per_summit.out" />
 </test>
-<test>
+<!--
-<param name="rnaseq" value="ExpressionData.txt" ftype="tabular" />
+	RnaChipIntegrator +name=test +cutoff=130000 +promoter_region=-10000,2500 +xlsx +compact features.txt peaks.txt
-<param name="chipseq" value="ChIP_peaks.txt" ftype="tabular" />
+-->
-<param name="peak_type" value="regions" />
+<test>
-<param name="edge_cutoff" value="130000" />
+<param name="features_in" value="features.txt" ftype="tabular" />
-<!--
+<param name="peaks_in" value="peaks.txt" ftype="tabular" />
-**NB** outputs have to be specified in order that they appear in the
+<param name="cutoff" value="130000" />
-tool (which is the order they will be written to the history) - the
+<param name="promoter_start" value="-10000" />
-test framework seems to use the order and ignores the "name" attribute
+<param name="promoter_end" value="2500" />
--->
+<output name="xlsx_out" file="peaks1.xlsx" compare="sim_size" />
-<output name="xls_output" file="peaks.xls" compare="sim_size" />
+<output name="peaks_per_feature_out" ftype="tabular"
-<output name="transcripts_to_edges_out" file="transcripts_to_edges.out" ftype="tabular" />
+	      file="peaks_per_feature1.out" />
-<output name="transcripts_to_edges_summary" file="transcripts_to_edges.summary" ftype="tabular" />
+<output name="features_per_peak_out" ftype="tabular"
-<output name="tss_to_edges_out" file="tss_to_edges.out" ftype="tabular" />
+	      file="features_per_peak1.out" />
-<output name="tss_to_edges_summary" file="tss_to_edges.summary" ftype="tabular" />
+</test>
+<!--
+	RnaChipIntegrator +name=test +cutoff=130000 +xlsx features.txt peaks.txt
+-->
+<test>
+<param name="features_in" value="features.txt" ftype="tabular" />
+<param name="peaks_in" value="peaks.txt" ftype="tabular" />
+<param name="cutoff" value="130000" />
+<param name="compact_format" value="false" />
+<output name="xlsx_out" file="peaks2.xlsx" compare="sim_size" />
+<output name="peaks_per_feature_out" ftype="tabular"
+	      file="peaks_per_feature2.out" />
+<output name="features_per_peak_out" ftype="tabular"
+	      file="features_per_peak2.out" />
+</test>
+<!--
+	RnaChipIntegrator +name=test +cutoff=130000 +only-DE +xlsx +compact features.txt peaks.txt
+-->
+<test>
+<param name="features_in" value="features.txt" ftype="tabular" />
+<param name="peaks_in" value="peaks.txt" ftype="tabular" />
+<param name="cutoff" value="130000" />
+<param name="diff_expressed_only" value="true" />
+<output name="xlsx_out" file="peaks3.xlsx" compare="sim_size" />
+<output name="peaks_per_feature_out" ftype="tabular"
+	      file="peaks_per_feature3.out" />
+<output name="features_per_peak_out" ftype="tabular"
+	      file="features_per_peak3.out" />
+</test>
+<!--
+	RnaChipIntegrator +name=test +cutoff=130000 +promoter_region=-10000,2500 +xlsx +summary features.txt peaks.txt
+-->
+<test>
+<param name="features_in" value="features.txt" ftype="tabular" />
+<param name="peaks_in" value="peaks.txt" ftype="tabular" />
+<param name="cutoff" value="130000" />
+<param name="compact_format" value="false" />
+<param name="summary" value="true" />
+<param name="pad_output" value="true" />
+<output name="xlsx_out" file="peaks4.xlsx" compare="sim_size" />
+<output name="peaks_per_feature_out" ftype="tabular"
+	      file="peaks_per_feature4.out" />
+<output name="features_per_peak_out" ftype="tabular"
+	      file="features_per_peak4.out" />
+<output name="peaks_per_feature_summary" ftype="tabular"
+	      file="peaks_per_feature4.summary" />
+<output name="features_per_peak_summary" ftype="tabular"
+	      file="features_per_peak4.summary" />
 </test>
 </tests>
 <help>
 .. class:: infomark
 **What it does**
-Run RnaChipIntegrator to perform integrated analyses of gene expression
+Performs integrated analyses of genes (or other genomic feature data)
-and ChIP data, identifying the nearest ChIP peaks to each transcript
+gainst a set of peaks (e.g. ChIP data), identifying the nearest peaks to
-and vice versa.
+each feature and vice versa.
-For ChIP peaks defined as regions the following analyses are performed:
+The program was originally written specifically for ChIP-Seq and RNA-Seq
+data but works equally well for ChIP-chip and microarray expression data,
-* **TranscriptsToPeakEdges**: reports the nearest transcripts with the smallest
+and can also be used to integrate any set of genomic features (e.g.
-distance from either their TSS or TES to the nearest peak edge.
+canonical genes, CpG islands) with expression data.
-* **TSSToPeakEdges**: reports the nearest transcripts with the smallest distance
+RnaChipIntegrator can be obtained from
-from their TSS to the nearest peak edge.
+https://pypi.python.org/pypi/RnaChipIntegrator/
-For ChIP peaks defined as summits:
-* **TSSToSummits**: reports the nearest transcripts with the smallest distance
-from the TSS to the nearest peak summit.
-* **PeaksToTranscripts**: reports the nearest peak summits with the smallest
-distance to either the TSS or TES of each transcript.
-The program was originally written specifically for ChIP-Seq and RNA-Seq data
-but works equally well for ChIP-chip and microarray expression data, and can
-also be used to integrate any set of genomic features (e.g. canonical genes,
-CpG islands) with expression data.
-RnaChipIntgerator can be obtained from
-http://fls-bioinformatics-core.github.com/RnaChipIntegrator/
 -------------
 .. class:: infomark
 **Input**
-The expression data must be in a tab-delimited file with the following columns
+The gene data must be in a tabular file with the following columns
-of data for each genomic feature (one feature per line):
+of data for each gene or genomic feature (one gene per line):
 ====== ========== ======================================================================
 Column Name       Description
 ====== ========== ======================================================================
-1 ID         Name used to identify the feature in the output
+1 ID         Name used to identify the gene in the output
 2 chr        Chromosome name
-3 start      Start position of the feature
+3 start      Start position of the gene
-4 end        End position of the feature
+4 end        End position of the gene
 5 strand     Must be either '+' or '-'
-6 diff_expr  Optional: indicates feature is differentially expressed (1) or not (0)
+6 diff_expr  Optional: indicates gene is differentially expressed (1) or not (0)
 ====== ========== ======================================================================
-The ChIP-seq data must be in a tab-delimited file with 3 columns of data for each
+The peak data must be in a tabular file with at least 3 columns of data
-ChIP peak (one per line):
+for each peak (one peak per line):
-====== ========== ======================================================================
+====== ========== =================================
 Column Name       Description
-====== ========== ======================================================================
+====== ========== =================================
-1 chr        Chromosome name (must match one of those in expression data file)
+1 chr        Chromosome name
 2 start      Start position of the peak
-3 end        End position of the peak (start + 1 for summit data)
+3 end        End position of the peak
-====== ========== ======================================================================
+====== ========== =================================
-The ChIP peak data can be either the summit (in which case 'end' - 'start' = 1) or the
+If peak data is in ``bed`` format then the tool will automatically
-entire extent of the binding region (with 'start' and 'end' indicating the limits).
+assign the correct columns, otherwise the first three columns of data
+will be used.
--------------
+-------------
-.. class:: infomark
+.. class:: infomark
-**Output**
+**Outputs**
-The outputs from this tool vary depending on the type of data that is input, however
-generally there is one tab-delimited results file for each analysis described above
+The key outputs from the tool are two lists compromising the nearest
-in the **What it does** section (some analyses output a second file with just the
+peaks for each gene, and the nearest gene for each peak (one dataset
-"best" hits).
+for each list).
-A history item will be generated for each output file, unless the option to put them
+There are two formats for reporting: "compact" and "full":
-into a single zip archive is selected; this archive file will have to be downloaded
-and unzipped on your local machine. It is recommended that you refer to the
+* **Compact output** reports all the hits for each peak or gene on
-RnaChipIntegrator documentation for information on the contents of each output file:
+a single line of output;
-https://github.com/fls-bioinformatics-core/RnaChipIntegrator/blob/master/doc/MANUAL.markdown
+* **Full output** reports each peak/gene pair on a separate line
+(i.e. a multi-line output format).
-In addition an Excel spreadsheet (with one page for each analysis performed) is always
-produced.
+In "full" output mode, additional options are available:
+* The output files can be "padded" with extra (empty) lines to ensure
+that there are always the same number of lines for each peak or
+gene, if fewer than the requested number of hits are found.
+* "Summary" datasets can also be requested, which include just the
+nearest peak reported for each gene (and vice versa).
+In either mode these data will also be output in a single MS Excel file,
+which contains one sheet per result set.
+.. class:: warning
+Using "compact" output with the number of hits limited to more than 4
+peak/gene pairs (or with no limit at all) can result in a large number
+of columns in the output files, which in some versions of Galaxy will
+not be properly displayed. However the data files themselves should be
+okay.
+-------------
+.. class:: informark
+**More information**
+It is recommended that you refer to the ``RnaChipIntegrator``
+documentation for information on the contents of each output file:
+* http://rnachipintegrator.readthedocs.org/en/latest/
 -------------
 .. class:: infomark
 developed by this group, and is documented at
 http://fls-bioinformatics-core.github.com/RnaChipIntegrator/
 Please kindly acknowledge the Bioinformatics Core Facility if you use this tool.
 </help>
+<expand macro="citations" />
 </tool>

Mercurial > repos > pjbriggs > rnachipintegrator

comparison rnachipintegrator_wrapper.xml @ 1:5f69a2c1b9c9 draft