comparison rnachipintegrator_wrapper.xml @ 1:5f69a2c1b9c9 draft

Uploaded version 1.0.0.0.
author pjbriggs
date Wed, 24 Feb 2016 09:39:14 -0500
parents d9c1f2133124
children dc498b03ca9a
comparison
equal deleted inserted replaced
0:d9c1f2133124 1:5f69a2c1b9c9
1 <?xml version="1.0" encoding="utf-8"?> 1 <?xml version="1.0" encoding="utf-8"?>
2 <tool id="rnachipintegrator_wrapper" name="RnaChipIntegrator" version="@VERSION@-0"> 2 <tool id="rnachipintegrator_wrapper" name="RnaChipIntegrator" version="@VERSION@.0">
3 <description>Integrated analysis of gene expression data and ChIP data</description> 3 <description>Integrated analysis of 'gene' and 'peak' data</description>
4 <macros> 4 <macros>
5 <import>rnachipintegrator_macros.xml</import> 5 <import>rnachipintegrator_macros.xml</import>
6 </macros> 6 </macros>
7 <expand macro="requirements" /> 7 <expand macro="requirements" />
8 <expand macro="version_command" /> 8 <expand macro="version_command" />
9 <command interpreter="bash">rnachipintegrator_wrapper.sh 9 <command interpreter="bash"><![CDATA[
10 #if str( $analysis_options.peak_type ) == "summits" 10 rnachipintegrator_wrapper.sh
11 #if str( $analysis_options.window ) != "" 11 #if $peaks_in.metadata.chromCol
12 --window=$analysis_options.window 12 --peak_cols=${peaks_in.metadata.chromCol},${peaks_in.metadata.startCol},${peaks_in.metadata.endCol}
13 #end if
14 #if str( $cutoff ) != ""
15 --cutoff=$cutoff
16 #else
17 --cutoff=0
18 #end if
19 #if str( $number ) != ""
20 --number=$number
21 #end if
22 --promoter_region=$promoter_start,$promoter_end
23 --edge=$edge
24 $diff_expressed_only
25 --xlsx_file "$xlsx_out"
26 --output_files "$peaks_per_feature_out" "$features_per_peak_out"
27 #if $output.compact_format
28 --compact
29 #else
30 #if $output.summary
31 --summary_files "$peaks_per_feature_summary" "$features_per_peak_summary"
13 #end if 32 #end if
14 #if str( $analysis_options.cutoff ) != "" 33 ${output.pad_output}
15 --cutoff=$analysis_options.cutoff 34 #end if
16 #end if 35 "$features_in" "$peaks_in"
17 #end if 36 ]]></command>
18 #if str( $analysis_options.peak_type ) == "regions"
19 #if str( $analysis_options.edge_cutoff ) != ""
20 --edge-cutoff=$analysis_options.edge_cutoff
21 #end if
22 #if str( $analysis_options.number ) != ""
23 --number=$analysis_options.number
24 #end if
25 #if (str( $analysis_options.promoter_start ) != "" and str( $analysis_options.promoter_end ))
26 --promoter_region=$analysis_options.promoter_start,$analysis_options.promoter_end
27 #end if
28 #if $analysis_options.pad_output
29 --pad
30 #end if
31 #end if
32 $rnaseq $chipseq
33 --output_xls $xls_output
34 #if $results_as_zip
35 --zip_file $zip_file
36 #else
37 #if str( $analysis_options.peak_type ) == "summits"
38 --summit_outputs $peaks_to_transcripts_out $tss_to_summits_out
39 #end if
40 #if str( $analysis_options.peak_type ) == "regions"
41 --peak_outputs $transcripts_to_edges_out
42 $transcripts_to_edges_summary
43 $tss_to_edges_out
44 $tss_to_edges_summary
45 #end if
46 #end if
47 </command>
48 <inputs> 37 <inputs>
49 <param format="tabular" name="rnaseq" type="data" label="Gene expression data file" /> 38 <param format="tabular" name="features_in" type="data"
50 <param format="tabular" name="chipseq" type="data" label="ChIP peaks data file" /> 39 label="Genes/genomic features" />
51 <conditional name="analysis_options"> 40 <param format="tabular" name="peaks_in" type="data"
52 <!-- user must specify if ChIP peaks are summits or regions --> 41 label="Peaks/regions" />
53 <param name="peak_type" type="select" label="ChIP peaks are" 42 <expand macro="analysis_options" />
54 help="Options and outputs depend on whether ChIP data are summits or regions"> 43 <param name="diff_expressed_only" type="boolean"
55 <option value="summits">summits</option> 44 truevalue="--only-DE" falsevalue="" checked="false"
56 <option value="regions">regions</option> 45 label="Only consider genes which are flagged as differentially
57 </param> 46 expressed"
58 <when value="summits"> 47 help="NB input feature data must include differential expression
59 <param name="window" type="integer" value="20000" optional="true" 48 flags (--only-DE)" />
60 label="Maximum distance a peak can be from each transcript 49 <expand macro="output_options" />
61 TSS before being omitted from analysis" />
62 <param name="cutoff" type="integer" value="130000" optional="true"
63 label="Maximum distance a transcript TSS can be from each
64 peak before being omitted from the analysis" />
65 </when>
66 <when value="regions">
67 <param name="edge_cutoff" type="integer" value="10000" optional="true"
68 label="Maximum distance a transcript edge can be from the
69 peak edge before being omitted from the analysis"
70 help="Set to zero to indicate that no cut off should be applied" />
71 <param name="number" type="integer" value="4" optional="true"
72 label="Maximum number of transcripts per peak to report from
73 from the analysis" />
74 <param name="promoter_start" type="integer" value="-10000" optional="true"
75 label="Start of promoter region with respect to gene TSS" />
76 <param name="promoter_end" type="integer" value="2500" optional="true"
77 label="End of promoter region with respect to gene TSS" />
78 <param name="pad_output" type="boolean" checked="false" truevalue="yes"
79 label="Output same number of lines for each peak"
80 help="Add blank lines in output for peaks with fewer than maximum number
81 of hits (--pad)" />
82 </when>
83 </conditional>
84 <param name="results_as_zip" type="boolean" checked="false" truevalue="yes"
85 label="Put output tab-delimited files into a single zip archive" />
86 </inputs> 50 </inputs>
87 <outputs> 51 <outputs>
88 <!-- Always produce XLS output --> 52 <!-- Always produce XLSX output -->
89 <data format="xls" name="xls_output" 53 <data format="xlsx" name="xlsx_out"
90 label="All RnaChipIntegrator analyses for ${rnaseq.name} vs ${chipseq.name} (Excel spreadsheet)" /> 54 label="All RnaChipIntegrator analyses: ${features_in.name} vs ${peaks_in.name} (Excel spreadsheet)" />
91 <!-- Outputs only produced for summit data --> 55 <data format="tabular" name="peaks_per_feature_out"
92 <data format="tabular" name="peaks_to_transcripts_out" 56 label="Nearest peaks to each gene: ${features_in.name} vs ${peaks_in.name}" />
93 label="Nearest summits to transcripts for ${rnaseq.name} vs ${chipseq.name}" > 57 <data format="tabular" name="features_per_peak_out"
94 <filter>analysis_options['peak_type'] == "summits"</filter> 58 label="Nearest genes to each peak: ${features_in.name} vs ${peaks_in.name}" />
95 <filter>results_as_zip is False</filter> 59 <data format="tabular" name="peaks_per_feature_summary"
60 label="Nearest peaks to each gene (summary): ${features_in.name} vs ${peaks_in.name}" >
61 <filter>output['compact_format'] is False</filter>
62 <filter>output['summary'] is True</filter>
96 </data> 63 </data>
97 <data format="tabular" name="tss_to_summits_out" 64 <data format="tabular" name="features_per_peak_summary"
98 label="Nearest TSS to summits for ${rnaseq.name} vs ${chipseq.name}" > 65 label="Nearest gene to each peak (summary): ${features_in.name} vs ${peaks_in.name}" >
99 <filter>analysis_options['peak_type'] == "summits"</filter> 66 <filter>output['compact_format'] is False</filter>
100 <filter>results_as_zip is False</filter> 67 <filter>output['summary'] is True</filter>
101 </data>
102 <!-- Outputs only produced for peak data -->
103 <data format="tabular" name="transcripts_to_edges_out"
104 label="Nearest transcripts to peak edges for ${rnaseq.name} vs ${chipseq.name}" >
105 <filter>analysis_options['peak_type'] == "regions"</filter>
106 <filter>results_as_zip is False</filter>
107 </data>
108 <data format="tabular" name="transcripts_to_edges_summary"
109 label="Nearest transcripts to peak edges (summary) for ${rnaseq.name} vs ${chipseq.name}" >
110 <filter>analysis_options['peak_type'] == "regions"</filter>
111 <filter>results_as_zip is False</filter>
112 </data>
113 <data format="tabular" name="tss_to_edges_out"
114 label="Nearest TSS to peak edges for ${rnaseq.name} vs ${chipseq.name}" >
115 <filter>analysis_options['peak_type'] == "regions"</filter>
116 <filter>results_as_zip is False</filter>
117 </data>
118 <data format="tabular" name="tss_to_edges_summary"
119 label="Nearest TSS to peak edges (summary) for ${rnaseq.name} vs ${chipseq.name}" >
120 <filter>analysis_options['peak_type'] == "regions"</filter>
121 <filter>results_as_zip is False</filter>
122 </data>
123 <data format="zip" name="zip_file"
124 label="All tab-delimited files for ${rnaseq.name} vs ${chipseq.name} (zip file)" >
125 <filter>results_as_zip is True</filter>
126 </data> 68 </data>
127 </outputs> 69 </outputs>
128 <tests> 70 <tests>
129 <test> 71 <!--
130 <param name="rnaseq" value="ExpressionData.txt" ftype="tabular" /> 72 RnaChipIntegrator +name=test +cutoff=130000 +promoter_region=-10000,2500 +xlsx +compact features.txt summits.txt
131 <param name="chipseq" value="ChIP_summits.txt" ftype="tabular" /> 73 -->
132 <param name="peak_type" value="summits" /> 74 <test>
133 <param name="window" value="20000" /> 75 <param name="features_in" value="features.txt" ftype="tabular" />
134 <param name="cutoff" value="130000" /> 76 <param name="peaks_in" value="summits.txt" ftype="tabular" />
135 <!-- 77 <param name="cutoff" value="130000" />
136 **NB** outputs have to be specified in order that they appear in the 78 <param name="promoter_start" value="-10000" />
137 tool (which is the order they will be written to the history) - the 79 <param name="promoter_end" value="2500" />
138 test framework seems to use the order and ignores the "name" attribute 80 <output name="xlsx_out" file="summits.xlsx" compare="sim_size" />
139 --> 81 <output name="peaks_per_feature_out" ftype="tabular"
140 <output name="xls_output" file="summits.xls" compare="sim_size" /> 82 file="summits_per_feature.out" />
141 <output name="peaks_to_transcripts_out" file="peaks_to_transcripts.out" ftype="tabular" /> 83 <output name="features_per_peak_out" ftype="tabular"
142 <output name="tss_to_summits_out" file="tss_to_summits.out" ftype="tabular" /> 84 file="features_per_summit.out" />
143 </test> 85 </test>
144 <test> 86 <!--
145 <param name="rnaseq" value="ExpressionData.txt" ftype="tabular" /> 87 RnaChipIntegrator +name=test +cutoff=130000 +promoter_region=-10000,2500 +xlsx +compact features.txt peaks.txt
146 <param name="chipseq" value="ChIP_peaks.txt" ftype="tabular" /> 88 -->
147 <param name="peak_type" value="regions" /> 89 <test>
148 <param name="edge_cutoff" value="130000" /> 90 <param name="features_in" value="features.txt" ftype="tabular" />
149 <!-- 91 <param name="peaks_in" value="peaks.txt" ftype="tabular" />
150 **NB** outputs have to be specified in order that they appear in the 92 <param name="cutoff" value="130000" />
151 tool (which is the order they will be written to the history) - the 93 <param name="promoter_start" value="-10000" />
152 test framework seems to use the order and ignores the "name" attribute 94 <param name="promoter_end" value="2500" />
153 --> 95 <output name="xlsx_out" file="peaks1.xlsx" compare="sim_size" />
154 <output name="xls_output" file="peaks.xls" compare="sim_size" /> 96 <output name="peaks_per_feature_out" ftype="tabular"
155 <output name="transcripts_to_edges_out" file="transcripts_to_edges.out" ftype="tabular" /> 97 file="peaks_per_feature1.out" />
156 <output name="transcripts_to_edges_summary" file="transcripts_to_edges.summary" ftype="tabular" /> 98 <output name="features_per_peak_out" ftype="tabular"
157 <output name="tss_to_edges_out" file="tss_to_edges.out" ftype="tabular" /> 99 file="features_per_peak1.out" />
158 <output name="tss_to_edges_summary" file="tss_to_edges.summary" ftype="tabular" /> 100 </test>
101 <!--
102 RnaChipIntegrator +name=test +cutoff=130000 +xlsx features.txt peaks.txt
103 -->
104 <test>
105 <param name="features_in" value="features.txt" ftype="tabular" />
106 <param name="peaks_in" value="peaks.txt" ftype="tabular" />
107 <param name="cutoff" value="130000" />
108 <param name="compact_format" value="false" />
109 <output name="xlsx_out" file="peaks2.xlsx" compare="sim_size" />
110 <output name="peaks_per_feature_out" ftype="tabular"
111 file="peaks_per_feature2.out" />
112 <output name="features_per_peak_out" ftype="tabular"
113 file="features_per_peak2.out" />
114 </test>
115 <!--
116 RnaChipIntegrator +name=test +cutoff=130000 +only-DE +xlsx +compact features.txt peaks.txt
117 -->
118 <test>
119 <param name="features_in" value="features.txt" ftype="tabular" />
120 <param name="peaks_in" value="peaks.txt" ftype="tabular" />
121 <param name="cutoff" value="130000" />
122 <param name="diff_expressed_only" value="true" />
123 <output name="xlsx_out" file="peaks3.xlsx" compare="sim_size" />
124 <output name="peaks_per_feature_out" ftype="tabular"
125 file="peaks_per_feature3.out" />
126 <output name="features_per_peak_out" ftype="tabular"
127 file="features_per_peak3.out" />
128 </test>
129 <!--
130 RnaChipIntegrator +name=test +cutoff=130000 +promoter_region=-10000,2500 +xlsx +summary features.txt peaks.txt
131 -->
132 <test>
133 <param name="features_in" value="features.txt" ftype="tabular" />
134 <param name="peaks_in" value="peaks.txt" ftype="tabular" />
135 <param name="cutoff" value="130000" />
136 <param name="compact_format" value="false" />
137 <param name="summary" value="true" />
138 <param name="pad_output" value="true" />
139 <output name="xlsx_out" file="peaks4.xlsx" compare="sim_size" />
140 <output name="peaks_per_feature_out" ftype="tabular"
141 file="peaks_per_feature4.out" />
142 <output name="features_per_peak_out" ftype="tabular"
143 file="features_per_peak4.out" />
144 <output name="peaks_per_feature_summary" ftype="tabular"
145 file="peaks_per_feature4.summary" />
146 <output name="features_per_peak_summary" ftype="tabular"
147 file="features_per_peak4.summary" />
159 </test> 148 </test>
160 </tests> 149 </tests>
161 <help> 150 <help>
162 151
163 .. class:: infomark 152 .. class:: infomark
164 153
165 **What it does** 154 **What it does**
166 155
167 Run RnaChipIntegrator to perform integrated analyses of gene expression 156 Performs integrated analyses of genes (or other genomic feature data)
168 and ChIP data, identifying the nearest ChIP peaks to each transcript 157 gainst a set of peaks (e.g. ChIP data), identifying the nearest peaks to
169 and vice versa. 158 each feature and vice versa.
170 159
171 For ChIP peaks defined as regions the following analyses are performed: 160 The program was originally written specifically for ChIP-Seq and RNA-Seq
172 161 data but works equally well for ChIP-chip and microarray expression data,
173 * **TranscriptsToPeakEdges**: reports the nearest transcripts with the smallest 162 and can also be used to integrate any set of genomic features (e.g.
174 distance from either their TSS or TES to the nearest peak edge. 163 canonical genes, CpG islands) with expression data.
175 164
176 * **TSSToPeakEdges**: reports the nearest transcripts with the smallest distance 165 RnaChipIntegrator can be obtained from
177 from their TSS to the nearest peak edge. 166 https://pypi.python.org/pypi/RnaChipIntegrator/
178
179 For ChIP peaks defined as summits:
180
181 * **TSSToSummits**: reports the nearest transcripts with the smallest distance
182 from the TSS to the nearest peak summit.
183
184 * **PeaksToTranscripts**: reports the nearest peak summits with the smallest
185 distance to either the TSS or TES of each transcript.
186
187 The program was originally written specifically for ChIP-Seq and RNA-Seq data
188 but works equally well for ChIP-chip and microarray expression data, and can
189 also be used to integrate any set of genomic features (e.g. canonical genes,
190 CpG islands) with expression data.
191
192 RnaChipIntgerator can be obtained from
193 http://fls-bioinformatics-core.github.com/RnaChipIntegrator/
194 167
195 ------------- 168 -------------
196 169
197 .. class:: infomark 170 .. class:: infomark
198 171
199 **Input** 172 **Input**
200 173
201 The expression data must be in a tab-delimited file with the following columns 174 The gene data must be in a tabular file with the following columns
202 of data for each genomic feature (one feature per line): 175 of data for each gene or genomic feature (one gene per line):
203 176
204 ====== ========== ====================================================================== 177 ====== ========== ======================================================================
205 Column Name Description 178 Column Name Description
206 ====== ========== ====================================================================== 179 ====== ========== ======================================================================
207 1 ID Name used to identify the feature in the output 180 1 ID Name used to identify the gene in the output
208 2 chr Chromosome name 181 2 chr Chromosome name
209 3 start Start position of the feature 182 3 start Start position of the gene
210 4 end End position of the feature 183 4 end End position of the gene
211 5 strand Must be either '+' or '-' 184 5 strand Must be either '+' or '-'
212 6 diff_expr Optional: indicates feature is differentially expressed (1) or not (0) 185 6 diff_expr Optional: indicates gene is differentially expressed (1) or not (0)
213 ====== ========== ====================================================================== 186 ====== ========== ======================================================================
214 187
215 The ChIP-seq data must be in a tab-delimited file with 3 columns of data for each 188 The peak data must be in a tabular file with at least 3 columns of data
216 ChIP peak (one per line): 189 for each peak (one peak per line):
217 190
218 ====== ========== ====================================================================== 191 ====== ========== =================================
219 Column Name Description 192 Column Name Description
220 ====== ========== ====================================================================== 193 ====== ========== =================================
221 1 chr Chromosome name (must match one of those in expression data file) 194 1 chr Chromosome name
222 2 start Start position of the peak 195 2 start Start position of the peak
223 3 end End position of the peak (start + 1 for summit data) 196 3 end End position of the peak
224 ====== ========== ====================================================================== 197 ====== ========== =================================
225 198
226 The ChIP peak data can be either the summit (in which case 'end' - 'start' = 1) or the 199 If peak data is in ``bed`` format then the tool will automatically
227 entire extent of the binding region (with 'start' and 'end' indicating the limits). 200 assign the correct columns, otherwise the first three columns of data
228 201 will be used.
229 ------------- 202
230 203 -------------
231 .. class:: infomark 204
232 205 .. class:: infomark
233 **Output** 206
234 207 **Outputs**
235 The outputs from this tool vary depending on the type of data that is input, however 208
236 generally there is one tab-delimited results file for each analysis described above 209 The key outputs from the tool are two lists compromising the nearest
237 in the **What it does** section (some analyses output a second file with just the 210 peaks for each gene, and the nearest gene for each peak (one dataset
238 "best" hits). 211 for each list).
239 212
240 A history item will be generated for each output file, unless the option to put them 213 There are two formats for reporting: "compact" and "full":
241 into a single zip archive is selected; this archive file will have to be downloaded 214
242 and unzipped on your local machine. It is recommended that you refer to the 215 * **Compact output** reports all the hits for each peak or gene on
243 RnaChipIntegrator documentation for information on the contents of each output file: 216 a single line of output;
244 https://github.com/fls-bioinformatics-core/RnaChipIntegrator/blob/master/doc/MANUAL.markdown 217 * **Full output** reports each peak/gene pair on a separate line
245 218 (i.e. a multi-line output format).
246 In addition an Excel spreadsheet (with one page for each analysis performed) is always 219
247 produced. 220 In "full" output mode, additional options are available:
221
222 * The output files can be "padded" with extra (empty) lines to ensure
223 that there are always the same number of lines for each peak or
224 gene, if fewer than the requested number of hits are found.
225 * "Summary" datasets can also be requested, which include just the
226 nearest peak reported for each gene (and vice versa).
227
228 In either mode these data will also be output in a single MS Excel file,
229 which contains one sheet per result set.
230
231 .. class:: warning
232
233 Using "compact" output with the number of hits limited to more than 4
234 peak/gene pairs (or with no limit at all) can result in a large number
235 of columns in the output files, which in some versions of Galaxy will
236 not be properly displayed. However the data files themselves should be
237 okay.
238
239 -------------
240
241 .. class:: informark
242
243 **More information**
244
245 It is recommended that you refer to the ``RnaChipIntegrator``
246 documentation for information on the contents of each output file:
247
248 * http://rnachipintegrator.readthedocs.org/en/latest/
248 249
249 ------------- 250 -------------
250 251
251 .. class:: infomark 252 .. class:: infomark
252 253
257 developed by this group, and is documented at 258 developed by this group, and is documented at
258 http://fls-bioinformatics-core.github.com/RnaChipIntegrator/ 259 http://fls-bioinformatics-core.github.com/RnaChipIntegrator/
259 260
260 Please kindly acknowledge the Bioinformatics Core Facility if you use this tool. 261 Please kindly acknowledge the Bioinformatics Core Facility if you use this tool.
261 </help> 262 </help>
263 <expand macro="citations" />
262 </tool> 264 </tool>