comparison qualimap_counts.xml @ 0:e020be4f281b draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/qualimap commit b4d43001cc0caa14d760c347fa1c416929f769b2"
author iuc
date Thu, 10 Oct 2019 17:40:46 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e020be4f281b
1 <tool id="qualimap_counts" name="QualiMap Counts QC" version="@VERSION@">
2 <macros>
3 <import>qualimap_macros.xml</import>
4 <xml name="gene_info">
5 <conditional name="gene_info">
6 <param name="source" type="select"
7 label="Additional information about genes">
8 <option value="">None</option>
9 <option value="builtin">Built-in gene information for supported species</option>
10 <option value="custom">Custom gene information</option>
11 </param>
12 <when value="" />
13 <when value="builtin">
14 <param argument="-s" name="species" type="select"
15 label="Select species">
16 <option value="HUMAN">Human</option>
17 <option value="MOUSE">Mouse</option>
18 </param>
19 </when>
20 <when value="custom">
21 <param argument="-i" name="info" type="data" format="tsv"
22 label="Qualimap compatible gene annotation data" />
23 </when>
24 </conditional>
25 </xml>
26 </macros>
27 <expand macro="requirements" />
28 <expand macro="version_command" />
29 <command detect_errors="exit_code"><![CDATA[
30 @SET_JAVA_OPTS@ &&
31
32 ## Inconveniently, metadata.column_names is a long string of
33 ## comma-separated, sanitized column name representations.
34 ## The following turns this string into a list of the actual names.
35 #set $sample_names = [
36 s.strip().replace(' ', '_')
37 for s in $input.metadata.column_names
38 .replace('__ob__','')
39 .replace('u__sq__','')
40 .replace('__sq__','')
41 .replace('__cb__','')
42 .split(',')
43 ][1:]
44 #if str($analysis.groups).strip():
45 #set $groups = [
46 v.strip().replace(' ', '_')
47 for v in str($analysis.groups).split(',')
48 ]
49 #else:
50 #set $groups = ['.'] * (int($input.metadata.columns) - 1)
51 #end if
52 #for $col, $sample, $group in zip(range(2, int($input.metadata.columns) + 1), $sample_names, $groups):
53 printf '%s\t%s\tcounts_data\t%d\n' '${sample}' '${group}' ${col} >> data_spec.txt &&
54 #end for
55
56 ln -s '$input' counts_data &&
57
58 qualimap counts
59 --data data_spec.txt
60 #if str($analysis.mode) == 'comparison':
61 --compare
62 #end if
63 #if str($analysis.mode) != 'multi_sample':
64 #if str($analysis.gene_info.source) == 'builtin':
65 --species ${analysis.gene_info.species}
66 #elif str($analysis.gene_info.source) == 'custom':
67 --info ${analysis.info}
68 #end if
69 #end if
70 --threshold $threshold
71 -outdir results -outformat html &&
72
73 #if str($analysis.mode) == 'comparison':
74 #set $report_name = 'ComparisonReport'
75 #elif str($analysis.mode) == 'multi_sample':
76 #set $report_name = 'GlobalReport'
77 #else:
78 #set $sample = $sample_names[int($analysis.sample_number) - 1]
79 ## Qualimap replaces '-' in sample names with '_' for
80 ## determining file names
81 #set $report_name = str($sample).replace('-', '_') + 'Report'
82 #end if
83 #set $summary_report = None
84 @MASSAGE_OUTPUT@
85 ]]></command>
86
87 <inputs>
88 <param name="input" type="data" format="tsv" label="Input counts data" />
89 <conditional name="analysis">
90 <param name="mode" type="select" label="Type of analysis to perform">
91 <option value="multi_sample">Report overview stats for all samples</option>
92 <option value="single_sample">Report feature count stats of a single sample</option>
93 <option value="comparison">Compare two groups of samples</option>
94 </param>
95 <when value="multi_sample">
96 <param name="groups" type="hidden" value="" />
97 </when>
98 <when value="single_sample">
99 <param name="groups" type="hidden" value="" />
100 <param name="sample_number" type="integer" value="1" min="1"
101 label="Which sample in the input would you like to analyze?" />
102 <expand macro="gene_info" />
103 </when>
104 <when value="comparison">
105 <param name="groups" type="text"
106 label="Assign each sample to one of two groups"
107 help="The 'Comparison' mode currently works for exactly two groups of samples. Enter comma-separated group names - one for each sample. The group names will be assigned to the samples in the order of the sample columns of the counts data.">
108 <validator type="expression" message="Please specify a comma-separated list of group names. No group name in the list may be blank.">all(v.strip() for v in value.split(','))</validator>
109 <validator type="expression" message="Please specify a comma-separated list of group names. Only two different group names are allowed.">len(set(v.strip() for v in value.split(','))) == 2</validator>
110 </param>
111 <expand macro="gene_info" />
112 </when>
113 </conditional>
114 <param argument="-k" name="threshold" type="integer" value="5" min="1"
115 label="Counts threshold" />
116 </inputs>
117 <outputs>
118 <data name="output_html" format="html" />
119 </outputs>
120 <tests>
121 <test>
122 <param name="input" value="mouse_counts_ensemble_1000_6.tsv" ftype="tsv" />
123 <conditional name="analysis">
124 <param name="mode" value="single_sample" />
125 <param name="sample_number" value="3" />
126 </conditional>
127 <assert_command>
128 <has_text text="results/_GlcN03Report.html" />
129 </assert_command>
130 <output name="output_html">
131 <assert_contents>
132 <has_text text="Qualimap Report: Counts QC" />
133 <not_has_text text="#Counts Distribution" />
134 <not_has_text text="#Features With Low Counts" />
135 <not_has_text text="#Bio Detection" />
136 <not_has_text text="#Counts Per Biotype" />
137 <not_has_text text="#Length Bias" />
138 <not_has_text text="#GC Bias" />
139 <not_has_text text="#Counts Density" />
140 <not_has_text text="#Scatterplot Matrix" />
141 <has_text text="#Saturation" />
142 </assert_contents>
143 </output>
144 </test>
145 <test>
146 <param name="input" value="mouse_counts_ensemble_1000_6.tsv" ftype="tsv" />
147 <conditional name="analysis">
148 <param name="mode" value="single_sample" />
149 <param name="sample_number" value="3" />
150 <conditional name="gene_info">
151 <param name="source" value="builtin" />
152 <param name="species" value="MOUSE" />
153 </conditional>
154 </conditional>
155 <assert_command>
156 <has_text text="results/_GlcN03Report.html" />
157 </assert_command>
158 <output name="output_html">
159 <assert_contents>
160 <has_text text="Qualimap Report: Counts QC" />
161 <not_has_text text="#Counts Distribution" />
162 <not_has_text text="#Features With Low Counts" />
163 <has_text text="#Bio Detection" />
164 <has_text text="#Counts Per Biotype" />
165 <has_text text="#Length Bias" />
166 <has_text text="#GC Bias" />
167 <not_has_text text="#Counts Density" />
168 <not_has_text text="#Scatterplot Matrix" />
169 <has_text text="#Saturation" />
170 </assert_contents>
171 </output>
172 </test>
173 <test>
174 <param name="input" value="mouse_counts_ensemble_1000_6.tsv" ftype="tsv" />
175 <conditional name="analysis">
176 <param name="mode" value="comparison" />
177 <param name="groups" value="minus,minus,minus,plus,plus,plus" />
178 </conditional>
179 <output name="output_html">
180 <assert_contents>
181 <has_text text="Qualimap Report: Counts QC" />
182 <has_text text="#Counts Distribution" />
183 <has_text text="#Features With Low Counts" />
184 <not_has_text text="#Bio Detection" />
185 <not_has_text text="#Counts Per Biotype" />
186 <not_has_text text="#Length Bias" />
187 <not_has_text text="#GC Bias" />
188 <not_has_text text="#Counts Density" />
189 <not_has_text text="#Scatterplot Matrix" />
190 <not_has_text text="#Saturation" />
191 </assert_contents>
192 </output>
193 </test>
194 <test>
195 <param name="input" value="mouse_counts_ensemble_1000_6.tsv" ftype="tsv" />
196 <conditional name="analysis">
197 <param name="mode" value="comparison" />
198 <param name="groups" value="minus,minus,minus,plus,plus,plus" />
199 <conditional name="gene_info">
200 <param name="source" value="builtin" />
201 <param name="species" value="MOUSE" />
202 </conditional>
203 </conditional>
204 <output name="output_html">
205 <assert_contents>
206 <has_text text="Qualimap Report: Counts QC" />
207 <has_text text="#Counts Distribution" />
208 <has_text text="#Features With Low Counts" />
209 <has_text text="#Bio Detection" />
210 <not_has_text text="#Counts Per Biotype" />
211 <has_text text="#Length Bias" />
212 <has_text text="#GC Bias" />
213 <not_has_text text="#Counts Density" />
214 <not_has_text text="#Scatterplot Matrix" />
215 <not_has_text text="#Saturation" />
216 </assert_contents>
217 </output>
218 </test>
219 <test>
220 <param name="input" value="mouse_counts_ensemble_1000_6.tsv" ftype="tsv" />
221 <conditional name="analysis">
222 <param name="mode" value="multi_sample" />
223 </conditional>
224 <output name="output_html">
225 <assert_contents>
226 <has_text text="Qualimap Report: Counts QC" />
227 <has_text text="_GlcN01 ." />
228 <has_text text="_GlcN02 ." />
229 <has_text text="_GlcN03 ." />
230 <has_text text="+GlcN01 ." />
231 <has_text text="+GlcN02 ." />
232 <has_text text="+GlcN03 ." />
233 <has_text text="#input" />
234 <has_text text="#Counts Density" />
235 <has_text text="#Scatterplot Matrix" />
236 <has_text text="#Saturation" />
237 <has_text text="#Counts Distribution" />
238 <has_text text="#Features With Low Counts" />
239 <not_has_text text="#Bio Detection" />
240 <not_has_text text="#Counts Per Biotype" />
241 <not_has_text text="#Length Bias" />
242 <not_has_text text="#GC Bias" />
243 </assert_contents>
244 </output>
245 </test>
246 </tests>
247 <help><![CDATA[
248 **What it does**
249
250 In RNA-seq experiments, the reads are usually first mapped to a reference genome. It is assumed that if the number of reads mapping to a certain biological feature of interest (gene, transcript, exon, ...) is sufficient, it can be used as an estimation of the abundance of that feature in the sample and interpreted as the quantification of the expression level of the corresponding region.
251
252 These count data can be utilized for example to assess differential expression between two or more experimental conditions. Before assessing differential expression analysis, researchers should be aware of some potential limitations of RNA-seq data, as for example:
253
254 - Has saturation been reached, or could more features be detected by
255 increasing the sequencing depth?
256
257 - Which type of features are being detected in the experiment?
258
259 - How good is the quantification of expression in the sample?
260
261 All of these questions can be answered by interpreting the plots generated by
262 **Qualimap Counts QC**.
263
264
265 Input
266 =====
267
268 The tool accepts tabular input of type `tsv`. It expects gene identifiers in
269 the first and the counts for different samples in the following column(s).
270 The first line of the input needs to be a header line starting with `#`,
271 immediately followed by the name of the gene identifier column, then the sample
272 names separated by tabs.
273 So, for example::
274
275 #GeneID Sample1 Sample2
276
277 would be a valid header line.
278
279 .. class:: infomark
280
281 The *Counts* output of featureCounts represents nearly valid input for the
282 tool, but you will have to **replace** the header line to add a leading `#`
283 and to provide more telling sample names.
284
285 You can **join** the outputs of several featureCounts runs to obtain
286 multi-sample counts data.
287
288
289 Analysis/Report types
290 ---------------------
291
292 *Report overview stats for all samples* - Generates overview plots of the
293 counts data across all samples.
294
295 *Report feature count stats of a single sample* - Generates plots with detailed
296 information about a single sample.
297
298 *Compare two groups of samples* - Lets you compare groups of samples representing different conditions.
299 This version of Qualimap requires all samples to belong to one of two groups.
300
301
302 Parameters
303 ----------
304
305 *Additional information about genes* (optional)
306
307 Qualimap requires gene annotation data to generate plots (see *Output* section
308 below) of
309
310 - counts across classes of features
311 - feature length and GC content bias in the counts data
312
313 , which are available for the single-sample and group comparison reports.
314
315 You can provide the annotation data in the form of:
316
317 - Built-in gene information for supported species
318
319 For convenience, Qualimap provides the Ensembl annotations for certain species (currently Human and Mouse). In order to use these annotations, Ensembl Gene IDs should be used as the feature IDs on the count files (e.g. ENSG00000251282).
320
321 - Custom gene information
322
323 A tabular dataset holding annotations of the features in the counts dataset is
324 required. It must be in a four-column tab-delimited (`tsv`) format, with the
325 feature names or IDs in the first column, the group (*e.g.* the biotype from
326 Ensembl database) in the second column, feature length in the third and feature
327 GC-content in the last column (see this
328 `example <http://kokonech.github.io/qualimap/samples/human.ens68.txt>`__).
329
330 **Make sure to use the same feature IDs in the annotation and in the counts dataset!**
331
332 To generate a Qualimap-compatible info file based on an arbitrary GTF annotation and a genome FASTA file, the developers of Qualimap offer a `Python script for the command line <https://bitbucket.org/kokonech/qualimap/src/master/util/createQualimapInfoFile.py?at=master>`__.
333
334 *Counts threshold*
335
336 In order to remove the influence of spurious reads, a feature is considered as
337 detected if its corresponding number of counts is greater than this threshold.
338
339 By default, the threshold value is set to 5 counts, meaning that features having
340 less than 5 counts will not be taken into account.
341
342
343 Output
344 ======
345
346 Many of the plots that this tool can produce are created using the NOISeq package. The `NOISeq vignette <http://www.bioconductor.org/packages/release/bioc/vignettes/NOISeq/inst/doc/NOISeq.pdf>`__ contains a lot of useful information about the plots and how to interpret them. Here we provide a short explanation:
347
348
349 Plots of overview stats for all samples
350 ---------------------------------------
351
352 *Counts Density*
353
354 This plot shows density of counts computed from the histogram of log-transformed counts. In order to avoid infinite values in case of zero counts the transformation `log2(expr + 0.5)` is applied, where `expr` is the number of read counts for a given feature. Only log-transformed counts having value greater than 1 are plotted.
355
356 *Scatterplot Matrix*
357
358 The panel shows a scatterplot along with smoothed line (lower panel) and Pearson correlation coefficients (upper panel) for each pair of samples. Plots are generated using log-transformed counts.
359
360 *Saturation*
361
362 This plot provides information about the level of saturation in the samples, so it helps the user to decide if more sequencing is needed and more features could be detected when increasing the number of reads.
363
364 Sequencing depth of each sample (on the x-axis) is plotted against the number of detected features (on the y-axis). Here, “detected features” refers to features with more than k counts, where k is the *Counts threshold* selected by the user.
365
366 The highlighted value is the real sequencing depth of the sample(s). The
367 expected results at other sequencing depths are simulated based on random
368 sampling of the original data.
369
370 *Counts Distribution*
371
372 This box plot shows the overall counts distribution of each sample.
373
374 *Features With Low Counts*
375
376 This plot shows the proportion of features with low counts in each sample. Such features are usually less reliable and could be filtered out. In this plot, the bars show the percentage of features within each sample having more than 0 counts per million (CPM), or more than 1, 2, 5 and 10 CPM.
377
378
379 Plots of single-sample count statistics
380 ---------------------------------------
381
382 .. class:: infomark
383
384 Note that most single-sample plots require built-in or custom *additional
385 information about genes* to be generated. The *Saturation* plot is the only
386 exception.
387
388 *Saturation*
389
390 Similar to the same plot in the overview of all samples.
391 The single-sample plot, however, has an additional y-axis (on the right)
392 showing the number of features expected to be newly detected when increasing
393 the sequencing depth by one million reads from each indicated sequencing depth
394 value.
395
396 *Bio Detection*
397
398 This barplot allows the user to know which **kind** of features are being
399 detected in the chosen sample. The x-axis shows all feature categories listed
400 in the annotations file. The gray bars are the percentage of features of each group within the reference genome (or transcriptome, etc.). The striped color bars are the percentages of features of each group detected in the sample with regard to the genome. The solid color bars are the percentages that each group represents in the total detected features in the sample.
401
402 *Counts Per Biotype*
403
404 This boxplot shows the distribution of counts of features from each detected
405 feature clas.
406
407 *Length Bias*
408
409 The plot describes the relationship between the length of the features and
410 their expression values.
411
412 Feature lengths are divided into bins, and mean expression of features falling into a particular length interval is computed and plotted. A cubic spline regression model is fitted to explain the relation between length and expression. Coefficient of determination (`R^2`) and p-value are shown together with the regression curve.
413
414 *GC Bias*
415
416 The plot describes the relantionship between the GC-content of the features and the expression values. The data for the plot is generated similar to the
417 *Length Bias* plot. The GC content is divided into bins and the mean expression
418 of features falling into any given GC interval is computed. The relation between GC-content and expression is investigated using a cubic spline regression model.
419
420
421 Plots for comparing two groups of samples
422 -----------------------------------------
423
424 This mode can generate side-by-side plots of
425
426 - *Counts Distribution*,
427 - *Features With Low Counts*
428 - *Bio Detection*,
429 - *Length Bias* and
430 - *GC Bias*
431
432 for two groups of samples.
433
434 .. class:: infomark
435
436 Note that the *Bio Detection*, *Length Bias* and *GC Bias* plots can only be
437 generated when built-in or custom *additional information about genes* is
438 available.
439
440 ]]></help>
441 <expand macro="citations"/>
442 </tool>