Mercurial > repos > iuc > qualimap_counts
comparison qualimap_counts.xml @ 0:e020be4f281b draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/qualimap commit b4d43001cc0caa14d760c347fa1c416929f769b2"
author | iuc |
---|---|
date | Thu, 10 Oct 2019 17:40:46 -0400 |
parents | |
children | 72927fc9e9ed |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e020be4f281b |
---|---|
1 <tool id="qualimap_counts" name="QualiMap Counts QC" version="@VERSION@"> | |
2 <macros> | |
3 <import>qualimap_macros.xml</import> | |
4 <xml name="gene_info"> | |
5 <conditional name="gene_info"> | |
6 <param name="source" type="select" | |
7 label="Additional information about genes"> | |
8 <option value="">None</option> | |
9 <option value="builtin">Built-in gene information for supported species</option> | |
10 <option value="custom">Custom gene information</option> | |
11 </param> | |
12 <when value="" /> | |
13 <when value="builtin"> | |
14 <param argument="-s" name="species" type="select" | |
15 label="Select species"> | |
16 <option value="HUMAN">Human</option> | |
17 <option value="MOUSE">Mouse</option> | |
18 </param> | |
19 </when> | |
20 <when value="custom"> | |
21 <param argument="-i" name="info" type="data" format="tsv" | |
22 label="Qualimap compatible gene annotation data" /> | |
23 </when> | |
24 </conditional> | |
25 </xml> | |
26 </macros> | |
27 <expand macro="requirements" /> | |
28 <expand macro="version_command" /> | |
29 <command detect_errors="exit_code"><![CDATA[ | |
30 @SET_JAVA_OPTS@ && | |
31 | |
32 ## Inconveniently, metadata.column_names is a long string of | |
33 ## comma-separated, sanitized column name representations. | |
34 ## The following turns this string into a list of the actual names. | |
35 #set $sample_names = [ | |
36 s.strip().replace(' ', '_') | |
37 for s in $input.metadata.column_names | |
38 .replace('__ob__','') | |
39 .replace('u__sq__','') | |
40 .replace('__sq__','') | |
41 .replace('__cb__','') | |
42 .split(',') | |
43 ][1:] | |
44 #if str($analysis.groups).strip(): | |
45 #set $groups = [ | |
46 v.strip().replace(' ', '_') | |
47 for v in str($analysis.groups).split(',') | |
48 ] | |
49 #else: | |
50 #set $groups = ['.'] * (int($input.metadata.columns) - 1) | |
51 #end if | |
52 #for $col, $sample, $group in zip(range(2, int($input.metadata.columns) + 1), $sample_names, $groups): | |
53 printf '%s\t%s\tcounts_data\t%d\n' '${sample}' '${group}' ${col} >> data_spec.txt && | |
54 #end for | |
55 | |
56 ln -s '$input' counts_data && | |
57 | |
58 qualimap counts | |
59 --data data_spec.txt | |
60 #if str($analysis.mode) == 'comparison': | |
61 --compare | |
62 #end if | |
63 #if str($analysis.mode) != 'multi_sample': | |
64 #if str($analysis.gene_info.source) == 'builtin': | |
65 --species ${analysis.gene_info.species} | |
66 #elif str($analysis.gene_info.source) == 'custom': | |
67 --info ${analysis.info} | |
68 #end if | |
69 #end if | |
70 --threshold $threshold | |
71 -outdir results -outformat html && | |
72 | |
73 #if str($analysis.mode) == 'comparison': | |
74 #set $report_name = 'ComparisonReport' | |
75 #elif str($analysis.mode) == 'multi_sample': | |
76 #set $report_name = 'GlobalReport' | |
77 #else: | |
78 #set $sample = $sample_names[int($analysis.sample_number) - 1] | |
79 ## Qualimap replaces '-' in sample names with '_' for | |
80 ## determining file names | |
81 #set $report_name = str($sample).replace('-', '_') + 'Report' | |
82 #end if | |
83 #set $summary_report = None | |
84 @MASSAGE_OUTPUT@ | |
85 ]]></command> | |
86 | |
87 <inputs> | |
88 <param name="input" type="data" format="tsv" label="Input counts data" /> | |
89 <conditional name="analysis"> | |
90 <param name="mode" type="select" label="Type of analysis to perform"> | |
91 <option value="multi_sample">Report overview stats for all samples</option> | |
92 <option value="single_sample">Report feature count stats of a single sample</option> | |
93 <option value="comparison">Compare two groups of samples</option> | |
94 </param> | |
95 <when value="multi_sample"> | |
96 <param name="groups" type="hidden" value="" /> | |
97 </when> | |
98 <when value="single_sample"> | |
99 <param name="groups" type="hidden" value="" /> | |
100 <param name="sample_number" type="integer" value="1" min="1" | |
101 label="Which sample in the input would you like to analyze?" /> | |
102 <expand macro="gene_info" /> | |
103 </when> | |
104 <when value="comparison"> | |
105 <param name="groups" type="text" | |
106 label="Assign each sample to one of two groups" | |
107 help="The 'Comparison' mode currently works for exactly two groups of samples. Enter comma-separated group names - one for each sample. The group names will be assigned to the samples in the order of the sample columns of the counts data."> | |
108 <validator type="expression" message="Please specify a comma-separated list of group names. No group name in the list may be blank.">all(v.strip() for v in value.split(','))</validator> | |
109 <validator type="expression" message="Please specify a comma-separated list of group names. Only two different group names are allowed.">len(set(v.strip() for v in value.split(','))) == 2</validator> | |
110 </param> | |
111 <expand macro="gene_info" /> | |
112 </when> | |
113 </conditional> | |
114 <param argument="-k" name="threshold" type="integer" value="5" min="1" | |
115 label="Counts threshold" /> | |
116 </inputs> | |
117 <outputs> | |
118 <data name="output_html" format="html" /> | |
119 </outputs> | |
120 <tests> | |
121 <test> | |
122 <param name="input" value="mouse_counts_ensemble_1000_6.tsv" ftype="tsv" /> | |
123 <conditional name="analysis"> | |
124 <param name="mode" value="single_sample" /> | |
125 <param name="sample_number" value="3" /> | |
126 </conditional> | |
127 <assert_command> | |
128 <has_text text="results/_GlcN03Report.html" /> | |
129 </assert_command> | |
130 <output name="output_html"> | |
131 <assert_contents> | |
132 <has_text text="Qualimap Report: Counts QC" /> | |
133 <not_has_text text="#Counts Distribution" /> | |
134 <not_has_text text="#Features With Low Counts" /> | |
135 <not_has_text text="#Bio Detection" /> | |
136 <not_has_text text="#Counts Per Biotype" /> | |
137 <not_has_text text="#Length Bias" /> | |
138 <not_has_text text="#GC Bias" /> | |
139 <not_has_text text="#Counts Density" /> | |
140 <not_has_text text="#Scatterplot Matrix" /> | |
141 <has_text text="#Saturation" /> | |
142 </assert_contents> | |
143 </output> | |
144 </test> | |
145 <test> | |
146 <param name="input" value="mouse_counts_ensemble_1000_6.tsv" ftype="tsv" /> | |
147 <conditional name="analysis"> | |
148 <param name="mode" value="single_sample" /> | |
149 <param name="sample_number" value="3" /> | |
150 <conditional name="gene_info"> | |
151 <param name="source" value="builtin" /> | |
152 <param name="species" value="MOUSE" /> | |
153 </conditional> | |
154 </conditional> | |
155 <assert_command> | |
156 <has_text text="results/_GlcN03Report.html" /> | |
157 </assert_command> | |
158 <output name="output_html"> | |
159 <assert_contents> | |
160 <has_text text="Qualimap Report: Counts QC" /> | |
161 <not_has_text text="#Counts Distribution" /> | |
162 <not_has_text text="#Features With Low Counts" /> | |
163 <has_text text="#Bio Detection" /> | |
164 <has_text text="#Counts Per Biotype" /> | |
165 <has_text text="#Length Bias" /> | |
166 <has_text text="#GC Bias" /> | |
167 <not_has_text text="#Counts Density" /> | |
168 <not_has_text text="#Scatterplot Matrix" /> | |
169 <has_text text="#Saturation" /> | |
170 </assert_contents> | |
171 </output> | |
172 </test> | |
173 <test> | |
174 <param name="input" value="mouse_counts_ensemble_1000_6.tsv" ftype="tsv" /> | |
175 <conditional name="analysis"> | |
176 <param name="mode" value="comparison" /> | |
177 <param name="groups" value="minus,minus,minus,plus,plus,plus" /> | |
178 </conditional> | |
179 <output name="output_html"> | |
180 <assert_contents> | |
181 <has_text text="Qualimap Report: Counts QC" /> | |
182 <has_text text="#Counts Distribution" /> | |
183 <has_text text="#Features With Low Counts" /> | |
184 <not_has_text text="#Bio Detection" /> | |
185 <not_has_text text="#Counts Per Biotype" /> | |
186 <not_has_text text="#Length Bias" /> | |
187 <not_has_text text="#GC Bias" /> | |
188 <not_has_text text="#Counts Density" /> | |
189 <not_has_text text="#Scatterplot Matrix" /> | |
190 <not_has_text text="#Saturation" /> | |
191 </assert_contents> | |
192 </output> | |
193 </test> | |
194 <test> | |
195 <param name="input" value="mouse_counts_ensemble_1000_6.tsv" ftype="tsv" /> | |
196 <conditional name="analysis"> | |
197 <param name="mode" value="comparison" /> | |
198 <param name="groups" value="minus,minus,minus,plus,plus,plus" /> | |
199 <conditional name="gene_info"> | |
200 <param name="source" value="builtin" /> | |
201 <param name="species" value="MOUSE" /> | |
202 </conditional> | |
203 </conditional> | |
204 <output name="output_html"> | |
205 <assert_contents> | |
206 <has_text text="Qualimap Report: Counts QC" /> | |
207 <has_text text="#Counts Distribution" /> | |
208 <has_text text="#Features With Low Counts" /> | |
209 <has_text text="#Bio Detection" /> | |
210 <not_has_text text="#Counts Per Biotype" /> | |
211 <has_text text="#Length Bias" /> | |
212 <has_text text="#GC Bias" /> | |
213 <not_has_text text="#Counts Density" /> | |
214 <not_has_text text="#Scatterplot Matrix" /> | |
215 <not_has_text text="#Saturation" /> | |
216 </assert_contents> | |
217 </output> | |
218 </test> | |
219 <test> | |
220 <param name="input" value="mouse_counts_ensemble_1000_6.tsv" ftype="tsv" /> | |
221 <conditional name="analysis"> | |
222 <param name="mode" value="multi_sample" /> | |
223 </conditional> | |
224 <output name="output_html"> | |
225 <assert_contents> | |
226 <has_text text="Qualimap Report: Counts QC" /> | |
227 <has_text text="_GlcN01 ." /> | |
228 <has_text text="_GlcN02 ." /> | |
229 <has_text text="_GlcN03 ." /> | |
230 <has_text text="+GlcN01 ." /> | |
231 <has_text text="+GlcN02 ." /> | |
232 <has_text text="+GlcN03 ." /> | |
233 <has_text text="#input" /> | |
234 <has_text text="#Counts Density" /> | |
235 <has_text text="#Scatterplot Matrix" /> | |
236 <has_text text="#Saturation" /> | |
237 <has_text text="#Counts Distribution" /> | |
238 <has_text text="#Features With Low Counts" /> | |
239 <not_has_text text="#Bio Detection" /> | |
240 <not_has_text text="#Counts Per Biotype" /> | |
241 <not_has_text text="#Length Bias" /> | |
242 <not_has_text text="#GC Bias" /> | |
243 </assert_contents> | |
244 </output> | |
245 </test> | |
246 </tests> | |
247 <help><![CDATA[ | |
248 **What it does** | |
249 | |
250 In RNA-seq experiments, the reads are usually first mapped to a reference genome. It is assumed that if the number of reads mapping to a certain biological feature of interest (gene, transcript, exon, ...) is sufficient, it can be used as an estimation of the abundance of that feature in the sample and interpreted as the quantification of the expression level of the corresponding region. | |
251 | |
252 These count data can be utilized for example to assess differential expression between two or more experimental conditions. Before assessing differential expression analysis, researchers should be aware of some potential limitations of RNA-seq data, as for example: | |
253 | |
254 - Has saturation been reached, or could more features be detected by | |
255 increasing the sequencing depth? | |
256 | |
257 - Which type of features are being detected in the experiment? | |
258 | |
259 - How good is the quantification of expression in the sample? | |
260 | |
261 All of these questions can be answered by interpreting the plots generated by | |
262 **Qualimap Counts QC**. | |
263 | |
264 | |
265 Input | |
266 ===== | |
267 | |
268 The tool accepts tabular input of type `tsv`. It expects gene identifiers in | |
269 the first and the counts for different samples in the following column(s). | |
270 The first line of the input needs to be a header line starting with `#`, | |
271 immediately followed by the name of the gene identifier column, then the sample | |
272 names separated by tabs. | |
273 So, for example:: | |
274 | |
275 #GeneID Sample1 Sample2 | |
276 | |
277 would be a valid header line. | |
278 | |
279 .. class:: infomark | |
280 | |
281 The *Counts* output of featureCounts represents nearly valid input for the | |
282 tool, but you will have to **replace** the header line to add a leading `#` | |
283 and to provide more telling sample names. | |
284 | |
285 You can **join** the outputs of several featureCounts runs to obtain | |
286 multi-sample counts data. | |
287 | |
288 | |
289 Analysis/Report types | |
290 --------------------- | |
291 | |
292 *Report overview stats for all samples* - Generates overview plots of the | |
293 counts data across all samples. | |
294 | |
295 *Report feature count stats of a single sample* - Generates plots with detailed | |
296 information about a single sample. | |
297 | |
298 *Compare two groups of samples* - Lets you compare groups of samples representing different conditions. | |
299 This version of Qualimap requires all samples to belong to one of two groups. | |
300 | |
301 | |
302 Parameters | |
303 ---------- | |
304 | |
305 *Additional information about genes* (optional) | |
306 | |
307 Qualimap requires gene annotation data to generate plots (see *Output* section | |
308 below) of | |
309 | |
310 - counts across classes of features | |
311 - feature length and GC content bias in the counts data | |
312 | |
313 , which are available for the single-sample and group comparison reports. | |
314 | |
315 You can provide the annotation data in the form of: | |
316 | |
317 - Built-in gene information for supported species | |
318 | |
319 For convenience, Qualimap provides the Ensembl annotations for certain species (currently Human and Mouse). In order to use these annotations, Ensembl Gene IDs should be used as the feature IDs on the count files (e.g. ENSG00000251282). | |
320 | |
321 - Custom gene information | |
322 | |
323 A tabular dataset holding annotations of the features in the counts dataset is | |
324 required. It must be in a four-column tab-delimited (`tsv`) format, with the | |
325 feature names or IDs in the first column, the group (*e.g.* the biotype from | |
326 Ensembl database) in the second column, feature length in the third and feature | |
327 GC-content in the last column (see this | |
328 `example <http://kokonech.github.io/qualimap/samples/human.ens68.txt>`__). | |
329 | |
330 **Make sure to use the same feature IDs in the annotation and in the counts dataset!** | |
331 | |
332 To generate a Qualimap-compatible info file based on an arbitrary GTF annotation and a genome FASTA file, the developers of Qualimap offer a `Python script for the command line <https://bitbucket.org/kokonech/qualimap/src/master/util/createQualimapInfoFile.py?at=master>`__. | |
333 | |
334 *Counts threshold* | |
335 | |
336 In order to remove the influence of spurious reads, a feature is considered as | |
337 detected if its corresponding number of counts is greater than this threshold. | |
338 | |
339 By default, the threshold value is set to 5 counts, meaning that features having | |
340 less than 5 counts will not be taken into account. | |
341 | |
342 | |
343 Output | |
344 ====== | |
345 | |
346 Many of the plots that this tool can produce are created using the NOISeq package. The `NOISeq vignette <http://www.bioconductor.org/packages/release/bioc/vignettes/NOISeq/inst/doc/NOISeq.pdf>`__ contains a lot of useful information about the plots and how to interpret them. Here we provide a short explanation: | |
347 | |
348 | |
349 Plots of overview stats for all samples | |
350 --------------------------------------- | |
351 | |
352 *Counts Density* | |
353 | |
354 This plot shows density of counts computed from the histogram of log-transformed counts. In order to avoid infinite values in case of zero counts the transformation `log2(expr + 0.5)` is applied, where `expr` is the number of read counts for a given feature. Only log-transformed counts having value greater than 1 are plotted. | |
355 | |
356 *Scatterplot Matrix* | |
357 | |
358 The panel shows a scatterplot along with smoothed line (lower panel) and Pearson correlation coefficients (upper panel) for each pair of samples. Plots are generated using log-transformed counts. | |
359 | |
360 *Saturation* | |
361 | |
362 This plot provides information about the level of saturation in the samples, so it helps the user to decide if more sequencing is needed and more features could be detected when increasing the number of reads. | |
363 | |
364 Sequencing depth of each sample (on the x-axis) is plotted against the number of detected features (on the y-axis). Here, “detected features” refers to features with more than k counts, where k is the *Counts threshold* selected by the user. | |
365 | |
366 The highlighted value is the real sequencing depth of the sample(s). The | |
367 expected results at other sequencing depths are simulated based on random | |
368 sampling of the original data. | |
369 | |
370 *Counts Distribution* | |
371 | |
372 This box plot shows the overall counts distribution of each sample. | |
373 | |
374 *Features With Low Counts* | |
375 | |
376 This plot shows the proportion of features with low counts in each sample. Such features are usually less reliable and could be filtered out. In this plot, the bars show the percentage of features within each sample having more than 0 counts per million (CPM), or more than 1, 2, 5 and 10 CPM. | |
377 | |
378 | |
379 Plots of single-sample count statistics | |
380 --------------------------------------- | |
381 | |
382 .. class:: infomark | |
383 | |
384 Note that most single-sample plots require built-in or custom *additional | |
385 information about genes* to be generated. The *Saturation* plot is the only | |
386 exception. | |
387 | |
388 *Saturation* | |
389 | |
390 Similar to the same plot in the overview of all samples. | |
391 The single-sample plot, however, has an additional y-axis (on the right) | |
392 showing the number of features expected to be newly detected when increasing | |
393 the sequencing depth by one million reads from each indicated sequencing depth | |
394 value. | |
395 | |
396 *Bio Detection* | |
397 | |
398 This barplot allows the user to know which **kind** of features are being | |
399 detected in the chosen sample. The x-axis shows all feature categories listed | |
400 in the annotations file. The gray bars are the percentage of features of each group within the reference genome (or transcriptome, etc.). The striped color bars are the percentages of features of each group detected in the sample with regard to the genome. The solid color bars are the percentages that each group represents in the total detected features in the sample. | |
401 | |
402 *Counts Per Biotype* | |
403 | |
404 This boxplot shows the distribution of counts of features from each detected | |
405 feature clas. | |
406 | |
407 *Length Bias* | |
408 | |
409 The plot describes the relationship between the length of the features and | |
410 their expression values. | |
411 | |
412 Feature lengths are divided into bins, and mean expression of features falling into a particular length interval is computed and plotted. A cubic spline regression model is fitted to explain the relation between length and expression. Coefficient of determination (`R^2`) and p-value are shown together with the regression curve. | |
413 | |
414 *GC Bias* | |
415 | |
416 The plot describes the relantionship between the GC-content of the features and the expression values. The data for the plot is generated similar to the | |
417 *Length Bias* plot. The GC content is divided into bins and the mean expression | |
418 of features falling into any given GC interval is computed. The relation between GC-content and expression is investigated using a cubic spline regression model. | |
419 | |
420 | |
421 Plots for comparing two groups of samples | |
422 ----------------------------------------- | |
423 | |
424 This mode can generate side-by-side plots of | |
425 | |
426 - *Counts Distribution*, | |
427 - *Features With Low Counts* | |
428 - *Bio Detection*, | |
429 - *Length Bias* and | |
430 - *GC Bias* | |
431 | |
432 for two groups of samples. | |
433 | |
434 .. class:: infomark | |
435 | |
436 Note that the *Bio Detection*, *Length Bias* and *GC Bias* plots can only be | |
437 generated when built-in or custom *additional information about genes* is | |
438 available. | |
439 | |
440 ]]></help> | |
441 <expand macro="citations"/> | |
442 </tool> |