comparison qualimap_bamqc.xml @ 0:ac607906f10a draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/qualimap commit b4d43001cc0caa14d760c347fa1c416929f769b2"
author iuc
date Thu, 10 Oct 2019 17:42:04 -0400
parents
children 4a89c6f84425
comparison
equal deleted inserted replaced
-1:000000000000 0:ac607906f10a
1 <tool id="qualimap_bamqc" name="QualiMap BamQC" version="@VERSION@">
2 <macros>
3 <import>qualimap_macros.xml</import>
4 </macros>
5 <expand macro="requirements" />
6 <expand macro="version_command" />
7 <command detect_errors="exit_code"><![CDATA[
8 #import os
9 @SET_JAVA_OPTS@ &&
10
11 ## Set some default file names and paths
12 ## where we expect tool output to end up.
13 ## Note that most of these need to be overwritten if the user is
14 ## interested in regions *outside* those defined in a custom regions
15 ## file.
16 #set $out_dir = 'results'
17 #set $report_name = 'qualimapReport'
18 #set $summary_report = 'genome_results.txt'
19 #set $coverage_file = os.path.join($out_dir, 'coverage.txt')
20 ## This is the only file path that qualimap does not calculate
21 ## from $out_dir.
22 #set $per_base_coverage_target = $coverage_file
23
24 qualimap bamqc
25 -bam '$input1' -outdir results -outformat html
26 --collect-overlap-pairs
27 #if str($stats_regions.region_select) == 'custom_regions':
28 -gff ${stats_regions.regions}
29 #if $stats_regions.outside_stats:
30 #set $report_name = 'qualimapReportOutsideRegions'
31 #set $summary_report = 'outside_results.txt'
32 #set $coverage_file = os.path.join(
33 $out_dir, 'outside_coverage.txt'
34 )
35 #if $per_base_coverage:
36 #set $per_base_coverage_target = '/dev/null'
37 #end if
38 ${stats_regions.outside_stats}
39 #end if
40 #end if
41 #if $per_base_coverage:
42 $per_base_coverage $per_base_coverage_target
43 #end if
44 -nw ${plot_specific.n_bins}
45 ${plot_specific.paint_chromosome_limits}
46 #if $plot_specific.genome_gc_distr:
47 --genome-gc-distr ${plot_specific.genome_gc_distr}
48 #end if
49 -hm ${plot_specific.homopolymer_size}
50
51 #if $duplicate_skipping:
52 --skip-duplicated
53 #if str($duplicate_skipping) == '0,1':
54 --skip-dup-mode 2
55 #else:
56 --skip-dup-mode ${duplicate_skipping}
57 #end if
58 #end if
59 -nt \${GALAXY_SLOTS:-1} &&
60
61 #if $per_base_coverage:
62 mv $coverage_file '$output_per_base_coverage' &&
63 #end if
64 @MASSAGE_OUTPUT@
65 ]]></command>
66 <inputs>
67 <param argument="-bam" name="input1" type="data" format="bam"
68 label="Mapped reads input dataset" />
69 <conditional name="stats_regions">
70 <param name="region_select" type="select" label="Reference genome regions to calculate mapping statistics for">
71 <option value="all">All (whole genome)</option>
72 <option value="custom_regions">Select regions</option>
73 </param>
74 <when value="all" />
75 <when value="custom_regions">
76 <param argument="-gff" name="regions" type="data" format="gff,gtf,bed"
77 label="Dataset specifying regions" />
78 <param argument="-os" name="outside_stats" type="boolean" truevalue="--outside-stats" falsevalue="" checked="false"
79 label="Invert regions"
80 help="If selected, report read statistics *outside* the regions in the regions file." />
81 </when>
82 </conditional>
83 <param argument="-oc" name="per_base_coverage" type="boolean" truevalue="--output-genome-coverage" falsevalue="" checked="false"
84 label="Generate per-base coverage output"
85 help="Produce additional tabular output listing the coverage at every site (omitting only zero-coverage positions) in the selected regions of the genome. Caution: Will generate a huge dataset for anything but small input genomes or restricted regions!" />
86 <param argument="--skip-dup-mode" name="duplicate_skipping" type="select" display="checkboxes" multiple="true" optional="true"
87 label="Skip duplicate reads">
88 <option value="0" selected="true">Reads flagged as duplicates in input</option>
89 <option value="1">Duplicates detected by Qualimap</option>
90 </param>
91 <section name="plot_specific" title="Settings affecting specific plots" expanded="false">
92 <param argument="-nw" name="n_bins" type="integer" value="400"
93 label="Number of bins to use in across-reference plots"
94 help="Affected plots: Coverage, Mapping Quality and Insert Size across reference, Mapped reads GC-content distribution; the value determines the resolution of the affected plots. Note: The lower the value, the higher the memory usage of the tool!" />
95 <param argument="-c" name="paint_chromosome_limits" type="boolean" truevalue="--paint-chromosome-limits" falsevalue="" checked="true"
96 label="Draw chromosome limits"
97 help="Affected plots: Coverage, Mapping Quality and Insert Size across reference; in across-reference plots, indicate chromosome boundaries with dotted lines and labels" />
98 <param argument="-gd" name="genome_gc_distr" type="select" optional="true"
99 label="Plot expected GC-content distribution of the following reference genome"
100 help="Affected plot: Mapped reads GC-content distribution; include a precalculated GC-content distribution for the selected (Qualimap-supported) reference genome in the plot">
101 <option value="hg19">Human genome (hg19)</option>
102 <option value="mm9">Mouse genome (mm9)</option>
103 <option value="mm10">Mouse genome (mm10)</option>
104 </param>
105 <param argument="-hm" name="homopolymer_size" type="integer" value="3" min="2"
106 label="Homopolymer size"
107 help="Affected plot: Homopolymer indels; sets the minimal number of consecutive bases that define a homopolymer" />
108 </section>
109 </inputs>
110 <outputs>
111 <data name="output_html" format="html"
112 label="${tool.name} report on ${on_string}" />
113 <data name="output_per_base_coverage" format="tsv"
114 label="${tool.name} per-base coverage on ${on_string}">
115 <filter>per_base_coverage</filter>
116 </data>
117 <collection name="raw_data" type="list"
118 label="Raw data for ${tool.name} on ${on_string}">
119 <data name="genome_results" format="txt" from_work_dir="results/summary_report.txt" />
120 <data name="coverage_across_reference" format="tsv" from_work_dir="results/coverage_across_reference.txt" />
121 <data name="coverage_histogram" format="tsv" from_work_dir="results/coverage_histogram.txt" />
122 <data name="genome_fraction_coverage" format="tsv" from_work_dir="results/genome_fraction_coverage.txt" />
123 <data name="duplication_rate_histogram" format="tsv" from_work_dir="results/duplication_rate_histogram.txt" />
124 <data name="mapped_reads_clipping_profile" format="tsv" from_work_dir="results/mapped_reads_clipping_profile.txt" />
125 <data name="mapped_reads_gc-content_distribution" format="tsv" from_work_dir="results/mapped_reads_gc-content_distribution.txt" />
126 <data name="mapped_reads_nucleotide_content" format="tsv" from_work_dir="results/mapped_reads_nucleotide_content.txt" />
127 <data name="mapping_quality_across_reference" format="tsv" from_work_dir="results/mapping_quality_across_reference.txt" />
128 <data name="mapping_quality_histogram" format="tsv" from_work_dir="results/mapping_quality_histogram.txt" />
129 </collection>
130 </outputs>
131 <tests>
132 <test expect_num_outputs="12">
133 <param name="input1" value="test_mapped_reads.bam"/>
134 <output name="output_html" ftype="html">
135 <assert_contents>
136 <has_text text="Qualimap report: BAM QC" />
137 </assert_contents>
138 </output>
139 <output_collection name="raw_data" type="list">
140 <element name="genome_results" file="genome_results_default.txt" ftype="txt" compare="diff" lines_diff="2" />
141 </output_collection>
142 </test>
143 <test expect_num_outputs="13">
144 <param name="input1" value="test_mapped_reads.bam" />
145 <param name="per_base_coverage" value="true" />
146 <output name="output_html" ftype="html">
147 <assert_contents>
148 <has_text text="Qualimap report: BAM QC" />
149 </assert_contents>
150 </output>
151 <output name="output_per_base_coverage" file="per_base_coverage_default.txt" ftype="tsv" />
152 <output_collection name="raw_data" type="list">
153 <element name="genome_results" file="genome_results_default.txt" ftype="txt" compare="diff" lines_diff="2" />
154 </output_collection>
155 </test>
156 <test expect_num_outputs="12">
157 <param name="input1" value="test_mapped_reads.bam"/>
158 <conditional name="stats_regions">
159 <param name="region_select" value="custom_regions" />
160 <param name="regions" value="features.gtf" />
161 </conditional>
162 <output name="output_html" ftype="html">
163 <assert_contents>
164 <has_text text="Qualimap report: BAM QC" />
165 </assert_contents>
166 </output>
167 <output_collection name="raw_data" type="list">
168 <element name="genome_results" file="genome_results_inside_features.txt" ftype="txt" compare="diff" lines_diff="2" />
169 </output_collection>
170 </test>
171 <test expect_num_outputs="13">
172 <param name="input1" value="test_mapped_reads.bam" />
173 <conditional name="stats_regions">
174 <param name="region_select" value="custom_regions" />
175 <param name="regions" value="features.gtf" />
176 </conditional>
177 <param name="per_base_coverage" value="true" />
178 <output name="output_html" ftype="html">
179 <assert_contents>
180 <has_text text="Qualimap report: BAM QC" />
181 </assert_contents>
182 </output>
183 <output name="output_per_base_coverage" file="per_base_coverage_inside_features.txt" ftype="tsv" />
184 <output_collection name="raw_data" type="list">
185 <element name="genome_results" file="genome_results_inside_features.txt" ftype="txt" compare="diff" lines_diff="2" />
186 </output_collection>
187 </test>
188 <test expect_num_outputs="13">
189 <param name="input1" value="test_mapped_reads.bam" />
190 <conditional name="stats_regions">
191 <param name="region_select" value="custom_regions" />
192 <param name="regions" value="features.gtf" />
193 <param name="outside_stats" value="true" />
194 </conditional>
195 <param name="per_base_coverage" value="true" />
196 <output name="output_html" ftype="html">
197 <assert_contents>
198 <has_text text="Qualimap report: BAM QC" />
199 </assert_contents>
200 </output>
201 <output name="output_per_base_coverage" file="per_base_coverage_outside_features.txt" ftype="tsv" />
202 <output_collection name="raw_data" type="list">
203 <element name="genome_results" file="genome_results_outside_features.txt" ftype="txt" compare="diff" lines_diff="2" />
204 </output_collection>
205 </test>
206 </tests>
207 <help><![CDATA[
208 **What it does**
209
210 **Qualimap BAM QC** lets you evaluate the quality of aligned reads data in BAM
211 format. The tool summarizes basic statistics of the alignment (number of reads,
212 coverage, GC-content, etc.) and produces a number of useful graphs for their
213 interpretation.
214
215 The analysis can be performed with any kind of sequencing data, such as
216 whole-genome sequencing, exome sequencing, RNA-seq or ChIP-seq data.
217
218 In addition, it is possible to provide an annotation file so the results are
219 computed for the reads mapping inside (and optionally outside) of the
220 corresponding genomic regions, which can be especially useful for evaluating
221 target-enrichment sequencing studies.
222
223 Input
224 =====
225
226 *Mapped reads input dataset*
227
228 The dataset holding the mapped reads to carry out the analysis with.
229
230 *Dataset specifying regions*
231
232 If you decide to calculate mapping statistics for selected regions of the
233 reference genome (instead of for the whole genome), you need to specify the
234 regions through this additional dataset in gtf, gff or bed format.
235
236 .. class:: infomark
237
238 A typical problem when working with regions (and genome annotation data, in general) is potential inconsistency between the chromosome names used in the mapped reads input versus those used to define the regions. In the case of the human genome, for example, UCSC data has chromosomes starting with a 'chr' prefix, which is lacking from Ensemble data. This simple form of the problem is handled by Qualimap: if chromosome names in the regions input have a 'chr' prefix, Qualimap will add that prefix to the mapped reads chromosome names as needed. For more complex cases you will have to adjust your inputs manually.
239
240
241 Parameters
242 ----------
243
244 *Reference genome regions to calculate mapping statistics for*
245
246 Choose whether you would like to have mapping statistics reported across
247
248 - the entire reference genome
249 (as specified in the header of the mapped reads input)
250
251 - specific regions of the reference
252
253 In the second case, you need to select a *Dataset specifying regions* (see
254 above). Using the *Invert regions* switch you can then indicate whether you
255 want to select or exclude the regions in this dataset.
256
257 *Generate per-base coverage output*
258
259 *Skip duplicate reads*
260
261 The tool lets you skip alignments of duplicate reads from the analysis.
262 Depending on whether you select none, either one, or both of the available
263 options, you can decide to:
264
265 - not correct for duplicate reads at all (*e.g.* because you have removed them
266 at an earlier step with some dedicated tool)
267 - identify and flag duplicate reads with a dedicated tool (like ``Picard
268 MarkDuplicates`` or ``samtools markdup``), then have Qualimap ignore the
269 duplicate-flagged reads (recommended, most flexible option since other tools
270 can be told to ignore the same reads)
271 - have Qualimap identify potential duplicates by itself and ignore them
272 - combine external and Qualimap-internal duplicate detection for extra
273 stringency
274
275 Independent of your selection, the HTML report will always list (in the
276 `Globals` section of the `Summary`) the number of duplicated reads estimated by
277 Qualimap. If you choose to skip duplicates, you will also be informed about the
278 number of skipped reads in that same section and, if you instruct Qualimap to
279 look for the duplicate flag on reads, the number of reads flagged as duplicates
280 will also be reported here.
281
282 **Section: Settings affecting specific plots**
283
284 Parameters in this section only affect some (or even only one) of the plots
285 contained in the HTML report (and the corresponding part of the *Raw Data*
286 output collection).
287
288 For most of these options, the parameter help above should be descriptive
289 enough. Just a few more words on two of them:
290
291 *Number of bins to use in across-reference plots*
292
293 This value is used for computing the various graphs that plot information
294 across the reference. Basically, the reference genome gets split into the given
295 number of bins, and reads falling in the same bin are aggregated in the
296 statistics of that bin.
297
298 Thus, the higher the number of bins, the higher the resolution of the plots,
299 but more bins also require longer time for their statistics to be computed.
300 Less bins, on the other hand, mean more reads will have to be aggregated per
301 bin and this comes with higher memory requirements. Hence, if the tool fails
302 with an ``Out Of Memory`` error, you may want to rerun it with a higher bin
303 number.
304
305 *Plot expected GC-content distribution of the following reference genome*
306
307 The choice of reference genomes with pre-calculated GC distributions is built
308 into Qualimap.
309
310 Future releases of Qualimap may include more choices, but the current version
311 is limited to those offered here.
312
313
314 Outputs
315 =======
316
317 HTML Report
318 -----------
319
320 **Summary Section**
321
322 *Globals*
323
324 This section contains information about the total number of reads, number of mapped reads, paired-end mapping performance, read length distribution,
325 number of clipped reads and duplication rate (estimated from the start positions of read alignments).
326
327 *ACGT Content*
328
329 Nucleotide content and GC percentage in the mapped reads.
330
331 *Coverage*
332
333 Mean and standard deviation of the coverage depth.
334
335 *Mapping quality*
336
337 Mean mapping quality of the mapped reads.
338
339 *Insert size*
340
341 Mean, standard deviation and percentiles of the insert size distribution if applicable. The features are computed based on the TLEN field of the SAM file.
342
343 *Mismatches and indels*
344
345 The section reports general alignment error rate (computed as a ratio of total collected edit distance to the number of mapped bases), total number of mismatches and total number of indels (computed from the CIGAR values). Additionally fraction of the homopolymer indels among total indels is provided. Note, the error rate and mismatches metrics are based on optional fields of a SAM record (NM for edit distance, MD for mismatches). The features are not reported if these fields are missing in the SAM file.
346
347 *Chromosome stats*
348
349 Number of mapped bases, mean and standard deviation of the coverage depth for each chromosome as defined by the header of the SAM file.
350
351 For region-based analysis the information is given inside of regions, including some additional information like, for example, number of correct strand reads.
352
353
354 **Plots**
355
356 *Coverage Across Reference*
357
358 This plot consists of two figures.
359 The upper figure provides the coverage distribution (red line) and coverage
360 deviation across the reference sequence.
361 The lower figure shows GC content across reference (black line) together with
362 its average value (red dotted line).
363
364 *Coverage Histogram*
365
366 Histogram of the number of genomic locations having a given coverage rate.
367 The bins of the x-axis are conveniently scaled by aggregating some coverage
368 values in order to produce a representative histogram also in presence of the
369 usual NGS peaks of coverage.
370
371 *Coverage Histogram (0-50X)*
372
373 Similar to the previous plot, but in this graph genome locations with a
374 coverage greater than 50X are grouped into the last bin.
375 By doing so a higher resolution of the most common values for the coverage rate
376 is obtained.
377
378 *Genome Fraction Coverage*
379
380 Provides a visual way of knowing how much reference has been sequenced to at
381 least a given coverage rate.
382 This graph should be interpreted as in this example:
383 If one aims for a coverage rate of at least 25X (x-axis), how much of the
384 reference (y-axis) will be considered?
385
386 *Duplication Rate Histogram*
387
388 This plot shows the distribution of duplicated reads.
389 Due to several factors (*e.g.* amount of starting material, sample preparation,
390 *etc.*) it is possible that the same fragments are sequenced several times.
391 For some experiments where enrichment is used (*e.g.* ChIP-seq ) this is
392 expected to some degree.
393 For most experiments, however, a high duplication level of the reads indicates
394 some unwanted bias.
395
396 *Mapped Reads Nucleotide Content*
397
398 This plot shows the nucleotide content per position of the mapped reads.
399
400 *Mapped Reads GC Content Distribution*
401
402 This graph shows the distribution of GC-content per mapped read.
403 If compared with a precomputed genome distribution, this plot allows to check
404 if there is a shift in the GC content.
405
406 *Mapped Reads Clipping Profile*
407
408 Represents the percentage of clipped bases across the reads.
409 Technically, the clipping is detected via SAM format CIGAR codes ‘H’
410 (hard clipping) and ‘S’ (soft clipping).
411 In addition, the total number of clipped reads can be found in the report
412 `Summary` section.
413
414 This plot is not shown if no clipped reads are found.
415
416 *Homopolymer Indels*
417
418 This bar plot shows the number of indels that are located within A, C, G and T
419 homopolymers, respectively, as well as the number of indels that are not within
420 any homopolymer. Large numbers of homopolymer indels may indicate a problem in
421 the sequencing process.
422 Technically, Qualimap identifies indels from the CIGAR code of the aligned
423 reads. Indel statistics can also be found in a dedicated section of the report
424 `Summary`.
425
426 This graph is not shown if the sample doesn’t contain any indels.
427
428 *Mapping Quality Across Reference*
429
430 This plot provides the mapping quality distribution across the reference.
431 To construct the plot, the mean mapping quality is computed for each bin.
432
433 *Mapping Quality Histogram*
434
435 Histogram of the number of genomic locations having a given mapping quality.
436 To construct the histogram the mean mapping quality is computed at each genome
437 position with non-zero coverage and collected.
438 According to the SAM/BAM format specifications, the range for the mapping
439 quality score is [0-255].
440
441 *Insert Size Across Reference*
442
443 This plot provides the insert size distribution across the reference.
444 Technically, the insert size of each pair of aligned reads is collected from
445 the SAM alignment field `TLEN`. Only positive values are taken into account.
446 To construct the plot, the mean insert size is computed for each bin.
447
448 *Insert Size Histogram*
449
450 Histogram of insert size distribution.
451
452
453 Raw Data
454 --------
455
456 This is a *Collection* of 10 individual datasets.
457
458 The *genome_results* dataset provides a plain-text summary of key statistics,
459 most of which can also be found in the *Summary* section of the *HTML Report*.
460
461 The remaining 9 datasets hold the tabular raw data underlying the plots of the corresponding names in the *HTML Report*.
462
463
464 Per-base coverage
465 -----------------
466
467 Optional. This is a tabular dataset listing the coverage of every base in the
468 reference genome unless that coverage is zero. Since its content is
469 uncompressed text, this dataset can easily become huge, and it is recommended
470 that you generate this dataset only for very small genomes or very limited
471 regions of larger genomes.
472 ]]> </help>
473 <expand macro="citations"/>
474 </tool>