comparison umi-tools_dedup.xml @ 0:a6477bafd522 draft

planemo upload commit eea727c3bdfe36d9d16036d5ab79fb8b27c4e82e
author iuc
date Wed, 10 Jan 2018 19:09:42 -0500
parents
children 1692b1acebfd
comparison
equal deleted inserted replaced
-1:000000000000 0:a6477bafd522
1 <tool id="umi_tools_dedup" name="UMI-tools deduplicate" version="@VERSION@.0">
2 <description>Extract UMI from fastq files</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="requirements">
7 <requirement type="package" version="1.6">samtools</requirement>
8 </expand>
9 <command detect_errors="exit_code"><![CDATA[
10 #if $input.is_of_type("sam"):
11 #set $input_file = $input
12 #else:
13 ln -sf '${input}' 'input.bam' &&
14 ln -sf '$input.metadata.bam_index' 'input.bam.bai' &&
15 #set $input_file = 'input.bam'
16 #end if
17
18 umi_tools dedup
19 --random-seed 0
20 --extract-umi-method $extract_umi_method
21 #if str($extract_umi_method) != 'read_id':
22 --umi-separator '$umi_separator' --umi-tag '$umi_tag'
23 #end if
24 --method $method --edit-distance-threshold $edit_distance_threshold
25 $paired $spliced_is_unique --soft-clip-threshold $soft_clip_threshold
26 $read_length $whole_contig --subset $subset $per_contig $per_gene
27 #if $gene_transcript_map:
28 --gene-transcript-map '$gene_transcript_map'
29 #end if
30 #if len(str($gene_tag)) > 0:
31 --gene-tag '$gene_tag'
32 #end if
33 #if $input.is_of_type("sam"):
34 --in-sam
35 #end if
36 -I '$input_file' -S deduped.bam &&
37 samtools sort deduped.bam -@ \${GALAXY_SLOTS:-1} -o '$output' -O BAM
38 ]]></command>
39 <inputs>
40 <param name="input" type="data" format="sam,bam" label="Reads to deduplicate in SAM or BAM format" />
41 <param name="extract_umi_method" argument="--extract-umi-method" type="select">
42 <option value="read_id" selected="True">Read ID</option>
43 <option value="tag">Tag</option>
44 </param>
45 <param name="umi_separator" argument="--umi-separator" type="text" label="Separator between read id and UMI." help="Ignored unless extracting by tag" />
46 <param name="umi_tag" argument="--umi-tag" type="text" label="Tag which contains UMI." />
47 <param argument="--method" type="select" label="Method used to identify PCR duplicates within reads." help="All methods start by identifying the reads with the same mapping position">
48 <option value="unique">Reads group share the exact same UMI</option>
49 <option value="percentile">Reads group share the exact same UMI. UMIs with counts less than 1% of the median counts for UMIs at the same position are ignored</option>
50 <option value="cluster">Identify clusters based on hamming distance</option>
51 <option value="adjacency">Identify clusters based on hamming distance and resolve networks by using the node counts</option>
52 <option value="directional">Identify clusters based on distance and counts, restrict network expansion by threshold</option>
53 </param>
54 <param name="edit_distance_threshold" argument="--edit-distance-threshold" type="integer" value="1" label="Edit distance threshold" help="For the adjacency and cluster methods the threshold for the edit distance to connect two UMIs in the network can be increased. The default value of 1 works best unless the UMI is very long (&gt;14bp)" />
55 <param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" label="BAM is paired end" help="This will also force the use of the template length to determine reads with the same mapping coordinates." />
56 <param name="spliced_is_unique" argument="--spliced-is-unique" type="boolean" truevalue="--spliced-is-unique" falsevalue="" label="Spliced reads are unique" help="Causes two reads that start in the same position on the same strand and having the same UMI to be considered unique if one is spliced and the other is not. (Uses the 'N' cigar operation to test for splicing)" />
57 <param name="soft_clip_threshold" argument="--soft-clip-threshold" type="integer" value="4" label="Soft clip threshold" help="Mappers that soft clip, will sometimes do so rather than mapping a spliced read if there is only a small overhang over the exon junction. By setting this option, you can treat reads with at least this many bases soft-clipped at the 3' end as spliced." />
58 <param name="read_length" argument="--read-length" type="boolean" truevalue="--read-length" falsevalue="" label="Use the read length as as a criterion when deduping" />
59 <param name="whole_contig" argument="--whole-contig" type="boolean" truevalue="--whole-contig" falsevalue="" label="Consider all alignments to a single contig together" help="This is useful if you have aligned to a transcriptome multi-fasta" />
60 <param argument="--subset" type="float" min="0.0" max="1.0" value="1.0" label="Only consider a random selection of the reads" />
61 <param argument="--chrom" type="boolean" truevalue="--chrom" falsevalue="" label="Only consider a single chromosome" />
62 <param name="per_contig" argument="--per-contig" type="boolean" truevalue="--per-contig" falsevalue="" label="Deduplicate per contig" help="Field 3 in BAM; RNAME. All reads with the same contig will be considered to have the same alignment position. This is useful if your library prep generates PCR duplicates with non identical alignment positions such as CEL-Seq. In this case, you would align to a reference transcriptome with one transcript per gene" />
63 <param name="per_gene" argument="--per-gene" type="boolean" truevalue="--per-gene" falsevalue="" label="Deduplicate per gene" help="As above except with this option you can align to a reference transcriptome with more than one transcript per gene. You need to also provide a map of genes to transcripts. This will also add a metacontig ('MC') tag to the output BAM file." />
64 <param name="gene_transcript_map" argument="--gene-transcript-map" type="data" format="tabular" optional="True" label="Tabular file mapping genes to transripts" />
65 <param name="gene_tag" argument="--gene-tag" type="text" optional="True" label="Deduplicate by this gene tag" help="As --per-gene except here the gene information is encoded in the bam read tag specified so you do not need to supply the mapping file." />
66 </inputs>
67 <outputs>
68 <data format="bam" name="output" />
69 </outputs>
70 <tests>
71 <test>
72 <param name="input" value="group_in1.sam" ftype="sam" />
73 <param name="extract_umi_method" value="read_id" />
74 <param name="method" value="unique" />
75 <output name="output" file="dedup_out1.bam" ftype="bam" sort="True"/>
76 </test>
77 <test>
78 <param name="input" value="group_in2.bam" ftype="bam" />
79 <param name="extract_umi_method" value="read_id" />
80 <param name="paired" value="True" />
81 <param name="method" value="unique" />
82 <output name="output" file="dedup_out2.bam" ftype="bam" sort="True" />
83 </test>
84 <test>
85 <param name="input" value="group_in3.bam" ftype="bam" />
86 <param name="extract_umi_method" value="read_id" />
87 <param name="method" value="unique" />
88 <output name="output" file="dedup_out3.bam" ftype="bam" sort="True" />
89 </test>
90 <test>
91 <param name="input" value="group_in4.bam" ftype="bam" />
92 <param name="extract_umi_method" value="tag" />
93 <param name="umi_tag" value="BX" />
94 <param name="method" value="unique" />
95 <output name="output" file="dedup_out4.bam" ftype="bam" sort="True" />
96 </test>
97 <test>
98 <param name="input" value="group_in5.bam" ftype="bam" />
99 <param name="extract_umi_method" value="read_id" />
100 <param name="umi_tag" value="BX" />
101 <param name="method" value="cluster" />
102 <output name="output" file="dedup_out5.bam" ftype="bam" sort="True" />
103 </test>
104 <test>
105 <param name="input" value="group_in6.bam" ftype="bam" />
106 <param name="extract_umi_method" value="read_id" />
107 <param name="umi_tag" value="BX" />
108 <param name="method" value="directional" />
109 <output name="output" file="dedup_out6.bam" ftype="bam" sort="True" />
110 </test>
111 </tests>
112 <help><![CDATA[
113 umi_tools dedup - Deduplicate reads based on their UMI
114 ======================================================
115
116 Purpose
117 -------
118
119 The purpose of this command is to deduplicate BAM files based on the first
120 mapping co-ordinate and the UMI attached to the read. It is assumed that the
121 FASTQ files were processed with extract_umi.py before mapping and thus the UMI
122 is the last word of the read name. e.g:
123
124 @HISEQ:87:00000000_AATT
125
126 where AATT is the UMI sequeuence.
127
128 If you have used an alternative method which does not separate the
129 read id and UMI with a "_", such as bcl2fastq which uses ":", you can
130 specify the separator with the option "--umi-separator=<sep>",
131 replacing <sep> with e.g ":".
132
133 Alternatively, if your UMIs are encoded in a tag, you can specify this
134 by setting the option --extract-umi-method=tag and set the tag name
135 with the --umi-tag option. For example, if your UMIs are encoded in
136 the 'UM' tag, provide the following options:
137 "--extract-umi-method=tag --umi-tag=UM"
138
139 The start postion of a read is considered to be the start of its alignment
140 minus any soft clipped bases. A read aligned at position 500 with
141 cigar 2S98M will be assumed to start at postion 498.
142
143
144 Methods
145 -------
146
147 dedup can be run with multiple methods to identify groups of reads with
148 the same (or similar) UMI(s). All methods start by identifying the
149 reads with the same mapping position.
150
151 The simpliest method, "unique", groups reads with the exact same
152 UMI. The network-based methods, "cluster", "adjacency" and
153 "directional", build networks where nodes are UMIs and edges connect
154 UMIs with an edit distance <= threshold (usually 1). The groups of
155 reads are then defined from the network in a method-specific manner.
156
157 "unique"
158 Reads group share the exact same UMI
159
160 "percentile"
161 Reads group share the exact same UMI. UMIs with counts < 1% of the
162 median counts for UMIs at the same position are ignored.
163
164 "cluster"
165 Identify clusters of connected UMIs (based on hamming distance
166 threshold). Each network is a read group
167
168 "adjacency"
169 Cluster UMIs as above. For each cluster, select the node(UMI)
170 with the highest counts. Visit all nodes one edge away. If all
171 nodes have been visted, stop. Otherise, repeat with remaining
172 nodes until all nodes have been visted. Each step
173 defines a read group.
174
175 "directional" (default)
176 Identify clusters of connected UMIs (based on hamming distance
177 threshold) and umi A counts >= (2* umi B counts) - 1. Each
178 network is a read group.
179
180 Options
181 -------
182
183 --extract-umi-method (choice)
184 How are the UMIs encoded in the read?
185
186 Options are:
187
188 - "read_id" (default)
189 UMIs contained at the end of the read separated as
190 specified with --umi-separator option
191
192 - "tag"
193 UMIs contained in a tag, see --umi-tag option
194
195 --umi-separator (string)
196 Separator between read id and UMI. See --extract-umi-method above
197
198 --umi-tag (string)
199 Tag which contains UMI. See --extract-umi-method above
200
201 --edit-distance-threshold (int)
202 For the adjacency and cluster methods the threshold for the
203 edit distance to connect two UMIs in the network can be
204 increased. The default value of 1 works best unless the UMI is
205 very long (>14bp)
206
207 --paired
208 BAM is paired end - output both read pairs. This will also
209 force the use of the template length to determine reads with
210 the same mapping coordinates.
211
212 --spliced-is-unique
213 Causes two reads that start in the same position on the same
214 strand and having the same UMI to be considered unique if one is
215 spliced and the other is not. (Uses the 'N' cigar operation to test
216 for splicing)
217
218 --soft-clip-threshold (int)
219 Mappers that soft clip, will sometimes do so rather than mapping a
220 spliced read if there is only a small overhang over the exon
221 junction. By setting this option, you can treat reads with at least
222 this many bases soft-clipped at the 3' end as spliced.
223
224 --multimapping-detection-method (string, choice)
225 If the sam/bam contains tags to identify multimapping reads, you can
226 specify for use when selecting the best read at a given loci.
227 Supported tags are "NH", "X0" and "XT". If not specified, the read
228 with the highest mapping quality will be selected
229
230 --read-length
231 Use the read length as as a criteria when deduping, for e.g sRNA-Seq
232
233 --whole-contig
234 Consider all alignments to a single contig together. This is useful if
235 you have aligned to a transcriptome multi-fasta
236
237 --subset (float, [0-1])
238 Only consider a fraction of the reads, chosen at random. This is useful
239 for doing saturation analyses.
240
241 --chrom
242 Only consider a single chromosome. This is useful for debugging purposes
243
244 --per-contig (string)
245 Deduplicate per contig (field 3 in BAM; RNAME).
246 All reads with the same contig will be
247 considered to have the same alignment position. This is useful
248 if your library prep generates PCR duplicates with non identical
249 alignment positions such as CEL-Seq. In this case, you would
250 align to a reference transcriptome with one transcript per gene
251
252 --per-gene (string)
253 Deduplicate per gene. As above except with this option you can
254 align to a reference transcriptome with more than one transcript
255 per gene. You need to also provide --gene-transcript-map option.
256 This will also add a metacontig ('MC') tag to the reads if used
257 in conjunction with --output-bam
258
259 --gene-transcript-map (string)
260 File mapping genes to transripts (tab separated), e.g:
261
262 gene1 transcript1
263 gene1 transcript2
264 gene2 transcript3
265
266 --gene-tag (string)
267 Deduplicate per gene. As per --per-gene except here the gene
268 information is encoded in the bam read tag specified so you do
269 not need to supply --gene-transcript-map
270
271 --output-bam (string, filename)
272 Output a tagged bam file to stdout or -S <filename>
273
274 -i, --in-sam/-o, --out-sam
275 By default, inputs are assumed to be in BAM format and output are output
276 in BAM format. Use these options to specify the use of SAM format for
277 inputs or outputs.
278
279 -I (string, filename) input file name
280 The input file must be sorted and indexed.
281
282 -S (string, filename) output file name
283
284 -L (string, filename) log file name
285
286 Usage
287 -----
288 umi_tools dedup -I infile.bam -S grouped.bam --
289
290 ]]></help>
291 <expand macro="citations" />
292 </tool>