comparison umi-tools_dedup.xml @ 12:4098ab380097 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit bf6a3aa532e8f9d122da4c1e39f3e256ae587b79"
author iuc
date Mon, 13 Sep 2021 14:51:31 +0000
parents 7fa28eb10fed
children
comparison
equal deleted inserted replaced
11:7fa28eb10fed 12:4098ab380097
1 <tool id="umi_tools_dedup" name="UMI-tools deduplicate" version="@VERSION@+galaxy1"> 1 <tool id="umi_tools_dedup" name="UMI-tools deduplicate" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
2 <description>Extract UMI from fastq files</description> 2 <description>Extract UMI from fastq files</description>
3 <expand macro="bio_tools"/>
3 <macros> 4 <macros>
4 <import>macros.xml</import> 5 <import>macros.xml</import>
5 </macros> 6 </macros>
6 <expand macro="requirements"> 7 <expand macro="requirements">
7 <requirement type="package" version="1.9">samtools</requirement> 8 <requirement type="package" version="1.12">samtools</requirement>
8 </expand> 9 </expand>
9 <command detect_errors="exit_code"><![CDATA[ 10 <command detect_errors="exit_code"><![CDATA[
10 #if $input.is_of_type("sam"): 11 @LINK_SAM_BAM_INPUT@
11 #set $input_file = $input 12
12 #else: 13 echo $input.ext &&
13 ln -sf '${input}' 'input.bam' &&
14 ln -sf '$input.metadata.bam_index' 'input.bam.bai' &&
15 #set $input_file = 'input.bam'
16 #end if
17
18 umi_tools dedup 14 umi_tools dedup
19 '$output_stats_bool' 15 #if $output_stats_bool
20 --random-seed 0 16 --output-stats=stats_outputs
21 --extract-umi-method $extract_umi_method
22 #if str($extract_umi_method) != 'read_id':
23 --umi-separator '$umi_separator' --umi-tag '$umi_tag'
24 #end if 17 #end if
25 --method $method --edit-distance-threshold $edit_distance_threshold 18 @GROUPDEDUP_OPTIONS@
26 $paired $spliced_is_unique --soft-clip-threshold $soft_clip_threshold 19 @BARCODE_OPTIONS@
27 $read_length $whole_contig --subset $subset $per_contig $per_gene 20 @UMI_GROUPING_OPTIONS@
28 #if $gene_transcript_map: 21 @SAMBAM_OPTIONS@
29 --gene-transcript-map '$gene_transcript_map' 22 @FULLSC_OPTIONS@
30 #end if 23 @ADVANCED_OPTIONS@
31 #if len(str($gene_tag)) > 0: 24 -I '$input_file' -S deduped.bam
32 --gene-tag '$gene_tag' 25 ## TODO using samtools sort is a workaround, for the following error that appears when Galaxy
33 #end if 26 ## compares the generated file with the one in test-data
34 #if $input.is_of_type("sam"): 27 ## `Converting history BAM to SAM failed: 'samtools returned with error 1: stdout=None, stderr=[main_samview] fail to read the header from "/tmp/tmpd8o61jykdedup_out6.bam".\n'. Will compare BAM files`
35 --in-sam 28 ## problem seems to be the BAM file generated with pysam
36 #end if 29 ## may be dropped in the future
37 -I '$input_file' -S deduped.bam && 30 --no-sort-output
38 samtools sort deduped.bam -@ \${GALAXY_SLOTS:-1} -T "\${TMPDIR:-.}" -o '$output' -O BAM 31 @LOG@
32 && samtools sort --no-PG deduped.bam -@ \${GALAXY_SLOTS:-1} -T "\${TMPDIR:-.}" -o '$output' -O BAM
33
39 ]]></command> 34 ]]></command>
40 <inputs> 35 <inputs>
41 <param name="input" type="data" format="sam,bam" label="Reads to deduplicate in SAM or BAM format" /> 36 <param name="input" type="data" format="sam,bam" label="Reads to deduplicate in SAM or BAM format" />
42 <param name="extract_umi_method" argument="--extract-umi-method" type="select"> 37 <param name="output_stats_bool" type="boolean" checked="false" label="Output UMI related statistics files?"/>
43 <option value="read_id" selected="True">Read ID</option> 38 <expand macro="groupdedup_options_macro"/>
44 <option value="tag">Tag</option> 39 <expand macro="barcode_options_macro"/>
45 </param> 40 <expand macro="umi_grouping_options_macro"/>
46 <param name="umi_separator" argument="--umi-separator" type="text" label="Separator between read id and UMI." help="Ignored unless extracting by tag" /> 41 <expand macro="sambam_options_macro"/>
47 <param name="umi_tag" argument="--umi-tag" type="text" label="Tag which contains UMI." /> 42 <expand macro="fullsc_options_macro"/>
48 <param argument="--method" type="select" label="Method used to identify PCR duplicates within reads." help="All methods start by identifying the reads with the same mapping position"> 43 <expand macro="advanced_options_macro"/>
49 <option value="unique">Reads group share the exact same UMI</option> 44 <expand macro="log_input_macro"/>
50 <option value="percentile">Reads group share the exact same UMI. UMIs with counts less than 1% of the median counts for UMIs at the same position are ignored</option>
51 <option value="cluster">Identify clusters based on hamming distance</option>
52 <option value="adjacency">Identify clusters based on hamming distance and resolve networks by using the node counts</option>
53 <option value="directional">Identify clusters based on distance and counts, restrict network expansion by threshold</option>
54 </param>
55 <param name="edit_distance_threshold" argument="--edit-distance-threshold" type="integer" value="1" label="Edit distance threshold" help="For the adjacency and cluster methods the threshold for the edit distance to connect two UMIs in the network can be increased. The default value of 1 works best unless the UMI is very long (&gt;14bp)" />
56 <param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" label="BAM is paired end" help="This will also force the use of the template length to determine reads with the same mapping coordinates." />
57 <param name="spliced_is_unique" argument="--spliced-is-unique" type="boolean" truevalue="--spliced-is-unique" falsevalue="" label="Spliced reads are unique" help="Causes two reads that start in the same position on the same strand and having the same UMI to be considered unique if one is spliced and the other is not. (Uses the 'N' cigar operation to test for splicing)" />
58 <param name="soft_clip_threshold" argument="--soft-clip-threshold" type="integer" value="4" label="Soft clip threshold" help="Mappers that soft clip, will sometimes do so rather than mapping a spliced read if there is only a small overhang over the exon junction. By setting this option, you can treat reads with at least this many bases soft-clipped at the 3' end as spliced." />
59 <param name="read_length" argument="--read-length" type="boolean" truevalue="--read-length" falsevalue="" label="Use the read length as as a criterion when deduping" />
60 <param name="whole_contig" argument="--whole-contig" type="boolean" truevalue="--whole-contig" falsevalue="" label="Consider all alignments to a single contig together" help="This is useful if you have aligned to a transcriptome multi-fasta" />
61 <param argument="--subset" type="float" min="0.0" max="1.0" value="1.0" label="Only consider a random selection of the reads" />
62 <param argument="--chrom" type="boolean" truevalue="--chrom" falsevalue="" label="Only consider a single chromosome" />
63 <param name="per_contig" argument="--per-contig" type="boolean" truevalue="--per-contig" falsevalue="" label="Deduplicate per contig" help="Field 3 in BAM; RNAME. All reads with the same contig will be considered to have the same alignment position. This is useful if your library prep generates PCR duplicates with non identical alignment positions such as CEL-Seq. In this case, you would align to a reference transcriptome with one transcript per gene" />
64 <param name="per_gene" argument="--per-gene" type="boolean" truevalue="--per-gene" falsevalue="" label="Deduplicate per gene" help="As above except with this option you can align to a reference transcriptome with more than one transcript per gene. You need to also provide a map of genes to transcripts. This will also add a metacontig ('MC') tag to the output BAM file." />
65 <param name="gene_transcript_map" argument="--gene-transcript-map" type="data" format="tabular" optional="True" label="Tabular file mapping genes to transripts" />
66 <param name="gene_tag" argument="--gene-tag" type="text" optional="True" label="Deduplicate by this gene tag" help="As --per-gene except here the gene information is encoded in the bam read tag specified so you do not need to supply the mapping file." />
67 <param name="output_stats_bool" type="boolean" truevalue="--output-stats=stats_outputs" falsevalue="" checked="false" label="Output UMI related statistics files?"/>
68 </inputs> 45 </inputs>
69 <outputs> 46 <outputs>
70 <data format="bam" name="output" /> 47 <data format="bam" name="output" />
71 <collection name="output_stats" type="list" label="UMI_tools dedup stats"> 48 <collection name="output_stats" type="list" label="${tool.name} on ${on_string} stats">
72 <filter>output_stats_bool</filter> 49 <filter>output_stats_bool</filter>
73 <data name="edit_distance" format="tabular" from_work_dir="stats_outputs_edit_distance.tsv"/> 50 <data name="edit_distance" format="tabular" from_work_dir="stats_outputs_edit_distance.tsv"/>
74 <data name="per_umi" format="tabular" from_work_dir="stats_outputs_per_umi.tsv"/> 51 <data name="per_umi" format="tabular" from_work_dir="stats_outputs_per_umi.tsv"/>
75 <data name="per_umi_per_position" format="tabular" from_work_dir="stats_outputs_per_umi_per_position.tsv"/> 52 <data name="per_umi_per_position" format="tabular" from_work_dir="stats_outputs_per_umi_per_position.tsv"/>
76 </collection> 53 </collection>
54 <expand macro="log_output_macro"/>
77 </outputs> 55 </outputs>
78 <tests> 56 <tests>
79 <test expect_num_outputs="1"> 57 <test expect_num_outputs="1">
80 <param name="input" value="group_in1.sam" ftype="sam" /> 58 <param name="input" value="group_in1.sam" ftype="sam" />
81 <param name="extract_umi_method" value="read_id" /> 59 <section name="advanced">
82 <param name="method" value="unique" /> 60 <param name="random_seed" value="0" />
83 <output name="output" file="dedup_out1.bam" ftype="bam" sort="True"/> 61 </section>
84 </test> 62 <conditional name="bc">
85 <test expect_num_outputs="1"> 63 <param name="extract_umi_method" value="read_id" />
86 <param name="input" value="group_in2.bam" ftype="bam" /> 64 </conditional>
87 <param name="extract_umi_method" value="read_id" /> 65 <section name="umi">
88 <param name="paired" value="True" /> 66 <param name="method" value="unique" />
89 <param name="method" value="unique" /> 67 </section>
90 <output name="output" file="dedup_out2.bam" ftype="bam" sort="True" /> 68 <output name="output" file="dedup_out1.bam" ftype="bam" lines_diff="2"/><!--lines_diff won't be needed in later versions since umitools use \-\-no-PG internally -->
69 </test>
70 <test expect_num_outputs="1">
71 <param name="input" value="group_in2.sam" ftype="sam" />
72 <section name="advanced">
73 <param name="random_seed" value="0" />
74 </section>
75 <conditional name="bc">
76 <param name="extract_umi_method" value="read_id" />
77 </conditional>
78 <section name="sambam">
79 <param name="paired" value="true" />
80 </section>
81 <section name="umi">
82 <param name="method" value="unique" />
83 </section>
84 <output name="output" file="dedup_out2.bam" ftype="bam" lines_diff="2" />
91 </test> 85 </test>
92 <test expect_num_outputs="1"> 86 <test expect_num_outputs="1">
93 <param name="input" value="group_in3.bam" ftype="bam" /> 87 <param name="input" value="group_in3.bam" ftype="bam" />
94 <param name="extract_umi_method" value="read_id" /> 88 <section name="advanced">
95 <param name="method" value="unique" /> 89 <param name="random_seed" value="0" />
96 <output name="output" file="dedup_out3.bam" ftype="bam" sort="True" /> 90 </section>
91 <conditional name="bc">
92 <param name="extract_umi_method" value="read_id" />
93 </conditional>
94 <section name="umi">
95 <param name="method" value="unique" />
96 </section>
97 <output name="output" file="dedup_out3.bam" ftype="bam" lines_diff="2" />
97 </test> 98 </test>
98 <test expect_num_outputs="1"> 99 <test expect_num_outputs="1">
99 <param name="input" value="group_in4.bam" ftype="bam" /> 100 <param name="input" value="group_in4.bam" ftype="bam" />
100 <param name="extract_umi_method" value="tag" /> 101 <section name="advanced">
101 <param name="umi_tag" value="BX" /> 102 <param name="random_seed" value="0" />
102 <param name="method" value="unique" /> 103 </section>
103 <output name="output" file="dedup_out4.bam" ftype="bam" sort="True" /> 104 <conditional name="bc">
105 <param name="extract_umi_method" value="tag" />
106 <param name="umi_tag" value="BX" />
107 </conditional>
108 <section name="umi">
109 <param name="method" value="unique" />
110 </section>
111 <output name="output" file="dedup_out4.bam" ftype="bam" lines_diff="2"/>
104 </test> 112 </test>
105 <test expect_num_outputs="1"> 113 <test expect_num_outputs="1">
106 <param name="input" value="group_in5.bam" ftype="bam" /> 114 <param name="input" value="group_in5.bam" ftype="bam" />
107 <param name="extract_umi_method" value="read_id" /> 115 <section name="advanced">
108 <param name="umi_tag" value="BX" /> 116 <param name="random_seed" value="0" />
109 <param name="method" value="cluster" /> 117 </section>
110 <output name="output" file="dedup_out5.bam" ftype="bam" sort="True" /> 118 <conditional name="bc">
119 <param name="extract_umi_method" value="read_id" />
120 <param name="umi_tag" value="BX" />
121 </conditional>
122 <section name="umi">
123 <param name="method" value="cluster" />
124 </section>
125 <output name="output" file="dedup_out5.bam" ftype="bam" lines_diff="2"/>
111 </test> 126 </test>
112 <test expect_num_outputs="1"> 127 <test expect_num_outputs="1">
113 <param name="input" value="group_in6.bam" ftype="bam" /> 128 <param name="input" value="group_in6.bam" ftype="bam" />
114 <param name="extract_umi_method" value="read_id" /> 129 <section name="advanced">
115 <param name="umi_tag" value="BX" /> 130 <param name="random_seed" value="0" />
116 <param name="method" value="directional" /> 131 </section>
117 <output name="output" file="dedup_out6.bam" ftype="bam" sort="True" /> 132 <conditional name="bc">
133 <param name="extract_umi_method" value="read_id" />
134 <param name="umi_tag" value="BX" />
135 </conditional>
136 <section name="umi">
137 <param name="method" value="directional" />
138 </section>
139 <output name="output" file="dedup_out6.bam" ftype="bam" lines_diff="2"/>
118 </test> 140 </test>
119 <test expect_num_outputs="5"> 141 <test expect_num_outputs="5">
120 <param name="input" value="group_in6.bam" ftype="bam" /> 142 <param name="input" value="group_in6.bam" ftype="bam" />
121 <param name="extract_umi_method" value="read_id" /> 143 <section name="advanced">
122 <param name="umi_tag" value="BX" /> 144 <param name="random_seed" value="0" />
123 <param name="method" value="directional" /> 145 </section>
146 <conditional name="bc">
147 <param name="extract_umi_method" value="read_id" />
148 <param name="umi_tag" value="BX" />
149 </conditional>
150 <section name="umi">
151 <param name="method" value="directional" />
152 </section>
124 <param name="output_stats_bool" value="true"/> 153 <param name="output_stats_bool" value="true"/>
125 <output name="output" file="dedup_out6.bam" ftype="bam" sort="True" /> 154 <output name="output" file="dedup_out6.bam" ftype="bam" lines_diff="2"/>
126 <output_collection name="output_stats"> 155 <output_collection name="output_stats">
127 <element name="edit_distance" file="stats_outputs_edit_distance.tsv" /> 156 <element name="edit_distance" file="stats_outputs_edit_distance.tsv" />
128 <element name="per_umi" file="stats_outputs_per_umi.tsv" /> 157 <element name="per_umi" file="stats_outputs_per_umi.tsv" />
129 <element name="per_umi_per_position" file="stats_outputs_per_umi_per_position.tsv" /> 158 <element name="per_umi_per_position" file="stats_outputs_per_umi_per_position.tsv" />
130 </output_collection> 159 </output_collection>
131 </test> 160 </test>
132 </tests> 161 </tests>
133 <help><![CDATA[ 162 <help><![CDATA[
134 umi_tools dedup - Deduplicate reads based on their UMI 163 umi_tools dedup - Deduplicate reads based on their UMI and mapping coordinates
135 ====================================================== 164 ==============================================================================
136 165
137 Purpose 166 Purpose
138 ------- 167 -------
139 168
140 The purpose of this command is to deduplicate BAM files based on the first 169 The purpose of this command is to deduplicate BAM files based on the first
141 mapping co-ordinate and the UMI attached to the read. It is assumed that the 170 mapping co-ordinate and the UMI attached to the read.
142 FASTQ files were processed with extract_umi.py before mapping and thus the UMI 171
143 is the last word of the read name. e.g: 172 @BARCODE_HELP@
144 173
145 @HISEQ:87:00000000_AATT 174 @UMI_GROUPING_HELP@
146 175
147 where AATT is the UMI sequeuence. 176 Selecting the representative read
148 177 ---------------------------------
149 If you have used an alternative method which does not separate the 178 For every group of duplicate reads, a single representative read is
150 read id and UMI with a "_", such as bcl2fastq which uses ":", you can 179 retained.The following criteria are applied to select the read that
151 specify the separator with the option "--umi-separator=<sep>", 180 will be retained from a group of duplicated reads:
152 replacing <sep> with e.g ":". 181
153 182 1. The read with the lowest number of mapping coordinates (see
154 Alternatively, if your UMIs are encoded in a tag, you can specify this 183 ``--multimapping-detection-method`` option)
155 by setting the option --extract-umi-method=tag and set the tag name 184
156 with the --umi-tag option. For example, if your UMIs are encoded in 185 2. The read with the highest mapping quality. Note that this is not
157 the 'UM' tag, provide the following options: 186 the read sequencing quality and that if two reads have the same
158 "--extract-umi-method=tag --umi-tag=UM" 187 mapping quality then one will be picked at random regardless of the
159 188 read quality.
160 The start postion of a read is considered to be the start of its alignment 189
161 minus any soft clipped bases. A read aligned at position 500 with 190 Otherwise a read is chosen at random.
162 cigar 2S98M will be assumed to start at postion 498. 191
163 192 Optional statistics output
164 193 --------------------------
165 Methods 194
166 ------- 195 One can use the edit distance between UMIs at the same position as an
167 196 quality control for the deduplication process by comparing with
168 dedup can be run with multiple methods to identify groups of reads with 197 a null expectation of random sampling. For the random sampling, the
169 the same (or similar) UMI(s). All methods start by identifying the 198 observed frequency of UMIs is used to more reasonably model the null
170 reads with the same mapping position. 199 expectation.
171 200
172 The simpliest method, "unique", groups reads with the exact same 201 Use the option ``Output UMI related statistics files?`` generate stats outfiles:
173 UMI. The network-based methods, "cluster", "adjacency" and 202
174 "directional", build networks where nodes are UMIs and edges connect 203 edit_distance
175 UMIs with an edit distance <= threshold (usually 1). The groups of 204 Reports the (binned) average edit distance between the UMIs at each
176 reads are then defined from the network in a method-specific manner. 205 position. Positions with a single UMI are reported seperately. The
177 206 edit distances are reported pre- and post-deduplication alongside
178 "unique" 207 the null expectation from random sampling of UMIs from the UMIs
179 Reads group share the exact same UMI 208 observed across all positions. Note that separate null
180 209 distributions are reported since the null depends on the observed
181 "percentile" 210 frequency of each UMI which is different pre- and
182 Reads group share the exact same UMI. UMIs with counts < 1% of the 211 post-deduplication. The post-duplication values should be closer to
183 median counts for UMIs at the same position are ignored. 212 their respective null than the pre-deduplication vs null comparison
184 213
185 "cluster" 214 In addition, this option will trigger reporting of further summary
186 Identify clusters of connected UMIs (based on hamming distance 215 statistics for the UMIs which may be informative for selecting the
187 threshold). Each network is a read group 216 optimal deduplication method or debugging.
188 217
189 "adjacency" 218 Each unique UMI sequence may be observed [0-many] times at multiple
190 Cluster UMIs as above. For each cluster, select the node(UMI) 219 positions in the BAM. The following files report the distribution for
191 with the highest counts. Visit all nodes one edge away. If all 220 the frequencies of each UMI.
192 nodes have been visted, stop. Otherise, repeat with remaining 221
193 nodes until all nodes have been visted. Each step 222 per_umi_per_position
194 defines a read group. 223 The `_stats_per_umi_per_position.tsv` file simply tabulates the
195 224 counts for unique combinations of UMI and position. E.g if prior to
196 "directional" (default) 225 deduplication, we have two positions in the BAM (POSa, POSb), at
197 Identify clusters of connected UMIs (based on hamming distance 226 POSa we have observed 2*UMIa, 1*UMIb and at POSb: 1*UMIc, 3*UMId,
198 threshold) and umi A counts >= (2* umi B counts) - 1. Each 227 then the stats file is populated thus:
199 network is a read group. 228
200 229 ====== =============
201 Options 230 counts instances_pre
202 ------- 231 ------ -------------
203 232 1 2
204 --extract-umi-method (choice) 233 2 1
205 How are the UMIs encoded in the read? 234 3 1
206 235 ====== =============
207 Options are: 236
208 237 If post deduplication, UMIb is grouped with UMIa such that POSa:
209 - "read_id" (default) 238 3*UMIa, then the `instances_post` column is populated thus:
210 UMIs contained at the end of the read separated as 239
211 specified with --umi-separator option 240 ====== ============= ==============
212 241 counts instances_pre instances_post
213 - "tag" 242 ------ ------------- --------------
214 UMIs contained in a tag, see --umi-tag option 243 1 2 1
215 244 2 1 0
216 --umi-separator (string) 245 3 1 2
217 Separator between read id and UMI. See --extract-umi-method above 246 ====== ============= ==============
218 247
219 --umi-tag (string) 248 per_umi_per
220 Tag which contains UMI. See --extract-umi-method above 249 The `_stats_per_umi_per.tsv` table provides UMI-level summary
221 250 statistics. Keeping in mind that each unique UMI sequence can be
222 --edit-distance-threshold (int) 251 observed at [0-many] times across multiple positions in the BAM,
223 For the adjacency and cluster methods the threshold for the 252
224 edit distance to connect two UMIs in the network can be 253 :times_observed: How many positions the UMI was observed at
225 increased. The default value of 1 works best unless the UMI is 254 :total_counts: The total number of times the UMI was observed across all positions
226 very long (>14bp) 255 :median_counts: The median for the distribution of how often the UMI was observed at each position (excluding zeros)
227 256
228 --paired 257 Hence, whenever times_observed=1, total_counts==median_counts.]]></help>
229 BAM is paired end - output both read pairs. This will also
230 force the use of the template length to determine reads with
231 the same mapping coordinates.
232
233 --spliced-is-unique
234 Causes two reads that start in the same position on the same
235 strand and having the same UMI to be considered unique if one is
236 spliced and the other is not. (Uses the 'N' cigar operation to test
237 for splicing)
238
239 --soft-clip-threshold (int)
240 Mappers that soft clip, will sometimes do so rather than mapping a
241 spliced read if there is only a small overhang over the exon
242 junction. By setting this option, you can treat reads with at least
243 this many bases soft-clipped at the 3' end as spliced.
244
245 --multimapping-detection-method (string, choice)
246 If the sam/bam contains tags to identify multimapping reads, you can
247 specify for use when selecting the best read at a given loci.
248 Supported tags are "NH", "X0" and "XT". If not specified, the read
249 with the highest mapping quality will be selected
250
251 --read-length
252 Use the read length as as a criteria when deduping, for e.g sRNA-Seq
253
254 --whole-contig
255 Consider all alignments to a single contig together. This is useful if
256 you have aligned to a transcriptome multi-fasta
257
258 --subset (float, [0-1])
259 Only consider a fraction of the reads, chosen at random. This is useful
260 for doing saturation analyses.
261
262 --chrom
263 Only consider a single chromosome. This is useful for debugging purposes
264
265 --per-contig (string)
266 Deduplicate per contig (field 3 in BAM; RNAME).
267 All reads with the same contig will be
268 considered to have the same alignment position. This is useful
269 if your library prep generates PCR duplicates with non identical
270 alignment positions such as CEL-Seq. In this case, you would
271 align to a reference transcriptome with one transcript per gene
272
273 --per-gene (string)
274 Deduplicate per gene. As above except with this option you can
275 align to a reference transcriptome with more than one transcript
276 per gene. You need to also provide --gene-transcript-map option.
277 This will also add a metacontig ('MC') tag to the reads if used
278 in conjunction with --output-bam
279
280 --gene-transcript-map (string)
281 File mapping genes to transripts (tab separated), e.g:
282
283 gene1 transcript1
284 gene1 transcript2
285 gene2 transcript3
286
287 --gene-tag (string)
288 Deduplicate per gene. As per --per-gene except here the gene
289 information is encoded in the bam read tag specified so you do
290 not need to supply --gene-transcript-map
291
292 --output-bam (string, filename)
293 Output a tagged bam file to stdout or -S <filename>
294
295 -i, --in-sam/-o, --out-sam
296 By default, inputs are assumed to be in BAM format and output are output
297 in BAM format. Use these options to specify the use of SAM format for
298 inputs or outputs.
299
300 -I (string, filename) input file name
301 The input file must be sorted and indexed.
302
303 -S (string, filename) output file name
304
305 -L (string, filename) log file name
306
307 Usage
308 -----
309 umi_tools dedup -I infile.bam -S grouped.bam --
310
311 ]]></help>
312 <expand macro="citations" /> 258 <expand macro="citations" />
313 </tool> 259 </tool>