Mercurial > repos > iuc > umi_tools_dedup
comparison umi-tools_dedup.xml @ 12:4098ab380097 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit bf6a3aa532e8f9d122da4c1e39f3e256ae587b79"
author | iuc |
---|---|
date | Mon, 13 Sep 2021 14:51:31 +0000 |
parents | 7fa28eb10fed |
children |
comparison
equal
deleted
inserted
replaced
11:7fa28eb10fed | 12:4098ab380097 |
---|---|
1 <tool id="umi_tools_dedup" name="UMI-tools deduplicate" version="@VERSION@+galaxy1"> | 1 <tool id="umi_tools_dedup" name="UMI-tools deduplicate" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> |
2 <description>Extract UMI from fastq files</description> | 2 <description>Extract UMI from fastq files</description> |
3 <expand macro="bio_tools"/> | |
3 <macros> | 4 <macros> |
4 <import>macros.xml</import> | 5 <import>macros.xml</import> |
5 </macros> | 6 </macros> |
6 <expand macro="requirements"> | 7 <expand macro="requirements"> |
7 <requirement type="package" version="1.9">samtools</requirement> | 8 <requirement type="package" version="1.12">samtools</requirement> |
8 </expand> | 9 </expand> |
9 <command detect_errors="exit_code"><![CDATA[ | 10 <command detect_errors="exit_code"><![CDATA[ |
10 #if $input.is_of_type("sam"): | 11 @LINK_SAM_BAM_INPUT@ |
11 #set $input_file = $input | 12 |
12 #else: | 13 echo $input.ext && |
13 ln -sf '${input}' 'input.bam' && | |
14 ln -sf '$input.metadata.bam_index' 'input.bam.bai' && | |
15 #set $input_file = 'input.bam' | |
16 #end if | |
17 | |
18 umi_tools dedup | 14 umi_tools dedup |
19 '$output_stats_bool' | 15 #if $output_stats_bool |
20 --random-seed 0 | 16 --output-stats=stats_outputs |
21 --extract-umi-method $extract_umi_method | |
22 #if str($extract_umi_method) != 'read_id': | |
23 --umi-separator '$umi_separator' --umi-tag '$umi_tag' | |
24 #end if | 17 #end if |
25 --method $method --edit-distance-threshold $edit_distance_threshold | 18 @GROUPDEDUP_OPTIONS@ |
26 $paired $spliced_is_unique --soft-clip-threshold $soft_clip_threshold | 19 @BARCODE_OPTIONS@ |
27 $read_length $whole_contig --subset $subset $per_contig $per_gene | 20 @UMI_GROUPING_OPTIONS@ |
28 #if $gene_transcript_map: | 21 @SAMBAM_OPTIONS@ |
29 --gene-transcript-map '$gene_transcript_map' | 22 @FULLSC_OPTIONS@ |
30 #end if | 23 @ADVANCED_OPTIONS@ |
31 #if len(str($gene_tag)) > 0: | 24 -I '$input_file' -S deduped.bam |
32 --gene-tag '$gene_tag' | 25 ## TODO using samtools sort is a workaround, for the following error that appears when Galaxy |
33 #end if | 26 ## compares the generated file with the one in test-data |
34 #if $input.is_of_type("sam"): | 27 ## `Converting history BAM to SAM failed: 'samtools returned with error 1: stdout=None, stderr=[main_samview] fail to read the header from "/tmp/tmpd8o61jykdedup_out6.bam".\n'. Will compare BAM files` |
35 --in-sam | 28 ## problem seems to be the BAM file generated with pysam |
36 #end if | 29 ## may be dropped in the future |
37 -I '$input_file' -S deduped.bam && | 30 --no-sort-output |
38 samtools sort deduped.bam -@ \${GALAXY_SLOTS:-1} -T "\${TMPDIR:-.}" -o '$output' -O BAM | 31 @LOG@ |
32 && samtools sort --no-PG deduped.bam -@ \${GALAXY_SLOTS:-1} -T "\${TMPDIR:-.}" -o '$output' -O BAM | |
33 | |
39 ]]></command> | 34 ]]></command> |
40 <inputs> | 35 <inputs> |
41 <param name="input" type="data" format="sam,bam" label="Reads to deduplicate in SAM or BAM format" /> | 36 <param name="input" type="data" format="sam,bam" label="Reads to deduplicate in SAM or BAM format" /> |
42 <param name="extract_umi_method" argument="--extract-umi-method" type="select"> | 37 <param name="output_stats_bool" type="boolean" checked="false" label="Output UMI related statistics files?"/> |
43 <option value="read_id" selected="True">Read ID</option> | 38 <expand macro="groupdedup_options_macro"/> |
44 <option value="tag">Tag</option> | 39 <expand macro="barcode_options_macro"/> |
45 </param> | 40 <expand macro="umi_grouping_options_macro"/> |
46 <param name="umi_separator" argument="--umi-separator" type="text" label="Separator between read id and UMI." help="Ignored unless extracting by tag" /> | 41 <expand macro="sambam_options_macro"/> |
47 <param name="umi_tag" argument="--umi-tag" type="text" label="Tag which contains UMI." /> | 42 <expand macro="fullsc_options_macro"/> |
48 <param argument="--method" type="select" label="Method used to identify PCR duplicates within reads." help="All methods start by identifying the reads with the same mapping position"> | 43 <expand macro="advanced_options_macro"/> |
49 <option value="unique">Reads group share the exact same UMI</option> | 44 <expand macro="log_input_macro"/> |
50 <option value="percentile">Reads group share the exact same UMI. UMIs with counts less than 1% of the median counts for UMIs at the same position are ignored</option> | |
51 <option value="cluster">Identify clusters based on hamming distance</option> | |
52 <option value="adjacency">Identify clusters based on hamming distance and resolve networks by using the node counts</option> | |
53 <option value="directional">Identify clusters based on distance and counts, restrict network expansion by threshold</option> | |
54 </param> | |
55 <param name="edit_distance_threshold" argument="--edit-distance-threshold" type="integer" value="1" label="Edit distance threshold" help="For the adjacency and cluster methods the threshold for the edit distance to connect two UMIs in the network can be increased. The default value of 1 works best unless the UMI is very long (>14bp)" /> | |
56 <param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" label="BAM is paired end" help="This will also force the use of the template length to determine reads with the same mapping coordinates." /> | |
57 <param name="spliced_is_unique" argument="--spliced-is-unique" type="boolean" truevalue="--spliced-is-unique" falsevalue="" label="Spliced reads are unique" help="Causes two reads that start in the same position on the same strand and having the same UMI to be considered unique if one is spliced and the other is not. (Uses the 'N' cigar operation to test for splicing)" /> | |
58 <param name="soft_clip_threshold" argument="--soft-clip-threshold" type="integer" value="4" label="Soft clip threshold" help="Mappers that soft clip, will sometimes do so rather than mapping a spliced read if there is only a small overhang over the exon junction. By setting this option, you can treat reads with at least this many bases soft-clipped at the 3' end as spliced." /> | |
59 <param name="read_length" argument="--read-length" type="boolean" truevalue="--read-length" falsevalue="" label="Use the read length as as a criterion when deduping" /> | |
60 <param name="whole_contig" argument="--whole-contig" type="boolean" truevalue="--whole-contig" falsevalue="" label="Consider all alignments to a single contig together" help="This is useful if you have aligned to a transcriptome multi-fasta" /> | |
61 <param argument="--subset" type="float" min="0.0" max="1.0" value="1.0" label="Only consider a random selection of the reads" /> | |
62 <param argument="--chrom" type="boolean" truevalue="--chrom" falsevalue="" label="Only consider a single chromosome" /> | |
63 <param name="per_contig" argument="--per-contig" type="boolean" truevalue="--per-contig" falsevalue="" label="Deduplicate per contig" help="Field 3 in BAM; RNAME. All reads with the same contig will be considered to have the same alignment position. This is useful if your library prep generates PCR duplicates with non identical alignment positions such as CEL-Seq. In this case, you would align to a reference transcriptome with one transcript per gene" /> | |
64 <param name="per_gene" argument="--per-gene" type="boolean" truevalue="--per-gene" falsevalue="" label="Deduplicate per gene" help="As above except with this option you can align to a reference transcriptome with more than one transcript per gene. You need to also provide a map of genes to transcripts. This will also add a metacontig ('MC') tag to the output BAM file." /> | |
65 <param name="gene_transcript_map" argument="--gene-transcript-map" type="data" format="tabular" optional="True" label="Tabular file mapping genes to transripts" /> | |
66 <param name="gene_tag" argument="--gene-tag" type="text" optional="True" label="Deduplicate by this gene tag" help="As --per-gene except here the gene information is encoded in the bam read tag specified so you do not need to supply the mapping file." /> | |
67 <param name="output_stats_bool" type="boolean" truevalue="--output-stats=stats_outputs" falsevalue="" checked="false" label="Output UMI related statistics files?"/> | |
68 </inputs> | 45 </inputs> |
69 <outputs> | 46 <outputs> |
70 <data format="bam" name="output" /> | 47 <data format="bam" name="output" /> |
71 <collection name="output_stats" type="list" label="UMI_tools dedup stats"> | 48 <collection name="output_stats" type="list" label="${tool.name} on ${on_string} stats"> |
72 <filter>output_stats_bool</filter> | 49 <filter>output_stats_bool</filter> |
73 <data name="edit_distance" format="tabular" from_work_dir="stats_outputs_edit_distance.tsv"/> | 50 <data name="edit_distance" format="tabular" from_work_dir="stats_outputs_edit_distance.tsv"/> |
74 <data name="per_umi" format="tabular" from_work_dir="stats_outputs_per_umi.tsv"/> | 51 <data name="per_umi" format="tabular" from_work_dir="stats_outputs_per_umi.tsv"/> |
75 <data name="per_umi_per_position" format="tabular" from_work_dir="stats_outputs_per_umi_per_position.tsv"/> | 52 <data name="per_umi_per_position" format="tabular" from_work_dir="stats_outputs_per_umi_per_position.tsv"/> |
76 </collection> | 53 </collection> |
54 <expand macro="log_output_macro"/> | |
77 </outputs> | 55 </outputs> |
78 <tests> | 56 <tests> |
79 <test expect_num_outputs="1"> | 57 <test expect_num_outputs="1"> |
80 <param name="input" value="group_in1.sam" ftype="sam" /> | 58 <param name="input" value="group_in1.sam" ftype="sam" /> |
81 <param name="extract_umi_method" value="read_id" /> | 59 <section name="advanced"> |
82 <param name="method" value="unique" /> | 60 <param name="random_seed" value="0" /> |
83 <output name="output" file="dedup_out1.bam" ftype="bam" sort="True"/> | 61 </section> |
84 </test> | 62 <conditional name="bc"> |
85 <test expect_num_outputs="1"> | 63 <param name="extract_umi_method" value="read_id" /> |
86 <param name="input" value="group_in2.bam" ftype="bam" /> | 64 </conditional> |
87 <param name="extract_umi_method" value="read_id" /> | 65 <section name="umi"> |
88 <param name="paired" value="True" /> | 66 <param name="method" value="unique" /> |
89 <param name="method" value="unique" /> | 67 </section> |
90 <output name="output" file="dedup_out2.bam" ftype="bam" sort="True" /> | 68 <output name="output" file="dedup_out1.bam" ftype="bam" lines_diff="2"/><!--lines_diff won't be needed in later versions since umitools use \-\-no-PG internally --> |
69 </test> | |
70 <test expect_num_outputs="1"> | |
71 <param name="input" value="group_in2.sam" ftype="sam" /> | |
72 <section name="advanced"> | |
73 <param name="random_seed" value="0" /> | |
74 </section> | |
75 <conditional name="bc"> | |
76 <param name="extract_umi_method" value="read_id" /> | |
77 </conditional> | |
78 <section name="sambam"> | |
79 <param name="paired" value="true" /> | |
80 </section> | |
81 <section name="umi"> | |
82 <param name="method" value="unique" /> | |
83 </section> | |
84 <output name="output" file="dedup_out2.bam" ftype="bam" lines_diff="2" /> | |
91 </test> | 85 </test> |
92 <test expect_num_outputs="1"> | 86 <test expect_num_outputs="1"> |
93 <param name="input" value="group_in3.bam" ftype="bam" /> | 87 <param name="input" value="group_in3.bam" ftype="bam" /> |
94 <param name="extract_umi_method" value="read_id" /> | 88 <section name="advanced"> |
95 <param name="method" value="unique" /> | 89 <param name="random_seed" value="0" /> |
96 <output name="output" file="dedup_out3.bam" ftype="bam" sort="True" /> | 90 </section> |
91 <conditional name="bc"> | |
92 <param name="extract_umi_method" value="read_id" /> | |
93 </conditional> | |
94 <section name="umi"> | |
95 <param name="method" value="unique" /> | |
96 </section> | |
97 <output name="output" file="dedup_out3.bam" ftype="bam" lines_diff="2" /> | |
97 </test> | 98 </test> |
98 <test expect_num_outputs="1"> | 99 <test expect_num_outputs="1"> |
99 <param name="input" value="group_in4.bam" ftype="bam" /> | 100 <param name="input" value="group_in4.bam" ftype="bam" /> |
100 <param name="extract_umi_method" value="tag" /> | 101 <section name="advanced"> |
101 <param name="umi_tag" value="BX" /> | 102 <param name="random_seed" value="0" /> |
102 <param name="method" value="unique" /> | 103 </section> |
103 <output name="output" file="dedup_out4.bam" ftype="bam" sort="True" /> | 104 <conditional name="bc"> |
105 <param name="extract_umi_method" value="tag" /> | |
106 <param name="umi_tag" value="BX" /> | |
107 </conditional> | |
108 <section name="umi"> | |
109 <param name="method" value="unique" /> | |
110 </section> | |
111 <output name="output" file="dedup_out4.bam" ftype="bam" lines_diff="2"/> | |
104 </test> | 112 </test> |
105 <test expect_num_outputs="1"> | 113 <test expect_num_outputs="1"> |
106 <param name="input" value="group_in5.bam" ftype="bam" /> | 114 <param name="input" value="group_in5.bam" ftype="bam" /> |
107 <param name="extract_umi_method" value="read_id" /> | 115 <section name="advanced"> |
108 <param name="umi_tag" value="BX" /> | 116 <param name="random_seed" value="0" /> |
109 <param name="method" value="cluster" /> | 117 </section> |
110 <output name="output" file="dedup_out5.bam" ftype="bam" sort="True" /> | 118 <conditional name="bc"> |
119 <param name="extract_umi_method" value="read_id" /> | |
120 <param name="umi_tag" value="BX" /> | |
121 </conditional> | |
122 <section name="umi"> | |
123 <param name="method" value="cluster" /> | |
124 </section> | |
125 <output name="output" file="dedup_out5.bam" ftype="bam" lines_diff="2"/> | |
111 </test> | 126 </test> |
112 <test expect_num_outputs="1"> | 127 <test expect_num_outputs="1"> |
113 <param name="input" value="group_in6.bam" ftype="bam" /> | 128 <param name="input" value="group_in6.bam" ftype="bam" /> |
114 <param name="extract_umi_method" value="read_id" /> | 129 <section name="advanced"> |
115 <param name="umi_tag" value="BX" /> | 130 <param name="random_seed" value="0" /> |
116 <param name="method" value="directional" /> | 131 </section> |
117 <output name="output" file="dedup_out6.bam" ftype="bam" sort="True" /> | 132 <conditional name="bc"> |
133 <param name="extract_umi_method" value="read_id" /> | |
134 <param name="umi_tag" value="BX" /> | |
135 </conditional> | |
136 <section name="umi"> | |
137 <param name="method" value="directional" /> | |
138 </section> | |
139 <output name="output" file="dedup_out6.bam" ftype="bam" lines_diff="2"/> | |
118 </test> | 140 </test> |
119 <test expect_num_outputs="5"> | 141 <test expect_num_outputs="5"> |
120 <param name="input" value="group_in6.bam" ftype="bam" /> | 142 <param name="input" value="group_in6.bam" ftype="bam" /> |
121 <param name="extract_umi_method" value="read_id" /> | 143 <section name="advanced"> |
122 <param name="umi_tag" value="BX" /> | 144 <param name="random_seed" value="0" /> |
123 <param name="method" value="directional" /> | 145 </section> |
146 <conditional name="bc"> | |
147 <param name="extract_umi_method" value="read_id" /> | |
148 <param name="umi_tag" value="BX" /> | |
149 </conditional> | |
150 <section name="umi"> | |
151 <param name="method" value="directional" /> | |
152 </section> | |
124 <param name="output_stats_bool" value="true"/> | 153 <param name="output_stats_bool" value="true"/> |
125 <output name="output" file="dedup_out6.bam" ftype="bam" sort="True" /> | 154 <output name="output" file="dedup_out6.bam" ftype="bam" lines_diff="2"/> |
126 <output_collection name="output_stats"> | 155 <output_collection name="output_stats"> |
127 <element name="edit_distance" file="stats_outputs_edit_distance.tsv" /> | 156 <element name="edit_distance" file="stats_outputs_edit_distance.tsv" /> |
128 <element name="per_umi" file="stats_outputs_per_umi.tsv" /> | 157 <element name="per_umi" file="stats_outputs_per_umi.tsv" /> |
129 <element name="per_umi_per_position" file="stats_outputs_per_umi_per_position.tsv" /> | 158 <element name="per_umi_per_position" file="stats_outputs_per_umi_per_position.tsv" /> |
130 </output_collection> | 159 </output_collection> |
131 </test> | 160 </test> |
132 </tests> | 161 </tests> |
133 <help><![CDATA[ | 162 <help><![CDATA[ |
134 umi_tools dedup - Deduplicate reads based on their UMI | 163 umi_tools dedup - Deduplicate reads based on their UMI and mapping coordinates |
135 ====================================================== | 164 ============================================================================== |
136 | 165 |
137 Purpose | 166 Purpose |
138 ------- | 167 ------- |
139 | 168 |
140 The purpose of this command is to deduplicate BAM files based on the first | 169 The purpose of this command is to deduplicate BAM files based on the first |
141 mapping co-ordinate and the UMI attached to the read. It is assumed that the | 170 mapping co-ordinate and the UMI attached to the read. |
142 FASTQ files were processed with extract_umi.py before mapping and thus the UMI | 171 |
143 is the last word of the read name. e.g: | 172 @BARCODE_HELP@ |
144 | 173 |
145 @HISEQ:87:00000000_AATT | 174 @UMI_GROUPING_HELP@ |
146 | 175 |
147 where AATT is the UMI sequeuence. | 176 Selecting the representative read |
148 | 177 --------------------------------- |
149 If you have used an alternative method which does not separate the | 178 For every group of duplicate reads, a single representative read is |
150 read id and UMI with a "_", such as bcl2fastq which uses ":", you can | 179 retained.The following criteria are applied to select the read that |
151 specify the separator with the option "--umi-separator=<sep>", | 180 will be retained from a group of duplicated reads: |
152 replacing <sep> with e.g ":". | 181 |
153 | 182 1. The read with the lowest number of mapping coordinates (see |
154 Alternatively, if your UMIs are encoded in a tag, you can specify this | 183 ``--multimapping-detection-method`` option) |
155 by setting the option --extract-umi-method=tag and set the tag name | 184 |
156 with the --umi-tag option. For example, if your UMIs are encoded in | 185 2. The read with the highest mapping quality. Note that this is not |
157 the 'UM' tag, provide the following options: | 186 the read sequencing quality and that if two reads have the same |
158 "--extract-umi-method=tag --umi-tag=UM" | 187 mapping quality then one will be picked at random regardless of the |
159 | 188 read quality. |
160 The start postion of a read is considered to be the start of its alignment | 189 |
161 minus any soft clipped bases. A read aligned at position 500 with | 190 Otherwise a read is chosen at random. |
162 cigar 2S98M will be assumed to start at postion 498. | 191 |
163 | 192 Optional statistics output |
164 | 193 -------------------------- |
165 Methods | 194 |
166 ------- | 195 One can use the edit distance between UMIs at the same position as an |
167 | 196 quality control for the deduplication process by comparing with |
168 dedup can be run with multiple methods to identify groups of reads with | 197 a null expectation of random sampling. For the random sampling, the |
169 the same (or similar) UMI(s). All methods start by identifying the | 198 observed frequency of UMIs is used to more reasonably model the null |
170 reads with the same mapping position. | 199 expectation. |
171 | 200 |
172 The simpliest method, "unique", groups reads with the exact same | 201 Use the option ``Output UMI related statistics files?`` generate stats outfiles: |
173 UMI. The network-based methods, "cluster", "adjacency" and | 202 |
174 "directional", build networks where nodes are UMIs and edges connect | 203 edit_distance |
175 UMIs with an edit distance <= threshold (usually 1). The groups of | 204 Reports the (binned) average edit distance between the UMIs at each |
176 reads are then defined from the network in a method-specific manner. | 205 position. Positions with a single UMI are reported seperately. The |
177 | 206 edit distances are reported pre- and post-deduplication alongside |
178 "unique" | 207 the null expectation from random sampling of UMIs from the UMIs |
179 Reads group share the exact same UMI | 208 observed across all positions. Note that separate null |
180 | 209 distributions are reported since the null depends on the observed |
181 "percentile" | 210 frequency of each UMI which is different pre- and |
182 Reads group share the exact same UMI. UMIs with counts < 1% of the | 211 post-deduplication. The post-duplication values should be closer to |
183 median counts for UMIs at the same position are ignored. | 212 their respective null than the pre-deduplication vs null comparison |
184 | 213 |
185 "cluster" | 214 In addition, this option will trigger reporting of further summary |
186 Identify clusters of connected UMIs (based on hamming distance | 215 statistics for the UMIs which may be informative for selecting the |
187 threshold). Each network is a read group | 216 optimal deduplication method or debugging. |
188 | 217 |
189 "adjacency" | 218 Each unique UMI sequence may be observed [0-many] times at multiple |
190 Cluster UMIs as above. For each cluster, select the node(UMI) | 219 positions in the BAM. The following files report the distribution for |
191 with the highest counts. Visit all nodes one edge away. If all | 220 the frequencies of each UMI. |
192 nodes have been visted, stop. Otherise, repeat with remaining | 221 |
193 nodes until all nodes have been visted. Each step | 222 per_umi_per_position |
194 defines a read group. | 223 The `_stats_per_umi_per_position.tsv` file simply tabulates the |
195 | 224 counts for unique combinations of UMI and position. E.g if prior to |
196 "directional" (default) | 225 deduplication, we have two positions in the BAM (POSa, POSb), at |
197 Identify clusters of connected UMIs (based on hamming distance | 226 POSa we have observed 2*UMIa, 1*UMIb and at POSb: 1*UMIc, 3*UMId, |
198 threshold) and umi A counts >= (2* umi B counts) - 1. Each | 227 then the stats file is populated thus: |
199 network is a read group. | 228 |
200 | 229 ====== ============= |
201 Options | 230 counts instances_pre |
202 ------- | 231 ------ ------------- |
203 | 232 1 2 |
204 --extract-umi-method (choice) | 233 2 1 |
205 How are the UMIs encoded in the read? | 234 3 1 |
206 | 235 ====== ============= |
207 Options are: | 236 |
208 | 237 If post deduplication, UMIb is grouped with UMIa such that POSa: |
209 - "read_id" (default) | 238 3*UMIa, then the `instances_post` column is populated thus: |
210 UMIs contained at the end of the read separated as | 239 |
211 specified with --umi-separator option | 240 ====== ============= ============== |
212 | 241 counts instances_pre instances_post |
213 - "tag" | 242 ------ ------------- -------------- |
214 UMIs contained in a tag, see --umi-tag option | 243 1 2 1 |
215 | 244 2 1 0 |
216 --umi-separator (string) | 245 3 1 2 |
217 Separator between read id and UMI. See --extract-umi-method above | 246 ====== ============= ============== |
218 | 247 |
219 --umi-tag (string) | 248 per_umi_per |
220 Tag which contains UMI. See --extract-umi-method above | 249 The `_stats_per_umi_per.tsv` table provides UMI-level summary |
221 | 250 statistics. Keeping in mind that each unique UMI sequence can be |
222 --edit-distance-threshold (int) | 251 observed at [0-many] times across multiple positions in the BAM, |
223 For the adjacency and cluster methods the threshold for the | 252 |
224 edit distance to connect two UMIs in the network can be | 253 :times_observed: How many positions the UMI was observed at |
225 increased. The default value of 1 works best unless the UMI is | 254 :total_counts: The total number of times the UMI was observed across all positions |
226 very long (>14bp) | 255 :median_counts: The median for the distribution of how often the UMI was observed at each position (excluding zeros) |
227 | 256 |
228 --paired | 257 Hence, whenever times_observed=1, total_counts==median_counts.]]></help> |
229 BAM is paired end - output both read pairs. This will also | |
230 force the use of the template length to determine reads with | |
231 the same mapping coordinates. | |
232 | |
233 --spliced-is-unique | |
234 Causes two reads that start in the same position on the same | |
235 strand and having the same UMI to be considered unique if one is | |
236 spliced and the other is not. (Uses the 'N' cigar operation to test | |
237 for splicing) | |
238 | |
239 --soft-clip-threshold (int) | |
240 Mappers that soft clip, will sometimes do so rather than mapping a | |
241 spliced read if there is only a small overhang over the exon | |
242 junction. By setting this option, you can treat reads with at least | |
243 this many bases soft-clipped at the 3' end as spliced. | |
244 | |
245 --multimapping-detection-method (string, choice) | |
246 If the sam/bam contains tags to identify multimapping reads, you can | |
247 specify for use when selecting the best read at a given loci. | |
248 Supported tags are "NH", "X0" and "XT". If not specified, the read | |
249 with the highest mapping quality will be selected | |
250 | |
251 --read-length | |
252 Use the read length as as a criteria when deduping, for e.g sRNA-Seq | |
253 | |
254 --whole-contig | |
255 Consider all alignments to a single contig together. This is useful if | |
256 you have aligned to a transcriptome multi-fasta | |
257 | |
258 --subset (float, [0-1]) | |
259 Only consider a fraction of the reads, chosen at random. This is useful | |
260 for doing saturation analyses. | |
261 | |
262 --chrom | |
263 Only consider a single chromosome. This is useful for debugging purposes | |
264 | |
265 --per-contig (string) | |
266 Deduplicate per contig (field 3 in BAM; RNAME). | |
267 All reads with the same contig will be | |
268 considered to have the same alignment position. This is useful | |
269 if your library prep generates PCR duplicates with non identical | |
270 alignment positions such as CEL-Seq. In this case, you would | |
271 align to a reference transcriptome with one transcript per gene | |
272 | |
273 --per-gene (string) | |
274 Deduplicate per gene. As above except with this option you can | |
275 align to a reference transcriptome with more than one transcript | |
276 per gene. You need to also provide --gene-transcript-map option. | |
277 This will also add a metacontig ('MC') tag to the reads if used | |
278 in conjunction with --output-bam | |
279 | |
280 --gene-transcript-map (string) | |
281 File mapping genes to transripts (tab separated), e.g: | |
282 | |
283 gene1 transcript1 | |
284 gene1 transcript2 | |
285 gene2 transcript3 | |
286 | |
287 --gene-tag (string) | |
288 Deduplicate per gene. As per --per-gene except here the gene | |
289 information is encoded in the bam read tag specified so you do | |
290 not need to supply --gene-transcript-map | |
291 | |
292 --output-bam (string, filename) | |
293 Output a tagged bam file to stdout or -S <filename> | |
294 | |
295 -i, --in-sam/-o, --out-sam | |
296 By default, inputs are assumed to be in BAM format and output are output | |
297 in BAM format. Use these options to specify the use of SAM format for | |
298 inputs or outputs. | |
299 | |
300 -I (string, filename) input file name | |
301 The input file must be sorted and indexed. | |
302 | |
303 -S (string, filename) output file name | |
304 | |
305 -L (string, filename) log file name | |
306 | |
307 Usage | |
308 ----- | |
309 umi_tools dedup -I infile.bam -S grouped.bam -- | |
310 | |
311 ]]></help> | |
312 <expand macro="citations" /> | 258 <expand macro="citations" /> |
313 </tool> | 259 </tool> |