comparison umi-tools_counts.xml @ 8:e654095ab143 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit bf6a3aa532e8f9d122da4c1e39f3e256ae587b79"
author iuc
date Mon, 13 Sep 2021 14:49:44 +0000
parents 276b4111b253
children 71ad4a56c40c
comparison
equal deleted inserted replaced
7:8250ea3a1501 8:e654095ab143
1 <tool id="umi_tools_count" name="UMI-tools count" version="@VERSION@.1"> 1 <tool id="umi_tools_count" name="UMI-tools count" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
2 <description>performs quantification of UMIs from BAM files</description> 2 <description>performs quantification of UMIs from BAM files</description>
3 <expand macro="bio_tools"/>
3 <macros> 4 <macros>
4 <import>macros.xml</import> 5 <import>macros.xml</import>
5 <xml name="sanitize_tag" >
6 <sanitizer invalid_char="">
7 <valid initial="string.letters,string.digits" />
8 </sanitizer>
9 </xml>
10 </macros> 6 </macros>
11 <expand macro="requirements"> 7 <expand macro="requirements">
8 <!-- TODO see comment in LINK_SAM_BAM_INPUT -->
9 <requirement type="package" version="1.12">samtools</requirement>
12 <requirement type="package" version="4.7">sed</requirement> 10 <requirement type="package" version="4.7">sed</requirement>
13 </expand> 11 </expand>
14 <command detect_errors="exit_code"><![CDATA[ 12 <command detect_errors="exit_code"><![CDATA[
15 #import re 13 #import re
16 14 @LINK_SAM_BAM_INPUT@
17 ln -s '${input_bam}' 'input.bam' && 15
18 ln -s '${input_bam.metadata.bam_index}' 'input.bam.bai' && 16 umi_tools count
19 17 '$wide_format_cell_counts'
20 umi_tools count 18 @BARCODE_OPTIONS@
21 -I input.bam 19 @UMI_GROUPING_OPTIONS@
22 '$paired' 20 @SC_OPTIONS@
23 --extract-umi-method='$barcodes.extract_umi_method.value' 21 @SAMBAM_OPTIONS@
24 #if str($barcodes.extract_umi_method) == 'read_id': 22 @ADVANCED_OPTIONS@
25 --umi-separator='$barcodes.umi_separator.value' 23 -I '$input_file' -S '$out_counts'
26 #else if str($barcodes.extract_umi_method) == 'tag': 24 @LOG@
27 --umi-tag='$barcodes.umi_tag.value' 25 #if str($cond_extra.prepender) != "none":
28 --cell-tag='$barcodes.cell_tag.value' 26 #if str($cond_extra.prepender) == "string":
29 #end if 27 #set $replacer = str($cond_extra.custom_label)
30 --method='$method.value' 28 #else
31 --edit-distance-threshold='$edit_distance_threshold' 29 #set $replacer = re.sub('[^\w\_]+', '_', str($input.element_identifier.rsplit('.',1)[0]))
32 --mapping-quality='$advanced.mapping_quality' 30 #end if
33 --per-gene 31 && sed -i -r '1s|\b([ACGT]+)\b|'"$replacer"'_\1|g' '$out_counts'
34 '$wide_format_cell_counts' 32 #end if
35 '$advanced.per_contig'
36 '$advanced.per_cell'
37
38 #if str($advanced.gene_tag) != "":
39 --gene-tag='$advanced.gene_tag.value'
40 #end if
41 #if str($advanced.skip_tags_regex) != "":
42 --skip-tags-regex='$advanced.skip_tags_regex.value'
43 #end if
44 #if '$advanced.random_seed' != 0:
45 --random-seed='$advanced.random_seed'
46 #end if
47 -S '$out_counts'
48
49
50 #if str($cond_extra.prepender) != "none":
51 #set $replacer = re.sub('[^\w\_]+', '_', str($input_bam.element_identifier.rsplit('.',1)[0]))
52 #if str($cond_extra.prepender) == "string":
53 #set $replacer = str($cond_extra.custom_label)
54 #end if
55
56 && sed -i -r '1s|\b([ACGT]+)\b|'"$replacer"'_\1|g' '$out_counts'
57 #end if
58
59 ]]></command> 33 ]]></command>
60 <inputs> 34 <inputs>
61 <param name="input_bam" type="data" format="bam" label="Sorted BAM file" help="Please use the samtools sort tool to ensure a correct BAM input" /> 35 <param name="input" type="data" format="sam,bam" label="Reads to deduplicate in SAM or BAM format" help="Please use the samtools sort tool to ensure a correct BAM input" />
62 <param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" checked="false" label="Bam is paired-end" help="both read pairs will be output. This will also force the use of the template length to determine reads with the same mapping coordinates." />
63 <conditional name="barcodes" >
64 <param argument="--extract-umi-method" name="extract_umi_method" type="select" label="Umi Extract Method" help="How are the barcodes encoded in the read?" >
65 <option value="read_id" selected="true">Barcodes are contained at the end of the read seperated by a delimiter</option>
66 <option value="tag" >Barcodes are contained in tags</option>
67 <option value="umis" >Barcodes were extracted using umis</option>
68 </param>
69 <when value="read_id" >
70 <param argument="--umi-separator" name="umi_separator" type="text" label="Delimiter between read id and the UMI" value="_" >
71 <sanitizer invalid_char="" >
72 <valid initial="string.punctuation" />
73 </sanitizer>
74 </param>
75 </when>
76 <when value="tag" >
77 <param argument="--umi-tag" name="umi_tag" type="text" label="Tag which contains the UMI" >
78 <expand macro="sanitize_tag" />
79 </param>
80 <param argument="--cell-tag" name="cell_tag" type="text" label="Tag which contains the cell barcode" >
81 <expand macro="sanitize_tag" />
82 </param>
83 </when>
84 <when value="umis"></when>
85 </conditional>
86 <param argument="--method" type="select" label="Method to identify group of reads" help="UMIs with the same (or similar) codes can be grouped together. The simplest methods 'unique' and 'percentile' group identical
87 UMIs, however 'cluster', 'adjacency', and 'directional' can group similar umis with edit distances less than some threshold. Unique: Reads group share the exact same UMI. Percentile: Reads group share the same UMI, and UMIs with
88 counts &lt; 1% of the median counts for UMIs at the same position are ignored. Cluster: Identify clusters of connected UMIs (based on hamming distance threshold). Adjacency: Same as cluster, but considers only directly ajacent UMIs in the cluster. Directional: Identify cluster of connected UMIs based on hamming distance and umi." >
89 <option value="unique" >Unique</option>
90 <option value="percentile">Percentile</option>
91 <option value="cluster">Cluster</option>
92 <option value="adjacency">Adjacency</option>
93 <option value="directional" selected="true" >Directional</option>
94 </param>
95 <param argument="--edit-distance-threshold" name="edit_distance_threshold" type="integer" label="Edit distance threshold" min="0" value="1" />
96 <param argument="--wide-format-cell-counts" name="wide_format_cell_counts" type="boolean" truevalue="--wide-format-cell-counts" falsevalue="" checked="true" label="Output a matrix of genes and cells, instead of a flat file" /> 36 <param argument="--wide-format-cell-counts" name="wide_format_cell_counts" type="boolean" truevalue="--wide-format-cell-counts" falsevalue="" checked="true" label="Output a matrix of genes and cells, instead of a flat file" />
97 <section name="advanced" title="Extra parameters" > 37 <expand macro="barcode_options_macro"/>
98 <param argument="--mapping-quality" name="mapping_quality" type="integer" min="0" value="0" label="Minimum mapping quality" /> 38 <expand macro="umi_grouping_options_macro"/>
99 <!-- Currently hard-coded parameter. Leave here if useful to future wrapper --> 39 <expand macro="sambam_options_macro"/>
100 <!-- <param argument="-\-per-gene" name="per_gene" type="text" label="Group reads together if they have the same gene" help="Reads will be grouped together if they have the same gene. This is useful if your library 40 <expand macro="sc_options_macro"/>
101 prep generates PCR duplicates with non-identical alignment positions such as CEL-Seq. Note this option is hardcoded to be on with the count command. I.e counting is always performed per-gene. Must be combined with either 41 <expand macro="advanced_options_macro"/>
102 -\-gene-tag or -\-per-contig option" /> -->
103 <param argument="--gene-tag" name="gene_tag" type="text" label="Deduplicate per gene." value="XT" help="The gene information is encoded in the bam read tag." >
104 <expand macro="sanitize_tag" />
105 </param>
106 <param argument="--skip-tags-regex" name="skip_tags_regex" type="text" label="Skip any reads where the gene matches this tag" value="" >
107 <expand macro="barcode_sanitizer" />
108 </param>
109 <param argument="--per-contig" name="per_contig" type="boolean" truevalue="--per-contig" falsevalue="" checked="false" label="Deduplicate per contig (field 3 in BAM; RNAME)" help="All reads with the same contig will be considered to have the same alignment position. This is useful if you have aligned to a reference transcriptome with one transcript per gene." />
110 <param argument="--per-cell" name="per_cell" type="boolean" truevalue="--per-cell" falsevalue="" checked="true" label="Group reads only if they have the same cell barcode." />
111 <param argument="--random-seed" name="random_seed" type="integer" min="0" value="0" label="Random Seed" />
112 </section>
113 <conditional name="cond_extra" > 42 <conditional name="cond_extra" >
114 <param name="prepender" type="select" label="Prepend a label to all column headers" help="This preserves uniqueness when merging with other files with the same headers. Note: filename must not contain a '.' character" > 43 <param name="prepender" type="select" label="Prepend a label to all column headers" help="This preserves uniqueness when merging with other files with the same headers. Note: filename must not contain a '.' character" >
115 <option value="none" selected="true" >No modifications</option> 44 <option value="none" selected="true" >No modifications</option>
116 <option value="string">Custom Label</option> 45 <option value="string">Custom Label</option>
117 <option value="dataset name">Dataset Name</option> 46 <option value="dataset name">Dataset Name</option>
128 </valid> 57 </valid>
129 </sanitizer> 58 </sanitizer>
130 </param> 59 </param>
131 </when> 60 </when>
132 </conditional> 61 </conditional>
62 <expand macro="log_input_macro"/>
133 </inputs> 63 </inputs>
134 <outputs> 64 <outputs>
135 <data name="out_counts" format="tabular" /> 65 <data name="out_counts" format="tabular" />
66 <expand macro="log_output_macro"/>
136 </outputs> 67 </outputs>
137 <tests> 68 <tests>
138 <test><!--count_single_gene_tag:--> 69 <test><!--count_single_gene_tag:-->
139 <param name="input_bam" value="chr19_gene_tags.bam" /> 70 <param name="input" value="chr19_gene_tags.bam" />
140 <param name="random_seed" value="123456789" /> 71 <section name="advanced">
141 <param name="method" value="directional" /> 72 <param name="random_seed" value="123456789" />
142 <param name="gene_tag" value="XF" /> 73 </section>
143 <param name="skip_tags_regex" value="^[__|Unassigned]" /> 74 <section name="sc">
144 <param name="extract_umi_method" value="umis" /> 75 <param name="gene_tag" value="XF" />
76 <param name="skip_tags_regex" value="^[__|Unassigned]" />
77 <param name="per_cell" value="false" />
78 </section>
79 <conditional name="bc">
80 <param name="extract_umi_method" value="umis" />
81 </conditional>
82 <section name="umi">
83 <param name="method" value="directional" />
84 </section>
145 <param name="wide_format_cell_counts" value="false" /> 85 <param name="wide_format_cell_counts" value="false" />
146 <param name="per_cell" value="false" />
147 <output name="out_counts" value="count_single_gene_tag.tsv" /> 86 <output name="out_counts" value="count_single_gene_tag.tsv" />
148 </test> 87 </test>
88 <test><!--count_single_gene_tag .. with sam input-->
89 <param name="input" value="chr19_gene_tags.sam" />
90 <section name="advanced">
91 <param name="random_seed" value="123456789" />
92 </section>
93 <section name="sc">
94 <param name="gene_tag" value="XF" />
95 <param name="skip_tags_regex" value="^[__|Unassigned]" />
96 <param name="per_cell" value="false" />
97 </section>
98 <conditional name="bc">
99 <param name="extract_umi_method" value="umis" />
100 </conditional>
101 <section name="umi">
102 <param name="method" value="directional" />
103 </section>
104 <param name="wide_format_cell_counts" value="false" />
105 <output name="out_counts" value="count_single_gene_tag.tsv" />
106 </test>
149 <test><!--count_single_cells_gene_tag:--> 107 <test><!--count_single_cells_gene_tag:-->
150 <param name="input_bam" value="chr19_gene_tags.bam" /> 108 <param name="input" value="chr19_gene_tags.bam" />
151 <param name="random_seed" value="123456789" /> 109 <section name="advanced">
152 <param name="method" value="directional" /> 110 <param name="random_seed" value="123456789" />
153 <param name="gene_tag" value="XF" /> 111 </section>
154 <param name="skip_tags_regex" value="^[__|Unassigned]" /> 112 <section name="sc">
155 <param name="per_cell" value="true" /> 113 <param name="gene_tag" value="XF" />
156 <param name="extract_umi_method" value="umis" /> 114 <param name="skip_tags_regex" value="^[__|Unassigned]" />
115 <param name="per_cell" value="true" />
116 </section>
117 <conditional name="bc">
118 <param name="extract_umi_method" value="umis" />
119 </conditional>
120 <section name="umi">
121 <param name="method" value="directional" />
122 </section>
157 <param name="wide_format_cell_counts" value="false" /> 123 <param name="wide_format_cell_counts" value="false" />
158 <output name="out_counts" value="count_single_cells_gene_tag.tsv" /> 124 <output name="out_counts" value="count_single_cells_gene_tag.tsv" />
159 </test> 125 </test>
160 <test><!--count_single_cells_wide_gene_tag:--> 126 <test><!--count_single_cells_wide_gene_tag:-->
161 <param name="input_bam" value="chr19_gene_tags.bam" /> 127 <param name="input" value="chr19_gene_tags.bam" />
162 <param name="random_seed" value="123456789" /> 128 <section name="advanced">
163 <param name="method" value="directional" /> 129 <param name="random_seed" value="123456789" />
164 <param name="gene_tag" value="XF" /> 130 </section>
165 <param name="skip_tags_regex" value="^[__|Unassigned]" /> 131 <section name="sc">
166 <param name="per_cell" value="true" /> 132 <param name="gene_tag" value="XF" />
167 <param name="extract_umi_method" value="umis" /> 133 <param name="skip_tags_regex" value="^[__|Unassigned]" />
134 <param name="per_cell" value="true" />
135 </section>
136 <conditional name="bc">
137 <param name="extract_umi_method" value="umis" />
138 </conditional>
139 <section name="umi">
140 <param name="method" value="directional" />
141 </section>
168 <param name="wide_format_cell_counts" value="true" /> 142 <param name="wide_format_cell_counts" value="true" />
169 <output name="out_counts" value="count_single_cells_gene_tag_wide.tsv" /> 143 <output name="out_counts" value="count_single_cells_gene_tag_wide.tsv" />
170 </test> 144 </test>
171 <test><!-- count ENSDARG00000019692, with defaults --> 145 <test><!-- count ENSDARG00000019692, with defaults -->
172 <param name="input_bam" value="fc.ENSDARG00000019692.bam" /> 146 <param name="input" value="fc.ENSDARG00000019692.bam" />
173 <param name="method" value="unique" /> 147 <section name="advanced">
148 <param name="random_seed" value="0" />
149 </section>
150 <section name="sc">
151 <param name="gene_tag" value="XT" />
152 <param name="per_cell" value="true" />
153 </section>
154 <section name="umi">
155 <param name="method" value="unique" />
156 </section>
174 <output name="out_counts" value="fc.ENSDARG00000019692.counts" /> 157 <output name="out_counts" value="fc.ENSDARG00000019692.counts" />
175 </test> 158 </test>
176 <test><!-- count ENSDARG00000019692, relabel string --> 159 <test><!-- count ENSDARG00000019692, relabel string -->
177 <param name="input_bam" value="fc.ENSDARG00000019692.bam" /> 160 <param name="input" value="fc.ENSDARG00000019692.bam" />
178 <param name="method" value="unique" /> 161 <section name="advanced">
162 <param name="random_seed" value="0" />
163 </section>
164 <section name="sc">
165 <param name="gene_tag" value="XT" />
166 <param name="per_cell" value="true" />
167 </section>
168 <section name="umi">
169 <param name="method" value="unique" />
170 </section>
179 <conditional name="cond_extra" > 171 <conditional name="cond_extra" >
180 <param name="prepender" value="string" /> 172 <param name="prepender" value="string" />
181 <param name="custom_label" value="test" /> 173 <param name="custom_label" value="test" />
182 </conditional> 174 </conditional>
183 <output name="out_counts" value="fc.ENSDARG00000019692.counts.test" /> 175 <output name="out_counts" value="fc.ENSDARG00000019692.counts.test" />
184 </test> 176 </test>
185 <test><!-- count ENSDARG00000019692, relabel filename --> 177 <test><!-- count ENSDARG00000019692, relabel filename -->
186 <param name="input_bam" value="fc.ENSDARG00000019692.bam" /> 178 <param name="input" value="fc.ENSDARG00000019692.bam" />
187 <param name="method" value="unique" /> 179 <section name="advanced">
180 <param name="random_seed" value="0" />
181 </section>
182 <section name="sc">
183 <param name="gene_tag" value="XT" />
184 <param name="per_cell" value="true" />
185 </section>
186 <section name="umi">
187 <param name="method" value="unique" />
188 </section>
188 <conditional name="cond_extra" > 189 <conditional name="cond_extra" >
189 <param name="prepender" value="dataset name" /> 190 <param name="prepender" value="dataset name" />
190 </conditional> 191 </conditional>
191 <output name="out_counts" value="fc.ENSDARG00000019692.counts.name" /> 192 <output name="out_counts" value="fc.ENSDARG00000019692.counts.name" />
192 </test> 193 </test>
193 </tests> 194 </tests>
194 <help><![CDATA[ 195 <help><![CDATA[
195 196
196 UMI Tools count - Count reads per gene from BAM using UMIs 197 count - Count reads per gene from BAM using UMIs and mapping coordinates
197 ---------------------------------------------------------- 198 ========================================================================
198 199
199 Purpose 200 This tool is only designed to work with library preparation
200 ------- 201 methods where the fragmentation occurs after amplification, as per
201 202 most single cell RNA-Seq methods (e.g 10x, inDrop, Drop-seq, SCRB-seq
202 The purpose of this command is to count the number of reads per gene based 203 and CEL-seq2). Since the precise mapping co-ordinate is not longer
203 on the mapping co-ordinate and the UMI attached to the read. 204 informative for such library preparations, it is simplified to the
204 205 gene. This is a reasonable approach providing the number of available
205 206 UMIs is sufficiently high and the sequencing depth is sufficiently low
206 It is assumed that the FASTQ files were processed with extract_umi.py 207 that the probability of two reads from the same gene having the same
207 before mapping and thus the UMI is the last word of the read name. e.g: 208 UMIs is acceptably low.
208 209
209 @HISEQ:87:00000000_AATT 210 If you want to count reads per gene for library preparations which
210 211 fragment prior to amplification (e.g bulk RNA-Seq), please use
211 where AATT is the UMI sequeuence. 212 ``umi_tools dedup`` to remove the duplicate reads as this will use the
212 213 full information from the mapping co-ordinate. Then use a read
213 If you have used an alternative method which does not separate the 214 counting tool such as FeatureCounts or HTSeq to count the reads per
214 read id and UMI with a "_", such as bcl2fastq which uses ":", you can 215 gene.
215 specify the separator, or if your UMIs are encoded in a tag you can also specify this. 216
217 In the rare case of bulk RNA-Seq using a library preparation method
218 with fragmentation after amplification, one can still use ``count`` but
219 note that it has not been tested on bulk RNA-Seq.
220
221 This tool deviates from group and dedup in that the ``--per-gene`` option
222 is hardcoded on.
223
224 @BARCODE_HELP@
225
226 @UMI_GROUPING_HELP@
216 227
217 ]]></help> 228 ]]></help>
218 <expand macro="citations" /> 229 <expand macro="citations" />
219 </tool> 230 </tool>