Mercurial > repos > iuc > umi_tools_whitelist
comparison umi-tools_whitelist.xml @ 14:345bdf4546fd draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit bf6a3aa532e8f9d122da4c1e39f3e256ae587b79"
author | iuc |
---|---|
date | Mon, 13 Sep 2021 14:50:19 +0000 |
parents | 262026eb36a5 |
children | 953b4821b183 |
comparison
equal
deleted
inserted
replaced
13:49bfe13676a1 | 14:345bdf4546fd |
---|---|
1 <tool id="umi_tools_whitelist" name="UMI-tools whitelist" version="@VERSION@.0"> | 1 <tool id="umi_tools_whitelist" name="UMI-tools whitelist" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> |
2 <description>Extract cell barcodes from FASTQ files</description> | 2 <description>Extract cell barcodes from FASTQ files</description> |
3 <expand macro="bio_tools"/> | |
3 <macros> | 4 <macros> |
4 <import>macros.xml</import> | 5 <import>macros.xml</import> |
5 </macros> | 6 </macros> |
6 <expand macro="requirements" /> | 7 <expand macro="requirements" /> |
7 <command detect_errors="exit_code"><![CDATA[ | 8 <command detect_errors="exit_code"><![CDATA[ |
9 #import json | |
8 @COMMAND_LINK@ | 10 @COMMAND_LINK@ |
9 | |
10 umi_tools whitelist | 11 umi_tools whitelist |
11 --bc-pattern='$bc_pattern' | 12 |
12 --extract-method='$extract_method' | 13 @FASTQ_BARCODE_EXTRACTION_OPTIONS@ |
14 | |
13 --subset-reads='$subset_reads' | 15 --subset-reads='$subset_reads' |
14 #if $input_type.type == 'single': | 16 $allow_threshold_error |
17 #if $ed_above_threshold | |
18 --ed-above-threshold=$ed_above_threshold | |
19 #end if | |
20 --knee-method='$knee_method' | |
21 #if $input_type_cond.input_type == 'single': | |
15 #if $gz: | 22 #if $gz: |
16 --stdin=input_single.gz | 23 --stdin=input_single.gz |
17 #else | 24 #else |
18 --stdin=input_single.txt | 25 --stdin=input_single.txt |
19 #end if | 26 #end if |
23 --read2-in=input_read2.gz | 30 --read2-in=input_read2.gz |
24 #else: | 31 #else: |
25 --stdin=input_read1.txt | 32 --stdin=input_read1.txt |
26 --read2-in=input_read2.txt | 33 --read2-in=input_read2.txt |
27 #end if | 34 #end if |
28 #if $input_type.barcode.barcode_select == "both_reads": | |
29 --bc-pattern2='$input_type.barcode.bc_pattern2' | |
30 #end if | |
31 #end if | 35 #end if |
32 #if $celloptions.use_cell_opts == "advanced": | 36 #if $celloptions.use_cell_opts == "advanced": |
33 #if $celloptions.set_cell_number != "0": | 37 #if str($celloptions.set_cell_number) != '': |
34 --set-cell-number=$celloptions.set_cell_number | 38 --set-cell-number=$celloptions.set_cell_number |
35 #end if | 39 #end if |
36 #if $celloptions.expect_cells != "0": | 40 #if str($celloptions.expect_cells) != '': |
37 --expect-cells=$celloptions.expect_cells | 41 --expect-cells=$celloptions.expect_cells |
38 #end if | 42 #end if |
39 --error-correct-threshold=$celloptions.error_correct_thresh | 43 --error-correct-threshold=$celloptions.error_correct_threshold |
40 #end if | 44 #end if |
41 --method=$method | 45 --method=$method |
42 --plot-prefix=OUT | 46 --plot-prefix=OUT |
43 | 47 |
44 $prime3 | 48 @LOG@ |
45 | |
46 #if $log: | |
47 --log='$out_log' | |
48 #end if | |
49 | |
50 --log2stderr | |
51 | |
52 > '$out_whitelist' && | 49 > '$out_whitelist' && |
53 | 50 |
54 mkdir '${ out_html_report.files_path }' && | 51 mkdir '${ out_html_report.files_path }' && |
55 cp OUT_*.png '${ out_html_report.files_path }' && | 52 cp OUT_*.png '${ out_html_report.files_path }' && |
56 | 53 |
64 && | 61 && |
65 mv OUT_cell_thresholds.tsv '$out_thresh' | 62 mv OUT_cell_thresholds.tsv '$out_thresh' |
66 ]]></command> | 63 ]]></command> |
67 <inputs> | 64 <inputs> |
68 <expand macro="input_types" /> | 65 <expand macro="input_types" /> |
69 <param name="bc_pattern" argument="--bc-pattern" type="text" label="Barcode pattern for first read" | 66 <expand macro="fastq_barcode_extraction_options_macro"/> |
70 help="Use this option to specify the format of the UMI/barcode. Use Ns to | 67 |
71 represent the random positions and Xs to indicate the bc positions. | 68 <param argument="--method" type="select" label="Count reads or UMIs" |
72 Bases with Ns will be extracted and added to the read name. Remaining | 69 help="Many published protocols rank CBs by the number of reads the CBs appear in. However you could also use the number of unique UMIs a CB is associated with. Note that this is still and approximation to the number of transcripts captured because the same UMI could be associated with two different transcripts and be counted as independent" > |
73 bases, marked with an X will be reattached to the read."> | |
74 <expand macro="barcode_sanitizer" /> | |
75 </param> | |
76 <param name="extract_method" argument="--extract-method" type="select" label="Barcode Extraction Method" | |
77 help="If bracketed expressions are used in the above barcode pattern, then set this to 'regex'. Otherwise leave as 'string'" > | |
78 <option value="string" selected="true" /> | |
79 <option value="regex" /> | |
80 </param> | |
81 <param name="method" argument="--method" type="select" label="Count reads or UMIs" | |
82 help="Many published protocols rank CBs by the number of reads the CBs appear in. However you could also use the number of unique UMIs a CB is associated with. Note that this is still and approximation to the number of transcripts captured because the same UMI could be associated with two different transcripts and be counted as independent." > | |
83 <option value="reads" selected="true" /> | 70 <option value="reads" selected="true" /> |
84 <option value="umis" /> | 71 <option value="umis" /> |
85 </param> | 72 </param> |
86 <param argument="--3prime" name="prime3" type="boolean" label="Is barcode on 3' end of the read?" | 73 <!-- TODO Cannot use expect-cells with 'distance' knee method.--> |
87 truevalue="--3prime" falsevalue="" | 74 <param argument="--knee-method" type="select" label="Method for detection of knee"> |
88 help="By default the barcode is assumed to be on the 5' end of the read, but | 75 <option value="distance" selected="true" /> |
89 use this option to specify that it is on the 3' end instead." /> | 76 <option value="density"/> |
90 <param name="subset_reads" argument="--subset-reads" type="integer" min="0" value="0" label="Use the first N reads to automatically identify the true cell barcodes." /> | 77 </param> |
78 <param argument="--subset-reads" type="integer" min="0" value="0" label="Use the first N reads to automatically identify the true cell barcodes" /> | |
79 <param argument="--allow-threshold-error" type="boolean" truevalue="--allow-threshold-error" falsevalue="" label="Don't select a threshold" help="Will still output the plots if requested"/> | |
80 <param argument="--ed-above-threshold" type="select" optional="true" label="Detect and correct CBs above the threshold" help="which may be sequence errors from another CB"> | |
81 <option value="correct">correct</option> | |
82 <option value="discard">discard</option> | |
83 </param> | |
91 <conditional name="celloptions" > | 84 <conditional name="celloptions" > |
92 <param name="use_cell_opts" type="select" label="Cell parameters" > | 85 <param name="use_cell_opts" type="select" label="Cell parameters" > |
93 <option value="defaults" selected="True">Use Defaults</option> | 86 <option value="defaults" selected="True">Use Defaults</option> |
94 <option value="advanced">Advanced Options</option> | 87 <option value="advanced">Advanced Options</option> |
95 </param> | 88 </param> |
96 <when value="defaults"/> | 89 <when value="defaults"/> |
97 <when value="advanced"> | 90 <when value="advanced"> |
98 <param name="set_cell_number" type="integer" min="0" value="0" label="Specify the number of cell barcodes to accept" /> | 91 <param argument="--set-cell-number" type="integer" min="0" optional="true" label="Specify the number of cell barcodes to accept" /> |
99 <param name="expect_cells" type="integer" min="0" value="0" label="Prior expectation on the upper limit on the number of cells sequenced" /> | 92 <param argument="--expect-cells" type="integer" min="0" optional="true" label="Prior expectation on the upper limit on the number of cells sequenced" /> |
100 <param name="error_correct_thresh" type="integer" min="0" value="0" label="Hamming distance for correction of barcodes to whitelist barcodes. Set to zero to generate no error correcting metrics." /> | 93 <param argument="--error-correct-threshold" type="integer" min="0" value="1" label="Hamming distance for correction of barcodes to whitelist barcodes. Set to zero to generate no error correcting metrics" /> |
101 </when> | 94 </when> |
102 </conditional> | 95 </conditional> |
103 <param argument="--log" type="boolean" label="Output log?" truevalue="--log" falsevalue="" | 96 <expand macro="log_input_macro"/> |
104 help="Choose if you want to generate a text file containing logging information." /> | |
105 </inputs> | 97 </inputs> |
106 <outputs> | 98 <outputs> |
107 <data name="out_whitelist" format="tabular" label="${tool.name} on ${on_string}: Whitelist"/> | 99 <data name="out_whitelist" format="tabular" label="${tool.name} on ${on_string}: Whitelist"/> |
108 <data name="out_log" format="txt" label="${tool.name} on ${on_string}: logfile" > | 100 |
109 <filter>log</filter> | 101 <data name="filtered_out" format_source="input_read1" label="${tool.name} on ${on_string}: reads not matching regex pattern"> |
102 <filter>extract_method_cond['extract_method'] == 'regex' and extract_method_cond['filtered_out_bool'] and input_type_cond['input_type'] in ['single', 'paired']</filter> | |
110 </data> | 103 </data> |
104 <data name="filtered_out_paired" format_source="input_read2" label="${tool.name} on ${on_string}: reads not matching regex pattern"> | |
105 <filter>extract_method_cond['extract_method'] == 'regex' and extract_method_cond['filtered_out_bool'] and input_type_cond['input_type'] == 'paired'</filter> | |
106 </data> | |
107 <collection name="filtered_out_paired_collection" type="paired" label="${tool.name} on ${on_string}: reads not matching regex pattern"> | |
108 <data name="forward" format_source="input_readpair" /> | |
109 <data name="reverse" format_source="input_readpair" /> | |
110 <filter>extract_method_cond['extract_method'] == 'regex' and extract_method_cond['filtered_out_bool'] and input_type_cond['input_type'] == 'paired_collection'</filter> | |
111 </collection> | |
112 <expand macro="log_output_macro"/> | |
111 <data name="out_html_report" format="html" label="${tool.name} on ${on_string}: Webpage" /> | 113 <data name="out_html_report" format="html" label="${tool.name} on ${on_string}: Webpage" /> |
112 <data name="out_thresh" format="tabular" label="${tool.name} on ${on_string}: TSV Cell Thresholds" /> | 114 <data name="out_thresh" format="tabular" label="${tool.name} on ${on_string}: TSV Cell Thresholds" /> |
113 </outputs> | 115 </outputs> |
114 <tests> | 116 <tests> |
115 <test expect_num_outputs="3"> | 117 <test expect_num_outputs="3"> |
116 <conditional name="input_type" > | 118 <conditional name="input_type_cond" > |
117 <param name="type" value="single" /> | 119 <param name="input_type" value="single" /> |
118 <param name="input_single" value="t_R2.fastq.gz" ftype="fastq.gz" /> | 120 <param name="input_read1" value="t_R2.fastq.gz" ftype="fastqsanger.gz" /> |
119 </conditional> | 121 <param name="bc_pattern" value="CCCCCCCCNNNNNNNN" /> |
120 <param name="bc_pattern" value="CCCCCCCCNNNNNNNN" /> | 122 </conditional> |
123 <conditional name="extract_method_cond"> | |
124 <param name="prime3" value="true" /> | |
125 </conditional> | |
121 <param name="method" value="reads" /> | 126 <param name="method" value="reads" /> |
122 <param name="prime3" value="true" /> | 127 <param name="knee_method" value="density"/> |
123 <output name="out_whitelist" file="out_wl_single.txt" lines_diff="40" /> | 128 <output name="out_whitelist" file="out_wl_single.txt" lines_diff="40" /> |
124 <output name="out_thresh" file="out_wl_single.tresh.tab" /> | 129 <output name="out_thresh" file="out_wl_single.tresh.tab" /> |
125 <output name="out_html_report" file="out_wl_single.html" /> | 130 <output name="out_html_report" file="out_wl_single.html" /> |
126 </test> | 131 </test> |
127 <test expect_num_outputs="4"> | 132 <test expect_num_outputs="4"> |
128 <conditional name="input_type" > | 133 <conditional name="input_type_cond" > |
129 <param name="type" value="paired" /> | 134 <param name="input_type" value="paired" /> |
130 <param name="input_read1" value="t_R1.fastq.gz" ftype="fastq.gz" /> | 135 <param name="input_read1" value="t_R1.fastq.gz" ftype="fastqsanger.gz" /> |
131 <param name="input_read2" value="t_R2.fastq.gz" ftype="fastq.gz" /> | 136 <param name="input_read2" value="t_R2.fastq.gz" ftype="fastqsanger.gz" /> |
132 </conditional> | 137 <param name="bc_pattern" value="CCCNNNNNNNNXXXXX" /> |
133 <param name="barcode_select" value="first_read_only" /> | 138 <!-- <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" /> --> |
134 <param name="bc_pattern" value="CCCNNNNNNNNXXXXX" /> | 139 </conditional> |
135 <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" /> | |
136 <param name="method" value="reads" /> | 140 <param name="method" value="reads" /> |
137 <param name="prime3" value="false" /> | 141 <conditional name="extract_method_cond"> |
142 <param name="prime3" value="false" /> | |
143 </conditional> | |
138 <param name="use_cell_opts" value="advanced" /> | 144 <param name="use_cell_opts" value="advanced" /> |
145 <param name="knee_method" value="density"/> | |
139 <param name="expect_cells" value="5" /> | 146 <param name="expect_cells" value="5" /> |
140 <param name="error_correct_thresh" value="3" /> | 147 <param name="error_correct_threshold" value="3" /> |
141 <param name="log" value="true" /> | 148 <param name="log" value="true" /> |
142 <output name="out_whitelist" file="out_wl_paired.txt" /> | 149 <output name="out_whitelist" file="out_wl_paired.txt" /> |
143 <output name="out_log" file="out_wl_paired.log" lines_diff="50" /> | 150 <output name="out_log" file="out_wl_paired.log" lines_diff="50" /> |
144 <output name="out_html_report" file="out_wl_paired.html" /> | 151 <output name="out_html_report" file="out_wl_paired.html" /> |
145 <output name="out_thresh" file="out_wl_paired.tresh.tab" /> | 152 <output name="out_thresh" file="out_wl_paired.tresh.tab" /> |
146 </test> | 153 </test> |
147 <test expect_num_outputs="4"> <!-- As previous, identical outputs but paired collection input --> | 154 <test expect_num_outputs="4"> <!-- As previous, identical outputs but paired collection input --> |
148 <conditional name="input_type" > | 155 <conditional name="input_type_cond" > |
149 <param name="type" value="paired_collection" /> | 156 <param name="input_type" value="paired_collection" /> |
150 <param name="input_readpair" > | 157 <param name="input_readpair" > |
151 <collection type="paired"> | 158 <collection type="paired"> |
152 <element name="forward" ftype="fastq.gz" value="t_R1.fastq.gz" /> | 159 <element name="forward" ftype="fastqsanger.gz" value="t_R1.fastq.gz" /> |
153 <element name="reverse" ftype="fastq.gz" value="t_R2.fastq.gz" /> | 160 <element name="reverse" ftype="fastqsanger.gz" value="t_R2.fastq.gz" /> |
154 </collection> | 161 </collection> |
155 </param> | 162 </param> |
156 </conditional> | 163 <param name="bc_pattern" value="CCCNNNNNNNNXXXXX" /> |
157 <param name="barcode_select" value="first_read_only" /> | 164 <!-- <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" /> --> |
158 <param name="bc_pattern" value="CCCNNNNNNNNXXXXX" /> | 165 </conditional> |
159 <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" /> | |
160 <param name="method" value="reads" /> | 166 <param name="method" value="reads" /> |
161 <param name="prime3" value="false" /> | 167 <conditional name="extract_method_cond"> |
168 <param name="prime3" value="false" /> | |
169 </conditional> | |
162 <param name="use_cell_opts" value="advanced" /> | 170 <param name="use_cell_opts" value="advanced" /> |
171 <param name="knee_method" value="density"/> | |
163 <param name="expect_cells" value="5" /> | 172 <param name="expect_cells" value="5" /> |
164 <param name="error_correct_thresh" value="3" /> | 173 <param name="error_correct_threshold" value="3" /> |
165 <param name="log" value="true" /> | 174 <param name="log" value="true" /> |
175 <param name="filtered_out_bool" value="true"/> | |
166 <output name="out_whitelist" file="out_wl_paired.txt" /> | 176 <output name="out_whitelist" file="out_wl_paired.txt" /> |
167 <output name="out_log" file="out_wl_paired.log" lines_diff="50" /> | 177 <output name="out_log" file="out_wl_paired.log" lines_diff="50" /> |
168 <output name="out_html_report" file="out_wl_paired.html" /> | 178 <output name="out_html_report" file="out_wl_paired.html" /> |
169 <output name="out_thresh" file="out_wl_paired.tresh.tab" /> | 179 <output name="out_thresh" file="out_wl_paired.tresh.tab" /> |
170 </test> | 180 </test> |
171 <!-- Error report on not accepting regex and lt and gt symbols --> | 181 <!-- Error report on not accepting regex and lt and gt symbols --> |
172 <test expect_num_outputs="3"> | 182 <test expect_num_outputs="4"> |
173 <conditional name="input_type" > | 183 <conditional name="input_type_cond" > |
174 <param name="type" value="single" /> | 184 <param name="input_type" value="single" /> |
175 <param name="input_single" value="testYYY.40k.fastq.gz" ftype="fastq.gz" /> | 185 <param name="input_read1" value="testYYY.40k.fastq.gz" ftype="fastqsanger.gz" /> |
176 </conditional> | 186 </conditional> |
177 <param name="bc_pattern" value="(?P<cell_1>.{8,10})(?P<discard_1>ACTGGCCTGCGA){s<=3}(?P<cell_2>.{9})(?P<discard_2>GGTAGCGGTGACA){s<=3}(?P<cell_3>.{9})(?P<umi_1>.{8})T{3}.*" /> | 187 <param name="bc_pattern" value="(?P<cell_1>.{8,10})(?P<discard_1>ACTGGCCTGCGA){s<=3}(?P<cell_2>.{9})(?P<discard_2>GGTAGCGGTGACA){s<=3}(?P<cell_3>.{9})(?P<umi_1>.{8})T{3}.*" /> |
178 <param name="extract_method" value="regex" /> | 188 <param name="extract_method" value="regex" /> |
179 <param name="method" value="umis" /> | 189 <param name="method" value="umis" /> |
190 <param name="knee_method" value="density"/> | |
180 <param name="prime3" value="true" /> | 191 <param name="prime3" value="true" /> |
192 <param name="filtered_out_bool" value="true"/> | |
181 <output name="out_whitelist" file="out_wl_user.single.txt" /> | 193 <output name="out_whitelist" file="out_wl_user.single.txt" /> |
182 <output name="out_thresh" file="out_wl_user.single.tresh.tab" /> | 194 <output name="out_thresh" file="out_wl_user.single.tresh.tab" /> |
183 <output name="out_html_report" file="out_wl_user.single.html" /> | 195 <output name="out_html_report" file="out_wl_user.single.html" /> |
196 <output name="filtered_out"> | |
197 <assert_contents> | |
198 <has_text text="@A00250:74:HFMFVDSXX:2:1101:1027:1016 1:N:0:GTAGAGGA"/> | |
199 </assert_contents> | |
200 </output> | |
184 </test> | 201 </test> |
185 </tests> | 202 </tests> |
186 <help><![CDATA[ | 203 <help><![CDATA[ |
187 | 204 |
188 | 205 |
193 ------- | 210 ------- |
194 | 211 |
195 Extract cell barcodes and identify the most likely true barcodes using | 212 Extract cell barcodes and identify the most likely true barcodes using |
196 the 'knee' method. | 213 the 'knee' method. |
197 | 214 |
198 Options | 215 @FASTQ_BARCODE_EXTRACTION_HELP@ |
199 ------- | 216 |
200 | |
201 --bc-pattern | |
202 This should be used where the barcodes are always in the same | |
203 place in the read. | |
204 | |
205 - N = UMI position (required) | |
206 - C = cell barcode position (optional) | |
207 - X = sample position (optional) | |
208 | |
209 Bases with Ns and Cs will be extracted and added to the read | |
210 name. The corresponding sequence qualities will be removed from | |
211 the read. Bases with an X will be reattached to the read. | |
212 | |
213 E.g. If the pattern is NNNNCC, | |
214 Then the read: | |
215 @HISEQ:87:00000000 read1 | |
216 AAGGTTGCTGATTGGATGGGCTAG | |
217 DA1AEBFGGCG01DFH00B1FF0B | |
218 + | |
219 will become: | |
220 @HISEQ:87:00000000_TT_AAGG read1 | |
221 GCTGATTGGATGGGCTAG | |
222 1AFGGCG01DFH00B1FF0B | |
223 + | |
224 | |
225 where 'TT' is the cell barcode and 'AAGG' is the UMI. | |
226 | |
227 | |
228 --set-cell-number | |
229 Use this option to explicity set the number of cell barcodes | |
230 which should be accepted. Note that the exact number of cell | |
231 barcodes in the outputted whitelist may be slightly less than | |
232 this if there are multiple cells observed with the same | |
233 frequency at the threshold between accepted and rejected cell | |
234 barcodes. | |
235 | |
236 --expect-cells=[EXPECTED_CELLS] | |
237 An upper limit estimate for the number of inputted cells. The knee | |
238 method will now select the first threshold (order ascendingly) | |
239 which results in the number of cell barcodes accepted being <= | |
240 EXPECTED_CELLS and > EXPECTED_CELLS * 0.1. | |
241 | |
242 | |
243 --bc-pattern2 | |
244 Use this option to specify the format of the UMI/barcode for | |
245 the second read pair if required. If --bc-pattern2 is not | |
246 supplied, this defaults to the same pattern as --bc-pattern | |
247 | |
248 --3prime | |
249 By default the barcode is assumed to be on the 5' end of the read, but | |
250 use this option to sepecify that it is on the 3' end instead | |
251 | |
252 Usage: | |
253 ------ | |
254 | |
255 For single ended reads: | |
256 umi_tools whitelist --bc-pattern=[PATTERN] -L extract.log | |
257 [OPTIONS] | |
258 | |
259 reads from stdin and outputs to stdout. | |
260 | |
261 For paired end reads where the cell barcodes is split across the read pairs: | |
262 umi_tools whitelist --bc-pattern=[PATTERN] | |
263 --bc-pattern2=[PATTERN] --read2-in=[FASTQIN] -L extract.log | |
264 [OPTIONS] | |
265 | |
266 reads end one from stdin and end two from FASTQIN and outputs to stdin | |
267 | 217 |
268 | 218 |
269 Output: | 219 Output: |
270 ------- | 220 ------- |
271 | 221 |