comparison umi-tools_whitelist.xml @ 14:345bdf4546fd draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit bf6a3aa532e8f9d122da4c1e39f3e256ae587b79"
author iuc
date Mon, 13 Sep 2021 14:50:19 +0000
parents 262026eb36a5
children 953b4821b183
comparison
equal deleted inserted replaced
13:49bfe13676a1 14:345bdf4546fd
1 <tool id="umi_tools_whitelist" name="UMI-tools whitelist" version="@VERSION@.0"> 1 <tool id="umi_tools_whitelist" name="UMI-tools whitelist" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
2 <description>Extract cell barcodes from FASTQ files</description> 2 <description>Extract cell barcodes from FASTQ files</description>
3 <expand macro="bio_tools"/>
3 <macros> 4 <macros>
4 <import>macros.xml</import> 5 <import>macros.xml</import>
5 </macros> 6 </macros>
6 <expand macro="requirements" /> 7 <expand macro="requirements" />
7 <command detect_errors="exit_code"><![CDATA[ 8 <command detect_errors="exit_code"><![CDATA[
9 #import json
8 @COMMAND_LINK@ 10 @COMMAND_LINK@
9
10 umi_tools whitelist 11 umi_tools whitelist
11 --bc-pattern='$bc_pattern' 12
12 --extract-method='$extract_method' 13 @FASTQ_BARCODE_EXTRACTION_OPTIONS@
14
13 --subset-reads='$subset_reads' 15 --subset-reads='$subset_reads'
14 #if $input_type.type == 'single': 16 $allow_threshold_error
17 #if $ed_above_threshold
18 --ed-above-threshold=$ed_above_threshold
19 #end if
20 --knee-method='$knee_method'
21 #if $input_type_cond.input_type == 'single':
15 #if $gz: 22 #if $gz:
16 --stdin=input_single.gz 23 --stdin=input_single.gz
17 #else 24 #else
18 --stdin=input_single.txt 25 --stdin=input_single.txt
19 #end if 26 #end if
23 --read2-in=input_read2.gz 30 --read2-in=input_read2.gz
24 #else: 31 #else:
25 --stdin=input_read1.txt 32 --stdin=input_read1.txt
26 --read2-in=input_read2.txt 33 --read2-in=input_read2.txt
27 #end if 34 #end if
28 #if $input_type.barcode.barcode_select == "both_reads":
29 --bc-pattern2='$input_type.barcode.bc_pattern2'
30 #end if
31 #end if 35 #end if
32 #if $celloptions.use_cell_opts == "advanced": 36 #if $celloptions.use_cell_opts == "advanced":
33 #if $celloptions.set_cell_number != "0": 37 #if str($celloptions.set_cell_number) != '':
34 --set-cell-number=$celloptions.set_cell_number 38 --set-cell-number=$celloptions.set_cell_number
35 #end if 39 #end if
36 #if $celloptions.expect_cells != "0": 40 #if str($celloptions.expect_cells) != '':
37 --expect-cells=$celloptions.expect_cells 41 --expect-cells=$celloptions.expect_cells
38 #end if 42 #end if
39 --error-correct-threshold=$celloptions.error_correct_thresh 43 --error-correct-threshold=$celloptions.error_correct_threshold
40 #end if 44 #end if
41 --method=$method 45 --method=$method
42 --plot-prefix=OUT 46 --plot-prefix=OUT
43 47
44 $prime3 48 @LOG@
45
46 #if $log:
47 --log='$out_log'
48 #end if
49
50 --log2stderr
51
52 > '$out_whitelist' && 49 > '$out_whitelist' &&
53 50
54 mkdir '${ out_html_report.files_path }' && 51 mkdir '${ out_html_report.files_path }' &&
55 cp OUT_*.png '${ out_html_report.files_path }' && 52 cp OUT_*.png '${ out_html_report.files_path }' &&
56 53
64 && 61 &&
65 mv OUT_cell_thresholds.tsv '$out_thresh' 62 mv OUT_cell_thresholds.tsv '$out_thresh'
66 ]]></command> 63 ]]></command>
67 <inputs> 64 <inputs>
68 <expand macro="input_types" /> 65 <expand macro="input_types" />
69 <param name="bc_pattern" argument="--bc-pattern" type="text" label="Barcode pattern for first read" 66 <expand macro="fastq_barcode_extraction_options_macro"/>
70 help="Use this option to specify the format of the UMI/barcode. Use Ns to 67
71 represent the random positions and Xs to indicate the bc positions. 68 <param argument="--method" type="select" label="Count reads or UMIs"
72 Bases with Ns will be extracted and added to the read name. Remaining 69 help="Many published protocols rank CBs by the number of reads the CBs appear in. However you could also use the number of unique UMIs a CB is associated with. Note that this is still and approximation to the number of transcripts captured because the same UMI could be associated with two different transcripts and be counted as independent" >
73 bases, marked with an X will be reattached to the read.">
74 <expand macro="barcode_sanitizer" />
75 </param>
76 <param name="extract_method" argument="--extract-method" type="select" label="Barcode Extraction Method"
77 help="If bracketed expressions are used in the above barcode pattern, then set this to 'regex'. Otherwise leave as 'string'" >
78 <option value="string" selected="true" />
79 <option value="regex" />
80 </param>
81 <param name="method" argument="--method" type="select" label="Count reads or UMIs"
82 help="Many published protocols rank CBs by the number of reads the CBs appear in. However you could also use the number of unique UMIs a CB is associated with. Note that this is still and approximation to the number of transcripts captured because the same UMI could be associated with two different transcripts and be counted as independent." >
83 <option value="reads" selected="true" /> 70 <option value="reads" selected="true" />
84 <option value="umis" /> 71 <option value="umis" />
85 </param> 72 </param>
86 <param argument="--3prime" name="prime3" type="boolean" label="Is barcode on 3' end of the read?" 73 <!-- TODO Cannot use expect-cells with 'distance' knee method.-->
87 truevalue="--3prime" falsevalue="" 74 <param argument="--knee-method" type="select" label="Method for detection of knee">
88 help="By default the barcode is assumed to be on the 5' end of the read, but 75 <option value="distance" selected="true" />
89 use this option to specify that it is on the 3' end instead." /> 76 <option value="density"/>
90 <param name="subset_reads" argument="--subset-reads" type="integer" min="0" value="0" label="Use the first N reads to automatically identify the true cell barcodes." /> 77 </param>
78 <param argument="--subset-reads" type="integer" min="0" value="0" label="Use the first N reads to automatically identify the true cell barcodes" />
79 <param argument="--allow-threshold-error" type="boolean" truevalue="--allow-threshold-error" falsevalue="" label="Don't select a threshold" help="Will still output the plots if requested"/>
80 <param argument="--ed-above-threshold" type="select" optional="true" label="Detect and correct CBs above the threshold" help="which may be sequence errors from another CB">
81 <option value="correct">correct</option>
82 <option value="discard">discard</option>
83 </param>
91 <conditional name="celloptions" > 84 <conditional name="celloptions" >
92 <param name="use_cell_opts" type="select" label="Cell parameters" > 85 <param name="use_cell_opts" type="select" label="Cell parameters" >
93 <option value="defaults" selected="True">Use Defaults</option> 86 <option value="defaults" selected="True">Use Defaults</option>
94 <option value="advanced">Advanced Options</option> 87 <option value="advanced">Advanced Options</option>
95 </param> 88 </param>
96 <when value="defaults"/> 89 <when value="defaults"/>
97 <when value="advanced"> 90 <when value="advanced">
98 <param name="set_cell_number" type="integer" min="0" value="0" label="Specify the number of cell barcodes to accept" /> 91 <param argument="--set-cell-number" type="integer" min="0" optional="true" label="Specify the number of cell barcodes to accept" />
99 <param name="expect_cells" type="integer" min="0" value="0" label="Prior expectation on the upper limit on the number of cells sequenced" /> 92 <param argument="--expect-cells" type="integer" min="0" optional="true" label="Prior expectation on the upper limit on the number of cells sequenced" />
100 <param name="error_correct_thresh" type="integer" min="0" value="0" label="Hamming distance for correction of barcodes to whitelist barcodes. Set to zero to generate no error correcting metrics." /> 93 <param argument="--error-correct-threshold" type="integer" min="0" value="1" label="Hamming distance for correction of barcodes to whitelist barcodes. Set to zero to generate no error correcting metrics" />
101 </when> 94 </when>
102 </conditional> 95 </conditional>
103 <param argument="--log" type="boolean" label="Output log?" truevalue="--log" falsevalue="" 96 <expand macro="log_input_macro"/>
104 help="Choose if you want to generate a text file containing logging information." />
105 </inputs> 97 </inputs>
106 <outputs> 98 <outputs>
107 <data name="out_whitelist" format="tabular" label="${tool.name} on ${on_string}: Whitelist"/> 99 <data name="out_whitelist" format="tabular" label="${tool.name} on ${on_string}: Whitelist"/>
108 <data name="out_log" format="txt" label="${tool.name} on ${on_string}: logfile" > 100
109 <filter>log</filter> 101 <data name="filtered_out" format_source="input_read1" label="${tool.name} on ${on_string}: reads not matching regex pattern">
102 <filter>extract_method_cond['extract_method'] == 'regex' and extract_method_cond['filtered_out_bool'] and input_type_cond['input_type'] in ['single', 'paired']</filter>
110 </data> 103 </data>
104 <data name="filtered_out_paired" format_source="input_read2" label="${tool.name} on ${on_string}: reads not matching regex pattern">
105 <filter>extract_method_cond['extract_method'] == 'regex' and extract_method_cond['filtered_out_bool'] and input_type_cond['input_type'] == 'paired'</filter>
106 </data>
107 <collection name="filtered_out_paired_collection" type="paired" label="${tool.name} on ${on_string}: reads not matching regex pattern">
108 <data name="forward" format_source="input_readpair" />
109 <data name="reverse" format_source="input_readpair" />
110 <filter>extract_method_cond['extract_method'] == 'regex' and extract_method_cond['filtered_out_bool'] and input_type_cond['input_type'] == 'paired_collection'</filter>
111 </collection>
112 <expand macro="log_output_macro"/>
111 <data name="out_html_report" format="html" label="${tool.name} on ${on_string}: Webpage" /> 113 <data name="out_html_report" format="html" label="${tool.name} on ${on_string}: Webpage" />
112 <data name="out_thresh" format="tabular" label="${tool.name} on ${on_string}: TSV Cell Thresholds" /> 114 <data name="out_thresh" format="tabular" label="${tool.name} on ${on_string}: TSV Cell Thresholds" />
113 </outputs> 115 </outputs>
114 <tests> 116 <tests>
115 <test expect_num_outputs="3"> 117 <test expect_num_outputs="3">
116 <conditional name="input_type" > 118 <conditional name="input_type_cond" >
117 <param name="type" value="single" /> 119 <param name="input_type" value="single" />
118 <param name="input_single" value="t_R2.fastq.gz" ftype="fastq.gz" /> 120 <param name="input_read1" value="t_R2.fastq.gz" ftype="fastqsanger.gz" />
119 </conditional> 121 <param name="bc_pattern" value="CCCCCCCCNNNNNNNN" />
120 <param name="bc_pattern" value="CCCCCCCCNNNNNNNN" /> 122 </conditional>
123 <conditional name="extract_method_cond">
124 <param name="prime3" value="true" />
125 </conditional>
121 <param name="method" value="reads" /> 126 <param name="method" value="reads" />
122 <param name="prime3" value="true" /> 127 <param name="knee_method" value="density"/>
123 <output name="out_whitelist" file="out_wl_single.txt" lines_diff="40" /> 128 <output name="out_whitelist" file="out_wl_single.txt" lines_diff="40" />
124 <output name="out_thresh" file="out_wl_single.tresh.tab" /> 129 <output name="out_thresh" file="out_wl_single.tresh.tab" />
125 <output name="out_html_report" file="out_wl_single.html" /> 130 <output name="out_html_report" file="out_wl_single.html" />
126 </test> 131 </test>
127 <test expect_num_outputs="4"> 132 <test expect_num_outputs="4">
128 <conditional name="input_type" > 133 <conditional name="input_type_cond" >
129 <param name="type" value="paired" /> 134 <param name="input_type" value="paired" />
130 <param name="input_read1" value="t_R1.fastq.gz" ftype="fastq.gz" /> 135 <param name="input_read1" value="t_R1.fastq.gz" ftype="fastqsanger.gz" />
131 <param name="input_read2" value="t_R2.fastq.gz" ftype="fastq.gz" /> 136 <param name="input_read2" value="t_R2.fastq.gz" ftype="fastqsanger.gz" />
132 </conditional> 137 <param name="bc_pattern" value="CCCNNNNNNNNXXXXX" />
133 <param name="barcode_select" value="first_read_only" /> 138 <!-- <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" /> -->
134 <param name="bc_pattern" value="CCCNNNNNNNNXXXXX" /> 139 </conditional>
135 <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" />
136 <param name="method" value="reads" /> 140 <param name="method" value="reads" />
137 <param name="prime3" value="false" /> 141 <conditional name="extract_method_cond">
142 <param name="prime3" value="false" />
143 </conditional>
138 <param name="use_cell_opts" value="advanced" /> 144 <param name="use_cell_opts" value="advanced" />
145 <param name="knee_method" value="density"/>
139 <param name="expect_cells" value="5" /> 146 <param name="expect_cells" value="5" />
140 <param name="error_correct_thresh" value="3" /> 147 <param name="error_correct_threshold" value="3" />
141 <param name="log" value="true" /> 148 <param name="log" value="true" />
142 <output name="out_whitelist" file="out_wl_paired.txt" /> 149 <output name="out_whitelist" file="out_wl_paired.txt" />
143 <output name="out_log" file="out_wl_paired.log" lines_diff="50" /> 150 <output name="out_log" file="out_wl_paired.log" lines_diff="50" />
144 <output name="out_html_report" file="out_wl_paired.html" /> 151 <output name="out_html_report" file="out_wl_paired.html" />
145 <output name="out_thresh" file="out_wl_paired.tresh.tab" /> 152 <output name="out_thresh" file="out_wl_paired.tresh.tab" />
146 </test> 153 </test>
147 <test expect_num_outputs="4"> <!-- As previous, identical outputs but paired collection input --> 154 <test expect_num_outputs="4"> <!-- As previous, identical outputs but paired collection input -->
148 <conditional name="input_type" > 155 <conditional name="input_type_cond" >
149 <param name="type" value="paired_collection" /> 156 <param name="input_type" value="paired_collection" />
150 <param name="input_readpair" > 157 <param name="input_readpair" >
151 <collection type="paired"> 158 <collection type="paired">
152 <element name="forward" ftype="fastq.gz" value="t_R1.fastq.gz" /> 159 <element name="forward" ftype="fastqsanger.gz" value="t_R1.fastq.gz" />
153 <element name="reverse" ftype="fastq.gz" value="t_R2.fastq.gz" /> 160 <element name="reverse" ftype="fastqsanger.gz" value="t_R2.fastq.gz" />
154 </collection> 161 </collection>
155 </param> 162 </param>
156 </conditional> 163 <param name="bc_pattern" value="CCCNNNNNNNNXXXXX" />
157 <param name="barcode_select" value="first_read_only" /> 164 <!-- <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" /> -->
158 <param name="bc_pattern" value="CCCNNNNNNNNXXXXX" /> 165 </conditional>
159 <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" />
160 <param name="method" value="reads" /> 166 <param name="method" value="reads" />
161 <param name="prime3" value="false" /> 167 <conditional name="extract_method_cond">
168 <param name="prime3" value="false" />
169 </conditional>
162 <param name="use_cell_opts" value="advanced" /> 170 <param name="use_cell_opts" value="advanced" />
171 <param name="knee_method" value="density"/>
163 <param name="expect_cells" value="5" /> 172 <param name="expect_cells" value="5" />
164 <param name="error_correct_thresh" value="3" /> 173 <param name="error_correct_threshold" value="3" />
165 <param name="log" value="true" /> 174 <param name="log" value="true" />
175 <param name="filtered_out_bool" value="true"/>
166 <output name="out_whitelist" file="out_wl_paired.txt" /> 176 <output name="out_whitelist" file="out_wl_paired.txt" />
167 <output name="out_log" file="out_wl_paired.log" lines_diff="50" /> 177 <output name="out_log" file="out_wl_paired.log" lines_diff="50" />
168 <output name="out_html_report" file="out_wl_paired.html" /> 178 <output name="out_html_report" file="out_wl_paired.html" />
169 <output name="out_thresh" file="out_wl_paired.tresh.tab" /> 179 <output name="out_thresh" file="out_wl_paired.tresh.tab" />
170 </test> 180 </test>
171 <!-- Error report on not accepting regex and lt and gt symbols --> 181 <!-- Error report on not accepting regex and lt and gt symbols -->
172 <test expect_num_outputs="3"> 182 <test expect_num_outputs="4">
173 <conditional name="input_type" > 183 <conditional name="input_type_cond" >
174 <param name="type" value="single" /> 184 <param name="input_type" value="single" />
175 <param name="input_single" value="testYYY.40k.fastq.gz" ftype="fastq.gz" /> 185 <param name="input_read1" value="testYYY.40k.fastq.gz" ftype="fastqsanger.gz" />
176 </conditional> 186 </conditional>
177 <param name="bc_pattern" value="(?P&#60;cell_1&#62;.{8,10})(?P&#60;discard_1&#62;ACTGGCCTGCGA){s&#60;=3}(?P&#60;cell_2&#62;.{9})(?P&#60;discard_2&#62;GGTAGCGGTGACA){s&#60;=3}(?P&#60;cell_3&#62;.{9})(?P&#60;umi_1&#62;.{8})T{3}.*" /> 187 <param name="bc_pattern" value="(?P&#60;cell_1&#62;.{8,10})(?P&#60;discard_1&#62;ACTGGCCTGCGA){s&#60;=3}(?P&#60;cell_2&#62;.{9})(?P&#60;discard_2&#62;GGTAGCGGTGACA){s&#60;=3}(?P&#60;cell_3&#62;.{9})(?P&#60;umi_1&#62;.{8})T{3}.*" />
178 <param name="extract_method" value="regex" /> 188 <param name="extract_method" value="regex" />
179 <param name="method" value="umis" /> 189 <param name="method" value="umis" />
190 <param name="knee_method" value="density"/>
180 <param name="prime3" value="true" /> 191 <param name="prime3" value="true" />
192 <param name="filtered_out_bool" value="true"/>
181 <output name="out_whitelist" file="out_wl_user.single.txt" /> 193 <output name="out_whitelist" file="out_wl_user.single.txt" />
182 <output name="out_thresh" file="out_wl_user.single.tresh.tab" /> 194 <output name="out_thresh" file="out_wl_user.single.tresh.tab" />
183 <output name="out_html_report" file="out_wl_user.single.html" /> 195 <output name="out_html_report" file="out_wl_user.single.html" />
196 <output name="filtered_out">
197 <assert_contents>
198 <has_text text="@A00250:74:HFMFVDSXX:2:1101:1027:1016 1:N:0:GTAGAGGA"/>
199 </assert_contents>
200 </output>
184 </test> 201 </test>
185 </tests> 202 </tests>
186 <help><![CDATA[ 203 <help><![CDATA[
187 204
188 205
193 ------- 210 -------
194 211
195 Extract cell barcodes and identify the most likely true barcodes using 212 Extract cell barcodes and identify the most likely true barcodes using
196 the 'knee' method. 213 the 'knee' method.
197 214
198 Options 215 @FASTQ_BARCODE_EXTRACTION_HELP@
199 ------- 216
200
201 --bc-pattern
202 This should be used where the barcodes are always in the same
203 place in the read.
204
205 - N = UMI position (required)
206 - C = cell barcode position (optional)
207 - X = sample position (optional)
208
209 Bases with Ns and Cs will be extracted and added to the read
210 name. The corresponding sequence qualities will be removed from
211 the read. Bases with an X will be reattached to the read.
212
213 E.g. If the pattern is NNNNCC,
214 Then the read:
215 @HISEQ:87:00000000 read1
216 AAGGTTGCTGATTGGATGGGCTAG
217 DA1AEBFGGCG01DFH00B1FF0B
218 +
219 will become:
220 @HISEQ:87:00000000_TT_AAGG read1
221 GCTGATTGGATGGGCTAG
222 1AFGGCG01DFH00B1FF0B
223 +
224
225 where 'TT' is the cell barcode and 'AAGG' is the UMI.
226
227
228 --set-cell-number
229 Use this option to explicity set the number of cell barcodes
230 which should be accepted. Note that the exact number of cell
231 barcodes in the outputted whitelist may be slightly less than
232 this if there are multiple cells observed with the same
233 frequency at the threshold between accepted and rejected cell
234 barcodes.
235
236 --expect-cells=[EXPECTED_CELLS]
237 An upper limit estimate for the number of inputted cells. The knee
238 method will now select the first threshold (order ascendingly)
239 which results in the number of cell barcodes accepted being <=
240 EXPECTED_CELLS and > EXPECTED_CELLS * 0.1.
241
242
243 --bc-pattern2
244 Use this option to specify the format of the UMI/barcode for
245 the second read pair if required. If --bc-pattern2 is not
246 supplied, this defaults to the same pattern as --bc-pattern
247
248 --3prime
249 By default the barcode is assumed to be on the 5' end of the read, but
250 use this option to sepecify that it is on the 3' end instead
251
252 Usage:
253 ------
254
255 For single ended reads:
256 umi_tools whitelist --bc-pattern=[PATTERN] -L extract.log
257 [OPTIONS]
258
259 reads from stdin and outputs to stdout.
260
261 For paired end reads where the cell barcodes is split across the read pairs:
262 umi_tools whitelist --bc-pattern=[PATTERN]
263 --bc-pattern2=[PATTERN] --read2-in=[FASTQIN] -L extract.log
264 [OPTIONS]
265
266 reads end one from stdin and end two from FASTQIN and outputs to stdin
267 217
268 218
269 Output: 219 Output:
270 ------- 220 -------
271 221