comparison umi-tools_whitelist.xml @ 0:b911124762a8 draft

planemo upload commit 8da5246c32d60a49e6b6b9027c9adc0a31d4bc5a
author iuc
date Sun, 25 Feb 2018 13:07:58 -0500
parents
children dac4e7dc837d
comparison
equal deleted inserted replaced
-1:000000000000 0:b911124762a8
1 <tool id="umi_tools_whitelist" name="UMI-tools whitelist" version="@VERSION@.0">
2 <description>Extract cell barcodes from FASTQ files</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="requirements" />
7 <command detect_errors="exit_code"><![CDATA[
8 #set $gz = False
9 #if $input_type.type == 'single':
10 #if $input_type.input_single.is_of_type("fastq.gz", "fastqsanger.gz"):
11 ln -s '$input_type.input_single' input_single.gz &&
12 #set $gz = True
13 #end if
14 #else
15 #if $input_type.input_read1.is_of_type("fastq.gz", "fastqsanger.gz"):
16 ln -s '$input_type.input_read1' input_read1.gz &&
17 ln -s '$input_type.input_read2' input_read2.gz &&
18 #set $gz = True
19 #end if
20 #end if
21 umi_tools whitelist
22 --bc-pattern='$bc_pattern'
23 --subset-reads='$subset_reads'
24 #if $input_type.type == 'single':
25 #if $gz:
26 --stdin=input_single.gz
27 #else
28 --stdin='$input_type.input_single'
29 #end if
30 #else:
31 #if $gz:
32 --stdin=input_read1.gz
33 --read2-in=input_read2.gz
34 #else:
35 --stdin='$input_type.input_read1'
36 --read2-in='$input_type.input_read2'
37 #end if
38 #if $input_type.barcode.barcode_select == "1":
39 --bc-pattern2='$input_type.barcode.bc_pattern2'
40 #end if
41 #end if
42 #if $celloptions.use_cell_opts == "advanced":
43 #if $celloptions.set_cell_number != "0":
44 --set-cell-number=$celloptions.set_cell_number
45 #end if
46 #if $celloptions.expect_cells != "0":
47 --expect-cells=$celloptions.expect_cells
48 #end if
49 --error-correct-threshold=$celloptions.error_correct_thresh
50 #end if
51 --method=$method
52 --plot-prefix=OUT
53
54 $prime3
55
56 #if $log:
57 --log='$out_log'
58 #end if
59
60 > '$out_whitelist' &&
61
62 mkdir '${ out_html_report.files_path }' &&
63 cp OUT_*.png '${ out_html_report.files_path }' &&
64
65 echo "<html>
66 <head></head><body>
67 <h1>Cell and Count Metrics</h1>
68 <img src=\"OUT_cell_barcode_count_density.png\" ><br />
69 <img src=\"OUT_cell_barcode_knee.png\" ><br />
70 <img src=\"OUT_cell_barcode_counts.png\" ><br />
71 </body></html>" > '$out_html_report'
72 &&
73 mv OUT_cell_thresholds.tsv '$out_thresh'
74 ]]></command>
75 <inputs>
76 <conditional name="input_type">
77 <param name="type" type="select" label="Library type">
78 <option value="single">Single-end</option>
79 <option value="paired">Paired-end</option>
80 </param>
81 <when value="single">
82 <param name="input_single" type="data" format="fastq,fastq.gz" label="Reads in FASTQ format" />
83 </when>
84 <when value="paired">
85 <param name="input_read1" type="data" format="fastq,fastq.gz" label="Reads in FASTQ format" />
86 <param name="input_read2" type="data" format="fastq,fastq.gz" label="Reads in FASTQ format" />
87 <conditional name="barcode">
88 <param name="barcode_select" argument="--split-barcode" type="select" label="Barcode on both reads?">
89 <option value="first_read_only">Barcode on first read only</option>
90 <option value="both_reads">Barcode on both reads</option>
91 </param>
92 <when value="first_read_only"/>
93 <when value="both_reads">
94 <param name="bc_pattern2" argument="--bc-pattern2" type="text" value="" label="Barcode pattern for second read"
95 help="Use this option to specify the format of the UMI/barcode for
96 the second read pair if required.">
97 </param>
98 </when>
99 </conditional>
100 </when>
101 </conditional>
102 <param name="bc_pattern" argument="--bc-pattern" type="text" label="Barcode pattern for first read"
103 help="Use this option to specify the format of the UMI/barcode. Use Ns to
104 represent the random positions and Xs to indicate the bc positions.
105 Bases with Ns will be extracted and added to the read name. Remaining
106 bases, marked with an X will be reattached to the read.">
107 </param>
108 <param name="method" argument="--method" type="select" label="Count reads or UMIs"
109 help="Many published protocols rank CBs by the number of reads the CBs appear in. However you could also use the number of unique UMIs a CB is associated with. Note that this is still and approximation to the number of transcripts captured because the same UMI could be associated with two different transcripts and be counted as independent." >
110 <option value="reads" selected="true" />
111 <option value="umis" />
112 </param>
113
114 <param argument="--3prime" name="prime3" type="boolean" label="Is barcode on 3' end of the read?"
115 truevalue="--3prime" falsevalue=""
116 help="By default the barcode is assumed to be on the 5' end of the read, but
117 use this option to specify that it is on the 3' end instead." />
118 <param name="subset_reads" argument="--subset-reads" type="integer" min="0" value="0" label="Use the first N reads to automatically identify the true cell barcodes." />
119 <conditional name="celloptions" >
120 <param name="use_cell_opts" type="select" label="Cell parameters" >
121 <option value="defaults" selected="True">Use Defaults</option>
122 <option value="advanced">Advanced Options</option>
123 </param>
124 <when value="defaults"/>
125 <when value="advanced">
126 <param name="set_cell_number" type="integer" min="0" value="0" label="Specify the number of cell barcodes to accept" />
127 <param name="expect_cells" type="integer" min="0" value="0" label="Prior expectation on the upper limit on the number of cells sequenced" />
128 <param name="error_correct_thresh" type="integer" min="0" value="0" label="Hamming distance for correction of barcodes to whilelist barcodes. Set to zero to generate no error correcting metrics." />
129 </when>
130 </conditional>
131 <param argument="--log" type="boolean" label="Output log?" truevalue="--log" falsevalue=""
132 help="Choose if you want to generate a text file containing logging information." />
133
134 </inputs>
135 <outputs>
136 <data name="out_whitelist" format="tabular" label="${tool.name} on ${on_string}: Whitelist"/>
137 <data name="out_log" format="txt" label="${tool.name} on ${on_string}: logfile" >
138 <filter>log</filter>
139 </data>
140 <data name="out_html_report" format="html" label="${tool.name} on ${on_string}: Webpage" />
141 <data name="out_thresh" format="tabular" label="${tool.name} on ${on_string}: TSV Cell Thresholds" />
142 </outputs>
143 <tests>
144 <test expect_num_outputs="3">
145 <param name="type" value="single" />
146 <param name="input_single" value="t_R2.fastq.gz" ftype="fastq" />
147 <param name="bc_pattern" value="CCCCCCCCNNNNNNNN" />
148 <param name="method" value="reads" />
149 <param name="prime3" value="true" />
150 <output name="out_whitelist" file="out_wl_single.txt" lines_diff="40" />
151 <output name="out_thresh" file="out_wl_single.tresh.tab" />
152 <output name="out_html_report" file="out_wl_single.html" />
153 </test>
154 <test expect_num_outputs="4">
155 <param name="type" value="paired" />
156 <param name="input_read1" value="t_R1.fastq.gz" ftype="fastq" />
157 <param name="input_read2" value="t_R2.fastq.gz" ftype="fastq" />
158 <param name="barcode_select" value="both_reads" />
159 <param name="bc_pattern" value="CCCNNNNNNNNXXXXX" />
160 <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" />
161 <param name="method" value="reads" />
162 <param name="prime3" value="false" />
163 <param name="use_cell_opts" value="advanced" />
164 <param name="expect_cells" value="5" />
165 <param name="error_correct_thresh" value="3" />
166 <param name="log" value="true" />
167 <output name="out_whitelist" file="out_wl_paired.txt" />
168 <output name="out_log" file="out_wl_paired.log" lines_diff="40" />
169 <output name="out_html_report" file="out_wl_paired.html" />
170 <output name="out_thresh" file="out_wl_paired.tresh.tab" />
171 </test>
172 </tests>
173 <help><![CDATA[
174
175
176 UMI-tools whitelist - Extract barcodes from fastq
177 ==================================================
178
179 Purpose
180 -------
181
182 Extract cell barcodes and identify the most likely true barcodes using
183 the 'knee' method.
184
185 Options
186 -------
187
188 --bc-pattern
189 This should be used where the barcodes are always in the same
190 place in the read.
191
192 - N = UMI position (required)
193 - C = cell barcode position (optional)
194 - X = sample position (optional)
195
196 Bases with Ns and Cs will be extracted and added to the read
197 name. The corresponding sequence qualities will be removed from
198 the read. Bases with an X will be reattached to the read.
199
200 E.g. If the pattern is NNNNCC,
201 Then the read:
202 @HISEQ:87:00000000 read1
203 AAGGTTGCTGATTGGATGGGCTAG
204 DA1AEBFGGCG01DFH00B1FF0B
205 +
206 will become:
207 @HISEQ:87:00000000_TT_AAGG read1
208 GCTGATTGGATGGGCTAG
209 1AFGGCG01DFH00B1FF0B
210 +
211
212 where 'TT' is the cell barcode and 'AAGG' is the UMI.
213
214
215 --set-cell-number
216 Use this option to explicity set the number of cell barcodes
217 which should be accepted. Note that the exact number of cell
218 barcodes in the outputted whitelist may be slightly less than
219 this if there are multiple cells observed with the same
220 frequency at the threshold between accepted and rejected cell
221 barcodes.
222
223 --expect-cells=[EXPECTED_CELLS]
224 An upper limit estimate for the number of inputted cells. The knee
225 method will now select the first threshold (order ascendingly)
226 which results in the number of cell barcodes accepted being <=
227 EXPECTED_CELLS and > EXPECTED_CELLS * 0.1.
228
229
230 --bc-pattern2
231 Use this option to specify the format of the UMI/barcode for
232 the second read pair if required. If --bc-pattern2 is not
233 supplied, this defaults to the same pattern as --bc-pattern
234
235 --3prime
236 By default the barcode is assumed to be on the 5' end of the read, but
237 use this option to sepecify that it is on the 3' end instead
238
239 Usage:
240 ------
241
242 For single ended reads:
243 umi_tools whitelist --bc-pattern=[PATTERN] -L extract.log
244 [OPTIONS]
245
246 reads from stdin and outputs to stdout.
247
248 For paired end reads where the cell barcodes is split across the read pairs:
249 umi_tools whitelist --bc-pattern=[PATTERN]
250 --bc-pattern2=[PATTERN] --read2-in=[FASTQIN] -L extract.log
251 [OPTIONS]
252
253 reads end one from stdin and end two from FASTQIN and outputs to stdin
254
255
256 Output:
257 -------
258
259 The whitelist is outputted as 4 tab-separated columns:
260
261 1. whitelisted cell barcode
262 2. Other cell barcode(s) (comma-separated) to correct to the
263 whitelisted barcode
264 3. Count for whitelisted cell barcodes
265 4. Count(s) for the other cell barcode(s) (comma-separated)
266
267 example output:
268
269 AAAAAA AGAAAA 146 1
270 AAAATC 22
271 AAACAT 21
272 AAACTA AAACTN,GAACTA 27 1,1
273 AAATAC 72
274 AAATCA GAATCA 37 3
275 AAATGT AAAGGT,CAATGT 41 1,1
276 AAATTG CAATTG 36 1
277 AACAAT 18
278 AACATA 24
279
280 If --error-correct-threshold is set to 0, columns 2 and 4 will be empty.
281
282
283 ]]></help>
284 <expand macro="citations" />
285 </tool>