Mercurial > repos > iuc > umi_tools_whitelist
comparison umi-tools_whitelist.xml @ 0:b911124762a8 draft
planemo upload commit 8da5246c32d60a49e6b6b9027c9adc0a31d4bc5a
author | iuc |
---|---|
date | Sun, 25 Feb 2018 13:07:58 -0500 |
parents | |
children | dac4e7dc837d |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b911124762a8 |
---|---|
1 <tool id="umi_tools_whitelist" name="UMI-tools whitelist" version="@VERSION@.0"> | |
2 <description>Extract cell barcodes from FASTQ files</description> | |
3 <macros> | |
4 <import>macros.xml</import> | |
5 </macros> | |
6 <expand macro="requirements" /> | |
7 <command detect_errors="exit_code"><![CDATA[ | |
8 #set $gz = False | |
9 #if $input_type.type == 'single': | |
10 #if $input_type.input_single.is_of_type("fastq.gz", "fastqsanger.gz"): | |
11 ln -s '$input_type.input_single' input_single.gz && | |
12 #set $gz = True | |
13 #end if | |
14 #else | |
15 #if $input_type.input_read1.is_of_type("fastq.gz", "fastqsanger.gz"): | |
16 ln -s '$input_type.input_read1' input_read1.gz && | |
17 ln -s '$input_type.input_read2' input_read2.gz && | |
18 #set $gz = True | |
19 #end if | |
20 #end if | |
21 umi_tools whitelist | |
22 --bc-pattern='$bc_pattern' | |
23 --subset-reads='$subset_reads' | |
24 #if $input_type.type == 'single': | |
25 #if $gz: | |
26 --stdin=input_single.gz | |
27 #else | |
28 --stdin='$input_type.input_single' | |
29 #end if | |
30 #else: | |
31 #if $gz: | |
32 --stdin=input_read1.gz | |
33 --read2-in=input_read2.gz | |
34 #else: | |
35 --stdin='$input_type.input_read1' | |
36 --read2-in='$input_type.input_read2' | |
37 #end if | |
38 #if $input_type.barcode.barcode_select == "1": | |
39 --bc-pattern2='$input_type.barcode.bc_pattern2' | |
40 #end if | |
41 #end if | |
42 #if $celloptions.use_cell_opts == "advanced": | |
43 #if $celloptions.set_cell_number != "0": | |
44 --set-cell-number=$celloptions.set_cell_number | |
45 #end if | |
46 #if $celloptions.expect_cells != "0": | |
47 --expect-cells=$celloptions.expect_cells | |
48 #end if | |
49 --error-correct-threshold=$celloptions.error_correct_thresh | |
50 #end if | |
51 --method=$method | |
52 --plot-prefix=OUT | |
53 | |
54 $prime3 | |
55 | |
56 #if $log: | |
57 --log='$out_log' | |
58 #end if | |
59 | |
60 > '$out_whitelist' && | |
61 | |
62 mkdir '${ out_html_report.files_path }' && | |
63 cp OUT_*.png '${ out_html_report.files_path }' && | |
64 | |
65 echo "<html> | |
66 <head></head><body> | |
67 <h1>Cell and Count Metrics</h1> | |
68 <img src=\"OUT_cell_barcode_count_density.png\" ><br /> | |
69 <img src=\"OUT_cell_barcode_knee.png\" ><br /> | |
70 <img src=\"OUT_cell_barcode_counts.png\" ><br /> | |
71 </body></html>" > '$out_html_report' | |
72 && | |
73 mv OUT_cell_thresholds.tsv '$out_thresh' | |
74 ]]></command> | |
75 <inputs> | |
76 <conditional name="input_type"> | |
77 <param name="type" type="select" label="Library type"> | |
78 <option value="single">Single-end</option> | |
79 <option value="paired">Paired-end</option> | |
80 </param> | |
81 <when value="single"> | |
82 <param name="input_single" type="data" format="fastq,fastq.gz" label="Reads in FASTQ format" /> | |
83 </when> | |
84 <when value="paired"> | |
85 <param name="input_read1" type="data" format="fastq,fastq.gz" label="Reads in FASTQ format" /> | |
86 <param name="input_read2" type="data" format="fastq,fastq.gz" label="Reads in FASTQ format" /> | |
87 <conditional name="barcode"> | |
88 <param name="barcode_select" argument="--split-barcode" type="select" label="Barcode on both reads?"> | |
89 <option value="first_read_only">Barcode on first read only</option> | |
90 <option value="both_reads">Barcode on both reads</option> | |
91 </param> | |
92 <when value="first_read_only"/> | |
93 <when value="both_reads"> | |
94 <param name="bc_pattern2" argument="--bc-pattern2" type="text" value="" label="Barcode pattern for second read" | |
95 help="Use this option to specify the format of the UMI/barcode for | |
96 the second read pair if required."> | |
97 </param> | |
98 </when> | |
99 </conditional> | |
100 </when> | |
101 </conditional> | |
102 <param name="bc_pattern" argument="--bc-pattern" type="text" label="Barcode pattern for first read" | |
103 help="Use this option to specify the format of the UMI/barcode. Use Ns to | |
104 represent the random positions and Xs to indicate the bc positions. | |
105 Bases with Ns will be extracted and added to the read name. Remaining | |
106 bases, marked with an X will be reattached to the read."> | |
107 </param> | |
108 <param name="method" argument="--method" type="select" label="Count reads or UMIs" | |
109 help="Many published protocols rank CBs by the number of reads the CBs appear in. However you could also use the number of unique UMIs a CB is associated with. Note that this is still and approximation to the number of transcripts captured because the same UMI could be associated with two different transcripts and be counted as independent." > | |
110 <option value="reads" selected="true" /> | |
111 <option value="umis" /> | |
112 </param> | |
113 | |
114 <param argument="--3prime" name="prime3" type="boolean" label="Is barcode on 3' end of the read?" | |
115 truevalue="--3prime" falsevalue="" | |
116 help="By default the barcode is assumed to be on the 5' end of the read, but | |
117 use this option to specify that it is on the 3' end instead." /> | |
118 <param name="subset_reads" argument="--subset-reads" type="integer" min="0" value="0" label="Use the first N reads to automatically identify the true cell barcodes." /> | |
119 <conditional name="celloptions" > | |
120 <param name="use_cell_opts" type="select" label="Cell parameters" > | |
121 <option value="defaults" selected="True">Use Defaults</option> | |
122 <option value="advanced">Advanced Options</option> | |
123 </param> | |
124 <when value="defaults"/> | |
125 <when value="advanced"> | |
126 <param name="set_cell_number" type="integer" min="0" value="0" label="Specify the number of cell barcodes to accept" /> | |
127 <param name="expect_cells" type="integer" min="0" value="0" label="Prior expectation on the upper limit on the number of cells sequenced" /> | |
128 <param name="error_correct_thresh" type="integer" min="0" value="0" label="Hamming distance for correction of barcodes to whilelist barcodes. Set to zero to generate no error correcting metrics." /> | |
129 </when> | |
130 </conditional> | |
131 <param argument="--log" type="boolean" label="Output log?" truevalue="--log" falsevalue="" | |
132 help="Choose if you want to generate a text file containing logging information." /> | |
133 | |
134 </inputs> | |
135 <outputs> | |
136 <data name="out_whitelist" format="tabular" label="${tool.name} on ${on_string}: Whitelist"/> | |
137 <data name="out_log" format="txt" label="${tool.name} on ${on_string}: logfile" > | |
138 <filter>log</filter> | |
139 </data> | |
140 <data name="out_html_report" format="html" label="${tool.name} on ${on_string}: Webpage" /> | |
141 <data name="out_thresh" format="tabular" label="${tool.name} on ${on_string}: TSV Cell Thresholds" /> | |
142 </outputs> | |
143 <tests> | |
144 <test expect_num_outputs="3"> | |
145 <param name="type" value="single" /> | |
146 <param name="input_single" value="t_R2.fastq.gz" ftype="fastq" /> | |
147 <param name="bc_pattern" value="CCCCCCCCNNNNNNNN" /> | |
148 <param name="method" value="reads" /> | |
149 <param name="prime3" value="true" /> | |
150 <output name="out_whitelist" file="out_wl_single.txt" lines_diff="40" /> | |
151 <output name="out_thresh" file="out_wl_single.tresh.tab" /> | |
152 <output name="out_html_report" file="out_wl_single.html" /> | |
153 </test> | |
154 <test expect_num_outputs="4"> | |
155 <param name="type" value="paired" /> | |
156 <param name="input_read1" value="t_R1.fastq.gz" ftype="fastq" /> | |
157 <param name="input_read2" value="t_R2.fastq.gz" ftype="fastq" /> | |
158 <param name="barcode_select" value="both_reads" /> | |
159 <param name="bc_pattern" value="CCCNNNNNNNNXXXXX" /> | |
160 <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" /> | |
161 <param name="method" value="reads" /> | |
162 <param name="prime3" value="false" /> | |
163 <param name="use_cell_opts" value="advanced" /> | |
164 <param name="expect_cells" value="5" /> | |
165 <param name="error_correct_thresh" value="3" /> | |
166 <param name="log" value="true" /> | |
167 <output name="out_whitelist" file="out_wl_paired.txt" /> | |
168 <output name="out_log" file="out_wl_paired.log" lines_diff="40" /> | |
169 <output name="out_html_report" file="out_wl_paired.html" /> | |
170 <output name="out_thresh" file="out_wl_paired.tresh.tab" /> | |
171 </test> | |
172 </tests> | |
173 <help><![CDATA[ | |
174 | |
175 | |
176 UMI-tools whitelist - Extract barcodes from fastq | |
177 ================================================== | |
178 | |
179 Purpose | |
180 ------- | |
181 | |
182 Extract cell barcodes and identify the most likely true barcodes using | |
183 the 'knee' method. | |
184 | |
185 Options | |
186 ------- | |
187 | |
188 --bc-pattern | |
189 This should be used where the barcodes are always in the same | |
190 place in the read. | |
191 | |
192 - N = UMI position (required) | |
193 - C = cell barcode position (optional) | |
194 - X = sample position (optional) | |
195 | |
196 Bases with Ns and Cs will be extracted and added to the read | |
197 name. The corresponding sequence qualities will be removed from | |
198 the read. Bases with an X will be reattached to the read. | |
199 | |
200 E.g. If the pattern is NNNNCC, | |
201 Then the read: | |
202 @HISEQ:87:00000000 read1 | |
203 AAGGTTGCTGATTGGATGGGCTAG | |
204 DA1AEBFGGCG01DFH00B1FF0B | |
205 + | |
206 will become: | |
207 @HISEQ:87:00000000_TT_AAGG read1 | |
208 GCTGATTGGATGGGCTAG | |
209 1AFGGCG01DFH00B1FF0B | |
210 + | |
211 | |
212 where 'TT' is the cell barcode and 'AAGG' is the UMI. | |
213 | |
214 | |
215 --set-cell-number | |
216 Use this option to explicity set the number of cell barcodes | |
217 which should be accepted. Note that the exact number of cell | |
218 barcodes in the outputted whitelist may be slightly less than | |
219 this if there are multiple cells observed with the same | |
220 frequency at the threshold between accepted and rejected cell | |
221 barcodes. | |
222 | |
223 --expect-cells=[EXPECTED_CELLS] | |
224 An upper limit estimate for the number of inputted cells. The knee | |
225 method will now select the first threshold (order ascendingly) | |
226 which results in the number of cell barcodes accepted being <= | |
227 EXPECTED_CELLS and > EXPECTED_CELLS * 0.1. | |
228 | |
229 | |
230 --bc-pattern2 | |
231 Use this option to specify the format of the UMI/barcode for | |
232 the second read pair if required. If --bc-pattern2 is not | |
233 supplied, this defaults to the same pattern as --bc-pattern | |
234 | |
235 --3prime | |
236 By default the barcode is assumed to be on the 5' end of the read, but | |
237 use this option to sepecify that it is on the 3' end instead | |
238 | |
239 Usage: | |
240 ------ | |
241 | |
242 For single ended reads: | |
243 umi_tools whitelist --bc-pattern=[PATTERN] -L extract.log | |
244 [OPTIONS] | |
245 | |
246 reads from stdin and outputs to stdout. | |
247 | |
248 For paired end reads where the cell barcodes is split across the read pairs: | |
249 umi_tools whitelist --bc-pattern=[PATTERN] | |
250 --bc-pattern2=[PATTERN] --read2-in=[FASTQIN] -L extract.log | |
251 [OPTIONS] | |
252 | |
253 reads end one from stdin and end two from FASTQIN and outputs to stdin | |
254 | |
255 | |
256 Output: | |
257 ------- | |
258 | |
259 The whitelist is outputted as 4 tab-separated columns: | |
260 | |
261 1. whitelisted cell barcode | |
262 2. Other cell barcode(s) (comma-separated) to correct to the | |
263 whitelisted barcode | |
264 3. Count for whitelisted cell barcodes | |
265 4. Count(s) for the other cell barcode(s) (comma-separated) | |
266 | |
267 example output: | |
268 | |
269 AAAAAA AGAAAA 146 1 | |
270 AAAATC 22 | |
271 AAACAT 21 | |
272 AAACTA AAACTN,GAACTA 27 1,1 | |
273 AAATAC 72 | |
274 AAATCA GAATCA 37 3 | |
275 AAATGT AAAGGT,CAATGT 41 1,1 | |
276 AAATTG CAATTG 36 1 | |
277 AACAAT 18 | |
278 AACATA 24 | |
279 | |
280 If --error-correct-threshold is set to 0, columns 2 and 4 will be empty. | |
281 | |
282 | |
283 ]]></help> | |
284 <expand macro="citations" /> | |
285 </tool> |