comparison barcode_splitter.xml @ 0:ff12d2c1f5d6 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/barcode_splitter commit 8f4f88267b8adfa035539230eab3d8eea6896e10
author iuc
date Wed, 29 May 2019 10:24:27 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ff12d2c1f5d6
1 <tool id="barcode_splitter" name="Barcode Splitter" version="0.18.4.0">
2 <description>Split sequence files using multiple sets of barcodes</description>
3 <requirements>
4 <requirement type="package" version="0.18.4">barcode_splitter</requirement>
5 </requirements>
6 <version_command>barcode_splitter --version</version_command>
7 <command detect_errors="exit_code">
8 <![CDATA[
9 mkdir split &&
10 barcode_splitter
11
12 --bcfile '$bcfile' --mismatches '$mismatches' --galaxy $barcodes_at_end --prefix 'split/'
13
14 #set $auto_split_all = $split_all
15 #set $format = ""
16 #set $paired_match = "0"
17 #set $num_index_files = 0
18
19 #if str($runinterface.run_type) == "flexible":
20
21 ## Determine the format based on the first input file's extension
22 ## #for $indf in $runinterface.seqfiles
23 ## #set $format = $indf.input.extension
24 ## #break
25 ## #end for
26 #set $format = $runinterface.seqfiles[0].input.extension
27
28 ## Add the sequence files, count the number of forward, reverse, and other non-index read names, and auto-set split_all
29 #set $numforwards = 0
30 #set $numreverses = 0
31 #for $sf in $runinterface.seqfiles
32 #if str($sf.nameinterface.readtype) == "forward" or str($sf.nameinterface.readtype) == "forwardindex":
33 #set $numforwards += 1
34 #elif str($sf.nameinterface.readtype) == "reverse" or str($sf.nameinterface.readtype) == "reverseindex":
35 #set $numreverses += 1
36 #end if
37 #if str($sf.nameinterface.readtype) == "singleindex" or str($sf.nameinterface.readtype) == "forwardindex" or str($sf.nameinterface.readtype) == "reverseindex":
38 #set $auto_split_all = "--split_all"
39 #end if
40 '${sf.input}'
41 #end for
42
43 ## Determine whether valid paired-end data exists
44 #if $numforwards == 1 and $numreverses == 1:
45 #set $paired_match = "1"
46 #end if
47
48 ##Add the relative file argument positions of the index files in the list included above
49 --idxread
50 #set $n = 0
51 #for $sf in $runinterface.seqfiles
52 #set $n += 1
53 #if str($sf.nameinterface.readtype) == 'index' or str($sf.nameinterface.readtype) == "singleindex" or str($sf.nameinterface.readtype) == "forwardindex" or str($sf.nameinterface.readtype) == "reverseindex":
54 #set $num_index_files += 1
55 '${n}'
56 #end if
57 #end for
58
59 #elif str($runinterface.run_type) == "single":
60
61 #set $format = $runinterface.snglinput.extension
62
63 ## Add the sequence files
64 '${runinterface.snglinput}'
65 #for $sf in $runinterface.idxfiles
66 '${sf.idxinput}'
67 #end for
68
69 ##Add the relative file argument positions of the index files
70 --idxread
71 #set $n = 1
72 #for $sf in $runinterface.idxfiles
73 #set $num_index_files += 1
74 #set $n += 1
75 '${n}'
76 #end for
77
78 #elif str($runinterface.run_type) == "paired":
79
80 #set $format = $runinterface.fwdinput.extension
81 #set $paired_match = "1"
82
83 ## Add the sequence files
84 '${runinterface.fwdinput}' '${runinterface.revinput}'
85 #for $sf in $runinterface.idxfiles
86 '${sf.idxinput}'
87 #end for
88
89 ##Add the relative file argument positions of the index files
90 --idxread
91 #set $n = 2
92 #for $sf in $runinterface.idxfiles
93 #set $num_index_files += 1
94 #set $n += 1
95 '${n}'
96 #end for
97
98 #end if
99
100 --format '${format}' --suffix '.${format}' $auto_split_all
101
102 > '$summary';
103
104 ## Determine validity of repeat params
105 #if $num_index_files == 0:
106 echo "ERROR: At least 1 read file must have a 'Read Type' selected as 'Index', 'Single-End with Index', 'Forward with index', or 'Reverse with Index')."
107 #end if
108
109 ## Give the output split file names the name specified by the user (to be turned into collection identifiers)
110 rc=0;
111 #set $n = 0
112
113 #if str($runinterface.run_type) == "flexible":
114
115 #for $repeat in $runinterface.seqfiles
116
117 #set $n += 1
118
119 #set $readname = "read-" + str($n)
120 #set $readtype = "index"
121 #if str($repeat.nameinterface.readtype) == "forward" or str($repeat.nameinterface.readtype) == "forwardindex":
122 #set $readname = "forward"
123 #set $readtype = "forward"
124 #elif str($repeat.nameinterface.readtype) == "reverse" or str($repeat.nameinterface.readtype) == "reverseindex":
125 #set $readname = "reverse"
126 #set $readtype = "reverse"
127 #else:
128 #if (str($repeat.nameinterface.readtype) == "single" or str($repeat.nameinterface.readtype) == "singleindex" or str($repeat.nameinterface.readtype) == "index") and str($repeat.nameinterface.readname) != "":
129 #set $readname = $repeat.nameinterface.readname
130 #end if
131 #if str($repeat.nameinterface.readtype) == "single" or str($repeat.nameinterface.readtype) == "singleindex":
132 #set $readtype = "single"
133 #end if
134 #end if
135
136 for f in split/*-read-'${n}'.'${format}';
137 do
138 if [ -e "\$f" ]; then
139 ## Space-delimited file name structure: "SAMPLE READNAME READTYPE MATCHEDPAIR.EXT"
140 mv "\$f" "\${f/-read-${n}/ ${readname} ${readtype} ${paired_match}}" || rc=\$?;
141 fi;
142 done;
143 #end for
144
145 #elif str($runinterface.run_type) == "single":
146
147 #set $n = 1
148
149 ## Rename the single-end split read files
150 #set $readname = "read-" + str($n)
151 #set $readtype = 'single'
152 for f in split/*-read-'${n}'.'${format}';
153 do
154 mv "\$f" "\${f/-read-${n}/ ${readname} ${readtype} ${paired_match}}" || rc=\$?;
155 done;
156
157 ## Rename the index read files
158 #set $readtype = "index"
159 #for $repeat in $runinterface.idxfiles
160 #set $n += 1
161
162 #set $readname = "read-" + str($n)
163 #if str($repeat.idxreadname) != "":
164 #set $readname = $repeat.idxreadname
165 #end if
166
167 for f in split/*-read-'${n}'.'${format}';
168 do
169 if [ -e "\$f" ]; then
170 ## Space-delimited file name structure: "SAMPLE READNAME READTYPE MATCHEDPAIR.EXT"
171 mv "\$f" "\${f/-read-${n}/ ${readname} ${readtype} ${paired_match}}" || rc=\$?;
172 fi;
173 done;
174 #end for
175
176 #elif str($runinterface.run_type) == "paired":
177
178 #set $n = 1
179
180 ## Rename the forward split read files
181 #set $readname = "forward"
182 #set $readtype = "forward"
183 for f in split/*-read-'${n}'.'${format}';
184 do
185 mv "\$f" "\${f/-read-${n}/ ${readname} ${readtype} ${paired_match}}" || rc=\$?;
186 done;
187
188 #set $n += 1
189
190 ## Rename the reverse split read files
191 #set $readname = "reverse"
192 #set $readtype = "reverse"
193 for f in split/*-read-'${n}'.'${format}';
194 do
195 mv "\$f" "\${f/-read-${n}/ ${readname} ${readtype} ${paired_match}}" || rc=\$?;
196 done;
197
198 ## Rename the index read files
199 #set $readtype = "index"
200 #for $repeat in $runinterface.idxfiles
201 #set $n += 1
202
203 #set $readname = "read-" + str($n)
204 #if str($repeat.idxreadname) != "":
205 #set $readname = $repeat.idxreadname
206 #end if
207
208 for f in split/*-read-'${n}'.'${format}';
209 do
210 if [ -e "\$f" ]; then
211 ## Space-delimited file name structure: "SAMPLE READNAME READTYPE MATCHEDPAIR.EXT"
212 mv "\$f" "\${f/-read-${n}/ ${readname} ${readtype} ${paired_match}}" || rc=\$?;
213 fi;
214 done;
215 #end for
216
217 #end if
218
219 ## Exit non-zero if rc is not 0
220 [ \$rc == 0 ];
221 ]]>
222 </command>
223
224 <inputs>
225 <param name="bcfile" type="data" format="tabular" label="Barcode File" help="Tab-delimited text file where the first column is a sample ID and subsequent columns are barcodes. Note, files with indexes/barcode sequences must be supplied in the same order as the barcode columns in this file and their 'Read Type' must have 'Index'." />
226
227 <conditional name="runinterface">
228 <param label="Run Type" name="run_type" type="select">
229 <option value="single" selected="true">Single-End</option>
230 <option value="paired">Paired-End</option>
231 <option value="flexible">Flexible</option>
232 </param>
233 <when value="single">
234 <!-- Simplified interface for common use-case. Can still use "Flexible" for alternative single-end options. -->
235 <param name="snglinput" type="data" format="fastq,fastqsanger,fastqsolexa,fastqillumina" label="Read File" help="Typically, 'Read 1'." />
236 <repeat name="idxfiles" title="Index File(s)" min="1" default="1">
237 <param name="idxinput" type="data" format="fastq,fastqsanger,fastqsolexa,fastqillumina" label="Index File" help="Typically, 'Read 2 Index'." />
238 <param name="idxreadname" type="text" label="Index Name" help="E.g. index1, index2, etc.. Letters and numbers only. No spaces. Default is 'read-x' where 'x' is a number corresponding to the order in which the file was supplied (including the read file above).">
239 <sanitizer>
240 <valid initial="string.ascii_letters,string.digits" />
241 </sanitizer>
242 </param>
243 </repeat>
244 </when>
245 <when value="paired">
246 <!-- No embedded index allowed in the forward/reverse reads for simplicity. Can still use "Flexible" for that. -->
247 <param name="fwdinput" type="data" format="fastq,fastqsanger,fastqsolexa,fastqillumina" label="Forward Read File" help="Typically, 'Read 1'." />
248 <param name="revinput" type="data" format="fastq,fastqsanger,fastqsolexa,fastqillumina" label="Reverse Read File" help="Typically, 'Read 3'." />
249
250 <repeat name="idxfiles" title="Index Files" min="0" default="1">
251 <param name="idxinput" type="data" format="fastq,fastqsanger,fastqsolexa,fastqillumina" label="Index File" help="Typically, 'Read 2 Index' or 'Read 4 Index'." />
252 <param name="idxreadname" type="text" label="Index Name" help="E.g. index1, index2, etc.. Letters and numbers only. No spaces. Default is 'read-x' where 'x' is a number corresponding to the order in which the file was supplied (including the forward and reverse files).">
253 <sanitizer>
254 <valid initial="string.ascii_letters,string.digits" />
255 </sanitizer>
256 </param>
257 </repeat>
258 </when>
259 <when value="flexible">
260 <repeat name="seqfiles" title="Read Files" min="1" default="1">
261 <param name="input" type="data" format="fastq,fastqsanger,fastqsolexa,fastqillumina" label="Read File" help="Forward or single-end read files are typically named 'Read 1'. Index files are typically named 'Read 2 Index' or 'Read 4 Index'. Reverse read files are typically named 'Read 3'." />
262 <conditional name="nameinterface">
263 <param name="readtype" type="select" label="Read Type" help="E.g. If one file is 'Forward' and one is 'Reverse', split reads from those files will appear in an output paired collection. If your file has reads with barcodes embedded in them, select their 'with index' variant, e.g. 'Forward with Index'.">
264 <option value="single" selected="true">Single-End</option>
265 <option value="forward">Forward</option>
266 <option value="reverse">Reverse</option>
267 <option value="index">Index</option>
268 <option value="singleindex">Single-End with Index</option>
269 <option value="forwardindex">Forward with Index</option>
270 <option value="reverseindex">Reverse with Index</option>
271 </param>
272 <when value="single">
273 <param name="readname" type="text" label="Read Name" help="E.g. index1, index2, etc.. Ignored for paired-collections. Letters and numbers only. No spaces. Default is 'read-x' where 'x' is a number corresponding to the order in which the file was supplied.">
274 <sanitizer>
275 <valid initial="string.ascii_letters,string.digits" />
276 </sanitizer>
277 </param>
278 </when>
279 <when value="singleindex">
280 <param name="readname" type="text" label="Read Name" help="E.g. index1, index2, etc.. Ignored for paired-collections. Letters and numbers only. No spaces. Default is 'read-x' where 'x' is a number corresponding to the order in which the file was supplied.">
281 <sanitizer>
282 <valid initial="string.ascii_letters,string.digits" />
283 </sanitizer>
284 </param>
285 </when>
286 <when value="index">
287 <param name="readname" type="text" label="Read Name" help="E.g. index1, index2, etc.. Ignored for paired-collections. Letters and numbers only. No spaces. Default is 'read-x' where 'x' is a number corresponding to the order in which the file was supplied.">
288 <sanitizer>
289 <valid initial="string.ascii_letters,string.digits" />
290 </sanitizer>
291 </param>
292 </when>
293 </conditional>
294 </repeat>
295 </when>
296 </conditional>
297
298 <param name="mismatches" type="integer" value="1" max="2" min="0" label="Number of allowed mismatches" help="An integer between 0 and 2 (inclusive). Warning: Make sure all your barcodes differ by at least double this value plus 1, otherwise sequences that match both barcodes equally well will be discarded as 'multimatched' reads." />
299 <param argument="--barcodes_at_end" type="boolean" truevalue="--barcodes_at_end" falsevalue="" checked="false" label="Barcodes are at the end of all index sequences" help="Default is the beginning of all sequences" />
300 <param argument="--split_all" type="boolean" truevalue="--split_all" falsevalue="" checked="false" label="Split index files too" help="This creates an output collection for split files whose selected 'Read Type' is 'Index'." />
301 </inputs>
302
303 <outputs>
304 <data format="tabular" name="summary" label="${tool.name} on ${on_string}: Summary" />
305 <!-- Collection for single-end split reads, regardless of run_type -->
306 <collection name="split_output_single" type="list" format_source="input" label="${tool.name} on ${on_string}: single-end">
307 <!-- run_type is single OR flexible and there's only 1 output read type -->
308 <filter>str(runinterface['run_type']) == "single" or (str(runinterface['run_type']) == "flexible" and len(list(filter(lambda x: str(x['nameinterface']['readtype']) != "index", runinterface['seqfiles']))) == 1)</filter>
309 <discover_datasets pattern="(?P&lt;identifier_0&gt;\S+)\ \S+\ (single|forward|reverse) 0\.(?P&lt;ext&gt;.*)" directory="split" visible="false" />
310 </collection>
311 <!-- Collection for paired-end split reads, regardless of run_type -->
312 <collection name="split_output_paired" type="list:paired" format_source="input" label="${tool.name} on ${on_string}: paired-end">
313 <!-- run_type is paired OR flexible and there's 1 forward and 1 reverse readtype -->
314 <filter>str(runinterface['run_type']) == "paired" or (str(runinterface['run_type']) == "flexible" and len(list(filter(lambda x: str(x['nameinterface']['readtype']).startswith('forward'), runinterface['seqfiles']))) == 1 and len(list(filter(lambda x: str(x['nameinterface']['readtype']).startswith('reverse'), runinterface['seqfiles']))) == 1)</filter>
315 <discover_datasets pattern="(?P&lt;identifier_0&gt;\S+)\ \S+\ (?P&lt;identifier_1&gt;forward|reverse) 1\.(?P&lt;ext&gt;.*)" directory="split" visible="false" />
316 </collection>
317 <!-- Collection for single-end split reads that sit alongside a paired-end collection -->
318 <collection name="split_output_paired_other" type="list" format_source="input" label="${tool.name} on ${on_string}: other-end">
319 <!-- run_type is flexible, there are single-end reads, and there's a single valid read pair -->
320 <filter>str(runinterface['run_type']) == "flexible" and len(list(filter(lambda x: str(x['nameinterface']['readtype']).startswith('single'), runinterface['seqfiles']))) > 0 and len(list(filter(lambda x: str(x['nameinterface']['readtype']).startswith('forward'), runinterface['seqfiles']))) == 1 and len(list(filter(lambda x: str(x['nameinterface']['readtype']).startswith('reverse'), runinterface['seqfiles']))) == 1</filter>
321 <discover_datasets pattern="(?P&lt;identifier_0&gt;\S+\ \S+)\ single 1\.(?P&lt;ext&gt;.*)" directory="split" visible="false" />
322 </collection>
323 <!-- Collection for multi-end split reads -->
324 <collection name="split_output_multi" type="list" format_source="input" label="${tool.name} on ${on_string}: multi-end">
325 <!-- run_type is flexible, there are multiple output split readtypes, and there's not a single valid read pair -->
326 <filter>str(runinterface['run_type']) == "flexible" and (len(runinterface['seqfiles']) - len(list(filter(lambda x: str(x['nameinterface']['readtype']) == "index", runinterface['seqfiles'])))) > 1 and (len(list(filter(lambda x: str(x['nameinterface']['readtype']).startswith('forward'), runinterface['seqfiles']))) != 1 or len(list(filter(lambda x: str(x['nameinterface']['readtype']).startswith('reverse'), runinterface['seqfiles']))) != 1)</filter>
327 <discover_datasets pattern="(?P&lt;identifier_0&gt;\S+ \S+) (single (0|1)|(forward|reverse) 0)\.(?P&lt;ext&gt;.*)" directory="split" visible="false" />
328 </collection>
329 <!-- Collection for split_all index-only reads -->
330 <collection name="index_only" type="list" format_source="input" label="${tool.name} on ${on_string}: indexes">
331 <!-- split_all is true and ((run_type is not flexible and there are index only files) or (run_type is flexible and there are index only files)) -->
332 <filter>split_all and ((str(runinterface['run_type']) != "flexible" and len(runinterface['idxfiles']) > 0) or (str(runinterface['run_type']) == "flexible" and len(list(filter(lambda x: str(x['nameinterface']['readtype']) == "index", runinterface['seqfiles']))) > 0))</filter>
333 <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?) index (0|1)\.(?P&lt;ext&gt;.*)" directory="split" visible="false" />
334 </collection>
335 </outputs>
336
337 <tests>
338 <test>
339 <!-- single end with 1 index -->
340 <param name="run_type" value="single" />
341 <param name="bcfile" value="barcode_splitter_barcodes.txt" />
342 <param name="mismatches" value="2" />
343 <param name="barcodes_at_end" value="" />
344 <param name="split_all" value="False" />
345 <param name="snglinput" value="barcode_splitter1.fastq" ftype="fastq" />
346
347 <repeat name="idxfiles">
348 <param name="idxinput" value="barcode_splitter_index.fastq" ftype="fastq" />
349 <param name="idxreadname" value="read" />
350 </repeat>
351
352 <output name="summary" file="test_1_summary.out" />
353 <output_collection name="split_output_single" type="list">
354 <element name="BC1" ftype="fastq" file="test_1_BC1-read-1.out" />
355 <element name="BC2" ftype="fastq" file="test_1_BC2-read-1.out" />
356 <element name="BC3" ftype="fastq" file="test_1_BC3-read-1.out" />
357 <element name="BC4" ftype="fastq" file="test_1_BC4-read-1.out" />
358 <element name="unmatched" ftype="fastq" file="test_1_unmatched-read-1.out" />
359 <element name="multimatched" ftype="fastq" file="test_1_multimatched-read-1.out" />
360 </output_collection>
361 </test>
362 <test>
363 <!-- single end with 1 index and split all -->
364 <param name="run_type" value="single" />
365 <param name="bcfile" value="barcode_splitter_barcodes_dual.txt" />
366 <param name="mismatches" value="2" />
367 <param name="barcodes_at_end" value="" />
368 <param name="split_all" value="True" />
369 <param name="snglinput" value="barcode_splitter1.fastq" ftype="fastq" />
370
371 <repeat name="idxfiles">
372 <param name="idxinput" value="barcode_splitter_index.fastq" ftype="fastq" />
373 <param name="idxreadname" value="index1" />
374 </repeat>
375 <repeat name="idxfiles">
376 <param name="idxinput" value="barcode_splitter_index_2.fastq" ftype="fastq" />
377 <param name="idxreadname" value="index2" />
378 </repeat>
379
380 <output name="summary" file="test_2_summary.out" />
381 <output_collection name="split_output_single" type="list">
382 <element name="BC1" ftype="fastq" file="test_2_BC1-read-1.fastq" />
383 <element name="BC2" ftype="fastq" file="test_2_BC2-read-1.fastq" />
384 <element name="BC3" ftype="fastq" file="test_2_BC3-read-1.fastq" />
385 <element name="BC4" ftype="fastq" file="test_2_BC4-read-1.fastq" />
386 <element name="unmatched" ftype="fastq" file="test_2_unmatched-read-1.fastq" />
387 <element name="multimatched" ftype="fastq" file="test_2_multimatched-read-1.fastq" />
388 </output_collection>
389 <output_collection name="index_only" type="list">
390 <element name="BC1 index1" ftype="fastq" file="test_2_BC1_index1.fastq" />
391 <element name="BC2 index1" ftype="fastq" file="test_2_BC2_index1.fastq" />
392 <element name="BC3 index1" ftype="fastq" file="test_2_BC3_index1.fastq" />
393 <element name="BC4 index1" ftype="fastq" file="test_2_BC4_index1.fastq" />
394 <element name="unmatched index1" ftype="fastq" file="test_2_unmatched_index1.fastq" />
395 <element name="multimatched index1" ftype="fastq" file="test_2_multimatched_index1.fastq" />
396 <element name="BC1 index2" ftype="fastq" file="test_2_BC1_index2.fastq" />
397 <element name="BC2 index2" ftype="fastq" file="test_2_BC2_index2.fastq" />
398 <element name="BC3 index2" ftype="fastq" file="test_2_BC3_index2.fastq" />
399 <element name="BC4 index2" ftype="fastq" file="test_2_BC4_index2.fastq" />
400 <element name="unmatched index2" ftype="fastq" file="test_2_unmatched_index2.fastq" />
401 <element name="multimatched index2" ftype="fastq" file="test_2_multimatched_index2.fastq" />
402 </output_collection>
403 </test>
404 <test>
405 <!-- paired end with one index -->
406 <param name="run_type" value="paired" />
407 <param name="bcfile" value="barcode_splitter_barcodes.txt" />
408 <param name="mismatches" value="2" />
409 <param name="barcodes_at_end" value="" />
410 <param name="split_all" value="False" />
411 <param name="fwdinput" value="barcode_splitter1.fastq" ftype="fastq" />
412 <param name="revinput" value="barcode_splitter_index_2.fastq" ftype="fastq" />
413
414 <repeat name="idxfiles">
415 <param name="idxinput" value="barcode_splitter_index.fastq" ftype="fastq" />
416 <param name="idxreadname" value="index" />
417 </repeat>
418
419 <output name="summary" file="test_1_summary.out" />
420 <output_collection name="split_output_paired" type="list:paired">
421 <element name="BC1">
422 <element name="forward" ftype="fastq" file="test_3_BC1-forward.fastq" />
423 <element name="reverse" ftype="fastq" file="test_3_BC1-reverse.fastq" />
424 </element>
425 <element name="BC2">
426 <element name="forward" ftype="fastq" file="test_3_BC2-forward.fastq" />
427 <element name="reverse" ftype="fastq" file="test_3_BC2-reverse.fastq" />
428 </element>
429 <element name="BC3">
430 <element name="forward" ftype="fastq" file="test_3_BC3-forward.fastq" />
431 <element name="reverse" ftype="fastq" file="test_3_BC3-reverse.fastq" />
432 </element>
433 <element name="BC4">
434 <element name="forward" ftype="fastq" file="test_3_BC4-forward.fastq" />
435 <element name="reverse" ftype="fastq" file="test_3_BC4-reverse.fastq" />
436 </element>
437 <element name="unmatched">
438 <element name="forward" ftype="fastq" file="test_3_unmatched-forward.fastq" />
439 <element name="reverse" ftype="fastq" file="test_3_unmatched-reverse.fastq" />
440 </element>
441 <element name="multimatched">
442 <element name="forward" ftype="fastq" file="test_3_multimatched-forward.fastq" />
443 <element name="reverse" ftype="fastq" file="test_3_multimatched-reverse.fastq" />
444 </element>
445 </output_collection>
446 </test>
447 <test>
448 <!-- paired end 2 indexes and split_all -->
449 <param name="run_type" value="paired" />
450 <param name="bcfile" value="barcode_splitter_barcodes_dual.txt" />
451 <param name="mismatches" value="2" />
452 <param name="barcodes_at_end" value="" />
453 <param name="split_all" value="True" />
454 <param name="fwdinput" value="barcode_splitter1.fastq" ftype="fastq" />
455 <param name="revinput" value="barcode_splitter_index_2.fastq" ftype="fastq" />
456
457 <repeat name="idxfiles">
458 <param name="idxinput" value="barcode_splitter_index.fastq" ftype="fastq" />
459 <param name="idxreadname" value="index1" />
460 </repeat>
461 <repeat name="idxfiles">
462 <param name="idxinput" value="barcode_splitter_index_2.fastq" ftype="fastq" />
463 <param name="idxreadname" value="index2" />
464 </repeat>
465
466 <output name="summary" file="test_2_summary.out" />
467 <output_collection name="split_output_paired" type="list:paired">
468 <element name="BC1">
469 <element name="forward" ftype="fastq" file="test_3_BC1-forward.fastq" />
470 <element name="reverse" ftype="fastq" file="test_3_BC1-reverse.fastq" />
471 </element>
472 <element name="BC2">
473 <element name="forward" ftype="fastq" file="test_3_BC2-forward.fastq" />
474 <element name="reverse" ftype="fastq" file="test_3_BC2-reverse.fastq" />
475 </element>
476 <element name="BC3">
477 <element name="forward" ftype="fastq" file="test_3_BC3-forward.fastq" />
478 <element name="reverse" ftype="fastq" file="test_3_BC3-reverse.fastq" />
479 </element>
480 <element name="BC4">
481 <element name="forward" ftype="fastq" file="test_3_BC4-forward.fastq" />
482 <element name="reverse" ftype="fastq" file="test_3_BC4-reverse.fastq" />
483 </element>
484 <element name="unmatched">
485 <element name="forward" ftype="fastq" file="test_3_unmatched-forward.fastq" />
486 <element name="reverse" ftype="fastq" file="test_3_unmatched-reverse.fastq" />
487 </element>
488 <element name="multimatched">
489 <element name="forward" ftype="fastq" file="test_3_multimatched-forward.fastq" />
490 <element name="reverse" ftype="fastq" file="test_3_multimatched-reverse.fastq" />
491 </element>
492 </output_collection>
493 <output_collection name="index_only" type="list">
494 <element name="BC1 index1" ftype="fastq" file="test_2_BC1_index1.fastq" />
495 <element name="BC2 index1" ftype="fastq" file="test_2_BC2_index1.fastq" />
496 <element name="BC3 index1" ftype="fastq" file="test_2_BC3_index1.fastq" />
497 <element name="BC4 index1" ftype="fastq" file="test_2_BC4_index1.fastq" />
498 <element name="unmatched index1" ftype="fastq" file="test_2_unmatched_index1.fastq" />
499 <element name="multimatched index1" ftype="fastq" file="test_2_multimatched_index1.fastq" />
500 <element name="BC1 index2" ftype="fastq" file="test_2_BC1_index2.fastq" />
501 <element name="BC2 index2" ftype="fastq" file="test_2_BC2_index2.fastq" />
502 <element name="BC3 index2" ftype="fastq" file="test_2_BC3_index2.fastq" />
503 <element name="BC4 index2" ftype="fastq" file="test_2_BC4_index2.fastq" />
504 <element name="unmatched index2" ftype="fastq" file="test_2_unmatched_index2.fastq" />
505 <element name="multimatched index2" ftype="fastq" file="test_2_multimatched_index2.fastq" />
506 </output_collection>
507 </test>
508 <test>
509 <!-- flexible with single end containing index -->
510 <param name="bcfile" value="barcode_splitter_barcodes.txt" />
511 <param name="mismatches" value="2" />
512 <param name="barcodes_at_end" value="" />
513 <param name="split_all" value="False" />
514
515 <conditional name="runinterface">
516 <param name="run_type" value="flexible" />
517 <repeat name="seqfiles">
518 <param name="input" ftype="fastq" value="barcode_splitter1.fastq" />
519 <conditional name="nameinterface">
520 <param name="readtype" value="singleindex" />
521 <param name="readname" value="read" />
522 </conditional>
523 </repeat>
524 </conditional>
525
526 <output name="summary" file="test_5_summary.out" />
527 <output_collection name="split_output_single" type="list">
528 <element name="BC1" ftype="fastq" file="test_1_BC1-read-1.out" />
529 <element name="BC2" ftype="fastq" file="test_1_BC2-read-1.out" />
530 <element name="BC3" ftype="fastq" file="test_1_BC3-read-1.out" />
531 <element name="BC4" ftype="fastq" file="test_5_BC4-read-1.out" />
532 <element name="unmatched" ftype="fastq" file="test_5_unmatched-read-1.out" />
533 <element name="multimatched" ftype="fastq" file="test_1_multimatched-read-1.out" />
534 </output_collection>
535 </test>
536 <test>
537 <!-- flexible with first single end no index, second containing index, 1 separate index, and split all -->
538 <param name="bcfile" value="barcode_splitter_barcodes_dual.txt" />
539 <param name="mismatches" value="2" />
540 <param name="barcodes_at_end" value="" />
541 <param name="split_all" value="True" />
542
543 <conditional name="runinterface">
544 <param name="run_type" value="flexible" />
545 <repeat name="seqfiles">
546 <param name="input" ftype="fastq" value="barcode_splitter1.fastq" />
547 <conditional name="nameinterface">
548 <param name="readtype" value="single" />
549 <param name="readname" value="read1" />
550 </conditional>
551 </repeat>
552 <repeat name="seqfiles">
553 <param name="input" ftype="fastq" value="barcode_splitter_index.fastq" />
554 <conditional name="nameinterface">
555 <param name="readtype" value="singleindex" />
556 <param name="readname" value="read2" />
557 </conditional>
558 </repeat>
559 <repeat name="seqfiles">
560 <param name="input" ftype="fastq" value="barcode_splitter_index_2.fastq" />
561 <conditional name="nameinterface">
562 <param name="readtype" value="index" />
563 <param name="readname" value="index2" />
564 </conditional>
565 </repeat>
566 </conditional>
567
568 <output name="summary" file="test_6_summary.out" />
569 <output_collection name="split_output_multi" type="list">
570 <element name="BC1 read1" ftype="fastq" file="test_6_BC1_read1.fastq" />
571 <element name="BC2 read1" ftype="fastq" file="test_6_BC2_read1.fastq" />
572 <element name="BC3 read1" ftype="fastq" file="test_6_BC3_read1.fastq" />
573 <element name="BC4 read1" ftype="fastq" file="test_6_BC4_read1.fastq" />
574 <element name="unmatched read1" ftype="fastq" file="test_6_unmatched_read1.fastq" />
575 <element name="multimatched read1" ftype="fastq" file="test_6_multimatched_read1.fastq" />
576 <element name="BC1 read2" ftype="fastq" file="test_6_BC1_read2.fastq" />
577 <element name="BC2 read2" ftype="fastq" file="test_6_BC2_read2.fastq" />
578 <element name="BC3 read2" ftype="fastq" file="test_6_BC3_read2.fastq" />
579 <element name="BC4 read2" ftype="fastq" file="test_6_BC4_read2.fastq" />
580 <element name="unmatched read2" ftype="fastq" file="test_6_unmatched_read2.fastq" />
581 <element name="multimatched read2" ftype="fastq" file="test_6_multimatched_read2.fastq" /> <!-- FAILS -->
582 </output_collection>
583 <output_collection name="index_only" type="list">
584 <element name="BC1 index2" ftype="fastq" file="test_6_BC1_index2.fastq" />
585 <element name="BC2 index2" ftype="fastq" file="test_6_BC2_index2.fastq" />
586 <element name="BC3 index2" ftype="fastq" file="test_6_BC3_index2.fastq" />
587 <element name="BC4 index2" ftype="fastq" file="test_6_BC4_index2.fastq" />
588 <element name="unmatched index2" ftype="fastq" file="test_6_unmatched_index2.fastq" />
589 <element name="multimatched index2" ftype="fastq" file="test_6_multimatched_index2.fastq" />
590 </output_collection>
591 </test>
592 <test>
593 <!-- flexible with paired end with one index and no split all -->
594 <param name="bcfile" value="barcode_splitter_barcodes.txt" />
595 <param name="mismatches" value="2" />
596 <param name="barcodes_at_end" value="" />
597 <param name="split_all" value="False" />
598
599 <conditional name="runinterface">
600 <param name="run_type" value="flexible" />
601 <repeat name="seqfiles">
602 <param name="input" ftype="fastq" value="barcode_splitter1.fastq" />
603 <conditional name="nameinterface">
604 <param name="readtype" value="forward" />
605 <param name="readname" value="" />
606 </conditional>
607 </repeat>
608 <repeat name="seqfiles">
609 <param name="input" ftype="fastq" value="barcode_splitter_index.fastq" />
610 <conditional name="nameinterface">
611 <param name="readtype" value="index" />
612 <param name="readname" value="index" />
613 </conditional>
614 </repeat>
615 <repeat name="seqfiles">
616 <param name="input" ftype="fastq" value="barcode_splitter_index_2.fastq" />
617 <conditional name="nameinterface">
618 <param name="readtype" value="reverse" />
619 <param name="readname" value="" />
620 </conditional>
621 </repeat>
622 </conditional>
623
624 <output name="summary" file="test_1_summary.out" />
625 <output_collection name="split_output_paired" type="list:paired">
626 <element name="BC1">
627 <element name="forward" ftype="fastq" file="test_3_BC1-forward.fastq" />
628 <element name="reverse" ftype="fastq" file="test_3_BC1-reverse.fastq" />
629 </element>
630 <element name="BC2">
631 <element name="forward" ftype="fastq" file="test_3_BC2-forward.fastq" />
632 <element name="reverse" ftype="fastq" file="test_3_BC2-reverse.fastq" />
633 </element>
634 <element name="BC3">
635 <element name="forward" ftype="fastq" file="test_3_BC3-forward.fastq" />
636 <element name="reverse" ftype="fastq" file="test_3_BC3-reverse.fastq" />
637 </element>
638 <element name="BC4">
639 <element name="forward" ftype="fastq" file="test_3_BC4-forward.fastq" />
640 <element name="reverse" ftype="fastq" file="test_3_BC4-reverse.fastq" />
641 </element>
642 <element name="unmatched">
643 <element name="forward" ftype="fastq" file="test_3_unmatched-forward.fastq" />
644 <element name="reverse" ftype="fastq" file="test_3_unmatched-reverse.fastq" />
645 </element>
646 <element name="multimatched">
647 <element name="forward" ftype="fastq" file="test_3_multimatched-forward.fastq" />
648 <element name="reverse" ftype="fastq" file="test_3_multimatched-reverse.fastq" />
649 </element>
650 </output_collection>
651 </test>
652 <test>
653 <!-- flexible with paired end with reverseindex, single end, separate index, and no split all -->
654 <param name="bcfile" value="barcode_splitter_barcodes_dual.txt" />
655 <param name="mismatches" value="2" />
656 <param name="barcodes_at_end" value="" />
657 <param name="split_all" value="False" />
658
659 <conditional name="runinterface">
660 <param name="run_type" value="flexible" />
661 <repeat name="seqfiles">
662 <param name="input" ftype="fastq" value="barcode_splitter1.fastq" />
663 <conditional name="nameinterface">
664 <param name="readtype" value="forward" />
665 <param name="readname" value="" />
666 </conditional>
667 </repeat>
668 <repeat name="seqfiles">
669 <param name="input" ftype="fastq" value="barcode_splitter1.fastq" />
670 <conditional name="nameinterface">
671 <param name="readtype" value="single" />
672 <param name="readname" value="read" />
673 </conditional>
674 </repeat>
675 <repeat name="seqfiles">
676 <param name="input" ftype="fastq" value="barcode_splitter_index_2.fastq" />
677 <conditional name="nameinterface">
678 <param name="readtype" value="index" />
679 <param name="readname" value="index" />
680 </conditional>
681 </repeat>
682 <repeat name="seqfiles">
683 <param name="input" ftype="fastq" value="barcode_splitter_index_2.fastq" />
684 <conditional name="nameinterface">
685 <param name="readtype" value="reverseindex" />
686 <param name="readname" value="" />
687 </conditional>
688 </repeat>
689 </conditional>
690
691 <output name="summary" file="test_2_summary.out" />
692 <output_collection name="split_output_paired" type="list:paired">
693 <element name="BC1">
694 <element name="forward" ftype="fastq" file="test_3_BC1-forward.fastq" />
695 <element name="reverse" ftype="fastq" file="test_3_BC1-reverse.fastq" />
696 </element>
697 <element name="BC2">
698 <element name="forward" ftype="fastq" file="test_3_BC2-forward.fastq" />
699 <element name="reverse" ftype="fastq" file="test_3_BC2-reverse.fastq" />
700 </element>
701 <element name="BC3">
702 <element name="forward" ftype="fastq" file="test_3_BC3-forward.fastq" />
703 <element name="reverse" ftype="fastq" file="test_3_BC3-reverse.fastq" />
704 </element>
705 <element name="BC4">
706 <element name="forward" ftype="fastq" file="test_3_BC4-forward.fastq" />
707 <element name="reverse" ftype="fastq" file="test_3_BC4-reverse.fastq" />
708 </element>
709 <element name="unmatched">
710 <element name="forward" ftype="fastq" file="test_3_unmatched-forward.fastq" />
711 <element name="reverse" ftype="fastq" file="test_3_unmatched-reverse.fastq" />
712 </element>
713 <element name="multimatched">
714 <element name="forward" ftype="fastq" file="test_3_multimatched-forward.fastq" />
715 <element name="reverse" ftype="fastq" file="test_3_multimatched-reverse.fastq" />
716 </element>
717 </output_collection>
718 <output_collection name="split_output_paired_other" type="list">
719 <element name="BC1 read" ftype="fastq" file="test_1_BC1-read-1.out" />
720 <element name="BC2 read" ftype="fastq" file="test_1_BC2-read-1.out" />
721 <element name="BC3 read" ftype="fastq" file="test_1_BC3-read-1.out" />
722 <element name="BC4 read" ftype="fastq" file="test_1_BC4-read-1.out" />
723 <element name="unmatched read" ftype="fastq" file="test_1_unmatched-read-1.out" />
724 <element name="multimatched read" ftype="fastq" file="test_1_multimatched-read-1.out" />
725 </output_collection>
726 </test>
727 </tests>
728
729 <help>
730 <![CDATA[
731 **What it does**
732
733 This tool splits a FASTQ file into several files, using barcodes as the split criteria. Barcodes in one file can be used to split multiple sorted files. Multiple sets of barcodes, each located in a different file, can be used.
734
735 --------
736
737 **How it works**
738
739 Given a number of allowed mismatches, all possible mismatching barcode combinations are pre-computed and stored in a hash lookup table. Each barcode column in the barcode file (--bcfile) adds another level to the hash table data structure. For each read group (e.g. forward, reverse, index1, and index2), the index sequence(s) are used to look up the sample they belong to. No pattern matching takes place - it's a simple hash table lookup where the keys being looked up are taken from the sequences in the index files. Barcode collisions are detected during the construction of the hash table before any sequences are processed, which results in warnings and/or errors and reads that match collided barcodes end up in a "multimatched" file. (A barcode collision is when 2 barcodes can match each other when each has an allowed number of mismatches).
740
741 The length of the barcode sequences in the barcodes file must be less than or equal to the length of the sequences in the corresponding index files and all barcodes in 1 column must be the same length (though the lengths of the barcodes between columns may differ).
742
743 There can only be 1 number of mismatches and it is applied per barcode. E.g. If the number of mismatches is set to 1, and there are 2 barcode columns, then two barcodes on the same row may each have 1 mismatch. There is no way (currently) to set a different number of mismatches for different barcode columns.
744
745 If there are 2 barcode columns, the output summary table can have multiple rows where a single sample could not be identified. Ignoring multimatched and error states for the moment, the following 4 rows are possible, but only those with counts greater than 0 will be included in the summary table:
746
747 unmatched unmatched unmatched 1
748 unmatched matched unmatched 2
749 unmatched unmatched matched 3
750 unmatched matched matched 4
751
752 The first column is the ID, which is 'unmatched' in all cases (except the error row). Here's what each row means in the above example:
753
754 1. For 1 read group, neither of the index sequences matched any barcodes in either barcode column.
755 2. For 2 read groups, a barcode in the first barcode column matched but none from the second were matched.
756 3. For 3 read groups, no barcodes in the first column matched but a barcode in the second barcode column did match.
757 4. For 4 read groups, a barcode from each column matched, but they were not in the same row.
758
759 If you encounter large counts in case 4, then barcodes are likely not paired correctly in the barcodes file.
760
761 Two other states can also be reported: multimatched & error. Read groups with 'multimatch' in one or more columns means that with the allowed number of mismatches, the affected index read can match multiple barcodes in the corresponding column. A multimatch will only be reported if the number of mismatches in the 2 matched barcodes are the same. If they are different, barcode_splitter will assign the read group to the better match. If you have any multimatch barcodes or barcode collision warnings, then the barcode design should be improved. The number of differences between any pair of barcodes in a single column should be greater than double the number of allowed mismatches, or else you may end up with numerous multimatch scenarios. A match in another barcode column will not resolve a multimatch in a different column.
762
763 **Barcode file Format**
764
765 Barcode files are simple text files.
766 Each line should contain an identifier (descriptive name for the barcode), and at least 1 barcode, separated by TAB characters. Multiple columns of barcodes are supported (each corresponding to a separate barcoded read file), though there's usually just 1. An example of the usage of multiple sets of barcodes could be the first set of barcodes can denote user and the second set can be each user's sample barcodes.
767 Example::
768
769 #This line is a comment (starts with a 'number' sign)
770 BC1 GATCT TTGCAT
771 BC2 ATCGT GCGCAT
772 BC3 GTGAT AGGTCA
773 BC4 TGTCT CTTTGG
774
775 For each barcode, a new FASTQ file will be created (with the barcodes' identifier as part of the file name).
776 Sequences matching the barcodes in a row will be stored in the appropriate file.
777
778 The first sequence file submitted must contain sequences with the barcodes in the first column of the barcode file. The second sequence file must contain sequences with the barcodes in the second column, and so on. The Number of Index Files supplied must match the number of actual columns in the barcode file and the order in which they are supplied must match the order of the barcode columns as well.
779
780 As many as 2 additional FASTQ output files will be created for each read/index file: the 'unmatched' file and the 'multimatched' file, where sequences not matching any barcode or matching more than 1 barcode (when mismatches are taken into account) will be stored.
781
782 The output of this tool is a summary table displaying the split counts for each barcode identifier and the percentage of the total reads those represent.
783 In addition, each FASTQ file produced will be loaded into the galaxy history as part of a collection list.
784 ]]>
785 </help>
786 <citations>
787 <citation type="bibtex">
788 @misc{paired_sequence_utils,
789 title = {{Barcode}-{Splitter}},
790 url = {https://bitbucket.org/princeton_genomics/barcode_splitter/},
791 author = "Parsons, Lance and Leach, Robert"
792 }
793 </citation>
794 </citations>
795 </tool>