0
|
1 <tool id="cshl_princeton_fastx_barcode_splitter" version="0.4" name="Barcode Splitter">
|
|
2 <description></description>
|
|
3 <command interpreter="bash" detect_errors="aggressive"><![CDATA[
|
|
4 barcode_splitter_galaxy_wrapper.sh split
|
|
5 #for $sf in $seqfiles
|
|
6 ${sf.input.extension}
|
|
7 #break
|
|
8 #end for
|
|
9 --bcfile $bcfile --mismatches $mismatches --galaxy $zip $barcodes_at_end
|
|
10 #for $sf in $seqfiles
|
|
11 ${sf.input}
|
|
12 #end for
|
|
13 --idxread
|
|
14 #set $bound = $num_barcode_columns.value + 1
|
|
15 #for $n in range( 1, $bound )
|
|
16 ${n}
|
|
17 #end for
|
|
18 > $summary
|
|
19 ]]>
|
|
20 </command>
|
|
21
|
|
22 <inputs>
|
|
23 <param format="txt" name="bcfile" type="data" label="Barcode File" help="Tab-delimited text file where the first column is a sample ID and subsequent columns are barcodes." />
|
|
24 <param name="num_barcode_columns" type="integer" size="2" value="1" label="Number of barcode columns" help="The number of columns in the barcode file containing barcode sequences. Note that you must submit at least this many read files." />
|
|
25
|
|
26
|
|
27 <repeat name="seqfiles" title="Read Files" min="1" default="2">
|
|
28 <param format="fasta,fastq,fastqsanger,fastqsolexa,fastqillumina" name="input" type="data" label="Library to split" help="Barcoded reads files must be first. If there are multiple barcode columns in the barcode file, the files must be supplied in the same order as the barcode columns (from left to right)." />
|
|
29 </repeat>
|
|
30
|
|
31 <param name="mismatches" type="integer" size="3" value="0" label="Number of allowed mismatches" />
|
|
32
|
|
33 <param name="barcodes_at_end" type="boolean" truevalue="--barcodes_at_end" falsevalue="" checked="false"
|
|
34 label="Barcodes are at the end of all sequences" help="Default is the beginning of all sequences" />
|
|
35
|
|
36 <param name="zip" type="boolean" truevalue="--gzip" falsevalue="" checked="false"
|
|
37 label="Compress/zip the output" help="This generates reads files with a .gz extension. Default is based on the file extension of the first input file." />
|
|
38
|
|
39 </inputs>
|
|
40
|
|
41 <outputs>
|
|
42 <data format="tabular" name="summary" label="${tool.name} on ${on_string}: Summary" />
|
|
43 <collection name="split_output" type="list" format_source="input" label="${tool.name} on ${on_string}">
|
|
44 <discover_datasets pattern="__designation_and_ext__" directory="split" visible="false" label="${designation}"/>
|
|
45 </collection>
|
|
46 </outputs>
|
|
47
|
|
48 <tests>
|
|
49 <test>
|
|
50 <!-- Split a FASTQ file -->
|
|
51 <param name="bcfile" value="barcode_splitter1.txt" />
|
|
52 <param name="num_barcode_columns" value="1" />
|
|
53 <repeat name="seqfiles">
|
|
54 <param name="input" value="barcode_splitter1.fastq" ftype="fastqsolexa" />
|
|
55 </repeat>
|
|
56 <param name="barcodes_at_end" value="" />
|
|
57 <param name="mismatches" value="2" />
|
|
58 <output name="summary" file="barcode_splitter1.out" />
|
|
59 <collection name="output" type="list">
|
|
60 <discovered_dataset designation="BC1" ftype="fastqsolexa" file="barcode_splitter1_BC1.out" />
|
|
61 <discovered_dataset designation="BC2" ftype="fastqsolexa" file="barcode_splitter1_BC2.out" />
|
|
62 <discovered_dataset designation="BC3" ftype="fastqsolexa" file="barcode_splitter1_BC3.out" />
|
|
63 <discovered_dataset designation="BC4" ftype="fastqsolexa" file="barcode_splitter1_BC4.out" />
|
|
64 <discovered_dataset designation="unmatched" ftype="fastqsolexa" file="barcode_splitter1_unmatched.out" />
|
|
65 </collection>
|
|
66 </test>
|
|
67
|
|
68 <test>
|
|
69 <!-- Split a FASTQ file, using separate index read -->
|
|
70 <param name="bcfile" value="barcode_splitter1.txt" />
|
|
71 <param name="num_barcode_columns" value="1" />
|
|
72 <repeat name="seqfiles">
|
|
73 <param name="input" value="barcode_splitter_index.fastq" ftype="fastqsolexa" />
|
|
74 </repeat>
|
|
75 <repeat name="seqfiles">
|
|
76 <param name="input" value="barcode_splitter1.fastq" ftype="fastqsolexa" />
|
|
77 </repeat>
|
|
78 <param name="barcodes_at_end" value="" />
|
|
79 <param name="mismatches" value="2" />
|
|
80 <output name="output" file="barcode_splitter1.out" />
|
|
81 <collection name="split_output" type="list">
|
|
82 <discovered_dataset designation="BC1" ftype="fastqsolexa" file="barcode_splitter1_BC1.out" />
|
|
83 <discovered_dataset designation="BC2" ftype="fastqsolexa" file="barcode_splitter1_BC2.out" />
|
|
84 <discovered_dataset designation="BC3" ftype="fastqsolexa" file="barcode_splitter1_BC3.out" />
|
|
85 <discovered_dataset designation="BC4" ftype="fastqsolexa" file="barcode_splitter1_BC4.out" />
|
|
86 <discovered_dataset designation="unmatched" ftype="fastqsolexa" file="barcode_splitter1_unmatched.out" />
|
|
87 </collection>
|
|
88 </test>
|
|
89 </tests>
|
|
90
|
|
91 <help><![CDATA[
|
|
92 **What it does**
|
|
93
|
|
94 This tool splits a FASTQ file into several files, using barcodes as the split criteria. Barcodes in one file can be used to split multiple sorted files. Multiple sets of barcodes, each located in a different file, can be used.
|
|
95
|
|
96 --------
|
|
97
|
|
98 **Barcode file Format**
|
|
99
|
|
100 Barcode files are simple text files.
|
|
101 Each line should contain an identifier (descriptive name for the barcode), and at least 1 barcode, separated by TAB characters. Multiple columns of barcodes are supported (each corresponding to a separate barcoded read file), though there's usually just 1. An example of the usage of multiple sets of barcodes could be the first set of barcodes can denote user and the second set can be each user's sample barcodes.
|
|
102 Example::
|
|
103
|
|
104 #This line is a comment (starts with a 'number' sign)
|
|
105 BC1 GATCT TTGCAT
|
|
106 BC2 ATCGT GCGCAT
|
|
107 BC3 GTGAT AGGTCA
|
|
108 BC4 TGTCT CTTTGG
|
|
109
|
|
110 For each barcode, a new FASTQ file will be created (with the barcodes' identifier as part of the file name).
|
|
111 Sequences matching the barcodes in a row will be stored in the appropriate file.
|
|
112
|
|
113 The first sequence file submitted must contain sequences with the barcodes in the first column of the barcode file. The second sequence file must contain sequences with the barcodes in the second column, and so on. The 'Number of barcode columns' specified must match the number of actual columns in the barcode file.
|
|
114
|
|
115 One (possibly two) additional FASTQ files will be created: the 'unmatched' file (and the 'multimatched' file), where sequences not matching any barcode (or matching more than 1 barcode when mismatches are taken into account) will be stored.
|
|
116
|
|
117 The output of this tool is a summary table displaying the split counts for each barcode identifier and the percentage of the total reads those represent.
|
|
118 In addition, each fastq file produced will be loaded into the galaxy history as part of a collection list.
|
|
119 ]]>
|
|
120 </help>
|
|
121
|
|
122 <!-- Barcode-Splitter is part of the paired_sequence_utils package, by L.Parsons (lparsons@princeton.edu) and R.Leach (rleach@princeton.edu) -->
|
|
123 <citations>
|
|
124 <citation type="bibtex">
|
|
125 @misc{paired_sequence_utils,
|
|
126 title = {{Barcode}-{Splitter}},
|
|
127 url = {https://bitbucket.org/hepcat72/paired_sequence_utils},
|
|
128 author = "Parsons, Lance and Leach, Robert"
|
|
129 }
|
|
130 </citation>
|
|
131 </citations>
|
|
132
|
|
133 </tool>
|