0
|
1 <tool id="je_demultiplex" name="Je-Demultiplex" version="1.0">
|
|
2 <description>demultiplexes fastq files</description>
|
|
3 <macros>
|
|
4 <import>macros.xml</import>
|
|
5 </macros>
|
|
6 <stdio>
|
|
7 <exit_code range="1:" level="fatal" description="Tool exception" />
|
|
8 </stdio>
|
|
9 <version_command>echo '1.0'</version_command>
|
|
10 <command interpreter="bash">
|
|
11 <![CDATA[
|
|
12 je demultiplex
|
|
13
|
|
14 ## Fastq inputs
|
|
15 @single_or_paired_cmd@
|
|
16 #if str( $library.type ) != "single":
|
|
17 @demultiplex_paired_end_cmd_options@
|
|
18 #end if
|
|
19
|
|
20 @barcode_option_cmd@
|
|
21 @barcode_len_cmd@
|
|
22 C=$CLIP_BARCODE
|
|
23
|
|
24 @demultiplexer_common_options_cmd@
|
|
25 @common_options_cmd@
|
|
26
|
|
27 @demultiplexer_common_output_options_cmd@
|
|
28 @demultiplexer_common_outputs_cmd@
|
|
29
|
|
30 ]]>
|
|
31 </command>
|
|
32 <configfiles>
|
|
33 <expand macro="barcode_config_file"></expand>
|
|
34 </configfiles>
|
|
35 <inputs>
|
|
36 <!-- single/paired - similar to macro 'single_or_paired_general' -->
|
|
37 <expand macro="single_or_paired_general">
|
|
38 <expand macro="demultiplex_paired_end_options"/>
|
|
39 </expand>
|
|
40
|
|
41 <expand macro="barcode_option"/>
|
|
42 <expand macro="barcode_len_option"/>
|
|
43 <expand macro="clip_barcode"/>
|
|
44
|
|
45 <expand macro="demultiplexer_common_options"/>
|
|
46
|
|
47 <expand macro="common_options"/>
|
|
48
|
|
49 <expand macro="demultiplexer_common_output_options"/>
|
|
50
|
|
51 </inputs>
|
|
52 <outputs>
|
|
53 <expand macro="demultiplexer_common_outputs"/>
|
|
54 </outputs>
|
|
55
|
|
56 <tests>
|
|
57 <test>
|
|
58 <!-- simple test on single end data -->
|
|
59 <param name="type" value="single"/>
|
|
60 <param name="input_1" value="file_1_sequence.txt" ftype="fastqsanger"/>
|
|
61 <param name="BARCODE_FILE" value="barcodes_SE.txt" ftype="tabular"/>
|
|
62 <output name="METRICS_FILE_NAME" file="summary_SE.txt" ftype="tabular" lines_diff="4">
|
|
63 <discovered_dataset designation="unassigned_1" file="unassigned_1_SE.txt" />
|
|
64 </output>
|
|
65 </test>
|
|
66 <test>
|
|
67 <!-- more complex test on paired end data with different barcode for fwd/rev -->
|
|
68 <param name="type" value="paired"/>
|
|
69 <param name="input_1" value="file_1_sequence.txt" ftype="fastqsanger"/>
|
|
70 <param name="input_2" value="file_2_sequence.txt" ftype="fastqsanger"/>
|
|
71
|
|
72 <param name="BPOS" value="BOTH"/>
|
|
73 <param name="BM" value="BOTH"/>
|
|
74 <param name="BRED" value="false"/>
|
|
75
|
|
76 <param name="barcode_list_type_con" value="text"/>
|
|
77 <param name="barcode_text"
|
|
78 value="sample1 CACTGT:GTATAG sample2 ATTCCG:TCCGTC sample3 GCTACC:TGGTCA sample4 CGAAAC:CACTGT"/>
|
|
79 <output name="METRICS_FILE_NAME" file="summary_PE.txt" ftype="tabular" lines_diff="4">
|
|
80 <discovered_dataset designation="unassigned_1" file="unassigned_1_PE.txt" />
|
|
81 <discovered_dataset designation="unassigned_2" file="unassigned_2_PE.txt" />
|
|
82 <discovered_dataset designation="sample4_CGAAACCACTGT_2" file="sample4_CGAAACCACTGT_2.txt"/>
|
|
83 <discovered_dataset designation="sample4_CGAAACCACTGT_1" file="sample4_CGAAACCACTGT_1.txt"/>
|
|
84 <discovered_dataset designation="sample3_GCTACCTGGTCA_2" file="sample3_GCTACCTGGTCA_2.txt"/>
|
|
85 <discovered_dataset designation="sample3_GCTACCTGGTCA_1" file="sample3_GCTACCTGGTCA_1.txt"/>
|
|
86 <discovered_dataset designation="sample2_ATTCCGTCCGTC_2" file="sample2_ATTCCGTCCGTC_2.txt"/>
|
|
87 <discovered_dataset designation="sample2_ATTCCGTCCGTC_1" file="sample2_ATTCCGTCCGTC_1.txt"/>
|
|
88 <discovered_dataset designation="sample1_CACTGTGTATAG_2" file="sample1_CACTGTGTATAG_2.txt"/>
|
|
89 <discovered_dataset designation="sample1_CACTGTGTATAG_1" file="sample1_CACTGTGTATAG_1.txt"/>
|
|
90 </output>
|
|
91 </test>
|
|
92 </tests>
|
|
93
|
|
94
|
|
95 <help>
|
|
96 <![CDATA[
|
|
97 **What it does**
|
|
98
|
|
99 Je demultiplex: A fastq file demultiplexer with optional handling of Unique Molecular Identifiers for further use
|
|
100 in 'markdupes' module.
|
|
101 Input files are fastq files, and can be in gzip compressed format.
|
|
102
|
|
103 Author: Charles Girardot (charles.girardot@embl.de).
|
|
104
|
|
105 Wrapper by: Jelle Scholtalbers (jelle.scholtalbers@embl.de).
|
|
106
|
|
107 ------
|
|
108
|
|
109 **Know what you are doing**
|
|
110
|
|
111 .. class:: warningmark
|
|
112
|
|
113 You will want to read the `documentation`__.
|
|
114
|
|
115 .. __: http://gbcs.embl.de/portal/Je
|
|
116
|
|
117 ------
|
|
118
|
|
119 **Parameter list**
|
|
120
|
|
121 This is an exhaustive list of options::
|
|
122
|
|
123 FASTQ_FILE1=File
|
|
124 F1=File
|
|
125
|
|
126 Input fastq file (optionally gzipped) for single end data, or first read in paired end
|
|
127 data.
|
|
128
|
|
129 Required.
|
|
130
|
|
131 FASTQ_FILE2=File
|
|
132 F2=File
|
|
133
|
|
134 Input fastq file (optionally gzipped) for the second read of paired end data.
|
|
135
|
|
136 Default value: null.
|
|
137
|
|
138 BARCODE_FILE=File
|
|
139 BF=File
|
|
140
|
|
141 Barcode file describing sequence list and sample names. Tab-delimited file with 2
|
|
142 columns, with the sample in col1 and the corresponding barcode in col2.
|
|
143 Simple barcode file format : 2 tab-delimited colums
|
|
144 If multiple barcode map to the same sample, either line can be duplicated e.g.
|
|
145 sample1 ATAT
|
|
146 sample1 GAGG
|
|
147 sample2 CCAA
|
|
148 sample2 TGTG
|
|
149 Or barcodes can be combined using the OR operator '|' i.e. the file above can be
|
|
150 re-written like
|
|
151 sample1 ATAT|GAGG
|
|
152 sample2 CCAA|TGTG
|
|
153 Finally, for the special situation of paired-end data in which barcodes differ at both
|
|
154 ends (ie BPOS=BOTH BRED=false BM=BOTH , see BRED option description), barcodes for read_1
|
|
155 and read_2 can be distinguished using a ':' separator i.e.
|
|
156 sample1 ATAT:GAGG
|
|
157 sample2 CCAA:TGTG
|
|
158 This above syntax means that sample 1 is encoded with ATAT barcode at read_1 AND GAGG
|
|
159 barcode at read_2. Note that you can still combine barcodes using | e.g.
|
|
160 sample1 ATAT|GAGG:CCAA|TGTG
|
|
161 would mean that sample 1 is mapped by the combination of barcode: ATAT OR GAGG at read_1
|
|
162 AND CCAA OR TGTG at read_2.
|
|
163 Extended barcode file format : 3 (single-end) or 4 (paired-end) tab-delimited colums
|
|
164 same as the simple barcode file format but the extra columns contains the file name(s)
|
|
165 to use to name output files. A unique extra column is expected for single-end while 2
|
|
166 extra columns are expected for paired-end. In case, lines are duplicated (multiple
|
|
167 barcodesmapping the same sample), the same file name should be indicated in the third
|
|
168 (and fourth) column(s).
|
|
169 sample1 ATAT spl1_1.txt.gz spl1_2.txt.gz
|
|
170 sample1 GAGG spl1_1.txt.gz spl1_2.txt.gz
|
|
171 sample2 CCAA spl2_1.txt.gz spl2_2.txt.gz
|
|
172 Or
|
|
173 sample1 ATAT|GAGG:CCAA|TGTG spl1_1.txt.gz spl1_2.txt.gz
|
|
174 Ns in barcode sequence are allowed and are used to flag positions that should be ignored
|
|
175 in sample matching
|
|
176 i.e. they will be clipped off the read sequence (like in iCLIP protocol).
|
|
177
|
|
178 Required.
|
|
179
|
|
180 BARCODE_READ_POS=BarcodePosition
|
|
181 BPOS=BarcodePosition
|
|
182
|
|
183 For paired-end data, where to expect the barcode(s) :
|
|
184 READ_1 (beginning of read from FASTQ_FILE_1),
|
|
185 READ_2 (beginning of read from FASTQ_FILE_2),
|
|
186 BOTH (beginning of both reads).
|
|
187 Automatically set to READ_1 in single end mode.
|
|
188
|
|
189 Default value: BOTH. This option can be set to 'null' to clear the default value.
|
|
190 Possible values: {READ_1, READ_2, BOTH, NONE}
|
|
191
|
|
192 BCLEN=String
|
|
193 LEN=String
|
|
194
|
|
195 Length of the barcode sequences, optional. Taken from barcode file when not given.
|
|
196 In situations where BARCODE_READ_POS == BOTH AND REDUNDANT_BARCODES=false, two distinct
|
|
197 length can be provided using the syntax LEN=X:Z where X and Z are 2 integers representing
|
|
198 the barcode length for read_1 and read_2 respectively.
|
|
199
|
|
200 Default value: null.
|
|
201
|
|
202 BARCODE_FOR_SAMPLE_MATCHING=BarcodePosition
|
|
203 BM=BarcodePosition
|
|
204
|
|
205 Indicates which barcode(s) should be used for sample lookup
|
|
206 Automatically set to READ_1 in single end mode.
|
|
207 For paired-end data and when BARCODE_READ_POS == BOTH, which barcode should be used to
|
|
208 resolve sample:
|
|
209 use BM=READ_1 (beginning of read from FASTQ_FILE_1) if only this read should be used
|
|
210 for sample matching:
|
|
211 use BM=READ_2 (beginning of read from FASTQ_FILE_2) if only this read should be used
|
|
212 for sample matching:
|
|
213 use BM=BOTH (beginning of both reads) if both should be used.
|
|
214
|
|
215 When BM=BOTH, the behaviour is different based on the value of REDUNDANT_BARCODES :
|
|
216 If REDUNDANT_BARCODES=true, the two barcodes are considered to map to the same sample
|
|
217 and 'Je demultiplex' uses the two barcodes according to the STRICT value.
|
|
218 If REDUNDANT_BARCODES=false, the barcode file should map a couple of barcode to each
|
|
219 sample (e.g. sample1 => AGAGTG:TTGATA) and 'Je demultiplex' needs both barcodes to find
|
|
220 the relevant sample. Note that this is the only situation in which all barcode matching
|
|
221 options (MM, MMD, Q) accept different values for both barcodes in the form X:Z where X
|
|
222 and Z are 2 integers.
|
|
223
|
|
224 Default value: BOTH. This option can be set to 'null' to clear the default value.
|
|
225 Possible values: {READ_1, READ_2, BOTH, NONE}
|
|
226
|
|
227
|
|
228 REDUNDANT_BARCODES=Boolean
|
|
229 BRED=Boolean
|
|
230
|
|
231 This option only applies for paired-end data with BARCODE_READ_POS set to 'BOTH'
|
|
232 Indicates if both read's barcodes encode redundant information or if barcodes are
|
|
233 supposed to be identical at both ends (or to resolve to the same sample when a pool of
|
|
234 barcodes is used per sample).
|
|
235 When REDUNDANT_BARCODES=false, the 2 barcodes potentially encode
|
|
236 different information. For example, only one of the barcodes encodes the sample identity
|
|
237 while
|
|
238 the second barcode might be a random barcode (UMI) to tell apart PCR artefacts from real
|
|
239 duplicates.
|
|
240 Another example is when both barcodes should be used in a combined fashion to resolve the
|
|
241 sample.
|
|
242 In the first example, you should use BPOS=BOTH BRED=false BM=READ_1.
|
|
243 In the second example, you should have BPOS=BOTH BRED=false BM=BOTH.
|
|
244 Note that with BPOS=BOTH BRED=true BM=BOTH, the behavior would be different as
|
|
245 'demultiplex' would then check the STRICT option to perform sample resolution.
|
|
246 Importantly, when BARCODE_READ_POS (BPOS) == BOTH AND REDUNDANT_BARCODES=false, BLEN,
|
|
247 barcode matching options (MM, MMD, Q) and read trimming/clipping options (XT, ZT) accept
|
|
248 different values for both barcodes in the form X:Z where X and Z are 2 integers.
|
|
249
|
|
250 Default value: true. This option can be set to 'null' to clear the default value.
|
|
251 Possible values: {true, false}
|
|
252
|
|
253 STRICT=Boolean
|
|
254 S=Boolean
|
|
255
|
|
256 For paired-end data and when two distinct barcodes/indices are used to encode samples,
|
|
257 this option tells if both barcodes should resolve to the same sample.
|
|
258 When true and if only one of the two reads has a barcode match, the read pair is
|
|
259 'unassigned'.
|
|
260 When false and if only one of the two reads has a barcode match, the read pair is
|
|
261 assigned to the
|
|
262 corresponding sample
|
|
263 When reads resolve to different samples, the read pair is always 'unassigned'.
|
|
264
|
|
265 Default value: false. This option can be set to 'null' to clear the default value.
|
|
266 Possible values: {true, false}
|
|
267
|
|
268 MAX_MISMATCHES=String
|
|
269 MM=String
|
|
270
|
|
271 Maximum mismatches for a barcode to be considered a match. In situations where both
|
|
272 barcodes are used for sample matching i.e. BPOS=BOTH BM=BOTH (or 2 INDEX_FILE given), two
|
|
273 distinct
|
|
274 values can be given here using the syntax MM=X:Z where X and Z are 2 integers to use for
|
|
275 read_1 and read_2 respectively.
|
|
276 MM=null is like MM=0
|
|
277
|
|
278 Default value: 1. This option can be set to 'null' to clear the default value.
|
|
279
|
|
280 MIN_MISMATCH_DELTA=String
|
|
281 MMD=String
|
|
282
|
|
283 Minimum difference between the number of mismatches against the best and the second best
|
|
284 barcode. When MMD is not respected, the read remains unassigned.
|
|
285 When two distinct barcodes are used for sample matching (dual encoding), two distinct
|
|
286 values can be given using the syntax MMD=X:Z where X and Z are 2 integers to use for
|
|
287 first (e.g. from read_1 or index_1)
|
|
288 MMD=null is like MMD=0
|
|
289
|
|
290 Default value: 1. This option can be set to 'null' to clear the default value.
|
|
291
|
|
292 MIN_BASE_QUALITY=String
|
|
293 Q=String
|
|
294
|
|
295 Minimum base quality during barcode matching: bases which quality is less than this
|
|
296 cutoff are always considered as a mismatch.When two distinct barcodes are used for sample
|
|
297 matching (dual encoding), two distinct values can be given using the syntax Q=X:Z where X
|
|
298 and Z are 2 integers to use for first (e.g. from read_1 or index_1) and second barcode
|
|
299 (e.g. from read_2 or index_2) respectively.
|
|
300 Q=null is like Q=0.
|
|
301
|
|
302 Default value: 10. This option can be set to 'null' to clear the default value.
|
|
303
|
|
304 XTRIMLEN=String
|
|
305 XT=String
|
|
306
|
|
307 Optional extra number of base to be trimmed right after the barcode (only used if
|
|
308 CLIP_BARCODE=true).
|
|
309 When running paired-end, two distinct values can be given using the syntax XT=X:Z where X
|
|
310 and Z are 2 integers to use for read_1 and read_2 respectively. Note that even when
|
|
311 BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode as to
|
|
312 end up with reads of the same length (note that this can also be operated using ZT). If a
|
|
313 unique value is given, e.g. XT=1, while running paired-end the following rule applies:
|
|
314 (1) BPOS=READ_1 or BPOS=READ_2, no trim is applied at the read w/o barcode
|
|
315 (2) BPOS=BOTH, the value is used for both reads.
|
|
316
|
|
317 Note that XT=null is like XT=0.
|
|
318 Default value: 0. This option can be set to 'null' to clear the default value.
|
|
319
|
|
320 ZTRIMLEN=String
|
|
321 ZT=String
|
|
322
|
|
323 Optional extra number of bases to be trimmed from the read end i.e. 3' end.
|
|
324 When running paired-end, two distinct values can be given here using the syntax ZT=X:Z
|
|
325 where X and Z are 2 integers to use for read_1 and read_2 respectively. Note that even
|
|
326 when BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode
|
|
327 as to end up with reads of the same length (note that this can also be operated using
|
|
328 XT). Note that if a single value is passed, the value always applies to both reads in
|
|
329 paired-end mode without further consideration.
|
|
330 ZT=null is like ZT=0.
|
|
331
|
|
332 Default value: 0. This option can be set to 'null' to clear the default value.
|
|
333
|
|
334 CLIP_BARCODE=Boolean
|
|
335 C=Boolean
|
|
336
|
|
337 Clip barcode sequence from read sequence, as well as XTRIMLEN (and ZTRIMLEN) bases if
|
|
338 applicable, before writing to output file.
|
|
339 If false, reads are written without modification to output file.
|
|
340 Apply to both barcodes when BPOS=BOTH.
|
|
341
|
|
342 Default value: true. This option can be set to 'null' to clear the default value.
|
|
343 Possible values: {true, false}
|
|
344
|
|
345 ADD_BARCODE_TO_HEADER=Boolean
|
|
346 ADD=Boolean
|
|
347
|
|
348 Add barcode at the end of the read header. Apply to both barcodes when BPOS=BOTH.
|
|
349 If true, the string ':barcode' is added at the end of the read header with a ':' added
|
|
350 only if current read header does not end with ':'.
|
|
351 If both reads of the pair have a barcode (i.e. BARCODE_READ_POS == BOTH), thenthe second
|
|
352 read also has its own matched barcode written. Else, the read without a barcode receives
|
|
353 the barcode from the barcoded read.
|
|
354 For example:
|
|
355 @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:
|
|
356 becomes:
|
|
357 @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:BARCODE
|
|
358
|
|
359 When barcodes containing random positions, i.e. 'N', (for example like in the iCLIP
|
|
360 protocol) or are UMIs, the added sequence is the sequence clipped from the read and NOT
|
|
361 the matched barcode.
|
|
362
|
|
363 Default value: true. This option can be set to 'null' to clear the default value.
|
|
364 Possible values: {true, false}
|
|
365
|
|
366
|
|
367 ENSURE_IDENTICAL_HEADER_NAMES=Boolean
|
|
368 SAME_HEADERS=Boolean
|
|
369
|
|
370 Makes sure that headers of both reads of a pair are identical, using the following read
|
|
371 header pattern (for both reads of a pair):
|
|
372 @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 SAMPLEBARCODE_READ1:SAMPLEBARCODE_READ2(:CLIPPED_SEQ_FROMREAD1:CLIPPED_SEQ_FROMREAD2)
|
|
373 This option only makes sense in
|
|
374 paired end mode and ADD=true. Some (if not all) mappers will indeed complain when the
|
|
375 read headers are not identical. When molecular barcodes are present in reads (either as
|
|
376 additional barcodes or as degenerate barcodes ie with 'N') and the RCHAR is used, you
|
|
377 will end with (problematic) read headers like this:
|
|
378 HISEQ:44:C6KC0ANXX:5:1101:1491:1994:1:N:0:TAGAACAC:TGGAGTAG
|
|
379 HISEQ:44:C6KC0ANXX:5:1101:1491:1994:3:N:0:TAGAACAC:CGTTGTAT
|
|
380 SAME_HEADERS=true will instead generates the following identical header for both reads:
|
|
381 HISEQ:44:C6KC0ANXX:5:1101:1491:1994:TAGAACAC:TGGAGTAG:CGTTGTAT
|
|
382 Note that we also clipped the useless '1:N:0' and '3:N:0' has they will also result in
|
|
383 generating different headers.
|
|
384 Important: this option will force RCHAR=: UNLESS you specify RCHAR=null ; in which
|
|
385 case a space will be preserved ie:
|
|
386 HISEQ:44:C6KC0ANXX:5:1101:1491:1994 TAGAACAC:TGGAGTAG:CGTTGTAT
|
|
387
|
|
388 Default value: true. This option can be set to 'null' to clear the default value.
|
|
389 Possible values: {true, false}
|
|
390
|
|
391
|
|
392 READ_NAME_REPLACE_CHAR=String
|
|
393 RCHAR=String
|
|
394
|
|
395 Replace spaces in read name/header using provided character. This is particularly handy
|
|
396 when you need to retain ADDed barcode in read name/header during mapping (everything
|
|
397 after space in read name is usually clipped in BAM files). For example, with RCHAR=':':
|
|
398 @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:
|
|
399 becomes
|
|
400 @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965:2:N:0:BARCODE
|
|
401 Default value: null.
|
|
402
|
|
403 QUALITY_FORMAT=FastqQualityFormat
|
|
404 V=FastqQualityFormat
|
|
405
|
|
406 A value describing how the quality values are encoded in the fastq. Either 'Solexa' for
|
|
407 pre-pipeline 1.3 style scores (solexa scaling + 66), 'Illumina' for pipeline 1.3 and
|
|
408 above (phred scaling + 64) or 'Standard' for phred scaled scores with a character shift
|
|
409 of 33. If this value is not specified (or 'null' is given), the quality format will be
|
|
410 detected.
|
|
411
|
|
412 Default value: Standard. This option can be set to 'null' to clear the default value.
|
|
413 Possible values: {Solexa, Illumina, Standard}
|
|
414
|
|
415 KEEP_UNASSIGNED_READ=Boolean
|
|
416 UN=Boolean
|
|
417
|
|
418 Should un-assigned reads be saved in files or simply ignored. File names are
|
|
419 automatically created or can be given using UF1 & UF2 options.
|
|
420
|
|
421 Default value: true. This option can be set to 'null' to clear the default value.
|
|
422 Possible values: {true, false}
|
|
423
|
|
424 BARCODE_DIAG_FILE=String
|
|
425 DIAG=String
|
|
426
|
|
427 Name for a barcode match reporting file (not generated by default).Either a name (in
|
|
428 which case the file will be created in the output dir) or full path. This file will
|
|
429 contain a line per read pair with the barcode best matching the read subsequence or
|
|
430 'null' when no match is found according to matching parameters ; and the final selected
|
|
431 sample. This file is useful for debugging or further processing in case both ends are
|
|
432 barcoded.
|
|
433 N.B: this file will have a size of about one of the fastq input files.
|
|
434
|
|
435 Default value: null.
|
|
436 ]]>
|
|
437 </help>
|
|
438
|
|
439 </tool>
|