comparison je-demultiplex.xml @ 0:424f44e2124e draft

Initial upload
author gbcs-embl-heidelberg
date Wed, 25 Nov 2015 12:37:28 -0500
parents
children 8930b411a9d7
comparison
equal deleted inserted replaced
-1:000000000000 0:424f44e2124e
1 <tool id="je_demultiplex" name="Je-Demultiplex" version="1.0">
2 <description>demultiplexes fastq files</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <stdio>
7 <exit_code range="1:" level="fatal" description="Tool exception" />
8 </stdio>
9 <version_command>echo '1.0'</version_command>
10 <command interpreter="bash">
11 <![CDATA[
12 je demultiplex
13
14 ## Fastq inputs
15 @single_or_paired_cmd@
16 #if str( $library.type ) != "single":
17 @demultiplex_paired_end_cmd_options@
18 #end if
19
20 @barcode_option_cmd@
21 @barcode_len_cmd@
22 C=$CLIP_BARCODE
23
24 @demultiplexer_common_options_cmd@
25 @common_options_cmd@
26
27 @demultiplexer_common_output_options_cmd@
28 @demultiplexer_common_outputs_cmd@
29
30 ]]>
31 </command>
32 <configfiles>
33 <expand macro="barcode_config_file"></expand>
34 </configfiles>
35 <inputs>
36 <!-- single/paired - similar to macro 'single_or_paired_general' -->
37 <expand macro="single_or_paired_general">
38 <expand macro="demultiplex_paired_end_options"/>
39 </expand>
40
41 <expand macro="barcode_option"/>
42 <expand macro="barcode_len_option"/>
43 <expand macro="clip_barcode"/>
44
45 <expand macro="demultiplexer_common_options"/>
46
47 <expand macro="common_options"/>
48
49 <expand macro="demultiplexer_common_output_options"/>
50
51 </inputs>
52 <outputs>
53 <expand macro="demultiplexer_common_outputs"/>
54 </outputs>
55
56 <tests>
57 <test>
58 <!-- simple test on single end data -->
59 <param name="type" value="single"/>
60 <param name="input_1" value="file_1_sequence.txt" ftype="fastqsanger"/>
61 <param name="BARCODE_FILE" value="barcodes_SE.txt" ftype="tabular"/>
62 <output name="METRICS_FILE_NAME" file="summary_SE.txt" ftype="tabular" lines_diff="4">
63 <discovered_dataset designation="unassigned_1" file="unassigned_1_SE.txt" />
64 </output>
65 </test>
66 <test>
67 <!-- more complex test on paired end data with different barcode for fwd/rev -->
68 <param name="type" value="paired"/>
69 <param name="input_1" value="file_1_sequence.txt" ftype="fastqsanger"/>
70 <param name="input_2" value="file_2_sequence.txt" ftype="fastqsanger"/>
71
72 <param name="BPOS" value="BOTH"/>
73 <param name="BM" value="BOTH"/>
74 <param name="BRED" value="false"/>
75
76 <param name="barcode_list_type_con" value="text"/>
77 <param name="barcode_text"
78 value="sample1 CACTGT:GTATAG&#10;sample2 ATTCCG:TCCGTC&#10;sample3 GCTACC:TGGTCA&#10;sample4 CGAAAC:CACTGT"/>
79 <output name="METRICS_FILE_NAME" file="summary_PE.txt" ftype="tabular" lines_diff="4">
80 <discovered_dataset designation="unassigned_1" file="unassigned_1_PE.txt" />
81 <discovered_dataset designation="unassigned_2" file="unassigned_2_PE.txt" />
82 <discovered_dataset designation="sample4_CGAAACCACTGT_2" file="sample4_CGAAACCACTGT_2.txt"/>
83 <discovered_dataset designation="sample4_CGAAACCACTGT_1" file="sample4_CGAAACCACTGT_1.txt"/>
84 <discovered_dataset designation="sample3_GCTACCTGGTCA_2" file="sample3_GCTACCTGGTCA_2.txt"/>
85 <discovered_dataset designation="sample3_GCTACCTGGTCA_1" file="sample3_GCTACCTGGTCA_1.txt"/>
86 <discovered_dataset designation="sample2_ATTCCGTCCGTC_2" file="sample2_ATTCCGTCCGTC_2.txt"/>
87 <discovered_dataset designation="sample2_ATTCCGTCCGTC_1" file="sample2_ATTCCGTCCGTC_1.txt"/>
88 <discovered_dataset designation="sample1_CACTGTGTATAG_2" file="sample1_CACTGTGTATAG_2.txt"/>
89 <discovered_dataset designation="sample1_CACTGTGTATAG_1" file="sample1_CACTGTGTATAG_1.txt"/>
90 </output>
91 </test>
92 </tests>
93
94
95 <help>
96 <![CDATA[
97 **What it does**
98
99 Je demultiplex: A fastq file demultiplexer with optional handling of Unique Molecular Identifiers for further use
100 in 'markdupes' module.
101 Input files are fastq files, and can be in gzip compressed format.
102
103 Author: Charles Girardot (charles.girardot@embl.de).
104
105 Wrapper by: Jelle Scholtalbers (jelle.scholtalbers@embl.de).
106
107 ------
108
109 **Know what you are doing**
110
111 .. class:: warningmark
112
113 You will want to read the `documentation`__.
114
115 .. __: http://gbcs.embl.de/portal/Je
116
117 ------
118
119 **Parameter list**
120
121 This is an exhaustive list of options::
122
123 FASTQ_FILE1=File
124 F1=File
125
126 Input fastq file (optionally gzipped) for single end data, or first read in paired end
127 data.
128
129 Required.
130
131 FASTQ_FILE2=File
132 F2=File
133
134 Input fastq file (optionally gzipped) for the second read of paired end data.
135
136 Default value: null.
137
138 BARCODE_FILE=File
139 BF=File
140
141 Barcode file describing sequence list and sample names. Tab-delimited file with 2
142 columns, with the sample in col1 and the corresponding barcode in col2.
143 Simple barcode file format : 2 tab-delimited colums
144 If multiple barcode map to the same sample, either line can be duplicated e.g.
145 sample1 ATAT
146 sample1 GAGG
147 sample2 CCAA
148 sample2 TGTG
149 Or barcodes can be combined using the OR operator '|' i.e. the file above can be
150 re-written like
151 sample1 ATAT|GAGG
152 sample2 CCAA|TGTG
153 Finally, for the special situation of paired-end data in which barcodes differ at both
154 ends (ie BPOS=BOTH BRED=false BM=BOTH , see BRED option description), barcodes for read_1
155 and read_2 can be distinguished using a ':' separator i.e.
156 sample1 ATAT:GAGG
157 sample2 CCAA:TGTG
158 This above syntax means that sample 1 is encoded with ATAT barcode at read_1 AND GAGG
159 barcode at read_2. Note that you can still combine barcodes using | e.g.
160 sample1 ATAT|GAGG:CCAA|TGTG
161 would mean that sample 1 is mapped by the combination of barcode: ATAT OR GAGG at read_1
162 AND CCAA OR TGTG at read_2.
163 Extended barcode file format : 3 (single-end) or 4 (paired-end) tab-delimited colums
164 same as the simple barcode file format but the extra columns contains the file name(s)
165 to use to name output files. A unique extra column is expected for single-end while 2
166 extra columns are expected for paired-end. In case, lines are duplicated (multiple
167 barcodesmapping the same sample), the same file name should be indicated in the third
168 (and fourth) column(s).
169 sample1 ATAT spl1_1.txt.gz spl1_2.txt.gz
170 sample1 GAGG spl1_1.txt.gz spl1_2.txt.gz
171 sample2 CCAA spl2_1.txt.gz spl2_2.txt.gz
172 Or
173 sample1 ATAT|GAGG:CCAA|TGTG spl1_1.txt.gz spl1_2.txt.gz
174 Ns in barcode sequence are allowed and are used to flag positions that should be ignored
175 in sample matching
176 i.e. they will be clipped off the read sequence (like in iCLIP protocol).
177
178 Required.
179
180 BARCODE_READ_POS=BarcodePosition
181 BPOS=BarcodePosition
182
183 For paired-end data, where to expect the barcode(s) :
184 READ_1 (beginning of read from FASTQ_FILE_1),
185 READ_2 (beginning of read from FASTQ_FILE_2),
186 BOTH (beginning of both reads).
187 Automatically set to READ_1 in single end mode.
188
189 Default value: BOTH. This option can be set to 'null' to clear the default value.
190 Possible values: {READ_1, READ_2, BOTH, NONE}
191
192 BCLEN=String
193 LEN=String
194
195 Length of the barcode sequences, optional. Taken from barcode file when not given.
196 In situations where BARCODE_READ_POS == BOTH AND REDUNDANT_BARCODES=false, two distinct
197 length can be provided using the syntax LEN=X:Z where X and Z are 2 integers representing
198 the barcode length for read_1 and read_2 respectively.
199
200 Default value: null.
201
202 BARCODE_FOR_SAMPLE_MATCHING=BarcodePosition
203 BM=BarcodePosition
204
205 Indicates which barcode(s) should be used for sample lookup
206 Automatically set to READ_1 in single end mode.
207 For paired-end data and when BARCODE_READ_POS == BOTH, which barcode should be used to
208 resolve sample:
209 use BM=READ_1 (beginning of read from FASTQ_FILE_1) if only this read should be used
210 for sample matching:
211 use BM=READ_2 (beginning of read from FASTQ_FILE_2) if only this read should be used
212 for sample matching:
213 use BM=BOTH (beginning of both reads) if both should be used.
214
215 When BM=BOTH, the behaviour is different based on the value of REDUNDANT_BARCODES :
216 If REDUNDANT_BARCODES=true, the two barcodes are considered to map to the same sample
217 and 'Je demultiplex' uses the two barcodes according to the STRICT value.
218 If REDUNDANT_BARCODES=false, the barcode file should map a couple of barcode to each
219 sample (e.g. sample1 => AGAGTG:TTGATA) and 'Je demultiplex' needs both barcodes to find
220 the relevant sample. Note that this is the only situation in which all barcode matching
221 options (MM, MMD, Q) accept different values for both barcodes in the form X:Z where X
222 and Z are 2 integers.
223
224 Default value: BOTH. This option can be set to 'null' to clear the default value.
225 Possible values: {READ_1, READ_2, BOTH, NONE}
226
227
228 REDUNDANT_BARCODES=Boolean
229 BRED=Boolean
230
231 This option only applies for paired-end data with BARCODE_READ_POS set to 'BOTH'
232 Indicates if both read's barcodes encode redundant information or if barcodes are
233 supposed to be identical at both ends (or to resolve to the same sample when a pool of
234 barcodes is used per sample).
235 When REDUNDANT_BARCODES=false, the 2 barcodes potentially encode
236 different information. For example, only one of the barcodes encodes the sample identity
237 while
238 the second barcode might be a random barcode (UMI) to tell apart PCR artefacts from real
239 duplicates.
240 Another example is when both barcodes should be used in a combined fashion to resolve the
241 sample.
242 In the first example, you should use BPOS=BOTH BRED=false BM=READ_1.
243 In the second example, you should have BPOS=BOTH BRED=false BM=BOTH.
244 Note that with BPOS=BOTH BRED=true BM=BOTH, the behavior would be different as
245 'demultiplex' would then check the STRICT option to perform sample resolution.
246 Importantly, when BARCODE_READ_POS (BPOS) == BOTH AND REDUNDANT_BARCODES=false, BLEN,
247 barcode matching options (MM, MMD, Q) and read trimming/clipping options (XT, ZT) accept
248 different values for both barcodes in the form X:Z where X and Z are 2 integers.
249
250 Default value: true. This option can be set to 'null' to clear the default value.
251 Possible values: {true, false}
252
253 STRICT=Boolean
254 S=Boolean
255
256 For paired-end data and when two distinct barcodes/indices are used to encode samples,
257 this option tells if both barcodes should resolve to the same sample.
258 When true and if only one of the two reads has a barcode match, the read pair is
259 'unassigned'.
260 When false and if only one of the two reads has a barcode match, the read pair is
261 assigned to the
262 corresponding sample
263 When reads resolve to different samples, the read pair is always 'unassigned'.
264
265 Default value: false. This option can be set to 'null' to clear the default value.
266 Possible values: {true, false}
267
268 MAX_MISMATCHES=String
269 MM=String
270
271 Maximum mismatches for a barcode to be considered a match. In situations where both
272 barcodes are used for sample matching i.e. BPOS=BOTH BM=BOTH (or 2 INDEX_FILE given), two
273 distinct
274 values can be given here using the syntax MM=X:Z where X and Z are 2 integers to use for
275 read_1 and read_2 respectively.
276 MM=null is like MM=0
277
278 Default value: 1. This option can be set to 'null' to clear the default value.
279
280 MIN_MISMATCH_DELTA=String
281 MMD=String
282
283 Minimum difference between the number of mismatches against the best and the second best
284 barcode. When MMD is not respected, the read remains unassigned.
285 When two distinct barcodes are used for sample matching (dual encoding), two distinct
286 values can be given using the syntax MMD=X:Z where X and Z are 2 integers to use for
287 first (e.g. from read_1 or index_1)
288 MMD=null is like MMD=0
289
290 Default value: 1. This option can be set to 'null' to clear the default value.
291
292 MIN_BASE_QUALITY=String
293 Q=String
294
295 Minimum base quality during barcode matching: bases which quality is less than this
296 cutoff are always considered as a mismatch.When two distinct barcodes are used for sample
297 matching (dual encoding), two distinct values can be given using the syntax Q=X:Z where X
298 and Z are 2 integers to use for first (e.g. from read_1 or index_1) and second barcode
299 (e.g. from read_2 or index_2) respectively.
300 Q=null is like Q=0.
301
302 Default value: 10. This option can be set to 'null' to clear the default value.
303
304 XTRIMLEN=String
305 XT=String
306
307 Optional extra number of base to be trimmed right after the barcode (only used if
308 CLIP_BARCODE=true).
309 When running paired-end, two distinct values can be given using the syntax XT=X:Z where X
310 and Z are 2 integers to use for read_1 and read_2 respectively. Note that even when
311 BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode as to
312 end up with reads of the same length (note that this can also be operated using ZT). If a
313 unique value is given, e.g. XT=1, while running paired-end the following rule applies:
314 (1) BPOS=READ_1 or BPOS=READ_2, no trim is applied at the read w/o barcode
315 (2) BPOS=BOTH, the value is used for both reads.
316
317 Note that XT=null is like XT=0.
318 Default value: 0. This option can be set to 'null' to clear the default value.
319
320 ZTRIMLEN=String
321 ZT=String
322
323 Optional extra number of bases to be trimmed from the read end i.e. 3' end.
324 When running paired-end, two distinct values can be given here using the syntax ZT=X:Z
325 where X and Z are 2 integers to use for read_1 and read_2 respectively. Note that even
326 when BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode
327 as to end up with reads of the same length (note that this can also be operated using
328 XT). Note that if a single value is passed, the value always applies to both reads in
329 paired-end mode without further consideration.
330 ZT=null is like ZT=0.
331
332 Default value: 0. This option can be set to 'null' to clear the default value.
333
334 CLIP_BARCODE=Boolean
335 C=Boolean
336
337 Clip barcode sequence from read sequence, as well as XTRIMLEN (and ZTRIMLEN) bases if
338 applicable, before writing to output file.
339 If false, reads are written without modification to output file.
340 Apply to both barcodes when BPOS=BOTH.
341
342 Default value: true. This option can be set to 'null' to clear the default value.
343 Possible values: {true, false}
344
345 ADD_BARCODE_TO_HEADER=Boolean
346 ADD=Boolean
347
348 Add barcode at the end of the read header. Apply to both barcodes when BPOS=BOTH.
349 If true, the string ':barcode' is added at the end of the read header with a ':' added
350 only if current read header does not end with ':'.
351 If both reads of the pair have a barcode (i.e. BARCODE_READ_POS == BOTH), thenthe second
352 read also has its own matched barcode written. Else, the read without a barcode receives
353 the barcode from the barcoded read.
354 For example:
355 @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:
356 becomes:
357 @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:BARCODE
358
359 When barcodes containing random positions, i.e. 'N', (for example like in the iCLIP
360 protocol) or are UMIs, the added sequence is the sequence clipped from the read and NOT
361 the matched barcode.
362
363 Default value: true. This option can be set to 'null' to clear the default value.
364 Possible values: {true, false}
365
366
367 ENSURE_IDENTICAL_HEADER_NAMES=Boolean
368 SAME_HEADERS=Boolean
369
370 Makes sure that headers of both reads of a pair are identical, using the following read
371 header pattern (for both reads of a pair):
372 @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 SAMPLEBARCODE_READ1:SAMPLEBARCODE_READ2(:CLIPPED_SEQ_FROMREAD1:CLIPPED_SEQ_FROMREAD2)
373 This option only makes sense in
374 paired end mode and ADD=true. Some (if not all) mappers will indeed complain when the
375 read headers are not identical. When molecular barcodes are present in reads (either as
376 additional barcodes or as degenerate barcodes ie with 'N') and the RCHAR is used, you
377 will end with (problematic) read headers like this:
378 HISEQ:44:C6KC0ANXX:5:1101:1491:1994:1:N:0:TAGAACAC:TGGAGTAG
379 HISEQ:44:C6KC0ANXX:5:1101:1491:1994:3:N:0:TAGAACAC:CGTTGTAT
380 SAME_HEADERS=true will instead generates the following identical header for both reads:
381 HISEQ:44:C6KC0ANXX:5:1101:1491:1994:TAGAACAC:TGGAGTAG:CGTTGTAT
382 Note that we also clipped the useless '1:N:0' and '3:N:0' has they will also result in
383 generating different headers.
384 Important: this option will force RCHAR=: UNLESS you specify RCHAR=null ; in which
385 case a space will be preserved ie:
386 HISEQ:44:C6KC0ANXX:5:1101:1491:1994 TAGAACAC:TGGAGTAG:CGTTGTAT
387
388 Default value: true. This option can be set to 'null' to clear the default value.
389 Possible values: {true, false}
390
391
392 READ_NAME_REPLACE_CHAR=String
393 RCHAR=String
394
395 Replace spaces in read name/header using provided character. This is particularly handy
396 when you need to retain ADDed barcode in read name/header during mapping (everything
397 after space in read name is usually clipped in BAM files). For example, with RCHAR=':':
398 @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:
399 becomes
400 @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965:2:N:0:BARCODE
401 Default value: null.
402
403 QUALITY_FORMAT=FastqQualityFormat
404 V=FastqQualityFormat
405
406 A value describing how the quality values are encoded in the fastq. Either 'Solexa' for
407 pre-pipeline 1.3 style scores (solexa scaling + 66), 'Illumina' for pipeline 1.3 and
408 above (phred scaling + 64) or 'Standard' for phred scaled scores with a character shift
409 of 33. If this value is not specified (or 'null' is given), the quality format will be
410 detected.
411
412 Default value: Standard. This option can be set to 'null' to clear the default value.
413 Possible values: {Solexa, Illumina, Standard}
414
415 KEEP_UNASSIGNED_READ=Boolean
416 UN=Boolean
417
418 Should un-assigned reads be saved in files or simply ignored. File names are
419 automatically created or can be given using UF1 & UF2 options.
420
421 Default value: true. This option can be set to 'null' to clear the default value.
422 Possible values: {true, false}
423
424 BARCODE_DIAG_FILE=String
425 DIAG=String
426
427 Name for a barcode match reporting file (not generated by default).Either a name (in
428 which case the file will be created in the output dir) or full path. This file will
429 contain a line per read pair with the barcode best matching the read subsequence or
430 'null' when no match is found according to matching parameters ; and the final selected
431 sample. This file is useful for debugging or further processing in case both ends are
432 barcoded.
433 N.B: this file will have a size of about one of the fastq input files.
434
435 Default value: null.
436 ]]>
437 </help>
438
439 </tool>