comparison je-markdupes.xml @ 0:d39a96961423 draft

Initial upload
author gbcs-embl-heidelberg
date Wed, 25 Nov 2015 12:36:12 -0500
parents
children 4ccf1406832d
comparison
equal deleted inserted replaced
-1:000000000000 0:d39a96961423
1 <tool id="je_markdupes" name="Je-MarkDuplicates" version="1.0">
2 <description>to filter BAM files for read duplicates taking UMIs into account</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <stdio>
7 <exit_code range="1:" level="fatal" description="Tool exception" />
8 </stdio>
9 <version_command>echo '1.0'</version_command>
10 <command interpreter="bash">
11 <![CDATA[
12 je markdupes
13
14 ## picard MarkDuplicates defaults
15 INPUT="${inputFile}"
16 OUTPUT="${outFile}"
17
18 METRICS_FILE="${metrics_file}"
19
20 REMOVE_DUPLICATES="${remove_duplicates}"
21 ASSUME_SORTED="${assume_sorted}"
22
23 #for $element in $adv_options.comments:
24 COMMENT="${element.comment}"
25 #end for
26
27 DUPLICATE_SCORING_STRATEGY="${adv_options.duplicate_scoring_strategy}"
28
29 #import pipes
30 READ_NAME_REGEX=${ pipes.quote( str( $adv_options.read_name_regex ) ) or "''" }
31 OPTICAL_DUPLICATE_PIXEL_DISTANCE="${adv_options.optical_duplicate_pixel_distance}"
32
33 VALIDATION_STRINGENCY="${adv_options.validation_stringency}"
34 QUIET=true
35 VERBOSITY=ERROR
36
37 ## Je Markdupes Specific
38 MM=${MM}
39 #if str($MAX_N) != "":
40 MAX_N=${MAX_N}
41 #end if
42 @barcode_option_cmd@
43
44 #for $i, $option in enumerate( $repeat_slots )
45 #if str($option.SLOTS) != "":
46 SLOTS=${option.SLOTS}
47 #end if
48 #end for
49
50 #if str($trim_conditional.T) == "true":
51 T=${trim_conditional.T}
52 #for $i, $option in enumerate( $trim_conditional.repeat_tslots )
53 #if str($option.TSLOTS) != "":
54 TSLOTS=${option.TSLOTS}
55 #end if
56 #end for
57 #end if
58 ]]>
59 </command>
60 <configfiles>
61 <expand macro="barcode_config_file"></expand>
62 </configfiles>
63
64 <inputs>
65 <param format="bam,sam" name="inputFile" type="data" label="Select SAM/BAM dataset"
66 help="If empty, upload or import a SAM/BAM dataset"/>
67 <param name="remove_duplicates" type="boolean" label="If true do not write duplicates to the output file
68 instead of writing them with appropriate flags set" help="REMOVE_DUPLICATES; default=False"/>
69 <param name="assume_sorted" type="boolean" label="Assume the input file is already sorted" checked="true"
70 truevalue="true" falsevalue="false" help="ASSUME_SORTED; default=True"/>
71 <conditional name="barcodes">
72 <param name="barcode_list_type_con" type="select" label="Do you have a predefined list of UMIs">
73 <option value="file" selected="true">A one column txt file from the history</option>
74 <option value="text">Paste the UMI list in a text field</option>
75 <option value="no_barcodes">No predefined list</option>
76 </param>
77
78 <when value="file">
79 <param name="BARCODE_FILE" type="data" format="tabular,txt" label="UMI file"
80 help="BARCODE_FILE. Pre-defined list of Unique Molecular Identifiers that can be expected.
81 Format: one column text file, one UMI per line. All UMIs MUST have the same length."/>
82 </when>
83
84 <when value="text">
85 <param name="barcode_text" type="text" area="True" size="10x30"
86 value="barcode\n" label="Barcode file"
87 help="BARCODE_FILE. Pre-defined list of Unique Molecular Identifiers that can be expected.
88 Format: one column text file, one UMI per line. All UMIs MUST have the same length.">
89 <sanitizer>
90 <valid initial="string.printable"></valid>
91 <mapping initial="none"/>
92 </sanitizer>
93 </param>
94 </when>
95 <when value="no_barcodes"/>
96 </conditional>
97 <repeat name="repeat_slots" min="1" title="Unique Molecular Identifier location">
98 <param name="SLOTS" type="text" value="-1" label="Where to find the UMIs in the read name"
99 help="SLOTS. The last position is considered by default (-1). See help below."/>
100 </repeat>
101 <param name="MM" type="integer" value="1" min="0"
102 label="Number of maximum mismatches to consider two Unique Molecular Identifiers (UMIs) similar"
103 help="MISMATCHES"/>
104 <param name="MAX_N" type="text" value="" label="Maximum number of Ns a UMI can contain"
105 help="MAX_NUMBER_OF_N. Above this value, reads are placed in a 'undefined' group.
106 Default value is the MISMATCHES number."/>
107 <param name="SPLIT" type="text" value=":" label="Character to split up the header" help="SPLIT"/>
108 <conditional name="trim_conditional">
109 <param name="T" type="select"
110 label="Should barcode information be removed from read names in the output BAM" help="TRIM_HEADERS">
111 <option value="true">Yes</option>
112 <option value="false" selected="true">No</option>
113 </param>
114 <when value="true">
115 <repeat name="repeat_tslots" min="1" title="Unique Molecular Identifier location for trimming">
116 <param name="TSLOTS" type="text" value="-1"
117 label="Where to find the UMIs in the read name that should be removed from the header"
118 help="TSLOTS. Value for SLOTS is considered by default. See help below"/>
119 </repeat>
120 </when>
121 <when value="false"/>
122 </conditional>
123 <section name="adv_options" title="Advanced Options" expanded="False">
124 <repeat name="comments" title="Comment" min="0" help="You can provide multiple comments">
125 <param name="comment" type="text" label="Add this comment to BAM dataset"/>
126 </repeat>
127
128 <param name="duplicate_scoring_strategy" type="select" label="The scoring strategy for choosing the
129 non-duplicate among candidates" help="DUPLICATE_SCORING_STRATEGY; default=SUM_OF_BASE_QUALITIES">
130 <option value="SUM_OF_BASE_QUALITIES">SUM_OF_BASE_QUALITIES</option>
131 <option value="TOTAL_MAPPED_REFERENCE_LENGTH">TOTAL_MAPPED_REFERENCE_LENGTH</option>
132 </param>
133
134 <param name="read_name_regex" type="text" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*."
135 label="Regular expression that can be used to parse read names in the incoming SAM/BAM dataset"
136 help="READ_NAME_REGEX; Read names are parsed to extract three variables: tile/region, x coordinate and
137 y coordinate. These values are used to estimate the rate of optical duplication in order to give a more
138 accurate estimated library size. See help below for more info;
139 default=[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*.">
140 <sanitizer>
141 <valid initial="string.printable">
142 </valid>
143 </sanitizer>
144 </param>
145 <param name="optical_duplicate_pixel_distance" type="integer" value="100" min="0" max="500"
146 label="The maximum offset between two duplicte clusters in order to consider them optical duplicates"
147 help="OPTICAL_DUPLICATE_PIXEL_DISTANCE; default=100"/>
148
149 <param name="validation_stringency" type="select" label="Select validation stringency"
150 help="Setting stringency to SILENT can improve performance when processing a BAM file in which
151 variable-length data (read, qualities, tags) do not otherwise need to be decoded.">
152 <option value="LENIENT" selected="True">Lenient</option>
153 <option value="SILENT">Silent</option>
154 <option value="STRICT">Strict</option>
155 </param>
156 </section>
157 </inputs>
158 <outputs>
159 <data format="bam" name="outFile" label="${tool.name} on ${on_string}: Je-MarkDuplicates BAM output"/>
160 <data format="txt" name="metrics_file" label="${tool.name} on ${on_string}: Je-MarkDuplicate metrics"/>
161 </outputs>
162
163 <tests>
164 <test>
165 <!-- picard markduplicates default test -->
166 <param name="inputFile" value="markdupes_DNase_sorted.bam" ftype="bam"/>
167 <param name="barcode_list_type_con" value="file"/>
168 <param name="BARCODE_FILE" value="markdupes_umis.txt" ftype="txt"/>
169 <param name="repeat_slots_0|SLOTS" value="-1"/>
170 <param name="repeat_slots_1|SLOTS" value="-2"/>
171 <param name="MM" value="2"/>
172 <param name="MAX_N" value="1"/>
173 <param name="comment" value="test-run"/>
174 <param name="assume_sorted" value="True"/>
175 <param name="remove_duplicates" value="True"/>
176 <param name="read_name_regex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*."/>
177 <param name="optical_duplicate_pixel_distance" value="100"/>
178 <param name="duplicate_scoring_strategy" value="SUM_OF_BASE_QUALITIES"/>
179 <param name="validation_stringency" value="LENIENT"/>
180 <output name="outFile" file="markdupes_DNase_sorted_marked.bam" ftype="bam" lines_diff="2"/>
181 <output name="metrics_file" file="markdupes_metrics.txt" ftype="txt" lines_diff="4"/>
182 </test>
183 </tests>
184
185
186 <help>
187 <![CDATA[
188 **What it does**
189
190 Je MarkDupes: Examines aligned records in the supplied SAM or BAM file to locate duplicate molecules taking into account
191 molecular barcodes (Unique Molecular Identifiers or UMIs) found in read header.
192 All records are then either written to the output file with the duplicate records flagged or trashed.
193
194 Input file is a bam file.
195
196 Author: Charles Girardot (charles.girardot@embl.de).
197
198 Wrapper by: Jelle Scholtalbers (jelle.scholtalbers@embl.de).
199
200 ------
201
202 **Know what you are doing**
203
204 .. class:: warningmark
205
206 You will want to read the `documentation`__.
207
208 .. __: http://gbcs.embl.de/portal/Je
209
210 ------
211
212 **Parameter list**
213
214 This is an exhaustive list of options::
215
216 INPUT=String
217 I=String
218
219 One or more input SAM or BAM files to analyze. Must be coordinate sorted.
220
221 Default value: null. This option may be specified 0 or more times.
222
223 OUTPUT=File
224 O=File
225
226 The output file to write marked records to
227
228 Required.
229
230 MISMATCHES=Integer
231 MM=Integer
232
233 Number of MisMatches (inclusive) to still consider two Unique Molecular Identifiers
234 (UMIs) the same i.e. this option buffers for sequencing errors.
235 Indeed, in case of a sequencing error, 2 duplicate reads would not be considered
236 duplicates anymore.
237 Note that N are not considered mismatches during comparison ie ATTNGG and NTTANG are seen
238 as the same barcode and these two reads would be flagged duplicates.
239 This option takes a single value even when several barcodes are present (see SLOTS).
240 Note that when declaring several barcodes (see SLOTS) AND providing a predefined set
241 of barcodes (see BC option), the MM value is applicable in each lookup. When a predefined
242 set of barcodes is NOT given, the different barcodes (SLOTS) are concatenated first and
243 the MM value is therefore considered *overall* as the concatenated code is seen as a
244 unique code.
245 MM=null is like MM=0
246 Use the minimum Hamming distance of the original barcode set (if applicable).
247
248 Required.
249
250 MAX_NUMBER_OF_N=Integer
251 MAX_N=Integer
252
253 Maximum number of Ns a molecular code can contain (inclusive). Above this value, reads
254 are placed in a UNDEF group.
255 More precisely, these 'too degenarate' codes will not :
256 * be compared to the list of predefined codes [predefined code list situation ie BC
257 option given] nor
258 * be considered as a potential independent code [no predefined code list situation ie
259 BC option not given]
260 Default value is the MISMATCHES number.
261 Note that when declaring several barcodes (see SLOTS) AND providing a predefined set
262 of barcodes (see BC option), the MAX_N value is applicable to each barcode. When a
263 predefined set
264 of barcodes is NOT given, the different barcodes (SLOTS) are concatenated first and the
265 MAX_N value
266 is therefore considered *overall*.
267
268 Default value: null.
269
270
271 SLOTS=Integer
272 SLOTS=Integer
273
274 Where to find the UMIs (and only the UMIs) in the read name once read name has been
275 tokenized using the SPLIT character (e.g. ':').
276 By default, the UMI is considered to be found at the end of the read header i.e. after
277 the last ':'. Use this option to indicate other or additional UMI positions (e.g.
278 multiple UMIs present in read header.
279 IMPORTANT: counting starts at 1 and negative numbers can be used to start counting from
280 the end.
281 For example, consider the following read name that lists 3 different barcodes in the end:
282 HISEQ:44:C6KC0ANXX:8:2112:20670:79594:CGATGTTT:GATCCTAG:AAGGTACG
283 to indicate that the three barcodes are molecular codes, use
284 SLOTS=-1 SLOTS=-2 SLOTS=-3
285 if only the 2 last ones should be considered (the third one being a sample encoding
286 barcode), use
287 SLOTS=-1 SLOTS=-2
288
289 Default value: null. This option may be specified 0 or more times.
290
291 BARCODE_FILE=File
292 BC=File
293
294 Pre-defined list of UMIs that can be expected. Format: one column text file, one barcode
295 per line. All UMIs MUST have the same length.
296
297 Default value: null.
298
299 TRIM_HEADERS=Boolean
300 T=Boolean
301
302 Should barcode information be removed from read names in the output BAM?
303
304 Default value: false. This option can be set to 'null' to clear the default value.
305 Possible values: {true, false}
306
307 TSLOTS=Integer
308 TSLOTS=Integer
309
310 Where to find *all* barcode(s) (i.e. sample encoding and UMIs) in the read name once has
311 been tokenized using the SPLIT character (e.g. ':').
312 This option is only considered when TRIM_HEADERS=true. When TSLOTS is ommited while
313 TRIM_HEADERS=true, the values of SLOTS apply.
314 IMPORTANT : counting starts at 1 and negative numbers can be used to start counting from
315 the end.
316 See SLOT help for examples.
317
318 Default value: null. This option may be specified 0 or more times.
319
320 SPLIT_CHAR=String
321 SPLIT=String
322
323 Character to use to split up the read header line, default is ':'.
324
325 Default value: ':'. This option can be set to 'null' to clear the default value.
326
327 INPUT=String
328 I=String
329
330 One or more input SAM or BAM files to analyze. Must be coordinate sorted.
331
332 Default value: null. This option may be specified 0 or more times.
333
334 OUTPUT=File
335 O=File
336
337 The output file to write marked records to Required.
338
339 METRICS_FILE=File
340 M=File
341
342 File to write duplication metrics to Required.
343
344 COMMENT=String
345 CO=String
346
347 Comment(s) to include in the output file's header.
348
349 Default value: null. This option may be specified 0 or more times.
350
351 REMOVE_DUPLICATES=Boolean
352
353 If true do not write duplicates to the output file instead of writing them with
354 appropriate flags set.
355
356 Default value: false. This option can be set to 'null' to clear
357 the default value.
358 Possible values: {true, false}
359
360 ASSUME_SORTED=Boolean
361 AS=Boolean
362
363 If true, assume that the input file is coordinate sorted even if the header says
364 otherwise.
365
366 Default value: false. This option can be set to 'null' to clear the default
367 value.
368 Possible values: {true, false}
369
370 DUPLICATE_SCORING_STRATEGY=ScoringStrategy
371 DS=ScoringStrategy
372
373 The scoring strategy for choosing the non-duplicate among candidates.
374
375 Default value: SUM_OF_BASE_QUALITIES. This option can be set to 'null' to clear the default value.
376 Possible values: {SUM_OF_BASE_QUALITIES, TOTAL_MAPPED_REFERENCE_LENGTH}
377
378 READ_NAME_REGEX=String
379
380 Regular expression that can be used to parse read names in the incoming SAM file. Read
381 names are parsed to extract three variables: tile/region, x coordinate and y coordinate.
382 These values are used to estimate the rate of optical duplication in order to give a more
383 accurate estimated library size. Set this option to null to disable optical duplicate
384 detection. The regular expression should contain three capture groups for the three
385 variables, in order. It must match the entire read name. Note that if the default regex
386 is specified, a regex match is not actually done, but instead the read name is split on
387 colon character. For 5 element names, the 3rd, 4th and 5th elements are assumed to be
388 tile, x and y values. For 7 element names (CASAVA 1.8), the 5th, 6th, and 7th elements
389 are assumed to be tile, x and y values.
390
391 Default value:
392 [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*. This option can be set to 'null' to
393 clear the default value.
394
395 OPTICAL_DUPLICATE_PIXEL_DISTANCE=Integer
396
397 The maximum offset between two duplicte clusters in order to consider them optical
398 duplicates. This should usually be set to some fairly small number (e.g. 5-10 pixels)
399 unless using later versions of the Illumina pipeline that multiply pixel values by 10, in
400 which case 50-100 is more normal.
401
402 Default value: 100. This option can be set to 'null'
403 to clear the default value.
404
405 ]]>
406 </help>
407
408 </tool>