0
|
1 <tool id="je_markdupes" name="Je-MarkDuplicates" version="1.0">
|
|
2 <description>to filter BAM files for read duplicates taking UMIs into account</description>
|
|
3 <macros>
|
|
4 <import>macros.xml</import>
|
|
5 </macros>
|
|
6 <stdio>
|
|
7 <exit_code range="1:" level="fatal" description="Tool exception" />
|
|
8 </stdio>
|
|
9 <version_command>echo '1.0'</version_command>
|
|
10 <command interpreter="bash">
|
|
11 <![CDATA[
|
|
12 je markdupes
|
|
13
|
|
14 ## picard MarkDuplicates defaults
|
|
15 INPUT="${inputFile}"
|
|
16 OUTPUT="${outFile}"
|
|
17
|
|
18 METRICS_FILE="${metrics_file}"
|
|
19
|
|
20 REMOVE_DUPLICATES="${remove_duplicates}"
|
|
21 ASSUME_SORTED="${assume_sorted}"
|
|
22
|
|
23 #for $element in $adv_options.comments:
|
|
24 COMMENT="${element.comment}"
|
|
25 #end for
|
|
26
|
|
27 DUPLICATE_SCORING_STRATEGY="${adv_options.duplicate_scoring_strategy}"
|
|
28
|
|
29 #import pipes
|
|
30 READ_NAME_REGEX=${ pipes.quote( str( $adv_options.read_name_regex ) ) or "''" }
|
|
31 OPTICAL_DUPLICATE_PIXEL_DISTANCE="${adv_options.optical_duplicate_pixel_distance}"
|
|
32
|
|
33 VALIDATION_STRINGENCY="${adv_options.validation_stringency}"
|
|
34 QUIET=true
|
|
35 VERBOSITY=ERROR
|
|
36
|
|
37 ## Je Markdupes Specific
|
|
38 MM=${MM}
|
|
39 #if str($MAX_N) != "":
|
|
40 MAX_N=${MAX_N}
|
|
41 #end if
|
|
42 @barcode_option_cmd@
|
|
43
|
|
44 #for $i, $option in enumerate( $repeat_slots )
|
|
45 #if str($option.SLOTS) != "":
|
|
46 SLOTS=${option.SLOTS}
|
|
47 #end if
|
|
48 #end for
|
|
49
|
|
50 #if str($trim_conditional.T) == "true":
|
|
51 T=${trim_conditional.T}
|
|
52 #for $i, $option in enumerate( $trim_conditional.repeat_tslots )
|
|
53 #if str($option.TSLOTS) != "":
|
|
54 TSLOTS=${option.TSLOTS}
|
|
55 #end if
|
|
56 #end for
|
|
57 #end if
|
|
58 ]]>
|
|
59 </command>
|
|
60 <configfiles>
|
|
61 <expand macro="barcode_config_file"></expand>
|
|
62 </configfiles>
|
|
63
|
|
64 <inputs>
|
|
65 <param format="bam,sam" name="inputFile" type="data" label="Select SAM/BAM dataset"
|
|
66 help="If empty, upload or import a SAM/BAM dataset"/>
|
|
67 <param name="remove_duplicates" type="boolean" label="If true do not write duplicates to the output file
|
|
68 instead of writing them with appropriate flags set" help="REMOVE_DUPLICATES; default=False"/>
|
|
69 <param name="assume_sorted" type="boolean" label="Assume the input file is already sorted" checked="true"
|
|
70 truevalue="true" falsevalue="false" help="ASSUME_SORTED; default=True"/>
|
|
71 <conditional name="barcodes">
|
|
72 <param name="barcode_list_type_con" type="select" label="Do you have a predefined list of UMIs">
|
|
73 <option value="file" selected="true">A one column txt file from the history</option>
|
|
74 <option value="text">Paste the UMI list in a text field</option>
|
|
75 <option value="no_barcodes">No predefined list</option>
|
|
76 </param>
|
|
77
|
|
78 <when value="file">
|
|
79 <param name="BARCODE_FILE" type="data" format="tabular,txt" label="UMI file"
|
|
80 help="BARCODE_FILE. Pre-defined list of Unique Molecular Identifiers that can be expected.
|
|
81 Format: one column text file, one UMI per line. All UMIs MUST have the same length."/>
|
|
82 </when>
|
|
83
|
|
84 <when value="text">
|
|
85 <param name="barcode_text" type="text" area="True" size="10x30"
|
|
86 value="barcode\n" label="Barcode file"
|
|
87 help="BARCODE_FILE. Pre-defined list of Unique Molecular Identifiers that can be expected.
|
|
88 Format: one column text file, one UMI per line. All UMIs MUST have the same length.">
|
|
89 <sanitizer>
|
|
90 <valid initial="string.printable"></valid>
|
|
91 <mapping initial="none"/>
|
|
92 </sanitizer>
|
|
93 </param>
|
|
94 </when>
|
|
95 <when value="no_barcodes"/>
|
|
96 </conditional>
|
|
97 <repeat name="repeat_slots" min="1" title="Unique Molecular Identifier location">
|
|
98 <param name="SLOTS" type="text" value="-1" label="Where to find the UMIs in the read name"
|
|
99 help="SLOTS. The last position is considered by default (-1). See help below."/>
|
|
100 </repeat>
|
|
101 <param name="MM" type="integer" value="1" min="0"
|
|
102 label="Number of maximum mismatches to consider two Unique Molecular Identifiers (UMIs) similar"
|
|
103 help="MISMATCHES"/>
|
|
104 <param name="MAX_N" type="text" value="" label="Maximum number of Ns a UMI can contain"
|
|
105 help="MAX_NUMBER_OF_N. Above this value, reads are placed in a 'undefined' group.
|
|
106 Default value is the MISMATCHES number."/>
|
|
107 <param name="SPLIT" type="text" value=":" label="Character to split up the header" help="SPLIT"/>
|
|
108 <conditional name="trim_conditional">
|
|
109 <param name="T" type="select"
|
|
110 label="Should barcode information be removed from read names in the output BAM" help="TRIM_HEADERS">
|
|
111 <option value="true">Yes</option>
|
|
112 <option value="false" selected="true">No</option>
|
|
113 </param>
|
|
114 <when value="true">
|
|
115 <repeat name="repeat_tslots" min="1" title="Unique Molecular Identifier location for trimming">
|
|
116 <param name="TSLOTS" type="text" value="-1"
|
|
117 label="Where to find the UMIs in the read name that should be removed from the header"
|
|
118 help="TSLOTS. Value for SLOTS is considered by default. See help below"/>
|
|
119 </repeat>
|
|
120 </when>
|
|
121 <when value="false"/>
|
|
122 </conditional>
|
|
123 <section name="adv_options" title="Advanced Options" expanded="False">
|
|
124 <repeat name="comments" title="Comment" min="0" help="You can provide multiple comments">
|
|
125 <param name="comment" type="text" label="Add this comment to BAM dataset"/>
|
|
126 </repeat>
|
|
127
|
|
128 <param name="duplicate_scoring_strategy" type="select" label="The scoring strategy for choosing the
|
|
129 non-duplicate among candidates" help="DUPLICATE_SCORING_STRATEGY; default=SUM_OF_BASE_QUALITIES">
|
|
130 <option value="SUM_OF_BASE_QUALITIES">SUM_OF_BASE_QUALITIES</option>
|
|
131 <option value="TOTAL_MAPPED_REFERENCE_LENGTH">TOTAL_MAPPED_REFERENCE_LENGTH</option>
|
|
132 </param>
|
|
133
|
|
134 <param name="read_name_regex" type="text" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*."
|
|
135 label="Regular expression that can be used to parse read names in the incoming SAM/BAM dataset"
|
|
136 help="READ_NAME_REGEX; Read names are parsed to extract three variables: tile/region, x coordinate and
|
|
137 y coordinate. These values are used to estimate the rate of optical duplication in order to give a more
|
|
138 accurate estimated library size. See help below for more info;
|
|
139 default=[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*.">
|
|
140 <sanitizer>
|
|
141 <valid initial="string.printable">
|
|
142 </valid>
|
|
143 </sanitizer>
|
|
144 </param>
|
|
145 <param name="optical_duplicate_pixel_distance" type="integer" value="100" min="0" max="500"
|
|
146 label="The maximum offset between two duplicte clusters in order to consider them optical duplicates"
|
|
147 help="OPTICAL_DUPLICATE_PIXEL_DISTANCE; default=100"/>
|
|
148
|
|
149 <param name="validation_stringency" type="select" label="Select validation stringency"
|
|
150 help="Setting stringency to SILENT can improve performance when processing a BAM file in which
|
|
151 variable-length data (read, qualities, tags) do not otherwise need to be decoded.">
|
|
152 <option value="LENIENT" selected="True">Lenient</option>
|
|
153 <option value="SILENT">Silent</option>
|
|
154 <option value="STRICT">Strict</option>
|
|
155 </param>
|
|
156 </section>
|
|
157 </inputs>
|
|
158 <outputs>
|
|
159 <data format="bam" name="outFile" label="${tool.name} on ${on_string}: Je-MarkDuplicates BAM output"/>
|
|
160 <data format="txt" name="metrics_file" label="${tool.name} on ${on_string}: Je-MarkDuplicate metrics"/>
|
|
161 </outputs>
|
|
162
|
|
163 <tests>
|
|
164 <test>
|
|
165 <!-- picard markduplicates default test -->
|
|
166 <param name="inputFile" value="markdupes_DNase_sorted.bam" ftype="bam"/>
|
|
167 <param name="barcode_list_type_con" value="file"/>
|
|
168 <param name="BARCODE_FILE" value="markdupes_umis.txt" ftype="txt"/>
|
|
169 <param name="repeat_slots_0|SLOTS" value="-1"/>
|
|
170 <param name="repeat_slots_1|SLOTS" value="-2"/>
|
|
171 <param name="MM" value="2"/>
|
|
172 <param name="MAX_N" value="1"/>
|
|
173 <param name="comment" value="test-run"/>
|
|
174 <param name="assume_sorted" value="True"/>
|
|
175 <param name="remove_duplicates" value="True"/>
|
|
176 <param name="read_name_regex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*."/>
|
|
177 <param name="optical_duplicate_pixel_distance" value="100"/>
|
|
178 <param name="duplicate_scoring_strategy" value="SUM_OF_BASE_QUALITIES"/>
|
|
179 <param name="validation_stringency" value="LENIENT"/>
|
|
180 <output name="outFile" file="markdupes_DNase_sorted_marked.bam" ftype="bam" lines_diff="2"/>
|
|
181 <output name="metrics_file" file="markdupes_metrics.txt" ftype="txt" lines_diff="4"/>
|
|
182 </test>
|
|
183 </tests>
|
|
184
|
|
185
|
|
186 <help>
|
|
187 <![CDATA[
|
|
188 **What it does**
|
|
189
|
|
190 Je MarkDupes: Examines aligned records in the supplied SAM or BAM file to locate duplicate molecules taking into account
|
|
191 molecular barcodes (Unique Molecular Identifiers or UMIs) found in read header.
|
|
192 All records are then either written to the output file with the duplicate records flagged or trashed.
|
|
193
|
|
194 Input file is a bam file.
|
|
195
|
|
196 Author: Charles Girardot (charles.girardot@embl.de).
|
|
197
|
|
198 Wrapper by: Jelle Scholtalbers (jelle.scholtalbers@embl.de).
|
|
199
|
|
200 ------
|
|
201
|
|
202 **Know what you are doing**
|
|
203
|
|
204 .. class:: warningmark
|
|
205
|
|
206 You will want to read the `documentation`__.
|
|
207
|
|
208 .. __: http://gbcs.embl.de/portal/Je
|
|
209
|
|
210 ------
|
|
211
|
|
212 **Parameter list**
|
|
213
|
|
214 This is an exhaustive list of options::
|
|
215
|
|
216 INPUT=String
|
|
217 I=String
|
|
218
|
|
219 One or more input SAM or BAM files to analyze. Must be coordinate sorted.
|
|
220
|
|
221 Default value: null. This option may be specified 0 or more times.
|
|
222
|
|
223 OUTPUT=File
|
|
224 O=File
|
|
225
|
|
226 The output file to write marked records to
|
|
227
|
|
228 Required.
|
|
229
|
|
230 MISMATCHES=Integer
|
|
231 MM=Integer
|
|
232
|
|
233 Number of MisMatches (inclusive) to still consider two Unique Molecular Identifiers
|
|
234 (UMIs) the same i.e. this option buffers for sequencing errors.
|
|
235 Indeed, in case of a sequencing error, 2 duplicate reads would not be considered
|
|
236 duplicates anymore.
|
|
237 Note that N are not considered mismatches during comparison ie ATTNGG and NTTANG are seen
|
|
238 as the same barcode and these two reads would be flagged duplicates.
|
|
239 This option takes a single value even when several barcodes are present (see SLOTS).
|
|
240 Note that when declaring several barcodes (see SLOTS) AND providing a predefined set
|
|
241 of barcodes (see BC option), the MM value is applicable in each lookup. When a predefined
|
|
242 set of barcodes is NOT given, the different barcodes (SLOTS) are concatenated first and
|
|
243 the MM value is therefore considered *overall* as the concatenated code is seen as a
|
|
244 unique code.
|
|
245 MM=null is like MM=0
|
|
246 Use the minimum Hamming distance of the original barcode set (if applicable).
|
|
247
|
|
248 Required.
|
|
249
|
|
250 MAX_NUMBER_OF_N=Integer
|
|
251 MAX_N=Integer
|
|
252
|
|
253 Maximum number of Ns a molecular code can contain (inclusive). Above this value, reads
|
|
254 are placed in a UNDEF group.
|
|
255 More precisely, these 'too degenarate' codes will not :
|
|
256 * be compared to the list of predefined codes [predefined code list situation ie BC
|
|
257 option given] nor
|
|
258 * be considered as a potential independent code [no predefined code list situation ie
|
|
259 BC option not given]
|
|
260 Default value is the MISMATCHES number.
|
|
261 Note that when declaring several barcodes (see SLOTS) AND providing a predefined set
|
|
262 of barcodes (see BC option), the MAX_N value is applicable to each barcode. When a
|
|
263 predefined set
|
|
264 of barcodes is NOT given, the different barcodes (SLOTS) are concatenated first and the
|
|
265 MAX_N value
|
|
266 is therefore considered *overall*.
|
|
267
|
|
268 Default value: null.
|
|
269
|
|
270
|
|
271 SLOTS=Integer
|
|
272 SLOTS=Integer
|
|
273
|
|
274 Where to find the UMIs (and only the UMIs) in the read name once read name has been
|
|
275 tokenized using the SPLIT character (e.g. ':').
|
|
276 By default, the UMI is considered to be found at the end of the read header i.e. after
|
|
277 the last ':'. Use this option to indicate other or additional UMI positions (e.g.
|
|
278 multiple UMIs present in read header.
|
|
279 IMPORTANT: counting starts at 1 and negative numbers can be used to start counting from
|
|
280 the end.
|
|
281 For example, consider the following read name that lists 3 different barcodes in the end:
|
|
282 HISEQ:44:C6KC0ANXX:8:2112:20670:79594:CGATGTTT:GATCCTAG:AAGGTACG
|
|
283 to indicate that the three barcodes are molecular codes, use
|
|
284 SLOTS=-1 SLOTS=-2 SLOTS=-3
|
|
285 if only the 2 last ones should be considered (the third one being a sample encoding
|
|
286 barcode), use
|
|
287 SLOTS=-1 SLOTS=-2
|
|
288
|
|
289 Default value: null. This option may be specified 0 or more times.
|
|
290
|
|
291 BARCODE_FILE=File
|
|
292 BC=File
|
|
293
|
|
294 Pre-defined list of UMIs that can be expected. Format: one column text file, one barcode
|
|
295 per line. All UMIs MUST have the same length.
|
|
296
|
|
297 Default value: null.
|
|
298
|
|
299 TRIM_HEADERS=Boolean
|
|
300 T=Boolean
|
|
301
|
|
302 Should barcode information be removed from read names in the output BAM?
|
|
303
|
|
304 Default value: false. This option can be set to 'null' to clear the default value.
|
|
305 Possible values: {true, false}
|
|
306
|
|
307 TSLOTS=Integer
|
|
308 TSLOTS=Integer
|
|
309
|
|
310 Where to find *all* barcode(s) (i.e. sample encoding and UMIs) in the read name once has
|
|
311 been tokenized using the SPLIT character (e.g. ':').
|
|
312 This option is only considered when TRIM_HEADERS=true. When TSLOTS is ommited while
|
|
313 TRIM_HEADERS=true, the values of SLOTS apply.
|
|
314 IMPORTANT : counting starts at 1 and negative numbers can be used to start counting from
|
|
315 the end.
|
|
316 See SLOT help for examples.
|
|
317
|
|
318 Default value: null. This option may be specified 0 or more times.
|
|
319
|
|
320 SPLIT_CHAR=String
|
|
321 SPLIT=String
|
|
322
|
|
323 Character to use to split up the read header line, default is ':'.
|
|
324
|
|
325 Default value: ':'. This option can be set to 'null' to clear the default value.
|
|
326
|
|
327 INPUT=String
|
|
328 I=String
|
|
329
|
|
330 One or more input SAM or BAM files to analyze. Must be coordinate sorted.
|
|
331
|
|
332 Default value: null. This option may be specified 0 or more times.
|
|
333
|
|
334 OUTPUT=File
|
|
335 O=File
|
|
336
|
|
337 The output file to write marked records to Required.
|
|
338
|
|
339 METRICS_FILE=File
|
|
340 M=File
|
|
341
|
|
342 File to write duplication metrics to Required.
|
|
343
|
|
344 COMMENT=String
|
|
345 CO=String
|
|
346
|
|
347 Comment(s) to include in the output file's header.
|
|
348
|
|
349 Default value: null. This option may be specified 0 or more times.
|
|
350
|
|
351 REMOVE_DUPLICATES=Boolean
|
|
352
|
|
353 If true do not write duplicates to the output file instead of writing them with
|
|
354 appropriate flags set.
|
|
355
|
|
356 Default value: false. This option can be set to 'null' to clear
|
|
357 the default value.
|
|
358 Possible values: {true, false}
|
|
359
|
|
360 ASSUME_SORTED=Boolean
|
|
361 AS=Boolean
|
|
362
|
|
363 If true, assume that the input file is coordinate sorted even if the header says
|
|
364 otherwise.
|
|
365
|
|
366 Default value: false. This option can be set to 'null' to clear the default
|
|
367 value.
|
|
368 Possible values: {true, false}
|
|
369
|
|
370 DUPLICATE_SCORING_STRATEGY=ScoringStrategy
|
|
371 DS=ScoringStrategy
|
|
372
|
|
373 The scoring strategy for choosing the non-duplicate among candidates.
|
|
374
|
|
375 Default value: SUM_OF_BASE_QUALITIES. This option can be set to 'null' to clear the default value.
|
|
376 Possible values: {SUM_OF_BASE_QUALITIES, TOTAL_MAPPED_REFERENCE_LENGTH}
|
|
377
|
|
378 READ_NAME_REGEX=String
|
|
379
|
|
380 Regular expression that can be used to parse read names in the incoming SAM file. Read
|
|
381 names are parsed to extract three variables: tile/region, x coordinate and y coordinate.
|
|
382 These values are used to estimate the rate of optical duplication in order to give a more
|
|
383 accurate estimated library size. Set this option to null to disable optical duplicate
|
|
384 detection. The regular expression should contain three capture groups for the three
|
|
385 variables, in order. It must match the entire read name. Note that if the default regex
|
|
386 is specified, a regex match is not actually done, but instead the read name is split on
|
|
387 colon character. For 5 element names, the 3rd, 4th and 5th elements are assumed to be
|
|
388 tile, x and y values. For 7 element names (CASAVA 1.8), the 5th, 6th, and 7th elements
|
|
389 are assumed to be tile, x and y values.
|
|
390
|
|
391 Default value:
|
|
392 [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*. This option can be set to 'null' to
|
|
393 clear the default value.
|
|
394
|
|
395 OPTICAL_DUPLICATE_PIXEL_DISTANCE=Integer
|
|
396
|
|
397 The maximum offset between two duplicte clusters in order to consider them optical
|
|
398 duplicates. This should usually be set to some fairly small number (e.g. 5-10 pixels)
|
|
399 unless using later versions of the Illumina pipeline that multiply pixel values by 10, in
|
|
400 which case 50-100 is more normal.
|
|
401
|
|
402 Default value: 100. This option can be set to 'null'
|
|
403 to clear the default value.
|
|
404
|
|
405 ]]>
|
|
406 </help>
|
|
407
|
|
408 </tool>
|