comparison picard_AddOrReplaceReadGroups.xml @ 5:3d4f1fa26f0e draft

Uploaded
author devteam
date Tue, 16 Dec 2014 19:03:21 -0500
parents 9227b8c3093b
children 3a3234d7a2e8
comparison
equal deleted inserted replaced
4:ab1f60c26526 5:3d4f1fa26f0e
1 <tool name="Add or Replace Groups" id="picard_ARRG" version="1.56.0"> 1 <tool name="AddOrReplaceReadGroups" id="picard_AddOrReplaceReadGroups" version="1.126.0">
2 <requirements><requirement type="package" version="1.56.0">picard</requirement></requirements> 2 <description>add or replaces read group information</description>
3 <command interpreter="python"> 3 <requirements>
4 picard_wrapper.py 4 <requirement type="package" version="1.126.0">picard</requirement>
5 --input="${inputFile}" 5 </requirements>
6 --rg-lb="${rglb}" 6
7 --rg-pl="${rgpl}" 7 <macros>
8 --rg-pu="${rgpu}" 8 <import>picard_macros.xml</import>
9 --rg-sm="${rgsm}" 9 </macros>
10 --rg-id="${rgid}" 10
11 --rg-opts="${readGroupOpts.rgOpts}" 11 <command>
12 #if $readGroupOpts.rgOpts == "full" 12 @java_options@
13 --rg-cn="${readGroupOpts.rgcn}" 13 java -jar \$JAVA_JAR_PATH/picard.jar
14 --rg-ds="${readGroupOpts.rgds}" 14 AddOrReplaceReadGroups
15 INPUT="${inputFile}"
16 RGLB="${rglb}"
17 RGPL="${rgpl}"
18 RGPU="${rgpu}"
19 RGSM="${rgsm}"
20 RGID="${rgid}"
21
22 #if str( $rgcn):
23 RGCN="${rgcn}"
15 #end if 24 #end if
16 --output-format="${outputFormat}" 25
17 --output="${outFile}" 26 #if str( $rgds):
18 -j "\$JAVA_JAR_PATH/AddOrReplaceReadGroups.jar" 27 RGDS="${rgds}"
19 --tmpdir "${__new_file_path__}" 28 #end if
29
30 #if str( $rgpi):
31 RGPI="${rgpi}"
32 #end if
33
34 #if str( $rgdt):
35 RGDT="${rgdt}"
36 #end if
37
38 VALIDATION_STRINGENCY="${validation_stringency}"
39 QUIET=true
40 VERBOSITY=ERROR
41 OUTPUT="${outFile}"
42
20 </command> 43 </command>
44
21 <inputs> 45 <inputs>
22 <param format="bam,sam" name="inputFile" type="data" label="SAM/BAM dataset to add or replace read groups in" 46 <param format="bam,sam" name="inputFile" type="data" label="Select SAM/BAM dataset or dataset collection" help="If empty, upload or import a SAM/BAM dataset" />
23 help="If empty, upload or import a SAM/BAM dataset." /> 47 <param name="rgid" value="A" type="text" label="Read Group ID" help="RGID; Required" />
24 <param name="rgid" value="1" type="text" label="Read group ID (ID tag)" help="The most important read group tag. Galaxy will use a value of '1' if nothing provided." /> 48 <param name="rgsm" value="sample-a" type="text" label="Read Group Sample name" help="RGSM; Required" />
25 <param name="rgsm" value="" type="text" label="Read group sample name (SM tag)" /> 49 <param name="rglb" value="tumor-a" type="text" label="Read Group library" help="RGLB; Required" />
26 <param name="rglb" value="" type="text" label="Read group library (LB tag)" /> 50 <param name="rgpl" value="Illumina" type="text" label="Read Group Platform" help="RGPL; Required; e.g., Illumina, 454, IonTorrent, etc" />
27 <param name="rgpl" value="" type="text" label="Read group platform (PL tag)" help="illumina, solid, 454, pacbio, helicos" /> 51 <param name="rgpu" value="run-1" type="text" label="Read Group Platform Unit" help="RGPU; Required; e.g., run, barcode, etc" />
28 <param name="rgpu" value="" type="text" label="Read group platform unit" help="like run barcode, etc." /> 52 <!-- optional params -->
29 <conditional name="readGroupOpts"> 53 <param name="rgcn" value="" optional="True" type="text" label="Read Group sequencing center name" help="RGCN; Optional" />
30 <param name="rgOpts" type="select" label="Specify additional (optional) arguments" help="Allows you to set RGCN and RGDS."> 54 <param name="rgds" value="" optional="True" type="text" label="Read Group description" help="RGDS; Optional" />
31 <option value="preSet">Use pre-set defaults</option> 55 <param name="rgpi" value="" optional="True" type="integer" label="Read Group predicted insert size" help="RGPI; Optional" />
32 <option value="full">Set optional arguments</option> 56 <param name="rgdt" value="" optional="True" type="text" label="Read Group run date" help="RGDT; Optional; Format=YYYY-MM-DD (eg 1997-07-16)"/>
33 </param> 57
34 <when value="preSet" /> 58 <expand macro="VS" />
35 <when value="full"> 59
36 <param name="rgcn" value="" type="text" label="Read group sequencing center name" help="Leave set to &lt;null&gt; for default (none)" />
37 <param name="rgds" value="" type="text" label="Read group description" help="Leave set to &lt;null&gt; for default (none)" />
38 </when>
39 </conditional>
40 <param name="outputFormat" type="boolean" checked="True" truevalue="bam" falsevalue="sam" label="Output bam instead of sam" help="Uncheck for sam output" />
41 </inputs> 60 </inputs>
61
42 <outputs> 62 <outputs>
43 <data name="outFile" format="bam" label="${tool.name} on ${on_string}: ${outputFormat} with read groups replaced"> 63 <data name="outFile" format="bam" label="${tool.name} on ${on_string}: BAM with replaced/modified readgroups"/>
44 <change_format>
45 <when input="outputFormat" value="sam" format="sam" />
46 </change_format>
47 </data>
48 </outputs> 64 </outputs>
65
66 <stdio>
67 <exit_code range="1:" level="fatal"/>
68 </stdio>
69
49 <tests> 70 <tests>
50 <test> 71 <test>
51 <!-- Command for replacing read groups in bam: 72 <param name="inputFile" value="picard_ARRG.bam" />
52 java -jar AddOrReplaceReadGroups.jar VALIDATION_STRINGENCY=LENIENT I=test-data/picard_ARRG_input1.bam O=picard_ARRG_output1.sam RGID=one RGLB=lib RGPL=illumina RGPU=peaewe RGSM=sam1 73 <param name="rglb" value="tumor-a" />
53 --> 74 <param name="rgpl" value="Illumina" />
54 <param name="inputFile" value="picard_ARRG_input1.bam" /> 75 <param name="rgpu" value="run-1" />
55 <param name="rglb" value="lib" /> 76 <param name="rgsm" value="sample-a" />
56 <param name="rgpl" value="illumina" /> 77 <param name="rgid" value="id-1" />
57 <param name="rgpu" value="peaewe" /> 78 <output name="outFile" file="picard_ARRG_test1.bam" ftype="bam" />
58 <param name="rgsm" value="sam1" />
59 <param name="rgid" value="one" />
60 <param name="rgOpts" value="preSet" />
61 <param name="outputFormat" value="False" />
62 <output name="outFile" file="picard_ARRG_output1.sam" ftype="sam" />
63 </test>
64 <test>
65 <!-- Command for replacing read groups in sam:
66 java -jar AddOrReplaceReadGroups.jar VALIDATION_STRINGENCY=LENIENT I=test-data/picard_ARRG_input1.sam O=picard_ARRG_output2.sam RGLB=LIB RGPL=IL RGPU=PLAT RGSM=smp RGID=M5 RGCN=FamousCenter RGDS="description with spaces"
67 picard_ARRG_input1.bam can be created from picard_ARRG_input1.sam
68 -->
69 <param name="inputFile" value="picard_ARRG_input1.sam" />
70 <param name="rglb" value="LIB" />
71 <param name="rgpl" value="IL" />
72 <param name="rgpu" value="PLAT" />
73 <param name="rgsm" value="smp" />
74 <param name="rgid" value="M5" />
75 <param name="rgOpts" value="full" />
76 <param name="rgcn" value="FamousCenter" />
77 <param name="rgds" value="description with spaces" />
78 <param name="outputFormat" value="False" />
79 <output name="outFile" file="picard_ARRG_output2.sam" ftype="sam" />
80 </test>
81 <test>
82 <!-- Command for adding read groups in sam:
83 java -jar AddOrReplaceReadGroups.jar VALIDATION_STRINGENCY=LENIENT I=test-data/picard_ARRG_input2.sam O=picard_ARRG_output3.bam RGID=M6 RGLB=LIB RGPL=IL RGPU=PLAT RGSM=smp1
84 -->
85 <param name="inputFile" value="picard_ARRG_input2.sam" />
86 <param name="rglb" value="LIB" />
87 <param name="rgpl" value="IL" />
88 <param name="rgpu" value="PLAT" />
89 <param name="rgsm" value="smp1" />
90 <param name="rgid" value="M6" />
91 <param name="rgOpts" value="preSet" />
92 <param name="outputFormat" value="True" />
93 <output name="outFile" file="picard_ARRG_output3.bam" ftype="bam" />
94 </test> 79 </test>
95 </tests> 80 </tests>
96 <help> 81 <help>
97 82
98 .. class:: infomark 83 .. class:: infomark
99 84
100 **Purpose** 85 **Purpose**
101 86
102 Add or Replace Read Groups in an input BAM or SAM file. 87 Add or Replace Read Groups in an input BAM or SAM file.
103 88
104 **Read Groups are Important!** 89 @dataset_collections@
105 90
106 Many downstream analysis tools (such as GATK, for example) require BAM datasets to contain read groups. Even if you are not going to use GATK, setting read groups correctly from the start will simplify your life greatly. Below we provide an explanation of read groups fields taken from GATK FAQ webpage: 91 @RG@
107 92
108 .. csv-table:: 93 @description@
109 :header-rows: 1
110
111 Tag,Importance,Definition,Meaning
112 "ID","Required","Read group identifier. Each @RG line must have a unique ID. The value of ID is used in the RG tags of alignment records. Must be unique among all read groups in header section. Read group IDs may be modified when merging SAM files in order to handle collisions.","Ideally, this should be a globally unique identify across all sequencing data in the world, such as the Illumina flowcell + lane name and number. Will be referenced by each read with the RG:Z field, allowing tools to determine the read group information associated with each read, including the sample from which the read came. Also, a read group is effectively treated as a separate run of the NGS instrument in tools like base quality score recalibration (a GATK component) -- all reads within a read group are assumed to come from the same instrument run and to therefore share the same error model."
113 "SM","Sample. Use pool name where a pool is being sequenced.","Required. As important as ID.","The name of the sample sequenced in this read group. GATK tools treat all read groups with the same SM value as containing sequencing data for the same sample. Therefore it's critical that the SM field be correctly specified, especially when using multi-sample tools like the Unified Genotyper (a GATK component)."
114 "PL","Platform/technology used to produce the read. Valid values: ILLUMINA, SOLID, LS454, HELICOS and PACBIO.","Important. Not currently used in the GATK, but was in the past, and may return. The only way to known the sequencing technology used to generate the sequencing data","It's a good idea to use this field."
115 "LB","DNA preparation library identify","Essential for MarkDuplicates","MarkDuplicates uses the LB field to determine which read groups might contain molecular duplicates, in case the same DNA library was sequenced on multiple lanes."
116 94
117 **Example of Read Group usage** 95 INPUT=File
96 I=File Input file (bam or sam). Required.
118 97
119 Support we have a trio of samples: MOM, DAD, and KID. Each has two DNA libraries prepared, one with 400 bp inserts and another with 200 bp inserts. Each of these libraries is run on two lanes of an illumina hiseq, requiring 3 x 2 x 2 = 12 lanes of data. When the data come off the sequencer, we would create 12 BAM files, with the following @RG fields in the header:: 98 OUTPUT=File
99 O=File Output file (bam or sam). Required.
120 100
121 Dad's data: 101 SORT_ORDER=SortOrder
122 @RG ID:FLOWCELL1.LANE1 PL:illumina LB:LIB-DAD-1 SM:DAD PI:200 102 SO=SortOrder Optional sort order to output in. If not supplied OUTPUT is in the same order as INPUT.
123 @RG ID:FLOWCELL1.LANE2 PL:illumina LB:LIB-DAD-1 SM:DAD PI:200 103 Default value: null. Possible values: {unsorted, queryname, coordinate}
124 @RG ID:FLOWCELL1.LANE3 PL:illumina LB:LIB-DAD-2 SM:DAD PI:400 104
125 @RG ID:FLOWCELL1.LANE4 PL:illumina LB:LIB-DAD-2 SM:DAD PI:400 105 RGID=String
106 ID=String Read Group ID Default value: 1. This option can be set to 'null' to clear the default
107 value.
108
109 RGLB=String
110 LB=String Read Group Library Required.
126 111
127 Mom's data: 112 RGPL=String
128 @RG ID:FLOWCELL1.LANE5 PL:illumina LB:LIB-MOM-1 SM:MOM PI:200 113 PL=String Read Group platform (e.g. illumina, solid) Required.
129 @RG ID:FLOWCELL1.LANE6 PL:illumina LB:LIB-MOM-1 SM:MOM PI:200
130 @RG ID:FLOWCELL1.LANE7 PL:illumina LB:LIB-MOM-2 SM:MOM PI:400
131 @RG ID:FLOWCELL1.LANE8 PL:illumina LB:LIB-MOM-2 SM:MOM PI:400
132
133 Kid's data:
134 @RG ID:FLOWCELL2.LANE1 PL:illumina LB:LIB-KID-1 SM:KID PI:200
135 @RG ID:FLOWCELL2.LANE2 PL:illumina LB:LIB-KID-1 SM:KID PI:200
136 @RG ID:FLOWCELL2.LANE3 PL:illumina LB:LIB-KID-2 SM:KID PI:400
137 @RG ID:FLOWCELL2.LANE4 PL:illumina LB:LIB-KID-2 SM:KID PI:400
138 114
139 Note the hierarchical relationship between read groups (unique for each lane) to libraries (sequenced on two lanes) and samples (across four lanes, two lanes for each library). 115 RGPU=String
116 PU=String Read Group platform unit (eg. run barcode) Required.
140 117
141 **Picard documentation** 118 RGSM=String
119 SM=String Read Group sample name Required.
142 120
143 This is a Galaxy wrapper for AddOrReplaceReadGroups, a part of the external package Picard-tools_. 121 RGCN=String
122 CN=String Read Group sequencing center name Default value: null.
144 123
145 .. _Picard-tools: http://www.google.com/search?q=picard+samtools 124 RGDS=String
125 DS=String Read Group description Default value: null.
146 126
147 ------ 127 RGDT=Iso8601Date
128 DT=Iso8601Date Read Group run date Default value: null.
148 129
149 .. class:: infomark 130 RGPI=Integer
131 PI=Integer Read Group predicted insert size Default value: null.
150 132
151 **Inputs, outputs, and parameters** 133 @more_info@
152
153 Either a sam file or a bam file must be supplied. If a bam file is used, it must
154 be coordinate-sorted. Galaxy currently coordinate-sorts all bam files.
155
156 The output file is either bam (the default) or sam, according to user selection,
157 and contains the same information as the input file except for the appropraite
158 additional (or modified) read group tags. Bam is recommended since it is smaller.
159
160 From the Picard documentation.
161
162 AddOrReplaceReadGroups REQUIRED parameters::
163
164 Option (Type) Description
165
166 RGLB=String Read Group Library
167 RGPL=String Read Group platform (e.g. illumina, solid)
168 RGPU=String Read Group platform unit (eg. run barcode)
169 RGSM=String Read Group sample name
170 RGID=String Read Group ID; Default value: null (empty)
171
172 AddOrReplaceReadGroups OPTIONAL parameters::
173
174 Option (Type) Description
175
176 RGCN=String Read Group sequencing center name; Default value: null (empty)
177 RGDS=String Read Group description Default value: null (empty)
178
179 One parameter that Picard's AddOrReplaceReadGroups offers that is automatically
180 set by Galaxy is the SORT_ORDER, which is set to coordinate.
181
182 .. class:: warningmark
183
184 **Warning on SAM/BAM quality**
185
186 Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
187 flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
188 to be the only way to deal with SAM/BAM that cannot be parsed.
189
190
191
192 </help> 134 </help>
193 </tool> 135 </tool>
194 136
195 137
196 138
201 143
202 144
203 145
204 146
205 147
148