Mercurial > repos > devteam > pileup_parser
comparison pileup_parser.xml @ 2:85bedbea8a12 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/pileup_parser commit ab627176cd4f6efe6d1fe4b85baa679aaa651eb1
author | devteam |
---|---|
date | Wed, 05 Oct 2016 06:30:36 -0400 |
parents | 1670f0565000 |
children |
comparison
equal
deleted
inserted
replaced
1:1670f0565000 | 2:85bedbea8a12 |
---|---|
1 <tool id="pileup_parser" name="Filter pileup" version="1.0.2">> | 1 <tool id="pileup_parser" name="Filter pileup" version="1.0.2"> |
2 <description>on coverage and SNPs</description> | 2 <description>on coverage and SNPs</description> |
3 <requirements> | |
4 <requirement type="package" version="5.22.0">perl</requirement> | |
5 </requirements> | |
3 <command interpreter="perl"> | 6 <command interpreter="perl"> |
4 #if $pileup_type.type_select == "six" #pileup_parser.pl $input "3" "5" "6" "4" $qv_cutoff $cvrg_cutoff $snps_only $interval "2" $out_file1 $diff $qc_base | 7 #if $pileup_type.type_select == "six" #pileup_parser.pl $input "3" "5" "6" "4" $qv_cutoff $cvrg_cutoff $snps_only $interval "2" $out_file1 $diff $qc_base |
5 #elif $pileup_type.type_select == "ten" #pileup_parser.pl $input "3" "9" "10" "8" $qv_cutoff $cvrg_cutoff $snps_only $interval "2" $out_file1 $diff $qc_base | 8 #elif $pileup_type.type_select == "ten" #pileup_parser.pl $input "3" "9" "10" "8" $qv_cutoff $cvrg_cutoff $snps_only $interval "2" $out_file1 $diff $qc_base |
6 #elif $pileup_type.type_select == "manual" #pileup_parser.pl $input $pileup_type.ref_base_column $pileup_type.read_bases_column $pileup_type.read_qv_column $pileup_type.cvrg_column $qv_cutoff $cvrg_cutoff $snps_only $interval $pileup_type.coord_column $out_file1 $diff $qc_base | 9 #elif $pileup_type.type_select == "manual" #pileup_parser.pl $input $pileup_type.ref_base_column $pileup_type.read_bases_column $pileup_type.read_qv_column $pileup_type.cvrg_column $qv_cutoff $cvrg_cutoff $snps_only $interval $pileup_type.coord_column $out_file1 $diff $qc_base |
7 #end if# | 10 #end if# |
42 </param> | 45 </param> |
43 <param name="qc_base" label="Print quality and base string?" type="select" help="See "Example 4" below for explanation"> | 46 <param name="qc_base" label="Print quality and base string?" type="select" help="See "Example 4" below for explanation"> |
44 <option value="No">No</option> | 47 <option value="No">No</option> |
45 <option value="Yes" selected="true">Yes</option> | 48 <option value="Yes" selected="true">Yes</option> |
46 </param> | 49 </param> |
47 | |
48 </inputs> | 50 </inputs> |
49 <outputs> | 51 <outputs> |
50 <data format="tabular" name="out_file1"> | 52 <data format="tabular" name="out_file1"> |
51 <change_format> | 53 <change_format> |
52 <when input="interval" value="Yes" format="interval" /> | 54 <when input="interval" value="Yes" format="interval" /> |
91 <param name="input" value="pileup_parser.10col.pileup"/> | 93 <param name="input" value="pileup_parser.10col.pileup"/> |
92 <output name="out_file1" file="pileup_parser.10col.20-3-yes-yes.pileup.out"/> | 94 <output name="out_file1" file="pileup_parser.10col.20-3-yes-yes.pileup.out"/> |
93 <param name="type_select" value="ten"/> | 95 <param name="type_select" value="ten"/> |
94 <param name="qv_cutoff" value="20" /> | 96 <param name="qv_cutoff" value="20" /> |
95 <param name="cvrg_cutoff" value="3" /> | 97 <param name="cvrg_cutoff" value="3" /> |
96 <param name="snps_only" value="Yes"/>q | 98 <param name="snps_only" value="Yes"/> |
97 <param name="interval" value="Yes" /> | 99 <param name="interval" value="Yes" /> |
98 <param name="diff" value="No" /> | 100 <param name="diff" value="No" /> |
99 <param name="qc_base" value="Yes" /> | 101 <param name="qc_base" value="Yes" /> |
100 </test> | 102 </test> |
101 <test> | 103 <test> |
109 <param name="coord_column" value="2"/> | 111 <param name="coord_column" value="2"/> |
110 <param name="qv_cutoff" value="20" /> | 112 <param name="qv_cutoff" value="20" /> |
111 <param name="cvrg_cutoff" value="3" /> | 113 <param name="cvrg_cutoff" value="3" /> |
112 <param name="snps_only" value="Yes"/> | 114 <param name="snps_only" value="Yes"/> |
113 <param name="interval" value="Yes" /> | 115 <param name="interval" value="Yes" /> |
114 <param name="diff" value="No" /> | 116 <param name="diff" value="No" /> |
115 <param name="qc_base" value="Yes" /> | 117 <param name="qc_base" value="Yes" /> |
116 </test> | 118 </test> |
117 <test> | 119 <test> |
118 <param name="input" value="pileup_parser.10col.pileup"/> | 120 <param name="input" value="pileup_parser.10col.pileup"/> |
119 <output name="out_file1" file="pileup_parser.10col.20-3-yes-yes-yes-yes.pileup.out"/> | 121 <output name="out_file1" file="pileup_parser.10col.20-3-yes-yes-yes-yes.pileup.out"/> |
120 <param name="type_select" value="manual"/> | 122 <param name="type_select" value="manual"/> |
121 <param name="ref_base_column" value="3"/> | 123 <param name="ref_base_column" value="3"/> |
122 <param name="read_bases_column" value="9"/> | 124 <param name="read_bases_column" value="9"/> |
125 <param name="coord_column" value="2"/> | 127 <param name="coord_column" value="2"/> |
126 <param name="qv_cutoff" value="20" /> | 128 <param name="qv_cutoff" value="20" /> |
127 <param name="cvrg_cutoff" value="3" /> | 129 <param name="cvrg_cutoff" value="3" /> |
128 <param name="snps_only" value="Yes"/> | 130 <param name="snps_only" value="Yes"/> |
129 <param name="interval" value="Yes" /> | 131 <param name="interval" value="Yes" /> |
130 <param name="diff" value="Yes" /> | 132 <param name="diff" value="Yes" /> |
131 <param name="qc_base" value="Yes" /> | 133 <param name="qc_base" value="Yes" /> |
132 </test> | 134 </test> |
133 <test> | 135 <test> |
134 <param name="input" value="pileup_parser.10col.pileup"/> | 136 <param name="input" value="pileup_parser.10col.pileup"/> |
135 <output name="out_file1" file="pileup_parser.10col.20-3-yes-yes-yes-no.pileup.out"/> | 137 <output name="out_file1" file="pileup_parser.10col.20-3-yes-yes-yes-no.pileup.out"/> |
141 <param name="coord_column" value="2"/> | 143 <param name="coord_column" value="2"/> |
142 <param name="qv_cutoff" value="20" /> | 144 <param name="qv_cutoff" value="20" /> |
143 <param name="cvrg_cutoff" value="3" /> | 145 <param name="cvrg_cutoff" value="3" /> |
144 <param name="snps_only" value="Yes"/> | 146 <param name="snps_only" value="Yes"/> |
145 <param name="interval" value="Yes" /> | 147 <param name="interval" value="Yes" /> |
146 <param name="diff" value="Yes" /> | 148 <param name="diff" value="Yes" /> |
147 <param name="qc_base" value="No" /> | 149 <param name="qc_base" value="No" /> |
148 </test> | 150 </test> |
149 | 151 </tests> |
150 | |
151 </tests> | |
152 <help> | 152 <help> |
153 | 153 |
154 **What it does** | 154 **What it does** |
155 | 155 |
156 Allows one to find sequence variants and/or sites covered by a specified number of reads with bases above a set quality threshold. The tool works on six and ten column pileup formats produced with *samtools pileup* command. However, it also allows you to specify columns in the input file manually. The tool assumes the following: | 156 Allows one to find sequence variants and/or sites covered by a specified number of reads with bases above a set quality threshold. The tool works on six and ten column pileup formats produced with *samtools pileup* command. However, it also allows you to specify columns in the input file manually. The tool assumes the following: |
167 .. _SAMTools: http://samtools.sourceforge.net/pileup.shtml | 167 .. _SAMTools: http://samtools.sourceforge.net/pileup.shtml |
168 | 168 |
169 **Six column pileup**:: | 169 **Six column pileup**:: |
170 | 170 |
171 1 2 3 4 5 6 | 171 1 2 3 4 5 6 |
172 --------------------------------- | 172 --------------------------------- |
173 chrM 412 A 2 ., II | 173 chrM 412 A 2 ., II |
174 chrM 413 G 4 ..t, IIIH | 174 chrM 413 G 4 ..t, IIIH |
175 chrM 414 C 4 ..Ta III2 | 175 chrM 414 C 4 ..Ta III2 |
176 chrM 415 C 4 TTTt III7 | 176 chrM 415 C 4 TTTt III7 |
177 | 177 |
178 where:: | 178 where:: |
179 | 179 |
180 Column Definition | 180 Column Definition |
181 ------- ---------------------------- | 181 ------- ---------------------------- |
182 1 Chromosome | 182 1 Chromosome |
183 2 Position (1-based) | 183 2 Position (1-based) |
184 3 Reference base at that position | 184 3 Reference base at that position |
185 4 Coverage (# reads aligning over that position) | 185 4 Coverage (# reads aligning over that position) |
186 5 Bases within reads | 186 5 Bases within reads |
187 6 Quality values (phred33 scale, see Galaxy wiki for more) | 187 6 Quality values (phred33 scale, see Galaxy wiki for more) |
188 | 188 |
189 **Ten column pileup** | 189 **Ten column pileup** |
190 | 190 |
191 The `ten-column`__ pileup incorporates additional consensus information generated with the *-c* option of the *samtools pileup* command:: | 191 The `ten-column`__ pileup incorporates additional consensus information generated with the *-c* option of the *samtools pileup* command:: |
192 | 192 |
193 | 193 |
226 | 226 |
227 - Number of **A** variants | 227 - Number of **A** variants |
228 - Number of **C** variants | 228 - Number of **C** variants |
229 - Number of **G** variants | 229 - Number of **G** variants |
230 - Number of **T** variants | 230 - Number of **T** variants |
231 - Number of read bases covering this position, where quality is equal to or higher than the value set by **Do not consider read bases with quality lower than** option. | 231 - Number of read bases covering this position, where quality is equal to or higher than the value set by **Do not consider read bases with quality lower than** option. |
232 | 232 |
233 Optionally, if **Print total number of differences?** is set to **Yes**, the tool will append the sixth column with the total number of deviants (see below). | 233 Optionally, if **Print total number of differences?** is set to **Yes**, the tool will append the sixth column with the total number of deviants (see below). |
234 | 234 |
235 2. If **Convert coordinates to intervals?** is set to **Yes**, the tool replaces the position column (typically the second column) with a pair of tab-delimited start/end values. | 235 2. If **Convert coordinates to intervals?** is set to **Yes**, the tool replaces the position column (typically the second column) with a pair of tab-delimited start/end values. |
236 | 236 |
244 you will get:: | 244 you will get:: |
245 | 245 |
246 chrM 413 G 4 ..t, IIIH 0 0 2 1 3 | 246 chrM 413 G 4 ..t, IIIH 0 0 2 1 3 |
247 chrM 414 C 4 ..Ta III2 1 1 0 1 3 | 247 chrM 414 C 4 ..Ta III2 1 1 0 1 3 |
248 chrM 415 C 4 TTTt III7 0 0 0 4 4 | 248 chrM 415 C 4 TTTt III7 0 0 0 4 4 |
249 | 249 |
250 where:: | 250 where:: |
251 | 251 |
252 Column Definition | 252 Column Definition |
253 ------- ---------------------------- | 253 ------- ---------------------------- |
254 1 Chromosome | 254 1 Chromosome |
262 9 Number of G variants | 262 9 Number of G variants |
263 10 Number of T variants | 263 10 Number of T variants |
264 11 Quality adjusted coverage: | 264 11 Quality adjusted coverage: |
265 12 Number of read bases (i.e., # of reads) with quality above the set threshold | 265 12 Number of read bases (i.e., # of reads) with quality above the set threshold |
266 13 Total number of deviants (if Convert coordinates to intervals? is set to yes) | 266 13 Total number of deviants (if Convert coordinates to intervals? is set to yes) |
267 | 267 |
268 if **Print total number of differences?** is set to **Yes**, you will get:: | 268 if **Print total number of differences?** is set to **Yes**, you will get:: |
269 | 269 |
270 chrM 413 G 4 ..t, IIIH 0 0 2 1 3 1 | 270 chrM 413 G 4 ..t, IIIH 0 0 2 1 3 1 |
271 chrM 414 C 4 ..Ta III2 1 2 0 1 3 2 | 271 chrM 414 C 4 ..Ta III2 1 2 0 1 3 2 |
272 chrM 415 C 4 TTTt III7 0 0 0 4 4 0 | 272 chrM 415 C 4 TTTt III7 0 0 0 4 4 0 |
273 | 273 |
274 Note the additional column 13, that contains the number of deviant reads (e.g., there are two deviants, T and a, for position 414). | 274 Note the additional column 13, that contains the number of deviant reads (e.g., there are two deviants, T and a, for position 414). |
275 | 275 |
276 | |
277 Finally, if **Convert coordinates to intervals?** is set to **Yes**, you will get one additional column with the end coordinate:: | 276 Finally, if **Convert coordinates to intervals?** is set to **Yes**, you will get one additional column with the end coordinate:: |
278 | 277 |
279 chrM 412 413 G 4 ..t, III2 0 0 2 1 3 | 278 chrM 412 413 G 4 ..t, III2 0 0 2 1 3 |
280 chrM 414 415 C 4 ..Ta III2 1 2 0 1 3 | 279 chrM 414 415 C 4 ..Ta III2 1 2 0 1 3 |
281 chrM 414 415 C 4 TTTt III7 0 0 0 4 4 | 280 chrM 414 415 C 4 TTTt III7 0 0 0 4 4 |
282 | 281 |
283 where:: | 282 where:: |
284 | 283 |
285 Column Definition | 284 Column Definition |
286 ------- ---------------------------- | 285 ------- ---------------------------- |
287 1 Chromosome | 286 1 Chromosome |
297 11 Number of T variants | 296 11 Number of T variants |
298 12 Quality adjusted coverage | 297 12 Quality adjusted coverage |
299 13 Total number of deviants (if Convert coordinates to intervals? is set to yes) | 298 13 Total number of deviants (if Convert coordinates to intervals? is set to yes) |
300 | 299 |
301 | 300 |
302 Note that in this case the coordinates of SNPs were converted to intervals, where the start coordinate is 0-based and the end coordinate in 1-based using the UCSC Table Browser convention. | 301 Note that in this case the coordinates of SNPs were converted to intervals, where the start coordinate is 0-based and the end coordinate in 1-based using the UCSC Table Browser convention. |
303 | 302 |
304 Although three positions have variants in the original file (413, 414, and 415), only 413 and 415 are reported because the quality values associated with these two SNPs are above the threshold of 20. In the case of 414 the **a** allele has a quality value of 17 ( ord("2")-33 ), and is therefore not reported. Note that five columns have been added to each of the reported lines:: | 303 Although three positions have variants in the original file (413, 414, and 415), only 413 and 415 are reported because the quality values associated with these two SNPs are above the threshold of 20. In the case of 414 the **a** allele has a quality value of 17 ( ord("2")-33 ), and is therefore not reported. Note that five columns have been added to each of the reported lines:: |
305 | 304 |
306 chrM 413 G 4 ..t, IIIH 0 0 2 1 3 | 305 chrM 413 G 4 ..t, IIIH 0 0 2 1 3 |
307 | 306 |
308 Here, there is one variant, **t**. Because the fourth column represents **T** counts, it is incremented by 1. The last column shows that at this position, three reads have bases above the quality threshold of 20. | 307 Here, there is one variant, **t**. Because the fourth column represents **T** counts, it is incremented by 1. The last column shows that at this position, three reads have bases above the quality threshold of 20. |
309 | 308 |
310 ----- | 309 ----- |
311 | 310 |
312 **Example 1**: Just variants | 311 **Example 1**: Just variants |
315 | 314 |
316 chrM 412 A 2 ., II | 315 chrM 412 A 2 ., II |
317 chrM 413 G 4 ..t, III2 | 316 chrM 413 G 4 ..t, III2 |
318 chrM 414 C 4 ..Ta III2 | 317 chrM 414 C 4 ..Ta III2 |
319 chrM 415 C 4 TTTt III7 | 318 chrM 415 C 4 TTTt III7 |
320 | 319 |
321 To call all variants (with no restriction by coverage) with quality above phred value of 20, we will need to set the parameters as follows: | 320 To call all variants (with no restriction by coverage) with quality above phred value of 20, we will need to set the parameters as follows: |
322 | 321 |
323 .. image:: pileup_parser_help1.png | 322 .. image:: pileup_parser_help1.png |
324 | 323 |
325 Running the tool with these parameters will return:: | 324 Running the tool with these parameters will return:: |
326 | 325 |
327 chrM 413 G 4 ..t, IIIH 0 0 0 1 3 | 326 chrM 413 G 4 ..t, IIIH 0 0 0 1 3 |
328 chrM 414 C 4 ..Ta III2 0 2 0 1 3 | 327 chrM 414 C 4 ..Ta III2 0 2 0 1 3 |
329 chrM 415 C 4 TTTt III7 0 0 0 4 4 | 328 chrM 415 C 4 TTTt III7 0 0 0 4 4 |
330 | 329 |
331 **Note** that position 414 is not reported because the *a* variant has associated quality value of 17 (because ord('2')-33 = 17) and is below the phred threshold of 20 set by the **Count variants with quality above this value** parameter. | 330 **Note** that position 414 is not reported because the *a* variant has associated quality value of 17 (because ord('2')-33 = 17) and is below the phred threshold of 20 set by the **Count variants with quality above this value** parameter. |
332 | 331 |
333 ----- | 332 ----- |
334 | 333 |
335 **Example 2**: Report everything | 334 **Example 2**: Report everything |
336 | 335 |
337 In addition to calling variants, it is often useful to know the quality adjusted coverage. Running the tool with these parameters: | 336 In addition to calling variants, it is often useful to know the quality adjusted coverage. Running the tool with these parameters: |
338 | 337 |
339 .. image:: pileup_parser_help2.png | 338 .. image:: pileup_parser_help2.png |
340 | 339 |
341 will report everything from the original file:: | 340 will report everything from the original file:: |
342 | 341 |
343 chrM 412 A 2 ., II 2 0 0 0 2 | 342 chrM 412 A 2 ., II 2 0 0 0 2 |
344 chrM 413 G 4 ..t, III2 0 0 2 1 3 | 343 chrM 413 G 4 ..t, III2 0 0 2 1 3 |
345 chrM 414 C 4 ..Ta III2 0 2 0 1 3 | 344 chrM 414 C 4 ..Ta III2 0 2 0 1 3 |
346 chrM 415 C 4 TTTt III7 0 0 0 4 4 | 345 chrM 415 C 4 TTTt III7 0 0 0 4 4 |
347 | 346 |
348 Here, you can see that although the total coverage at position 414 is 4 (column 4), the quality adjusted coverage is 3 (last column). This is because only three out of four reads have bases with quality above the set threshold of 20 (the actual qualities are III2 or, after conversion, 40, 40, 40, 17). | 347 Here, you can see that although the total coverage at position 414 is 4 (column 4), the quality adjusted coverage is 3 (last column). This is because only three out of four reads have bases with quality above the set threshold of 20 (the actual qualities are III2 or, after conversion, 40, 40, 40, 17). |
349 | 348 |
350 One can use the last column of this dataset to filter out (using Galaxy's **Filter** tool) positions where quality adjusted coverage (last column) is below a set threshold. | 349 One can use the last column of this dataset to filter out (using Galaxy's **Filter** tool) positions where quality adjusted coverage (last column) is below a set threshold. |
351 | 350 |
352 ------ | 351 ------ |
361 | 360 |
362 chrM 412 A 2 ., II 2 0 0 0 2 0 | 361 chrM 412 A 2 ., II 2 0 0 0 2 0 |
363 chrM 413 G 4 ..t, III2 0 0 2 1 3 1 | 362 chrM 413 G 4 ..t, III2 0 0 2 1 3 1 |
364 chrM 414 C 4 ..Ta III2 0 2 0 1 3 1 | 363 chrM 414 C 4 ..Ta III2 0 2 0 1 3 1 |
365 chrM 415 C 4 TTTt III7 0 0 0 4 4 0 | 364 chrM 415 C 4 TTTt III7 0 0 0 4 4 0 |
366 | 365 |
367 | |
368 ----- | 366 ----- |
369 | 367 |
370 **Example 4**: Report everything, print total number of differences, and ignore qualities and read bases | 368 **Example 4**: Report everything, print total number of differences, and ignore qualities and read bases |
371 | 369 |
372 Setting **Print quality and base string?** to **Yes** as shown here: | 370 Setting **Print quality and base string?** to **Yes** as shown here: |
377 | 375 |
378 chrM 412 A 2 2 0 0 0 2 0 | 376 chrM 412 A 2 2 0 0 0 2 0 |
379 chrM 413 G 4 0 0 2 1 3 1 | 377 chrM 413 G 4 0 0 2 1 3 1 |
380 chrM 414 C 4 0 2 0 1 3 1 | 378 chrM 414 C 4 0 2 0 1 3 1 |
381 chrM 415 C 4 0 0 0 4 4 0 | 379 chrM 415 C 4 0 0 0 4 4 0 |
382 | |
383 | |
384 | |
385 | |
386 </help> | 380 </help> |
381 <citations> | |
382 </citations> | |
387 </tool> | 383 </tool> |