comparison snpSift_extractFields.xml @ 1:98708b88af9f draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpsift/snpsift commit 21b46ae2c90ba7e569b2b3a9eaf938f8dedb2c31
author iuc
date Tue, 07 Jun 2016 10:04:09 -0400
parents
children bf8c1526871b
comparison
equal deleted inserted replaced
0:9e8280e19338 1:98708b88af9f
1 <tool id="snpSift_extractFields" name="SnpSift Extract Fields" version="@WRAPPER_VERSION@.0">
2 <options sanitize="False" />
3 <description>from a VCF file inot a tabular file</description>
4 <macros>
5 <import>snpSift_macros.xml</import>
6 </macros>
7 <expand macro="requirements" />
8 <expand macro="stdio" />
9 <expand macro="version_command" />
10 <command><![CDATA[
11 cat "$input"
12 #if $one_effect_per_line:
13 | "\$SNPEFF_JAR_PATH/scripts/vcfEffOnePerLine.pl"
14 #end if
15 | java -Xmx6G -jar "\$SNPEFF_JAR_PATH/SnpSift.jar" extractFields
16 #if $separator:
17 -s '$separator'
18 #end if
19 #if $empty_text:
20 -e '$empty_text'
21 #end if
22 -
23 #echo ' '.join(['"%s"' % x for x in $extract.split()])
24 > "$output"
25 ]]>
26 </command>
27 <inputs>
28 <param format="vcf" name="input" type="data" label="Variant input file in VCF format"/>
29 <param name="extract" type="text" label="Extract" help="Need help? See below a few examples." />
30 <param name="one_effect_per_line" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="One effect per line" help="When variants have more than one effect, lists one effect per line, while all other parameters in the line are repeated across mutiple lines" />
31 <param name="separator" type="text" value="" optional="true" label="multiple field separator" help="Separate multiple fields in one column with this character, e.g. a comma, rather than a column for each of the multiple values">
32 </param>
33 <param name="empty_text" type="text" value="" optional="true" label="empty field text" help="Represent empty fields with this value, rather than leaving them blank" >
34 </param>
35 </inputs>
36 <outputs>
37 <data format="tabular" name="output" />
38 </outputs>
39 <tests>
40 <test>
41 <param name="input" ftype="vcf" value="test_rmInfo.vcf"/>
42 <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/>
43 <output name="output">
44 <assert_contents>
45 <has_text text="INTRAGENIC" />
46 <not_has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" />
47 </assert_contents>
48 </output>
49 </test>
50
51 <test>
52 <param name="input" ftype="vcf" value="test_rmInfo.vcf"/>
53 <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/>
54 <param name="separator" value=","/>
55 <output name="output">
56 <assert_contents>
57 <has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" />
58 </assert_contents>
59 </output>
60 </test>
61
62 </tests>
63 <help><![CDATA[
64
65 **SnpSift Extract Fields**
66
67 Extract fields from a VCF file to a TXT, tab separated format, that you can easily load in R, XLS, etc.
68
69 http://snpeff.sourceforge.net/SnpSift.html#Extract
70
71 You can also use sub-fields and genotype fields / sub-fields such as:
72
73 ::
74
75 Standard VCF fields:
76 CHROM
77 POS
78 ID
79 REF
80 ALT
81 FILTER
82 INFO fields:
83 AF
84 AC
85 DP
86 MQ
87 etc. (any info field available)
88 SnpEff 'ANN' fields:
89 "ANN[*].ALLELE" (alias GENOTYPE)
90 "ANN[*].EFFECT" (alias ANNOTATION): Effect in Sequence ontology terms (e.g. 'missense_variant', 'synonymous_variant', 'stop_gained', etc.)
91 "ANN[*].IMPACT" { HIGH, MODERATE, LOW, MODIFIER }
92 "ANN[*].GENE" Gene name (e.g. 'PSD3')
93 "ANN[*].GENEID" Gene ID
94 "ANN[*].FEATURE"
95 "ANN[*].FEATUREID" (alias TRID: Transcript ID)
96 "ANN[*].BIOTYPE" Biotype, as described by the annotations (e.g. 'protein_coding')
97 "ANN[*].RANK" Exon or Intron rank (i.e. exon number in a transcript)
98 "ANN[*].HGVS_C" (alias HGVS_DNA, CODON): Variant in HGVS (DNA) notation
99 "ANN[*].HGVS_P" (alias HGVS, HGVS_PROT, AA): Variant in HGVS (protein) notation
100 "ANN[*].CDNA_POS" (alias POS_CDNA)
101 "ANN[*].CDNA_LEN" (alias LEN_CDNA)
102 "ANN[*].CDS_POS" (alias POS_CDS)
103 "ANN[*].CDS_LEN" (alias LEN_CDS)
104 "ANN[*].AA_POS" (alias POS_AA)
105 "ANN[*].AA_LEN" (alias LEN_AA)
106 "ANN[*].DISTANCE"
107 "ANN[*].ERRORS" (alias WARNING, INFOS)
108 SnpEff 'EFF' fields (this is for older SnpEff/SnpSift versions, new version use 'ANN' field):
109 "EFF[*].EFFECT"
110 "EFF[*].IMPACT"
111 "EFF[*].FUNCLASS"
112 "EFF[*].CODON"
113 "EFF[*].AA"
114 "EFF[*].AA_LEN"
115 "EFF[*].GENE"
116 "EFF[*].BIOTYPE"
117 "EFF[*].CODING"
118 "EFF[*].TRID"
119 "EFF[*].RANK"
120 SnpEff 'LOF' fields:
121 "LOF[*].GENE"
122 "LOF[*].GENEID"
123 "LOF[*].NUMTR"
124 "LOF[*].PERC"
125 SnpEff' NMD' fields:
126 "NMD[*].GENE"
127 "NMD[*].GENEID"
128 "NMD[*].NUMTR"
129 "NMD[*].PERC"
130
131
132 Some examples:
133
134 - *Extracting chromosome, position, ID and allele frequency from a VCF file:*
135
136 **CHROM POS ID AF**
137
138 The result will look something like:
139
140 ::
141
142 #CHROM POS ID AF
143 1 69134 0.086
144 1 69496 rs150690004 0.001
145
146
147 - *Extracting genotype fields:*
148
149 **CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT**
150
151 This means to extract:
152
153 - CHROM POS ID: regular fields (as in the previous example)
154 - THETA : This one is from INFO
155 - GEN[0].GL[1] : Second likelihood from first genotype
156 - GEN[1].GL : The whole GL fiels (all entries without separating them)
157 - GEN[3].GL[*] : All likelihoods form genotype 3 (this time they will be tab separated, as opposed to the previous one).
158 - GEN[*].GT : Genotype subfields (GT) from ALL samples (tab separated).
159
160 The result will look something like:
161
162 ::
163
164 #CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT
165 1 10583 rs58108140 0.0046 -0.47 -0.24,-0.44,-1.16 -0.48 -0.48 -0.48 0|0 0|0 0|0 0|1 0|0 0|1 0|0 0|0 0|1
166 1 10611 rs189107123 0.0077 -0.48 -0.24,-0.44,-1.16 -0.48 -0.48 -0.48 0|0 0|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0
167 1 13302 rs180734498 0.0048 -0.58 -2.45,-0.00,-5.00 -0.48 -0.48 -0.48 0|0 0|1 0|0 0|0 0|0 1|0 0|0 0|1 0|0
168
169 - *Extracting fields with multiple values:*
170 (notice that there are multiple effect columns per line because there are mutiple effects per variant)
171
172 **CHROM POS REF ALT ANN[*].EFFECT**
173
174 The result will look something like:
175
176 ::
177
178 #CHROM POS REF ALT ANN[*].EFFECT
179 22 17071756 T C 3_prime_UTR_variant downstream_gene_variant
180 22 17072035 C T missense_variant downstream_gene_variant
181 22 17072258 C A missense_variant downstream_gene_variant
182
183 - *Extracting fields with multiple values using a comma as a multipe field separator:*
184
185 **CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P**
186
187 The result will look something like:
188
189 ::
190
191 #CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P
192 22 17071756 T C 3_prime_UTR_variant,downstream_gene_variant .,.
193 22 17072035 C T missense_variant,downstream_gene_variant p.Gly469Glu,.
194 22 17072258 C A missense_variant,downstream_gene_variant p.Gly395Cys,.
195
196
197 - *Extracting fields with multiple values, one effect per line:*
198
199 **CHROM POS REF ALT ANN[*].EFFECT**
200
201 The result will look something like:
202
203 ::
204
205 #CHROM POS REF ALT ANN[*].EFFECT
206 22 17071756 T C 3_prime_UTR_variant
207 22 17071756 T C downstream_gene_variant
208 22 17072035 C T missense_variant
209 22 17072035 C T downstream_gene_variant
210 22 17072258 C A missense_variant
211 22 17072258 C A downstream_gene_variant
212
213
214 @EXTERNAL_DOCUMENTATION@
215 http://snpeff.sourceforge.net/SnpSift.html#Extract
216
217 ]]>
218 </help>
219 <expand macro="citations" />
220 </tool>