comparison gemini_annotate.xml @ 6:0c8f7322f8fc draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gemini commit 29750462f6e725177dc5617c14ba28bcaeed9794
author iuc
date Fri, 18 Jan 2019 19:34:41 -0500
parents 08f57a8502e4
children 567837ca5f33
comparison
equal deleted inserted replaced
5:08f57a8502e4 6:0c8f7322f8fc
1 <tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@"> 1 <tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@+galaxy1">
2 <description>adding your own custom annotations</description> 2 <description>the variants in an existing GEMINI database with additional information</description>
3 <macros> 3 <macros>
4 <import>gemini_macros.xml</import> 4 <import>gemini_macros.xml</import>
5 <token name="@BINARY@">annotate</token> 5 <token name="@BINARY@">annotate</token>
6 <xml name="add_as">
7 <param argument="-c" name="column_name" type="text" value=""
8 label="Database column name to use for recording annotations"
9 help="A column with the name provided here will be added to the variants table of the GEMINI database to store the annotations">
10 <validator type="empty_field" />
11 <validator type="expression" message="Only alphanumeric characters and the underscore can be used in column names">value.replace('_', '').isalnum()</validator>
12 </param>
13 </xml>
6 </macros> 14 </macros>
7 <expand macro="requirements" /> 15 <expand macro="requirements" />
8 <expand macro="stdio" /> 16 <expand macro="stdio" />
9 <expand macro="version_command" /> 17 <expand macro="version_command" />
10 <command> 18 <command>
16 tabix -p '$annotate_source.ext' '$tabixed_file' && 24 tabix -p '$annotate_source.ext' '$tabixed_file' &&
17 cp '$infile' '$outfile' && 25 cp '$infile' '$outfile' &&
18 26
19 gemini @BINARY@ 27 gemini @BINARY@
20 -f '$tabixed_file' 28 -f '$tabixed_file'
21 -c '$column_name'
22 -a $a.a_selector 29 -a $a.a_selector
23 #if $a.a_selector == 'extract': 30 #if $a.a_selector == 'extract':
24 -t $a.column_type 31 #set $types = []
25 -e '$a.column_extracts' 32 #set $elements = []
26 -o $a.operation 33 #set $column_names = []
34 #set $operations = []
35 #for $action in $a.actions:
36 $types.append(str($action.column_type))
37 $elements.append(str($action.element_to_extract).strip())
38 $column_names.append(str($action.column_name).strip())
39 $operations.append(str($action.operation))
40 #end for
41
42 -t #echo ",".join($types)
43 -e '#echo ",".join($elements)#'
44 -o #echo ",".join($operations)
45 -c '#echo ",".join($column_names)#'
46 #else:
47 -c '${a.column_name}'
27 #end if 48 #end if
28 $region_only 49 #if str($annotate_source.ext) == "vcf":
50 $region_only
51 #end if
29 '$outfile' 52 '$outfile'
30 ]]> 53 ]]>
31
32 </command> 54 </command>
33 <inputs> 55 <inputs>
34 <expand macro="infile" /> 56 <expand macro="infile" />
35 <param name="annotate_source" type="data" format="vcf,bed" label="File containing the annotations in BED/VCF format" help="(-f)"/> 57 <param argument="-f" name="annotate_source" type="data" format="vcf,bed"
36 58 label="Dataset to use as the annotation source"
37 <param name="column_name" type="text" value="" 59 help="The tool can use the information from a BED or VCF dataset to annotate the database variants."/>
38 label="The name of the column to be added to the variant table" 60 <param argument="--region-only" name="region_only" type="boolean" checked="true" truevalue="" falsevalue="--region-only"
39 help="If the input file is a VCF, then this is the name of the info field to pull. (-c)"> 61 label="Strict variant-identity matching of database and annotation records (VCF format only)"
40 <sanitizer invalid_char=" "> 62 help="The default is to consider VCF-formatted annotations only if a variant in the GEMINI database and a record in the annotation source describe the exact same nucleotide change at the same position in the genome. You can disable this option to make use of any annotation that overlaps with the position of a database variant. This setting is ignored for annotation sources in BED format, for which matching is always based on overlapping positions only." />
41 <valid initial="string.letters,string.digits">
42 <add value="_" />
43 </valid>
44 </sanitizer>
45 </param>
46 <conditional name="a"> 63 <conditional name="a">
47 <param name="a_selector" type="select" label="How should the annotation file be used?" help="(-a)"> 64 <param argument="-a" name="a_selector" type="select"
48 <option value="boolean">Did a variant overlap a region or not? (boolean)</option> 65 label="Type of information to add to the database variants"
49 <option value="count">How many regions did a variant overlap? (count)</option> 66 help="">
50 <option value="extract" selected="True">Extract specific values from a BED/VCF file. (extract)</option> 67 <option value="boolean">Binary indicator (1=found, 0=not found) of whether the variant had any match in the annotation source (boolean)</option>
68 <option value="count">Count of the number of matches found in the annotation source for the database variant (count)</option>
69 <option value="extract" selected="True">Specific values extracted from matching records in the annotation source (extract)</option>
51 </param> 70 </param>
52 <when value="extract"> 71 <when value="extract">
53 72 <repeat name="actions" title="Annotation extraction recipe" default="1" min="1">
54 <param name="column_extracts" label="Column to extract information from for list annotations. For BED files, this is the column number. For VCF files, this is the name of the INFO field." 73 <param argument="-e" name="element_to_extract" type="text" value=""
55 type="text" force_select="true" help="(-e)"/> 74 label="Elements to extract from the annotation source"
56 75 help="For an annotation source in BED format, specify the number of the column from which the annotations should be read. For a VCF source, name an INFO field element.">
57 76 <validator type="expression" message="This field cannot be empty">value.strip()</validator>
58 <param name="column_type" type="select" label="What data type(s) should be used to represent the new values in the database?" 77 </param>
59 help="(-t)"> 78 <expand macro="add_as" />
60 <option value="float">Decimal precision number (float)</option> 79 <param argument="-t" name="column_type" type="select" display="radio"
61 <option value="integer">Integer number (integer)</option> 80 label="What type of data are you trying to extract?"
62 <option value="text">Text columns such as “valid”, “yes” (text)</option> 81 help="Your selection will determine the data type used to store the new annotations in the database.">
63 </param> 82 <option value="float">Numbers with decimal precision</option>
64 83 <option value="integer">Integer numbers</option>
65 <param name="operation" type="select" label="Operation to apply to the extract column values ..." 84 <option value="text">Text (text)</option>
66 help="in the event that a variant overlaps multiple annotations in your annotation file. (-o)"> 85 </param>
67 <option value="mean">Compute the average of the (numeric) values</option> 86 <param argument="-o" name="operation" type="select"
68 <option value="sum">Compute the sum of the (numeric) values</option> 87 label="If multiple annotations are found for the same variant, store ..."
69 <option value="median">Compute the median of the (numeric) values</option> 88 help="Note: If indicated (in parentheses) an option is only applicable to annotations of a specific type.">
70 <option value="min">Compute the minimum of the (numeric) values</option> 89 <option value="first">the first annotation found</option>
71 <option value="max">Compute the maximum of the (numeric) values</option> 90 <option value="last">the last annotation found</option>
72 <option value="mode">Compute the maximum of the (numeric) values</option> 91 <option value="list">a comma-separated list of the (text) values</option>
73 <option value="first">Use the value from the first record in the annotation file</option> 92 <option value="uniq_list">a comma-separated list of non-redundant (text) values</option>
74 <option value="last">Use the value from the last record in the annotation file</option> 93 <option value="min">the smallest of the (numeric) values</option>
75 <option value="list">Create a comma-separated list of the observed (text) values</option> 94 <option value="max">the largest of the (numeric) values</option>
76 <option value="uniq_list">Create a comma-separated list of non-redundant observed (text) values</option> 95 <option value="mode">the most frequent of the (numeric) values</option>
77 </param> 96 <option value="mean">the mean of the (numeric) values</option>
78 97 <option value="median">the median of the (numeric) values</option>
98 <option value="sum">the sum of the (numeric) values</option>
99 </param>
100 </repeat>
79 </when> 101 </when>
80 <when value="boolean"/> 102 <when value="boolean">
81 <when value="count"/> 103 <expand macro="add_as" />
104 </when>
105 <when value="count">
106 <expand macro="add_as" />
107 </when>
82 </conditional> 108 </conditional>
83 <param name="region_only" argument="--region-only" type="boolean" checked="false"
84 truevalue="--region-only" falsevalue=""
85 label="If set, only region coordinates will be considered when annotating variants."
86 help="The default is to annotate using region coordinates as well as REF and ALT
87 variant values. This option is only valid if annotation is a VCF file"/>
88 </inputs> 109 </inputs>
89 <outputs> 110 <outputs>
90 <data name="outfile" format="gemini.sqlite" /> 111 <data name="outfile" format="gemini.sqlite" />
91 </outputs> 112 </outputs>
92 <tests> 113 <tests>
93 <test> 114 <test>
94 <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" /> 115 <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" />
95 <param name="annotate_source" value="anno.bed" /> 116 <param name="annotate_source" value="anno.bed" />
96 <param name="a_selector" value="count" /> 117 <param name="region_only" value="false" />
97 <param name="column_name" value="anno5" /> 118 <conditional name="a">
119 <param name="a_selector" value="count" />
120 <param name="column_name" value="anno5" />
121 </conditional>
122 <assert_command>
123 <not_has_text text="--region-only" />
124 </assert_command>
98 <output name="outfile" file="gemini_annotate_result.db" ftype="gemini.sqlite" compare="sim_size" delta="1000"/> 125 <output name="outfile" file="gemini_annotate_result.db" ftype="gemini.sqlite" compare="sim_size" delta="1000"/>
126 </test>
127 <test>
128 <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" />
129 <param name="annotate_source" value="anno.bed" />
130 <conditional name="a">
131 <param name="a_selector" value="extract" />
132 <repeat name="actions">
133 <param name="element_to_extract" value="1" />
134 <param name="column_name" value="anno5" />
135 <param name="column_type" value="text" />
136 <param name="operation" value="first" />
137 </repeat>
138 </conditional>
139 <assert_command>
140 <has_text_matching expression="-a +extract" />
141 <has_text_matching expression="-e +1" />
142 <has_text_matching expression="-c +anno5" />
143 <has_text_matching expression="-t +text" />
144 <has_text_matching expression="-o +first" />
145 </assert_command>
146 </test>
147 <test>
148 <param name="infile" value="gemini_load_result1.db" ftype="gemini.sqlite" />
149 <param name="annotate_source" value="anno.bed" />
150 <conditional name="a">
151 <param name="a_selector" value="extract" />
152 <repeat name="actions">
153 <param name="element_to_extract" value="1" />
154 <param name="column_name" value="anno5" />
155 <param name="column_type" value="text" />
156 <param name="operation" value="first" />
157 </repeat>
158 <repeat name="actions">
159 <param name="element_to_extract" value="2" />
160 <param name="column_name" value="anno6" />
161 <param name="column_type" value="integer" />
162 <param name="operation" value="first" />
163 </repeat>
164 </conditional>
165 <assert_command>
166 <has_text_matching expression="-a +extract" />
167 <has_text_matching expression="-e +1,2" />
168 <has_text_matching expression="-c +anno5,anno6" />
169 <has_text_matching expression="-t +text,integer" />
170 <has_text_matching expression="-o +first,first" />
171 </assert_command>
99 </test> 172 </test>
100 </tests> 173 </tests>
101 <help><![CDATA[ 174 <help><![CDATA[
102 **What it does** 175 **What it does**
103 176
104 It is inevitable that researchers will want to enhance the GEMINI framework with their own, custom annotations. GEMINI provides a sub-command called annotate for exactly this purpose. 177 Given an existing GEMINI database and an annotation source in BED or VCF
105 178 format, the annotate tool will, for each variant in the variants table of the
106 **Details** 179 database, screen for overlapping regions defined in the annotation source and
107 180 update one or more new columns of the variant record in the database based on
108 It is inevitable that researchers will want to enhance the GEMINI framework with their own, custom annotations. GEMINI provides a sub-command called annotate for exactly this purpose. As long as you provide a tabix‘ed annotation file in BED or VCF format, the annotate tool will, for each variant in the variants table, screen for overlaps in your annotation file and update a one or more new column in the variants table that you may specify on the command line. This is best illustrated by a following **example**. 181 the result and the annotation found.
109
110 **Input files**
111
112 Let’s assume you have already created a GEMINI database of a **VCF file** using the *load module*.
113
114 Now, let’s imagine you have an annotated file in **BED format** (important.bed) that describes regions of the genome that are particularly relevant to your lab’s research. You would like to annotate in the GEMINI database which variants overlap these crucial regions. We want to store this knowledge in a new column in the variants table called important_variant that tracks whether a given variant overlapped (1) or did not overlap (0) intervals in your annotation file.
115
116 *To do this, you must first TABIX your BED file*
117
118 **-a boolean - Did a variant overlap a region or not?**
119
120 Now, you can use this *TABIX*’ed file to annotate which variants overlap your important regions. In the example below, the results will be stored in a new column called “important”. The **-t boolean** option says that you just want to track whether (1) or not (0) the variant overlapped one or more of your regions.
121
122 Since a new columns has been created in the database, we can now directly query the new column. In the example results below, the first and third variants overlapped a crucial region while the second did not::
123
124 chr22 100 101 1 1
125 chr22 200 201 2 0
126 chr22 300 500 3 1
127
128 **-a count - How many regions did a variant overlap?**
129
130 Instead of a simple yes or no, we can use the **-t count** option to count how many important regions a variant overlapped. It turns out that the 3rd variant actually overlapped two important regions::
131
132 chr22 100 101 1 1
133 chr22 200 201 2 0
134 chr22 300 500 3 2
135
136 **-a extract - Extract specific values from a BED file**
137
138 Lastly, we may also extract values from specific fields in a BED file (or from the INFO field in a VCF) and populate one or more new columns in the database based on overlaps with the annotation file and the values of the fields therein. To do this, we use the **-a extract** option.
139
140 This is best described with an example. To set this up, let’s imagine that we have a VCF file from a different experiment and we want to annotate the variants in our GEMINI database with the allele frequency and depth tags from the INFO fields for the same variants in this other VCF file.
141
142 Now that we have a proper *TABIX*’ed VCF file, we can use the **-a extract** option to populate new columns in the GEMINI database. In order to do so, we must specify:
143
144 1) its type (e.g., text, int, float,) (**-t**)
145 2) the field in the INFO column of the VCF file that we should use to extract data with which to populate the new column (**-e**)
146 3) what operation should be used to summarize the data in the event of multiple overlaps in the annotation file (**-o**)
147 4) (optionally) the name of the column we want to add (**-c**), if this is not specified, it will use the value from **-e**.
148
149 For example, let’s imagine we want to create a new column called “other_allele_freq” (**-c**) using the AF field in our VCF file to populate it.
150
151 This create a new column in my.db called other_allele_freq and this new column will be a FLOAT (**-t float**). In the event of multiple records in the VCF file overlapping a variant in the database, the average (**-o mean**) of the allele frequencies values from the VCF file will be used.
152
153 At this point, one can query the database based on the values of the new other_allele_freq column (using **GEMINI query**).
154
155 **-t TYPE - Specifying the column type(s) when using -a extract**
156
157 The annotate tool will create three different types of columns via the **-t** option:
158
159 1) Floating point columns for annotations with decimal precision as above (-t float)
160 2) Integer columns for integral annotations (-t integer)
161 3) Text columns for string columns such as “valid”, “yes”, etc. (-t text)
162
163 *The -t option is only valid when using the -a extract option.*
164
165 **-o OPERATION - Specifying the summary operations when using -a extract**
166
167 In the event of multiple overlaps between a variant and records in the annotation file, the annotate tool can summarize the values observed with multiple options:
168
169 - -o mean Compute the average of the values. They must be numeric.
170 - -o median Compute the median of the values. They must be numeric.
171 - -o min Compute the minimum of the values. They must be numeric.
172 - -o max Compute the maximum of the values. They must be numeric.
173 - -o mode Compute the maximum of the values. They must be numeric.
174 - -o first Use the value from the first record in the annotation file.
175 - -o last Use the value from the last record in the annotation file.
176 - -o list Create a comma-separated list of the observed values.
177 - -o uniq_list Create a comma-separated list of the distinct observed values.
178 - -o sum Compute the sum of the values. They must be numeric.
179
180 The -o option is only valid when using the -a extract option.
181
182 **Annotating with VCF**
183
184 Most of the examples to this point have pulled a column from a tabix indexed bed file. It is likewise possible to pull from the INFO field of a tabix index VCF. The syntax is identical but the **-e** operation will specify the names of fields in the INFO column to pull. By default, those names will be used, but that can still be specified with the **-c column**.
185
186 To put a DP column in the db, set:
187
188 -o list, -e DP, -t integer
189
190 ... and name it 'depth', set:
191
192 -o list, -e DP, -c depth, -t integer
193
194
195 Missing values are allowed since we expect that in some cases an annotation VCF will not have all INFO fields specified for all variants.
196
197 *We recommend decomposing and normalizing variants before annotating. See Step 1. split, left-align, and trim variants for a detailed explanation of how to do this. To do that see the GEMINI* preprocessing_ *website.*
198
199 **Extracting and populating multiple columns at once**
200
201 One can also extract and populate multiple columns at once by providing comma-separated lists (no spaces) of column names (**-c**), types (**-t**), numbers (**-e**), and summary operations (**-o**). For example, recall that in the VCF example above, we created a *TABIX*’ed BED file containg the allele frequency and depth values from the INFO field as the 4th and 5th columns in the BED, respectively.
202
203 Instead of running the annotate tool twice (once for each column), we can run the tool once and load both columns in the same run. For example with settings:
204
205 - -a extract
206 - -c other_allele_freq,other_depth
207 - -t float,integer
208 - -e 4,5
209 - -o mean,max
210
211 We can then use each of the new columns to filter variants with a *GEMINI query*:
212
213 .. _preprocessing: https://gemini.readthedocs.org/en/latest/content/preprocessing.html#preprocess
214
215 ]]></help> 182 ]]></help>
216 <expand macro="citations"/> 183 <expand macro="citations"/>
217 </tool> 184 </tool>