comparison snpEff_create_db.xml @ 15:479c4f2f4826 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit 999eca8a05f17ae567f99b8ca3394f2105491173
author iuc
date Mon, 09 Jul 2018 13:22:58 -0400
parents f0ee2b470481
children 65ae79bddc69
comparison
equal deleted inserted replaced
14:85ca751407c3 15:479c4f2f4826
1 <tool id="snpEff_build_gb" name="SnpEff build:" version="@wrapper_version@.galaxy2"> 1 <tool id="snpEff_build_gb" name="SnpEff build:" version="@wrapper_version@.galaxy3">
2 <description> database from Genbank record</description> 2 <description> database from Genbank or GFF record</description>
3 <macros> 3 <macros>
4 <import>snpEff_macros.xml</import> 4 <import>snpEff_macros.xml</import>
5 </macros> 5 </macros>
6 <requirements> 6 <requirements>
7 <expand macro="requirement" /> 7 <expand macro="requirement" />
12 </requirements> 12 </requirements>
13 <expand macro="stdio" /> 13 <expand macro="stdio" />
14 <expand macro="version_command" /> 14 <expand macro="version_command" />
15 <command><![CDATA[ 15 <command><![CDATA[
16 16
17 #if str( $fasta.fasta_selector ) == "yes": 17 #if str( $input_type.input_type_selector ) == "gb":
18 python3 '$__tool_directory__/gbk2fa.py' '${input_gbk}' '${output_fasta}' 18 #if str( $input_type.fasta ) == "yes":
19 #if $fasta.remove_version: 19 python3 '$__tool_directory__/gbk2fa.py' '${input_type.input_gbk}' '${output_fasta}'
20 '${fasta.remove_version}' 20 #if $input_type.remove_version:
21 '${input_type.remove_version}'
22 #end if
23 &&
21 #end if 24 #end if
22 &&
23 #end if 25 #end if
24 26
25 mkdir -p '${snpeff_output.files_path}'/'${genome_version}' && 27 mkdir -p '${snpeff_output.files_path}'/'${genome_version}' &&
26 28
27 ln -s '${input_gbk}' '${snpeff_output.files_path}'/'${genome_version}'/genes.gbk && 29 #if str( $input_type.input_type_selector ) == "gb":
30 #if $input_type.input_gbk.is_of_type("genbank"):
31 ln -s '${input_type.input_gbk}' '${snpeff_output.files_path}'/'${genome_version}'/genes.gbk &&
32 #elif $input_type.input_gbk.is_of_type("genbank.gz"):
33 ln -s '${input_type.input_gbk}' '${snpeff_output.files_path}'/'${genome_version}'/genes.gbk.gz &&
34 #end if
35 #elif str( $input_type.input_type_selector ) == "gff":
36 #if $input_type.input_fasta.is_of_type("fasta"):
37 ln -s '${input_type.input_fasta}' '${snpeff_output.files_path}'/'${genome_version}'/sequences.fa &&
38 #elif $input_type.input_fasta.is_of_type("fasta.gz"):
39 ln -s '${input_type.input_fasta}' '${snpeff_output.files_path}'/'${genome_version}'/sequences.fa.gz &&
40 #end if
41 ln -s '${input_type.input_gff}' '${snpeff_output.files_path}'/'${genome_version}'/genes.gff &&
42 #end if
28 43
29 snpEff @java_options@ build -v 44 snpEff @java_options@ build -v
30 -configOption '${genome_version}'.genome='${genome_version}' 45 -configOption '${genome_version}'.genome='${genome_version}'
31 -configOption '${genome_version}'.codonTable='${codon_table}' 46 -configOption '${genome_version}'.codonTable='${codon_table}'
32 -genbank -dataDir '${snpeff_output.files_path}' '${genome_version}' && 47 #if str( $input_type.input_type_selector ) == "gb":
48 -genbank
49 #elif str( $input_type.input_type_selector ) == "gff":
50 -gff3
51 #end if
52 -dataDir '${snpeff_output.files_path}' '${genome_version}' &&
33 echo "${genome_version}.genome : ${genome_version}" >> '${snpeff_output.files_path}'/snpEff.config && 53 echo "${genome_version}.genome : ${genome_version}" >> '${snpeff_output.files_path}'/snpEff.config &&
34 echo "${genome_version}.codonTable : ${codon_table}" >> '${snpeff_output.files_path}'/snpEff.config 54 echo "${genome_version}.codonTable : ${codon_table}" >> '${snpeff_output.files_path}'/snpEff.config
35 55
36 ]]></command> 56 ]]></command>
37 <inputs> 57 <inputs>
38 <param name="genome_version" type="text" value="" label="Name for the database" help="for E. coli K12 you may want to use 'EcK12' etc."> 58 <param name="genome_version" type="text" value="" label="Name for the database" help="for E. coli K12 you may want to use 'EcK12' etc.">
39 <validator type="regex" message="A genome version name is required">\S+</validator> 59 <validator type="regex" message="A genome version name is required">\S+</validator>
40 </param> 60 </param>
41 <param name="input_gbk" type="data" format="genbank,genbank.gz" label="Genbank dataset to build database from" help="This Genbank file will be used to generate snpEff database"/> 61 <conditional name="input_type">
62 <param name="input_type_selector" type="select" display="radio" label="Input annotations are in" help="Specify format for annotations you are using to create SnpEff database">
63 <option value="gb" selected="true">GenBank</option>
64 <option value="gff">GFF</option>
65 </param>
66 <when value="gb">
67 <param name="input_gbk" type="data" format="genbank,genbank.gz" label="Genbank dataset to build database from" help="This Genbank file will be used to generate snpEff database"/>
68 <param name="fasta" type="select" display="radio" label="Parse Genbank into Fasta" help="This will generate an additional dataset containing all sequences from Genbank file in FASTA format">
69 <option value="yes" selected="true">Yes</option>
70 <option value="no">No</option>
71 </param>
72 <param type="boolean" name="remove_version" truevalue="--remove_version" falsevalue="" checked="true" label="Remove sequence version label?" help="Genbank sequences have vesion numbers such as B000564.2. This option removes them leaving only B000564" argument="--remove_version"/>
73 </when>
74 <when value="gff">
75 <param name="input_gff" type="data" format="gff3" label="GFF dataset to build database from" help="This GFF file will be used to generate snpEff database"/>
76 <param name="input_fasta" type="data" format="fasta,fasta.gz" label="Genome in FASTA format" help="This dataset is required for generating SnpEff database. See help section below."/>
77 </when>
78 </conditional>
42 <param name="codon_table" type="select" label="Select genetic code for this sequence" help="If this sequence uses non-standard genetic code, select one from these options"> 79 <param name="codon_table" type="select" label="Select genetic code for this sequence" help="If this sequence uses non-standard genetic code, select one from these options">
43 <option selected="true" value="Standard">Standard</option> 80 <option selected="true" value="Standard">Standard</option>
44 <option value="Vertebrate_Mitochondrial">Vertebrate_Mitochondrial</option> 81 <option value="Vertebrate_Mitochondrial">Vertebrate_Mitochondrial</option>
45 <option value="Yeast_Mitochondrial">Yeast_Mitochondrial</option> 82 <option value="Yeast_Mitochondrial">Yeast_Mitochondrial</option>
46 <option value="Mold_Mitochondrial">Mold_Mitochondrial</option> 83 <option value="Mold_Mitochondrial">Mold_Mitochondrial</option>
64 <option value="Chlorophycean_Mitochondrial">Chlorophycean_Mitochondrial</option> 101 <option value="Chlorophycean_Mitochondrial">Chlorophycean_Mitochondrial</option>
65 <option value="Trematode_Mitochondrial">Trematode_Mitochondrial</option> 102 <option value="Trematode_Mitochondrial">Trematode_Mitochondrial</option>
66 <option value="Scenedesmus_obliquus_Mitochondrial">Scenedesmus_obliquus_Mitochondrial</option> 103 <option value="Scenedesmus_obliquus_Mitochondrial">Scenedesmus_obliquus_Mitochondrial</option>
67 <option value="Thraustochytrium_Mitochondrial">Thraustochytrium_Mitochondrial</option> 104 <option value="Thraustochytrium_Mitochondrial">Thraustochytrium_Mitochondrial</option>
68 </param> 105 </param>
69 <conditional name="fasta">
70 <param name="fasta_selector" type="select" display="radio" label="Parse Genbank into Fasta" help="This will generate an additional dataset containing all sequences from Genbank file in FASTA format">
71 <option value="yes" selected="true">Yes</option>
72 <option value="no">No</option>
73 </param>
74 <when value="yes">
75 <param type="boolean" name="remove_version" truevalue="--remove_version" falsevalue="" checked="true" label="Remove sequence version label?" help="Genbank sequences have vesion numbers such as B000564.2. This option removes them leaving only B000564" argument="--remove_version"/>
76 </when>
77 <when value="no"/>
78 </conditional>
79 </inputs> 106 </inputs>
80 <outputs> 107 <outputs>
81 <data name="snpeff_output" format="snpeffdb" label="@snpeff_version@ database for ${genome_version}"/> 108 <data name="snpeff_output" format="snpeffdb" label="@snpeff_version@ database for ${genome_version}"/>
82 <data name="output_fasta" format="fasta" label="Fasta sequences for ${genome_version}"> 109 <data name="output_fasta" format="fasta" label="Fasta sequences for ${genome_version}">
83 <filter>fasta['fasta_selector'] == 'yes'</filter> 110 <filter>input_type['input_type_selector'] == 'gb'</filter>
111 <filter>input_type['fasta'] == 'yes'</filter>
84 </data> 112 </data>
85 </outputs> 113 </outputs>
86 <tests> 114 <tests>
87 <test> 115 <test>
88 <param name="genome_version" value="pBR322"/> 116 <param name="genome_version" value="pBR322"/>
117 <param name="input_type_selector" value="gb"/>
89 <param name="input_gbk" value="pBR322.gbk" /> 118 <param name="input_gbk" value="pBR322.gbk" />
90 <output name="snpeff_output"> 119 <output name="snpeff_output">
91 <assert_contents> 120 <assert_contents>
92 <has_text text="pBR322" /> 121 <has_text text="pBR322" />
93 </assert_contents> 122 </assert_contents>
94 </output> 123 </output>
95 <output name="output_fasta" value="pBR322.fna"/> 124 <output name="output_fasta" value="pBR322_test1.fna"/>
125 </test>
126 <test>
127 <param name="genome_version" value="pBR322"/>
128 <param name="input_type_selector" value="gb"/>
129 <param name="input_gbk" value="pBR322.gbk.gz" />
130 <output name="snpeff_output">
131 <assert_contents>
132 <has_text text="pBR322" />
133 </assert_contents>
134 </output>
135 <output name="output_fasta" value="pBR322_test1.fna"/>
136 </test>
137 <test>
138 <param name="genome_version" value="pBR322"/>
139 <param name="input_type_selector" value="gff"/>
140 <param name="input_fasta" value="pBR322_test2.fna" />
141 <param name="input_gff" value="pBR322.gff3" />
142 <output name="snpeff_output">
143 <assert_contents>
144 <has_text text="pBR322" />
145 </assert_contents>
146 </output>
147 </test>
148 <test>
149 <param name="genome_version" value="pBR322"/>
150 <param name="input_type_selector" value="gff"/>
151 <param name="input_fasta" value="pBR322_test2.fna.gz" />
152 <param name="input_gff" value="pBR322.gff3" />
153 <output name="snpeff_output">
154 <assert_contents>
155 <has_text text="pBR322" />
156 </assert_contents>
157 </output>
96 </test> 158 </test>
97 </tests> 159 </tests>
98 <help><![CDATA[ 160 <help><![CDATA[
99 **What it does** 161 **What it does**
100 162
101 This tool uses `"snpEff build -genbank"` command to create a snpEff database from a Genbank dataset. If **Parse Genbank into Fasta** is selected (the default behavior) the tool will also convert Genbank dataset into a single FASTA dataset. 163 This tool uses `"snpEff build -genbank"` or `"snpEff build -gff3"` commands to create a snpEff database.
102 164
165 ------
166
167 .. class:: infomark
168
169 **Working with Genbank files**
103 170
104 Using Genbank data for creating databases has several advantages: 171 Using Genbank data for creating databases has several advantages:
105 172
106 #. Genbank files contains annotations (such as locations of genes) together with sequences. This was one ensures that these two are in sync with each other 173 #. Genbank files contain annotations (such as locations of genes) together with sequences. This ensures that these two are in sync with each other.
107 #. When you are analyzing small genomes it is much more convenient to create a database on the fly and use it. 174 #. When you are analyzing small genomes (or not so small) it is much more convenient to create a database on the fly and use it.
175
176 .. class:: warningmark
177
178 SnpEff errors out on highly fragmented genomes containing multiple scaffolds. This is because a single gene may be split between multiple scaffolds causing SnpEff to crash. If this is happening use GFF route described below.
108 179
109 ------- 180 -------
110 181
111 .. class:: infomark 182 **Genbank usage scenario**
112
113 **The usage scenario**
114 183
115 Suppose you have a series of Illumina reads from an experiment involving *E. coli* K-12 MG1655. You want to map these reads to the reference genome of K-12 MG1655, call variants, and annotate them using snpEff. This tool enables you to follow the following analysis steps: 184 Suppose you have a series of Illumina reads from an experiment involving *E. coli* K-12 MG1655. You want to map these reads to the reference genome of K-12 MG1655, call variants, and annotate them using snpEff. This tool enables you to follow the following analysis steps:
116 185
117 #. Download genome from `NCBI <https://www.ncbi.nlm.nih.gov>`_ into Galaxy. 186 #. Go to `NCBI <http://www.ncbi.nlm.nih.gov>`_ page for K-12 MG1655 genome (note that all NCBI genomes have similar list of files associated with them).
118 #. Use this tool to generate a snpEff database and FASTA sequences from the file you downloaded at step 1. 187 #. Copy URL for file with extension `gbff.gz`
188 #. Paste the URL into upload tool and set datatype to `genbank.gz`.
189 #. Use this tool to generate a snpEff database and FASTA sequences from the dataset you've uploaded during the previous step.
119 #. Use your Illumina reads to map against FASTA dataset generated in the previous step using BWA-MEM. 190 #. Use your Illumina reads to map against FASTA dataset generated in the previous step using BWA-MEM.
120 #. Call variants using **Freebayes**. 191 #. Call variants using **Freebayes**.
121 #. Annotate vcf output of Freebayes with **SnpEff eff** using database generated at step 2 (using *Custom* option for **Genome source** parameter). 192 #. Annotate vcf output of Freebayes with **SnpEff eff** using database generated at step 2 (using *Custom* option for **Genome source** parameter).
122 193
123 In this scenario Genbank dataset is used twice. First, it is used to produce FASTA sequences that are using by BWA to map against. Second, it is used to create snpEff database. This guarantees that you will not have any issues related to reference sequence naming. 194 In this scenario Genbank dataset is used twice. First, it is used to produce FASTA sequences that are using by BWA to map against. Second, it is used to create snpEff database. This guarantees that you will not have any issues related to reference sequence naming.
195
196 -------
197
198 .. class:: infomark
199
200 **Working with GFF files**
201
202 Alternatively you can create a SnpEff database from `GFF3 <https://en.wikipedia.org/wiki/General_feature_format>`_ files downloaded from NCBI or any other source. Using GFF dataset for building SnpEff database requires two inputs:
203
204 #. The GFF file itself
205 #. A genome in FASTA format
206
207 The GFF file contains coordinates of various features, but does not contain underlying sequences. This is why a FASTA file needs to be provided as well.
208
209 ------
210
211 **GFF usage scenario**
212
213 The following example also uses *E. coli* K-12 MG1655:
214
215 #. Go to `NCBI <http://www.ncbi.nlm.nih.gov>`_ page for K-12 MG1655 genome.
216 #. Copy URLs for files with `gff.gz` and `fna.gz` extensions. The first file contains annotations in GFF3 format. The second file contains entire genome as a FASTA record.
217 #. Paste URLs into upload tool and set datatypes to `gff3` and `fasta.gz` for annotations and genome, respectively.
218 #. Use this tool to generate a snpEff database from the GFF dataset.
219 #. Map your reads against the FASTA dataset and continue as described in the above example.
220
124 221
125 @snpeff_in_galaxy_info@ 222 @snpeff_in_galaxy_info@
126 @external_documentation@ 223 @external_documentation@
127 ]]> 224 ]]>
128 </help> 225 </help>