comparison datamanager_create_reference.xml @ 11:b22f8634ff84 draft

planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/defuse commit 23b94b5747c6956360cd2eca0a07a669929ea141-dirty
author jjohnson
date Sun, 17 Jan 2016 14:11:06 -0500
parents
children 3a4876d01c7e
comparison
equal deleted inserted replaced
10:f65857c1b92e 11:b22f8634ff84
1 <tool id="data_manager_defuse_reference" name="DeFuse Reference DataManager" version="1.6.1" tool_type="manage_data">
2 <description>create a defuse reference from Ensembl and UCSC sources</description>
3 <requirements>
4 <requirement type="package" version="0.6.1">defuse</requirement>
5 <requirement type="package" version="0.1.18">samtools</requirement>
6 <requirement type="package" version="1.0.0">bowtie</requirement>
7 <requirement type="package" version="2013-05-09">gmap</requirement>
8 <requirement type="package" version="latest">kent</requirement>
9 </requirements>
10 <command interpreter="python"> datamanager_create_reference.py
11 --dbkey $genome.ensembl_genome_version
12 --description "$genome.ensembl_prefix $genome.ensembl_genome_version ($genome.ucsc_genome_version)"
13 --defuse_config $defuse_config
14 --defuse_script $defuse_script
15 $out_file
16 </command>
17 <inputs>
18 <conditional name="genome">
19 <param name="choice" type="select" label="Select a Genome Build">
20 <option value="GRCh38">Homo_sapiens GRCh38 hg38</option>
21 <option value="GRCh37">Homo_sapiens GRCh37 hg19</option>
22 <option value="NCBI36">Homo_sapiens NCBI36 hg18</option>
23 <option value="GRCm38">Mus_musculus GRCm38 mm10</option>
24 <option value="NCBIM37">Mus_musculus NCBIM37 mm9</option>
25 <option value="Rnor_5.0">Rattus_norvegicus Rnor_5.0 rn5</option>
26 <option value="user_specified">User specified</option>
27 </param>
28 <when value="GRCh38">
29 <param name="ensembl_organism" type="hidden" value="homo_sapiens"/>
30 <param name="ensembl_prefix" type="hidden" value="Homo_sapiens"/>
31 <param name="ensembl_genome_version" type="hidden" value="GRCh38"/>
32 <param name="ensembl_version" type="hidden" value="80"/>
33 <param name="ncbi_organism" type="hidden" value="Homo_sapiens"/>
34 <param name="ncbi_prefix" type="hidden" value="Hs"/>
35 <param name="ucsc_genome_version" type="hidden" value="hg38"/>
36 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
37 <param name="mt_chromosome" type="hidden" value="MT"/>
38 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
39 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
40 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
41 </when>
42 <when value="GRCh37">
43 <param name="ensembl_organism" type="hidden" value="homo_sapiens"/>
44 <param name="ensembl_prefix" type="hidden" value="Homo_sapiens"/>
45 <param name="ensembl_genome_version" type="hidden" value="GRCh37"/>
46 <param name="ensembl_version" type="hidden" value="71"/>
47 <param name="ncbi_organism" type="hidden" value="Homo_sapiens"/>
48 <param name="ncbi_prefix" type="hidden" value="Hs"/>
49 <param name="ucsc_genome_version" type="hidden" value="hg19"/>
50 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
51 <param name="mt_chromosome" type="hidden" value="MT"/>
52 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
53 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
54 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
55 </when>
56 <when value="NCBI36">
57 <param name="ensembl_organism" type="hidden" value="homo_sapiens"/>
58 <param name="ensembl_prefix" type="hidden" value="Homo_sapiens"/>
59 <param name="ensembl_genome_version" type="hidden" value="NCBI36"/>
60 <param name="ensembl_version" type="hidden" value="54"/>
61 <param name="ncbi_organism" type="hidden" value="Homo_sapiens"/>
62 <param name="ncbi_prefix" type="hidden" value="Hs"/>
63 <param name="ucsc_genome_version" type="hidden" value="hg18"/>
64 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
65 <param name="mt_chromosome" type="hidden" value="MT"/>
66 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
67 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
68 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
69 </when>
70 <when value="GRCm38">
71 <param name="ensembl_organism" type="hidden" value="mus_musculus"/>
72 <param name="ensembl_prefix" type="hidden" value="Mus_musculus"/>
73 <param name="ensembl_genome_version" type="hidden" value="GRCm38"/>
74 <param name="ensembl_version" type="hidden" value="71"/>
75 <param name="ncbi_organism" type="hidden" value="Mus_musculus"/>
76 <param name="ncbi_prefix" type="hidden" value="Mm"/>
77 <param name="ucsc_genome_version" type="hidden" value="mm10"/>
78 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT"/>
79 <param name="mt_chromosome" type="hidden" value="MT"/>
80 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
81 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
82 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
83 </when>
84 <when value="NCBIM37">
85 <param name="ensembl_organism" type="hidden" value="mus_musculus"/>
86 <param name="ensembl_prefix" type="hidden" value="Mus_musculus"/>
87 <param name="ensembl_genome_version" type="hidden" value="NCBIM37"/>
88 <param name="ensembl_version" type="hidden" value="67"/>
89 <param name="ncbi_organism" type="hidden" value="Mus_musculus"/>
90 <param name="ncbi_prefix" type="hidden" value="Mm"/>
91 <param name="ucsc_genome_version" type="hidden" value="mm9"/>
92 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT"/>
93 <param name="mt_chromosome" type="hidden" value="MT"/>
94 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
95 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
96 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
97 </when>
98 <when value="Rnor_5.0">
99 <param name="ensembl_organism" type="hidden" value="rattus_norvegicus"/>
100 <param name="ensembl_prefix" type="hidden" value="Rattus_norvegicus"/>
101 <param name="ensembl_genome_version" type="hidden" value="Rnor_5.0"/>
102 <param name="ensembl_version" type="hidden" value="71"/>
103 <param name="ncbi_organism" type="hidden" value="Rattus_norvegicus"/>
104 <param name="ncbi_prefix" type="hidden" value="Rn"/>
105 <param name="ucsc_genome_version" type="hidden" value="rn5"/>
106 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,MT"/>
107 <param name="mt_chromosome" type="hidden" value="MT"/>
108 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
109 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
110 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
111 </when>
112 <when value="user_specified">
113 <param name="ensembl_organism" type="text" value="" label="Ensembl Organism Name" help="Examples: homo_sapiens, mus_musculus, rattus_norvegicus"/>
114 <param name="ensembl_prefix" type="text" value="" label="Ensembl Organism prefix" help="Examples: Homo_sapiens, Mus_musculus, Rattus_norvegicus"/>
115 <param name="ensembl_genome_version" type="text" value="" label="Ensembl Genome Version" help="Examples: GRCh37, GRCm38, Rnor_5.0"/>
116 <param name="ensembl_version" type="integer" value="" label="Ensembl Release Version" help="Example: 71"/>
117 <param name="ncbi_organism" type="text" value="" label="NCBI Organism Name" help="Examples: Homo_sapiens, Mus_musculus, Rattus_norvegicus"/>
118 <param name="ncbi_prefix" type="text" value="" label="NCBI Organism Unigene prefix" help="Examples: Hs, Mm, Rn"/>
119 <param name="ucsc_genome_version" type="text" value="" label="UCSC Genome Version" help="Examples: hg19, mm10, rn5"/>
120 <param name="chromosomes" type="text" value="" label="Chromosomes for Ensembl genome build" >
121 <help> Examples:
122 Homo_sapiens: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT
123 Mus_musculus: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT
124 Rattus_norvegicus: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,MT
125 ( ftp://ftp.ensembl.org/pub/release-71/fasta/homo_sapiens/dna/ )
126 </help>
127 </param>
128 <param name="mt_chromosome" type="text" value="MT" label="Ensembl Mitochonrial Chromosome name" />
129 <param name="gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding" label="Gene sources" />
130 <param name="ig_gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene" label="IG Gene sources" />
131 <param name="rrna_gene_sources" type="text" value="Mt_rRNA,rRNA,rRNA_pseudogene" label="Ribosomal Gene sources" />
132 </when>
133 </conditional>
134 </inputs>
135 <outputs>
136 <data name="out_file" format="data_manager_json" label="${tool.name} : ${genome.ensembl_genome_version}"/>
137 </outputs>
138 <stdio>
139 <exit_code range="1:" level="fatal" description="Error running Create DeFuse Reference" />
140 <regex match="Error:"
141 source="both"
142 level="fatal"
143 description="Error running Create DeFuse Reference" />
144
145 </stdio>
146 <configfiles>
147 <configfile name="defuse_config">
148 #
149 # Configuration file for defuse
150 #
151 # Variables that desiganate the PATH to an application, e.g. __SAMTOOLS_BIN__
152 # will be set by the runtime script using the ENV PATH
153 #
154
155 # Directory where the defuse code was unpacked
156 source_directory = __DEFUSE_PATH__
157
158 # Organism IDs
159 ensembl_organism = $genome.ensembl_organism
160 ensembl_prefix = $genome.ensembl_prefix
161 ensembl_version = $genome.ensembl_version
162 ensembl_genome_version = $genome.ensembl_genome_version
163 ucsc_genome_version = $genome.ucsc_genome_version
164 ncbi_organism = $genome.ncbi_organism
165 ncbi_prefix = $genome.ncbi_prefix
166
167 # Directory where you want your dataset
168 dataset_directory = __DATASET_DIRECTORY__
169
170 #raw
171 # Input genome and gene models
172 gene_models = $(dataset_directory)/$(ensembl_prefix).$(ensembl_genome_version).$(ensembl_version).gtf
173 genome_fasta = $(dataset_directory)/$(ensembl_prefix).$(ensembl_genome_version).$(ensembl_version).dna.chromosomes.fa
174
175 # Repeat table from ucsc genome browser
176 repeats_filename = $(dataset_directory)/repeats.txt
177
178 # EST info downloaded from ucsc genome browser
179 est_fasta = $(dataset_directory)/est.fa
180 est_alignments = $(dataset_directory)/intronEst.txt
181
182 # Unigene clusters downloaded from ncbi
183 unigene_fasta = $(dataset_directory)/$(ncbi_prefix).seq.uniq
184 #end raw
185
186 # Paths to external tools
187 samtools_bin = __SAMTOOLS_BIN__
188 bowtie_bin = __BOWTIE_BIN__
189 bowtie_build_bin = __BOWTIE_BUILD_BIN__
190 blat_bin = __BLAT_BIN__
191 fatotwobit_bin = __FATOTWOBIT_BIN__
192 gmap_bin = __GMAP_BIN__
193 gmap_setup_bin = __GMAP_SETUP_BIN__
194 r_bin = __R_BIN__
195 rscript_bin = __RSCRIPT_BIN__
196
197 #raw
198 # Directory where you want your dataset
199 gmap_index_directory = $(dataset_directory)/gmap
200 #end raw
201
202 #raw
203 # Dataset files
204 dataset_prefix = $(dataset_directory)/defuse
205 chromosome_prefix = $(dataset_prefix).dna.chromosomes
206 exons_fasta = $(dataset_prefix).exons.fa
207 cds_fasta = $(dataset_prefix).cds.fa
208 cdna_regions = $(dataset_prefix).cdna.regions
209 cdna_fasta = $(dataset_prefix).cdna.fa
210 reference_fasta = $(dataset_prefix).reference.fa
211 rrna_fasta = $(dataset_prefix).rrna.fa
212 ig_gene_list = $(dataset_prefix).ig.gene.list
213 repeats_regions = $(dataset_directory)/repeats.regions
214 est_split_fasta1 = $(dataset_directory)/est.1.fa
215 est_split_fasta2 = $(dataset_directory)/est.2.fa
216 est_split_fasta3 = $(dataset_directory)/est.3.fa
217 est_split_fasta4 = $(dataset_directory)/est.4.fa
218 est_split_fasta5 = $(dataset_directory)/est.5.fa
219 est_split_fasta6 = $(dataset_directory)/est.6.fa
220 est_split_fasta7 = $(dataset_directory)/est.7.fa
221 est_split_fasta8 = $(dataset_directory)/est.8.fa
222 est_split_fasta9 = $(dataset_directory)/est.9.fa
223
224 # Fasta files with bowtie indices for prefiltering reads for concordantly mapping pairs
225 prefilter1 = $(unigene_fasta)
226
227 # deFuse scripts and tools
228 scripts_directory = $(source_directory)/scripts
229 tools_directory = $(source_directory)/tools
230 data_directory = $(source_directory)/data
231 #end raw
232
233 # Parameters for building the dataset
234 chromosomes = $genome.chromosomes
235 mt_chromosome = $genome.mt_chromosome
236 gene_sources = $genome.gene_sources
237 ig_gene_sources = $genome.ig_gene_sources
238 rrna_gene_sources = $genome.rrna_gene_sources
239 gene_biotypes = $genome.gene_sources
240 ig_gene_biotypes = $genome.ig_gene_sources
241 rrna_gene_biotypes = $genome.rrna_gene_sources
242
243 #raw
244 # Remove temp files
245 remove_job_files = yes
246 remove_job_temp_files = yes
247 #end raw
248 </configfile>
249 <configfile name="defuse_script">#slurp
250 #!/bin/bash
251 ## define some things for cheetah proccessing
252 #set $amp = chr(38)
253 #set $gt = chr(62)
254 ## substitute pathnames into config file
255 if `grep __DATASET_DIRECTORY__ $defuse_config ${gt} /dev/null`;then sed -i'.tmp' "s#__DATASET_DIRECTORY__#\$1#" $defuse_config; fi
256 if `grep __DEFUSE_PATH__ $defuse_config ${gt} /dev/null`;then sed -i'.tmp' "s#__DEFUSE_PATH__#\${DEFUSE_PATH}#" $defuse_config; fi
257 if `grep __SAMTOOLS_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} SAMTOOLS_BIN=`which samtools`;then sed -i'.tmp' "s#__SAMTOOLS_BIN__#\${SAMTOOLS_BIN}#" $defuse_config; fi
258 if `grep __BOWTIE_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BIN=`which bowtie`;then sed -i'.tmp' "s#__BOWTIE_BIN__#\${BOWTIE_BIN}#" $defuse_config; fi
259 if `grep __BOWTIE_BUILD_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BUILD_BIN=`which bowtie-build`;then sed -i'.tmp' "s#__BOWTIE_BUILD_BIN__#\${BOWTIE_BUILD_BIN}#" $defuse_config; fi
260 if `grep __BLAT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BLAT_BIN=`which blat`;then sed -i'.tmp' "s#__BLAT_BIN__#\${BLAT_BIN}#" $defuse_config; fi
261 if `grep __FATOTWOBIT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} FATOTWOBIT_BIN=`which faToTwoBit`;then sed -i'.tmp' "s#__FATOTWOBIT_BIN__#\${FATOTWOBIT_BIN}#" $defuse_config; fi
262 if `grep __GMAP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_BIN=`which gmap`;then sed -i'.tmp' "s#__GMAP_BIN__#\${GMAP_BIN}#" $defuse_config; fi
263 if `grep __GMAP_SETUP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_SETUP_BIN=`which gmap_setup`;then sed -i'.tmp' "s#__GMAP_SETUP_BIN__#\${GMAP_SETUP_BIN}#" $defuse_config; fi
264 if `grep __GMAP_INDEX_DIR__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_INDEX_DIR=`pwd`/gmap;then sed -i'.tmp' "s#__GMAP_INDEX_DIR__#\${GMAP_INDEX_DIR}#" $defuse_config; fi
265 if `grep __R_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} R_BIN=`which R`;then sed -i'.tmp' "s#__R_BIN__#\${R_BIN}#" $defuse_config; fi
266 if `grep __RSCRIPT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} RSCRIPT_BIN=`which Rscript`;then sed -i'.tmp' "s#__RSCRIPT_BIN__#\${RSCRIPT_BIN}#" $defuse_config; fi
267 ## copy config to output
268 cp $defuse_config \$1/defuse_config.txt
269 ## Run the create_reference_dataset.pl
270 perl \${DEFUSE_PATH}/scripts/create_reference_dataset.pl -c $defuse_config
271 </configfile>
272 </configfiles>
273
274 <tests>
275 </tests>
276 <help>
277 **DeFuse**
278
279 DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion. See the DeFuse_Version_0.6_ manual for details.
280
281 DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.6_:
282 - genome_fasta from Ensembl
283 - gene_models from Ensembl
284 - repeats_filename from UCSC RepeatMasker rmsk.txt
285 - est_fasta from UCSC
286 - est_alignments from UCSC intronEst.txt
287 - unigene_fasta from NCBI
288
289 The create_defuse_reference Galaxy tool downloads the reference genome and other source files, and builds any derivative files including bowtie indices, gmap indices, and 2bit files. Expect this step to take at least 12 hours.
290
291
292 It will generate the refernce data for deFuse Galaxy tool.
293
294 Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138
295
296 .. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page
297
298 .. _DeFuse_Version_0.6: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.6.1
299
300 ------
301
302 **Outputs**
303
304 The galaxy history will contain: the config.txt file that provides DeFuse with the reference data paths.
305
306 </help>
307 </tool>