comparison create_reference_dataset.xml @ 11:b22f8634ff84 draft

planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/defuse commit 23b94b5747c6956360cd2eca0a07a669929ea141-dirty
author jjohnson
date Sun, 17 Jan 2016 14:11:06 -0500
parents
children b67c24d902aa
comparison
equal deleted inserted replaced
10:f65857c1b92e 11:b22f8634ff84
1 <tool id="create_defuse_reference" name="Create DeFuse Reference" version="@DEFUSE_VERSION@.1">
2 <description>create a defuse reference from Ensembl and UCSC sources</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <requirements>
7 <expand macro="defuse_requirement" />
8 <expand macro="mapping_requirements" />
9 </requirements>
10 <command interpreter="command"> /bin/bash $defuse_script </command>
11 <inputs>
12 <conditional name="genome">
13 <param name="choice" type="select" label="Select a Genome Build">
14 <option value="GRCh38">Homo_sapiens GRCh38 hg38</option>
15 <option value="GRCh37">Homo_sapiens GRCh37 hg19</option>
16 <option value="NCBI36">Homo_sapiens NCBI36 hg18</option>
17 <option value="GRCm38">Mus_musculus GRCm38 mm10</option>
18 <option value="NCBIM37">Mus_musculus NCBIM37 mm9</option>
19 <option value="Rnor_5.0">Rattus_norvegicus Rnor_5.0 rn5</option>
20 <option value="user_specified">User specified</option>
21 </param>
22 <when value="GRCh38">
23 <param name="ensembl_organism" type="hidden" value="homo_sapiens"/>
24 <param name="ensembl_prefix" type="hidden" value="Homo_sapiens"/>
25 <param name="ensembl_genome_version" type="hidden" value="GRCh38"/>
26 <param name="ensembl_version" type="hidden" value="80"/>
27 <param name="ncbi_organism" type="hidden" value="Homo_sapiens"/>
28 <param name="ncbi_prefix" type="hidden" value="Hs"/>
29 <param name="ucsc_genome_version" type="hidden" value="hg38"/>
30 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
31 <param name="mt_chromosome" type="hidden" value="MT"/>
32 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
33 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
34 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
35 </when>
36 <when value="GRCh37">
37 <param name="ensembl_organism" type="hidden" value="homo_sapiens"/>
38 <param name="ensembl_prefix" type="hidden" value="Homo_sapiens"/>
39 <param name="ensembl_genome_version" type="hidden" value="GRCh37"/>
40 <param name="ensembl_version" type="hidden" value="71"/>
41 <param name="ncbi_organism" type="hidden" value="Homo_sapiens"/>
42 <param name="ncbi_prefix" type="hidden" value="Hs"/>
43 <param name="ucsc_genome_version" type="hidden" value="hg19"/>
44 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
45 <param name="mt_chromosome" type="hidden" value="MT"/>
46 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
47 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
48 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
49 </when>
50 <when value="NCBI36">
51 <param name="ensembl_organism" type="hidden" value="homo_sapiens"/>
52 <param name="ensembl_prefix" type="hidden" value="Homo_sapiens"/>
53 <param name="ensembl_genome_version" type="hidden" value="NCBI36"/>
54 <param name="ensembl_version" type="hidden" value="54"/>
55 <param name="ncbi_organism" type="hidden" value="Homo_sapiens"/>
56 <param name="ncbi_prefix" type="hidden" value="Hs"/>
57 <param name="ucsc_genome_version" type="hidden" value="hg18"/>
58 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
59 <param name="mt_chromosome" type="hidden" value="MT"/>
60 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
61 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
62 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
63 </when>
64 <when value="GRCm38">
65 <param name="ensembl_organism" type="hidden" value="mus_musculus"/>
66 <param name="ensembl_prefix" type="hidden" value="Mus_musculus"/>
67 <param name="ensembl_genome_version" type="hidden" value="GRCm38"/>
68 <param name="ensembl_version" type="hidden" value="71"/>
69 <param name="ncbi_organism" type="hidden" value="Mus_musculus"/>
70 <param name="ncbi_prefix" type="hidden" value="Mm"/>
71 <param name="ucsc_genome_version" type="hidden" value="mm10"/>
72 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT"/>
73 <param name="mt_chromosome" type="hidden" value="MT"/>
74 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
75 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
76 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
77 </when>
78 <when value="NCBIM37">
79 <param name="ensembl_organism" type="hidden" value="mus_musculus"/>
80 <param name="ensembl_prefix" type="hidden" value="Mus_musculus"/>
81 <param name="ensembl_genome_version" type="hidden" value="NCBIM37"/>
82 <param name="ensembl_version" type="hidden" value="67"/>
83 <param name="ncbi_organism" type="hidden" value="Mus_musculus"/>
84 <param name="ncbi_prefix" type="hidden" value="Mm"/>
85 <param name="ucsc_genome_version" type="hidden" value="mm9"/>
86 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT"/>
87 <param name="mt_chromosome" type="hidden" value="MT"/>
88 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
89 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
90 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
91 </when>
92 <when value="Rnor_5.0">
93 <param name="ensembl_organism" type="hidden" value="rattus_norvegicus"/>
94 <param name="ensembl_prefix" type="hidden" value="Rattus_norvegicus"/>
95 <param name="ensembl_genome_version" type="hidden" value="Rnor_5.0"/>
96 <param name="ensembl_version" type="hidden" value="71"/>
97 <param name="ncbi_organism" type="hidden" value="Rattus_norvegicus"/>
98 <param name="ncbi_prefix" type="hidden" value="Rn"/>
99 <param name="ucsc_genome_version" type="hidden" value="rn5"/>
100 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,MT"/>
101 <param name="mt_chromosome" type="hidden" value="MT"/>
102 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
103 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
104 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
105 </when>
106 <when value="user_specified">
107 <param name="ensembl_organism" type="text" value="" label="Ensembl Organism Name">
108 <help>
109 Examples: homo_sapiens, mus_musculus, rattus_norvegicus
110 ftp://ftp.ensembl.org/pub/release-$ensembl_version/fasta/$ensembl_organism/dna/$ensembl_prefix.$ensembl_genome_version.$ensembl_version.dna.chromosome.$chromosome.fa.gz
111 </help>
112 </param>
113 <param name="ensembl_prefix" type="text" value="" label="Ensembl Organism prefix" help="Examples: Homo_sapiens, Mus_musculus, Rattus_norvegicus"/>
114 <param name="ensembl_genome_version" type="text" value="" label="Ensembl Genome Version" help="Examples: GRCh37, GRCm38, Rnor_5.0"/>
115 <param name="ensembl_version" type="integer" value="" label="Ensembl Release Version" help="Example: 71"/>
116 <param name="ncbi_organism" type="text" value="" label="NCBI Organism Name" help="Examples: Homo_sapiens, Mus_musculus, Rattus_norvegicus"/>
117 <param name="ncbi_prefix" type="text" value="" label="NCBI Organism Unigene prefix" help="Examples: Hs, Mm, Rn"/>
118 <param name="ucsc_genome_version" type="text" value="" label="UCSC Genome Version" help="Examples: hg19, mm10, rn5"/>
119 <param name="chromosomes" type="text" value="" label="Chromosomes for Ensembl genome build" >
120 <help> Examples:
121 Homo_sapiens: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT
122 Mus_musculus: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT
123 Rattus_norvegicus: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,MT
124 ( ftp://ftp.ensembl.org/pub/release-71/fasta/homo_sapiens/dna/ )
125 </help>
126 </param>
127 <param name="mt_chromosome" type="text" value="MT" label="Ensembl Mitochonrial Chromosome name" />
128 <param name="gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding" label="Gene sources" />
129 <param name="ig_gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene" label="IG Gene sources" />
130 <param name="rrna_gene_sources" type="text" value="Mt_rRNA,rRNA,rRNA_pseudogene" label="Ribosomal Gene sources" />
131 </when>
132 </conditional>
133 </inputs>
134 <outputs>
135 <data format="defuse.conf" name="config_txt" label="${tool.name} on ${genome.ensembl_genome_version} : config.txt"/>
136 </outputs>
137 <stdio>
138 <exit_code range="1:" level="fatal" description="Error running Create DeFuse Reference" />
139 <regex match="Error:"
140 source="both"
141 level="fatal"
142 description="Error running Create DeFuse Reference" />
143
144 </stdio>
145 <configfiles>
146 <configfile name="defuse_config">
147 #
148 # Configuration file for defuse
149 #
150 # Variables that desiganate the PATH to an application, e.g. __SAMTOOLS_BIN__
151 # will be set by the runtime script using the ENV PATH
152 #
153
154 # Directory where the defuse code was unpacked
155 source_directory = __DEFUSE_PATH__
156
157 # Organism IDs
158 ensembl_organism = $genome.ensembl_organism
159 ensembl_prefix = $genome.ensembl_prefix
160 ensembl_version = $genome.ensembl_version
161 ensembl_genome_version = $genome.ensembl_genome_version
162 ucsc_genome_version = $genome.ucsc_genome_version
163 ncbi_organism = $genome.ncbi_organism
164 ncbi_prefix = $genome.ncbi_prefix
165
166 # Directory where you want your dataset
167 dataset_directory = $config_txt.dataset.extra_files_path
168
169 #raw
170 # Input genome and gene models
171 gene_models = $(dataset_directory)/$(ensembl_prefix).$(ensembl_genome_version).$(ensembl_version).gtf
172 genome_fasta = $(dataset_directory)/$(ensembl_prefix).$(ensembl_genome_version).$(ensembl_version).dna.chromosomes.fa
173
174 # Repeat table from ucsc genome browser
175 repeats_filename = $(dataset_directory)/repeats.txt
176
177 # EST info downloaded from ucsc genome browser
178 est_fasta = $(dataset_directory)/est.fa
179 est_alignments = $(dataset_directory)/intronEst.txt
180
181 # Unigene clusters downloaded from ncbi
182 unigene_fasta = $(dataset_directory)/$(ncbi_prefix).seq.uniq
183 #end raw
184
185 # Paths to external tools
186 samtools_bin = __SAMTOOLS_BIN__
187 bowtie_bin = __BOWTIE_BIN__
188 bowtie_build_bin = __BOWTIE_BUILD_BIN__
189 blat_bin = __BLAT_BIN__
190 fatotwobit_bin = __FATOTWOBIT_BIN__
191 gmap_bin = __GMAP_BIN__
192 gmap_setup_bin = __GMAP_SETUP_BIN__
193 r_bin = __R_BIN__
194 rscript_bin = __RSCRIPT_BIN__
195
196 #raw
197 # Directory where you want your dataset
198 gmap_index_directory = $(dataset_directory)/gmap
199 #end raw
200
201 #raw
202 # Dataset files
203 dataset_prefix = $(dataset_directory)/defuse
204 chromosome_prefix = $(dataset_prefix).dna.chromosomes
205 exons_fasta = $(dataset_prefix).exons.fa
206 cds_fasta = $(dataset_prefix).cds.fa
207 cdna_regions = $(dataset_prefix).cdna.regions
208 cdna_fasta = $(dataset_prefix).cdna.fa
209 reference_fasta = $(dataset_prefix).reference.fa
210 rrna_fasta = $(dataset_prefix).rrna.fa
211 ig_gene_list = $(dataset_prefix).ig.gene.list
212 repeats_regions = $(dataset_directory)/repeats.regions
213 est_split_fasta1 = $(dataset_directory)/est.1.fa
214 est_split_fasta2 = $(dataset_directory)/est.2.fa
215 est_split_fasta3 = $(dataset_directory)/est.3.fa
216 est_split_fasta4 = $(dataset_directory)/est.4.fa
217 est_split_fasta5 = $(dataset_directory)/est.5.fa
218 est_split_fasta6 = $(dataset_directory)/est.6.fa
219 est_split_fasta7 = $(dataset_directory)/est.7.fa
220 est_split_fasta8 = $(dataset_directory)/est.8.fa
221 est_split_fasta9 = $(dataset_directory)/est.9.fa
222
223 # Fasta files with bowtie indices for prefiltering reads for concordantly mapping pairs
224 prefilter1 = $(unigene_fasta)
225
226 # deFuse scripts and tools
227 scripts_directory = $(source_directory)/scripts
228 tools_directory = $(source_directory)/tools
229 data_directory = $(source_directory)/data
230 #end raw
231
232 # Parameters for building the dataset
233 chromosomes = $genome.chromosomes
234 mt_chromosome = $genome.mt_chromosome
235 gene_sources = $genome.gene_sources
236 ig_gene_sources = $genome.ig_gene_sources
237 rrna_gene_sources = $genome.rrna_gene_sources
238 gene_biotypes = $genome.gene_sources
239 ig_gene_biotypes = $genome.ig_gene_sources
240 rrna_gene_biotypes = $genome.rrna_gene_sources
241
242 #raw
243 # Remove temp files
244 remove_job_files = yes
245 remove_job_temp_files = yes
246 #end raw
247 </configfile>
248 <configfile name="defuse_script">
249 #!/bin/bash
250 ## define some things for cheetah proccessing
251 #set $amp = chr(38)
252 #set $gt = chr(62)
253 ## substitute pathnames into config file
254 if `grep __DEFUSE_PATH__ $defuse_config ${gt} /dev/null`;then sed -i'.tmp' "s#__DEFUSE_PATH__#\${DEFUSE_PATH}#" $defuse_config; fi
255 if `grep __SAMTOOLS_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} SAMTOOLS_BIN=`which samtools`;then sed -i'.tmp' "s#__SAMTOOLS_BIN__#\${SAMTOOLS_BIN}#" $defuse_config; fi
256 if `grep __BOWTIE_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BIN=`which bowtie`;then sed -i'.tmp' "s#__BOWTIE_BIN__#\${BOWTIE_BIN}#" $defuse_config; fi
257 if `grep __BOWTIE_BUILD_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BUILD_BIN=`which bowtie-build`;then sed -i'.tmp' "s#__BOWTIE_BUILD_BIN__#\${BOWTIE_BUILD_BIN}#" $defuse_config; fi
258 if `grep __BLAT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BLAT_BIN=`which blat`;then sed -i'.tmp' "s#__BLAT_BIN__#\${BLAT_BIN}#" $defuse_config; fi
259 if `grep __FATOTWOBIT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} FATOTWOBIT_BIN=`which faToTwoBit`;then sed -i'.tmp' "s#__FATOTWOBIT_BIN__#\${FATOTWOBIT_BIN}#" $defuse_config; fi
260 if `grep __GMAP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_BIN=`which gmap`;then sed -i'.tmp' "s#__GMAP_BIN__#\${GMAP_BIN}#" $defuse_config; fi
261 if `grep __GMAP_SETUP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_SETUP_BIN=`which gmap_setup`;then sed -i'.tmp' "s#__GMAP_SETUP_BIN__#\${GMAP_SETUP_BIN}#" $defuse_config; fi
262 if `grep __GMAP_INDEX_DIR__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_INDEX_DIR=`pwd`/gmap;then sed -i'.tmp' "s#__GMAP_INDEX_DIR__#\${GMAP_INDEX_DIR}#" $defuse_config; fi
263 if `grep __R_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} R_BIN=`which R`;then sed -i'.tmp' "s#__R_BIN__#\${R_BIN}#" $defuse_config; fi
264 if `grep __RSCRIPT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} RSCRIPT_BIN=`which Rscript`;then sed -i'.tmp' "s#__RSCRIPT_BIN__#\${RSCRIPT_BIN}#" $defuse_config; fi
265 ## copy config to output
266 cp $defuse_config $config_txt
267 ## make a data_dir and ln -s the input fastq
268 mkdir -p $config_txt.dataset.extra_files_path
269 ## create_reference_dataset.pl
270 perl \${DEFUSE_PATH}/scripts/create_reference_dataset.pl -c $defuse_config
271 </configfile>
272 </configfiles>
273
274 <tests>
275 </tests>
276 <help>
277 **DeFuse**
278
279 DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion. See the DeFuse_Version_0.6_ manual for details.
280
281 DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.6_:
282 - genome_fasta from Ensembl
283 - gene_models from Ensembl
284 - repeats_filename from UCSC RepeatMasker rmsk.txt
285 - est_fasta from UCSC
286 - est_alignments from UCSC intronEst.txt
287 - unigene_fasta from NCBI
288
289 The create_defuse_reference Galaxy tool downloads the reference genome and other source files, and builds any derivative files including bowtie indices, gmap indices, and 2bit files. Expect this step to take at least 12 hours.
290
291
292 It will generate a config.txt file that can be input into the deFuse Galaxy tool.
293
294 Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138
295
296 .. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page
297
298 .. _DeFuse_Version_0.6: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.6.1
299
300 ------
301
302 **Outputs**
303
304 The galaxy history will contain: the config.txt file that provides DeFuse with the reference data paths.
305
306 </help>
307 <expand macro="citations"/>
308 </tool>