comparison TEdenovo.xml @ 0:baea09e6722b draft default tip

1st Uploaded
author vmarcon
date Mon, 06 Feb 2017 13:31:53 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:baea09e6722b
1 <tool id="tedenovo" name="REPET Lite - TEdenovo" version="2.2.0">
2
3 <!-- [REQUIRED] Tool description displayed after the tool name -->
4 <description> Compute a library of transposable element</description>
5
6 <!-- [OPTIONAL] 3rd party tools, binaries, modules... required for the tool to work -->
7 <requirements>
8 <requirement type="binary">python</requirement>
9 <requirement type="package" version="2.5">repet</requirement>
10 </requirements>
11
12 <!-- [STRONGLY RECOMMANDED] Exit code rules -->
13 <stdio>
14 <!-- Anything other than zero is an error -->
15 <exit_code range="1:" level="fatal"/>
16 <exit_code range=":-1" level="fatal"/>
17
18 </stdio>
19
20 <!-- [OPTIONAL] Command to be executed to get the tool's version string -->
21 <version_command>
22 TEdenovo.py --version
23 </version_command>
24
25 <!-- [REQUIRED] The command to execute -->
26 <command interpreter="bash">
27 TEdenovo.sh $fasta $outputfasta $classif $outputlog $outputconfig $outputstats
28 #if str( $classif ) == "yes":
29 $outputclassif
30 #else
31 ''
32 #end if
33 </command>
34
35 <!-- [REQUIRED] Input files and tool parameters -->
36 <inputs>
37 <param name="fasta" type="data" format="fasta" optional="false" label="Fasta alignment input" />
38 <param name="classif" type="select" label="Get classification informations" help="To add the informations at annotation file on next step." >
39 <option value="no" selected="true">No</option>
40 <option value="yes">Yes</option>
41 </param>
42 <param name="label" type="text" label="Output name" />
43 </inputs>
44
45 <!-- [REQUIRED] Output files -->
46 <outputs>
47 <data name="outputlog" type="data" format="txt" label="TEdenovo-#if str($label)=='' then $fasta.name else $label #.log" />
48 <data name="outputfasta" type="data" format="fasta" label="TEdenovo-#if str($label)=='' then $fasta.name else $label #.TElib.fa" />
49 <data name="outputstats" type="data" format="txt" label="TEdenovo-#if str($label)=='' then $fasta.name else $label #.classif_stats.txt" />
50 <data name="outputclassif" type="data" format="tabular" label="TEdenovo-#if str($label)=='' then $fasta.name else $label #.classif" >
51 <filter>(classif == 'yes')</filter>
52 </data>
53 <data name="outputconfig" type="data" format="txt" label="TEdenovo-#if str($label)=='' then $fasta.name else $label #.cfg" />
54 </outputs>
55
56
57 <!-- [OPTIONAL] Tests to be run manually by the Galaxy admin -->
58 <tests>
59 <!-- [HELP] Test files have to be in the ~/test-data directory -->
60 <test>
61 <param name="fasta" value="DmelChr4Chr3.fa" />
62 <output name="outputfasta">
63 <assert_contents>
64 <has_line_matching expression="^>\w+" />
65 <has_line_matching expression="[ACTG]{60}" />
66 </assert_contents>
67 </output>
68 <output name="outputlog">
69 <assert_contents>
70 <has_line_matching expression="^step 7 finished successfully" />
71 <has_line_matching expression="^END time: \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}" />
72 <has_line_matching expression="^Writing fasta file" />
73 </assert_contents>
74 </output>
75 <output name="outputconfig">
76 <assert_contents>
77 <has_line_matching expression="^project_name: \d{8}" />
78 <has_line_matching expression="^repet_version: 2.5" />
79 <has_line_matching expression="^tmpDir:" />
80 <has_line_matching expression="^clean: yes" />
81 </assert_contents>
82 </output>
83 <output name="outputstats">
84 <assert_contents>
85 <has_line_matching expression="-------Summary---------" />
86 <has_line_matching expression="TOTAL: \d+ (\d+.\d+%)" />
87 <has_line_matching expression="------NOTES---------" />
88 </assert_contents>
89 </output>
90 </test>
91 <test>
92 <param name="fasta" value="DmelChr4Chr3.fa" />
93 <param name="classif" value="yes" />
94 <output name="outputfasta">
95 <assert_contents>
96 <has_line_matching expression="^>\w+" />
97 <has_line_matching expression="[ACTG]{60}" />
98 </assert_contents>
99 </output>
100 <output name="outputlog">
101 <assert_contents>
102 <has_line_matching expression="^step 7 finished successfully" />
103 <has_line_matching expression="^END time: \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}" />
104 <has_line_matching expression="^Writing fasta file" />
105 </assert_contents>
106 </output>
107 <output name="outputconfig">
108 <assert_contents>
109 <has_line_matching expression="^project_name: \d{8}" />
110 <has_line_matching expression="^repet_version: 2.5" />
111 <has_line_matching expression="^tmpDir:" />
112 <has_line_matching expression="^clean: yes" />
113 </assert_contents>
114 </output>
115 <output name="outputstats">
116 <assert_contents>
117 <has_line_matching expression="-------Summary---------" />
118 <has_line_matching expression="TOTAL: \d+ (\d+.\d+%)" />
119 <has_line_matching expression="------NOTES---------" />
120 </assert_contents>
121 </output>
122 <output name="outputclassif">
123 <assert_contents>
124 <has_n_columns n="8" />
125 </assert_contents>
126 </output>
127 </test>
128 </tests>
129
130 <!-- [OPTIONAL] Help displayed in Galaxy -->
131 <help>
132 <![CDATA[
133 .. class:: infomark
134
135
136 **Authors**
137 Gwendoline Andres
138 Valentin Marcon
139 Veronique Jamilloux
140 Olivier Inizan
141
142 ---------------------------------------------------
143
144 .. class:: infomark
145
146 **Please cite** If you use this tool, please cite
147
148 ---------------------------------------------------
149
150 ==============
151 TEdenovo Lite
152 ==============
153
154 -----------
155 Description
156 -----------
157 REPET is for detection and annotation of transposable elements (TE). The ligth version available on Galaxy is specialised on transposable element masking.
158 TEdenovo is the first step to constitute a consensus library of TE.
159 For a detailed description of each parameter used, please consult the Galaxy page in "Shared Data > Published Pages"
160
161 -----------------
162 Workflow position
163 -----------------
164
165 **Downstream tools**
166
167 =========== ========================== =======
168 Name output file(s) format
169 =========== ========================== =======
170 TEannot GFF with TE masked gff
171 =========== ========================== =======
172
173
174 ----------
175 Input file
176 ----------
177
178 Fasta file
179 Genome file at fasta format
180
181
182 ------------
183 Output files
184 ------------
185
186 Output_name.fa
187 TE library
188
189 Output_name.log
190 log file to see each steps progress
191
192 Output_name.cfg
193 File to show which params have been used
194
195 Output_name.classif_stats.txt
196 File with statistics you can visualize
197
198 Output_name.classif
199 If asked, the classification file to give to next step (TEannot)
200
201
202 ---------------
203 Working example
204 ---------------
205
206 Input files
207 ===========
208
209 Fasta file
210 -----------
211
212 ::
213
214 >dmel_chr4
215 GAATTCGCGTCCGCTTACCCATGTGCCTGTGGATGCCGAACAGGAGGCGCCGTTGACGGC
216 GAATGACTTACTCAAGGGAGTAGCCAATCTGTCGGATACGCCCGGATTGGAGCTGCCCAT
217 GGAGGGTTCTACAAGAAAGCGGTGGAGGATTGCTCGCATACTGCGAGACCGTTTCTGAAG
218 GAGATGGCTCATGGAGTACCTGCCTACGCTTGTGCGCCGCGAGAAGTGGTGAAGAAGAAC
219 GGAGCCCATACACCAGGGTGATATGGTCTTCGTCTGCGATCCCGCCTTGCCCCGGCGAGA
220 GTGGTGCAAGGGCATCATGGAGGAAGTCTCCAGCAGAGCAGATGGAGCAACGGCCTATAG
221 AGGACACTGATGCTACCCGTCTCTAAGCTTGCAGTTTTGGATTTAAGTGAATCGGTTATT
222 CACGGGGTCGGGGATGTCGCGGATCGAACGGTGCAATCGATAGGCGTAATCAGTATTTCC
223 AGATAGTGATAAGATTTGGTGGATAAATGTGTGCGGGCACACTAATGGCCGCCATCGTAA
224 GCCGCGAAAAGCTTAGCGTGCATTGTCGATCGAGAGTTTGGAGGGCAAACTGCGGTAAGA
225 TAAGATTAAATAATTTGTACTGAATAATCTTAAAGAATCCTGATGGAAAGCGCCATGCAG
226 TCACATATAATATGTGCAGAGCTCTCCTC
227
228
229 Output files
230 ============
231
232 output fasta : TE library
233 -------------------------
234
235 ::
236
237 >DTX-incomp_20150313101806-B-G1-Map3
238 ATACAGCTGCGGTTAAAATAATAGCACTACTGCAGGTGGAAAGTTGATTTCCTAAAAAAA
239 ATTATTAAATGTTTATATTTTTTTAAGTCAGATTGCATGAATAATAAGTACCATATGTTG
240 GCTCTCTGAGCAAGAAATTTTTAG
241 >RLX-incomp_20150313101806-B-R12-Map3_reversed
242 ATGATAAGTAGGCAAACTATAAAAATGTTCTATTTATGGGCTGCAATAAACATGTCACCG
243 GACAGCATAAGTGGCAACTACAG
244
245
246 output config : .cfg
247 --------------------
248
249 ::
250
251 [repet_env]
252 repet_version: 2.4
253 repet_host: ******
254 repet_user: ******
255
256
257 output stats : .classif_stats.txt
258 ---------------------------------
259
260 ::
261
262 LTR incomp: 1 (10.00%)
263 LTR total (RLX): 1 (10.00%)
264
265 ClassI + one order: 1 (10.00%)
266 ClassI total (RXX): 1 (10.00%)
267
268 -------------------------Summary--------------------------------
269
270 RXX: 1 (10.00%)
271 DXX: 9 (90.00%)
272 TOTAL: 10 (100.00%)
273
274 -----------------------------NOTES------------------------------
275
276
277 output classif : Classification file
278 ------------------------------------
279
280 ::
281
282 DTX-incomp_dataset_370.dat-B-G1-Map3 542 + ok II TIR incomplete CI=37; coding=(TE_BLRtx: TC1_DM:ClassII:TIR:Tc1-Mariner: 32.59%; TE_BLRx: Mariner-1_DAn_1p:ClassII:TIR:Tc1-Mariner: 18.43%); struct=(TElength: <700bps); other=(Other_profiles: PF13936.1_HTH_38_NA_OTHER_27.0: 77.27%(77.27%); SSRCoverage=0.03)
283 DTX-comp_dataset_370.dat-B-G8-Map20 1244 + ok II TIR complete CI=50; coding=(TE_BLRtx: PROTOP:ClassII:TIR:P: 12.03%, PROTOP_A:ClassII:TIR:P: 49.14%); struct=(TElength: >1000bps; TermRepeats: termTIR: 50); other=(SSRCoverage=0.25)
284
285 ]]>
286 </help>
287
288 <citations>
289 <citation type="bibtex"><![CDATA[@article{10.1371/journal.pone.0016526,
290 author = {Flutre, Timothée AND Duprat, Elodie AND Feuillet, Catherine AND Quesneville, Hadi},
291 journal = {PLoS ONE},
292 publisher = {Public Library of Science},
293 title = {Considering Transposable Element Diversification in <italic>De Novo</italic> Annotation Approaches},
294 year = {2011},
295 month = {01},
296 volume = {6},
297 url = {http://dx.doi.org/10.1371%2Fjournal.pone.0016526},
298 pages = {e16526},
299 abstract = {
300 <p>Transposable elements (TEs) are mobile, repetitive DNA sequences that are almost ubiquitous in prokaryotic and eukaryotic genomes. They have a large impact on genome structure, function and evolution. With the recent development of high-throughput sequencing methods, many genome sequences have become available, making possible comparative studies of TE dynamics at an unprecedented scale. Several methods have been proposed for the <italic>de novo</italic> identification of TEs in sequenced genomes. Most begin with the detection of genomic repeats, but the subsequent steps for defining TE families differ. High-quality TE annotations are available for the <italic>Drosophila melanogaster</italic> and <italic>Arabidopsis thaliana</italic> genome sequences, providing a solid basis for the benchmarking of such methods. We compared the performance of specific algorithms for the clustering of interspersed repeats and found that only a particular combination of algorithms detected TE families with good recovery of the reference sequences. We then applied a new procedure for reconciling the different clustering results and classifying TE sequences. The whole approach was implemented in a pipeline using the REPET package. Finally, we show that our combined approach highlights the dynamics of well defined TE families by making it possible to identify structural variations among their copies. This approach makes it possible to annotate TE families and to study their diversification in a single analysis, improving our understanding of TE dynamics at the whole-genome scale and for diverse species.</p>
301 },
302 number = {1},
303 doi = {10.1371/journal.pone.0016526}
304 }]]></citation>
305 <citation type="bibtex"><![CDATA[@article{10.1371/journal.pone.0094101,
306 author = {Maumus, Florian AND Quesneville, Hadi},
307 journal = {PLoS ONE},
308 publisher = {Public Library of Science},
309 title = {Deep Investigation of <italic>Arabidopsis thaliana</italic> Junk DNA Reveals a Continuum between Repetitive Elements and Genomic Dark Matter},
310 year = {2014},
311 month = {04},
312 volume = {9},
313 url = {http://dx.doi.org/10.1371%2Fjournal.pone.0094101},
314 pages = {e94101},
315 abstract = {<p>Eukaryotic genomes contain highly variable amounts of DNA with no apparent function. This so-called junk DNA is composed of two components: repeated and repeat-derived sequences (together referred to as the repeatome), and non-annotated sequences also known as genomic dark matter. Because of their high duplication rates as compared to other genomic features, transposable elements are predominant contributors to the repeatome and the products of their decay is thought to be a major source of genomic dark matter. Determining the origin and composition of junk DNA is thus important to help understanding genome evolution as well as host biology. In this study, we have used a combination of tools enabling to show that the repeatome from the small and reducing <italic>A. thaliana</italic> genome is significantly larger than previously thought. Furthermore, we present the concepts and results from a series of innovative approaches suggesting that a significant amount of the <italic>A. thaliana</italic> dark matter is of repetitive origin. As a tentative standard for the community, we propose a deep compendium annotation of the <italic>A. thaliana</italic> repeatome that may help addressing farther genome evolution as well as transcriptional and epigenetic regulation in this model plant.</p>},
316 number = {4},
317 doi = {10.1371/journal.pone.0094101}
318 }]]></citation>
319 </citations>
320
321
322 </tool>