|
0
|
1 <tool id="teannot" name="REPET Lite - TEannot" version="1.5.0">
|
|
|
2
|
|
|
3 <!-- [REQUIRED] Tool description displayed after the tool name -->
|
|
|
4 <description> Genome annotation for masking transposable elements</description>
|
|
|
5
|
|
|
6 <!-- [OPTIONAL] 3rd party tools, binaries, modules... required for the tool to work -->
|
|
|
7 <requirements>
|
|
|
8 <requirement type="binary">python</requirement>
|
|
|
9 <requirement type="package" version="2.5">repet</requirement>
|
|
|
10 </requirements>
|
|
|
11
|
|
|
12 <!-- [STRONGLY RECOMMANDED] Exit code rules -->
|
|
|
13 <stdio>
|
|
|
14 <!-- Anything other than zero is an error -->
|
|
|
15 <exit_code range="1:" level="fatal"/>
|
|
|
16 <exit_code range=":-1" level="fatal"/>
|
|
|
17
|
|
|
18 </stdio>
|
|
|
19
|
|
|
20
|
|
|
21 <!-- [OPTIONAL] Command to be executed to get the tool's version string -->
|
|
|
22 <version_command>
|
|
|
23 <!--
|
|
|
24 tool_binary -v
|
|
|
25 -->
|
|
|
26 </version_command>
|
|
|
27
|
|
|
28 <!-- [REQUIRED] The command to execute -->
|
|
|
29 <command interpreter="bash">
|
|
|
30 TEannot.sh $fasta $library $outputfile $outputmaskedfile $outputlog $outputconfig
|
|
|
31 #if str( $withStats ) == "yes":
|
|
|
32 $outputstatsfile
|
|
|
33 #else :
|
|
|
34 $withStats
|
|
|
35 #end if
|
|
|
36 $classif
|
|
|
37 $outputmasked_SSRmaskfile
|
|
|
38 </command>
|
|
|
39
|
|
|
40 <!-- [REQUIRED] Input files and tool parameters -->
|
|
|
41 <inputs>
|
|
|
42 <param name="fasta" type="data" format="fasta" optional="false" label="Fasta alignment input" />
|
|
|
43 <param name="library" type="data" format="fasta" optional="false" label="Fasta TE library [from TEdenovo]" />
|
|
|
44 <param name="classif" type="data" format="tabular" optional="true" label="Classification file" help="To add classification information in the output file." />
|
|
|
45 <param name="label" type="text" label="Output name" />
|
|
|
46 <param name="withStats" type="select" label="Get statistical file">
|
|
|
47 <option value="yes" selected="true">Yes</option>
|
|
|
48 <option value="no">No</option>
|
|
|
49 </param>
|
|
|
50 </inputs>
|
|
|
51
|
|
|
52 <!-- [REQUIRED] Output files -->
|
|
|
53 <outputs>
|
|
|
54 <data name="outputlog" type="data" format="txt" label="TEannot-#if str($label)=='' then $fasta.name else $label #.log" />
|
|
|
55 <data name="outputfile" type="data" format="gff3" label="TEannot-#if str($label)=='' then $fasta.name else $label #.gff3" />
|
|
|
56 <data name="outputmaskedfile" type="data" format="fasta" label="TEannot-#if str($label)=='' then $fasta.name else $label #_masked.fa" />
|
|
|
57 <data name="outputmasked_SSRmaskfile" type="data" format="fasta" label="TEannot-#if str($label)=='' then $fasta.name else $label #_SSRmask.fa" />
|
|
|
58 <data name="outputstatsfile" type="data" format="txt" label="TEannot-#if str($label)=='' then $fasta.name else $label #.stats" >
|
|
|
59 <filter>(withStats == 'yes')</filter>
|
|
|
60 </data>
|
|
|
61 <data name="outputconfig" type="data" format="txt" label="TEannot-#if str($label)=='' then $fasta.name else $label #.cfg" />
|
|
|
62 </outputs>
|
|
|
63
|
|
|
64
|
|
|
65 <!-- [OPTIONAL] Tests to be run manually by the Galaxy admin -->
|
|
|
66 <tests>
|
|
|
67 <!-- [HELP] Test files have to be in the ~/test-data directory -->
|
|
|
68 <test>
|
|
|
69 <param name="fasta" value="alignment.fa" />
|
|
|
70 <param name="library" value="libTE.fa" />
|
|
|
71 <output name="outputfile" >
|
|
|
72 <assert_contents>
|
|
|
73 <has_line_matching expression="^##gff-version 3" />
|
|
|
74 <has_n_columns n="9" />
|
|
|
75 </assert_contents>
|
|
|
76 </output>
|
|
|
77 <output name="outputmaskedfile" >
|
|
|
78 <assert_contents>
|
|
|
79 <has_line_matching expression="^>\w+" />
|
|
|
80 <has_line_matching expression="[ACTGX]{60}" />
|
|
|
81 </assert_contents>
|
|
|
82 </output>
|
|
|
83 <output name="outputstatsfile">
|
|
|
84 <assert_contents>
|
|
|
85 <has_line_matching expression="^nb of sequences:" />
|
|
|
86 <has_line_matching expression="^mean of median length percentage of all families:" />
|
|
|
87 </assert_contents>
|
|
|
88 </output>
|
|
|
89 <output name="outputlog">
|
|
|
90 <assert_contents>
|
|
|
91 <has_line_matching expression="^step 7 finished successfully" />
|
|
|
92 <has_line_matching expression="^END time: \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}" />
|
|
|
93 <has_line_matching expression="^Writing fasta file" />
|
|
|
94 </assert_contents>
|
|
|
95 </output>
|
|
|
96 <output name="outputconfig">
|
|
|
97 <assert_contents>
|
|
|
98 <has_line_matching expression="^project_name: \d{8}" />
|
|
|
99 <has_line_matching expression="^repet_version: 2.5" />
|
|
|
100 <has_line_matching expression="^tmpDir:" />
|
|
|
101 <has_line_matching expression="^clean: yes" />
|
|
|
102 </assert_contents>
|
|
|
103 </output>
|
|
|
104 </test>
|
|
|
105 <test>
|
|
|
106 <param name="fasta" value="alignment.fa" />
|
|
|
107 <param name="library" value="libTE.fa" />
|
|
|
108 <param name="classif" value="libTE.classif" />
|
|
|
109 <param name="withStats" value="no"/>
|
|
|
110 <output name="outputfile">
|
|
|
111 <assert_contents>
|
|
|
112 <has_line_matching expression="^##gff-version 3" />
|
|
|
113 <has_n_columns n="9" />
|
|
|
114 </assert_contents>
|
|
|
115 </output>
|
|
|
116 <output name="outputmaskedfile">
|
|
|
117 <assert_contents>
|
|
|
118 <has_line_matching expression="^>\w+" />
|
|
|
119 <has_line_matching expression="[ACTGX]{60}" />
|
|
|
120 </assert_contents>
|
|
|
121 </output>
|
|
|
122 <output name="outputlog">
|
|
|
123 <assert_contents>
|
|
|
124 <has_line_matching expression="^step 7 finished successfully" />
|
|
|
125 <has_line_matching expression="^END time: \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}" />
|
|
|
126 <has_line_matching expression="^Writing fasta file" />
|
|
|
127 </assert_contents>
|
|
|
128 </output>
|
|
|
129 <output name="outputconfig">
|
|
|
130 <assert_contents>
|
|
|
131 <has_line_matching expression="^project_name: \d{8}" />
|
|
|
132 <has_line_matching expression="^repet_version: 2.5" />
|
|
|
133 <has_line_matching expression="^tmpDir:" />
|
|
|
134 <has_line_matching expression="^clean: yes" />
|
|
|
135 </assert_contents>
|
|
|
136 </output>
|
|
|
137 </test>
|
|
|
138 </tests>
|
|
|
139
|
|
|
140 <!-- [OPTIONAL] Help displayed in Galaxy -->
|
|
|
141 <help>
|
|
|
142 <![CDATA[
|
|
|
143 .. class:: infomark
|
|
|
144
|
|
|
145 **Authors**
|
|
|
146 Gwendoline Andres
|
|
|
147 Valentin Marcon
|
|
|
148 Veronique Jamilloux
|
|
|
149 Olivier Inizan
|
|
|
150
|
|
|
151 ---------------------------------------------------
|
|
|
152
|
|
|
153 .. class:: infomark
|
|
|
154
|
|
|
155 **Please cite** If you use this tool, please cite
|
|
|
156
|
|
|
157 ---------------------------------------------------
|
|
|
158
|
|
|
159 ==============
|
|
|
160 TEannot Lite
|
|
|
161 ==============
|
|
|
162
|
|
|
163 -----------
|
|
|
164 Description
|
|
|
165 -----------
|
|
|
166
|
|
|
167 REPET is for detection and annotation of transposable elements (TE). The ligth version available on Galaxy is specialised on transposable element masking.
|
|
|
168 TEannot is the second and last step to mask TE on the genome.
|
|
|
169 For a detailed description of each parameter used, please consult the Galaxy page in "Shared Data > Published Pages"
|
|
|
170
|
|
|
171 -----------------
|
|
|
172 Workflow position
|
|
|
173 -----------------
|
|
|
174
|
|
|
175 **Upstream tools**
|
|
|
176
|
|
|
177 =========== ========================== =======
|
|
|
178 Name output file(s) format
|
|
|
179 =========== ========================== =======
|
|
|
180 TEdenovo Fasta file with TE library fasta
|
|
|
181 =========== ========================== =======
|
|
|
182
|
|
|
183
|
|
|
184 ----------
|
|
|
185 Input file
|
|
|
186 ----------
|
|
|
187
|
|
|
188 Fasta file
|
|
|
189 Genome file at fasta format
|
|
|
190
|
|
|
191 Library file
|
|
|
192 Fasta file with a library of transposable elements from TEdenovo.
|
|
|
193
|
|
|
194 ----------
|
|
|
195 Parameters
|
|
|
196 ----------
|
|
|
197
|
|
|
198 Masked file
|
|
|
199 To get an additionnal output file : Masked fasta file
|
|
|
200
|
|
|
201
|
|
|
202 ------------
|
|
|
203 Output files
|
|
|
204 ------------
|
|
|
205
|
|
|
206 Output_gff3
|
|
|
207 GFF3 file with transposable elements
|
|
|
208 Output_masked_fasta
|
|
|
209 Input fasta file masked with TE infos
|
|
|
210 Output_config
|
|
|
211 File to show which params have been used
|
|
|
212 Output_stats
|
|
|
213 File with statistics on TE library
|
|
|
214
|
|
|
215 ------------
|
|
|
216 Dependencies
|
|
|
217 ------------
|
|
|
218
|
|
|
219
|
|
|
220 ---------------------------------------------------
|
|
|
221
|
|
|
222 ---------------
|
|
|
223 Working example
|
|
|
224 ---------------
|
|
|
225
|
|
|
226 Input files
|
|
|
227 ===========
|
|
|
228
|
|
|
229 Fasta file
|
|
|
230 ----------
|
|
|
231
|
|
|
232 ::
|
|
|
233
|
|
|
234 >dmel_chr4
|
|
|
235 GAGAACCGTCCTGTAAGTACTCTTGCTTTAAATACGAAAGTAATACTAATCCATGACGCTTAAGTCGAAGAGAGAATAAGTCAATATTTAATTGGACTCATCGCTTATGTTCATCATGAATCTATAGTTAACTTGATGTTGTGCTCCATGTACGATATAAAAAGTTAGATA
|
|
|
236
|
|
|
237
|
|
|
238 Fasta Library
|
|
|
239 -------------
|
|
|
240
|
|
|
241 ::
|
|
|
242
|
|
|
243 >DTX-incomp_20150325110123-B-G1-Map3
|
|
|
244 ATACAGCTGCGGTTAAAATAATAGCACTACTGCAGGTGGAAAGTTGATTTCCTAAAAAAA
|
|
|
245 ATTATTAAATGTTTATATTTTTTTAAGTCAGATTGCATGAATAATAAGTACCATATGTTG
|
|
|
246 GCTCTCTGAGCAAGAAATTTTTAGTCTCT
|
|
|
247 >DTX-incomp_20150325110123-B-P1.0-Map3
|
|
|
248 CTTGTGTCCGCACTTCGTGCCTCAAGATATGAACAAAGCAAAGACACTAGAATAATTCTA
|
|
|
249 GTGTATTACTTTGATATTACTTTTGCAATAAACAGTTATCATATTTTTA
|
|
|
250
|
|
|
251
|
|
|
252 Output files
|
|
|
253 ============
|
|
|
254
|
|
|
255 GFF3 output :
|
|
|
256 -------------
|
|
|
257
|
|
|
258 ::
|
|
|
259
|
|
|
260 ##gff-version 3
|
|
|
261 dmel_chr4 test_REPET_TEs match 971161 971469 0.0 - . ID=ms1_dmel_chr4_DTX-incomp_DmelChr4-B-G1-Map3;Target=DTX-incomp_DmelChr4-B-G1-Map3 45 542
|
|
|
262 dmel_chr4 test_REPET_TEs match_part 971161 971271 0.0 - . ID=mp1-1_dmel_chr4_DTX-incomp_DmelChr4-B-G1-Map3;Parent=ms1_dmel_chr4_DTX-incomp_DmelChr4-B-G1-Map3;Target=DTX-incomp_DmelChr4-B-G1-Map3 435 542;Identity=94.4
|
|
|
263
|
|
|
264 Masked fasta output :
|
|
|
265 ---------------------
|
|
|
266
|
|
|
267 ::
|
|
|
268
|
|
|
269 >dmel_chr4
|
|
|
270 GAGAACCGTCCTGTAAGTACTCTTGCTTTAAATACGXXXXXXXXXXXXXXXXXXXXACGCTTAAGTCGAAGAGAGAATAAGTCAATATTTAATTGGACTCATCGCTTATGTTCATCATGAATCTATAGTTAACTTGATGTTGTGCTCCATGTACGATATAAAAAGTTAGATA
|
|
|
271
|
|
|
272 Config file :
|
|
|
273 -------------
|
|
|
274
|
|
|
275 ::
|
|
|
276
|
|
|
277 [repet_env]
|
|
|
278 repet_version: 2.4
|
|
|
279 repet_host: ******
|
|
|
280 repet_user: ******
|
|
|
281
|
|
|
282 Statistics file :
|
|
|
283 -----------------
|
|
|
284
|
|
|
285 ::
|
|
|
286
|
|
|
287 nb of sequences: 8
|
|
|
288 nb of matched sequences: 8
|
|
|
289 cumulative coverage: 133656 bp
|
|
|
290
|
|
|
291 ]]>
|
|
|
292 </help>
|
|
|
293
|
|
|
294 <citations>
|
|
|
295 <citation type="bibtex"><![CDATA[@article{10.1371/journal.pone.0016526,
|
|
|
296 author = {Flutre, Timothée AND Duprat, Elodie AND Feuillet, Catherine AND Quesneville, Hadi},
|
|
|
297 journal = {PLoS ONE},
|
|
|
298 publisher = {Public Library of Science},
|
|
|
299 title = {Considering Transposable Element Diversification in <italic>De Novo</italic> Annotation Approaches},
|
|
|
300 year = {2011},
|
|
|
301 month = {01},
|
|
|
302 volume = {6},
|
|
|
303 url = {http://dx.doi.org/10.1371%2Fjournal.pone.0016526},
|
|
|
304 pages = {e16526},
|
|
|
305 abstract = {
|
|
|
306 <p>Transposable elements (TEs) are mobile, repetitive DNA sequences that are almost ubiquitous in prokaryotic and eukaryotic genomes. They have a large impact on genome structure, function and evolution. With the recent development of high-throughput sequencing methods, many genome sequences have become available, making possible comparative studies of TE dynamics at an unprecedented scale. Several methods have been proposed for the <italic>de novo</italic> identification of TEs in sequenced genomes. Most begin with the detection of genomic repeats, but the subsequent steps for defining TE families differ. High-quality TE annotations are available for the <italic>Drosophila melanogaster</italic> and <italic>Arabidopsis thaliana</italic> genome sequences, providing a solid basis for the benchmarking of such methods. We compared the performance of specific algorithms for the clustering of interspersed repeats and found that only a particular combination of algorithms detected TE families with good recovery of the reference sequences. We then applied a new procedure for reconciling the different clustering results and classifying TE sequences. The whole approach was implemented in a pipeline using the REPET package. Finally, we show that our combined approach highlights the dynamics of well defined TE families by making it possible to identify structural variations among their copies. This approach makes it possible to annotate TE families and to study their diversification in a single analysis, improving our understanding of TE dynamics at the whole-genome scale and for diverse species.</p>
|
|
|
307 },
|
|
|
308 number = {1},
|
|
|
309 doi = {10.1371/journal.pone.0016526}
|
|
|
310 }]]></citation>
|
|
|
311 <citation type="bibtex"><![CDATA[@article{10.1371/journal.pone.0094101,
|
|
|
312 author = {Maumus, Florian AND Quesneville, Hadi},
|
|
|
313 journal = {PLoS ONE},
|
|
|
314 publisher = {Public Library of Science},
|
|
|
315 title = {Deep Investigation of <italic>Arabidopsis thaliana</italic> Junk DNA Reveals a Continuum between Repetitive Elements and Genomic Dark Matter},
|
|
|
316 year = {2014},
|
|
|
317 month = {04},
|
|
|
318 volume = {9},
|
|
|
319 url = {http://dx.doi.org/10.1371%2Fjournal.pone.0094101},
|
|
|
320 pages = {e94101},
|
|
|
321 abstract = {<p>Eukaryotic genomes contain highly variable amounts of DNA with no apparent function. This so-called junk DNA is composed of two components: repeated and repeat-derived sequences (together referred to as the repeatome), and non-annotated sequences also known as genomic dark matter. Because of their high duplication rates as compared to other genomic features, transposable elements are predominant contributors to the repeatome and the products of their decay is thought to be a major source of genomic dark matter. Determining the origin and composition of junk DNA is thus important to help understanding genome evolution as well as host biology. In this study, we have used a combination of tools enabling to show that the repeatome from the small and reducing <italic>A. thaliana</italic> genome is significantly larger than previously thought. Furthermore, we present the concepts and results from a series of innovative approaches suggesting that a significant amount of the <italic>A. thaliana</italic> dark matter is of repetitive origin. As a tentative standard for the community, we propose a deep compendium annotation of the <italic>A. thaliana</italic> repeatome that may help addressing farther genome evolution as well as transcriptional and epigenetic regulation in this model plant.</p>},
|
|
|
322 number = {4},
|
|
|
323 doi = {10.1371/journal.pone.0094101}
|
|
|
324 }]]></citation>
|
|
|
325 </citations>
|
|
|
326
|
|
|
327 </tool>
|