comparison tools/RepeatMasker.xml @ 0:13df908a02b0

Initial commit
author bjoern-gruening
date Wed, 11 Jan 2012 04:50:59 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:13df908a02b0
1 <tool id="repeatmasker_wrapper" name="RepeatMasker" version="0.1">
2 <description>Masks different kind of repeats</description>
3 <command>
4 ## The command is a Cheetah template which allows some Python based syntax.
5 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
6
7 ## create temp directory
8 #import tempfile, os
9 #set $dirname = os.path.abspath(tempfile.mkdtemp())
10 #set $input_filename = os.path.split(str($query))[-1]
11 #set $output_basename = os.path.join($dirname, $input_filename)
12
13
14 RepeatMasker
15 -parallel 8
16
17 $nolow
18 $noint
19 $norna
20
21 #if str($species)!="all":
22 $species
23 #end if
24
25
26 -dir $dirname
27
28 #if $adv_opts.adv_opts_selector=="advanced":
29
30 #if str($adv_opts.gc)!="0":
31 -gc $adv_opts.gc
32 #end if
33
34 $adv_opts.gccalc
35
36 #set $output_files_list = str($adv_opts.output_files).split(',')
37 #if "gff" in $output_files_list:
38 -gff
39 #end if
40 #if "html" in $output_files_list:
41 -html
42 #end if
43
44 $adv_opts.slow_search
45 $adv_opts.quick_search
46 $adv_opts.rush_search
47 $adv_opts.only_alus
48 $adv_opts.is_only
49
50 #else:
51 ## Set defaults
52 -gff
53
54 ## End of advanced options:
55 #end if
56
57 $query
58
59
60 > /dev/null 2> /dev/null;
61 ## Copy the output files to galaxy
62 #if $adv_opts.adv_opts_selector=="advanced":
63
64 #if "summary" in $output_files_list:
65 ## Write out the summary file (default)
66 #set $summary_file = $output_basename + '.tbl'
67 cp $summary_file $output_summary;
68 #end if
69
70 #if "gff" in $output_files_list:
71 ## Write out the gff file (default)
72 #set $gff_file = $output_basename + '.out.gff'
73 cp $gff_file $output_gff;
74 #end if
75
76 #if "html" in $output_files_list:
77 ## Write out the html file
78 #set $html_file = $output_basename + '.out.html'
79 cp $html_file $output_html;
80 #end if
81
82 #else:
83
84 ## Write out the summary file (default)
85 #set $summary_file = $output_basename + '.tbl'
86 cp $summary_file $output_summary;
87
88 ## Write out the gff file (default)
89 #set $gff_file = $output_basename + '.out.gff'
90 cp $gff_file $output_gff;
91
92
93 ## End of advanced options:
94 #end if
95
96 ## Write out mask sequence file
97 #set $mask_sequence_file = $output_basename + '.masked'
98 cp $mask_sequence_file $output_mask;
99
100 ## Write out standard file (default)
101 ## The default '.out' file from RepeatMasker has a 3-line header and spaces rather
102 ## than tabs. Remove the header and replace the whitespaces with tab
103 #set $standard_file = $output_basename + '.out'
104 tail -n +4 $standard_file | tr -s ' ' '\t' > $output_std;
105
106 ## Delete all temporary files
107 rm $dirname -r;
108
109
110 </command>
111 <inputs>
112 <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/>
113
114 <param name="nolow" type="boolean" label="No low complexity DNA" truevalue="-nolow" falsevalue="" checked="false" help="Does not mask low_complexity DNA or simple repeats."/>
115 <param name="noint" type="boolean" label="No interspersed repeats" truevalue="-noint" falsevalue="" checked="false" help="Only masks low complex/simple repeats (no interspersed repeats)."/>
116
117 <param name="norna" type="boolean" label="No small RNA genes" truevalue="-norna" falsevalue="" checked="false" help="Does not mask small RNA (pseudo) genes."/>
118
119 <!--
120 Specify the species or clade of the input sequence. The species name
121 must be a valid NCBI Taxonomy Database species name and be contained
122 in the RepeatMasker repeat database. The following collection is not complete.
123 -->
124 <param name="species" type="select" label="Species" help="The list is not complete, if you need other species contact your administrator.">
125 <option value="-species anopheles">anopheles</option>
126 <option value="-species arabidopsis">arabidopsis</option>
127 <option value="-species artiodactyl">artiodactyl</option>
128 <option value="-species aspergillus">aspergillus</option>
129 <option value="-species carnivore">carnivore</option>
130 <option value="-species cat">cat</option>
131 <option value="-species chicken">chicken</option>
132 <option value="-species 'ciona intestinalis'">ciona intestinalis</option>
133 <option value="-species 'ciona savignyi'">ciona savignyi</option>
134 <option value="-species cow">cow</option>
135 <option value="-species danio">danio</option>
136 <option value="-species diatoaea">diatoaea</option>
137 <option value="-species dog">dog</option>
138 <option value="-species drosophila">drosophila</option>
139 <option value="-species elegans">elegans</option>
140 <option value="-species fugu">fugu</option>
141 <option value="-species fungi" selected="true">fungi</option>
142 <option value="-species human">human</option>
143 <option value="-species maize">maize</option>
144 <option value="-species mammal">mammal</option>
145 <option value="-species mouse">mouse</option>
146 <option value="-species pig">pig</option>
147 <option value="-species rat">rat</option>
148 <option value="-species rice">rice</option>
149 <option value="-species rodentia">rodentia</option>
150 <option value="-species wheat">wheat</option>
151 </param>
152
153 <conditional name="adv_opts">
154 <param name="adv_opts_selector" type="select" label="Advanced Options">
155 <option value="basic" selected="True">Hide Advanced Options</option>
156 <option value="advanced">Show Advanced Options</option>
157 </param>
158 <when value="basic" />
159 <when value="advanced">
160
161
162 <param name="is_only" type="boolean" label="Mask only E coli insertion elements" truevalue="-is_only" falsevalue="" checked="false" help="Only clips E coli insertion elements out of fasta and .qual files."/>
163
164
165 <param name="slow_search" type="boolean" label="Slow search" truevalue="-s" falsevalue="" checked="false" help="0-5% more sensitive, 2-3 times slower than default."/>
166 <param name="quick_search" type="boolean" label="Quick search" truevalue="-q" falsevalue="" checked="false" help="5-10% less sensitive, 2-5 times faster than default."/>
167 <param name="rush_search" type="boolean" label="Rush search" truevalue="-qq" falsevalue="" checked="false" help="about 10% less sensitive, 4->10 times faster than default."/>
168
169 <param name="only_alus" type="boolean" label="Only Alus" truevalue="-alu" falsevalue="" checked="false" help="Only masks Alus (and 7SLRNA, SVA and LTR5)(only for primate DNA)."/>
170
171 <param name="gccalc" type="boolean" label="Use GC depended matrices, automaticly" truevalue="-gccalc" falsevalue="" checked="true" help="RepeatMasker calculates the GC content even for batch files/small seqs"/>
172
173 <param name="output_files" type="select" multiple="true" label="Additional output files">
174 <option selected="true" value="summary">Summary file</option>
175 <option value="gff">GFF file</option>
176 <option value="html">HTML file</option>
177 <option value="mask">Mask FastA file</option>
178 </param>
179
180
181 <param name="gc" type="integer" value="0" label="Use GC depended matrices" help="Use matrices calculated for 'number' percentage background GC level">
182 <validator type="in_range" min="0" />
183 <validator type="in_range" max="100" />
184 </param>
185
186 </when>
187 </conditional>
188
189 </inputs>
190 <outputs>
191 <data name="output_std" format="tabular" label="${tool.name} on ${on_string}: Standard" />
192 <data name="output_mask" format="fasta" label="${tool.name} on ${on_string}: Mask sequence">
193 <filter>
194 (adv_opts['adv_opts_selector'] == 'advanced' and 'mask' in adv_opts['output_files'])
195 </filter>
196 </data>
197 <data name="output_summary" format="txt" label="${tool.name} on ${on_string}: Summary">
198 <filter>(
199 (adv_opts['adv_opts_selector'] == 'advanced' and 'summary' in adv_opts['output_files'])
200 or
201 (adv_opts['adv_opts_selector'] == 'basic')
202 )
203 </filter>
204 </data>
205 <data name="output_html" format="html" label="${tool.name} on ${on_string}: HTML">
206 <filter>(adv_opts['adv_opts_selector'] == 'advanced' and 'html' in adv_opts['output_files'])</filter>
207 </data>
208 <data name="output_gff" format="gff" label="${tool.name} on ${on_string}: GFF">
209 <filter>
210 (adv_opts['adv_opts_selector'] == 'advanced' and 'gff' in adv_opts['output_files'])
211 </filter>
212 </data>
213 </outputs>
214 <requirements>
215 <requirement type="binary">RepeatMasker</requirement>
216 </requirements>
217 <help>
218
219 .. class:: warningmark
220
221
222 -----
223
224 **What it does**
225
226 RepeatMasker is a program that screens DNA sequences for *interspersed repeats*
227 and *low complexity* DNA sequences. The output of the program is a detailed
228 annotation of the repeats that are present in the query sequence as well as a
229 modified version of the query sequence in which all the annotated repeats have
230 been masked (default: replaced by Ns).
231
232 -----
233
234 **How to read the results**
235
236
237
238 The annotation file contains the cross_match output lines. It lists all best matches
239 (above a set minimum score) between the query sequence and any of the sequences in
240 the repeat database or with low complexity DNA. The term "best matches" reflects
241 that a match is not shown if its domain is over 80% contained within the domain
242 of a higher scoring match, where the "domain" of a match is the region in
243 the query sequence that is defined by the alignment start and stop. These domains
244 have been masked in the returned masked sequence file. In the output, matches are
245 ordered by query name, and for each query by position of the start of the alignment.
246
247 Example:
248
249 ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= ==
250 SW score perc div. perc del. perc ins. query seq. q-pos begin q-pos end (left) w complement matching repeat repeat class/family repeat-pos begin repeat-pos end (left) ID
251 ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= ==
252 1306 15.6 6.2 0.0 HSU08988 6563 6781 \(22462) C MER7A DNA/MER2_type 336 103 \(0) 1
253 12204 10.0 2.4 1.8 HSU08988 6782 7714 \(21529) C TIGGER1 DNA/MER2_type 2418 1493 \(0) 2
254 279 3.0 0.0 0.0 HSU08988 7719 7751 \(21492) + (TTTTA)n Simple_repeat 1 33 \(0) 3
255 1765 13.4 6.5 1.8 HSU08988 7752 8022 \(21221) C AluSx SINE/Alu 289 1 \(23) 4
256 12204 10.0 2.4 1.8 HSU08988 8023 8694 \(20549) C TIGGER1 DNA/MER2_type 1493 827 \(925) 5
257 1984 11.1 0.3 0.7 HSU08988 8695 9000 \(20243) C AluSg SINE/Alu 305 1 \(5) 6
258 12204 10.0 2.4 1.8 HSU08988 9001 9695 \(19548) C TIGGER1 DNA/MER2_type 827 2 \(1591) 7
259 711 21.2 1.4 0.0 HSU08988 9696 9816 \(19427) C MER7A DNA/MER2_type 122 2 \(224) 8
260 ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= ==
261
262 This is a sequence in which a Tigger1 DNA transposon has integrated into a MER7 DNA transposon copy.
263 Subsequently two Alus integrated in the Tigger1 sequence. The simple repeat is derived from the
264 poly A of the Alu element. The first line is interpreted like this:
265
266 :Table description:
267
268 1. **1306** = Smith-Waterman score of the match, usually complexity adjusted
269 The SW scores are not always directly comparable. Sometimes
270 the complexity adjustment has been turned off, and a variety of
271 scoring-matrices are used.
272
273 #. **15.6** = % substitutions in matching region compared to the consensus
274 #. **6.2** = % of bases opposite a gap in the query sequence (deleted bp)
275 #. **0.0** = % of bases opposite a gap in the repeat consensus (inserted bp)
276 #. **HSU08988** = name of query sequence
277 #. **6563** = starting position of match in query sequence
278 #. **7714** = ending position of match in query sequence
279 #. **(22462)** = no. of bases in query sequence past the ending position of match
280 #. **C** = match is with the Complement of the consensus sequence in the database
281 #. **MER7A** = name of the matching interspersed repeat
282 #. **DNA/MER2_type** = the class of the repeat, in this case a DNA transposon fossil of the MER2 group (see below for list and references)
283 #. **2418** = starting position of match in database sequence (using top-strand numbering)
284 #. **1465** = ending position of match in database sequence
285 #. **(0)** = no. of bases in (complement of) the repeat consensus sequence prior to beginning of the match (so 0 means that the match extended all the way to the end of the repeat consensus sequence)
286 #. **1** = Identifier
287
288 An asterisk (\*) in the final column (no example shown) indicates that there is
289 a higher-scoring match whose domain partly (&lt;80%) includes the domain of this match.
290
291 Note that the SW score and divergence numbers for the three Tigger1 lines are identical.
292 This is because the information is derived from a single alignment (the Alus were deleted
293 from the query before the alignment with the Tigger element was performed).
294 The program makes educated guesses about many fragments if they are derived from
295 the same element (e.g. it knows that the MER7A fragments represent one insert).
296 In a next version I can identify each element with a unique ID, if interest exists
297 (this could help to represent repeats cleaner in graphic displays).
298
299
300 -------
301
302 **References**
303
304 Smit, AFA, Hubley, R and Green, P. RepeatMasker Open-3.0.
305
306 http://www.repeatmasker.org/
307
308 </help>
309 </tool>