Mercurial > repos > bjoern-gruening > repeat_masker
comparison tools/RepeatMasker.xml @ 0:13df908a02b0
Initial commit
author | bjoern-gruening |
---|---|
date | Wed, 11 Jan 2012 04:50:59 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:13df908a02b0 |
---|---|
1 <tool id="repeatmasker_wrapper" name="RepeatMasker" version="0.1"> | |
2 <description>Masks different kind of repeats</description> | |
3 <command> | |
4 ## The command is a Cheetah template which allows some Python based syntax. | |
5 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces | |
6 | |
7 ## create temp directory | |
8 #import tempfile, os | |
9 #set $dirname = os.path.abspath(tempfile.mkdtemp()) | |
10 #set $input_filename = os.path.split(str($query))[-1] | |
11 #set $output_basename = os.path.join($dirname, $input_filename) | |
12 | |
13 | |
14 RepeatMasker | |
15 -parallel 8 | |
16 | |
17 $nolow | |
18 $noint | |
19 $norna | |
20 | |
21 #if str($species)!="all": | |
22 $species | |
23 #end if | |
24 | |
25 | |
26 -dir $dirname | |
27 | |
28 #if $adv_opts.adv_opts_selector=="advanced": | |
29 | |
30 #if str($adv_opts.gc)!="0": | |
31 -gc $adv_opts.gc | |
32 #end if | |
33 | |
34 $adv_opts.gccalc | |
35 | |
36 #set $output_files_list = str($adv_opts.output_files).split(',') | |
37 #if "gff" in $output_files_list: | |
38 -gff | |
39 #end if | |
40 #if "html" in $output_files_list: | |
41 -html | |
42 #end if | |
43 | |
44 $adv_opts.slow_search | |
45 $adv_opts.quick_search | |
46 $adv_opts.rush_search | |
47 $adv_opts.only_alus | |
48 $adv_opts.is_only | |
49 | |
50 #else: | |
51 ## Set defaults | |
52 -gff | |
53 | |
54 ## End of advanced options: | |
55 #end if | |
56 | |
57 $query | |
58 | |
59 | |
60 > /dev/null 2> /dev/null; | |
61 ## Copy the output files to galaxy | |
62 #if $adv_opts.adv_opts_selector=="advanced": | |
63 | |
64 #if "summary" in $output_files_list: | |
65 ## Write out the summary file (default) | |
66 #set $summary_file = $output_basename + '.tbl' | |
67 cp $summary_file $output_summary; | |
68 #end if | |
69 | |
70 #if "gff" in $output_files_list: | |
71 ## Write out the gff file (default) | |
72 #set $gff_file = $output_basename + '.out.gff' | |
73 cp $gff_file $output_gff; | |
74 #end if | |
75 | |
76 #if "html" in $output_files_list: | |
77 ## Write out the html file | |
78 #set $html_file = $output_basename + '.out.html' | |
79 cp $html_file $output_html; | |
80 #end if | |
81 | |
82 #else: | |
83 | |
84 ## Write out the summary file (default) | |
85 #set $summary_file = $output_basename + '.tbl' | |
86 cp $summary_file $output_summary; | |
87 | |
88 ## Write out the gff file (default) | |
89 #set $gff_file = $output_basename + '.out.gff' | |
90 cp $gff_file $output_gff; | |
91 | |
92 | |
93 ## End of advanced options: | |
94 #end if | |
95 | |
96 ## Write out mask sequence file | |
97 #set $mask_sequence_file = $output_basename + '.masked' | |
98 cp $mask_sequence_file $output_mask; | |
99 | |
100 ## Write out standard file (default) | |
101 ## The default '.out' file from RepeatMasker has a 3-line header and spaces rather | |
102 ## than tabs. Remove the header and replace the whitespaces with tab | |
103 #set $standard_file = $output_basename + '.out' | |
104 tail -n +4 $standard_file | tr -s ' ' '\t' > $output_std; | |
105 | |
106 ## Delete all temporary files | |
107 rm $dirname -r; | |
108 | |
109 | |
110 </command> | |
111 <inputs> | |
112 <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/> | |
113 | |
114 <param name="nolow" type="boolean" label="No low complexity DNA" truevalue="-nolow" falsevalue="" checked="false" help="Does not mask low_complexity DNA or simple repeats."/> | |
115 <param name="noint" type="boolean" label="No interspersed repeats" truevalue="-noint" falsevalue="" checked="false" help="Only masks low complex/simple repeats (no interspersed repeats)."/> | |
116 | |
117 <param name="norna" type="boolean" label="No small RNA genes" truevalue="-norna" falsevalue="" checked="false" help="Does not mask small RNA (pseudo) genes."/> | |
118 | |
119 <!-- | |
120 Specify the species or clade of the input sequence. The species name | |
121 must be a valid NCBI Taxonomy Database species name and be contained | |
122 in the RepeatMasker repeat database. The following collection is not complete. | |
123 --> | |
124 <param name="species" type="select" label="Species" help="The list is not complete, if you need other species contact your administrator."> | |
125 <option value="-species anopheles">anopheles</option> | |
126 <option value="-species arabidopsis">arabidopsis</option> | |
127 <option value="-species artiodactyl">artiodactyl</option> | |
128 <option value="-species aspergillus">aspergillus</option> | |
129 <option value="-species carnivore">carnivore</option> | |
130 <option value="-species cat">cat</option> | |
131 <option value="-species chicken">chicken</option> | |
132 <option value="-species 'ciona intestinalis'">ciona intestinalis</option> | |
133 <option value="-species 'ciona savignyi'">ciona savignyi</option> | |
134 <option value="-species cow">cow</option> | |
135 <option value="-species danio">danio</option> | |
136 <option value="-species diatoaea">diatoaea</option> | |
137 <option value="-species dog">dog</option> | |
138 <option value="-species drosophila">drosophila</option> | |
139 <option value="-species elegans">elegans</option> | |
140 <option value="-species fugu">fugu</option> | |
141 <option value="-species fungi" selected="true">fungi</option> | |
142 <option value="-species human">human</option> | |
143 <option value="-species maize">maize</option> | |
144 <option value="-species mammal">mammal</option> | |
145 <option value="-species mouse">mouse</option> | |
146 <option value="-species pig">pig</option> | |
147 <option value="-species rat">rat</option> | |
148 <option value="-species rice">rice</option> | |
149 <option value="-species rodentia">rodentia</option> | |
150 <option value="-species wheat">wheat</option> | |
151 </param> | |
152 | |
153 <conditional name="adv_opts"> | |
154 <param name="adv_opts_selector" type="select" label="Advanced Options"> | |
155 <option value="basic" selected="True">Hide Advanced Options</option> | |
156 <option value="advanced">Show Advanced Options</option> | |
157 </param> | |
158 <when value="basic" /> | |
159 <when value="advanced"> | |
160 | |
161 | |
162 <param name="is_only" type="boolean" label="Mask only E coli insertion elements" truevalue="-is_only" falsevalue="" checked="false" help="Only clips E coli insertion elements out of fasta and .qual files."/> | |
163 | |
164 | |
165 <param name="slow_search" type="boolean" label="Slow search" truevalue="-s" falsevalue="" checked="false" help="0-5% more sensitive, 2-3 times slower than default."/> | |
166 <param name="quick_search" type="boolean" label="Quick search" truevalue="-q" falsevalue="" checked="false" help="5-10% less sensitive, 2-5 times faster than default."/> | |
167 <param name="rush_search" type="boolean" label="Rush search" truevalue="-qq" falsevalue="" checked="false" help="about 10% less sensitive, 4->10 times faster than default."/> | |
168 | |
169 <param name="only_alus" type="boolean" label="Only Alus" truevalue="-alu" falsevalue="" checked="false" help="Only masks Alus (and 7SLRNA, SVA and LTR5)(only for primate DNA)."/> | |
170 | |
171 <param name="gccalc" type="boolean" label="Use GC depended matrices, automaticly" truevalue="-gccalc" falsevalue="" checked="true" help="RepeatMasker calculates the GC content even for batch files/small seqs"/> | |
172 | |
173 <param name="output_files" type="select" multiple="true" label="Additional output files"> | |
174 <option selected="true" value="summary">Summary file</option> | |
175 <option value="gff">GFF file</option> | |
176 <option value="html">HTML file</option> | |
177 <option value="mask">Mask FastA file</option> | |
178 </param> | |
179 | |
180 | |
181 <param name="gc" type="integer" value="0" label="Use GC depended matrices" help="Use matrices calculated for 'number' percentage background GC level"> | |
182 <validator type="in_range" min="0" /> | |
183 <validator type="in_range" max="100" /> | |
184 </param> | |
185 | |
186 </when> | |
187 </conditional> | |
188 | |
189 </inputs> | |
190 <outputs> | |
191 <data name="output_std" format="tabular" label="${tool.name} on ${on_string}: Standard" /> | |
192 <data name="output_mask" format="fasta" label="${tool.name} on ${on_string}: Mask sequence"> | |
193 <filter> | |
194 (adv_opts['adv_opts_selector'] == 'advanced' and 'mask' in adv_opts['output_files']) | |
195 </filter> | |
196 </data> | |
197 <data name="output_summary" format="txt" label="${tool.name} on ${on_string}: Summary"> | |
198 <filter>( | |
199 (adv_opts['adv_opts_selector'] == 'advanced' and 'summary' in adv_opts['output_files']) | |
200 or | |
201 (adv_opts['adv_opts_selector'] == 'basic') | |
202 ) | |
203 </filter> | |
204 </data> | |
205 <data name="output_html" format="html" label="${tool.name} on ${on_string}: HTML"> | |
206 <filter>(adv_opts['adv_opts_selector'] == 'advanced' and 'html' in adv_opts['output_files'])</filter> | |
207 </data> | |
208 <data name="output_gff" format="gff" label="${tool.name} on ${on_string}: GFF"> | |
209 <filter> | |
210 (adv_opts['adv_opts_selector'] == 'advanced' and 'gff' in adv_opts['output_files']) | |
211 </filter> | |
212 </data> | |
213 </outputs> | |
214 <requirements> | |
215 <requirement type="binary">RepeatMasker</requirement> | |
216 </requirements> | |
217 <help> | |
218 | |
219 .. class:: warningmark | |
220 | |
221 | |
222 ----- | |
223 | |
224 **What it does** | |
225 | |
226 RepeatMasker is a program that screens DNA sequences for *interspersed repeats* | |
227 and *low complexity* DNA sequences. The output of the program is a detailed | |
228 annotation of the repeats that are present in the query sequence as well as a | |
229 modified version of the query sequence in which all the annotated repeats have | |
230 been masked (default: replaced by Ns). | |
231 | |
232 ----- | |
233 | |
234 **How to read the results** | |
235 | |
236 | |
237 | |
238 The annotation file contains the cross_match output lines. It lists all best matches | |
239 (above a set minimum score) between the query sequence and any of the sequences in | |
240 the repeat database or with low complexity DNA. The term "best matches" reflects | |
241 that a match is not shown if its domain is over 80% contained within the domain | |
242 of a higher scoring match, where the "domain" of a match is the region in | |
243 the query sequence that is defined by the alignment start and stop. These domains | |
244 have been masked in the returned masked sequence file. In the output, matches are | |
245 ordered by query name, and for each query by position of the start of the alignment. | |
246 | |
247 Example: | |
248 | |
249 ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= == | |
250 SW score perc div. perc del. perc ins. query seq. q-pos begin q-pos end (left) w complement matching repeat repeat class/family repeat-pos begin repeat-pos end (left) ID | |
251 ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= == | |
252 1306 15.6 6.2 0.0 HSU08988 6563 6781 \(22462) C MER7A DNA/MER2_type 336 103 \(0) 1 | |
253 12204 10.0 2.4 1.8 HSU08988 6782 7714 \(21529) C TIGGER1 DNA/MER2_type 2418 1493 \(0) 2 | |
254 279 3.0 0.0 0.0 HSU08988 7719 7751 \(21492) + (TTTTA)n Simple_repeat 1 33 \(0) 3 | |
255 1765 13.4 6.5 1.8 HSU08988 7752 8022 \(21221) C AluSx SINE/Alu 289 1 \(23) 4 | |
256 12204 10.0 2.4 1.8 HSU08988 8023 8694 \(20549) C TIGGER1 DNA/MER2_type 1493 827 \(925) 5 | |
257 1984 11.1 0.3 0.7 HSU08988 8695 9000 \(20243) C AluSg SINE/Alu 305 1 \(5) 6 | |
258 12204 10.0 2.4 1.8 HSU08988 9001 9695 \(19548) C TIGGER1 DNA/MER2_type 827 2 \(1591) 7 | |
259 711 21.2 1.4 0.0 HSU08988 9696 9816 \(19427) C MER7A DNA/MER2_type 122 2 \(224) 8 | |
260 ======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= == | |
261 | |
262 This is a sequence in which a Tigger1 DNA transposon has integrated into a MER7 DNA transposon copy. | |
263 Subsequently two Alus integrated in the Tigger1 sequence. The simple repeat is derived from the | |
264 poly A of the Alu element. The first line is interpreted like this: | |
265 | |
266 :Table description: | |
267 | |
268 1. **1306** = Smith-Waterman score of the match, usually complexity adjusted | |
269 The SW scores are not always directly comparable. Sometimes | |
270 the complexity adjustment has been turned off, and a variety of | |
271 scoring-matrices are used. | |
272 | |
273 #. **15.6** = % substitutions in matching region compared to the consensus | |
274 #. **6.2** = % of bases opposite a gap in the query sequence (deleted bp) | |
275 #. **0.0** = % of bases opposite a gap in the repeat consensus (inserted bp) | |
276 #. **HSU08988** = name of query sequence | |
277 #. **6563** = starting position of match in query sequence | |
278 #. **7714** = ending position of match in query sequence | |
279 #. **(22462)** = no. of bases in query sequence past the ending position of match | |
280 #. **C** = match is with the Complement of the consensus sequence in the database | |
281 #. **MER7A** = name of the matching interspersed repeat | |
282 #. **DNA/MER2_type** = the class of the repeat, in this case a DNA transposon fossil of the MER2 group (see below for list and references) | |
283 #. **2418** = starting position of match in database sequence (using top-strand numbering) | |
284 #. **1465** = ending position of match in database sequence | |
285 #. **(0)** = no. of bases in (complement of) the repeat consensus sequence prior to beginning of the match (so 0 means that the match extended all the way to the end of the repeat consensus sequence) | |
286 #. **1** = Identifier | |
287 | |
288 An asterisk (\*) in the final column (no example shown) indicates that there is | |
289 a higher-scoring match whose domain partly (<80%) includes the domain of this match. | |
290 | |
291 Note that the SW score and divergence numbers for the three Tigger1 lines are identical. | |
292 This is because the information is derived from a single alignment (the Alus were deleted | |
293 from the query before the alignment with the Tigger element was performed). | |
294 The program makes educated guesses about many fragments if they are derived from | |
295 the same element (e.g. it knows that the MER7A fragments represent one insert). | |
296 In a next version I can identify each element with a unique ID, if interest exists | |
297 (this could help to represent repeats cleaner in graphic displays). | |
298 | |
299 | |
300 ------- | |
301 | |
302 **References** | |
303 | |
304 Smit, AFA, Hubley, R and Green, P. RepeatMasker Open-3.0. | |
305 | |
306 http://www.repeatmasker.org/ | |
307 | |
308 </help> | |
309 </tool> |