comparison glimmer_w_icm.xml @ 0:841357e0acbf draft

Uploaded
author bgruening
date Sat, 06 Jul 2013 10:09:30 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:841357e0acbf
1 <tool id="glimmer_knowlegde-based" name="Glimmer3" version="0.2">
2 <description>Predict ORFs in prokaryotic genomes (knowlegde-based)</description>
3 <requirements>
4 <requirement type="package" version="3.02b">glimmer</requirement>
5 <requirement type="package" version="1.61">biopython</requirement>
6 <requirement type="set_environment">GLIMMER_SCRIPT_PATH</requirement>
7 </requirements>
8 <command>
9 #import tempfile, os
10 #set $temp = tempfile.NamedTemporaryFile( delete=False )
11 #silent $temp.close()
12 #set $temp = $temp.name
13
14 glimmer3
15 --max_olap $max_olap
16 --gene_len $gene_len
17 --threshold $threshold
18 #if float( str($gc_percent) ) > 0.0:
19 --gc_percent $gc_percent
20 #end if
21
22 #if $stop_codon_opts.stop_codon_opts_selector == "gb":
23 --trans_table "${stop_codon_opts.genbank_gencode}"
24 #else:
25 --stop_codons "${stop_codon_opts.stop_codons}"
26 #end if
27
28 --start_codons $start_codons
29
30 $linear
31 $no_indep
32 $extend
33 $seq_input
34 $icm_input
35 $temp 2>&#38;1;
36
37 ## convert prediction to FASTA sequences
38 \$GLIMMER_SCRIPT_PATH/glimmer2seq.py $temp".predict" $seq_input $genes_output;
39
40 #if $report:
41 mv $temp".predict" $report_output;
42 #else:
43 rm $temp".predict";
44 #end if
45
46 #if $detailed_report:
47 mv $temp".detail" $detailed_output;
48 #else:
49 rm $temp".detail";
50 #end if
51
52 rm $temp
53 </command>
54 <inputs>
55 <param name="seq_input" type="data" format="fasta" label="Genome Sequence" />
56 <param name="icm_input" type="data" format="data" label="Interpolated context model (ICM)" />
57
58 <param name="max_olap" type="integer" value="50" label="Set maximum overlap length" help="Overlaps this short or shorter are ignored." />
59 <param name="gene_len" type="integer" value="90" label="Set the minimum gene length to n nucleotides" hrlp="This does not include the bases in the stop codon."/>
60 <param name="threshold" type="integer" value="30" label="Set threshold score for calling as gene" help="If the in-frame score >= N, then the region is given a number and considered a potential gene." />
61 <param name="gc_percent" type="float" value="0.0" label="Set the GC percentage of the independent model, i.e., the model of intergenic sequence" help="If 0.0 specified, the GC percentage will be counted from the input file." />
62
63 <param name="linear" type="boolean" truevalue="--linear" falsevalue="" checked="true" label="Assume linear rather than circular genome, i.e., no wraparound" />
64 <param name="no_indep" type="boolean" truevalue="--no_indep" falsevalue="" checked="false" label="Don’t use the independent probability score column at all" help="Using this option will produce more short gene predictions." />
65 <param name="extend" type="boolean" truevalue="--extend" falsevalue="" checked="false" label="Also score orfs that extend off the end of the sequence(s)" />
66 <param name="start_codons" type="text" value="atg,gtg,ttg" label="Specify start codons as a comma-separated list" />
67
68 <conditional name="stop_codon_opts">
69 <param name="stop_codon_opts_selector" type="select" label="Specify start codons as">
70 <option value="gb" selected="True">Genbank translation table entry</option>
71 <option value="free_form">Comma-separated list</option>
72 </param>
73 <when value="gb">
74 <param name="genbank_gencode" type="select" label="Use Genbank translation table to specify stop codons">
75 <option value="1" select="True">1. Standard</option>
76 <option value="2">2. Vertebrate Mitochondrial</option>
77 <option value="3">3. Yeast Mitochondrial</option>
78 <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
79 <option value="5">5. Invertebrate Mitochondrial</option>
80 <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
81 <option value="9">9. Echinoderm Mitochondrial</option>
82 <option value="10">10. Euplotid Nuclear</option>
83 <option value="11">11. Bacteria and Archaea</option>
84 <option value="12">12. Alternative Yeast Nuclear</option>
85 <option value="13">13. Ascidian Mitochondrial</option>
86 <option value="14">14. Flatworm Mitochondrial</option>
87 <option value="15">15. Blepharisma Macronuclear</option>
88 <option value="16">16. Chlorophycean Mitochondrial</option>
89 <option value="21">21. Trematode Mitochondrial</option>
90 <option value="22">22. Scenedesmus obliquus mitochondrial</option>
91 <option value="23">23. Thraustochytrium Mitochondrial</option>
92 <option value="24">24. Pterobranchia mitochondrial</option>
93 </param>
94 </when>
95 <when value="free_form">
96 <param name="stop_codons" type="text" value="tag,tga,taa" label="Specify stop codons as a comma-separated list" />
97 </when>
98 </conditional>
99
100 <param name="report" type="boolean" truevalue="" falsevalue="" checked="false" label="Report the classic glimmer table output"/>
101 <param name="detailed_report" type="boolean" truevalue="" falsevalue="" checked="false" label="Output a detailed gene prediction report as separate file"/>
102 </inputs>
103 <outputs>
104 <data name="genes_output" format="fasta" label="Glimmer3 on ${on_string} (Gene Prediction FASTA)" />
105 <data name="report_output" format="txt" label="Glimmer3 on ${on_string} (Gene Prediction table)">
106 <filter>report == True</filter>
107 </data>
108 <data name="detailed_output" format="txt" label="Glimmer3 on ${on_string} (detailed report)">
109 <filter>detailed_report == True</filter>
110 </data>
111 </outputs>
112 <tests>
113 <test>
114 <param name="seq_input" value='streptomyces_Tu6071_genomic.fasta' />
115 <param name="icm_input" value='streptomyces_Tu6071_plasmid_genes.icm' />
116 <param name="max_olap" value="50" />
117 <param name="gene_len" value="90" />
118 <param name="threshold" value="30" />
119 <param name="gc_percent" value="0.0" />
120 <param name="linear" value="--linear" />
121 <param name="no_indep" value="" />
122 <param name="extend" value="" />
123 <param name="start_codons" value="atg,gtg,ttg" />
124 <param name="genbank_gencode" value="11" />
125 <param name="detailed_report" value="" />
126 <param name="report" value="" />
127 <output name="genes_output" file='glimmer_w_icm_trans-table-11_genomic.fasta' ftype="fasta" />
128 </test>
129 </tests>
130 <help>
131
132
133 **What it does**
134
135 This is the main program that makes gene preditions based on an interpolated context model (ICM).
136
137 The ICM can be generated with extracted CDS from related organisms (ICM builder). If you can't generate an ICM model you can use the non knowlegde-based Glimmer with a de novo prediction.
138
139 -----
140
141 **Example**
142
143 *Input*::
144
145 - interpolated context model (ICM): Use the 'Glimmer ICM builder' tool to create one
146 - Genome Sequence in FASTA format
147
148 >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7
149 GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT
150 GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT
151 TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT
152 TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC
153 GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA
154 ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG
155 AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA
156 CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA
157 TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC
158 AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA
159 GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC
160 AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC
161 CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA
162 AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC
163 GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT
164 .....
165
166 *Output*::
167
168 - FASTA file with predicted proteins
169 - Glimmer prediction file (optional)
170
171 >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
172 orf00001 40137 52 +2 8.68
173 orf00004 603 34 -1 2.91
174 orf00006 1289 1095 -3 3.16
175 orf00007 1555 1391 -2 2.33
176 orf00008 1809 1576 -1 1.02
177 orf00010 1953 2066 +3 3.09
178 orf00011 2182 2304 +1 0.89
179 orf00013 2390 2521 +2 0.60
180 orf00018 2570 3073 +2 2.54
181 orf00020 3196 3747 +1 2.91
182 orf00022 3758 4000 +2 0.83
183 orf00023 4399 4157 -2 1.31
184 orf00025 4463 4759 +2 2.92
185 orf00026 4878 5111 +3 0.78
186 orf00027 5468 5166 -3 1.64
187 orf00029 5590 5832 +1 0.29
188 orf00032 6023 6226 +2 6.02
189 orf00033 6217 6336 +1 3.09
190 ........
191
192 - Glimmer detailed report (optional)
193
194 >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
195 Sequence length = 40222
196
197 ----- Start ----- --- Length ---- ------------- Scores -------------
198 ID Frame of Orf of Gene Stop of Orf of Gene Raw InFrm F1 F2 F3 R1 R2 R3 NC
199 0001 +2 40137 40137 52 135 135 9.26 96 - 96 - - 3 - 0
200 0002 +1 58 64 180 120 114 5.01 69 69 - - 30 - - 0
201 +3 300 309 422 120 111 -0.68 20 - - 20 38 - - 41
202 +3 423 432 545 120 111 1.29 21 - 51 21 13 - 8 5
203 0003 +2 401 416 595 192 177 2.51 93 - 93 - 5 - - 1
204 0004 -1 645 552 34 609 516 2.33 99 - - - 99 - - 0
205 +1 562 592 762 198 168 -2.54 1 1 - - - - - 98
206 +1 763 772 915 150 141 -1.34 1 1 - - - - 86 11
207 +3 837 846 1007 168 159 1.35 28 - 50 28 - - 17 3
208 0005 -3 1073 977 654 417 321 0.52 84 - - - - - 84 15
209 0006 -3 1373 1319 1095 276 222 3.80 99 - - - - - 99 0
210 0007 -2 1585 1555 1391 192 162 2.70 98 - - - - 98 - 1
211 0008 -1 1812 1809 1576 234 231 1.26 94 - - - 94 - - 5
212 0009 +2 1721 1730 1945 222 213 0.68 80 - 80 - - - - 19
213 .....
214
215 -------
216
217 **References**
218
219 A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
220
221
222 </help>
223
224 </tool>