comparison test-data/funannotate_db/trained_species/fly/augustus/fly_parameters.cfg @ 0:40b87aef5241 draft

"planemo upload commit 9613152729099079c7465c3d5d42005ef22ca91e"
author iuc
date Thu, 26 Aug 2021 06:55:33 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:40b87aef5241
1 #
2 # parameters for all Drosophila versions
3 #
4 # date : 11.8.2009
5 #
6
7 #
8 # Properties for augustus
9 #------------------------------------
10 /augustus/verbosity 3 # 0-3, 0: only print the necessary
11 maxDNAPieceSize 200000 # maximum segment that is predicted in one piece
12 stopCodonExcludedFromCDS false # make this 'true' if the CDS includes the stop codon (training and prediction)
13
14 # gff output options:
15 protein on # output predicted protein sequence
16 codingseq off # output the coding sequence
17 cds on # output 'cds' as feature for exons
18 start on # output start codons (translation start)
19 stop on # output stop codons (translation stop)
20 introns on # output introns
21 tss on # output transcription start site
22 tts on # output transcription termination site
23 print_utr off # output 5'UTR and 3'UTR lines in addition to exon lines
24
25 checkExAcc off # internal parameter for extrinsic accuracy
26
27 # alternative transcripts and posterior probabilities
28 sample 100 # the number of sampling iterations
29 alternatives-from-sampling false # output alternative transcripts
30 minexonintronprob 0.08 # minimal posterior probability of all (coding) exons
31 minmeanexonintronprob 0.4 # minimal geometric mean of the posterior probs of introns and exons
32 maxtracks -1 # maximum number of reported transcripts per gene (-1: no limit)
33 keep_viterbi true # set to true if all Viterbi transcripts should be reported
34 uniqueCDS true # don't report transcripts that differ only in the UTR
35 UTR on # predict untranslated regions
36
37 #
38 #
39 # The rest of the file contains mainly meta parameters used for training.
40 #
41
42 # global constants
43 # ----------------------------
44
45 /Constant/trans_init_window 25
46 /Constant/ass_upwindow_size 32
47 /Constant/ass_start 1
48 /Constant/ass_end 4
49 /Constant/dss_start 3
50 /Constant/dss_end 4
51 /Constant/init_coding_len 9
52 /Constant/intterm_coding_len 0
53 /Constant/tss_upwindow_size 45
54 /Constant/decomp_num_at 1
55 /Constant/decomp_num_gc 1
56 /Constant/gc_range_min 0.32 # This range has an effect only when decomp_num_steps>1.
57 /Constant/gc_range_max 0.50 # States the minimal and maximal percentage of c or g
58 /Constant/decomp_num_steps 1 # I recommend keeping this to 1 for most species.
59 /Constant/min_coding_len 201 # no gene with a coding sequence shorter than this is predicted
60 /Constant/probNinCoding 0.23 # divide this by .25 to get a malus for making one masked letter part of the coding sequence
61 /Constant/amberprob 0.34 # Prob(stop codon = tag), if 0 tag is assumed to code for amino acid
62 /Constant/ochreprob 0.41 # Prob(stop codon = taa), if 0 taa is assumed to code for amino acid
63 /Constant/opalprob 0.25 # Prob(stop codon = tga), if 0 tga is assumed to code for amino acid
64 /Constant/subopt_transcript_threshold 0.7
65 /Constant/almost_identical_maxdiff 10
66
67 # type of weighing, one of 1 = equalWeights, 2 = gcContentClasses, 3 = multiNormalKernel
68 /BaseCount/weighingType 3
69 # file with the weight matrix (only for multiNormalKernel type weighing)
70 /BaseCount/weightMatrixFile fly_weightmatrix.txt # change this to your species if at all necessary
71
72 # Properties for IGenicModel
73 # ----------------------------
74 /IGenicModel/verbosity 0
75 /IGenicModel/infile fly_igenic_probs.pbl # change this and the other five filenames *_probs.pbl below to your species
76 /IGenicModel/outfile fly_igenic_probs.pbl
77 /IGenicModel/patpseudocount 5.0
78 /IGenicModel/k 4 # order of the Markov chain for content model, keep equal to /ExonModel/k
79
80 # Properties for ExonModel
81 # ----------------------------
82 /ExonModel/verbosity 3
83 /ExonModel/infile fly_exon_probs.pbl
84 /ExonModel/outfile fly_exon_probs.pbl
85 /ExonModel/patpseudocount 5.0
86 /ExonModel/minPatSum 350
87 /ExonModel/k 4 # order of the Markov chain for content model
88 /ExonModel/etorder 2
89 /ExonModel/etpseudocount 3
90 /ExonModel/exonlengthD 3000 # beyond this the distribution is geometric
91 /ExonModel/maxexonlength 15000
92 /ExonModel/slope_of_bandwidth 0.3
93 /ExonModel/minwindowcount 8
94 /ExonModel/tis_motif_memory 3
95 /ExonModel/tis_motif_radius 2
96
97 # Properties for IntronModel
98 # ----------------------------
99 /IntronModel/verbosity 0
100 /IntronModel/infile fly_intron_probs.pbl
101 /IntronModel/outfile fly_intron_probs.pbl
102 /IntronModel/patpseudocount 5.0
103 /IntronModel/k 4 # order of the Markov chain for content model, keep equal to /ExonModel/k
104 /IntronModel/slope_of_bandwidth 0.4
105 /IntronModel/minwindowcount 3
106 /IntronModel/asspseudocount 0.01
107 /IntronModel/dsspseudocount 0.01015
108 /IntronModel/dssneighborfactor 0.001
109 #/IntronModel/splicefile fly_splicefile.txt # this optional file contains additional windows around splice sites for training, uncomment if you have one
110 /IntronModel/sf_with_motif false # if true the splice file is also used to train the branch point region
111 /IntronModel/d 929 # constraint: this must be larger than 4 + /Constant/dss_end + /Constant/ass_upwindow_size + /Constant/ass_start
112 /IntronModel/ass_motif_memory 1
113 /IntronModel/ass_motif_radius 4
114
115 # Properties for UtrModel
116 # ----------------------------
117 /UtrModel/verbosity 3
118 /UtrModel/infile fly_utr_probs.pbl
119 /UtrModel/outfile fly_utr_probs.pbl
120 /UtrModel/k 4
121 /UtrModel/utr5patternweight 0.3 #0.7625
122 /UtrModel/utr3patternweight 0.3 #0.5
123 /UtrModel/patpseudocount 1
124 /UtrModel/tssup_k 1
125 /UtrModel/tssup_patpseudocount 1
126 /UtrModel/slope_of_bandwidth 0.25
127 /UtrModel/minwindowcount 1
128 /UtrModel/exonlengthD 800
129 /UtrModel/maxexonlength 1200
130 /UtrModel/max3singlelength 2000 # excludes roughly 1%
131 /UtrModel/max3termlength 1200 # excludes ~ 0.3%
132 /UtrModel/tss_start 8
133 /UtrModel/tss_end 5
134 /UtrModel/tata_start 2
135 /UtrModel/tata_end 10
136 /UtrModel/tata_pseudocount 2
137 /UtrModel/d_tss_tata_min 26 # minimal distance between start of tata box (if existent) and tss
138 /UtrModel/d_tss_tata_max 37 # maximal distance between start of tata box (if existent) and tss
139 /UtrModel/polyasig_consensus aataaa # polyadenylation signal training not fully automated yet
140 /UtrModel/d_polyasig_cleavage 14 # the transcription end is predicted this many bases after the polyadenylation signal
141 /UtrModel/d_polya_cleavage_min 9
142 /UtrModel/d_polya_cleavage_max 35
143 /UtrModel/prob_polya 0.95
144 /UtrModel/tts_motif_memory 1