Mercurial > repos > cafletezbrant > kmersvm
comparison kmersvm/train.xml @ 0:7fe1103032f7 draft
Uploaded
author | cafletezbrant |
---|---|
date | Mon, 20 Aug 2012 18:07:22 -0400 |
parents | |
children | fd740d515502 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7fe1103032f7 |
---|---|
1 <tool id="kmersvm_train" name="Train SVM"> | |
2 <description>on regulatory DNA sequences</description> | |
3 <command interpreter="python">scripts/kmersvm_train.py -q -p -s -v $N -C $SVMC -e $EPS | |
4 #if $weight_type.weight_type_select == "custom" | |
5 -w $weight_type.weight | |
6 #end if | |
7 #if $kernel.kernel_select == "sk" | |
8 -t 1 -k $kernel.kmerlen_sk | |
9 #else | |
10 -t 2 -k $kernel.kmerlen_wsk -K $kernel.kmerlen_wsk2 | |
11 #end if | |
12 $inputA $inputB | |
13 </command> | |
14 <inputs> | |
15 <param format="fasta" name="inputA" type="data" label="Positives"/> | |
16 <param format="fasta" name="inputB" type="data" label="Negatives"/> | |
17 <conditional name="kernel"> | |
18 <param name="kernel_select" type="select" label="Kernel Type"> | |
19 <option value="sk">Spectrum Kernel</option> | |
20 <option value="wsk">Weighted Spectrum Kernel</option> | |
21 </param> | |
22 <when value="sk"> | |
23 <param name="kmerlen_sk" type="integer" value="6" label="K-mer Length"> | |
24 <validator type="in_range" message="K-mer length must be in range 5-10" min="5" max="10" /> | |
25 </param> | |
26 </when> | |
27 <when value="wsk"> | |
28 <param name="kmerlen_wsk" type="integer" value="6" label="Minimum K-mer Length"> | |
29 <validator type="in_range" message="K-mer length must be in range 5-10" min="5" max="10" /> | |
30 </param> | |
31 <param name="kmerlen_wsk2" type="integer" value="8" label="Maximum K-mer Length"> | |
32 <validator type="in_range" message="K-mer length must be in range 5-10" min="5" max="10" /> | |
33 </param> | |
34 </when> | |
35 </conditional> | |
36 <param name="N" type="select" label="N-Fold Cross Validation"> | |
37 <option value="3">3</option> | |
38 <option value="5" selected="true">5</option> | |
39 <option value="10">10</option> | |
40 </param> | |
41 <conditional name="weight_type"> | |
42 <param name="weight_type_select" type="select" label="Positive Set Weight"> | |
43 <option value="automatic">Automatic</option> | |
44 <option value="custom">Custom</option> | |
45 </param> | |
46 <when value="custom"> | |
47 <param name="weight" type="float" value="1" label="Input The Value of Positive Set Weight" /> | |
48 </when> | |
49 </conditional> | |
50 <param name="SVMC" type="integer" value="1" label="Regularization Param C" /> | |
51 <param name="EPS" type="float" value="0.00001" label="Precision Param E" /> | |
52 </inputs> | |
53 <outputs> | |
54 <data format="tabular" name="SVM_weights" from_work_dir="kmersvm_output_weights.out" label="${tool.name} on ${on_string} : Weights" /> | |
55 <data format="tabular" name="CV_predictions" from_work_dir="kmersvm_output_cvpred.out" label="${tool.name} on ${on_string} : Predictions" /> | |
56 </outputs> | |
57 <tests> | |
58 <!--SK--> | |
59 <test> | |
60 <param name="kernel_select" value="sk"/> | |
61 <param name="inputA" value="test_positive.fa" /> | |
62 <param name="inputB" value="test_negative.fa" /> | |
63 <param name="weight_type_select" value="automatic" /> | |
64 <output name="output" file="test_weights.out" compare="re_match" lines_diff="20"/> | |
65 <output name="output2" file="train_predictions.out" compare="re_match"/> | |
66 </test> | |
67 </tests> | |
68 <help> | |
69 | |
70 **Note** | |
71 | |
72 .. class:: warningmark | |
73 | |
74 All values of K-mer lengths must be between 5 and 10 bp. | |
75 | |
76 ---- | |
77 | |
78 **What it does** | |
79 | |
80 Takes as input 2 FASTA files, 1 of positive sequences and 1 of negative sequences. Produces 2 outputs: | |
81 | |
82 A) Weights: list of sequences of length K ranked by score and posterior probability for that score. | |
83 | |
84 B) Predictions: results of N-fold cross validation | |
85 | |
86 ---- | |
87 | |
88 **Parameters** | |
89 | |
90 Kernel: 2 choices: | |
91 | |
92 A) Spectrum Kernel: Analyzes a sequence using strings of length K. | |
93 | |
94 B) Weighted Spectrum Kernel: Analyzes a sequence using strings of range of lengths K1 - Kn. | |
95 | |
96 N-Fold Cross Validation: Number of partitions of training data used for cross validation. | |
97 | |
98 Weight: Increases importance of positive data (increase if positive sets are very trustworthy or for training with very large negative sequence sets). | |
99 | |
100 Regularization Parameter: Penalty for misclassification. Trade-off is overfitting (high parameter) versus high error rate (low parameter). | |
101 | |
102 Precision Parameter: Insensitivity zone. Affects precision of SVM by altering number of support vectors used. | |
103 | |
104 ---- | |
105 | |
106 **Example** | |
107 | |
108 Weights file:: | |
109 | |
110 #parameters: | |
111 #kernel=1 | |
112 #kmerlen=6 | |
113 #bias=-1.20239998751 | |
114 #A=-1.50821617139 | |
115 #B=-0.110516009177 | |
116 #NOTE: k-mers with large negative weights are also important. They can be found at the bottom of the list. | |
117 #k-mer revcomp SVM-weight | |
118 AGGTCA TGACCT 9.32110889151 | |
119 AAGGTC GACCTT 8.22598019901 | |
120 ACCTTG CAAGGT 5.78739494153 | |
121 AGGTCG CGACCT 5.40759311635 | |
122 | |
123 Predictions file:: | |
124 | |
125 mm8_chr1_10212203_10212303_+ 3.31832111466 1 1 | |
126 mm8_chr1_103584748_103584848_+ -0.253869299667 1 3 | |
127 mm8_chr1_105299130_105299230_+ -1.03463560077 1 3 | |
128 mm8_chr1_106367772_106367872_+ 5.36528447025 1 3 | |
129 | |
130 </help> | |
131 </tool> |