0
|
1 <tool id="kmersvm_train" name="Train SVM">
|
|
2 <description>on regulatory DNA sequences</description>
|
|
3 <command interpreter="python">scripts/kmersvm_train.py -q -p -s -v $N -C $SVMC -e $EPS
|
|
4 #if $weight_type.weight_type_select == "custom"
|
|
5 -w $weight_type.weight
|
|
6 #end if
|
|
7 #if $kernel.kernel_select == "sk"
|
|
8 -t 1 -k $kernel.kmerlen_sk
|
|
9 #else
|
|
10 -t 2 -k $kernel.kmerlen_wsk -K $kernel.kmerlen_wsk2
|
|
11 #end if
|
|
12 $inputA $inputB
|
|
13 </command>
|
|
14 <inputs>
|
|
15 <param format="fasta" name="inputA" type="data" label="Positives"/>
|
|
16 <param format="fasta" name="inputB" type="data" label="Negatives"/>
|
|
17 <conditional name="kernel">
|
|
18 <param name="kernel_select" type="select" label="Kernel Type">
|
|
19 <option value="sk">Spectrum Kernel</option>
|
|
20 <option value="wsk">Weighted Spectrum Kernel</option>
|
|
21 </param>
|
|
22 <when value="sk">
|
|
23 <param name="kmerlen_sk" type="integer" value="6" label="K-mer Length">
|
|
24 <validator type="in_range" message="K-mer length must be in range 5-10" min="5" max="10" />
|
|
25 </param>
|
|
26 </when>
|
|
27 <when value="wsk">
|
|
28 <param name="kmerlen_wsk" type="integer" value="6" label="Minimum K-mer Length">
|
|
29 <validator type="in_range" message="K-mer length must be in range 5-10" min="5" max="10" />
|
|
30 </param>
|
|
31 <param name="kmerlen_wsk2" type="integer" value="8" label="Maximum K-mer Length">
|
|
32 <validator type="in_range" message="K-mer length must be in range 5-10" min="5" max="10" />
|
|
33 </param>
|
|
34 </when>
|
|
35 </conditional>
|
|
36 <param name="N" type="select" label="N-Fold Cross Validation">
|
|
37 <option value="3">3</option>
|
|
38 <option value="5" selected="true">5</option>
|
|
39 <option value="10">10</option>
|
|
40 </param>
|
|
41 <conditional name="weight_type">
|
|
42 <param name="weight_type_select" type="select" label="Positive Set Weight">
|
|
43 <option value="automatic">Automatic</option>
|
|
44 <option value="custom">Custom</option>
|
|
45 </param>
|
|
46 <when value="custom">
|
|
47 <param name="weight" type="float" value="1" label="Input The Value of Positive Set Weight" />
|
|
48 </when>
|
|
49 </conditional>
|
|
50 <param name="SVMC" type="integer" value="1" label="Regularization Param C" />
|
|
51 <param name="EPS" type="float" value="0.00001" label="Precision Param E" />
|
|
52 </inputs>
|
|
53 <outputs>
|
|
54 <data format="tabular" name="SVM_weights" from_work_dir="kmersvm_output_weights.out" label="${tool.name} on ${on_string} : Weights" />
|
|
55 <data format="tabular" name="CV_predictions" from_work_dir="kmersvm_output_cvpred.out" label="${tool.name} on ${on_string} : Predictions" />
|
|
56 </outputs>
|
|
57 <tests>
|
|
58 <!--SK-->
|
|
59 <test>
|
|
60 <param name="kernel_select" value="sk"/>
|
|
61 <param name="inputA" value="test_positive.fa" />
|
|
62 <param name="inputB" value="test_negative.fa" />
|
|
63 <param name="weight_type_select" value="automatic" />
|
|
64 <output name="output" file="test_weights.out" compare="re_match" lines_diff="20"/>
|
|
65 <output name="output2" file="train_predictions.out" compare="re_match"/>
|
|
66 </test>
|
|
67 </tests>
|
|
68 <help>
|
|
69
|
|
70 **Note**
|
|
71
|
|
72 .. class:: warningmark
|
|
73
|
|
74 All values of K-mer lengths must be between 5 and 10 bp.
|
|
75
|
|
76 ----
|
|
77
|
|
78 **What it does**
|
|
79
|
|
80 Takes as input 2 FASTA files, 1 of positive sequences and 1 of negative sequences. Produces 2 outputs:
|
|
81
|
|
82 A) Weights: list of sequences of length K ranked by score and posterior probability for that score.
|
|
83
|
|
84 B) Predictions: results of N-fold cross validation
|
|
85
|
|
86 ----
|
|
87
|
|
88 **Parameters**
|
|
89
|
|
90 Kernel: 2 choices:
|
|
91
|
|
92 A) Spectrum Kernel: Analyzes a sequence using strings of length K.
|
|
93
|
|
94 B) Weighted Spectrum Kernel: Analyzes a sequence using strings of range of lengths K1 - Kn.
|
|
95
|
|
96 N-Fold Cross Validation: Number of partitions of training data used for cross validation.
|
|
97
|
|
98 Weight: Increases importance of positive data (increase if positive sets are very trustworthy or for training with very large negative sequence sets).
|
|
99
|
|
100 Regularization Parameter: Penalty for misclassification. Trade-off is overfitting (high parameter) versus high error rate (low parameter).
|
|
101
|
|
102 Precision Parameter: Insensitivity zone. Affects precision of SVM by altering number of support vectors used.
|
|
103
|
|
104 ----
|
|
105
|
|
106 **Example**
|
|
107
|
|
108 Weights file::
|
|
109
|
|
110 #parameters:
|
|
111 #kernel=1
|
|
112 #kmerlen=6
|
|
113 #bias=-1.20239998751
|
|
114 #A=-1.50821617139
|
|
115 #B=-0.110516009177
|
|
116 #NOTE: k-mers with large negative weights are also important. They can be found at the bottom of the list.
|
|
117 #k-mer revcomp SVM-weight
|
|
118 AGGTCA TGACCT 9.32110889151
|
|
119 AAGGTC GACCTT 8.22598019901
|
|
120 ACCTTG CAAGGT 5.78739494153
|
|
121 AGGTCG CGACCT 5.40759311635
|
|
122
|
|
123 Predictions file::
|
|
124
|
|
125 mm8_chr1_10212203_10212303_+ 3.31832111466 1 1
|
|
126 mm8_chr1_103584748_103584848_+ -0.253869299667 1 3
|
|
127 mm8_chr1_105299130_105299230_+ -1.03463560077 1 3
|
|
128 mm8_chr1_106367772_106367872_+ 5.36528447025 1 3
|
|
129
|
|
130 </help>
|
|
131 </tool>
|