annotate kmersvm/train.xml @ 0:7fe1103032f7 draft

Uploaded
author cafletezbrant
date Mon, 20 Aug 2012 18:07:22 -0400
parents
children fd740d515502
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
1 <tool id="kmersvm_train" name="Train SVM">
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
2 <description>on regulatory DNA sequences</description>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
3 <command interpreter="python">scripts/kmersvm_train.py -q -p -s -v $N -C $SVMC -e $EPS
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
4 #if $weight_type.weight_type_select == "custom"
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
5 -w $weight_type.weight
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
6 #end if
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
7 #if $kernel.kernel_select == "sk"
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
8 -t 1 -k $kernel.kmerlen_sk
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
9 #else
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
10 -t 2 -k $kernel.kmerlen_wsk -K $kernel.kmerlen_wsk2
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
11 #end if
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
12 $inputA $inputB
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
13 </command>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
14 <inputs>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
15 <param format="fasta" name="inputA" type="data" label="Positives"/>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
16 <param format="fasta" name="inputB" type="data" label="Negatives"/>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
17 <conditional name="kernel">
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
18 <param name="kernel_select" type="select" label="Kernel Type">
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
19 <option value="sk">Spectrum Kernel</option>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
20 <option value="wsk">Weighted Spectrum Kernel</option>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
21 </param>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
22 <when value="sk">
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
23 <param name="kmerlen_sk" type="integer" value="6" label="K-mer Length">
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
24 <validator type="in_range" message="K-mer length must be in range 5-10" min="5" max="10" />
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
25 </param>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
26 </when>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
27 <when value="wsk">
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
28 <param name="kmerlen_wsk" type="integer" value="6" label="Minimum K-mer Length">
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
29 <validator type="in_range" message="K-mer length must be in range 5-10" min="5" max="10" />
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
30 </param>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
31 <param name="kmerlen_wsk2" type="integer" value="8" label="Maximum K-mer Length">
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
32 <validator type="in_range" message="K-mer length must be in range 5-10" min="5" max="10" />
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
33 </param>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
34 </when>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
35 </conditional>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
36 <param name="N" type="select" label="N-Fold Cross Validation">
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
37 <option value="3">3</option>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
38 <option value="5" selected="true">5</option>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
39 <option value="10">10</option>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
40 </param>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
41 <conditional name="weight_type">
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
42 <param name="weight_type_select" type="select" label="Positive Set Weight">
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
43 <option value="automatic">Automatic</option>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
44 <option value="custom">Custom</option>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
45 </param>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
46 <when value="custom">
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
47 <param name="weight" type="float" value="1" label="Input The Value of Positive Set Weight" />
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
48 </when>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
49 </conditional>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
50 <param name="SVMC" type="integer" value="1" label="Regularization Param C" />
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
51 <param name="EPS" type="float" value="0.00001" label="Precision Param E" />
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
52 </inputs>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
53 <outputs>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
54 <data format="tabular" name="SVM_weights" from_work_dir="kmersvm_output_weights.out" label="${tool.name} on ${on_string} : Weights" />
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
55 <data format="tabular" name="CV_predictions" from_work_dir="kmersvm_output_cvpred.out" label="${tool.name} on ${on_string} : Predictions" />
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
56 </outputs>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
57 <tests>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
58 <!--SK-->
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
59 <test>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
60 <param name="kernel_select" value="sk"/>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
61 <param name="inputA" value="test_positive.fa" />
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
62 <param name="inputB" value="test_negative.fa" />
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
63 <param name="weight_type_select" value="automatic" />
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
64 <output name="output" file="test_weights.out" compare="re_match" lines_diff="20"/>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
65 <output name="output2" file="train_predictions.out" compare="re_match"/>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
66 </test>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
67 </tests>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
68 <help>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
69
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
70 **Note**
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
71
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
72 .. class:: warningmark
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
73
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
74 All values of K-mer lengths must be between 5 and 10 bp.
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
75
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
76 ----
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
77
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
78 **What it does**
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
79
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
80 Takes as input 2 FASTA files, 1 of positive sequences and 1 of negative sequences. Produces 2 outputs:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
81
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
82 A) Weights: list of sequences of length K ranked by score and posterior probability for that score.
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
83
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
84 B) Predictions: results of N-fold cross validation
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
85
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
86 ----
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
87
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
88 **Parameters**
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
89
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
90 Kernel: 2 choices:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
91
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
92 A) Spectrum Kernel: Analyzes a sequence using strings of length K.
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
93
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
94 B) Weighted Spectrum Kernel: Analyzes a sequence using strings of range of lengths K1 - Kn.
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
95
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
96 N-Fold Cross Validation: Number of partitions of training data used for cross validation.
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
97
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
98 Weight: Increases importance of positive data (increase if positive sets are very trustworthy or for training with very large negative sequence sets).
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
99
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
100 Regularization Parameter: Penalty for misclassification. Trade-off is overfitting (high parameter) versus high error rate (low parameter).
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
101
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
102 Precision Parameter: Insensitivity zone. Affects precision of SVM by altering number of support vectors used.
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
103
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
104 ----
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
105
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
106 **Example**
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
107
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
108 Weights file::
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
109
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
110 #parameters:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
111 #kernel=1
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
112 #kmerlen=6
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
113 #bias=-1.20239998751
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
114 #A=-1.50821617139
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
115 #B=-0.110516009177
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
116 #NOTE: k-mers with large negative weights are also important. They can be found at the bottom of the list.
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
117 #k-mer revcomp SVM-weight
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
118 AGGTCA TGACCT 9.32110889151
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
119 AAGGTC GACCTT 8.22598019901
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
120 ACCTTG CAAGGT 5.78739494153
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
121 AGGTCG CGACCT 5.40759311635
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
122
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
123 Predictions file::
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
124
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
125 mm8_chr1_10212203_10212303_+ 3.31832111466 1 1
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
126 mm8_chr1_103584748_103584848_+ -0.253869299667 1 3
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
127 mm8_chr1_105299130_105299230_+ -1.03463560077 1 3
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
128 mm8_chr1_106367772_106367872_+ 5.36528447025 1 3
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
129
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
130 </help>
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
131 </tool>