comparison kmersvm/train.xml @ 0:7fe1103032f7 draft

Uploaded
author cafletezbrant
date Mon, 20 Aug 2012 18:07:22 -0400
parents
children fd740d515502
comparison
equal deleted inserted replaced
-1:000000000000 0:7fe1103032f7
1 <tool id="kmersvm_train" name="Train SVM">
2 <description>on regulatory DNA sequences</description>
3 <command interpreter="python">scripts/kmersvm_train.py -q -p -s -v $N -C $SVMC -e $EPS
4 #if $weight_type.weight_type_select == "custom"
5 -w $weight_type.weight
6 #end if
7 #if $kernel.kernel_select == "sk"
8 -t 1 -k $kernel.kmerlen_sk
9 #else
10 -t 2 -k $kernel.kmerlen_wsk -K $kernel.kmerlen_wsk2
11 #end if
12 $inputA $inputB
13 </command>
14 <inputs>
15 <param format="fasta" name="inputA" type="data" label="Positives"/>
16 <param format="fasta" name="inputB" type="data" label="Negatives"/>
17 <conditional name="kernel">
18 <param name="kernel_select" type="select" label="Kernel Type">
19 <option value="sk">Spectrum Kernel</option>
20 <option value="wsk">Weighted Spectrum Kernel</option>
21 </param>
22 <when value="sk">
23 <param name="kmerlen_sk" type="integer" value="6" label="K-mer Length">
24 <validator type="in_range" message="K-mer length must be in range 5-10" min="5" max="10" />
25 </param>
26 </when>
27 <when value="wsk">
28 <param name="kmerlen_wsk" type="integer" value="6" label="Minimum K-mer Length">
29 <validator type="in_range" message="K-mer length must be in range 5-10" min="5" max="10" />
30 </param>
31 <param name="kmerlen_wsk2" type="integer" value="8" label="Maximum K-mer Length">
32 <validator type="in_range" message="K-mer length must be in range 5-10" min="5" max="10" />
33 </param>
34 </when>
35 </conditional>
36 <param name="N" type="select" label="N-Fold Cross Validation">
37 <option value="3">3</option>
38 <option value="5" selected="true">5</option>
39 <option value="10">10</option>
40 </param>
41 <conditional name="weight_type">
42 <param name="weight_type_select" type="select" label="Positive Set Weight">
43 <option value="automatic">Automatic</option>
44 <option value="custom">Custom</option>
45 </param>
46 <when value="custom">
47 <param name="weight" type="float" value="1" label="Input The Value of Positive Set Weight" />
48 </when>
49 </conditional>
50 <param name="SVMC" type="integer" value="1" label="Regularization Param C" />
51 <param name="EPS" type="float" value="0.00001" label="Precision Param E" />
52 </inputs>
53 <outputs>
54 <data format="tabular" name="SVM_weights" from_work_dir="kmersvm_output_weights.out" label="${tool.name} on ${on_string} : Weights" />
55 <data format="tabular" name="CV_predictions" from_work_dir="kmersvm_output_cvpred.out" label="${tool.name} on ${on_string} : Predictions" />
56 </outputs>
57 <tests>
58 <!--SK-->
59 <test>
60 <param name="kernel_select" value="sk"/>
61 <param name="inputA" value="test_positive.fa" />
62 <param name="inputB" value="test_negative.fa" />
63 <param name="weight_type_select" value="automatic" />
64 <output name="output" file="test_weights.out" compare="re_match" lines_diff="20"/>
65 <output name="output2" file="train_predictions.out" compare="re_match"/>
66 </test>
67 </tests>
68 <help>
69
70 **Note**
71
72 .. class:: warningmark
73
74 All values of K-mer lengths must be between 5 and 10 bp.
75
76 ----
77
78 **What it does**
79
80 Takes as input 2 FASTA files, 1 of positive sequences and 1 of negative sequences. Produces 2 outputs:
81
82 A) Weights: list of sequences of length K ranked by score and posterior probability for that score.
83
84 B) Predictions: results of N-fold cross validation
85
86 ----
87
88 **Parameters**
89
90 Kernel: 2 choices:
91
92 A) Spectrum Kernel: Analyzes a sequence using strings of length K.
93
94 B) Weighted Spectrum Kernel: Analyzes a sequence using strings of range of lengths K1 - Kn.
95
96 N-Fold Cross Validation: Number of partitions of training data used for cross validation.
97
98 Weight: Increases importance of positive data (increase if positive sets are very trustworthy or for training with very large negative sequence sets).
99
100 Regularization Parameter: Penalty for misclassification. Trade-off is overfitting (high parameter) versus high error rate (low parameter).
101
102 Precision Parameter: Insensitivity zone. Affects precision of SVM by altering number of support vectors used.
103
104 ----
105
106 **Example**
107
108 Weights file::
109
110 #parameters:
111 #kernel=1
112 #kmerlen=6
113 #bias=-1.20239998751
114 #A=-1.50821617139
115 #B=-0.110516009177
116 #NOTE: k-mers with large negative weights are also important. They can be found at the bottom of the list.
117 #k-mer revcomp SVM-weight
118 AGGTCA TGACCT 9.32110889151
119 AAGGTC GACCTT 8.22598019901
120 ACCTTG CAAGGT 5.78739494153
121 AGGTCG CGACCT 5.40759311635
122
123 Predictions file::
124
125 mm8_chr1_10212203_10212303_+ 3.31832111466 1 1
126 mm8_chr1_103584748_103584848_+ -0.253869299667 1 3
127 mm8_chr1_105299130_105299230_+ -1.03463560077 1 3
128 mm8_chr1_106367772_106367872_+ 5.36528447025 1 3
129
130 </help>
131 </tool>