Mercurial > repos > xuebing > sharplabtool
comparison tools/stats/lda_analy.xml @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 <tool id="lda_analy1" name="Perform LDA" version="1.0.1"> | |
2 <description>Linear Discriminant Analysis</description> | |
3 <command interpreter="sh">r_wrapper.sh $script_file</command> | |
4 <inputs> | |
5 <param format="tabular" name="input" type="data" label="Source file"/> | |
6 <param name="cond" size="30" type="integer" value="3" label="Number of principal components" help="See TIP below"> | |
7 <validator type="empty_field" message="Enter a valid number of principal components, see syntax below for examples"/> | |
8 </param> | |
9 | |
10 </inputs> | |
11 <outputs> | |
12 <data format="txt" name="output" /> | |
13 </outputs> | |
14 | |
15 <tests> | |
16 <test> | |
17 <param name="input" value="matrix_generator_for_pc_and_lda_output.tabular"/> | |
18 <output name="output" file="lda_analy_output.txt"/> | |
19 <param name="cond" value="2"/> | |
20 | |
21 </test> | |
22 </tests> | |
23 | |
24 <configfiles> | |
25 <configfile name="script_file"> | |
26 | |
27 rm(list = objects() ) | |
28 | |
29 ############# FORMAT X DATA ######################### | |
30 format<-function(data) { | |
31 ind=NULL | |
32 for(i in 1 : ncol(data)){ | |
33 if (is.na(data[nrow(data),i])) { | |
34 ind<-c(ind,i) | |
35 } | |
36 } | |
37 #print(is.null(ind)) | |
38 if (!is.null(ind)) { | |
39 data<-data[,-c(ind)] | |
40 } | |
41 | |
42 data | |
43 } | |
44 | |
45 ########GET RESPONSES ############################### | |
46 get_resp<- function(data) { | |
47 resp1<-as.vector(data[,ncol(data)]) | |
48 resp=numeric(length(resp1)) | |
49 for (i in 1:length(resp1)) { | |
50 if (resp1[i]=="Y ") { | |
51 resp[i] = 0 | |
52 } | |
53 if (resp1[i]=="X ") { | |
54 resp[i] = 1 | |
55 } | |
56 } | |
57 return(resp) | |
58 } | |
59 | |
60 ######## CHARS TO NUMBERS ########################### | |
61 f_to_numbers<- function(F) { | |
62 ind<-NULL | |
63 G<-matrix(0,nrow(F), ncol(F)) | |
64 for (i in 1:nrow(F)) { | |
65 for (j in 1:ncol(F)) { | |
66 G[i,j]<-as.integer(F[i,j]) | |
67 } | |
68 } | |
69 return(G) | |
70 } | |
71 | |
72 ###################NORMALIZING######################### | |
73 norm <- function(M, a=NULL, b=NULL) { | |
74 C<-NULL | |
75 ind<-NULL | |
76 | |
77 for (i in 1: ncol(M)) { | |
78 if (sd(M[,i])!=0) { | |
79 M[,i]<-(M[,i]-mean(M[,i]))/sd(M[,i]) | |
80 } | |
81 # else {print(mean(M[,i]))} | |
82 } | |
83 return(M) | |
84 } | |
85 | |
86 ##### LDA DIRECTIONS ################################# | |
87 lda_dec <- function(data, k){ | |
88 priors=numeric(k) | |
89 grandmean<-numeric(ncol(data)-1) | |
90 means=matrix(0,k,ncol(data)-1) | |
91 B = matrix(0, ncol(data)-1, ncol(data)-1) | |
92 N=nrow(data) | |
93 for (i in 1:k){ | |
94 priors[i]=sum(data[,1]==i)/N | |
95 grp=subset(data,data\$group==i) | |
96 means[i,]=mean(grp[,2:ncol(data)]) | |
97 #print(means[i,]) | |
98 #print(priors[i]) | |
99 #print(priors[i]*means[i,]) | |
100 grandmean = priors[i]*means[i,] + grandmean | |
101 } | |
102 | |
103 for (i in 1:k) { | |
104 B= B + priors[i]*((means[i,]-grandmean)%*%t(means[i,]-grandmean)) | |
105 } | |
106 | |
107 W = var(data[,2:ncol(data)]) | |
108 svdW = svd(W) | |
109 inv_sqrtW =solve(svdW\$v %*% diag(sqrt(svdW\$d)) %*% t(svdW\$v)) | |
110 B_star= t(inv_sqrtW)%*%B%*%inv_sqrtW | |
111 B_star_decomp = svd(B_star) | |
112 directions = inv_sqrtW%*%B_star_decomp\$v | |
113 return( list(directions, B_star_decomp\$d) ) | |
114 } | |
115 | |
116 ################ NAIVE BAYES FOR 1D SIR OR LDA ############## | |
117 naive_bayes_classifier <- function(resp, tr_data, test_data, k=2, tau) { | |
118 tr_data=data.frame(resp=resp, dir=tr_data) | |
119 means=numeric(k) | |
120 #print(k) | |
121 cl=numeric(k) | |
122 predclass=numeric(length(test_data)) | |
123 for (i in 1:k) { | |
124 grp = subset(tr_data, resp==i) | |
125 means[i] = mean(grp\$dir) | |
126 #print(i, means[i]) | |
127 } | |
128 cutoff = tau*means[1]+(1-tau)*means[2] | |
129 #print(tau) | |
130 #print(means) | |
131 #print(cutoff) | |
132 if (cutoff>means[1]) { | |
133 cl[1]=1 | |
134 cl[2]=2 | |
135 } | |
136 else { | |
137 cl[1]=2 | |
138 cl[2]=1 | |
139 } | |
140 | |
141 for (i in 1:length(test_data)) { | |
142 | |
143 if (test_data[i] <= cutoff) { | |
144 predclass[i] = cl[1] | |
145 } | |
146 else { | |
147 predclass[i] = cl[2] | |
148 } | |
149 } | |
150 #print(means) | |
151 #print(mean(means)) | |
152 #X11() | |
153 #plot(test_data,pch=predclass, col=resp) | |
154 predclass | |
155 } | |
156 | |
157 ################# EXTENDED ERROR RATES ################# | |
158 ext_error_rate <- function(predclass, actualclass,msg=c("you forgot the message"), pr=1) { | |
159 er=sum(predclass != actualclass)/length(predclass) | |
160 | |
161 matr<-data.frame(predclass=predclass,actualclass=actualclass) | |
162 escapes = subset(matr, actualclass==1) | |
163 subjects = subset(matr, actualclass==2) | |
164 er_esc=sum(escapes\$predclass != escapes\$actualclass)/length(escapes\$predclass) | |
165 er_subj=sum(subjects\$predclass != subjects\$actualclass)/length(subjects\$predclass) | |
166 | |
167 if (pr==1) { | |
168 # print(paste(c(msg, 'overall : ', (1-er)*100, "%."),collapse=" ")) | |
169 # print(paste(c(msg, 'within escapes : ', (1-er_esc)*100, "%."),collapse=" ")) | |
170 # print(paste(c(msg, 'within subjects: ', (1-er_subj)*100, "%."),collapse=" ")) | |
171 } | |
172 return(c((1-er)*100, (1-er_esc)*100, (1-er_subj)*100)) | |
173 } | |
174 | |
175 ## Main Function ## | |
176 | |
177 files<-matrix("${input}", 1,1, byrow=T) | |
178 | |
179 d<-"${cond}" # Number of PC | |
180 | |
181 tau<-seq(0,1, by=0.005) | |
182 #tau<-seq(0,1, by=0.1) | |
183 for_curve=matrix(-10, 3,length(tau)) | |
184 | |
185 ############################################################## | |
186 | |
187 test_data_whole_X <-read.delim(files[1,1], row.names=1) | |
188 | |
189 #### FORMAT TRAINING DATA #################################### | |
190 # get only necessary columns | |
191 | |
192 test_data_whole_X<-format(test_data_whole_X) | |
193 oligo_labels<-test_data_whole_X[1:(nrow(test_data_whole_X)-1),ncol(test_data_whole_X)] | |
194 test_data_whole_X<-test_data_whole_X[,1:(ncol(test_data_whole_X)-1)] | |
195 | |
196 X_names<-colnames(test_data_whole_X)[1:ncol(test_data_whole_X)] | |
197 test_data_whole_X<-t(test_data_whole_X) | |
198 resp<-get_resp(test_data_whole_X) | |
199 ldaqda_resp = resp + 1 | |
200 a<-sum(resp) # Number of Subject | |
201 b<-length(resp) - a # Number of Escape | |
202 ## FREQUENCIES ################################################# | |
203 F<-test_data_whole_X[,1:(ncol(test_data_whole_X)-1)] | |
204 F<-f_to_numbers(F) | |
205 FN<-norm(F, a, b) | |
206 ss<-svd(FN) | |
207 eigvar<-NULL | |
208 eig<-ss\$d^2 | |
209 | |
210 for ( i in 1:length(ss\$d)) { | |
211 eigvar[i]<-sum(eig[1:i])/sum(eig) | |
212 } | |
213 | |
214 #print(paste(c("Variance explained : ", eigvar[d]*100, "%"), collapse="")) | |
215 | |
216 Z<-F%*%ss\$v | |
217 | |
218 ldaqda_data <- data.frame(group=ldaqda_resp,Z[,1:d]) | |
219 lda_dir<-lda_dec(ldaqda_data,2) | |
220 train_lda_pred <-Z[,1:d]%*%lda_dir[[1]] | |
221 | |
222 ############# NAIVE BAYES CROSS-VALIDATION ############# | |
223 ### LDA ##### | |
224 | |
225 y<-ldaqda_resp | |
226 X<-F | |
227 cv<-matrix(c(rep('NA',nrow(test_data_whole_X))), nrow(test_data_whole_X), length(tau)) | |
228 for (i in 1:nrow(test_data_whole_X)) { | |
229 # print(i) | |
230 resp<-y[-i] | |
231 p<-matrix(X[-i,], dim(X)[1]-1, dim(X)[2]) | |
232 testdata<-matrix(X[i,],1,dim(X)[2]) | |
233 p1<-norm(p) | |
234 sss<-svd(p1) | |
235 pred<-(p%*%sss\$v)[,1:d] | |
236 test<- (testdata%*%sss\$v)[,1:d] | |
237 lda <- lda_dec(data.frame(group=resp,pred),2) | |
238 pred <- pred[,1:d]%*%lda[[1]][,1] | |
239 test <- test%*%lda[[1]][,1] | |
240 test<-matrix(test, 1, length(test)) | |
241 for (t in 1:length(tau)) { | |
242 cv[i, t] <- naive_bayes_classifier (resp, pred, test,k=2, tau[t]) | |
243 } | |
244 } | |
245 | |
246 for (t in 1:length(tau)) { | |
247 tr_err<-ext_error_rate(cv[,t], ldaqda_resp , c("CV"), 1) | |
248 for_curve[1:3,t]<-tr_err | |
249 } | |
250 | |
251 dput(for_curve, file="${output}") | |
252 | |
253 | |
254 </configfile> | |
255 </configfiles> | |
256 | |
257 <help> | |
258 | |
259 .. class:: infomark | |
260 | |
261 **TIP:** If you want to perform Principal Component Analysis (PCA) on the give numeric input data (which corresponds to the "Source file First in "Generate A Matrix" tool), please use *Multivariate Analysis/Principal Component Analysis* | |
262 | |
263 ----- | |
264 | |
265 .. class:: infomark | |
266 | |
267 **What it does** | |
268 | |
269 This tool consists of the module to perform the Linear Discriminant Analysis as described in Carrel et al., 2006 (PMID: 17009873) | |
270 | |
271 *Carrel L, Park C, Tyekucheva S, Dunn J, Chiaromonte F, et al. (2006) Genomic Environment Predicts Expression Patterns on the Human Inactive X Chromosome. PLoS Genet 2(9): e151. doi:10.1371/journal.pgen.0020151* | |
272 | |
273 ----- | |
274 | |
275 .. class:: warningmark | |
276 | |
277 **Note** | |
278 | |
279 - Output from "Generate A Matrix" tool is used as input file for this tool | |
280 - Output of this tool contains LDA classification success rates for different values of the turning parameter tau (from 0 to 1 with 0.005 interval). This output file will be used to establish the ROC plot, and you can obtain more detail information from this plot. | |
281 | |
282 | |
283 </help> | |
284 | |
285 </tool> |