comparison tools/stats/plot_from_lda.xml @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 <tool id="plot_for_lda_output1" name="Draw ROC plot" version="1.0.1">
2 <description>on "Perform LDA" output</description>
3 <command interpreter="sh">r_wrapper.sh $script_file</command>
4
5 <inputs>
6 <param format="txt" name="input" type="data" label="Source file"> </param>
7 <param name="my_title" size="30" type="text" value="My Figure" label="Title of your plot" help="See syntax below"> </param>
8 <param name="X_axis" size="30" type="text" value="Text for X axis" label="Legend of X axis in your plot" help="See syntax below"> </param>
9 <param name="Y_axis" size="30" type="text" value="Text for Y axis" label="Legend of Y axis in your plot" help="See syntax below"> </param>
10 </inputs>
11 <outputs>
12 <data format="pdf" name="pdf_output" />
13 </outputs>
14
15 <tests>
16 <test>
17 <param name="input" value="lda_analy_output.txt"/>
18 <param name="my_title" value="Test Plot1"/>
19 <param name="X_axis" value="Test Plot2"/>
20 <param name="Y_axis" value="Test Plot3"/>
21 <output name="pdf_output" file="plot_for_lda_output.pdf"/>
22 </test>
23 </tests>
24
25 <configfiles>
26 <configfile name="script_file">
27
28 rm(list = objects() )
29
30 ############# FORMAT X DATA #########################
31 format&lt;-function(data) {
32 ind=NULL
33 for(i in 1 : ncol(data)){
34 if (is.na(data[nrow(data),i])) {
35 ind&lt;-c(ind,i)
36 }
37 }
38 #print(is.null(ind))
39 if (!is.null(ind)) {
40 data&lt;-data[,-c(ind)]
41 }
42
43 data
44 }
45
46 ########GET RESPONSES ###############################
47 get_resp&lt;- function(data) {
48 resp1&lt;-as.vector(data[,ncol(data)])
49 resp=numeric(length(resp1))
50 for (i in 1:length(resp1)) {
51 if (resp1[i]=="Control ") {
52 resp[i] = 0
53 }
54 if (resp1[i]=="XLMR ") {
55 resp[i] = 1
56 }
57 }
58 return(resp)
59 }
60
61 ######## CHARS TO NUMBERS ###########################
62 f_to_numbers&lt;- function(F) {
63 ind&lt;-NULL
64 G&lt;-matrix(0,nrow(F), ncol(F))
65 for (i in 1:nrow(F)) {
66 for (j in 1:ncol(F)) {
67 G[i,j]&lt;-as.integer(F[i,j])
68 }
69 }
70 return(G)
71 }
72
73 ###################NORMALIZING#########################
74 norm &lt;- function(M, a=NULL, b=NULL) {
75 C&lt;-NULL
76 ind&lt;-NULL
77
78 for (i in 1: ncol(M)) {
79 if (sd(M[,i])!=0) {
80 M[,i]&lt;-(M[,i]-mean(M[,i]))/sd(M[,i])
81 }
82 # else {print(mean(M[,i]))}
83 }
84 return(M)
85 }
86
87 ##### LDA DIRECTIONS #################################
88 lda_dec &lt;- function(data, k){
89 priors=numeric(k)
90 grandmean&lt;-numeric(ncol(data)-1)
91 means=matrix(0,k,ncol(data)-1)
92 B = matrix(0, ncol(data)-1, ncol(data)-1)
93 N=nrow(data)
94 for (i in 1:k){
95 priors[i]=sum(data[,1]==i)/N
96 grp=subset(data,data\$group==i)
97 means[i,]=mean(grp[,2:ncol(data)])
98 #print(means[i,])
99 #print(priors[i])
100 #print(priors[i]*means[i,])
101 grandmean = priors[i]*means[i,] + grandmean
102 }
103
104 for (i in 1:k) {
105 B= B + priors[i]*((means[i,]-grandmean)%*%t(means[i,]-grandmean))
106 }
107
108 W = var(data[,2:ncol(data)])
109 svdW = svd(W)
110 inv_sqrtW =solve(svdW\$v %*% diag(sqrt(svdW\$d)) %*% t(svdW\$v))
111 B_star= t(inv_sqrtW)%*%B%*%inv_sqrtW
112 B_star_decomp = svd(B_star)
113 directions = inv_sqrtW%*%B_star_decomp\$v
114 return( list(directions, B_star_decomp\$d) )
115 }
116
117 ################ NAIVE BAYES FOR 1D SIR OR LDA ##############
118 naive_bayes_classifier &lt;- function(resp, tr_data, test_data, k=2, tau) {
119 tr_data=data.frame(resp=resp, dir=tr_data)
120 means=numeric(k)
121 #print(k)
122 cl=numeric(k)
123 predclass=numeric(length(test_data))
124 for (i in 1:k) {
125 grp = subset(tr_data, resp==i)
126 means[i] = mean(grp\$dir)
127 #print(i, means[i])
128 }
129 cutoff = tau*means[1]+(1-tau)*means[2]
130 #print(tau)
131 #print(means)
132 #print(cutoff)
133 if (cutoff&gt;means[1]) {
134 cl[1]=1
135 cl[2]=2
136 }
137 else {
138 cl[1]=2
139 cl[2]=1
140 }
141
142 for (i in 1:length(test_data)) {
143
144 if (test_data[i] &lt;= cutoff) {
145 predclass[i] = cl[1]
146 }
147 else {
148 predclass[i] = cl[2]
149 }
150 }
151 #print(means)
152 #print(mean(means))
153 #X11()
154 #plot(test_data,pch=predclass, col=resp)
155 predclass
156 }
157
158 ################# EXTENDED ERROR RATES #################
159 ext_error_rate &lt;- function(predclass, actualclass,msg=c("you forgot the message"), pr=1) {
160 er=sum(predclass != actualclass)/length(predclass)
161
162 matr&lt;-data.frame(predclass=predclass,actualclass=actualclass)
163 escapes = subset(matr, actualclass==1)
164 subjects = subset(matr, actualclass==2)
165 er_esc=sum(escapes\$predclass != escapes\$actualclass)/length(escapes\$predclass)
166 er_subj=sum(subjects\$predclass != subjects\$actualclass)/length(subjects\$predclass)
167
168 if (pr==1) {
169 # print(paste(c(msg, 'overall : ', (1-er)*100, "%."),collapse=" "))
170 # print(paste(c(msg, 'within escapes : ', (1-er_esc)*100, "%."),collapse=" "))
171 # print(paste(c(msg, 'within subjects: ', (1-er_subj)*100, "%."),collapse=" "))
172 }
173 return(c((1-er)*100, (1-er_esc)*100, (1-er_subj)*100))
174 }
175
176 ## Main Function ##
177
178 files_alias&lt;-c("${my_title}")
179 tau=seq(0,1,by=0.005)
180 nfiles=1
181 f = c("${input}")
182
183 rez_ext&lt;-list()
184 for (i in 1:nfiles) {
185 rez_ext[[i]]&lt;-dget(paste(f[i], sep="",collapse=""))
186 }
187
188 tau&lt;-tau[1:(length(tau)-1)]
189 for (i in 1:nfiles) {
190 rez_ext[[i]]&lt;-rez_ext[[i]][,1:(length(tau)-1)]
191 }
192
193 ######## OPTIMAIL TAU ###########################
194
195 #rez_ext
196
197 rate&lt;-c("Optimal tau","Tr total", "Tr Y", "Tr X")
198
199 m_tr&lt;-numeric(nfiles)
200 m_xp22&lt;-numeric(nfiles)
201 m_x&lt;-numeric(nfiles)
202
203 for (i in 1:nfiles) {
204 r&lt;-rez_ext[[i]]
205 #tr
206 # rate&lt;-rbind(rate, c(files_alias[i]," "," "," ") )
207 mm&lt;-which((r[3,])==max(r[3,]))
208
209 m_tr[i]&lt;-mm[1]
210 rate&lt;-rbind(rate,c(tau[m_tr[i]],r[,m_tr[i]]))
211 }
212 print(rate)
213
214 pdf(file= paste("${pdf_output}"))
215
216 plot(rez_ext[[i]][2,]~rez_ext[[i]][3,], xlim=c(0,100), ylim=c(0,100), xlab="${X_axis} [1-FP(False Positive)]", ylab="${Y_axis} [1-FP(False Positive)]", type="l", lty=1, col="blue", xaxt='n', yaxt='n')
217 for (i in 1:nfiles) {
218 lines(rez_ext[[i]][2,]~rez_ext[[i]][3,], xlab="${X_axis} [1-FP(False Positive)]", ylab="${Y_axis} [1-FP(False Positive)]", type="l", lty=1, col=i)
219 # pt=c(r,)
220 points(x=rez_ext[[i]][3,m_tr[i]],y=rez_ext[[i]][2,m_tr[i]], pch=16, col=i)
221 }
222
223
224 title(main="${my_title}", adj=0, cex.main=1.1)
225 axis(2, at=c(0,20,40,60,80,100), labels=c('0','20','40','60','80','100%'))
226 axis(1, at=c(0,20,40,60,80,100), labels=c('0','20','40','60','80','100%'))
227
228 #leg=c("10 kb","50 kb","100 kb")
229 #legend("bottomleft",legend=leg , col=c(1,2,3), lty=c(1,1,1))
230
231 #dev.off()
232
233 </configfile>
234 </configfiles>
235
236
237 <help>
238 .. class:: infomark
239
240 **What it does**
241
242 This tool generates a Receiver Operating Characteristic (ROC) plot that shows LDA classification success rates for different values of the tuning parameter tau as Figure 3 in Carrel et al., 2006 (PMID: 17009873).
243
244 *Carrel L, Park C, Tyekucheva S, Dunn J, Chiaromonte F, et al. (2006) Genomic Environment Predicts Expression Patterns on the Human Inactive X Chromosome. PLoS Genet 2(9): e151. doi:10.1371/journal.pgen.0020151*
245
246 -----
247
248 .. class:: warningmark
249
250 **Note**
251
252 - Output from "Perform LDA" tool is used as input file for this tool.
253
254 </help>
255
256
257
258 </tool>