comparison Intchecks/Script_intensity_check.R @ 0:c2c2e1be904a draft

Uploaded
author melpetera
date Thu, 11 Oct 2018 05:33:19 -0400
parents
children 4973a2104cfd
comparison
equal deleted inserted replaced
-1:000000000000 0:c2c2e1be904a
1 ####################################################################
2 # SCRIPT INTENSITY CHECK
3 # V1: Fold and NA
4 #
5 # Input: Data Matrix, VariableMetadata, SampleMetadata
6 # Output: VariableMetadata, Graphics (barplots and boxplots)
7 #
8 # Dependencies: RcheckLibrary.R
9 #
10 ####################################################################
11
12
13 # Parameters (for dev)
14 if(FALSE){
15
16 rm(list = ls())
17 setwd("Y:\\Developpement\\Intensity check\\Pour tests")
18
19 DM.name <- "DM_NA.tabular"
20 SM.name <- "SM_NA.tabular"
21 VM.name <- "vM_NA.tabular"
22 type <- "One_class"
23 class.col <- "2"
24 class1 <- "Blanks"
25 VM.output <- "new_VM.txt"
26 graphs.output <- "Barplots_and_Boxplots.pdf"
27 }
28
29 intens_check <- function(DM.name, SM.name, VM.name, type, class.col, class1, VM.output, graphs.output){
30 # This function allows to check the intensities considering classes with a fold calculation, the number and
31 # the proportion of NA
32 #
33 # Two options:
34 # - one class (selected by the user) against all the remaining samples ("One_class")
35 # - tests on each class ("Each_class")
36 #
37 # Parameters:
38 # DM.name, SM.name, VM.name: dataMatrix, sampleMetadata, variableMetadata files access
39 # type: "One_class" or "Each_class"
40 # class.col: number of the sampleMetadata's column with classes
41 # class1: name of the class if type="One_class"
42 # VM.output: output file's access (VM with the new columns)
43 # graphs.output: pdf with barplots for the proportion of NA and boxplots with the folds values
44
45
46
47
48 # Input ---------------------------------------------------------
49
50 DM <- read.table(DM.name, header=TRUE, sep="\t", check.names=FALSE)
51 SM <- read.table(SM.name, header=TRUE, sep="\t", check.names=FALSE)
52 VM <- read.table(VM.name, header=TRUE, sep="\t", check.names=FALSE)
53
54
55
56 # Table match check with Rchecklibrary
57 table.check <- match3(DM, SM, VM)
58 check.err(table.check)
59
60
61 rownames(DM) <- DM[,1]
62 var_names <- DM[,1]
63 DM <- DM[,-1]
64 DM <- data.frame(t(DM))
65
66 class.col <- colnames(SM)[as.numeric(class.col)]
67
68 # check class.col, class1 and the number of classes---------------
69
70 if(!(class.col %in% colnames(SM))){
71 stop("\n- - - - - - - - -\n", "The column ",class.col, " is not a part of the specify sample Metadata","\n- - - - - - - - -\n")
72 }
73
74 c_class <- SM[,class.col]
75 c_class <- as.factor(c_class)
76 nb_class <- nlevels(c_class)
77 classnames <- levels(c_class)
78
79
80 if((nb_class > (nrow(SM))/3)){
81 class.err <- c("\n There are too many classes, think about reducing the number of classes and excluding those
82 with few samples \n")
83 cat(class.err)
84 }
85
86 if(type == "One_class"){
87 if(!(class1 %in% classnames)){
88 list.class1 <- c("\n Classes:",classnames,"\n")
89 cat(list.class1)
90 err.class1 <- c("The class ",class1, " does not appear in the column ", class.col)
91 stop("\n- - - - - - - - -\n", err.class1,"\n- - - - - - - - -\n")
92 }
93 }
94
95 #If type is "one_class", change others classes in "other"
96 if(type == "One_class"){
97 for(i in 1:length(c_class)){
98 if(c_class[i]!=class1){
99 c_class <- as.character(c_class)
100 c_class[i] <- "Other"
101 c_class <- as.factor(c_class)
102 nb_class <- nlevels(c_class)
103 classnames <- levels(c_class)
104
105 }
106 }
107 }
108
109 DM <- cbind(DM,c_class)
110
111 # fold -------------------------------------------------------
112 n <- 1
113 fold <- data.frame()
114 for(j in 1:(nb_class-1)){
115 for(k in (j+1):nb_class) {
116 for (i in 1:(length(DM)-1)){
117 fold[i,n] <- mean(DM[which(DM$c_class==classnames[k]),i], na.rm=TRUE)/
118 mean(DM[which(DM$c_class==classnames[j]),i], na.rm=TRUE)}
119 names(fold)[n] <- paste("fold",classnames[k],"VS", classnames[j], sep="_")
120 n <- n + 1}
121 }
122
123 # NA ---------------------------------------------------------
124 calcul_NA <- data.frame()
125 pct_NA <- data.frame()
126 for (i in 1:(length(DM)-1)){
127 for (j in 1:nb_class){
128 n <- 0
129 new_DM <- DM[which(DM$c_class==classnames[j]),i]
130 for(k in 1:length(new_DM)){
131 if (is.na(new_DM[k])){
132 n <- n + 1}
133 calcul_NA[i,j] <- n
134 pct_NA[i,j] <- (calcul_NA[i,j]/length(new_DM))*100}
135 }
136 }
137 names(calcul_NA) <- paste("Nb_NA",classnames, sep="_")
138 names(pct_NA) <- paste("Pct_NA", classnames, sep="_")
139
140 # Alert message if there is no NA
141
142 sumNA <- colSums(calcul_NA)
143 sum_total <- sum(sumNA)
144
145 alerte <- NULL
146 if(sum_total==0){
147 alerte <- c(alerte, "Data Matrix contains no NA.\n")
148 }
149
150 if(length(alerte) != 0){
151 cat(alerte,"\n")
152 }
153
154 table_NA <- cbind(calcul_NA, pct_NA)
155
156 # check columns names ----------------------------------------
157
158 # Fold
159 VM.names <- colnames(VM)
160 fold.names <- colnames(fold)
161
162 for (i in 1:length(VM.names)){
163 for (j in 1:length(fold.names)){
164 if (VM.names[i]==fold.names[j]){
165 fold.names[j] <- paste(fold.names[j],"2", sep="_")
166 }
167 }
168 }
169 colnames(fold) <- fold.names
170
171 # NA
172 NA.names <- colnames(table_NA)
173
174 for (i in 1:length(VM.names)){
175 for (j in 1:length(NA.names)){
176 if (VM.names[i]==NA.names[j]){
177 NA.names[j] <- paste(NA.names[j],"2", sep="_")
178 }
179 }
180 }
181 colnames(table_NA) <- NA.names
182
183 #for NA barplots ---------------------------------------------
184
185 data_bp <- data.frame()
186
187 for (j in 1:ncol(pct_NA)){
188 Nb_NA_0_20 <- 0
189 Nb_NA_20_40 <- 0
190 Nb_NA_40_60 <- 0
191 Nb_NA_60_80 <- 0
192 Nb_NA_80_100 <- 0
193 for (i in 1:nrow(pct_NA)){
194
195 if ((0<=pct_NA[i,j])&(pct_NA[i,j]<20)){
196 Nb_NA_0_20=Nb_NA_0_20+1
197 }
198
199 if ((20<=pct_NA[i,j])&(pct_NA[i,j]<40)){
200 Nb_NA_20_40=Nb_NA_20_40+1}
201
202 if ((40<=pct_NA[i,j])&(pct_NA[i,j]<60)){
203 Nb_NA_40_60=Nb_NA_40_60+1}
204
205 if ((60<=pct_NA[i,j])&(pct_NA[i,j]<80)){
206 Nb_NA_60_80=Nb_NA_60_80+1}
207
208 if ((80<=pct_NA[i,j])&(pct_NA[i,j]<=100)){
209 Nb_NA_80_100=Nb_NA_80_100+1}
210 }
211 data_bp[1,j] <- Nb_NA_0_20
212 data_bp[2,j] <- Nb_NA_20_40
213 data_bp[3,j] <- Nb_NA_40_60
214 data_bp[4,j] <- Nb_NA_60_80
215 data_bp[5,j] <- Nb_NA_80_100
216 }
217 rownames(data_bp) <- c("0-20%", "20-40%", "40-60%", "60-80%", "80-100%")
218 colnames(data_bp) <- classnames
219 data_bp <- as.matrix(data_bp)
220
221
222 # Output--------------------------------------------------------
223
224 VM <- cbind(VM,fold,table_NA)
225
226 write.table(VM, VM.output,sep="\t", quote=FALSE, row.names=FALSE)
227
228 #graphics pdf
229
230 pdf(graphs.output)
231
232 #Boxplots for NA
233
234 par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
235
236 bp=barplot(data_bp, col=rainbow(nrow(data_bp)), main="Proportion of NA", xlab="Classes", ylab="Variables")
237 legend("topright", fill=rainbow(nrow(data_bp)),rownames(data_bp), inset=c(-0.3,0))
238
239
240 stock=0
241 for (i in 1:nrow(data_bp)){
242 text(bp, stock+data_bp[i,]/2, data_bp[i,], col="white")
243 stock <- stock+data_bp[i,]
244 }
245
246
247 #Boxplots for fold test
248 for (j in 1:ncol(fold)){
249 title=paste(fold.names[j])
250 boxplot(fold[j], main=title)
251 }
252
253 dev.off()
254
255 }
256
257
258 # Function call---------------
259
260 #setwd("Y:\\Developpement\\Intensity check\\Pour tests")
261 #intens_check("DM_NA.tabular", "SM_NA.tabular", "VM_NA.tabular", "One_class", "class", "Blanks", "VM_oneclass_NA.txt",
262 #"Barplots_and_Boxplots")
263
264