comparison quantp.r @ 0:75faf9a89f5b draft

planemo upload commit a0e968c7bd2b6f7b963baeecb08f3a39e50f52d6
author galaxyp
date Fri, 14 Sep 2018 12:22:31 -0400
parents
children bcc7a4c4cc29
comparison
equal deleted inserted replaced
-1:000000000000 0:75faf9a89f5b
1 #***************************************************************************************************************************************
2 # Functions: Start
3 #***************************************************************************************************************************************
4
5 #===============================================================================
6 # PCA
7 #===============================================================================
8 multisample_PCA = function(df, sampleinfo_df, outfile)
9 {
10 tempdf = df[,-1];
11 tempcol = colnames(tempdf);
12 tempgrp = sampleinfo_df[tempcol,2];
13 tempdf = t(tempdf) %>% as.data.frame();
14 tempdf[is.na(tempdf)] = 0;
15 tempdf$Group = tempgrp;
16 png(outfile, width = 6, height = 6, units = 'in', res=300);
17 # bitmap(outfile, "png16m");
18 g = autoplot(prcomp(select(tempdf, -Group)), data = tempdf, colour = 'Group', size=3);
19 plot(g);
20 dev.off();
21 }
22
23 #===============================================================================
24 # Regression and Cook's distance
25 #===============================================================================
26 singlesample_regression = function(PE_TE_data,htmloutfile, append=TRUE)
27 {
28 rownames(PE_TE_data) = PE_TE_data$PE_ID;
29 regmodel = lm(PE_abundance~TE_abundance, data=PE_TE_data);
30 regmodel_summary = summary(regmodel);
31
32 cat("<font><h3>Linear Regression model fit between Proteome and Transcriptome data</h3></font>\n",
33 "<p>Assuming a linear relationship between Proteome and Transcriptome data, we here fit a linear regression model.</p>\n",
34 '<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Parameter</font></th><th><font color=#ffcc33>Value</font></th></tr>\n',
35 file = htmloutfile, append = TRUE);
36
37 cat("<tr><td>Formula</td><td>","PE_abundance~TE_abundance","</td></tr>\n",
38 "<tr><td colspan='2' align='center'> <b>Coefficients</b></td>","</tr>\n",
39 "<tr><td>",names(regmodel$coefficients[1]),"</td><td>",regmodel$coefficients[1]," (Pvalue:", regmodel_summary$coefficients[1,4],")","</td></tr>\n",
40 "<tr><td>",names(regmodel$coefficients[2]),"</td><td>",regmodel$coefficients[2]," (Pvalue:", regmodel_summary$coefficients[2,4],")","</td></tr>\n",
41 "<tr><td colspan='2' align='center'> <b>Model parameters</b></td>","</tr>\n",
42 "<tr><td>Residual standard error</td><td>",regmodel_summary$sigma," (",regmodel_summary$df[2]," degree of freedom)</td></tr>\n",
43 "<tr><td>F-statistic</td><td>",regmodel_summary$fstatistic[1]," ( on ",regmodel_summary$fstatistic[2]," and ",regmodel_summary$fstatistic[3]," degree of freedom)</td></tr>\n",
44 "<tr><td>R-squared</td><td>",regmodel_summary$r.squared,"</td></tr>\n",
45 "<tr><td>Adjusted R-squared</td><td>",regmodel_summary$adj.r.squared,"</td></tr>\n",
46 file = htmloutfile, append = TRUE);
47
48 cat("</table>\n", file = htmloutfile, append = TRUE);
49
50 cat(
51 "<font color='#ff0000'><h3>Regression and diagnostics plots</h3></font>\n",
52 file = htmloutfile, append = TRUE);
53
54 outplot = paste(outdir,"/PE_TE_lm_1.png",sep="",collapse="");
55 png(outplot, width = 10, height = 10, units = 'in',res=300);
56 # bitmap(outplot, "png16m");
57 par(mfrow=c(1,1));
58 plot(regmodel, 1, cex.lab=1.5);
59 dev.off();
60
61 outplot = paste(outdir,"/PE_TE_lm_2.png",sep="",collapse="");
62 png(outplot,width = 10, height = 10, units = 'in', res=300);
63 # bitmap(outplot, "png16m");
64 par(mfrow=c(1,1));
65 plot(regmodel, 2, cex.lab=1.5);
66 dev.off();
67
68 outplot = paste(outdir,"/PE_TE_lm_5.png",sep="",collapse="");
69 png(outplot, width = 10, height = 10, units = 'in',res=300);
70 # bitmap(outplot, "png16m");
71 par(mfrow=c(1,1));
72 plot(regmodel, 5, cex.lab=1.5);
73 dev.off();
74
75 cat('<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; ">', file = htmloutfile, append = TRUE);
76
77 cat(
78 '<tr bgcolor="#7a0019"><th>', "<font color='#ffcc33'><h4>1) <u>Residuals vs Fitted plot</h4></font></u></th>\n",
79 '<th><font color=#ffcc33><h4>2) <u>Normal Q-Q plot of residuals</h4></font></u></th></tr>\n',
80 file = htmloutfile, append = TRUE);
81
82 cat(
83 '<tr><td align=center><img src="PE_TE_lm_1.png" width=600 height=600></td><td align=center><img src="PE_TE_lm_2.png" width=600 height=600></td></tr>\n',
84 file = htmloutfile, append = TRUE);
85
86 cat(
87 '<tr><td align=center>This plot checks for linear relationship assumptions.<br>If a horizontal line is observed without any distinct patterns, it indicates a linear relationship.</td>\n',
88 '<td align=center>This plot checks whether residuals are normally distributed or not.<br>It is good if the residuals points follow the straight dashed line i.e., do not deviate much from dashed line.</td></tr></table>\n',
89 file = htmloutfile, append = TRUE);
90
91
92 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
93 # Residuals data
94 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
95 res_all = regmodel$residuals;
96 res_mean = mean(res_all);
97 res_sd = sd(res_all);
98 res_diff = (res_all-res_mean);
99 res_zscore = res_diff/res_sd;
100 # res_outliers = res_all[which((res_zscore > 2)|(res_zscore < -2))]
101
102
103 tempind = which((res_zscore > 2)|(res_zscore < -2));
104 res_PE_TE_data_no_outlier = PE_TE_data[-tempind,];
105 res_PE_TE_data_no_outlier$residuals = res_all[-tempind];
106 res_PE_TE_data_outlier = PE_TE_data[tempind,];
107 res_PE_TE_data_outlier$residuals = res_all[tempind];
108
109 # Save the complete table for download (influential_observations)
110 temp_outlier_data = data.frame(res_PE_TE_data_outlier$PE_ID, res_PE_TE_data_outlier$TE_abundance, res_PE_TE_data_outlier$PE_abundance, res_PE_TE_data_outlier$residuals)
111 colnames(temp_outlier_data) = c("Gene", "Transcript abundance", "Protein abundance", "Residual value")
112 outdatafile = paste(outdir,"/PE_TE_outliers_residuals.txt", sep="", collapse="");
113 write.table(temp_outlier_data, file=outdatafile, row.names=F, sep="\t", quote=F);
114
115
116 # Save the complete table for download (non influential_observations)
117 temp_all_data = data.frame(PE_TE_data$PE_ID, PE_TE_data$TE_abundance, PE_TE_data$PE_abundance, res_all)
118 colnames(temp_all_data) = c("Gene", "Transcript abundance", "Protein abundance", "Residual value")
119 outdatafile = paste(outdir,"/PE_TE_abundance_residuals.txt", sep="", collapse="");
120 write.table(temp_all_data, file=outdatafile, row.names=F, sep="\t", quote=F);
121
122
123 cat('<br><h2 id="inf_obs"><font color=#ff0000>Outliers based on the residuals from regression analysis</font></h2>\n',
124 file = htmloutfile, append = TRUE);
125 cat('<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; ">\n',
126 '<tr bgcolor="#7a0019"><th colspan=2><font color=#ffcc33>Residuals from Regression</font></th></tr>\n',
127 '<tr bgcolor="#7a0019"><th><font color=#ffcc33>Parameter</font></th><th><font color=#ffcc33>Value</font></th></tr>\n',
128 file = htmloutfile, append = TRUE);
129
130 cat("<tr><td>Mean Residual value</td><td>",res_mean,"</td></tr>\n",
131 "<tr><td>Standard deviation (Residuals)</td><td>",res_sd,"</td></tr>\n",
132 '<tr><td>Total outliers (Residual value > 2 standard deviation from the mean)</td><td>',length(tempind),' <font size=4>(<b><a href=PE_TE_outliers_residuals.txt target="_blank">Download these ',length(tempind),' data points with high residual values here</a></b>)</font></td>\n',
133 '<tr><td colspan=2 align=center><font size=4>(<b><a href=PE_TE_abundance_residuals.txt target="_blank">Download the complete residuals data here</a></b>)</font></td></td>\n',
134 "</table><br><br>\n",
135 file = htmloutfile, append = TRUE);
136
137 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
138
139
140 cat('<br><br><table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; ">', file = htmloutfile, append = TRUE);
141
142 cat(
143 '<tr bgcolor="#7a0019"><th><font color=#ffcc33><h4>3) <u>Residuals vs Leverage plot</h4></font></u></th></tr>\n',
144 file = htmloutfile, append = TRUE);
145
146 cat(
147 '<tr><td align=center><img src="PE_TE_lm_5.png" width=600 height=600></td></tr>\n',
148 file = htmloutfile, append = TRUE);
149
150 cat(
151 '<tr><td align=center>This plot is useful to identify any influential cases, that is outliers or extreme values.<br>They might influence the regression results upon inclusion or exclusion from the analysis.</td></tr></table><br>\n',
152 file = htmloutfile, append = TRUE);
153
154
155
156 #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
157 # Cook's Distance
158 #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
159 cat('<hr/><h2 id="inf_obs"><font color=#ff0000>INFLUENTIAL OBSERVATIONS</font></h2>\n',
160 file = htmloutfile, append = TRUE);
161 cat(
162 '<p><b>Cook\'s distance</b> computes the influence of each data point/observation on the predicted outcome. i.e. this measures how much the observation is influencing the fitted values.<br>In general use, those observations that have a <b>Cook\'s distance > than ', cookdist_upper_cutoff,' times the mean</b> may be classified as <b>influential.</b></p>\n',
163 file = htmloutfile, append = TRUE);
164
165 cooksd <- cooks.distance(regmodel);
166
167 outplot = paste(outdir,"/PE_TE_lm_cooksd.png",sep="",collapse="");
168 png(outplot, width = 10, height = 10, units = 'in', res=300);
169 # bitmap(outplot, "png16m");
170 par(mfrow=c(1,1));
171 plot(cooksd, main="Influential Obs. by Cook\'s distance", ylab="Cook\'s distance", xlab="Observations", type="n") # plot cooks distance
172 sel_outlier=which(cooksd>=as.numeric(cookdist_upper_cutoff)*mean(cooksd, na.rm=T))
173 sel_nonoutlier=which(cooksd<as.numeric(cookdist_upper_cutoff)*mean(cooksd, na.rm=T))
174 points(sel_outlier, cooksd[sel_outlier],pch="*", cex=2, cex.lab=1.5, col="red")
175 points(sel_nonoutlier, cooksd[sel_nonoutlier],pch="*", cex=2, cex.lab=1.5, col="black")
176 abline(h = as.numeric(cookdist_upper_cutoff)*mean(cooksd, na.rm=T), col="red") # add cutoff line
177 #text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>as.numeric(cookdist_upper_cutoff)*mean(cooksd, na.rm=T),names(cooksd),""), col="red", pos=2) # add labels
178 dev.off();
179
180 cat(
181 '<img src="PE_TE_lm_cooksd.png" width=800 height=800>',
182 '<br>In the above plot, observations above red line (',cookdist_upper_cutoff,' * mean Cook\'s distance) are influential. Genes that are outliers could be important. These observations influences the correlation values and regression coefficients<br><br>',
183 file = htmloutfile, append = TRUE);
184
185 tempind = which(cooksd>as.numeric(cookdist_upper_cutoff)*mean(cooksd, na.rm=T));
186 PE_TE_data_no_outlier = PE_TE_data[-tempind,];
187 PE_TE_data_no_outlier$cooksd = cooksd[-tempind];
188 PE_TE_data_outlier = PE_TE_data[tempind,];
189 PE_TE_data_outlier$cooksd = cooksd[tempind];
190 a = sort(PE_TE_data_outlier$cooksd, decreasing=T, index.return=T);
191 PE_TE_data_outlier_sorted = PE_TE_data_outlier[a$ix,];
192
193 cat(
194 '<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Parameter</font></th><th><font color=#ffcc33>Value</font></th></tr>\n',
195 file = htmloutfile, append = TRUE);
196
197 # Save the complete table for download (influential_observations)
198 temp_outlier_data = data.frame(PE_TE_data_outlier$PE_ID, PE_TE_data_outlier$TE_abundance, PE_TE_data_outlier$PE_abundance, PE_TE_data_outlier$cooksd)
199 colnames(temp_outlier_data) = c("Gene", "Transcript abundance", "Protein abundance", "Cook's distance")
200 outdatafile = paste(outdir,"/PE_TE_influential_observation.txt", sep="", collapse="");
201 write.table(temp_outlier_data, file=outdatafile, row.names=F, sep="\t", quote=F);
202
203
204 # Save the complete table for download (non influential_observations)
205 temp_no_outlier_data = data.frame(PE_TE_data_no_outlier$PE_ID, PE_TE_data_no_outlier$TE_abundance, PE_TE_data_no_outlier$PE_abundance, PE_TE_data_no_outlier$cooksd)
206 colnames(temp_no_outlier_data) = c("Gene", "Transcript abundance", "Protein abundance", "Cook's distance")
207 outdatafile = paste(outdir,"/PE_TE_non_influential_observation.txt", sep="", collapse="");
208 write.table(temp_no_outlier_data, file=outdatafile, row.names=F, sep="\t", quote=F);
209
210
211 cat("<tr><td>Mean Cook\'s distance</td><td>",mean(cooksd, na.rm=T),"</td></tr>\n",
212 "<tr><td>Total influential observations (Cook\'s distance > ",cookdist_upper_cutoff," * mean Cook\'s distance)</td><td>",length(tempind),"</td>\n",
213
214 "<tr><td>Observations with Cook\'s distance < ",cookdist_upper_cutoff," * mean Cook\'s distance</td><td>",length(which(cooksd<as.numeric(cookdist_upper_cutoff)*mean(cooksd, na.rm=T))),"</td>\n",
215 "</table><br><br>\n",
216 file = htmloutfile, append = TRUE);
217
218
219 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
220 # Scatter plot after removal of influential points
221 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
222 outplot = paste(outdir,"/AbundancePlot_scatter_without_outliers.png",sep="",collapse="");
223 min_lim = min(c(PE_TE_data$PE_abundance,PE_TE_data$TE_abundance));
224 max_lim = max(c(PE_TE_data$PE_abundance,PE_TE_data$TE_abundance));
225 png(outplot, width = 10, height = 10, units = 'in', res=300);
226 # bitmap(outplot,"png16m");
227 g = ggplot(PE_TE_data_no_outlier, aes(x=TE_abundance, y=PE_abundance))+geom_point() + geom_smooth() + xlab("Transcript abundance log fold-change") + ylab("Protein abundance log fold-change") + xlim(min_lim,max_lim) + ylim(min_lim,max_lim);
228 suppressMessages(plot(g));
229 dev.off();
230
231 cat('<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Scatterplot: Before removal</font></th><th><font color=#ffcc33>Scatterplot: After removal</font></th></tr>\n', file = htmloutfile, append = TRUE);
232 # Before
233 cat("<tr><td align=center><!--<font color='#ff0000'><h3>Scatter plot between Proteome and Transcriptome Abundance</h3></font>\n-->", '<img src="TE_PE_scatter.png" width=600 height=600></td>\n', file = htmloutfile, append = TRUE);
234
235 # After
236 cat("<td align=center>\n",
237 '<img src="AbundancePlot_scatter_without_outliers.png" width=600 height=600></td></tr>\n',
238 file = htmloutfile, append = TRUE);
239 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
240
241
242 cor_result_pearson = cor.test(PE_TE_data_no_outlier[,"TE_abundance"], PE_TE_data_no_outlier[,"PE_abundance"], method = "pearson");
243 cor_result_spearman = cor.test(PE_TE_data_no_outlier[,"TE_abundance"], PE_TE_data_no_outlier[,"PE_abundance"], method = "spearman");
244 cor_result_kendall = cor.test(PE_TE_data_no_outlier[,"TE_abundance"], PE_TE_data_no_outlier[,"PE_abundance"], method = "kendall");
245
246 cat('<tr><td>\n', file = htmloutfile, append=TRUE);
247 singlesample_cor(PE_TE_data, htmloutfile, append=TRUE);
248 cat('</td>\n', file = htmloutfile, append=TRUE);
249
250
251 cat('<td><table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Parameter</font></th><th><font color=#ffcc33>Method 1</font></th><th><font color=#ffcc33>Method 2</font></th><th><font color=#ffcc33>Method 3</font></th></tr>\n',
252 file = htmloutfile, append = TRUE);
253
254 cat(
255 "<tr><td>Correlation method</td><td>",cor_result_pearson$method,"</td><td>",cor_result_spearman$method,"</td><td>",cor_result_kendall$method,"</td></tr>\n",
256 "<tr><td>Correlation coefficient</td><td>",cor_result_pearson$estimate,"</td><td>",cor_result_spearman$estimate,"</td><td>",cor_result_kendall$estimate,"</td></tr>\n",
257 file = htmloutfile, append = TRUE)
258 cat("</table></td></tr></table>\n", file = htmloutfile, append = TRUE)
259
260
261
262 if(dim(PE_TE_data_outlier)[1]<10)
263 {
264 tab_n_row = dim(PE_TE_data_outlier)[1];
265 }else{
266 tab_n_row = 10;
267 }
268
269 cat("<br><br><font size=5><b><a href='PE_TE_influential_observation.txt' target='_blank'>Download the complete list of influential observations</a></b></font>&nbsp;&nbsp;&nbsp;&nbsp;",
270 "<font size=5><b><a href='PE_TE_non_influential_observation.txt' target='_blank'>Download the complete list (After removing influential points)</a></b></font><br>\n",
271 '<br><font color="brown"><h4>Top ',as.character(tab_n_row),' Influential observations (Cook\'s distance > ',cookdist_upper_cutoff,' * mean Cook\'s distance)</h4></font>\n',
272 file = htmloutfile, append = TRUE);
273
274 cat('<table border=1 cellspacing=0 cellpadding=5> <tr bgcolor="#7a0019">\n', sep = "",file = htmloutfile, append = TRUE);
275 cat("<th><font color=#ffcc33>Gene</font></th><th><font color=#ffcc33>Protein Log Fold-Change</font></th><th><font color=#ffcc33>Transcript Log Fold-Change</font></th><th><font color=#ffcc33>Cook's Distance</font></th></tr>\n",
276 file = htmloutfile, append = TRUE);
277
278
279 for(i in 1:tab_n_row)
280 {
281 cat(
282 '<tr>','<td>',as.character(PE_TE_data_outlier_sorted[i,1]),'</td>\n',
283 '<td>',format(PE_TE_data_outlier_sorted[i,2], scientific=F),'</td>\n',
284 '<td>',PE_TE_data_outlier_sorted[i,4],'</td>\n',
285 '<td>',format(PE_TE_data_outlier_sorted[i,5], scientific=F),'</td></tr>\n',
286 file = htmloutfile, append = TRUE);
287 }
288 cat('</table><br><br>\n',file = htmloutfile, append = TRUE);
289
290
291 }
292
293
294
295 #===============================================================================
296 # Heatmap
297 #===============================================================================
298 singlesample_heatmap=function(PE_TE_data, htmloutfile, hm_nclust){
299 cat('<br><table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Heatmap of PE and TE abundance values (Hierarchical clustering)</font></th><th><font color=#ffcc33>Number of clusters to extract: ',hm_nclust,'</font></th></tr>\n',
300 file = htmloutfile, append = TRUE);
301
302 hc=hclust(dist(as.matrix(PE_TE_data[,c("PE_abundance","TE_abundance")])))
303 hm_cluster = cutree(hc,k=hm_nclust);
304
305 outplot = paste(outdir,"/PE_TE_heatmap.png",sep="",collapse="");
306 png(outplot, width = 10, height = 10, units = 'in', res=300);
307 # bitmap(outplot, "png16m");
308 par(mfrow=c(1,1));
309 hmap = heatmap.2(as.matrix(PE_TE_data[,c("PE_abundance","TE_abundance")]), trace="none", cexCol=1, col=greenred(100),Colv=F, labCol=c("Proteins","Transcripts"), scale="col", hclustfun = hclust, distfun = dist);
310 dev.off();
311
312 cat('<tr><td align=center colspan="2"><img src="PE_TE_heatmap.png" width=800 height=800></td></tr>\n',
313 file = htmloutfile, append = TRUE);
314
315
316 temp_PE_TE_data = data.frame(PE_TE_data$PE_ID, PE_TE_data$TE_abundance, PE_TE_data$PE_abundance, hm_cluster);
317 colnames(temp_PE_TE_data) = c("Gene", "Transcript abundance", "Protein abundance", "Cluster (Hierarchical clustering)")
318 tempoutfile = paste(outdir,"/PE_TE_hc_clusterpoints.txt",sep="",collapse="");
319 write.table(temp_PE_TE_data, file=tempoutfile, row.names=F, quote=F, sep="\t", eol="\n")
320
321
322 cat('<tr><td colspan="2" align=center><font size=5><a href="PE_TE_hc_clusterpoints.txt" target="_blank"><b>Download the hierarchical cluster list</b></a></font></td></tr></table>\n',
323 file = htmloutfile, append = TRUE);
324 }
325
326
327 #===============================================================================
328 # K-means clustering
329 #===============================================================================
330 singlesample_kmeans=function(PE_TE_data, htmloutfile, nclust){
331 PE_TE_data_kdata = PE_TE_data;
332 k1 = kmeans(PE_TE_data_kdata[,c("PE_abundance","TE_abundance")], nclust);
333 outplot = paste(outdir,"/PE_TE_kmeans.png",sep="",collapse="");
334 png(outplot, width = 10, height = 10, units = 'in', res=300);
335 # bitmap(outplot, "png16m");
336 par(mfrow=c(1,1));
337 scatter.smooth(PE_TE_data_kdata[,"TE_abundance"], PE_TE_data_kdata[,"PE_abundance"], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5);
338 legend(1, 95, legend=c("Cluster 1", "Line 2"), col="red", lty=1:1, cex=0.8)
339 legend(1, 95, legend="Cluster 2", col="green", lty=1:1, cex=0.8)
340
341
342 ind=which(k1$cluster==1);
343 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="red", pch=16);
344 ind=which(k1$cluster==2);
345 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="green", pch=16);
346 ind=which(k1$cluster==3);
347 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="blue", pch=16);
348 ind=which(k1$cluster==4);
349 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="cyan", pch=16);
350 ind=which(k1$cluster==5);
351 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="black", pch=16);
352 ind=which(k1$cluster==6);
353 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="brown", pch=16);
354 ind=which(k1$cluster==7);
355 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="gold", pch=16);
356 ind=which(k1$cluster==8);
357 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="thistle", pch=16);
358 ind=which(k1$cluster==9);
359 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="yellow", pch=16);
360 ind=which(k1$cluster==10);
361 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="orange", pch=16);
362 dev.off();
363
364 cat('<br><br><table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>K-mean clustering</font></th><th><font color=#ffcc33>Number of clusters: ',nclust,'</font></th></tr>\n',
365 file = htmloutfile, append = TRUE);
366
367 tempind = order(k1$cluster);
368 tempoutfile = paste(outdir,"/PE_TE_kmeans_clusterpoints.txt",sep="",collapse="");
369 write.table(data.frame(PE_TE_data_kdata[tempind, ], Cluster=k1$cluster[tempind]), file=tempoutfile, row.names=F, quote=F, sep="\t", eol="\n")
370
371
372 cat('<tr><td colspan="2" align=center><img src="PE_TE_kmeans.png" width=800 height=800></td></tr>\n',
373 file = htmloutfile, append = TRUE);
374 cat('<tr><td colspan="2" align=center><font size=5><a href="PE_TE_kmeans_clusterpoints.txt" target="_blank"><b>Download the cluster list</b></a></font></td></tr></table><br><hr/>\n',
375 file = htmloutfile, append = TRUE);
376
377 }
378
379 #===============================================================================
380 # scatter plot
381 #===============================================================================
382 singlesample_scatter = function(PE_TE_data, outfile)
383 {
384 min_lim = min(c(PE_TE_data$PE_abundance,PE_TE_data$TE_abundance));
385 max_lim = max(c(PE_TE_data$PE_abundance,PE_TE_data$TE_abundance));
386 png(outfile, width = 10, height = 10, units = 'in', res=300);
387 # bitmap(outfile, "png16m");
388 g = ggplot(PE_TE_data, aes(x=TE_abundance, y=PE_abundance))+geom_point() + geom_smooth() + xlab("Transcript abundance log fold-change") + ylab("Protein abundance log fold-change") + xlim(min_lim,max_lim) + ylim(min_lim,max_lim);
389 suppressMessages(plot(g));
390 # plot(g);
391 dev.off();
392 }
393
394 #===============================================================================
395 # Correlation table
396 #===============================================================================
397 singlesample_cor = function(PE_TE_data, htmloutfile, append=TRUE)
398 {
399 cor_result_pearson = cor.test(PE_TE_data$TE_abundance, PE_TE_data$PE_abundance, method = "pearson");
400 cor_result_spearman = cor.test(PE_TE_data$TE_abundance, PE_TE_data$PE_abundance, method = "spearman");
401 cor_result_kendall = cor.test(PE_TE_data$TE_abundance, PE_TE_data$PE_abundance, method = "kendall");
402
403 cat(
404 '<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Parameter</font></th><th><font color=#ffcc33>Method 1</font></th><th><font color=#ffcc33>Method 2</font></th><th><font color=#ffcc33>Method 3</font></th></tr>\n',
405 file = htmloutfile, append = TRUE);
406
407 cat(
408 "<tr><td>Correlation method</td><td>",cor_result_pearson$method,"</td><td>",cor_result_spearman$method,"</td><td>",cor_result_kendall$method,"</td></tr>\n",
409 "<tr><td>Correlation coefficient</td><td>",cor_result_pearson$estimate,"</td><td>",cor_result_spearman$estimate,"</td><td>",cor_result_kendall$estimate,"</td></tr>\n",
410 file = htmloutfile, append = TRUE)
411 cat("</table>\n", file = htmloutfile, append = TRUE);
412
413 }
414
415 #===============================================================================
416 # Boxplot
417 #===============================================================================
418 multisample_boxplot = function(df, sampleinfo_df, outfile, fill_leg, user_xlab, user_ylab)
419 {
420 tempdf = df[,-1, drop=FALSE];
421 tempdf = t(tempdf) %>% as.data.frame();
422 tempdf[is.na(tempdf)] = 0;
423 tempdf$Sample = rownames(tempdf);
424 tempdf1 = melt(tempdf, id.vars = "Sample");
425 tempdf1$Group = sampleinfo_df[tempdf1$Sample,2];
426 png(outplot, width = 6, height = 6, units = 'in', res=300);
427 # bitmap(outplot, "png16m");
428 if(fill_leg=="Yes")
429 {
430 g = ggplot(tempdf1, aes(x=Sample, y=value, fill=Group)) + geom_boxplot() + labs(x=user_xlab) + labs(y=user_ylab)
431 }else{
432 if(fill_leg=="No")
433 {
434 tempdf1$Group = c("case", "control")
435 g = ggplot(tempdf1, aes(x=Sample, y=value, fill=Group)) + geom_boxplot() + labs(x=user_xlab) + labs(y=user_ylab)
436 }
437 }
438 plot(g);
439 dev.off();
440 }
441
442
443 #===============================================================================
444 # Mean or Median of Replicates
445 #===============================================================================
446
447 mergeReplicates = function(TE_df,PE_df, sampleinfo_df, method)
448 {
449 grps = unique(sampleinfo_df[,2]);
450
451 TE_df_merged <<- sapply(grps, function(x){
452 tempsample = sampleinfo_df[which(sampleinfo_df$Group==x),1]
453 if(length(tempsample)!=1){
454 apply(TE_df[,tempsample],1,method);
455 }else{
456 return(TE_df[,tempsample]);
457 }
458 });
459 TE_df_merged <<- data.frame(as.character(TE_df[,1]), TE_df_merged);
460 colnames(TE_df_merged) = c(colnames(TE_df)[1], grps);
461
462 PE_df_merged <<- sapply(grps, function(x){
463 tempsample = sampleinfo_df[which(sampleinfo_df$Group==x),1]
464 if(length(tempsample)!=1){
465 apply(PE_df[,tempsample],1,method);
466 }else{
467 return(PE_df[,tempsample]);
468 }
469 });
470
471 PE_df_merged <<- data.frame(as.character(PE_df[,1]), PE_df_merged);
472 colnames(PE_df_merged) = c(colnames(PE_df)[1], grps);
473
474 #sampleinfo_df_merged = data.frame(Sample = grps, Group = grps, stringsAsFactors = F);
475 sampleinfo_df_merged = data.frame(Sample = grps, Group = "Group", stringsAsFactors = F);
476
477 return(list(TE_df_merged = TE_df_merged, PE_df_merged = PE_df_merged, sampleinfo_df_merged = sampleinfo_df_merged));
478 }
479
480 #===============================================================================
481 # (T-Test or Wilcoxon ranksum test) and Volcano Plot
482 #===============================================================================
483
484 perform_Test_Volcano = function(TE_df_data,PE_df_data,TE_df_logfold, PE_df_logfold,sampleinfo_df, method, correction_method,volc_with)
485 {
486
487 PE_colnames = colnames(PE_df_data);
488 control_sample = sampleinfo_df[which(sampleinfo_df$Group=="control"),1];
489 control_ind <<- sapply(control_sample, function(x){temp_ind = which(PE_colnames==x); as.numeric(temp_ind)});
490 condition_sample = sampleinfo_df[which(sampleinfo_df$Group=="case"),1];
491 condition_ind <<- sapply(condition_sample, function(x){temp_ind = which(PE_colnames==x); as.numeric(temp_ind)});
492
493 if(method=="mean"){
494 #PE_pval = apply(PE_df_data[2:length(colnames(PE_df_data))],1,function(x) t.test(x[condition_ind-1], x[control_ind-1])$p.value);
495 PE_pval = apply(PE_df_data[2:length(colnames(PE_df_data))],1,function(x) {obj<-try(t.test(x[condition_ind-1], x[control_ind-1]),silent=TRUE); if(is(obj, "try-error")){return(NA)}else{return(obj$p.value)}})
496 }else{
497 if(method=="median"){
498 PE_pval = apply(PE_df_data[2:length(colnames(PE_df_data))],1,function(x) {obj<-try(wilcox.test(x[condition_ind-1], x[control_ind-1]),silent=TRUE); if(is(obj, "try-error")){return(NA)}else{return(obj$p.value)}})
499 # PE_pval = apply(PE_df_data[2:length(colnames(PE_df_data))],1,function(x) wilcox.test(x[condition_ind-1], x[control_ind-1])$p.value);
500 }
501 }
502 PE_adj_pval = p.adjust(PE_pval, method = correction_method, n = length(PE_pval))
503
504
505 TE_colnames = colnames(TE_df_data);
506 control_sample = sampleinfo_df[which(sampleinfo_df$Group=="control"),1];
507 control_ind <<- sapply(control_sample, function(x){temp_ind = which(TE_colnames==x); as.numeric(temp_ind)});
508 condition_sample = sampleinfo_df[which(sampleinfo_df$Group=="case"),1];
509 condition_ind <<- sapply(condition_sample, function(x){temp_ind = which(TE_colnames==x); as.numeric(temp_ind)});
510
511 if(method=="mean"){
512 # TE_pval = apply(TE_df_data[2:length(colnames(TE_df_data))],1,function(x) t.test(x[condition_ind-1], x[control_ind-1])$p.value);
513 TE_pval = apply(TE_df_data[2:length(colnames(TE_df_data))],1,function(x) {obj<-try(t.test(x[condition_ind-1], x[control_ind-1]),silent=TRUE); if(is(obj, "try-error")){return(NA)}else{return(obj$p.value)}})
514 }else{
515 if(method=="median"){
516 TE_pval = apply(TE_df_data[2:length(colnames(TE_df_data))],1,function(x) {obj<-try(wilcox.test(x[condition_ind-1], x[control_ind-1]),silent=TRUE); if(is(obj, "try-error")){return(NA)}else{return(obj$p.value)}})
517 # TE_pval = apply(TE_df_data[2:length(colnames(TE_df_data))],1,function(x) wilcox.test(x[condition_ind-1], x[control_ind-1])$p.value);
518 }
519 }
520 TE_adj_pval = p.adjust(TE_pval, method = correction_method, n = length(TE_pval))
521
522
523 PE_TE_logfold_pval = data.frame(TE_df_logfold$Gene, TE_df_logfold$LogFold, TE_pval, TE_adj_pval, PE_df_logfold$LogFold, PE_pval, PE_adj_pval);
524 colnames(PE_TE_logfold_pval) = c("Gene", "Transcript log fold-change", "p-value (transcript)", "adj p-value (transcript)", "Protein log fold-change", "p-value (protein)", "adj p-value (protein)");
525 outdatafile = paste(outdir,"/PE_TE_logfold_pval.txt", sep="", collapse="");
526 write.table(PE_TE_logfold_pval, file=outdatafile, row.names=F, sep="\t", quote=F);
527 cat("<br><br><font size=5><b><a href='PE_TE_logfold_pval.txt' target='_blank'>Download the complete fold change data here</a></b></font><br>\n",
528 file = htmloutfile, append = TRUE);
529
530 if(length(condition_ind)!=1)
531 {
532 # Volcano Plot
533
534 if(volc_with=="adj_pval")
535 {
536 PE_pval = PE_adj_pval
537 TE_pval = TE_adj_pval
538 volc_ylab = "-log10 Adjusted p-value";
539 }else{
540 if(volc_with=="pval")
541 {
542 volc_ylab = "-log10 p-value";
543 }
544 }
545 outplot = paste(outdir,"/PE_volcano.png",sep="",collapse="");
546 png(outplot, width = 10, height = 10, units = 'in', res=300);
547 # bitmap(outplot, "png16m");
548 par(mfrow=c(1,1));
549
550 plot(PE_df_logfold$LogFold, -log10(PE_pval),
551 xlab="log2 fold change", ylab=volc_ylab,
552 type="n")
553 sel <- which((PE_df_logfold$LogFold<=log(2,base=2))&(PE_df_logfold$LogFold>=log(0.5, base=2))) # or whatever you want to use
554 points(PE_df_logfold[sel,"LogFold"], -log10(PE_pval[sel]),col="black")
555 #sel <- which((PE_df_logfold$LogFold>log(2,base=2))&(PE_df_logfold$LogFold<log(0.5,base=2))) # or whatever you want to use
556 sel <- which((PE_df_logfold$LogFold>log(2,base=2))|(PE_df_logfold$LogFold<log(0.5, base=2)))
557 sel1 <- which(PE_pval<=0.05)
558 sel=intersect(sel,sel1)
559 points(PE_df_logfold[sel,"LogFold"], -log10(PE_pval[sel]),col="red")
560 sel <- which((PE_df_logfold$LogFold>log(2,base=2))|(PE_df_logfold$LogFold<log(0.5, base=2)))
561 sel1 <- which(PE_pval>0.05)
562 sel=intersect(sel,sel1)
563 points(PE_df_logfold[sel,"LogFold"], -log10(PE_pval[sel]),col="blue")
564 abline(h = -log(0.05,base=10), col="red", lty=2)
565 abline(v = log(2,base=2), col="red", lty=2)
566 abline(v = log(0.5,base=2), col="red", lty=2)
567 dev.off();
568
569
570
571 outplot = paste(outdir,"/TE_volcano.png",sep="",collapse="");
572 png(outplot, width = 10, height = 10, units = 'in', res=300);
573 # bitmap(outplot, "png16m");
574 par(mfrow=c(1,1));
575
576 plot(TE_df_logfold$LogFold, -log10(TE_pval),
577 xlab="log2 fold change", ylab=volc_ylab,
578 type="n")
579 sel <- which((TE_df_logfold$LogFold<=log(2,base=2))&(TE_df_logfold$LogFold>=log(0.5, base=2))) # or whatever you want to use
580 points(TE_df_logfold[sel,"LogFold"], -log10(TE_pval[sel]),col="black")
581 #sel <- which((TE_df_logfold$LogFold>log(2,base=2))&(TE_df_logfold$LogFold<log(0.5,base=2))) # or whatever you want to use
582 sel <- which((TE_df_logfold$LogFold>log(2,base=2))|(TE_df_logfold$LogFold<log(0.5, base=2)))
583 sel1 <- which(TE_pval<=0.05)
584 sel=intersect(sel,sel1)
585 points(TE_df_logfold[sel,"LogFold"], -log10(TE_pval[sel]),col="red")
586 sel <- which((TE_df_logfold$LogFold>log(2,base=2))|(TE_df_logfold$LogFold<log(0.5, base=2)))
587 sel1 <- which(TE_pval>0.05)
588 sel=intersect(sel,sel1)
589 points(TE_df_logfold[sel,"LogFold"], -log10(TE_pval[sel]),col="blue")
590 abline(h = -log(0.05,base=10), col="red", lty=2)
591 abline(v = log(2,base=2), col="red", lty=2)
592 abline(v = log(0.5,base=2), col="red", lty=2)
593 dev.off();
594
595
596
597 cat('<br><table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Transcript Fold-Change</font></th><th><font color=#ffcc33>Protein Fold-Change</font></th></tr>\n', file = htmloutfile, append = TRUE);
598 cat("<tr><td align=center>", '<img src="TE_volcano.png" width=600 height=600></td>\n', file = htmloutfile, append = TRUE);
599 cat("<td align=center>",
600 '<img src="PE_volcano.png" width=600 height=600></td></tr></table><br>\n',
601 file = htmloutfile, append = TRUE);
602 }else{
603 cat('<br><br><b><font color=red>!!! No replicates found. Cannot perform test to check significance of differential expression. Thus, no Volcano plot generated !!!</font></b><br><br>',
604 file = htmloutfile, append = TRUE);
605 }
606
607 }
608
609
610 #***************************************************************************************************************************************
611 # Functions: End
612 #***************************************************************************************************************************************
613
614
615
616
617 #===============================================================================
618 # Arguments
619 #===============================================================================
620 noargs = 12;
621 args = commandArgs(trailingOnly = TRUE);
622 if(length(args) != noargs)
623 {
624 stop(paste("Please check usage. Number of arguments is not equal to ",noargs,sep="",collapse=""));
625 }
626
627 mode = args[1]; # "multiple" or "logfold"
628 method = args[2]; # "mean" or "median"
629 sampleinfo_file = args[3];
630 proteome_file = args[4];
631 transcriptome_file = args[5];
632 correction_method = args[6];
633 cookdist_upper_cutoff = args[7];
634 numCluster = args[8];
635 hm_nclust = args[9];
636 volc_with = args[10];
637
638 htmloutfile = args[11]; # html output file
639 outdir = args[12]; # html supporting files
640
641
642 #===============================================================================
643 # Check for file existance
644 #===============================================================================
645 if(! file.exists(proteome_file))
646 {
647 stop(paste("Proteome Data file does not exists. Path given: ",proteome_file,sep="",collapse=""));
648 }
649 if(! file.exists(transcriptome_file))
650 {
651 stop(paste("Transcriptome Data file does not exists. Path given: ",transcriptome_file,sep="",collapse=""));
652 }
653
654 #===============================================================================
655 # Load library
656 #===============================================================================
657 options(warn=-1);
658
659 suppressPackageStartupMessages(library(dplyr));
660 suppressPackageStartupMessages(library(data.table));
661 suppressPackageStartupMessages(library(gplots));
662 suppressPackageStartupMessages(library(ggplot2));
663 suppressPackageStartupMessages(library(ggfortify));
664
665 #===============================================================================
666 # Select mode and parse experiment design file
667 #===============================================================================
668 if(mode=="multiple")
669 {
670 expDesign = fread(sampleinfo_file, header = FALSE, stringsAsFactors = FALSE, sep="\t") %>% data.frame();
671 expDesign_cc = expDesign[1:2,];
672
673 sampleinfo_df = expDesign[3:nrow(expDesign),];
674 rownames(sampleinfo_df)=1:nrow(sampleinfo_df);
675 colnames(sampleinfo_df) = c("Sample","Group");
676
677 condition_cols = sampleinfo_df[which(sampleinfo_df[,2]==expDesign_cc[which(expDesign_cc[,1]=="case"),2]),1];
678 condition_g_name = "case";
679 control_cols = sampleinfo_df[which(sampleinfo_df[,2]==expDesign_cc[which(expDesign_cc[,1]=="control"),2]),1];
680 control_g_name = "control";
681 sampleinfo_df[which(sampleinfo_df[,2]==expDesign_cc[which(expDesign_cc[,1]=="case"),2]),2] = "case";
682 sampleinfo_df[which(sampleinfo_df[,2]==expDesign_cc[which(expDesign_cc[,1]=="control"),2]),2] = "control";
683 sampleinfo_df_orig = sampleinfo_df;
684 }
685
686 if(mode=="logfold")
687 {
688 sampleinfo_df = data.frame("Sample"= c("LogFold"), "Group"=c("Fold_Change"))
689 }
690
691 #===============================================================================
692 # Parse Transcriptome data
693 #===============================================================================
694 TE_df_orig = fread(transcriptome_file, sep="\t", stringsAsFactor=F, header=T) %>% data.frame();
695 if(mode=="multiple")
696 {
697 TE_df = TE_df_orig[,c(colnames(TE_df_orig)[1],condition_cols,control_cols)];
698 }
699 if(mode=="logfold")
700 {
701 TE_df = TE_df_orig;
702 colnames(TE_df) = c("Genes", "LogFold");
703 }
704 #===============================================================================
705 # Parse Proteome data
706 #===============================================================================
707 PE_df_orig = fread(proteome_file, sep="\t", stringsAsFactor=F, header=T) %>% data.frame();
708 if(mode=="multiple")
709 {
710 PE_df = PE_df_orig[,c(colnames(PE_df_orig)[1],condition_cols,control_cols)];
711 }
712 if(mode=="logfold")
713 {
714 PE_df = PE_df_orig;
715 colnames(PE_df) = c("Genes", "LogFold");
716 }
717
718 #=============================================================================================================
719 # Create directory structures and then set the working directory to output directory
720 #=============================================================================================================
721 if(! file.exists(outdir))
722 {
723 dir.create(outdir);
724 }
725 #===============================================================================
726 # Write initial data summary in html outfile
727 #===============================================================================
728 cat("<html><head></head><body>\n", file = htmloutfile);
729
730 cat("<h1><u>QuanTP: Association between abundance ratios of transcript and protein</u></h1><hr/>\n",
731 "<font><h3>Input data summary</h3></font>\n",
732 "<ul>\n",
733 "<li>Abbreviations used: PE (Proteome data) and TE (Transcriptome data)","</li><br>\n",
734 "<li>Input Proteome data dimension (Row Column): ", dim(PE_df)[1]," x ", dim(PE_df)[2],"</li>\n",
735 "<li>Input Transcriptome data dimension (Row Column): ", dim(TE_df)[1]," x ", dim(TE_df)[2],"</li></ul><hr/>\n",
736 file = htmloutfile, append = TRUE);
737
738 cat("<h3 id=table_of_content>Table of Contents:</h3>\n",
739 "<ul>\n",
740 "<li><a href=#sample_dist>Sample distribution</a></li>\n",
741 "<li><a href=#corr_data>Correlation</a></li>\n",
742 "<li><a href=#regression_data>Regression analysis</a></li>\n",
743 "<li><a href=#inf_obs>Influential observations</a></li>\n",
744 "<li><a href=#cluster_data>Cluster analysis</a></li></ul><hr/>\n",
745 file = htmloutfile, append = TRUE);
746 #===============================================================================
747 # Find common samples
748 #===============================================================================
749 common_samples = intersect(sampleinfo_df[,1], colnames(TE_df)[-1]) %>% intersect(., colnames(PE_df)[-1]);
750
751 if(length(common_samples)==0)
752 {
753 stop("No common samples found ");
754 cat("<b>Please check your experiment design file. Sample names (column names) in the Transcriptome and the Proteome data do not match. </b>\n",file = htmloutfile, append = TRUE);
755 }
756
757 #===============================================================================
758 # Create subsets based on common samples
759 #===============================================================================
760 TE_df = select(TE_df, 1, common_samples);
761 PE_df = select(PE_df, 1, common_samples);
762 sampleinfo_df = filter(sampleinfo_df, Sample %in% common_samples);
763 rownames(sampleinfo_df) = sampleinfo_df[,1];
764
765 #===============================================================================
766 # Check for number of rows similarity
767 #===============================================================================
768 if(nrow(TE_df) != nrow(PE_df))
769 {
770 stop("Number of rows in Transcriptome and Proteome data are not same i.e. they are not paired");
771 cat("<b>The correlation analysis expects paired TE and PE data i.e. (i)th gene/transcript of TE file should correspond to (i)th protein of PE file. In the current input provided there is mismatch in terms of number of rows of TE and PE file. Please make sure you provide paired data.</b>\n",file = htmloutfile, append = TRUE);
772 }
773
774 #===============================================================================
775 # Number of groups
776 #===============================================================================
777 ngrps = unique(sampleinfo_df[,2]) %>% length();
778 grps = unique(sampleinfo_df[,2]);
779 names(grps) = grps;
780
781 #===============================================================================
782 # Change column1 name
783 #===============================================================================
784 colnames(TE_df)[1] = "Gene";
785 colnames(PE_df)[1] = "Protein";
786
787 #===============================================================================
788 # Treat missing values
789 #===============================================================================
790 TE_nacount = sum(is.na(TE_df));
791 PE_nacount = sum(is.na(PE_df));
792
793 TE_df[is.na(TE_df)] = 0;
794 PE_df[is.na(PE_df)] = 0;
795
796
797 #===============================================================================
798 # Decide based on analysis mode
799 #===============================================================================
800 if(mode=="logfold")
801 {
802 cat('<h2 id="sample_dist"><font color=#ff0000>SAMPLE DISTRIBUTION</font></h2>\n',
803 file = htmloutfile, append = TRUE);
804
805 # TE Boxplot
806 outplot = paste(outdir,"/Box_TE.png",sep="",collape="");
807 cat('<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; ">\n',
808 '<tr bgcolor="#7a0019"><th><font color=#ffcc33>Boxplot: Transcriptome data</font></th><th><font color=#ffcc33>Boxplot: Proteome data</font></th></tr>\n',
809 "<tr><td align=center>", '<img src="Box_TE.png" width=500 height=500></td>\n', file = htmloutfile, append = TRUE);
810 multisample_boxplot(TE_df, sampleinfo_df, outplot, "Yes", "Samples", "Transcript Abundance data");
811
812 # PE Boxplot
813 outplot = paste(outdir,"/Box_PE.png",sep="",collape="");
814 cat("<td align=center>", '<img src="Box_PE.png" width=500 height=500></td></tr></table>\n', file = htmloutfile, append = TRUE);
815 multisample_boxplot(PE_df, sampleinfo_df, outplot, "Yes", "Samples", "Protein Abundance data");
816
817 cat('<hr/><h2 id="corr_data"><font color=#ff0000>CORRELATION</font></h2>\n',
818 file = htmloutfile, append = TRUE);
819
820 # TE PE scatter
821 outplot = paste(outdir,"/TE_PE_scatter.png",sep="",collape="");
822 cat('<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Scatter plot between Proteome and Transcriptome Abundance</font></th></tr>\n', file = htmloutfile, append = TRUE);
823 cat("<tr><td align=center>", '<img src="TE_PE_scatter.png" width=800 height=800></td></tr>\n', file = htmloutfile, append = TRUE);
824 PE_TE_data = data.frame(PE_df, TE_df);
825 colnames(PE_TE_data) = c("PE_ID","PE_abundance","TE_ID","TE_abundance");
826 singlesample_scatter(PE_TE_data, outplot);
827
828 # TE PE Cor
829 cat("<tr><td align=center>", file = htmloutfile, append = TRUE);
830 singlesample_cor(PE_TE_data, htmloutfile, append=TRUE);
831 cat('<font color="red">*Note that <u>correlation</u> is <u>sensitive to outliers</u> in the data. So it is important to analyze outliers/influential observations in the data.<br> Below we use <u>Cook\'s distance based approach</u> to identify such influential observations.</font>\n',
832 file = htmloutfile, append = TRUE);
833 cat('</td></table>',
834 file = htmloutfile, append = TRUE);
835
836 cat('<hr/><h2 id="regression_data"><font color=#ff0000>REGRESSION ANALYSIS</font></h2>\n',
837 file = htmloutfile, append = TRUE);
838
839 # TE PE Regression
840 singlesample_regression(PE_TE_data,htmloutfile, append=TRUE);
841
842 cat('<hr/><h2 id="cluster_data"><font color=#ff0000>CLUSTER ANALYSIS</font></h2>\n',
843 file = htmloutfile, append = TRUE);
844
845 # TE PE Heatmap
846 singlesample_heatmap(PE_TE_data, htmloutfile, hm_nclust);
847
848
849 # TE PE Clustering (kmeans)
850 singlesample_kmeans(PE_TE_data, htmloutfile, nclust=as.numeric(numCluster))
851
852 }else{
853 if(mode=="multiple")
854 {
855 cat('<h2 id="sample_dist"><font color=#ff0000>SAMPLE DISTRIBUTION</font></h2>\n',
856 file = htmloutfile, append = TRUE);
857
858 # TE Boxplot
859 outplot = paste(outdir,"/Box_TE_all_rep.png",sep="",collape="");
860 cat('<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; ">\n',
861 '<tr bgcolor="#7a0019"><th><font color=#ffcc33>Boxplot: Transcriptome data</font></th><th><font color=#ffcc33>Boxplot: Proteome data</font></th></tr>\n',
862 "<tr><td align=center>", '<img src="Box_TE_all_rep.png" width=500 height=500></td>\n', file = htmloutfile, append = TRUE);
863 temp_df_te_data = data.frame(TE_df[,1], log(TE_df[,2:length(TE_df)]));
864 colnames(temp_df_te_data) = colnames(TE_df);
865 multisample_boxplot(temp_df_te_data, sampleinfo_df, outplot, "Yes", "Samples", "Transcript Abundance (log)");
866
867 # PE Boxplot
868 outplot = paste(outdir,"/Box_PE_all_rep.png",sep="",collape="");
869 cat("<td align=center>", '<img src="Box_PE_all_rep.png" width=500 height=500></td></tr></table>\n', file = htmloutfile, append = TRUE);
870 temp_df_pe_data = data.frame(PE_df[,1], log(PE_df[,2:length(PE_df)]));
871 colnames(temp_df_pe_data) = colnames(PE_df);
872 multisample_boxplot(temp_df_pe_data, sampleinfo_df, outplot, "Yes", "Samples", "Protein Abundance (log)");
873
874 # Calc TE PCA
875 outplot = paste(outdir,"/PCA_TE_all_rep.png",sep="",collape="");
876 multisample_PCA(TE_df, sampleinfo_df, outplot);
877
878 # Calc PE PCA
879 outplot = paste(outdir,"/PCA_PE_all_rep.png",sep="",collape="");
880 multisample_PCA(PE_df, sampleinfo_df, outplot);
881
882
883 # Replicate mode
884 templist = mergeReplicates(TE_df,PE_df, sampleinfo_df, method);
885 TE_df = templist$TE_df_merged;
886 PE_df = templist$PE_df_merged;
887 sampleinfo_df = templist$sampleinfo_df_merged;
888 rownames(sampleinfo_df) = sampleinfo_df[,1];
889
890 # TE Boxplot
891 outplot = paste(outdir,"/Box_TE_rep.png",sep="",collape="");
892 cat('<br><font color="#ff0000"><h3>Sample wise distribution (Box plot) after using ',method,' on replicates </h3></font><table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Boxplot: Transcriptome data</font></th><th><font color=#ffcc33>Boxplot: Proteome data</font></th></tr>\n',
893 "<tr><td align=center>", '<img src="Box_TE_rep.png" width=500 height=500></td>\n', file = htmloutfile, append = TRUE);
894 temp_df_te_data = data.frame(TE_df[,1], log(TE_df[,2:length(TE_df)]));
895 colnames(temp_df_te_data) = colnames(TE_df);
896 multisample_boxplot(temp_df_te_data, sampleinfo_df, outplot, "No", "Sample Groups", "Mean Transcript Abundance (log)");
897
898 # PE Boxplot
899 outplot = paste(outdir,"/Box_PE_rep.png",sep="",collape="");
900 cat("<td align=center>", '<img src="Box_PE_rep.png" width=500 height=500></td></tr></table>\n', file = htmloutfile, append = TRUE);
901 temp_df_pe_data = data.frame(PE_df[,1], log(PE_df[,2:length(PE_df)]));
902 colnames(temp_df_pe_data) = colnames(PE_df);
903 multisample_boxplot(temp_df_pe_data, sampleinfo_df, outplot, "No", "Sample Groups", "Mean Protein Abundance (log)");
904
905 #===============================================================================
906 # Calculating log fold change and running the "single" code part
907 #===============================================================================
908
909 TE_df = data.frame("Genes"=TE_df[,1], "LogFold"=apply(TE_df[,c(which(colnames(TE_df)==condition_g_name),which(colnames(TE_df)==control_g_name))],1,function(x) log(x[1]/x[2],base=2)));
910 PE_df = data.frame("Genes"=PE_df[,1], "LogFold"=apply(PE_df[,c(which(colnames(PE_df)==condition_g_name),which(colnames(PE_df)==control_g_name))],1,function(x) log(x[1]/x[2],base=2)));
911
912 #===============================================================================
913 # Treat missing values
914 #===============================================================================
915
916 TE_df[is.infinite(TE_df[,2]),2] = NA;
917 PE_df[is.infinite(PE_df[,2]),2] = NA;
918 TE_df[is.na(TE_df)] = 0;
919 PE_df[is.na(PE_df)] = 0;
920
921 sampleinfo_df = data.frame("Sample"= c("LogFold"), "Group"=c("Fold_Change"))
922 #===============================================================================
923 # Find common samples
924 #===============================================================================
925
926 common_samples = intersect(sampleinfo_df[,1], colnames(TE_df)[-1]) %>% intersect(., colnames(PE_df)[-1]);
927 TE_df = select(TE_df, 1, common_samples);
928 PE_df = select(PE_df, 1, common_samples);
929 sampleinfo_df = filter(sampleinfo_df, Sample %in% common_samples);
930 rownames(sampleinfo_df) = sampleinfo_df[,1];
931
932 # TE Boxplot
933 outplot = paste(outdir,"/Box_TE.png",sep="",collape="");
934 cat('<br><font color="#ff0000"><h3>Distribution (Box plot) of log fold change </h3></font>', file = htmloutfile, append = TRUE);
935 cat('<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Boxplot: Transcriptome data</font></th><th><font color=#ffcc33>Boxplot: Proteome data</font></th></tr>\n',
936 "<tr><td align=center>", '<img src="Box_TE.png" width=500 height=500></td>\n', file = htmloutfile, append = TRUE);
937 multisample_boxplot(TE_df, sampleinfo_df, outplot, "Yes", "Sample (log2(case/control))", "Transcript Abundance fold-change (log2)");
938
939 # PE Boxplot
940 outplot = paste(outdir,"/Box_PE.png",sep="",collape="");
941 cat("<td align=center>", '<img src="Box_PE.png" width=500 height=500></td></tr></table>\n', file = htmloutfile, append = TRUE);
942 multisample_boxplot(PE_df, sampleinfo_df, outplot, "Yes", "Sample (log2(case/control))", "Protein Abundance fold-change(log2)");
943
944
945 # Log Fold Data
946 perform_Test_Volcano(TE_df_orig,PE_df_orig,TE_df, PE_df,sampleinfo_df_orig,method,correction_method,volc_with)
947
948
949
950 # Print PCA
951
952 cat('<br><br><table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>PCA plot: Transcriptome data</font></th><th><font color=#ffcc33>PCA plot: Proteome data</font></th></tr>\n',
953 "<tr><td align=center>", '<img src="PCA_TE_all_rep.png" width=500 height=500></td>\n',
954 "<td align=center>", '<img src="PCA_PE_all_rep.png" width=500 height=500></td></tr></table>\n',
955 file = htmloutfile, append = TRUE);
956
957
958
959 cat('<hr/><h2 id="corr_data"><font color=#ff0000>CORRELATION</font></h2>\n',
960 file = htmloutfile, append = TRUE);
961
962 # TE PE scatter
963 outplot = paste(outdir,"/TE_PE_scatter.png",sep="",collape="");
964 cat('<br><table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Scatter plot between Proteome and Transcriptome Abundance</font></th></tr>\n', file = htmloutfile, append = TRUE);
965 cat("<tr><td align=center>", '<img src="TE_PE_scatter.png" width=800 height=800></td>\n', file = htmloutfile, append = TRUE);
966 PE_TE_data = data.frame(PE_df, TE_df);
967 colnames(PE_TE_data) = c("PE_ID","PE_abundance","TE_ID","TE_abundance");
968 singlesample_scatter(PE_TE_data, outplot);
969
970 # TE PE Cor
971 cat("<tr><td align=center>\n", file = htmloutfile, append = TRUE);
972 singlesample_cor(PE_TE_data, htmloutfile, append=TRUE);
973 cat('<font color="red">*Note that <u>correlation</u> is <u>sensitive to outliers</u> in the data. So it is important to analyze outliers/influential observations in the data.<br> Below we use <u>Cook\'s distance based approach</u> to identify such influential observations.</font>\n',
974 file = htmloutfile, append = TRUE);
975 cat('</td></table>',
976 file = htmloutfile, append = TRUE);
977
978 cat('<hr/><h2 id="regression_data"><font color=#ff0000>REGRESSION ANALYSIS</font></h2>\n',
979 file = htmloutfile, append = TRUE);
980
981 # TE PE Regression
982 singlesample_regression(PE_TE_data,htmloutfile, append=TRUE);
983
984 cat('<hr/><h2 id="cluster_data"><font color=#ff0000>CLUSTER ANALYSIS</font></h2>\n',
985 file = htmloutfile, append = TRUE);
986
987 #TE PE Heatmap
988 singlesample_heatmap(PE_TE_data, htmloutfile, hm_nclust);
989
990 #TE PE Clustering (kmeans)
991 singlesample_kmeans(PE_TE_data, htmloutfile, nclust=as.numeric(numCluster))
992
993 }
994 }
995 cat("<h3>Go To:</h3>\n",
996 "<ul>\n",
997 "<li><a href=#sample_dist>Sample distribution</a></li>\n",
998 "<li><a href=#corr_data>Correlation</a></li>\n",
999 "<li><a href=#regression_data>Regression analysis</a></li>\n",
1000 "<li><a href=#inf_obs>Influential observations</a></li>\n",
1001 "<li><a href=#cluster_data>Cluster analysis</a></li></ul>\n",
1002 "<br><a href=#>TOP</a>",
1003 file = htmloutfile, append = TRUE);
1004 cat("</body></html>\n", file = htmloutfile, append = TRUE);