Mercurial > repos > galaxyp > quantp
comparison quantp.r @ 0:75faf9a89f5b draft
planemo upload commit a0e968c7bd2b6f7b963baeecb08f3a39e50f52d6
author | galaxyp |
---|---|
date | Fri, 14 Sep 2018 12:22:31 -0400 |
parents | |
children | bcc7a4c4cc29 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:75faf9a89f5b |
---|---|
1 #*************************************************************************************************************************************** | |
2 # Functions: Start | |
3 #*************************************************************************************************************************************** | |
4 | |
5 #=============================================================================== | |
6 # PCA | |
7 #=============================================================================== | |
8 multisample_PCA = function(df, sampleinfo_df, outfile) | |
9 { | |
10 tempdf = df[,-1]; | |
11 tempcol = colnames(tempdf); | |
12 tempgrp = sampleinfo_df[tempcol,2]; | |
13 tempdf = t(tempdf) %>% as.data.frame(); | |
14 tempdf[is.na(tempdf)] = 0; | |
15 tempdf$Group = tempgrp; | |
16 png(outfile, width = 6, height = 6, units = 'in', res=300); | |
17 # bitmap(outfile, "png16m"); | |
18 g = autoplot(prcomp(select(tempdf, -Group)), data = tempdf, colour = 'Group', size=3); | |
19 plot(g); | |
20 dev.off(); | |
21 } | |
22 | |
23 #=============================================================================== | |
24 # Regression and Cook's distance | |
25 #=============================================================================== | |
26 singlesample_regression = function(PE_TE_data,htmloutfile, append=TRUE) | |
27 { | |
28 rownames(PE_TE_data) = PE_TE_data$PE_ID; | |
29 regmodel = lm(PE_abundance~TE_abundance, data=PE_TE_data); | |
30 regmodel_summary = summary(regmodel); | |
31 | |
32 cat("<font><h3>Linear Regression model fit between Proteome and Transcriptome data</h3></font>\n", | |
33 "<p>Assuming a linear relationship between Proteome and Transcriptome data, we here fit a linear regression model.</p>\n", | |
34 '<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Parameter</font></th><th><font color=#ffcc33>Value</font></th></tr>\n', | |
35 file = htmloutfile, append = TRUE); | |
36 | |
37 cat("<tr><td>Formula</td><td>","PE_abundance~TE_abundance","</td></tr>\n", | |
38 "<tr><td colspan='2' align='center'> <b>Coefficients</b></td>","</tr>\n", | |
39 "<tr><td>",names(regmodel$coefficients[1]),"</td><td>",regmodel$coefficients[1]," (Pvalue:", regmodel_summary$coefficients[1,4],")","</td></tr>\n", | |
40 "<tr><td>",names(regmodel$coefficients[2]),"</td><td>",regmodel$coefficients[2]," (Pvalue:", regmodel_summary$coefficients[2,4],")","</td></tr>\n", | |
41 "<tr><td colspan='2' align='center'> <b>Model parameters</b></td>","</tr>\n", | |
42 "<tr><td>Residual standard error</td><td>",regmodel_summary$sigma," (",regmodel_summary$df[2]," degree of freedom)</td></tr>\n", | |
43 "<tr><td>F-statistic</td><td>",regmodel_summary$fstatistic[1]," ( on ",regmodel_summary$fstatistic[2]," and ",regmodel_summary$fstatistic[3]," degree of freedom)</td></tr>\n", | |
44 "<tr><td>R-squared</td><td>",regmodel_summary$r.squared,"</td></tr>\n", | |
45 "<tr><td>Adjusted R-squared</td><td>",regmodel_summary$adj.r.squared,"</td></tr>\n", | |
46 file = htmloutfile, append = TRUE); | |
47 | |
48 cat("</table>\n", file = htmloutfile, append = TRUE); | |
49 | |
50 cat( | |
51 "<font color='#ff0000'><h3>Regression and diagnostics plots</h3></font>\n", | |
52 file = htmloutfile, append = TRUE); | |
53 | |
54 outplot = paste(outdir,"/PE_TE_lm_1.png",sep="",collapse=""); | |
55 png(outplot, width = 10, height = 10, units = 'in',res=300); | |
56 # bitmap(outplot, "png16m"); | |
57 par(mfrow=c(1,1)); | |
58 plot(regmodel, 1, cex.lab=1.5); | |
59 dev.off(); | |
60 | |
61 outplot = paste(outdir,"/PE_TE_lm_2.png",sep="",collapse=""); | |
62 png(outplot,width = 10, height = 10, units = 'in', res=300); | |
63 # bitmap(outplot, "png16m"); | |
64 par(mfrow=c(1,1)); | |
65 plot(regmodel, 2, cex.lab=1.5); | |
66 dev.off(); | |
67 | |
68 outplot = paste(outdir,"/PE_TE_lm_5.png",sep="",collapse=""); | |
69 png(outplot, width = 10, height = 10, units = 'in',res=300); | |
70 # bitmap(outplot, "png16m"); | |
71 par(mfrow=c(1,1)); | |
72 plot(regmodel, 5, cex.lab=1.5); | |
73 dev.off(); | |
74 | |
75 cat('<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; ">', file = htmloutfile, append = TRUE); | |
76 | |
77 cat( | |
78 '<tr bgcolor="#7a0019"><th>', "<font color='#ffcc33'><h4>1) <u>Residuals vs Fitted plot</h4></font></u></th>\n", | |
79 '<th><font color=#ffcc33><h4>2) <u>Normal Q-Q plot of residuals</h4></font></u></th></tr>\n', | |
80 file = htmloutfile, append = TRUE); | |
81 | |
82 cat( | |
83 '<tr><td align=center><img src="PE_TE_lm_1.png" width=600 height=600></td><td align=center><img src="PE_TE_lm_2.png" width=600 height=600></td></tr>\n', | |
84 file = htmloutfile, append = TRUE); | |
85 | |
86 cat( | |
87 '<tr><td align=center>This plot checks for linear relationship assumptions.<br>If a horizontal line is observed without any distinct patterns, it indicates a linear relationship.</td>\n', | |
88 '<td align=center>This plot checks whether residuals are normally distributed or not.<br>It is good if the residuals points follow the straight dashed line i.e., do not deviate much from dashed line.</td></tr></table>\n', | |
89 file = htmloutfile, append = TRUE); | |
90 | |
91 | |
92 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
93 # Residuals data | |
94 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
95 res_all = regmodel$residuals; | |
96 res_mean = mean(res_all); | |
97 res_sd = sd(res_all); | |
98 res_diff = (res_all-res_mean); | |
99 res_zscore = res_diff/res_sd; | |
100 # res_outliers = res_all[which((res_zscore > 2)|(res_zscore < -2))] | |
101 | |
102 | |
103 tempind = which((res_zscore > 2)|(res_zscore < -2)); | |
104 res_PE_TE_data_no_outlier = PE_TE_data[-tempind,]; | |
105 res_PE_TE_data_no_outlier$residuals = res_all[-tempind]; | |
106 res_PE_TE_data_outlier = PE_TE_data[tempind,]; | |
107 res_PE_TE_data_outlier$residuals = res_all[tempind]; | |
108 | |
109 # Save the complete table for download (influential_observations) | |
110 temp_outlier_data = data.frame(res_PE_TE_data_outlier$PE_ID, res_PE_TE_data_outlier$TE_abundance, res_PE_TE_data_outlier$PE_abundance, res_PE_TE_data_outlier$residuals) | |
111 colnames(temp_outlier_data) = c("Gene", "Transcript abundance", "Protein abundance", "Residual value") | |
112 outdatafile = paste(outdir,"/PE_TE_outliers_residuals.txt", sep="", collapse=""); | |
113 write.table(temp_outlier_data, file=outdatafile, row.names=F, sep="\t", quote=F); | |
114 | |
115 | |
116 # Save the complete table for download (non influential_observations) | |
117 temp_all_data = data.frame(PE_TE_data$PE_ID, PE_TE_data$TE_abundance, PE_TE_data$PE_abundance, res_all) | |
118 colnames(temp_all_data) = c("Gene", "Transcript abundance", "Protein abundance", "Residual value") | |
119 outdatafile = paste(outdir,"/PE_TE_abundance_residuals.txt", sep="", collapse=""); | |
120 write.table(temp_all_data, file=outdatafile, row.names=F, sep="\t", quote=F); | |
121 | |
122 | |
123 cat('<br><h2 id="inf_obs"><font color=#ff0000>Outliers based on the residuals from regression analysis</font></h2>\n', | |
124 file = htmloutfile, append = TRUE); | |
125 cat('<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; ">\n', | |
126 '<tr bgcolor="#7a0019"><th colspan=2><font color=#ffcc33>Residuals from Regression</font></th></tr>\n', | |
127 '<tr bgcolor="#7a0019"><th><font color=#ffcc33>Parameter</font></th><th><font color=#ffcc33>Value</font></th></tr>\n', | |
128 file = htmloutfile, append = TRUE); | |
129 | |
130 cat("<tr><td>Mean Residual value</td><td>",res_mean,"</td></tr>\n", | |
131 "<tr><td>Standard deviation (Residuals)</td><td>",res_sd,"</td></tr>\n", | |
132 '<tr><td>Total outliers (Residual value > 2 standard deviation from the mean)</td><td>',length(tempind),' <font size=4>(<b><a href=PE_TE_outliers_residuals.txt target="_blank">Download these ',length(tempind),' data points with high residual values here</a></b>)</font></td>\n', | |
133 '<tr><td colspan=2 align=center><font size=4>(<b><a href=PE_TE_abundance_residuals.txt target="_blank">Download the complete residuals data here</a></b>)</font></td></td>\n', | |
134 "</table><br><br>\n", | |
135 file = htmloutfile, append = TRUE); | |
136 | |
137 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
138 | |
139 | |
140 cat('<br><br><table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; ">', file = htmloutfile, append = TRUE); | |
141 | |
142 cat( | |
143 '<tr bgcolor="#7a0019"><th><font color=#ffcc33><h4>3) <u>Residuals vs Leverage plot</h4></font></u></th></tr>\n', | |
144 file = htmloutfile, append = TRUE); | |
145 | |
146 cat( | |
147 '<tr><td align=center><img src="PE_TE_lm_5.png" width=600 height=600></td></tr>\n', | |
148 file = htmloutfile, append = TRUE); | |
149 | |
150 cat( | |
151 '<tr><td align=center>This plot is useful to identify any influential cases, that is outliers or extreme values.<br>They might influence the regression results upon inclusion or exclusion from the analysis.</td></tr></table><br>\n', | |
152 file = htmloutfile, append = TRUE); | |
153 | |
154 | |
155 | |
156 #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
157 # Cook's Distance | |
158 #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
159 cat('<hr/><h2 id="inf_obs"><font color=#ff0000>INFLUENTIAL OBSERVATIONS</font></h2>\n', | |
160 file = htmloutfile, append = TRUE); | |
161 cat( | |
162 '<p><b>Cook\'s distance</b> computes the influence of each data point/observation on the predicted outcome. i.e. this measures how much the observation is influencing the fitted values.<br>In general use, those observations that have a <b>Cook\'s distance > than ', cookdist_upper_cutoff,' times the mean</b> may be classified as <b>influential.</b></p>\n', | |
163 file = htmloutfile, append = TRUE); | |
164 | |
165 cooksd <- cooks.distance(regmodel); | |
166 | |
167 outplot = paste(outdir,"/PE_TE_lm_cooksd.png",sep="",collapse=""); | |
168 png(outplot, width = 10, height = 10, units = 'in', res=300); | |
169 # bitmap(outplot, "png16m"); | |
170 par(mfrow=c(1,1)); | |
171 plot(cooksd, main="Influential Obs. by Cook\'s distance", ylab="Cook\'s distance", xlab="Observations", type="n") # plot cooks distance | |
172 sel_outlier=which(cooksd>=as.numeric(cookdist_upper_cutoff)*mean(cooksd, na.rm=T)) | |
173 sel_nonoutlier=which(cooksd<as.numeric(cookdist_upper_cutoff)*mean(cooksd, na.rm=T)) | |
174 points(sel_outlier, cooksd[sel_outlier],pch="*", cex=2, cex.lab=1.5, col="red") | |
175 points(sel_nonoutlier, cooksd[sel_nonoutlier],pch="*", cex=2, cex.lab=1.5, col="black") | |
176 abline(h = as.numeric(cookdist_upper_cutoff)*mean(cooksd, na.rm=T), col="red") # add cutoff line | |
177 #text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>as.numeric(cookdist_upper_cutoff)*mean(cooksd, na.rm=T),names(cooksd),""), col="red", pos=2) # add labels | |
178 dev.off(); | |
179 | |
180 cat( | |
181 '<img src="PE_TE_lm_cooksd.png" width=800 height=800>', | |
182 '<br>In the above plot, observations above red line (',cookdist_upper_cutoff,' * mean Cook\'s distance) are influential. Genes that are outliers could be important. These observations influences the correlation values and regression coefficients<br><br>', | |
183 file = htmloutfile, append = TRUE); | |
184 | |
185 tempind = which(cooksd>as.numeric(cookdist_upper_cutoff)*mean(cooksd, na.rm=T)); | |
186 PE_TE_data_no_outlier = PE_TE_data[-tempind,]; | |
187 PE_TE_data_no_outlier$cooksd = cooksd[-tempind]; | |
188 PE_TE_data_outlier = PE_TE_data[tempind,]; | |
189 PE_TE_data_outlier$cooksd = cooksd[tempind]; | |
190 a = sort(PE_TE_data_outlier$cooksd, decreasing=T, index.return=T); | |
191 PE_TE_data_outlier_sorted = PE_TE_data_outlier[a$ix,]; | |
192 | |
193 cat( | |
194 '<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Parameter</font></th><th><font color=#ffcc33>Value</font></th></tr>\n', | |
195 file = htmloutfile, append = TRUE); | |
196 | |
197 # Save the complete table for download (influential_observations) | |
198 temp_outlier_data = data.frame(PE_TE_data_outlier$PE_ID, PE_TE_data_outlier$TE_abundance, PE_TE_data_outlier$PE_abundance, PE_TE_data_outlier$cooksd) | |
199 colnames(temp_outlier_data) = c("Gene", "Transcript abundance", "Protein abundance", "Cook's distance") | |
200 outdatafile = paste(outdir,"/PE_TE_influential_observation.txt", sep="", collapse=""); | |
201 write.table(temp_outlier_data, file=outdatafile, row.names=F, sep="\t", quote=F); | |
202 | |
203 | |
204 # Save the complete table for download (non influential_observations) | |
205 temp_no_outlier_data = data.frame(PE_TE_data_no_outlier$PE_ID, PE_TE_data_no_outlier$TE_abundance, PE_TE_data_no_outlier$PE_abundance, PE_TE_data_no_outlier$cooksd) | |
206 colnames(temp_no_outlier_data) = c("Gene", "Transcript abundance", "Protein abundance", "Cook's distance") | |
207 outdatafile = paste(outdir,"/PE_TE_non_influential_observation.txt", sep="", collapse=""); | |
208 write.table(temp_no_outlier_data, file=outdatafile, row.names=F, sep="\t", quote=F); | |
209 | |
210 | |
211 cat("<tr><td>Mean Cook\'s distance</td><td>",mean(cooksd, na.rm=T),"</td></tr>\n", | |
212 "<tr><td>Total influential observations (Cook\'s distance > ",cookdist_upper_cutoff," * mean Cook\'s distance)</td><td>",length(tempind),"</td>\n", | |
213 | |
214 "<tr><td>Observations with Cook\'s distance < ",cookdist_upper_cutoff," * mean Cook\'s distance</td><td>",length(which(cooksd<as.numeric(cookdist_upper_cutoff)*mean(cooksd, na.rm=T))),"</td>\n", | |
215 "</table><br><br>\n", | |
216 file = htmloutfile, append = TRUE); | |
217 | |
218 | |
219 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
220 # Scatter plot after removal of influential points | |
221 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
222 outplot = paste(outdir,"/AbundancePlot_scatter_without_outliers.png",sep="",collapse=""); | |
223 min_lim = min(c(PE_TE_data$PE_abundance,PE_TE_data$TE_abundance)); | |
224 max_lim = max(c(PE_TE_data$PE_abundance,PE_TE_data$TE_abundance)); | |
225 png(outplot, width = 10, height = 10, units = 'in', res=300); | |
226 # bitmap(outplot,"png16m"); | |
227 g = ggplot(PE_TE_data_no_outlier, aes(x=TE_abundance, y=PE_abundance))+geom_point() + geom_smooth() + xlab("Transcript abundance log fold-change") + ylab("Protein abundance log fold-change") + xlim(min_lim,max_lim) + ylim(min_lim,max_lim); | |
228 suppressMessages(plot(g)); | |
229 dev.off(); | |
230 | |
231 cat('<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Scatterplot: Before removal</font></th><th><font color=#ffcc33>Scatterplot: After removal</font></th></tr>\n', file = htmloutfile, append = TRUE); | |
232 # Before | |
233 cat("<tr><td align=center><!--<font color='#ff0000'><h3>Scatter plot between Proteome and Transcriptome Abundance</h3></font>\n-->", '<img src="TE_PE_scatter.png" width=600 height=600></td>\n', file = htmloutfile, append = TRUE); | |
234 | |
235 # After | |
236 cat("<td align=center>\n", | |
237 '<img src="AbundancePlot_scatter_without_outliers.png" width=600 height=600></td></tr>\n', | |
238 file = htmloutfile, append = TRUE); | |
239 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
240 | |
241 | |
242 cor_result_pearson = cor.test(PE_TE_data_no_outlier[,"TE_abundance"], PE_TE_data_no_outlier[,"PE_abundance"], method = "pearson"); | |
243 cor_result_spearman = cor.test(PE_TE_data_no_outlier[,"TE_abundance"], PE_TE_data_no_outlier[,"PE_abundance"], method = "spearman"); | |
244 cor_result_kendall = cor.test(PE_TE_data_no_outlier[,"TE_abundance"], PE_TE_data_no_outlier[,"PE_abundance"], method = "kendall"); | |
245 | |
246 cat('<tr><td>\n', file = htmloutfile, append=TRUE); | |
247 singlesample_cor(PE_TE_data, htmloutfile, append=TRUE); | |
248 cat('</td>\n', file = htmloutfile, append=TRUE); | |
249 | |
250 | |
251 cat('<td><table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Parameter</font></th><th><font color=#ffcc33>Method 1</font></th><th><font color=#ffcc33>Method 2</font></th><th><font color=#ffcc33>Method 3</font></th></tr>\n', | |
252 file = htmloutfile, append = TRUE); | |
253 | |
254 cat( | |
255 "<tr><td>Correlation method</td><td>",cor_result_pearson$method,"</td><td>",cor_result_spearman$method,"</td><td>",cor_result_kendall$method,"</td></tr>\n", | |
256 "<tr><td>Correlation coefficient</td><td>",cor_result_pearson$estimate,"</td><td>",cor_result_spearman$estimate,"</td><td>",cor_result_kendall$estimate,"</td></tr>\n", | |
257 file = htmloutfile, append = TRUE) | |
258 cat("</table></td></tr></table>\n", file = htmloutfile, append = TRUE) | |
259 | |
260 | |
261 | |
262 if(dim(PE_TE_data_outlier)[1]<10) | |
263 { | |
264 tab_n_row = dim(PE_TE_data_outlier)[1]; | |
265 }else{ | |
266 tab_n_row = 10; | |
267 } | |
268 | |
269 cat("<br><br><font size=5><b><a href='PE_TE_influential_observation.txt' target='_blank'>Download the complete list of influential observations</a></b></font> ", | |
270 "<font size=5><b><a href='PE_TE_non_influential_observation.txt' target='_blank'>Download the complete list (After removing influential points)</a></b></font><br>\n", | |
271 '<br><font color="brown"><h4>Top ',as.character(tab_n_row),' Influential observations (Cook\'s distance > ',cookdist_upper_cutoff,' * mean Cook\'s distance)</h4></font>\n', | |
272 file = htmloutfile, append = TRUE); | |
273 | |
274 cat('<table border=1 cellspacing=0 cellpadding=5> <tr bgcolor="#7a0019">\n', sep = "",file = htmloutfile, append = TRUE); | |
275 cat("<th><font color=#ffcc33>Gene</font></th><th><font color=#ffcc33>Protein Log Fold-Change</font></th><th><font color=#ffcc33>Transcript Log Fold-Change</font></th><th><font color=#ffcc33>Cook's Distance</font></th></tr>\n", | |
276 file = htmloutfile, append = TRUE); | |
277 | |
278 | |
279 for(i in 1:tab_n_row) | |
280 { | |
281 cat( | |
282 '<tr>','<td>',as.character(PE_TE_data_outlier_sorted[i,1]),'</td>\n', | |
283 '<td>',format(PE_TE_data_outlier_sorted[i,2], scientific=F),'</td>\n', | |
284 '<td>',PE_TE_data_outlier_sorted[i,4],'</td>\n', | |
285 '<td>',format(PE_TE_data_outlier_sorted[i,5], scientific=F),'</td></tr>\n', | |
286 file = htmloutfile, append = TRUE); | |
287 } | |
288 cat('</table><br><br>\n',file = htmloutfile, append = TRUE); | |
289 | |
290 | |
291 } | |
292 | |
293 | |
294 | |
295 #=============================================================================== | |
296 # Heatmap | |
297 #=============================================================================== | |
298 singlesample_heatmap=function(PE_TE_data, htmloutfile, hm_nclust){ | |
299 cat('<br><table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Heatmap of PE and TE abundance values (Hierarchical clustering)</font></th><th><font color=#ffcc33>Number of clusters to extract: ',hm_nclust,'</font></th></tr>\n', | |
300 file = htmloutfile, append = TRUE); | |
301 | |
302 hc=hclust(dist(as.matrix(PE_TE_data[,c("PE_abundance","TE_abundance")]))) | |
303 hm_cluster = cutree(hc,k=hm_nclust); | |
304 | |
305 outplot = paste(outdir,"/PE_TE_heatmap.png",sep="",collapse=""); | |
306 png(outplot, width = 10, height = 10, units = 'in', res=300); | |
307 # bitmap(outplot, "png16m"); | |
308 par(mfrow=c(1,1)); | |
309 hmap = heatmap.2(as.matrix(PE_TE_data[,c("PE_abundance","TE_abundance")]), trace="none", cexCol=1, col=greenred(100),Colv=F, labCol=c("Proteins","Transcripts"), scale="col", hclustfun = hclust, distfun = dist); | |
310 dev.off(); | |
311 | |
312 cat('<tr><td align=center colspan="2"><img src="PE_TE_heatmap.png" width=800 height=800></td></tr>\n', | |
313 file = htmloutfile, append = TRUE); | |
314 | |
315 | |
316 temp_PE_TE_data = data.frame(PE_TE_data$PE_ID, PE_TE_data$TE_abundance, PE_TE_data$PE_abundance, hm_cluster); | |
317 colnames(temp_PE_TE_data) = c("Gene", "Transcript abundance", "Protein abundance", "Cluster (Hierarchical clustering)") | |
318 tempoutfile = paste(outdir,"/PE_TE_hc_clusterpoints.txt",sep="",collapse=""); | |
319 write.table(temp_PE_TE_data, file=tempoutfile, row.names=F, quote=F, sep="\t", eol="\n") | |
320 | |
321 | |
322 cat('<tr><td colspan="2" align=center><font size=5><a href="PE_TE_hc_clusterpoints.txt" target="_blank"><b>Download the hierarchical cluster list</b></a></font></td></tr></table>\n', | |
323 file = htmloutfile, append = TRUE); | |
324 } | |
325 | |
326 | |
327 #=============================================================================== | |
328 # K-means clustering | |
329 #=============================================================================== | |
330 singlesample_kmeans=function(PE_TE_data, htmloutfile, nclust){ | |
331 PE_TE_data_kdata = PE_TE_data; | |
332 k1 = kmeans(PE_TE_data_kdata[,c("PE_abundance","TE_abundance")], nclust); | |
333 outplot = paste(outdir,"/PE_TE_kmeans.png",sep="",collapse=""); | |
334 png(outplot, width = 10, height = 10, units = 'in', res=300); | |
335 # bitmap(outplot, "png16m"); | |
336 par(mfrow=c(1,1)); | |
337 scatter.smooth(PE_TE_data_kdata[,"TE_abundance"], PE_TE_data_kdata[,"PE_abundance"], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5); | |
338 legend(1, 95, legend=c("Cluster 1", "Line 2"), col="red", lty=1:1, cex=0.8) | |
339 legend(1, 95, legend="Cluster 2", col="green", lty=1:1, cex=0.8) | |
340 | |
341 | |
342 ind=which(k1$cluster==1); | |
343 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="red", pch=16); | |
344 ind=which(k1$cluster==2); | |
345 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="green", pch=16); | |
346 ind=which(k1$cluster==3); | |
347 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="blue", pch=16); | |
348 ind=which(k1$cluster==4); | |
349 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="cyan", pch=16); | |
350 ind=which(k1$cluster==5); | |
351 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="black", pch=16); | |
352 ind=which(k1$cluster==6); | |
353 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="brown", pch=16); | |
354 ind=which(k1$cluster==7); | |
355 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="gold", pch=16); | |
356 ind=which(k1$cluster==8); | |
357 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="thistle", pch=16); | |
358 ind=which(k1$cluster==9); | |
359 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="yellow", pch=16); | |
360 ind=which(k1$cluster==10); | |
361 points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="orange", pch=16); | |
362 dev.off(); | |
363 | |
364 cat('<br><br><table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>K-mean clustering</font></th><th><font color=#ffcc33>Number of clusters: ',nclust,'</font></th></tr>\n', | |
365 file = htmloutfile, append = TRUE); | |
366 | |
367 tempind = order(k1$cluster); | |
368 tempoutfile = paste(outdir,"/PE_TE_kmeans_clusterpoints.txt",sep="",collapse=""); | |
369 write.table(data.frame(PE_TE_data_kdata[tempind, ], Cluster=k1$cluster[tempind]), file=tempoutfile, row.names=F, quote=F, sep="\t", eol="\n") | |
370 | |
371 | |
372 cat('<tr><td colspan="2" align=center><img src="PE_TE_kmeans.png" width=800 height=800></td></tr>\n', | |
373 file = htmloutfile, append = TRUE); | |
374 cat('<tr><td colspan="2" align=center><font size=5><a href="PE_TE_kmeans_clusterpoints.txt" target="_blank"><b>Download the cluster list</b></a></font></td></tr></table><br><hr/>\n', | |
375 file = htmloutfile, append = TRUE); | |
376 | |
377 } | |
378 | |
379 #=============================================================================== | |
380 # scatter plot | |
381 #=============================================================================== | |
382 singlesample_scatter = function(PE_TE_data, outfile) | |
383 { | |
384 min_lim = min(c(PE_TE_data$PE_abundance,PE_TE_data$TE_abundance)); | |
385 max_lim = max(c(PE_TE_data$PE_abundance,PE_TE_data$TE_abundance)); | |
386 png(outfile, width = 10, height = 10, units = 'in', res=300); | |
387 # bitmap(outfile, "png16m"); | |
388 g = ggplot(PE_TE_data, aes(x=TE_abundance, y=PE_abundance))+geom_point() + geom_smooth() + xlab("Transcript abundance log fold-change") + ylab("Protein abundance log fold-change") + xlim(min_lim,max_lim) + ylim(min_lim,max_lim); | |
389 suppressMessages(plot(g)); | |
390 # plot(g); | |
391 dev.off(); | |
392 } | |
393 | |
394 #=============================================================================== | |
395 # Correlation table | |
396 #=============================================================================== | |
397 singlesample_cor = function(PE_TE_data, htmloutfile, append=TRUE) | |
398 { | |
399 cor_result_pearson = cor.test(PE_TE_data$TE_abundance, PE_TE_data$PE_abundance, method = "pearson"); | |
400 cor_result_spearman = cor.test(PE_TE_data$TE_abundance, PE_TE_data$PE_abundance, method = "spearman"); | |
401 cor_result_kendall = cor.test(PE_TE_data$TE_abundance, PE_TE_data$PE_abundance, method = "kendall"); | |
402 | |
403 cat( | |
404 '<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Parameter</font></th><th><font color=#ffcc33>Method 1</font></th><th><font color=#ffcc33>Method 2</font></th><th><font color=#ffcc33>Method 3</font></th></tr>\n', | |
405 file = htmloutfile, append = TRUE); | |
406 | |
407 cat( | |
408 "<tr><td>Correlation method</td><td>",cor_result_pearson$method,"</td><td>",cor_result_spearman$method,"</td><td>",cor_result_kendall$method,"</td></tr>\n", | |
409 "<tr><td>Correlation coefficient</td><td>",cor_result_pearson$estimate,"</td><td>",cor_result_spearman$estimate,"</td><td>",cor_result_kendall$estimate,"</td></tr>\n", | |
410 file = htmloutfile, append = TRUE) | |
411 cat("</table>\n", file = htmloutfile, append = TRUE); | |
412 | |
413 } | |
414 | |
415 #=============================================================================== | |
416 # Boxplot | |
417 #=============================================================================== | |
418 multisample_boxplot = function(df, sampleinfo_df, outfile, fill_leg, user_xlab, user_ylab) | |
419 { | |
420 tempdf = df[,-1, drop=FALSE]; | |
421 tempdf = t(tempdf) %>% as.data.frame(); | |
422 tempdf[is.na(tempdf)] = 0; | |
423 tempdf$Sample = rownames(tempdf); | |
424 tempdf1 = melt(tempdf, id.vars = "Sample"); | |
425 tempdf1$Group = sampleinfo_df[tempdf1$Sample,2]; | |
426 png(outplot, width = 6, height = 6, units = 'in', res=300); | |
427 # bitmap(outplot, "png16m"); | |
428 if(fill_leg=="Yes") | |
429 { | |
430 g = ggplot(tempdf1, aes(x=Sample, y=value, fill=Group)) + geom_boxplot() + labs(x=user_xlab) + labs(y=user_ylab) | |
431 }else{ | |
432 if(fill_leg=="No") | |
433 { | |
434 tempdf1$Group = c("case", "control") | |
435 g = ggplot(tempdf1, aes(x=Sample, y=value, fill=Group)) + geom_boxplot() + labs(x=user_xlab) + labs(y=user_ylab) | |
436 } | |
437 } | |
438 plot(g); | |
439 dev.off(); | |
440 } | |
441 | |
442 | |
443 #=============================================================================== | |
444 # Mean or Median of Replicates | |
445 #=============================================================================== | |
446 | |
447 mergeReplicates = function(TE_df,PE_df, sampleinfo_df, method) | |
448 { | |
449 grps = unique(sampleinfo_df[,2]); | |
450 | |
451 TE_df_merged <<- sapply(grps, function(x){ | |
452 tempsample = sampleinfo_df[which(sampleinfo_df$Group==x),1] | |
453 if(length(tempsample)!=1){ | |
454 apply(TE_df[,tempsample],1,method); | |
455 }else{ | |
456 return(TE_df[,tempsample]); | |
457 } | |
458 }); | |
459 TE_df_merged <<- data.frame(as.character(TE_df[,1]), TE_df_merged); | |
460 colnames(TE_df_merged) = c(colnames(TE_df)[1], grps); | |
461 | |
462 PE_df_merged <<- sapply(grps, function(x){ | |
463 tempsample = sampleinfo_df[which(sampleinfo_df$Group==x),1] | |
464 if(length(tempsample)!=1){ | |
465 apply(PE_df[,tempsample],1,method); | |
466 }else{ | |
467 return(PE_df[,tempsample]); | |
468 } | |
469 }); | |
470 | |
471 PE_df_merged <<- data.frame(as.character(PE_df[,1]), PE_df_merged); | |
472 colnames(PE_df_merged) = c(colnames(PE_df)[1], grps); | |
473 | |
474 #sampleinfo_df_merged = data.frame(Sample = grps, Group = grps, stringsAsFactors = F); | |
475 sampleinfo_df_merged = data.frame(Sample = grps, Group = "Group", stringsAsFactors = F); | |
476 | |
477 return(list(TE_df_merged = TE_df_merged, PE_df_merged = PE_df_merged, sampleinfo_df_merged = sampleinfo_df_merged)); | |
478 } | |
479 | |
480 #=============================================================================== | |
481 # (T-Test or Wilcoxon ranksum test) and Volcano Plot | |
482 #=============================================================================== | |
483 | |
484 perform_Test_Volcano = function(TE_df_data,PE_df_data,TE_df_logfold, PE_df_logfold,sampleinfo_df, method, correction_method,volc_with) | |
485 { | |
486 | |
487 PE_colnames = colnames(PE_df_data); | |
488 control_sample = sampleinfo_df[which(sampleinfo_df$Group=="control"),1]; | |
489 control_ind <<- sapply(control_sample, function(x){temp_ind = which(PE_colnames==x); as.numeric(temp_ind)}); | |
490 condition_sample = sampleinfo_df[which(sampleinfo_df$Group=="case"),1]; | |
491 condition_ind <<- sapply(condition_sample, function(x){temp_ind = which(PE_colnames==x); as.numeric(temp_ind)}); | |
492 | |
493 if(method=="mean"){ | |
494 #PE_pval = apply(PE_df_data[2:length(colnames(PE_df_data))],1,function(x) t.test(x[condition_ind-1], x[control_ind-1])$p.value); | |
495 PE_pval = apply(PE_df_data[2:length(colnames(PE_df_data))],1,function(x) {obj<-try(t.test(x[condition_ind-1], x[control_ind-1]),silent=TRUE); if(is(obj, "try-error")){return(NA)}else{return(obj$p.value)}}) | |
496 }else{ | |
497 if(method=="median"){ | |
498 PE_pval = apply(PE_df_data[2:length(colnames(PE_df_data))],1,function(x) {obj<-try(wilcox.test(x[condition_ind-1], x[control_ind-1]),silent=TRUE); if(is(obj, "try-error")){return(NA)}else{return(obj$p.value)}}) | |
499 # PE_pval = apply(PE_df_data[2:length(colnames(PE_df_data))],1,function(x) wilcox.test(x[condition_ind-1], x[control_ind-1])$p.value); | |
500 } | |
501 } | |
502 PE_adj_pval = p.adjust(PE_pval, method = correction_method, n = length(PE_pval)) | |
503 | |
504 | |
505 TE_colnames = colnames(TE_df_data); | |
506 control_sample = sampleinfo_df[which(sampleinfo_df$Group=="control"),1]; | |
507 control_ind <<- sapply(control_sample, function(x){temp_ind = which(TE_colnames==x); as.numeric(temp_ind)}); | |
508 condition_sample = sampleinfo_df[which(sampleinfo_df$Group=="case"),1]; | |
509 condition_ind <<- sapply(condition_sample, function(x){temp_ind = which(TE_colnames==x); as.numeric(temp_ind)}); | |
510 | |
511 if(method=="mean"){ | |
512 # TE_pval = apply(TE_df_data[2:length(colnames(TE_df_data))],1,function(x) t.test(x[condition_ind-1], x[control_ind-1])$p.value); | |
513 TE_pval = apply(TE_df_data[2:length(colnames(TE_df_data))],1,function(x) {obj<-try(t.test(x[condition_ind-1], x[control_ind-1]),silent=TRUE); if(is(obj, "try-error")){return(NA)}else{return(obj$p.value)}}) | |
514 }else{ | |
515 if(method=="median"){ | |
516 TE_pval = apply(TE_df_data[2:length(colnames(TE_df_data))],1,function(x) {obj<-try(wilcox.test(x[condition_ind-1], x[control_ind-1]),silent=TRUE); if(is(obj, "try-error")){return(NA)}else{return(obj$p.value)}}) | |
517 # TE_pval = apply(TE_df_data[2:length(colnames(TE_df_data))],1,function(x) wilcox.test(x[condition_ind-1], x[control_ind-1])$p.value); | |
518 } | |
519 } | |
520 TE_adj_pval = p.adjust(TE_pval, method = correction_method, n = length(TE_pval)) | |
521 | |
522 | |
523 PE_TE_logfold_pval = data.frame(TE_df_logfold$Gene, TE_df_logfold$LogFold, TE_pval, TE_adj_pval, PE_df_logfold$LogFold, PE_pval, PE_adj_pval); | |
524 colnames(PE_TE_logfold_pval) = c("Gene", "Transcript log fold-change", "p-value (transcript)", "adj p-value (transcript)", "Protein log fold-change", "p-value (protein)", "adj p-value (protein)"); | |
525 outdatafile = paste(outdir,"/PE_TE_logfold_pval.txt", sep="", collapse=""); | |
526 write.table(PE_TE_logfold_pval, file=outdatafile, row.names=F, sep="\t", quote=F); | |
527 cat("<br><br><font size=5><b><a href='PE_TE_logfold_pval.txt' target='_blank'>Download the complete fold change data here</a></b></font><br>\n", | |
528 file = htmloutfile, append = TRUE); | |
529 | |
530 if(length(condition_ind)!=1) | |
531 { | |
532 # Volcano Plot | |
533 | |
534 if(volc_with=="adj_pval") | |
535 { | |
536 PE_pval = PE_adj_pval | |
537 TE_pval = TE_adj_pval | |
538 volc_ylab = "-log10 Adjusted p-value"; | |
539 }else{ | |
540 if(volc_with=="pval") | |
541 { | |
542 volc_ylab = "-log10 p-value"; | |
543 } | |
544 } | |
545 outplot = paste(outdir,"/PE_volcano.png",sep="",collapse=""); | |
546 png(outplot, width = 10, height = 10, units = 'in', res=300); | |
547 # bitmap(outplot, "png16m"); | |
548 par(mfrow=c(1,1)); | |
549 | |
550 plot(PE_df_logfold$LogFold, -log10(PE_pval), | |
551 xlab="log2 fold change", ylab=volc_ylab, | |
552 type="n") | |
553 sel <- which((PE_df_logfold$LogFold<=log(2,base=2))&(PE_df_logfold$LogFold>=log(0.5, base=2))) # or whatever you want to use | |
554 points(PE_df_logfold[sel,"LogFold"], -log10(PE_pval[sel]),col="black") | |
555 #sel <- which((PE_df_logfold$LogFold>log(2,base=2))&(PE_df_logfold$LogFold<log(0.5,base=2))) # or whatever you want to use | |
556 sel <- which((PE_df_logfold$LogFold>log(2,base=2))|(PE_df_logfold$LogFold<log(0.5, base=2))) | |
557 sel1 <- which(PE_pval<=0.05) | |
558 sel=intersect(sel,sel1) | |
559 points(PE_df_logfold[sel,"LogFold"], -log10(PE_pval[sel]),col="red") | |
560 sel <- which((PE_df_logfold$LogFold>log(2,base=2))|(PE_df_logfold$LogFold<log(0.5, base=2))) | |
561 sel1 <- which(PE_pval>0.05) | |
562 sel=intersect(sel,sel1) | |
563 points(PE_df_logfold[sel,"LogFold"], -log10(PE_pval[sel]),col="blue") | |
564 abline(h = -log(0.05,base=10), col="red", lty=2) | |
565 abline(v = log(2,base=2), col="red", lty=2) | |
566 abline(v = log(0.5,base=2), col="red", lty=2) | |
567 dev.off(); | |
568 | |
569 | |
570 | |
571 outplot = paste(outdir,"/TE_volcano.png",sep="",collapse=""); | |
572 png(outplot, width = 10, height = 10, units = 'in', res=300); | |
573 # bitmap(outplot, "png16m"); | |
574 par(mfrow=c(1,1)); | |
575 | |
576 plot(TE_df_logfold$LogFold, -log10(TE_pval), | |
577 xlab="log2 fold change", ylab=volc_ylab, | |
578 type="n") | |
579 sel <- which((TE_df_logfold$LogFold<=log(2,base=2))&(TE_df_logfold$LogFold>=log(0.5, base=2))) # or whatever you want to use | |
580 points(TE_df_logfold[sel,"LogFold"], -log10(TE_pval[sel]),col="black") | |
581 #sel <- which((TE_df_logfold$LogFold>log(2,base=2))&(TE_df_logfold$LogFold<log(0.5,base=2))) # or whatever you want to use | |
582 sel <- which((TE_df_logfold$LogFold>log(2,base=2))|(TE_df_logfold$LogFold<log(0.5, base=2))) | |
583 sel1 <- which(TE_pval<=0.05) | |
584 sel=intersect(sel,sel1) | |
585 points(TE_df_logfold[sel,"LogFold"], -log10(TE_pval[sel]),col="red") | |
586 sel <- which((TE_df_logfold$LogFold>log(2,base=2))|(TE_df_logfold$LogFold<log(0.5, base=2))) | |
587 sel1 <- which(TE_pval>0.05) | |
588 sel=intersect(sel,sel1) | |
589 points(TE_df_logfold[sel,"LogFold"], -log10(TE_pval[sel]),col="blue") | |
590 abline(h = -log(0.05,base=10), col="red", lty=2) | |
591 abline(v = log(2,base=2), col="red", lty=2) | |
592 abline(v = log(0.5,base=2), col="red", lty=2) | |
593 dev.off(); | |
594 | |
595 | |
596 | |
597 cat('<br><table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Transcript Fold-Change</font></th><th><font color=#ffcc33>Protein Fold-Change</font></th></tr>\n', file = htmloutfile, append = TRUE); | |
598 cat("<tr><td align=center>", '<img src="TE_volcano.png" width=600 height=600></td>\n', file = htmloutfile, append = TRUE); | |
599 cat("<td align=center>", | |
600 '<img src="PE_volcano.png" width=600 height=600></td></tr></table><br>\n', | |
601 file = htmloutfile, append = TRUE); | |
602 }else{ | |
603 cat('<br><br><b><font color=red>!!! No replicates found. Cannot perform test to check significance of differential expression. Thus, no Volcano plot generated !!!</font></b><br><br>', | |
604 file = htmloutfile, append = TRUE); | |
605 } | |
606 | |
607 } | |
608 | |
609 | |
610 #*************************************************************************************************************************************** | |
611 # Functions: End | |
612 #*************************************************************************************************************************************** | |
613 | |
614 | |
615 | |
616 | |
617 #=============================================================================== | |
618 # Arguments | |
619 #=============================================================================== | |
620 noargs = 12; | |
621 args = commandArgs(trailingOnly = TRUE); | |
622 if(length(args) != noargs) | |
623 { | |
624 stop(paste("Please check usage. Number of arguments is not equal to ",noargs,sep="",collapse="")); | |
625 } | |
626 | |
627 mode = args[1]; # "multiple" or "logfold" | |
628 method = args[2]; # "mean" or "median" | |
629 sampleinfo_file = args[3]; | |
630 proteome_file = args[4]; | |
631 transcriptome_file = args[5]; | |
632 correction_method = args[6]; | |
633 cookdist_upper_cutoff = args[7]; | |
634 numCluster = args[8]; | |
635 hm_nclust = args[9]; | |
636 volc_with = args[10]; | |
637 | |
638 htmloutfile = args[11]; # html output file | |
639 outdir = args[12]; # html supporting files | |
640 | |
641 | |
642 #=============================================================================== | |
643 # Check for file existance | |
644 #=============================================================================== | |
645 if(! file.exists(proteome_file)) | |
646 { | |
647 stop(paste("Proteome Data file does not exists. Path given: ",proteome_file,sep="",collapse="")); | |
648 } | |
649 if(! file.exists(transcriptome_file)) | |
650 { | |
651 stop(paste("Transcriptome Data file does not exists. Path given: ",transcriptome_file,sep="",collapse="")); | |
652 } | |
653 | |
654 #=============================================================================== | |
655 # Load library | |
656 #=============================================================================== | |
657 options(warn=-1); | |
658 | |
659 suppressPackageStartupMessages(library(dplyr)); | |
660 suppressPackageStartupMessages(library(data.table)); | |
661 suppressPackageStartupMessages(library(gplots)); | |
662 suppressPackageStartupMessages(library(ggplot2)); | |
663 suppressPackageStartupMessages(library(ggfortify)); | |
664 | |
665 #=============================================================================== | |
666 # Select mode and parse experiment design file | |
667 #=============================================================================== | |
668 if(mode=="multiple") | |
669 { | |
670 expDesign = fread(sampleinfo_file, header = FALSE, stringsAsFactors = FALSE, sep="\t") %>% data.frame(); | |
671 expDesign_cc = expDesign[1:2,]; | |
672 | |
673 sampleinfo_df = expDesign[3:nrow(expDesign),]; | |
674 rownames(sampleinfo_df)=1:nrow(sampleinfo_df); | |
675 colnames(sampleinfo_df) = c("Sample","Group"); | |
676 | |
677 condition_cols = sampleinfo_df[which(sampleinfo_df[,2]==expDesign_cc[which(expDesign_cc[,1]=="case"),2]),1]; | |
678 condition_g_name = "case"; | |
679 control_cols = sampleinfo_df[which(sampleinfo_df[,2]==expDesign_cc[which(expDesign_cc[,1]=="control"),2]),1]; | |
680 control_g_name = "control"; | |
681 sampleinfo_df[which(sampleinfo_df[,2]==expDesign_cc[which(expDesign_cc[,1]=="case"),2]),2] = "case"; | |
682 sampleinfo_df[which(sampleinfo_df[,2]==expDesign_cc[which(expDesign_cc[,1]=="control"),2]),2] = "control"; | |
683 sampleinfo_df_orig = sampleinfo_df; | |
684 } | |
685 | |
686 if(mode=="logfold") | |
687 { | |
688 sampleinfo_df = data.frame("Sample"= c("LogFold"), "Group"=c("Fold_Change")) | |
689 } | |
690 | |
691 #=============================================================================== | |
692 # Parse Transcriptome data | |
693 #=============================================================================== | |
694 TE_df_orig = fread(transcriptome_file, sep="\t", stringsAsFactor=F, header=T) %>% data.frame(); | |
695 if(mode=="multiple") | |
696 { | |
697 TE_df = TE_df_orig[,c(colnames(TE_df_orig)[1],condition_cols,control_cols)]; | |
698 } | |
699 if(mode=="logfold") | |
700 { | |
701 TE_df = TE_df_orig; | |
702 colnames(TE_df) = c("Genes", "LogFold"); | |
703 } | |
704 #=============================================================================== | |
705 # Parse Proteome data | |
706 #=============================================================================== | |
707 PE_df_orig = fread(proteome_file, sep="\t", stringsAsFactor=F, header=T) %>% data.frame(); | |
708 if(mode=="multiple") | |
709 { | |
710 PE_df = PE_df_orig[,c(colnames(PE_df_orig)[1],condition_cols,control_cols)]; | |
711 } | |
712 if(mode=="logfold") | |
713 { | |
714 PE_df = PE_df_orig; | |
715 colnames(PE_df) = c("Genes", "LogFold"); | |
716 } | |
717 | |
718 #============================================================================================================= | |
719 # Create directory structures and then set the working directory to output directory | |
720 #============================================================================================================= | |
721 if(! file.exists(outdir)) | |
722 { | |
723 dir.create(outdir); | |
724 } | |
725 #=============================================================================== | |
726 # Write initial data summary in html outfile | |
727 #=============================================================================== | |
728 cat("<html><head></head><body>\n", file = htmloutfile); | |
729 | |
730 cat("<h1><u>QuanTP: Association between abundance ratios of transcript and protein</u></h1><hr/>\n", | |
731 "<font><h3>Input data summary</h3></font>\n", | |
732 "<ul>\n", | |
733 "<li>Abbreviations used: PE (Proteome data) and TE (Transcriptome data)","</li><br>\n", | |
734 "<li>Input Proteome data dimension (Row Column): ", dim(PE_df)[1]," x ", dim(PE_df)[2],"</li>\n", | |
735 "<li>Input Transcriptome data dimension (Row Column): ", dim(TE_df)[1]," x ", dim(TE_df)[2],"</li></ul><hr/>\n", | |
736 file = htmloutfile, append = TRUE); | |
737 | |
738 cat("<h3 id=table_of_content>Table of Contents:</h3>\n", | |
739 "<ul>\n", | |
740 "<li><a href=#sample_dist>Sample distribution</a></li>\n", | |
741 "<li><a href=#corr_data>Correlation</a></li>\n", | |
742 "<li><a href=#regression_data>Regression analysis</a></li>\n", | |
743 "<li><a href=#inf_obs>Influential observations</a></li>\n", | |
744 "<li><a href=#cluster_data>Cluster analysis</a></li></ul><hr/>\n", | |
745 file = htmloutfile, append = TRUE); | |
746 #=============================================================================== | |
747 # Find common samples | |
748 #=============================================================================== | |
749 common_samples = intersect(sampleinfo_df[,1], colnames(TE_df)[-1]) %>% intersect(., colnames(PE_df)[-1]); | |
750 | |
751 if(length(common_samples)==0) | |
752 { | |
753 stop("No common samples found "); | |
754 cat("<b>Please check your experiment design file. Sample names (column names) in the Transcriptome and the Proteome data do not match. </b>\n",file = htmloutfile, append = TRUE); | |
755 } | |
756 | |
757 #=============================================================================== | |
758 # Create subsets based on common samples | |
759 #=============================================================================== | |
760 TE_df = select(TE_df, 1, common_samples); | |
761 PE_df = select(PE_df, 1, common_samples); | |
762 sampleinfo_df = filter(sampleinfo_df, Sample %in% common_samples); | |
763 rownames(sampleinfo_df) = sampleinfo_df[,1]; | |
764 | |
765 #=============================================================================== | |
766 # Check for number of rows similarity | |
767 #=============================================================================== | |
768 if(nrow(TE_df) != nrow(PE_df)) | |
769 { | |
770 stop("Number of rows in Transcriptome and Proteome data are not same i.e. they are not paired"); | |
771 cat("<b>The correlation analysis expects paired TE and PE data i.e. (i)th gene/transcript of TE file should correspond to (i)th protein of PE file. In the current input provided there is mismatch in terms of number of rows of TE and PE file. Please make sure you provide paired data.</b>\n",file = htmloutfile, append = TRUE); | |
772 } | |
773 | |
774 #=============================================================================== | |
775 # Number of groups | |
776 #=============================================================================== | |
777 ngrps = unique(sampleinfo_df[,2]) %>% length(); | |
778 grps = unique(sampleinfo_df[,2]); | |
779 names(grps) = grps; | |
780 | |
781 #=============================================================================== | |
782 # Change column1 name | |
783 #=============================================================================== | |
784 colnames(TE_df)[1] = "Gene"; | |
785 colnames(PE_df)[1] = "Protein"; | |
786 | |
787 #=============================================================================== | |
788 # Treat missing values | |
789 #=============================================================================== | |
790 TE_nacount = sum(is.na(TE_df)); | |
791 PE_nacount = sum(is.na(PE_df)); | |
792 | |
793 TE_df[is.na(TE_df)] = 0; | |
794 PE_df[is.na(PE_df)] = 0; | |
795 | |
796 | |
797 #=============================================================================== | |
798 # Decide based on analysis mode | |
799 #=============================================================================== | |
800 if(mode=="logfold") | |
801 { | |
802 cat('<h2 id="sample_dist"><font color=#ff0000>SAMPLE DISTRIBUTION</font></h2>\n', | |
803 file = htmloutfile, append = TRUE); | |
804 | |
805 # TE Boxplot | |
806 outplot = paste(outdir,"/Box_TE.png",sep="",collape=""); | |
807 cat('<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; ">\n', | |
808 '<tr bgcolor="#7a0019"><th><font color=#ffcc33>Boxplot: Transcriptome data</font></th><th><font color=#ffcc33>Boxplot: Proteome data</font></th></tr>\n', | |
809 "<tr><td align=center>", '<img src="Box_TE.png" width=500 height=500></td>\n', file = htmloutfile, append = TRUE); | |
810 multisample_boxplot(TE_df, sampleinfo_df, outplot, "Yes", "Samples", "Transcript Abundance data"); | |
811 | |
812 # PE Boxplot | |
813 outplot = paste(outdir,"/Box_PE.png",sep="",collape=""); | |
814 cat("<td align=center>", '<img src="Box_PE.png" width=500 height=500></td></tr></table>\n', file = htmloutfile, append = TRUE); | |
815 multisample_boxplot(PE_df, sampleinfo_df, outplot, "Yes", "Samples", "Protein Abundance data"); | |
816 | |
817 cat('<hr/><h2 id="corr_data"><font color=#ff0000>CORRELATION</font></h2>\n', | |
818 file = htmloutfile, append = TRUE); | |
819 | |
820 # TE PE scatter | |
821 outplot = paste(outdir,"/TE_PE_scatter.png",sep="",collape=""); | |
822 cat('<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Scatter plot between Proteome and Transcriptome Abundance</font></th></tr>\n', file = htmloutfile, append = TRUE); | |
823 cat("<tr><td align=center>", '<img src="TE_PE_scatter.png" width=800 height=800></td></tr>\n', file = htmloutfile, append = TRUE); | |
824 PE_TE_data = data.frame(PE_df, TE_df); | |
825 colnames(PE_TE_data) = c("PE_ID","PE_abundance","TE_ID","TE_abundance"); | |
826 singlesample_scatter(PE_TE_data, outplot); | |
827 | |
828 # TE PE Cor | |
829 cat("<tr><td align=center>", file = htmloutfile, append = TRUE); | |
830 singlesample_cor(PE_TE_data, htmloutfile, append=TRUE); | |
831 cat('<font color="red">*Note that <u>correlation</u> is <u>sensitive to outliers</u> in the data. So it is important to analyze outliers/influential observations in the data.<br> Below we use <u>Cook\'s distance based approach</u> to identify such influential observations.</font>\n', | |
832 file = htmloutfile, append = TRUE); | |
833 cat('</td></table>', | |
834 file = htmloutfile, append = TRUE); | |
835 | |
836 cat('<hr/><h2 id="regression_data"><font color=#ff0000>REGRESSION ANALYSIS</font></h2>\n', | |
837 file = htmloutfile, append = TRUE); | |
838 | |
839 # TE PE Regression | |
840 singlesample_regression(PE_TE_data,htmloutfile, append=TRUE); | |
841 | |
842 cat('<hr/><h2 id="cluster_data"><font color=#ff0000>CLUSTER ANALYSIS</font></h2>\n', | |
843 file = htmloutfile, append = TRUE); | |
844 | |
845 # TE PE Heatmap | |
846 singlesample_heatmap(PE_TE_data, htmloutfile, hm_nclust); | |
847 | |
848 | |
849 # TE PE Clustering (kmeans) | |
850 singlesample_kmeans(PE_TE_data, htmloutfile, nclust=as.numeric(numCluster)) | |
851 | |
852 }else{ | |
853 if(mode=="multiple") | |
854 { | |
855 cat('<h2 id="sample_dist"><font color=#ff0000>SAMPLE DISTRIBUTION</font></h2>\n', | |
856 file = htmloutfile, append = TRUE); | |
857 | |
858 # TE Boxplot | |
859 outplot = paste(outdir,"/Box_TE_all_rep.png",sep="",collape=""); | |
860 cat('<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; ">\n', | |
861 '<tr bgcolor="#7a0019"><th><font color=#ffcc33>Boxplot: Transcriptome data</font></th><th><font color=#ffcc33>Boxplot: Proteome data</font></th></tr>\n', | |
862 "<tr><td align=center>", '<img src="Box_TE_all_rep.png" width=500 height=500></td>\n', file = htmloutfile, append = TRUE); | |
863 temp_df_te_data = data.frame(TE_df[,1], log(TE_df[,2:length(TE_df)])); | |
864 colnames(temp_df_te_data) = colnames(TE_df); | |
865 multisample_boxplot(temp_df_te_data, sampleinfo_df, outplot, "Yes", "Samples", "Transcript Abundance (log)"); | |
866 | |
867 # PE Boxplot | |
868 outplot = paste(outdir,"/Box_PE_all_rep.png",sep="",collape=""); | |
869 cat("<td align=center>", '<img src="Box_PE_all_rep.png" width=500 height=500></td></tr></table>\n', file = htmloutfile, append = TRUE); | |
870 temp_df_pe_data = data.frame(PE_df[,1], log(PE_df[,2:length(PE_df)])); | |
871 colnames(temp_df_pe_data) = colnames(PE_df); | |
872 multisample_boxplot(temp_df_pe_data, sampleinfo_df, outplot, "Yes", "Samples", "Protein Abundance (log)"); | |
873 | |
874 # Calc TE PCA | |
875 outplot = paste(outdir,"/PCA_TE_all_rep.png",sep="",collape=""); | |
876 multisample_PCA(TE_df, sampleinfo_df, outplot); | |
877 | |
878 # Calc PE PCA | |
879 outplot = paste(outdir,"/PCA_PE_all_rep.png",sep="",collape=""); | |
880 multisample_PCA(PE_df, sampleinfo_df, outplot); | |
881 | |
882 | |
883 # Replicate mode | |
884 templist = mergeReplicates(TE_df,PE_df, sampleinfo_df, method); | |
885 TE_df = templist$TE_df_merged; | |
886 PE_df = templist$PE_df_merged; | |
887 sampleinfo_df = templist$sampleinfo_df_merged; | |
888 rownames(sampleinfo_df) = sampleinfo_df[,1]; | |
889 | |
890 # TE Boxplot | |
891 outplot = paste(outdir,"/Box_TE_rep.png",sep="",collape=""); | |
892 cat('<br><font color="#ff0000"><h3>Sample wise distribution (Box plot) after using ',method,' on replicates </h3></font><table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Boxplot: Transcriptome data</font></th><th><font color=#ffcc33>Boxplot: Proteome data</font></th></tr>\n', | |
893 "<tr><td align=center>", '<img src="Box_TE_rep.png" width=500 height=500></td>\n', file = htmloutfile, append = TRUE); | |
894 temp_df_te_data = data.frame(TE_df[,1], log(TE_df[,2:length(TE_df)])); | |
895 colnames(temp_df_te_data) = colnames(TE_df); | |
896 multisample_boxplot(temp_df_te_data, sampleinfo_df, outplot, "No", "Sample Groups", "Mean Transcript Abundance (log)"); | |
897 | |
898 # PE Boxplot | |
899 outplot = paste(outdir,"/Box_PE_rep.png",sep="",collape=""); | |
900 cat("<td align=center>", '<img src="Box_PE_rep.png" width=500 height=500></td></tr></table>\n', file = htmloutfile, append = TRUE); | |
901 temp_df_pe_data = data.frame(PE_df[,1], log(PE_df[,2:length(PE_df)])); | |
902 colnames(temp_df_pe_data) = colnames(PE_df); | |
903 multisample_boxplot(temp_df_pe_data, sampleinfo_df, outplot, "No", "Sample Groups", "Mean Protein Abundance (log)"); | |
904 | |
905 #=============================================================================== | |
906 # Calculating log fold change and running the "single" code part | |
907 #=============================================================================== | |
908 | |
909 TE_df = data.frame("Genes"=TE_df[,1], "LogFold"=apply(TE_df[,c(which(colnames(TE_df)==condition_g_name),which(colnames(TE_df)==control_g_name))],1,function(x) log(x[1]/x[2],base=2))); | |
910 PE_df = data.frame("Genes"=PE_df[,1], "LogFold"=apply(PE_df[,c(which(colnames(PE_df)==condition_g_name),which(colnames(PE_df)==control_g_name))],1,function(x) log(x[1]/x[2],base=2))); | |
911 | |
912 #=============================================================================== | |
913 # Treat missing values | |
914 #=============================================================================== | |
915 | |
916 TE_df[is.infinite(TE_df[,2]),2] = NA; | |
917 PE_df[is.infinite(PE_df[,2]),2] = NA; | |
918 TE_df[is.na(TE_df)] = 0; | |
919 PE_df[is.na(PE_df)] = 0; | |
920 | |
921 sampleinfo_df = data.frame("Sample"= c("LogFold"), "Group"=c("Fold_Change")) | |
922 #=============================================================================== | |
923 # Find common samples | |
924 #=============================================================================== | |
925 | |
926 common_samples = intersect(sampleinfo_df[,1], colnames(TE_df)[-1]) %>% intersect(., colnames(PE_df)[-1]); | |
927 TE_df = select(TE_df, 1, common_samples); | |
928 PE_df = select(PE_df, 1, common_samples); | |
929 sampleinfo_df = filter(sampleinfo_df, Sample %in% common_samples); | |
930 rownames(sampleinfo_df) = sampleinfo_df[,1]; | |
931 | |
932 # TE Boxplot | |
933 outplot = paste(outdir,"/Box_TE.png",sep="",collape=""); | |
934 cat('<br><font color="#ff0000"><h3>Distribution (Box plot) of log fold change </h3></font>', file = htmloutfile, append = TRUE); | |
935 cat('<table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Boxplot: Transcriptome data</font></th><th><font color=#ffcc33>Boxplot: Proteome data</font></th></tr>\n', | |
936 "<tr><td align=center>", '<img src="Box_TE.png" width=500 height=500></td>\n', file = htmloutfile, append = TRUE); | |
937 multisample_boxplot(TE_df, sampleinfo_df, outplot, "Yes", "Sample (log2(case/control))", "Transcript Abundance fold-change (log2)"); | |
938 | |
939 # PE Boxplot | |
940 outplot = paste(outdir,"/Box_PE.png",sep="",collape=""); | |
941 cat("<td align=center>", '<img src="Box_PE.png" width=500 height=500></td></tr></table>\n', file = htmloutfile, append = TRUE); | |
942 multisample_boxplot(PE_df, sampleinfo_df, outplot, "Yes", "Sample (log2(case/control))", "Protein Abundance fold-change(log2)"); | |
943 | |
944 | |
945 # Log Fold Data | |
946 perform_Test_Volcano(TE_df_orig,PE_df_orig,TE_df, PE_df,sampleinfo_df_orig,method,correction_method,volc_with) | |
947 | |
948 | |
949 | |
950 # Print PCA | |
951 | |
952 cat('<br><br><table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>PCA plot: Transcriptome data</font></th><th><font color=#ffcc33>PCA plot: Proteome data</font></th></tr>\n', | |
953 "<tr><td align=center>", '<img src="PCA_TE_all_rep.png" width=500 height=500></td>\n', | |
954 "<td align=center>", '<img src="PCA_PE_all_rep.png" width=500 height=500></td></tr></table>\n', | |
955 file = htmloutfile, append = TRUE); | |
956 | |
957 | |
958 | |
959 cat('<hr/><h2 id="corr_data"><font color=#ff0000>CORRELATION</font></h2>\n', | |
960 file = htmloutfile, append = TRUE); | |
961 | |
962 # TE PE scatter | |
963 outplot = paste(outdir,"/TE_PE_scatter.png",sep="",collape=""); | |
964 cat('<br><table border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#7a0019"><th><font color=#ffcc33>Scatter plot between Proteome and Transcriptome Abundance</font></th></tr>\n', file = htmloutfile, append = TRUE); | |
965 cat("<tr><td align=center>", '<img src="TE_PE_scatter.png" width=800 height=800></td>\n', file = htmloutfile, append = TRUE); | |
966 PE_TE_data = data.frame(PE_df, TE_df); | |
967 colnames(PE_TE_data) = c("PE_ID","PE_abundance","TE_ID","TE_abundance"); | |
968 singlesample_scatter(PE_TE_data, outplot); | |
969 | |
970 # TE PE Cor | |
971 cat("<tr><td align=center>\n", file = htmloutfile, append = TRUE); | |
972 singlesample_cor(PE_TE_data, htmloutfile, append=TRUE); | |
973 cat('<font color="red">*Note that <u>correlation</u> is <u>sensitive to outliers</u> in the data. So it is important to analyze outliers/influential observations in the data.<br> Below we use <u>Cook\'s distance based approach</u> to identify such influential observations.</font>\n', | |
974 file = htmloutfile, append = TRUE); | |
975 cat('</td></table>', | |
976 file = htmloutfile, append = TRUE); | |
977 | |
978 cat('<hr/><h2 id="regression_data"><font color=#ff0000>REGRESSION ANALYSIS</font></h2>\n', | |
979 file = htmloutfile, append = TRUE); | |
980 | |
981 # TE PE Regression | |
982 singlesample_regression(PE_TE_data,htmloutfile, append=TRUE); | |
983 | |
984 cat('<hr/><h2 id="cluster_data"><font color=#ff0000>CLUSTER ANALYSIS</font></h2>\n', | |
985 file = htmloutfile, append = TRUE); | |
986 | |
987 #TE PE Heatmap | |
988 singlesample_heatmap(PE_TE_data, htmloutfile, hm_nclust); | |
989 | |
990 #TE PE Clustering (kmeans) | |
991 singlesample_kmeans(PE_TE_data, htmloutfile, nclust=as.numeric(numCluster)) | |
992 | |
993 } | |
994 } | |
995 cat("<h3>Go To:</h3>\n", | |
996 "<ul>\n", | |
997 "<li><a href=#sample_dist>Sample distribution</a></li>\n", | |
998 "<li><a href=#corr_data>Correlation</a></li>\n", | |
999 "<li><a href=#regression_data>Regression analysis</a></li>\n", | |
1000 "<li><a href=#inf_obs>Influential observations</a></li>\n", | |
1001 "<li><a href=#cluster_data>Cluster analysis</a></li></ul>\n", | |
1002 "<br><a href=#>TOP</a>", | |
1003 file = htmloutfile, append = TRUE); | |
1004 cat("</body></html>\n", file = htmloutfile, append = TRUE); |