# HG changeset patch # User rsajulga # Date 1545336766 18000 # Node ID 0072d7fe861a0217bcec2571c7f44d8a5a61d811 planemo upload diff -r 000000000000 -r 0072d7fe861a quantp.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/quantp.r Thu Dec 20 15:12:46 2018 -0500 @@ -0,0 +1,1232 @@ +#*************************************************************************************************************************************** +# Functions: Start +#*************************************************************************************************************************************** + +#=============================================================================== +# PCA +#=============================================================================== +multisample_PCA = function(df, sampleinfo_df, outfile) +{ + tempdf = df[,-1]; + tempcol = colnames(tempdf); + tempgrp = sampleinfo_df[tempcol,2]; + tempdf = t(tempdf) %>% as.data.frame(); + tempdf[is.na(tempdf)] = 0; + tempdf$Group = tempgrp; + png(outfile, width = 6, height = 6, units = 'in', res=300); + # bitmap(outfile, "png16m"); + g = autoplot(prcomp(select(tempdf, -Group)), data = tempdf, colour = 'Group', size=3); + saveWidget(ggplotly(g), file.path(gsub("\\.png", "\\.html", outplot))) + plot(g); + dev.off(); +} + +#=============================================================================== +# Regression and Cook's distance +#=============================================================================== +singlesample_regression = function(PE_TE_data,htmloutfile, append=TRUE) +{ + rownames(PE_TE_data) = PE_TE_data$PE_ID; + regmodel = lm(PE_abundance~TE_abundance, data=PE_TE_data); + regmodel_summary = summary(regmodel); + + cat("

Linear Regression model fit between Proteome and Transcriptome data

\n", + "

Assuming a linear relationship between Proteome and Transcriptome data, we here fit a linear regression model.

\n", + '\n', + file = htmloutfile, append = TRUE); + + cat("\n", + "","\n", + "\n", + "\n", + "","\n", + "\n", + "\n", + "\n", + "\n", + file = htmloutfile, append = TRUE); + + cat("

Parameter	Value
Formula	","PE_abundance~TE_abundance","
Coefficients
",names(regmodel$coefficients[1]),"	",regmodel$coefficients[1]," (Pvalue:", regmodel_summary$coefficients[1,4],")","
",names(regmodel$coefficients[2]),"	",regmodel$coefficients[2]," (Pvalue:", regmodel_summary$coefficients[2,4],")","
Model parameters
Residual standard error	",regmodel_summary$sigma," (",regmodel_summary$df[2]," degree of freedom)
F-statistic	",regmodel_summary$fstatistic[1]," ( on ",regmodel_summary$fstatistic[2]," and ",regmodel_summary$fstatistic[3]," degree of freedom)
R-squared	",regmodel_summary$r.squared,"
Adjusted R-squared	",regmodel_summary$adj.r.squared,"

\n", file = htmloutfile, append = TRUE); + + cat( + "

Regression and diagnostics plots

\n", + file = htmloutfile, append = TRUE); + + outplot = paste(outdir,"/PE_TE_lm_1.png",sep="",collapse=""); + png(outplot, width = 10, height = 10, units = 'in',res=300); + # bitmap(outplot, "png16m"); + par(mfrow=c(1,1)); + plot(regmodel, 1, cex.lab=1.5); + dev.off(); + + g <- autoplot(regmodel, label = FALSE)[[1]] + + geom_point(aes(text=sprintf("Residual: %.2f
Fitted value: %.2f
Gene: %s", .fitted, .resid, PE_TE_data$PE_ID)), + shape = 1, size = .1, stroke = .2) + + theme_light() + saveWidget(ggplotly(g, tooltip= c("text")), file.path(gsub("\\.png", "\\.html", outplot))) + + outplot = paste(outdir,"/PE_TE_lm_2.png",sep="",collapse=""); + png(outplot,width = 10, height = 10, units = 'in', res=300); + # bitmap(outplot, "png16m"); + par(mfrow=c(1,1)); + g <- plot(regmodel, 2, cex.lab=1.5); + ggplotly(g) + dev.off(); + + g <- autoplot(regmodel, label = FALSE)[[2]] + + geom_point(aes(text=sprintf("Standarized residual: %.2f
Theoretical quantile: %.2f
Gene: %s", .qqx, .qqy, PE_TE_data$PE_ID)), + shape = 1, size = .1) + + theme_light() + saveWidget(ggplotly(g, tooltip = "text"), file.path(gsub("\\.png", "\\.html", outplot))) + + + outplot = paste(outdir,"/PE_TE_lm_5.png",sep="",collapse=""); + png(outplot, width = 10, height = 10, units = 'in',res=300); + # bitmap(outplot, "png16m"); + par(mfrow=c(1,1)); + plot(regmodel, 5, cex.lab=1.5); + dev.off(); + + cd_cont_pos <- function(leverage, level, model) {sqrt(level*length(coef(model))*(1-leverage)/leverage)} + cd_cont_neg <- function(leverage, level, model) {-cd_cont_pos(leverage, level, model)} + + g <- autoplot(regmodel, label = FALSE)[[4]] + + aes(label = PE_TE_data$PE_ID) + + geom_point(aes(text=sprintf("Leverage: %.2f
Standardized residual: %.2f
Gene: %s", .hat, .stdresid, PE_TE_data$PE_ID))) + + theme_light() + saveWidget(ggplotly(g, tooltip = "text"), file.path(gsub("\\.png", "\\.html", outplot))) + + cat('', file = htmloutfile, append = TRUE); + + cat( + '\n", + '\n', + file = htmloutfile, append = TRUE); + + cat( + '\n', file = htmloutfile, append = TRUE); + + cat( + '\n', + '

', " 1) Residuals vs Fitted plot	2) Normal Q-Q plot of residuals
', + gsub("width:500px;height:500px", "width:600px;height:600px", extractWidgetCode(paste(outdir,"/PE_TE_lm_1.png",sep="",collapse=""))$widget_div), + '	', + gsub("width:500px;height:500px", "width:600px;height:600px", extractWidgetCode(paste(outdir,"/PE_TE_lm_2.png",sep="",collapse=""))$widget_div), + '
This plot checks for linear relationship assumptions. If a horizontal line is observed without any distinct patterns, it indicates a linear relationship.	This plot checks whether residuals are normally distributed or not. It is good if the residuals points follow the straight dashed line i.e., do not deviate much from dashed line.

\n', + file = htmloutfile, append = TRUE); + + + #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + # Residuals data + #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + res_all = regmodel$residuals; + res_mean = mean(res_all); + res_sd = sd(res_all); + res_diff = (res_all-res_mean); + res_zscore = res_diff/res_sd; + # res_outliers = res_all[which((res_zscore > 2)|(res_zscore < -2))] + + + tempind = which((res_zscore > 2)|(res_zscore < -2)); + res_PE_TE_data_no_outlier = PE_TE_data[-tempind,]; + res_PE_TE_data_no_outlier$residuals = res_all[-tempind]; + res_PE_TE_data_outlier = PE_TE_data[tempind,]; + res_PE_TE_data_outlier$residuals = res_all[tempind]; + + # Save the complete table for download (influential_observations) + temp_outlier_data = data.frame(res_PE_TE_data_outlier$PE_ID, res_PE_TE_data_outlier$TE_abundance, res_PE_TE_data_outlier$PE_abundance, res_PE_TE_data_outlier$residuals) + colnames(temp_outlier_data) = c("Gene", "Transcript abundance", "Protein abundance", "Residual value") + outdatafile = paste(outdir,"/PE_TE_outliers_residuals.txt", sep="", collapse=""); + write.table(temp_outlier_data, file=outdatafile, row.names=F, sep="\t", quote=F); + + + # Save the complete table for download (non influential_observations) + temp_all_data = data.frame(PE_TE_data$PE_ID, PE_TE_data$TE_abundance, PE_TE_data$PE_abundance, res_all) + colnames(temp_all_data) = c("Gene", "Transcript abundance", "Protein abundance", "Residual value") + outdatafile = paste(outdir,"/PE_TE_abundance_residuals.txt", sep="", collapse=""); + write.table(temp_all_data, file=outdatafile, row.names=F, sep="\t", quote=F); + + + cat('

Outliers based on the residuals from regression analysis

\n', + file = htmloutfile, append = TRUE); + cat('\n', + '\n', + '\n', + file = htmloutfile, append = TRUE); + + cat("\n", + "\n", + '\n', + '\n

Residuals from Regression
Parameter	Value
Mean Residual value	",res_mean,"
Standard deviation (Residuals)	",res_sd,"
Total outliers (Residual value > 2 standard deviation from the mean)	',length(tempind),' (Download these ',length(tempind),' data points with high residual values here)
', + '(Download the complete residuals data here)', + "

\n", + file = htmloutfile, append = TRUE); + + #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + + cat('

', file = htmloutfile, append = TRUE); + + cat( + '\n', + file = htmloutfile, append = TRUE); + + cat( + '\n', + file = htmloutfile, append = TRUE); + + cat( + '

3) Residuals vs Leverage plot
', + gsub("width:500px;height:500px", "width:600px;height:600px", extractWidgetCode(paste(outdir,"/PE_TE_lm_5.png",sep="",collapse=""))$widget_div) + , '
This plot is useful to identify any influential cases, that is outliers or extreme values. They might influence the regression results upon inclusion or exclusion from the analysis.

\n', + file = htmloutfile, append = TRUE); + + + + #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + # Cook's Distance + #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + cat('

INFLUENTIAL OBSERVATIONS

\n', + file = htmloutfile, append = TRUE); + cat( + '

Cook\'s distance computes the influence of each data point/observation on the predicted outcome. i.e. this measures how much the observation is influencing the fitted values.
In general use, those observations that have a Cook\'s distance > than ', cookdist_upper_cutoff,' times the mean may be classified as influential.

\n', + file = htmloutfile, append = TRUE); + + cooksd <- cooks.distance(regmodel); + + outplot = paste(outdir,"/PE_TE_lm_cooksd.png",sep="",collapse=""); + png(outplot, width = 10, height = 10, units = 'in', res=300); + # bitmap(outplot, "png16m"); + par(mfrow=c(1,1)); + plot(cooksd, main="Influential Obs. by Cook\'s distance", ylab="Cook\'s distance", xlab="Observations", type="n") # plot cooks distance + sel_outlier=which(cooksd>=as.numeric(cookdist_upper_cutoff)*mean(cooksd, na.rm=T)) + sel_nonoutlier=which(cooksdas.numeric(cookdist_upper_cutoff)*mean(cooksd, na.rm=T),names(cooksd),""), col="red", pos=2) # add labels + dev.off(); + + cooksd_df <- data.frame(cooksd) + cooksd_df$genes <- row.names(cooksd_df) + cooksd_df$index <- 1:nrow(cooksd_df) + cooksd_df$colors <- "black" + cutoff <- as.numeric(cookdist_upper_cutoff)*mean(cooksd, na.rm=T) + cooksd_df[cooksd_df$cooksd > cutoff,]$colors <- "red" + + g <- ggplot(cooksd_df, aes(x = index, y = cooksd, label = row.names(cooksd_df), color=as.factor(colors), + text=sprintf("Gene: %s
Cook's Distance: %.3f", row.names(cooksd_df), cooksd))) + + ggtitle("Influential Obs. by Cook's distance") + xlab("Observations") + ylab("Cook's Distance") + + #xlim(0, 3000) + ylim(0, .15) + + scale_shape_discrete(solid=F) + + geom_point(size = 2, shape = 8) + + geom_hline(yintercept = cutoff, + linetype = "dashed", color = "red") + + scale_color_manual(values = c("black" = "black", "red" = "red")) + + theme_light() + theme(legend.position="none") + saveWidget(ggplotly(g, tooltip= "text"), file.path(gsub("\\.png", "\\.html", outplot))) + + cat( + '

', + gsub("width:500px;height:500px", "width:800px;height:800px", extractWidgetCode(outplot)$widget_div), + '
In the above plot, observations above red line (',cookdist_upper_cutoff,' * mean Cook\'s distance) are influential. Genes that are outliers could be important. These observations influences the correlation values and regression coefficients

', + file = htmloutfile, append = TRUE); + + tempind = which(cooksd>as.numeric(cookdist_upper_cutoff)*mean(cooksd, na.rm=T)); + PE_TE_data_no_outlier = PE_TE_data[-tempind,]; + PE_TE_data_no_outlier$cooksd = cooksd[-tempind]; + PE_TE_data_outlier = PE_TE_data[tempind,]; + PE_TE_data_outlier$cooksd = cooksd[tempind]; + a = sort(PE_TE_data_outlier$cooksd, decreasing=T, index.return=T); + PE_TE_data_outlier_sorted = PE_TE_data_outlier[a$ix,]; + + cat( + '\n', + file = htmloutfile, append = TRUE); + + # Save the complete table for download (influential_observations) + temp_outlier_data = data.frame(PE_TE_data_outlier$PE_ID, PE_TE_data_outlier$TE_abundance, PE_TE_data_outlier$PE_abundance, PE_TE_data_outlier$cooksd) + colnames(temp_outlier_data) = c("Gene", "Transcript abundance", "Protein abundance", "Cook's distance") + outdatafile = paste(outdir,"/PE_TE_influential_observation.txt", sep="", collapse=""); + write.table(temp_outlier_data, file=outdatafile, row.names=F, sep="\t", quote=F); + + + # Save the complete table for download (non influential_observations) + temp_no_outlier_data = data.frame(PE_TE_data_no_outlier$PE_ID, PE_TE_data_no_outlier$TE_abundance, PE_TE_data_no_outlier$PE_abundance, PE_TE_data_no_outlier$cooksd) + colnames(temp_no_outlier_data) = c("Gene", "Transcript abundance", "Protein abundance", "Cook's distance") + outdatafile = paste(outdir,"/PE_TE_non_influential_observation.txt", sep="", collapse=""); + write.table(temp_no_outlier_data, file=outdatafile, row.names=F, sep="\t", quote=F); + + + cat("\n", + "\n", + + "

Parameter	Value
Mean Cook\'s distance	",mean(cooksd, na.rm=T),"
Total influential observations (Cook\'s distance > ",cookdist_upper_cutoff," * mean Cook\'s distance)	",length(tempind),"
Observations with Cook\'s distance < ",cookdist_upper_cutoff," * mean Cook\'s distance	",length(which(cooksd\n", + "

\n", + file = htmloutfile, append = TRUE); + + + #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + # Scatter plot after removal of influential points + #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + outplot = paste(outdir,"/AbundancePlot_scatter_without_outliers.png",sep="",collapse=""); + min_lim = min(c(PE_TE_data$PE_abundance,PE_TE_data$TE_abundance)); + max_lim = max(c(PE_TE_data$PE_abundance,PE_TE_data$TE_abundance)); + png(outplot, width = 10, height = 10, units = 'in', res=300); + # bitmap(outplot,"png16m"); + g = ggplot(PE_TE_data_no_outlier, aes(x=TE_abundance, y=PE_abundance, label=PE_ID)) + geom_smooth() + + xlab("Transcript abundance log fold-change") + ylab("Protein abundance log fold-change") + + xlim(min_lim,max_lim) + ylim(min_lim,max_lim) + + geom_point(aes(text=sprintf("Gene: %s
Transcript Abundance (log fold-change): %.3f
Protein Abundance (log fold-change): %.3f", + PE_ID, TE_abundance, PE_abundance))) + suppressMessages(plot(g)); + suppressMessages(saveWidget(ggplotly(g, tooltip="text"), file.path(gsub("\\.png", "\\.html", outplot)))); + dev.off(); + + + cat('\n', file = htmloutfile, append = TRUE); + # Before + cat("\n', + file = htmloutfile, append = TRUE); + + # After + cat("\n', + file = htmloutfile, append = TRUE); + #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + + cor_result_pearson = cor.test(PE_TE_data_no_outlier[,"TE_abundance"], PE_TE_data_no_outlier[,"PE_abundance"], method = "pearson"); + cor_result_spearman = cor.test(PE_TE_data_no_outlier[,"TE_abundance"], PE_TE_data_no_outlier[,"PE_abundance"], method = "spearman"); + cor_result_kendall = cor.test(PE_TE_data_no_outlier[,"TE_abundance"], PE_TE_data_no_outlier[,"PE_abundance"], method = "kendall"); + + cat('\n', file = htmloutfile, append=TRUE); + + + cat('

Scatterplot: Before removal

Scatterplot: After removal

", + '

', + gsub('id="html', 'id="secondhtml"', + gsub("width:500px;height:500px", "width:600px;height:600px", extractWidgetCode(paste(outdir,"/TE_PE_scatter.png",sep="",collapse=""))$widget_div)), + '

\n", + '

', + gsub("width:500px;height:500px", "width:600px;height:600px", extractWidgetCode(outplot)$widget_div), + + '

\n', file = htmloutfile, append=TRUE); + singlesample_cor(PE_TE_data, htmloutfile, append=TRUE); + cat('

\n', + file = htmloutfile, append = TRUE); + + cat( + "\n", + "\n", + file = htmloutfile, append = TRUE) + cat("

Parameter	Method 1	Method 2	Method 3
Correlation method	",cor_result_pearson$method,"	",cor_result_spearman$method,"	",cor_result_kendall$method,"
Correlation coefficient	",cor_result_pearson$estimate,"	",cor_result_spearman$estimate,"	",cor_result_kendall$estimate,"

\n", file = htmloutfile, append = TRUE) + + + + if(dim(PE_TE_data_outlier)[1]<10) + { + tab_n_row = dim(PE_TE_data_outlier)[1]; + }else{ + tab_n_row = 10; + } + + cat("

Download the complete list of influential observations ", + "Download the complete list (After removing influential points)
\n", + '

Top ',as.character(tab_n_row),' Influential observations (Cook\'s distance > ',cookdist_upper_cutoff,' * mean Cook\'s distance)

\n', + file = htmloutfile, append = TRUE); + + cat('\n', sep = "",file = htmloutfile, append = TRUE); + cat("\n", + file = htmloutfile, append = TRUE); + + + for(i in 1:tab_n_row) + { + cat( + '','\n', + '\n', + '\n', + '\n', + file = htmloutfile, append = TRUE); + } + cat('

Gene	Protein Log Fold-Change	Transcript Log Fold-Change	Cook's Distance
',as.character(PE_TE_data_outlier_sorted[i,1]),'	',format(PE_TE_data_outlier_sorted[i,2], scientific=F),'	',PE_TE_data_outlier_sorted[i,4],'	',format(PE_TE_data_outlier_sorted[i,5], scientific=F),'

\n',file = htmloutfile, append = TRUE); + + +} + + + +#=============================================================================== +# Heatmap +#=============================================================================== +singlesample_heatmap=function(PE_TE_data, htmloutfile, hm_nclust){ + cat('
\n', + file = htmloutfile, append = TRUE); + + hc=hclust(dist(as.matrix(PE_TE_data[,c("PE_abundance","TE_abundance")]))) + hm_cluster = cutree(hc,k=hm_nclust); + + outplot = paste(outdir,"/PE_TE_heatmap.png",sep="",collapse=""); + png(outplot, width = 10, height = 10, units = 'in', res=300); + # bitmap(outplot, "png16m"); + par(mfrow=c(1,1)); + hmap = heatmap.2(as.matrix(PE_TE_data[,c("PE_abundance","TE_abundance")]), + trace="none", cexCol=1, col=greenred(100),Colv=F, + labCol=c("Proteins","Transcripts"), scale="col", + hclustfun = hclust, distfun = dist); + + dev.off(); + + + p <- d3heatmap(as.matrix(PE_TE_data[,c("PE_abundance","TE_abundance")]), scale = "col", + dendrogram = "row", colors = greenred(100), + hclustfun = hclust, distfun = dist, + show_grid = FALSE) + saveWidget(p, file.path(gsub("\\.png", "\\.html", outplot))) + + cat('\n', + file = htmloutfile, append = TRUE); + + + temp_PE_TE_data = data.frame(PE_TE_data$PE_ID, PE_TE_data$TE_abundance, PE_TE_data$PE_abundance, hm_cluster); + colnames(temp_PE_TE_data) = c("Gene", "Transcript abundance", "Protein abundance", "Cluster (Hierarchical clustering)") + tempoutfile = paste(outdir,"/PE_TE_hc_clusterpoints.txt",sep="",collapse=""); + write.table(temp_PE_TE_data, file=tempoutfile, row.names=F, quote=F, sep="\t", eol="\n") + + + cat('

Heatmap of PE and TE abundance values (Hierarchical clustering)	Number of clusters to extract: ',hm_nclust,'
', + '', + gsub("width:960px;height:500px", "width:800px;height:800px", extractWidgetCode(outplot)$widget_div), + '
Download the hierarchical cluster list

\n', + file = htmloutfile, append = TRUE); +} + + +#=============================================================================== +# K-means clustering +#=============================================================================== +singlesample_kmeans=function(PE_TE_data, htmloutfile, nclust){ + PE_TE_data_kdata = PE_TE_data; + k1 = kmeans(PE_TE_data_kdata[,c("PE_abundance","TE_abundance")], nclust); + outplot = paste(outdir,"/PE_TE_kmeans.png",sep="",collapse=""); + png(outplot, width = 10, height = 10, units = 'in', res=300); + # bitmap(outplot, "png16m"); + par(mfrow=c(1,1)); + scatter.smooth(PE_TE_data_kdata[,"TE_abundance"], PE_TE_data_kdata[,"PE_abundance"], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5); + legend(1, 95, legend=c("Cluster 1", "Line 2"), col="red", lty=1:1, cex=0.8) + legend(1, 95, legend="Cluster 2", col="green", lty=1:1, cex=0.8) + + ind=which(k1$cluster==1); + points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="red", pch=16); + ind=which(k1$cluster==2); + points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="green", pch=16); + ind=which(k1$cluster==3); + points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="blue", pch=16); + ind=which(k1$cluster==4); + points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="cyan", pch=16); + ind=which(k1$cluster==5); + points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="black", pch=16); + ind=which(k1$cluster==6); + points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="brown", pch=16); + ind=which(k1$cluster==7); + points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="gold", pch=16); + ind=which(k1$cluster==8); + points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="thistle", pch=16); + ind=which(k1$cluster==9); + points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="yellow", pch=16); + ind=which(k1$cluster==10); + points(PE_TE_data_kdata[ind,"TE_abundance"], PE_TE_data_kdata[ind,"PE_abundance"], col="orange", pch=16); + dev.off(); + + # Interactive plot for k-means clustering + g <- ggplot(PE_TE_data, aes(x = TE_abundance, y = PE_abundance, label = row.names(PE_TE_data), + text=sprintf("Gene: %s
Transcript Abundance: %.3f
Protein Abundance: %.3f", + PE_ID, TE_abundance, PE_abundance), + color=as.factor(k1$cluster))) + + xlab("Transcript Abundance") + ylab("Protein Abundance") + + scale_shape_discrete(solid=F) + geom_smooth(method = "loess", span = 2/3) + + geom_point(size = 1, shape = 8) + + theme_light() + theme(legend.position="none") + saveWidget(ggplotly(g, tooltip=c("text")), file.path(gsub("\\.png", "\\.html", outplot))) + + cat('

\n', + file = htmloutfile, append = TRUE); + + tempind = order(k1$cluster); + tempoutfile = paste(outdir,"/PE_TE_kmeans_clusterpoints.txt",sep="",collapse=""); + write.table(data.frame(PE_TE_data_kdata[tempind, ], Cluster=k1$cluster[tempind]), file=tempoutfile, row.names=F, quote=F, sep="\t", eol="\n") + + #paste(outdir,"/PE_TE_heatmap.png",sep="",collapse=""); + cat('\n', + file = htmloutfile, append = TRUE); + cat('

K-mean clustering	Number of clusters: ',nclust,'
', + gsub("width:500px;height:500px", "width:800px;height:800px", extractWidgetCode(outplot)$widget_div), '
Download the cluster list

\n', + file = htmloutfile, append = TRUE); + +} + +#=============================================================================== +# scatter plot +#=============================================================================== +singlesample_scatter = function(PE_TE_data, outfile) +{ + min_lim = min(c(PE_TE_data$PE_abundance,PE_TE_data$TE_abundance)); + max_lim = max(c(PE_TE_data$PE_abundance,PE_TE_data$TE_abundance)); + png(outfile, width = 10, height = 10, units = 'in', res=300); + # bitmap(outfile, "png16m"); + g = ggplot(PE_TE_data, aes(x=TE_abundance, y=PE_abundance, label=PE_ID)) + geom_smooth() + + xlab("Transcript abundance log fold-change") + ylab("Protein abundance log fold-change") + + xlim(min_lim,max_lim) + ylim(min_lim,max_lim) + + geom_point(aes(text=sprintf("Gene: %s
Transcript Abundance (log fold-change): %.3f
Protein Abundance (log fold-change): %.3f", + PE_ID, TE_abundance, PE_abundance)), + size = .5) + suppressMessages(plot(g)); + suppressMessages(saveWidget(ggplotly(g, tooltip = "text"), file.path(gsub("\\.png", "\\.html", outfile)))) + dev.off(); +} + +#=============================================================================== +# Correlation table +#=============================================================================== +singlesample_cor = function(PE_TE_data, htmloutfile, append=TRUE) +{ + cor_result_pearson = cor.test(PE_TE_data$TE_abundance, PE_TE_data$PE_abundance, method = "pearson"); + cor_result_spearman = cor.test(PE_TE_data$TE_abundance, PE_TE_data$PE_abundance, method = "spearman"); + cor_result_kendall = cor.test(PE_TE_data$TE_abundance, PE_TE_data$PE_abundance, method = "kendall"); + + cat( + '\n', + file = htmloutfile, append = TRUE); + + cat( + "\n", + "\n", + file = htmloutfile, append = TRUE) + cat("

Parameter	Method 1	Method 2	Method 3
Correlation method	",cor_result_pearson$method,"	",cor_result_spearman$method,"	",cor_result_kendall$method,"
Correlation coefficient	",cor_result_pearson$estimate,"	",cor_result_spearman$estimate,"	",cor_result_kendall$estimate,"

\n", file = htmloutfile, append = TRUE); + +} + +#=============================================================================== +# Boxplot +#=============================================================================== +multisample_boxplot = function(df, sampleinfo_df, outfile, fill_leg, user_xlab, user_ylab) +{ + tempdf = df[,-1, drop=FALSE]; + tempdf = t(tempdf) %>% as.data.frame(); + tempdf[is.na(tempdf)] = 0; + tempdf$Sample = rownames(tempdf); + tempdf1 = melt(tempdf, id.vars = "Sample"); + + if("Gene" %in% colnames(df)){ + tempdf1$Name = df$Gene; + } else if ("Protein" %in% colnames(df)){ + tempdf1$Name = df$Protein; + } else if ("Genes" %in% colnames(df)){ + tempdf1$Name = df$Genes; + } + + tempdf1$Group = sampleinfo_df[tempdf1$Sample,2]; + png(outplot, width = 6, height = 6, units = 'in', res=300); + # bitmap(outplot, "png16m"); + if(fill_leg=="No"){ + tempdf1$Group = c("case", "control") + } + + g = ggplot(tempdf1, aes(x=Sample, y=value, fill=Group)) + + geom_boxplot()+ + labs(x=user_xlab) + labs(y=user_ylab) + saveWidget(ggplotly(g), file.path(gsub("\\.png", "\\.html", outfile))) + plot(g); + dev.off(); +} + +## A wrapper to saveWidget which compensates for arguable BUG in +## saveWidget which requires `file` to be in current working +## directory. +saveWidget <- function (widget,file,...) { + wd<-getwd() + on.exit(setwd(wd)) + outDir<-dirname(file) + file<-basename(file) + setwd(outDir); + htmlwidgets::saveWidget(widget,file=file,selfcontained = FALSE) +} + +#=============================================================================== +# Mean or Median of Replicates +#=============================================================================== + +mergeReplicates = function(TE_df,PE_df, sampleinfo_df, method) +{ + grps = unique(sampleinfo_df[,2]); + + TE_df_merged <<- sapply(grps, function(x){ + tempsample = sampleinfo_df[which(sampleinfo_df$Group==x),1] + if(length(tempsample)!=1){ + apply(TE_df[,tempsample],1,method); + }else{ + return(TE_df[,tempsample]); + } + }); + TE_df_merged <<- data.frame(as.character(TE_df[,1]), TE_df_merged); + colnames(TE_df_merged) = c(colnames(TE_df)[1], grps); + + PE_df_merged <<- sapply(grps, function(x){ + tempsample = sampleinfo_df[which(sampleinfo_df$Group==x),1] + if(length(tempsample)!=1){ + apply(PE_df[,tempsample],1,method); + }else{ + return(PE_df[,tempsample]); + } + }); + + PE_df_merged <<- data.frame(as.character(PE_df[,1]), PE_df_merged); + colnames(PE_df_merged) = c(colnames(PE_df)[1], grps); + + #sampleinfo_df_merged = data.frame(Sample = grps, Group = grps, stringsAsFactors = F); + sampleinfo_df_merged = data.frame(Sample = grps, Group = "Group", stringsAsFactors = F); + + return(list(TE_df_merged = TE_df_merged, PE_df_merged = PE_df_merged, sampleinfo_df_merged = sampleinfo_df_merged)); +} + +#=============================================================================== +# (T-Test or Wilcoxon ranksum test) and Volcano Plot +#=============================================================================== + +perform_Test_Volcano = function(TE_df_data,PE_df_data,TE_df_logfold, PE_df_logfold,sampleinfo_df, method, correction_method,volc_with) +{ + + PE_colnames = colnames(PE_df_data); + control_sample = sampleinfo_df[which(sampleinfo_df$Group=="control"),1]; + control_ind <<- sapply(control_sample, function(x){temp_ind = which(PE_colnames==x); as.numeric(temp_ind)}); + condition_sample = sampleinfo_df[which(sampleinfo_df$Group=="case"),1]; + condition_ind <<- sapply(condition_sample, function(x){temp_ind = which(PE_colnames==x); as.numeric(temp_ind)}); + + if(method=="mean"){ + #PE_pval = apply(PE_df_data[2:length(colnames(PE_df_data))],1,function(x) t.test(x[condition_ind-1], x[control_ind-1])$p.value); + PE_pval = apply(PE_df_data[2:length(colnames(PE_df_data))],1,function(x) {obj<-try(t.test(x[condition_ind-1], x[control_ind-1]),silent=TRUE); if(is(obj, "try-error")){return(NA)}else{return(obj$p.value)}}) + }else{ + if(method=="median"){ + PE_pval = apply(PE_df_data[2:length(colnames(PE_df_data))],1,function(x) {obj<-try(wilcox.test(x[condition_ind-1], x[control_ind-1]),silent=TRUE); if(is(obj, "try-error")){return(NA)}else{return(obj$p.value)}}) + # PE_pval = apply(PE_df_data[2:length(colnames(PE_df_data))],1,function(x) wilcox.test(x[condition_ind-1], x[control_ind-1])$p.value); + } + } + PE_adj_pval = p.adjust(PE_pval, method = correction_method, n = length(PE_pval)) + + + TE_colnames = colnames(TE_df_data); + control_sample = sampleinfo_df[which(sampleinfo_df$Group=="control"),1]; + control_ind <<- sapply(control_sample, function(x){temp_ind = which(TE_colnames==x); as.numeric(temp_ind)}); + condition_sample = sampleinfo_df[which(sampleinfo_df$Group=="case"),1]; + condition_ind <<- sapply(condition_sample, function(x){temp_ind = which(TE_colnames==x); as.numeric(temp_ind)}); + + if(method=="mean"){ + # TE_pval = apply(TE_df_data[2:length(colnames(TE_df_data))],1,function(x) t.test(x[condition_ind-1], x[control_ind-1])$p.value); + TE_pval = apply(TE_df_data[2:length(colnames(TE_df_data))],1,function(x) {obj<-try(t.test(x[condition_ind-1], x[control_ind-1]),silent=TRUE); if(is(obj, "try-error")){return(NA)}else{return(obj$p.value)}}) + }else{ + if(method=="median"){ + TE_pval = apply(TE_df_data[2:length(colnames(TE_df_data))],1,function(x) {obj<-try(wilcox.test(x[condition_ind-1], x[control_ind-1]),silent=TRUE); if(is(obj, "try-error")){return(NA)}else{return(obj$p.value)}}) + # TE_pval = apply(TE_df_data[2:length(colnames(TE_df_data))],1,function(x) wilcox.test(x[condition_ind-1], x[control_ind-1])$p.value); + } + } + TE_adj_pval = p.adjust(TE_pval, method = correction_method, n = length(TE_pval)) + + + PE_TE_logfold_pval = data.frame(TE_df_logfold$Gene, TE_df_logfold$LogFold, TE_pval, TE_adj_pval, PE_df_logfold$LogFold, PE_pval, PE_adj_pval); + colnames(PE_TE_logfold_pval) = c("Gene", "Transcript log fold-change", "p-value (transcript)", "adj p-value (transcript)", "Protein log fold-change", "p-value (protein)", "adj p-value (protein)"); + outdatafile = paste(outdir,"/PE_TE_logfold_pval.txt", sep="", collapse=""); + write.table(PE_TE_logfold_pval, file=outdatafile, row.names=F, sep="\t", quote=F); + cat("

Download the complete fold change data here
\n", + file = htmloutfile, append = TRUE); + + if(length(condition_ind)!=1) + { + # Volcano Plot + + if(volc_with=="adj_pval") + { + PE_pval = PE_adj_pval + TE_pval = TE_adj_pval + volc_ylab = "-log10 Adjusted p-value"; + }else{ + if(volc_with=="pval") + { + volc_ylab = "-log10 p-value"; + } + } + outplot_PE = paste(outdir,"/PE_volcano.png",sep="",collapse=""); + png(outplot_PE, width = 10, height = 10, units = 'in', res=300); + # bitmap(outplot, "png16m"); + par(mfrow=c(1,1)); + + plot(PE_df_logfold$LogFold, -log10(PE_pval), + xlab="log2 fold change", ylab=volc_ylab, + type="n") + sel <- which((PE_df_logfold$LogFold<=log(2,base=2))&(PE_df_logfold$LogFold>=log(0.5, base=2))) # or whatever you want to use + points(PE_df_logfold[sel,"LogFold"], -log10(PE_pval[sel]),col="black") + PE_df_logfold$color <- "black" + #sel <- which((PE_df_logfold$LogFold>log(2,base=2))&(PE_df_logfold$LogFoldlog(2,base=2))|(PE_df_logfold$LogFoldlog(2,base=2))|(PE_df_logfold$LogFold0.05) + sel=intersect(sel,sel1) + points(PE_df_logfold[sel,"LogFold"], -log10(PE_pval[sel]),col="blue") + PE_df_logfold[sel,]$color <- "blue" + abline(h = -log(0.05,base=10), col="red", lty=2) + abline(v = log(2,base=2), col="red", lty=2) + abline(v = log(0.5,base=2), col="red", lty=2) + dev.off(); + + g <- ggplot(PE_df_logfold, aes(x = LogFold, -log10(PE_pval), color = as.factor(color), + text=sprintf("Gene: %s
Log2 Fold-Change: %.3f
-log10 p-value: %.3f
p-value: %.3f", + Genes, LogFold, -log10(PE_pval), PE_pval))) + + xlab("log2 fold change") + ylab("-log10 p-value") + + geom_point(shape=1, size = 1.5, stroke = .2) + + scale_color_manual(values = c("black" = "black", "red" = "red", "blue" = "blue")) + + geom_hline(yintercept = -log(0.05,base=10), linetype="dashed", color="red") + + geom_vline(xintercept = log(2,base=2), linetype="dashed", color="red") + + geom_vline(xintercept = log(0.5,base=2), linetype="dashed", color="red") + + theme_light() + theme(legend.position="none") + saveWidget(ggplotly(g, tooltip="text"), file.path(gsub("\\.png", "\\.html", outplot_PE))) + + outplot_TE = paste(outdir,"/TE_volcano.png",sep="",collapse=""); + png(outplot_TE, width = 10, height = 10, units = 'in', res=300); + # bitmap(outplot, "png16m"); + par(mfrow=c(1,1)); + + plot(TE_df_logfold$LogFold, -log10(TE_pval), + xlab="log2 fold change", ylab=volc_ylab, + type="n") + + sel <- which((TE_df_logfold$LogFold<=log(2,base=2))&(TE_df_logfold$LogFold>=log(0.5, base=2))) # or whatever you want to use + points(TE_df_logfold[sel,"LogFold"], -log10(TE_pval[sel]),col="black") + TE_df_logfold$color <- "black" + #sel <- which((TE_df_logfold$LogFold>log(2,base=2))&(TE_df_logfold$LogFoldlog(2,base=2))|(TE_df_logfold$LogFoldlog(2,base=2))|(TE_df_logfold$LogFold0.05) + sel=intersect(sel,sel1) + points(TE_df_logfold[sel,"LogFold"], -log10(TE_pval[sel]),col="blue") + TE_df_logfold[sel,]$color <- "blue" + abline(h = -log(0.05,base=10), col="red", lty=2) + abline(v = log(2,base=2), col="red", lty=2) + abline(v = log(0.5,base=2), col="red", lty=2) + dev.off(); + + g <- ggplot(TE_df_logfold, aes(x = LogFold, -log10(TE_pval), color = as.factor(color), + text=sprintf("Gene: %s
Log2 Fold-Change: %.3f
-log10 p-value: %.3f
p-value: %.3f", + Genes, LogFold, -log10(TE_pval), TE_pval))) + + xlab("log2 fold change") + ylab("-log10 p-value") + + geom_point(shape=1, size = 1.5, stroke = .2) + + scale_color_manual(values = c("black" = "black", "red" = "red", "blue" = "blue")) + + geom_hline(yintercept = -log(0.05,base=10), linetype="dashed", color="red") + + geom_vline(xintercept = log(2,base=2), linetype="dashed", color="red") + + geom_vline(xintercept = log(0.5,base=2), linetype="dashed", color="red") + + theme_light() + theme(legend.position="none") + saveWidget(ggplotly(g, tooltip="text"), file.path(gsub("\\.png", "\\.html", outplot_TE))) + + + cat('
\n', file = htmloutfile, append = TRUE); + cat("\n', file = htmloutfile, append = TRUE); + cat("

Transcript Fold-Change	Protein Fold-Change
", + '', + extractWidgetCode(outplot_TE)$widget_div, + '	", + '', + extractWidgetCode(outplot_PE)$widget_div, + '

\n', + file = htmloutfile, append = TRUE); + + + }else{ + cat('

!!! No replicates found. Cannot perform test to check significance of differential expression. Thus, no Volcano plot generated !!!

', + file = htmloutfile, append = TRUE); + } + +} + + +#*************************************************************************************************************************************** +# Functions: End +#*************************************************************************************************************************************** + + +#=============================================================================== +# Arguments +#=============================================================================== +noargs = 12; +args = commandArgs(trailingOnly = TRUE); +if(length(args) != noargs) +{ + stop(paste("Please check usage. Number of arguments is not equal to ",noargs,sep="",collapse="")); +} + +mode = args[1]; # "multiple" or "logfold" +method = args[2]; # "mean" or "median" +sampleinfo_file = args[3]; +proteome_file = args[4]; +transcriptome_file = args[5]; +correction_method = args[6]; +cookdist_upper_cutoff = args[7]; +numCluster = args[8]; +hm_nclust = args[9]; +volc_with = args[10]; + +htmloutfile = args[11]; # html output file +outdir = args[12]; # html supporting files + +#=============================================================================== +# Check for file existance +#=============================================================================== +if(! file.exists(proteome_file)) +{ + stop(paste("Proteome Data file does not exists. Path given: ",proteome_file,sep="",collapse="")); +} +if(! file.exists(transcriptome_file)) +{ + stop(paste("Transcriptome Data file does not exists. Path given: ",transcriptome_file,sep="",collapse="")); +} + +#=============================================================================== +# Load library +#=============================================================================== +options(warn=-1); + +suppressPackageStartupMessages(library(dplyr)); +suppressPackageStartupMessages(library(data.table)); +suppressPackageStartupMessages(library(gplots)); +suppressPackageStartupMessages(library(ggplot2)); +suppressPackageStartupMessages(library(ggfortify)); +suppressPackageStartupMessages(library(plotly)); +suppressPackageStartupMessages(library(d3heatmap)); + +#=============================================================================== +# Select mode and parse experiment design file +#=============================================================================== +if(mode=="multiple") +{ + expDesign = fread(sampleinfo_file, header = FALSE, stringsAsFactors = FALSE, sep="\t") %>% data.frame(); + expDesign_cc = expDesign[1:2,]; + + sampleinfo_df = expDesign[3:nrow(expDesign),]; + rownames(sampleinfo_df)=1:nrow(sampleinfo_df); + colnames(sampleinfo_df) = c("Sample","Group"); + + condition_cols = sampleinfo_df[which(sampleinfo_df[,2]==expDesign_cc[which(expDesign_cc[,1]=="case"),2]),1]; + condition_g_name = "case"; + control_cols = sampleinfo_df[which(sampleinfo_df[,2]==expDesign_cc[which(expDesign_cc[,1]=="control"),2]),1]; + control_g_name = "control"; + sampleinfo_df[which(sampleinfo_df[,2]==expDesign_cc[which(expDesign_cc[,1]=="case"),2]),2] = "case"; + sampleinfo_df[which(sampleinfo_df[,2]==expDesign_cc[which(expDesign_cc[,1]=="control"),2]),2] = "control"; + sampleinfo_df_orig = sampleinfo_df; +} + +if(mode=="logfold") +{ + sampleinfo_df = data.frame("Sample"= c("LogFold"), "Group"=c("Fold_Change")) +} + +#=============================================================================== +# Parse Transcriptome data +#=============================================================================== +TE_df_orig = fread(transcriptome_file, sep="\t", stringsAsFactor=F, header=T) %>% data.frame(); +if(mode=="multiple") +{ + TE_df = TE_df_orig[,c(colnames(TE_df_orig)[1],condition_cols,control_cols)]; +} +if(mode=="logfold") +{ + TE_df = TE_df_orig; + colnames(TE_df) = c("Genes", "LogFold"); +} +#=============================================================================== +# Parse Proteome data +#=============================================================================== +PE_df_orig = fread(proteome_file, sep="\t", stringsAsFactor=F, header=T) %>% data.frame(); +if(mode=="multiple") +{ + PE_df = PE_df_orig[,c(colnames(PE_df_orig)[1],condition_cols,control_cols)]; +} +if(mode=="logfold") +{ + PE_df = PE_df_orig; + colnames(PE_df) = c("Genes", "LogFold"); +} + +#============================================================================================================= +# Create directory structures and then set the working directory to output directory +#============================================================================================================= +if(! file.exists(outdir)) +{ + dir.create(outdir); +} +#=============================================================================== +# Write initial data summary in html outfile +#=============================================================================== +cat("\n", file = htmloutfile); + +cat("

QuanTP: Association between abundance ratios of transcript and protein

\n", + "

Input data summary

\n", + "

Abbreviations used: PE (Proteome data) and TE (Transcriptome data)","

Input Proteome data dimension (Row Column): ", dim(PE_df)[1]," x ", dim(PE_df)[2],"
Input Transcriptome data dimension (Row Column): ", dim(TE_df)[1]," x ", dim(TE_df)[2],"

\n", + file = htmloutfile, append = TRUE); + +cat("

\n", + file = htmloutfile, append = TRUE); +#=============================================================================== +# Find common samples +#=============================================================================== +common_samples = intersect(sampleinfo_df[,1], colnames(TE_df)[-1]) %>% intersect(., colnames(PE_df)[-1]); + +if(length(common_samples)==0) +{ + stop("No common samples found "); + cat("Please check your experiment design file. Sample names (column names) in the Transcriptome and the Proteome data do not match. \n",file = htmloutfile, append = TRUE); +} + +#=============================================================================== +# Create subsets based on common samples +#=============================================================================== +TE_df = select(TE_df, 1, common_samples); +PE_df = select(PE_df, 1, common_samples); +sampleinfo_df = filter(sampleinfo_df, Sample %in% common_samples); +rownames(sampleinfo_df) = sampleinfo_df[,1]; + +#=============================================================================== +# Check for number of rows similarity +#=============================================================================== +if(nrow(TE_df) != nrow(PE_df)) +{ + stop("Number of rows in Transcriptome and Proteome data are not same i.e. they are not paired"); + cat("The correlation analysis expects paired TE and PE data i.e. (i)th gene/transcript of TE file should correspond to (i)th protein of PE file. In the current input provided there is mismatch in terms of number of rows of TE and PE file. Please make sure you provide paired data.\n",file = htmloutfile, append = TRUE); +} + +#=============================================================================== +# Number of groups +#=============================================================================== +ngrps = unique(sampleinfo_df[,2]) %>% length(); +grps = unique(sampleinfo_df[,2]); +names(grps) = grps; + +#=============================================================================== +# Change column1 name +#=============================================================================== +colnames(TE_df)[1] = "Gene"; +colnames(PE_df)[1] = "Protein"; + +#=============================================================================== +# Treat missing values +#=============================================================================== +TE_nacount = sum(is.na(TE_df)); +PE_nacount = sum(is.na(PE_df)); + +TE_df[is.na(TE_df)] = 0; +PE_df[is.na(PE_df)] = 0; + +#=============================================================================== +# Obtain JS/HTML lines for interactive visualization +#=============================================================================== +extractWidgetCode = function(outplot){ + lines <- readLines(gsub("\\.png", "\\.html", outplot)) + return(list( + 'prescripts' = c('', + gsub('script', 'script', + lines[grep('',lines) + 3 + :grep('' ,lines) - 5]), + ''), + 'widget_div' = paste('', + gsub('width:100%;height:400px', + 'width:500px;height:500px', + lines[grep(lines, pattern='html-widget')]), + '', sep=''), + 'postscripts' = paste('', + gsub('script', 'script', + lines[grep(lines, pattern='

Linear Regression model fit between Proteome and Transcriptome data

Regression and diagnostics plots

1) Residuals vs Fitted plot

2) Normal Q-Q plot of residuals

Outliers based on the residuals from regression analysis

3) Residuals vs Leverage plot

INFLUENTIAL OBSERVATIONS

Top ',as.character(tab_n_row),' Influential observations (Cook\'s distance > ',cookdist_upper_cutoff,' * mean Cook\'s distance)

QuanTP: Association between abundance ratios of transcript and protein

Input data summary

Table of Contents: