# HG changeset patch # User pravs # Date 1529541836 14400 # Node ID 6bf0203ee17eb576977efcf0f3ab3f57709174fe # Parent b014014e685abffff4f639c90786a6c968c116f7 planemo upload diff -r b014014e685a -r 6bf0203ee17e protein_rna_correlation.r --- a/protein_rna_correlation.r Sun Jun 17 05:06:18 2018 -0400 +++ b/protein_rna_correlation.r Wed Jun 20 20:43:56 2018 -0400 @@ -80,8 +80,8 @@ # ............................SCRIPT STARTS FROM HERE ............................. #================================================================================== # Warning Off -oldw <- getOption("warn") -options(warn = -1) +# oldw <- getOption("warn") +# options(warn = -1) #============================================================================================================= # Functions #============================================================================================================= @@ -411,10 +411,10 @@ cat( "

Download mapped unmapped data

", - "", file = htmloutfile, append = TRUE); #========================================================================================== @@ -470,8 +470,8 @@ # Write excluded transcriptomics and proteomics data link to html file cat( - "", + "", file = htmloutfile, append = TRUE); # Keep the unexcluded entries only @@ -535,7 +535,7 @@ # Distribution of proteome and transcriptome abundance (Box and Density plot) #============================================================================================================= cat("Generating Box and Density plot\n",file=logfile, append=T); - outplot = paste("./",outdir,"/AbundancePlot.png",sep="",collapse=""); + outplot = paste(outdir,"/AbundancePlot.png",sep="",collapse=""); png(outplot); par(mfrow=c(2,2)); boxplot(proteome_df[,2], ylab="Abundance", main="Proteome abundance", cex.lab=1.5); @@ -553,7 +553,7 @@ # Scatter plot #============================================================================================================= cat("Generating scatter plot\n",file=logfile, append=T); - outplot = paste("./",outdir,"/AbundancePlot_scatter.png",sep="",collapse=""); + outplot = paste(outdir,"/AbundancePlot_scatter.png",sep="",collapse=""); png(outplot); par(mfrow=c(1,1)); scatter.smooth(transcriptome_df[,2], proteome_df[,2], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5); @@ -620,7 +620,7 @@ #============================================================================================================= # Plotting various regression diagnostics plots #============================================================================================================= - outplot1 = paste("./",outdir,"/PE_GE_modelfit.pdf",sep="",collapse=""); + outplot1 = paste(outdir,"/PE_GE_modelfit.pdf",sep="",collapse=""); pdf(outplot1); devnum = which(unlist(sapply(2:length(.Devices), function(x){attributes(.Devices[[x]])$filepath==outplot1})))+1 print(.Devices) @@ -635,31 +635,31 @@ "

Plotting various regression diagnostics plots

\n", file = htmloutfile, append = TRUE); - outplot = paste("./",outdir,"/PE_GE_lm_1.png",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_lm_1.png",sep="",collapse=""); png(outplot); par(mfrow=c(1,1)); plot(regmodel, 1, cex.lab=1.5); dev.off(); - outplot = paste("./",outdir,"/PE_GE_lm_2.png",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_lm_2.png",sep="",collapse=""); png(outplot); par(mfrow=c(1,1)); plot(regmodel, 2, cex.lab=1.5); dev.off(); - outplot = paste("./",outdir,"/PE_GE_lm_3.png",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_lm_3.png",sep="",collapse=""); png(outplot); par(mfrow=c(1,1)); plot(regmodel, 3, cex.lab=1.5); dev.off(); - outplot = paste("./",outdir,"/PE_GE_lm_5.png",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_lm_5.png",sep="",collapse=""); png(outplot); par(mfrow=c(1,1)); plot(regmodel, 5, cex.lab=1.5); dev.off(); - outplot = paste("./",outdir,"/PE_GE_lm.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_lm.pdf",sep="",collapse=""); pdf(outplot); plot(regmodel); dev.off(); @@ -704,7 +704,7 @@ cooksd <- cooks.distance(regmodel); cat("Generating cooksd plot\n",file=logfile, append=T); - outplot = paste("./",outdir,"/PE_GE_lm_cooksd.png",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_lm_cooksd.png",sep="",collapse=""); png(outplot); par(mfrow=c(1,1)); plot(cooksd, pch="*", cex=2, cex.lab=1.5,main="Influential Obs. by Cooks distance", ylab="Cook\'s distance", xlab="Observations") # plot cooks distance @@ -737,7 +737,7 @@ cat("Writing influential observations\n",file=logfile, append=T); - outdatafile = paste("./",outdir,"/PE_GE_influential_observation.tsv", sep="", collapse=""); + outdatafile = paste(outdir,"/PE_GE_influential_observation.tsv", sep="", collapse=""); cat('Download entire list',file = htmloutfile, append = TRUE); write.table(PE_GE_data_outlier, file=outdatafile, row.names=F, sep="\t", quote=F); @@ -764,7 +764,7 @@ #============================================================================================================= # Scatter plot #============================================================================================================= - outplot = paste("./",outdir,"/AbundancePlot_scatter_without_outliers.png",sep="",collapse=""); + outplot = paste(outdir,"/AbundancePlot_scatter_without_outliers.png",sep="",collapse=""); png(outplot); par(mfrow=c(1,1)); scatter.smooth(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5); @@ -806,10 +806,12 @@ file = htmloutfile, append = TRUE); cat("Generating heatmap plot\n",file=logfile, append=T); - outplot = paste("./",outdir,"/PE_GE_heatmap.png",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_heatmap.png",sep="",collapse=""); png(outplot); par(mfrow=c(1,1)); - heatmap.2(as.matrix(PE_GE_data[,c("PE_abundance","GE_abundance")]), trace="none", cexCol=1, col=greenred(100),Colv=F, labCol=c("PE","GE"), scale="col"); + #heatmap.2(as.matrix(PE_GE_data[,c("PE_abundance","GE_abundance")]), trace="none", cexCol=1, col=greenred(100),Colv=F, labCol=c("PE","GE"), scale="col"); + my_palette <- colorRampPalette(c("green", "white", "red"))(299); + heatmap.2(as.matrix(PE_GE_data[,c("PE_abundance","GE_abundance")]), trace="none", cexCol=1, col=my_palette ,Colv=F, labCol=c("PE","GE"), scale="col", dendrogram = "row"); dev.off(); cat( @@ -825,7 +827,7 @@ k1 = kmeans(PE_GE_data_kdata[,c("PE_abundance","GE_abundance")], 5); cat("Generating kmeans plot\n",file=logfile, append=T); - outplot = paste("./",outdir,"/PE_GE_kmeans.png",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_kmeans.png",sep="",collapse=""); png(outplot); par(mfrow=c(1,1)); scatter.smooth(PE_GE_data_kdata[,"GE_abundance"], PE_GE_data_kdata[,"PE_abundance"], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5); @@ -864,7 +866,7 @@ # Linear regression with removal of outliers regmodel_no_outlier = lm(PE_abundance~GE_abundance, data=PE_GE_data_no_outlier); regmodel_no_outlier_predictedy = predict(regmodel_no_outlier, PE_GE_data_no_outlier); - outplot = paste("./",outdir,"/PE_GE_lm_without_outliers.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_lm_without_outliers.pdf",sep="",collapse=""); plot(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Linear regression with removal of outliers"); points(PE_GE_data_no_outlier[,"GE_abundance"], regmodel_no_outlier_predictedy, col="red"); @@ -876,7 +878,7 @@ # Resistant regression (lqs / least trimmed squares method) regmodel_lqs = lqs(PE_abundance~GE_abundance, data=PE_GE_data); regmodel_lqs_predictedy = predict(regmodel_lqs, PE_GE_data); - outplot = paste("./",outdir,"/PE_GE_lqs.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_lqs.pdf",sep="",collapse=""); pdf(outplot); plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Resistant regression (lqs / least trimmed squares method)"); points(PE_GE_data[,"GE_abundance"], regmodel_lqs_predictedy, col="red"); @@ -890,7 +892,7 @@ # Robust regression (rlm / Huber M-estimator method) regmodel_rlm = rlm(PE_abundance~GE_abundance, data=PE_GE_data); regmodel_rlm_predictedy = predict(regmodel_rlm, PE_GE_data); - outplot = paste("./",outdir,"/PE_GE_rlm.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_rlm.pdf",sep="",collapse=""); plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Robust regression (rlm / Huber M-estimator method)"); points(PE_GE_data[,"GE_abundance"], regmodel_rlm_predictedy, col="red"); pdf(outplot); @@ -915,7 +917,7 @@ regmodel_poly5_predictedy = predict(regmodel_poly5, PE_GE_data); regmodel_poly6_predictedy = predict(regmodel_poly6, PE_GE_data); - outplot = paste("./",outdir,"/PE_GE_poly2.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_poly2.pdf",sep="",collapse=""); dev.set(devnum); plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 2"); points(PE_GE_data[,"GE_abundance"], regmodel_poly2_predictedy, col="red"); @@ -923,7 +925,7 @@ plot(regmodel_poly2); dev.off(); - outplot = paste("./",outdir,"/PE_GE_poly3.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_poly3.pdf",sep="",collapse=""); dev.set(devnum); plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 3"); points(PE_GE_data[,"GE_abundance"], regmodel_poly3_predictedy, col="red"); @@ -931,7 +933,7 @@ plot(regmodel_poly3); dev.off(); - outplot = paste("./",outdir,"/PE_GE_poly4.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_poly4.pdf",sep="",collapse=""); dev.set(devnum); plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 4"); points(PE_GE_data[,"GE_abundance"], regmodel_poly4_predictedy, col="red"); @@ -939,7 +941,7 @@ plot(regmodel_poly4); dev.off(); - outplot = paste("./",outdir,"/PE_GE_poly5.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_poly5.pdf",sep="",collapse=""); dev.set(devnum); plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 5"); points(PE_GE_data[,"GE_abundance"], regmodel_poly5_predictedy, col="red"); @@ -947,7 +949,7 @@ plot(regmodel_poly5); dev.off(); - outplot = paste("./",outdir,"/PE_GE_poly6.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_poly6.pdf",sep="",collapse=""); dev.set(devnum); plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 6"); points(PE_GE_data[,"GE_abundance"], regmodel_poly6_predictedy, col="red"); @@ -960,7 +962,7 @@ regmodel_gam_predictedy = predict(regmodel_gam, PE_GE_data); regmodel_gam_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_gam$fitted.values) - outplot = paste("./",outdir,"/PE_GE_gam.pdf",sep="",collapse=""); + outplot = paste(outdir,"/PE_GE_gam.pdf",sep="",collapse=""); dev.set(devnum); plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Generalized additive models"); points(PE_GE_data[,"GE_abundance"], regmodel_gam_predictedy, col="red"); @@ -1019,7 +1021,7 @@ # Warning On - options(warn = oldw) + # options(warn = oldw) \ No newline at end of file diff -r b014014e685a -r 6bf0203ee17e protein_rna_correlation.xml --- a/protein_rna_correlation.xml Sun Jun 17 05:06:18 2018 -0400 +++ b/protein_rna_correlation.xml Wed Jun 20 20:43:56 2018 -0400 @@ -1,28 +1,20 @@ - - Correlation between protein and rna expression (Single Sample) + + Correlation between protein and transcript expression - r-base - bioconductor-rgalaxy - bioconductor-biocinstaller - rmarkdown - MASS - mgcv - DMwR - data.table - gplots - lattice - grid - nlme + R + r-protrnacorr + x11 + - + - + @@ -30,41 +22,41 @@ - + - + - - - - + + + + - - - - + + + + - + - + - + @@ -78,14 +70,23 @@ - - - + + + - + + ]]> + + +@misc{protein_rna_correlation, + author={Panigrahi, Priyabrata}, + year={2018}, + title={ProteinRNACorrelation} +} + + diff -r b014014e685a -r 6bf0203ee17e test_data/PE_abundance_GE_abundance_pearson.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test_data/PE_abundance_GE_abundance_pearson.html Wed Jun 20 20:43:56 2018 -0400 @@ -0,0 +1,56 @@ + +

Association between proteomics and transcriptomics data

+

Input data summary

Download mapped unmapped data

Filtering

Checking for NA or Inf or -Inf in either Transcriptome or Proteome data, if found, remove those entry

Filtered data summary

Excluding entires with abundance values: NA/Inf/-Inf

Proteome data summary

+ + + + + + +
ParameterValue
Min. :-2.98277
1st Qu.:-0.40393
Median :-0.07986
Mean : 0.00000
3rd Qu.: 0.26061
Max. :15.13211
+

Transcriptome data summary

+ + + + + + +
ParameterValue
Min. :-8.33003
1st Qu.:-0.06755
Median : 0.09635
Mean : 0.00000
3rd Qu.: 0.18103
Max. : 8.50430
+

Distribution of Proteome and Transcripome abundance (Box plot and Density plot)

+

Scatter plot between Proteome and Transcriptome Abundance

+

Correlation with all data

+
ParameterMethod 1Method 2Method 3
Correlation method used Pearson's product-moment correlation Spearman's rank correlation rho Kendall's rank correlation tau
Correlation -0.003584536 0.01866248 0.01280742
Pvalue 0.8457255 0.3110035 0.314683
+*Note that correlation is sensitive to outliers in the data. So it is important to analyze outliers/influential observations in the data.
Below we use cook's distance based approach to identify such influential observations.

Linear Regression model fit between Proteome and Transcriptome data

+

Assuming a linear relationship between Proteome and Transcriptome data, we here fit a linear regression model.

+ + + + + + + + + +
ParameterValue
Formula PE_abundance~GE_abundance
Coefficients
(Intercept) 1.727289e-16 (Pvalue: 1 )
GE_abundance -0.003584536 (Pvalue: 0.8457255 )
Model parameters
Residual standard error 1.000163 ( 2947 degree of freedom)
F-statistic 0.0378662 ( on 1 and 2947 degree of freedom)
R-squared 1.28489e-05
Adjusted R-squared -0.0003264749
+

Plotting various regression diagnostics plots

+

Residuals vs Fitted plot

+

This plot checks for linear relationship assumptions. If a horizontal line is observed without any distinct patterns, it indicates a linear relationship

Normal Q-Q plot of residuals

+

This plot checks whether residuals are normally distributed or not. It is good if the residuals points follow the straight dashed line i.e., do not deviate much from dashed line.

Scale-Location (or Spread-Location) plot

+

This plot checks for homogeneity of residual variance (homoscedasticity). A horizontal line observed with equally spread residual points is a good indication of homoscedasticity.

Residuals vs Leverage plot

+

This plot is useful to identify any influential cases, that is outliers or extreme values that might influence the regression results upon inclusion or exclusion from the analysis.

Identify influential observations

+

Cook’s distance computes the influence of each data point/observation on the predicted outcome. i.e. this measures how much the observation is influencing the fitted values.
In general use, those observations that have a cook’s distance > than 4 times the mean may be classified as influential.


In the above plot, observations above red line (4*mean cook's distance) are influential, marked in *. Genes that are outliers could be important. These observations influences the correlation values and regression coefficients

+ + +
ParameterValue
Mean cook's distance 0.0002988385
Total influential observations (cook's distance > 4 * mean cook's distance) 90
Total influential observations (cook's distance > 3 * mean cook's distance) 116

Top 10 influential observations (cook's distance > 4 * mean cook's distance)

Download entire list
PE_IDPE_abundanceGE_IDGE_abundancecooksd
ENSMUSP00000107109.2 -0.4719799 ENSMUST00000001126 -5.301664 0.001213545
ENSMUSP00000151536.1 3.113811 ENSMUST00000001256 -0.6348804 0.00230483
ENSMUSP00000150261.1 2.914045 ENSMUST00000001583 0.4988006 0.001801232
ENSMUSP00000111204.1 2.850989 ENSMUST00000002073 0.09635024 0.001391751
ENSMUSP00000089336.4 1.219945 ENSMUST00000002391 -2.47573 0.001781417
ENSMUSP00000030805.7 -0.8313093 ENSMUST00000003469 3.660597 0.001650483
ENSMUSP00000011492.8 -0.3735374 ENSMUST00000004326 -7.366491 0.001556623
ENSMUSP00000029658.7 9.120211 ENSMUST00000004473 0.09635024 0.01423993
ENSMUSP00000099904.4 -1.913743 ENSMUST00000004673 0.9756628 0.001209039
ENSMUSP00000081956.8 3.674308 ENSMUST00000005607 1.306612 0.006223403

Scatter plot between Proteome and Transcriptome Abundance, after removal of outliers/influential observations

+

Correlation with removal of outliers / influential observations

+

We removed the influential observations and reestimated the correlation values.

ParameterMethod 1Method 2Method 3
Correlation method used Pearson's product-moment correlation Spearman's rank correlation rho Kendall's rank correlation tau
Correlation 0.01485058 0.0246989 0.01689519
Pvalue 0.4273403 0.1867467 0.1918906
+

Heatmap of PE and GE abundance values

+

Kmean clustering

+Number of Clusters: 5
Download cluster list

Other regression model fitting

+ +

Comparison of model fits

ModelMAEMSERMSEMAPEDiagnostics Plot
Linear regression with all data 0.5463329 0.9996481 0.999824 0.9996321 Link
Linear regression with removal of outliers 0.5404805 1.006281 1.003136 1.455637 Link
Resistant regression (lqs / least trimmed squares method) 0.5407598 1.007932 1.003958 1.537172 Link
Robust regression (rlm / Huber M-estimator method) 0.5404879 1.005054 1.002524 1.411806 Link
Polynomial regression with degree 2 0.546322 0.9996472 0.9998236 0.9993865 Link
Polynomial regression with degree 3 0.5469588 0.9976384 0.9988185 1.043158 Link
Polynomial regression with degree 4 0.5467885 0.9975077 0.9987531 1.041541 Link
Polynomial regression with degree 5 0.5467813 0.9975076 0.998753 1.041209 Link
Polynomial regression with degree 6 0.5465911 0.996652 0.9983246 1.056632 Link
Generalized additive models 0.5463695 0.9976796 0.9988391 1.032766 Link
\ No newline at end of file diff -r b014014e685a -r 6bf0203ee17e tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Wed Jun 20 20:43:56 2018 -0400 @@ -0,0 +1,44 @@ + + + + + + + + + + + + + + https://cran.r-project.org/src/contrib/MASS_7.3-50.tar.gz + https://cran.r-project.org/src/contrib/mgcv_1.8-24.tar.gz + https://cran.r-project.org/src/contrib/zoo_1.8-2.tar.gz + https://cran.r-project.org/src/contrib/xts_0.10-2.tar.gz + https://cran.r-project.org/src/contrib/curl_3.2.tar.gz + https://cran.r-project.org/src/contrib/TTR_0.23-3.tar.gz + https://cran.r-project.org/src/contrib/quantmod_0.4-13.tar.gz + https://cran.r-project.org/src/contrib/abind_1.4-5.tar.gz + https://cran.r-project.org/src/contrib/gtools_3.5.0.tar.gz + https://cran.r-project.org/src/contrib/gdata_2.18.0.tar.gz + https://cran.r-project.org/src/contrib/bitops_1.0-6.tar.gz + https://cran.r-project.org/src/contrib/caTools_1.17.1.tar.gz + https://cran.r-project.org/src/contrib/gplots_3.0.1.tar.gz + https://cran.r-project.org/src/contrib/ROCR_1.0-7.tar.gz + https://cran.r-project.org/src/contrib/lattice_0.20-35.tar.gz + https://cran.r-project.org/src/contrib/DMwR_0.4.1.tar.gz + https://cran.r-project.org/src/contrib/data.table_1.11.4.tar.gz + + https://cran.r-project.org/src/contrib/Archive/nlme/nlme_3.1-120.tar.gz + + + + + + Contains a tool dependency definition that downloads and installs protein rna correlation tool package for R library. + + + + + +