Mercurial > repos > pravs > protein_rna_correlation

#==================================================================================
# About the script
#==================================================================================
	# Version: V1
	# This script works for single sample only
	# It takes GE (Gene Expression) and PE (Protein expression) data of one sample and perform correlation, regression analysis between PE and GE data
	# Input data can be of tsv format
	# Script also need a parameter or option file

#==================================================================================
# Dependencies
#==================================================================================
	# Following R package has to be installed.
		# data.table
		# gplots
		# MASS
		# DMwR
		# mgcv
	# It can be installed by following R command in R session. e.g. install.packages("data.table")

#==================================================================================
# How to Run
#==================================================================================
	# Rscript PE_GE_association_singleSample_V1.r <PE_file> <GE_file> <Option_file containing tool parameters>  <Ensembl map file containing directory path> <outdir>

#==================================================================================
# Arguments
#==================================================================================
	# Arg1. <PE file>: PE data (tsv format)
	# Arg2. <GE file>: GE data (tsv format)
	# Arg3. <Option file>: tsv format, key\tvalue
	#	    Options are
	# 		PE_idcolno: Column number of PE file containing protein IDs
	# 		GE_idcolno: Column number of GE file containing transcript IDs
	# 		PE_expcolno: Column number of PE file containing protein expression values
	# 		GE_expcolno: Column number of GE file containing transcript expression values
	#		PE_idtype: protein id type. It can be either Uniprot or Ensembl or HGNC_symbol
	# 		GE_idtype: transcript id type. At present it is only one type i.e. Ensembl or HGNC_symbol
	# 		Organism: Organism
	#		writeMapUnmap: Whether to write mapped and unmapped data in input data format. It takes value as 1 or 0. If 1, mapped and unmapped data is written. Default is 1.
	#		doscale: Whether perform scaling to input data or not. If yet, abundance values are normalized by standard normalization. Default 1
	# Arg4. <Ensembl map file containg directory>: Path to Ensembl map file containg directory e.g. /home/user/Ensembl/mapfiles
	# Arg5. <Outdir>: output directory (e.g. /home/user/out1)

#==================================================================================
# Sample option file
#==================================================================================
	#PE_idcolno	7
	#GE_idcolno	1
	#PE_expcolno	2
	#GE_expcolno	3
	#PE_idtype	Ensembl
	#GE_idtype	Ensembl
	#Organism	mmusculus
	#writeMapUnmap	1
	#doscale	1

#==================================================================================
# Output
#==================================================================================
	# The script outputs image and data folder along with Correlation_result.html and Result.log file
	# Result.log: Log file
	# Correlation_result.html; main result file in html format

	# data folder contains following output files

	# PE_abundance.tsv: 2 column tsv file containing mapped id and protein expression values
	# GE_abundance.tsv: 2 column tsv file containing mapped id and transcript expression values

	# If writeMapUnmap is 1 i.e. to write mapped and unmapped data, 4 additional file will be written
		# PE_unmapped.tsv: Output format is same as input, PE unmapped data is written
		# GE_unmapped.tsv: Output format is same as input, GE unmapped data is written
		# PE_mapped.tsv: Output format is same as input, PE mapped data is written
		# GE_mapped.tsv: Output format is same as input, GE mapped data is written

	# PE_GE_influential_observation.tsv: Influential observations based on cook's distance
	# PE_GE_kmeans_clusterpoints.txt: Observations clustered based on kmeans clustering. File contains cluster assignment of each observations

#==================================================================================
# ............................SCRIPT STARTS FROM HERE .............................
#==================================================================================
# Warning Off
oldw <- getOption("warn")
options(warn = -1)
#=============================================================================================================
# Functions
#=============================================================================================================
	usage <- function()
	{
		cat("\n\n###########\nERROR: Rscript PE_GE_association_singleSample_V1.r <PE file> <GE file> <Option file containing tool parameters>  <Ensembl map file containing directory path> <outdir>\n###########\n\n");
	}

#=============================================================================================================
# Global variables
#=============================================================================================================
	noargs = 13;

#=============================================================================================================
# Parse command line arguments in args object
#=============================================================================================================
	args = commandArgs(trailingOnly = TRUE);


#=============================================================================================================
# Check for No of arguments
#=============================================================================================================
	if(length(args) != noargs)
	{
		usage();
		stop(paste("Please check usage. Number of arguments is not equal to ",noargs,sep="",collapse=""));
	}

#======================================================================
# Load libraries
#======================================================================
	library(data.table);
	library(lattice);
	library(grid);
	library(nlme);
	library(gplots);
	library(MASS);
	library(DMwR);
	library(mgcv);


#=============================================================================================================
# Set variables
#=============================================================================================================
	#PE_file = args[1]; # Protein abundance data file
	#GE_file = args[2]; # Gene expression data file
	#option_file = args[3]; # Option file containing various parameters
	#biomartdir = args[4]; # Biomart map file containing directory path in local system
	#outdir = args[5]; # output directory

	PE_file = args[1]; # Protein abundance data file
	GE_file = args[2]; # Gene expression data file
	#option_file = args[3]; # Option file containing various parameters
	#biomartdir = args[4]; # Biomart map file containing directory path in local system
	outdir = args[13]; # output directory

	#imagesubdirprefix = "image";
	#datasubdirprefix = "data";
	#htmloutfile = "Correlation_result.html";
	htmloutfile = args[12]

	#logfile = "Result.log";
	PE_outfile_mapped = "PE_mapped.tsv";
	GE_outfile_mapped = "GE_mapped.tsv";
	PE_outfile_unmapped = "PE_unmapped.tsv";
	GE_outfile_unmapped = "GE_unmapped.tsv";
	PE_expfile = "PE_abundance.tsv";
	GE_expfile = "GE_abundance.tsv";
	PE_outfile_excluded_naInf = "PE_excluded_NA_Inf.tsv";
	GE_outfile_excluded_naInf = "GE_excluded_NA_Inf.tsv";

#=============================================================================================================
# Check input files existance
#=============================================================================================================
	if(! file.exists(PE_file))
	{
		usage();
		stop(paste("Input PE_file file does not exists. Path given: ",PE_file,sep="",collapse=""));
	}
	if(! file.exists(GE_file))
	{
		usage();
		stop(paste("Input GE_file does not exists. Path given: ",GE_file,sep="",collapse=""));
	}
	#if(! file.exists(option_file))
	#{
	#	usage();
	#	stop(paste("Input option_file does not exists. Path given: ",option_file,sep="",collapse=""));
	#}

#=============================================================================================================
# Read param_file and set parameter/option data frame
#=============================================================================================================
	#optiondf = read.table(option_file, header = FALSE, stringsAsFactors = FALSE)
	#rownames(optiondf) = optiondf[,1];

#=============================================================================================================
# Define option variables
#=============================================================================================================
	#PE_idcolno = as.numeric(optiondf["PE_idcolno",2]);
	#GE_idcolno = as.numeric(optiondf["GE_idcolno",2]);
	#PE_expcolno = as.numeric(optiondf["PE_expcolno",2]);
	#GE_expcolno = as.numeric(optiondf["GE_expcolno",2]);
	#PE_idtype = optiondf["PE_idtype",2];
	#GE_idtype = optiondf["GE_idtype",2];
	#Organism = optiondf["Organism",2];
	#writeMapUnmap = as.logical(as.numeric(optiondf["writeMapUnmap",2]));
	#doscale = as.logical(as.numeric(optiondf["doscale",2]));

	PE_idcolno = as.numeric(args[3])
	GE_idcolno = as.numeric(args[4])
	PE_expcolno = as.numeric(args[5])
	GE_expcolno = as.numeric(args[6])
	PE_idtype = args[7]
	GE_idtype = args[8]
	#Organism = args[9]
	writeMapUnmap = as.logical(as.numeric(args[10]));
	doscale = as.logical(as.numeric(args[11]));


#1 PE_file = "test_data/PE_mouse_singlesample.txt"
#2 GE_file = "test_data/GE_mouse_singlesample.txt"
#3 PE_idcolno = 7
#4 GE_idcolno = 1
#5 PE_expcolno = 13
#6 GE_expcolno = 10
#7 PE_idtype = "Ensembl_with_version"
#8 GE_idtype = "Ensembl_with_version"
#10 writeMapUnmap = 1
#11 doscale = 1
#9 biomart_mapfile = "test_data/mmusculus_gene_ensembl__GRCm38.p6.map"
#12 htmloutfile = "html_out.html"
#13 outdir = "output_elements"

#=============================================================================================================
# Set column name of biomart map file (idtype) based on whether Ensembl or Uniprot
#=============================================================================================================
	if(PE_idtype == "Ensembl")
	{
		PE_idtype = "ensembl_peptide_id";
	}else
	{
		if(PE_idtype == "Ensembl_with_version")
		{
		PE_idtype = "ensembl_peptide_id_version";
		}else{
			if(PE_idtype == "HGNC_symbol")
			{
			PE_idtype = "hgnc_symbol";
			}
		}
	}

	if(GE_idtype == "Ensembl")
	{
		GE_idtype = "ensembl_transcript_id";
	}else
	{
		if(GE_idtype == "Ensembl_with_version")
		{
		GE_idtype = "ensembl_transcript_id_version";
		}else{
			if(GE_idtype == "HGNC_symbol")
			{
			GE_idtype = "hgnc_symbol";
			}
		}
	}
#=============================================================================================================
# Identify biomart mapping file
#=============================================================================================================
	#biomartdir = gsub(biomartdir, pattern="/$", replacement="")
	#biomart_mapfilename = list.files(path = biomartdir, pattern = Organism);
	#biomart_mapfile = paste(biomartdir,"/",biomart_mapfilename,sep="",collapse="");
	#print(biomart_mapfile);
	biomart_mapfile = args[9];
#=============================================================================================================
# Parse PE, GE, biomart file file
#=============================================================================================================
	PE_df = as.data.frame(fread(input=PE_file, header=T, sep="\t"));
	GE_df = as.data.frame(fread(input=GE_file, header=T, sep="\t"));
	biomart_df = as.data.frame(fread(input=biomart_mapfile, header=T, sep="\t"));

#=============================================================================================================
# Create directory structures and then set the working directory to output directory
#=============================================================================================================
	if(! file.exists(outdir))
	{
		dir.create(outdir);
	}

	#tempdir = paste(outdir,"/",imagesubdirprefix,sep="",collapse="");
	#if(! file.exists(tempdir))
	#{
	#	dir.create(tempdir);
	#}

	#tempdir = paste(outdir,"/",datasubdirprefix,sep="",collapse="");
	#if(! file.exists(tempdir))
	#{
	#	dir.create(tempdir);
	#}

	#setwd(outdir);
	logfile = paste(outdir,"/", "Result.log",sep="",collapse="");

#=============================================================================================================
# Write initial data summary in html outfile
#=============================================================================================================
	cat("<html><body>\n", file = htmloutfile);
	cat("<h1>Association between proteomics and transcriptomics data</h1>\n",
	"<font color='blue'><h3>Input data summary</h3></font>",
	"<ul>",
	"<li>Abbrebiations used: PE (Proteomics) and GE (Transcriptomics)","</li>",
	"<li>Input PE data dimension (Row Column): ", dim(PE_df),"</li>",
	"<li>Input GE data dimension (Row Column): ", dim(GE_df),"</li>",
	#"<li>Organism selected: ", Organism,"</li>",
	"<li>Protein ID fetched from column: ", PE_idcolno,"</li>",
	"<li>Transcript ID fetched from column: ", GE_idcolno,"</li>",
	"<li>Protein ID type: ", PE_idtype,"</li>",
	"<li>Transcript ID type: ", GE_idtype,"</li>",
	"<li>Protein expression data fetched from column: ", PE_expcolno,"</li>",
	"<li>Transcript expression data fetched from column: ", GE_expcolno,"</li>",
	file = htmloutfile, append = TRUE);

#=============================================================================================================
# Write initial data summary in logfile
#=============================================================================================================
	#cat("Current work dir:", outdir,"\n");
	cat("Processing started\n---------------------\n", file=logfile);
	cat("Abbrebiations used: PE (Proteomics) and GE (Transcriptomics)\n", file=logfile, append=T);
	cat("Input PE data dimension (Row Column): ", dim(PE_df),"\n", file=logfile, append=T)
	cat("Input GE data dimension (Row Column): ", dim(GE_df),"\n", file=logfile, append=T)
	#cat("Organism selected: ", Organism,"\n", file=logfile, append=T)
	#cat("Biomart map file used: ", biomart_mapfilename,"\n", file=logfile, append=T)
	cat("Ensembl Biomart mapping data dimension (Row Column): ", dim(biomart_df),"\n", file=logfile, append=T)
	cat("\n\nProtein ID to Transcript ID mapping started\n----------------\n", file=logfile, append=T)
	cat("Protein ID fetched from column:", PE_idcolno,"\n", file=logfile, append=T)
	cat("Transcript ID fetched from column:", GE_idcolno, "\n",file=logfile, append=T)
	cat("Protein ID type:", PE_idtype, "\n",file=logfile, append=T)
	cat("Transcript ID type:", GE_idtype,"\n", file=logfile, append=T);
	cat("Protein expression data fetched from column:", PE_expcolno,"\n", file=logfile, append=T)
	cat("Transcript expression data fetched from column:", GE_expcolno, "\n",file=logfile, append=T)

#=============================================================================================================
# Mapping starts here
# Pseudocode
# Loop over each row of PE file, fetch protein_id
# Search the biomartmap file and obtain corresponding transcript_id
# Take the mapped transcript_id and search in GE file, which row it correspond
# Store the PE rowno and GE rowno in rowpair object
#=============================================================================================================
	rowpair = data.frame(PE_rowno = 0, GE_rowno = 0);
	cat("Total rows:", nrow(PE_df),"\n");
	cat("\n\nTotal protein ids to be mapped: ", nrow(PE_df),"\n", file=logfile, append=T);
	messagelog = "\n\nBelow are protein IDs, for which no match is observed in Ensembl Biomart Map file.\n\n";

	# GE_id column
	GE_ids = GE_df[,GE_idcolno];
	GE_ids = gsub(x=GE_ids, pattern=".\\d+$", replacement="");  # Remove version

	# Loop over every row of PE data (PE_df)
	for(i in 1:nrow(PE_df))
	{

		if(i%%100 ==0)
		{
		cat("Total rows processed ", i,"\n", file=logfile, append=T);
		print(i);
		}

		queryid = PE_df[i,PE_idcolno];
		#queryid = gsub(x=queryid, pattern=".\\d+$", replacement="");  # Remove version
		#print(queryid);

		PE_df_matchrowno = i; # Row number in PE_df which matches queryid

		if(PE_idtype == "Uniprot")
		{
			biomart_matchrowno = which(biomart_df[,8] == queryid | biomart_df[,9] == queryid);  # Row number of biomart_df which matches queryid
		}else{
		biomart_matchrowno = which(biomart_df[,PE_idtype] == queryid);  # Row number of biomart_df which matches queryid
		}

		# If match found, map protein id to GE id and find corresponding match row number of GE_df
		if(length(biomart_matchrowno)>0)
		{
			GE_df_matchrowno = which(GE_ids %in% biomart_df[biomart_matchrowno[1],GE_idtype]);
			rowpair = rbind( rowpair, c(PE_df_matchrowno, GE_df_matchrowno));
			if(length(GE_df_matchrowno) > 1)
			{
				cat("\nFor protein ID ", i," multiple transcript mapping found\n", file=logfile, append=T);

				cat(
				"<br><font color=",'red',">For protein ID", i," multiple transcript mapping found</font><br>",file = htmloutfile, append = TRUE);
			}
		}else{
			messagelog = paste(messagelog, queryid, "\n",sep="", collapse="");
		}
	}
	rowpair = rowpair[-1,];

#=============================================================================================================
# Write mapping summary, mapped and unmapped data
#=============================================================================================================
	cat(
	"<li>Total Protein ID mapped: ", length(intersect(1:nrow(PE_df), rowpair[,1])),"</li>",
	"<li>Total Protein ID unmapped: ", length(setdiff(1:nrow(PE_df), rowpair[,1])),"</li>",
	"<li>Total Transcript ID mapped: ", length(intersect(1:nrow(GE_df), rowpair[,2])),"</li>",
	"<li>Total Transcript ID unmapped: ", length(setdiff(1:nrow(GE_df), rowpair[,2])),"</li></ul>",
	file = htmloutfile, append = TRUE);

	cat("\n\nMapping Statistics\n---------------------\n", file=logfile, append=T);
	cat("Total Protein ID mapped:", length(intersect(1:nrow(PE_df), rowpair[,1])), "\n", file=logfile, append=T)
	cat("Total Protein ID unmapped:", length(setdiff(1:nrow(PE_df), rowpair[,1])), "\n", file=logfile, append=T)
	cat("Total Transcript ID mapped:", length(intersect(1:nrow(GE_df), rowpair[,2])), "\n", file=logfile, append=T)
	cat("Total Transcript ID unmapped:", length(setdiff(1:nrow(GE_df), rowpair[,2])), "\n", file=logfile, append=T)

	cat(messagelog,"\n",file=logfile, append=T);

	if(writeMapUnmap)
	{
		write.table(PE_df[rowpair[,1], ], file=paste(outdir,"/",PE_outfile_mapped,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n")
		write.table(GE_df[rowpair[,2], ], file=   paste(outdir,"/",GE_outfile_mapped,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n")
		write.table(PE_df[-rowpair[,1], ], file=   paste(outdir,"/",PE_outfile_unmapped,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n")
		write.table(GE_df[-rowpair[,2], ], file=paste(outdir,"/",GE_outfile_unmapped,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n");

			cat(
		"<font color='blue'><h3>Download mapped unmapped data</h3></font>",
		"<ul><li>Protein mapped data: ", '<a href="',paste(outdir,"/",PE_outfile_mapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
		"<li>Protein unmapped data: ", '<a href="',paste(outdir,"/",PE_outfile_unmapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
		"<li>Transcript mapped data: ", '<a href="',paste(outdir,"/",GE_outfile_mapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
		"<li>Transcript unmapped data: ", '<a href="',paste(outdir,"/",GE_outfile_unmapped,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
		file = htmloutfile, append = TRUE);
	}

	write.table(PE_df[rowpair[,1], c(PE_idcolno,PE_expcolno)], file=paste(outdir,"/",PE_expfile,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n")
	write.table(GE_df[rowpair[,2], c(GE_idcolno,GE_expcolno)], file=paste(outdir,"/",GE_expfile,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n")

		cat(
	"<li>Protein abundance data: ", '<a href="',paste(outdir,"/",PE_expfile,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
	"<li>Transcript abundance data: ", '<a href="',paste(outdir,"/",GE_expfile,sep="",collapse=""),'" target="_blank"> Link</a>',"</li></ul>",
	file = htmloutfile, append = TRUE);

#==========================================================================================
# Analysis (correlation and regression) starts here
#==========================================================================================
	cat("Analysis started\n---------------------------\n",file=logfile, append=T);
	proteome_df = PE_df[rowpair[,1], c(PE_idcolno,PE_expcolno)];
	transcriptome_df = GE_df[rowpair[,2], c(GE_idcolno,GE_expcolno)];
	nPE = nrow(proteome_df);
	nGE = nrow(transcriptome_df)

	cat("Total Protein ID: ",nPE,"\n",file=logfile, append=T);
	cat("Total Transcript ID: ",nGE,"\n",file=logfile, append=T);

#==========================================================================================
# Data summary
#==========================================================================================
	cat(
	"<ul>",
	"<li>Number of entries in Transcriptome data used for correlation: ",nPE,"</li>",
	"<li>Number of entries in Proteome data used for correlation: ",nGE,"</li></ul>",
	file = htmloutfile, append = TRUE);

#=============================================================================================================
# Remove entries with NA or Inf or -Inf in Transcriptome and Proteome data which will create problem in correlation analysis
#=============================================================================================================
	totna = sum(is.na(transcriptome_df[,2]) | is.na(proteome_df[,2]));
	totinf = sum(is.infinite(transcriptome_df[,2]) | is.infinite(proteome_df[,2]));

	cat("<font color='blue'><h3>Filtering</h3></font>","Checking for NA or Inf or -Inf in either Transcriptome or Proteome data, if found, remove those entry<br>","<ul>","<li>Number of NA found: ",totna,"</li>","<li>Number of Inf or -Inf found: ",totinf,"</li></ul>",file = htmloutfile, append = TRUE);

	cat("Total NA observed in either Transcriptome or Proteome data: ",totna,"\n",file=logfile, append=T);
	cat("Total Inf or -Inf observed in either Transcriptome or Proteome data: ",totinf,"\n",file=logfile, append=T);

	if(totna > 0 | totinf > 0)
	{
		excludeIndices_PE = which(is.na(proteome_df[,2]) | is.infinite(proteome_df[,2]));
		excludeIndices_GE = which(is.na(transcriptome_df[,2]) | is.infinite(transcriptome_df[,2]));
		excludeIndices = which(is.na(transcriptome_df[,2]) | is.infinite(transcriptome_df[,2]) | is.na(proteome_df[,2]) | is.infinite(proteome_df[,2]));

		# Write excluded transcriptomics and proteomics data to file
		write.table(proteome_df[excludeIndices_PE,], file=paste(outdir,"/",PE_outfile_excluded_naInf,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n")
		write.table(transcriptome_df[excludeIndices_GE,], file=paste(outdir,"/",GE_outfile_excluded_naInf,sep="",collapse=""), row.names=F, quote=F, sep="\t", eol="\n")

		# Write excluded transcriptomics and proteomics data link to html file
		cat(
		"<ul><li>Protein excluded data with NA or Inf or -Inf: ", '<a href="',paste(outdir,"/",PE_outfile_excluded_naInf,sep="",collapse=""),'" target="_blank"> Link</a>',"</li>",
		"<li>Transcript excluded data with NA or Inf or -Inf: ", '<a href="',paste(outdir,"/",GE_outfile_excluded_naInf,sep="",collapse=""),'" target="_blank"> Link</a>',"</li></ul>",
		file = htmloutfile, append = TRUE);

		# Keep the unexcluded entries only
		transcriptome_df = transcriptome_df[-excludeIndices,];
		proteome_df = proteome_df[-excludeIndices,];
		nPE = nrow(proteome_df);
		nGE = nrow(transcriptome_df)

		cat("<font color='blue'><h3>Filtered data summary</h3></font>",
		"Excluding entires with abundance values: NA/Inf/-Inf<br>",
		"<ul>",
		"<li>Number of entries in Transcriptome data remained: ",nrow(transcriptome_df),"</li>",
		"<li>Number of entries in Proteome data remained: ",nrow(proteome_df),"</li></ul>", file = htmloutfile, append = TRUE);

		cat("Excluding entires with abundance values: NA/Inf/-Inf","\n",file=logfile, append=T);

		cat("Total Protein ID after filtering: ",nPE,"\n",file=logfile, append=T);
		cat("Total Transcript ID after filtering: ",nGE,"\n",file=logfile, append=T);

	}

#==========================================================================================
# Scaling of data
#==========================================================================================
	if(doscale)
	{
		proteome_df[,2] = scale(proteome_df[,2], center = TRUE, scale = TRUE);
		transcriptome_df[,2] = scale(transcriptome_df[,2], center = TRUE, scale = TRUE);
	}

#=============================================================================================================
# Proteome and Transcriptome data summary
#=============================================================================================================
	cat("Calculating summary of PE and GE data\n",file=logfile, append=T);
	s1 = summary(proteome_df[,2]);
	s2 = summary(transcriptome_df[,2])

	cat(
	"<font color='blue'><h3>Proteome data summary</h3></font>\n",
	'<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr>',
	file = htmloutfile, append = TRUE);

	for(i in 1:length(s1))
	{
	cat("<tr><td>",names(s1[i]),"</td><td>", s1[i],"</td></tr>\n", file = htmloutfile, append = TRUE)
	}
	cat("</table>\n", file = htmloutfile, append = TRUE)

	cat(
	"<font color='blue'><h3>Transcriptome data summary</h3></font>\n",
	'<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr>',
	file = htmloutfile, append = TRUE);

	for(i in 1:length(s2))
	{
	cat("<tr><td>",names(s2[i]),"</td><td>", s2[i],"</td></tr>\n", file = htmloutfile, append = TRUE)
	}
	cat("</table>\n", file = htmloutfile, append = TRUE)

#=============================================================================================================
# Distribution of proteome and transcriptome abundance (Box and Density plot)
#=============================================================================================================
	cat("Generating Box and Density plot\n",file=logfile, append=T);
	outplot = paste("./",outdir,"/AbundancePlot.png",sep="",collapse="");
	png(outplot);
	par(mfrow=c(2,2));
	boxplot(proteome_df[,2], ylab="Abundance", main="Proteome abundance", cex.lab=1.5);
	plot(density(proteome_df[,2]), xlab="Protein Abundance", ylab="Density", main="Proteome abundance", cex.lab=1.5);
	boxplot(transcriptome_df[,2], ylab="Abundance", main="Transcriptome abundance", cex.lab=1.5);
	plot(density(transcriptome_df[,2]), xlab="Transcript Abundance", ylab="Density", main="Transcriptome abundance", cex.lab=1.5);
	dev.off();

	cat(
	"<font color='blue'><h3>Distribution of Proteome and Transcripome abundance (Box plot and Density plot)</h3></font>\n",
	'<img src="AbundancePlot.png">',
	file = htmloutfile, append = TRUE);

#=============================================================================================================
# Scatter plot
#=============================================================================================================
	cat("Generating scatter plot\n",file=logfile, append=T);
	outplot = paste("./",outdir,"/AbundancePlot_scatter.png",sep="",collapse="");
	png(outplot);
	par(mfrow=c(1,1));
	scatter.smooth(transcriptome_df[,2], proteome_df[,2], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5);

	cat(
	"<font color='blue'><h3>Scatter plot between Proteome and Transcriptome Abundance</h3></font>\n",
	'<img src="AbundancePlot_scatter.png">',
	file = htmloutfile, append = TRUE);

#=============================================================================================================
# Correlation testing
#=============================================================================================================
	cat("Estimating correlation\n",file=logfile, append=T);
	cor_result_pearson = cor.test(transcriptome_df[,2], proteome_df[,2], method = "pearson");
	cor_result_spearman = cor.test(transcriptome_df[,2], proteome_df[,2], method = "spearman");
	cor_result_kendall = cor.test(transcriptome_df[,2], proteome_df[,2], method = "kendall");

	cat(
	"<font color='blue'><h3>Correlation with all data</h3></font>\n",
	'<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Method 1</th><th>Method 2</th><th>Method 3</th></tr>',
	file = htmloutfile, append = TRUE);

	cat(
	"<tr><td>Correlation method used</td><td>",cor_result_pearson$method,"</td><td>",cor_result_spearman$method,"</td><td>",cor_result_kendall$method,"</td></tr>",
	"<tr><td>Correlation</td><td>",cor_result_pearson$estimate,"</td><td>",cor_result_spearman$estimate,"</td><td>",cor_result_kendall$estimate,"</td></tr>",
	"<tr><td>Pvalue</td><td>",cor_result_pearson$p.value,"</td><td>",cor_result_spearman$p.value,"</td><td>",cor_result_kendall$p.value,"</td></tr>",
	file = htmloutfile, append = TRUE)
	cat("</table>\n", file = htmloutfile, append = TRUE)

	cat(
	'<font color="red">*Note that <u>correlation</u> is <u>sensitive to outliers</u> in the data. So it is important to analyze outliers/influential observations in the data.<br> Below we use <u>cook\'s distance based approach</u> to identify such influential observations.</font>',
	file = htmloutfile, append = TRUE);

#=============================================================================================================
# Linear Regression
#=============================================================================================================
	cat("Fitting linear regression model\n",file=logfile, append=T);
	PE_GE_data = proteome_df;
	PE_GE_data = cbind(PE_GE_data, transcriptome_df);
	colnames(PE_GE_data) = c("PE_ID","PE_abundance","GE_ID","GE_abundance");

	regmodel = lm(PE_abundance~GE_abundance, data=PE_GE_data);
	regmodel_summary = summary(regmodel);

	cat(
	"<font color='blue'><h3>Linear Regression model fit between Proteome and Transcriptome data</h3></font>\n",
	"<p>Assuming a linear relationship between Proteome and Transcriptome data, we here fit a linear regression model.</p>\n",
	'<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr>',
	file = htmloutfile, append = TRUE);

	cat(
	"<tr><td>Formula</td><td>","PE_abundance~GE_abundance","</td></tr>\n",
	"<tr><td colspan='2' align='center'> <b>Coefficients</b></td>","</tr>\n",
	"<tr><td>",names(regmodel$coefficients[1]),"</td><td>",regmodel$coefficients[1]," (Pvalue:", regmodel_summary$coefficients[1,4],")","</td></tr>\n",
	"<tr><td>",names(regmodel$coefficients[2]),"</td><td>",regmodel$coefficients[2]," (Pvalue:", regmodel_summary$coefficients[2,4],")","</td></tr>\n",
	"<tr><td colspan='2' align='center'> <b>Model parameters</b></td>","</tr>\n",
	"<tr><td>Residual standard error</td><td>",regmodel_summary$sigma," (",regmodel_summary$df[2]," degree of freedom)</td></tr>\n",
	"<tr><td>F-statistic</td><td>",regmodel_summary$fstatistic[1]," ( on ",regmodel_summary$fstatistic[2]," and  ",regmodel_summary$fstatistic[3]," degree of freedom)</td></tr>\n",
	"<tr><td>R-squared</td><td>",regmodel_summary$r.squared,"</td></tr>\n",
	"<tr><td>Adjusted R-squared</td><td>",regmodel_summary$adj.r.squared,"</td></tr>\n",
	file = htmloutfile, append = TRUE)
	cat("</table>\n", file = htmloutfile, append = TRUE)

#=============================================================================================================
# Plotting various regression diagnostics plots
#=============================================================================================================
	outplot1 = paste("./",outdir,"/PE_GE_modelfit.pdf",sep="",collapse="");
	pdf(outplot1);
	devnum = which(unlist(sapply(2:length(.Devices), function(x){attributes(.Devices[[x]])$filepath==outplot1})))+1
	print(.Devices)
	print(c(devnum,"+++"));

	regmodel_predictedy = predict(regmodel, PE_GE_data);
	plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Linear regression with all data");
	points(PE_GE_data[,"GE_abundance"], regmodel_predictedy, col="red");

	cat("Generating regression diagnostics plot\n",file=logfile, append=T);
	cat(
	"<font color='blue'><h3>Plotting various regression diagnostics plots</h3></font>\n",
	file = htmloutfile, append = TRUE);

	outplot = paste("./",outdir,"/PE_GE_lm_1.png",sep="",collapse="");
	png(outplot);
	par(mfrow=c(1,1));
	plot(regmodel, 1, cex.lab=1.5);
	dev.off();

	outplot = paste("./",outdir,"/PE_GE_lm_2.png",sep="",collapse="");
	png(outplot);
	par(mfrow=c(1,1));
	plot(regmodel, 2, cex.lab=1.5);
	dev.off();

	outplot = paste("./",outdir,"/PE_GE_lm_3.png",sep="",collapse="");
	png(outplot);
	par(mfrow=c(1,1));
	plot(regmodel, 3, cex.lab=1.5);
	dev.off();

	outplot = paste("./",outdir,"/PE_GE_lm_5.png",sep="",collapse="");
	png(outplot);
	par(mfrow=c(1,1));
	plot(regmodel, 5, cex.lab=1.5);
	dev.off();

	outplot = paste("./",outdir,"/PE_GE_lm.pdf",sep="",collapse="");
	pdf(outplot);
	plot(regmodel);
	dev.off();
	regmodel_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel$fitted.values)


	cat(
	"<u><font color='brown'><h4>Residuals vs Fitted plot</h4></font></u>\n",
	'<img src="PE_GE_lm_1.png">',
	'<br><br>This plot checks for linear relationship assumptions. If a horizontal line is observed without any distinct patterns, it indicates a linear relationship<br>',
	file = htmloutfile, append = TRUE);

	cat(
	"<u><font color='brown'><h4>Normal Q-Q plot of residuals</h4></font></u>\n",
	'<img src="PE_GE_lm_2.png">',
	'<br><br>This plot checks whether residuals are normally distributed or not. It is good if the residuals points follow the straight dashed line i.e., do not deviate much from dashed line.<br>',
	file = htmloutfile, append = TRUE);

	cat(
	"<u><font color='brown'><h4>Scale-Location (or Spread-Location) plot</h4></font></u>\n",
	'<img src="PE_GE_lm_3.png">',
	'<br><br>This plot checks for homogeneity of residual variance (homoscedasticity). A horizontal line observed with equally spread residual points is a good indication of homoscedasticity.<br>',
	file = htmloutfile, append = TRUE);

	cat(
	"<u><font color='brown'><h4>Residuals vs Leverage plot</h4></font></u>\n",
	'<img src="PE_GE_lm_3.png">',
	'<br><br>This plot is useful to identify any influential cases, that is outliers or extreme values that might influence the regression results upon inclusion or exclusion from the analysis.<br>',
	file = htmloutfile, append = TRUE);

#=============================================================================================================
# Identification of influential observations
#=============================================================================================================
	cat("Identifying influential observations\n",file=logfile, append=T);
	cat(
	"<font color='blue'><h3>Identify influential observations</h3></font>\n",
	file = htmloutfile, append = TRUE);
	cat(
	'<p><b>Cook’s distance</b> computes the influence of each data point/observation on the predicted outcome. i.e. this measures how much the observation is influencing the fitted values.<br>In general use, those observations that have a <b>cook’s distance > than 4 times the mean</b> may be classified as <b>influential.</b></p>',
	file = htmloutfile, append = TRUE);

	cooksd <- cooks.distance(regmodel);

	cat("Generating cooksd plot\n",file=logfile, append=T);
	outplot = paste("./",outdir,"/PE_GE_lm_cooksd.png",sep="",collapse="");
	png(outplot);
	par(mfrow=c(1,1));
	plot(cooksd, pch="*", cex=2, cex.lab=1.5,main="Influential Obs. by Cooks distance", ylab="Cook\'s distance", xlab="Observations")  # plot cooks distance
	abline(h = 4*mean(cooksd, na.rm=T), col="red")  # add cutoff line
	#text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red", pos=2)  # add labels
	dev.off();

	cat(
	'<img src="PE_GE_lm_cooksd.png">',
	'<br>In the above plot, observations above red line (4*mean cook\'s distance) are influential, marked in <b>*</b>. Genes that are outliers could be important. These observations influences the correlation values and regression coefficients<br><br>',
	file = htmloutfile, append = TRUE);

	tempind = which(cooksd>4*mean(cooksd, na.rm=T));
	PE_GE_data_no_outlier = PE_GE_data[-tempind,];
	PE_GE_data_no_outlier$cooksd = cooksd[-tempind]
	PE_GE_data_outlier = PE_GE_data[tempind,];
	PE_GE_data_outlier$cooksd = cooksd[tempind]

	cat(
	'<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Value</th></tr>',
	file = htmloutfile, append = TRUE);

	cat(
	"<tr><td>Mean cook\'s distance</td><td>",mean(cooksd, na.rm=T),"</td></tr>\n",
	"<tr><td>Total influential observations (cook\'s distance > 4 * mean cook\'s distance)</td><td>",length(tempind),"</td></tr>\n",
	"<tr><td>Total influential observations (cook\'s distance > 3 * mean cook\'s distance)</td><td>",length(which(cooksd>3*mean(cooksd, na.rm=T))),"</td></tr>\n",
	"</table>",
	'<font color="brown"><h4>Top 10 influential observations (cook\'s distance > 4 * mean cook\'s distance)</h4></font>',
	file = htmloutfile, append = TRUE);

	cat("Writing influential observations\n",file=logfile, append=T);

	outdatafile = paste("./",outdir,"/PE_GE_influential_observation.tsv", sep="", collapse="");
	cat('<a href="',outdatafile, '" target="_blank">Download entire list</a>',file = htmloutfile, append = TRUE);
	write.table(PE_GE_data_outlier, file=outdatafile, row.names=F, sep="\t", quote=F);

	cat(
	'<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>PE_ID</th><th>PE_abundance</th><th>GE_ID</th><th>GE_abundance</th><th>cooksd</th></tr>',
	file = htmloutfile, append = TRUE);

	for(i in 1:10)
	{
		cat(
		'<tr>','<td>',PE_GE_data_outlier[i,1],'</td>',
		'<td>',PE_GE_data_outlier[i,2],'</td>',
		'<td>',PE_GE_data_outlier[i,3],'</td>',
		'<td>',PE_GE_data_outlier[i,4],'</td>',
		'<td>',PE_GE_data_outlier[i,5],'</td></tr>',
		file = htmloutfile, append = TRUE);
	}
	cat('</table>',file = htmloutfile, append = TRUE);

#=============================================================================================================
# Correlation with removal of outliers
#=============================================================================================================

	#=============================================================================================================
	# Scatter plot
	#=============================================================================================================
	outplot = paste("./",outdir,"/AbundancePlot_scatter_without_outliers.png",sep="",collapse="");
	png(outplot);
	par(mfrow=c(1,1));
	scatter.smooth(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5);

	cat(
	"<font color='blue'><h3>Scatter plot between Proteome and Transcriptome Abundance, after removal of outliers/influential observations</h3></font>\n",
	'<img src="AbundancePlot_scatter_without_outliers.png">',
	file = htmloutfile, append = TRUE);

	#=============================================================================================================
	# Correlation
	#=============================================================================================================
	cat("Estimating orrelation with removal of outliers \n",file=logfile, append=T);
	cat(
	"<font color='blue'><h3>Correlation with removal of outliers / influential observations</h3></font>\n",
	'<p>We removed the influential observations and reestimated the correlation values.</p>',
	file = htmloutfile, append = TRUE);

	cor_result_pearson = cor.test(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], method = "pearson");
	cor_result_spearman = cor.test(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], method = "spearman");
	cor_result_kendall = cor.test(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], method = "kendall");

	cat(
	'<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Parameter</th><th>Method 1</th><th>Method 2</th><th>Method 3</th></tr>',
	file = htmloutfile, append = TRUE);

	cat(
	"<tr><td>Correlation method used</td><td>",cor_result_pearson$method,"</td><td>",cor_result_spearman$method,"</td><td>",cor_result_kendall$method,"</td></tr>",
	"<tr><td>Correlation</td><td>",cor_result_pearson$estimate,"</td><td>",cor_result_spearman$estimate,"</td><td>",cor_result_kendall$estimate,"</td></tr>",
	"<tr><td>Pvalue</td><td>",cor_result_pearson$p.value,"</td><td>",cor_result_spearman$p.value,"</td><td>",cor_result_kendall$p.value,"</td></tr>",
	file = htmloutfile, append = TRUE)
	cat("</table>\n", file = htmloutfile, append = TRUE)

#=============================================================================================================
# Heatmap
#=============================================================================================================
	cat(
	"<font color='blue'><h3>Heatmap of PE and GE abundance values</h3></font>\n",
	file = htmloutfile, append = TRUE);

	cat("Generating heatmap plot\n",file=logfile, append=T);
	outplot = paste("./",outdir,"/PE_GE_heatmap.png",sep="",collapse="");
	png(outplot);
	par(mfrow=c(1,1));
	heatmap.2(as.matrix(PE_GE_data[,c("PE_abundance","GE_abundance")]), trace="none", cexCol=1, col=greenred(100),Colv=F, labCol=c("PE","GE"), scale="col");
	dev.off();

	cat(
	'<img src="PE_GE_heatmap.png">',
	file = htmloutfile, append = TRUE);

#=============================================================================================================
# kmeans clustering
#=============================================================================================================

	PE_GE_data_kdata = PE_GE_data;


	k1 = kmeans(PE_GE_data_kdata[,c("PE_abundance","GE_abundance")], 5);
	cat("Generating kmeans plot\n",file=logfile, append=T);
	outplot = paste("./",outdir,"/PE_GE_kmeans.png",sep="",collapse="");
	png(outplot);
	par(mfrow=c(1,1));
	scatter.smooth(PE_GE_data_kdata[,"GE_abundance"], PE_GE_data_kdata[,"PE_abundance"], xlab="Transcript Abundance", ylab="Protein Abundance", cex.lab=1.5);

	ind=which(k1$cluster==1);
	points(PE_GE_data_kdata[ind,"GE_abundance"], PE_GE_data_kdata[ind,"PE_abundance"], col="red", pch=16);
	ind=which(k1$cluster==2);
	points(PE_GE_data_kdata[ind,"GE_abundance"], PE_GE_data_kdata[ind,"PE_abundance"], col="green", pch=16);
	ind=which(k1$cluster==3);
	points(PE_GE_data_kdata[ind,"GE_abundance"], PE_GE_data_kdata[ind,"PE_abundance"], col="blue", pch=16);
	ind=which(k1$cluster==4);
	points(PE_GE_data_kdata[ind,"GE_abundance"], PE_GE_data_kdata[ind,"PE_abundance"], col="cyan", pch=16);
	ind=which(k1$cluster==5);
	points(PE_GE_data_kdata[ind,"GE_abundance"], PE_GE_data_kdata[ind,"PE_abundance"], col="black", pch=16);
	dev.off();

	cat(
	"<font color='blue'><h3>Kmean clustering</h3></font>\nNumber of Clusters: 5<br>",
	file = htmloutfile, append = TRUE);

	tempind = order(k1$cluster);
	tempoutfile = paste(outdir,"/","PE_GE_kmeans_clusterpoints.txt",sep="",collapse="");

	write.table(data.frame(PE_GE_data_kdata[tempind, ], Cluster=k1$cluster[tempind]), file=tempoutfile, row.names=F, quote=F, sep="\t", eol="\n")

	cat('<a href="',tempoutfile, '" target="_blank">Download cluster list</a><br>',file = htmloutfile, append = TRUE);

	cat(
	'<img src="PE_GE_kmeans.png">',
	file = htmloutfile, append = TRUE);

#=============================================================================================================
# Other Regression fit
#=============================================================================================================
	dev.set(devnum);
	# Linear regression with removal of outliers
	regmodel_no_outlier = lm(PE_abundance~GE_abundance, data=PE_GE_data_no_outlier);
	regmodel_no_outlier_predictedy = predict(regmodel_no_outlier, PE_GE_data_no_outlier);
	outplot = paste("./",outdir,"/PE_GE_lm_without_outliers.pdf",sep="",collapse="");
	plot(PE_GE_data_no_outlier[,"GE_abundance"], PE_GE_data_no_outlier[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Linear regression with removal of outliers");
	points(PE_GE_data_no_outlier[,"GE_abundance"], regmodel_no_outlier_predictedy, col="red");

	pdf(outplot);
	plot(regmodel_no_outlier);
	dev.off();
	regmodel_no_outlier_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_no_outlier$fitted.values)

	# Resistant regression (lqs / least trimmed squares method)
	regmodel_lqs = lqs(PE_abundance~GE_abundance, data=PE_GE_data);
	regmodel_lqs_predictedy = predict(regmodel_lqs, PE_GE_data);
	outplot = paste("./",outdir,"/PE_GE_lqs.pdf",sep="",collapse="");
	pdf(outplot);
	plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Resistant regression (lqs / least trimmed squares method)");
	points(PE_GE_data[,"GE_abundance"], regmodel_lqs_predictedy, col="red");
	#plot(regmodel_lqs);
	dev.off();
	dev.set(devnum);
	plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Resistant regression (lqs / least trimmed squares method)");
	points(PE_GE_data[,"GE_abundance"], regmodel_lqs_predictedy, col="red");
	regmodel_lqs_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_lqs$fitted.values)

	# Robust regression (rlm /  Huber M-estimator method)
	regmodel_rlm = rlm(PE_abundance~GE_abundance, data=PE_GE_data);
	regmodel_rlm_predictedy = predict(regmodel_rlm, PE_GE_data);
	outplot = paste("./",outdir,"/PE_GE_rlm.pdf",sep="",collapse="");
	plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Robust regression (rlm /  Huber M-estimator method)");
	points(PE_GE_data[,"GE_abundance"], regmodel_rlm_predictedy, col="red");
	pdf(outplot);
	plot(regmodel_rlm);
	dev.off();
	regmodel_rlm_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_rlm$fitted.values)

	# polynomical reg
	regmodel_poly2 = lm(PE_abundance ~ poly(GE_abundance, 2, raw = TRUE), data = PE_GE_data)
	regmodel_poly3 = lm(PE_abundance ~ poly(GE_abundance, 3, raw = TRUE), data = PE_GE_data)
	regmodel_poly4 = lm(PE_abundance ~ poly(GE_abundance, 4, raw = TRUE), data = PE_GE_data)
	regmodel_poly5 = lm(PE_abundance ~ poly(GE_abundance, 5, raw = TRUE), data = PE_GE_data)
	regmodel_poly6 = lm(PE_abundance ~ poly(GE_abundance, 6, raw = TRUE), data = PE_GE_data)
	regmodel_poly2_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_poly2$fitted.values)
	regmodel_poly3_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_poly3$fitted.values)
	regmodel_poly4_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_poly4$fitted.values)
	regmodel_poly5_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_poly5$fitted.values)
	regmodel_poly6_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_poly6$fitted.values)
	regmodel_poly2_predictedy = predict(regmodel_poly2, PE_GE_data);
	regmodel_poly3_predictedy = predict(regmodel_poly3, PE_GE_data);
	regmodel_poly4_predictedy = predict(regmodel_poly4, PE_GE_data);
	regmodel_poly5_predictedy = predict(regmodel_poly5, PE_GE_data);
	regmodel_poly6_predictedy = predict(regmodel_poly6, PE_GE_data);

	outplot = paste("./",outdir,"/PE_GE_poly2.pdf",sep="",collapse="");
	dev.set(devnum);
	plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 2");
	points(PE_GE_data[,"GE_abundance"], regmodel_poly2_predictedy, col="red");
	pdf(outplot);
	plot(regmodel_poly2);
	dev.off();

	outplot = paste("./",outdir,"/PE_GE_poly3.pdf",sep="",collapse="");
	dev.set(devnum);
	plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 3");
	points(PE_GE_data[,"GE_abundance"], regmodel_poly3_predictedy, col="red");
	pdf(outplot);
	plot(regmodel_poly3);
	dev.off();

	outplot = paste("./",outdir,"/PE_GE_poly4.pdf",sep="",collapse="");
	dev.set(devnum);
	plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 4");
	points(PE_GE_data[,"GE_abundance"], regmodel_poly4_predictedy, col="red");
	pdf(outplot);
	plot(regmodel_poly4);
	dev.off();

	outplot = paste("./",outdir,"/PE_GE_poly5.pdf",sep="",collapse="");
	dev.set(devnum);
	plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 5");
	points(PE_GE_data[,"GE_abundance"], regmodel_poly5_predictedy, col="red");
	pdf(outplot);
	plot(regmodel_poly5);
	dev.off();

	outplot = paste("./",outdir,"/PE_GE_poly6.pdf",sep="",collapse="");
	dev.set(devnum);
	plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Polynomial regression with degree 6");
	points(PE_GE_data[,"GE_abundance"], regmodel_poly6_predictedy, col="red");
	pdf(outplot);
	plot(regmodel_poly6);
	dev.off();

	# GAM Generalized additive models
	regmodel_gam <- gam(PE_abundance ~ s(GE_abundance), data = PE_GE_data)
	regmodel_gam_predictedy = predict(regmodel_gam, PE_GE_data);

	regmodel_gam_metrics = regr.eval(PE_GE_data$PE_abundance, regmodel_gam$fitted.values)
	outplot = paste("./",outdir,"/PE_GE_gam.pdf",sep="",collapse="");
	dev.set(devnum);
	plot(PE_GE_data[,"GE_abundance"], PE_GE_data[,"PE_abundance"], xlab="GE_abundance", ylab="PE_abundance",main="Generalized additive models");
	points(PE_GE_data[,"GE_abundance"], regmodel_gam_predictedy, col="red");
	pdf(outplot);
	plot(regmodel_gam,pages=1,residuals=TRUE);  ## show partial residuals
	plot(regmodel_gam,pages=1,seWithMean=TRUE) ## `with intercept' CIs
	dev.off();
	dev.off(devnum);

	cat(
	"<font color='blue'><h3>Other regression model fitting</h3></font>\n",
	file = htmloutfile, append = TRUE);

	cat(
	"<ul>
	<li>MAE:mean absolute error</li>
	<li>MSE: mean squared error</li>
	<li>RMSE:root mean squared error ( sqrt(MSE) )</li>
	<li>MAPE:mean absolute percentage error</li>
	</ul>
	",
	file = htmloutfile, append = TRUE);

	cat(
	'<h4><a href="PE_GE_modelfit.pdf" target="_blank">Comparison of model fits</a></h4>',
	file = htmloutfile, append = TRUE);

	cat(
	'<table class="embedded-table" border=1 cellspacing=0 cellpadding=5 style="table-layout:auto; "> <tr bgcolor="#c3f0d6"><th>Model</th><th>MAE</th><th>MSE</th><th>RMSE</th><th>MAPE</th><th>Diagnostics Plot</th></tr>',
	file = htmloutfile, append = TRUE);

	cat(
	"<tr><td>Linear regression with all data</td><td>",regmodel_metrics[1],"</td><td>",regmodel_metrics[2],"</td><td>",regmodel_metrics[3],"</td><td>",regmodel_metrics[4],"</td><td>",'<a href="PE_GE_lm.pdf" target="_blank">Link</a>',"</td></tr>",

	"<tr><td>Linear regression with removal of outliers</td><td>",regmodel_no_outlier_metrics[1],"</td><td>",regmodel_no_outlier_metrics[2],"</td><td>",regmodel_no_outlier_metrics[3],"</td><td>",regmodel_no_outlier_metrics[4],"</td><td>",'<a href="PE_GE_lm_without_outliers.pdf" target="_blank">Link</a>',"</td></tr>",

	"<tr><td>Resistant regression (lqs / least trimmed squares method)</td><td>",regmodel_lqs_metrics[1],"</td><td>",regmodel_lqs_metrics[2],"</td><td>",regmodel_lqs_metrics[3],"</td><td>",regmodel_lqs_metrics[4],"</td><td>", '<a href="PE_GE_lqs.pdf" target="_blank">Link</a>',"</td></tr>",

	"<tr><td>Robust regression (rlm /  Huber M-estimator method)</td><td>",regmodel_rlm_metrics[1],"</td><td>",regmodel_rlm_metrics[2],"</td><td>",regmodel_rlm_metrics[3],"</td><td>",regmodel_rlm_metrics[4],"</td><td>",'<a href="PE_GE_rlm.pdf" target="_blank">Link</a>',"</td></tr>",


	"<tr><td>Polynomial regression with degree 2</td><td>",regmodel_poly2_metrics[1],"</td><td>",regmodel_poly2_metrics[2],"</td><td>",regmodel_poly2_metrics[3],"</td><td>",regmodel_poly2_metrics[4],"</td><td>",'<a href="PE_GE_poly2.pdf" target="_blank">Link</a>',"</td></tr>",

	"<tr><td>Polynomial regression with degree 3</td><td>",regmodel_poly3_metrics[1],"</td><td>",regmodel_poly3_metrics[2],"</td><td>",regmodel_poly3_metrics[3],"</td><td>",regmodel_poly3_metrics[4],"</td><td>",'<a href="PE_GE_poly3.pdf" target="_blank">Link</a>',"</td></tr>",

	"<tr><td>Polynomial regression with degree 4</td><td>",regmodel_poly4_metrics[1],"</td><td>",regmodel_poly4_metrics[2],"</td><td>",regmodel_poly4_metrics[3],"</td><td>",regmodel_poly4_metrics[4],"</td><td>",'<a href="PE_GE_poly4.pdf" target="_blank">Link</a>',"</td></tr>",

	"<tr><td>Polynomial regression with degree 5</td><td>",regmodel_poly5_metrics[1],"</td><td>",regmodel_poly5_metrics[2],"</td><td>",regmodel_poly5_metrics[3],"</td><td>",regmodel_poly5_metrics[4],"</td><td>",'<a href="PE_GE_poly5.pdf" target="_blank">Link</a>',"</td></tr>",

	"<tr><td>Polynomial regression with degree 6</td><td>",regmodel_poly6_metrics[1],"</td><td>",regmodel_poly6_metrics[2],"</td><td>",regmodel_poly6_metrics[3],"</td><td>",regmodel_poly6_metrics[4],"</td><td>",'<a href="PE_GE_poly6.pdf" target="_blank">Link</a>',"</td></tr>",

	"<tr><td>Generalized additive models</td><td>",regmodel_gam_metrics[1],"</td><td>",regmodel_gam_metrics[2],"</td><td>",regmodel_gam_metrics[3],"</td><td>",regmodel_gam_metrics[4],"</td><td>",'<a href="PE_GE_gam.pdf" target="_blank">Link</a>',"</td></tr>",

	"</table>",
	file = htmloutfile, append = TRUE);


	# Warning On
	options(warn = oldw)
author	pravs
date	Sun, 17 Jun 2018 04:44:29 -0400
parents	fc89f8c3b777
children	6bf0203ee17e