diff Rscripts/ridb-regression.R @ 0:9d5f4f5f764b

Initial commit to toolshed
author pieter.lukasse@wur.nl
date Thu, 16 Jan 2014 13:10:00 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Rscripts/ridb-regression.R	Thu Jan 16 13:10:00 2014 +0100
@@ -0,0 +1,208 @@
+##
+#
+# Performs regression analysis using either 3rd degree polynomial- or linear-method
+#
+##
+
+# Commandline arguments
+args  <- commandArgs(TRUE)
+if (length(args) < 7)
+	stop(cat("Missing arguments, usage:\n\tRscript ridb-regression.R RI-database ",
+             "ouput_file logfile min_residuals range_mod pvalue rsquared method ",
+			 "plot(yes/no) plot_archive"))
+
+ridb <- args[1]
+out_file <- args[2]
+logfile <- args[3]
+min_residuals <- as.integer(args[4])
+range_mod <- as.integer(args[5])
+pvalue <- as.double(args[6])
+rsquared <- as.double(args[7])
+method <- args[8]
+plot <- tolower(args[9])
+if (plot == 'true')
+	plot_archive = args[10]
+
+# Do not show warnings etc.
+sink(file='/dev/null')
+
+progress <- c()
+logger <- function(logdata) {
+	## Logs progress, adds a timestamp for each event
+	#cat(paste(Sys.time(), "\t", logdata, "\n", sep="")) ## DEBUG
+	progress <<- c(progress, paste(Sys.time(), "\t", logdata, sep=""))
+}
+
+logger("Reading Retention Index Database..")
+
+# Read Retention Index Database
+ridb <- read.csv(ridb, header=TRUE, sep="\t")
+logger(paste("\t", nrow(ridb), "records read.."))
+# Get a unique list 
+gc_columns <- unique(as.vector(as.matrix(ridb['Column.name'])[,1]))
+cas_numbers <- unique(as.vector(as.matrix(ridb['CAS'])[,1]))
+
+add_poly_fit <- function(fit, gc1_index, gc2_index, range) {
+	pval = anova.lm(fit)$Pr
+	r.squared = summary(fit)$r.squared
+
+	data = rep(NA, 11)
+	# Append results to matrix
+	data[1] = gc_columns[gc1_index] # Column 1
+	data[2] = gc_columns[gc2_index] # Column 2
+	data[3] = coefficients(fit)[1]  # The 4 coefficients
+	data[4] = coefficients(fit)[2]
+	data[5] = coefficients(fit)[3]
+	data[6] = coefficients(fit)[4]
+	data[7] = range[1]              # Left limit
+	data[8] = range[2]              # Right limit
+	data[9] = length(fit$residuals) # Number of datapoints analysed
+	data[10] = pval[1]              # p-value for resulting fitting
+	data[11] = r.squared            # R-squared
+	return(data)
+}
+
+
+add_linear_fit <- function(fit, gc1_index, gc2_index, range) {
+	pval = anova.lm(fit)$Pr
+	r.squared = summary(fit)$r.squared
+	
+	data = rep(NA, 7)
+	# Append results to matrix
+	data[1] = gc_columns[gc1_index] # Column 1
+	data[2] = gc_columns[gc2_index] # Column 2
+	data[3] = coefficients(fit)[1]  # The 4 coefficients
+	data[4] = coefficients(fit)[2]
+	data[7] = length(fit$residuals) # Number of datapoints analysed
+	data[8] = pval[1]               # p-value for resulting fitting
+	data[9] = r.squared             # R-squared
+	return(data)
+}
+
+
+add_fit <- function(fit, gc1_index, gc2_index, range, method) {
+	if (method == 'poly')
+		return(add_poly_fit(fit, gc1_index, gc2_index, range))
+	else
+		return(add_linear_fit(fit, gc1_index, gc2_index, range))
+}
+
+
+plot_fit <- function(ri1, ri2, gc1_index, gc2_index, coeff, range, method) {
+    if (method == 'poly')
+        pol <- function(x) coeff[4]*x^3 + coeff[3]*x^2 + coeff[2]*x + coeff[1]
+    else
+        pol <- function(x) coeff[2]*x + coeff[1]
+    pdf(paste('regression_model_',
+              make.names(gc_columns[gc1_index]), '_vs_', 
+              make.names(gc_columns[gc2_index]), '.pdf', sep=''))
+    curve(pol, 250:3750, col="red", lwd=2.5, main='Regression Model', xlab=gc_columns[gc1_index],
+          ylab=gc_columns[gc2_index], xlim=c(250, 3750), ylim=c(250, 3750))
+    points(ri1, ri2, lwd=0.4)
+    # Add vertical lines showing left- and right limits when using poly method
+    if (method == 'poly')
+        abline(v=range, col="grey", lwd=1.5)
+    dev.off()
+}
+
+# Initialize output dataframe
+if (method == 'poly') {
+    m <- data.frame(matrix(ncol = 11, nrow = 10))
+} else {
+    m <- data.frame(matrix(ncol = 9, nrow = 10))
+}
+
+
+get_fit <- function(gc1, gc2, method) {
+	if (method == 'poly')
+		return(lm(gc1 ~ poly(gc2, 3, raw=TRUE)))
+	else
+		return(lm(gc1 ~ gc2))
+}
+
+# Permutate
+k <- 1
+logger(paste("Permutating (with ", length(gc_columns), " GC-columns)..", sep=""))
+
+for (i in 1:(length(gc_columns)-1)) {
+	logger(paste("\tCalculating model for ", gc_columns[i], "..", sep=""))
+	breaks <- 0
+	for (j in (i+1):length(gc_columns)) {
+		col1 = ridb[which(ridb['Column.name'][,1] == gc_columns[i]),]
+		col2 = ridb[which(ridb['Column.name'][,1] == gc_columns[j]),]
+		
+		# Find CAS numbers for which both columns have data (intersect)
+		cas_intersect = intersect(col1[['CAS']], col2[['CAS']])
+		
+		# Skip if number of shared CAS entries is < cutoff
+		if (length(cas_intersect) < min_residuals) {
+			breaks = breaks + 1
+			next
+		}
+		# Gather Retention Indices
+		col1_data = col1[['RI']][match(cas_intersect, col1[['CAS']])]
+		col2_data = col2[['RI']][match(cas_intersect, col2[['CAS']])]
+		
+		# Calculate the range within which regression is possible (and move if 'range_mod' != 0)
+		range = c(min(c(min(col1_data), min(col2_data))), max(c(max(col1_data), max(col2_data))))
+		if (range_mod != 0) {
+			# Calculate percentage and add/subtract from range
+			perc = diff(range) / 100
+			perc_cutoff = range_mod * perc
+			range = as.integer(range + c(perc_cutoff, -perc_cutoff))
+		}
+		
+		# Calculate model for column1 vs column2 and plot if requested
+		fit = get_fit(col1_data, col2_data, method)
+		m[k,] = add_fit(fit, i, j, range, method)
+		
+		if (plot == 'true')
+			plot_fit(col1_data, col2_data, i, j, coefficients(fit), range, method)
+		
+		# Calculate model for column2 vs column1 and plot if requested
+		fit = get_fit(col2_data, col1_data, method)
+		m[k + 1,] = add_fit(fit, j, i, range, method)
+		
+		if (plot == 'true')
+			plot_fit(col2_data, col1_data, j, i, coefficients(fit), range, method)
+		
+		k = k + 2
+	}
+	logger(paste("\t\t", breaks, " comparisons have been skipped due to nr. of datapoints < cutoff", sep=""))
+}
+
+# Filter on pvalue and R-squared
+logger("Filtering on pvalue and R-squared..")
+if (method == 'poly') {
+    pval_index <- which(m[,10] < pvalue)
+    rsquared_index <- which(m[,11] > rsquared)
+} else {
+    pval_index <- which(m[,8] < pvalue)
+    rsquared_index <- which(m[,9] > rsquared)
+}
+logger(paste(nrow(m) - length(pval_index), " models discarded due to pvalue > ", pvalue, sep=""))
+
+logger(paste(nrow(m) - length(rsquared_index), " models discarded due to R-squared < ", rsquared, sep=""))
+
+# Remaining rows
+index = unique(c(pval_index, rsquared_index))
+
+# Reduce dataset
+m = m[index,]
+sink()
+
+# Place plots in the history as a ZIP file
+if (plot == 'true') {
+    logger("Creating archive with model graphics..")
+    system(paste("zip -9 -r models.zip *.pdf > /dev/null", sep=""))
+    system(paste("cp models.zip ", plot_archive, sep=""))
+}
+
+# Save dataframe as tab separated file
+logger("All done, saving data..")
+header = c("Column1", "Column2", "Coefficient1", "Coefficient2", "Coefficient3", "Coefficient4", 
+           "LeftLimit", "RightLimit", "Residuals", "pvalue", "Rsquared")
+if (method != 'poly')
+	header = header[c(1:4, 7:11)]
+write(progress, logfile)
+write.table(m, file=out_file, sep="\t", quote=FALSE, col.names=header, row.names=FALSE)