Mercurial > repos > pieterlukasse > prims_metabolomics
comparison Rscripts/ridb-regression.R @ 0:9d5f4f5f764b
Initial commit to toolshed
| author | pieter.lukasse@wur.nl |
|---|---|
| date | Thu, 16 Jan 2014 13:10:00 +0100 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9d5f4f5f764b |
|---|---|
| 1 ## | |
| 2 # | |
| 3 # Performs regression analysis using either 3rd degree polynomial- or linear-method | |
| 4 # | |
| 5 ## | |
| 6 | |
| 7 # Commandline arguments | |
| 8 args <- commandArgs(TRUE) | |
| 9 if (length(args) < 7) | |
| 10 stop(cat("Missing arguments, usage:\n\tRscript ridb-regression.R RI-database ", | |
| 11 "ouput_file logfile min_residuals range_mod pvalue rsquared method ", | |
| 12 "plot(yes/no) plot_archive")) | |
| 13 | |
| 14 ridb <- args[1] | |
| 15 out_file <- args[2] | |
| 16 logfile <- args[3] | |
| 17 min_residuals <- as.integer(args[4]) | |
| 18 range_mod <- as.integer(args[5]) | |
| 19 pvalue <- as.double(args[6]) | |
| 20 rsquared <- as.double(args[7]) | |
| 21 method <- args[8] | |
| 22 plot <- tolower(args[9]) | |
| 23 if (plot == 'true') | |
| 24 plot_archive = args[10] | |
| 25 | |
| 26 # Do not show warnings etc. | |
| 27 sink(file='/dev/null') | |
| 28 | |
| 29 progress <- c() | |
| 30 logger <- function(logdata) { | |
| 31 ## Logs progress, adds a timestamp for each event | |
| 32 #cat(paste(Sys.time(), "\t", logdata, "\n", sep="")) ## DEBUG | |
| 33 progress <<- c(progress, paste(Sys.time(), "\t", logdata, sep="")) | |
| 34 } | |
| 35 | |
| 36 logger("Reading Retention Index Database..") | |
| 37 | |
| 38 # Read Retention Index Database | |
| 39 ridb <- read.csv(ridb, header=TRUE, sep="\t") | |
| 40 logger(paste("\t", nrow(ridb), "records read..")) | |
| 41 # Get a unique list | |
| 42 gc_columns <- unique(as.vector(as.matrix(ridb['Column.name'])[,1])) | |
| 43 cas_numbers <- unique(as.vector(as.matrix(ridb['CAS'])[,1])) | |
| 44 | |
| 45 add_poly_fit <- function(fit, gc1_index, gc2_index, range) { | |
| 46 pval = anova.lm(fit)$Pr | |
| 47 r.squared = summary(fit)$r.squared | |
| 48 | |
| 49 data = rep(NA, 11) | |
| 50 # Append results to matrix | |
| 51 data[1] = gc_columns[gc1_index] # Column 1 | |
| 52 data[2] = gc_columns[gc2_index] # Column 2 | |
| 53 data[3] = coefficients(fit)[1] # The 4 coefficients | |
| 54 data[4] = coefficients(fit)[2] | |
| 55 data[5] = coefficients(fit)[3] | |
| 56 data[6] = coefficients(fit)[4] | |
| 57 data[7] = range[1] # Left limit | |
| 58 data[8] = range[2] # Right limit | |
| 59 data[9] = length(fit$residuals) # Number of datapoints analysed | |
| 60 data[10] = pval[1] # p-value for resulting fitting | |
| 61 data[11] = r.squared # R-squared | |
| 62 return(data) | |
| 63 } | |
| 64 | |
| 65 | |
| 66 add_linear_fit <- function(fit, gc1_index, gc2_index, range) { | |
| 67 pval = anova.lm(fit)$Pr | |
| 68 r.squared = summary(fit)$r.squared | |
| 69 | |
| 70 data = rep(NA, 7) | |
| 71 # Append results to matrix | |
| 72 data[1] = gc_columns[gc1_index] # Column 1 | |
| 73 data[2] = gc_columns[gc2_index] # Column 2 | |
| 74 data[3] = coefficients(fit)[1] # The 4 coefficients | |
| 75 data[4] = coefficients(fit)[2] | |
| 76 data[7] = length(fit$residuals) # Number of datapoints analysed | |
| 77 data[8] = pval[1] # p-value for resulting fitting | |
| 78 data[9] = r.squared # R-squared | |
| 79 return(data) | |
| 80 } | |
| 81 | |
| 82 | |
| 83 add_fit <- function(fit, gc1_index, gc2_index, range, method) { | |
| 84 if (method == 'poly') | |
| 85 return(add_poly_fit(fit, gc1_index, gc2_index, range)) | |
| 86 else | |
| 87 return(add_linear_fit(fit, gc1_index, gc2_index, range)) | |
| 88 } | |
| 89 | |
| 90 | |
| 91 plot_fit <- function(ri1, ri2, gc1_index, gc2_index, coeff, range, method) { | |
| 92 if (method == 'poly') | |
| 93 pol <- function(x) coeff[4]*x^3 + coeff[3]*x^2 + coeff[2]*x + coeff[1] | |
| 94 else | |
| 95 pol <- function(x) coeff[2]*x + coeff[1] | |
| 96 pdf(paste('regression_model_', | |
| 97 make.names(gc_columns[gc1_index]), '_vs_', | |
| 98 make.names(gc_columns[gc2_index]), '.pdf', sep='')) | |
| 99 curve(pol, 250:3750, col="red", lwd=2.5, main='Regression Model', xlab=gc_columns[gc1_index], | |
| 100 ylab=gc_columns[gc2_index], xlim=c(250, 3750), ylim=c(250, 3750)) | |
| 101 points(ri1, ri2, lwd=0.4) | |
| 102 # Add vertical lines showing left- and right limits when using poly method | |
| 103 if (method == 'poly') | |
| 104 abline(v=range, col="grey", lwd=1.5) | |
| 105 dev.off() | |
| 106 } | |
| 107 | |
| 108 # Initialize output dataframe | |
| 109 if (method == 'poly') { | |
| 110 m <- data.frame(matrix(ncol = 11, nrow = 10)) | |
| 111 } else { | |
| 112 m <- data.frame(matrix(ncol = 9, nrow = 10)) | |
| 113 } | |
| 114 | |
| 115 | |
| 116 get_fit <- function(gc1, gc2, method) { | |
| 117 if (method == 'poly') | |
| 118 return(lm(gc1 ~ poly(gc2, 3, raw=TRUE))) | |
| 119 else | |
| 120 return(lm(gc1 ~ gc2)) | |
| 121 } | |
| 122 | |
| 123 # Permutate | |
| 124 k <- 1 | |
| 125 logger(paste("Permutating (with ", length(gc_columns), " GC-columns)..", sep="")) | |
| 126 | |
| 127 for (i in 1:(length(gc_columns)-1)) { | |
| 128 logger(paste("\tCalculating model for ", gc_columns[i], "..", sep="")) | |
| 129 breaks <- 0 | |
| 130 for (j in (i+1):length(gc_columns)) { | |
| 131 col1 = ridb[which(ridb['Column.name'][,1] == gc_columns[i]),] | |
| 132 col2 = ridb[which(ridb['Column.name'][,1] == gc_columns[j]),] | |
| 133 | |
| 134 # Find CAS numbers for which both columns have data (intersect) | |
| 135 cas_intersect = intersect(col1[['CAS']], col2[['CAS']]) | |
| 136 | |
| 137 # Skip if number of shared CAS entries is < cutoff | |
| 138 if (length(cas_intersect) < min_residuals) { | |
| 139 breaks = breaks + 1 | |
| 140 next | |
| 141 } | |
| 142 # Gather Retention Indices | |
| 143 col1_data = col1[['RI']][match(cas_intersect, col1[['CAS']])] | |
| 144 col2_data = col2[['RI']][match(cas_intersect, col2[['CAS']])] | |
| 145 | |
| 146 # Calculate the range within which regression is possible (and move if 'range_mod' != 0) | |
| 147 range = c(min(c(min(col1_data), min(col2_data))), max(c(max(col1_data), max(col2_data)))) | |
| 148 if (range_mod != 0) { | |
| 149 # Calculate percentage and add/subtract from range | |
| 150 perc = diff(range) / 100 | |
| 151 perc_cutoff = range_mod * perc | |
| 152 range = as.integer(range + c(perc_cutoff, -perc_cutoff)) | |
| 153 } | |
| 154 | |
| 155 # Calculate model for column1 vs column2 and plot if requested | |
| 156 fit = get_fit(col1_data, col2_data, method) | |
| 157 m[k,] = add_fit(fit, i, j, range, method) | |
| 158 | |
| 159 if (plot == 'true') | |
| 160 plot_fit(col1_data, col2_data, i, j, coefficients(fit), range, method) | |
| 161 | |
| 162 # Calculate model for column2 vs column1 and plot if requested | |
| 163 fit = get_fit(col2_data, col1_data, method) | |
| 164 m[k + 1,] = add_fit(fit, j, i, range, method) | |
| 165 | |
| 166 if (plot == 'true') | |
| 167 plot_fit(col2_data, col1_data, j, i, coefficients(fit), range, method) | |
| 168 | |
| 169 k = k + 2 | |
| 170 } | |
| 171 logger(paste("\t\t", breaks, " comparisons have been skipped due to nr. of datapoints < cutoff", sep="")) | |
| 172 } | |
| 173 | |
| 174 # Filter on pvalue and R-squared | |
| 175 logger("Filtering on pvalue and R-squared..") | |
| 176 if (method == 'poly') { | |
| 177 pval_index <- which(m[,10] < pvalue) | |
| 178 rsquared_index <- which(m[,11] > rsquared) | |
| 179 } else { | |
| 180 pval_index <- which(m[,8] < pvalue) | |
| 181 rsquared_index <- which(m[,9] > rsquared) | |
| 182 } | |
| 183 logger(paste(nrow(m) - length(pval_index), " models discarded due to pvalue > ", pvalue, sep="")) | |
| 184 | |
| 185 logger(paste(nrow(m) - length(rsquared_index), " models discarded due to R-squared < ", rsquared, sep="")) | |
| 186 | |
| 187 # Remaining rows | |
| 188 index = unique(c(pval_index, rsquared_index)) | |
| 189 | |
| 190 # Reduce dataset | |
| 191 m = m[index,] | |
| 192 sink() | |
| 193 | |
| 194 # Place plots in the history as a ZIP file | |
| 195 if (plot == 'true') { | |
| 196 logger("Creating archive with model graphics..") | |
| 197 system(paste("zip -9 -r models.zip *.pdf > /dev/null", sep="")) | |
| 198 system(paste("cp models.zip ", plot_archive, sep="")) | |
| 199 } | |
| 200 | |
| 201 # Save dataframe as tab separated file | |
| 202 logger("All done, saving data..") | |
| 203 header = c("Column1", "Column2", "Coefficient1", "Coefficient2", "Coefficient3", "Coefficient4", | |
| 204 "LeftLimit", "RightLimit", "Residuals", "pvalue", "Rsquared") | |
| 205 if (method != 'poly') | |
| 206 header = header[c(1:4, 7:11)] | |
| 207 write(progress, logfile) | |
| 208 write.table(m, file=out_file, sep="\t", quote=FALSE, col.names=header, row.names=FALSE) |
