Mercurial > repos > fcaramia > contra
diff Contra/scripts/cn_analysis.v3.R @ 0:7564f3b1e675
Uploaded
author | fcaramia |
---|---|
date | Thu, 13 Sep 2012 02:31:43 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Contra/scripts/cn_analysis.v3.R Thu Sep 13 02:31:43 2012 -0400 @@ -0,0 +1,278 @@ +# ----------------------------------------------------------------------# +# Copyright (c) 2011, Richard Lupat & Jason Li. +# +# > Source License < +# This file is part of CONTRA. +# +# CONTRA is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# CONTRA is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with CONTRA. If not, see <http://www.gnu.org/licenses/>. +# +# +#-----------------------------------------------------------------------# +# Last Updated : 31 Oct 2011 17:00PM + + +# Parameters Parsing (from Command Line) +options <- commandArgs(trailingOnly = T) +bins = as.integer(options[1]) +rd.cutoff = as.integer(options[2]) +min.bases = as.integer(options[3]) +outf = options[4] +sample.name = options[5] +plotoption = options[6] +actual.bin = as.numeric(options[7]) +min_normal_rd_for_call = as.numeric(options[8]) +min_tumour_rd_for_call = as.numeric(options[9]) +min_avg_cov_for_call = as.numeric(options[10]) + +if (sample.name == "No-SampleName") + sample.name = "" + +if (sample.name != "") + sample.name = paste(sample.name, ".", sep="") + +# Setup output name +out.f = paste(outf, "/table/", sample.name, "CNATable.", rd.cutoff,"rd.", min.bases,"bases.", bins,"bins.txt", sep="") +pdf.out.f = paste(outf, "/plot/", sample.name, "densityplot.", bins, "bins.pdf", sep="") + +# Open and read input files +# cnAverageFile = paste("bin", bins, ".txt", sep="") +cnAverageFile = paste(outf,"/buf/bin",bins,".txt",sep="") +boundariesFile = paste(outf,"/buf/bin",bins,".boundaries.txt",sep="") +print (cnAverageFile) +cn.average = read.delim(cnAverageFile, as.is=F, header=F) +cn.boundary= read.delim(boundariesFile,as.is=F, header=F) + +# Apply thresholds and data grouping +cn.average.aboveTs = cn.average[cn.average$V3>min.bases,] +cn.average.list = as.matrix(cn.average.aboveTs$V4) + +# Get the mean and sd for each bins +cn.average.mean = c() +cn.average.sd = c() +cn.average.log= c() + +# Density Plots for each bins +if (plotoption == "True"){ + pdf(pdf.out.f) +} +for (j in 1:actual.bin){ + cn.average.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V4) + cn.coverage.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V11) + boundary.end = cn.boundary[cn.boundary$V1==j,]$V2 + boundary.start = cn.boundary[cn.boundary$V1==(j-1),]$V2 + boundary.mid = (boundary.end+boundary.start)/2 + if (plotoption == "True") { + plot_title = paste("density: bin", bins, sep="") + #plot(density(cn.average.nth),xlim=c(-5,5), title=plot_title) + plot(density(cn.average.nth),xlim=c(-5,5)) + } + cn.average.mean = c(cn.average.mean, mean(cn.average.nth)) +# cn.average.sd = c(cn.average.sd, sd(cn.average.nth)) + cn.average.sd = c(cn.average.sd, apply(cn.average.nth,2,sd)) + #cn.average.log = c(cn.average.log, boundary.mid) + cn.average.log = c(cn.average.log, log(mean(cn.coverage.nth),2)) +} +if (plotoption == "True"){ + dev.off() +} + +# for point outside of boundaries +if (bins > 1) { + boundary.first = cn.boundary[cn.boundary$V1==0,]$V2 + boundary.last = cn.boundary[cn.boundary$V1==bins,]$V2 + + b.mean.y2 = cn.average.mean[2] + b.mean.y1 = cn.average.mean[1] + b.sd.y2 = cn.average.sd[2] + b.sd.y1 = cn.average.sd[1] + b.x2 = cn.average.log[2] + b.x1 = cn.average.log[1] + + boundary.f.mean = (((b.mean.y2- b.mean.y1)/(b.x2-b.x1))*(boundary.first-b.x1))+b.mean.y1 + boundary.f.sd = (((b.sd.y2- b.sd.y1)/(b.x2-b.x1))*(boundary.first-b.x1))+b.sd.y1 + + if (boundary.f.sd < 0){ + boundary.f.sd = 0 + } + + b.mean.y2 = cn.average.mean[bins] + b.mean.y1 = cn.average.mean[bins-1] + b.sd.y2 = cn.average.sd[bins] + b.sd.y1 = cn.average.sd[bins-1] + b.x2 = cn.average.log[bins] + b.x1 = cn.average.log[bins-1] + + boundary.l.mean = (((b.mean.y2- b.mean.y1)/(b.x2-b.x1))*(boundary.last-b.x1))+b.mean.y1 + boundary.l.sd = (((b.sd.y2- b.sd.y1)/(b.x2-b.x1))*(boundary.last-b.x1))+b.sd.y1 + + #cn.average.log = c(boundary.first, cn.average.log, boundary.last) + #cn.linear.mean = c(boundary.f.mean, cn.average.mean, boundary.l.mean) + #cn.linear.sd = c(boundary.f.sd, cn.average.sd, boundary.l.sd) + + cn.average.log = c(boundary.first, cn.average.log) + cn.linear.mean = c(boundary.f.mean, cn.average.mean) + cn.linear.sd = c(boundary.f.sd, cn.average.sd) + +} + +# Linear Interpolation +if (bins > 1 ){ + #print(cn.average.log) + #print(cn.linear.mean) + #print(cn.linear.sd) + mean.fn <- approxfun(cn.average.log, cn.linear.mean, rule=2) + sd.fn <- approxfun(cn.average.log, cn.linear.sd, rule=2) +} + + +# Put the data's details into matrices +ids = as.matrix(cn.average.aboveTs$V1) +exons = as.matrix(cn.average.aboveTs$V6) +exons.pos = as.matrix(cn.average.aboveTs$V5) +gs = as.matrix(cn.average.aboveTs$V2) +number.bases = as.matrix(cn.average.aboveTs$V3) +mean = as.matrix(cn.average.aboveTs$V4) +sd = as.matrix(cn.average.aboveTs$V7) +tumour.rd = as.matrix(cn.average.aboveTs$V8) +tumour.rd.ori = as.matrix(cn.average.aboveTs$V10) +normal.rd = as.matrix(cn.average.aboveTs$V9) +normal.rd.ori = as.matrix(cn.average.aboveTs$V11) +median = as.matrix(cn.average.aboveTs$V12) +MinLogRatio = as.matrix(cn.average.aboveTs$V13) +MaxLogRatio = as.matrix(cn.average.aboveTs$V14) +Bin = as.matrix(cn.average.aboveTs$V15) +Chr = as.matrix(cn.average.aboveTs$V16) +OriStCoordinate = as.matrix(cn.average.aboveTs$V17) +OriEndCoordinate= as.matrix(cn.average.aboveTs$V18) + +# Linear Fit +logratios.mean = mean +logcov.mean = log2((normal.rd + tumour.rd)/2) +fit.mean = lm(logratios.mean ~ logcov.mean) +fit.x = fit.mean$coefficient[1] +fit.y = fit.mean$coefficient[2] + +adjusted.lr = rep(NA, length(logratios.mean)) +for (j in 1:length(logratios.mean)){ + fitted.mean = fit.x + fit.y * logcov.mean[j] + adjusted.lr[j] = logratios.mean[j] - fitted.mean +} + +fit.mean2 = lm(adjusted.lr ~ logcov.mean) +fit.mean.a = fit.mean2$coefficient[1] +fit.mean.b = fit.mean2$coefficient[2] + +fit.mean.fn <- function(x, fit.a, fit.b){ + result = fit.a + fit.b * x + return (result) +} + +# Adjust SD based on the new adjusted log ratios +logratios.sd = c() +logcov.bins.mean= c() +for (j in 1:actual.bin){ + lr.bins.mean = as.matrix(adjusted.lr[cn.average.aboveTs$V15==j]) +# logratios.sd = c(logratios.sd, sd(lr.bins.mean)) + logratios.sd = c(logratios.sd, apply(lr.bins.mean,2,sd)) + + cn.coverage.tumour.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V8) + cn.coverage.normal.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V9) + cn.coverage.nth = (cn.coverage.tumour.nth + cn.coverage.normal.nth) /2 + logcov.bins.mean= c(logcov.bins.mean, log2(mean(cn.coverage.nth))) + +} + +logratios.sd.ori = logratios.sd +if (length(logratios.sd) > 2) { + logratios.sd = logratios.sd[-length(logratios.sd)] +} + +logcov.bins.mean.ori = logcov.bins.mean +if (length(logcov.bins.mean) > 2){ + logcov.bins.mean= logcov.bins.mean[-length(logcov.bins.mean)] +} + +fit.sd = lm(log2(logratios.sd) ~ logcov.bins.mean) +fit.sd.a = fit.sd$coefficient[1] +fit.sd.b = fit.sd$coefficient[2] + +fit.sd.fn <- function(x, fit.a, fit.b){ + result = 2 ^ (fit.mean.fn(x, fit.a, fit.b)) + return (result) +} + +# Get the P Values, called the gain/loss +# with average and sd from each bins +pVal.list = c() +gain.loss = c() + +for (i in 1:nrow(cn.average.list)){ + #print (i) + #logratio = cn.average.list[i] + #logcov = log(normal.rd.ori[i],2) + logratio = adjusted.lr[i] + logcov = logcov.mean[i] + exon.bin = Bin[i] + + if (length(logratios.sd) > 1){ + #pVal <- pnorm(logratio, fit.mean.fn(logcov, fit.mean.a, fit.mean.b), fit.sd.fn(logcov, fit.sd.a, fit.sd.b)) + pVal <- pnorm(logratio, fit.mean.fn(logcov, fit.mean.a, fit.mean.b), sd.fn(logcov)) + } else { + pVal <- pnorm(logratio, 0, logratios.sd[exon.bin]) + } + + if (pVal > 0.5){ + pVal = 1-pVal + gain.loss = c(gain.loss, "gain") + } else { + gain.loss = c(gain.loss, "loss") + } + pVal.list = c(pVal.list, pVal*2) +} + +# Get the adjusted P Values +adjusted.pVal.list = p.adjust(pVal.list, method="BH") + +# Write the output into a tab-delimited text files +outdf=data.frame(Targeted.Region.ID=ids,Exon.Number=exons,Gene.Sym=gs,Chr, OriStCoordinate, OriEndCoordinate, Mean.of.LogRatio=cn.average.list, Adjusted.Mean.of.LogRatio=adjusted.lr, SD.of.LogRatio=sd, Median.of.LogRatio=median, number.bases, P.Value=pVal.list ,Adjusted.P.Value=adjusted.pVal.list , gain.loss, tumour.rd, normal.rd, tumour.rd.ori, normal.rd.ori, MinLogRatio, MaxLogRatio, BinNumber = Bin) + +#min_normal_rd_for_call=5 +#min_tumour_rd_for_call=0 +#min_avg_cov_for_call=20 +outdf$tumour.rd.ori = outdf$tumour.rd.ori-0.5 +outdf$normal.rd.ori = outdf$normal.rd.ori-0.5 +wh.to.excl = outdf$normal.rd.ori < min_normal_rd_for_call +wh.to.excl = wh.to.excl | outdf$tumour.rd.ori < min_tumour_rd_for_call +wh.to.excl = wh.to.excl | (outdf$tumour.rd.ori+outdf$normal.rd.ori)/2 < min_avg_cov_for_call +outdf$P.Value[wh.to.excl]=NA +outdf$Adjusted.P.Value[wh.to.excl]=NA + + +write.table(outdf,out.f,sep="\t",quote=F,row.names=F,col.names=T) + +#Plotting SD +#a.sd.fn = rep(fit.sd.a, length(logratios.sd.ori)) +#b.sd.fn = rep(fit.sd.b, length(logratios.sd.ori)) +#sd.after.fit = fit.sd.fn(logcov.bins.mean.ori, fit.sd.a, fit.sd.b) +#sd.out.f = paste(outf, "/plot/", sample.name, "sd.data_fit.", bins, "bins.txt", sep="") +#sd.outdf = data.frame(SD.Before.Fit = logratios.sd.ori, Log.Coverage = logcov.bins.mean.ori, SD.After.Fit = sd.after.fit, a.for.fitting=a.sd.fn, b.for.fitting=b.sd.fn) +#write.table(sd.outdf, sd.out.f,sep="\t", quote=F, row.names=F, col.names=T) + + +#End of the script +print ("End of cn_analysis.R") +print (i) + + +