Mercurial > repos > fcaramia > contra
comparison Contra/scripts/cn_analysis.v3.R @ 0:7564f3b1e675
Uploaded
| author | fcaramia |
|---|---|
| date | Thu, 13 Sep 2012 02:31:43 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:7564f3b1e675 |
|---|---|
| 1 # ----------------------------------------------------------------------# | |
| 2 # Copyright (c) 2011, Richard Lupat & Jason Li. | |
| 3 # | |
| 4 # > Source License < | |
| 5 # This file is part of CONTRA. | |
| 6 # | |
| 7 # CONTRA is free software: you can redistribute it and/or modify | |
| 8 # it under the terms of the GNU General Public License as published by | |
| 9 # the Free Software Foundation, either version 3 of the License, or | |
| 10 # (at your option) any later version. | |
| 11 # | |
| 12 # CONTRA is distributed in the hope that it will be useful, | |
| 13 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 15 # GNU General Public License for more details. | |
| 16 # | |
| 17 # You should have received a copy of the GNU General Public License | |
| 18 # along with CONTRA. If not, see <http://www.gnu.org/licenses/>. | |
| 19 # | |
| 20 # | |
| 21 #-----------------------------------------------------------------------# | |
| 22 # Last Updated : 31 Oct 2011 17:00PM | |
| 23 | |
| 24 | |
| 25 # Parameters Parsing (from Command Line) | |
| 26 options <- commandArgs(trailingOnly = T) | |
| 27 bins = as.integer(options[1]) | |
| 28 rd.cutoff = as.integer(options[2]) | |
| 29 min.bases = as.integer(options[3]) | |
| 30 outf = options[4] | |
| 31 sample.name = options[5] | |
| 32 plotoption = options[6] | |
| 33 actual.bin = as.numeric(options[7]) | |
| 34 min_normal_rd_for_call = as.numeric(options[8]) | |
| 35 min_tumour_rd_for_call = as.numeric(options[9]) | |
| 36 min_avg_cov_for_call = as.numeric(options[10]) | |
| 37 | |
| 38 if (sample.name == "No-SampleName") | |
| 39 sample.name = "" | |
| 40 | |
| 41 if (sample.name != "") | |
| 42 sample.name = paste(sample.name, ".", sep="") | |
| 43 | |
| 44 # Setup output name | |
| 45 out.f = paste(outf, "/table/", sample.name, "CNATable.", rd.cutoff,"rd.", min.bases,"bases.", bins,"bins.txt", sep="") | |
| 46 pdf.out.f = paste(outf, "/plot/", sample.name, "densityplot.", bins, "bins.pdf", sep="") | |
| 47 | |
| 48 # Open and read input files | |
| 49 # cnAverageFile = paste("bin", bins, ".txt", sep="") | |
| 50 cnAverageFile = paste(outf,"/buf/bin",bins,".txt",sep="") | |
| 51 boundariesFile = paste(outf,"/buf/bin",bins,".boundaries.txt",sep="") | |
| 52 print (cnAverageFile) | |
| 53 cn.average = read.delim(cnAverageFile, as.is=F, header=F) | |
| 54 cn.boundary= read.delim(boundariesFile,as.is=F, header=F) | |
| 55 | |
| 56 # Apply thresholds and data grouping | |
| 57 cn.average.aboveTs = cn.average[cn.average$V3>min.bases,] | |
| 58 cn.average.list = as.matrix(cn.average.aboveTs$V4) | |
| 59 | |
| 60 # Get the mean and sd for each bins | |
| 61 cn.average.mean = c() | |
| 62 cn.average.sd = c() | |
| 63 cn.average.log= c() | |
| 64 | |
| 65 # Density Plots for each bins | |
| 66 if (plotoption == "True"){ | |
| 67 pdf(pdf.out.f) | |
| 68 } | |
| 69 for (j in 1:actual.bin){ | |
| 70 cn.average.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V4) | |
| 71 cn.coverage.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V11) | |
| 72 boundary.end = cn.boundary[cn.boundary$V1==j,]$V2 | |
| 73 boundary.start = cn.boundary[cn.boundary$V1==(j-1),]$V2 | |
| 74 boundary.mid = (boundary.end+boundary.start)/2 | |
| 75 if (plotoption == "True") { | |
| 76 plot_title = paste("density: bin", bins, sep="") | |
| 77 #plot(density(cn.average.nth),xlim=c(-5,5), title=plot_title) | |
| 78 plot(density(cn.average.nth),xlim=c(-5,5)) | |
| 79 } | |
| 80 cn.average.mean = c(cn.average.mean, mean(cn.average.nth)) | |
| 81 # cn.average.sd = c(cn.average.sd, sd(cn.average.nth)) | |
| 82 cn.average.sd = c(cn.average.sd, apply(cn.average.nth,2,sd)) | |
| 83 #cn.average.log = c(cn.average.log, boundary.mid) | |
| 84 cn.average.log = c(cn.average.log, log(mean(cn.coverage.nth),2)) | |
| 85 } | |
| 86 if (plotoption == "True"){ | |
| 87 dev.off() | |
| 88 } | |
| 89 | |
| 90 # for point outside of boundaries | |
| 91 if (bins > 1) { | |
| 92 boundary.first = cn.boundary[cn.boundary$V1==0,]$V2 | |
| 93 boundary.last = cn.boundary[cn.boundary$V1==bins,]$V2 | |
| 94 | |
| 95 b.mean.y2 = cn.average.mean[2] | |
| 96 b.mean.y1 = cn.average.mean[1] | |
| 97 b.sd.y2 = cn.average.sd[2] | |
| 98 b.sd.y1 = cn.average.sd[1] | |
| 99 b.x2 = cn.average.log[2] | |
| 100 b.x1 = cn.average.log[1] | |
| 101 | |
| 102 boundary.f.mean = (((b.mean.y2- b.mean.y1)/(b.x2-b.x1))*(boundary.first-b.x1))+b.mean.y1 | |
| 103 boundary.f.sd = (((b.sd.y2- b.sd.y1)/(b.x2-b.x1))*(boundary.first-b.x1))+b.sd.y1 | |
| 104 | |
| 105 if (boundary.f.sd < 0){ | |
| 106 boundary.f.sd = 0 | |
| 107 } | |
| 108 | |
| 109 b.mean.y2 = cn.average.mean[bins] | |
| 110 b.mean.y1 = cn.average.mean[bins-1] | |
| 111 b.sd.y2 = cn.average.sd[bins] | |
| 112 b.sd.y1 = cn.average.sd[bins-1] | |
| 113 b.x2 = cn.average.log[bins] | |
| 114 b.x1 = cn.average.log[bins-1] | |
| 115 | |
| 116 boundary.l.mean = (((b.mean.y2- b.mean.y1)/(b.x2-b.x1))*(boundary.last-b.x1))+b.mean.y1 | |
| 117 boundary.l.sd = (((b.sd.y2- b.sd.y1)/(b.x2-b.x1))*(boundary.last-b.x1))+b.sd.y1 | |
| 118 | |
| 119 #cn.average.log = c(boundary.first, cn.average.log, boundary.last) | |
| 120 #cn.linear.mean = c(boundary.f.mean, cn.average.mean, boundary.l.mean) | |
| 121 #cn.linear.sd = c(boundary.f.sd, cn.average.sd, boundary.l.sd) | |
| 122 | |
| 123 cn.average.log = c(boundary.first, cn.average.log) | |
| 124 cn.linear.mean = c(boundary.f.mean, cn.average.mean) | |
| 125 cn.linear.sd = c(boundary.f.sd, cn.average.sd) | |
| 126 | |
| 127 } | |
| 128 | |
| 129 # Linear Interpolation | |
| 130 if (bins > 1 ){ | |
| 131 #print(cn.average.log) | |
| 132 #print(cn.linear.mean) | |
| 133 #print(cn.linear.sd) | |
| 134 mean.fn <- approxfun(cn.average.log, cn.linear.mean, rule=2) | |
| 135 sd.fn <- approxfun(cn.average.log, cn.linear.sd, rule=2) | |
| 136 } | |
| 137 | |
| 138 | |
| 139 # Put the data's details into matrices | |
| 140 ids = as.matrix(cn.average.aboveTs$V1) | |
| 141 exons = as.matrix(cn.average.aboveTs$V6) | |
| 142 exons.pos = as.matrix(cn.average.aboveTs$V5) | |
| 143 gs = as.matrix(cn.average.aboveTs$V2) | |
| 144 number.bases = as.matrix(cn.average.aboveTs$V3) | |
| 145 mean = as.matrix(cn.average.aboveTs$V4) | |
| 146 sd = as.matrix(cn.average.aboveTs$V7) | |
| 147 tumour.rd = as.matrix(cn.average.aboveTs$V8) | |
| 148 tumour.rd.ori = as.matrix(cn.average.aboveTs$V10) | |
| 149 normal.rd = as.matrix(cn.average.aboveTs$V9) | |
| 150 normal.rd.ori = as.matrix(cn.average.aboveTs$V11) | |
| 151 median = as.matrix(cn.average.aboveTs$V12) | |
| 152 MinLogRatio = as.matrix(cn.average.aboveTs$V13) | |
| 153 MaxLogRatio = as.matrix(cn.average.aboveTs$V14) | |
| 154 Bin = as.matrix(cn.average.aboveTs$V15) | |
| 155 Chr = as.matrix(cn.average.aboveTs$V16) | |
| 156 OriStCoordinate = as.matrix(cn.average.aboveTs$V17) | |
| 157 OriEndCoordinate= as.matrix(cn.average.aboveTs$V18) | |
| 158 | |
| 159 # Linear Fit | |
| 160 logratios.mean = mean | |
| 161 logcov.mean = log2((normal.rd + tumour.rd)/2) | |
| 162 fit.mean = lm(logratios.mean ~ logcov.mean) | |
| 163 fit.x = fit.mean$coefficient[1] | |
| 164 fit.y = fit.mean$coefficient[2] | |
| 165 | |
| 166 adjusted.lr = rep(NA, length(logratios.mean)) | |
| 167 for (j in 1:length(logratios.mean)){ | |
| 168 fitted.mean = fit.x + fit.y * logcov.mean[j] | |
| 169 adjusted.lr[j] = logratios.mean[j] - fitted.mean | |
| 170 } | |
| 171 | |
| 172 fit.mean2 = lm(adjusted.lr ~ logcov.mean) | |
| 173 fit.mean.a = fit.mean2$coefficient[1] | |
| 174 fit.mean.b = fit.mean2$coefficient[2] | |
| 175 | |
| 176 fit.mean.fn <- function(x, fit.a, fit.b){ | |
| 177 result = fit.a + fit.b * x | |
| 178 return (result) | |
| 179 } | |
| 180 | |
| 181 # Adjust SD based on the new adjusted log ratios | |
| 182 logratios.sd = c() | |
| 183 logcov.bins.mean= c() | |
| 184 for (j in 1:actual.bin){ | |
| 185 lr.bins.mean = as.matrix(adjusted.lr[cn.average.aboveTs$V15==j]) | |
| 186 # logratios.sd = c(logratios.sd, sd(lr.bins.mean)) | |
| 187 logratios.sd = c(logratios.sd, apply(lr.bins.mean,2,sd)) | |
| 188 | |
| 189 cn.coverage.tumour.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V8) | |
| 190 cn.coverage.normal.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V9) | |
| 191 cn.coverage.nth = (cn.coverage.tumour.nth + cn.coverage.normal.nth) /2 | |
| 192 logcov.bins.mean= c(logcov.bins.mean, log2(mean(cn.coverage.nth))) | |
| 193 | |
| 194 } | |
| 195 | |
| 196 logratios.sd.ori = logratios.sd | |
| 197 if (length(logratios.sd) > 2) { | |
| 198 logratios.sd = logratios.sd[-length(logratios.sd)] | |
| 199 } | |
| 200 | |
| 201 logcov.bins.mean.ori = logcov.bins.mean | |
| 202 if (length(logcov.bins.mean) > 2){ | |
| 203 logcov.bins.mean= logcov.bins.mean[-length(logcov.bins.mean)] | |
| 204 } | |
| 205 | |
| 206 fit.sd = lm(log2(logratios.sd) ~ logcov.bins.mean) | |
| 207 fit.sd.a = fit.sd$coefficient[1] | |
| 208 fit.sd.b = fit.sd$coefficient[2] | |
| 209 | |
| 210 fit.sd.fn <- function(x, fit.a, fit.b){ | |
| 211 result = 2 ^ (fit.mean.fn(x, fit.a, fit.b)) | |
| 212 return (result) | |
| 213 } | |
| 214 | |
| 215 # Get the P Values, called the gain/loss | |
| 216 # with average and sd from each bins | |
| 217 pVal.list = c() | |
| 218 gain.loss = c() | |
| 219 | |
| 220 for (i in 1:nrow(cn.average.list)){ | |
| 221 #print (i) | |
| 222 #logratio = cn.average.list[i] | |
| 223 #logcov = log(normal.rd.ori[i],2) | |
| 224 logratio = adjusted.lr[i] | |
| 225 logcov = logcov.mean[i] | |
| 226 exon.bin = Bin[i] | |
| 227 | |
| 228 if (length(logratios.sd) > 1){ | |
| 229 #pVal <- pnorm(logratio, fit.mean.fn(logcov, fit.mean.a, fit.mean.b), fit.sd.fn(logcov, fit.sd.a, fit.sd.b)) | |
| 230 pVal <- pnorm(logratio, fit.mean.fn(logcov, fit.mean.a, fit.mean.b), sd.fn(logcov)) | |
| 231 } else { | |
| 232 pVal <- pnorm(logratio, 0, logratios.sd[exon.bin]) | |
| 233 } | |
| 234 | |
| 235 if (pVal > 0.5){ | |
| 236 pVal = 1-pVal | |
| 237 gain.loss = c(gain.loss, "gain") | |
| 238 } else { | |
| 239 gain.loss = c(gain.loss, "loss") | |
| 240 } | |
| 241 pVal.list = c(pVal.list, pVal*2) | |
| 242 } | |
| 243 | |
| 244 # Get the adjusted P Values | |
| 245 adjusted.pVal.list = p.adjust(pVal.list, method="BH") | |
| 246 | |
| 247 # Write the output into a tab-delimited text files | |
| 248 outdf=data.frame(Targeted.Region.ID=ids,Exon.Number=exons,Gene.Sym=gs,Chr, OriStCoordinate, OriEndCoordinate, Mean.of.LogRatio=cn.average.list, Adjusted.Mean.of.LogRatio=adjusted.lr, SD.of.LogRatio=sd, Median.of.LogRatio=median, number.bases, P.Value=pVal.list ,Adjusted.P.Value=adjusted.pVal.list , gain.loss, tumour.rd, normal.rd, tumour.rd.ori, normal.rd.ori, MinLogRatio, MaxLogRatio, BinNumber = Bin) | |
| 249 | |
| 250 #min_normal_rd_for_call=5 | |
| 251 #min_tumour_rd_for_call=0 | |
| 252 #min_avg_cov_for_call=20 | |
| 253 outdf$tumour.rd.ori = outdf$tumour.rd.ori-0.5 | |
| 254 outdf$normal.rd.ori = outdf$normal.rd.ori-0.5 | |
| 255 wh.to.excl = outdf$normal.rd.ori < min_normal_rd_for_call | |
| 256 wh.to.excl = wh.to.excl | outdf$tumour.rd.ori < min_tumour_rd_for_call | |
| 257 wh.to.excl = wh.to.excl | (outdf$tumour.rd.ori+outdf$normal.rd.ori)/2 < min_avg_cov_for_call | |
| 258 outdf$P.Value[wh.to.excl]=NA | |
| 259 outdf$Adjusted.P.Value[wh.to.excl]=NA | |
| 260 | |
| 261 | |
| 262 write.table(outdf,out.f,sep="\t",quote=F,row.names=F,col.names=T) | |
| 263 | |
| 264 #Plotting SD | |
| 265 #a.sd.fn = rep(fit.sd.a, length(logratios.sd.ori)) | |
| 266 #b.sd.fn = rep(fit.sd.b, length(logratios.sd.ori)) | |
| 267 #sd.after.fit = fit.sd.fn(logcov.bins.mean.ori, fit.sd.a, fit.sd.b) | |
| 268 #sd.out.f = paste(outf, "/plot/", sample.name, "sd.data_fit.", bins, "bins.txt", sep="") | |
| 269 #sd.outdf = data.frame(SD.Before.Fit = logratios.sd.ori, Log.Coverage = logcov.bins.mean.ori, SD.After.Fit = sd.after.fit, a.for.fitting=a.sd.fn, b.for.fitting=b.sd.fn) | |
| 270 #write.table(sd.outdf, sd.out.f,sep="\t", quote=F, row.names=F, col.names=T) | |
| 271 | |
| 272 | |
| 273 #End of the script | |
| 274 print ("End of cn_analysis.R") | |
| 275 print (i) | |
| 276 | |
| 277 | |
| 278 |
