Mercurial > repos > davidvanzessen > argalaxy_tools
diff report_clonality/RScript.r @ 24:d5d203d38c8a draft
Uploaded
author | davidvanzessen |
---|---|
date | Wed, 01 Feb 2017 09:48:38 -0500 |
parents | 9185c3dfc679 |
children | 94765af0db1f |
line wrap: on
line diff
--- a/report_clonality/RScript.r Fri Jan 27 04:29:43 2017 -0500 +++ b/report_clonality/RScript.r Wed Feb 01 09:48:38 2017 -0500 @@ -49,6 +49,7 @@ inputdata = read.table(infile, sep="\t", header=TRUE, fill=T, comment.char="", stringsAsFactors=F) + print(paste("nrows: ", nrow(inputdata))) setwd(outdir) @@ -121,9 +122,6 @@ clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$clonality_clonaltype), ] } -print("SAMPLE TABLE:") -print(table(PRODF$Sample)) - prod.unique.sample.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample")]) prod.unique.rep.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample", "Replicate")]) @@ -162,6 +160,9 @@ #write.table(PRODF, "allUnique.csv", sep=",",quote=F,row.names=F,col.names=T) write.table(UNPROD, "allUnproductive.csv", sep=",",quote=F,row.names=F,col.names=T) +print("SAMPLE TABLE:") +print(table(PRODF$Sample)) + #write the samples to a file sampleFile <- file("samples.txt") un = unique(inputdata$Sample) @@ -383,7 +384,7 @@ png("CDR3LengthPlot.png",width = 1280, height = 720) CDR3LengthPlot dev.off() -write.table(x=CDR3Length, file="CDR3LengthPlot.csv", sep=",",quote=F,row.names=F,col.names=T) +write.table(x=CDR3Length, file="CDR3LengthPlot.txt", sep="\t",quote=F,row.names=F,col.names=T) # ---------------------- Plot the heatmaps ---------------------- @@ -412,7 +413,7 @@ png(paste("HeatmapVD_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Dchain$v.name)), height=100+(15*length(Vchain$v.name))) print(img) dev.off() - write.table(x=acast(dat, Top.V.Gene~Top.D.Gene, value.var="Length"), file=paste("HeatmapVD_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA) + write.table(x=acast(dat, Top.V.Gene~Top.D.Gene, value.var="Length"), file=paste("HeatmapVD_", unique(dat[3])[1,1], ".txt", sep=""), sep="\t",quote=F,row.names=T,col.names=NA) } VandDCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.D.Gene", "Sample")]) @@ -462,7 +463,7 @@ png(paste("HeatmapVJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Vchain$v.name))) print(img) dev.off() - write.table(x=acast(dat, Top.V.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapVJ_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA) + write.table(x=acast(dat, Top.V.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapVJ_", unique(dat[3])[1,1], ".txt", sep=""), sep="\t",quote=F,row.names=T,col.names=NA) } VandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.J.Gene", "Sample")]) @@ -511,7 +512,7 @@ png(paste("HeatmapDJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Dchain$v.name))) print(img) dev.off() - write.table(x=acast(dat, Top.D.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapDJ_", unique(dat[3])[1,1], ".csv", sep=""), sep=",",quote=F,row.names=T,col.names=NA) + write.table(x=acast(dat, Top.D.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapDJ_", unique(dat[3])[1,1], ".txt", sep=""), sep="\t",quote=F,row.names=T,col.names=NA) } @@ -701,17 +702,7 @@ if(all(imgtcolumns %in% colnames(inputdata))) { print("found IMGT columns, running junction analysis") - - if(locus %in% c("IGK","IGL", "TRA", "TRG")){ - print("VJ recombination, no filtering on absent D") - } else { - print("VDJ recombination, using N column for junction analysis") - fltr = nchar(PRODF$Top.D.Gene) < 4 - print(paste("Removing", sum(fltr), "sequences without a identified D")) - PRODF = PRODF[!fltr,] - } - - + #ensure certain columns are in the data (files generated with older versions of IMGT Loader) col.checks = c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb") for(col.check in col.checks){ @@ -730,9 +721,12 @@ } } + PRODF.with.D = PRODF[nchar(PRODF$Top.D.Gene, keepNA=F) > 2,] + PRODF.no.D = PRODF[nchar(PRODF$Top.D.Gene, keepNA=F) < 4,] + num_median = function(x, na.rm=T) { as.numeric(median(x, na.rm=na.rm)) } - newData = data.frame(data.table(PRODF)[,list(unique=.N, + newData = data.frame(data.table(PRODF.with.D)[,list(unique=.N, VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=mean(.SD$P3V.nt.nb, na.rm=T), N1=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)), @@ -749,9 +743,9 @@ Median.CDR3.l=as.double(median(.SD$CDR3.Length))), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) - write.table(newData, "junctionAnalysisProd_mean.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) + write.table(newData, "junctionAnalysisProd_mean_wD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) - newData = data.frame(data.table(PRODF)[,list(unique=.N, + newData = data.frame(data.table(PRODF.with.D)[,list(unique=.N, VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=num_median(.SD$P3V.nt.nb, na.rm=T), N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)), @@ -768,9 +762,9 @@ Median.CDR3.l=as.double(median(.SD$CDR3.Length))), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) - write.table(newData, "junctionAnalysisProd_median.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) + write.table(newData, "junctionAnalysisProd_median_wD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) - newData = data.frame(data.table(UNPROD)[,list(unique=.N, + newData = data.frame(data.table(PRODF.with.D)[,list(unique=.N, VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=mean(.SD$P3V.nt.nb, na.rm=T), N1=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)), @@ -787,9 +781,9 @@ Median.CDR3.l=as.double(median(.SD$CDR3.Length))), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) - write.table(newData, "junctionAnalysisUnProd_mean.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) + write.table(newData, "junctionAnalysisUnProd_mean_wD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) - newData = data.frame(data.table(UNPROD)[,list(unique=.N, + newData = data.frame(data.table(PRODF.with.D)[,list(unique=.N, VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=num_median(.SD$P3V.nt.nb, na.rm=T), N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)), @@ -805,9 +799,67 @@ Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), Median.CDR3.l=as.double(median(.SD$CDR3.Length))), by=c("Sample")]) - + newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) + write.table(newData, "junctionAnalysisUnProd_median_wD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) + + #---------------- again for no-D + + newData = data.frame(data.table(PRODF.no.D)[,list(unique=.N, + VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), + P1=mean(.SD$P3V.nt.nb, na.rm=T), + N1=mean(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), + P2=mean(.SD$P5J.nt.nb, na.rm=T), + DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), + Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), + Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), + Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), + Median.CDR3.l=as.double(median(.SD$CDR3.Length))), + by=c("Sample")]) + print(newData) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) - write.table(newData, "junctionAnalysisUnProd_median.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) + write.table(newData, "junctionAnalysisProd_mean_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) + + newData = data.frame(data.table(PRODF.no.D)[,list(unique=.N, + VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), + P1=num_median(.SD$P3V.nt.nb, na.rm=T), + N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), + P2=num_median(.SD$P5J.nt.nb, na.rm=T), + DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), + Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), + Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), + Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), + Median.CDR3.l=as.double(median(.SD$CDR3.Length))), + by=c("Sample")]) + newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) + write.table(newData, "junctionAnalysisProd_median_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) + + newData = data.frame(data.table(PRODF.no.D)[,list(unique=.N, + VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), + P1=mean(.SD$P3V.nt.nb, na.rm=T), + N1=mean(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), + P2=mean(.SD$P5J.nt.nb, na.rm=T), + DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), + Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), + Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), + Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), + Median.CDR3.l=as.double(median(.SD$CDR3.Length))), + by=c("Sample")]) + newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) + write.table(newData, "junctionAnalysisUnProd_mean_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) + + newData = data.frame(data.table(PRODF.no.D)[,list(unique=.N, + VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), + P1=num_median(.SD$P3V.nt.nb, na.rm=T), + N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), + P2=num_median(.SD$P5J.nt.nb, na.rm=T), + DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), + Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), + Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), + Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), + Median.CDR3.l=as.double(median(.SD$CDR3.Length))), + by=c("Sample")]) + newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) + write.table(newData, "junctionAnalysisUnProd_median_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) } PRODF = bak @@ -822,12 +874,18 @@ D.REGION.reading.frame[chck,"D.REGION.reading.frame"] = "No D" } -D.REGION.reading.frame = data.frame(data.table(D.REGION.reading.frame)[, list(Freq=.N), by=c("Sample", "D.REGION.reading.frame")]) +D.REGION.reading.frame.1 = data.frame(data.table(D.REGION.reading.frame)[, list(Freq=.N), by=c("Sample", "D.REGION.reading.frame")]) + +D.REGION.reading.frame.2 = data.frame(data.table(D.REGION.reading.frame)[, list(sample.sum=sum(as.numeric(.SD$D.REGION.reading.frame), na.rm=T)), by=c("Sample")]) -write.table(D.REGION.reading.frame, "DReadingFrame.csv" , sep="\t",quote=F,row.names=F,col.names=T) +D.REGION.reading.frame = merge(D.REGION.reading.frame.1, D.REGION.reading.frame.2, by="Sample") + +D.REGION.reading.frame$percentage = round(D.REGION.reading.frame$Freq / D.REGION.reading.frame$sample.sum * 100, 1) + +write.table(D.REGION.reading.frame, "DReadingFrame.txt" , sep="\t",quote=F,row.names=F,col.names=T) D.REGION.reading.frame = ggplot(D.REGION.reading.frame) -D.REGION.reading.frame = D.REGION.reading.frame + geom_bar(aes( x = D.REGION.reading.frame, y = Freq, fill=Sample), stat='identity', position='dodge' ) + ggtitle("D reading frame") + xlab("Frequency") + ylab("Frame") +D.REGION.reading.frame = D.REGION.reading.frame + geom_bar(aes( x = D.REGION.reading.frame, y = percentage, fill=Sample), stat='identity', position='dodge' ) + ggtitle("D reading frame") + xlab("Frequency") + ylab("Frame") D.REGION.reading.frame = D.REGION.reading.frame + scale_fill_manual(values=sample.colors) D.REGION.reading.frame = D.REGION.reading.frame + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) @@ -878,6 +936,6 @@ # ---------------------- AA median CDR3 length ---------------------- -median.aa.l = data.frame(data.table(PRODF)[, list(median=as.double(median(.SD$CDR3.Length))), by=c("Sample")]) -write.table(median.aa.l, "AAMedianBySample.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) +median.aa.l = data.frame(data.table(PRODF)[, list(median=as.double(median(as.numeric(.SD$CDR3.Length, na.rm=T), na.rm=T))), by=c("Sample")]) +write.table(median.aa.l, "AAMedianBySample.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F)