Mercurial > repos > davidvanzessen > argalaxy_tools
diff report_clonality/RScript.r @ 26:28fbbdfd7a87 draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 13 Feb 2017 09:08:46 -0500 |
parents | 94765af0db1f |
children | 1f83e14f173b |
line wrap: on
line diff
--- a/report_clonality/RScript.r Thu Feb 09 07:20:09 2017 -0500 +++ b/report_clonality/RScript.r Mon Feb 13 09:08:46 2017 -0500 @@ -614,86 +614,86 @@ colnames(coincidence.table) = c("Coincidence Type", "Raw Coincidence Freq") write.table(coincidence.table, file=paste("lymphclon_coincidences_", sample_id, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) } - } else if(clonality_method == "old") { - clonalFreq = data.frame(data.table(clonalityFrame)[, list(Type=.N), by=c("Sample", "clonaltype")]) - - #write files for every coincidence group of >1 - samples = unique(clonalFreq$Sample) - for(sample in samples){ - clonalFreqSample = clonalFreq[clonalFreq$Sample == sample,] - if(max(clonalFreqSample$Type) > 1){ - for(i in 2:max(clonalFreqSample$Type)){ - clonalFreqSampleType = clonalFreqSample[clonalFreqSample$Type == i,] - clonalityFrame.sub = clonalityFrame[clonalityFrame$clonaltype %in% clonalFreqSampleType$clonaltype,] - clonalityFrame.sub = clonalityFrame.sub[order(clonalityFrame.sub$clonaltype),] - write.table(clonalityFrame.sub, file=paste("coincidences_", sample, "_", i, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) - } - } - } - - clonalFreqCount = data.frame(data.table(clonalFreq)[, list(Count=.N), by=c("Sample", "Type")]) - clonalFreqCount$realCount = clonalFreqCount$Type * clonalFreqCount$Count - clonalSum = data.frame(data.table(clonalFreqCount)[, list(Reads=sum(realCount)), by=c("Sample")]) - clonalFreqCount = merge(clonalFreqCount, clonalSum, by.x="Sample", by.y="Sample") - - ct = c('Type\tWeight\n2\t1\n3\t3\n4\t6\n5\t10\n6\t15') - tcct = textConnection(ct) - CT = read.table(tcct, sep="\t", header=TRUE) - close(tcct) - clonalFreqCount = merge(clonalFreqCount, CT, by.x="Type", by.y="Type", all.x=T) - clonalFreqCount$WeightedCount = clonalFreqCount$Count * clonalFreqCount$Weight - - ReplicateReads = data.frame(data.table(clonalityFrame)[, list(Type=.N), by=c("Sample", "Replicate", "clonaltype")]) - ReplicateReads = data.frame(data.table(ReplicateReads)[, list(Reads=.N), by=c("Sample", "Replicate")]) - clonalFreqCount$Reads = as.numeric(clonalFreqCount$Reads) - ReplicateReads$Reads = as.numeric(ReplicateReads$Reads) - ReplicateReads$squared = as.numeric(ReplicateReads$Reads * ReplicateReads$Reads) - - ReplicatePrint <- function(dat){ - write.table(dat[-1], paste("ReplicateReads_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) - } - - ReplicateSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"]) - lapply(ReplicateSplit, FUN=ReplicatePrint) - - ReplicateReads = data.frame(data.table(ReplicateReads)[, list(ReadsSum=sum(as.numeric(Reads)), ReadsSquaredSum=sum(as.numeric(squared))), by=c("Sample")]) - clonalFreqCount = merge(clonalFreqCount, ReplicateReads, by.x="Sample", by.y="Sample", all.x=T) - - ReplicateSumPrint <- function(dat){ - write.table(dat[-1], paste("ReplicateSumReads_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) - } - - ReplicateSumSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"]) - lapply(ReplicateSumSplit, FUN=ReplicateSumPrint) - - clonalFreqCountSum = data.frame(data.table(clonalFreqCount)[, list(Numerator=sum(WeightedCount, na.rm=T)), by=c("Sample")]) - clonalFreqCount = merge(clonalFreqCount, clonalFreqCountSum, by.x="Sample", by.y="Sample", all.x=T) - clonalFreqCount$ReadsSum = as.numeric(clonalFreqCount$ReadsSum) #prevent integer overflow - clonalFreqCount$Denominator = (((clonalFreqCount$ReadsSum * clonalFreqCount$ReadsSum) - clonalFreqCount$ReadsSquaredSum) / 2) - clonalFreqCount$Result = (clonalFreqCount$Numerator + 1) / (clonalFreqCount$Denominator + 1) - - ClonalityScorePrint <- function(dat){ - write.table(dat$Result, paste("ClonalityScore_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) - } - - clonalityScore = clonalFreqCount[c("Sample", "Result")] - clonalityScore = unique(clonalityScore) - - clonalityScoreSplit = split(clonalityScore, f=clonalityScore[,"Sample"]) - lapply(clonalityScoreSplit, FUN=ClonalityScorePrint) - - clonalityOverview = clonalFreqCount[c("Sample", "Type", "Count", "Weight", "WeightedCount")] - - - - ClonalityOverviewPrint <- function(dat){ - dat = dat[order(dat[,2]),] - write.table(dat[-1], paste("ClonalityOverView_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F) - } - - clonalityOverviewSplit = split(clonalityOverview, f=clonalityOverview$Sample) - lapply(clonalityOverviewSplit, FUN=ClonalityOverviewPrint) + } + clonalFreq = data.frame(data.table(clonalityFrame)[, list(Type=.N), by=c("Sample", "clonaltype")]) + + #write files for every coincidence group of >1 + samples = unique(clonalFreq$Sample) + for(sample in samples){ + clonalFreqSample = clonalFreq[clonalFreq$Sample == sample,] + if(max(clonalFreqSample$Type) > 1){ + for(i in 2:max(clonalFreqSample$Type)){ + clonalFreqSampleType = clonalFreqSample[clonalFreqSample$Type == i,] + clonalityFrame.sub = clonalityFrame[clonalityFrame$clonaltype %in% clonalFreqSampleType$clonaltype,] + clonalityFrame.sub = clonalityFrame.sub[order(clonalityFrame.sub$clonaltype),] + write.table(clonalityFrame.sub, file=paste("coincidences_", sample, "_", i, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) + } + } + } + + clonalFreqCount = data.frame(data.table(clonalFreq)[, list(Count=.N), by=c("Sample", "Type")]) + clonalFreqCount$realCount = clonalFreqCount$Type * clonalFreqCount$Count + clonalSum = data.frame(data.table(clonalFreqCount)[, list(Reads=sum(realCount)), by=c("Sample")]) + clonalFreqCount = merge(clonalFreqCount, clonalSum, by.x="Sample", by.y="Sample") + + ct = c('Type\tWeight\n2\t1\n3\t3\n4\t6\n5\t10\n6\t15') + tcct = textConnection(ct) + CT = read.table(tcct, sep="\t", header=TRUE) + close(tcct) + clonalFreqCount = merge(clonalFreqCount, CT, by.x="Type", by.y="Type", all.x=T) + clonalFreqCount$WeightedCount = clonalFreqCount$Count * clonalFreqCount$Weight + + ReplicateReads = data.frame(data.table(clonalityFrame)[, list(Type=.N), by=c("Sample", "Replicate", "clonaltype")]) + ReplicateReads = data.frame(data.table(ReplicateReads)[, list(Reads=.N), by=c("Sample", "Replicate")]) + clonalFreqCount$Reads = as.numeric(clonalFreqCount$Reads) + ReplicateReads$Reads = as.numeric(ReplicateReads$Reads) + ReplicateReads$squared = as.numeric(ReplicateReads$Reads * ReplicateReads$Reads) + + ReplicatePrint <- function(dat){ + write.table(dat[-1], paste("ReplicateReads_", unique(dat[1])[1,1] , ".txt", sep=""), sep="\t",quote=F,na="-",row.names=F,col.names=F) } + + ReplicateSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"]) + lapply(ReplicateSplit, FUN=ReplicatePrint) + + ReplicateReads = data.frame(data.table(ReplicateReads)[, list(ReadsSum=sum(as.numeric(Reads)), ReadsSquaredSum=sum(as.numeric(squared))), by=c("Sample")]) + clonalFreqCount = merge(clonalFreqCount, ReplicateReads, by.x="Sample", by.y="Sample", all.x=T) + + ReplicateSumPrint <- function(dat){ + write.table(dat[-1], paste("ReplicateSumReads_", unique(dat[1])[1,1] , ".txt", sep=""), sep="\t",quote=F,na="-",row.names=F,col.names=F) + } + + ReplicateSumSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"]) + lapply(ReplicateSumSplit, FUN=ReplicateSumPrint) + + clonalFreqCountSum = data.frame(data.table(clonalFreqCount)[, list(Numerator=sum(WeightedCount, na.rm=T)), by=c("Sample")]) + clonalFreqCount = merge(clonalFreqCount, clonalFreqCountSum, by.x="Sample", by.y="Sample", all.x=T) + clonalFreqCount$ReadsSum = as.numeric(clonalFreqCount$ReadsSum) #prevent integer overflow + clonalFreqCount$Denominator = (((clonalFreqCount$ReadsSum * clonalFreqCount$ReadsSum) - clonalFreqCount$ReadsSquaredSum) / 2) + clonalFreqCount$Result = (clonalFreqCount$Numerator + 1) / (clonalFreqCount$Denominator + 1) + + ClonalityScorePrint <- function(dat){ + write.table(dat$Result, paste("ClonalityScore_", unique(dat[1])[1,1] , ".txt", sep=""), sep="\t",quote=F,na="-",row.names=F,col.names=F) + } + + clonalityScore = clonalFreqCount[c("Sample", "Result")] + clonalityScore = unique(clonalityScore) + + clonalityScoreSplit = split(clonalityScore, f=clonalityScore[,"Sample"]) + lapply(clonalityScoreSplit, FUN=ClonalityScorePrint) + + clonalityOverview = clonalFreqCount[c("Sample", "Type", "Count", "Weight", "WeightedCount")] + + + + ClonalityOverviewPrint <- function(dat){ + dat = dat[order(dat[,2]),] + write.table(dat[-1], paste("ClonalityOverView_", unique(dat[1])[1,1] , ".txt", sep=""), sep="\t",quote=F,na="-",row.names=F,col.names=F) + } + + clonalityOverviewSplit = split(clonalityOverview, f=clonalityOverview$Sample) + lapply(clonalityOverviewSplit, FUN=ClonalityOverviewPrint) + } bak = PRODF @@ -724,16 +724,14 @@ PRODF.with.D = PRODF[nchar(PRODF$Top.D.Gene, keepNA=F) > 2,] PRODF.no.D = PRODF[nchar(PRODF$Top.D.Gene, keepNA=F) < 4,] + write.table(PRODF.no.D, "productive_no_D.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=T) UNPROD.with.D = UNPROD[nchar(UNPROD$Top.D.Gene, keepNA=F) > 2,] UNPROD.no.D = UNPROD[nchar(UNPROD$Top.D.Gene, keepNA=F) < 4,] + write.table(UNPROD.no.D, "unproductive_no_D.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=T) num_median = function(x, na.rm=T) { as.numeric(median(x, na.rm=na.rm)) } - print("---- table prod.with.d cdr3.length ----") - print(table(PRODF.with.D$CDR3.Length, useNA="ifany")) - print(median(PRODF.with.D$CDR3.Length, na.rm=T)) - newData = data.frame(data.table(PRODF.with.D)[,list(unique=.N, VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=mean(.SD$P3V.nt.nb, na.rm=T), @@ -772,10 +770,6 @@ newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) write.table(newData, "junctionAnalysisProd_median_wD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) - print("---- table unprod.with.d cdr3.length ----") - print(table(UNPROD.with.D$CDR3.Length, useNA="ifany")) - print(median(UNPROD.with.D$CDR3.Length, na.rm=T)) - newData = data.frame(data.table(UNPROD.with.D)[,list(unique=.N, VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=mean(.SD$P3V.nt.nb, na.rm=T), @@ -819,11 +813,11 @@ newData = data.frame(data.table(PRODF.no.D)[,list(unique=.N, VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=mean(.SD$P3V.nt.nb, na.rm=T), - N1=mean(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), + N1=mean(.SD$N.REGION.nt.nb, na.rm=T), P2=mean(.SD$P5J.nt.nb, na.rm=T), DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), - Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), + Total.N=mean(.SD$N.REGION.nt.nb, na.rm=T), Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), Median.CDR3.l=as.double(as.numeric(median(.SD$CDR3.Length, na.rm=T)))), by=c("Sample")]) @@ -833,39 +827,43 @@ newData = data.frame(data.table(PRODF.no.D)[,list(unique=.N, VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=num_median(.SD$P3V.nt.nb, na.rm=T), - N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), + N1=median(.SD$N.REGION.nt.nb, na.rm=T), P2=num_median(.SD$P5J.nt.nb, na.rm=T), DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), - Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), + Total.N=median(.SD$N.REGION.nt.nb, na.rm=T), Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), Median.CDR3.l=as.double(as.numeric(median(.SD$CDR3.Length, na.rm=T)))), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) write.table(newData, "junctionAnalysisProd_median_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) + print(paste("mean N:", mean(UNPROD.no.D$N.REGION.nt.nb, na.rm=T))) + newData = data.frame(data.table(UNPROD.no.D)[,list(unique=.N, VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=mean(.SD$P3V.nt.nb, na.rm=T), - N1=mean(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), + N1=mean(.SD$N.REGION.nt.nb, na.rm=T), P2=mean(.SD$P5J.nt.nb, na.rm=T), DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), - Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), + Total.N=mean(.SD$N.REGION.nt.nb, na.rm=T), Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), Median.CDR3.l=as.double(as.numeric(median(.SD$CDR3.Length, na.rm=T)))), by=c("Sample")]) newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) write.table(newData, "junctionAnalysisUnProd_mean_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) + print(paste("median N:", num_median(UNPROD.no.D$N.REGION.nt.nb, na.rm=T))) + newData = data.frame(data.table(UNPROD.no.D)[,list(unique=.N, VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), P1=num_median(.SD$P3V.nt.nb, na.rm=T), - N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), + N1=median(.SD$N.REGION.nt.nb, na.rm=T), P2=num_median(.SD$P5J.nt.nb, na.rm=T), DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), - Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)), + Total.N=median(.SD$N.REGION.nt.nb, na.rm=T), Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), Median.CDR3.l=as.double(as.numeric(median(.SD$CDR3.Length, na.rm=T)))), by=c("Sample")]) @@ -960,20 +958,13 @@ clonaltype.in.replicates$clonaltype = do.call(paste, c(clonaltype.in.replicates[clonaltype], sep = ":")) clonaltype.in.replicates = clonaltype.in.replicates[,c("clonaltype","Replicate", "ID", "Sequence", "Sample")] -print(head(clonaltype.in.replicates)) - clonaltype.counts = data.frame(table(clonaltype.in.replicates$clonaltype)) names(clonaltype.counts) = c("clonaltype", "coincidence") -print(head(clonaltype.counts)) - clonaltype.counts = clonaltype.counts[clonaltype.counts$coincidence > 1,] -head(clonaltype.counts) - clonaltype.in.replicates = clonaltype.in.replicates[clonaltype.in.replicates$clonaltype %in% clonaltype.counts$clonaltype,] clonaltype.in.replicates = merge(clonaltype.in.replicates, clonaltype.counts, by="clonaltype") -print(head(clonaltype.in.replicates)) clonaltype.in.replicates = clonaltype.in.replicates[order(clonaltype.in.replicates$clonaltype),c("coincidence","clonaltype", "Sample", "Replicate", "ID", "Sequence")] write.table(clonaltype.in.replicates, "clonaltypes_replicates.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=T)