diff report_clonality/RScript.r @ 26:28fbbdfd7a87 draft

Uploaded
author davidvanzessen
date Mon, 13 Feb 2017 09:08:46 -0500
parents 94765af0db1f
children 1f83e14f173b
line wrap: on
line diff
--- a/report_clonality/RScript.r	Thu Feb 09 07:20:09 2017 -0500
+++ b/report_clonality/RScript.r	Mon Feb 13 09:08:46 2017 -0500
@@ -614,86 +614,86 @@
       colnames(coincidence.table) = c("Coincidence Type",  "Raw Coincidence Freq")
       write.table(coincidence.table, file=paste("lymphclon_coincidences_", sample_id, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
     }
-  } else if(clonality_method == "old") {
-    clonalFreq = data.frame(data.table(clonalityFrame)[, list(Type=.N), by=c("Sample", "clonaltype")])
-    
-    #write files for every coincidence group of >1
-    samples = unique(clonalFreq$Sample)
-    for(sample in samples){
-		clonalFreqSample = clonalFreq[clonalFreq$Sample == sample,]
-		if(max(clonalFreqSample$Type) > 1){
-			for(i in 2:max(clonalFreqSample$Type)){
-				clonalFreqSampleType = clonalFreqSample[clonalFreqSample$Type == i,]
-				clonalityFrame.sub = clonalityFrame[clonalityFrame$clonaltype %in% clonalFreqSampleType$clonaltype,]
-				clonalityFrame.sub = clonalityFrame.sub[order(clonalityFrame.sub$clonaltype),]
-				write.table(clonalityFrame.sub, file=paste("coincidences_", sample, "_", i, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
-			}
-		}
-	}
-	
-    clonalFreqCount = data.frame(data.table(clonalFreq)[, list(Count=.N), by=c("Sample", "Type")])
-    clonalFreqCount$realCount = clonalFreqCount$Type * clonalFreqCount$Count
-    clonalSum = data.frame(data.table(clonalFreqCount)[, list(Reads=sum(realCount)), by=c("Sample")])
-    clonalFreqCount = merge(clonalFreqCount, clonalSum, by.x="Sample", by.y="Sample")
-    
-    ct = c('Type\tWeight\n2\t1\n3\t3\n4\t6\n5\t10\n6\t15')
-    tcct = textConnection(ct)
-    CT  = read.table(tcct, sep="\t", header=TRUE)
-    close(tcct)
-    clonalFreqCount = merge(clonalFreqCount, CT, by.x="Type", by.y="Type", all.x=T)
-    clonalFreqCount$WeightedCount = clonalFreqCount$Count * clonalFreqCount$Weight
-    
-    ReplicateReads = data.frame(data.table(clonalityFrame)[, list(Type=.N), by=c("Sample", "Replicate", "clonaltype")])
-    ReplicateReads = data.frame(data.table(ReplicateReads)[, list(Reads=.N), by=c("Sample", "Replicate")])
-    clonalFreqCount$Reads = as.numeric(clonalFreqCount$Reads)
-    ReplicateReads$Reads = as.numeric(ReplicateReads$Reads)
-    ReplicateReads$squared = as.numeric(ReplicateReads$Reads * ReplicateReads$Reads)
-    
-    ReplicatePrint <- function(dat){
-      write.table(dat[-1], paste("ReplicateReads_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F)
-    }
-    
-    ReplicateSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"])
-    lapply(ReplicateSplit, FUN=ReplicatePrint)
-    
-    ReplicateReads = data.frame(data.table(ReplicateReads)[, list(ReadsSum=sum(as.numeric(Reads)), ReadsSquaredSum=sum(as.numeric(squared))), by=c("Sample")])
-    clonalFreqCount = merge(clonalFreqCount, ReplicateReads, by.x="Sample", by.y="Sample", all.x=T)
-    
-    ReplicateSumPrint <- function(dat){
-      write.table(dat[-1], paste("ReplicateSumReads_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F)
-    }
-    
-    ReplicateSumSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"])
-    lapply(ReplicateSumSplit, FUN=ReplicateSumPrint)
-    
-    clonalFreqCountSum = data.frame(data.table(clonalFreqCount)[, list(Numerator=sum(WeightedCount, na.rm=T)), by=c("Sample")])
-    clonalFreqCount = merge(clonalFreqCount, clonalFreqCountSum, by.x="Sample", by.y="Sample", all.x=T)
-    clonalFreqCount$ReadsSum = as.numeric(clonalFreqCount$ReadsSum) #prevent integer overflow
-    clonalFreqCount$Denominator = (((clonalFreqCount$ReadsSum * clonalFreqCount$ReadsSum) - clonalFreqCount$ReadsSquaredSum) / 2)
-    clonalFreqCount$Result = (clonalFreqCount$Numerator + 1) / (clonalFreqCount$Denominator + 1)
-    
-    ClonalityScorePrint <- function(dat){
-      write.table(dat$Result, paste("ClonalityScore_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F)
-    }
-    
-    clonalityScore = clonalFreqCount[c("Sample", "Result")]
-    clonalityScore = unique(clonalityScore)
-    
-    clonalityScoreSplit = split(clonalityScore, f=clonalityScore[,"Sample"])
-    lapply(clonalityScoreSplit, FUN=ClonalityScorePrint)
-    
-    clonalityOverview = clonalFreqCount[c("Sample", "Type", "Count", "Weight", "WeightedCount")]
-    
-    
-    
-    ClonalityOverviewPrint <- function(dat){
-	  dat = dat[order(dat[,2]),]
-      write.table(dat[-1], paste("ClonalityOverView_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F)
-    }
-    
-    clonalityOverviewSplit = split(clonalityOverview, f=clonalityOverview$Sample)
-    lapply(clonalityOverviewSplit, FUN=ClonalityOverviewPrint)
+  }
+  clonalFreq = data.frame(data.table(clonalityFrame)[, list(Type=.N), by=c("Sample", "clonaltype")])
+
+  #write files for every coincidence group of >1
+  samples = unique(clonalFreq$Sample)
+  for(sample in samples){
+      clonalFreqSample = clonalFreq[clonalFreq$Sample == sample,]
+      if(max(clonalFreqSample$Type) > 1){
+          for(i in 2:max(clonalFreqSample$Type)){
+              clonalFreqSampleType = clonalFreqSample[clonalFreqSample$Type == i,]
+              clonalityFrame.sub = clonalityFrame[clonalityFrame$clonaltype %in% clonalFreqSampleType$clonaltype,]
+              clonalityFrame.sub = clonalityFrame.sub[order(clonalityFrame.sub$clonaltype),]
+              write.table(clonalityFrame.sub, file=paste("coincidences_", sample, "_", i, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+          }
+      }
+  }
+
+  clonalFreqCount = data.frame(data.table(clonalFreq)[, list(Count=.N), by=c("Sample", "Type")])
+  clonalFreqCount$realCount = clonalFreqCount$Type * clonalFreqCount$Count
+  clonalSum = data.frame(data.table(clonalFreqCount)[, list(Reads=sum(realCount)), by=c("Sample")])
+  clonalFreqCount = merge(clonalFreqCount, clonalSum, by.x="Sample", by.y="Sample")
+
+  ct = c('Type\tWeight\n2\t1\n3\t3\n4\t6\n5\t10\n6\t15')
+  tcct = textConnection(ct)
+  CT  = read.table(tcct, sep="\t", header=TRUE)
+  close(tcct)
+  clonalFreqCount = merge(clonalFreqCount, CT, by.x="Type", by.y="Type", all.x=T)
+  clonalFreqCount$WeightedCount = clonalFreqCount$Count * clonalFreqCount$Weight
+
+  ReplicateReads = data.frame(data.table(clonalityFrame)[, list(Type=.N), by=c("Sample", "Replicate", "clonaltype")])
+  ReplicateReads = data.frame(data.table(ReplicateReads)[, list(Reads=.N), by=c("Sample", "Replicate")])
+  clonalFreqCount$Reads = as.numeric(clonalFreqCount$Reads)
+  ReplicateReads$Reads = as.numeric(ReplicateReads$Reads)
+  ReplicateReads$squared = as.numeric(ReplicateReads$Reads * ReplicateReads$Reads)
+
+  ReplicatePrint <- function(dat){
+      write.table(dat[-1], paste("ReplicateReads_", unique(dat[1])[1,1] , ".txt", sep=""), sep="\t",quote=F,na="-",row.names=F,col.names=F)
   }
+
+  ReplicateSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"])
+  lapply(ReplicateSplit, FUN=ReplicatePrint)
+
+  ReplicateReads = data.frame(data.table(ReplicateReads)[, list(ReadsSum=sum(as.numeric(Reads)), ReadsSquaredSum=sum(as.numeric(squared))), by=c("Sample")])
+  clonalFreqCount = merge(clonalFreqCount, ReplicateReads, by.x="Sample", by.y="Sample", all.x=T)
+
+  ReplicateSumPrint <- function(dat){
+      write.table(dat[-1], paste("ReplicateSumReads_", unique(dat[1])[1,1] , ".txt", sep=""), sep="\t",quote=F,na="-",row.names=F,col.names=F)
+  }
+
+  ReplicateSumSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"])
+  lapply(ReplicateSumSplit, FUN=ReplicateSumPrint)
+
+  clonalFreqCountSum = data.frame(data.table(clonalFreqCount)[, list(Numerator=sum(WeightedCount, na.rm=T)), by=c("Sample")])
+  clonalFreqCount = merge(clonalFreqCount, clonalFreqCountSum, by.x="Sample", by.y="Sample", all.x=T)
+  clonalFreqCount$ReadsSum = as.numeric(clonalFreqCount$ReadsSum) #prevent integer overflow
+  clonalFreqCount$Denominator = (((clonalFreqCount$ReadsSum * clonalFreqCount$ReadsSum) - clonalFreqCount$ReadsSquaredSum) / 2)
+  clonalFreqCount$Result = (clonalFreqCount$Numerator + 1) / (clonalFreqCount$Denominator + 1)
+
+  ClonalityScorePrint <- function(dat){
+      write.table(dat$Result, paste("ClonalityScore_", unique(dat[1])[1,1] , ".txt", sep=""), sep="\t",quote=F,na="-",row.names=F,col.names=F)
+  }
+
+  clonalityScore = clonalFreqCount[c("Sample", "Result")]
+  clonalityScore = unique(clonalityScore)
+
+  clonalityScoreSplit = split(clonalityScore, f=clonalityScore[,"Sample"])
+  lapply(clonalityScoreSplit, FUN=ClonalityScorePrint)
+
+  clonalityOverview = clonalFreqCount[c("Sample", "Type", "Count", "Weight", "WeightedCount")]
+
+
+
+  ClonalityOverviewPrint <- function(dat){
+      dat = dat[order(dat[,2]),]
+      write.table(dat[-1], paste("ClonalityOverView_", unique(dat[1])[1,1] , ".txt", sep=""), sep="\t",quote=F,na="-",row.names=F,col.names=F)
+  }
+
+  clonalityOverviewSplit = split(clonalityOverview, f=clonalityOverview$Sample)
+  lapply(clonalityOverviewSplit, FUN=ClonalityOverviewPrint)
+  
 }
 
 bak = PRODF
@@ -724,16 +724,14 @@
   
   PRODF.with.D = PRODF[nchar(PRODF$Top.D.Gene, keepNA=F) > 2,]
   PRODF.no.D = PRODF[nchar(PRODF$Top.D.Gene, keepNA=F) < 4,]
+  write.table(PRODF.no.D, "productive_no_D.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=T)
   
   UNPROD.with.D = UNPROD[nchar(UNPROD$Top.D.Gene, keepNA=F) > 2,]
   UNPROD.no.D = UNPROD[nchar(UNPROD$Top.D.Gene, keepNA=F) < 4,]
+  write.table(UNPROD.no.D, "unproductive_no_D.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=T)
   
   num_median = function(x, na.rm=T) { as.numeric(median(x, na.rm=na.rm)) }
 
-  print("---- table prod.with.d cdr3.length ----")
-  print(table(PRODF.with.D$CDR3.Length, useNA="ifany"))
-  print(median(PRODF.with.D$CDR3.Length, na.rm=T))
-
   newData = data.frame(data.table(PRODF.with.D)[,list(unique=.N, 
                                                VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T),
                                                P1=mean(.SD$P3V.nt.nb, na.rm=T),
@@ -772,10 +770,6 @@
   newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)
   write.table(newData, "junctionAnalysisProd_median_wD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F)
   
-  print("---- table unprod.with.d cdr3.length ----")
-  print(table(UNPROD.with.D$CDR3.Length, useNA="ifany"))
-  print(median(UNPROD.with.D$CDR3.Length, na.rm=T))
-  
   newData = data.frame(data.table(UNPROD.with.D)[,list(unique=.N, 
                                                 VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T),
                                                 P1=mean(.SD$P3V.nt.nb, na.rm=T),
@@ -819,11 +813,11 @@
   newData = data.frame(data.table(PRODF.no.D)[,list(unique=.N, 
                                                VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T),
                                                P1=mean(.SD$P3V.nt.nb, na.rm=T),
-                                               N1=mean(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)),
+                                               N1=mean(.SD$N.REGION.nt.nb, na.rm=T),
                                                P2=mean(.SD$P5J.nt.nb, na.rm=T),
                                                DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T),
                                                Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)),
-                                               Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)),
+                                               Total.N=mean(.SD$N.REGION.nt.nb, na.rm=T),
                                                Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)),
                                                Median.CDR3.l=as.double(as.numeric(median(.SD$CDR3.Length, na.rm=T)))),
                                          by=c("Sample")])
@@ -833,39 +827,43 @@
   newData = data.frame(data.table(PRODF.no.D)[,list(unique=.N, 
                                                VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T),
                                                P1=num_median(.SD$P3V.nt.nb, na.rm=T),
-                                               N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)),
+                                               N1=median(.SD$N.REGION.nt.nb, na.rm=T),
                                                P2=num_median(.SD$P5J.nt.nb, na.rm=T),
                                                DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T),
 											   Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)),
-											   Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)),
+											   Total.N=median(.SD$N.REGION.nt.nb, na.rm=T),
 											   Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)),
 											   Median.CDR3.l=as.double(as.numeric(median(.SD$CDR3.Length, na.rm=T)))),
                                          by=c("Sample")])
   newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)
   write.table(newData, "junctionAnalysisProd_median_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F)
   
+  print(paste("mean N:", mean(UNPROD.no.D$N.REGION.nt.nb, na.rm=T)))
+  
   newData = data.frame(data.table(UNPROD.no.D)[,list(unique=.N, 
                                                 VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T),
                                                 P1=mean(.SD$P3V.nt.nb, na.rm=T),
-                                                N1=mean(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)),
+                                                N1=mean(.SD$N.REGION.nt.nb, na.rm=T),
                                                 P2=mean(.SD$P5J.nt.nb, na.rm=T),
                                                 DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T),
                                                 Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)),
-                                                Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)),
+                                                Total.N=mean(.SD$N.REGION.nt.nb, na.rm=T),
                                                 Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)),
                                                 Median.CDR3.l=as.double(as.numeric(median(.SD$CDR3.Length, na.rm=T)))),
                                           by=c("Sample")])
   newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1)
   write.table(newData, "junctionAnalysisUnProd_mean_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F)
   
+  print(paste("median N:", num_median(UNPROD.no.D$N.REGION.nt.nb, na.rm=T)))
+  
     newData = data.frame(data.table(UNPROD.no.D)[,list(unique=.N, 
                                                 VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T),
                                                 P1=num_median(.SD$P3V.nt.nb, na.rm=T),
-                                                N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)),
+                                                N1=median(.SD$N.REGION.nt.nb, na.rm=T),
                                                 P2=num_median(.SD$P5J.nt.nb, na.rm=T),
                                                 DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T),
                                                 Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)),
-                                                Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb"), with=F], na.rm=T)),
+                                                Total.N=median(.SD$N.REGION.nt.nb, na.rm=T),
                                                 Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)),
                                                 Median.CDR3.l=as.double(as.numeric(median(.SD$CDR3.Length, na.rm=T)))),
 															by=c("Sample")])
@@ -960,20 +958,13 @@
 clonaltype.in.replicates$clonaltype = do.call(paste, c(clonaltype.in.replicates[clonaltype], sep = ":"))
 clonaltype.in.replicates = clonaltype.in.replicates[,c("clonaltype","Replicate", "ID", "Sequence", "Sample")]
 
-print(head(clonaltype.in.replicates))
-
 clonaltype.counts = data.frame(table(clonaltype.in.replicates$clonaltype))
 names(clonaltype.counts) = c("clonaltype", "coincidence")
 
-print(head(clonaltype.counts))
-
 clonaltype.counts = clonaltype.counts[clonaltype.counts$coincidence > 1,]
 
-head(clonaltype.counts)
-
 clonaltype.in.replicates = clonaltype.in.replicates[clonaltype.in.replicates$clonaltype %in% clonaltype.counts$clonaltype,]
 clonaltype.in.replicates = merge(clonaltype.in.replicates, clonaltype.counts, by="clonaltype")
-print(head(clonaltype.in.replicates))
 clonaltype.in.replicates = clonaltype.in.replicates[order(clonaltype.in.replicates$clonaltype),c("coincidence","clonaltype", "Sample", "Replicate", "ID", "Sequence")]
 
 write.table(clonaltype.in.replicates, "clonaltypes_replicates.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=T)