| 5 | 1 # ---------------------- load/install packages ---------------------- | 
|  | 2 | 
|  | 3 if (!("gridExtra" %in% rownames(installed.packages()))) { | 
|  | 4   install.packages("gridExtra", repos="http://cran.xl-mirror.nl/") | 
|  | 5 } | 
|  | 6 library(gridExtra) | 
|  | 7 if (!("ggplot2" %in% rownames(installed.packages()))) { | 
|  | 8   install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") | 
|  | 9 } | 
|  | 10 library(ggplot2) | 
|  | 11 if (!("plyr" %in% rownames(installed.packages()))) { | 
|  | 12   install.packages("plyr", repos="http://cran.xl-mirror.nl/") | 
|  | 13 } | 
|  | 14 library(plyr) | 
|  | 15 | 
|  | 16 if (!("data.table" %in% rownames(installed.packages()))) { | 
|  | 17   install.packages("data.table", repos="http://cran.xl-mirror.nl/") | 
|  | 18 } | 
|  | 19 library(data.table) | 
|  | 20 | 
|  | 21 if (!("reshape2" %in% rownames(installed.packages()))) { | 
|  | 22   install.packages("reshape2", repos="http://cran.xl-mirror.nl/") | 
|  | 23 } | 
|  | 24 library(reshape2) | 
|  | 25 | 
|  | 26 if (!("lymphclon" %in% rownames(installed.packages()))) { | 
|  | 27   install.packages("lymphclon", repos="http://cran.xl-mirror.nl/") | 
|  | 28 } | 
|  | 29 library(lymphclon) | 
|  | 30 | 
|  | 31 # ---------------------- parameters ---------------------- | 
|  | 32 | 
|  | 33 args <- commandArgs(trailingOnly = TRUE) | 
|  | 34 | 
|  | 35 infile = args[1] #path to input file | 
|  | 36 outfile = args[2] #path to output file | 
|  | 37 outdir = args[3] #path to output folder (html/images/data) | 
|  | 38 clonaltype = args[4] #clonaltype definition, or 'none' for no unique filtering | 
|  | 39 ct = unlist(strsplit(clonaltype, ",")) | 
|  | 40 species = args[5] #human or mouse | 
|  | 41 locus = args[6] # IGH, IGK, IGL, TRB, TRA, TRG or TRD | 
|  | 42 filterproductive = ifelse(args[7] == "yes", T, F) #should unproductive sequences be filtered out? (yes/no) | 
|  | 43 clonality_method = args[8] | 
|  | 44 | 
|  | 45 | 
|  | 46 # ---------------------- Data preperation ---------------------- | 
|  | 47 | 
|  | 48 print("Report Clonality - Data preperation") | 
|  | 49 | 
| 13 | 50 inputdata = read.table(infile, sep="\t", header=TRUE, fill=T, comment.char="", stringsAsFactors=F) | 
| 5 | 51 | 
| 24 | 52 | 
| 5 | 53 print(paste("nrows: ", nrow(inputdata))) | 
|  | 54 | 
|  | 55 setwd(outdir) | 
|  | 56 | 
|  | 57 # remove weird rows | 
|  | 58 inputdata = inputdata[inputdata$Sample != "",] | 
|  | 59 | 
|  | 60 print(paste("nrows: ", nrow(inputdata))) | 
|  | 61 | 
|  | 62 #remove the allele from the V,D and J genes | 
|  | 63 inputdata$Top.V.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.V.Gene) | 
|  | 64 inputdata$Top.D.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.D.Gene) | 
|  | 65 inputdata$Top.J.Gene = gsub("[*]([0-9]+)", "", inputdata$Top.J.Gene) | 
|  | 66 | 
|  | 67 print(paste("nrows: ", nrow(inputdata))) | 
|  | 68 | 
|  | 69 #filter uniques | 
|  | 70 inputdata.removed = inputdata[NULL,] | 
|  | 71 | 
|  | 72 print(paste("nrows: ", nrow(inputdata))) | 
|  | 73 | 
|  | 74 inputdata$clonaltype = 1:nrow(inputdata) | 
|  | 75 | 
|  | 76 #keep track of the count of sequences in samples or samples/replicates for the front page overview | 
|  | 77 input.sample.count = data.frame(data.table(inputdata)[, list(All=.N), by=c("Sample")]) | 
|  | 78 input.rep.count = data.frame(data.table(inputdata)[, list(All=.N), by=c("Sample", "Replicate")]) | 
|  | 79 | 
|  | 80 PRODF = inputdata | 
|  | 81 UNPROD = inputdata | 
|  | 82 if(filterproductive){ | 
|  | 83   if("Functionality" %in% colnames(inputdata)) { # "Functionality" is an IMGT column | 
|  | 84     #PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ] | 
|  | 85     PRODF = inputdata[inputdata$Functionality %in% c("productive (see comment)","productive"),] | 
|  | 86 | 
|  | 87     PRODF.count = data.frame(data.table(PRODF)[, list(count=.N), by=c("Sample")]) | 
|  | 88 | 
|  | 89     UNPROD = inputdata[inputdata$Functionality %in% c("unproductive (see comment)","unproductive"), ] | 
|  | 90   } else { | 
|  | 91     PRODF = inputdata[inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" , ] | 
|  | 92     UNPROD = inputdata[!(inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" ), ] | 
|  | 93   } | 
|  | 94 } | 
|  | 95 | 
| 13 | 96 for(i in 1:nrow(UNPROD)){ | 
|  | 97     if(!is.numeric(UNPROD[i,"CDR3.Length"])){ | 
|  | 98         UNPROD[i,"CDR3.Length"] = 0 | 
|  | 99     } | 
|  | 100 } | 
|  | 101 | 
| 5 | 102 prod.sample.count = data.frame(data.table(PRODF)[, list(Productive=.N), by=c("Sample")]) | 
|  | 103 prod.rep.count = data.frame(data.table(PRODF)[, list(Productive=.N), by=c("Sample", "Replicate")]) | 
|  | 104 | 
|  | 105 unprod.sample.count = data.frame(data.table(UNPROD)[, list(Unproductive=.N), by=c("Sample")]) | 
|  | 106 unprod.rep.count = data.frame(data.table(UNPROD)[, list(Unproductive=.N), by=c("Sample", "Replicate")]) | 
|  | 107 | 
|  | 108 clonalityFrame = PRODF | 
|  | 109 | 
|  | 110 #remove duplicates based on the clonaltype | 
|  | 111 if(clonaltype != "none"){ | 
|  | 112   clonaltype = paste(clonaltype, ",Sample", sep="") #add sample column to clonaltype, unique within samples | 
|  | 113   PRODF$clonaltype = do.call(paste, c(PRODF[unlist(strsplit(clonaltype, ","))], sep = ":")) | 
|  | 114   PRODF = PRODF[!duplicated(PRODF$clonaltype), ] | 
|  | 115 | 
|  | 116   UNPROD$clonaltype = do.call(paste, c(UNPROD[unlist(strsplit(clonaltype, ","))], sep = ":")) | 
|  | 117   UNPROD = UNPROD[!duplicated(UNPROD$clonaltype), ] | 
|  | 118 | 
|  | 119   #again for clonalityFrame but with sample+replicate | 
|  | 120   clonalityFrame$clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(clonaltype, ","))], sep = ":")) | 
|  | 121   clonalityFrame$clonality_clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(paste(clonaltype, ",Replicate", sep=""), ","))], sep = ":")) | 
|  | 122   clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$clonality_clonaltype), ] | 
|  | 123 } | 
|  | 124 | 
|  | 125 prod.unique.sample.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample")]) | 
|  | 126 prod.unique.rep.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample", "Replicate")]) | 
|  | 127 | 
|  | 128 unprod.unique.sample.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample")]) | 
|  | 129 unprod.unique.rep.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample", "Replicate")]) | 
|  | 130 | 
|  | 131 PRODF$freq = 1 | 
|  | 132 | 
|  | 133 if(any(grepl(pattern="_", x=PRODF$ID))){ #the frequency can be stored in the ID with the pattern ".*_freq_.*" | 
|  | 134   PRODF$freq = gsub("^[0-9]+_", "", PRODF$ID) | 
|  | 135   PRODF$freq = gsub("_.*", "", PRODF$freq) | 
|  | 136   PRODF$freq = as.numeric(PRODF$freq) | 
|  | 137   if(any(is.na(PRODF$freq))){ #if there was an "_" in the ID, but not the frequency, go back to frequency of 1 for every sequence | 
|  | 138     PRODF$freq = 1 | 
|  | 139   } | 
|  | 140 } | 
|  | 141 | 
| 8 | 142 #make a names list with sample -> color | 
|  | 143 naive.colors = c('blue4', 'darkred', 'olivedrab3', 'red', 'gray74', 'darkviolet', 'lightblue1', 'gold', 'chartreuse2', 'pink', 'Paleturquoise3', 'Chocolate1', 'Yellow', 'Deeppink3', 'Mediumorchid1', 'Darkgreen', 'Blue', 'Gray36', 'Hotpink', 'Yellow4') | 
|  | 144 unique.samples = unique(PRODF$Sample) | 
|  | 145 | 
|  | 146 if(length(unique.samples) <= length(naive.colors)){ | 
|  | 147 	sample.colors = naive.colors[1:length(unique.samples)] | 
|  | 148 } else { | 
|  | 149 	sample.colors = rainbow(length(unique.samples)) | 
|  | 150 } | 
|  | 151 | 
|  | 152 names(sample.colors) = unique.samples | 
|  | 153 | 
|  | 154 print("Sample.colors") | 
|  | 155 print(sample.colors) | 
| 5 | 156 | 
|  | 157 | 
|  | 158 #write the complete dataset that is left over, will be the input if 'none' for clonaltype and 'no' for filterproductive | 
|  | 159 write.table(PRODF, "allUnique.txt", sep="\t",quote=F,row.names=F,col.names=T) | 
| 18 | 160 #write.table(PRODF, "allUnique.csv", sep=",",quote=F,row.names=F,col.names=T) | 
| 25 | 161 write.table(UNPROD, "allUnproductive.txt", sep="\t",quote=F,row.names=F,col.names=T) | 
| 5 | 162 | 
| 24 | 163 print("SAMPLE TABLE:") | 
|  | 164 print(table(PRODF$Sample)) | 
|  | 165 | 
| 5 | 166 #write the samples to a file | 
|  | 167 sampleFile <- file("samples.txt") | 
|  | 168 un = unique(inputdata$Sample) | 
|  | 169 un = paste(un, sep="\n") | 
|  | 170 writeLines(un, sampleFile) | 
|  | 171 close(sampleFile) | 
|  | 172 | 
|  | 173 # ---------------------- Counting the productive/unproductive and unique sequences ---------------------- | 
|  | 174 | 
|  | 175 print("Report Clonality - counting productive/unproductive/unique") | 
|  | 176 | 
|  | 177 #create the table on the overview page with the productive/unique counts per sample/replicate | 
|  | 178 #first for sample | 
|  | 179 sample.count = merge(input.sample.count, prod.sample.count, by="Sample", all.x=T) | 
|  | 180 sample.count$perc_prod = round(sample.count$Productive / sample.count$All * 100) | 
|  | 181 sample.count = merge(sample.count, prod.unique.sample.count, by="Sample", all.x=T) | 
|  | 182 sample.count$perc_prod_un = round(sample.count$Productive_unique / sample.count$All * 100) | 
|  | 183 | 
|  | 184 sample.count = merge(sample.count , unprod.sample.count, by="Sample", all.x=T) | 
|  | 185 sample.count$perc_unprod = round(sample.count$Unproductive / sample.count$All * 100) | 
|  | 186 sample.count = merge(sample.count, unprod.unique.sample.count, by="Sample", all.x=T) | 
|  | 187 sample.count$perc_unprod_un = round(sample.count$Unproductive_unique / sample.count$All * 100) | 
|  | 188 | 
|  | 189 #then sample/replicate | 
|  | 190 rep.count = merge(input.rep.count, prod.rep.count, by=c("Sample", "Replicate"), all.x=T) | 
|  | 191 rep.count$perc_prod = round(rep.count$Productive / rep.count$All * 100) | 
|  | 192 rep.count = merge(rep.count, prod.unique.rep.count, by=c("Sample", "Replicate"), all.x=T) | 
|  | 193 rep.count$perc_prod_un = round(rep.count$Productive_unique / rep.count$All * 100) | 
|  | 194 | 
|  | 195 rep.count = merge(rep.count, unprod.rep.count, by=c("Sample", "Replicate"), all.x=T) | 
|  | 196 rep.count$perc_unprod = round(rep.count$Unproductive / rep.count$All * 100) | 
|  | 197 rep.count = merge(rep.count, unprod.unique.rep.count, by=c("Sample", "Replicate"), all.x=T) | 
|  | 198 rep.count$perc_unprod_un = round(rep.count$Unproductive_unique / rep.count$All * 100) | 
|  | 199 | 
|  | 200 rep.count$Sample = paste(rep.count$Sample, rep.count$Replicate, sep="_") | 
|  | 201 rep.count = rep.count[,names(rep.count) != "Replicate"] | 
|  | 202 | 
|  | 203 count = rbind(sample.count, rep.count) | 
|  | 204 | 
|  | 205 | 
|  | 206 | 
|  | 207 write.table(x=count, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F) | 
|  | 208 | 
|  | 209 # ---------------------- V+J+CDR3 sequence count ---------------------- | 
|  | 210 | 
|  | 211 VJCDR3.count = data.frame(table(clonalityFrame$Top.V.Gene, clonalityFrame$Top.J.Gene, clonalityFrame$CDR3.Seq.DNA)) | 
|  | 212 names(VJCDR3.count) = c("Top.V.Gene", "Top.J.Gene", "CDR3.Seq.DNA", "Count") | 
|  | 213 | 
|  | 214 VJCDR3.count = VJCDR3.count[VJCDR3.count$Count > 0,] | 
|  | 215 VJCDR3.count = VJCDR3.count[order(-VJCDR3.count$Count),] | 
|  | 216 | 
|  | 217 write.table(x=VJCDR3.count, file="VJCDR3_count.txt", sep="\t",quote=F,row.names=F,col.names=T) | 
|  | 218 | 
|  | 219 # ---------------------- Frequency calculation for V, D and J ---------------------- | 
|  | 220 | 
|  | 221 print("Report Clonality - frequency calculation V, D and J") | 
|  | 222 | 
|  | 223 PRODFV = data.frame(data.table(PRODF)[, list(Length=sum(freq)), by=c("Sample", "Top.V.Gene")]) | 
|  | 224 Total = ddply(PRODFV, .(Sample), function(x) data.frame(Total = sum(x$Length))) | 
|  | 225 PRODFV = merge(PRODFV, Total, by.x='Sample', by.y='Sample', all.x=TRUE) | 
|  | 226 PRODFV = ddply(PRODFV, c("Sample", "Top.V.Gene"), summarise, relFreq= (Length*100 / Total)) | 
|  | 227 | 
|  | 228 PRODFD = data.frame(data.table(PRODF)[, list(Length=sum(freq)), by=c("Sample", "Top.D.Gene")]) | 
|  | 229 Total = ddply(PRODFD, .(Sample), function(x) data.frame(Total = sum(x$Length))) | 
|  | 230 PRODFD = merge(PRODFD, Total, by.x='Sample', by.y='Sample', all.x=TRUE) | 
|  | 231 PRODFD = ddply(PRODFD, c("Sample", "Top.D.Gene"), summarise, relFreq= (Length*100 / Total)) | 
|  | 232 | 
|  | 233 PRODFJ = data.frame(data.table(PRODF)[, list(Length=sum(freq)), by=c("Sample", "Top.J.Gene")]) | 
|  | 234 Total = ddply(PRODFJ, .(Sample), function(x) data.frame(Total = sum(x$Length))) | 
|  | 235 PRODFJ = merge(PRODFJ, Total, by.x='Sample', by.y='Sample', all.x=TRUE) | 
|  | 236 PRODFJ = ddply(PRODFJ, c("Sample", "Top.J.Gene"), summarise, relFreq= (Length*100 / Total)) | 
|  | 237 | 
|  | 238 # ---------------------- Setting up the gene names for the different species/loci ---------------------- | 
|  | 239 | 
|  | 240 print("Report Clonality - getting genes for species/loci") | 
|  | 241 | 
|  | 242 Vchain = "" | 
|  | 243 Dchain = "" | 
|  | 244 Jchain = "" | 
|  | 245 | 
|  | 246 if(species == "custom"){ | 
|  | 247 	print("Custom genes: ") | 
|  | 248 	splt = unlist(strsplit(locus, ";")) | 
|  | 249 	print(paste("V:", splt[1])) | 
|  | 250 	print(paste("D:", splt[2])) | 
|  | 251 	print(paste("J:", splt[3])) | 
|  | 252 | 
|  | 253 	Vchain = unlist(strsplit(splt[1], ",")) | 
|  | 254 	Vchain = data.frame(v.name = Vchain, chr.orderV = 1:length(Vchain)) | 
|  | 255 | 
|  | 256 	Dchain = unlist(strsplit(splt[2], ",")) | 
|  | 257 	if(length(Dchain) > 0){ | 
|  | 258 		Dchain = data.frame(v.name = Dchain, chr.orderD = 1:length(Dchain)) | 
|  | 259 	} else { | 
|  | 260 		Dchain = data.frame(v.name = character(0), chr.orderD = numeric(0)) | 
|  | 261 	} | 
|  | 262 | 
|  | 263 	Jchain = unlist(strsplit(splt[3], ",")) | 
|  | 264 	Jchain = data.frame(v.name = Jchain, chr.orderJ = 1:length(Jchain)) | 
|  | 265 | 
|  | 266 } else { | 
|  | 267 	genes = read.table("genes.txt", sep="\t", header=TRUE, fill=T, comment.char="") | 
|  | 268 | 
|  | 269 	Vchain = genes[grepl(species, genes$Species) & genes$locus == locus & genes$region == "V",c("IMGT.GENE.DB", "chr.order")] | 
|  | 270 	colnames(Vchain) = c("v.name", "chr.orderV") | 
|  | 271 	Dchain = genes[grepl(species, genes$Species) & genes$locus == locus & genes$region == "D",c("IMGT.GENE.DB", "chr.order")] | 
|  | 272 	colnames(Dchain) = c("v.name", "chr.orderD") | 
|  | 273 	Jchain = genes[grepl(species, genes$Species) & genes$locus == locus & genes$region == "J",c("IMGT.GENE.DB", "chr.order")] | 
|  | 274 	colnames(Jchain) = c("v.name", "chr.orderJ") | 
|  | 275 } | 
|  | 276 useD = TRUE | 
|  | 277 if(nrow(Dchain) == 0){ | 
|  | 278   useD = FALSE | 
|  | 279   cat("No D Genes in this species/locus") | 
|  | 280 } | 
|  | 281 print(paste(nrow(Vchain), "genes in V")) | 
|  | 282 print(paste(nrow(Dchain), "genes in D")) | 
|  | 283 print(paste(nrow(Jchain), "genes in J")) | 
|  | 284 | 
|  | 285 # ---------------------- merge with the frequency count ---------------------- | 
|  | 286 | 
|  | 287 PRODFV = merge(PRODFV, Vchain, by.x='Top.V.Gene', by.y='v.name', all.x=TRUE) | 
|  | 288 | 
|  | 289 PRODFD = merge(PRODFD, Dchain, by.x='Top.D.Gene', by.y='v.name', all.x=TRUE) | 
|  | 290 | 
|  | 291 PRODFJ = merge(PRODFJ, Jchain, by.x='Top.J.Gene', by.y='v.name', all.x=TRUE) | 
|  | 292 | 
|  | 293 # ---------------------- Create the V, D and J frequency plots and write the data.frame for every plot to a file ---------------------- | 
|  | 294 | 
|  | 295 print("Report Clonality - V, D and J frequency plots") | 
|  | 296 | 
|  | 297 pV = ggplot(PRODFV) | 
|  | 298 pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) | 
| 8 | 299 pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage") + scale_fill_manual(values=sample.colors) | 
|  | 300 pV = pV + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | 
| 18 | 301 write.table(x=PRODFV, file="VFrequency.txt", sep="\t",quote=F,row.names=F,col.names=T) | 
| 5 | 302 | 
|  | 303 png("VPlot.png",width = 1280, height = 720) | 
|  | 304 pV | 
| 32 | 305 dev.off() | 
|  | 306 | 
|  | 307 ggsave("VPlot.pdf", pV, width=13, height=7) | 
| 5 | 308 | 
|  | 309 if(useD){ | 
|  | 310   pD = ggplot(PRODFD) | 
|  | 311   pD = pD + geom_bar( aes( x=factor(reorder(Top.D.Gene, chr.orderD)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) | 
| 8 | 312   pD = pD + xlab("Summary of D gene") + ylab("Frequency") + ggtitle("Relative frequency of D gene usage") + scale_fill_manual(values=sample.colors) | 
|  | 313   pD = pD + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | 
| 18 | 314   write.table(x=PRODFD, file="DFrequency.txt", sep="\t",quote=F,row.names=F,col.names=T) | 
| 5 | 315 | 
|  | 316   png("DPlot.png",width = 800, height = 600) | 
|  | 317   print(pD) | 
| 32 | 318   dev.off() | 
|  | 319 | 
|  | 320   ggsave("DPlot.pdf", pD, width=10, height=7) | 
| 5 | 321 } | 
|  | 322 | 
|  | 323 pJ = ggplot(PRODFJ) | 
|  | 324 pJ = pJ + geom_bar( aes( x=factor(reorder(Top.J.Gene, chr.orderJ)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) | 
| 8 | 325 pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of J gene usage") + scale_fill_manual(values=sample.colors) | 
|  | 326 pJ = pJ + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | 
| 18 | 327 write.table(x=PRODFJ, file="JFrequency.txt", sep="\t",quote=F,row.names=F,col.names=T) | 
| 5 | 328 | 
|  | 329 png("JPlot.png",width = 800, height = 600) | 
|  | 330 pJ | 
| 32 | 331 dev.off() | 
|  | 332 | 
|  | 333 ggsave("JPlot.pdf", pJ) | 
| 5 | 334 | 
|  | 335 # ---------------------- Now the frequency plots of the V, D and J families ---------------------- | 
|  | 336 | 
|  | 337 print("Report Clonality - V, D and J family plots") | 
|  | 338 | 
|  | 339 VGenes = PRODF[,c("Sample", "Top.V.Gene")] | 
|  | 340 VGenes$Top.V.Gene = gsub("-.*", "", VGenes$Top.V.Gene) | 
|  | 341 VGenes = data.frame(data.table(VGenes)[, list(Count=.N), by=c("Sample", "Top.V.Gene")]) | 
|  | 342 TotalPerSample = data.frame(data.table(VGenes)[, list(total=sum(.SD$Count)), by=Sample]) | 
|  | 343 VGenes = merge(VGenes, TotalPerSample, by="Sample") | 
|  | 344 VGenes$Frequency = VGenes$Count * 100 / VGenes$total | 
|  | 345 VPlot = ggplot(VGenes) | 
|  | 346 VPlot = VPlot + geom_bar(aes( x = Top.V.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + | 
|  | 347   ggtitle("Distribution of V gene families") + | 
| 8 | 348   ylab("Percentage of sequences") + | 
|  | 349   scale_fill_manual(values=sample.colors) + | 
|  | 350   theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | 
| 5 | 351 png("VFPlot.png") | 
|  | 352 VPlot | 
| 32 | 353 dev.off() | 
|  | 354 ggsave("VFPlot.pdf", VPlot) | 
|  | 355 | 
| 18 | 356 write.table(x=VGenes, file="VFFrequency.txt", sep="\t",quote=F,row.names=F,col.names=T) | 
| 5 | 357 | 
|  | 358 if(useD){ | 
|  | 359   DGenes = PRODF[,c("Sample", "Top.D.Gene")] | 
|  | 360   DGenes$Top.D.Gene = gsub("-.*", "", DGenes$Top.D.Gene) | 
|  | 361   DGenes = data.frame(data.table(DGenes)[, list(Count=.N), by=c("Sample", "Top.D.Gene")]) | 
|  | 362   TotalPerSample = data.frame(data.table(DGenes)[, list(total=sum(.SD$Count)), by=Sample]) | 
|  | 363   DGenes = merge(DGenes, TotalPerSample, by="Sample") | 
|  | 364   DGenes$Frequency = DGenes$Count * 100 / DGenes$total | 
|  | 365   DPlot = ggplot(DGenes) | 
|  | 366   DPlot = DPlot + geom_bar(aes( x = Top.D.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + | 
|  | 367     ggtitle("Distribution of D gene families") + | 
| 8 | 368     ylab("Percentage of sequences") + | 
|  | 369     scale_fill_manual(values=sample.colors) + | 
|  | 370     theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | 
| 5 | 371   png("DFPlot.png") | 
|  | 372   print(DPlot) | 
| 32 | 373   dev.off() | 
|  | 374 | 
|  | 375   ggsave("DFPlot.pdf", DPlot) | 
| 18 | 376   write.table(x=DGenes, file="DFFrequency.txt", sep="\t",quote=F,row.names=F,col.names=T) | 
| 5 | 377 } | 
|  | 378 | 
|  | 379 # ---------------------- Plotting the cdr3 length ---------------------- | 
|  | 380 | 
|  | 381 print("Report Clonality - CDR3 length plot") | 
|  | 382 | 
| 9 | 383 CDR3Length = data.frame(data.table(PRODF)[, list(Count=.N), by=c("Sample", "CDR3.Length")]) | 
| 5 | 384 TotalPerSample = data.frame(data.table(CDR3Length)[, list(total=sum(.SD$Count)), by=Sample]) | 
|  | 385 CDR3Length = merge(CDR3Length, TotalPerSample, by="Sample") | 
|  | 386 CDR3Length$Frequency = CDR3Length$Count * 100 / CDR3Length$total | 
|  | 387 CDR3LengthPlot = ggplot(CDR3Length) | 
| 15 | 388 CDR3LengthPlot = CDR3LengthPlot + geom_bar(aes( x = factor(reorder(CDR3.Length, as.numeric(CDR3.Length))), y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + | 
| 5 | 389   ggtitle("Length distribution of CDR3") + | 
|  | 390   xlab("CDR3 Length") + | 
| 8 | 391   ylab("Percentage of sequences") + | 
|  | 392   scale_fill_manual(values=sample.colors) + | 
|  | 393   theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | 
| 5 | 394 png("CDR3LengthPlot.png",width = 1280, height = 720) | 
|  | 395 CDR3LengthPlot | 
|  | 396 dev.off() | 
| 32 | 397 | 
|  | 398 ggsave("CDR3LengthPlot.pdf", CDR3LengthPlot, width=12, height=7) | 
|  | 399 | 
| 24 | 400 write.table(x=CDR3Length, file="CDR3LengthPlot.txt", sep="\t",quote=F,row.names=F,col.names=T) | 
| 5 | 401 | 
|  | 402 # ---------------------- Plot the heatmaps ---------------------- | 
|  | 403 | 
|  | 404 #get the reverse order for the V and D genes | 
|  | 405 revVchain = Vchain | 
|  | 406 revDchain = Dchain | 
|  | 407 revVchain$chr.orderV = rev(revVchain$chr.orderV) | 
|  | 408 revDchain$chr.orderD = rev(revDchain$chr.orderD) | 
|  | 409 | 
|  | 410 if(useD){ | 
|  | 411   print("Report Clonality - Heatmaps VD") | 
|  | 412   plotVD <- function(dat){ | 
|  | 413     if(length(dat[,1]) == 0){ | 
|  | 414       return() | 
|  | 415     } | 
|  | 416 | 
|  | 417     img = ggplot() + | 
|  | 418       geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) + | 
|  | 419       theme(axis.text.x = element_text(angle = 90, hjust = 1)) + | 
|  | 420       scale_fill_gradient(low="gold", high="blue", na.value="white") + | 
|  | 421       ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + | 
|  | 422       xlab("D genes") + | 
| 9 | 423       ylab("V Genes") + | 
| 14 | 424       theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), panel.grid.major = element_line(colour = "gainsboro")) | 
| 5 | 425 | 
|  | 426     png(paste("HeatmapVD_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Dchain$v.name)), height=100+(15*length(Vchain$v.name))) | 
|  | 427     print(img) | 
|  | 428     dev.off() | 
| 32 | 429 | 
|  | 430     ggsave(paste("HeatmapVD_", unique(dat[3])[1,1] , ".pdf", sep=""), img, height=13, width=8) | 
|  | 431 | 
| 24 | 432     write.table(x=acast(dat, Top.V.Gene~Top.D.Gene, value.var="Length"), file=paste("HeatmapVD_", unique(dat[3])[1,1], ".txt", sep=""), sep="\t",quote=F,row.names=T,col.names=NA) | 
| 5 | 433   } | 
|  | 434 | 
|  | 435   VandDCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.D.Gene", "Sample")]) | 
|  | 436 | 
|  | 437   VandDCount$l = log(VandDCount$Length) | 
|  | 438   maxVD = data.frame(data.table(VandDCount)[, list(max=max(l)), by=c("Sample")]) | 
|  | 439   VandDCount = merge(VandDCount, maxVD, by.x="Sample", by.y="Sample", all.x=T) | 
|  | 440   VandDCount$relLength = VandDCount$l / VandDCount$max | 
| 6 | 441   check = is.nan(VandDCount$relLength) | 
|  | 442   if(any(check)){ | 
|  | 443 	VandDCount[check,"relLength"] = 0 | 
|  | 444   } | 
| 5 | 445 | 
|  | 446   cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name) | 
|  | 447 | 
|  | 448   completeVD = merge(VandDCount, cartegianProductVD, by.x=c("Top.V.Gene", "Top.D.Gene"), by.y=c("Top.V.Gene", "Top.D.Gene"), all=TRUE) | 
|  | 449 | 
|  | 450   completeVD = merge(completeVD, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) | 
|  | 451 | 
|  | 452   completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) | 
|  | 453 | 
|  | 454   fltr = is.nan(completeVD$relLength) | 
|  | 455   if(all(fltr)){ | 
|  | 456 	  completeVD[fltr,"relLength"] = 0 | 
|  | 457   } | 
|  | 458 | 
|  | 459   VDList = split(completeVD, f=completeVD[,"Sample"]) | 
|  | 460   lapply(VDList, FUN=plotVD) | 
|  | 461 } | 
|  | 462 | 
|  | 463 print("Report Clonality - Heatmaps VJ") | 
|  | 464 | 
|  | 465 plotVJ <- function(dat){ | 
|  | 466   if(length(dat[,1]) == 0){ | 
|  | 467     return() | 
|  | 468   } | 
|  | 469   cat(paste(unique(dat[3])[1,1])) | 
|  | 470   img = ggplot() + | 
|  | 471     geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) + | 
|  | 472     theme(axis.text.x = element_text(angle = 90, hjust = 1)) + | 
|  | 473     scale_fill_gradient(low="gold", high="blue", na.value="white") + | 
|  | 474     ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + | 
|  | 475     xlab("J genes") + | 
| 9 | 476     ylab("V Genes") + | 
| 14 | 477     theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), panel.grid.major = element_line(colour = "gainsboro")) | 
| 5 | 478 | 
|  | 479   png(paste("HeatmapVJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Vchain$v.name))) | 
|  | 480   print(img) | 
|  | 481   dev.off() | 
| 32 | 482 | 
|  | 483   ggsave(paste("HeatmapVJ_", unique(dat[3])[1,1] , ".pdf", sep=""), img, height=11, width=4) | 
|  | 484 | 
| 24 | 485   write.table(x=acast(dat, Top.V.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapVJ_", unique(dat[3])[1,1], ".txt", sep=""), sep="\t",quote=F,row.names=T,col.names=NA) | 
| 5 | 486 } | 
|  | 487 | 
|  | 488 VandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.V.Gene", "Top.J.Gene", "Sample")]) | 
|  | 489 | 
|  | 490 VandJCount$l = log(VandJCount$Length) | 
|  | 491 maxVJ = data.frame(data.table(VandJCount)[, list(max=max(l)), by=c("Sample")]) | 
|  | 492 VandJCount = merge(VandJCount, maxVJ, by.x="Sample", by.y="Sample", all.x=T) | 
|  | 493 VandJCount$relLength = VandJCount$l / VandJCount$max | 
|  | 494 | 
| 6 | 495 check = is.nan(VandJCount$relLength) | 
|  | 496 if(any(check)){ | 
|  | 497 	VandJCount[check,"relLength"] = 0 | 
|  | 498 } | 
|  | 499 | 
| 5 | 500 cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name) | 
|  | 501 | 
|  | 502 completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE) | 
|  | 503 completeVJ = merge(completeVJ, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) | 
|  | 504 completeVJ = merge(completeVJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) | 
|  | 505 | 
|  | 506 fltr = is.nan(completeVJ$relLength) | 
|  | 507 if(any(fltr)){ | 
|  | 508 	completeVJ[fltr,"relLength"] = 1 | 
|  | 509 } | 
|  | 510 | 
|  | 511 VJList = split(completeVJ, f=completeVJ[,"Sample"]) | 
|  | 512 lapply(VJList, FUN=plotVJ) | 
|  | 513 | 
|  | 514 | 
|  | 515 | 
|  | 516 if(useD){ | 
|  | 517   print("Report Clonality - Heatmaps DJ") | 
|  | 518   plotDJ <- function(dat){ | 
|  | 519     if(length(dat[,1]) == 0){ | 
|  | 520       return() | 
|  | 521     } | 
|  | 522     img = ggplot() + | 
|  | 523       geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.D.Gene, chr.orderD)), fill=relLength)) + | 
|  | 524       theme(axis.text.x = element_text(angle = 90, hjust = 1)) + | 
|  | 525       scale_fill_gradient(low="gold", high="blue", na.value="white") + | 
|  | 526       ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + | 
|  | 527       xlab("J genes") + | 
| 9 | 528       ylab("D Genes") + | 
| 14 | 529       theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), panel.grid.major = element_line(colour = "gainsboro")) | 
| 5 | 530 | 
|  | 531     png(paste("HeatmapDJ_", unique(dat[3])[1,1] , ".png", sep=""), width=150+(15*length(Jchain$v.name)), height=100+(15*length(Dchain$v.name))) | 
|  | 532     print(img) | 
|  | 533     dev.off() | 
| 32 | 534 | 
|  | 535     ggsave(paste("HeatmapDJ_", unique(dat[3])[1,1] , ".pdf", sep=""), img, width=4, height=7) | 
|  | 536 | 
| 24 | 537     write.table(x=acast(dat, Top.D.Gene~Top.J.Gene, value.var="Length"), file=paste("HeatmapDJ_", unique(dat[3])[1,1], ".txt", sep=""), sep="\t",quote=F,row.names=T,col.names=NA) | 
| 5 | 538   } | 
|  | 539 | 
|  | 540 | 
|  | 541   DandJCount = data.frame(data.table(PRODF)[, list(Length=.N), by=c("Top.D.Gene", "Top.J.Gene", "Sample")]) | 
|  | 542 | 
|  | 543   DandJCount$l = log(DandJCount$Length) | 
|  | 544   maxDJ = data.frame(data.table(DandJCount)[, list(max=max(l)), by=c("Sample")]) | 
|  | 545   DandJCount = merge(DandJCount, maxDJ, by.x="Sample", by.y="Sample", all.x=T) | 
|  | 546   DandJCount$relLength = DandJCount$l / DandJCount$max | 
|  | 547 | 
| 6 | 548   check = is.nan(DandJCount$relLength) | 
|  | 549   if(any(check)){ | 
|  | 550     DandJCount[check,"relLength"] = 0 | 
|  | 551   } | 
|  | 552 | 
| 5 | 553   cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name) | 
|  | 554 | 
|  | 555   completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE) | 
|  | 556   completeDJ = merge(completeDJ, revDchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) | 
|  | 557   completeDJ = merge(completeDJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) | 
|  | 558 | 
|  | 559   fltr = is.nan(completeDJ$relLength) | 
|  | 560   if(any(fltr)){ | 
|  | 561 	  completeDJ[fltr, "relLength"] = 1 | 
|  | 562   } | 
|  | 563 | 
|  | 564   DJList = split(completeDJ, f=completeDJ[,"Sample"]) | 
|  | 565   lapply(DJList, FUN=plotDJ) | 
|  | 566 } | 
|  | 567 | 
|  | 568 | 
|  | 569 # ---------------------- output tables for the circos plots ---------------------- | 
|  | 570 | 
|  | 571 print("Report Clonality - Circos data") | 
|  | 572 | 
|  | 573 for(smpl in unique(PRODF$Sample)){ | 
|  | 574 	PRODF.sample = PRODF[PRODF$Sample == smpl,] | 
|  | 575 | 
|  | 576 	fltr = PRODF.sample$Top.V.Gene == "" | 
|  | 577 	if(any(fltr, na.rm=T)){ | 
|  | 578 	  PRODF.sample[fltr, "Top.V.Gene"] = "NA" | 
|  | 579 	} | 
|  | 580 | 
|  | 581 	fltr = PRODF.sample$Top.D.Gene == "" | 
|  | 582 	if(any(fltr, na.rm=T)){ | 
|  | 583 	  PRODF.sample[fltr, "Top.D.Gene"] = "NA" | 
|  | 584 	} | 
|  | 585 | 
|  | 586 	fltr = PRODF.sample$Top.J.Gene == "" | 
|  | 587 	if(any(fltr, na.rm=T)){ | 
|  | 588 	  PRODF.sample[fltr, "Top.J.Gene"] = "NA" | 
|  | 589 	} | 
|  | 590 | 
|  | 591 	v.d = table(PRODF.sample$Top.V.Gene, PRODF.sample$Top.D.Gene) | 
|  | 592 	v.j = table(PRODF.sample$Top.V.Gene, PRODF.sample$Top.J.Gene) | 
|  | 593 	d.j = table(PRODF.sample$Top.D.Gene, PRODF.sample$Top.J.Gene) | 
|  | 594 | 
|  | 595 	write.table(v.d, file=paste(smpl, "_VD_circos.txt", sep=""), sep="\t", quote=F, row.names=T, col.names=NA) | 
|  | 596 	write.table(v.j, file=paste(smpl, "_VJ_circos.txt", sep=""), sep="\t", quote=F, row.names=T, col.names=NA) | 
|  | 597 	write.table(d.j, file=paste(smpl, "_DJ_circos.txt", sep=""), sep="\t", quote=F, row.names=T, col.names=NA) | 
|  | 598 } | 
|  | 599 | 
|  | 600 # ---------------------- calculating the clonality score ---------------------- | 
|  | 601 | 
|  | 602 if("Replicate" %in% colnames(inputdata)) #can only calculate clonality score when replicate information is available | 
|  | 603 { | 
|  | 604   print("Report Clonality - Clonality") | 
| 18 | 605   write.table(clonalityFrame, "clonalityComplete.txt", sep="\t",quote=F,row.names=F,col.names=T) | 
| 5 | 606   if(clonality_method == "boyd"){ | 
|  | 607     samples = split(clonalityFrame, clonalityFrame$Sample, drop=T) | 
|  | 608 | 
|  | 609     for (sample in samples){ | 
|  | 610       res = data.frame(paste=character(0)) | 
|  | 611       sample_id = unique(sample$Sample)[[1]] | 
|  | 612       for(replicate in unique(sample$Replicate)){ | 
|  | 613         tmp = sample[sample$Replicate == replicate,] | 
|  | 614         clone_table = data.frame(table(tmp$clonaltype)) | 
|  | 615         clone_col_name = paste("V", replicate, sep="") | 
|  | 616         colnames(clone_table) = c("paste", clone_col_name) | 
|  | 617         res = merge(res, clone_table, by="paste", all=T) | 
|  | 618       } | 
|  | 619 | 
| 17 | 620       res[is.na(res)] = 0 | 
|  | 621 | 
| 20 | 622       write.table(res, file=paste("raw_clonality_", sample_id, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=F) | 
|  | 623       write.table(as.matrix(res[,2:ncol(res)]), file=paste("raw_clonality2_", sample_id, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=F) | 
|  | 624 | 
|  | 625       res = read.table(paste("raw_clonality_", sample_id, ".txt", sep=""), header=F, sep="\t", quote="", stringsAsFactors=F, fill=T, comment.char="") | 
| 17 | 626 | 
| 5 | 627       infer.result = infer.clonality(as.matrix(res[,2:ncol(res)])) | 
|  | 628 | 
| 13 | 629       #print(infer.result) | 
| 5 | 630 | 
| 20 | 631       write.table(data.table(infer.result[[12]]), file=paste("lymphclon_clonality_", sample_id, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=F) | 
| 5 | 632 | 
|  | 633       res$type = rowSums(res[,2:ncol(res)]) | 
|  | 634 | 
|  | 635       coincidence.table = data.frame(table(res$type)) | 
|  | 636       colnames(coincidence.table) = c("Coincidence Type",  "Raw Coincidence Freq") | 
| 20 | 637       write.table(coincidence.table, file=paste("lymphclon_coincidences_", sample_id, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) | 
| 5 | 638     } | 
| 26 | 639   } | 
|  | 640   clonalFreq = data.frame(data.table(clonalityFrame)[, list(Type=.N), by=c("Sample", "clonaltype")]) | 
|  | 641 | 
|  | 642   #write files for every coincidence group of >1 | 
|  | 643   samples = unique(clonalFreq$Sample) | 
|  | 644   for(sample in samples){ | 
|  | 645       clonalFreqSample = clonalFreq[clonalFreq$Sample == sample,] | 
|  | 646       if(max(clonalFreqSample$Type) > 1){ | 
|  | 647           for(i in 2:max(clonalFreqSample$Type)){ | 
|  | 648               clonalFreqSampleType = clonalFreqSample[clonalFreqSample$Type == i,] | 
|  | 649               clonalityFrame.sub = clonalityFrame[clonalityFrame$clonaltype %in% clonalFreqSampleType$clonaltype,] | 
|  | 650               clonalityFrame.sub = clonalityFrame.sub[order(clonalityFrame.sub$clonaltype),] | 
|  | 651               write.table(clonalityFrame.sub, file=paste("coincidences_", sample, "_", i, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) | 
|  | 652           } | 
|  | 653       } | 
|  | 654   } | 
|  | 655 | 
|  | 656   clonalFreqCount = data.frame(data.table(clonalFreq)[, list(Count=.N), by=c("Sample", "Type")]) | 
|  | 657   clonalFreqCount$realCount = clonalFreqCount$Type * clonalFreqCount$Count | 
|  | 658   clonalSum = data.frame(data.table(clonalFreqCount)[, list(Reads=sum(realCount)), by=c("Sample")]) | 
|  | 659   clonalFreqCount = merge(clonalFreqCount, clonalSum, by.x="Sample", by.y="Sample") | 
|  | 660 | 
|  | 661   ct = c('Type\tWeight\n2\t1\n3\t3\n4\t6\n5\t10\n6\t15') | 
|  | 662   tcct = textConnection(ct) | 
|  | 663   CT  = read.table(tcct, sep="\t", header=TRUE) | 
|  | 664   close(tcct) | 
|  | 665   clonalFreqCount = merge(clonalFreqCount, CT, by.x="Type", by.y="Type", all.x=T) | 
|  | 666   clonalFreqCount$WeightedCount = clonalFreqCount$Count * clonalFreqCount$Weight | 
|  | 667 | 
|  | 668   ReplicateReads = data.frame(data.table(clonalityFrame)[, list(Type=.N), by=c("Sample", "Replicate", "clonaltype")]) | 
|  | 669   ReplicateReads = data.frame(data.table(ReplicateReads)[, list(Reads=.N), by=c("Sample", "Replicate")]) | 
|  | 670   clonalFreqCount$Reads = as.numeric(clonalFreqCount$Reads) | 
|  | 671   ReplicateReads$Reads = as.numeric(ReplicateReads$Reads) | 
|  | 672   ReplicateReads$squared = as.numeric(ReplicateReads$Reads * ReplicateReads$Reads) | 
|  | 673 | 
|  | 674   ReplicatePrint <- function(dat){ | 
|  | 675       write.table(dat[-1], paste("ReplicateReads_", unique(dat[1])[1,1] , ".txt", sep=""), sep="\t",quote=F,na="-",row.names=F,col.names=F) | 
| 5 | 676   } | 
| 26 | 677 | 
|  | 678   ReplicateSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"]) | 
|  | 679   lapply(ReplicateSplit, FUN=ReplicatePrint) | 
|  | 680 | 
|  | 681   ReplicateReads = data.frame(data.table(ReplicateReads)[, list(ReadsSum=sum(as.numeric(Reads)), ReadsSquaredSum=sum(as.numeric(squared))), by=c("Sample")]) | 
|  | 682   clonalFreqCount = merge(clonalFreqCount, ReplicateReads, by.x="Sample", by.y="Sample", all.x=T) | 
|  | 683 | 
|  | 684   ReplicateSumPrint <- function(dat){ | 
|  | 685       write.table(dat[-1], paste("ReplicateSumReads_", unique(dat[1])[1,1] , ".txt", sep=""), sep="\t",quote=F,na="-",row.names=F,col.names=F) | 
|  | 686   } | 
|  | 687 | 
|  | 688   ReplicateSumSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"]) | 
|  | 689   lapply(ReplicateSumSplit, FUN=ReplicateSumPrint) | 
|  | 690 | 
|  | 691   clonalFreqCountSum = data.frame(data.table(clonalFreqCount)[, list(Numerator=sum(WeightedCount, na.rm=T)), by=c("Sample")]) | 
|  | 692   clonalFreqCount = merge(clonalFreqCount, clonalFreqCountSum, by.x="Sample", by.y="Sample", all.x=T) | 
|  | 693   clonalFreqCount$ReadsSum = as.numeric(clonalFreqCount$ReadsSum) #prevent integer overflow | 
|  | 694   clonalFreqCount$Denominator = (((clonalFreqCount$ReadsSum * clonalFreqCount$ReadsSum) - clonalFreqCount$ReadsSquaredSum) / 2) | 
|  | 695   clonalFreqCount$Result = (clonalFreqCount$Numerator + 1) / (clonalFreqCount$Denominator + 1) | 
|  | 696 | 
|  | 697   ClonalityScorePrint <- function(dat){ | 
|  | 698       write.table(dat$Result, paste("ClonalityScore_", unique(dat[1])[1,1] , ".txt", sep=""), sep="\t",quote=F,na="-",row.names=F,col.names=F) | 
|  | 699   } | 
|  | 700 | 
|  | 701   clonalityScore = clonalFreqCount[c("Sample", "Result")] | 
|  | 702   clonalityScore = unique(clonalityScore) | 
|  | 703 | 
|  | 704   clonalityScoreSplit = split(clonalityScore, f=clonalityScore[,"Sample"]) | 
|  | 705   lapply(clonalityScoreSplit, FUN=ClonalityScorePrint) | 
|  | 706 | 
|  | 707   clonalityOverview = clonalFreqCount[c("Sample", "Type", "Count", "Weight", "WeightedCount")] | 
|  | 708 | 
|  | 709 | 
|  | 710 | 
|  | 711   ClonalityOverviewPrint <- function(dat){ | 
|  | 712       dat = dat[order(dat[,2]),] | 
|  | 713       write.table(dat[-1], paste("ClonalityOverView_", unique(dat[1])[1,1] , ".txt", sep=""), sep="\t",quote=F,na="-",row.names=F,col.names=F) | 
|  | 714   } | 
|  | 715 | 
|  | 716   clonalityOverviewSplit = split(clonalityOverview, f=clonalityOverview$Sample) | 
|  | 717   lapply(clonalityOverviewSplit, FUN=ClonalityOverviewPrint) | 
|  | 718 | 
| 5 | 719 } | 
|  | 720 | 
|  | 721 bak = PRODF | 
| 25 | 722 bakun = UNPROD | 
| 5 | 723 | 
|  | 724 imgtcolumns = c("X3V.REGION.trimmed.nt.nb","P3V.nt.nb", "N1.REGION.nt.nb", "P5D.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "P3D.nt.nb", "N2.REGION.nt.nb", "P5J.nt.nb", "X5J.REGION.trimmed.nt.nb", "X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb") | 
|  | 725 if(all(imgtcolumns %in% colnames(inputdata))) | 
|  | 726 { | 
|  | 727   print("found IMGT columns, running junction analysis") | 
| 24 | 728 | 
| 5 | 729   #ensure certain columns are in the data (files generated with older versions of IMGT Loader) | 
|  | 730   col.checks = c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb") | 
|  | 731   for(col.check in col.checks){ | 
|  | 732 	  if(!(col.check %in% names(PRODF))){ | 
|  | 733 		  print(paste(col.check, "not found adding new column")) | 
|  | 734 		  if(nrow(PRODF) > 0){ #because R is anoying... | 
|  | 735 			PRODF[,col.check] = 0 | 
|  | 736 		  } else { | 
|  | 737 			PRODF = cbind(PRODF, data.frame(N3.REGION.nt.nb=numeric(0), N4.REGION.nt.nb=numeric(0))) | 
|  | 738 		  } | 
|  | 739 		  if(nrow(UNPROD) > 0){ | 
|  | 740 			UNPROD[,col.check] = 0 | 
|  | 741 		  } else { | 
|  | 742 			UNPROD = cbind(UNPROD, data.frame(N3.REGION.nt.nb=numeric(0), N4.REGION.nt.nb=numeric(0))) | 
|  | 743 		  } | 
|  | 744 	  } | 
|  | 745   } | 
|  | 746 | 
| 24 | 747   PRODF.with.D = PRODF[nchar(PRODF$Top.D.Gene, keepNA=F) > 2,] | 
|  | 748   PRODF.no.D = PRODF[nchar(PRODF$Top.D.Gene, keepNA=F) < 4,] | 
| 26 | 749   write.table(PRODF.no.D, "productive_no_D.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=T) | 
| 24 | 750 | 
| 25 | 751   UNPROD.with.D = UNPROD[nchar(UNPROD$Top.D.Gene, keepNA=F) > 2,] | 
|  | 752   UNPROD.no.D = UNPROD[nchar(UNPROD$Top.D.Gene, keepNA=F) < 4,] | 
| 26 | 753   write.table(UNPROD.no.D, "unproductive_no_D.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=T) | 
| 25 | 754 | 
| 5 | 755   num_median = function(x, na.rm=T) { as.numeric(median(x, na.rm=na.rm)) } | 
| 25 | 756 | 
| 24 | 757   newData = data.frame(data.table(PRODF.with.D)[,list(unique=.N, | 
| 5 | 758                                                VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 759                                                P1=mean(.SD$P3V.nt.nb, na.rm=T), | 
|  | 760                                                N1=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)), | 
|  | 761                                                P2=mean(.SD$P5D.nt.nb, na.rm=T), | 
|  | 762                                                DEL.DH=mean(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 763                                                DH.DEL=mean(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 764                                                P3=mean(.SD$P3D.nt.nb, na.rm=T), | 
|  | 765                                                N2=mean(rowSums(.SD[,c("N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), | 
|  | 766                                                P4=mean(.SD$P5J.nt.nb, na.rm=T), | 
|  | 767                                                DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 768                                                Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), | 
|  | 769                                                Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), | 
|  | 770                                                Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), | 
| 38 | 771                                                Median.CDR3.l=as.double(median(as.numeric(.SD$CDR3.Length), na.rm=T))), | 
| 5 | 772                                          by=c("Sample")]) | 
|  | 773   newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) | 
| 24 | 774   write.table(newData, "junctionAnalysisProd_mean_wD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) | 
| 5 | 775 | 
| 24 | 776   newData = data.frame(data.table(PRODF.with.D)[,list(unique=.N, | 
| 5 | 777                                                VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 778                                                P1=num_median(.SD$P3V.nt.nb, na.rm=T), | 
|  | 779                                                N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)), | 
|  | 780                                                P2=num_median(.SD$P5D.nt.nb, na.rm=T), | 
|  | 781                                                DEL.DH=num_median(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 782                                                DH.DEL=num_median(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 783                                                P3=num_median(.SD$P3D.nt.nb, na.rm=T), | 
|  | 784                                                N2=num_median(rowSums(.SD[,c("N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), | 
|  | 785                                                P4=num_median(.SD$P5J.nt.nb, na.rm=T), | 
|  | 786                                                DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 787 											   Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), | 
|  | 788 											   Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), | 
|  | 789 											   Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), | 
| 38 | 790 											   Median.CDR3.l=as.double(median(as.numeric(.SD$CDR3.Length), na.rm=T))), | 
| 5 | 791                                          by=c("Sample")]) | 
|  | 792   newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) | 
| 24 | 793   write.table(newData, "junctionAnalysisProd_median_wD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) | 
| 5 | 794 | 
| 25 | 795   newData = data.frame(data.table(UNPROD.with.D)[,list(unique=.N, | 
| 5 | 796                                                 VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 797                                                 P1=mean(.SD$P3V.nt.nb, na.rm=T), | 
|  | 798                                                 N1=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)), | 
|  | 799                                                 P2=mean(.SD$P5D.nt.nb, na.rm=T), | 
|  | 800                                                 DEL.DH=mean(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 801                                                 DH.DEL=mean(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 802                                                 P3=mean(.SD$P3D.nt.nb, na.rm=T), | 
|  | 803                                                 N2=mean(rowSums(.SD[,c("N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), | 
|  | 804                                                 P4=mean(.SD$P5J.nt.nb, na.rm=T), | 
|  | 805                                                 DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 806                                                 Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), | 
|  | 807                                                 Total.N=mean(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), | 
|  | 808                                                 Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), | 
| 38 | 809                                                 Median.CDR3.l=as.double(median(as.numeric(.SD$CDR3.Length), na.rm=T))), | 
| 5 | 810                                           by=c("Sample")]) | 
|  | 811   newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) | 
| 24 | 812   write.table(newData, "junctionAnalysisUnProd_mean_wD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) | 
| 5 | 813 | 
| 25 | 814     newData = data.frame(data.table(UNPROD.with.D)[,list(unique=.N, | 
| 5 | 815                                                 VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 816                                                 P1=num_median(.SD$P3V.nt.nb, na.rm=T), | 
|  | 817                                                 N1=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb"), with=F], na.rm=T)), | 
|  | 818                                                 P2=num_median(.SD$P5D.nt.nb, na.rm=T), | 
|  | 819                                                 DEL.DH=num_median(.SD$X5D.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 820                                                 DH.DEL=num_median(.SD$X3D.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 821                                                 P3=num_median(.SD$P3D.nt.nb, na.rm=T), | 
|  | 822                                                 N2=num_median(rowSums(.SD[,c("N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), | 
|  | 823                                                 P4=num_median(.SD$P5J.nt.nb, na.rm=T), | 
|  | 824                                                 DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 825                                                 Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5D.REGION.trimmed.nt.nb", "X3D.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), | 
|  | 826                                                 Total.N=num_median(rowSums(.SD[,c("N.REGION.nt.nb", "N1.REGION.nt.nb", "N2.REGION.nt.nb", "N3.REGION.nt.nb", "N4.REGION.nt.nb"), with=F], na.rm=T)), | 
|  | 827                                                 Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5D.nt.nb", "P3D.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), | 
| 38 | 828                                                 Median.CDR3.l=as.double(median(as.numeric(.SD$CDR3.Length), na.rm=T))), | 
| 5 | 829 															by=c("Sample")]) | 
| 24 | 830   newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) | 
|  | 831   write.table(newData, "junctionAnalysisUnProd_median_wD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) | 
|  | 832 | 
|  | 833   #---------------- again for no-D | 
|  | 834 | 
|  | 835   newData = data.frame(data.table(PRODF.no.D)[,list(unique=.N, | 
|  | 836                                                VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 837                                                P1=mean(.SD$P3V.nt.nb, na.rm=T), | 
| 26 | 838                                                N1=mean(.SD$N.REGION.nt.nb, na.rm=T), | 
| 24 | 839                                                P2=mean(.SD$P5J.nt.nb, na.rm=T), | 
|  | 840                                                DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 841                                                Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), | 
| 26 | 842                                                Total.N=mean(.SD$N.REGION.nt.nb, na.rm=T), | 
| 24 | 843                                                Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), | 
| 38 | 844                                                Median.CDR3.l=as.double(median(as.numeric(.SD$CDR3.Length), na.rm=T))), | 
| 24 | 845                                          by=c("Sample")]) | 
| 5 | 846   newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) | 
| 24 | 847   write.table(newData, "junctionAnalysisProd_mean_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) | 
|  | 848 | 
|  | 849   newData = data.frame(data.table(PRODF.no.D)[,list(unique=.N, | 
|  | 850                                                VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 851                                                P1=num_median(.SD$P3V.nt.nb, na.rm=T), | 
| 30 | 852                                                N1=num_median(.SD$N.REGION.nt.nb, na.rm=T), | 
| 24 | 853                                                P2=num_median(.SD$P5J.nt.nb, na.rm=T), | 
|  | 854                                                DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 855 											   Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), | 
| 30 | 856 											   Total.N=num_median(.SD$N.REGION.nt.nb, na.rm=T), | 
| 24 | 857 											   Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), | 
| 38 | 858 											   Median.CDR3.l=as.double(median(as.numeric(.SD$CDR3.Length), na.rm=T))), | 
| 24 | 859                                          by=c("Sample")]) | 
|  | 860   newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) | 
|  | 861   write.table(newData, "junctionAnalysisProd_median_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) | 
|  | 862 | 
| 25 | 863   newData = data.frame(data.table(UNPROD.no.D)[,list(unique=.N, | 
| 24 | 864                                                 VH.DEL=mean(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 865                                                 P1=mean(.SD$P3V.nt.nb, na.rm=T), | 
| 26 | 866                                                 N1=mean(.SD$N.REGION.nt.nb, na.rm=T), | 
| 24 | 867                                                 P2=mean(.SD$P5J.nt.nb, na.rm=T), | 
|  | 868                                                 DEL.JH=mean(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 869                                                 Total.Del=mean(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), | 
| 26 | 870                                                 Total.N=mean(.SD$N.REGION.nt.nb, na.rm=T), | 
| 24 | 871                                                 Total.P=mean(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), | 
| 38 | 872                                                 Median.CDR3.l=as.double(median(as.numeric(.SD$CDR3.Length), na.rm=T))), | 
| 24 | 873                                           by=c("Sample")]) | 
|  | 874   newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) | 
|  | 875   write.table(newData, "junctionAnalysisUnProd_mean_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) | 
|  | 876 | 
| 26 | 877 | 
| 25 | 878     newData = data.frame(data.table(UNPROD.no.D)[,list(unique=.N, | 
| 24 | 879                                                 VH.DEL=num_median(.SD$X3V.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 880                                                 P1=num_median(.SD$P3V.nt.nb, na.rm=T), | 
| 30 | 881                                                 N1=num_median(.SD$N.REGION.nt.nb, na.rm=T), | 
| 24 | 882                                                 P2=num_median(.SD$P5J.nt.nb, na.rm=T), | 
|  | 883                                                 DEL.JH=num_median(.SD$X5J.REGION.trimmed.nt.nb, na.rm=T), | 
|  | 884                                                 Total.Del=num_median(rowSums(.SD[,c("X3V.REGION.trimmed.nt.nb", "X5J.REGION.trimmed.nt.nb"), with=F], na.rm=T)), | 
| 30 | 885                                                 Total.N=num_median(.SD$N.REGION.nt.nb, na.rm=T), | 
| 24 | 886                                                 Total.P=num_median(rowSums(.SD[,c("P3V.nt.nb", "P5J.nt.nb"), with=F], na.rm=T)), | 
| 38 | 887                                                 Median.CDR3.l=as.double(median(as.numeric(.SD$CDR3.Length), na.rm=T))), | 
| 24 | 888 															by=c("Sample")]) | 
|  | 889   newData[,sapply(newData, is.numeric)] = round(newData[,sapply(newData, is.numeric)],1) | 
|  | 890   write.table(newData, "junctionAnalysisUnProd_median_nD.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) | 
| 5 | 891 } | 
|  | 892 | 
|  | 893 PRODF = bak | 
| 25 | 894 UNPROD = bakun | 
| 5 | 895 | 
|  | 896 | 
|  | 897 # ---------------------- D reading frame ---------------------- | 
|  | 898 | 
| 8 | 899 D.REGION.reading.frame = PRODF[,c("Sample", "D.REGION.reading.frame")] | 
| 5 | 900 | 
| 8 | 901 chck = is.na(D.REGION.reading.frame$D.REGION.reading.frame) | 
|  | 902 if(any(chck)){ | 
|  | 903 	D.REGION.reading.frame[chck,"D.REGION.reading.frame"] = "No D" | 
|  | 904 } | 
| 5 | 905 | 
| 24 | 906 D.REGION.reading.frame.1 = data.frame(data.table(D.REGION.reading.frame)[, list(Freq=.N), by=c("Sample", "D.REGION.reading.frame")]) | 
|  | 907 | 
|  | 908 D.REGION.reading.frame.2 = data.frame(data.table(D.REGION.reading.frame)[, list(sample.sum=sum(as.numeric(.SD$D.REGION.reading.frame), na.rm=T)), by=c("Sample")]) | 
| 5 | 909 | 
| 24 | 910 D.REGION.reading.frame = merge(D.REGION.reading.frame.1, D.REGION.reading.frame.2, by="Sample") | 
|  | 911 | 
|  | 912 D.REGION.reading.frame$percentage = round(D.REGION.reading.frame$Freq / D.REGION.reading.frame$sample.sum * 100, 1) | 
|  | 913 | 
|  | 914 write.table(D.REGION.reading.frame, "DReadingFrame.txt" , sep="\t",quote=F,row.names=F,col.names=T) | 
| 5 | 915 | 
|  | 916 D.REGION.reading.frame = ggplot(D.REGION.reading.frame) | 
| 29 | 917 D.REGION.reading.frame = D.REGION.reading.frame + geom_bar(aes( x = D.REGION.reading.frame, y = percentage, fill=Sample), stat='identity', position='dodge' ) + ggtitle("D reading frame") + xlab("Frame") + ylab("Frequency") | 
| 8 | 918 D.REGION.reading.frame = D.REGION.reading.frame + scale_fill_manual(values=sample.colors) | 
|  | 919 D.REGION.reading.frame = D.REGION.reading.frame + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | 
| 5 | 920 | 
|  | 921 png("DReadingFrame.png") | 
|  | 922 D.REGION.reading.frame | 
|  | 923 dev.off() | 
|  | 924 | 
| 32 | 925 ggsave("DReadingFrame.pdf", D.REGION.reading.frame) | 
| 5 | 926 | 
|  | 927 # ---------------------- AA composition in CDR3 ---------------------- | 
|  | 928 | 
|  | 929 AACDR3 = PRODF[,c("Sample", "CDR3.Seq")] | 
|  | 930 | 
|  | 931 TotalPerSample = data.frame(data.table(AACDR3)[, list(total=sum(nchar(as.character(.SD$CDR3.Seq)))), by=Sample]) | 
|  | 932 | 
|  | 933 AAfreq = list() | 
|  | 934 | 
|  | 935 for(i in 1:nrow(TotalPerSample)){ | 
|  | 936 	sample = TotalPerSample$Sample[i] | 
|  | 937   AAfreq[[i]] = data.frame(table(unlist(strsplit(as.character(AACDR3[AACDR3$Sample == sample,c("CDR3.Seq")]), "")))) | 
|  | 938   AAfreq[[i]]$Sample = sample | 
|  | 939 } | 
|  | 940 | 
|  | 941 AAfreq = ldply(AAfreq, data.frame) | 
|  | 942 AAfreq = merge(AAfreq, TotalPerSample, by="Sample", all.x = T) | 
|  | 943 AAfreq$freq_perc = as.numeric(AAfreq$Freq / AAfreq$total * 100) | 
|  | 944 | 
|  | 945 | 
|  | 946 AAorder = read.table(sep="\t", header=TRUE, text="order.aa\tAA\n1\tR\n2\tK\n3\tN\n4\tD\n5\tQ\n6\tE\n7\tH\n8\tP\n9\tY\n10\tW\n11\tS\n12\tT\n13\tG\n14\tA\n15\tM\n16\tC\n17\tF\n18\tL\n19\tV\n20\tI") | 
|  | 947 AAfreq = merge(AAfreq, AAorder, by.x='Var1', by.y='AA', all.x=TRUE) | 
|  | 948 | 
|  | 949 AAfreq = AAfreq[!is.na(AAfreq$order.aa),] | 
|  | 950 | 
|  | 951 AAfreqplot = ggplot(AAfreq) | 
|  | 952 AAfreqplot = AAfreqplot + geom_bar(aes( x=factor(reorder(Var1, order.aa)), y = freq_perc, fill = Sample), stat='identity', position='dodge' ) | 
|  | 953 AAfreqplot = AAfreqplot + annotate("rect", xmin = 0.5, xmax = 2.5, ymin = 0, ymax = Inf, fill = "red", alpha = 0.2) | 
|  | 954 AAfreqplot = AAfreqplot + annotate("rect", xmin = 3.5, xmax = 4.5, ymin = 0, ymax = Inf, fill = "blue", alpha = 0.2) | 
|  | 955 AAfreqplot = AAfreqplot + annotate("rect", xmin = 5.5, xmax = 6.5, ymin = 0, ymax = Inf, fill = "blue", alpha = 0.2) | 
|  | 956 AAfreqplot = AAfreqplot + annotate("rect", xmin = 6.5, xmax = 7.5, ymin = 0, ymax = Inf, fill = "red", alpha = 0.2) | 
| 8 | 957 AAfreqplot = AAfreqplot + ggtitle("Amino Acid Composition in the CDR3") + xlab("Amino Acid, from Hydrophilic (left) to Hydrophobic (right)") + ylab("Percentage") + scale_fill_manual(values=sample.colors) | 
| 32 | 958 AAfreqplot = AAfreqplot + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | 
| 5 | 959 | 
|  | 960 png("AAComposition.png",width = 1280, height = 720) | 
|  | 961 AAfreqplot | 
|  | 962 dev.off() | 
| 32 | 963 | 
|  | 964 ggsave("AAComposition.pdf", AAfreqplot, width=12, height=7) | 
|  | 965 | 
| 18 | 966 write.table(AAfreq, "AAComposition.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=T) | 
| 5 | 967 | 
| 8 | 968 # ---------------------- AA median CDR3 length ---------------------- | 
| 5 | 969 | 
| 24 | 970 median.aa.l = data.frame(data.table(PRODF)[, list(median=as.double(median(as.numeric(.SD$CDR3.Length, na.rm=T), na.rm=T))), by=c("Sample")]) | 
|  | 971 write.table(median.aa.l, "AAMedianBySample.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=F) | 
| 8 | 972 | 
| 25 | 973 | 
|  | 974 #generate the "Sequences that are present in more than one replicate" dataset | 
|  | 975 clonaltype.in.replicates = inputdata | 
| 38 | 976 clonaltype.in.replicates = clonaltype.in.replicates[clonaltype.in.replicates$Functionality %in% c("productive (see comment)","productive"),] | 
| 37 | 977 clonaltype.in.replicates = na.omit(clonaltype.in.replicates) | 
| 25 | 978 clonaltype = unlist(strsplit(clonaltype, ",")) | 
| 37 | 979 | 
|  | 980 clonaltype.in.replicates$clonaltype = do.call(paste, c(clonaltype.in.replicates[c(clonaltype, "Replicate")], sep = ":")) | 
|  | 981 | 
|  | 982 clonaltype.in.replicates = clonaltype.in.replicates[!duplicated(clonaltype.in.replicates$clonaltype),] | 
|  | 983 | 
| 25 | 984 clonaltype = clonaltype[-which(clonaltype == "Sample")] | 
|  | 985 | 
|  | 986 clonaltype.in.replicates$clonaltype = do.call(paste, c(clonaltype.in.replicates[clonaltype], sep = ":")) | 
|  | 987 clonaltype.in.replicates = clonaltype.in.replicates[,c("clonaltype","Replicate", "ID", "Sequence", "Sample")] | 
|  | 988 | 
|  | 989 clonaltype.counts = data.frame(table(clonaltype.in.replicates$clonaltype)) | 
|  | 990 names(clonaltype.counts) = c("clonaltype", "coincidence") | 
|  | 991 | 
|  | 992 clonaltype.counts = clonaltype.counts[clonaltype.counts$coincidence > 1,] | 
|  | 993 | 
|  | 994 clonaltype.in.replicates = clonaltype.in.replicates[clonaltype.in.replicates$clonaltype %in% clonaltype.counts$clonaltype,] | 
|  | 995 clonaltype.in.replicates = merge(clonaltype.in.replicates, clonaltype.counts, by="clonaltype") | 
| 37 | 996 clonaltype.in.replicates = clonaltype.in.replicates[order(-clonaltype.in.replicates$coincidence, clonaltype.in.replicates$clonaltype, clonaltype.in.replicates$Replicate),c("coincidence","clonaltype", "Sample", "Replicate", "ID", "Sequence")] | 
|  | 997 | 
| 25 | 998 | 
|  | 999 write.table(clonaltype.in.replicates, "clonaltypes_replicates.txt" , sep="\t",quote=F,na="-",row.names=F,col.names=T) | 
|  | 1000 | 
|  | 1001 | 
|  | 1002 | 
|  | 1003 | 
|  | 1004 | 
|  | 1005 | 
|  | 1006 | 
|  | 1007 | 
|  | 1008 | 
|  | 1009 | 
|  | 1010 | 
|  | 1011 | 
|  | 1012 | 
|  | 1013 | 
|  | 1014 | 
|  | 1015 | 
|  | 1016 | 
|  | 1017 | 
|  | 1018 | 
|  | 1019 | 
|  | 1020 | 
|  | 1021 | 
|  | 1022 |