comparison RScript.r @ 0:ed6885c85660 draft

Uploaded
author davidvanzessen
date Wed, 31 Aug 2016 05:31:47 -0400
parents
children 75853bceec00
comparison
equal deleted inserted replaced
-1:000000000000 0:ed6885c85660
1 args <- commandArgs(trailingOnly = TRUE)
2 options(scipen=999)
3
4 inFile = args[1]
5 outDir = args[2]
6 logfile = args[3]
7 min_freq = as.numeric(args[4])
8 min_cells = as.numeric(args[5])
9 mergeOn = args[6]
10
11 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
12
13 library(ggplot2)
14 library(reshape2)
15 library(data.table)
16 library(grid)
17 library(parallel)
18 #require(xtable)
19 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
20 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
21 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Clone_Sequence")]
22 dat$dsPerM = 0
23 dat = dat[!is.na(dat$Patient),]
24 dat$Related_to_leukemia_clone = F
25
26 setwd(outDir)
27 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
28 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
29 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
30
31 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
32
33 dat$Frequency = ((10^dat$Log10_Frequency)*100)
34
35 dat = dat[dat$Frequency >= min_freq,]
36
37 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
38
39 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
40
41 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
42 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
43 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
44
45 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
46 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
47
48 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
49 print(paste("rows:", nrow(dat)))
50 dat = merge(dat, min_cell_count, by="min_cell_paste")
51 print(paste("rows:", nrow(dat)))
52 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
53
54 dat = dat[dat$normalized_read_count >= min_cells,]
55
56 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
57
58 patients = split(dat, dat$Patient, drop=T)
59 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
60 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
61 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
62 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
63 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
64 Titles = factor(Titles, levels=Titles)
65 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
66
67 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
68
69 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
70 patient.merge.list.second = list()
71 scatter_locus_data_list = list()
72 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="multiple_matches.html", append=T)
73 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="single_matches.html", append=T)
74 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
75 if (!is.data.frame(x) & is.list(x)){
76 x = x[[1]]
77 }
78 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
79 x = data.frame(x,stringsAsFactors=F)
80 onShort = "reads"
81 if(on == "Frequency"){
82 onShort = "freq"
83 }
84 onx = paste(on, ".x", sep="")
85 ony = paste(on, ".y", sep="")
86 splt = split(x, x$Sample, drop=T)
87 type="pair"
88 if(length(splt) == 1){
89 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
90 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0))
91 type="single"
92 }
93 patient1 = splt[[1]]
94 patient2 = splt[[2]]
95
96 threshholdIndex = which(colnames(product) == "interval")
97 V_SegmentIndex = which(colnames(product) == "V_Segments")
98 J_SegmentIndex = which(colnames(product) == "J_Segments")
99 titleIndex = which(colnames(product) == "Titles")
100 sampleIndex = which(colnames(x) == "Sample")
101 patientIndex = which(colnames(x) == "Patient")
102 oneSample = paste(patient1[1,sampleIndex], sep="")
103 twoSample = paste(patient2[1,sampleIndex], sep="")
104 patient = paste(x[1,patientIndex])
105
106 switched = F
107 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
108 tmp = twoSample
109 twoSample = oneSample
110 oneSample = tmp
111 tmp = patient1
112 patient1 = patient2
113 patient2 = tmp
114 switched = T
115 }
116 if(appendtxt){
117 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
118 }
119 cat(paste("<tr><td>", patient, "</td>", sep=""), file=logfile, append=T)
120
121 if(mergeOn == "Clone_Sequence"){
122 patient1$merge = paste(patient1$Clone_Sequence)
123 patient2$merge = paste(patient2$Clone_Sequence)
124 } else {
125 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
126 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
127 }
128
129 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
130 #scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
131 scatterplot_data = patient1[NULL,scatterplot_data_columns]
132 #scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
133 #scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
134 scatterplot.data.type.factor = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both"))
135 #scatterplot_data$type = factor(x=NULL, levels=scatterplot.data.type.factor)
136 scatterplot_data$type = character(0)
137 scatterplot_data$link = numeric(0)
138 scatterplot_data$on = character(0)
139
140 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") #merge alles 'fuzzy'
141 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh
142
143 cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence
144
145 start.time = proc.time()
146 merge.list = c()
147
148 if(patient %in% names(patient.merge.list)){
149 patientMerge = patient.merge.list[[patient]]
150 merge.list[["second"]] = patient.merge.list.second[[patient]]
151 scatterplot_data = scatter_locus_data_list[[patient]]
152 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
153
154 print(names(patient.merge.list))
155 } else {
156 #fuzzy matching here...
157 #merge.list = patientMerge$merge
158
159 #patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
160 #patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
161
162 patient1.fuzzy = patient1
163 patient2.fuzzy = patient2
164
165 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
166 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
167
168 #patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
169 #patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)
170
171 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
172 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
173
174 #merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"]))) #also remove?
175 #merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
176
177 #patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
178 #patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
179
180 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
181 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
182
183 merge.list = list()
184
185 merge.list[["second"]] = vector()
186
187 link.count = 1
188
189 while(nrow(patient.fuzzy) > 1){
190 first.merge = patient.fuzzy[1,"merge"]
191 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
192 first.sample = patient.fuzzy[1,"Sample"]
193 merge.filter = first.merge == patient.fuzzy$merge
194
195 #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
196
197 first.sample.filter = first.sample == patient.fuzzy$Sample
198 second.sample.filter = first.sample != patient.fuzzy$Sample
199
200 #first match same sample, sum to a single row, same for other sample
201 #then merge rows like 'normal'
202
203 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
204
205
206
207 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter
208 first.match.filter = merge.filter & sequence.filter & first.sample.filter
209 second.match.filter = merge.filter & sequence.filter & second.sample.filter
210
211 first.rows = patient.fuzzy[first.match.filter,]
212 second.rows = patient.fuzzy[second.match.filter,]
213
214 first.rows.v = table(first.rows$V_Segment_Major_Gene)
215 first.rows.v = names(first.rows.v[which.max(first.rows.v)])
216 first.rows.j = table(first.rows$J_Segment_Major_Gene)
217 first.rows.j = names(first.rows.j[which.max(first.rows.j)])
218
219 first.sum = data.frame(merge = first.clone.sequence,
220 Patient = patient,
221 Receptor = first.rows[1,"Receptor"],
222 Sample = first.rows[1,"Sample"],
223 Cell_Count = first.rows[1,"Cell_Count"],
224 Clone_Molecule_Count_From_Spikes = sum(first.rows$Clone_Molecule_Count_From_Spikes),
225 Log10_Frequency = log10(sum(first.rows$Frequency)),
226 Total_Read_Count = sum(first.rows$Total_Read_Count),
227 dsPerM = sum(first.rows$dsPerM),
228 J_Segment_Major_Gene = first.rows.j,
229 V_Segment_Major_Gene = first.rows.v,
230 Clone_Sequence = first.clone.sequence,
231 CDR3_Sense_Sequence = first.rows[1,"CDR3_Sense_Sequence"],
232 Related_to_leukemia_clone = F,
233 Frequency = sum(first.rows$Frequency),
234 locus_V = first.rows[1,"locus_V"],
235 locus_J = first.rows[1,"locus_J"],
236 min_cell_count = first.rows[1,"min_cell_count"],
237 normalized_read_count = sum(first.rows$normalized_read_count),
238 paste = first.rows[1,"paste"],
239 min_cell_paste = first.rows[1,"min_cell_paste"])
240
241 if(nrow(second.rows) > 0){
242 second.rows.v = table(second.rows$V_Segment_Major_Gene)
243 second.rows.v = names(second.rows.v[which.max(second.rows.v)])
244 second.rows.j = table(second.rows$J_Segment_Major_Gene)
245 second.rows.j = names(second.rows.j[which.max(second.rows.j)])
246
247 second.sum = data.frame(merge = first.clone.sequence,
248 Patient = patient,
249 Receptor = second.rows[1,"Receptor"],
250 Sample = second.rows[1,"Sample"],
251 Cell_Count = second.rows[1,"Cell_Count"],
252 Clone_Molecule_Count_From_Spikes = sum(second.rows$Clone_Molecule_Count_From_Spikes),
253 Log10_Frequency = log10(sum(second.rows$Frequency)),
254 Total_Read_Count = sum(second.rows$Total_Read_Count),
255 dsPerM = sum(second.rows$dsPerM),
256 J_Segment_Major_Gene = second.rows.j,
257 V_Segment_Major_Gene = second.rows.v,
258 Clone_Sequence = first.clone.sequence,
259 CDR3_Sense_Sequence = second.rows[1,"CDR3_Sense_Sequence"],
260 Related_to_leukemia_clone = F,
261 Frequency = sum(second.rows$Frequency),
262 locus_V = second.rows[1,"locus_V"],
263 locus_J = second.rows[1,"locus_J"],
264 min_cell_count = second.rows[1,"min_cell_count"],
265 normalized_read_count = sum(second.rows$normalized_read_count),
266 paste = second.rows[1,"paste"],
267 min_cell_paste = second.rows[1,"min_cell_paste"])
268
269 patientMerge = rbind(patientMerge, merge(first.sum, second.sum, by="merge"))
270 patient.fuzzy = patient.fuzzy[!(first.match.filter | second.match.filter),]
271
272 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"], second.rows[second.rows$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
273 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
274
275 tmp.rows = rbind(first.rows, second.rows)
276 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
277
278
279 #add to the scatterplot data
280 scatterplot.row = first.sum[,scatterplot_data_columns]
281 scatterplot.row$type = paste(first.sum[,"Sample"], "In Both")
282 scatterplot.row$link = link.count
283 scatterplot.row$on = onShort
284
285 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
286
287 scatterplot.row = second.sum[,scatterplot_data_columns]
288 scatterplot.row$type = paste(second.sum[,"Sample"], "In Both")
289 scatterplot.row$link = link.count
290 scatterplot.row$on = onShort
291
292 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
293
294 #write some information about the match to a log file
295 if (nrow(first.rows) > 1 | nrow(second.rows) > 1) {
296 cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
297 } else {
298 second.clone.sequence = second.rows[1,"Clone_Sequence"]
299 if(nchar(first.clone.sequence) != nchar(second.clone.sequence)){
300 cat(paste("<tr bgcolor='#DDD'><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
301 } else {
302 #cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
303 }
304 }
305
306 } else if(nrow(first.rows) > 1) {
307 if(patient1[1,"Sample"] == first.sample){
308 patient1 = patient1[!(patient1$Clone_Sequence %in% first.rows$Clone_Sequence),]
309 patient1 = rbind(patient1, first.sum)
310 } else {
311 patient2 = patient2[!(patient2$Clone_Sequence %in% first.rows$Clone_Sequence),]
312 patient2 = rbind(patient2, first.sum)
313 }
314
315 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"])
316 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
317
318 patient.fuzzy = patient.fuzzy[-first.match.filter,]
319
320 #add to the scatterplot data
321 scatterplot.row = first.sum[,scatterplot_data_columns]
322 scatterplot.row$type = first.sum[,"Sample"]
323 scatterplot.row$link = link.count
324 scatterplot.row$on = onShort
325
326 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
327
328 cat(paste("<tr bgcolor='#DDF'><td>", patient, " row ", 1:nrow(first.rows), "</td><td>", first.rows$Sample, ":</td><td>", first.rows$Clone_Sequence, "</td><td>", first.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
329 } else {
330 patient.fuzzy = patient.fuzzy[-1,]
331
332 #add to the scatterplot data
333 scatterplot.row = first.sum[,scatterplot_data_columns]
334 scatterplot.row$type = first.sum[,"Sample"]
335 scatterplot.row$link = link.count
336 scatterplot.row$on = onShort
337
338 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
339 }
340 link.count = link.count + 1
341 }
342 patient.merge.list[[patient]] <<- patientMerge
343 patient.merge.list.second[[patient]] <<- merge.list[["second"]]
344
345 sample.order = data.frame(type = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both")),type.order = 1:4)
346 scatterplot_data = merge(scatterplot_data, sample.order, by="type")
347
348 scatter_locus_data_list[[patient]] <<- scatterplot_data
349 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
350 }
351
352 patient1 = patient1[!(patient1$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
353 patient2 = patient2[!(patient2$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
354
355
356 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
357 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony])
358 res1 = vector()
359 res2 = vector()
360 resBoth = vector()
361 read1Count = vector()
362 read2Count = vector()
363 locussum1 = vector()
364 locussum2 = vector()
365
366 #for(iter in 1){
367 for(iter in 1:length(product[,1])){
368 threshhold = product[iter,threshholdIndex]
369 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
370 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
371 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
372 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
373 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
374 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
375 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
376 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
377 res1 = append(res1, sum(one))
378 res2 = append(res2, sum(two))
379 resBoth = append(resBoth, sum(both))
380 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
381 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
382 #threshhold = 0
383 if(threshhold != 0){
384 if(sum(one) > 0){
385 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
386 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
387 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
388 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
389 }
390 if(sum(two) > 0){
391 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
392 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
393 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
394 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
395 }
396 } else {
397 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
398 if(nrow(scatterplot_locus_data) > 0){
399 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
400 }
401
402
403
404 p = NULL
405 print(paste("nrow scatterplot_locus_data", nrow(scatterplot_locus_data)))
406 if(nrow(scatterplot_locus_data) != 0){
407 if(on == "normalized_read_count"){
408 write.table(scatterplot_locus_data, file=paste(oneSample, twoSample, product[iter, titleIndex], "scatterplot_locus_data.txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
409 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
410 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), normalized_read_count, group=link)) + geom_line() + scale_y_log10(breaks=scales,labels=scales, limits=c(1,1e6)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
411 } else {
412 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), Frequency, group=link)) + geom_line() + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
413 }
414 p = p + geom_point(aes(colour=type), position="dodge")
415 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
416 } else {
417 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
418 }
419 png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
420 print(p)
421 dev.off()
422 }
423 if(sum(both) > 0){
424 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
425 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
426 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
427 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
428 }
429 }
430 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
431 if(sum(is.na(patientResult$percentage)) > 0){
432 patientResult[is.na(patientResult$percentage),]$percentage = 0
433 }
434 colnames(patientResult)[6] = oneSample
435 colnames(patientResult)[8] = twoSample
436 colnamesBak = colnames(patientResult)
437 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
438 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
439 colnames(patientResult) = colnamesBak
440
441 patientResult$Locus = factor(patientResult$Locus, Titles)
442 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
443
444 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
445 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
446 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
447 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
448 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
449 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
450 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
451 print(plt)
452 dev.off()
453 #(t,r,b,l)
454 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
455 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
456 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
457 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
458 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
459 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
460 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
461 print(plt)
462 dev.off()
463
464 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
465 patientResult$relativeValue = patientResult$value * 10
466 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
467 plt = ggplot(patientResult)
468 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
469 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
470 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
471 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
472 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
473 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
474 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
475 print(plt)
476 dev.off()
477 }
478
479 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
480
481 interval = intervalFreq
482 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
483 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
484 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
485
486 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
487
488 interval = intervalReads
489 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
490 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
491 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
492
493 if(nrow(single_patients) > 0){
494 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
495 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=as.character(scales)) + expand_limits(y=c(0,1000000))
496 p = p + geom_point(aes(colour=type), position="jitter")
497 p = p + xlab("In one or both samples") + ylab("Reads")
498 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
499 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
500 print(p)
501 dev.off()
502
503 #p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
504 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
505 p = p + geom_point(aes(colour=type), position="jitter")
506 p = p + xlab("In one or both samples") + ylab("Frequency")
507 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
508 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
509 print(p)
510 dev.off()
511 } else {
512 empty <- data.frame()
513 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
514
515 png("singles_reads_scatterplot.png", width=400, height=300)
516 print(p)
517 dev.off()
518
519 png("singles_freq_scatterplot.png", width=400, height=300)
520 print(p)
521 dev.off()
522 }
523
524 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
525 patient.merge.list.second = list()
526
527 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
528 onShort = "reads"
529 if(on == "Frequency"){
530 onShort = "freq"
531 }
532 onx = paste(on, ".x", sep="")
533 ony = paste(on, ".y", sep="")
534 onz = paste(on, ".z", sep="")
535 type="triplet"
536
537 threshholdIndex = which(colnames(product) == "interval")
538 V_SegmentIndex = which(colnames(product) == "V_Segments")
539 J_SegmentIndex = which(colnames(product) == "J_Segments")
540 titleIndex = which(colnames(product) == "Titles")
541 sampleIndex = which(colnames(patient1) == "Sample")
542 patientIndex = which(colnames(patient1) == "Patient")
543 oneSample = paste(patient1[1,sampleIndex], sep="")
544 twoSample = paste(patient2[1,sampleIndex], sep="")
545 threeSample = paste(patient3[1,sampleIndex], sep="")
546
547 if(mergeOn == "Clone_Sequence"){
548 patient1$merge = paste(patient1$Clone_Sequence)
549 patient2$merge = paste(patient2$Clone_Sequence)
550 patient3$merge = paste(patient3$Clone_Sequence)
551
552 } else {
553 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
554 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
555 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
556 }
557
558 #patientMerge = merge(patient1, patient2, by="merge")[NULL,]
559 patient1.fuzzy = patient1
560 patient2.fuzzy = patient2
561 patient3.fuzzy = patient3
562
563 cat(paste("<tr><td>", label1, "</td>", sep=""), file=logfile, append=T)
564
565 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
566 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
567 patient3.fuzzy$merge = paste(patient3.fuzzy$locus_V, patient3.fuzzy$locus_J)
568
569 patient.fuzzy = rbind(patient1.fuzzy ,patient2.fuzzy, patient3.fuzzy)
570 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
571
572 other.sample.list = list()
573 other.sample.list[[oneSample]] = c(twoSample, threeSample)
574 other.sample.list[[twoSample]] = c(oneSample, threeSample)
575 other.sample.list[[threeSample]] = c(oneSample, twoSample)
576
577 patientMerge = merge(patient1, patient2, by="merge")
578 patientMerge = merge(patientMerge, patient3, by="merge")
579 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
580 #patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
581 patientMerge = patientMerge[NULL,]
582
583 duo.merge.list = list()
584
585 patientMerge12 = merge(patient1, patient2, by="merge")
586 #patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
587 patientMerge12 = patientMerge12[NULL,]
588 duo.merge.list[[paste(oneSample, twoSample)]] = patientMerge12
589 duo.merge.list[[paste(twoSample, oneSample)]] = patientMerge12
590
591 patientMerge13 = merge(patient1, patient3, by="merge")
592 #patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
593 patientMerge13 = patientMerge13[NULL,]
594 duo.merge.list[[paste(oneSample, threeSample)]] = patientMerge13
595 duo.merge.list[[paste(threeSample, oneSample)]] = patientMerge13
596
597 patientMerge23 = merge(patient2, patient3, by="merge")
598 #patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
599 patientMerge23 = patientMerge23[NULL,]
600 duo.merge.list[[paste(twoSample, threeSample)]] = patientMerge23
601 duo.merge.list[[paste(threeSample, twoSample)]] = patientMerge23
602
603 merge.list = list()
604 merge.list[["second"]] = vector()
605
606 start.time = proc.time()
607 if(paste(label1, "123") %in% names(patient.merge.list)){
608 patientMerge = patient.merge.list[[paste(label1, "123")]]
609 patientMerge12 = patient.merge.list[[paste(label1, "12")]]
610 patientMerge13 = patient.merge.list[[paste(label1, "13")]]
611 patientMerge23 = patient.merge.list[[paste(label1, "23")]]
612
613 merge.list[["second"]] = patient.merge.list.second[[label1]]
614
615 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
616 } else {
617 while(nrow(patient.fuzzy) > 0){
618 first.merge = patient.fuzzy[1,"merge"]
619 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
620 first.sample = patient.fuzzy[1,"Sample"]
621
622 merge.filter = first.merge == patient.fuzzy$merge
623
624 second.sample = other.sample.list[[first.sample]][1]
625 third.sample = other.sample.list[[first.sample]][2]
626
627 sample.filter.1 = first.sample == patient.fuzzy$Sample
628 sample.filter.2 = second.sample == patient.fuzzy$Sample
629 sample.filter.3 = third.sample == patient.fuzzy$Sample
630
631 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
632
633 match.filter.1 = sample.filter.1 & sequence.filter & merge.filter
634 match.filter.2 = sample.filter.2 & sequence.filter & merge.filter
635 match.filter.3 = sample.filter.3 & sequence.filter & merge.filter
636
637 matches.in.1 = any(match.filter.1)
638 matches.in.2 = any(match.filter.2)
639 matches.in.3 = any(match.filter.3)
640
641
642
643 rows.1 = patient.fuzzy[match.filter.1,]
644
645 sum.1 = data.frame(merge = first.clone.sequence,
646 Patient = label1,
647 Receptor = rows.1[1,"Receptor"],
648 Sample = rows.1[1,"Sample"],
649 Cell_Count = rows.1[1,"Cell_Count"],
650 Clone_Molecule_Count_From_Spikes = sum(rows.1$Clone_Molecule_Count_From_Spikes),
651 Log10_Frequency = log10(sum(rows.1$Frequency)),
652 Total_Read_Count = sum(rows.1$Total_Read_Count),
653 dsPerM = sum(rows.1$dsPerM),
654 J_Segment_Major_Gene = rows.1[1,"J_Segment_Major_Gene"],
655 V_Segment_Major_Gene = rows.1[1,"V_Segment_Major_Gene"],
656 Clone_Sequence = first.clone.sequence,
657 CDR3_Sense_Sequence = rows.1[1,"CDR3_Sense_Sequence"],
658 Related_to_leukemia_clone = F,
659 Frequency = sum(rows.1$Frequency),
660 locus_V = rows.1[1,"locus_V"],
661 locus_J = rows.1[1,"locus_J"],
662 uniqueID = rows.1[1,"uniqueID"],
663 normalized_read_count = sum(rows.1$normalized_read_count))
664 sum.2 = sum.1[NULL,]
665 rows.2 = patient.fuzzy[match.filter.2,]
666 if(matches.in.2){
667 sum.2 = data.frame(merge = first.clone.sequence,
668 Patient = label1,
669 Receptor = rows.2[1,"Receptor"],
670 Sample = rows.2[1,"Sample"],
671 Cell_Count = rows.2[1,"Cell_Count"],
672 Clone_Molecule_Count_From_Spikes = sum(rows.2$Clone_Molecule_Count_From_Spikes),
673 Log10_Frequency = log10(sum(rows.2$Frequency)),
674 Total_Read_Count = sum(rows.2$Total_Read_Count),
675 dsPerM = sum(rows.2$dsPerM),
676 J_Segment_Major_Gene = rows.2[1,"J_Segment_Major_Gene"],
677 V_Segment_Major_Gene = rows.2[1,"V_Segment_Major_Gene"],
678 Clone_Sequence = first.clone.sequence,
679 CDR3_Sense_Sequence = rows.2[1,"CDR3_Sense_Sequence"],
680 Related_to_leukemia_clone = F,
681 Frequency = sum(rows.2$Frequency),
682 locus_V = rows.2[1,"locus_V"],
683 locus_J = rows.2[1,"locus_J"],
684 uniqueID = rows.2[1,"uniqueID"],
685 normalized_read_count = sum(rows.2$normalized_read_count))
686 }
687
688 sum.3 = sum.1[NULL,]
689 rows.3 = patient.fuzzy[match.filter.3,]
690 if(matches.in.3){
691 sum.3 = data.frame(merge = first.clone.sequence,
692 Patient = label1,
693 Receptor = rows.3[1,"Receptor"],
694 Sample = rows.3[1,"Sample"],
695 Cell_Count = rows.3[1,"Cell_Count"],
696 Clone_Molecule_Count_From_Spikes = sum(rows.3$Clone_Molecule_Count_From_Spikes),
697 Log10_Frequency = log10(sum(rows.3$Frequency)),
698 Total_Read_Count = sum(rows.3$Total_Read_Count),
699 dsPerM = sum(rows.3$dsPerM),
700 J_Segment_Major_Gene = rows.3[1,"J_Segment_Major_Gene"],
701 V_Segment_Major_Gene = rows.3[1,"V_Segment_Major_Gene"],
702 Clone_Sequence = first.clone.sequence,
703 CDR3_Sense_Sequence = rows.3[1,"CDR3_Sense_Sequence"],
704 Related_to_leukemia_clone = F,
705 Frequency = sum(rows.3$Frequency),
706 locus_V = rows.3[1,"locus_V"],
707 locus_J = rows.3[1,"locus_J"],
708 uniqueID = rows.3[1,"uniqueID"],
709 normalized_read_count = sum(rows.3$normalized_read_count))
710 }
711
712 if(matches.in.2 & matches.in.3){
713 merge.123 = merge(sum.1, sum.2, by="merge")
714 merge.123 = merge(merge.123, sum.3, by="merge")
715 colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123)))] = paste(colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123), perl=T))], ".z", sep="")
716 #merge.123$thresholdValue = pmax(merge.123[,onx], merge.123[,ony], merge.123[,onz])
717
718 patientMerge = rbind(patientMerge, merge.123)
719 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2 | match.filter.3),]
720
721 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
722 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
723
724 } else if (matches.in.2) {
725 #other.sample1 = other.sample.list[[first.sample]][1]
726 #other.sample2 = other.sample.list[[first.sample]][2]
727
728 second.sample = sum.2[,"Sample"]
729
730 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
731
732 merge.12 = merge(sum.1, sum.2, by="merge")
733
734 current.merge.list = rbind(current.merge.list, merge.12)
735 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
736
737 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2),]
738
739 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
740 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
741
742 } else if (matches.in.3) {
743
744 #other.sample1 = other.sample.list[[first.sample]][1]
745 #other.sample2 = other.sample.list[[first.sample]][2]
746
747 second.sample = sum.3[,"Sample"]
748
749 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
750
751 merge.13 = merge(sum.1, sum.3, by="merge")
752
753 current.merge.list = rbind(current.merge.list, merge.13)
754 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
755
756 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.3),]
757
758 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
759 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
760
761 } else if(nrow(rows.1) > 1){
762 patient1 = patient1[!(patient1$Clone_Sequence %in% rows.1$Clone_Sequence),]
763 print(names(patient1)[names(patient1) %in% sum.1])
764 print(names(patient1)[!(names(patient1) %in% sum.1)])
765 print(names(patient1))
766 print(names(sum.1))
767 print(summary(sum.1))
768 print(summary(patient1))
769 print(dim(sum.1))
770 print(dim(patient1))
771 print(head(sum.1[,names(patient1)]))
772 patient1 = rbind(patient1, sum.1[,names(patient1)])
773 patient.fuzzy = patient.fuzzy[-match.filter.1,]
774 } else {
775 patient.fuzzy = patient.fuzzy[-1,]
776 }
777
778 tmp.rows = rbind(rows.1, rows.2, rows.3)
779 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
780
781 if (sum(match.filter.1) > 1 | sum(match.filter.2) > 1 | sum(match.filter.1) > 1) {
782 cat(paste("<tr><td>", label1, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
783 } else {
784 }
785
786 }
787 patient.merge.list[[paste(label1, "123")]] = patientMerge
788
789 patientMerge12 = duo.merge.list[[paste(oneSample, twoSample)]]
790 patientMerge13 = duo.merge.list[[paste(oneSample, threeSample)]]
791 patientMerge23 = duo.merge.list[[paste(twoSample, threeSample)]]
792
793 patient.merge.list[[paste(label1, "12")]] = patientMerge12
794 patient.merge.list[[paste(label1, "13")]] = patientMerge13
795 patient.merge.list[[paste(label1, "23")]] = patientMerge23
796
797 patient.merge.list.second[[label1]] = merge.list[["second"]]
798 }
799 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
800 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
801 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
802 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
803 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
804
805 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
806 #patientMerge12$thresholdValue = pmin(patientMerge12[,onx], patientMerge12[,ony])
807 #patientMerge13$thresholdValue = pmin(patientMerge13[,onx], patientMerge13[,ony])
808 #patientMerge23$thresholdValue = pmin(patientMerge23[,onx], patientMerge23[,ony])
809
810 patient1 = patient1[!(patient1$Clone_Sequence %in% merge.list[["second"]]),]
811 patient2 = patient2[!(patient2$Clone_Sequence %in% merge.list[["second"]]),]
812 patient3 = patient3[!(patient3$Clone_Sequence %in% merge.list[["second"]]),]
813
814 if(F){
815 patientMerge = merge(patient1, patient2, by="merge")
816 patientMerge = merge(patientMerge, patient3, by="merge")
817 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
818 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
819 patientMerge12 = merge(patient1, patient2, by="merge")
820 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
821 patientMerge13 = merge(patient1, patient3, by="merge")
822 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
823 patientMerge23 = merge(patient2, patient3, by="merge")
824 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
825 }
826
827 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
828 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
829 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
830 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
831
832 res1 = vector()
833 res2 = vector()
834 res3 = vector()
835 res12 = vector()
836 res13 = vector()
837 res23 = vector()
838 resAll = vector()
839 read1Count = vector()
840 read2Count = vector()
841 read3Count = vector()
842
843 if(appendTriplets){
844 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
845 }
846 for(iter in 1:length(product[,1])){
847 threshhold = product[iter,threshholdIndex]
848 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
849 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
850 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
851 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
852
853 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
854 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
855 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
856
857 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
858 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
859 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
860
861 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
862 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
863 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
864 res1 = append(res1, sum(one))
865 res2 = append(res2, sum(two))
866 res3 = append(res3, sum(three))
867 resAll = append(resAll, sum(all))
868 res12 = append(res12, sum(one_two))
869 res13 = append(res13, sum(one_three))
870 res23 = append(res23, sum(two_three))
871 #threshhold = 0
872 if(threshhold != 0){
873 if(sum(one) > 0){
874 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
875 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
876 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
877 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
878 }
879 if(sum(two) > 0){
880 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
881 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
882 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
883 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
884 }
885 if(sum(three) > 0){
886 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
887 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
888 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
889 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
890 }
891 if(sum(one_two) > 0){
892 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
893 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
894 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
895 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
896 }
897 if(sum(one_three) > 0){
898 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
899 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
900 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
901 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
902 }
903 if(sum(two_three) > 0){
904 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
905 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
906 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
907 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
908 }
909 } else { #scatterplot data
910 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
911 scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
912 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
913 if(sum(in_two) > 0){
914 scatterplot_locus_data[in_two,]$type = "In two"
915 }
916 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
917 if(sum(in_three)> 0){
918 scatterplot_locus_data[in_three,]$type = "In three"
919 }
920 not_in_one = scatterplot_locus_data$type != "In one"
921 if(sum(not_in_one) > 0){
922 #scatterplot_locus_data[not_in_one,]$type = "In multiple"
923 }
924 p = NULL
925 if(nrow(scatterplot_locus_data) != 0){
926 if(on == "normalized_read_count"){
927 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
928 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales, limits=c(1, 1e6))
929 } else {
930 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
931 #p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
932 }
933 p = p + geom_point(aes(colour=type), position="jitter")
934 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
935 } else {
936 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
937 }
938 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
939 print(p)
940 dev.off()
941 }
942 if(sum(all) > 0){
943 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
944 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
945 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
946 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
947 }
948 }
949 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
950 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
951 colnames(patientResult)[6] = oneSample
952 colnames(patientResult)[7] = twoSample
953 colnames(patientResult)[8] = threeSample
954 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
955 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
956 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
957
958 colnamesBak = colnames(patientResult)
959 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
960 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
961 colnames(patientResult) = colnamesBak
962
963 patientResult$Locus = factor(patientResult$Locus, Titles)
964 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
965
966 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
967 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
968 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
969 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
970 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
971 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
972 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
973 print(plt)
974 dev.off()
975
976 fontSize = 4
977
978 bak = patientResult
979 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
980 patientResult$relativeValue = patientResult$value * 10
981 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
982 plt = ggplot(patientResult)
983 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
984 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
985 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
986 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
987 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
988 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
989 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
990 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
991 print(plt)
992 dev.off()
993 }
994
995 if(nrow(triplets) != 0){
996
997 cat("<tr><td>Starting triplet analysis</td></tr>", file=logfile, append=T)
998
999 triplets$uniqueID = "ID"
1000
1001 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
1002 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
1003 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
1004
1005 triplets[grepl("16278_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
1006 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
1007 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
1008
1009 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696"
1010
1011 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
1012
1013 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
1014 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
1015 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
1016
1017 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
1018 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
1019
1020 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
1021
1022 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
1023
1024 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
1025
1026 triplets = triplets[triplets$normalized_read_count >= min_cells,]
1027
1028 column_drops = c("min_cell_count", "min_cell_paste")
1029
1030 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
1031
1032 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
1033
1034 interval = intervalReads
1035 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
1036 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
1037
1038 one = triplets[triplets$Sample == "14696_reg_BM",]
1039 two = triplets[triplets$Sample == "24536_reg_BM",]
1040 three = triplets[triplets$Sample == "24062_reg_BM",]
1041 tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="normalized_read_count", T)
1042
1043 one = triplets[triplets$Sample == "16278_Left",]
1044 two = triplets[triplets$Sample == "26402_Left",]
1045 three = triplets[triplets$Sample == "26759_Left",]
1046 tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="normalized_read_count", T)
1047
1048 one = triplets[triplets$Sample == "16278_Right",]
1049 two = triplets[triplets$Sample == "26402_Right",]
1050 three = triplets[triplets$Sample == "26759_Right",]
1051 tripletAnalysis(one, "16278_Right_Trio", two, "26402_Right_Trio", three, "26759_Right_Trio", product=product, interval=interval, on="normalized_read_count", T)
1052
1053 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
1054
1055 interval = intervalFreq
1056 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
1057 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
1058
1059 one = triplets[triplets$Sample == "14696_reg_BM",]
1060 two = triplets[triplets$Sample == "24536_reg_BM",]
1061 three = triplets[triplets$Sample == "24062_reg_BM",]
1062 tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="Frequency", F)
1063
1064 one = triplets[triplets$Sample == "16278_Left",]
1065 two = triplets[triplets$Sample == "26402_Left",]
1066 three = triplets[triplets$Sample == "26759_Left",]
1067 tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="Frequency", F)
1068
1069 one = triplets[triplets$Sample == "16278_Right",]
1070 two = triplets[triplets$Sample == "26402_Right",]
1071 three = triplets[triplets$Sample == "26759_Right",]
1072 tripletAnalysis(one, "16278_Right_Trio", two, "26402_Right_Trio", three, "26759_Right_Trio", product=product, interval=interval, on="Frequency", F)
1073 } else {
1074 cat("", file="triplets.txt")
1075 }
1076 cat("</table></html>", file=logfile, append=T)