0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2 options(scipen=999)
|
|
3
|
|
4 inFile = args[1]
|
|
5 outDir = args[2]
|
|
6 logfile = args[3]
|
|
7 min_freq = as.numeric(args[4])
|
|
8 min_cells = as.numeric(args[5])
|
|
9 mergeOn = args[6]
|
|
10
|
|
11 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
|
12
|
|
13 library(ggplot2)
|
|
14 library(reshape2)
|
|
15 library(data.table)
|
|
16 library(grid)
|
|
17 library(parallel)
|
|
18 #require(xtable)
|
|
19 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
|
20 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
2
|
21
|
|
22 needed_cols = c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Clone_Sequence")
|
|
23 if(!all(needed_cols %in% names(dat))){
|
|
24 cat("Missing column(s):<br />", file=logfile, append=F)
|
|
25 missing_cols = needed_cols[!(needed_cols %in% names(dat))]
|
|
26 for(col in missing_cols){
|
|
27 cat(paste(col, "<br />"), file=logfile, append=T)
|
|
28 }
|
|
29 stop("Not all columns are present")
|
|
30 }
|
|
31
|
|
32 if(!("Total_Read_Count" %in% names(dat))){
|
|
33 dat$Total_Read_Count = 0
|
|
34 }
|
|
35
|
|
36 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Clone_Sequence")]
|
|
37
|
0
|
38 dat$dsPerM = 0
|
|
39 dat = dat[!is.na(dat$Patient),]
|
|
40 dat$Related_to_leukemia_clone = F
|
|
41
|
|
42 setwd(outDir)
|
|
43 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
|
44 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
45 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
46
|
|
47 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
|
48
|
|
49 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
|
50
|
|
51 dat = dat[dat$Frequency >= min_freq,]
|
|
52
|
1
|
53 patient.sample.counts = data.frame(data.table(dat)[, list(count=.N), by=c("Patient", "Sample")])
|
|
54 patient.sample.counts = data.frame(data.table(patient.sample.counts)[, list(count=.N), by=c("Patient")])
|
|
55
|
|
56 print("Found the following patients with number of samples:")
|
|
57 print(patient.sample.counts)
|
|
58
|
|
59 patient.sample.counts.pairs = patient.sample.counts[patient.sample.counts$count %in% 1:2,]
|
|
60 patient.sample.counts.triplets = patient.sample.counts[patient.sample.counts$count == 3,]
|
|
61
|
|
62
|
|
63
|
|
64 triplets = dat[dat$Patient %in% patient.sample.counts.triplets$Patient,]
|
|
65 dat = dat[dat$Patient %in% patient.sample.counts.pairs$Patient,]
|
0
|
66
|
|
67 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
68
|
|
69 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
70 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
71 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
72
|
|
73 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
74 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
75
|
|
76 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
77 print(paste("rows:", nrow(dat)))
|
|
78 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
|
79 print(paste("rows:", nrow(dat)))
|
|
80 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
81
|
|
82 dat = dat[dat$normalized_read_count >= min_cells,]
|
|
83
|
|
84 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
|
85
|
|
86 patients = split(dat, dat$Patient, drop=T)
|
|
87 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
|
88 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
|
89 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
90 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
91 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
92 Titles = factor(Titles, levels=Titles)
|
|
93 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
94
|
2
|
95 single_patients = dat[NULL,]
|
0
|
96
|
|
97 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
98 patient.merge.list.second = list()
|
|
99 scatter_locus_data_list = list()
|
|
100 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="multiple_matches.html", append=T)
|
|
101 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="single_matches.html", append=T)
|
|
102 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
|
103 if (!is.data.frame(x) & is.list(x)){
|
|
104 x = x[[1]]
|
|
105 }
|
|
106 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
107 x = data.frame(x,stringsAsFactors=F)
|
|
108 onShort = "reads"
|
|
109 if(on == "Frequency"){
|
|
110 onShort = "freq"
|
|
111 }
|
|
112 onx = paste(on, ".x", sep="")
|
|
113 ony = paste(on, ".y", sep="")
|
|
114 splt = split(x, x$Sample, drop=T)
|
|
115 type="pair"
|
|
116 if(length(splt) == 1){
|
|
117 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
2
|
118 splt[[2]] = splt[[1]][NULL,]
|
0
|
119 type="single"
|
|
120 }
|
|
121 patient1 = splt[[1]]
|
|
122 patient2 = splt[[2]]
|
|
123
|
2
|
124 oneSample = patient1[1,"Sample"]
|
|
125 twoSample = patient2[1,"Sample"]
|
|
126 patient = x[1,"Patient"]
|
|
127
|
0
|
128 switched = F
|
|
129 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
130 tmp = twoSample
|
|
131 twoSample = oneSample
|
|
132 oneSample = tmp
|
|
133 tmp = patient1
|
|
134 patient1 = patient2
|
|
135 patient2 = tmp
|
|
136 switched = T
|
|
137 }
|
|
138 if(appendtxt){
|
|
139 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
|
140 }
|
|
141 cat(paste("<tr><td>", patient, "</td>", sep=""), file=logfile, append=T)
|
|
142
|
|
143 if(mergeOn == "Clone_Sequence"){
|
|
144 patient1$merge = paste(patient1$Clone_Sequence)
|
|
145 patient2$merge = paste(patient2$Clone_Sequence)
|
|
146 } else {
|
|
147 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
148 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
149 }
|
|
150
|
|
151 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
|
152 scatterplot_data = patient1[NULL,scatterplot_data_columns]
|
|
153 scatterplot.data.type.factor = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both"))
|
|
154 scatterplot_data$type = character(0)
|
|
155 scatterplot_data$link = numeric(0)
|
|
156 scatterplot_data$on = character(0)
|
|
157
|
|
158 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh
|
2
|
159
|
0
|
160 cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence
|
2
|
161
|
0
|
162 start.time = proc.time()
|
|
163 merge.list = c()
|
2
|
164
|
0
|
165 if(patient %in% names(patient.merge.list)){
|
|
166 patientMerge = patient.merge.list[[patient]]
|
|
167 merge.list[["second"]] = patient.merge.list.second[[patient]]
|
|
168 scatterplot_data = scatter_locus_data_list[[patient]]
|
|
169 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
2
|
170
|
0
|
171 print(names(patient.merge.list))
|
|
172 } else {
|
|
173 #fuzzy matching here...
|
2
|
174
|
0
|
175 patient1.fuzzy = patient1
|
|
176 patient2.fuzzy = patient2
|
2
|
177
|
0
|
178 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
179 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
2
|
180
|
0
|
181 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
|
|
182 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
2
|
183
|
0
|
184 merge.list = list()
|
2
|
185
|
0
|
186 merge.list[["second"]] = vector()
|
2
|
187
|
|
188 link.count = 1
|
|
189
|
0
|
190 while(nrow(patient.fuzzy) > 1){
|
|
191 first.merge = patient.fuzzy[1,"merge"]
|
|
192 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
193 first.sample = patient.fuzzy[1,"Sample"]
|
|
194 merge.filter = first.merge == patient.fuzzy$merge
|
2
|
195
|
0
|
196 #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
|
2
|
197
|
0
|
198 first.sample.filter = first.sample == patient.fuzzy$Sample
|
|
199 second.sample.filter = first.sample != patient.fuzzy$Sample
|
2
|
200
|
0
|
201 #first match same sample, sum to a single row, same for other sample
|
|
202 #then merge rows like 'normal'
|
2
|
203
|
0
|
204 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
2
|
205
|
|
206
|
|
207
|
0
|
208 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter
|
|
209 first.match.filter = merge.filter & sequence.filter & first.sample.filter
|
|
210 second.match.filter = merge.filter & sequence.filter & second.sample.filter
|
2
|
211
|
0
|
212 first.rows = patient.fuzzy[first.match.filter,]
|
|
213 second.rows = patient.fuzzy[second.match.filter,]
|
2
|
214
|
0
|
215 first.rows.v = table(first.rows$V_Segment_Major_Gene)
|
|
216 first.rows.v = names(first.rows.v[which.max(first.rows.v)])
|
|
217 first.rows.j = table(first.rows$J_Segment_Major_Gene)
|
|
218 first.rows.j = names(first.rows.j[which.max(first.rows.j)])
|
2
|
219
|
0
|
220 first.sum = data.frame(merge = first.clone.sequence,
|
|
221 Patient = patient,
|
|
222 Receptor = first.rows[1,"Receptor"],
|
|
223 Sample = first.rows[1,"Sample"],
|
|
224 Cell_Count = first.rows[1,"Cell_Count"],
|
|
225 Clone_Molecule_Count_From_Spikes = sum(first.rows$Clone_Molecule_Count_From_Spikes),
|
|
226 Log10_Frequency = log10(sum(first.rows$Frequency)),
|
|
227 Total_Read_Count = sum(first.rows$Total_Read_Count),
|
|
228 dsPerM = sum(first.rows$dsPerM),
|
|
229 J_Segment_Major_Gene = first.rows.j,
|
|
230 V_Segment_Major_Gene = first.rows.v,
|
|
231 Clone_Sequence = first.clone.sequence,
|
|
232 CDR3_Sense_Sequence = first.rows[1,"CDR3_Sense_Sequence"],
|
|
233 Related_to_leukemia_clone = F,
|
|
234 Frequency = sum(first.rows$Frequency),
|
|
235 locus_V = first.rows[1,"locus_V"],
|
|
236 locus_J = first.rows[1,"locus_J"],
|
|
237 min_cell_count = first.rows[1,"min_cell_count"],
|
|
238 normalized_read_count = sum(first.rows$normalized_read_count),
|
|
239 paste = first.rows[1,"paste"],
|
|
240 min_cell_paste = first.rows[1,"min_cell_paste"])
|
2
|
241
|
0
|
242 if(nrow(second.rows) > 0){
|
|
243 second.rows.v = table(second.rows$V_Segment_Major_Gene)
|
|
244 second.rows.v = names(second.rows.v[which.max(second.rows.v)])
|
|
245 second.rows.j = table(second.rows$J_Segment_Major_Gene)
|
|
246 second.rows.j = names(second.rows.j[which.max(second.rows.j)])
|
2
|
247
|
0
|
248 second.sum = data.frame(merge = first.clone.sequence,
|
2
|
249 Patient = patient,
|
|
250 Receptor = second.rows[1,"Receptor"],
|
|
251 Sample = second.rows[1,"Sample"],
|
|
252 Cell_Count = second.rows[1,"Cell_Count"],
|
|
253 Clone_Molecule_Count_From_Spikes = sum(second.rows$Clone_Molecule_Count_From_Spikes),
|
|
254 Log10_Frequency = log10(sum(second.rows$Frequency)),
|
|
255 Total_Read_Count = sum(second.rows$Total_Read_Count),
|
|
256 dsPerM = sum(second.rows$dsPerM),
|
|
257 J_Segment_Major_Gene = second.rows.j,
|
|
258 V_Segment_Major_Gene = second.rows.v,
|
|
259 Clone_Sequence = first.clone.sequence,
|
|
260 CDR3_Sense_Sequence = second.rows[1,"CDR3_Sense_Sequence"],
|
|
261 Related_to_leukemia_clone = F,
|
|
262 Frequency = sum(second.rows$Frequency),
|
|
263 locus_V = second.rows[1,"locus_V"],
|
|
264 locus_J = second.rows[1,"locus_J"],
|
|
265 min_cell_count = second.rows[1,"min_cell_count"],
|
|
266 normalized_read_count = sum(second.rows$normalized_read_count),
|
|
267 paste = second.rows[1,"paste"],
|
|
268 min_cell_paste = second.rows[1,"min_cell_paste"])
|
|
269
|
|
270 #print(names(patientMerge))
|
|
271 #print(merge(first.sum, second.sum, by="merge"))
|
0
|
272 patientMerge = rbind(patientMerge, merge(first.sum, second.sum, by="merge"))
|
2
|
273 #print("test2")
|
0
|
274 patient.fuzzy = patient.fuzzy[!(first.match.filter | second.match.filter),]
|
2
|
275
|
0
|
276 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"], second.rows[second.rows$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
277 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
2
|
278
|
0
|
279 tmp.rows = rbind(first.rows, second.rows)
|
2
|
280 #print("test3")
|
0
|
281 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
|
282
|
|
283
|
|
284 #add to the scatterplot data
|
|
285 scatterplot.row = first.sum[,scatterplot_data_columns]
|
2
|
286 scatterplot.row$type = paste(first.sum[,"Sample"], "In Both")
|
|
287 scatterplot.row$link = link.count
|
|
288 scatterplot.row$on = onShort
|
|
289
|
|
290 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
0
|
291
|
|
292 scatterplot.row = second.sum[,scatterplot_data_columns]
|
2
|
293 scatterplot.row$type = paste(second.sum[,"Sample"], "In Both")
|
|
294 scatterplot.row$link = link.count
|
|
295 scatterplot.row$on = onShort
|
|
296
|
|
297 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
298
|
|
299 #write some information about the match to a log file
|
0
|
300 if (nrow(first.rows) > 1 | nrow(second.rows) > 1) {
|
|
301 cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
302 } else {
|
|
303 second.clone.sequence = second.rows[1,"Clone_Sequence"]
|
|
304 if(nchar(first.clone.sequence) != nchar(second.clone.sequence)){
|
|
305 cat(paste("<tr bgcolor='#DDD'><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
306 } else {
|
|
307 #cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
308 }
|
|
309 }
|
2
|
310
|
0
|
311 } else if(nrow(first.rows) > 1) {
|
|
312 if(patient1[1,"Sample"] == first.sample){
|
|
313 patient1 = patient1[!(patient1$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
314 patient1 = rbind(patient1, first.sum)
|
|
315 } else {
|
|
316 patient2 = patient2[!(patient2$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
317 patient2 = rbind(patient2, first.sum)
|
|
318 }
|
2
|
319
|
0
|
320 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"])
|
|
321 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
2
|
322
|
0
|
323 patient.fuzzy = patient.fuzzy[-first.match.filter,]
|
|
324
|
|
325 #add to the scatterplot data
|
|
326 scatterplot.row = first.sum[,scatterplot_data_columns]
|
2
|
327 scatterplot.row$type = first.sum[,"Sample"]
|
|
328 scatterplot.row$link = link.count
|
|
329 scatterplot.row$on = onShort
|
|
330
|
|
331 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
332
|
0
|
333 cat(paste("<tr bgcolor='#DDF'><td>", patient, " row ", 1:nrow(first.rows), "</td><td>", first.rows$Sample, ":</td><td>", first.rows$Clone_Sequence, "</td><td>", first.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
334 } else {
|
|
335 patient.fuzzy = patient.fuzzy[-1,]
|
|
336
|
|
337 #add to the scatterplot data
|
|
338 scatterplot.row = first.sum[,scatterplot_data_columns]
|
2
|
339 scatterplot.row$type = first.sum[,"Sample"]
|
|
340 scatterplot.row$link = link.count
|
|
341 scatterplot.row$on = onShort
|
|
342
|
|
343 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
0
|
344 }
|
|
345 link.count = link.count + 1
|
|
346 }
|
|
347 patient.merge.list[[patient]] <<- patientMerge
|
|
348 patient.merge.list.second[[patient]] <<- merge.list[["second"]]
|
|
349
|
|
350 sample.order = data.frame(type = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both")),type.order = 1:4)
|
|
351 scatterplot_data = merge(scatterplot_data, sample.order, by="type")
|
|
352
|
|
353 scatter_locus_data_list[[patient]] <<- scatterplot_data
|
|
354 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
|
355 }
|
2
|
356
|
0
|
357 patient1 = patient1[!(patient1$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
|
358 patient2 = patient2[!(patient2$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
2
|
359
|
0
|
360
|
|
361 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
|
362 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony])
|
|
363 res1 = vector()
|
|
364 res2 = vector()
|
|
365 resBoth = vector()
|
|
366 read1Count = vector()
|
|
367 read2Count = vector()
|
|
368 locussum1 = vector()
|
|
369 locussum2 = vector()
|
|
370
|
|
371 #for(iter in 1){
|
|
372 for(iter in 1:length(product[,1])){
|
2
|
373 threshhold = product[iter,"interval"]
|
|
374 V_Segment = paste(".*", as.character(product[iter,"V_Segments"]), ".*", sep="")
|
|
375 J_Segment = paste(".*", as.character(product[iter,"J_Segments"]), ".*", sep="")
|
0
|
376 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
|
377 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
|
378 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
379 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
|
380 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
381 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
|
382 res1 = append(res1, sum(one))
|
|
383 res2 = append(res2, sum(two))
|
|
384 resBoth = append(resBoth, sum(both))
|
|
385 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
386 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
387 #threshhold = 0
|
2
|
388 if(threshhold != 0 | T){
|
0
|
389 if(sum(one) > 0){
|
|
390 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
391 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
2
|
392 filenameOne = paste(oneSample, "_", product[iter, "Titles"], "_", threshhold, sep="")
|
0
|
393 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
394 }
|
|
395 if(sum(two) > 0){
|
|
396 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
397 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
2
|
398 filenameTwo = paste(twoSample, "_", product[iter, "Titles"], "_", threshhold, sep="")
|
0
|
399 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
400 }
|
|
401 } else {
|
|
402 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
403 if(nrow(scatterplot_locus_data) > 0){
|
2
|
404 scatterplot_locus_data$Rearrangement = product[iter, "Titles"]
|
0
|
405 }
|
|
406
|
|
407
|
2
|
408
|
0
|
409 p = NULL
|
2
|
410 #print(paste("nrow scatterplot_locus_data", nrow(scatterplot_locus_data)))
|
0
|
411 if(nrow(scatterplot_locus_data) != 0){
|
|
412 if(on == "normalized_read_count"){
|
2
|
413 write.table(scatterplot_locus_data, file=paste(oneSample, twoSample, product[iter, "Titles"], "scatterplot_locus_data.txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
0
|
414 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
415 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), normalized_read_count, group=link)) + geom_line() + scale_y_log10(breaks=scales,labels=scales, limits=c(1,1e6)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
|
416 } else {
|
|
417 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), Frequency, group=link)) + geom_line() + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
|
418 }
|
|
419 p = p + geom_point(aes(colour=type), position="dodge")
|
2
|
420 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,"Patient"], patient1[1,"Sample"], patient2[1,"Sample"], onShort, product[iter, "Titles"]))
|
0
|
421 } else {
|
2
|
422 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,"Patient"], patient1[1,"Sample"], patient2[1,"Sample"], onShort, product[iter, titleIndex]))
|
0
|
423 }
|
2
|
424 png(paste(patient1[1,"Patient"], "_", patient1[1,"Sample"], "_", patient2[1,"Sample"], "_", onShort, "_", product[iter, "Titles"],"_scatter.png", sep=""))
|
0
|
425 print(p)
|
|
426 dev.off()
|
|
427 }
|
|
428 if(sum(both) > 0){
|
|
429 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
430 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
2
|
431 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, "Titles"], "_", threshhold, sep="")
|
0
|
432 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
433 }
|
|
434 }
|
|
435 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
|
436 if(sum(is.na(patientResult$percentage)) > 0){
|
|
437 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
438 }
|
|
439 colnames(patientResult)[6] = oneSample
|
|
440 colnames(patientResult)[8] = twoSample
|
|
441 colnamesBak = colnames(patientResult)
|
|
442 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
|
443 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
444 colnames(patientResult) = colnamesBak
|
|
445
|
|
446 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
447 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
448
|
|
449 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
450 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
451 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
452 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
453 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
454 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
455 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
456 print(plt)
|
|
457 dev.off()
|
|
458 #(t,r,b,l)
|
|
459 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
460 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
461 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
462 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
463 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
464 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
465 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
466 print(plt)
|
|
467 dev.off()
|
|
468
|
|
469 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
470 patientResult$relativeValue = patientResult$value * 10
|
|
471 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
472 plt = ggplot(patientResult)
|
|
473 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
474 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
475 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
476 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
477 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
478 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
479 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
480 print(plt)
|
|
481 dev.off()
|
|
482 }
|
2
|
483
|
1
|
484 if(length(patients) > 0){
|
|
485 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
0
|
486
|
1
|
487 interval = intervalFreq
|
|
488 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
489 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
2
|
490 for (current_patient in patients){
|
|
491 print(paste("Started working", unique(current_patient$Patient), "Frequency analysis"))
|
|
492 patientCountOnColumn(current_patient, product=product, interval=interval, on="Frequency", appendtxt=T)
|
|
493 }
|
0
|
494
|
1
|
495 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
0
|
496
|
1
|
497 interval = intervalReads
|
|
498 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
499 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
2
|
500 for (current_patient in patients){
|
|
501 print(paste("Started working on ", unique(current_patient$Patient), "Read Count analysis"))
|
|
502 patientCountOnColumn(current_patient, product=product, interval=interval, on="normalized_read_count")
|
|
503 }
|
1
|
504 }
|
0
|
505 if(nrow(single_patients) > 0){
|
|
506 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
507 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=as.character(scales)) + expand_limits(y=c(0,1000000))
|
|
508 p = p + geom_point(aes(colour=type), position="jitter")
|
|
509 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
510 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
511 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
512 print(p)
|
|
513 dev.off()
|
|
514
|
|
515 #p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
516 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
|
|
517 p = p + geom_point(aes(colour=type), position="jitter")
|
|
518 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
519 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
520 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
521 print(p)
|
|
522 dev.off()
|
|
523 } else {
|
|
524 empty <- data.frame()
|
|
525 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
526
|
|
527 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
528 print(p)
|
|
529 dev.off()
|
|
530
|
|
531 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
532 print(p)
|
|
533 dev.off()
|
|
534 }
|
|
535
|
|
536 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
537 patient.merge.list.second = list()
|
|
538
|
|
539 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
1
|
540 onShort = "reads"
|
|
541 if(on == "Frequency"){
|
|
542 onShort = "freq"
|
|
543 }
|
|
544 onx = paste(on, ".x", sep="")
|
|
545 ony = paste(on, ".y", sep="")
|
|
546 onz = paste(on, ".z", sep="")
|
|
547 type="triplet"
|
0
|
548
|
1
|
549 threshholdIndex = which(colnames(product) == "interval")
|
|
550 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
551 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
552 titleIndex = which(colnames(product) == "Titles")
|
|
553 sampleIndex = which(colnames(patient1) == "Sample")
|
|
554 patientIndex = which(colnames(patient1) == "Patient")
|
|
555 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
556 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
557 threeSample = paste(patient3[1,sampleIndex], sep="")
|
|
558
|
|
559 if(mergeOn == "Clone_Sequence"){
|
|
560 patient1$merge = paste(patient1$Clone_Sequence)
|
0
|
561 patient2$merge = paste(patient2$Clone_Sequence)
|
|
562 patient3$merge = paste(patient3$Clone_Sequence)
|
|
563
|
1
|
564 } else {
|
0
|
565 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
566 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
567 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
1
|
568 }
|
0
|
569
|
1
|
570 #patientMerge = merge(patient1, patient2, by="merge")[NULL,]
|
|
571 patient1.fuzzy = patient1
|
|
572 patient2.fuzzy = patient2
|
|
573 patient3.fuzzy = patient3
|
0
|
574
|
1
|
575 cat(paste("<tr><td>", label1, "</td>", sep=""), file=logfile, append=T)
|
0
|
576
|
1
|
577 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
578 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
|
579 patient3.fuzzy$merge = paste(patient3.fuzzy$locus_V, patient3.fuzzy$locus_J)
|
0
|
580
|
1
|
581 patient.fuzzy = rbind(patient1.fuzzy ,patient2.fuzzy, patient3.fuzzy)
|
|
582 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
0
|
583
|
1
|
584 other.sample.list = list()
|
|
585 other.sample.list[[oneSample]] = c(twoSample, threeSample)
|
|
586 other.sample.list[[twoSample]] = c(oneSample, threeSample)
|
|
587 other.sample.list[[threeSample]] = c(oneSample, twoSample)
|
0
|
588
|
1
|
589 patientMerge = merge(patient1, patient2, by="merge")
|
|
590 patientMerge = merge(patientMerge, patient3, by="merge")
|
|
591 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
|
592 #patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
593 patientMerge = patientMerge[NULL,]
|
0
|
594
|
1
|
595 duo.merge.list = list()
|
0
|
596
|
1
|
597 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
598 #patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
599 patientMerge12 = patientMerge12[NULL,]
|
|
600 duo.merge.list[[paste(oneSample, twoSample)]] = patientMerge12
|
|
601 duo.merge.list[[paste(twoSample, oneSample)]] = patientMerge12
|
0
|
602
|
1
|
603 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
604 #patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
605 patientMerge13 = patientMerge13[NULL,]
|
|
606 duo.merge.list[[paste(oneSample, threeSample)]] = patientMerge13
|
|
607 duo.merge.list[[paste(threeSample, oneSample)]] = patientMerge13
|
0
|
608
|
1
|
609 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
610 #patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
611 patientMerge23 = patientMerge23[NULL,]
|
|
612 duo.merge.list[[paste(twoSample, threeSample)]] = patientMerge23
|
|
613 duo.merge.list[[paste(threeSample, twoSample)]] = patientMerge23
|
0
|
614
|
1
|
615 merge.list = list()
|
|
616 merge.list[["second"]] = vector()
|
|
617
|
|
618 #print(paste(nrow(patient1), nrow(patient2), nrow(patient3), label1, label2, label3))
|
|
619
|
|
620 start.time = proc.time()
|
|
621 if(paste(label1, "123") %in% names(patient.merge.list)){
|
|
622 patientMerge = patient.merge.list[[paste(label1, "123")]]
|
|
623 patientMerge12 = patient.merge.list[[paste(label1, "12")]]
|
|
624 patientMerge13 = patient.merge.list[[paste(label1, "13")]]
|
|
625 patientMerge23 = patient.merge.list[[paste(label1, "23")]]
|
0
|
626
|
1
|
627 #merge.list[["second"]] = patient.merge.list.second[[label1]]
|
0
|
628
|
1
|
629 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
|
630 } else {
|
|
631 while(nrow(patient.fuzzy) > 0){
|
|
632 first.merge = patient.fuzzy[1,"merge"]
|
|
633 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
634 first.sample = paste(patient.fuzzy[1,"Sample"], sep="")
|
|
635
|
|
636 merge.filter = first.merge == patient.fuzzy$merge
|
|
637
|
|
638 second.sample = other.sample.list[[first.sample]][1]
|
|
639 third.sample = other.sample.list[[first.sample]][2]
|
0
|
640
|
1
|
641 sample.filter.1 = first.sample == patient.fuzzy$Sample
|
|
642 sample.filter.2 = second.sample == patient.fuzzy$Sample
|
|
643 sample.filter.3 = third.sample == patient.fuzzy$Sample
|
0
|
644
|
1
|
645 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
0
|
646
|
1
|
647 match.filter.1 = sample.filter.1 & sequence.filter & merge.filter
|
|
648 match.filter.2 = sample.filter.2 & sequence.filter & merge.filter
|
|
649 match.filter.3 = sample.filter.3 & sequence.filter & merge.filter
|
0
|
650
|
1
|
651 matches.in.1 = any(match.filter.1)
|
|
652 matches.in.2 = any(match.filter.2)
|
|
653 matches.in.3 = any(match.filter.3)
|
0
|
654
|
1
|
655 rows.1 = patient.fuzzy[match.filter.1,]
|
0
|
656
|
1
|
657 sum.1 = data.frame(merge = first.clone.sequence,
|
|
658 Patient = label1,
|
|
659 Receptor = rows.1[1,"Receptor"],
|
|
660 Sample = rows.1[1,"Sample"],
|
|
661 Cell_Count = rows.1[1,"Cell_Count"],
|
|
662 Clone_Molecule_Count_From_Spikes = sum(rows.1$Clone_Molecule_Count_From_Spikes),
|
|
663 Log10_Frequency = log10(sum(rows.1$Frequency)),
|
|
664 Total_Read_Count = sum(rows.1$Total_Read_Count),
|
|
665 dsPerM = sum(rows.1$dsPerM),
|
|
666 J_Segment_Major_Gene = rows.1[1,"J_Segment_Major_Gene"],
|
|
667 V_Segment_Major_Gene = rows.1[1,"V_Segment_Major_Gene"],
|
|
668 Clone_Sequence = first.clone.sequence,
|
|
669 CDR3_Sense_Sequence = rows.1[1,"CDR3_Sense_Sequence"],
|
|
670 Related_to_leukemia_clone = F,
|
|
671 Frequency = sum(rows.1$Frequency),
|
|
672 locus_V = rows.1[1,"locus_V"],
|
|
673 locus_J = rows.1[1,"locus_J"],
|
|
674 uniqueID = rows.1[1,"uniqueID"],
|
|
675 normalized_read_count = sum(rows.1$normalized_read_count))
|
|
676 sum.2 = sum.1[NULL,]
|
|
677 rows.2 = patient.fuzzy[match.filter.2,]
|
|
678 if(matches.in.2){
|
|
679 sum.2 = data.frame(merge = first.clone.sequence,
|
|
680 Patient = label1,
|
|
681 Receptor = rows.2[1,"Receptor"],
|
|
682 Sample = rows.2[1,"Sample"],
|
|
683 Cell_Count = rows.2[1,"Cell_Count"],
|
|
684 Clone_Molecule_Count_From_Spikes = sum(rows.2$Clone_Molecule_Count_From_Spikes),
|
|
685 Log10_Frequency = log10(sum(rows.2$Frequency)),
|
|
686 Total_Read_Count = sum(rows.2$Total_Read_Count),
|
|
687 dsPerM = sum(rows.2$dsPerM),
|
|
688 J_Segment_Major_Gene = rows.2[1,"J_Segment_Major_Gene"],
|
|
689 V_Segment_Major_Gene = rows.2[1,"V_Segment_Major_Gene"],
|
|
690 Clone_Sequence = first.clone.sequence,
|
|
691 CDR3_Sense_Sequence = rows.2[1,"CDR3_Sense_Sequence"],
|
|
692 Related_to_leukemia_clone = F,
|
|
693 Frequency = sum(rows.2$Frequency),
|
|
694 locus_V = rows.2[1,"locus_V"],
|
|
695 locus_J = rows.2[1,"locus_J"],
|
|
696 uniqueID = rows.2[1,"uniqueID"],
|
|
697 normalized_read_count = sum(rows.2$normalized_read_count))
|
|
698 }
|
0
|
699
|
1
|
700 sum.3 = sum.1[NULL,]
|
|
701 rows.3 = patient.fuzzy[match.filter.3,]
|
|
702 if(matches.in.3){
|
|
703 sum.3 = data.frame(merge = first.clone.sequence,
|
|
704 Patient = label1,
|
|
705 Receptor = rows.3[1,"Receptor"],
|
|
706 Sample = rows.3[1,"Sample"],
|
|
707 Cell_Count = rows.3[1,"Cell_Count"],
|
|
708 Clone_Molecule_Count_From_Spikes = sum(rows.3$Clone_Molecule_Count_From_Spikes),
|
|
709 Log10_Frequency = log10(sum(rows.3$Frequency)),
|
|
710 Total_Read_Count = sum(rows.3$Total_Read_Count),
|
|
711 dsPerM = sum(rows.3$dsPerM),
|
|
712 J_Segment_Major_Gene = rows.3[1,"J_Segment_Major_Gene"],
|
|
713 V_Segment_Major_Gene = rows.3[1,"V_Segment_Major_Gene"],
|
|
714 Clone_Sequence = first.clone.sequence,
|
|
715 CDR3_Sense_Sequence = rows.3[1,"CDR3_Sense_Sequence"],
|
|
716 Related_to_leukemia_clone = F,
|
|
717 Frequency = sum(rows.3$Frequency),
|
|
718 locus_V = rows.3[1,"locus_V"],
|
|
719 locus_J = rows.3[1,"locus_J"],
|
|
720 uniqueID = rows.3[1,"uniqueID"],
|
|
721 normalized_read_count = sum(rows.3$normalized_read_count))
|
|
722 }
|
0
|
723
|
1
|
724 if(matches.in.2 & matches.in.3){
|
|
725 merge.123 = merge(sum.1, sum.2, by="merge")
|
|
726 merge.123 = merge(merge.123, sum.3, by="merge")
|
|
727 colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123)))] = paste(colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123), perl=T))], ".z", sep="")
|
|
728 #merge.123$thresholdValue = pmax(merge.123[,onx], merge.123[,ony], merge.123[,onz])
|
0
|
729
|
1
|
730 patientMerge = rbind(patientMerge, merge.123)
|
|
731 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2 | match.filter.3),]
|
0
|
732
|
1
|
733 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
734 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
0
|
735
|
1
|
736 } else if (matches.in.2) {
|
|
737 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
738 #other.sample2 = other.sample.list[[first.sample]][2]
|
0
|
739
|
1
|
740 second.sample = sum.2[,"Sample"]
|
0
|
741
|
1
|
742 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
0
|
743
|
1
|
744 merge.12 = merge(sum.1, sum.2, by="merge")
|
0
|
745
|
1
|
746 current.merge.list = rbind(current.merge.list, merge.12)
|
|
747 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
0
|
748
|
1
|
749 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2),]
|
0
|
750
|
1
|
751 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
752 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
0
|
753
|
1
|
754 } else if (matches.in.3) {
|
0
|
755
|
1
|
756 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
757 #other.sample2 = other.sample.list[[first.sample]][2]
|
0
|
758
|
1
|
759 second.sample = sum.3[,"Sample"]
|
0
|
760
|
1
|
761 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
0
|
762
|
1
|
763 merge.13 = merge(sum.1, sum.3, by="merge")
|
0
|
764
|
1
|
765 current.merge.list = rbind(current.merge.list, merge.13)
|
|
766 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
0
|
767
|
1
|
768 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.3),]
|
0
|
769
|
1
|
770 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
771 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
0
|
772
|
1
|
773 } else if(nrow(rows.1) > 1){
|
|
774 patient1 = patient1[!(patient1$Clone_Sequence %in% rows.1$Clone_Sequence),]
|
|
775 print(names(patient1)[names(patient1) %in% sum.1])
|
|
776 print(names(patient1)[!(names(patient1) %in% sum.1)])
|
|
777 print(names(patient1))
|
|
778 print(names(sum.1))
|
|
779 print(summary(sum.1))
|
|
780 print(summary(patient1))
|
|
781 print(dim(sum.1))
|
|
782 print(dim(patient1))
|
|
783 print(head(sum.1[,names(patient1)]))
|
|
784 patient1 = rbind(patient1, sum.1[,names(patient1)])
|
|
785 patient.fuzzy = patient.fuzzy[-match.filter.1,]
|
|
786 } else {
|
|
787 patient.fuzzy = patient.fuzzy[-1,]
|
|
788 }
|
|
789
|
|
790 tmp.rows = rbind(rows.1, rows.2, rows.3)
|
|
791 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
0
|
792
|
1
|
793 if (sum(match.filter.1) > 1 | sum(match.filter.2) > 1 | sum(match.filter.1) > 1) {
|
|
794 cat(paste("<tr><td>", label1, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
795 } else {
|
|
796 }
|
|
797
|
|
798 }
|
|
799 patient.merge.list[[paste(label1, "123")]] = patientMerge
|
|
800
|
|
801 patientMerge12 = duo.merge.list[[paste(oneSample, twoSample)]]
|
|
802 patientMerge13 = duo.merge.list[[paste(oneSample, threeSample)]]
|
|
803 patientMerge23 = duo.merge.list[[paste(twoSample, threeSample)]]
|
0
|
804
|
1
|
805 patient.merge.list[[paste(label1, "12")]] = patientMerge12
|
|
806 patient.merge.list[[paste(label1, "13")]] = patientMerge13
|
|
807 patient.merge.list[[paste(label1, "23")]] = patientMerge23
|
|
808
|
|
809 #patient.merge.list.second[[label1]] = merge.list[["second"]]
|
|
810 }
|
|
811 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
|
812 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
813 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
814 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
815 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
816
|
|
817 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
818 #patientMerge12$thresholdValue = pmin(patientMerge12[,onx], patientMerge12[,ony])
|
|
819 #patientMerge13$thresholdValue = pmin(patientMerge13[,onx], patientMerge13[,ony])
|
|
820 #patientMerge23$thresholdValue = pmin(patientMerge23[,onx], patientMerge23[,ony])
|
0
|
821
|
1
|
822 patient1 = patient1[!(patient1$Clone_Sequence %in% merge.list[["second"]]),]
|
|
823 patient2 = patient2[!(patient2$Clone_Sequence %in% merge.list[["second"]]),]
|
|
824 patient3 = patient3[!(patient3$Clone_Sequence %in% merge.list[["second"]]),]
|
0
|
825
|
1
|
826 if(F){
|
|
827 patientMerge = merge(patient1, patient2, by="merge")
|
|
828 patientMerge = merge(patientMerge, patient3, by="merge")
|
|
829 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
|
830 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
831 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
832 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
833 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
834 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
835 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
836 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
837 }
|
0
|
838
|
1
|
839 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
|
840 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
|
841 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
842
|
|
843 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
0
|
844
|
1
|
845 res1 = vector()
|
|
846 res2 = vector()
|
|
847 res3 = vector()
|
|
848 res12 = vector()
|
|
849 res13 = vector()
|
|
850 res23 = vector()
|
|
851 resAll = vector()
|
|
852 read1Count = vector()
|
|
853 read2Count = vector()
|
|
854 read3Count = vector()
|
0
|
855
|
1
|
856 if(appendTriplets){
|
|
857 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
|
858 }
|
|
859 for(iter in 1:length(product[,1])){
|
|
860 threshhold = product[iter,threshholdIndex]
|
|
861 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
862 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
|
863 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
864 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
0
|
865
|
1
|
866 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
867 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
868 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
|
869
|
|
870 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
871 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
872 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
0
|
873
|
1
|
874 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
875 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
876 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
|
877 res1 = append(res1, sum(one))
|
|
878 res2 = append(res2, sum(two))
|
|
879 res3 = append(res3, sum(three))
|
|
880 resAll = append(resAll, sum(all))
|
|
881 res12 = append(res12, sum(one_two))
|
|
882 res13 = append(res13, sum(one_three))
|
|
883 res23 = append(res23, sum(two_three))
|
|
884 #threshhold = 0
|
|
885 if(threshhold != 0){
|
|
886 if(sum(one) > 0){
|
|
887 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
888 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
|
889 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
890 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
891 }
|
|
892 if(sum(two) > 0){
|
|
893 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
894 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
|
895 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
896 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
897 }
|
|
898 if(sum(three) > 0){
|
|
899 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
900 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
|
901 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
902 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
903 }
|
|
904 if(sum(one_two) > 0){
|
|
905 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
906 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
907 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
908 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
909 }
|
|
910 if(sum(one_three) > 0){
|
|
911 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
912 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
913 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
914 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
915 }
|
|
916 if(sum(two_three) > 0){
|
|
917 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
918 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
919 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
920 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
921 }
|
|
922 } else { #scatterplot data
|
|
923 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
924 scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
|
|
925 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
|
926 if(sum(in_two) > 0){
|
|
927 scatterplot_locus_data[in_two,]$type = "In two"
|
|
928 }
|
|
929 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
|
930 if(sum(in_three)> 0){
|
|
931 scatterplot_locus_data[in_three,]$type = "In three"
|
|
932 }
|
|
933 not_in_one = scatterplot_locus_data$type != "In one"
|
|
934 if(sum(not_in_one) > 0){
|
|
935 #scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
936 }
|
|
937 p = NULL
|
|
938 if(nrow(scatterplot_locus_data) != 0){
|
2
|
939 filename.scatter = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_scatter_", threshhold, sep="")
|
|
940 write.table(scatterplot_locus_data, file=paste(filename.scatter, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
941 if(on == "normalized_read_count"){
|
|
942 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
943 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales, limits=c(1, 1e6))
|
|
944 } else {
|
|
945 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
|
|
946 #p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
947 }
|
|
948 p = p + geom_point(aes(colour=type), position="jitter")
|
|
949 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
1
|
950 } else {
|
2
|
951 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
1
|
952 }
|
|
953 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
954 print(p)
|
|
955 dev.off()
|
|
956 }
|
|
957 if(sum(all) > 0){
|
|
958 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
959 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
960 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
961 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
962 }
|
|
963 }
|
|
964 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
965 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
|
966 colnames(patientResult)[6] = oneSample
|
|
967 colnames(patientResult)[7] = twoSample
|
|
968 colnames(patientResult)[8] = threeSample
|
|
969 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
970 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
971 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
|
972
|
|
973 colnamesBak = colnames(patientResult)
|
|
974 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
|
975 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
976 colnames(patientResult) = colnamesBak
|
|
977
|
|
978 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
979 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
980
|
|
981 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
982 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
983 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
984 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
985 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
986 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
987 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
988 print(plt)
|
|
989 dev.off()
|
|
990
|
|
991 fontSize = 4
|
|
992
|
|
993 bak = patientResult
|
|
994 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
995 patientResult$relativeValue = patientResult$value * 10
|
|
996 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
997 plt = ggplot(patientResult)
|
|
998 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
999 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
1000 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
1001 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
1002 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
1003 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
1004 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
1005 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
1006 print(plt)
|
|
1007 dev.off()
|
0
|
1008 }
|
|
1009
|
|
1010 if(nrow(triplets) != 0){
|
|
1011
|
1
|
1012 cat("<tr><td>Starting triplet analysis</td></tr>", file=logfile, append=T)
|
|
1013
|
|
1014 triplets$uniqueID = paste(triplets$Patient, triplets$Sample, sep="_")
|
|
1015
|
|
1016 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
0
|
1017
|
1
|
1018 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
1019 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
1020 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
0
|
1021
|
1
|
1022 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
1023 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
0
|
1024
|
1
|
1025 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
1026
|
|
1027 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
1028
|
|
1029 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2)
|
0
|
1030
|
1
|
1031 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
1032
|
|
1033 column_drops = c("min_cell_count", "min_cell_paste")
|
|
1034
|
|
1035 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
|
1036
|
|
1037 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
0
|
1038
|
1
|
1039 interval = intervalReads
|
|
1040 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
1041 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
0
|
1042
|
1
|
1043 triplets = split(triplets, triplets$Patient, drop=T)
|
|
1044 print(nrow(triplets))
|
|
1045 for(triplet in triplets){
|
|
1046 samples = unique(triplet$Sample)
|
|
1047 one = triplet[triplet$Sample == samples[1],]
|
|
1048 two = triplet[triplet$Sample == samples[2],]
|
|
1049 three = triplet[triplet$Sample == samples[3],]
|
|
1050
|
|
1051 print(paste(nrow(triplet), nrow(one), nrow(two), nrow(three)))
|
|
1052 tripletAnalysis(one, one[1,"uniqueID"], two, two[1,"uniqueID"], three, three[1,"uniqueID"], product=product, interval=interval, on="normalized_read_count", T)
|
|
1053 }
|
|
1054
|
|
1055 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
1056
|
|
1057 interval = intervalFreq
|
|
1058 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
1059 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
1060
|
|
1061 for(triplet in triplets){
|
|
1062 samples = unique(triplet$Sample)
|
|
1063 one = triplet[triplet$Sample == samples[1],]
|
|
1064 two = triplet[triplet$Sample == samples[2],]
|
|
1065 three = triplet[triplet$Sample == samples[3],]
|
|
1066 tripletAnalysis(one, one[1,"uniqueID"], two, two[1,"uniqueID"], three, three[1,"uniqueID"], product=product, interval=interval, on="Frequency", F)
|
|
1067 }
|
0
|
1068 } else {
|
|
1069 cat("", file="triplets.txt")
|
|
1070 }
|
|
1071 cat("</table></html>", file=logfile, append=T)
|