0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2 options(scipen=999)
|
|
3
|
|
4 inFile = args[1]
|
|
5 outDir = args[2]
|
|
6 logfile = args[3]
|
|
7 min_freq = as.numeric(args[4])
|
|
8 min_cells = as.numeric(args[5])
|
|
9 mergeOn = args[6]
|
|
10
|
|
11 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
|
12
|
|
13 library(ggplot2)
|
|
14 library(reshape2)
|
|
15 library(data.table)
|
|
16 library(grid)
|
|
17 library(parallel)
|
|
18 #require(xtable)
|
|
19 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
|
20 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
2
|
21
|
|
22 needed_cols = c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Clone_Sequence")
|
|
23 if(!all(needed_cols %in% names(dat))){
|
|
24 cat("Missing column(s):<br />", file=logfile, append=F)
|
|
25 missing_cols = needed_cols[!(needed_cols %in% names(dat))]
|
|
26 for(col in missing_cols){
|
|
27 cat(paste(col, "<br />"), file=logfile, append=T)
|
|
28 }
|
|
29 stop("Not all columns are present")
|
|
30 }
|
|
31
|
|
32 if(!("Total_Read_Count" %in% names(dat))){
|
|
33 dat$Total_Read_Count = 0
|
|
34 }
|
|
35
|
|
36 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Clone_Sequence")]
|
7
|
37 dat = dat[!(nchar(dat$Clone_Sequence) < 2),]
|
2
|
38
|
0
|
39 dat$dsPerM = 0
|
|
40 dat = dat[!is.na(dat$Patient),]
|
|
41 dat$Related_to_leukemia_clone = F
|
|
42
|
|
43 setwd(outDir)
|
|
44 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
|
45 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
46 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
47
|
|
48 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
|
49
|
|
50 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
5
|
51 dat = dat[dat$Frequency <= 100,] #just in case?
|
0
|
52
|
|
53 dat = dat[dat$Frequency >= min_freq,]
|
|
54
|
1
|
55 patient.sample.counts = data.frame(data.table(dat)[, list(count=.N), by=c("Patient", "Sample")])
|
|
56 patient.sample.counts = data.frame(data.table(patient.sample.counts)[, list(count=.N), by=c("Patient")])
|
|
57
|
|
58 print("Found the following patients with number of samples:")
|
|
59 print(patient.sample.counts)
|
|
60
|
|
61 patient.sample.counts.pairs = patient.sample.counts[patient.sample.counts$count %in% 1:2,]
|
|
62 patient.sample.counts.triplets = patient.sample.counts[patient.sample.counts$count == 3,]
|
|
63
|
|
64
|
|
65
|
|
66 triplets = dat[dat$Patient %in% patient.sample.counts.triplets$Patient,]
|
|
67 dat = dat[dat$Patient %in% patient.sample.counts.pairs$Patient,]
|
0
|
68
|
|
69 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
70
|
|
71 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
72 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
73 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
74
|
|
75 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
76 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
77
|
|
78 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
79 print(paste("rows:", nrow(dat)))
|
|
80 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
|
81 print(paste("rows:", nrow(dat)))
|
|
82 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
83
|
|
84 dat = dat[dat$normalized_read_count >= min_cells,]
|
|
85
|
|
86 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
|
87
|
|
88 patients = split(dat, dat$Patient, drop=T)
|
|
89 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
|
90 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
|
91 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
92 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
93 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
94 Titles = factor(Titles, levels=Titles)
|
|
95 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
96
|
2
|
97 single_patients = dat[NULL,]
|
0
|
98
|
|
99 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
100 patient.merge.list.second = list()
|
|
101 scatter_locus_data_list = list()
|
|
102 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="multiple_matches.html", append=T)
|
|
103 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="single_matches.html", append=T)
|
|
104 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
|
105 if (!is.data.frame(x) & is.list(x)){
|
|
106 x = x[[1]]
|
|
107 }
|
|
108 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
109 x = data.frame(x,stringsAsFactors=F)
|
|
110 onShort = "reads"
|
|
111 if(on == "Frequency"){
|
|
112 onShort = "freq"
|
|
113 }
|
|
114 onx = paste(on, ".x", sep="")
|
|
115 ony = paste(on, ".y", sep="")
|
|
116 splt = split(x, x$Sample, drop=T)
|
|
117 type="pair"
|
|
118 if(length(splt) == 1){
|
|
119 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
2
|
120 splt[[2]] = splt[[1]][NULL,]
|
0
|
121 type="single"
|
|
122 }
|
|
123 patient1 = splt[[1]]
|
|
124 patient2 = splt[[2]]
|
|
125
|
2
|
126 oneSample = patient1[1,"Sample"]
|
|
127 twoSample = patient2[1,"Sample"]
|
|
128 patient = x[1,"Patient"]
|
|
129
|
0
|
130 switched = F
|
|
131 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
132 tmp = twoSample
|
|
133 twoSample = oneSample
|
|
134 oneSample = tmp
|
|
135 tmp = patient1
|
|
136 patient1 = patient2
|
|
137 patient2 = tmp
|
|
138 switched = T
|
|
139 }
|
|
140 if(appendtxt){
|
|
141 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
|
142 }
|
|
143 cat(paste("<tr><td>", patient, "</td>", sep=""), file=logfile, append=T)
|
|
144
|
|
145 if(mergeOn == "Clone_Sequence"){
|
|
146 patient1$merge = paste(patient1$Clone_Sequence)
|
|
147 patient2$merge = paste(patient2$Clone_Sequence)
|
|
148 } else {
|
|
149 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
150 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
151 }
|
|
152
|
|
153 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
|
154 scatterplot_data = patient1[NULL,scatterplot_data_columns]
|
|
155 scatterplot.data.type.factor = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both"))
|
|
156 scatterplot_data$type = character(0)
|
|
157 scatterplot_data$link = numeric(0)
|
|
158 scatterplot_data$on = character(0)
|
|
159
|
|
160 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh
|
2
|
161
|
0
|
162 cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence
|
2
|
163
|
0
|
164 start.time = proc.time()
|
|
165 merge.list = c()
|
2
|
166
|
0
|
167 if(patient %in% names(patient.merge.list)){
|
|
168 patientMerge = patient.merge.list[[patient]]
|
|
169 merge.list[["second"]] = patient.merge.list.second[[patient]]
|
|
170 scatterplot_data = scatter_locus_data_list[[patient]]
|
|
171 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
2
|
172
|
5
|
173 #print(names(patient.merge.list))
|
0
|
174 } else {
|
|
175 #fuzzy matching here...
|
2
|
176
|
0
|
177 patient1.fuzzy = patient1
|
|
178 patient2.fuzzy = patient2
|
2
|
179
|
0
|
180 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
181 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
2
|
182
|
0
|
183 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
|
|
184 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
2
|
185
|
0
|
186 merge.list = list()
|
2
|
187
|
0
|
188 merge.list[["second"]] = vector()
|
2
|
189
|
|
190 link.count = 1
|
|
191
|
0
|
192 while(nrow(patient.fuzzy) > 1){
|
|
193 first.merge = patient.fuzzy[1,"merge"]
|
|
194 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
195 first.sample = patient.fuzzy[1,"Sample"]
|
|
196 merge.filter = first.merge == patient.fuzzy$merge
|
2
|
197
|
0
|
198 #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
|
2
|
199
|
0
|
200 first.sample.filter = first.sample == patient.fuzzy$Sample
|
|
201 second.sample.filter = first.sample != patient.fuzzy$Sample
|
2
|
202
|
0
|
203 #first match same sample, sum to a single row, same for other sample
|
|
204 #then merge rows like 'normal'
|
2
|
205
|
0
|
206 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
2
|
207
|
|
208
|
|
209
|
0
|
210 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter
|
|
211 first.match.filter = merge.filter & sequence.filter & first.sample.filter
|
|
212 second.match.filter = merge.filter & sequence.filter & second.sample.filter
|
2
|
213
|
0
|
214 first.rows = patient.fuzzy[first.match.filter,]
|
|
215 second.rows = patient.fuzzy[second.match.filter,]
|
2
|
216
|
0
|
217 first.rows.v = table(first.rows$V_Segment_Major_Gene)
|
|
218 first.rows.v = names(first.rows.v[which.max(first.rows.v)])
|
|
219 first.rows.j = table(first.rows$J_Segment_Major_Gene)
|
|
220 first.rows.j = names(first.rows.j[which.max(first.rows.j)])
|
2
|
221
|
0
|
222 first.sum = data.frame(merge = first.clone.sequence,
|
|
223 Patient = patient,
|
|
224 Receptor = first.rows[1,"Receptor"],
|
|
225 Sample = first.rows[1,"Sample"],
|
|
226 Cell_Count = first.rows[1,"Cell_Count"],
|
|
227 Clone_Molecule_Count_From_Spikes = sum(first.rows$Clone_Molecule_Count_From_Spikes),
|
|
228 Log10_Frequency = log10(sum(first.rows$Frequency)),
|
|
229 Total_Read_Count = sum(first.rows$Total_Read_Count),
|
|
230 dsPerM = sum(first.rows$dsPerM),
|
|
231 J_Segment_Major_Gene = first.rows.j,
|
|
232 V_Segment_Major_Gene = first.rows.v,
|
|
233 Clone_Sequence = first.clone.sequence,
|
|
234 CDR3_Sense_Sequence = first.rows[1,"CDR3_Sense_Sequence"],
|
|
235 Related_to_leukemia_clone = F,
|
|
236 Frequency = sum(first.rows$Frequency),
|
|
237 locus_V = first.rows[1,"locus_V"],
|
|
238 locus_J = first.rows[1,"locus_J"],
|
|
239 min_cell_count = first.rows[1,"min_cell_count"],
|
|
240 normalized_read_count = sum(first.rows$normalized_read_count),
|
|
241 paste = first.rows[1,"paste"],
|
|
242 min_cell_paste = first.rows[1,"min_cell_paste"])
|
2
|
243
|
0
|
244 if(nrow(second.rows) > 0){
|
|
245 second.rows.v = table(second.rows$V_Segment_Major_Gene)
|
|
246 second.rows.v = names(second.rows.v[which.max(second.rows.v)])
|
|
247 second.rows.j = table(second.rows$J_Segment_Major_Gene)
|
|
248 second.rows.j = names(second.rows.j[which.max(second.rows.j)])
|
2
|
249
|
0
|
250 second.sum = data.frame(merge = first.clone.sequence,
|
2
|
251 Patient = patient,
|
|
252 Receptor = second.rows[1,"Receptor"],
|
|
253 Sample = second.rows[1,"Sample"],
|
|
254 Cell_Count = second.rows[1,"Cell_Count"],
|
|
255 Clone_Molecule_Count_From_Spikes = sum(second.rows$Clone_Molecule_Count_From_Spikes),
|
|
256 Log10_Frequency = log10(sum(second.rows$Frequency)),
|
|
257 Total_Read_Count = sum(second.rows$Total_Read_Count),
|
|
258 dsPerM = sum(second.rows$dsPerM),
|
|
259 J_Segment_Major_Gene = second.rows.j,
|
|
260 V_Segment_Major_Gene = second.rows.v,
|
|
261 Clone_Sequence = first.clone.sequence,
|
|
262 CDR3_Sense_Sequence = second.rows[1,"CDR3_Sense_Sequence"],
|
|
263 Related_to_leukemia_clone = F,
|
|
264 Frequency = sum(second.rows$Frequency),
|
|
265 locus_V = second.rows[1,"locus_V"],
|
|
266 locus_J = second.rows[1,"locus_J"],
|
|
267 min_cell_count = second.rows[1,"min_cell_count"],
|
|
268 normalized_read_count = sum(second.rows$normalized_read_count),
|
|
269 paste = second.rows[1,"paste"],
|
|
270 min_cell_paste = second.rows[1,"min_cell_paste"])
|
|
271
|
|
272 #print(names(patientMerge))
|
|
273 #print(merge(first.sum, second.sum, by="merge"))
|
0
|
274 patientMerge = rbind(patientMerge, merge(first.sum, second.sum, by="merge"))
|
2
|
275 #print("test2")
|
0
|
276 patient.fuzzy = patient.fuzzy[!(first.match.filter | second.match.filter),]
|
2
|
277
|
0
|
278 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"], second.rows[second.rows$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
279 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
2
|
280
|
0
|
281 tmp.rows = rbind(first.rows, second.rows)
|
2
|
282 #print("test3")
|
0
|
283 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
|
284
|
|
285
|
|
286 #add to the scatterplot data
|
|
287 scatterplot.row = first.sum[,scatterplot_data_columns]
|
2
|
288 scatterplot.row$type = paste(first.sum[,"Sample"], "In Both")
|
|
289 scatterplot.row$link = link.count
|
|
290 scatterplot.row$on = onShort
|
|
291
|
|
292 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
0
|
293
|
|
294 scatterplot.row = second.sum[,scatterplot_data_columns]
|
2
|
295 scatterplot.row$type = paste(second.sum[,"Sample"], "In Both")
|
|
296 scatterplot.row$link = link.count
|
|
297 scatterplot.row$on = onShort
|
|
298
|
|
299 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
300
|
|
301 #write some information about the match to a log file
|
0
|
302 if (nrow(first.rows) > 1 | nrow(second.rows) > 1) {
|
|
303 cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
304 } else {
|
|
305 second.clone.sequence = second.rows[1,"Clone_Sequence"]
|
|
306 if(nchar(first.clone.sequence) != nchar(second.clone.sequence)){
|
|
307 cat(paste("<tr bgcolor='#DDD'><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
308 } else {
|
|
309 #cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
310 }
|
|
311 }
|
2
|
312
|
0
|
313 } else if(nrow(first.rows) > 1) {
|
|
314 if(patient1[1,"Sample"] == first.sample){
|
|
315 patient1 = patient1[!(patient1$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
316 patient1 = rbind(patient1, first.sum)
|
|
317 } else {
|
|
318 patient2 = patient2[!(patient2$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
319 patient2 = rbind(patient2, first.sum)
|
|
320 }
|
2
|
321
|
0
|
322 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"])
|
|
323 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
2
|
324
|
0
|
325 patient.fuzzy = patient.fuzzy[-first.match.filter,]
|
|
326
|
|
327 #add to the scatterplot data
|
|
328 scatterplot.row = first.sum[,scatterplot_data_columns]
|
2
|
329 scatterplot.row$type = first.sum[,"Sample"]
|
|
330 scatterplot.row$link = link.count
|
|
331 scatterplot.row$on = onShort
|
|
332
|
|
333 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
334
|
0
|
335 cat(paste("<tr bgcolor='#DDF'><td>", patient, " row ", 1:nrow(first.rows), "</td><td>", first.rows$Sample, ":</td><td>", first.rows$Clone_Sequence, "</td><td>", first.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
336 } else {
|
|
337 patient.fuzzy = patient.fuzzy[-1,]
|
|
338
|
|
339 #add to the scatterplot data
|
|
340 scatterplot.row = first.sum[,scatterplot_data_columns]
|
2
|
341 scatterplot.row$type = first.sum[,"Sample"]
|
|
342 scatterplot.row$link = link.count
|
|
343 scatterplot.row$on = onShort
|
|
344
|
|
345 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
0
|
346 }
|
|
347 link.count = link.count + 1
|
|
348 }
|
|
349 patient.merge.list[[patient]] <<- patientMerge
|
|
350 patient.merge.list.second[[patient]] <<- merge.list[["second"]]
|
|
351
|
|
352 sample.order = data.frame(type = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both")),type.order = 1:4)
|
|
353 scatterplot_data = merge(scatterplot_data, sample.order, by="type")
|
|
354
|
|
355 scatter_locus_data_list[[patient]] <<- scatterplot_data
|
|
356 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
|
357 }
|
2
|
358
|
0
|
359 patient1 = patient1[!(patient1$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
|
360 patient2 = patient2[!(patient2$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
2
|
361
|
0
|
362
|
|
363 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
|
364 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony])
|
|
365 res1 = vector()
|
|
366 res2 = vector()
|
|
367 resBoth = vector()
|
|
368 read1Count = vector()
|
|
369 read2Count = vector()
|
|
370 locussum1 = vector()
|
|
371 locussum2 = vector()
|
|
372
|
|
373 #for(iter in 1){
|
|
374 for(iter in 1:length(product[,1])){
|
2
|
375 threshhold = product[iter,"interval"]
|
|
376 V_Segment = paste(".*", as.character(product[iter,"V_Segments"]), ".*", sep="")
|
|
377 J_Segment = paste(".*", as.character(product[iter,"J_Segments"]), ".*", sep="")
|
0
|
378 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
|
379 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
|
380 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
381 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
|
382 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
383 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
|
384 res1 = append(res1, sum(one))
|
|
385 res2 = append(res2, sum(two))
|
|
386 resBoth = append(resBoth, sum(both))
|
|
387 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
388 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
389 #threshhold = 0
|
2
|
390 if(threshhold != 0 | T){
|
0
|
391 if(sum(one) > 0){
|
|
392 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
393 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
2
|
394 filenameOne = paste(oneSample, "_", product[iter, "Titles"], "_", threshhold, sep="")
|
0
|
395 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
396 }
|
|
397 if(sum(two) > 0){
|
|
398 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
399 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
2
|
400 filenameTwo = paste(twoSample, "_", product[iter, "Titles"], "_", threshhold, sep="")
|
0
|
401 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
402 }
|
6
|
403 }
|
|
404 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
405 if(nrow(scatterplot_locus_data) > 0){
|
|
406 scatterplot_locus_data$Rearrangement = product[iter, "Titles"]
|
|
407 }
|
|
408 p = NULL
|
|
409 #print(paste("nrow scatterplot_locus_data", nrow(scatterplot_locus_data)))
|
|
410 if(nrow(scatterplot_locus_data) != 0){
|
|
411 if(on == "normalized_read_count"){
|
|
412 write.table(scatterplot_locus_data, file=paste(oneSample, twoSample, product[iter, "Titles"], "scatterplot_locus_data.txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
413 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
414 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), normalized_read_count, group=link)) + geom_line() + scale_y_log10(breaks=scales,labels=scales, limits=c(1,1e6)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
|
415 } else {
|
|
416 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), Frequency, group=link)) + geom_line() + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
0
|
417 }
|
6
|
418 p = p + geom_point(aes(colour=type), position="dodge")
|
|
419 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,"Patient"], patient1[1,"Sample"], patient2[1,"Sample"], onShort, product[iter, "Titles"]))
|
|
420 } else {
|
|
421 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,"Patient"], patient1[1,"Sample"], patient2[1,"Sample"], onShort, product[iter, "Titles"]))
|
0
|
422 }
|
8
|
423 file_name = paste(patient1[1,"Patient"], "_", patient1[1,"Sample"], "_", patient2[1,"Sample"], "_", onShort, "_", product[iter, "Titles"],"_scatter.png", sep="")
|
|
424 print(paste("Writing figure:", file_name))
|
|
425 png(file_name)
|
6
|
426 print(p)
|
|
427 dev.off()
|
0
|
428 if(sum(both) > 0){
|
|
429 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
430 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
2
|
431 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, "Titles"], "_", threshhold, sep="")
|
0
|
432 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
433 }
|
|
434 }
|
|
435 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
|
436 if(sum(is.na(patientResult$percentage)) > 0){
|
|
437 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
438 }
|
|
439 colnames(patientResult)[6] = oneSample
|
|
440 colnames(patientResult)[8] = twoSample
|
|
441 colnamesBak = colnames(patientResult)
|
|
442 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
|
443 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
444 colnames(patientResult) = colnamesBak
|
|
445
|
|
446 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
447 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
448
|
|
449 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
450 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
451 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
452 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
453 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
454 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
8
|
455 file_name = paste(patient, "_", onShort, ".png", sep="")
|
|
456 print(paste("Writing figure:", file_name))
|
|
457 png(file_name, width=1920, height=1080)
|
0
|
458 print(plt)
|
|
459 dev.off()
|
|
460 #(t,r,b,l)
|
|
461 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
462 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
463 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
464 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
465 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
466 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
8
|
467
|
|
468 file_name = paste(patient, "_percent_", onShort, ".png", sep="")
|
|
469 print(paste("Writing figure:", file_name))
|
|
470 png(file_name, width=1920, height=1080)
|
0
|
471 print(plt)
|
|
472 dev.off()
|
|
473
|
|
474 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
475 patientResult$relativeValue = patientResult$value * 10
|
|
476 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
477 plt = ggplot(patientResult)
|
|
478 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
479 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
480 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
481 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
482 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
483 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
8
|
484 file_name = paste(patient, "_", onShort, "_both.png", sep="")
|
|
485 print(paste("Writing figure:", file_name))
|
|
486 png(file_name, width=1920, height=1080)
|
0
|
487 print(plt)
|
|
488 dev.off()
|
|
489 }
|
2
|
490
|
1
|
491 if(length(patients) > 0){
|
|
492 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
0
|
493
|
1
|
494 interval = intervalFreq
|
|
495 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
496 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
2
|
497 for (current_patient in patients){
|
|
498 print(paste("Started working", unique(current_patient$Patient), "Frequency analysis"))
|
|
499 patientCountOnColumn(current_patient, product=product, interval=interval, on="Frequency", appendtxt=T)
|
|
500 }
|
0
|
501
|
1
|
502 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
0
|
503
|
1
|
504 interval = intervalReads
|
|
505 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
506 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
2
|
507 for (current_patient in patients){
|
|
508 print(paste("Started working on ", unique(current_patient$Patient), "Read Count analysis"))
|
|
509 patientCountOnColumn(current_patient, product=product, interval=interval, on="normalized_read_count")
|
|
510 }
|
1
|
511 }
|
0
|
512 if(nrow(single_patients) > 0){
|
|
513 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
514 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=as.character(scales)) + expand_limits(y=c(0,1000000))
|
|
515 p = p + geom_point(aes(colour=type), position="jitter")
|
|
516 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
517 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
8
|
518 file_name = "singles_reads_scatterplot.png"
|
|
519 print(paste("Writing figure:", file_name))
|
|
520 png(file_name, width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
0
|
521 print(p)
|
|
522 dev.off()
|
|
523
|
|
524 #p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
525 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
|
|
526 p = p + geom_point(aes(colour=type), position="jitter")
|
|
527 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
528 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
8
|
529 file_name = "singles_freq_scatterplot.png"
|
|
530 print(paste("Writing figure:", file_name))
|
|
531 png(file_name, width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
0
|
532 print(p)
|
|
533 dev.off()
|
|
534 } else {
|
|
535 empty <- data.frame()
|
|
536 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
537
|
|
538 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
539 print(p)
|
|
540 dev.off()
|
|
541
|
|
542 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
543 print(p)
|
|
544 dev.off()
|
|
545 }
|
|
546
|
|
547 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
548 patient.merge.list.second = list()
|
|
549
|
|
550 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
1
|
551 onShort = "reads"
|
|
552 if(on == "Frequency"){
|
|
553 onShort = "freq"
|
|
554 }
|
|
555 onx = paste(on, ".x", sep="")
|
|
556 ony = paste(on, ".y", sep="")
|
|
557 onz = paste(on, ".z", sep="")
|
|
558 type="triplet"
|
0
|
559
|
1
|
560 threshholdIndex = which(colnames(product) == "interval")
|
|
561 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
562 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
563 titleIndex = which(colnames(product) == "Titles")
|
|
564 sampleIndex = which(colnames(patient1) == "Sample")
|
|
565 patientIndex = which(colnames(patient1) == "Patient")
|
|
566 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
567 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
568 threeSample = paste(patient3[1,sampleIndex], sep="")
|
|
569
|
|
570 if(mergeOn == "Clone_Sequence"){
|
|
571 patient1$merge = paste(patient1$Clone_Sequence)
|
0
|
572 patient2$merge = paste(patient2$Clone_Sequence)
|
|
573 patient3$merge = paste(patient3$Clone_Sequence)
|
|
574
|
1
|
575 } else {
|
0
|
576 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
577 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
578 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
1
|
579 }
|
0
|
580
|
1
|
581 #patientMerge = merge(patient1, patient2, by="merge")[NULL,]
|
|
582 patient1.fuzzy = patient1
|
|
583 patient2.fuzzy = patient2
|
|
584 patient3.fuzzy = patient3
|
0
|
585
|
1
|
586 cat(paste("<tr><td>", label1, "</td>", sep=""), file=logfile, append=T)
|
0
|
587
|
1
|
588 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
589 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
|
590 patient3.fuzzy$merge = paste(patient3.fuzzy$locus_V, patient3.fuzzy$locus_J)
|
0
|
591
|
1
|
592 patient.fuzzy = rbind(patient1.fuzzy ,patient2.fuzzy, patient3.fuzzy)
|
|
593 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
0
|
594
|
1
|
595 other.sample.list = list()
|
|
596 other.sample.list[[oneSample]] = c(twoSample, threeSample)
|
|
597 other.sample.list[[twoSample]] = c(oneSample, threeSample)
|
|
598 other.sample.list[[threeSample]] = c(oneSample, twoSample)
|
0
|
599
|
1
|
600 patientMerge = merge(patient1, patient2, by="merge")
|
|
601 patientMerge = merge(patientMerge, patient3, by="merge")
|
|
602 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
|
603 #patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
604 patientMerge = patientMerge[NULL,]
|
0
|
605
|
1
|
606 duo.merge.list = list()
|
0
|
607
|
1
|
608 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
609 #patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
610 patientMerge12 = patientMerge12[NULL,]
|
|
611 duo.merge.list[[paste(oneSample, twoSample)]] = patientMerge12
|
|
612 duo.merge.list[[paste(twoSample, oneSample)]] = patientMerge12
|
0
|
613
|
1
|
614 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
615 #patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
616 patientMerge13 = patientMerge13[NULL,]
|
|
617 duo.merge.list[[paste(oneSample, threeSample)]] = patientMerge13
|
|
618 duo.merge.list[[paste(threeSample, oneSample)]] = patientMerge13
|
0
|
619
|
1
|
620 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
621 #patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
622 patientMerge23 = patientMerge23[NULL,]
|
|
623 duo.merge.list[[paste(twoSample, threeSample)]] = patientMerge23
|
|
624 duo.merge.list[[paste(threeSample, twoSample)]] = patientMerge23
|
0
|
625
|
1
|
626 merge.list = list()
|
|
627 merge.list[["second"]] = vector()
|
|
628
|
|
629 #print(paste(nrow(patient1), nrow(patient2), nrow(patient3), label1, label2, label3))
|
|
630
|
|
631 start.time = proc.time()
|
|
632 if(paste(label1, "123") %in% names(patient.merge.list)){
|
|
633 patientMerge = patient.merge.list[[paste(label1, "123")]]
|
|
634 patientMerge12 = patient.merge.list[[paste(label1, "12")]]
|
|
635 patientMerge13 = patient.merge.list[[paste(label1, "13")]]
|
|
636 patientMerge23 = patient.merge.list[[paste(label1, "23")]]
|
0
|
637
|
1
|
638 #merge.list[["second"]] = patient.merge.list.second[[label1]]
|
0
|
639
|
1
|
640 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
|
641 } else {
|
|
642 while(nrow(patient.fuzzy) > 0){
|
|
643 first.merge = patient.fuzzy[1,"merge"]
|
|
644 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
645 first.sample = paste(patient.fuzzy[1,"Sample"], sep="")
|
|
646
|
|
647 merge.filter = first.merge == patient.fuzzy$merge
|
|
648
|
|
649 second.sample = other.sample.list[[first.sample]][1]
|
|
650 third.sample = other.sample.list[[first.sample]][2]
|
0
|
651
|
1
|
652 sample.filter.1 = first.sample == patient.fuzzy$Sample
|
|
653 sample.filter.2 = second.sample == patient.fuzzy$Sample
|
|
654 sample.filter.3 = third.sample == patient.fuzzy$Sample
|
0
|
655
|
1
|
656 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
0
|
657
|
1
|
658 match.filter.1 = sample.filter.1 & sequence.filter & merge.filter
|
|
659 match.filter.2 = sample.filter.2 & sequence.filter & merge.filter
|
|
660 match.filter.3 = sample.filter.3 & sequence.filter & merge.filter
|
0
|
661
|
1
|
662 matches.in.1 = any(match.filter.1)
|
|
663 matches.in.2 = any(match.filter.2)
|
|
664 matches.in.3 = any(match.filter.3)
|
0
|
665
|
1
|
666 rows.1 = patient.fuzzy[match.filter.1,]
|
0
|
667
|
1
|
668 sum.1 = data.frame(merge = first.clone.sequence,
|
|
669 Patient = label1,
|
|
670 Receptor = rows.1[1,"Receptor"],
|
|
671 Sample = rows.1[1,"Sample"],
|
|
672 Cell_Count = rows.1[1,"Cell_Count"],
|
|
673 Clone_Molecule_Count_From_Spikes = sum(rows.1$Clone_Molecule_Count_From_Spikes),
|
|
674 Log10_Frequency = log10(sum(rows.1$Frequency)),
|
|
675 Total_Read_Count = sum(rows.1$Total_Read_Count),
|
|
676 dsPerM = sum(rows.1$dsPerM),
|
|
677 J_Segment_Major_Gene = rows.1[1,"J_Segment_Major_Gene"],
|
|
678 V_Segment_Major_Gene = rows.1[1,"V_Segment_Major_Gene"],
|
|
679 Clone_Sequence = first.clone.sequence,
|
|
680 CDR3_Sense_Sequence = rows.1[1,"CDR3_Sense_Sequence"],
|
|
681 Related_to_leukemia_clone = F,
|
|
682 Frequency = sum(rows.1$Frequency),
|
|
683 locus_V = rows.1[1,"locus_V"],
|
|
684 locus_J = rows.1[1,"locus_J"],
|
|
685 uniqueID = rows.1[1,"uniqueID"],
|
|
686 normalized_read_count = sum(rows.1$normalized_read_count))
|
|
687 sum.2 = sum.1[NULL,]
|
|
688 rows.2 = patient.fuzzy[match.filter.2,]
|
|
689 if(matches.in.2){
|
|
690 sum.2 = data.frame(merge = first.clone.sequence,
|
|
691 Patient = label1,
|
|
692 Receptor = rows.2[1,"Receptor"],
|
|
693 Sample = rows.2[1,"Sample"],
|
|
694 Cell_Count = rows.2[1,"Cell_Count"],
|
|
695 Clone_Molecule_Count_From_Spikes = sum(rows.2$Clone_Molecule_Count_From_Spikes),
|
|
696 Log10_Frequency = log10(sum(rows.2$Frequency)),
|
|
697 Total_Read_Count = sum(rows.2$Total_Read_Count),
|
|
698 dsPerM = sum(rows.2$dsPerM),
|
|
699 J_Segment_Major_Gene = rows.2[1,"J_Segment_Major_Gene"],
|
|
700 V_Segment_Major_Gene = rows.2[1,"V_Segment_Major_Gene"],
|
|
701 Clone_Sequence = first.clone.sequence,
|
|
702 CDR3_Sense_Sequence = rows.2[1,"CDR3_Sense_Sequence"],
|
|
703 Related_to_leukemia_clone = F,
|
|
704 Frequency = sum(rows.2$Frequency),
|
|
705 locus_V = rows.2[1,"locus_V"],
|
|
706 locus_J = rows.2[1,"locus_J"],
|
|
707 uniqueID = rows.2[1,"uniqueID"],
|
|
708 normalized_read_count = sum(rows.2$normalized_read_count))
|
|
709 }
|
0
|
710
|
1
|
711 sum.3 = sum.1[NULL,]
|
|
712 rows.3 = patient.fuzzy[match.filter.3,]
|
|
713 if(matches.in.3){
|
|
714 sum.3 = data.frame(merge = first.clone.sequence,
|
|
715 Patient = label1,
|
|
716 Receptor = rows.3[1,"Receptor"],
|
|
717 Sample = rows.3[1,"Sample"],
|
|
718 Cell_Count = rows.3[1,"Cell_Count"],
|
|
719 Clone_Molecule_Count_From_Spikes = sum(rows.3$Clone_Molecule_Count_From_Spikes),
|
|
720 Log10_Frequency = log10(sum(rows.3$Frequency)),
|
|
721 Total_Read_Count = sum(rows.3$Total_Read_Count),
|
|
722 dsPerM = sum(rows.3$dsPerM),
|
|
723 J_Segment_Major_Gene = rows.3[1,"J_Segment_Major_Gene"],
|
|
724 V_Segment_Major_Gene = rows.3[1,"V_Segment_Major_Gene"],
|
|
725 Clone_Sequence = first.clone.sequence,
|
|
726 CDR3_Sense_Sequence = rows.3[1,"CDR3_Sense_Sequence"],
|
|
727 Related_to_leukemia_clone = F,
|
|
728 Frequency = sum(rows.3$Frequency),
|
|
729 locus_V = rows.3[1,"locus_V"],
|
|
730 locus_J = rows.3[1,"locus_J"],
|
|
731 uniqueID = rows.3[1,"uniqueID"],
|
|
732 normalized_read_count = sum(rows.3$normalized_read_count))
|
|
733 }
|
0
|
734
|
1
|
735 if(matches.in.2 & matches.in.3){
|
|
736 merge.123 = merge(sum.1, sum.2, by="merge")
|
|
737 merge.123 = merge(merge.123, sum.3, by="merge")
|
|
738 colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123)))] = paste(colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123), perl=T))], ".z", sep="")
|
|
739 #merge.123$thresholdValue = pmax(merge.123[,onx], merge.123[,ony], merge.123[,onz])
|
0
|
740
|
1
|
741 patientMerge = rbind(patientMerge, merge.123)
|
|
742 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2 | match.filter.3),]
|
0
|
743
|
1
|
744 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
745 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
0
|
746
|
1
|
747 } else if (matches.in.2) {
|
|
748 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
749 #other.sample2 = other.sample.list[[first.sample]][2]
|
0
|
750
|
1
|
751 second.sample = sum.2[,"Sample"]
|
0
|
752
|
1
|
753 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
0
|
754
|
1
|
755 merge.12 = merge(sum.1, sum.2, by="merge")
|
0
|
756
|
1
|
757 current.merge.list = rbind(current.merge.list, merge.12)
|
|
758 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
0
|
759
|
1
|
760 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2),]
|
0
|
761
|
1
|
762 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
763 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
0
|
764
|
1
|
765 } else if (matches.in.3) {
|
0
|
766
|
1
|
767 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
768 #other.sample2 = other.sample.list[[first.sample]][2]
|
0
|
769
|
1
|
770 second.sample = sum.3[,"Sample"]
|
0
|
771
|
1
|
772 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
0
|
773
|
1
|
774 merge.13 = merge(sum.1, sum.3, by="merge")
|
0
|
775
|
1
|
776 current.merge.list = rbind(current.merge.list, merge.13)
|
|
777 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
0
|
778
|
1
|
779 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.3),]
|
0
|
780
|
1
|
781 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
782 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
0
|
783
|
1
|
784 } else if(nrow(rows.1) > 1){
|
|
785 patient1 = patient1[!(patient1$Clone_Sequence %in% rows.1$Clone_Sequence),]
|
5
|
786 #print(names(patient1)[names(patient1) %in% sum.1])
|
|
787 #print(names(patient1)[!(names(patient1) %in% sum.1)])
|
|
788 #print(names(patient1))
|
|
789 #print(names(sum.1))
|
|
790 #print(summary(sum.1))
|
|
791 #print(summary(patient1))
|
|
792 #print(dim(sum.1))
|
|
793 #print(dim(patient1))
|
|
794 #print(head(sum.1[,names(patient1)]))
|
1
|
795 patient1 = rbind(patient1, sum.1[,names(patient1)])
|
|
796 patient.fuzzy = patient.fuzzy[-match.filter.1,]
|
|
797 } else {
|
|
798 patient.fuzzy = patient.fuzzy[-1,]
|
|
799 }
|
|
800
|
|
801 tmp.rows = rbind(rows.1, rows.2, rows.3)
|
|
802 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
0
|
803
|
1
|
804 if (sum(match.filter.1) > 1 | sum(match.filter.2) > 1 | sum(match.filter.1) > 1) {
|
|
805 cat(paste("<tr><td>", label1, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
806 } else {
|
|
807 }
|
|
808
|
|
809 }
|
|
810 patient.merge.list[[paste(label1, "123")]] = patientMerge
|
|
811
|
|
812 patientMerge12 = duo.merge.list[[paste(oneSample, twoSample)]]
|
|
813 patientMerge13 = duo.merge.list[[paste(oneSample, threeSample)]]
|
|
814 patientMerge23 = duo.merge.list[[paste(twoSample, threeSample)]]
|
0
|
815
|
1
|
816 patient.merge.list[[paste(label1, "12")]] = patientMerge12
|
|
817 patient.merge.list[[paste(label1, "13")]] = patientMerge13
|
|
818 patient.merge.list[[paste(label1, "23")]] = patientMerge23
|
|
819
|
|
820 #patient.merge.list.second[[label1]] = merge.list[["second"]]
|
|
821 }
|
|
822 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
|
823 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
824 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
825 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
826 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
827
|
|
828 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
829 #patientMerge12$thresholdValue = pmin(patientMerge12[,onx], patientMerge12[,ony])
|
|
830 #patientMerge13$thresholdValue = pmin(patientMerge13[,onx], patientMerge13[,ony])
|
|
831 #patientMerge23$thresholdValue = pmin(patientMerge23[,onx], patientMerge23[,ony])
|
0
|
832
|
1
|
833 patient1 = patient1[!(patient1$Clone_Sequence %in% merge.list[["second"]]),]
|
|
834 patient2 = patient2[!(patient2$Clone_Sequence %in% merge.list[["second"]]),]
|
|
835 patient3 = patient3[!(patient3$Clone_Sequence %in% merge.list[["second"]]),]
|
0
|
836
|
1
|
837 if(F){
|
|
838 patientMerge = merge(patient1, patient2, by="merge")
|
|
839 patientMerge = merge(patientMerge, patient3, by="merge")
|
|
840 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
|
841 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
842 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
843 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
844 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
845 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
846 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
847 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
848 }
|
0
|
849
|
1
|
850 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
|
851 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
|
852 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
853
|
|
854 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
0
|
855
|
1
|
856 res1 = vector()
|
|
857 res2 = vector()
|
|
858 res3 = vector()
|
|
859 res12 = vector()
|
|
860 res13 = vector()
|
|
861 res23 = vector()
|
|
862 resAll = vector()
|
|
863 read1Count = vector()
|
|
864 read2Count = vector()
|
|
865 read3Count = vector()
|
0
|
866
|
1
|
867 if(appendTriplets){
|
|
868 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
|
869 }
|
|
870 for(iter in 1:length(product[,1])){
|
|
871 threshhold = product[iter,threshholdIndex]
|
|
872 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
873 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
|
874 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
875 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
0
|
876
|
1
|
877 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
878 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
879 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
|
880
|
|
881 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
882 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
883 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
0
|
884
|
1
|
885 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
886 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
887 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
|
888 res1 = append(res1, sum(one))
|
|
889 res2 = append(res2, sum(two))
|
|
890 res3 = append(res3, sum(three))
|
|
891 resAll = append(resAll, sum(all))
|
|
892 res12 = append(res12, sum(one_two))
|
|
893 res13 = append(res13, sum(one_three))
|
|
894 res23 = append(res23, sum(two_three))
|
|
895 #threshhold = 0
|
|
896 if(threshhold != 0){
|
|
897 if(sum(one) > 0){
|
|
898 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
899 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
|
900 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
901 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
902 }
|
|
903 if(sum(two) > 0){
|
|
904 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
905 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
|
906 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
907 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
908 }
|
|
909 if(sum(three) > 0){
|
|
910 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
911 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
|
912 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
913 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
914 }
|
|
915 if(sum(one_two) > 0){
|
|
916 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
917 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
918 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
919 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
920 }
|
|
921 if(sum(one_three) > 0){
|
|
922 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
923 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
924 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
925 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
926 }
|
|
927 if(sum(two_three) > 0){
|
|
928 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
929 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
930 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
931 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
932 }
|
|
933 } else { #scatterplot data
|
|
934 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
935 scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
|
|
936 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
|
937 if(sum(in_two) > 0){
|
|
938 scatterplot_locus_data[in_two,]$type = "In two"
|
|
939 }
|
|
940 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
|
941 if(sum(in_three)> 0){
|
|
942 scatterplot_locus_data[in_three,]$type = "In three"
|
|
943 }
|
|
944 not_in_one = scatterplot_locus_data$type != "In one"
|
|
945 if(sum(not_in_one) > 0){
|
|
946 #scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
947 }
|
|
948 p = NULL
|
|
949 if(nrow(scatterplot_locus_data) != 0){
|
2
|
950 filename.scatter = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_scatter_", threshhold, sep="")
|
|
951 write.table(scatterplot_locus_data, file=paste(filename.scatter, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
952 if(on == "normalized_read_count"){
|
|
953 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
954 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales, limits=c(1, 1e6))
|
|
955 } else {
|
|
956 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
|
|
957 #p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
958 }
|
|
959 p = p + geom_point(aes(colour=type), position="jitter")
|
|
960 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
1
|
961 } else {
|
2
|
962 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
1
|
963 }
|
8
|
964 file_name = paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep="")
|
|
965 print(paste("Writing figure:", file_name))
|
|
966 png(file_name)
|
1
|
967 print(p)
|
|
968 dev.off()
|
|
969 }
|
|
970 if(sum(all) > 0){
|
|
971 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
972 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
973 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
974 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
975 }
|
|
976 }
|
|
977 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
978 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
|
979 colnames(patientResult)[6] = oneSample
|
|
980 colnames(patientResult)[7] = twoSample
|
|
981 colnames(patientResult)[8] = threeSample
|
|
982 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
983 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
984 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
|
985
|
|
986 colnamesBak = colnames(patientResult)
|
|
987 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
|
988 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
989 colnames(patientResult) = colnamesBak
|
|
990
|
|
991 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
992 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
993
|
|
994 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
995 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
996 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
997 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
998 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
999 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
8
|
1000 file_name = paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep="")
|
|
1001 print(paste("Writing figure:", file_name))
|
|
1002 png(file_name, width=1920, height=1080)
|
1
|
1003 print(plt)
|
|
1004 dev.off()
|
|
1005
|
|
1006 fontSize = 4
|
|
1007
|
|
1008 bak = patientResult
|
|
1009 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
1010 patientResult$relativeValue = patientResult$value * 10
|
|
1011 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
1012 plt = ggplot(patientResult)
|
|
1013 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
1014 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
1015 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
1016 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
1017 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
1018 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
1019 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
8
|
1020 file_name = paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep="")
|
|
1021 print(paste("Writing figure:", file_name))
|
|
1022 png(file_name, width=1920, height=1080)
|
1
|
1023 print(plt)
|
|
1024 dev.off()
|
0
|
1025 }
|
|
1026
|
|
1027 if(nrow(triplets) != 0){
|
|
1028
|
1
|
1029 cat("<tr><td>Starting triplet analysis</td></tr>", file=logfile, append=T)
|
|
1030
|
|
1031 triplets$uniqueID = paste(triplets$Patient, triplets$Sample, sep="_")
|
|
1032
|
|
1033 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
0
|
1034
|
1
|
1035 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
1036 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
1037 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
0
|
1038
|
1
|
1039 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
1040 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
0
|
1041
|
1
|
1042 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
1043
|
|
1044 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
1045
|
|
1046 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2)
|
0
|
1047
|
1
|
1048 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
1049
|
|
1050 column_drops = c("min_cell_count", "min_cell_paste")
|
|
1051
|
|
1052 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
|
1053
|
|
1054 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
0
|
1055
|
1
|
1056 interval = intervalReads
|
|
1057 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
1058 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
0
|
1059
|
1
|
1060 triplets = split(triplets, triplets$Patient, drop=T)
|
5
|
1061 #print(nrow(triplets))
|
1
|
1062 for(triplet in triplets){
|
|
1063 samples = unique(triplet$Sample)
|
|
1064 one = triplet[triplet$Sample == samples[1],]
|
|
1065 two = triplet[triplet$Sample == samples[2],]
|
|
1066 three = triplet[triplet$Sample == samples[3],]
|
|
1067
|
|
1068 print(paste(nrow(triplet), nrow(one), nrow(two), nrow(three)))
|
|
1069 tripletAnalysis(one, one[1,"uniqueID"], two, two[1,"uniqueID"], three, three[1,"uniqueID"], product=product, interval=interval, on="normalized_read_count", T)
|
|
1070 }
|
|
1071
|
|
1072 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
1073
|
|
1074 interval = intervalFreq
|
|
1075 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
1076 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
1077
|
|
1078 for(triplet in triplets){
|
|
1079 samples = unique(triplet$Sample)
|
|
1080 one = triplet[triplet$Sample == samples[1],]
|
|
1081 two = triplet[triplet$Sample == samples[2],]
|
|
1082 three = triplet[triplet$Sample == samples[3],]
|
|
1083 tripletAnalysis(one, one[1,"uniqueID"], two, two[1,"uniqueID"], three, three[1,"uniqueID"], product=product, interval=interval, on="Frequency", F)
|
|
1084 }
|
0
|
1085 } else {
|
|
1086 cat("", file="triplets.txt")
|
|
1087 }
|
|
1088 cat("</table></html>", file=logfile, append=T)
|