0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2 options(scipen=999)
|
|
3
|
|
4 inFile = args[1]
|
|
5 outDir = args[2]
|
|
6 logfile = args[3]
|
|
7 min_freq = as.numeric(args[4])
|
|
8 min_cells = as.numeric(args[5])
|
|
9 mergeOn = args[6]
|
|
10
|
|
11 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
|
12
|
|
13 library(ggplot2)
|
|
14 library(reshape2)
|
|
15 library(data.table)
|
|
16 library(grid)
|
|
17 library(parallel)
|
|
18 #require(xtable)
|
|
19 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
|
20 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
2
|
21
|
|
22 needed_cols = c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Clone_Sequence")
|
|
23 if(!all(needed_cols %in% names(dat))){
|
|
24 cat("Missing column(s):<br />", file=logfile, append=F)
|
|
25 missing_cols = needed_cols[!(needed_cols %in% names(dat))]
|
|
26 for(col in missing_cols){
|
|
27 cat(paste(col, "<br />"), file=logfile, append=T)
|
|
28 }
|
|
29 stop("Not all columns are present")
|
|
30 }
|
|
31
|
|
32 if(!("Total_Read_Count" %in% names(dat))){
|
|
33 dat$Total_Read_Count = 0
|
|
34 }
|
|
35
|
|
36 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Clone_Sequence")]
|
7
|
37 dat = dat[!(nchar(dat$Clone_Sequence) < 2),]
|
2
|
38
|
0
|
39 dat$dsPerM = 0
|
|
40 dat = dat[!is.na(dat$Patient),]
|
|
41 dat$Related_to_leukemia_clone = F
|
|
42
|
|
43 setwd(outDir)
|
|
44 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
|
45 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
46 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
47
|
|
48 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
|
49
|
|
50 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
5
|
51 dat = dat[dat$Frequency <= 100,] #just in case?
|
0
|
52
|
|
53 dat = dat[dat$Frequency >= min_freq,]
|
|
54
|
1
|
55 patient.sample.counts = data.frame(data.table(dat)[, list(count=.N), by=c("Patient", "Sample")])
|
|
56 patient.sample.counts = data.frame(data.table(patient.sample.counts)[, list(count=.N), by=c("Patient")])
|
|
57
|
|
58 print("Found the following patients with number of samples:")
|
|
59 print(patient.sample.counts)
|
|
60
|
|
61 patient.sample.counts.pairs = patient.sample.counts[patient.sample.counts$count %in% 1:2,]
|
|
62 patient.sample.counts.triplets = patient.sample.counts[patient.sample.counts$count == 3,]
|
|
63
|
|
64
|
|
65
|
|
66 triplets = dat[dat$Patient %in% patient.sample.counts.triplets$Patient,]
|
|
67 dat = dat[dat$Patient %in% patient.sample.counts.pairs$Patient,]
|
0
|
68
|
|
69 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
70
|
|
71 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
72 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
73 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
74
|
|
75 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
76 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
77
|
|
78 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
79 print(paste("rows:", nrow(dat)))
|
|
80 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
|
81 print(paste("rows:", nrow(dat)))
|
|
82 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
83
|
|
84 dat = dat[dat$normalized_read_count >= min_cells,]
|
|
85
|
|
86 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
|
87
|
|
88 patients = split(dat, dat$Patient, drop=T)
|
|
89 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
|
90 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
|
91 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
92 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
93 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
94 Titles = factor(Titles, levels=Titles)
|
|
95 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
96
|
2
|
97 single_patients = dat[NULL,]
|
0
|
98
|
|
99 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
100 patient.merge.list.second = list()
|
|
101 scatter_locus_data_list = list()
|
|
102 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="multiple_matches.html", append=T)
|
|
103 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="single_matches.html", append=T)
|
|
104 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
|
105 if (!is.data.frame(x) & is.list(x)){
|
|
106 x = x[[1]]
|
|
107 }
|
|
108 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
109 x = data.frame(x,stringsAsFactors=F)
|
|
110 onShort = "reads"
|
|
111 if(on == "Frequency"){
|
|
112 onShort = "freq"
|
|
113 }
|
|
114 onx = paste(on, ".x", sep="")
|
|
115 ony = paste(on, ".y", sep="")
|
|
116 splt = split(x, x$Sample, drop=T)
|
|
117 type="pair"
|
|
118 if(length(splt) == 1){
|
|
119 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
2
|
120 splt[[2]] = splt[[1]][NULL,]
|
0
|
121 type="single"
|
|
122 }
|
|
123 patient1 = splt[[1]]
|
|
124 patient2 = splt[[2]]
|
|
125
|
2
|
126 oneSample = patient1[1,"Sample"]
|
|
127 twoSample = patient2[1,"Sample"]
|
|
128 patient = x[1,"Patient"]
|
|
129
|
0
|
130 switched = F
|
|
131 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
132 tmp = twoSample
|
|
133 twoSample = oneSample
|
|
134 oneSample = tmp
|
|
135 tmp = patient1
|
|
136 patient1 = patient2
|
|
137 patient2 = tmp
|
|
138 switched = T
|
|
139 }
|
|
140 if(appendtxt){
|
|
141 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
|
142 }
|
|
143 cat(paste("<tr><td>", patient, "</td>", sep=""), file=logfile, append=T)
|
|
144
|
|
145 if(mergeOn == "Clone_Sequence"){
|
|
146 patient1$merge = paste(patient1$Clone_Sequence)
|
|
147 patient2$merge = paste(patient2$Clone_Sequence)
|
|
148 } else {
|
|
149 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
150 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
151 }
|
|
152
|
|
153 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
|
154 scatterplot_data = patient1[NULL,scatterplot_data_columns]
|
|
155 scatterplot.data.type.factor = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both"))
|
|
156 scatterplot_data$type = character(0)
|
|
157 scatterplot_data$link = numeric(0)
|
|
158 scatterplot_data$on = character(0)
|
|
159
|
|
160 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh
|
2
|
161
|
0
|
162 cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence
|
2
|
163
|
0
|
164 start.time = proc.time()
|
|
165 merge.list = c()
|
2
|
166
|
0
|
167 if(patient %in% names(patient.merge.list)){
|
|
168 patientMerge = patient.merge.list[[patient]]
|
|
169 merge.list[["second"]] = patient.merge.list.second[[patient]]
|
|
170 scatterplot_data = scatter_locus_data_list[[patient]]
|
|
171 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
2
|
172
|
5
|
173 #print(names(patient.merge.list))
|
0
|
174 } else {
|
|
175 #fuzzy matching here...
|
2
|
176
|
0
|
177 patient1.fuzzy = patient1
|
|
178 patient2.fuzzy = patient2
|
2
|
179
|
0
|
180 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
181 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
2
|
182
|
0
|
183 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
|
|
184 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
2
|
185
|
0
|
186 merge.list = list()
|
2
|
187
|
0
|
188 merge.list[["second"]] = vector()
|
2
|
189
|
|
190 link.count = 1
|
|
191
|
0
|
192 while(nrow(patient.fuzzy) > 1){
|
|
193 first.merge = patient.fuzzy[1,"merge"]
|
|
194 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
195 first.sample = patient.fuzzy[1,"Sample"]
|
|
196 merge.filter = first.merge == patient.fuzzy$merge
|
2
|
197
|
0
|
198 #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
|
2
|
199
|
0
|
200 first.sample.filter = first.sample == patient.fuzzy$Sample
|
|
201 second.sample.filter = first.sample != patient.fuzzy$Sample
|
2
|
202
|
0
|
203 #first match same sample, sum to a single row, same for other sample
|
|
204 #then merge rows like 'normal'
|
2
|
205
|
0
|
206 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
2
|
207
|
|
208
|
|
209
|
0
|
210 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter
|
|
211 first.match.filter = merge.filter & sequence.filter & first.sample.filter
|
|
212 second.match.filter = merge.filter & sequence.filter & second.sample.filter
|
2
|
213
|
0
|
214 first.rows = patient.fuzzy[first.match.filter,]
|
|
215 second.rows = patient.fuzzy[second.match.filter,]
|
2
|
216
|
0
|
217 first.rows.v = table(first.rows$V_Segment_Major_Gene)
|
|
218 first.rows.v = names(first.rows.v[which.max(first.rows.v)])
|
|
219 first.rows.j = table(first.rows$J_Segment_Major_Gene)
|
|
220 first.rows.j = names(first.rows.j[which.max(first.rows.j)])
|
2
|
221
|
0
|
222 first.sum = data.frame(merge = first.clone.sequence,
|
|
223 Patient = patient,
|
|
224 Receptor = first.rows[1,"Receptor"],
|
|
225 Sample = first.rows[1,"Sample"],
|
|
226 Cell_Count = first.rows[1,"Cell_Count"],
|
|
227 Clone_Molecule_Count_From_Spikes = sum(first.rows$Clone_Molecule_Count_From_Spikes),
|
|
228 Log10_Frequency = log10(sum(first.rows$Frequency)),
|
|
229 Total_Read_Count = sum(first.rows$Total_Read_Count),
|
|
230 dsPerM = sum(first.rows$dsPerM),
|
|
231 J_Segment_Major_Gene = first.rows.j,
|
|
232 V_Segment_Major_Gene = first.rows.v,
|
|
233 Clone_Sequence = first.clone.sequence,
|
|
234 CDR3_Sense_Sequence = first.rows[1,"CDR3_Sense_Sequence"],
|
|
235 Related_to_leukemia_clone = F,
|
|
236 Frequency = sum(first.rows$Frequency),
|
|
237 locus_V = first.rows[1,"locus_V"],
|
|
238 locus_J = first.rows[1,"locus_J"],
|
|
239 min_cell_count = first.rows[1,"min_cell_count"],
|
|
240 normalized_read_count = sum(first.rows$normalized_read_count),
|
|
241 paste = first.rows[1,"paste"],
|
|
242 min_cell_paste = first.rows[1,"min_cell_paste"])
|
2
|
243
|
0
|
244 if(nrow(second.rows) > 0){
|
|
245 second.rows.v = table(second.rows$V_Segment_Major_Gene)
|
|
246 second.rows.v = names(second.rows.v[which.max(second.rows.v)])
|
|
247 second.rows.j = table(second.rows$J_Segment_Major_Gene)
|
|
248 second.rows.j = names(second.rows.j[which.max(second.rows.j)])
|
2
|
249
|
0
|
250 second.sum = data.frame(merge = first.clone.sequence,
|
2
|
251 Patient = patient,
|
|
252 Receptor = second.rows[1,"Receptor"],
|
|
253 Sample = second.rows[1,"Sample"],
|
|
254 Cell_Count = second.rows[1,"Cell_Count"],
|
|
255 Clone_Molecule_Count_From_Spikes = sum(second.rows$Clone_Molecule_Count_From_Spikes),
|
|
256 Log10_Frequency = log10(sum(second.rows$Frequency)),
|
|
257 Total_Read_Count = sum(second.rows$Total_Read_Count),
|
|
258 dsPerM = sum(second.rows$dsPerM),
|
|
259 J_Segment_Major_Gene = second.rows.j,
|
|
260 V_Segment_Major_Gene = second.rows.v,
|
|
261 Clone_Sequence = first.clone.sequence,
|
|
262 CDR3_Sense_Sequence = second.rows[1,"CDR3_Sense_Sequence"],
|
|
263 Related_to_leukemia_clone = F,
|
|
264 Frequency = sum(second.rows$Frequency),
|
|
265 locus_V = second.rows[1,"locus_V"],
|
|
266 locus_J = second.rows[1,"locus_J"],
|
|
267 min_cell_count = second.rows[1,"min_cell_count"],
|
|
268 normalized_read_count = sum(second.rows$normalized_read_count),
|
|
269 paste = second.rows[1,"paste"],
|
|
270 min_cell_paste = second.rows[1,"min_cell_paste"])
|
|
271
|
|
272 #print(names(patientMerge))
|
|
273 #print(merge(first.sum, second.sum, by="merge"))
|
0
|
274 patientMerge = rbind(patientMerge, merge(first.sum, second.sum, by="merge"))
|
2
|
275 #print("test2")
|
0
|
276 patient.fuzzy = patient.fuzzy[!(first.match.filter | second.match.filter),]
|
2
|
277
|
0
|
278 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"], second.rows[second.rows$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
279 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
2
|
280
|
0
|
281 tmp.rows = rbind(first.rows, second.rows)
|
2
|
282 #print("test3")
|
0
|
283 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
|
284
|
|
285
|
|
286 #add to the scatterplot data
|
|
287 scatterplot.row = first.sum[,scatterplot_data_columns]
|
2
|
288 scatterplot.row$type = paste(first.sum[,"Sample"], "In Both")
|
|
289 scatterplot.row$link = link.count
|
|
290 scatterplot.row$on = onShort
|
|
291
|
|
292 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
0
|
293
|
|
294 scatterplot.row = second.sum[,scatterplot_data_columns]
|
2
|
295 scatterplot.row$type = paste(second.sum[,"Sample"], "In Both")
|
|
296 scatterplot.row$link = link.count
|
|
297 scatterplot.row$on = onShort
|
|
298
|
|
299 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
300
|
|
301 #write some information about the match to a log file
|
0
|
302 if (nrow(first.rows) > 1 | nrow(second.rows) > 1) {
|
|
303 cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
304 } else {
|
|
305 second.clone.sequence = second.rows[1,"Clone_Sequence"]
|
|
306 if(nchar(first.clone.sequence) != nchar(second.clone.sequence)){
|
|
307 cat(paste("<tr bgcolor='#DDD'><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
308 } else {
|
|
309 #cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
310 }
|
|
311 }
|
2
|
312
|
0
|
313 } else if(nrow(first.rows) > 1) {
|
|
314 if(patient1[1,"Sample"] == first.sample){
|
|
315 patient1 = patient1[!(patient1$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
316 patient1 = rbind(patient1, first.sum)
|
|
317 } else {
|
|
318 patient2 = patient2[!(patient2$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
319 patient2 = rbind(patient2, first.sum)
|
|
320 }
|
2
|
321
|
0
|
322 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"])
|
|
323 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
2
|
324
|
0
|
325 patient.fuzzy = patient.fuzzy[-first.match.filter,]
|
|
326
|
|
327 #add to the scatterplot data
|
|
328 scatterplot.row = first.sum[,scatterplot_data_columns]
|
2
|
329 scatterplot.row$type = first.sum[,"Sample"]
|
|
330 scatterplot.row$link = link.count
|
|
331 scatterplot.row$on = onShort
|
|
332
|
|
333 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
334
|
0
|
335 cat(paste("<tr bgcolor='#DDF'><td>", patient, " row ", 1:nrow(first.rows), "</td><td>", first.rows$Sample, ":</td><td>", first.rows$Clone_Sequence, "</td><td>", first.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
336 } else {
|
|
337 patient.fuzzy = patient.fuzzy[-1,]
|
|
338
|
|
339 #add to the scatterplot data
|
|
340 scatterplot.row = first.sum[,scatterplot_data_columns]
|
2
|
341 scatterplot.row$type = first.sum[,"Sample"]
|
|
342 scatterplot.row$link = link.count
|
|
343 scatterplot.row$on = onShort
|
|
344
|
|
345 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
0
|
346 }
|
|
347 link.count = link.count + 1
|
|
348 }
|
|
349 patient.merge.list[[patient]] <<- patientMerge
|
|
350 patient.merge.list.second[[patient]] <<- merge.list[["second"]]
|
|
351
|
|
352 sample.order = data.frame(type = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both")),type.order = 1:4)
|
|
353 scatterplot_data = merge(scatterplot_data, sample.order, by="type")
|
|
354
|
|
355 scatter_locus_data_list[[patient]] <<- scatterplot_data
|
|
356 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
|
357 }
|
2
|
358
|
0
|
359 patient1 = patient1[!(patient1$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
|
360 patient2 = patient2[!(patient2$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
2
|
361
|
0
|
362
|
|
363 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
|
364 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony])
|
|
365 res1 = vector()
|
|
366 res2 = vector()
|
|
367 resBoth = vector()
|
|
368 read1Count = vector()
|
|
369 read2Count = vector()
|
|
370 locussum1 = vector()
|
|
371 locussum2 = vector()
|
|
372
|
|
373 #for(iter in 1){
|
|
374 for(iter in 1:length(product[,1])){
|
2
|
375 threshhold = product[iter,"interval"]
|
|
376 V_Segment = paste(".*", as.character(product[iter,"V_Segments"]), ".*", sep="")
|
|
377 J_Segment = paste(".*", as.character(product[iter,"J_Segments"]), ".*", sep="")
|
0
|
378 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
|
379 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
|
380 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
381 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
|
382 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
383 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
|
384 res1 = append(res1, sum(one))
|
|
385 res2 = append(res2, sum(two))
|
|
386 resBoth = append(resBoth, sum(both))
|
|
387 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
388 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
389 #threshhold = 0
|
2
|
390 if(threshhold != 0 | T){
|
0
|
391 if(sum(one) > 0){
|
|
392 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
393 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
2
|
394 filenameOne = paste(oneSample, "_", product[iter, "Titles"], "_", threshhold, sep="")
|
0
|
395 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
396 }
|
|
397 if(sum(two) > 0){
|
|
398 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
399 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
2
|
400 filenameTwo = paste(twoSample, "_", product[iter, "Titles"], "_", threshhold, sep="")
|
0
|
401 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
402 }
|
6
|
403 }
|
|
404 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
405 if(nrow(scatterplot_locus_data) > 0){
|
|
406 scatterplot_locus_data$Rearrangement = product[iter, "Titles"]
|
|
407 }
|
|
408 p = NULL
|
|
409 #print(paste("nrow scatterplot_locus_data", nrow(scatterplot_locus_data)))
|
|
410 if(nrow(scatterplot_locus_data) != 0){
|
|
411 if(on == "normalized_read_count"){
|
|
412 write.table(scatterplot_locus_data, file=paste(oneSample, twoSample, product[iter, "Titles"], "scatterplot_locus_data.txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
413 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
414 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), normalized_read_count, group=link)) + geom_line() + scale_y_log10(breaks=scales,labels=scales, limits=c(1,1e6)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
|
415 } else {
|
|
416 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), Frequency, group=link)) + geom_line() + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
0
|
417 }
|
6
|
418 p = p + geom_point(aes(colour=type), position="dodge")
|
|
419 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,"Patient"], patient1[1,"Sample"], patient2[1,"Sample"], onShort, product[iter, "Titles"]))
|
|
420 } else {
|
|
421 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,"Patient"], patient1[1,"Sample"], patient2[1,"Sample"], onShort, product[iter, "Titles"]))
|
0
|
422 }
|
6
|
423 png(paste(patient1[1,"Patient"], "_", patient1[1,"Sample"], "_", patient2[1,"Sample"], "_", onShort, "_", product[iter, "Titles"],"_scatter.png", sep=""))
|
|
424 print(p)
|
|
425 dev.off()
|
0
|
426 if(sum(both) > 0){
|
|
427 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
428 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
2
|
429 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, "Titles"], "_", threshhold, sep="")
|
0
|
430 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
431 }
|
|
432 }
|
|
433 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
|
434 if(sum(is.na(patientResult$percentage)) > 0){
|
|
435 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
436 }
|
|
437 colnames(patientResult)[6] = oneSample
|
|
438 colnames(patientResult)[8] = twoSample
|
|
439 colnamesBak = colnames(patientResult)
|
|
440 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
|
441 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
442 colnames(patientResult) = colnamesBak
|
|
443
|
|
444 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
445 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
446
|
|
447 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
448 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
449 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
450 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
451 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
452 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
453 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
454 print(plt)
|
|
455 dev.off()
|
|
456 #(t,r,b,l)
|
|
457 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
458 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
459 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
460 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
461 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
462 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
463 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
464 print(plt)
|
|
465 dev.off()
|
|
466
|
|
467 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
468 patientResult$relativeValue = patientResult$value * 10
|
|
469 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
470 plt = ggplot(patientResult)
|
|
471 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
472 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
473 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
474 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
475 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
476 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
477 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
478 print(plt)
|
|
479 dev.off()
|
|
480 }
|
2
|
481
|
1
|
482 if(length(patients) > 0){
|
|
483 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
0
|
484
|
1
|
485 interval = intervalFreq
|
|
486 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
487 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
2
|
488 for (current_patient in patients){
|
|
489 print(paste("Started working", unique(current_patient$Patient), "Frequency analysis"))
|
|
490 patientCountOnColumn(current_patient, product=product, interval=interval, on="Frequency", appendtxt=T)
|
|
491 }
|
0
|
492
|
1
|
493 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
0
|
494
|
1
|
495 interval = intervalReads
|
|
496 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
497 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
2
|
498 for (current_patient in patients){
|
|
499 print(paste("Started working on ", unique(current_patient$Patient), "Read Count analysis"))
|
|
500 patientCountOnColumn(current_patient, product=product, interval=interval, on="normalized_read_count")
|
|
501 }
|
1
|
502 }
|
0
|
503 if(nrow(single_patients) > 0){
|
|
504 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
505 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=as.character(scales)) + expand_limits(y=c(0,1000000))
|
|
506 p = p + geom_point(aes(colour=type), position="jitter")
|
|
507 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
508 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
509 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
510 print(p)
|
|
511 dev.off()
|
|
512
|
|
513 #p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
514 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
|
|
515 p = p + geom_point(aes(colour=type), position="jitter")
|
|
516 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
517 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
518 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
519 print(p)
|
|
520 dev.off()
|
|
521 } else {
|
|
522 empty <- data.frame()
|
|
523 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
524
|
|
525 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
526 print(p)
|
|
527 dev.off()
|
|
528
|
|
529 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
530 print(p)
|
|
531 dev.off()
|
|
532 }
|
|
533
|
|
534 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
535 patient.merge.list.second = list()
|
|
536
|
|
537 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
1
|
538 onShort = "reads"
|
|
539 if(on == "Frequency"){
|
|
540 onShort = "freq"
|
|
541 }
|
|
542 onx = paste(on, ".x", sep="")
|
|
543 ony = paste(on, ".y", sep="")
|
|
544 onz = paste(on, ".z", sep="")
|
|
545 type="triplet"
|
0
|
546
|
1
|
547 threshholdIndex = which(colnames(product) == "interval")
|
|
548 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
549 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
550 titleIndex = which(colnames(product) == "Titles")
|
|
551 sampleIndex = which(colnames(patient1) == "Sample")
|
|
552 patientIndex = which(colnames(patient1) == "Patient")
|
|
553 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
554 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
555 threeSample = paste(patient3[1,sampleIndex], sep="")
|
|
556
|
|
557 if(mergeOn == "Clone_Sequence"){
|
|
558 patient1$merge = paste(patient1$Clone_Sequence)
|
0
|
559 patient2$merge = paste(patient2$Clone_Sequence)
|
|
560 patient3$merge = paste(patient3$Clone_Sequence)
|
|
561
|
1
|
562 } else {
|
0
|
563 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
564 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
565 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
1
|
566 }
|
0
|
567
|
1
|
568 #patientMerge = merge(patient1, patient2, by="merge")[NULL,]
|
|
569 patient1.fuzzy = patient1
|
|
570 patient2.fuzzy = patient2
|
|
571 patient3.fuzzy = patient3
|
0
|
572
|
1
|
573 cat(paste("<tr><td>", label1, "</td>", sep=""), file=logfile, append=T)
|
0
|
574
|
1
|
575 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
576 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
|
577 patient3.fuzzy$merge = paste(patient3.fuzzy$locus_V, patient3.fuzzy$locus_J)
|
0
|
578
|
1
|
579 patient.fuzzy = rbind(patient1.fuzzy ,patient2.fuzzy, patient3.fuzzy)
|
|
580 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
0
|
581
|
1
|
582 other.sample.list = list()
|
|
583 other.sample.list[[oneSample]] = c(twoSample, threeSample)
|
|
584 other.sample.list[[twoSample]] = c(oneSample, threeSample)
|
|
585 other.sample.list[[threeSample]] = c(oneSample, twoSample)
|
0
|
586
|
1
|
587 patientMerge = merge(patient1, patient2, by="merge")
|
|
588 patientMerge = merge(patientMerge, patient3, by="merge")
|
|
589 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
|
590 #patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
591 patientMerge = patientMerge[NULL,]
|
0
|
592
|
1
|
593 duo.merge.list = list()
|
0
|
594
|
1
|
595 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
596 #patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
597 patientMerge12 = patientMerge12[NULL,]
|
|
598 duo.merge.list[[paste(oneSample, twoSample)]] = patientMerge12
|
|
599 duo.merge.list[[paste(twoSample, oneSample)]] = patientMerge12
|
0
|
600
|
1
|
601 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
602 #patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
603 patientMerge13 = patientMerge13[NULL,]
|
|
604 duo.merge.list[[paste(oneSample, threeSample)]] = patientMerge13
|
|
605 duo.merge.list[[paste(threeSample, oneSample)]] = patientMerge13
|
0
|
606
|
1
|
607 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
608 #patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
609 patientMerge23 = patientMerge23[NULL,]
|
|
610 duo.merge.list[[paste(twoSample, threeSample)]] = patientMerge23
|
|
611 duo.merge.list[[paste(threeSample, twoSample)]] = patientMerge23
|
0
|
612
|
1
|
613 merge.list = list()
|
|
614 merge.list[["second"]] = vector()
|
|
615
|
|
616 #print(paste(nrow(patient1), nrow(patient2), nrow(patient3), label1, label2, label3))
|
|
617
|
|
618 start.time = proc.time()
|
|
619 if(paste(label1, "123") %in% names(patient.merge.list)){
|
|
620 patientMerge = patient.merge.list[[paste(label1, "123")]]
|
|
621 patientMerge12 = patient.merge.list[[paste(label1, "12")]]
|
|
622 patientMerge13 = patient.merge.list[[paste(label1, "13")]]
|
|
623 patientMerge23 = patient.merge.list[[paste(label1, "23")]]
|
0
|
624
|
1
|
625 #merge.list[["second"]] = patient.merge.list.second[[label1]]
|
0
|
626
|
1
|
627 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
|
628 } else {
|
|
629 while(nrow(patient.fuzzy) > 0){
|
|
630 first.merge = patient.fuzzy[1,"merge"]
|
|
631 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
632 first.sample = paste(patient.fuzzy[1,"Sample"], sep="")
|
|
633
|
|
634 merge.filter = first.merge == patient.fuzzy$merge
|
|
635
|
|
636 second.sample = other.sample.list[[first.sample]][1]
|
|
637 third.sample = other.sample.list[[first.sample]][2]
|
0
|
638
|
1
|
639 sample.filter.1 = first.sample == patient.fuzzy$Sample
|
|
640 sample.filter.2 = second.sample == patient.fuzzy$Sample
|
|
641 sample.filter.3 = third.sample == patient.fuzzy$Sample
|
0
|
642
|
1
|
643 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
0
|
644
|
1
|
645 match.filter.1 = sample.filter.1 & sequence.filter & merge.filter
|
|
646 match.filter.2 = sample.filter.2 & sequence.filter & merge.filter
|
|
647 match.filter.3 = sample.filter.3 & sequence.filter & merge.filter
|
0
|
648
|
1
|
649 matches.in.1 = any(match.filter.1)
|
|
650 matches.in.2 = any(match.filter.2)
|
|
651 matches.in.3 = any(match.filter.3)
|
0
|
652
|
1
|
653 rows.1 = patient.fuzzy[match.filter.1,]
|
0
|
654
|
1
|
655 sum.1 = data.frame(merge = first.clone.sequence,
|
|
656 Patient = label1,
|
|
657 Receptor = rows.1[1,"Receptor"],
|
|
658 Sample = rows.1[1,"Sample"],
|
|
659 Cell_Count = rows.1[1,"Cell_Count"],
|
|
660 Clone_Molecule_Count_From_Spikes = sum(rows.1$Clone_Molecule_Count_From_Spikes),
|
|
661 Log10_Frequency = log10(sum(rows.1$Frequency)),
|
|
662 Total_Read_Count = sum(rows.1$Total_Read_Count),
|
|
663 dsPerM = sum(rows.1$dsPerM),
|
|
664 J_Segment_Major_Gene = rows.1[1,"J_Segment_Major_Gene"],
|
|
665 V_Segment_Major_Gene = rows.1[1,"V_Segment_Major_Gene"],
|
|
666 Clone_Sequence = first.clone.sequence,
|
|
667 CDR3_Sense_Sequence = rows.1[1,"CDR3_Sense_Sequence"],
|
|
668 Related_to_leukemia_clone = F,
|
|
669 Frequency = sum(rows.1$Frequency),
|
|
670 locus_V = rows.1[1,"locus_V"],
|
|
671 locus_J = rows.1[1,"locus_J"],
|
|
672 uniqueID = rows.1[1,"uniqueID"],
|
|
673 normalized_read_count = sum(rows.1$normalized_read_count))
|
|
674 sum.2 = sum.1[NULL,]
|
|
675 rows.2 = patient.fuzzy[match.filter.2,]
|
|
676 if(matches.in.2){
|
|
677 sum.2 = data.frame(merge = first.clone.sequence,
|
|
678 Patient = label1,
|
|
679 Receptor = rows.2[1,"Receptor"],
|
|
680 Sample = rows.2[1,"Sample"],
|
|
681 Cell_Count = rows.2[1,"Cell_Count"],
|
|
682 Clone_Molecule_Count_From_Spikes = sum(rows.2$Clone_Molecule_Count_From_Spikes),
|
|
683 Log10_Frequency = log10(sum(rows.2$Frequency)),
|
|
684 Total_Read_Count = sum(rows.2$Total_Read_Count),
|
|
685 dsPerM = sum(rows.2$dsPerM),
|
|
686 J_Segment_Major_Gene = rows.2[1,"J_Segment_Major_Gene"],
|
|
687 V_Segment_Major_Gene = rows.2[1,"V_Segment_Major_Gene"],
|
|
688 Clone_Sequence = first.clone.sequence,
|
|
689 CDR3_Sense_Sequence = rows.2[1,"CDR3_Sense_Sequence"],
|
|
690 Related_to_leukemia_clone = F,
|
|
691 Frequency = sum(rows.2$Frequency),
|
|
692 locus_V = rows.2[1,"locus_V"],
|
|
693 locus_J = rows.2[1,"locus_J"],
|
|
694 uniqueID = rows.2[1,"uniqueID"],
|
|
695 normalized_read_count = sum(rows.2$normalized_read_count))
|
|
696 }
|
0
|
697
|
1
|
698 sum.3 = sum.1[NULL,]
|
|
699 rows.3 = patient.fuzzy[match.filter.3,]
|
|
700 if(matches.in.3){
|
|
701 sum.3 = data.frame(merge = first.clone.sequence,
|
|
702 Patient = label1,
|
|
703 Receptor = rows.3[1,"Receptor"],
|
|
704 Sample = rows.3[1,"Sample"],
|
|
705 Cell_Count = rows.3[1,"Cell_Count"],
|
|
706 Clone_Molecule_Count_From_Spikes = sum(rows.3$Clone_Molecule_Count_From_Spikes),
|
|
707 Log10_Frequency = log10(sum(rows.3$Frequency)),
|
|
708 Total_Read_Count = sum(rows.3$Total_Read_Count),
|
|
709 dsPerM = sum(rows.3$dsPerM),
|
|
710 J_Segment_Major_Gene = rows.3[1,"J_Segment_Major_Gene"],
|
|
711 V_Segment_Major_Gene = rows.3[1,"V_Segment_Major_Gene"],
|
|
712 Clone_Sequence = first.clone.sequence,
|
|
713 CDR3_Sense_Sequence = rows.3[1,"CDR3_Sense_Sequence"],
|
|
714 Related_to_leukemia_clone = F,
|
|
715 Frequency = sum(rows.3$Frequency),
|
|
716 locus_V = rows.3[1,"locus_V"],
|
|
717 locus_J = rows.3[1,"locus_J"],
|
|
718 uniqueID = rows.3[1,"uniqueID"],
|
|
719 normalized_read_count = sum(rows.3$normalized_read_count))
|
|
720 }
|
0
|
721
|
1
|
722 if(matches.in.2 & matches.in.3){
|
|
723 merge.123 = merge(sum.1, sum.2, by="merge")
|
|
724 merge.123 = merge(merge.123, sum.3, by="merge")
|
|
725 colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123)))] = paste(colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123), perl=T))], ".z", sep="")
|
|
726 #merge.123$thresholdValue = pmax(merge.123[,onx], merge.123[,ony], merge.123[,onz])
|
0
|
727
|
1
|
728 patientMerge = rbind(patientMerge, merge.123)
|
|
729 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2 | match.filter.3),]
|
0
|
730
|
1
|
731 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
732 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
0
|
733
|
1
|
734 } else if (matches.in.2) {
|
|
735 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
736 #other.sample2 = other.sample.list[[first.sample]][2]
|
0
|
737
|
1
|
738 second.sample = sum.2[,"Sample"]
|
0
|
739
|
1
|
740 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
0
|
741
|
1
|
742 merge.12 = merge(sum.1, sum.2, by="merge")
|
0
|
743
|
1
|
744 current.merge.list = rbind(current.merge.list, merge.12)
|
|
745 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
0
|
746
|
1
|
747 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2),]
|
0
|
748
|
1
|
749 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
750 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
0
|
751
|
1
|
752 } else if (matches.in.3) {
|
0
|
753
|
1
|
754 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
755 #other.sample2 = other.sample.list[[first.sample]][2]
|
0
|
756
|
1
|
757 second.sample = sum.3[,"Sample"]
|
0
|
758
|
1
|
759 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
0
|
760
|
1
|
761 merge.13 = merge(sum.1, sum.3, by="merge")
|
0
|
762
|
1
|
763 current.merge.list = rbind(current.merge.list, merge.13)
|
|
764 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
0
|
765
|
1
|
766 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.3),]
|
0
|
767
|
1
|
768 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
769 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
0
|
770
|
1
|
771 } else if(nrow(rows.1) > 1){
|
|
772 patient1 = patient1[!(patient1$Clone_Sequence %in% rows.1$Clone_Sequence),]
|
5
|
773 #print(names(patient1)[names(patient1) %in% sum.1])
|
|
774 #print(names(patient1)[!(names(patient1) %in% sum.1)])
|
|
775 #print(names(patient1))
|
|
776 #print(names(sum.1))
|
|
777 #print(summary(sum.1))
|
|
778 #print(summary(patient1))
|
|
779 #print(dim(sum.1))
|
|
780 #print(dim(patient1))
|
|
781 #print(head(sum.1[,names(patient1)]))
|
1
|
782 patient1 = rbind(patient1, sum.1[,names(patient1)])
|
|
783 patient.fuzzy = patient.fuzzy[-match.filter.1,]
|
|
784 } else {
|
|
785 patient.fuzzy = patient.fuzzy[-1,]
|
|
786 }
|
|
787
|
|
788 tmp.rows = rbind(rows.1, rows.2, rows.3)
|
|
789 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
0
|
790
|
1
|
791 if (sum(match.filter.1) > 1 | sum(match.filter.2) > 1 | sum(match.filter.1) > 1) {
|
|
792 cat(paste("<tr><td>", label1, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
793 } else {
|
|
794 }
|
|
795
|
|
796 }
|
|
797 patient.merge.list[[paste(label1, "123")]] = patientMerge
|
|
798
|
|
799 patientMerge12 = duo.merge.list[[paste(oneSample, twoSample)]]
|
|
800 patientMerge13 = duo.merge.list[[paste(oneSample, threeSample)]]
|
|
801 patientMerge23 = duo.merge.list[[paste(twoSample, threeSample)]]
|
0
|
802
|
1
|
803 patient.merge.list[[paste(label1, "12")]] = patientMerge12
|
|
804 patient.merge.list[[paste(label1, "13")]] = patientMerge13
|
|
805 patient.merge.list[[paste(label1, "23")]] = patientMerge23
|
|
806
|
|
807 #patient.merge.list.second[[label1]] = merge.list[["second"]]
|
|
808 }
|
|
809 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
|
810 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
811 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
812 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
813 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
814
|
|
815 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
816 #patientMerge12$thresholdValue = pmin(patientMerge12[,onx], patientMerge12[,ony])
|
|
817 #patientMerge13$thresholdValue = pmin(patientMerge13[,onx], patientMerge13[,ony])
|
|
818 #patientMerge23$thresholdValue = pmin(patientMerge23[,onx], patientMerge23[,ony])
|
0
|
819
|
1
|
820 patient1 = patient1[!(patient1$Clone_Sequence %in% merge.list[["second"]]),]
|
|
821 patient2 = patient2[!(patient2$Clone_Sequence %in% merge.list[["second"]]),]
|
|
822 patient3 = patient3[!(patient3$Clone_Sequence %in% merge.list[["second"]]),]
|
0
|
823
|
1
|
824 if(F){
|
|
825 patientMerge = merge(patient1, patient2, by="merge")
|
|
826 patientMerge = merge(patientMerge, patient3, by="merge")
|
|
827 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
|
828 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
829 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
830 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
831 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
832 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
833 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
834 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
835 }
|
0
|
836
|
1
|
837 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
|
838 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
|
839 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
840
|
|
841 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
0
|
842
|
1
|
843 res1 = vector()
|
|
844 res2 = vector()
|
|
845 res3 = vector()
|
|
846 res12 = vector()
|
|
847 res13 = vector()
|
|
848 res23 = vector()
|
|
849 resAll = vector()
|
|
850 read1Count = vector()
|
|
851 read2Count = vector()
|
|
852 read3Count = vector()
|
0
|
853
|
1
|
854 if(appendTriplets){
|
|
855 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
|
856 }
|
|
857 for(iter in 1:length(product[,1])){
|
|
858 threshhold = product[iter,threshholdIndex]
|
|
859 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
860 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
|
861 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
862 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
0
|
863
|
1
|
864 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
865 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
866 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
|
867
|
|
868 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
869 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
870 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
0
|
871
|
1
|
872 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
873 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
874 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
|
875 res1 = append(res1, sum(one))
|
|
876 res2 = append(res2, sum(two))
|
|
877 res3 = append(res3, sum(three))
|
|
878 resAll = append(resAll, sum(all))
|
|
879 res12 = append(res12, sum(one_two))
|
|
880 res13 = append(res13, sum(one_three))
|
|
881 res23 = append(res23, sum(two_three))
|
|
882 #threshhold = 0
|
|
883 if(threshhold != 0){
|
|
884 if(sum(one) > 0){
|
|
885 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
886 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
|
887 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
888 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
889 }
|
|
890 if(sum(two) > 0){
|
|
891 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
892 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
|
893 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
894 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
895 }
|
|
896 if(sum(three) > 0){
|
|
897 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
898 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
|
899 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
900 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
901 }
|
|
902 if(sum(one_two) > 0){
|
|
903 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
904 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
905 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
906 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
907 }
|
|
908 if(sum(one_three) > 0){
|
|
909 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
910 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
911 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
912 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
913 }
|
|
914 if(sum(two_three) > 0){
|
|
915 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
916 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
917 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
918 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
919 }
|
|
920 } else { #scatterplot data
|
|
921 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
922 scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
|
|
923 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
|
924 if(sum(in_two) > 0){
|
|
925 scatterplot_locus_data[in_two,]$type = "In two"
|
|
926 }
|
|
927 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
|
928 if(sum(in_three)> 0){
|
|
929 scatterplot_locus_data[in_three,]$type = "In three"
|
|
930 }
|
|
931 not_in_one = scatterplot_locus_data$type != "In one"
|
|
932 if(sum(not_in_one) > 0){
|
|
933 #scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
934 }
|
|
935 p = NULL
|
|
936 if(nrow(scatterplot_locus_data) != 0){
|
2
|
937 filename.scatter = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_scatter_", threshhold, sep="")
|
|
938 write.table(scatterplot_locus_data, file=paste(filename.scatter, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
939 if(on == "normalized_read_count"){
|
|
940 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
941 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales, limits=c(1, 1e6))
|
|
942 } else {
|
|
943 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
|
|
944 #p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
945 }
|
|
946 p = p + geom_point(aes(colour=type), position="jitter")
|
|
947 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
1
|
948 } else {
|
2
|
949 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
1
|
950 }
|
|
951 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
952 print(p)
|
|
953 dev.off()
|
|
954 }
|
|
955 if(sum(all) > 0){
|
|
956 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
957 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
958 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
959 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
960 }
|
|
961 }
|
|
962 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
963 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
|
964 colnames(patientResult)[6] = oneSample
|
|
965 colnames(patientResult)[7] = twoSample
|
|
966 colnames(patientResult)[8] = threeSample
|
|
967 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
968 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
969 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
|
970
|
|
971 colnamesBak = colnames(patientResult)
|
|
972 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
|
973 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
974 colnames(patientResult) = colnamesBak
|
|
975
|
|
976 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
977 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
978
|
|
979 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
980 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
981 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
982 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
983 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
984 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
985 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
986 print(plt)
|
|
987 dev.off()
|
|
988
|
|
989 fontSize = 4
|
|
990
|
|
991 bak = patientResult
|
|
992 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
993 patientResult$relativeValue = patientResult$value * 10
|
|
994 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
995 plt = ggplot(patientResult)
|
|
996 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
997 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
998 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
999 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
1000 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
1001 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
1002 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
1003 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
1004 print(plt)
|
|
1005 dev.off()
|
0
|
1006 }
|
|
1007
|
|
1008 if(nrow(triplets) != 0){
|
|
1009
|
1
|
1010 cat("<tr><td>Starting triplet analysis</td></tr>", file=logfile, append=T)
|
|
1011
|
|
1012 triplets$uniqueID = paste(triplets$Patient, triplets$Sample, sep="_")
|
|
1013
|
|
1014 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
0
|
1015
|
1
|
1016 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
1017 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
1018 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
0
|
1019
|
1
|
1020 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
1021 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
0
|
1022
|
1
|
1023 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
1024
|
|
1025 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
1026
|
|
1027 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2)
|
0
|
1028
|
1
|
1029 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
1030
|
|
1031 column_drops = c("min_cell_count", "min_cell_paste")
|
|
1032
|
|
1033 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
|
1034
|
|
1035 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
0
|
1036
|
1
|
1037 interval = intervalReads
|
|
1038 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
1039 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
0
|
1040
|
1
|
1041 triplets = split(triplets, triplets$Patient, drop=T)
|
5
|
1042 #print(nrow(triplets))
|
1
|
1043 for(triplet in triplets){
|
|
1044 samples = unique(triplet$Sample)
|
|
1045 one = triplet[triplet$Sample == samples[1],]
|
|
1046 two = triplet[triplet$Sample == samples[2],]
|
|
1047 three = triplet[triplet$Sample == samples[3],]
|
|
1048
|
|
1049 print(paste(nrow(triplet), nrow(one), nrow(two), nrow(three)))
|
|
1050 tripletAnalysis(one, one[1,"uniqueID"], two, two[1,"uniqueID"], three, three[1,"uniqueID"], product=product, interval=interval, on="normalized_read_count", T)
|
|
1051 }
|
|
1052
|
|
1053 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
1054
|
|
1055 interval = intervalFreq
|
|
1056 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
1057 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
1058
|
|
1059 for(triplet in triplets){
|
|
1060 samples = unique(triplet$Sample)
|
|
1061 one = triplet[triplet$Sample == samples[1],]
|
|
1062 two = triplet[triplet$Sample == samples[2],]
|
|
1063 three = triplet[triplet$Sample == samples[3],]
|
|
1064 tripletAnalysis(one, one[1,"uniqueID"], two, two[1,"uniqueID"], three, three[1,"uniqueID"], product=product, interval=interval, on="Frequency", F)
|
|
1065 }
|
0
|
1066 } else {
|
|
1067 cat("", file="triplets.txt")
|
|
1068 }
|
|
1069 cat("</table></html>", file=logfile, append=T)
|