0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2 options(scipen=999)
|
|
3
|
|
4 inFile = args[1]
|
|
5 outDir = args[2]
|
|
6 logfile = args[3]
|
|
7 min_freq = as.numeric(args[4])
|
|
8 min_cells = as.numeric(args[5])
|
|
9 mergeOn = args[6]
|
|
10
|
|
11 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
|
12
|
|
13 library(ggplot2)
|
|
14 library(reshape2)
|
|
15 library(data.table)
|
|
16 library(grid)
|
|
17 library(parallel)
|
|
18 #require(xtable)
|
|
19 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
|
20 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
2
|
21
|
|
22 needed_cols = c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Clone_Sequence")
|
|
23 if(!all(needed_cols %in% names(dat))){
|
|
24 cat("Missing column(s):<br />", file=logfile, append=F)
|
|
25 missing_cols = needed_cols[!(needed_cols %in% names(dat))]
|
|
26 for(col in missing_cols){
|
|
27 cat(paste(col, "<br />"), file=logfile, append=T)
|
|
28 }
|
|
29 stop("Not all columns are present")
|
|
30 }
|
|
31
|
|
32 if(!("Total_Read_Count" %in% names(dat))){
|
|
33 dat$Total_Read_Count = 0
|
|
34 }
|
|
35
|
|
36 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Clone_Sequence")]
|
|
37
|
0
|
38 dat$dsPerM = 0
|
|
39 dat = dat[!is.na(dat$Patient),]
|
|
40 dat$Related_to_leukemia_clone = F
|
|
41
|
|
42 setwd(outDir)
|
|
43 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
|
44 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
45 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
46
|
|
47 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
|
48
|
|
49 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
|
50
|
|
51 dat = dat[dat$Frequency >= min_freq,]
|
|
52
|
1
|
53 patient.sample.counts = data.frame(data.table(dat)[, list(count=.N), by=c("Patient", "Sample")])
|
|
54 patient.sample.counts = data.frame(data.table(patient.sample.counts)[, list(count=.N), by=c("Patient")])
|
|
55
|
|
56 print("Found the following patients with number of samples:")
|
|
57 print(patient.sample.counts)
|
|
58
|
|
59 patient.sample.counts.pairs = patient.sample.counts[patient.sample.counts$count %in% 1:2,]
|
|
60 patient.sample.counts.triplets = patient.sample.counts[patient.sample.counts$count == 3,]
|
|
61
|
|
62
|
|
63
|
|
64 triplets = dat[dat$Patient %in% patient.sample.counts.triplets$Patient,]
|
|
65 dat = dat[dat$Patient %in% patient.sample.counts.pairs$Patient,]
|
0
|
66
|
|
67 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
68
|
|
69 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
70 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
71 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
72
|
|
73 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
74 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
75
|
|
76 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
77 print(paste("rows:", nrow(dat)))
|
|
78 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
|
79 print(paste("rows:", nrow(dat)))
|
|
80 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
81
|
|
82 dat = dat[dat$normalized_read_count >= min_cells,]
|
|
83
|
|
84 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
|
85
|
|
86 patients = split(dat, dat$Patient, drop=T)
|
|
87 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
|
88 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
|
89 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
90 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
91 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
92 Titles = factor(Titles, levels=Titles)
|
|
93 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
94
|
2
|
95 single_patients = dat[NULL,]
|
0
|
96
|
|
97 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
98 patient.merge.list.second = list()
|
|
99 scatter_locus_data_list = list()
|
|
100 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="multiple_matches.html", append=T)
|
|
101 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="single_matches.html", append=T)
|
|
102 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
|
103 if (!is.data.frame(x) & is.list(x)){
|
|
104 x = x[[1]]
|
|
105 }
|
|
106 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
107 x = data.frame(x,stringsAsFactors=F)
|
|
108 onShort = "reads"
|
|
109 if(on == "Frequency"){
|
|
110 onShort = "freq"
|
|
111 }
|
|
112 onx = paste(on, ".x", sep="")
|
|
113 ony = paste(on, ".y", sep="")
|
|
114 splt = split(x, x$Sample, drop=T)
|
2
|
115 print(splt)
|
0
|
116 type="pair"
|
|
117 if(length(splt) == 1){
|
|
118 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
2
|
119 splt[[2]] = splt[[1]][NULL,]
|
0
|
120 type="single"
|
|
121 }
|
|
122 patient1 = splt[[1]]
|
|
123 patient2 = splt[[2]]
|
|
124
|
2
|
125 oneSample = patient1[1,"Sample"]
|
|
126 twoSample = patient2[1,"Sample"]
|
|
127 patient = x[1,"Patient"]
|
|
128
|
0
|
129 switched = F
|
|
130 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
131 tmp = twoSample
|
|
132 twoSample = oneSample
|
|
133 oneSample = tmp
|
|
134 tmp = patient1
|
|
135 patient1 = patient2
|
|
136 patient2 = tmp
|
|
137 switched = T
|
|
138 }
|
|
139 if(appendtxt){
|
|
140 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
|
141 }
|
|
142 cat(paste("<tr><td>", patient, "</td>", sep=""), file=logfile, append=T)
|
|
143
|
|
144 if(mergeOn == "Clone_Sequence"){
|
|
145 patient1$merge = paste(patient1$Clone_Sequence)
|
|
146 patient2$merge = paste(patient2$Clone_Sequence)
|
|
147 } else {
|
|
148 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
149 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
150 }
|
|
151
|
|
152 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
|
153 scatterplot_data = patient1[NULL,scatterplot_data_columns]
|
|
154 scatterplot.data.type.factor = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both"))
|
|
155 scatterplot_data$type = character(0)
|
|
156 scatterplot_data$link = numeric(0)
|
|
157 scatterplot_data$on = character(0)
|
|
158
|
|
159 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh
|
2
|
160
|
0
|
161 cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence
|
2
|
162
|
0
|
163 start.time = proc.time()
|
|
164 merge.list = c()
|
2
|
165
|
0
|
166 if(patient %in% names(patient.merge.list)){
|
|
167 patientMerge = patient.merge.list[[patient]]
|
|
168 merge.list[["second"]] = patient.merge.list.second[[patient]]
|
|
169 scatterplot_data = scatter_locus_data_list[[patient]]
|
|
170 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
2
|
171
|
0
|
172 print(names(patient.merge.list))
|
|
173 } else {
|
|
174 #fuzzy matching here...
|
2
|
175
|
0
|
176 patient1.fuzzy = patient1
|
|
177 patient2.fuzzy = patient2
|
2
|
178
|
0
|
179 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
180 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
2
|
181
|
0
|
182 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
|
|
183 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
2
|
184
|
0
|
185 merge.list = list()
|
2
|
186
|
0
|
187 merge.list[["second"]] = vector()
|
2
|
188
|
|
189 link.count = 1
|
|
190
|
0
|
191 while(nrow(patient.fuzzy) > 1){
|
|
192 first.merge = patient.fuzzy[1,"merge"]
|
|
193 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
194 first.sample = patient.fuzzy[1,"Sample"]
|
|
195 merge.filter = first.merge == patient.fuzzy$merge
|
2
|
196
|
0
|
197 #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
|
2
|
198
|
0
|
199 first.sample.filter = first.sample == patient.fuzzy$Sample
|
|
200 second.sample.filter = first.sample != patient.fuzzy$Sample
|
2
|
201
|
0
|
202 #first match same sample, sum to a single row, same for other sample
|
|
203 #then merge rows like 'normal'
|
2
|
204
|
0
|
205 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
2
|
206
|
|
207
|
|
208
|
0
|
209 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter
|
|
210 first.match.filter = merge.filter & sequence.filter & first.sample.filter
|
|
211 second.match.filter = merge.filter & sequence.filter & second.sample.filter
|
2
|
212
|
0
|
213 first.rows = patient.fuzzy[first.match.filter,]
|
|
214 second.rows = patient.fuzzy[second.match.filter,]
|
2
|
215
|
0
|
216 first.rows.v = table(first.rows$V_Segment_Major_Gene)
|
|
217 first.rows.v = names(first.rows.v[which.max(first.rows.v)])
|
|
218 first.rows.j = table(first.rows$J_Segment_Major_Gene)
|
|
219 first.rows.j = names(first.rows.j[which.max(first.rows.j)])
|
2
|
220
|
0
|
221 first.sum = data.frame(merge = first.clone.sequence,
|
|
222 Patient = patient,
|
|
223 Receptor = first.rows[1,"Receptor"],
|
|
224 Sample = first.rows[1,"Sample"],
|
|
225 Cell_Count = first.rows[1,"Cell_Count"],
|
|
226 Clone_Molecule_Count_From_Spikes = sum(first.rows$Clone_Molecule_Count_From_Spikes),
|
|
227 Log10_Frequency = log10(sum(first.rows$Frequency)),
|
|
228 Total_Read_Count = sum(first.rows$Total_Read_Count),
|
|
229 dsPerM = sum(first.rows$dsPerM),
|
|
230 J_Segment_Major_Gene = first.rows.j,
|
|
231 V_Segment_Major_Gene = first.rows.v,
|
|
232 Clone_Sequence = first.clone.sequence,
|
|
233 CDR3_Sense_Sequence = first.rows[1,"CDR3_Sense_Sequence"],
|
|
234 Related_to_leukemia_clone = F,
|
|
235 Frequency = sum(first.rows$Frequency),
|
|
236 locus_V = first.rows[1,"locus_V"],
|
|
237 locus_J = first.rows[1,"locus_J"],
|
|
238 min_cell_count = first.rows[1,"min_cell_count"],
|
|
239 normalized_read_count = sum(first.rows$normalized_read_count),
|
|
240 paste = first.rows[1,"paste"],
|
|
241 min_cell_paste = first.rows[1,"min_cell_paste"])
|
2
|
242
|
0
|
243 if(nrow(second.rows) > 0){
|
|
244 second.rows.v = table(second.rows$V_Segment_Major_Gene)
|
|
245 second.rows.v = names(second.rows.v[which.max(second.rows.v)])
|
|
246 second.rows.j = table(second.rows$J_Segment_Major_Gene)
|
|
247 second.rows.j = names(second.rows.j[which.max(second.rows.j)])
|
2
|
248
|
0
|
249 second.sum = data.frame(merge = first.clone.sequence,
|
2
|
250 Patient = patient,
|
|
251 Receptor = second.rows[1,"Receptor"],
|
|
252 Sample = second.rows[1,"Sample"],
|
|
253 Cell_Count = second.rows[1,"Cell_Count"],
|
|
254 Clone_Molecule_Count_From_Spikes = sum(second.rows$Clone_Molecule_Count_From_Spikes),
|
|
255 Log10_Frequency = log10(sum(second.rows$Frequency)),
|
|
256 Total_Read_Count = sum(second.rows$Total_Read_Count),
|
|
257 dsPerM = sum(second.rows$dsPerM),
|
|
258 J_Segment_Major_Gene = second.rows.j,
|
|
259 V_Segment_Major_Gene = second.rows.v,
|
|
260 Clone_Sequence = first.clone.sequence,
|
|
261 CDR3_Sense_Sequence = second.rows[1,"CDR3_Sense_Sequence"],
|
|
262 Related_to_leukemia_clone = F,
|
|
263 Frequency = sum(second.rows$Frequency),
|
|
264 locus_V = second.rows[1,"locus_V"],
|
|
265 locus_J = second.rows[1,"locus_J"],
|
|
266 min_cell_count = second.rows[1,"min_cell_count"],
|
|
267 normalized_read_count = sum(second.rows$normalized_read_count),
|
|
268 paste = second.rows[1,"paste"],
|
|
269 min_cell_paste = second.rows[1,"min_cell_paste"])
|
|
270
|
|
271 #print(names(patientMerge))
|
|
272 #print(merge(first.sum, second.sum, by="merge"))
|
0
|
273 patientMerge = rbind(patientMerge, merge(first.sum, second.sum, by="merge"))
|
2
|
274 #print("test2")
|
0
|
275 patient.fuzzy = patient.fuzzy[!(first.match.filter | second.match.filter),]
|
2
|
276
|
0
|
277 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"], second.rows[second.rows$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
278 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
2
|
279
|
0
|
280 tmp.rows = rbind(first.rows, second.rows)
|
2
|
281 #print("test3")
|
0
|
282 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
|
283
|
|
284
|
|
285 #add to the scatterplot data
|
|
286 scatterplot.row = first.sum[,scatterplot_data_columns]
|
2
|
287 scatterplot.row$type = paste(first.sum[,"Sample"], "In Both")
|
|
288 scatterplot.row$link = link.count
|
|
289 scatterplot.row$on = onShort
|
|
290
|
|
291 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
0
|
292
|
|
293 scatterplot.row = second.sum[,scatterplot_data_columns]
|
2
|
294 scatterplot.row$type = paste(second.sum[,"Sample"], "In Both")
|
|
295 scatterplot.row$link = link.count
|
|
296 scatterplot.row$on = onShort
|
|
297
|
|
298 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
299
|
|
300 #write some information about the match to a log file
|
0
|
301 if (nrow(first.rows) > 1 | nrow(second.rows) > 1) {
|
|
302 cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
303 } else {
|
|
304 second.clone.sequence = second.rows[1,"Clone_Sequence"]
|
|
305 if(nchar(first.clone.sequence) != nchar(second.clone.sequence)){
|
|
306 cat(paste("<tr bgcolor='#DDD'><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
307 } else {
|
|
308 #cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
309 }
|
|
310 }
|
2
|
311
|
0
|
312 } else if(nrow(first.rows) > 1) {
|
|
313 if(patient1[1,"Sample"] == first.sample){
|
|
314 patient1 = patient1[!(patient1$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
315 patient1 = rbind(patient1, first.sum)
|
|
316 } else {
|
|
317 patient2 = patient2[!(patient2$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
318 patient2 = rbind(patient2, first.sum)
|
|
319 }
|
2
|
320
|
0
|
321 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"])
|
|
322 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
2
|
323
|
0
|
324 patient.fuzzy = patient.fuzzy[-first.match.filter,]
|
|
325
|
|
326 #add to the scatterplot data
|
|
327 scatterplot.row = first.sum[,scatterplot_data_columns]
|
2
|
328 scatterplot.row$type = first.sum[,"Sample"]
|
|
329 scatterplot.row$link = link.count
|
|
330 scatterplot.row$on = onShort
|
|
331
|
|
332 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
333
|
0
|
334 cat(paste("<tr bgcolor='#DDF'><td>", patient, " row ", 1:nrow(first.rows), "</td><td>", first.rows$Sample, ":</td><td>", first.rows$Clone_Sequence, "</td><td>", first.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
335 } else {
|
|
336 patient.fuzzy = patient.fuzzy[-1,]
|
|
337
|
|
338 #add to the scatterplot data
|
|
339 scatterplot.row = first.sum[,scatterplot_data_columns]
|
2
|
340 scatterplot.row$type = first.sum[,"Sample"]
|
|
341 scatterplot.row$link = link.count
|
|
342 scatterplot.row$on = onShort
|
|
343
|
|
344 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
0
|
345 }
|
|
346 link.count = link.count + 1
|
|
347 }
|
|
348 patient.merge.list[[patient]] <<- patientMerge
|
|
349 patient.merge.list.second[[patient]] <<- merge.list[["second"]]
|
|
350
|
|
351 sample.order = data.frame(type = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both")),type.order = 1:4)
|
|
352 scatterplot_data = merge(scatterplot_data, sample.order, by="type")
|
|
353
|
|
354 scatter_locus_data_list[[patient]] <<- scatterplot_data
|
|
355 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
|
356 }
|
2
|
357
|
0
|
358 patient1 = patient1[!(patient1$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
|
359 patient2 = patient2[!(patient2$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
2
|
360
|
0
|
361
|
|
362 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
|
363 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony])
|
|
364 res1 = vector()
|
|
365 res2 = vector()
|
|
366 resBoth = vector()
|
|
367 read1Count = vector()
|
|
368 read2Count = vector()
|
|
369 locussum1 = vector()
|
|
370 locussum2 = vector()
|
|
371
|
|
372 #for(iter in 1){
|
|
373 for(iter in 1:length(product[,1])){
|
2
|
374 threshhold = product[iter,"interval"]
|
|
375 V_Segment = paste(".*", as.character(product[iter,"V_Segments"]), ".*", sep="")
|
|
376 J_Segment = paste(".*", as.character(product[iter,"J_Segments"]), ".*", sep="")
|
0
|
377 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
|
378 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
|
379 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
380 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
|
381 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
382 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
|
383 res1 = append(res1, sum(one))
|
|
384 res2 = append(res2, sum(two))
|
|
385 resBoth = append(resBoth, sum(both))
|
|
386 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
387 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
388 #threshhold = 0
|
2
|
389 if(threshhold != 0 | T){
|
0
|
390 if(sum(one) > 0){
|
|
391 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
392 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
2
|
393 filenameOne = paste(oneSample, "_", product[iter, "Titles"], "_", threshhold, sep="")
|
0
|
394 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
395 }
|
|
396 if(sum(two) > 0){
|
|
397 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
398 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
2
|
399 filenameTwo = paste(twoSample, "_", product[iter, "Titles"], "_", threshhold, sep="")
|
0
|
400 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
401 }
|
|
402 } else {
|
|
403 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
404 if(nrow(scatterplot_locus_data) > 0){
|
2
|
405 scatterplot_locus_data$Rearrangement = product[iter, "Titles"]
|
0
|
406 }
|
|
407
|
|
408
|
2
|
409
|
0
|
410 p = NULL
|
2
|
411 #print(paste("nrow scatterplot_locus_data", nrow(scatterplot_locus_data)))
|
0
|
412 if(nrow(scatterplot_locus_data) != 0){
|
|
413 if(on == "normalized_read_count"){
|
2
|
414 write.table(scatterplot_locus_data, file=paste(oneSample, twoSample, product[iter, "Titles"], "scatterplot_locus_data.txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
0
|
415 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
416 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), normalized_read_count, group=link)) + geom_line() + scale_y_log10(breaks=scales,labels=scales, limits=c(1,1e6)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
|
417 } else {
|
|
418 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), Frequency, group=link)) + geom_line() + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
|
419 }
|
|
420 p = p + geom_point(aes(colour=type), position="dodge")
|
2
|
421 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,"Patient"], patient1[1,"Sample"], patient2[1,"Sample"], onShort, product[iter, "Titles"]))
|
0
|
422 } else {
|
2
|
423 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,"Patient"], patient1[1,"Sample"], patient2[1,"Sample"], onShort, product[iter, titleIndex]))
|
0
|
424 }
|
2
|
425 png(paste(patient1[1,"Patient"], "_", patient1[1,"Sample"], "_", patient2[1,"Sample"], "_", onShort, "_", product[iter, "Titles"],"_scatter.png", sep=""))
|
0
|
426 print(p)
|
|
427 dev.off()
|
|
428 }
|
|
429 if(sum(both) > 0){
|
|
430 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
431 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
2
|
432 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, "Titles"], "_", threshhold, sep="")
|
0
|
433 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
434 }
|
|
435 }
|
|
436 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
|
437 if(sum(is.na(patientResult$percentage)) > 0){
|
|
438 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
439 }
|
|
440 colnames(patientResult)[6] = oneSample
|
|
441 colnames(patientResult)[8] = twoSample
|
|
442 colnamesBak = colnames(patientResult)
|
|
443 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
|
444 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
445 colnames(patientResult) = colnamesBak
|
|
446
|
|
447 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
448 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
449
|
|
450 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
451 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
452 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
453 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
454 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
455 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
456 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
457 print(plt)
|
|
458 dev.off()
|
|
459 #(t,r,b,l)
|
|
460 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
461 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
462 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
463 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
464 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
465 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
466 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
467 print(plt)
|
|
468 dev.off()
|
|
469
|
|
470 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
471 patientResult$relativeValue = patientResult$value * 10
|
|
472 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
473 plt = ggplot(patientResult)
|
|
474 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
475 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
476 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
477 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
478 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
479 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
480 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
481 print(plt)
|
|
482 dev.off()
|
|
483 }
|
2
|
484
|
1
|
485 if(length(patients) > 0){
|
|
486 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
0
|
487
|
1
|
488 interval = intervalFreq
|
|
489 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
490 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
2
|
491 for (current_patient in patients){
|
|
492 print(paste("Started working", unique(current_patient$Patient), "Frequency analysis"))
|
|
493 patientCountOnColumn(current_patient, product=product, interval=interval, on="Frequency", appendtxt=T)
|
|
494 }
|
0
|
495
|
1
|
496 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
0
|
497
|
1
|
498 interval = intervalReads
|
|
499 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
500 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
2
|
501 for (current_patient in patients){
|
|
502 print(paste("Started working on ", unique(current_patient$Patient), "Read Count analysis"))
|
|
503 patientCountOnColumn(current_patient, product=product, interval=interval, on="normalized_read_count")
|
|
504 }
|
1
|
505 }
|
0
|
506 if(nrow(single_patients) > 0){
|
|
507 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
508 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=as.character(scales)) + expand_limits(y=c(0,1000000))
|
|
509 p = p + geom_point(aes(colour=type), position="jitter")
|
|
510 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
511 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
512 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
513 print(p)
|
|
514 dev.off()
|
|
515
|
|
516 #p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
517 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
|
|
518 p = p + geom_point(aes(colour=type), position="jitter")
|
|
519 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
520 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
521 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
522 print(p)
|
|
523 dev.off()
|
|
524 } else {
|
|
525 empty <- data.frame()
|
|
526 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
527
|
|
528 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
529 print(p)
|
|
530 dev.off()
|
|
531
|
|
532 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
533 print(p)
|
|
534 dev.off()
|
|
535 }
|
|
536
|
|
537 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
538 patient.merge.list.second = list()
|
|
539
|
|
540 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
1
|
541 onShort = "reads"
|
|
542 if(on == "Frequency"){
|
|
543 onShort = "freq"
|
|
544 }
|
|
545 onx = paste(on, ".x", sep="")
|
|
546 ony = paste(on, ".y", sep="")
|
|
547 onz = paste(on, ".z", sep="")
|
|
548 type="triplet"
|
0
|
549
|
1
|
550 threshholdIndex = which(colnames(product) == "interval")
|
|
551 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
552 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
553 titleIndex = which(colnames(product) == "Titles")
|
|
554 sampleIndex = which(colnames(patient1) == "Sample")
|
|
555 patientIndex = which(colnames(patient1) == "Patient")
|
|
556 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
557 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
558 threeSample = paste(patient3[1,sampleIndex], sep="")
|
|
559
|
|
560 if(mergeOn == "Clone_Sequence"){
|
|
561 patient1$merge = paste(patient1$Clone_Sequence)
|
0
|
562 patient2$merge = paste(patient2$Clone_Sequence)
|
|
563 patient3$merge = paste(patient3$Clone_Sequence)
|
|
564
|
1
|
565 } else {
|
0
|
566 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
567 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
568 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
1
|
569 }
|
0
|
570
|
1
|
571 #patientMerge = merge(patient1, patient2, by="merge")[NULL,]
|
|
572 patient1.fuzzy = patient1
|
|
573 patient2.fuzzy = patient2
|
|
574 patient3.fuzzy = patient3
|
0
|
575
|
1
|
576 cat(paste("<tr><td>", label1, "</td>", sep=""), file=logfile, append=T)
|
0
|
577
|
1
|
578 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
579 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
|
580 patient3.fuzzy$merge = paste(patient3.fuzzy$locus_V, patient3.fuzzy$locus_J)
|
0
|
581
|
1
|
582 patient.fuzzy = rbind(patient1.fuzzy ,patient2.fuzzy, patient3.fuzzy)
|
|
583 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
0
|
584
|
1
|
585 other.sample.list = list()
|
|
586 other.sample.list[[oneSample]] = c(twoSample, threeSample)
|
|
587 other.sample.list[[twoSample]] = c(oneSample, threeSample)
|
|
588 other.sample.list[[threeSample]] = c(oneSample, twoSample)
|
0
|
589
|
1
|
590 patientMerge = merge(patient1, patient2, by="merge")
|
|
591 patientMerge = merge(patientMerge, patient3, by="merge")
|
|
592 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
|
593 #patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
594 patientMerge = patientMerge[NULL,]
|
0
|
595
|
1
|
596 duo.merge.list = list()
|
0
|
597
|
1
|
598 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
599 #patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
600 patientMerge12 = patientMerge12[NULL,]
|
|
601 duo.merge.list[[paste(oneSample, twoSample)]] = patientMerge12
|
|
602 duo.merge.list[[paste(twoSample, oneSample)]] = patientMerge12
|
0
|
603
|
1
|
604 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
605 #patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
606 patientMerge13 = patientMerge13[NULL,]
|
|
607 duo.merge.list[[paste(oneSample, threeSample)]] = patientMerge13
|
|
608 duo.merge.list[[paste(threeSample, oneSample)]] = patientMerge13
|
0
|
609
|
1
|
610 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
611 #patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
612 patientMerge23 = patientMerge23[NULL,]
|
|
613 duo.merge.list[[paste(twoSample, threeSample)]] = patientMerge23
|
|
614 duo.merge.list[[paste(threeSample, twoSample)]] = patientMerge23
|
0
|
615
|
1
|
616 merge.list = list()
|
|
617 merge.list[["second"]] = vector()
|
|
618
|
|
619 #print(paste(nrow(patient1), nrow(patient2), nrow(patient3), label1, label2, label3))
|
|
620
|
|
621 start.time = proc.time()
|
|
622 if(paste(label1, "123") %in% names(patient.merge.list)){
|
|
623 patientMerge = patient.merge.list[[paste(label1, "123")]]
|
|
624 patientMerge12 = patient.merge.list[[paste(label1, "12")]]
|
|
625 patientMerge13 = patient.merge.list[[paste(label1, "13")]]
|
|
626 patientMerge23 = patient.merge.list[[paste(label1, "23")]]
|
0
|
627
|
1
|
628 #merge.list[["second"]] = patient.merge.list.second[[label1]]
|
0
|
629
|
1
|
630 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
|
631 } else {
|
|
632 while(nrow(patient.fuzzy) > 0){
|
|
633 first.merge = patient.fuzzy[1,"merge"]
|
|
634 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
635 first.sample = paste(patient.fuzzy[1,"Sample"], sep="")
|
|
636
|
|
637 merge.filter = first.merge == patient.fuzzy$merge
|
|
638
|
|
639 second.sample = other.sample.list[[first.sample]][1]
|
|
640 third.sample = other.sample.list[[first.sample]][2]
|
0
|
641
|
1
|
642 sample.filter.1 = first.sample == patient.fuzzy$Sample
|
|
643 sample.filter.2 = second.sample == patient.fuzzy$Sample
|
|
644 sample.filter.3 = third.sample == patient.fuzzy$Sample
|
0
|
645
|
1
|
646 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
0
|
647
|
1
|
648 match.filter.1 = sample.filter.1 & sequence.filter & merge.filter
|
|
649 match.filter.2 = sample.filter.2 & sequence.filter & merge.filter
|
|
650 match.filter.3 = sample.filter.3 & sequence.filter & merge.filter
|
0
|
651
|
1
|
652 matches.in.1 = any(match.filter.1)
|
|
653 matches.in.2 = any(match.filter.2)
|
|
654 matches.in.3 = any(match.filter.3)
|
0
|
655
|
1
|
656 rows.1 = patient.fuzzy[match.filter.1,]
|
0
|
657
|
1
|
658 sum.1 = data.frame(merge = first.clone.sequence,
|
|
659 Patient = label1,
|
|
660 Receptor = rows.1[1,"Receptor"],
|
|
661 Sample = rows.1[1,"Sample"],
|
|
662 Cell_Count = rows.1[1,"Cell_Count"],
|
|
663 Clone_Molecule_Count_From_Spikes = sum(rows.1$Clone_Molecule_Count_From_Spikes),
|
|
664 Log10_Frequency = log10(sum(rows.1$Frequency)),
|
|
665 Total_Read_Count = sum(rows.1$Total_Read_Count),
|
|
666 dsPerM = sum(rows.1$dsPerM),
|
|
667 J_Segment_Major_Gene = rows.1[1,"J_Segment_Major_Gene"],
|
|
668 V_Segment_Major_Gene = rows.1[1,"V_Segment_Major_Gene"],
|
|
669 Clone_Sequence = first.clone.sequence,
|
|
670 CDR3_Sense_Sequence = rows.1[1,"CDR3_Sense_Sequence"],
|
|
671 Related_to_leukemia_clone = F,
|
|
672 Frequency = sum(rows.1$Frequency),
|
|
673 locus_V = rows.1[1,"locus_V"],
|
|
674 locus_J = rows.1[1,"locus_J"],
|
|
675 uniqueID = rows.1[1,"uniqueID"],
|
|
676 normalized_read_count = sum(rows.1$normalized_read_count))
|
|
677 sum.2 = sum.1[NULL,]
|
|
678 rows.2 = patient.fuzzy[match.filter.2,]
|
|
679 if(matches.in.2){
|
|
680 sum.2 = data.frame(merge = first.clone.sequence,
|
|
681 Patient = label1,
|
|
682 Receptor = rows.2[1,"Receptor"],
|
|
683 Sample = rows.2[1,"Sample"],
|
|
684 Cell_Count = rows.2[1,"Cell_Count"],
|
|
685 Clone_Molecule_Count_From_Spikes = sum(rows.2$Clone_Molecule_Count_From_Spikes),
|
|
686 Log10_Frequency = log10(sum(rows.2$Frequency)),
|
|
687 Total_Read_Count = sum(rows.2$Total_Read_Count),
|
|
688 dsPerM = sum(rows.2$dsPerM),
|
|
689 J_Segment_Major_Gene = rows.2[1,"J_Segment_Major_Gene"],
|
|
690 V_Segment_Major_Gene = rows.2[1,"V_Segment_Major_Gene"],
|
|
691 Clone_Sequence = first.clone.sequence,
|
|
692 CDR3_Sense_Sequence = rows.2[1,"CDR3_Sense_Sequence"],
|
|
693 Related_to_leukemia_clone = F,
|
|
694 Frequency = sum(rows.2$Frequency),
|
|
695 locus_V = rows.2[1,"locus_V"],
|
|
696 locus_J = rows.2[1,"locus_J"],
|
|
697 uniqueID = rows.2[1,"uniqueID"],
|
|
698 normalized_read_count = sum(rows.2$normalized_read_count))
|
|
699 }
|
0
|
700
|
1
|
701 sum.3 = sum.1[NULL,]
|
|
702 rows.3 = patient.fuzzy[match.filter.3,]
|
|
703 if(matches.in.3){
|
|
704 sum.3 = data.frame(merge = first.clone.sequence,
|
|
705 Patient = label1,
|
|
706 Receptor = rows.3[1,"Receptor"],
|
|
707 Sample = rows.3[1,"Sample"],
|
|
708 Cell_Count = rows.3[1,"Cell_Count"],
|
|
709 Clone_Molecule_Count_From_Spikes = sum(rows.3$Clone_Molecule_Count_From_Spikes),
|
|
710 Log10_Frequency = log10(sum(rows.3$Frequency)),
|
|
711 Total_Read_Count = sum(rows.3$Total_Read_Count),
|
|
712 dsPerM = sum(rows.3$dsPerM),
|
|
713 J_Segment_Major_Gene = rows.3[1,"J_Segment_Major_Gene"],
|
|
714 V_Segment_Major_Gene = rows.3[1,"V_Segment_Major_Gene"],
|
|
715 Clone_Sequence = first.clone.sequence,
|
|
716 CDR3_Sense_Sequence = rows.3[1,"CDR3_Sense_Sequence"],
|
|
717 Related_to_leukemia_clone = F,
|
|
718 Frequency = sum(rows.3$Frequency),
|
|
719 locus_V = rows.3[1,"locus_V"],
|
|
720 locus_J = rows.3[1,"locus_J"],
|
|
721 uniqueID = rows.3[1,"uniqueID"],
|
|
722 normalized_read_count = sum(rows.3$normalized_read_count))
|
|
723 }
|
0
|
724
|
1
|
725 if(matches.in.2 & matches.in.3){
|
|
726 merge.123 = merge(sum.1, sum.2, by="merge")
|
|
727 merge.123 = merge(merge.123, sum.3, by="merge")
|
|
728 colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123)))] = paste(colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123), perl=T))], ".z", sep="")
|
|
729 #merge.123$thresholdValue = pmax(merge.123[,onx], merge.123[,ony], merge.123[,onz])
|
0
|
730
|
1
|
731 patientMerge = rbind(patientMerge, merge.123)
|
|
732 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2 | match.filter.3),]
|
0
|
733
|
1
|
734 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
735 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
0
|
736
|
1
|
737 } else if (matches.in.2) {
|
|
738 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
739 #other.sample2 = other.sample.list[[first.sample]][2]
|
0
|
740
|
1
|
741 second.sample = sum.2[,"Sample"]
|
0
|
742
|
1
|
743 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
0
|
744
|
1
|
745 merge.12 = merge(sum.1, sum.2, by="merge")
|
0
|
746
|
1
|
747 current.merge.list = rbind(current.merge.list, merge.12)
|
|
748 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
0
|
749
|
1
|
750 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2),]
|
0
|
751
|
1
|
752 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
753 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
0
|
754
|
1
|
755 } else if (matches.in.3) {
|
0
|
756
|
1
|
757 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
758 #other.sample2 = other.sample.list[[first.sample]][2]
|
0
|
759
|
1
|
760 second.sample = sum.3[,"Sample"]
|
0
|
761
|
1
|
762 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
0
|
763
|
1
|
764 merge.13 = merge(sum.1, sum.3, by="merge")
|
0
|
765
|
1
|
766 current.merge.list = rbind(current.merge.list, merge.13)
|
|
767 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
0
|
768
|
1
|
769 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.3),]
|
0
|
770
|
1
|
771 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
772 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
0
|
773
|
1
|
774 } else if(nrow(rows.1) > 1){
|
|
775 patient1 = patient1[!(patient1$Clone_Sequence %in% rows.1$Clone_Sequence),]
|
|
776 print(names(patient1)[names(patient1) %in% sum.1])
|
|
777 print(names(patient1)[!(names(patient1) %in% sum.1)])
|
|
778 print(names(patient1))
|
|
779 print(names(sum.1))
|
|
780 print(summary(sum.1))
|
|
781 print(summary(patient1))
|
|
782 print(dim(sum.1))
|
|
783 print(dim(patient1))
|
|
784 print(head(sum.1[,names(patient1)]))
|
|
785 patient1 = rbind(patient1, sum.1[,names(patient1)])
|
|
786 patient.fuzzy = patient.fuzzy[-match.filter.1,]
|
|
787 } else {
|
|
788 patient.fuzzy = patient.fuzzy[-1,]
|
|
789 }
|
|
790
|
|
791 tmp.rows = rbind(rows.1, rows.2, rows.3)
|
|
792 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
0
|
793
|
1
|
794 if (sum(match.filter.1) > 1 | sum(match.filter.2) > 1 | sum(match.filter.1) > 1) {
|
|
795 cat(paste("<tr><td>", label1, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
796 } else {
|
|
797 }
|
|
798
|
|
799 }
|
|
800 patient.merge.list[[paste(label1, "123")]] = patientMerge
|
|
801
|
|
802 patientMerge12 = duo.merge.list[[paste(oneSample, twoSample)]]
|
|
803 patientMerge13 = duo.merge.list[[paste(oneSample, threeSample)]]
|
|
804 patientMerge23 = duo.merge.list[[paste(twoSample, threeSample)]]
|
0
|
805
|
1
|
806 patient.merge.list[[paste(label1, "12")]] = patientMerge12
|
|
807 patient.merge.list[[paste(label1, "13")]] = patientMerge13
|
|
808 patient.merge.list[[paste(label1, "23")]] = patientMerge23
|
|
809
|
|
810 #patient.merge.list.second[[label1]] = merge.list[["second"]]
|
|
811 }
|
|
812 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
|
813 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
814 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
815 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
816 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
817
|
|
818 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
819 #patientMerge12$thresholdValue = pmin(patientMerge12[,onx], patientMerge12[,ony])
|
|
820 #patientMerge13$thresholdValue = pmin(patientMerge13[,onx], patientMerge13[,ony])
|
|
821 #patientMerge23$thresholdValue = pmin(patientMerge23[,onx], patientMerge23[,ony])
|
0
|
822
|
1
|
823 patient1 = patient1[!(patient1$Clone_Sequence %in% merge.list[["second"]]),]
|
|
824 patient2 = patient2[!(patient2$Clone_Sequence %in% merge.list[["second"]]),]
|
|
825 patient3 = patient3[!(patient3$Clone_Sequence %in% merge.list[["second"]]),]
|
0
|
826
|
1
|
827 if(F){
|
|
828 patientMerge = merge(patient1, patient2, by="merge")
|
|
829 patientMerge = merge(patientMerge, patient3, by="merge")
|
|
830 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
|
831 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
832 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
833 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
834 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
835 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
836 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
837 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
838 }
|
0
|
839
|
1
|
840 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
|
841 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
|
842 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
843
|
|
844 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
0
|
845
|
1
|
846 res1 = vector()
|
|
847 res2 = vector()
|
|
848 res3 = vector()
|
|
849 res12 = vector()
|
|
850 res13 = vector()
|
|
851 res23 = vector()
|
|
852 resAll = vector()
|
|
853 read1Count = vector()
|
|
854 read2Count = vector()
|
|
855 read3Count = vector()
|
0
|
856
|
1
|
857 if(appendTriplets){
|
|
858 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
|
859 }
|
|
860 for(iter in 1:length(product[,1])){
|
|
861 threshhold = product[iter,threshholdIndex]
|
|
862 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
863 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
|
864 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
865 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
0
|
866
|
1
|
867 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
868 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
869 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
|
870
|
|
871 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
872 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
873 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
0
|
874
|
1
|
875 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
876 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
877 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
|
878 res1 = append(res1, sum(one))
|
|
879 res2 = append(res2, sum(two))
|
|
880 res3 = append(res3, sum(three))
|
|
881 resAll = append(resAll, sum(all))
|
|
882 res12 = append(res12, sum(one_two))
|
|
883 res13 = append(res13, sum(one_three))
|
|
884 res23 = append(res23, sum(two_three))
|
|
885 #threshhold = 0
|
|
886 if(threshhold != 0){
|
|
887 if(sum(one) > 0){
|
|
888 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
889 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
|
890 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
891 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
892 }
|
|
893 if(sum(two) > 0){
|
|
894 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
895 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
|
896 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
897 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
898 }
|
|
899 if(sum(three) > 0){
|
|
900 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
901 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
|
902 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
903 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
904 }
|
|
905 if(sum(one_two) > 0){
|
|
906 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
907 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
908 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
909 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
910 }
|
|
911 if(sum(one_three) > 0){
|
|
912 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
913 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
914 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
915 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
916 }
|
|
917 if(sum(two_three) > 0){
|
|
918 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
919 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
920 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
921 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
922 }
|
|
923 } else { #scatterplot data
|
|
924 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
925 scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
|
|
926 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
|
927 if(sum(in_two) > 0){
|
|
928 scatterplot_locus_data[in_two,]$type = "In two"
|
|
929 }
|
|
930 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
|
931 if(sum(in_three)> 0){
|
|
932 scatterplot_locus_data[in_three,]$type = "In three"
|
|
933 }
|
|
934 not_in_one = scatterplot_locus_data$type != "In one"
|
|
935 if(sum(not_in_one) > 0){
|
|
936 #scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
937 }
|
|
938 p = NULL
|
|
939 if(nrow(scatterplot_locus_data) != 0){
|
2
|
940 filename.scatter = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_scatter_", threshhold, sep="")
|
|
941 write.table(scatterplot_locus_data, file=paste(filename.scatter, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
942 if(on == "normalized_read_count"){
|
|
943 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
944 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales, limits=c(1, 1e6))
|
|
945 } else {
|
|
946 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
|
|
947 #p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
948 }
|
|
949 p = p + geom_point(aes(colour=type), position="jitter")
|
|
950 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
1
|
951 } else {
|
2
|
952 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
1
|
953 }
|
|
954 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
955 print(p)
|
|
956 dev.off()
|
|
957 }
|
|
958 if(sum(all) > 0){
|
|
959 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
960 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
961 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
962 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
963 }
|
|
964 }
|
|
965 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
966 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
|
967 colnames(patientResult)[6] = oneSample
|
|
968 colnames(patientResult)[7] = twoSample
|
|
969 colnames(patientResult)[8] = threeSample
|
|
970 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
971 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
972 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
|
973
|
|
974 colnamesBak = colnames(patientResult)
|
|
975 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
|
976 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
977 colnames(patientResult) = colnamesBak
|
|
978
|
|
979 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
980 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
981
|
|
982 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
983 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
984 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
985 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
986 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
987 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
988 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
989 print(plt)
|
|
990 dev.off()
|
|
991
|
|
992 fontSize = 4
|
|
993
|
|
994 bak = patientResult
|
|
995 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
996 patientResult$relativeValue = patientResult$value * 10
|
|
997 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
998 plt = ggplot(patientResult)
|
|
999 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
1000 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
1001 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
1002 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
1003 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
1004 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
1005 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
1006 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
1007 print(plt)
|
|
1008 dev.off()
|
0
|
1009 }
|
|
1010
|
|
1011 if(nrow(triplets) != 0){
|
|
1012
|
1
|
1013 cat("<tr><td>Starting triplet analysis</td></tr>", file=logfile, append=T)
|
|
1014
|
|
1015 triplets$uniqueID = paste(triplets$Patient, triplets$Sample, sep="_")
|
|
1016
|
|
1017 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
0
|
1018
|
1
|
1019 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
1020 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
1021 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
0
|
1022
|
1
|
1023 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
1024 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
0
|
1025
|
1
|
1026 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
1027
|
|
1028 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
1029
|
|
1030 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2)
|
0
|
1031
|
1
|
1032 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
1033
|
|
1034 column_drops = c("min_cell_count", "min_cell_paste")
|
|
1035
|
|
1036 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
|
1037
|
|
1038 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
0
|
1039
|
1
|
1040 interval = intervalReads
|
|
1041 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
1042 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
0
|
1043
|
1
|
1044 triplets = split(triplets, triplets$Patient, drop=T)
|
|
1045 print(nrow(triplets))
|
|
1046 for(triplet in triplets){
|
|
1047 samples = unique(triplet$Sample)
|
|
1048 one = triplet[triplet$Sample == samples[1],]
|
|
1049 two = triplet[triplet$Sample == samples[2],]
|
|
1050 three = triplet[triplet$Sample == samples[3],]
|
|
1051
|
|
1052 print(paste(nrow(triplet), nrow(one), nrow(two), nrow(three)))
|
|
1053 tripletAnalysis(one, one[1,"uniqueID"], two, two[1,"uniqueID"], three, three[1,"uniqueID"], product=product, interval=interval, on="normalized_read_count", T)
|
|
1054 }
|
|
1055
|
|
1056 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
1057
|
|
1058 interval = intervalFreq
|
|
1059 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
1060 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
1061
|
|
1062 for(triplet in triplets){
|
|
1063 samples = unique(triplet$Sample)
|
|
1064 one = triplet[triplet$Sample == samples[1],]
|
|
1065 two = triplet[triplet$Sample == samples[2],]
|
|
1066 three = triplet[triplet$Sample == samples[3],]
|
|
1067 tripletAnalysis(one, one[1,"uniqueID"], two, two[1,"uniqueID"], three, three[1,"uniqueID"], product=product, interval=interval, on="Frequency", F)
|
|
1068 }
|
0
|
1069 } else {
|
|
1070 cat("", file="triplets.txt")
|
|
1071 }
|
|
1072 cat("</table></html>", file=logfile, append=T)
|