0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2 options(scipen=999)
|
|
3
|
|
4 inFile = args[1]
|
|
5 outDir = args[2]
|
|
6 logfile = args[3]
|
|
7 min_freq = as.numeric(args[4])
|
|
8 min_cells = as.numeric(args[5])
|
|
9 mergeOn = args[6]
|
|
10
|
|
11 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
|
12
|
|
13 library(ggplot2)
|
|
14 library(reshape2)
|
|
15 library(data.table)
|
|
16 library(grid)
|
|
17 library(parallel)
|
|
18 #require(xtable)
|
|
19 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
|
20 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
2
|
21
|
|
22 needed_cols = c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Clone_Sequence")
|
|
23 if(!all(needed_cols %in% names(dat))){
|
|
24 cat("Missing column(s):<br />", file=logfile, append=F)
|
|
25 missing_cols = needed_cols[!(needed_cols %in% names(dat))]
|
|
26 for(col in missing_cols){
|
|
27 cat(paste(col, "<br />"), file=logfile, append=T)
|
|
28 }
|
|
29 stop("Not all columns are present")
|
|
30 }
|
|
31
|
|
32 if(!("Total_Read_Count" %in% names(dat))){
|
|
33 dat$Total_Read_Count = 0
|
|
34 }
|
|
35
|
|
36 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Clone_Sequence")]
|
|
37
|
0
|
38 dat$dsPerM = 0
|
|
39 dat = dat[!is.na(dat$Patient),]
|
|
40 dat$Related_to_leukemia_clone = F
|
|
41
|
|
42 setwd(outDir)
|
|
43 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
|
44 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
45 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
46
|
|
47 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
|
48
|
|
49 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
5
|
50 dat = dat[dat$Frequency <= 100,] #just in case?
|
0
|
51
|
|
52 dat = dat[dat$Frequency >= min_freq,]
|
|
53
|
1
|
54 patient.sample.counts = data.frame(data.table(dat)[, list(count=.N), by=c("Patient", "Sample")])
|
|
55 patient.sample.counts = data.frame(data.table(patient.sample.counts)[, list(count=.N), by=c("Patient")])
|
|
56
|
|
57 print("Found the following patients with number of samples:")
|
|
58 print(patient.sample.counts)
|
|
59
|
|
60 patient.sample.counts.pairs = patient.sample.counts[patient.sample.counts$count %in% 1:2,]
|
|
61 patient.sample.counts.triplets = patient.sample.counts[patient.sample.counts$count == 3,]
|
|
62
|
|
63
|
|
64
|
|
65 triplets = dat[dat$Patient %in% patient.sample.counts.triplets$Patient,]
|
|
66 dat = dat[dat$Patient %in% patient.sample.counts.pairs$Patient,]
|
0
|
67
|
|
68 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
69
|
|
70 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
71 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
72 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
73
|
|
74 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
75 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
76
|
|
77 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
78 print(paste("rows:", nrow(dat)))
|
|
79 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
|
80 print(paste("rows:", nrow(dat)))
|
|
81 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
82
|
|
83 dat = dat[dat$normalized_read_count >= min_cells,]
|
|
84
|
|
85 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
|
86
|
|
87 patients = split(dat, dat$Patient, drop=T)
|
|
88 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
|
89 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
|
90 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
91 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
92 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
93 Titles = factor(Titles, levels=Titles)
|
|
94 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
95
|
2
|
96 single_patients = dat[NULL,]
|
0
|
97
|
|
98 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
99 patient.merge.list.second = list()
|
|
100 scatter_locus_data_list = list()
|
|
101 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="multiple_matches.html", append=T)
|
|
102 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="single_matches.html", append=T)
|
|
103 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
|
104 if (!is.data.frame(x) & is.list(x)){
|
|
105 x = x[[1]]
|
|
106 }
|
|
107 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
108 x = data.frame(x,stringsAsFactors=F)
|
|
109 onShort = "reads"
|
|
110 if(on == "Frequency"){
|
|
111 onShort = "freq"
|
|
112 }
|
|
113 onx = paste(on, ".x", sep="")
|
|
114 ony = paste(on, ".y", sep="")
|
|
115 splt = split(x, x$Sample, drop=T)
|
|
116 type="pair"
|
|
117 if(length(splt) == 1){
|
|
118 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
2
|
119 splt[[2]] = splt[[1]][NULL,]
|
0
|
120 type="single"
|
|
121 }
|
|
122 patient1 = splt[[1]]
|
|
123 patient2 = splt[[2]]
|
|
124
|
2
|
125 oneSample = patient1[1,"Sample"]
|
|
126 twoSample = patient2[1,"Sample"]
|
|
127 patient = x[1,"Patient"]
|
|
128
|
0
|
129 switched = F
|
|
130 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
131 tmp = twoSample
|
|
132 twoSample = oneSample
|
|
133 oneSample = tmp
|
|
134 tmp = patient1
|
|
135 patient1 = patient2
|
|
136 patient2 = tmp
|
|
137 switched = T
|
|
138 }
|
|
139 if(appendtxt){
|
|
140 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
|
141 }
|
|
142 cat(paste("<tr><td>", patient, "</td>", sep=""), file=logfile, append=T)
|
|
143
|
|
144 if(mergeOn == "Clone_Sequence"){
|
|
145 patient1$merge = paste(patient1$Clone_Sequence)
|
|
146 patient2$merge = paste(patient2$Clone_Sequence)
|
|
147 } else {
|
|
148 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
149 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
150 }
|
|
151
|
|
152 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
|
153 scatterplot_data = patient1[NULL,scatterplot_data_columns]
|
|
154 scatterplot.data.type.factor = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both"))
|
|
155 scatterplot_data$type = character(0)
|
|
156 scatterplot_data$link = numeric(0)
|
|
157 scatterplot_data$on = character(0)
|
|
158
|
|
159 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh
|
2
|
160
|
0
|
161 cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence
|
2
|
162
|
0
|
163 start.time = proc.time()
|
|
164 merge.list = c()
|
2
|
165
|
0
|
166 if(patient %in% names(patient.merge.list)){
|
|
167 patientMerge = patient.merge.list[[patient]]
|
|
168 merge.list[["second"]] = patient.merge.list.second[[patient]]
|
|
169 scatterplot_data = scatter_locus_data_list[[patient]]
|
|
170 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
2
|
171
|
5
|
172 #print(names(patient.merge.list))
|
0
|
173 } else {
|
|
174 #fuzzy matching here...
|
2
|
175
|
0
|
176 patient1.fuzzy = patient1
|
|
177 patient2.fuzzy = patient2
|
2
|
178
|
0
|
179 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
180 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
2
|
181
|
0
|
182 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
|
|
183 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
2
|
184
|
0
|
185 merge.list = list()
|
2
|
186
|
0
|
187 merge.list[["second"]] = vector()
|
2
|
188
|
|
189 link.count = 1
|
|
190
|
0
|
191 while(nrow(patient.fuzzy) > 1){
|
|
192 first.merge = patient.fuzzy[1,"merge"]
|
|
193 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
194 first.sample = patient.fuzzy[1,"Sample"]
|
|
195 merge.filter = first.merge == patient.fuzzy$merge
|
2
|
196
|
0
|
197 #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
|
2
|
198
|
0
|
199 first.sample.filter = first.sample == patient.fuzzy$Sample
|
|
200 second.sample.filter = first.sample != patient.fuzzy$Sample
|
2
|
201
|
0
|
202 #first match same sample, sum to a single row, same for other sample
|
|
203 #then merge rows like 'normal'
|
2
|
204
|
0
|
205 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
2
|
206
|
|
207
|
|
208
|
0
|
209 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter
|
|
210 first.match.filter = merge.filter & sequence.filter & first.sample.filter
|
|
211 second.match.filter = merge.filter & sequence.filter & second.sample.filter
|
2
|
212
|
0
|
213 first.rows = patient.fuzzy[first.match.filter,]
|
|
214 second.rows = patient.fuzzy[second.match.filter,]
|
2
|
215
|
0
|
216 first.rows.v = table(first.rows$V_Segment_Major_Gene)
|
|
217 first.rows.v = names(first.rows.v[which.max(first.rows.v)])
|
|
218 first.rows.j = table(first.rows$J_Segment_Major_Gene)
|
|
219 first.rows.j = names(first.rows.j[which.max(first.rows.j)])
|
2
|
220
|
0
|
221 first.sum = data.frame(merge = first.clone.sequence,
|
|
222 Patient = patient,
|
|
223 Receptor = first.rows[1,"Receptor"],
|
|
224 Sample = first.rows[1,"Sample"],
|
|
225 Cell_Count = first.rows[1,"Cell_Count"],
|
|
226 Clone_Molecule_Count_From_Spikes = sum(first.rows$Clone_Molecule_Count_From_Spikes),
|
|
227 Log10_Frequency = log10(sum(first.rows$Frequency)),
|
|
228 Total_Read_Count = sum(first.rows$Total_Read_Count),
|
|
229 dsPerM = sum(first.rows$dsPerM),
|
|
230 J_Segment_Major_Gene = first.rows.j,
|
|
231 V_Segment_Major_Gene = first.rows.v,
|
|
232 Clone_Sequence = first.clone.sequence,
|
|
233 CDR3_Sense_Sequence = first.rows[1,"CDR3_Sense_Sequence"],
|
|
234 Related_to_leukemia_clone = F,
|
|
235 Frequency = sum(first.rows$Frequency),
|
|
236 locus_V = first.rows[1,"locus_V"],
|
|
237 locus_J = first.rows[1,"locus_J"],
|
|
238 min_cell_count = first.rows[1,"min_cell_count"],
|
|
239 normalized_read_count = sum(first.rows$normalized_read_count),
|
|
240 paste = first.rows[1,"paste"],
|
|
241 min_cell_paste = first.rows[1,"min_cell_paste"])
|
2
|
242
|
0
|
243 if(nrow(second.rows) > 0){
|
|
244 second.rows.v = table(second.rows$V_Segment_Major_Gene)
|
|
245 second.rows.v = names(second.rows.v[which.max(second.rows.v)])
|
|
246 second.rows.j = table(second.rows$J_Segment_Major_Gene)
|
|
247 second.rows.j = names(second.rows.j[which.max(second.rows.j)])
|
2
|
248
|
0
|
249 second.sum = data.frame(merge = first.clone.sequence,
|
2
|
250 Patient = patient,
|
|
251 Receptor = second.rows[1,"Receptor"],
|
|
252 Sample = second.rows[1,"Sample"],
|
|
253 Cell_Count = second.rows[1,"Cell_Count"],
|
|
254 Clone_Molecule_Count_From_Spikes = sum(second.rows$Clone_Molecule_Count_From_Spikes),
|
|
255 Log10_Frequency = log10(sum(second.rows$Frequency)),
|
|
256 Total_Read_Count = sum(second.rows$Total_Read_Count),
|
|
257 dsPerM = sum(second.rows$dsPerM),
|
|
258 J_Segment_Major_Gene = second.rows.j,
|
|
259 V_Segment_Major_Gene = second.rows.v,
|
|
260 Clone_Sequence = first.clone.sequence,
|
|
261 CDR3_Sense_Sequence = second.rows[1,"CDR3_Sense_Sequence"],
|
|
262 Related_to_leukemia_clone = F,
|
|
263 Frequency = sum(second.rows$Frequency),
|
|
264 locus_V = second.rows[1,"locus_V"],
|
|
265 locus_J = second.rows[1,"locus_J"],
|
|
266 min_cell_count = second.rows[1,"min_cell_count"],
|
|
267 normalized_read_count = sum(second.rows$normalized_read_count),
|
|
268 paste = second.rows[1,"paste"],
|
|
269 min_cell_paste = second.rows[1,"min_cell_paste"])
|
|
270
|
|
271 #print(names(patientMerge))
|
|
272 #print(merge(first.sum, second.sum, by="merge"))
|
0
|
273 patientMerge = rbind(patientMerge, merge(first.sum, second.sum, by="merge"))
|
2
|
274 #print("test2")
|
0
|
275 patient.fuzzy = patient.fuzzy[!(first.match.filter | second.match.filter),]
|
2
|
276
|
0
|
277 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"], second.rows[second.rows$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
278 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
2
|
279
|
0
|
280 tmp.rows = rbind(first.rows, second.rows)
|
2
|
281 #print("test3")
|
0
|
282 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
|
283
|
|
284
|
|
285 #add to the scatterplot data
|
|
286 scatterplot.row = first.sum[,scatterplot_data_columns]
|
2
|
287 scatterplot.row$type = paste(first.sum[,"Sample"], "In Both")
|
|
288 scatterplot.row$link = link.count
|
|
289 scatterplot.row$on = onShort
|
|
290
|
|
291 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
0
|
292
|
|
293 scatterplot.row = second.sum[,scatterplot_data_columns]
|
2
|
294 scatterplot.row$type = paste(second.sum[,"Sample"], "In Both")
|
|
295 scatterplot.row$link = link.count
|
|
296 scatterplot.row$on = onShort
|
|
297
|
|
298 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
299
|
|
300 #write some information about the match to a log file
|
0
|
301 if (nrow(first.rows) > 1 | nrow(second.rows) > 1) {
|
|
302 cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
303 } else {
|
|
304 second.clone.sequence = second.rows[1,"Clone_Sequence"]
|
|
305 if(nchar(first.clone.sequence) != nchar(second.clone.sequence)){
|
|
306 cat(paste("<tr bgcolor='#DDD'><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
307 } else {
|
|
308 #cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
309 }
|
|
310 }
|
2
|
311
|
0
|
312 } else if(nrow(first.rows) > 1) {
|
|
313 if(patient1[1,"Sample"] == first.sample){
|
|
314 patient1 = patient1[!(patient1$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
315 patient1 = rbind(patient1, first.sum)
|
|
316 } else {
|
|
317 patient2 = patient2[!(patient2$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
318 patient2 = rbind(patient2, first.sum)
|
|
319 }
|
2
|
320
|
0
|
321 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"])
|
|
322 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
2
|
323
|
0
|
324 patient.fuzzy = patient.fuzzy[-first.match.filter,]
|
|
325
|
|
326 #add to the scatterplot data
|
|
327 scatterplot.row = first.sum[,scatterplot_data_columns]
|
2
|
328 scatterplot.row$type = first.sum[,"Sample"]
|
|
329 scatterplot.row$link = link.count
|
|
330 scatterplot.row$on = onShort
|
|
331
|
|
332 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
333
|
0
|
334 cat(paste("<tr bgcolor='#DDF'><td>", patient, " row ", 1:nrow(first.rows), "</td><td>", first.rows$Sample, ":</td><td>", first.rows$Clone_Sequence, "</td><td>", first.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
335 } else {
|
|
336 patient.fuzzy = patient.fuzzy[-1,]
|
|
337
|
|
338 #add to the scatterplot data
|
|
339 scatterplot.row = first.sum[,scatterplot_data_columns]
|
2
|
340 scatterplot.row$type = first.sum[,"Sample"]
|
|
341 scatterplot.row$link = link.count
|
|
342 scatterplot.row$on = onShort
|
|
343
|
|
344 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
0
|
345 }
|
|
346 link.count = link.count + 1
|
|
347 }
|
|
348 patient.merge.list[[patient]] <<- patientMerge
|
|
349 patient.merge.list.second[[patient]] <<- merge.list[["second"]]
|
|
350
|
|
351 sample.order = data.frame(type = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both")),type.order = 1:4)
|
|
352 scatterplot_data = merge(scatterplot_data, sample.order, by="type")
|
|
353
|
|
354 scatter_locus_data_list[[patient]] <<- scatterplot_data
|
|
355 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
|
356 }
|
2
|
357
|
0
|
358 patient1 = patient1[!(patient1$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
|
359 patient2 = patient2[!(patient2$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
2
|
360
|
0
|
361
|
|
362 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
|
363 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony])
|
|
364 res1 = vector()
|
|
365 res2 = vector()
|
|
366 resBoth = vector()
|
|
367 read1Count = vector()
|
|
368 read2Count = vector()
|
|
369 locussum1 = vector()
|
|
370 locussum2 = vector()
|
|
371
|
|
372 #for(iter in 1){
|
|
373 for(iter in 1:length(product[,1])){
|
2
|
374 threshhold = product[iter,"interval"]
|
|
375 V_Segment = paste(".*", as.character(product[iter,"V_Segments"]), ".*", sep="")
|
|
376 J_Segment = paste(".*", as.character(product[iter,"J_Segments"]), ".*", sep="")
|
0
|
377 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
|
378 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
|
379 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
380 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
|
381 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
382 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
|
383 res1 = append(res1, sum(one))
|
|
384 res2 = append(res2, sum(two))
|
|
385 resBoth = append(resBoth, sum(both))
|
|
386 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
387 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
388 #threshhold = 0
|
2
|
389 if(threshhold != 0 | T){
|
0
|
390 if(sum(one) > 0){
|
|
391 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
392 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
2
|
393 filenameOne = paste(oneSample, "_", product[iter, "Titles"], "_", threshhold, sep="")
|
0
|
394 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
395 }
|
|
396 if(sum(two) > 0){
|
|
397 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
398 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
2
|
399 filenameTwo = paste(twoSample, "_", product[iter, "Titles"], "_", threshhold, sep="")
|
0
|
400 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
401 }
|
6
|
402 }
|
|
403 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
404 if(nrow(scatterplot_locus_data) > 0){
|
|
405 scatterplot_locus_data$Rearrangement = product[iter, "Titles"]
|
|
406 }
|
|
407 p = NULL
|
|
408 #print(paste("nrow scatterplot_locus_data", nrow(scatterplot_locus_data)))
|
|
409 if(nrow(scatterplot_locus_data) != 0){
|
|
410 if(on == "normalized_read_count"){
|
|
411 write.table(scatterplot_locus_data, file=paste(oneSample, twoSample, product[iter, "Titles"], "scatterplot_locus_data.txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
412 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
413 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), normalized_read_count, group=link)) + geom_line() + scale_y_log10(breaks=scales,labels=scales, limits=c(1,1e6)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
|
414 } else {
|
|
415 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), Frequency, group=link)) + geom_line() + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
0
|
416 }
|
6
|
417 p = p + geom_point(aes(colour=type), position="dodge")
|
|
418 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,"Patient"], patient1[1,"Sample"], patient2[1,"Sample"], onShort, product[iter, "Titles"]))
|
|
419 } else {
|
|
420 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,"Patient"], patient1[1,"Sample"], patient2[1,"Sample"], onShort, product[iter, "Titles"]))
|
0
|
421 }
|
6
|
422 png(paste(patient1[1,"Patient"], "_", patient1[1,"Sample"], "_", patient2[1,"Sample"], "_", onShort, "_", product[iter, "Titles"],"_scatter.png", sep=""))
|
|
423 print(p)
|
|
424 dev.off()
|
0
|
425 if(sum(both) > 0){
|
|
426 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
427 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
2
|
428 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, "Titles"], "_", threshhold, sep="")
|
0
|
429 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
430 }
|
|
431 }
|
|
432 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
|
433 if(sum(is.na(patientResult$percentage)) > 0){
|
|
434 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
435 }
|
|
436 colnames(patientResult)[6] = oneSample
|
|
437 colnames(patientResult)[8] = twoSample
|
|
438 colnamesBak = colnames(patientResult)
|
|
439 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
|
440 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
441 colnames(patientResult) = colnamesBak
|
|
442
|
|
443 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
444 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
445
|
|
446 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
447 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
448 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
449 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
450 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
451 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
452 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
453 print(plt)
|
|
454 dev.off()
|
|
455 #(t,r,b,l)
|
|
456 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
457 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
458 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
459 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
460 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
461 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
462 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
463 print(plt)
|
|
464 dev.off()
|
|
465
|
|
466 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
467 patientResult$relativeValue = patientResult$value * 10
|
|
468 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
469 plt = ggplot(patientResult)
|
|
470 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
471 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
472 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
473 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
474 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
475 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
476 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
477 print(plt)
|
|
478 dev.off()
|
|
479 }
|
2
|
480
|
1
|
481 if(length(patients) > 0){
|
|
482 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
0
|
483
|
1
|
484 interval = intervalFreq
|
|
485 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
486 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
2
|
487 for (current_patient in patients){
|
|
488 print(paste("Started working", unique(current_patient$Patient), "Frequency analysis"))
|
|
489 patientCountOnColumn(current_patient, product=product, interval=interval, on="Frequency", appendtxt=T)
|
|
490 }
|
0
|
491
|
1
|
492 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
0
|
493
|
1
|
494 interval = intervalReads
|
|
495 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
496 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
2
|
497 for (current_patient in patients){
|
|
498 print(paste("Started working on ", unique(current_patient$Patient), "Read Count analysis"))
|
|
499 patientCountOnColumn(current_patient, product=product, interval=interval, on="normalized_read_count")
|
|
500 }
|
1
|
501 }
|
0
|
502 if(nrow(single_patients) > 0){
|
|
503 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
504 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=as.character(scales)) + expand_limits(y=c(0,1000000))
|
|
505 p = p + geom_point(aes(colour=type), position="jitter")
|
|
506 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
507 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
508 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
509 print(p)
|
|
510 dev.off()
|
|
511
|
|
512 #p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
513 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
|
|
514 p = p + geom_point(aes(colour=type), position="jitter")
|
|
515 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
516 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
517 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
518 print(p)
|
|
519 dev.off()
|
|
520 } else {
|
|
521 empty <- data.frame()
|
|
522 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
523
|
|
524 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
525 print(p)
|
|
526 dev.off()
|
|
527
|
|
528 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
529 print(p)
|
|
530 dev.off()
|
|
531 }
|
|
532
|
|
533 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
534 patient.merge.list.second = list()
|
|
535
|
|
536 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
1
|
537 onShort = "reads"
|
|
538 if(on == "Frequency"){
|
|
539 onShort = "freq"
|
|
540 }
|
|
541 onx = paste(on, ".x", sep="")
|
|
542 ony = paste(on, ".y", sep="")
|
|
543 onz = paste(on, ".z", sep="")
|
|
544 type="triplet"
|
0
|
545
|
1
|
546 threshholdIndex = which(colnames(product) == "interval")
|
|
547 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
548 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
549 titleIndex = which(colnames(product) == "Titles")
|
|
550 sampleIndex = which(colnames(patient1) == "Sample")
|
|
551 patientIndex = which(colnames(patient1) == "Patient")
|
|
552 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
553 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
554 threeSample = paste(patient3[1,sampleIndex], sep="")
|
|
555
|
|
556 if(mergeOn == "Clone_Sequence"){
|
|
557 patient1$merge = paste(patient1$Clone_Sequence)
|
0
|
558 patient2$merge = paste(patient2$Clone_Sequence)
|
|
559 patient3$merge = paste(patient3$Clone_Sequence)
|
|
560
|
1
|
561 } else {
|
0
|
562 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
563 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
564 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
1
|
565 }
|
0
|
566
|
1
|
567 #patientMerge = merge(patient1, patient2, by="merge")[NULL,]
|
|
568 patient1.fuzzy = patient1
|
|
569 patient2.fuzzy = patient2
|
|
570 patient3.fuzzy = patient3
|
0
|
571
|
1
|
572 cat(paste("<tr><td>", label1, "</td>", sep=""), file=logfile, append=T)
|
0
|
573
|
1
|
574 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
575 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
|
576 patient3.fuzzy$merge = paste(patient3.fuzzy$locus_V, patient3.fuzzy$locus_J)
|
0
|
577
|
1
|
578 patient.fuzzy = rbind(patient1.fuzzy ,patient2.fuzzy, patient3.fuzzy)
|
|
579 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
0
|
580
|
1
|
581 other.sample.list = list()
|
|
582 other.sample.list[[oneSample]] = c(twoSample, threeSample)
|
|
583 other.sample.list[[twoSample]] = c(oneSample, threeSample)
|
|
584 other.sample.list[[threeSample]] = c(oneSample, twoSample)
|
0
|
585
|
1
|
586 patientMerge = merge(patient1, patient2, by="merge")
|
|
587 patientMerge = merge(patientMerge, patient3, by="merge")
|
|
588 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
|
589 #patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
590 patientMerge = patientMerge[NULL,]
|
0
|
591
|
1
|
592 duo.merge.list = list()
|
0
|
593
|
1
|
594 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
595 #patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
596 patientMerge12 = patientMerge12[NULL,]
|
|
597 duo.merge.list[[paste(oneSample, twoSample)]] = patientMerge12
|
|
598 duo.merge.list[[paste(twoSample, oneSample)]] = patientMerge12
|
0
|
599
|
1
|
600 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
601 #patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
602 patientMerge13 = patientMerge13[NULL,]
|
|
603 duo.merge.list[[paste(oneSample, threeSample)]] = patientMerge13
|
|
604 duo.merge.list[[paste(threeSample, oneSample)]] = patientMerge13
|
0
|
605
|
1
|
606 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
607 #patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
608 patientMerge23 = patientMerge23[NULL,]
|
|
609 duo.merge.list[[paste(twoSample, threeSample)]] = patientMerge23
|
|
610 duo.merge.list[[paste(threeSample, twoSample)]] = patientMerge23
|
0
|
611
|
1
|
612 merge.list = list()
|
|
613 merge.list[["second"]] = vector()
|
|
614
|
|
615 #print(paste(nrow(patient1), nrow(patient2), nrow(patient3), label1, label2, label3))
|
|
616
|
|
617 start.time = proc.time()
|
|
618 if(paste(label1, "123") %in% names(patient.merge.list)){
|
|
619 patientMerge = patient.merge.list[[paste(label1, "123")]]
|
|
620 patientMerge12 = patient.merge.list[[paste(label1, "12")]]
|
|
621 patientMerge13 = patient.merge.list[[paste(label1, "13")]]
|
|
622 patientMerge23 = patient.merge.list[[paste(label1, "23")]]
|
0
|
623
|
1
|
624 #merge.list[["second"]] = patient.merge.list.second[[label1]]
|
0
|
625
|
1
|
626 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
|
627 } else {
|
|
628 while(nrow(patient.fuzzy) > 0){
|
|
629 first.merge = patient.fuzzy[1,"merge"]
|
|
630 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
631 first.sample = paste(patient.fuzzy[1,"Sample"], sep="")
|
|
632
|
|
633 merge.filter = first.merge == patient.fuzzy$merge
|
|
634
|
|
635 second.sample = other.sample.list[[first.sample]][1]
|
|
636 third.sample = other.sample.list[[first.sample]][2]
|
0
|
637
|
1
|
638 sample.filter.1 = first.sample == patient.fuzzy$Sample
|
|
639 sample.filter.2 = second.sample == patient.fuzzy$Sample
|
|
640 sample.filter.3 = third.sample == patient.fuzzy$Sample
|
0
|
641
|
1
|
642 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
0
|
643
|
1
|
644 match.filter.1 = sample.filter.1 & sequence.filter & merge.filter
|
|
645 match.filter.2 = sample.filter.2 & sequence.filter & merge.filter
|
|
646 match.filter.3 = sample.filter.3 & sequence.filter & merge.filter
|
0
|
647
|
1
|
648 matches.in.1 = any(match.filter.1)
|
|
649 matches.in.2 = any(match.filter.2)
|
|
650 matches.in.3 = any(match.filter.3)
|
0
|
651
|
1
|
652 rows.1 = patient.fuzzy[match.filter.1,]
|
0
|
653
|
1
|
654 sum.1 = data.frame(merge = first.clone.sequence,
|
|
655 Patient = label1,
|
|
656 Receptor = rows.1[1,"Receptor"],
|
|
657 Sample = rows.1[1,"Sample"],
|
|
658 Cell_Count = rows.1[1,"Cell_Count"],
|
|
659 Clone_Molecule_Count_From_Spikes = sum(rows.1$Clone_Molecule_Count_From_Spikes),
|
|
660 Log10_Frequency = log10(sum(rows.1$Frequency)),
|
|
661 Total_Read_Count = sum(rows.1$Total_Read_Count),
|
|
662 dsPerM = sum(rows.1$dsPerM),
|
|
663 J_Segment_Major_Gene = rows.1[1,"J_Segment_Major_Gene"],
|
|
664 V_Segment_Major_Gene = rows.1[1,"V_Segment_Major_Gene"],
|
|
665 Clone_Sequence = first.clone.sequence,
|
|
666 CDR3_Sense_Sequence = rows.1[1,"CDR3_Sense_Sequence"],
|
|
667 Related_to_leukemia_clone = F,
|
|
668 Frequency = sum(rows.1$Frequency),
|
|
669 locus_V = rows.1[1,"locus_V"],
|
|
670 locus_J = rows.1[1,"locus_J"],
|
|
671 uniqueID = rows.1[1,"uniqueID"],
|
|
672 normalized_read_count = sum(rows.1$normalized_read_count))
|
|
673 sum.2 = sum.1[NULL,]
|
|
674 rows.2 = patient.fuzzy[match.filter.2,]
|
|
675 if(matches.in.2){
|
|
676 sum.2 = data.frame(merge = first.clone.sequence,
|
|
677 Patient = label1,
|
|
678 Receptor = rows.2[1,"Receptor"],
|
|
679 Sample = rows.2[1,"Sample"],
|
|
680 Cell_Count = rows.2[1,"Cell_Count"],
|
|
681 Clone_Molecule_Count_From_Spikes = sum(rows.2$Clone_Molecule_Count_From_Spikes),
|
|
682 Log10_Frequency = log10(sum(rows.2$Frequency)),
|
|
683 Total_Read_Count = sum(rows.2$Total_Read_Count),
|
|
684 dsPerM = sum(rows.2$dsPerM),
|
|
685 J_Segment_Major_Gene = rows.2[1,"J_Segment_Major_Gene"],
|
|
686 V_Segment_Major_Gene = rows.2[1,"V_Segment_Major_Gene"],
|
|
687 Clone_Sequence = first.clone.sequence,
|
|
688 CDR3_Sense_Sequence = rows.2[1,"CDR3_Sense_Sequence"],
|
|
689 Related_to_leukemia_clone = F,
|
|
690 Frequency = sum(rows.2$Frequency),
|
|
691 locus_V = rows.2[1,"locus_V"],
|
|
692 locus_J = rows.2[1,"locus_J"],
|
|
693 uniqueID = rows.2[1,"uniqueID"],
|
|
694 normalized_read_count = sum(rows.2$normalized_read_count))
|
|
695 }
|
0
|
696
|
1
|
697 sum.3 = sum.1[NULL,]
|
|
698 rows.3 = patient.fuzzy[match.filter.3,]
|
|
699 if(matches.in.3){
|
|
700 sum.3 = data.frame(merge = first.clone.sequence,
|
|
701 Patient = label1,
|
|
702 Receptor = rows.3[1,"Receptor"],
|
|
703 Sample = rows.3[1,"Sample"],
|
|
704 Cell_Count = rows.3[1,"Cell_Count"],
|
|
705 Clone_Molecule_Count_From_Spikes = sum(rows.3$Clone_Molecule_Count_From_Spikes),
|
|
706 Log10_Frequency = log10(sum(rows.3$Frequency)),
|
|
707 Total_Read_Count = sum(rows.3$Total_Read_Count),
|
|
708 dsPerM = sum(rows.3$dsPerM),
|
|
709 J_Segment_Major_Gene = rows.3[1,"J_Segment_Major_Gene"],
|
|
710 V_Segment_Major_Gene = rows.3[1,"V_Segment_Major_Gene"],
|
|
711 Clone_Sequence = first.clone.sequence,
|
|
712 CDR3_Sense_Sequence = rows.3[1,"CDR3_Sense_Sequence"],
|
|
713 Related_to_leukemia_clone = F,
|
|
714 Frequency = sum(rows.3$Frequency),
|
|
715 locus_V = rows.3[1,"locus_V"],
|
|
716 locus_J = rows.3[1,"locus_J"],
|
|
717 uniqueID = rows.3[1,"uniqueID"],
|
|
718 normalized_read_count = sum(rows.3$normalized_read_count))
|
|
719 }
|
0
|
720
|
1
|
721 if(matches.in.2 & matches.in.3){
|
|
722 merge.123 = merge(sum.1, sum.2, by="merge")
|
|
723 merge.123 = merge(merge.123, sum.3, by="merge")
|
|
724 colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123)))] = paste(colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123), perl=T))], ".z", sep="")
|
|
725 #merge.123$thresholdValue = pmax(merge.123[,onx], merge.123[,ony], merge.123[,onz])
|
0
|
726
|
1
|
727 patientMerge = rbind(patientMerge, merge.123)
|
|
728 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2 | match.filter.3),]
|
0
|
729
|
1
|
730 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
731 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
0
|
732
|
1
|
733 } else if (matches.in.2) {
|
|
734 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
735 #other.sample2 = other.sample.list[[first.sample]][2]
|
0
|
736
|
1
|
737 second.sample = sum.2[,"Sample"]
|
0
|
738
|
1
|
739 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
0
|
740
|
1
|
741 merge.12 = merge(sum.1, sum.2, by="merge")
|
0
|
742
|
1
|
743 current.merge.list = rbind(current.merge.list, merge.12)
|
|
744 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
0
|
745
|
1
|
746 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2),]
|
0
|
747
|
1
|
748 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
749 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
0
|
750
|
1
|
751 } else if (matches.in.3) {
|
0
|
752
|
1
|
753 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
754 #other.sample2 = other.sample.list[[first.sample]][2]
|
0
|
755
|
1
|
756 second.sample = sum.3[,"Sample"]
|
0
|
757
|
1
|
758 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
0
|
759
|
1
|
760 merge.13 = merge(sum.1, sum.3, by="merge")
|
0
|
761
|
1
|
762 current.merge.list = rbind(current.merge.list, merge.13)
|
|
763 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
0
|
764
|
1
|
765 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.3),]
|
0
|
766
|
1
|
767 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
768 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
0
|
769
|
1
|
770 } else if(nrow(rows.1) > 1){
|
|
771 patient1 = patient1[!(patient1$Clone_Sequence %in% rows.1$Clone_Sequence),]
|
5
|
772 #print(names(patient1)[names(patient1) %in% sum.1])
|
|
773 #print(names(patient1)[!(names(patient1) %in% sum.1)])
|
|
774 #print(names(patient1))
|
|
775 #print(names(sum.1))
|
|
776 #print(summary(sum.1))
|
|
777 #print(summary(patient1))
|
|
778 #print(dim(sum.1))
|
|
779 #print(dim(patient1))
|
|
780 #print(head(sum.1[,names(patient1)]))
|
1
|
781 patient1 = rbind(patient1, sum.1[,names(patient1)])
|
|
782 patient.fuzzy = patient.fuzzy[-match.filter.1,]
|
|
783 } else {
|
|
784 patient.fuzzy = patient.fuzzy[-1,]
|
|
785 }
|
|
786
|
|
787 tmp.rows = rbind(rows.1, rows.2, rows.3)
|
|
788 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
0
|
789
|
1
|
790 if (sum(match.filter.1) > 1 | sum(match.filter.2) > 1 | sum(match.filter.1) > 1) {
|
|
791 cat(paste("<tr><td>", label1, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
792 } else {
|
|
793 }
|
|
794
|
|
795 }
|
|
796 patient.merge.list[[paste(label1, "123")]] = patientMerge
|
|
797
|
|
798 patientMerge12 = duo.merge.list[[paste(oneSample, twoSample)]]
|
|
799 patientMerge13 = duo.merge.list[[paste(oneSample, threeSample)]]
|
|
800 patientMerge23 = duo.merge.list[[paste(twoSample, threeSample)]]
|
0
|
801
|
1
|
802 patient.merge.list[[paste(label1, "12")]] = patientMerge12
|
|
803 patient.merge.list[[paste(label1, "13")]] = patientMerge13
|
|
804 patient.merge.list[[paste(label1, "23")]] = patientMerge23
|
|
805
|
|
806 #patient.merge.list.second[[label1]] = merge.list[["second"]]
|
|
807 }
|
|
808 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
|
809 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
810 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
811 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
812 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
813
|
|
814 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
815 #patientMerge12$thresholdValue = pmin(patientMerge12[,onx], patientMerge12[,ony])
|
|
816 #patientMerge13$thresholdValue = pmin(patientMerge13[,onx], patientMerge13[,ony])
|
|
817 #patientMerge23$thresholdValue = pmin(patientMerge23[,onx], patientMerge23[,ony])
|
0
|
818
|
1
|
819 patient1 = patient1[!(patient1$Clone_Sequence %in% merge.list[["second"]]),]
|
|
820 patient2 = patient2[!(patient2$Clone_Sequence %in% merge.list[["second"]]),]
|
|
821 patient3 = patient3[!(patient3$Clone_Sequence %in% merge.list[["second"]]),]
|
0
|
822
|
1
|
823 if(F){
|
|
824 patientMerge = merge(patient1, patient2, by="merge")
|
|
825 patientMerge = merge(patientMerge, patient3, by="merge")
|
|
826 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
|
827 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
828 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
829 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
830 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
831 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
832 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
833 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
834 }
|
0
|
835
|
1
|
836 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
|
837 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
|
838 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
839
|
|
840 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
0
|
841
|
1
|
842 res1 = vector()
|
|
843 res2 = vector()
|
|
844 res3 = vector()
|
|
845 res12 = vector()
|
|
846 res13 = vector()
|
|
847 res23 = vector()
|
|
848 resAll = vector()
|
|
849 read1Count = vector()
|
|
850 read2Count = vector()
|
|
851 read3Count = vector()
|
0
|
852
|
1
|
853 if(appendTriplets){
|
|
854 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
|
855 }
|
|
856 for(iter in 1:length(product[,1])){
|
|
857 threshhold = product[iter,threshholdIndex]
|
|
858 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
859 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
|
860 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
861 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
0
|
862
|
1
|
863 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
864 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
865 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
|
866
|
|
867 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
868 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
869 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
0
|
870
|
1
|
871 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
872 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
873 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
|
874 res1 = append(res1, sum(one))
|
|
875 res2 = append(res2, sum(two))
|
|
876 res3 = append(res3, sum(three))
|
|
877 resAll = append(resAll, sum(all))
|
|
878 res12 = append(res12, sum(one_two))
|
|
879 res13 = append(res13, sum(one_three))
|
|
880 res23 = append(res23, sum(two_three))
|
|
881 #threshhold = 0
|
|
882 if(threshhold != 0){
|
|
883 if(sum(one) > 0){
|
|
884 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
885 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
|
886 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
887 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
888 }
|
|
889 if(sum(two) > 0){
|
|
890 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
891 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
|
892 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
893 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
894 }
|
|
895 if(sum(three) > 0){
|
|
896 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
897 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
|
898 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
899 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
900 }
|
|
901 if(sum(one_two) > 0){
|
|
902 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
903 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
904 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
905 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
906 }
|
|
907 if(sum(one_three) > 0){
|
|
908 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
909 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
910 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
911 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
912 }
|
|
913 if(sum(two_three) > 0){
|
|
914 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
915 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
916 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
917 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
918 }
|
|
919 } else { #scatterplot data
|
|
920 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
921 scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
|
|
922 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
|
923 if(sum(in_two) > 0){
|
|
924 scatterplot_locus_data[in_two,]$type = "In two"
|
|
925 }
|
|
926 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
|
927 if(sum(in_three)> 0){
|
|
928 scatterplot_locus_data[in_three,]$type = "In three"
|
|
929 }
|
|
930 not_in_one = scatterplot_locus_data$type != "In one"
|
|
931 if(sum(not_in_one) > 0){
|
|
932 #scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
933 }
|
|
934 p = NULL
|
|
935 if(nrow(scatterplot_locus_data) != 0){
|
2
|
936 filename.scatter = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_scatter_", threshhold, sep="")
|
|
937 write.table(scatterplot_locus_data, file=paste(filename.scatter, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
938 if(on == "normalized_read_count"){
|
|
939 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
940 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales, limits=c(1, 1e6))
|
|
941 } else {
|
|
942 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
|
|
943 #p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
944 }
|
|
945 p = p + geom_point(aes(colour=type), position="jitter")
|
|
946 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
1
|
947 } else {
|
2
|
948 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
1
|
949 }
|
|
950 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
951 print(p)
|
|
952 dev.off()
|
|
953 }
|
|
954 if(sum(all) > 0){
|
|
955 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
956 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
957 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
958 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
959 }
|
|
960 }
|
|
961 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
962 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
|
963 colnames(patientResult)[6] = oneSample
|
|
964 colnames(patientResult)[7] = twoSample
|
|
965 colnames(patientResult)[8] = threeSample
|
|
966 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
967 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
968 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
|
969
|
|
970 colnamesBak = colnames(patientResult)
|
|
971 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
|
972 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
973 colnames(patientResult) = colnamesBak
|
|
974
|
|
975 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
976 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
977
|
|
978 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
979 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
980 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
981 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
982 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
983 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
984 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
985 print(plt)
|
|
986 dev.off()
|
|
987
|
|
988 fontSize = 4
|
|
989
|
|
990 bak = patientResult
|
|
991 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
992 patientResult$relativeValue = patientResult$value * 10
|
|
993 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
994 plt = ggplot(patientResult)
|
|
995 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
996 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
997 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
998 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
999 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
1000 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
1001 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
1002 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
1003 print(plt)
|
|
1004 dev.off()
|
0
|
1005 }
|
|
1006
|
|
1007 if(nrow(triplets) != 0){
|
|
1008
|
1
|
1009 cat("<tr><td>Starting triplet analysis</td></tr>", file=logfile, append=T)
|
|
1010
|
|
1011 triplets$uniqueID = paste(triplets$Patient, triplets$Sample, sep="_")
|
|
1012
|
|
1013 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
0
|
1014
|
1
|
1015 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
1016 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
1017 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
0
|
1018
|
1
|
1019 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
1020 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
0
|
1021
|
1
|
1022 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
1023
|
|
1024 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
1025
|
|
1026 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2)
|
0
|
1027
|
1
|
1028 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
1029
|
|
1030 column_drops = c("min_cell_count", "min_cell_paste")
|
|
1031
|
|
1032 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
|
1033
|
|
1034 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
0
|
1035
|
1
|
1036 interval = intervalReads
|
|
1037 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
1038 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
0
|
1039
|
1
|
1040 triplets = split(triplets, triplets$Patient, drop=T)
|
5
|
1041 #print(nrow(triplets))
|
1
|
1042 for(triplet in triplets){
|
|
1043 samples = unique(triplet$Sample)
|
|
1044 one = triplet[triplet$Sample == samples[1],]
|
|
1045 two = triplet[triplet$Sample == samples[2],]
|
|
1046 three = triplet[triplet$Sample == samples[3],]
|
|
1047
|
|
1048 print(paste(nrow(triplet), nrow(one), nrow(two), nrow(three)))
|
|
1049 tripletAnalysis(one, one[1,"uniqueID"], two, two[1,"uniqueID"], three, three[1,"uniqueID"], product=product, interval=interval, on="normalized_read_count", T)
|
|
1050 }
|
|
1051
|
|
1052 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
1053
|
|
1054 interval = intervalFreq
|
|
1055 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
1056 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
1057
|
|
1058 for(triplet in triplets){
|
|
1059 samples = unique(triplet$Sample)
|
|
1060 one = triplet[triplet$Sample == samples[1],]
|
|
1061 two = triplet[triplet$Sample == samples[2],]
|
|
1062 three = triplet[triplet$Sample == samples[3],]
|
|
1063 tripletAnalysis(one, one[1,"uniqueID"], two, two[1,"uniqueID"], three, three[1,"uniqueID"], product=product, interval=interval, on="Frequency", F)
|
|
1064 }
|
0
|
1065 } else {
|
|
1066 cat("", file="triplets.txt")
|
|
1067 }
|
|
1068 cat("</table></html>", file=logfile, append=T)
|