0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2 options(scipen=999)
|
|
3
|
|
4 inFile = args[1]
|
|
5 outDir = args[2]
|
|
6 logfile = args[3]
|
|
7 min_freq = as.numeric(args[4])
|
|
8 min_cells = as.numeric(args[5])
|
|
9 mergeOn = args[6]
|
|
10
|
|
11 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
|
12
|
|
13 library(ggplot2)
|
|
14 library(reshape2)
|
|
15 library(data.table)
|
|
16 library(grid)
|
|
17 library(parallel)
|
|
18 #require(xtable)
|
|
19 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
|
20 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
|
21 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Clone_Sequence")]
|
|
22 dat$dsPerM = 0
|
|
23 dat = dat[!is.na(dat$Patient),]
|
|
24 dat$Related_to_leukemia_clone = F
|
|
25
|
|
26 setwd(outDir)
|
|
27 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
|
28 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
29 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
30
|
|
31 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
|
32
|
|
33 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
|
34
|
|
35 dat = dat[dat$Frequency >= min_freq,]
|
|
36
|
1
|
37 patient.sample.counts = data.frame(data.table(dat)[, list(count=.N), by=c("Patient", "Sample")])
|
|
38 patient.sample.counts = data.frame(data.table(patient.sample.counts)[, list(count=.N), by=c("Patient")])
|
|
39
|
|
40 print("Found the following patients with number of samples:")
|
|
41 print(patient.sample.counts)
|
|
42
|
|
43 patient.sample.counts.pairs = patient.sample.counts[patient.sample.counts$count %in% 1:2,]
|
|
44 patient.sample.counts.triplets = patient.sample.counts[patient.sample.counts$count == 3,]
|
|
45
|
|
46
|
|
47
|
|
48 triplets = dat[dat$Patient %in% patient.sample.counts.triplets$Patient,]
|
|
49 dat = dat[dat$Patient %in% patient.sample.counts.pairs$Patient,]
|
0
|
50
|
|
51 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
52
|
|
53 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
54 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
55 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
56
|
|
57 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
58 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
59
|
|
60 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
61 print(paste("rows:", nrow(dat)))
|
|
62 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
|
63 print(paste("rows:", nrow(dat)))
|
|
64 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
65
|
|
66 dat = dat[dat$normalized_read_count >= min_cells,]
|
|
67
|
|
68 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
|
69
|
|
70 patients = split(dat, dat$Patient, drop=T)
|
|
71 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
|
72 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
|
73 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
74 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
75 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
76 Titles = factor(Titles, levels=Titles)
|
|
77 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
78
|
|
79 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
|
|
80
|
|
81 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
82 patient.merge.list.second = list()
|
|
83 scatter_locus_data_list = list()
|
|
84 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="multiple_matches.html", append=T)
|
|
85 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="single_matches.html", append=T)
|
|
86 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
|
87 if (!is.data.frame(x) & is.list(x)){
|
|
88 x = x[[1]]
|
|
89 }
|
|
90 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
91 x = data.frame(x,stringsAsFactors=F)
|
|
92 onShort = "reads"
|
|
93 if(on == "Frequency"){
|
|
94 onShort = "freq"
|
|
95 }
|
|
96 onx = paste(on, ".x", sep="")
|
|
97 ony = paste(on, ".y", sep="")
|
|
98 splt = split(x, x$Sample, drop=T)
|
|
99 type="pair"
|
|
100 if(length(splt) == 1){
|
|
101 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
|
102 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0))
|
|
103 type="single"
|
|
104 }
|
|
105 patient1 = splt[[1]]
|
|
106 patient2 = splt[[2]]
|
|
107
|
|
108 threshholdIndex = which(colnames(product) == "interval")
|
|
109 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
110 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
111 titleIndex = which(colnames(product) == "Titles")
|
|
112 sampleIndex = which(colnames(x) == "Sample")
|
|
113 patientIndex = which(colnames(x) == "Patient")
|
|
114 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
115 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
116 patient = paste(x[1,patientIndex])
|
|
117
|
|
118 switched = F
|
|
119 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
120 tmp = twoSample
|
|
121 twoSample = oneSample
|
|
122 oneSample = tmp
|
|
123 tmp = patient1
|
|
124 patient1 = patient2
|
|
125 patient2 = tmp
|
|
126 switched = T
|
|
127 }
|
|
128 if(appendtxt){
|
|
129 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
|
130 }
|
|
131 cat(paste("<tr><td>", patient, "</td>", sep=""), file=logfile, append=T)
|
|
132
|
|
133 if(mergeOn == "Clone_Sequence"){
|
|
134 patient1$merge = paste(patient1$Clone_Sequence)
|
|
135 patient2$merge = paste(patient2$Clone_Sequence)
|
|
136 } else {
|
|
137 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
138 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
139 }
|
|
140
|
|
141 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
|
142 #scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
|
|
143 scatterplot_data = patient1[NULL,scatterplot_data_columns]
|
|
144 #scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
145 #scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
|
|
146 scatterplot.data.type.factor = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both"))
|
|
147 #scatterplot_data$type = factor(x=NULL, levels=scatterplot.data.type.factor)
|
|
148 scatterplot_data$type = character(0)
|
|
149 scatterplot_data$link = numeric(0)
|
|
150 scatterplot_data$on = character(0)
|
|
151
|
|
152 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") #merge alles 'fuzzy'
|
|
153 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh
|
|
154
|
|
155 cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence
|
|
156
|
|
157 start.time = proc.time()
|
|
158 merge.list = c()
|
|
159
|
|
160 if(patient %in% names(patient.merge.list)){
|
|
161 patientMerge = patient.merge.list[[patient]]
|
|
162 merge.list[["second"]] = patient.merge.list.second[[patient]]
|
|
163 scatterplot_data = scatter_locus_data_list[[patient]]
|
|
164 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
|
165
|
|
166 print(names(patient.merge.list))
|
|
167 } else {
|
|
168 #fuzzy matching here...
|
|
169 #merge.list = patientMerge$merge
|
|
170
|
|
171 #patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
|
|
172 #patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
|
|
173
|
|
174 patient1.fuzzy = patient1
|
|
175 patient2.fuzzy = patient2
|
|
176
|
|
177 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
178 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
|
|
179
|
|
180 #patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
181 #patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)
|
|
182
|
|
183 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
184 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
|
185
|
|
186 #merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"]))) #also remove?
|
|
187 #merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
|
|
188
|
|
189 #patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
190 #patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
191
|
|
192 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
|
|
193 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
|
194
|
|
195 merge.list = list()
|
|
196
|
|
197 merge.list[["second"]] = vector()
|
|
198
|
|
199 link.count = 1
|
|
200
|
|
201 while(nrow(patient.fuzzy) > 1){
|
|
202 first.merge = patient.fuzzy[1,"merge"]
|
|
203 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
204 first.sample = patient.fuzzy[1,"Sample"]
|
|
205 merge.filter = first.merge == patient.fuzzy$merge
|
|
206
|
|
207 #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
|
|
208
|
|
209 first.sample.filter = first.sample == patient.fuzzy$Sample
|
|
210 second.sample.filter = first.sample != patient.fuzzy$Sample
|
|
211
|
|
212 #first match same sample, sum to a single row, same for other sample
|
|
213 #then merge rows like 'normal'
|
|
214
|
|
215 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
|
216
|
|
217
|
|
218
|
|
219 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter
|
|
220 first.match.filter = merge.filter & sequence.filter & first.sample.filter
|
|
221 second.match.filter = merge.filter & sequence.filter & second.sample.filter
|
|
222
|
|
223 first.rows = patient.fuzzy[first.match.filter,]
|
|
224 second.rows = patient.fuzzy[second.match.filter,]
|
|
225
|
|
226 first.rows.v = table(first.rows$V_Segment_Major_Gene)
|
|
227 first.rows.v = names(first.rows.v[which.max(first.rows.v)])
|
|
228 first.rows.j = table(first.rows$J_Segment_Major_Gene)
|
|
229 first.rows.j = names(first.rows.j[which.max(first.rows.j)])
|
|
230
|
|
231 first.sum = data.frame(merge = first.clone.sequence,
|
|
232 Patient = patient,
|
|
233 Receptor = first.rows[1,"Receptor"],
|
|
234 Sample = first.rows[1,"Sample"],
|
|
235 Cell_Count = first.rows[1,"Cell_Count"],
|
|
236 Clone_Molecule_Count_From_Spikes = sum(first.rows$Clone_Molecule_Count_From_Spikes),
|
|
237 Log10_Frequency = log10(sum(first.rows$Frequency)),
|
|
238 Total_Read_Count = sum(first.rows$Total_Read_Count),
|
|
239 dsPerM = sum(first.rows$dsPerM),
|
|
240 J_Segment_Major_Gene = first.rows.j,
|
|
241 V_Segment_Major_Gene = first.rows.v,
|
|
242 Clone_Sequence = first.clone.sequence,
|
|
243 CDR3_Sense_Sequence = first.rows[1,"CDR3_Sense_Sequence"],
|
|
244 Related_to_leukemia_clone = F,
|
|
245 Frequency = sum(first.rows$Frequency),
|
|
246 locus_V = first.rows[1,"locus_V"],
|
|
247 locus_J = first.rows[1,"locus_J"],
|
|
248 min_cell_count = first.rows[1,"min_cell_count"],
|
|
249 normalized_read_count = sum(first.rows$normalized_read_count),
|
|
250 paste = first.rows[1,"paste"],
|
|
251 min_cell_paste = first.rows[1,"min_cell_paste"])
|
|
252
|
|
253 if(nrow(second.rows) > 0){
|
|
254 second.rows.v = table(second.rows$V_Segment_Major_Gene)
|
|
255 second.rows.v = names(second.rows.v[which.max(second.rows.v)])
|
|
256 second.rows.j = table(second.rows$J_Segment_Major_Gene)
|
|
257 second.rows.j = names(second.rows.j[which.max(second.rows.j)])
|
|
258
|
|
259 second.sum = data.frame(merge = first.clone.sequence,
|
|
260 Patient = patient,
|
|
261 Receptor = second.rows[1,"Receptor"],
|
|
262 Sample = second.rows[1,"Sample"],
|
|
263 Cell_Count = second.rows[1,"Cell_Count"],
|
|
264 Clone_Molecule_Count_From_Spikes = sum(second.rows$Clone_Molecule_Count_From_Spikes),
|
|
265 Log10_Frequency = log10(sum(second.rows$Frequency)),
|
|
266 Total_Read_Count = sum(second.rows$Total_Read_Count),
|
|
267 dsPerM = sum(second.rows$dsPerM),
|
|
268 J_Segment_Major_Gene = second.rows.j,
|
|
269 V_Segment_Major_Gene = second.rows.v,
|
|
270 Clone_Sequence = first.clone.sequence,
|
|
271 CDR3_Sense_Sequence = second.rows[1,"CDR3_Sense_Sequence"],
|
|
272 Related_to_leukemia_clone = F,
|
|
273 Frequency = sum(second.rows$Frequency),
|
|
274 locus_V = second.rows[1,"locus_V"],
|
|
275 locus_J = second.rows[1,"locus_J"],
|
|
276 min_cell_count = second.rows[1,"min_cell_count"],
|
|
277 normalized_read_count = sum(second.rows$normalized_read_count),
|
|
278 paste = second.rows[1,"paste"],
|
|
279 min_cell_paste = second.rows[1,"min_cell_paste"])
|
|
280
|
|
281 patientMerge = rbind(patientMerge, merge(first.sum, second.sum, by="merge"))
|
|
282 patient.fuzzy = patient.fuzzy[!(first.match.filter | second.match.filter),]
|
|
283
|
|
284 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"], second.rows[second.rows$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
285 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
286
|
|
287 tmp.rows = rbind(first.rows, second.rows)
|
|
288 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
|
289
|
|
290
|
|
291 #add to the scatterplot data
|
|
292 scatterplot.row = first.sum[,scatterplot_data_columns]
|
|
293 scatterplot.row$type = paste(first.sum[,"Sample"], "In Both")
|
|
294 scatterplot.row$link = link.count
|
|
295 scatterplot.row$on = onShort
|
|
296
|
|
297 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
298
|
|
299 scatterplot.row = second.sum[,scatterplot_data_columns]
|
|
300 scatterplot.row$type = paste(second.sum[,"Sample"], "In Both")
|
|
301 scatterplot.row$link = link.count
|
|
302 scatterplot.row$on = onShort
|
|
303
|
|
304 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
305
|
|
306 #write some information about the match to a log file
|
|
307 if (nrow(first.rows) > 1 | nrow(second.rows) > 1) {
|
|
308 cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
309 } else {
|
|
310 second.clone.sequence = second.rows[1,"Clone_Sequence"]
|
|
311 if(nchar(first.clone.sequence) != nchar(second.clone.sequence)){
|
|
312 cat(paste("<tr bgcolor='#DDD'><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
313 } else {
|
|
314 #cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
315 }
|
|
316 }
|
|
317
|
|
318 } else if(nrow(first.rows) > 1) {
|
|
319 if(patient1[1,"Sample"] == first.sample){
|
|
320 patient1 = patient1[!(patient1$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
321 patient1 = rbind(patient1, first.sum)
|
|
322 } else {
|
|
323 patient2 = patient2[!(patient2$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
324 patient2 = rbind(patient2, first.sum)
|
|
325 }
|
|
326
|
|
327 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"])
|
|
328 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
329
|
|
330 patient.fuzzy = patient.fuzzy[-first.match.filter,]
|
|
331
|
|
332 #add to the scatterplot data
|
|
333 scatterplot.row = first.sum[,scatterplot_data_columns]
|
|
334 scatterplot.row$type = first.sum[,"Sample"]
|
|
335 scatterplot.row$link = link.count
|
|
336 scatterplot.row$on = onShort
|
|
337
|
|
338 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
339
|
|
340 cat(paste("<tr bgcolor='#DDF'><td>", patient, " row ", 1:nrow(first.rows), "</td><td>", first.rows$Sample, ":</td><td>", first.rows$Clone_Sequence, "</td><td>", first.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
341 } else {
|
|
342 patient.fuzzy = patient.fuzzy[-1,]
|
|
343
|
|
344 #add to the scatterplot data
|
|
345 scatterplot.row = first.sum[,scatterplot_data_columns]
|
|
346 scatterplot.row$type = first.sum[,"Sample"]
|
|
347 scatterplot.row$link = link.count
|
|
348 scatterplot.row$on = onShort
|
|
349
|
|
350 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
351 }
|
|
352 link.count = link.count + 1
|
|
353 }
|
|
354 patient.merge.list[[patient]] <<- patientMerge
|
|
355 patient.merge.list.second[[patient]] <<- merge.list[["second"]]
|
|
356
|
|
357 sample.order = data.frame(type = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both")),type.order = 1:4)
|
|
358 scatterplot_data = merge(scatterplot_data, sample.order, by="type")
|
|
359
|
|
360 scatter_locus_data_list[[patient]] <<- scatterplot_data
|
|
361 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
|
362 }
|
|
363
|
|
364 patient1 = patient1[!(patient1$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
|
365 patient2 = patient2[!(patient2$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
|
366
|
|
367
|
|
368 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
|
369 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony])
|
|
370 res1 = vector()
|
|
371 res2 = vector()
|
|
372 resBoth = vector()
|
|
373 read1Count = vector()
|
|
374 read2Count = vector()
|
|
375 locussum1 = vector()
|
|
376 locussum2 = vector()
|
|
377
|
|
378 #for(iter in 1){
|
|
379 for(iter in 1:length(product[,1])){
|
|
380 threshhold = product[iter,threshholdIndex]
|
|
381 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
382 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
|
383 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
|
384 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
|
385 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
386 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
|
387 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
388 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
|
389 res1 = append(res1, sum(one))
|
|
390 res2 = append(res2, sum(two))
|
|
391 resBoth = append(resBoth, sum(both))
|
|
392 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
393 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
394 #threshhold = 0
|
|
395 if(threshhold != 0){
|
|
396 if(sum(one) > 0){
|
|
397 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
398 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
|
399 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
400 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
401 }
|
|
402 if(sum(two) > 0){
|
|
403 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
404 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
|
405 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
406 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
407 }
|
|
408 } else {
|
|
409 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
410 if(nrow(scatterplot_locus_data) > 0){
|
|
411 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
|
|
412 }
|
|
413
|
|
414
|
|
415
|
|
416 p = NULL
|
|
417 print(paste("nrow scatterplot_locus_data", nrow(scatterplot_locus_data)))
|
|
418 if(nrow(scatterplot_locus_data) != 0){
|
|
419 if(on == "normalized_read_count"){
|
|
420 write.table(scatterplot_locus_data, file=paste(oneSample, twoSample, product[iter, titleIndex], "scatterplot_locus_data.txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
421 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
422 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), normalized_read_count, group=link)) + geom_line() + scale_y_log10(breaks=scales,labels=scales, limits=c(1,1e6)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
|
423 } else {
|
|
424 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), Frequency, group=link)) + geom_line() + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
|
425 }
|
|
426 p = p + geom_point(aes(colour=type), position="dodge")
|
|
427 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
428 } else {
|
|
429 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
430 }
|
|
431 png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
432 print(p)
|
|
433 dev.off()
|
|
434 }
|
|
435 if(sum(both) > 0){
|
|
436 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
437 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
438 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
439 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
440 }
|
|
441 }
|
|
442 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
|
443 if(sum(is.na(patientResult$percentage)) > 0){
|
|
444 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
445 }
|
|
446 colnames(patientResult)[6] = oneSample
|
|
447 colnames(patientResult)[8] = twoSample
|
|
448 colnamesBak = colnames(patientResult)
|
|
449 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
|
450 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
451 colnames(patientResult) = colnamesBak
|
|
452
|
|
453 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
454 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
455
|
|
456 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
457 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
458 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
459 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
460 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
461 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
462 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
463 print(plt)
|
|
464 dev.off()
|
|
465 #(t,r,b,l)
|
|
466 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
467 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
468 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
469 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
470 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
471 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
472 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
473 print(plt)
|
|
474 dev.off()
|
|
475
|
|
476 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
477 patientResult$relativeValue = patientResult$value * 10
|
|
478 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
479 plt = ggplot(patientResult)
|
|
480 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
481 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
482 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
483 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
484 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
485 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
486 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
487 print(plt)
|
|
488 dev.off()
|
|
489 }
|
1
|
490 if(length(patients) > 0){
|
|
491 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
0
|
492
|
1
|
493 interval = intervalFreq
|
|
494 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
495 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
496 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
|
0
|
497
|
1
|
498 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
0
|
499
|
1
|
500 interval = intervalReads
|
|
501 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
502 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
503 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
|
|
504 }
|
0
|
505 if(nrow(single_patients) > 0){
|
|
506 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
507 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=as.character(scales)) + expand_limits(y=c(0,1000000))
|
|
508 p = p + geom_point(aes(colour=type), position="jitter")
|
|
509 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
510 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
511 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
512 print(p)
|
|
513 dev.off()
|
|
514
|
|
515 #p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
516 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
|
|
517 p = p + geom_point(aes(colour=type), position="jitter")
|
|
518 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
519 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
520 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
521 print(p)
|
|
522 dev.off()
|
|
523 } else {
|
|
524 empty <- data.frame()
|
|
525 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
526
|
|
527 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
528 print(p)
|
|
529 dev.off()
|
|
530
|
|
531 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
532 print(p)
|
|
533 dev.off()
|
|
534 }
|
|
535
|
|
536 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
537 patient.merge.list.second = list()
|
|
538
|
|
539 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
1
|
540 onShort = "reads"
|
|
541 if(on == "Frequency"){
|
|
542 onShort = "freq"
|
|
543 }
|
|
544 onx = paste(on, ".x", sep="")
|
|
545 ony = paste(on, ".y", sep="")
|
|
546 onz = paste(on, ".z", sep="")
|
|
547 type="triplet"
|
0
|
548
|
1
|
549 threshholdIndex = which(colnames(product) == "interval")
|
|
550 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
551 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
552 titleIndex = which(colnames(product) == "Titles")
|
|
553 sampleIndex = which(colnames(patient1) == "Sample")
|
|
554 patientIndex = which(colnames(patient1) == "Patient")
|
|
555 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
556 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
557 threeSample = paste(patient3[1,sampleIndex], sep="")
|
|
558
|
|
559 if(mergeOn == "Clone_Sequence"){
|
|
560 patient1$merge = paste(patient1$Clone_Sequence)
|
0
|
561 patient2$merge = paste(patient2$Clone_Sequence)
|
|
562 patient3$merge = paste(patient3$Clone_Sequence)
|
|
563
|
1
|
564 } else {
|
0
|
565 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
566 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
567 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
1
|
568 }
|
0
|
569
|
1
|
570 #patientMerge = merge(patient1, patient2, by="merge")[NULL,]
|
|
571 patient1.fuzzy = patient1
|
|
572 patient2.fuzzy = patient2
|
|
573 patient3.fuzzy = patient3
|
0
|
574
|
1
|
575 cat(paste("<tr><td>", label1, "</td>", sep=""), file=logfile, append=T)
|
0
|
576
|
1
|
577 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
578 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
|
579 patient3.fuzzy$merge = paste(patient3.fuzzy$locus_V, patient3.fuzzy$locus_J)
|
0
|
580
|
1
|
581 patient.fuzzy = rbind(patient1.fuzzy ,patient2.fuzzy, patient3.fuzzy)
|
|
582 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
0
|
583
|
1
|
584 other.sample.list = list()
|
|
585 other.sample.list[[oneSample]] = c(twoSample, threeSample)
|
|
586 other.sample.list[[twoSample]] = c(oneSample, threeSample)
|
|
587 other.sample.list[[threeSample]] = c(oneSample, twoSample)
|
0
|
588
|
1
|
589 patientMerge = merge(patient1, patient2, by="merge")
|
|
590 patientMerge = merge(patientMerge, patient3, by="merge")
|
|
591 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
|
592 #patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
593 patientMerge = patientMerge[NULL,]
|
0
|
594
|
1
|
595 duo.merge.list = list()
|
0
|
596
|
1
|
597 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
598 #patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
599 patientMerge12 = patientMerge12[NULL,]
|
|
600 duo.merge.list[[paste(oneSample, twoSample)]] = patientMerge12
|
|
601 duo.merge.list[[paste(twoSample, oneSample)]] = patientMerge12
|
0
|
602
|
1
|
603 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
604 #patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
605 patientMerge13 = patientMerge13[NULL,]
|
|
606 duo.merge.list[[paste(oneSample, threeSample)]] = patientMerge13
|
|
607 duo.merge.list[[paste(threeSample, oneSample)]] = patientMerge13
|
0
|
608
|
1
|
609 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
610 #patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
611 patientMerge23 = patientMerge23[NULL,]
|
|
612 duo.merge.list[[paste(twoSample, threeSample)]] = patientMerge23
|
|
613 duo.merge.list[[paste(threeSample, twoSample)]] = patientMerge23
|
0
|
614
|
1
|
615 merge.list = list()
|
|
616 merge.list[["second"]] = vector()
|
|
617
|
|
618 #print(paste(nrow(patient1), nrow(patient2), nrow(patient3), label1, label2, label3))
|
|
619
|
|
620 start.time = proc.time()
|
|
621 if(paste(label1, "123") %in% names(patient.merge.list)){
|
|
622 patientMerge = patient.merge.list[[paste(label1, "123")]]
|
|
623 patientMerge12 = patient.merge.list[[paste(label1, "12")]]
|
|
624 patientMerge13 = patient.merge.list[[paste(label1, "13")]]
|
|
625 patientMerge23 = patient.merge.list[[paste(label1, "23")]]
|
0
|
626
|
1
|
627 #merge.list[["second"]] = patient.merge.list.second[[label1]]
|
0
|
628
|
1
|
629 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
|
630 } else {
|
|
631 while(nrow(patient.fuzzy) > 0){
|
|
632 first.merge = patient.fuzzy[1,"merge"]
|
|
633 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
634 first.sample = paste(patient.fuzzy[1,"Sample"], sep="")
|
|
635
|
|
636 merge.filter = first.merge == patient.fuzzy$merge
|
|
637
|
|
638 second.sample = other.sample.list[[first.sample]][1]
|
|
639 third.sample = other.sample.list[[first.sample]][2]
|
0
|
640
|
1
|
641 sample.filter.1 = first.sample == patient.fuzzy$Sample
|
|
642 sample.filter.2 = second.sample == patient.fuzzy$Sample
|
|
643 sample.filter.3 = third.sample == patient.fuzzy$Sample
|
0
|
644
|
1
|
645 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
0
|
646
|
1
|
647 match.filter.1 = sample.filter.1 & sequence.filter & merge.filter
|
|
648 match.filter.2 = sample.filter.2 & sequence.filter & merge.filter
|
|
649 match.filter.3 = sample.filter.3 & sequence.filter & merge.filter
|
0
|
650
|
1
|
651 matches.in.1 = any(match.filter.1)
|
|
652 matches.in.2 = any(match.filter.2)
|
|
653 matches.in.3 = any(match.filter.3)
|
0
|
654
|
1
|
655 rows.1 = patient.fuzzy[match.filter.1,]
|
0
|
656
|
1
|
657 sum.1 = data.frame(merge = first.clone.sequence,
|
|
658 Patient = label1,
|
|
659 Receptor = rows.1[1,"Receptor"],
|
|
660 Sample = rows.1[1,"Sample"],
|
|
661 Cell_Count = rows.1[1,"Cell_Count"],
|
|
662 Clone_Molecule_Count_From_Spikes = sum(rows.1$Clone_Molecule_Count_From_Spikes),
|
|
663 Log10_Frequency = log10(sum(rows.1$Frequency)),
|
|
664 Total_Read_Count = sum(rows.1$Total_Read_Count),
|
|
665 dsPerM = sum(rows.1$dsPerM),
|
|
666 J_Segment_Major_Gene = rows.1[1,"J_Segment_Major_Gene"],
|
|
667 V_Segment_Major_Gene = rows.1[1,"V_Segment_Major_Gene"],
|
|
668 Clone_Sequence = first.clone.sequence,
|
|
669 CDR3_Sense_Sequence = rows.1[1,"CDR3_Sense_Sequence"],
|
|
670 Related_to_leukemia_clone = F,
|
|
671 Frequency = sum(rows.1$Frequency),
|
|
672 locus_V = rows.1[1,"locus_V"],
|
|
673 locus_J = rows.1[1,"locus_J"],
|
|
674 uniqueID = rows.1[1,"uniqueID"],
|
|
675 normalized_read_count = sum(rows.1$normalized_read_count))
|
|
676 sum.2 = sum.1[NULL,]
|
|
677 rows.2 = patient.fuzzy[match.filter.2,]
|
|
678 if(matches.in.2){
|
|
679 sum.2 = data.frame(merge = first.clone.sequence,
|
|
680 Patient = label1,
|
|
681 Receptor = rows.2[1,"Receptor"],
|
|
682 Sample = rows.2[1,"Sample"],
|
|
683 Cell_Count = rows.2[1,"Cell_Count"],
|
|
684 Clone_Molecule_Count_From_Spikes = sum(rows.2$Clone_Molecule_Count_From_Spikes),
|
|
685 Log10_Frequency = log10(sum(rows.2$Frequency)),
|
|
686 Total_Read_Count = sum(rows.2$Total_Read_Count),
|
|
687 dsPerM = sum(rows.2$dsPerM),
|
|
688 J_Segment_Major_Gene = rows.2[1,"J_Segment_Major_Gene"],
|
|
689 V_Segment_Major_Gene = rows.2[1,"V_Segment_Major_Gene"],
|
|
690 Clone_Sequence = first.clone.sequence,
|
|
691 CDR3_Sense_Sequence = rows.2[1,"CDR3_Sense_Sequence"],
|
|
692 Related_to_leukemia_clone = F,
|
|
693 Frequency = sum(rows.2$Frequency),
|
|
694 locus_V = rows.2[1,"locus_V"],
|
|
695 locus_J = rows.2[1,"locus_J"],
|
|
696 uniqueID = rows.2[1,"uniqueID"],
|
|
697 normalized_read_count = sum(rows.2$normalized_read_count))
|
|
698 }
|
0
|
699
|
1
|
700 sum.3 = sum.1[NULL,]
|
|
701 rows.3 = patient.fuzzy[match.filter.3,]
|
|
702 if(matches.in.3){
|
|
703 sum.3 = data.frame(merge = first.clone.sequence,
|
|
704 Patient = label1,
|
|
705 Receptor = rows.3[1,"Receptor"],
|
|
706 Sample = rows.3[1,"Sample"],
|
|
707 Cell_Count = rows.3[1,"Cell_Count"],
|
|
708 Clone_Molecule_Count_From_Spikes = sum(rows.3$Clone_Molecule_Count_From_Spikes),
|
|
709 Log10_Frequency = log10(sum(rows.3$Frequency)),
|
|
710 Total_Read_Count = sum(rows.3$Total_Read_Count),
|
|
711 dsPerM = sum(rows.3$dsPerM),
|
|
712 J_Segment_Major_Gene = rows.3[1,"J_Segment_Major_Gene"],
|
|
713 V_Segment_Major_Gene = rows.3[1,"V_Segment_Major_Gene"],
|
|
714 Clone_Sequence = first.clone.sequence,
|
|
715 CDR3_Sense_Sequence = rows.3[1,"CDR3_Sense_Sequence"],
|
|
716 Related_to_leukemia_clone = F,
|
|
717 Frequency = sum(rows.3$Frequency),
|
|
718 locus_V = rows.3[1,"locus_V"],
|
|
719 locus_J = rows.3[1,"locus_J"],
|
|
720 uniqueID = rows.3[1,"uniqueID"],
|
|
721 normalized_read_count = sum(rows.3$normalized_read_count))
|
|
722 }
|
0
|
723
|
1
|
724 if(matches.in.2 & matches.in.3){
|
|
725 merge.123 = merge(sum.1, sum.2, by="merge")
|
|
726 merge.123 = merge(merge.123, sum.3, by="merge")
|
|
727 colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123)))] = paste(colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123), perl=T))], ".z", sep="")
|
|
728 #merge.123$thresholdValue = pmax(merge.123[,onx], merge.123[,ony], merge.123[,onz])
|
0
|
729
|
1
|
730 patientMerge = rbind(patientMerge, merge.123)
|
|
731 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2 | match.filter.3),]
|
0
|
732
|
1
|
733 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
734 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
0
|
735
|
1
|
736 } else if (matches.in.2) {
|
|
737 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
738 #other.sample2 = other.sample.list[[first.sample]][2]
|
0
|
739
|
1
|
740 second.sample = sum.2[,"Sample"]
|
0
|
741
|
1
|
742 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
0
|
743
|
1
|
744 merge.12 = merge(sum.1, sum.2, by="merge")
|
0
|
745
|
1
|
746 current.merge.list = rbind(current.merge.list, merge.12)
|
|
747 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
0
|
748
|
1
|
749 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2),]
|
0
|
750
|
1
|
751 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
752 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
0
|
753
|
1
|
754 } else if (matches.in.3) {
|
0
|
755
|
1
|
756 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
757 #other.sample2 = other.sample.list[[first.sample]][2]
|
0
|
758
|
1
|
759 second.sample = sum.3[,"Sample"]
|
0
|
760
|
1
|
761 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
0
|
762
|
1
|
763 merge.13 = merge(sum.1, sum.3, by="merge")
|
0
|
764
|
1
|
765 current.merge.list = rbind(current.merge.list, merge.13)
|
|
766 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
0
|
767
|
1
|
768 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.3),]
|
0
|
769
|
1
|
770 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
771 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
0
|
772
|
1
|
773 } else if(nrow(rows.1) > 1){
|
|
774 patient1 = patient1[!(patient1$Clone_Sequence %in% rows.1$Clone_Sequence),]
|
|
775 print(names(patient1)[names(patient1) %in% sum.1])
|
|
776 print(names(patient1)[!(names(patient1) %in% sum.1)])
|
|
777 print(names(patient1))
|
|
778 print(names(sum.1))
|
|
779 print(summary(sum.1))
|
|
780 print(summary(patient1))
|
|
781 print(dim(sum.1))
|
|
782 print(dim(patient1))
|
|
783 print(head(sum.1[,names(patient1)]))
|
|
784 patient1 = rbind(patient1, sum.1[,names(patient1)])
|
|
785 patient.fuzzy = patient.fuzzy[-match.filter.1,]
|
|
786 } else {
|
|
787 patient.fuzzy = patient.fuzzy[-1,]
|
|
788 }
|
|
789
|
|
790 tmp.rows = rbind(rows.1, rows.2, rows.3)
|
|
791 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
0
|
792
|
1
|
793 if (sum(match.filter.1) > 1 | sum(match.filter.2) > 1 | sum(match.filter.1) > 1) {
|
|
794 cat(paste("<tr><td>", label1, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
795 } else {
|
|
796 }
|
|
797
|
|
798 }
|
|
799 patient.merge.list[[paste(label1, "123")]] = patientMerge
|
|
800
|
|
801 patientMerge12 = duo.merge.list[[paste(oneSample, twoSample)]]
|
|
802 patientMerge13 = duo.merge.list[[paste(oneSample, threeSample)]]
|
|
803 patientMerge23 = duo.merge.list[[paste(twoSample, threeSample)]]
|
0
|
804
|
1
|
805 patient.merge.list[[paste(label1, "12")]] = patientMerge12
|
|
806 patient.merge.list[[paste(label1, "13")]] = patientMerge13
|
|
807 patient.merge.list[[paste(label1, "23")]] = patientMerge23
|
|
808
|
|
809 #patient.merge.list.second[[label1]] = merge.list[["second"]]
|
|
810 }
|
|
811 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
|
812 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
813 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
814 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
815 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
816
|
|
817 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
818 #patientMerge12$thresholdValue = pmin(patientMerge12[,onx], patientMerge12[,ony])
|
|
819 #patientMerge13$thresholdValue = pmin(patientMerge13[,onx], patientMerge13[,ony])
|
|
820 #patientMerge23$thresholdValue = pmin(patientMerge23[,onx], patientMerge23[,ony])
|
0
|
821
|
1
|
822 patient1 = patient1[!(patient1$Clone_Sequence %in% merge.list[["second"]]),]
|
|
823 patient2 = patient2[!(patient2$Clone_Sequence %in% merge.list[["second"]]),]
|
|
824 patient3 = patient3[!(patient3$Clone_Sequence %in% merge.list[["second"]]),]
|
0
|
825
|
1
|
826 if(F){
|
|
827 patientMerge = merge(patient1, patient2, by="merge")
|
|
828 patientMerge = merge(patientMerge, patient3, by="merge")
|
|
829 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
|
830 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
831 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
832 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
833 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
834 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
835 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
836 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
837 }
|
0
|
838
|
1
|
839 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
|
840 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
|
841 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
842
|
|
843 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
0
|
844
|
1
|
845 res1 = vector()
|
|
846 res2 = vector()
|
|
847 res3 = vector()
|
|
848 res12 = vector()
|
|
849 res13 = vector()
|
|
850 res23 = vector()
|
|
851 resAll = vector()
|
|
852 read1Count = vector()
|
|
853 read2Count = vector()
|
|
854 read3Count = vector()
|
0
|
855
|
1
|
856 if(appendTriplets){
|
|
857 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
|
858 }
|
|
859 for(iter in 1:length(product[,1])){
|
|
860 threshhold = product[iter,threshholdIndex]
|
|
861 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
862 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
|
863 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
864 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
0
|
865
|
1
|
866 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
867 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
868 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
|
869
|
|
870 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
871 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
872 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
0
|
873
|
1
|
874 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
875 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
876 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
|
877 res1 = append(res1, sum(one))
|
|
878 res2 = append(res2, sum(two))
|
|
879 res3 = append(res3, sum(three))
|
|
880 resAll = append(resAll, sum(all))
|
|
881 res12 = append(res12, sum(one_two))
|
|
882 res13 = append(res13, sum(one_three))
|
|
883 res23 = append(res23, sum(two_three))
|
|
884 #threshhold = 0
|
|
885 if(threshhold != 0){
|
|
886 if(sum(one) > 0){
|
|
887 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
888 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
|
889 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
890 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
891 }
|
|
892 if(sum(two) > 0){
|
|
893 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
894 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
|
895 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
896 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
897 }
|
|
898 if(sum(three) > 0){
|
|
899 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
900 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
|
901 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
902 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
903 }
|
|
904 if(sum(one_two) > 0){
|
|
905 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
906 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
907 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
908 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
909 }
|
|
910 if(sum(one_three) > 0){
|
|
911 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
912 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
913 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
914 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
915 }
|
|
916 if(sum(two_three) > 0){
|
|
917 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
918 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
919 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
920 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
921 }
|
|
922 } else { #scatterplot data
|
|
923 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
924 scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
|
|
925 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
|
926 if(sum(in_two) > 0){
|
|
927 scatterplot_locus_data[in_two,]$type = "In two"
|
|
928 }
|
|
929 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
|
930 if(sum(in_three)> 0){
|
|
931 scatterplot_locus_data[in_three,]$type = "In three"
|
|
932 }
|
|
933 not_in_one = scatterplot_locus_data$type != "In one"
|
|
934 if(sum(not_in_one) > 0){
|
|
935 #scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
936 }
|
|
937 p = NULL
|
|
938 if(nrow(scatterplot_locus_data) != 0){
|
|
939 if(on == "normalized_read_count"){
|
|
940 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
941 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales, limits=c(1, 1e6))
|
|
942 } else {
|
|
943 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
|
|
944 #p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
945 }
|
|
946 p = p + geom_point(aes(colour=type), position="jitter")
|
|
947 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
|
948 } else {
|
|
949 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
|
950 }
|
|
951 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
952 print(p)
|
|
953 dev.off()
|
|
954 }
|
|
955 if(sum(all) > 0){
|
|
956 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
957 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
958 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
959 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
960 }
|
|
961 }
|
|
962 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
963 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
|
964 colnames(patientResult)[6] = oneSample
|
|
965 colnames(patientResult)[7] = twoSample
|
|
966 colnames(patientResult)[8] = threeSample
|
|
967 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
968 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
969 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
|
970
|
|
971 colnamesBak = colnames(patientResult)
|
|
972 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
|
973 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
974 colnames(patientResult) = colnamesBak
|
|
975
|
|
976 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
977 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
978
|
|
979 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
980 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
981 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
982 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
983 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
984 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
985 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
986 print(plt)
|
|
987 dev.off()
|
|
988
|
|
989 fontSize = 4
|
|
990
|
|
991 bak = patientResult
|
|
992 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
993 patientResult$relativeValue = patientResult$value * 10
|
|
994 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
995 plt = ggplot(patientResult)
|
|
996 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
997 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
998 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
999 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
1000 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
1001 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
1002 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
1003 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
1004 print(plt)
|
|
1005 dev.off()
|
0
|
1006 }
|
|
1007
|
|
1008 if(nrow(triplets) != 0){
|
|
1009
|
1
|
1010 cat("<tr><td>Starting triplet analysis</td></tr>", file=logfile, append=T)
|
|
1011
|
|
1012 triplets$uniqueID = paste(triplets$Patient, triplets$Sample, sep="_")
|
|
1013
|
|
1014 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
0
|
1015
|
1
|
1016 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
1017 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
1018 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
0
|
1019
|
1
|
1020 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
1021 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
0
|
1022
|
1
|
1023 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
1024
|
|
1025 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
1026
|
|
1027 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2)
|
0
|
1028
|
1
|
1029 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
1030
|
|
1031 column_drops = c("min_cell_count", "min_cell_paste")
|
|
1032
|
|
1033 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
|
1034
|
|
1035 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
0
|
1036
|
1
|
1037 interval = intervalReads
|
|
1038 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
1039 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
0
|
1040
|
1
|
1041 triplets = split(triplets, triplets$Patient, drop=T)
|
|
1042 print(nrow(triplets))
|
|
1043 for(triplet in triplets){
|
|
1044 samples = unique(triplet$Sample)
|
|
1045 one = triplet[triplet$Sample == samples[1],]
|
|
1046 two = triplet[triplet$Sample == samples[2],]
|
|
1047 three = triplet[triplet$Sample == samples[3],]
|
|
1048
|
|
1049 print(paste(nrow(triplet), nrow(one), nrow(two), nrow(three)))
|
|
1050 tripletAnalysis(one, one[1,"uniqueID"], two, two[1,"uniqueID"], three, three[1,"uniqueID"], product=product, interval=interval, on="normalized_read_count", T)
|
|
1051 }
|
|
1052
|
|
1053 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
1054
|
|
1055 interval = intervalFreq
|
|
1056 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
1057 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
1058
|
|
1059 for(triplet in triplets){
|
|
1060 samples = unique(triplet$Sample)
|
|
1061 one = triplet[triplet$Sample == samples[1],]
|
|
1062 two = triplet[triplet$Sample == samples[2],]
|
|
1063 three = triplet[triplet$Sample == samples[3],]
|
|
1064 tripletAnalysis(one, one[1,"uniqueID"], two, two[1,"uniqueID"], three, three[1,"uniqueID"], product=product, interval=interval, on="Frequency", F)
|
|
1065 }
|
0
|
1066 } else {
|
|
1067 cat("", file="triplets.txt")
|
|
1068 }
|
|
1069 cat("</table></html>", file=logfile, append=T)
|