Mercurial > repos > davidvanzessen > prisca
comparison RScript.r @ 0:ed6885c85660 draft
Uploaded
author | davidvanzessen |
---|---|
date | Wed, 31 Aug 2016 05:31:47 -0400 |
parents | |
children | 75853bceec00 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ed6885c85660 |
---|---|
1 args <- commandArgs(trailingOnly = TRUE) | |
2 options(scipen=999) | |
3 | |
4 inFile = args[1] | |
5 outDir = args[2] | |
6 logfile = args[3] | |
7 min_freq = as.numeric(args[4]) | |
8 min_cells = as.numeric(args[5]) | |
9 mergeOn = args[6] | |
10 | |
11 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F) | |
12 | |
13 library(ggplot2) | |
14 library(reshape2) | |
15 library(data.table) | |
16 library(grid) | |
17 library(parallel) | |
18 #require(xtable) | |
19 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T) | |
20 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F) | |
21 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Clone_Sequence")] | |
22 dat$dsPerM = 0 | |
23 dat = dat[!is.na(dat$Patient),] | |
24 dat$Related_to_leukemia_clone = F | |
25 | |
26 setwd(outDir) | |
27 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T) | |
28 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1))) | |
29 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1))) | |
30 | |
31 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T) | |
32 | |
33 dat$Frequency = ((10^dat$Log10_Frequency)*100) | |
34 | |
35 dat = dat[dat$Frequency >= min_freq,] | |
36 | |
37 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),] | |
38 | |
39 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T) | |
40 | |
41 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4) | |
42 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4) | |
43 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")]) | |
44 | |
45 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J) | |
46 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J) | |
47 | |
48 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")] | |
49 print(paste("rows:", nrow(dat))) | |
50 dat = merge(dat, min_cell_count, by="min_cell_paste") | |
51 print(paste("rows:", nrow(dat))) | |
52 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2 | |
53 | |
54 dat = dat[dat$normalized_read_count >= min_cells,] | |
55 | |
56 dat$paste = paste(dat$Sample, dat$Clone_Sequence) | |
57 | |
58 patients = split(dat, dat$Patient, drop=T) | |
59 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000)) | |
60 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5)) | |
61 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV") | |
62 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*") | |
63 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb") | |
64 Titles = factor(Titles, levels=Titles) | |
65 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles)) | |
66 | |
67 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0)) | |
68 | |
69 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory... | |
70 patient.merge.list.second = list() | |
71 scatter_locus_data_list = list() | |
72 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="multiple_matches.html", append=T) | |
73 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="single_matches.html", append=T) | |
74 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){ | |
75 if (!is.data.frame(x) & is.list(x)){ | |
76 x = x[[1]] | |
77 } | |
78 #x$Sample = factor(x$Sample, levels=unique(x$Sample)) | |
79 x = data.frame(x,stringsAsFactors=F) | |
80 onShort = "reads" | |
81 if(on == "Frequency"){ | |
82 onShort = "freq" | |
83 } | |
84 onx = paste(on, ".x", sep="") | |
85 ony = paste(on, ".y", sep="") | |
86 splt = split(x, x$Sample, drop=T) | |
87 type="pair" | |
88 if(length(splt) == 1){ | |
89 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample")) | |
90 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0)) | |
91 type="single" | |
92 } | |
93 patient1 = splt[[1]] | |
94 patient2 = splt[[2]] | |
95 | |
96 threshholdIndex = which(colnames(product) == "interval") | |
97 V_SegmentIndex = which(colnames(product) == "V_Segments") | |
98 J_SegmentIndex = which(colnames(product) == "J_Segments") | |
99 titleIndex = which(colnames(product) == "Titles") | |
100 sampleIndex = which(colnames(x) == "Sample") | |
101 patientIndex = which(colnames(x) == "Patient") | |
102 oneSample = paste(patient1[1,sampleIndex], sep="") | |
103 twoSample = paste(patient2[1,sampleIndex], sep="") | |
104 patient = paste(x[1,patientIndex]) | |
105 | |
106 switched = F | |
107 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){ | |
108 tmp = twoSample | |
109 twoSample = oneSample | |
110 oneSample = tmp | |
111 tmp = patient1 | |
112 patient1 = patient2 | |
113 patient2 = tmp | |
114 switched = T | |
115 } | |
116 if(appendtxt){ | |
117 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3) | |
118 } | |
119 cat(paste("<tr><td>", patient, "</td>", sep=""), file=logfile, append=T) | |
120 | |
121 if(mergeOn == "Clone_Sequence"){ | |
122 patient1$merge = paste(patient1$Clone_Sequence) | |
123 patient2$merge = paste(patient2$Clone_Sequence) | |
124 } else { | |
125 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) | |
126 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) | |
127 } | |
128 | |
129 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge") | |
130 #scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns]) | |
131 scatterplot_data = patient1[NULL,scatterplot_data_columns] | |
132 #scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),] | |
133 #scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both")) | |
134 scatterplot.data.type.factor = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both")) | |
135 #scatterplot_data$type = factor(x=NULL, levels=scatterplot.data.type.factor) | |
136 scatterplot_data$type = character(0) | |
137 scatterplot_data$link = numeric(0) | |
138 scatterplot_data$on = character(0) | |
139 | |
140 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") #merge alles 'fuzzy' | |
141 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh | |
142 | |
143 cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence | |
144 | |
145 start.time = proc.time() | |
146 merge.list = c() | |
147 | |
148 if(patient %in% names(patient.merge.list)){ | |
149 patientMerge = patient.merge.list[[patient]] | |
150 merge.list[["second"]] = patient.merge.list.second[[patient]] | |
151 scatterplot_data = scatter_locus_data_list[[patient]] | |
152 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T) | |
153 | |
154 print(names(patient.merge.list)) | |
155 } else { | |
156 #fuzzy matching here... | |
157 #merge.list = patientMerge$merge | |
158 | |
159 #patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),] | |
160 #patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),] | |
161 | |
162 patient1.fuzzy = patient1 | |
163 patient2.fuzzy = patient2 | |
164 | |
165 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence) | |
166 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence) | |
167 | |
168 #patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence) | |
169 #patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence) | |
170 | |
171 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J) | |
172 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J) | |
173 | |
174 #merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"]))) #also remove? | |
175 #merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,] | |
176 | |
177 #patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,] | |
178 #patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,] | |
179 | |
180 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy) | |
181 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),] | |
182 | |
183 merge.list = list() | |
184 | |
185 merge.list[["second"]] = vector() | |
186 | |
187 link.count = 1 | |
188 | |
189 while(nrow(patient.fuzzy) > 1){ | |
190 first.merge = patient.fuzzy[1,"merge"] | |
191 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"] | |
192 first.sample = patient.fuzzy[1,"Sample"] | |
193 merge.filter = first.merge == patient.fuzzy$merge | |
194 | |
195 #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9 | |
196 | |
197 first.sample.filter = first.sample == patient.fuzzy$Sample | |
198 second.sample.filter = first.sample != patient.fuzzy$Sample | |
199 | |
200 #first match same sample, sum to a single row, same for other sample | |
201 #then merge rows like 'normal' | |
202 | |
203 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence) | |
204 | |
205 | |
206 | |
207 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter | |
208 first.match.filter = merge.filter & sequence.filter & first.sample.filter | |
209 second.match.filter = merge.filter & sequence.filter & second.sample.filter | |
210 | |
211 first.rows = patient.fuzzy[first.match.filter,] | |
212 second.rows = patient.fuzzy[second.match.filter,] | |
213 | |
214 first.rows.v = table(first.rows$V_Segment_Major_Gene) | |
215 first.rows.v = names(first.rows.v[which.max(first.rows.v)]) | |
216 first.rows.j = table(first.rows$J_Segment_Major_Gene) | |
217 first.rows.j = names(first.rows.j[which.max(first.rows.j)]) | |
218 | |
219 first.sum = data.frame(merge = first.clone.sequence, | |
220 Patient = patient, | |
221 Receptor = first.rows[1,"Receptor"], | |
222 Sample = first.rows[1,"Sample"], | |
223 Cell_Count = first.rows[1,"Cell_Count"], | |
224 Clone_Molecule_Count_From_Spikes = sum(first.rows$Clone_Molecule_Count_From_Spikes), | |
225 Log10_Frequency = log10(sum(first.rows$Frequency)), | |
226 Total_Read_Count = sum(first.rows$Total_Read_Count), | |
227 dsPerM = sum(first.rows$dsPerM), | |
228 J_Segment_Major_Gene = first.rows.j, | |
229 V_Segment_Major_Gene = first.rows.v, | |
230 Clone_Sequence = first.clone.sequence, | |
231 CDR3_Sense_Sequence = first.rows[1,"CDR3_Sense_Sequence"], | |
232 Related_to_leukemia_clone = F, | |
233 Frequency = sum(first.rows$Frequency), | |
234 locus_V = first.rows[1,"locus_V"], | |
235 locus_J = first.rows[1,"locus_J"], | |
236 min_cell_count = first.rows[1,"min_cell_count"], | |
237 normalized_read_count = sum(first.rows$normalized_read_count), | |
238 paste = first.rows[1,"paste"], | |
239 min_cell_paste = first.rows[1,"min_cell_paste"]) | |
240 | |
241 if(nrow(second.rows) > 0){ | |
242 second.rows.v = table(second.rows$V_Segment_Major_Gene) | |
243 second.rows.v = names(second.rows.v[which.max(second.rows.v)]) | |
244 second.rows.j = table(second.rows$J_Segment_Major_Gene) | |
245 second.rows.j = names(second.rows.j[which.max(second.rows.j)]) | |
246 | |
247 second.sum = data.frame(merge = first.clone.sequence, | |
248 Patient = patient, | |
249 Receptor = second.rows[1,"Receptor"], | |
250 Sample = second.rows[1,"Sample"], | |
251 Cell_Count = second.rows[1,"Cell_Count"], | |
252 Clone_Molecule_Count_From_Spikes = sum(second.rows$Clone_Molecule_Count_From_Spikes), | |
253 Log10_Frequency = log10(sum(second.rows$Frequency)), | |
254 Total_Read_Count = sum(second.rows$Total_Read_Count), | |
255 dsPerM = sum(second.rows$dsPerM), | |
256 J_Segment_Major_Gene = second.rows.j, | |
257 V_Segment_Major_Gene = second.rows.v, | |
258 Clone_Sequence = first.clone.sequence, | |
259 CDR3_Sense_Sequence = second.rows[1,"CDR3_Sense_Sequence"], | |
260 Related_to_leukemia_clone = F, | |
261 Frequency = sum(second.rows$Frequency), | |
262 locus_V = second.rows[1,"locus_V"], | |
263 locus_J = second.rows[1,"locus_J"], | |
264 min_cell_count = second.rows[1,"min_cell_count"], | |
265 normalized_read_count = sum(second.rows$normalized_read_count), | |
266 paste = second.rows[1,"paste"], | |
267 min_cell_paste = second.rows[1,"min_cell_paste"]) | |
268 | |
269 patientMerge = rbind(patientMerge, merge(first.sum, second.sum, by="merge")) | |
270 patient.fuzzy = patient.fuzzy[!(first.match.filter | second.match.filter),] | |
271 | |
272 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"], second.rows[second.rows$Clone_Sequence != first.clone.sequence,"Clone_Sequence"]) | |
273 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences) | |
274 | |
275 tmp.rows = rbind(first.rows, second.rows) | |
276 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),] | |
277 | |
278 | |
279 #add to the scatterplot data | |
280 scatterplot.row = first.sum[,scatterplot_data_columns] | |
281 scatterplot.row$type = paste(first.sum[,"Sample"], "In Both") | |
282 scatterplot.row$link = link.count | |
283 scatterplot.row$on = onShort | |
284 | |
285 scatterplot_data = rbind(scatterplot_data, scatterplot.row) | |
286 | |
287 scatterplot.row = second.sum[,scatterplot_data_columns] | |
288 scatterplot.row$type = paste(second.sum[,"Sample"], "In Both") | |
289 scatterplot.row$link = link.count | |
290 scatterplot.row$on = onShort | |
291 | |
292 scatterplot_data = rbind(scatterplot_data, scatterplot.row) | |
293 | |
294 #write some information about the match to a log file | |
295 if (nrow(first.rows) > 1 | nrow(second.rows) > 1) { | |
296 cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T) | |
297 } else { | |
298 second.clone.sequence = second.rows[1,"Clone_Sequence"] | |
299 if(nchar(first.clone.sequence) != nchar(second.clone.sequence)){ | |
300 cat(paste("<tr bgcolor='#DDD'><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T) | |
301 } else { | |
302 #cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T) | |
303 } | |
304 } | |
305 | |
306 } else if(nrow(first.rows) > 1) { | |
307 if(patient1[1,"Sample"] == first.sample){ | |
308 patient1 = patient1[!(patient1$Clone_Sequence %in% first.rows$Clone_Sequence),] | |
309 patient1 = rbind(patient1, first.sum) | |
310 } else { | |
311 patient2 = patient2[!(patient2$Clone_Sequence %in% first.rows$Clone_Sequence),] | |
312 patient2 = rbind(patient2, first.sum) | |
313 } | |
314 | |
315 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"]) | |
316 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences) | |
317 | |
318 patient.fuzzy = patient.fuzzy[-first.match.filter,] | |
319 | |
320 #add to the scatterplot data | |
321 scatterplot.row = first.sum[,scatterplot_data_columns] | |
322 scatterplot.row$type = first.sum[,"Sample"] | |
323 scatterplot.row$link = link.count | |
324 scatterplot.row$on = onShort | |
325 | |
326 scatterplot_data = rbind(scatterplot_data, scatterplot.row) | |
327 | |
328 cat(paste("<tr bgcolor='#DDF'><td>", patient, " row ", 1:nrow(first.rows), "</td><td>", first.rows$Sample, ":</td><td>", first.rows$Clone_Sequence, "</td><td>", first.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T) | |
329 } else { | |
330 patient.fuzzy = patient.fuzzy[-1,] | |
331 | |
332 #add to the scatterplot data | |
333 scatterplot.row = first.sum[,scatterplot_data_columns] | |
334 scatterplot.row$type = first.sum[,"Sample"] | |
335 scatterplot.row$link = link.count | |
336 scatterplot.row$on = onShort | |
337 | |
338 scatterplot_data = rbind(scatterplot_data, scatterplot.row) | |
339 } | |
340 link.count = link.count + 1 | |
341 } | |
342 patient.merge.list[[patient]] <<- patientMerge | |
343 patient.merge.list.second[[patient]] <<- merge.list[["second"]] | |
344 | |
345 sample.order = data.frame(type = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both")),type.order = 1:4) | |
346 scatterplot_data = merge(scatterplot_data, sample.order, by="type") | |
347 | |
348 scatter_locus_data_list[[patient]] <<- scatterplot_data | |
349 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T) | |
350 } | |
351 | |
352 patient1 = patient1[!(patient1$Clone_Sequence %in% patient.merge.list.second[[patient]]),] | |
353 patient2 = patient2[!(patient2$Clone_Sequence %in% patient.merge.list.second[[patient]]),] | |
354 | |
355 | |
356 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony]) | |
357 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony]) | |
358 res1 = vector() | |
359 res2 = vector() | |
360 resBoth = vector() | |
361 read1Count = vector() | |
362 read2Count = vector() | |
363 locussum1 = vector() | |
364 locussum2 = vector() | |
365 | |
366 #for(iter in 1){ | |
367 for(iter in 1:length(product[,1])){ | |
368 threshhold = product[iter,threshholdIndex] | |
369 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="") | |
370 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="") | |
371 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold | |
372 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold | |
373 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge)) | |
374 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge)) | |
375 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count)) | |
376 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count)) | |
377 res1 = append(res1, sum(one)) | |
378 res2 = append(res2, sum(two)) | |
379 resBoth = append(resBoth, sum(both)) | |
380 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count)) | |
381 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count)) | |
382 #threshhold = 0 | |
383 if(threshhold != 0){ | |
384 if(sum(one) > 0){ | |
385 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")] | |
386 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone") | |
387 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="") | |
388 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | |
389 } | |
390 if(sum(two) > 0){ | |
391 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")] | |
392 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone") | |
393 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="") | |
394 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | |
395 } | |
396 } else { | |
397 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),] | |
398 if(nrow(scatterplot_locus_data) > 0){ | |
399 scatterplot_locus_data$Rearrangement = product[iter, titleIndex] | |
400 } | |
401 | |
402 | |
403 | |
404 p = NULL | |
405 print(paste("nrow scatterplot_locus_data", nrow(scatterplot_locus_data))) | |
406 if(nrow(scatterplot_locus_data) != 0){ | |
407 if(on == "normalized_read_count"){ | |
408 write.table(scatterplot_locus_data, file=paste(oneSample, twoSample, product[iter, titleIndex], "scatterplot_locus_data.txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | |
409 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count)))) | |
410 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), normalized_read_count, group=link)) + geom_line() + scale_y_log10(breaks=scales,labels=scales, limits=c(1,1e6)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE) | |
411 } else { | |
412 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), Frequency, group=link)) + geom_line() + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE) | |
413 } | |
414 p = p + geom_point(aes(colour=type), position="dodge") | |
415 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex])) | |
416 } else { | |
417 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex])) | |
418 } | |
419 png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep="")) | |
420 print(p) | |
421 dev.off() | |
422 } | |
423 if(sum(both) > 0){ | |
424 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")] | |
425 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample)) | |
426 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="") | |
427 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | |
428 } | |
429 } | |
430 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2) | |
431 if(sum(is.na(patientResult$percentage)) > 0){ | |
432 patientResult[is.na(patientResult$percentage),]$percentage = 0 | |
433 } | |
434 colnames(patientResult)[6] = oneSample | |
435 colnames(patientResult)[8] = twoSample | |
436 colnamesBak = colnames(patientResult) | |
437 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample)) | |
438 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | |
439 colnames(patientResult) = colnamesBak | |
440 | |
441 patientResult$Locus = factor(patientResult$Locus, Titles) | |
442 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep="")) | |
443 | |
444 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")]) | |
445 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a") | |
446 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) | |
447 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0) | |
448 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both") | |
449 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines")) | |
450 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080) | |
451 print(plt) | |
452 dev.off() | |
453 #(t,r,b,l) | |
454 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")]) | |
455 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a") | |
456 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) | |
457 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0) | |
458 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right") | |
459 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines")) | |
460 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080) | |
461 print(plt) | |
462 dev.off() | |
463 | |
464 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2) | |
465 patientResult$relativeValue = patientResult$value * 10 | |
466 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1 | |
467 plt = ggplot(patientResult) | |
468 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge") | |
469 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) | |
470 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9))) | |
471 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2) | |
472 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8) | |
473 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep="")) | |
474 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080) | |
475 print(plt) | |
476 dev.off() | |
477 } | |
478 | |
479 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T) | |
480 | |
481 interval = intervalFreq | |
482 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) | |
483 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval))) | |
484 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T) | |
485 | |
486 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T) | |
487 | |
488 interval = intervalReads | |
489 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) | |
490 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval))) | |
491 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count") | |
492 | |
493 if(nrow(single_patients) > 0){ | |
494 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count)))) | |
495 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=as.character(scales)) + expand_limits(y=c(0,1000000)) | |
496 p = p + geom_point(aes(colour=type), position="jitter") | |
497 p = p + xlab("In one or both samples") + ylab("Reads") | |
498 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample") | |
499 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080) | |
500 print(p) | |
501 dev.off() | |
502 | |
503 #p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) | |
504 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100)) | |
505 p = p + geom_point(aes(colour=type), position="jitter") | |
506 p = p + xlab("In one or both samples") + ylab("Frequency") | |
507 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample") | |
508 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080) | |
509 print(p) | |
510 dev.off() | |
511 } else { | |
512 empty <- data.frame() | |
513 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample") | |
514 | |
515 png("singles_reads_scatterplot.png", width=400, height=300) | |
516 print(p) | |
517 dev.off() | |
518 | |
519 png("singles_freq_scatterplot.png", width=400, height=300) | |
520 print(p) | |
521 dev.off() | |
522 } | |
523 | |
524 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory... | |
525 patient.merge.list.second = list() | |
526 | |
527 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){ | |
528 onShort = "reads" | |
529 if(on == "Frequency"){ | |
530 onShort = "freq" | |
531 } | |
532 onx = paste(on, ".x", sep="") | |
533 ony = paste(on, ".y", sep="") | |
534 onz = paste(on, ".z", sep="") | |
535 type="triplet" | |
536 | |
537 threshholdIndex = which(colnames(product) == "interval") | |
538 V_SegmentIndex = which(colnames(product) == "V_Segments") | |
539 J_SegmentIndex = which(colnames(product) == "J_Segments") | |
540 titleIndex = which(colnames(product) == "Titles") | |
541 sampleIndex = which(colnames(patient1) == "Sample") | |
542 patientIndex = which(colnames(patient1) == "Patient") | |
543 oneSample = paste(patient1[1,sampleIndex], sep="") | |
544 twoSample = paste(patient2[1,sampleIndex], sep="") | |
545 threeSample = paste(patient3[1,sampleIndex], sep="") | |
546 | |
547 if(mergeOn == "Clone_Sequence"){ | |
548 patient1$merge = paste(patient1$Clone_Sequence) | |
549 patient2$merge = paste(patient2$Clone_Sequence) | |
550 patient3$merge = paste(patient3$Clone_Sequence) | |
551 | |
552 } else { | |
553 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence) | |
554 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence) | |
555 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence) | |
556 } | |
557 | |
558 #patientMerge = merge(patient1, patient2, by="merge")[NULL,] | |
559 patient1.fuzzy = patient1 | |
560 patient2.fuzzy = patient2 | |
561 patient3.fuzzy = patient3 | |
562 | |
563 cat(paste("<tr><td>", label1, "</td>", sep=""), file=logfile, append=T) | |
564 | |
565 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J) | |
566 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J) | |
567 patient3.fuzzy$merge = paste(patient3.fuzzy$locus_V, patient3.fuzzy$locus_J) | |
568 | |
569 patient.fuzzy = rbind(patient1.fuzzy ,patient2.fuzzy, patient3.fuzzy) | |
570 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),] | |
571 | |
572 other.sample.list = list() | |
573 other.sample.list[[oneSample]] = c(twoSample, threeSample) | |
574 other.sample.list[[twoSample]] = c(oneSample, threeSample) | |
575 other.sample.list[[threeSample]] = c(oneSample, twoSample) | |
576 | |
577 patientMerge = merge(patient1, patient2, by="merge") | |
578 patientMerge = merge(patientMerge, patient3, by="merge") | |
579 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="") | |
580 #patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz]) | |
581 patientMerge = patientMerge[NULL,] | |
582 | |
583 duo.merge.list = list() | |
584 | |
585 patientMerge12 = merge(patient1, patient2, by="merge") | |
586 #patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony]) | |
587 patientMerge12 = patientMerge12[NULL,] | |
588 duo.merge.list[[paste(oneSample, twoSample)]] = patientMerge12 | |
589 duo.merge.list[[paste(twoSample, oneSample)]] = patientMerge12 | |
590 | |
591 patientMerge13 = merge(patient1, patient3, by="merge") | |
592 #patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony]) | |
593 patientMerge13 = patientMerge13[NULL,] | |
594 duo.merge.list[[paste(oneSample, threeSample)]] = patientMerge13 | |
595 duo.merge.list[[paste(threeSample, oneSample)]] = patientMerge13 | |
596 | |
597 patientMerge23 = merge(patient2, patient3, by="merge") | |
598 #patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony]) | |
599 patientMerge23 = patientMerge23[NULL,] | |
600 duo.merge.list[[paste(twoSample, threeSample)]] = patientMerge23 | |
601 duo.merge.list[[paste(threeSample, twoSample)]] = patientMerge23 | |
602 | |
603 merge.list = list() | |
604 merge.list[["second"]] = vector() | |
605 | |
606 start.time = proc.time() | |
607 if(paste(label1, "123") %in% names(patient.merge.list)){ | |
608 patientMerge = patient.merge.list[[paste(label1, "123")]] | |
609 patientMerge12 = patient.merge.list[[paste(label1, "12")]] | |
610 patientMerge13 = patient.merge.list[[paste(label1, "13")]] | |
611 patientMerge23 = patient.merge.list[[paste(label1, "23")]] | |
612 | |
613 merge.list[["second"]] = patient.merge.list.second[[label1]] | |
614 | |
615 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T) | |
616 } else { | |
617 while(nrow(patient.fuzzy) > 0){ | |
618 first.merge = patient.fuzzy[1,"merge"] | |
619 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"] | |
620 first.sample = patient.fuzzy[1,"Sample"] | |
621 | |
622 merge.filter = first.merge == patient.fuzzy$merge | |
623 | |
624 second.sample = other.sample.list[[first.sample]][1] | |
625 third.sample = other.sample.list[[first.sample]][2] | |
626 | |
627 sample.filter.1 = first.sample == patient.fuzzy$Sample | |
628 sample.filter.2 = second.sample == patient.fuzzy$Sample | |
629 sample.filter.3 = third.sample == patient.fuzzy$Sample | |
630 | |
631 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence) | |
632 | |
633 match.filter.1 = sample.filter.1 & sequence.filter & merge.filter | |
634 match.filter.2 = sample.filter.2 & sequence.filter & merge.filter | |
635 match.filter.3 = sample.filter.3 & sequence.filter & merge.filter | |
636 | |
637 matches.in.1 = any(match.filter.1) | |
638 matches.in.2 = any(match.filter.2) | |
639 matches.in.3 = any(match.filter.3) | |
640 | |
641 | |
642 | |
643 rows.1 = patient.fuzzy[match.filter.1,] | |
644 | |
645 sum.1 = data.frame(merge = first.clone.sequence, | |
646 Patient = label1, | |
647 Receptor = rows.1[1,"Receptor"], | |
648 Sample = rows.1[1,"Sample"], | |
649 Cell_Count = rows.1[1,"Cell_Count"], | |
650 Clone_Molecule_Count_From_Spikes = sum(rows.1$Clone_Molecule_Count_From_Spikes), | |
651 Log10_Frequency = log10(sum(rows.1$Frequency)), | |
652 Total_Read_Count = sum(rows.1$Total_Read_Count), | |
653 dsPerM = sum(rows.1$dsPerM), | |
654 J_Segment_Major_Gene = rows.1[1,"J_Segment_Major_Gene"], | |
655 V_Segment_Major_Gene = rows.1[1,"V_Segment_Major_Gene"], | |
656 Clone_Sequence = first.clone.sequence, | |
657 CDR3_Sense_Sequence = rows.1[1,"CDR3_Sense_Sequence"], | |
658 Related_to_leukemia_clone = F, | |
659 Frequency = sum(rows.1$Frequency), | |
660 locus_V = rows.1[1,"locus_V"], | |
661 locus_J = rows.1[1,"locus_J"], | |
662 uniqueID = rows.1[1,"uniqueID"], | |
663 normalized_read_count = sum(rows.1$normalized_read_count)) | |
664 sum.2 = sum.1[NULL,] | |
665 rows.2 = patient.fuzzy[match.filter.2,] | |
666 if(matches.in.2){ | |
667 sum.2 = data.frame(merge = first.clone.sequence, | |
668 Patient = label1, | |
669 Receptor = rows.2[1,"Receptor"], | |
670 Sample = rows.2[1,"Sample"], | |
671 Cell_Count = rows.2[1,"Cell_Count"], | |
672 Clone_Molecule_Count_From_Spikes = sum(rows.2$Clone_Molecule_Count_From_Spikes), | |
673 Log10_Frequency = log10(sum(rows.2$Frequency)), | |
674 Total_Read_Count = sum(rows.2$Total_Read_Count), | |
675 dsPerM = sum(rows.2$dsPerM), | |
676 J_Segment_Major_Gene = rows.2[1,"J_Segment_Major_Gene"], | |
677 V_Segment_Major_Gene = rows.2[1,"V_Segment_Major_Gene"], | |
678 Clone_Sequence = first.clone.sequence, | |
679 CDR3_Sense_Sequence = rows.2[1,"CDR3_Sense_Sequence"], | |
680 Related_to_leukemia_clone = F, | |
681 Frequency = sum(rows.2$Frequency), | |
682 locus_V = rows.2[1,"locus_V"], | |
683 locus_J = rows.2[1,"locus_J"], | |
684 uniqueID = rows.2[1,"uniqueID"], | |
685 normalized_read_count = sum(rows.2$normalized_read_count)) | |
686 } | |
687 | |
688 sum.3 = sum.1[NULL,] | |
689 rows.3 = patient.fuzzy[match.filter.3,] | |
690 if(matches.in.3){ | |
691 sum.3 = data.frame(merge = first.clone.sequence, | |
692 Patient = label1, | |
693 Receptor = rows.3[1,"Receptor"], | |
694 Sample = rows.3[1,"Sample"], | |
695 Cell_Count = rows.3[1,"Cell_Count"], | |
696 Clone_Molecule_Count_From_Spikes = sum(rows.3$Clone_Molecule_Count_From_Spikes), | |
697 Log10_Frequency = log10(sum(rows.3$Frequency)), | |
698 Total_Read_Count = sum(rows.3$Total_Read_Count), | |
699 dsPerM = sum(rows.3$dsPerM), | |
700 J_Segment_Major_Gene = rows.3[1,"J_Segment_Major_Gene"], | |
701 V_Segment_Major_Gene = rows.3[1,"V_Segment_Major_Gene"], | |
702 Clone_Sequence = first.clone.sequence, | |
703 CDR3_Sense_Sequence = rows.3[1,"CDR3_Sense_Sequence"], | |
704 Related_to_leukemia_clone = F, | |
705 Frequency = sum(rows.3$Frequency), | |
706 locus_V = rows.3[1,"locus_V"], | |
707 locus_J = rows.3[1,"locus_J"], | |
708 uniqueID = rows.3[1,"uniqueID"], | |
709 normalized_read_count = sum(rows.3$normalized_read_count)) | |
710 } | |
711 | |
712 if(matches.in.2 & matches.in.3){ | |
713 merge.123 = merge(sum.1, sum.2, by="merge") | |
714 merge.123 = merge(merge.123, sum.3, by="merge") | |
715 colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123)))] = paste(colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123), perl=T))], ".z", sep="") | |
716 #merge.123$thresholdValue = pmax(merge.123[,onx], merge.123[,ony], merge.123[,onz]) | |
717 | |
718 patientMerge = rbind(patientMerge, merge.123) | |
719 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2 | match.filter.3),] | |
720 | |
721 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"]) | |
722 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences) | |
723 | |
724 } else if (matches.in.2) { | |
725 #other.sample1 = other.sample.list[[first.sample]][1] | |
726 #other.sample2 = other.sample.list[[first.sample]][2] | |
727 | |
728 second.sample = sum.2[,"Sample"] | |
729 | |
730 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]] | |
731 | |
732 merge.12 = merge(sum.1, sum.2, by="merge") | |
733 | |
734 current.merge.list = rbind(current.merge.list, merge.12) | |
735 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list | |
736 | |
737 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2),] | |
738 | |
739 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"]) | |
740 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences) | |
741 | |
742 } else if (matches.in.3) { | |
743 | |
744 #other.sample1 = other.sample.list[[first.sample]][1] | |
745 #other.sample2 = other.sample.list[[first.sample]][2] | |
746 | |
747 second.sample = sum.3[,"Sample"] | |
748 | |
749 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]] | |
750 | |
751 merge.13 = merge(sum.1, sum.3, by="merge") | |
752 | |
753 current.merge.list = rbind(current.merge.list, merge.13) | |
754 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list | |
755 | |
756 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.3),] | |
757 | |
758 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"]) | |
759 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences) | |
760 | |
761 } else if(nrow(rows.1) > 1){ | |
762 patient1 = patient1[!(patient1$Clone_Sequence %in% rows.1$Clone_Sequence),] | |
763 print(names(patient1)[names(patient1) %in% sum.1]) | |
764 print(names(patient1)[!(names(patient1) %in% sum.1)]) | |
765 print(names(patient1)) | |
766 print(names(sum.1)) | |
767 print(summary(sum.1)) | |
768 print(summary(patient1)) | |
769 print(dim(sum.1)) | |
770 print(dim(patient1)) | |
771 print(head(sum.1[,names(patient1)])) | |
772 patient1 = rbind(patient1, sum.1[,names(patient1)]) | |
773 patient.fuzzy = patient.fuzzy[-match.filter.1,] | |
774 } else { | |
775 patient.fuzzy = patient.fuzzy[-1,] | |
776 } | |
777 | |
778 tmp.rows = rbind(rows.1, rows.2, rows.3) | |
779 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),] | |
780 | |
781 if (sum(match.filter.1) > 1 | sum(match.filter.2) > 1 | sum(match.filter.1) > 1) { | |
782 cat(paste("<tr><td>", label1, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T) | |
783 } else { | |
784 } | |
785 | |
786 } | |
787 patient.merge.list[[paste(label1, "123")]] = patientMerge | |
788 | |
789 patientMerge12 = duo.merge.list[[paste(oneSample, twoSample)]] | |
790 patientMerge13 = duo.merge.list[[paste(oneSample, threeSample)]] | |
791 patientMerge23 = duo.merge.list[[paste(twoSample, threeSample)]] | |
792 | |
793 patient.merge.list[[paste(label1, "12")]] = patientMerge12 | |
794 patient.merge.list[[paste(label1, "13")]] = patientMerge13 | |
795 patient.merge.list[[paste(label1, "23")]] = patientMerge23 | |
796 | |
797 patient.merge.list.second[[label1]] = merge.list[["second"]] | |
798 } | |
799 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T) | |
800 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz]) | |
801 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony]) | |
802 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony]) | |
803 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony]) | |
804 | |
805 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz]) | |
806 #patientMerge12$thresholdValue = pmin(patientMerge12[,onx], patientMerge12[,ony]) | |
807 #patientMerge13$thresholdValue = pmin(patientMerge13[,onx], patientMerge13[,ony]) | |
808 #patientMerge23$thresholdValue = pmin(patientMerge23[,onx], patientMerge23[,ony]) | |
809 | |
810 patient1 = patient1[!(patient1$Clone_Sequence %in% merge.list[["second"]]),] | |
811 patient2 = patient2[!(patient2$Clone_Sequence %in% merge.list[["second"]]),] | |
812 patient3 = patient3[!(patient3$Clone_Sequence %in% merge.list[["second"]]),] | |
813 | |
814 if(F){ | |
815 patientMerge = merge(patient1, patient2, by="merge") | |
816 patientMerge = merge(patientMerge, patient3, by="merge") | |
817 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="") | |
818 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz]) | |
819 patientMerge12 = merge(patient1, patient2, by="merge") | |
820 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony]) | |
821 patientMerge13 = merge(patient1, patient3, by="merge") | |
822 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony]) | |
823 patientMerge23 = merge(patient2, patient3, by="merge") | |
824 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony]) | |
825 } | |
826 | |
827 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge") | |
828 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns]) | |
829 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),] | |
830 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple")) | |
831 | |
832 res1 = vector() | |
833 res2 = vector() | |
834 res3 = vector() | |
835 res12 = vector() | |
836 res13 = vector() | |
837 res23 = vector() | |
838 resAll = vector() | |
839 read1Count = vector() | |
840 read2Count = vector() | |
841 read3Count = vector() | |
842 | |
843 if(appendTriplets){ | |
844 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3) | |
845 } | |
846 for(iter in 1:length(product[,1])){ | |
847 threshhold = product[iter,threshholdIndex] | |
848 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="") | |
849 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="") | |
850 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold) | |
851 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) | |
852 | |
853 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge)) | |
854 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge)) | |
855 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge)) | |
856 | |
857 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge)) | |
858 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge)) | |
859 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge)) | |
860 | |
861 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x)) | |
862 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y)) | |
863 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z)) | |
864 res1 = append(res1, sum(one)) | |
865 res2 = append(res2, sum(two)) | |
866 res3 = append(res3, sum(three)) | |
867 resAll = append(resAll, sum(all)) | |
868 res12 = append(res12, sum(one_two)) | |
869 res13 = append(res13, sum(one_three)) | |
870 res23 = append(res23, sum(two_three)) | |
871 #threshhold = 0 | |
872 if(threshhold != 0){ | |
873 if(sum(one) > 0){ | |
874 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")] | |
875 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone") | |
876 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="") | |
877 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | |
878 } | |
879 if(sum(two) > 0){ | |
880 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")] | |
881 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone") | |
882 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="") | |
883 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | |
884 } | |
885 if(sum(three) > 0){ | |
886 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")] | |
887 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone") | |
888 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="") | |
889 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | |
890 } | |
891 if(sum(one_two) > 0){ | |
892 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")] | |
893 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample)) | |
894 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="") | |
895 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | |
896 } | |
897 if(sum(one_three) > 0){ | |
898 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")] | |
899 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample)) | |
900 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="") | |
901 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | |
902 } | |
903 if(sum(two_three) > 0){ | |
904 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")] | |
905 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample)) | |
906 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="") | |
907 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | |
908 } | |
909 } else { #scatterplot data | |
910 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),] | |
911 scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),] | |
912 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge) | |
913 if(sum(in_two) > 0){ | |
914 scatterplot_locus_data[in_two,]$type = "In two" | |
915 } | |
916 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge) | |
917 if(sum(in_three)> 0){ | |
918 scatterplot_locus_data[in_three,]$type = "In three" | |
919 } | |
920 not_in_one = scatterplot_locus_data$type != "In one" | |
921 if(sum(not_in_one) > 0){ | |
922 #scatterplot_locus_data[not_in_one,]$type = "In multiple" | |
923 } | |
924 p = NULL | |
925 if(nrow(scatterplot_locus_data) != 0){ | |
926 if(on == "normalized_read_count"){ | |
927 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count)))) | |
928 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales, limits=c(1, 1e6)) | |
929 } else { | |
930 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100)) | |
931 #p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) | |
932 } | |
933 p = p + geom_point(aes(colour=type), position="jitter") | |
934 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex])) | |
935 } else { | |
936 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex])) | |
937 } | |
938 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep="")) | |
939 print(p) | |
940 dev.off() | |
941 } | |
942 if(sum(all) > 0){ | |
943 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")] | |
944 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample)) | |
945 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="") | |
946 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | |
947 } | |
948 } | |
949 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count)) | |
950 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23) | |
951 colnames(patientResult)[6] = oneSample | |
952 colnames(patientResult)[7] = twoSample | |
953 colnames(patientResult)[8] = threeSample | |
954 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_") | |
955 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_") | |
956 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_") | |
957 | |
958 colnamesBak = colnames(patientResult) | |
959 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample)) | |
960 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T) | |
961 colnames(patientResult) = colnamesBak | |
962 | |
963 patientResult$Locus = factor(patientResult$Locus, Titles) | |
964 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep="")) | |
965 | |
966 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")]) | |
967 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a") | |
968 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) | |
969 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0) | |
970 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All") | |
971 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines")) | |
972 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080) | |
973 print(plt) | |
974 dev.off() | |
975 | |
976 fontSize = 4 | |
977 | |
978 bak = patientResult | |
979 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2) | |
980 patientResult$relativeValue = patientResult$value * 10 | |
981 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1 | |
982 plt = ggplot(patientResult) | |
983 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge") | |
984 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) | |
985 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9))) | |
986 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize) | |
987 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize) | |
988 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize) | |
989 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample") | |
990 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080) | |
991 print(plt) | |
992 dev.off() | |
993 } | |
994 | |
995 if(nrow(triplets) != 0){ | |
996 | |
997 cat("<tr><td>Starting triplet analysis</td></tr>", file=logfile, append=T) | |
998 | |
999 triplets$uniqueID = "ID" | |
1000 | |
1001 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" | |
1002 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" | |
1003 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left" | |
1004 | |
1005 triplets[grepl("16278_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right" | |
1006 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right" | |
1007 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right" | |
1008 | |
1009 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696" | |
1010 | |
1011 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T) | |
1012 | |
1013 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4) | |
1014 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4) | |
1015 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")]) | |
1016 | |
1017 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J) | |
1018 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J) | |
1019 | |
1020 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")] | |
1021 | |
1022 triplets = merge(triplets, min_cell_count, by="min_cell_paste") | |
1023 | |
1024 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2 | |
1025 | |
1026 triplets = triplets[triplets$normalized_read_count >= min_cells,] | |
1027 | |
1028 column_drops = c("min_cell_count", "min_cell_paste") | |
1029 | |
1030 triplets = triplets[,!(colnames(triplets) %in% column_drops)] | |
1031 | |
1032 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T) | |
1033 | |
1034 interval = intervalReads | |
1035 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) | |
1036 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval))) | |
1037 | |
1038 one = triplets[triplets$Sample == "14696_reg_BM",] | |
1039 two = triplets[triplets$Sample == "24536_reg_BM",] | |
1040 three = triplets[triplets$Sample == "24062_reg_BM",] | |
1041 tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="normalized_read_count", T) | |
1042 | |
1043 one = triplets[triplets$Sample == "16278_Left",] | |
1044 two = triplets[triplets$Sample == "26402_Left",] | |
1045 three = triplets[triplets$Sample == "26759_Left",] | |
1046 tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="normalized_read_count", T) | |
1047 | |
1048 one = triplets[triplets$Sample == "16278_Right",] | |
1049 two = triplets[triplets$Sample == "26402_Right",] | |
1050 three = triplets[triplets$Sample == "26759_Right",] | |
1051 tripletAnalysis(one, "16278_Right_Trio", two, "26402_Right_Trio", three, "26759_Right_Trio", product=product, interval=interval, on="normalized_read_count", T) | |
1052 | |
1053 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T) | |
1054 | |
1055 interval = intervalFreq | |
1056 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval)) | |
1057 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval))) | |
1058 | |
1059 one = triplets[triplets$Sample == "14696_reg_BM",] | |
1060 two = triplets[triplets$Sample == "24536_reg_BM",] | |
1061 three = triplets[triplets$Sample == "24062_reg_BM",] | |
1062 tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="Frequency", F) | |
1063 | |
1064 one = triplets[triplets$Sample == "16278_Left",] | |
1065 two = triplets[triplets$Sample == "26402_Left",] | |
1066 three = triplets[triplets$Sample == "26759_Left",] | |
1067 tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="Frequency", F) | |
1068 | |
1069 one = triplets[triplets$Sample == "16278_Right",] | |
1070 two = triplets[triplets$Sample == "26402_Right",] | |
1071 three = triplets[triplets$Sample == "26759_Right",] | |
1072 tripletAnalysis(one, "16278_Right_Trio", two, "26402_Right_Trio", three, "26759_Right_Trio", product=product, interval=interval, on="Frequency", F) | |
1073 } else { | |
1074 cat("", file="triplets.txt") | |
1075 } | |
1076 cat("</table></html>", file=logfile, append=T) |