comparison shm_csr.r @ 1:faae21ba5c63 draft

Uploaded
author davidvanzessen
date Tue, 25 Oct 2016 07:28:43 -0400
parents c33d93683a09
children e85fec274cde
comparison
equal deleted inserted replaced
0:c33d93683a09 1:faae21ba5c63
5 args <- commandArgs(trailingOnly = TRUE) 5 args <- commandArgs(trailingOnly = TRUE)
6 6
7 input = args[1] 7 input = args[1]
8 genes = unlist(strsplit(args[2], ",")) 8 genes = unlist(strsplit(args[2], ","))
9 outputdir = args[3] 9 outputdir = args[3]
10 include_fr1 = ifelse(args[4] == "yes", T, F) 10 empty.region.filter = args[4]
11 setwd(outputdir) 11 setwd(outputdir)
12 12
13 dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) 13 dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F)
14 14
15 if(length(dat$Sequence.ID) == 0){ 15 if(length(dat$Sequence.ID) == 0){
21 row.names(transitionTable) = c("A", "C", "G", "T") 21 row.names(transitionTable) = c("A", "C", "G", "T")
22 transitionTable["A","A"] = NA 22 transitionTable["A","A"] = NA
23 transitionTable["C","C"] = NA 23 transitionTable["C","C"] = NA
24 transitionTable["G","G"] = NA 24 transitionTable["G","G"] = NA
25 transitionTable["T","T"] = NA 25 transitionTable["T","T"] = NA
26
26 write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) 27 write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)
27 cat("0", file="n.txt") 28 cat("0", file="n.txt")
28 stop("No data") 29 stop("No data")
29 } 30 }
30 31
31 cleanup_columns = c("FR1.IMGT.c.a", 32 cleanup_columns = c("FR1.IMGT.c.a",
32 "FR2.IMGT.g.t", 33 "FR2.IMGT.g.t",
33 "CDR1.IMGT.Nb.of.nucleotides", 34 "CDR1.IMGT.Nb.of.nucleotides",
34 "CDR2.IMGT.t.a", 35 "CDR2.IMGT.t.a",
35 "FR1.IMGT.c.g", 36 "FR1.IMGT.c.g",
36 "CDR1.IMGT.c.t", 37 "CDR1.IMGT.c.t",
37 "FR2.IMGT.a.c", 38 "FR2.IMGT.a.c",
38 "FR2.IMGT.Nb.of.mutations", 39 "FR2.IMGT.Nb.of.mutations",
39 "FR2.IMGT.g.c", 40 "FR2.IMGT.g.c",
40 "FR2.IMGT.a.g", 41 "FR2.IMGT.a.g",
41 "FR3.IMGT.t.a", 42 "FR3.IMGT.t.a",
42 "FR3.IMGT.t.c", 43 "FR3.IMGT.t.c",
43 "FR2.IMGT.g.a", 44 "FR2.IMGT.g.a",
44 "FR3.IMGT.c.g", 45 "FR3.IMGT.c.g",
45 "FR1.IMGT.Nb.of.mutations", 46 "FR1.IMGT.Nb.of.mutations",
46 "CDR1.IMGT.g.a", 47 "CDR1.IMGT.g.a",
47 "CDR1.IMGT.t.g", 48 "CDR1.IMGT.t.g",
48 "CDR1.IMGT.g.c", 49 "CDR1.IMGT.g.c",
49 "CDR2.IMGT.Nb.of.nucleotides", 50 "CDR2.IMGT.Nb.of.nucleotides",
50 "FR2.IMGT.a.t", 51 "FR2.IMGT.a.t",
51 "CDR1.IMGT.Nb.of.mutations", 52 "CDR1.IMGT.Nb.of.mutations",
52 "CDR3.IMGT.Nb.of.nucleotides", 53 "CDR3.IMGT.Nb.of.nucleotides",
53 "CDR1.IMGT.a.g", 54 "CDR1.IMGT.a.g",
54 "FR3.IMGT.a.c", 55 "FR3.IMGT.a.c",
55 "FR1.IMGT.g.a", 56 "FR1.IMGT.g.a",
56 "FR3.IMGT.a.g", 57 "FR3.IMGT.a.g",
57 "FR1.IMGT.a.t", 58 "FR1.IMGT.a.t",
58 "CDR2.IMGT.a.g", 59 "CDR2.IMGT.a.g",
59 "CDR2.IMGT.Nb.of.mutations", 60 "CDR2.IMGT.Nb.of.mutations",
60 "CDR2.IMGT.g.t", 61 "CDR2.IMGT.g.t",
61 "CDR2.IMGT.a.c", 62 "CDR2.IMGT.a.c",
62 "CDR1.IMGT.t.c", 63 "CDR1.IMGT.t.c",
63 "FR3.IMGT.g.c", 64 "FR3.IMGT.g.c",
64 "FR1.IMGT.g.t", 65 "FR1.IMGT.g.t",
65 "FR3.IMGT.g.t", 66 "FR3.IMGT.g.t",
66 "CDR1.IMGT.a.t", 67 "CDR1.IMGT.a.t",
67 "FR1.IMGT.a.g", 68 "FR1.IMGT.a.g",
68 "FR3.IMGT.a.t", 69 "FR3.IMGT.a.t",
69 "FR3.IMGT.Nb.of.nucleotides", 70 "FR3.IMGT.Nb.of.nucleotides",
70 "FR2.IMGT.t.c", 71 "FR2.IMGT.t.c",
71 "CDR2.IMGT.g.a", 72 "CDR2.IMGT.g.a",
72 "FR2.IMGT.t.a", 73 "FR2.IMGT.t.a",
73 "CDR1.IMGT.t.a", 74 "CDR1.IMGT.t.a",
74 "FR2.IMGT.t.g", 75 "FR2.IMGT.t.g",
75 "FR3.IMGT.t.g", 76 "FR3.IMGT.t.g",
76 "FR2.IMGT.Nb.of.nucleotides", 77 "FR2.IMGT.Nb.of.nucleotides",
77 "FR1.IMGT.t.a", 78 "FR1.IMGT.t.a",
78 "FR1.IMGT.t.g", 79 "FR1.IMGT.t.g",
79 "FR3.IMGT.c.t", 80 "FR3.IMGT.c.t",
80 "FR1.IMGT.t.c", 81 "FR1.IMGT.t.c",
81 "CDR2.IMGT.a.t", 82 "CDR2.IMGT.a.t",
82 "FR2.IMGT.c.t", 83 "FR2.IMGT.c.t",
83 "CDR1.IMGT.g.t", 84 "CDR1.IMGT.g.t",
84 "CDR2.IMGT.t.g", 85 "CDR2.IMGT.t.g",
85 "FR1.IMGT.Nb.of.nucleotides", 86 "FR1.IMGT.Nb.of.nucleotides",
86 "CDR1.IMGT.c.g", 87 "CDR1.IMGT.c.g",
87 "CDR2.IMGT.t.c", 88 "CDR2.IMGT.t.c",
88 "FR3.IMGT.g.a", 89 "FR3.IMGT.g.a",
89 "CDR1.IMGT.a.c", 90 "CDR1.IMGT.a.c",
90 "FR2.IMGT.c.a", 91 "FR2.IMGT.c.a",
91 "FR3.IMGT.Nb.of.mutations", 92 "FR3.IMGT.Nb.of.mutations",
92 "FR2.IMGT.c.g", 93 "FR2.IMGT.c.g",
93 "CDR2.IMGT.g.c", 94 "CDR2.IMGT.g.c",
94 "FR1.IMGT.g.c", 95 "FR1.IMGT.g.c",
95 "CDR2.IMGT.c.t", 96 "CDR2.IMGT.c.t",
96 "FR3.IMGT.c.a", 97 "FR3.IMGT.c.a",
97 "CDR1.IMGT.c.a", 98 "CDR1.IMGT.c.a",
98 "CDR2.IMGT.c.g", 99 "CDR2.IMGT.c.g",
99 "CDR2.IMGT.c.a", 100 "CDR2.IMGT.c.a",
100 "FR1.IMGT.c.t", 101 "FR1.IMGT.c.t",
101 "FR1.IMGT.Nb.of.silent.mutations", 102 "FR1.IMGT.Nb.of.silent.mutations",
102 "FR2.IMGT.Nb.of.silent.mutations", 103 "FR2.IMGT.Nb.of.silent.mutations",
103 "FR3.IMGT.Nb.of.silent.mutations", 104 "FR3.IMGT.Nb.of.silent.mutations",
104 "FR1.IMGT.Nb.of.nonsilent.mutations", 105 "FR1.IMGT.Nb.of.nonsilent.mutations",
105 "FR2.IMGT.Nb.of.nonsilent.mutations", 106 "FR2.IMGT.Nb.of.nonsilent.mutations",
106 "FR3.IMGT.Nb.of.nonsilent.mutations") 107 "FR3.IMGT.Nb.of.nonsilent.mutations")
107
108 108
109 print("Cleaning up columns") 109 print("Cleaning up columns")
110
110 for(col in cleanup_columns){ 111 for(col in cleanup_columns){
111 dat[,col] = gsub("\\(.*\\)", "", dat[,col]) 112 dat[,col] = gsub("\\(.*\\)", "", dat[,col])
112 #dat[dat[,col] == "",] = "0" 113 #dat[dat[,col] == "",] = "0"
113 dat[,col] = as.numeric(dat[,col]) 114 dat[,col] = as.numeric(dat[,col])
114 dat[is.na(dat[,col]),col] = 0 115 dat[is.na(dat[,col]),col] = 0
115 } 116 }
116 117
117 regions = c("FR1", "CDR1", "FR2", "CDR2", "FR3") 118 regions = c("FR1", "CDR1", "FR2", "CDR2", "FR3")
118 if(!include_fr1){ 119 if(empty.region.filter == "FR1") {
119 regions = c("CDR1", "FR2", "CDR2", "FR3") 120 regions = c("CDR1", "FR2", "CDR2", "FR3")
121 } else if (empty.region.filter == "CDR1") {
122 regions = c("FR2", "CDR2", "FR3", "CDR3")
123 } else if (empty.region.filter == "FR2") {
124 regions = c("CDR2", "FR3", "CDR3")
120 } 125 }
121 126
122 sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) } 127 sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) }
123 128
124 print("aggregating data into new columns") 129 print("aggregating data into new columns")
134 dat$transitionMutations = apply(dat, FUN=sum_by_row, 1, columns=transitionMutations_columns) 139 dat$transitionMutations = apply(dat, FUN=sum_by_row, 1, columns=transitionMutations_columns)
135 140
136 transversionMutations_columns = paste(rep(regions, each=8), c(".IMGT.a.c",".IMGT.c.a",".IMGT.a.t",".IMGT.t.a",".IMGT.g.c",".IMGT.c.g",".IMGT.g.t",".IMGT.t.g"), sep="") 141 transversionMutations_columns = paste(rep(regions, each=8), c(".IMGT.a.c",".IMGT.c.a",".IMGT.a.t",".IMGT.t.a",".IMGT.g.c",".IMGT.c.g",".IMGT.g.t",".IMGT.t.g"), sep="")
137 dat$transversionMutations = apply(dat, FUN=sum_by_row, 1, columns=transversionMutations_columns) 142 dat$transversionMutations = apply(dat, FUN=sum_by_row, 1, columns=transversionMutations_columns)
138 143
139
140 transitionMutationsAtGC_columns = paste(rep(regions, each=2), c(".IMGT.g.a",".IMGT.c.t"), sep="") 144 transitionMutationsAtGC_columns = paste(rep(regions, each=2), c(".IMGT.g.a",".IMGT.c.t"), sep="")
141 dat$transitionMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtGC_columns) 145 dat$transitionMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtGC_columns)
142
143 146
144 totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.c.g",".IMGT.c.t",".IMGT.c.a",".IMGT.g.c",".IMGT.g.a",".IMGT.g.t"), sep="") 147 totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.c.g",".IMGT.c.t",".IMGT.c.a",".IMGT.g.c",".IMGT.g.a",".IMGT.g.t"), sep="")
145 #totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.g.a",".IMGT.c.t",".IMGT.c.a",".IMGT.c.g",".IMGT.g.t"), sep="") 148 #totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.g.a",".IMGT.c.t",".IMGT.c.a",".IMGT.c.g",".IMGT.g.t"), sep="")
146 dat$totalMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtGC_columns) 149 dat$totalMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtGC_columns)
147 150
150 153
151 totalMutationsAtAT_columns = paste(rep(regions, each=6), c(".IMGT.a.g",".IMGT.a.c",".IMGT.a.t",".IMGT.t.g",".IMGT.t.c",".IMGT.t.a"), sep="") 154 totalMutationsAtAT_columns = paste(rep(regions, each=6), c(".IMGT.a.g",".IMGT.a.c",".IMGT.a.t",".IMGT.t.g",".IMGT.t.c",".IMGT.t.a"), sep="")
152 #totalMutationsAtAT_columns = paste(rep(regions, each=5), c(".IMGT.a.g",".IMGT.t.c",".IMGT.a.c",".IMGT.g.c",".IMGT.t.g"), sep="") 155 #totalMutationsAtAT_columns = paste(rep(regions, each=5), c(".IMGT.a.g",".IMGT.t.c",".IMGT.a.c",".IMGT.g.c",".IMGT.t.g"), sep="")
153 dat$totalMutationsAtAT = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtAT_columns) 156 dat$totalMutationsAtAT = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtAT_columns)
154 157
155
156 FRRegions = regions[grepl("FR", regions)] 158 FRRegions = regions[grepl("FR", regions)]
157 CDRRegions = regions[grepl("CDR", regions)] 159 CDRRegions = regions[grepl("CDR", regions)]
158 160
159 FR_silentMutations_columns = paste(FRRegions, ".IMGT.Nb.of.silent.mutations", sep="") 161 FR_silentMutations_columns = paste(FRRegions, ".IMGT.Nb.of.silent.mutations", sep="")
160 dat$silentMutationsFR = apply(dat, FUN=sum_by_row, 1, columns=FR_silentMutations_columns) 162 dat$silentMutationsFR = apply(dat, FUN=sum_by_row, 1, columns=FR_silentMutations_columns)
167 169
168 CDR_nonSilentMutations_columns = paste(CDRRegions, ".IMGT.Nb.of.nonsilent.mutations", sep="") 170 CDR_nonSilentMutations_columns = paste(CDRRegions, ".IMGT.Nb.of.nonsilent.mutations", sep="")
169 dat$nonSilentMutationsCDR = apply(dat, FUN=sum_by_row, 1, columns=CDR_nonSilentMutations_columns) 171 dat$nonSilentMutationsCDR = apply(dat, FUN=sum_by_row, 1, columns=CDR_nonSilentMutations_columns)
170 172
171 mutation.sum.columns = c("Sequence.ID", "VRegionMutations", "VRegionNucleotides", "transitionMutations", "transversionMutations", "transitionMutationsAtGC", "transitionMutationsAtAT", "silentMutationsFR", "nonSilentMutationsFR", "silentMutationsCDR", "nonSilentMutationsCDR") 173 mutation.sum.columns = c("Sequence.ID", "VRegionMutations", "VRegionNucleotides", "transitionMutations", "transversionMutations", "transitionMutationsAtGC", "transitionMutationsAtAT", "silentMutationsFR", "nonSilentMutationsFR", "silentMutationsCDR", "nonSilentMutationsCDR")
172
173 write.table(dat[,mutation.sum.columns], "mutation_by_id.txt", sep="\t",quote=F,row.names=F,col.names=T) 174 write.table(dat[,mutation.sum.columns], "mutation_by_id.txt", sep="\t",quote=F,row.names=F,col.names=T)
174 175
175 setwd(outputdir) 176 setwd(outputdir)
176 177
177 base.order = data.frame(base=c("A", "T", "C", "G"), order=1:4) 178 base.order = data.frame(base=c("A", "T", "C", "G"), order=1:4)
181 182
182 j = i - 1 183 j = i - 1
183 x = (j * 3) + 1 184 x = (j * 3) + 1
184 y = (j * 3) + 2 185 y = (j * 3) + 2
185 z = (j * 3) + 3 186 z = (j * 3) + 3
186 187
187 if(nrow(tmp) > 0){ 188 if(nrow(tmp) > 0){
188
189 if(fname == "sum"){ 189 if(fname == "sum"){
190 matrx[1,x] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) 190 matrx[1,x] = round(f(tmp$VRegionMutations, na.rm=T), digits=1)
191 matrx[1,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1) 191 matrx[1,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1)
192 matrx[1,z] = round(f(matrx[1,x] / matrx[1,y]) * 100, digits=1) 192 matrx[1,z] = round(f(matrx[1,x] / matrx[1,y]) * 100, digits=1)
193 } else { 193 } else {
194 matrx[1,x] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) 194 matrx[1,x] = round(f(tmp$VRegionMutations, na.rm=T), digits=1)
195 matrx[1,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1) 195 matrx[1,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1)
196 matrx[1,z] = round(f(tmp$VRegionMutations / tmp$VRegionNucleotides) * 100, digits=1) 196 matrx[1,z] = round(f(tmp$VRegionMutations / tmp$VRegionNucleotides) * 100, digits=1)
197 } 197 }
198 198
199 matrx[2,x] = round(f(tmp$transitionMutations, na.rm=T), digits=1) 199 matrx[2,x] = round(f(tmp$transitionMutations, na.rm=T), digits=1)
200 matrx[2,y] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) 200 matrx[2,y] = round(f(tmp$VRegionMutations, na.rm=T), digits=1)
201 matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) 201 matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1)
257 FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") 257 FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="")
258 CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") 258 CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="")
259 FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") 259 FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="")
260 CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") 260 CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="")
261 FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") 261 FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="")
262 if(include_fr1){ 262 if (empty.region.filter == "leader"){
263 transitionTable[NT1,NT2] = sum(tmp[,c(FR1, CDR1, FR2, CDR2, FR3)]) 263 transitionTable[NT1,NT2] = sum(tmp[,c(FR1, CDR1, FR2, CDR2, FR3)])
264 } else { 264 } else if (empty.region.filter == "FR1") {
265 transitionTable[NT1,NT2] = sum(tmp[,c(CDR1, FR2, CDR2, FR3)]) 265 transitionTable[NT1,NT2] = sum(tmp[,c(CDR1, FR2, CDR2, FR3)])
266 } else if (empty.region.filter == "CDR1") {
267 transitionTable[NT1,NT2] = sum(tmp[,c(FR2, CDR2, FR3)])
268 } else if (empty.region.filter == "FR2") {
269 transitionTable[NT1,NT2] = sum(tmp[,c(CDR2, FR3)])
266 } 270 }
267 } 271 }
268 } 272 }
269 transition = transitionTable 273 transition = transitionTable
270 transition$id = names(transition) 274 transition$id = names(transition)
271 275
272 transition2 = melt(transition, id.vars="id") 276 transition2 = melt(transition, id.vars="id")
273 277
274 transition2 = merge(transition2, base.order, by.x="id", by.y="base") 278 transition2 = merge(transition2, base.order, by.x="id", by.y="base")
279
275 transition2 = merge(transition2, base.order, by.x="variable", by.y="base") 280 transition2 = merge(transition2, base.order, by.x="variable", by.y="base")
276 281
277 transition2[is.na(transition2$value),]$value = 0 282 transition2[is.na(transition2$value),]$value = 0
278 283
279 if(!all(transition2$value == 0)){ #having rows of data but a transition table filled with 0 is bad 284 if(any(transition2$value == 0)){ #having rows of data but a transition table filled with 0 is bad
280
281 print("Plotting stacked transition") 285 print("Plotting stacked transition")
282
283 png(filename=paste("transitions_stacked_", name, ".png", sep="")) 286 png(filename=paste("transitions_stacked_", name, ".png", sep=""))
284 p = ggplot(transition2, aes(factor(reorder(id, order.x)), y=value, fill=factor(reorder(variable, order.y)))) + geom_bar(position="fill", stat="identity", colour="black") #stacked bar 287 p = ggplot(transition2, aes(factor(reorder(id, order.x)), y=value, fill=factor(reorder(variable, order.y)))) + geom_bar(position="fill", stat="identity", colour="black") #stacked bar
285 p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + guides(fill=guide_legend(title=NULL)) 288 p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + guides(fill=guide_legend(title=NULL))
286 p = p + theme(panel.background = element_rect(fill = "white", colour="black")) + scale_fill_manual(values=c("A" = "blue4", "G" = "lightblue1", "C" = "olivedrab3", "T" = "olivedrab4")) 289 p = p + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black")) + scale_fill_manual(values=c("A" = "blue4", "G" = "lightblue1", "C" = "olivedrab3", "T" = "olivedrab4"))
287 #p = p + scale_colour_manual(values=c("A" = "black", "G" = "black", "C" = "black", "T" = "black")) 290 #p = p + scale_colour_manual(values=c("A" = "black", "G" = "black", "C" = "black", "T" = "black"))
288 print(p) 291 print(p)
289 dev.off() 292 dev.off()
290 293
291 print("Plotting heatmap transition") 294 print("Plotting heatmap transition")
292 295
293 png(filename=paste("transitions_heatmap_", name, ".png", sep="")) 296 png(filename=paste("transitions_heatmap_", name, ".png", sep=""))
294 p = ggplot(transition2, aes(factor(reorder(id, order.x)), factor(reorder(variable, order.y)))) + geom_tile(aes(fill = value)) + scale_fill_gradient(low="white", high="steelblue") #heatmap 297 p = ggplot(transition2, aes(factor(reorder(id, order.x)), factor(reorder(variable, order.y)))) + geom_tile(aes(fill = value)) + scale_fill_gradient(low="white", high="steelblue") #heatmap
295 p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + theme(panel.background = element_rect(fill = "white", colour="black")) 298 p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black"))
296 print(p) 299 print(p)
297 dev.off() 300 dev.off()
298 } else { 301 } else {
299 print("No data to plot") 302 print("No data to plot")
300 } 303 }
301 } 304 }
302 305
303 #print(paste("writing value file: ", name, "_", fname, "_value.txt" ,sep="")) 306 #print(paste("writing value file: ", name, "_", fname, "_value.txt" ,sep=""))
304
305 write.table(x=transitionTable, file=paste("transitions_", name ,"_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=NA) 307 write.table(x=transitionTable, file=paste("transitions_", name ,"_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=NA)
306 write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file=paste("matched_", name , "_", fname, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) 308 write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file=paste("matched_", name , "_", fname, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
307
308 cat(matrx[1,x], file=paste(name, "_", fname, "_value.txt" ,sep="")) 309 cat(matrx[1,x], file=paste(name, "_", fname, "_value.txt" ,sep=""))
309 cat(nrow(tmp), file=paste(name, "_", fname, "_n.txt" ,sep="")) 310 cat(nrow(tmp), file=paste(name, "_", fname, "_n.txt" ,sep=""))
310
311 #print(paste(fname, name, nrow(tmp))) 311 #print(paste(fname, name, nrow(tmp)))
312
313 matrx 312 matrx
314 } 313 }
315
316 nts = c("a", "c", "g", "t") 314 nts = c("a", "c", "g", "t")
317 zeros=rep(0, 4) 315 zeros=rep(0, 4)
318
319 funcs = c(median, sum, mean) 316 funcs = c(median, sum, mean)
320 fnames = c("median", "sum", "mean") 317 fnames = c("median", "sum", "mean")
321 318
322 print("Creating result tables") 319 print("Creating result tables")
323 320
328 rows = 9 325 rows = 9
329 if(fname == "sum"){ 326 if(fname == "sum"){
330 rows = 11 327 rows = 11
331 } 328 }
332 matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=rows) 329 matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=rows)
333
334 for(i in 1:length(genes)){ 330 for(i in 1:length(genes)){
335 print(paste("Creating table for", fname, genes[i])) 331 print(paste("Creating table for", fname, genes[i]))
336 matrx = calculate_result(i, genes[i], dat, matrx, func, fname, genes[i]) 332 matrx = calculate_result(i, genes[i], dat, matrx, func, fname, genes[i])
337 } 333 }
338
339 matrx = calculate_result(i + 1, ".*", dat[!grepl("unmatched", dat$best_match),], matrx, func, fname, name="all") 334 matrx = calculate_result(i + 1, ".*", dat[!grepl("unmatched", dat$best_match),], matrx, func, fname, name="all")
340 335
341 result = data.frame(matrx) 336 result = data.frame(matrx)
342 if(fname == "sum"){ 337 if(fname == "sum"){
343 row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR") 338 row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR")
344 } else { 339 } else {
345 row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)") 340 row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)")
346 } 341 }
347
348 write.table(x=result, file=paste("mutations_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=F) 342 write.table(x=result, file=paste("mutations_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=F)
349 } 343 }
350 344
351 print("Adding median number of mutations to sum table") 345 print("Adding median number of mutations to sum table")
352
353 sum.table = read.table("mutations_sum.txt", sep=",", header=F) 346 sum.table = read.table("mutations_sum.txt", sep=",", header=F)
354 median.table = read.table("mutations_median.txt", sep=",", header=F) 347 median.table = read.table("mutations_median.txt", sep=",", header=F)
355 348
356 new.table = sum.table[1,] 349 new.table = sum.table[1,]
357 new.table[2,] = median.table[1,] 350 new.table[2,] = median.table[1,]
361 354
362 #sum.table = sum.table[c("Number of Mutations (%)", "Median of Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR"),] 355 #sum.table = sum.table[c("Number of Mutations (%)", "Median of Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR"),]
363 356
364 write.table(x=new.table, file="mutations_sum.txt", sep=",",quote=F,row.names=F,col.names=F) 357 write.table(x=new.table, file="mutations_sum.txt", sep=",",quote=F,row.names=F,col.names=F)
365 358
366
367 print("Plotting IGA piechart") 359 print("Plotting IGA piechart")
368 360
369 dat = dat[!grepl("^unmatched", dat$best_match),] 361 dat = dat[!grepl("^unmatched", dat$best_match),]
370 362
371 #blegh 363 #blegh
364
372 genesForPlot = dat[grepl("IGA", dat$best_match),]$best_match 365 genesForPlot = dat[grepl("IGA", dat$best_match),]$best_match
366
373 if(length(genesForPlot) > 0){ 367 if(length(genesForPlot) > 0){
374 genesForPlot = data.frame(table(genesForPlot)) 368 genesForPlot = data.frame(table(genesForPlot))
375 colnames(genesForPlot) = c("Gene","Freq") 369 colnames(genesForPlot) = c("Gene","Freq")
376 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) 370 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)
377 371
378 pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=Gene)) 372 pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=Gene))
379 pc = pc + geom_bar(width = 1, stat = "identity") + scale_fill_manual(labels=genesForPlot$label, values=c("IGA1" = "lightblue1", "IGA2" = "blue4")) 373 pc = pc + geom_bar(width = 1, stat = "identity") + scale_fill_manual(labels=genesForPlot$label, values=c("IGA1" = "lightblue1", "IGA2" = "blue4"))
380 pc = pc + coord_polar(theta="y") 374 pc = pc + coord_polar(theta="y") + scale_y_continuous(breaks=NULL)
381 pc = pc + theme(panel.background = element_rect(fill = "white", colour="black")) 375 pc = pc + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black"))
382 pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IGA subclasses", "( n =", sum(genesForPlot$Freq), ")")) 376 pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IGA subclasses", "( n =", sum(genesForPlot$Freq), ")"))
383 write.table(genesForPlot, "IGA.txt", sep="\t",quote=F,row.names=F,col.names=T) 377 write.table(genesForPlot, "IGA.txt", sep="\t",quote=F,row.names=F,col.names=T)
384 378
385 png(filename="IGA.png") 379 png(filename="IGA.png")
386 print(pc) 380 print(pc)
388 } 382 }
389 383
390 print("Plotting IGG piechart") 384 print("Plotting IGG piechart")
391 385
392 genesForPlot = dat[grepl("IGG", dat$best_match),]$best_match 386 genesForPlot = dat[grepl("IGG", dat$best_match),]$best_match
387
393 if(length(genesForPlot) > 0){ 388 if(length(genesForPlot) > 0){
394 genesForPlot = data.frame(table(genesForPlot)) 389 genesForPlot = data.frame(table(genesForPlot))
395 colnames(genesForPlot) = c("Gene","Freq") 390 colnames(genesForPlot) = c("Gene","Freq")
396 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) 391 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq)
397 392
398 pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=Gene)) 393 pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=Gene))
399 pc = pc + geom_bar(width = 1, stat = "identity") + scale_fill_manual(labels=genesForPlot$label, values=c("IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred")) 394 pc = pc + geom_bar(width = 1, stat = "identity") + scale_fill_manual(labels=genesForPlot$label, values=c("IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred"))
400 pc = pc + coord_polar(theta="y") 395 pc = pc + coord_polar(theta="y") + scale_y_continuous(breaks=NULL)
401 pc = pc + theme(panel.background = element_rect(fill = "white", colour="black")) 396 pc = pc + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black"))
402 pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IGG subclasses", "( n =", sum(genesForPlot$Freq), ")")) 397 pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IGG subclasses", "( n =", sum(genesForPlot$Freq), ")"))
403 write.table(genesForPlot, "IGG.txt", sep="\t",quote=F,row.names=F,col.names=T) 398 write.table(genesForPlot, "IGG.txt", sep="\t",quote=F,row.names=F,col.names=T)
404 399
405 png(filename="IGG.png") 400 png(filename="IGG.png")
406 print(pc) 401 print(pc)
407 dev.off() 402 dev.off()
408 } 403 }
409 404
410
411 print("Plotting scatterplot") 405 print("Plotting scatterplot")
412 406
413 dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2) 407 dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2)
414 408 dat.clss = dat
415 p = ggplot(dat, aes(best_match, percentage_mutations)) 409
410 dat.clss$best_match = substr(dat.clss$best_match, 0, 3)
411
412 dat.clss = rbind(dat, dat.clss)
413
414 p = ggplot(dat.clss, aes(best_match, percentage_mutations))
416 p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA) 415 p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA)
417 p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") + theme(panel.background = element_rect(fill = "white", colour="black")) 416 p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black"))
418 p = p + scale_fill_manual(values=c("IGA1" = "lightblue1", "IGA2" = "blue4", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "black")) 417 p = p + scale_fill_manual(values=c("IGA" = "blue4", "IGA1" = "lightblue1", "IGA2" = "blue4", "IGG" = "olivedrab3", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "darkviolet"))
419 p = p + scale_colour_manual(values=c("IGA1" = "lightblue1", "IGA2" = "blue4", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "black")) 418 p = p + scale_colour_manual(values=c("IGA" = "blue4", "IGA1" = "lightblue1", "IGA2" = "blue4", "IGG" = "olivedrab3", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "darkviolet"))
420 419
421 png(filename="scatter.png") 420 png(filename="scatter.png")
422 print(p) 421 print(p)
423 dev.off() 422 dev.off()
424 423
425 write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T) 424 write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T)
426 425
427 write.table(dat, input, sep="\t",quote=F,row.names=F,col.names=T) 426 write.table(dat, input, sep="\t",quote=F,row.names=F,col.names=T)
428 427
429
430 print("Plotting frequency ranges plot") 428 print("Plotting frequency ranges plot")
431 429
432 dat$best_match_class = substr(dat$best_match, 0, 3) 430 dat$best_match_class = substr(dat$best_match, 0, 3)
433 freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20") 431 freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20")
434 dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels) 432 dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels)
440 frequency_bins_data = merge(frequency_bins_data, frequency_bins_sum, by="best_match_class") 438 frequency_bins_data = merge(frequency_bins_data, frequency_bins_sum, by="best_match_class")
441 439
442 frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2) 440 frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2)
443 441
444 p = ggplot(frequency_bins_data, aes(frequency_bins, frequency)) 442 p = ggplot(frequency_bins_data, aes(frequency_bins, frequency))
445 p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge") + theme(panel.background = element_rect(fill = "white", colour="black")) 443 p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black"))
446 p = p + xlab("Frequency ranges") + ylab("Frequency") + ggtitle("Mutation Frequencies by class") + scale_fill_manual(values=c("IGA" = "blue4", "IGG" = "olivedrab3", "IGM" = "black")) 444 p = p + xlab("Frequency ranges") + ylab("Frequency") + ggtitle("Mutation Frequencies by class") + scale_fill_manual(values=c("IGA" = "blue4", "IGG" = "olivedrab3", "IGM" = "black"))
447 445
448 png(filename="frequency_ranges.png") 446 png(filename="frequency_ranges.png")
449 print(p) 447 print(p)
450 dev.off() 448 dev.off()
460 frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2) 458 frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2)
461 459
462 write.table(frequency_bins_data, "frequency_ranges_subclasses.txt", sep="\t",quote=F,row.names=F,col.names=T) 460 write.table(frequency_bins_data, "frequency_ranges_subclasses.txt", sep="\t",quote=F,row.names=F,col.names=T)
463 461
464 462
465 #frequency_bins_data_by_class 463
466 #frequency_ranges_subclasses.txt 464
467 465
468 466
469 467
470 468
471 469
472 470
473 471
474 472
475 473
476 474
477 475
478 476
479 477
480 478
481 479
482 480
483 481
484 482
485 483
486 484
487 485
488 486
489 487
490 488
491 489
492 490
493 491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516