Mercurial > repos > davidvanzessen > shm_csr
comparison shm_csr.r @ 1:faae21ba5c63 draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 25 Oct 2016 07:28:43 -0400 |
parents | c33d93683a09 |
children | e85fec274cde |
comparison
equal
deleted
inserted
replaced
0:c33d93683a09 | 1:faae21ba5c63 |
---|---|
5 args <- commandArgs(trailingOnly = TRUE) | 5 args <- commandArgs(trailingOnly = TRUE) |
6 | 6 |
7 input = args[1] | 7 input = args[1] |
8 genes = unlist(strsplit(args[2], ",")) | 8 genes = unlist(strsplit(args[2], ",")) |
9 outputdir = args[3] | 9 outputdir = args[3] |
10 include_fr1 = ifelse(args[4] == "yes", T, F) | 10 empty.region.filter = args[4] |
11 setwd(outputdir) | 11 setwd(outputdir) |
12 | 12 |
13 dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) | 13 dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) |
14 | 14 |
15 if(length(dat$Sequence.ID) == 0){ | 15 if(length(dat$Sequence.ID) == 0){ |
21 row.names(transitionTable) = c("A", "C", "G", "T") | 21 row.names(transitionTable) = c("A", "C", "G", "T") |
22 transitionTable["A","A"] = NA | 22 transitionTable["A","A"] = NA |
23 transitionTable["C","C"] = NA | 23 transitionTable["C","C"] = NA |
24 transitionTable["G","G"] = NA | 24 transitionTable["G","G"] = NA |
25 transitionTable["T","T"] = NA | 25 transitionTable["T","T"] = NA |
26 | |
26 write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) | 27 write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) |
27 cat("0", file="n.txt") | 28 cat("0", file="n.txt") |
28 stop("No data") | 29 stop("No data") |
29 } | 30 } |
30 | 31 |
31 cleanup_columns = c("FR1.IMGT.c.a", | 32 cleanup_columns = c("FR1.IMGT.c.a", |
32 "FR2.IMGT.g.t", | 33 "FR2.IMGT.g.t", |
33 "CDR1.IMGT.Nb.of.nucleotides", | 34 "CDR1.IMGT.Nb.of.nucleotides", |
34 "CDR2.IMGT.t.a", | 35 "CDR2.IMGT.t.a", |
35 "FR1.IMGT.c.g", | 36 "FR1.IMGT.c.g", |
36 "CDR1.IMGT.c.t", | 37 "CDR1.IMGT.c.t", |
37 "FR2.IMGT.a.c", | 38 "FR2.IMGT.a.c", |
38 "FR2.IMGT.Nb.of.mutations", | 39 "FR2.IMGT.Nb.of.mutations", |
39 "FR2.IMGT.g.c", | 40 "FR2.IMGT.g.c", |
40 "FR2.IMGT.a.g", | 41 "FR2.IMGT.a.g", |
41 "FR3.IMGT.t.a", | 42 "FR3.IMGT.t.a", |
42 "FR3.IMGT.t.c", | 43 "FR3.IMGT.t.c", |
43 "FR2.IMGT.g.a", | 44 "FR2.IMGT.g.a", |
44 "FR3.IMGT.c.g", | 45 "FR3.IMGT.c.g", |
45 "FR1.IMGT.Nb.of.mutations", | 46 "FR1.IMGT.Nb.of.mutations", |
46 "CDR1.IMGT.g.a", | 47 "CDR1.IMGT.g.a", |
47 "CDR1.IMGT.t.g", | 48 "CDR1.IMGT.t.g", |
48 "CDR1.IMGT.g.c", | 49 "CDR1.IMGT.g.c", |
49 "CDR2.IMGT.Nb.of.nucleotides", | 50 "CDR2.IMGT.Nb.of.nucleotides", |
50 "FR2.IMGT.a.t", | 51 "FR2.IMGT.a.t", |
51 "CDR1.IMGT.Nb.of.mutations", | 52 "CDR1.IMGT.Nb.of.mutations", |
52 "CDR3.IMGT.Nb.of.nucleotides", | 53 "CDR3.IMGT.Nb.of.nucleotides", |
53 "CDR1.IMGT.a.g", | 54 "CDR1.IMGT.a.g", |
54 "FR3.IMGT.a.c", | 55 "FR3.IMGT.a.c", |
55 "FR1.IMGT.g.a", | 56 "FR1.IMGT.g.a", |
56 "FR3.IMGT.a.g", | 57 "FR3.IMGT.a.g", |
57 "FR1.IMGT.a.t", | 58 "FR1.IMGT.a.t", |
58 "CDR2.IMGT.a.g", | 59 "CDR2.IMGT.a.g", |
59 "CDR2.IMGT.Nb.of.mutations", | 60 "CDR2.IMGT.Nb.of.mutations", |
60 "CDR2.IMGT.g.t", | 61 "CDR2.IMGT.g.t", |
61 "CDR2.IMGT.a.c", | 62 "CDR2.IMGT.a.c", |
62 "CDR1.IMGT.t.c", | 63 "CDR1.IMGT.t.c", |
63 "FR3.IMGT.g.c", | 64 "FR3.IMGT.g.c", |
64 "FR1.IMGT.g.t", | 65 "FR1.IMGT.g.t", |
65 "FR3.IMGT.g.t", | 66 "FR3.IMGT.g.t", |
66 "CDR1.IMGT.a.t", | 67 "CDR1.IMGT.a.t", |
67 "FR1.IMGT.a.g", | 68 "FR1.IMGT.a.g", |
68 "FR3.IMGT.a.t", | 69 "FR3.IMGT.a.t", |
69 "FR3.IMGT.Nb.of.nucleotides", | 70 "FR3.IMGT.Nb.of.nucleotides", |
70 "FR2.IMGT.t.c", | 71 "FR2.IMGT.t.c", |
71 "CDR2.IMGT.g.a", | 72 "CDR2.IMGT.g.a", |
72 "FR2.IMGT.t.a", | 73 "FR2.IMGT.t.a", |
73 "CDR1.IMGT.t.a", | 74 "CDR1.IMGT.t.a", |
74 "FR2.IMGT.t.g", | 75 "FR2.IMGT.t.g", |
75 "FR3.IMGT.t.g", | 76 "FR3.IMGT.t.g", |
76 "FR2.IMGT.Nb.of.nucleotides", | 77 "FR2.IMGT.Nb.of.nucleotides", |
77 "FR1.IMGT.t.a", | 78 "FR1.IMGT.t.a", |
78 "FR1.IMGT.t.g", | 79 "FR1.IMGT.t.g", |
79 "FR3.IMGT.c.t", | 80 "FR3.IMGT.c.t", |
80 "FR1.IMGT.t.c", | 81 "FR1.IMGT.t.c", |
81 "CDR2.IMGT.a.t", | 82 "CDR2.IMGT.a.t", |
82 "FR2.IMGT.c.t", | 83 "FR2.IMGT.c.t", |
83 "CDR1.IMGT.g.t", | 84 "CDR1.IMGT.g.t", |
84 "CDR2.IMGT.t.g", | 85 "CDR2.IMGT.t.g", |
85 "FR1.IMGT.Nb.of.nucleotides", | 86 "FR1.IMGT.Nb.of.nucleotides", |
86 "CDR1.IMGT.c.g", | 87 "CDR1.IMGT.c.g", |
87 "CDR2.IMGT.t.c", | 88 "CDR2.IMGT.t.c", |
88 "FR3.IMGT.g.a", | 89 "FR3.IMGT.g.a", |
89 "CDR1.IMGT.a.c", | 90 "CDR1.IMGT.a.c", |
90 "FR2.IMGT.c.a", | 91 "FR2.IMGT.c.a", |
91 "FR3.IMGT.Nb.of.mutations", | 92 "FR3.IMGT.Nb.of.mutations", |
92 "FR2.IMGT.c.g", | 93 "FR2.IMGT.c.g", |
93 "CDR2.IMGT.g.c", | 94 "CDR2.IMGT.g.c", |
94 "FR1.IMGT.g.c", | 95 "FR1.IMGT.g.c", |
95 "CDR2.IMGT.c.t", | 96 "CDR2.IMGT.c.t", |
96 "FR3.IMGT.c.a", | 97 "FR3.IMGT.c.a", |
97 "CDR1.IMGT.c.a", | 98 "CDR1.IMGT.c.a", |
98 "CDR2.IMGT.c.g", | 99 "CDR2.IMGT.c.g", |
99 "CDR2.IMGT.c.a", | 100 "CDR2.IMGT.c.a", |
100 "FR1.IMGT.c.t", | 101 "FR1.IMGT.c.t", |
101 "FR1.IMGT.Nb.of.silent.mutations", | 102 "FR1.IMGT.Nb.of.silent.mutations", |
102 "FR2.IMGT.Nb.of.silent.mutations", | 103 "FR2.IMGT.Nb.of.silent.mutations", |
103 "FR3.IMGT.Nb.of.silent.mutations", | 104 "FR3.IMGT.Nb.of.silent.mutations", |
104 "FR1.IMGT.Nb.of.nonsilent.mutations", | 105 "FR1.IMGT.Nb.of.nonsilent.mutations", |
105 "FR2.IMGT.Nb.of.nonsilent.mutations", | 106 "FR2.IMGT.Nb.of.nonsilent.mutations", |
106 "FR3.IMGT.Nb.of.nonsilent.mutations") | 107 "FR3.IMGT.Nb.of.nonsilent.mutations") |
107 | |
108 | 108 |
109 print("Cleaning up columns") | 109 print("Cleaning up columns") |
110 | |
110 for(col in cleanup_columns){ | 111 for(col in cleanup_columns){ |
111 dat[,col] = gsub("\\(.*\\)", "", dat[,col]) | 112 dat[,col] = gsub("\\(.*\\)", "", dat[,col]) |
112 #dat[dat[,col] == "",] = "0" | 113 #dat[dat[,col] == "",] = "0" |
113 dat[,col] = as.numeric(dat[,col]) | 114 dat[,col] = as.numeric(dat[,col]) |
114 dat[is.na(dat[,col]),col] = 0 | 115 dat[is.na(dat[,col]),col] = 0 |
115 } | 116 } |
116 | 117 |
117 regions = c("FR1", "CDR1", "FR2", "CDR2", "FR3") | 118 regions = c("FR1", "CDR1", "FR2", "CDR2", "FR3") |
118 if(!include_fr1){ | 119 if(empty.region.filter == "FR1") { |
119 regions = c("CDR1", "FR2", "CDR2", "FR3") | 120 regions = c("CDR1", "FR2", "CDR2", "FR3") |
121 } else if (empty.region.filter == "CDR1") { | |
122 regions = c("FR2", "CDR2", "FR3", "CDR3") | |
123 } else if (empty.region.filter == "FR2") { | |
124 regions = c("CDR2", "FR3", "CDR3") | |
120 } | 125 } |
121 | 126 |
122 sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) } | 127 sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) } |
123 | 128 |
124 print("aggregating data into new columns") | 129 print("aggregating data into new columns") |
134 dat$transitionMutations = apply(dat, FUN=sum_by_row, 1, columns=transitionMutations_columns) | 139 dat$transitionMutations = apply(dat, FUN=sum_by_row, 1, columns=transitionMutations_columns) |
135 | 140 |
136 transversionMutations_columns = paste(rep(regions, each=8), c(".IMGT.a.c",".IMGT.c.a",".IMGT.a.t",".IMGT.t.a",".IMGT.g.c",".IMGT.c.g",".IMGT.g.t",".IMGT.t.g"), sep="") | 141 transversionMutations_columns = paste(rep(regions, each=8), c(".IMGT.a.c",".IMGT.c.a",".IMGT.a.t",".IMGT.t.a",".IMGT.g.c",".IMGT.c.g",".IMGT.g.t",".IMGT.t.g"), sep="") |
137 dat$transversionMutations = apply(dat, FUN=sum_by_row, 1, columns=transversionMutations_columns) | 142 dat$transversionMutations = apply(dat, FUN=sum_by_row, 1, columns=transversionMutations_columns) |
138 | 143 |
139 | |
140 transitionMutationsAtGC_columns = paste(rep(regions, each=2), c(".IMGT.g.a",".IMGT.c.t"), sep="") | 144 transitionMutationsAtGC_columns = paste(rep(regions, each=2), c(".IMGT.g.a",".IMGT.c.t"), sep="") |
141 dat$transitionMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtGC_columns) | 145 dat$transitionMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=transitionMutationsAtGC_columns) |
142 | |
143 | 146 |
144 totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.c.g",".IMGT.c.t",".IMGT.c.a",".IMGT.g.c",".IMGT.g.a",".IMGT.g.t"), sep="") | 147 totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.c.g",".IMGT.c.t",".IMGT.c.a",".IMGT.g.c",".IMGT.g.a",".IMGT.g.t"), sep="") |
145 #totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.g.a",".IMGT.c.t",".IMGT.c.a",".IMGT.c.g",".IMGT.g.t"), sep="") | 148 #totalMutationsAtGC_columns = paste(rep(regions, each=6), c(".IMGT.g.a",".IMGT.c.t",".IMGT.c.a",".IMGT.c.g",".IMGT.g.t"), sep="") |
146 dat$totalMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtGC_columns) | 149 dat$totalMutationsAtGC = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtGC_columns) |
147 | 150 |
150 | 153 |
151 totalMutationsAtAT_columns = paste(rep(regions, each=6), c(".IMGT.a.g",".IMGT.a.c",".IMGT.a.t",".IMGT.t.g",".IMGT.t.c",".IMGT.t.a"), sep="") | 154 totalMutationsAtAT_columns = paste(rep(regions, each=6), c(".IMGT.a.g",".IMGT.a.c",".IMGT.a.t",".IMGT.t.g",".IMGT.t.c",".IMGT.t.a"), sep="") |
152 #totalMutationsAtAT_columns = paste(rep(regions, each=5), c(".IMGT.a.g",".IMGT.t.c",".IMGT.a.c",".IMGT.g.c",".IMGT.t.g"), sep="") | 155 #totalMutationsAtAT_columns = paste(rep(regions, each=5), c(".IMGT.a.g",".IMGT.t.c",".IMGT.a.c",".IMGT.g.c",".IMGT.t.g"), sep="") |
153 dat$totalMutationsAtAT = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtAT_columns) | 156 dat$totalMutationsAtAT = apply(dat, FUN=sum_by_row, 1, columns=totalMutationsAtAT_columns) |
154 | 157 |
155 | |
156 FRRegions = regions[grepl("FR", regions)] | 158 FRRegions = regions[grepl("FR", regions)] |
157 CDRRegions = regions[grepl("CDR", regions)] | 159 CDRRegions = regions[grepl("CDR", regions)] |
158 | 160 |
159 FR_silentMutations_columns = paste(FRRegions, ".IMGT.Nb.of.silent.mutations", sep="") | 161 FR_silentMutations_columns = paste(FRRegions, ".IMGT.Nb.of.silent.mutations", sep="") |
160 dat$silentMutationsFR = apply(dat, FUN=sum_by_row, 1, columns=FR_silentMutations_columns) | 162 dat$silentMutationsFR = apply(dat, FUN=sum_by_row, 1, columns=FR_silentMutations_columns) |
167 | 169 |
168 CDR_nonSilentMutations_columns = paste(CDRRegions, ".IMGT.Nb.of.nonsilent.mutations", sep="") | 170 CDR_nonSilentMutations_columns = paste(CDRRegions, ".IMGT.Nb.of.nonsilent.mutations", sep="") |
169 dat$nonSilentMutationsCDR = apply(dat, FUN=sum_by_row, 1, columns=CDR_nonSilentMutations_columns) | 171 dat$nonSilentMutationsCDR = apply(dat, FUN=sum_by_row, 1, columns=CDR_nonSilentMutations_columns) |
170 | 172 |
171 mutation.sum.columns = c("Sequence.ID", "VRegionMutations", "VRegionNucleotides", "transitionMutations", "transversionMutations", "transitionMutationsAtGC", "transitionMutationsAtAT", "silentMutationsFR", "nonSilentMutationsFR", "silentMutationsCDR", "nonSilentMutationsCDR") | 173 mutation.sum.columns = c("Sequence.ID", "VRegionMutations", "VRegionNucleotides", "transitionMutations", "transversionMutations", "transitionMutationsAtGC", "transitionMutationsAtAT", "silentMutationsFR", "nonSilentMutationsFR", "silentMutationsCDR", "nonSilentMutationsCDR") |
172 | |
173 write.table(dat[,mutation.sum.columns], "mutation_by_id.txt", sep="\t",quote=F,row.names=F,col.names=T) | 174 write.table(dat[,mutation.sum.columns], "mutation_by_id.txt", sep="\t",quote=F,row.names=F,col.names=T) |
174 | 175 |
175 setwd(outputdir) | 176 setwd(outputdir) |
176 | 177 |
177 base.order = data.frame(base=c("A", "T", "C", "G"), order=1:4) | 178 base.order = data.frame(base=c("A", "T", "C", "G"), order=1:4) |
181 | 182 |
182 j = i - 1 | 183 j = i - 1 |
183 x = (j * 3) + 1 | 184 x = (j * 3) + 1 |
184 y = (j * 3) + 2 | 185 y = (j * 3) + 2 |
185 z = (j * 3) + 3 | 186 z = (j * 3) + 3 |
186 | 187 |
187 if(nrow(tmp) > 0){ | 188 if(nrow(tmp) > 0){ |
188 | |
189 if(fname == "sum"){ | 189 if(fname == "sum"){ |
190 matrx[1,x] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) | 190 matrx[1,x] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) |
191 matrx[1,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1) | 191 matrx[1,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1) |
192 matrx[1,z] = round(f(matrx[1,x] / matrx[1,y]) * 100, digits=1) | 192 matrx[1,z] = round(f(matrx[1,x] / matrx[1,y]) * 100, digits=1) |
193 } else { | 193 } else { |
194 matrx[1,x] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) | 194 matrx[1,x] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) |
195 matrx[1,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1) | 195 matrx[1,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1) |
196 matrx[1,z] = round(f(tmp$VRegionMutations / tmp$VRegionNucleotides) * 100, digits=1) | 196 matrx[1,z] = round(f(tmp$VRegionMutations / tmp$VRegionNucleotides) * 100, digits=1) |
197 } | 197 } |
198 | 198 |
199 matrx[2,x] = round(f(tmp$transitionMutations, na.rm=T), digits=1) | 199 matrx[2,x] = round(f(tmp$transitionMutations, na.rm=T), digits=1) |
200 matrx[2,y] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) | 200 matrx[2,y] = round(f(tmp$VRegionMutations, na.rm=T), digits=1) |
201 matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) | 201 matrx[2,z] = round(matrx[2,x] / matrx[2,y] * 100, digits=1) |
257 FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") | 257 FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") |
258 CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") | 258 CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") |
259 FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") | 259 FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") |
260 CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") | 260 CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") |
261 FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") | 261 FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") |
262 if(include_fr1){ | 262 if (empty.region.filter == "leader"){ |
263 transitionTable[NT1,NT2] = sum(tmp[,c(FR1, CDR1, FR2, CDR2, FR3)]) | 263 transitionTable[NT1,NT2] = sum(tmp[,c(FR1, CDR1, FR2, CDR2, FR3)]) |
264 } else { | 264 } else if (empty.region.filter == "FR1") { |
265 transitionTable[NT1,NT2] = sum(tmp[,c(CDR1, FR2, CDR2, FR3)]) | 265 transitionTable[NT1,NT2] = sum(tmp[,c(CDR1, FR2, CDR2, FR3)]) |
266 } else if (empty.region.filter == "CDR1") { | |
267 transitionTable[NT1,NT2] = sum(tmp[,c(FR2, CDR2, FR3)]) | |
268 } else if (empty.region.filter == "FR2") { | |
269 transitionTable[NT1,NT2] = sum(tmp[,c(CDR2, FR3)]) | |
266 } | 270 } |
267 } | 271 } |
268 } | 272 } |
269 transition = transitionTable | 273 transition = transitionTable |
270 transition$id = names(transition) | 274 transition$id = names(transition) |
271 | 275 |
272 transition2 = melt(transition, id.vars="id") | 276 transition2 = melt(transition, id.vars="id") |
273 | 277 |
274 transition2 = merge(transition2, base.order, by.x="id", by.y="base") | 278 transition2 = merge(transition2, base.order, by.x="id", by.y="base") |
279 | |
275 transition2 = merge(transition2, base.order, by.x="variable", by.y="base") | 280 transition2 = merge(transition2, base.order, by.x="variable", by.y="base") |
276 | 281 |
277 transition2[is.na(transition2$value),]$value = 0 | 282 transition2[is.na(transition2$value),]$value = 0 |
278 | 283 |
279 if(!all(transition2$value == 0)){ #having rows of data but a transition table filled with 0 is bad | 284 if(any(transition2$value == 0)){ #having rows of data but a transition table filled with 0 is bad |
280 | |
281 print("Plotting stacked transition") | 285 print("Plotting stacked transition") |
282 | |
283 png(filename=paste("transitions_stacked_", name, ".png", sep="")) | 286 png(filename=paste("transitions_stacked_", name, ".png", sep="")) |
284 p = ggplot(transition2, aes(factor(reorder(id, order.x)), y=value, fill=factor(reorder(variable, order.y)))) + geom_bar(position="fill", stat="identity", colour="black") #stacked bar | 287 p = ggplot(transition2, aes(factor(reorder(id, order.x)), y=value, fill=factor(reorder(variable, order.y)))) + geom_bar(position="fill", stat="identity", colour="black") #stacked bar |
285 p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + guides(fill=guide_legend(title=NULL)) | 288 p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + guides(fill=guide_legend(title=NULL)) |
286 p = p + theme(panel.background = element_rect(fill = "white", colour="black")) + scale_fill_manual(values=c("A" = "blue4", "G" = "lightblue1", "C" = "olivedrab3", "T" = "olivedrab4")) | 289 p = p + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black")) + scale_fill_manual(values=c("A" = "blue4", "G" = "lightblue1", "C" = "olivedrab3", "T" = "olivedrab4")) |
287 #p = p + scale_colour_manual(values=c("A" = "black", "G" = "black", "C" = "black", "T" = "black")) | 290 #p = p + scale_colour_manual(values=c("A" = "black", "G" = "black", "C" = "black", "T" = "black")) |
288 print(p) | 291 print(p) |
289 dev.off() | 292 dev.off() |
290 | 293 |
291 print("Plotting heatmap transition") | 294 print("Plotting heatmap transition") |
292 | 295 |
293 png(filename=paste("transitions_heatmap_", name, ".png", sep="")) | 296 png(filename=paste("transitions_heatmap_", name, ".png", sep="")) |
294 p = ggplot(transition2, aes(factor(reorder(id, order.x)), factor(reorder(variable, order.y)))) + geom_tile(aes(fill = value)) + scale_fill_gradient(low="white", high="steelblue") #heatmap | 297 p = ggplot(transition2, aes(factor(reorder(id, order.x)), factor(reorder(variable, order.y)))) + geom_tile(aes(fill = value)) + scale_fill_gradient(low="white", high="steelblue") #heatmap |
295 p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + theme(panel.background = element_rect(fill = "white", colour="black")) | 298 p = p + xlab("From base") + ylab("To base") + ggtitle("Mutations frequency from base to base") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black")) |
296 print(p) | 299 print(p) |
297 dev.off() | 300 dev.off() |
298 } else { | 301 } else { |
299 print("No data to plot") | 302 print("No data to plot") |
300 } | 303 } |
301 } | 304 } |
302 | 305 |
303 #print(paste("writing value file: ", name, "_", fname, "_value.txt" ,sep="")) | 306 #print(paste("writing value file: ", name, "_", fname, "_value.txt" ,sep="")) |
304 | |
305 write.table(x=transitionTable, file=paste("transitions_", name ,"_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=NA) | 307 write.table(x=transitionTable, file=paste("transitions_", name ,"_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=NA) |
306 write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file=paste("matched_", name , "_", fname, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) | 308 write.table(x=tmp[,c("Sequence.ID", "best_match", "chunk_hit_percentage", "nt_hit_percentage", "start_locations")], file=paste("matched_", name , "_", fname, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T) |
307 | |
308 cat(matrx[1,x], file=paste(name, "_", fname, "_value.txt" ,sep="")) | 309 cat(matrx[1,x], file=paste(name, "_", fname, "_value.txt" ,sep="")) |
309 cat(nrow(tmp), file=paste(name, "_", fname, "_n.txt" ,sep="")) | 310 cat(nrow(tmp), file=paste(name, "_", fname, "_n.txt" ,sep="")) |
310 | |
311 #print(paste(fname, name, nrow(tmp))) | 311 #print(paste(fname, name, nrow(tmp))) |
312 | |
313 matrx | 312 matrx |
314 } | 313 } |
315 | |
316 nts = c("a", "c", "g", "t") | 314 nts = c("a", "c", "g", "t") |
317 zeros=rep(0, 4) | 315 zeros=rep(0, 4) |
318 | |
319 funcs = c(median, sum, mean) | 316 funcs = c(median, sum, mean) |
320 fnames = c("median", "sum", "mean") | 317 fnames = c("median", "sum", "mean") |
321 | 318 |
322 print("Creating result tables") | 319 print("Creating result tables") |
323 | 320 |
328 rows = 9 | 325 rows = 9 |
329 if(fname == "sum"){ | 326 if(fname == "sum"){ |
330 rows = 11 | 327 rows = 11 |
331 } | 328 } |
332 matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=rows) | 329 matrx = matrix(data = 0, ncol=((length(genes) + 1) * 3),nrow=rows) |
333 | |
334 for(i in 1:length(genes)){ | 330 for(i in 1:length(genes)){ |
335 print(paste("Creating table for", fname, genes[i])) | 331 print(paste("Creating table for", fname, genes[i])) |
336 matrx = calculate_result(i, genes[i], dat, matrx, func, fname, genes[i]) | 332 matrx = calculate_result(i, genes[i], dat, matrx, func, fname, genes[i]) |
337 } | 333 } |
338 | |
339 matrx = calculate_result(i + 1, ".*", dat[!grepl("unmatched", dat$best_match),], matrx, func, fname, name="all") | 334 matrx = calculate_result(i + 1, ".*", dat[!grepl("unmatched", dat$best_match),], matrx, func, fname, name="all") |
340 | 335 |
341 result = data.frame(matrx) | 336 result = data.frame(matrx) |
342 if(fname == "sum"){ | 337 if(fname == "sum"){ |
343 row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR") | 338 row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR") |
344 } else { | 339 } else { |
345 row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)") | 340 row.names(result) = c("Number of Mutations (%)", "Transitions (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)") |
346 } | 341 } |
347 | |
348 write.table(x=result, file=paste("mutations_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=F) | 342 write.table(x=result, file=paste("mutations_", fname, ".txt", sep=""), sep=",",quote=F,row.names=T,col.names=F) |
349 } | 343 } |
350 | 344 |
351 print("Adding median number of mutations to sum table") | 345 print("Adding median number of mutations to sum table") |
352 | |
353 sum.table = read.table("mutations_sum.txt", sep=",", header=F) | 346 sum.table = read.table("mutations_sum.txt", sep=",", header=F) |
354 median.table = read.table("mutations_median.txt", sep=",", header=F) | 347 median.table = read.table("mutations_median.txt", sep=",", header=F) |
355 | 348 |
356 new.table = sum.table[1,] | 349 new.table = sum.table[1,] |
357 new.table[2,] = median.table[1,] | 350 new.table[2,] = median.table[1,] |
361 | 354 |
362 #sum.table = sum.table[c("Number of Mutations (%)", "Median of Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR"),] | 355 #sum.table = sum.table[c("Number of Mutations (%)", "Median of Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)", "Transitions at A T (%)", "Targeting of A T (%)", "FR R/S (ratio)", "CDR R/S (ratio)", "nt in FR", "nt in CDR"),] |
363 | 356 |
364 write.table(x=new.table, file="mutations_sum.txt", sep=",",quote=F,row.names=F,col.names=F) | 357 write.table(x=new.table, file="mutations_sum.txt", sep=",",quote=F,row.names=F,col.names=F) |
365 | 358 |
366 | |
367 print("Plotting IGA piechart") | 359 print("Plotting IGA piechart") |
368 | 360 |
369 dat = dat[!grepl("^unmatched", dat$best_match),] | 361 dat = dat[!grepl("^unmatched", dat$best_match),] |
370 | 362 |
371 #blegh | 363 #blegh |
364 | |
372 genesForPlot = dat[grepl("IGA", dat$best_match),]$best_match | 365 genesForPlot = dat[grepl("IGA", dat$best_match),]$best_match |
366 | |
373 if(length(genesForPlot) > 0){ | 367 if(length(genesForPlot) > 0){ |
374 genesForPlot = data.frame(table(genesForPlot)) | 368 genesForPlot = data.frame(table(genesForPlot)) |
375 colnames(genesForPlot) = c("Gene","Freq") | 369 colnames(genesForPlot) = c("Gene","Freq") |
376 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) | 370 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) |
377 | 371 |
378 pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=Gene)) | 372 pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=Gene)) |
379 pc = pc + geom_bar(width = 1, stat = "identity") + scale_fill_manual(labels=genesForPlot$label, values=c("IGA1" = "lightblue1", "IGA2" = "blue4")) | 373 pc = pc + geom_bar(width = 1, stat = "identity") + scale_fill_manual(labels=genesForPlot$label, values=c("IGA1" = "lightblue1", "IGA2" = "blue4")) |
380 pc = pc + coord_polar(theta="y") | 374 pc = pc + coord_polar(theta="y") + scale_y_continuous(breaks=NULL) |
381 pc = pc + theme(panel.background = element_rect(fill = "white", colour="black")) | 375 pc = pc + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black")) |
382 pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IGA subclasses", "( n =", sum(genesForPlot$Freq), ")")) | 376 pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IGA subclasses", "( n =", sum(genesForPlot$Freq), ")")) |
383 write.table(genesForPlot, "IGA.txt", sep="\t",quote=F,row.names=F,col.names=T) | 377 write.table(genesForPlot, "IGA.txt", sep="\t",quote=F,row.names=F,col.names=T) |
384 | 378 |
385 png(filename="IGA.png") | 379 png(filename="IGA.png") |
386 print(pc) | 380 print(pc) |
388 } | 382 } |
389 | 383 |
390 print("Plotting IGG piechart") | 384 print("Plotting IGG piechart") |
391 | 385 |
392 genesForPlot = dat[grepl("IGG", dat$best_match),]$best_match | 386 genesForPlot = dat[grepl("IGG", dat$best_match),]$best_match |
387 | |
393 if(length(genesForPlot) > 0){ | 388 if(length(genesForPlot) > 0){ |
394 genesForPlot = data.frame(table(genesForPlot)) | 389 genesForPlot = data.frame(table(genesForPlot)) |
395 colnames(genesForPlot) = c("Gene","Freq") | 390 colnames(genesForPlot) = c("Gene","Freq") |
396 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) | 391 genesForPlot$label = paste(genesForPlot$Gene, "-", genesForPlot$Freq) |
397 | 392 |
398 pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=Gene)) | 393 pc = ggplot(genesForPlot, aes(x = factor(1), y=Freq, fill=Gene)) |
399 pc = pc + geom_bar(width = 1, stat = "identity") + scale_fill_manual(labels=genesForPlot$label, values=c("IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred")) | 394 pc = pc + geom_bar(width = 1, stat = "identity") + scale_fill_manual(labels=genesForPlot$label, values=c("IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred")) |
400 pc = pc + coord_polar(theta="y") | 395 pc = pc + coord_polar(theta="y") + scale_y_continuous(breaks=NULL) |
401 pc = pc + theme(panel.background = element_rect(fill = "white", colour="black")) | 396 pc = pc + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black")) |
402 pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IGG subclasses", "( n =", sum(genesForPlot$Freq), ")")) | 397 pc = pc + xlab(" ") + ylab(" ") + ggtitle(paste("IGG subclasses", "( n =", sum(genesForPlot$Freq), ")")) |
403 write.table(genesForPlot, "IGG.txt", sep="\t",quote=F,row.names=F,col.names=T) | 398 write.table(genesForPlot, "IGG.txt", sep="\t",quote=F,row.names=F,col.names=T) |
404 | 399 |
405 png(filename="IGG.png") | 400 png(filename="IGG.png") |
406 print(pc) | 401 print(pc) |
407 dev.off() | 402 dev.off() |
408 } | 403 } |
409 | 404 |
410 | |
411 print("Plotting scatterplot") | 405 print("Plotting scatterplot") |
412 | 406 |
413 dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2) | 407 dat$percentage_mutations = round(dat$VRegionMutations / dat$VRegionNucleotides * 100, 2) |
414 | 408 dat.clss = dat |
415 p = ggplot(dat, aes(best_match, percentage_mutations)) | 409 |
410 dat.clss$best_match = substr(dat.clss$best_match, 0, 3) | |
411 | |
412 dat.clss = rbind(dat, dat.clss) | |
413 | |
414 p = ggplot(dat.clss, aes(best_match, percentage_mutations)) | |
416 p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA) | 415 p = p + geom_point(aes(colour=best_match), position="jitter") + geom_boxplot(aes(middle=mean(percentage_mutations)), alpha=0.1, outlier.shape = NA) |
417 p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") + theme(panel.background = element_rect(fill = "white", colour="black")) | 416 p = p + xlab("Subclass") + ylab("Frequency") + ggtitle("Frequency scatter plot") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black")) |
418 p = p + scale_fill_manual(values=c("IGA1" = "lightblue1", "IGA2" = "blue4", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "black")) | 417 p = p + scale_fill_manual(values=c("IGA" = "blue4", "IGA1" = "lightblue1", "IGA2" = "blue4", "IGG" = "olivedrab3", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "darkviolet")) |
419 p = p + scale_colour_manual(values=c("IGA1" = "lightblue1", "IGA2" = "blue4", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "black")) | 418 p = p + scale_colour_manual(values=c("IGA" = "blue4", "IGA1" = "lightblue1", "IGA2" = "blue4", "IGG" = "olivedrab3", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "darkviolet")) |
420 | 419 |
421 png(filename="scatter.png") | 420 png(filename="scatter.png") |
422 print(p) | 421 print(p) |
423 dev.off() | 422 dev.off() |
424 | 423 |
425 write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T) | 424 write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\t",quote=F,row.names=F,col.names=T) |
426 | 425 |
427 write.table(dat, input, sep="\t",quote=F,row.names=F,col.names=T) | 426 write.table(dat, input, sep="\t",quote=F,row.names=F,col.names=T) |
428 | 427 |
429 | |
430 print("Plotting frequency ranges plot") | 428 print("Plotting frequency ranges plot") |
431 | 429 |
432 dat$best_match_class = substr(dat$best_match, 0, 3) | 430 dat$best_match_class = substr(dat$best_match, 0, 3) |
433 freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20") | 431 freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20") |
434 dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels) | 432 dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels) |
440 frequency_bins_data = merge(frequency_bins_data, frequency_bins_sum, by="best_match_class") | 438 frequency_bins_data = merge(frequency_bins_data, frequency_bins_sum, by="best_match_class") |
441 | 439 |
442 frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2) | 440 frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2) |
443 | 441 |
444 p = ggplot(frequency_bins_data, aes(frequency_bins, frequency)) | 442 p = ggplot(frequency_bins_data, aes(frequency_bins, frequency)) |
445 p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge") + theme(panel.background = element_rect(fill = "white", colour="black")) | 443 p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=13, colour="black")) |
446 p = p + xlab("Frequency ranges") + ylab("Frequency") + ggtitle("Mutation Frequencies by class") + scale_fill_manual(values=c("IGA" = "blue4", "IGG" = "olivedrab3", "IGM" = "black")) | 444 p = p + xlab("Frequency ranges") + ylab("Frequency") + ggtitle("Mutation Frequencies by class") + scale_fill_manual(values=c("IGA" = "blue4", "IGG" = "olivedrab3", "IGM" = "black")) |
447 | 445 |
448 png(filename="frequency_ranges.png") | 446 png(filename="frequency_ranges.png") |
449 print(p) | 447 print(p) |
450 dev.off() | 448 dev.off() |
460 frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2) | 458 frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2) |
461 | 459 |
462 write.table(frequency_bins_data, "frequency_ranges_subclasses.txt", sep="\t",quote=F,row.names=F,col.names=T) | 460 write.table(frequency_bins_data, "frequency_ranges_subclasses.txt", sep="\t",quote=F,row.names=F,col.names=T) |
463 | 461 |
464 | 462 |
465 #frequency_bins_data_by_class | 463 |
466 #frequency_ranges_subclasses.txt | 464 |
467 | 465 |
468 | 466 |
469 | 467 |
470 | 468 |
471 | 469 |
472 | 470 |
473 | 471 |
474 | 472 |
475 | 473 |
476 | 474 |
477 | 475 |
478 | 476 |
479 | 477 |
480 | 478 |
481 | 479 |
482 | 480 |
483 | 481 |
484 | 482 |
485 | 483 |
486 | 484 |
487 | 485 |
488 | 486 |
489 | 487 |
490 | 488 |
491 | 489 |
492 | 490 |
493 | 491 |
492 | |
493 | |
494 | |
495 | |
496 | |
497 | |
498 | |
499 | |
500 | |
501 | |
502 | |
503 | |
504 | |
505 | |
506 | |
507 | |
508 | |
509 | |
510 | |
511 | |
512 | |
513 | |
514 | |
515 | |
516 |