Mercurial > repos > davidvanzessen > argalaxy_tools
comparison report_clonality/RScript.r @ 8:8cbc1a8d27ae draft
Uploaded
| author | davidvanzessen |
|---|---|
| date | Mon, 19 Dec 2016 08:50:31 -0500 |
| parents | d001d0c05dbe |
| children | efa1f5a17b6e |
comparison
equal
deleted
inserted
replaced
| 7:54f6756bacb1 | 8:8cbc1a8d27ae |
|---|---|
| 133 if(any(is.na(PRODF$freq))){ #if there was an "_" in the ID, but not the frequency, go back to frequency of 1 for every sequence | 133 if(any(is.na(PRODF$freq))){ #if there was an "_" in the ID, but not the frequency, go back to frequency of 1 for every sequence |
| 134 PRODF$freq = 1 | 134 PRODF$freq = 1 |
| 135 } | 135 } |
| 136 } | 136 } |
| 137 | 137 |
| 138 #make a names list with sample -> color | |
| 139 naive.colors = c('blue4', 'darkred', 'olivedrab3', 'red', 'gray74', 'darkviolet', 'lightblue1', 'gold', 'chartreuse2', 'pink', 'Paleturquoise3', 'Chocolate1', 'Yellow', 'Deeppink3', 'Mediumorchid1', 'Darkgreen', 'Blue', 'Gray36', 'Hotpink', 'Yellow4') | |
| 140 unique.samples = unique(PRODF$Sample) | |
| 141 | |
| 142 if(length(unique.samples) <= length(naive.colors)){ | |
| 143 sample.colors = naive.colors[1:length(unique.samples)] | |
| 144 } else { | |
| 145 sample.colors = rainbow(length(unique.samples)) | |
| 146 } | |
| 147 | |
| 148 names(sample.colors) = unique.samples | |
| 149 | |
| 150 print("Sample.colors") | |
| 151 print(sample.colors) | |
| 138 | 152 |
| 139 | 153 |
| 140 #write the complete dataset that is left over, will be the input if 'none' for clonaltype and 'no' for filterproductive | 154 #write the complete dataset that is left over, will be the input if 'none' for clonaltype and 'no' for filterproductive |
| 141 write.table(PRODF, "allUnique.txt", sep="\t",quote=F,row.names=F,col.names=T) | 155 write.table(PRODF, "allUnique.txt", sep="\t",quote=F,row.names=F,col.names=T) |
| 142 write.table(PRODF, "allUnique.csv", sep=",",quote=F,row.names=F,col.names=T) | 156 write.table(PRODF, "allUnique.csv", sep=",",quote=F,row.names=F,col.names=T) |
| 273 | 287 |
| 274 print("Report Clonality - V, D and J frequency plots") | 288 print("Report Clonality - V, D and J frequency plots") |
| 275 | 289 |
| 276 pV = ggplot(PRODFV) | 290 pV = ggplot(PRODFV) |
| 277 pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) | 291 pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) |
| 278 pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage") | 292 pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage") + scale_fill_manual(values=sample.colors) |
| 293 pV = pV + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | |
| 279 write.table(x=PRODFV, file="VFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) | 294 write.table(x=PRODFV, file="VFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) |
| 280 | 295 |
| 281 png("VPlot.png",width = 1280, height = 720) | 296 png("VPlot.png",width = 1280, height = 720) |
| 282 pV | 297 pV |
| 283 dev.off(); | 298 dev.off(); |
| 284 | 299 |
| 285 if(useD){ | 300 if(useD){ |
| 286 pD = ggplot(PRODFD) | 301 pD = ggplot(PRODFD) |
| 287 pD = pD + geom_bar( aes( x=factor(reorder(Top.D.Gene, chr.orderD)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) | 302 pD = pD + geom_bar( aes( x=factor(reorder(Top.D.Gene, chr.orderD)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) |
| 288 pD = pD + xlab("Summary of D gene") + ylab("Frequency") + ggtitle("Relative frequency of D gene usage") | 303 pD = pD + xlab("Summary of D gene") + ylab("Frequency") + ggtitle("Relative frequency of D gene usage") + scale_fill_manual(values=sample.colors) |
| 304 pD = pD + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | |
| 289 write.table(x=PRODFD, file="DFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) | 305 write.table(x=PRODFD, file="DFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) |
| 290 | 306 |
| 291 png("DPlot.png",width = 800, height = 600) | 307 png("DPlot.png",width = 800, height = 600) |
| 292 print(pD) | 308 print(pD) |
| 293 dev.off(); | 309 dev.off(); |
| 294 } | 310 } |
| 295 | 311 |
| 296 pJ = ggplot(PRODFJ) | 312 pJ = ggplot(PRODFJ) |
| 297 pJ = pJ + geom_bar( aes( x=factor(reorder(Top.J.Gene, chr.orderJ)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) | 313 pJ = pJ + geom_bar( aes( x=factor(reorder(Top.J.Gene, chr.orderJ)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) |
| 298 pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of J gene usage") | 314 pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of J gene usage") + scale_fill_manual(values=sample.colors) |
| 299 write.table(x=PRODFJ, file="JFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) | 315 pJ = pJ + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) |
| 300 | |
| 301 png("JPlot.png",width = 800, height = 600) | |
| 302 pJ | |
| 303 dev.off(); | |
| 304 | |
| 305 pJ = ggplot(PRODFJ) | |
| 306 pJ = pJ + geom_bar( aes( x=factor(reorder(Top.J.Gene, chr.orderJ)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) | |
| 307 pJ = pJ + xlab("Summary of J gene") + ylab("Frequency") + ggtitle("Relative frequency of J gene usage") | |
| 308 write.table(x=PRODFJ, file="JFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) | 316 write.table(x=PRODFJ, file="JFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) |
| 309 | 317 |
| 310 png("JPlot.png",width = 800, height = 600) | 318 png("JPlot.png",width = 800, height = 600) |
| 311 pJ | 319 pJ |
| 312 dev.off(); | 320 dev.off(); |
| 322 VGenes = merge(VGenes, TotalPerSample, by="Sample") | 330 VGenes = merge(VGenes, TotalPerSample, by="Sample") |
| 323 VGenes$Frequency = VGenes$Count * 100 / VGenes$total | 331 VGenes$Frequency = VGenes$Count * 100 / VGenes$total |
| 324 VPlot = ggplot(VGenes) | 332 VPlot = ggplot(VGenes) |
| 325 VPlot = VPlot + geom_bar(aes( x = Top.V.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + | 333 VPlot = VPlot + geom_bar(aes( x = Top.V.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + |
| 326 ggtitle("Distribution of V gene families") + | 334 ggtitle("Distribution of V gene families") + |
| 327 ylab("Percentage of sequences") | 335 ylab("Percentage of sequences") + |
| 336 scale_fill_manual(values=sample.colors) + | |
| 337 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | |
| 328 png("VFPlot.png") | 338 png("VFPlot.png") |
| 329 VPlot | 339 VPlot |
| 330 dev.off(); | 340 dev.off(); |
| 331 write.table(x=VGenes, file="VFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) | 341 write.table(x=VGenes, file="VFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) |
| 332 | 342 |
| 338 DGenes = merge(DGenes, TotalPerSample, by="Sample") | 348 DGenes = merge(DGenes, TotalPerSample, by="Sample") |
| 339 DGenes$Frequency = DGenes$Count * 100 / DGenes$total | 349 DGenes$Frequency = DGenes$Count * 100 / DGenes$total |
| 340 DPlot = ggplot(DGenes) | 350 DPlot = ggplot(DGenes) |
| 341 DPlot = DPlot + geom_bar(aes( x = Top.D.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + | 351 DPlot = DPlot + geom_bar(aes( x = Top.D.Gene, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + |
| 342 ggtitle("Distribution of D gene families") + | 352 ggtitle("Distribution of D gene families") + |
| 343 ylab("Percentage of sequences") | 353 ylab("Percentage of sequences") + |
| 354 scale_fill_manual(values=sample.colors) + | |
| 355 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | |
| 344 png("DFPlot.png") | 356 png("DFPlot.png") |
| 345 print(DPlot) | 357 print(DPlot) |
| 346 dev.off(); | 358 dev.off(); |
| 347 write.table(x=DGenes, file="DFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) | 359 write.table(x=DGenes, file="DFFrequency.csv", sep=",",quote=F,row.names=F,col.names=T) |
| 348 } | 360 } |
| 357 CDR3Length$Frequency = CDR3Length$Count * 100 / CDR3Length$total | 369 CDR3Length$Frequency = CDR3Length$Count * 100 / CDR3Length$total |
| 358 CDR3LengthPlot = ggplot(CDR3Length) | 370 CDR3LengthPlot = ggplot(CDR3Length) |
| 359 CDR3LengthPlot = CDR3LengthPlot + geom_bar(aes( x = CDR3.Length.DNA, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + | 371 CDR3LengthPlot = CDR3LengthPlot + geom_bar(aes( x = CDR3.Length.DNA, y = Frequency, fill = Sample), stat='identity', position='dodge' ) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + |
| 360 ggtitle("Length distribution of CDR3") + | 372 ggtitle("Length distribution of CDR3") + |
| 361 xlab("CDR3 Length") + | 373 xlab("CDR3 Length") + |
| 362 ylab("Percentage of sequences") | 374 ylab("Percentage of sequences") + |
| 375 scale_fill_manual(values=sample.colors) + | |
| 376 theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | |
| 363 png("CDR3LengthPlot.png",width = 1280, height = 720) | 377 png("CDR3LengthPlot.png",width = 1280, height = 720) |
| 364 CDR3LengthPlot | 378 CDR3LengthPlot |
| 365 dev.off() | 379 dev.off() |
| 366 write.table(x=CDR3Length, file="CDR3LengthPlot.csv", sep=",",quote=F,row.names=F,col.names=T) | 380 write.table(x=CDR3Length, file="CDR3LengthPlot.csv", sep=",",quote=F,row.names=F,col.names=T) |
| 367 | 381 |
| 784 PRODF = bak | 798 PRODF = bak |
| 785 | 799 |
| 786 | 800 |
| 787 # ---------------------- D reading frame ---------------------- | 801 # ---------------------- D reading frame ---------------------- |
| 788 | 802 |
| 789 D.REGION.reading.frame = PRODF$D.REGION.reading.frame | 803 D.REGION.reading.frame = PRODF[,c("Sample", "D.REGION.reading.frame")] |
| 790 | 804 |
| 791 D.REGION.reading.frame[is.na(D.REGION.reading.frame)] = "No D" | 805 chck = is.na(D.REGION.reading.frame$D.REGION.reading.frame) |
| 792 | 806 if(any(chck)){ |
| 793 D.REGION.reading.frame = data.frame(table(D.REGION.reading.frame)) | 807 D.REGION.reading.frame[chck,"D.REGION.reading.frame"] = "No D" |
| 808 } | |
| 809 | |
| 810 D.REGION.reading.frame = data.frame(data.table(D.REGION.reading.frame)[, list(Freq=.N), by=c("Sample", "D.REGION.reading.frame")]) | |
| 794 | 811 |
| 795 write.table(D.REGION.reading.frame, "DReadingFrame.csv" , sep="\t",quote=F,row.names=F,col.names=T) | 812 write.table(D.REGION.reading.frame, "DReadingFrame.csv" , sep="\t",quote=F,row.names=F,col.names=T) |
| 796 | 813 |
| 797 D.REGION.reading.frame = ggplot(D.REGION.reading.frame) | 814 D.REGION.reading.frame = ggplot(D.REGION.reading.frame) |
| 798 D.REGION.reading.frame = D.REGION.reading.frame + geom_bar(aes( x = D.REGION.reading.frame, y = Freq), stat='identity', position='dodge' ) + ggtitle("D reading frame") + xlab("Frequency") + ylab("Frame") | 815 D.REGION.reading.frame = D.REGION.reading.frame + geom_bar(aes( x = D.REGION.reading.frame, y = Freq, fill=Sample), stat='identity', position='dodge' ) + ggtitle("D reading frame") + xlab("Frequency") + ylab("Frame") |
| 816 D.REGION.reading.frame = D.REGION.reading.frame + scale_fill_manual(values=sample.colors) | |
| 817 D.REGION.reading.frame = D.REGION.reading.frame + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | |
| 799 | 818 |
| 800 png("DReadingFrame.png") | 819 png("DReadingFrame.png") |
| 801 D.REGION.reading.frame | 820 D.REGION.reading.frame |
| 802 dev.off() | 821 dev.off() |
| 803 | 822 |
| 832 AAfreqplot = AAfreqplot + geom_bar(aes( x=factor(reorder(Var1, order.aa)), y = freq_perc, fill = Sample), stat='identity', position='dodge' ) | 851 AAfreqplot = AAfreqplot + geom_bar(aes( x=factor(reorder(Var1, order.aa)), y = freq_perc, fill = Sample), stat='identity', position='dodge' ) |
| 833 AAfreqplot = AAfreqplot + annotate("rect", xmin = 0.5, xmax = 2.5, ymin = 0, ymax = Inf, fill = "red", alpha = 0.2) | 852 AAfreqplot = AAfreqplot + annotate("rect", xmin = 0.5, xmax = 2.5, ymin = 0, ymax = Inf, fill = "red", alpha = 0.2) |
| 834 AAfreqplot = AAfreqplot + annotate("rect", xmin = 3.5, xmax = 4.5, ymin = 0, ymax = Inf, fill = "blue", alpha = 0.2) | 853 AAfreqplot = AAfreqplot + annotate("rect", xmin = 3.5, xmax = 4.5, ymin = 0, ymax = Inf, fill = "blue", alpha = 0.2) |
| 835 AAfreqplot = AAfreqplot + annotate("rect", xmin = 5.5, xmax = 6.5, ymin = 0, ymax = Inf, fill = "blue", alpha = 0.2) | 854 AAfreqplot = AAfreqplot + annotate("rect", xmin = 5.5, xmax = 6.5, ymin = 0, ymax = Inf, fill = "blue", alpha = 0.2) |
| 836 AAfreqplot = AAfreqplot + annotate("rect", xmin = 6.5, xmax = 7.5, ymin = 0, ymax = Inf, fill = "red", alpha = 0.2) | 855 AAfreqplot = AAfreqplot + annotate("rect", xmin = 6.5, xmax = 7.5, ymin = 0, ymax = Inf, fill = "red", alpha = 0.2) |
| 837 AAfreqplot = AAfreqplot + ggtitle("Amino Acid Composition in the CDR3") + xlab("Amino Acid, from Hydrophilic (left) to Hydrophobic (right)") + ylab("Percentage") | 856 AAfreqplot = AAfreqplot + ggtitle("Amino Acid Composition in the CDR3") + xlab("Amino Acid, from Hydrophilic (left) to Hydrophobic (right)") + ylab("Percentage") + scale_fill_manual(values=sample.colors) |
| 857 AAfreqplot = AAfreqplot + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank()) | |
| 838 | 858 |
| 839 png("AAComposition.png",width = 1280, height = 720) | 859 png("AAComposition.png",width = 1280, height = 720) |
| 840 AAfreqplot | 860 AAfreqplot |
| 841 dev.off() | 861 dev.off() |
| 842 write.table(AAfreq, "AAComposition.csv" , sep=",",quote=F,na="-",row.names=F,col.names=T) | 862 write.table(AAfreq, "AAComposition.csv" , sep=",",quote=F,na="-",row.names=F,col.names=T) |
| 843 | 863 |
| 844 | 864 # ---------------------- AA median CDR3 length ---------------------- |
| 865 | |
| 866 median.aa.l = data.frame(data.table(PRODF)[, list(median=as.double(median(.SD$CDR3.Length.DNA))), by=c("Sample")]) | |
| 867 write.table(median.aa.l, "AAMedianBySample.csv" , sep=",",quote=F,na="-",row.names=F,col.names=F) | |
| 868 |
