Mercurial > repos > petr-novak > re_utils
comparison plot_comparative_clustering_summary.R @ 17:d14b68e9fd1d draft
Uploaded - new tools added
| author | petr-novak |
|---|---|
| date | Wed, 28 Apr 2021 08:37:20 +0000 |
| parents | |
| children | 5a05925340b0 |
comparison
equal
deleted
inserted
replaced
| 16:5376e1c9adec | 17:d14b68e9fd1d |
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 library(optparse) | |
| 3 ## TODO - add scale to legend! | |
| 4 twenty_colors = c( | |
| 5 '#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', | |
| 6 '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', | |
| 7 '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', | |
| 8 '#aaffc3', '#808000', '#ffd8b1', '#000075', "#000000" | |
| 9 ) | |
| 10 | |
| 11 get_color = function(classification, size){ | |
| 12 ## 20 of unique colors, first is black | |
| 13 unique_colors = twenty_colors[1:opt$number_of_colors] | |
| 14 Ncol = length(unique_colors) | |
| 15 ## rest wil be grey: | |
| 16 grey_color = "#a9a9a9" | |
| 17 ## unique repeats without All | |
| 18 include = !classification %in% "All" | |
| 19 unique_repeats = names(c(sort(by(size[include], INDICES = classification[include], FUN = sum), decreasing = TRUE))) | |
| 20 color_table = unique_colors[1:min(Ncol,length(unique_repeats))] | |
| 21 names(color_table) = unique_repeats[1:min(Ncol,length(unique_repeats))] | |
| 22 color = rep(grey_color, length(classification)) | |
| 23 names(color) = classification | |
| 24 for (ac in names(color_table)){ | |
| 25 color[names(color) %in% ac] = color_table[ac] | |
| 26 } | |
| 27 return(color) | |
| 28 } | |
| 29 | |
| 30 | |
| 31 make_legend = function(color){ | |
| 32 ## simplify description: | |
| 33 names(color) = gsub(".+/","",names(color)) | |
| 34 description = sapply(split(names(color), color), function(x) paste(unique(x), collapse=";")) | |
| 35 description = gsub(".+;.+", "Other", description) | |
| 36 description = gsub("All", "Other", description) | |
| 37 if ("Other" %in% description & length(description) > 1){ | |
| 38 description = c(description[! description %in% "Other"], description[description %in% "Other"]) | |
| 39 } | |
| 40 ord = order(factor(names(description), levels = twenty_colors)) | |
| 41 legend_info = list(name = gsub("All", "NA", description)[ord], color = names(description)[ord]) | |
| 42 } | |
| 43 | |
| 44 plot_rect_map = function(read_counts,cluster_annotation, output_file,GS, RL, Xcoef=1,Ycoef=1){ | |
| 45 ## read_counts : correspond to COMPARATIVE_ANALYSIS_COUNTS.csv | |
| 46 ## cluster annotation : CLUSTER_TABLE.csv | |
| 47 counts = read.table(read_counts,header=TRUE,as.is=TRUE) | |
| 48 input_read_counts = unlist(read.table(read_counts, nrows = 1, comment.char = "",sep="\t")[-(1:2)]) | |
| 49 | |
| 50 counts_file_valid = ncol(counts) == (length(input_read_counts) + 2) & all(colnames(input_read_counts)[1:2]==c("cluster", "supercluster")) | |
| 51 ## find which line is header | |
| 52 header_line = grep(".*Cluster.*Supercluster.*Size", readLines(cluster_annotation)) | |
| 53 annot = read.table(cluster_annotation, sep="\t",header=TRUE,as.is=TRUE, skip = header_line - 1) | |
| 54 ## validate | |
| 55 annot_file_valid = all(colnames(annot)==c("Cluster","Supercluster","Size","Size_adjusted","Automatic_annotation","TAREAN_annotation","Final_annotation")) | |
| 56 | |
| 57 | |
| 58 if (!annot_file_valid | !counts_file_valid){ | |
| 59 pdf(output_file) | |
| 60 plot.new() | |
| 61 text(0.5,0.5,"Input is not valid, check input files!") | |
| 62 dev.off() | |
| 63 stop("Input files are not valid!") | |
| 64 } | |
| 65 print(annot_file_valid) | |
| 66 print(counts_file_valid) | |
| 67 ## remove counts which are not in annotation - only clusters in annot will be plotted! | |
| 68 counts = counts[annot$Cluster,] | |
| 69 N = nrow(annot) | |
| 70 | |
| 71 counts_automatic = counts | |
| 72 annot_automatic = annot | |
| 73 input_read_counts_automatic = input_read_counts | |
| 74 # remove organelar and contamination if required make count correction | |
| 75 if (opt$nuclear_only){ | |
| 76 exclude=grep("contamination|organelle",annot$Automatic_annotation) | |
| 77 if (length(exclude)>0){ | |
| 78 counts_automatic = counts[-exclude, , drop=FALSE] | |
| 79 annot_automatic = annot[-exclude, ,drop=FALSE] | |
| 80 input_read_counts_automatic = input_read_counts - colSums(counts[exclude,-c(1:2) , drop=FALSE]) | |
| 81 } | |
| 82 } | |
| 83 color_auto = get_color(annot_automatic$Automatic_annotation, annot_automatic$Size) | |
| 84 | |
| 85 legend_info = make_legend(color_auto) | |
| 86 params = list(Automatic_annotation = list( | |
| 87 color = color_auto, | |
| 88 legend = legend_info, | |
| 89 counts = counts_automatic, | |
| 90 annot = annot_automatic, | |
| 91 input_read_counts = input_read_counts_automatic | |
| 92 ) | |
| 93 ) | |
| 94 | |
| 95 | |
| 96 if (!is.null(annot$Final_annotation)){ | |
| 97 | |
| 98 ## column with manual annotation exist - check if correct | |
| 99 if (any(annot$Final_annotation %in% "" | any(is.na(annot$Final_annotation)))){ | |
| 100 message("Final annotation is not complete, skipping") | |
| 101 }else{ | |
| 102 | |
| 103 counts_manual = counts | |
| 104 annot_manual = annot | |
| 105 input_read_counts_manual = input_read_counts | |
| 106 ## correction must be done idependetly in case manual and automatic classification differ in annotation | |
| 107 if (opt$nuclear_only){ | |
| 108 exclude=grep("contamination|organelle",annot$Final_annotation) | |
| 109 if (length(exclude)>0){ | |
| 110 counts_manual = counts[-exclude, , drop=FALSE] | |
| 111 annot_manual = annot[-exclude, ,drop=FALSE] | |
| 112 input_read_counts_manual = input_read_counts - colSums(counts[exclude,-c(1:2) , drop=FALSE]) | |
| 113 } | |
| 114 } | |
| 115 ## append | |
| 116 color_manual = get_color(annot_manual$Final_annotation, annot_manual$Size) | |
| 117 legend_info_manual = make_legend(color_manual) | |
| 118 | |
| 119 params$Final_annotation = list( | |
| 120 color = color_manual, | |
| 121 legend = legend_info_manual, | |
| 122 counts = counts_manual, | |
| 123 annot = annot_manual, | |
| 124 input_read_counts = input_read_counts_manual | |
| 125 | |
| 126 ) | |
| 127 } | |
| 128 } | |
| 129 | |
| 130 ## set size of pdf output | |
| 131 wdth = (3 + N*0.03 ) * Xcoef | |
| 132 hgt = (2.2 + ncol(counts)*0.20) * Ycoef | |
| 133 if (!any(is.na(GS))){ | |
| 134 hgt = hgt + ncol(counts)*0.20 * Ycoef | |
| 135 } | |
| 136 ptsize = round((wdth*hgt)^(1/4))*5 | |
| 137 | |
| 138 | |
| 139 pdf(output_file, width=wdth,height=hgt, pointsize = ptsize) # was 50 | |
| 140 ## originaly - printing of both automatic and final annotation | |
| 141 ## now - print only final_annotation if available | |
| 142 if (length(params) == 2){ | |
| 143 ## remove automatic | |
| 144 params$Automatic_annotation = NULL | |
| 145 } | |
| 146 ## | |
| 147 | |
| 148 for (j in seq_along(params)){ | |
| 149 Nclust = nrow(params[[j]]$annot) | |
| 150 ##prepare matrix for plotting | |
| 151 M = as.matrix(params[[j]]$counts[1:Nclust,-(1:2)]) | |
| 152 rownames(M) = paste0("CL",rownames(M)) | |
| 153 Mn1=(M)/apply(M,1,max) | |
| 154 Mn2=M/max(M) | |
| 155 ord1 = hclust(dist(Mn1),method = "ward.D")$order | |
| 156 ord2 = hclust(dist(t(Mn2)))$order | |
| 157 | |
| 158 ploting_area_width = 3 + log10(Nclust)*3 | |
| 159 ploting_area_sides = 1.5 | |
| 160 legend_width = 3 | |
| 161 title_height = 0.5 | |
| 162 if (any(is.na(GS))){ | |
| 163 layout(matrix(c(0,0,0,3,0,2,0,3,0,1,0,3),ncol=4,byrow = TRUE), | |
| 164 width=c(ploting_area_sides,ploting_area_width,ploting_area_sides, legend_width), | |
| 165 height=c(title_height, 3,ncol(M)*0.8)) | |
| 166 }else{ | |
| 167 ## extra row for legends | |
| 168 | |
| 169 | |
| 170 layout(matrix(c(0,0,0,3,0,2,0,3,0,1,0,3,0,0,0,4),ncol=4,byrow = TRUE), | |
| 171 width=c(ploting_area_sides,ploting_area_width,ploting_area_sides, legend_width), | |
| 172 height=c(title_height, 3,ncol(M)*0.8,ncol(M)*0.8 )) | |
| 173 } | |
| 174 | |
| 175 | |
| 176 par(xaxs='i', yaxs = 'i') | |
| 177 par(las=2,mar=c(4,0,0,0),cex.axis=0.5) | |
| 178 | |
| 179 if (any(is.na(GS))){ | |
| 180 rectMap(Mn2[ord1,ord2],scale.by='row',col=params[[j]]$color[ord1], grid=TRUE) | |
| 181 }else{ | |
| 182 # use genomic sizes | |
| 183 Mn3 = t(t(M) * (GS[colnames(M),] / params[[j]]$input_read_counts))[ord1,ord2] | |
| 184 ## rescale | |
| 185 MaxGS = max(Mn3) | |
| 186 Mn3 = Mn3/max(Mn3) | |
| 187 rectMap(Mn3,scale.by='none',col=params[[j]]$color[ord1], grid=TRUE) | |
| 188 } | |
| 189 par(las=2,mar=c(1,0,1,0), mgp = c(2,1,0)) | |
| 190 barplot(params[[j]]$annot$Size[ord1], col = 1) | |
| 191 mtext(side = 2, "Cluster size", las = 3, line = 2.5, cex = 0.5) | |
| 192 mtext(side=3, names(params)[j], las=0, line=1) | |
| 193 plot.new() | |
| 194 legend("topleft", col= params[[j]]$legend$color, legend=params[[j]]$legend$name, pch=15, cex=0.7, bty="n", pt.cex=1) | |
| 195 } | |
| 196 | |
| 197 if (!any(is.na(GS))){ | |
| 198 ## plot GS scale | |
| 199 par(xaxs='i', yaxs = 'i') | |
| 200 print(log(nrow(Mn3))) | |
| 201 par(las=2,mar=c(4,0,0,log(nrow(Mn3))),cex.axis=0.5) # same par as recplot above to keep the scale | |
| 202 Mn3scale = Mn3 | |
| 203 Mn3scale[,] = 0 | |
| 204 colnames(Mn3scale)=rep("", ncol(Mn3scale)) | |
| 205 rownames(Mn3scale)=rep("", nrow(Mn3scale)) | |
| 206 Mn3scale[,1] = seq(0,1, length.out = nrow(Mn3)) | |
| 207 rectMap(Mn3scale,scale.by='none',col="grey", grid=FALSE, boxlab="", draw_box=FALSE, center=FALSE) | |
| 208 slabels = pretty(c(0,MaxGS), n = 10) | |
| 209 sat = slabels/MaxGS * nrow(Mn3scale) | |
| 210 axis(side=1, at= sat, labels = slabels, line = 0) | |
| 211 mtext(side = 1, text = "Repeat abundance", las=1, line=2.5,cex=0.4) | |
| 212 mtext(side = 2, text = "Rectangle\n height", las=1, line=2,cex=0.4, at=1) | |
| 213 | |
| 214 axis(2, at=c(0.5, 1, 1.5), labels=c(0,0.5,1),line=0) | |
| 215 } | |
| 216 st = dev.off() | |
| 217 } | |
| 218 | |
| 219 rectMap=function(x,scale.by='row',col=1,xlab="",ylab="",grid=TRUE,axis_pos=c(1,4),boxlab = "Cluster Id", cexx=NULL,cexy=NULL, draw_box=TRUE, center=TRUE){ | |
| 220 if (scale.by=='row'){ | |
| 221 #x=(x)/rowSums(x) | |
| 222 x=(x)/apply(x,1,sum) | |
| 223 } | |
| 224 if (scale.by=='column'){ | |
| 225 x=t(t(x)/apply(x,2,max)) | |
| 226 } | |
| 227 nc=ncol(x) | |
| 228 nr=nrow(x) | |
| 229 coords=expand.grid(1:nr,1:nc) | |
| 230 plot(coords[,1],coords[,2],type='n',axes=F,xlim=range(coords[,1])+c(-.5,.5),ylim=range(coords[,2])+c(-.5,.5),xlab=xlab,ylab=ylab) | |
| 231 axis(axis_pos[1],at=1:nr,labels=rownames(x),lty=0,tick=FALSE,line=0,cex.axis=0.5/log10(nr)) | |
| 232 axis(axis_pos[2],at=1:nc,labels=colnames(x),lty=0,tick=FALSE,las=2,line=0 ,hadj=0, cex.axis=0.7) | |
| 233 axis(2,at=1:nc,labels=colnames(x),lty=0,tick=FALSE,las=2,line=0 ,hadj=1, cex.axis=0.7) | |
| 234 | |
| 235 mtext(side = 1, boxlab, las=1, line = 3, cex = 0.5) | |
| 236 line = 1.5 + log10(nr) | |
| 237 #mtext(side = 2, "Proportions of individual samples", las =0, line = line, cex = 0.5) | |
| 238 s=x/2 | |
| 239 w = c(x)/2 | |
| 240 if(center){ | |
| 241 rect(coords[,1]-0.5,coords[,2]-s,coords[,1]+0.5,coords[,2]+s,col=col,border=NA) | |
| 242 }else{ | |
| 243 rect(coords[,1]-0.5,coords[,2]-0.5,coords[,1]+0.5,coords[,2]+x-0.5,col=col,border=NA) | |
| 244 } | |
| 245 if (grid){ | |
| 246 abline(v=0:(nr)+.5,h=0:(nc)+.5,lty=2,col="#60606030",lwd=0.2) | |
| 247 } | |
| 248 if(draw_box){ | |
| 249 box(col="#60606030",lty=2, lwd=0.2) | |
| 250 } | |
| 251 } | |
| 252 | |
| 253 option_list <- list( | |
| 254 make_option(c("-c", "--cluster_table"), default=NA, type = "character", | |
| 255 help="file from RepeatExplorer2 clustering - CLUSTER_TABLE.csv"), | |
| 256 | |
| 257 make_option(c("-m", "--comparative_counts"),default = NA,type = "character", | |
| 258 help="file from RepeatExplorer2 output - COMPARATIVE_ANALYSIS_COUNTS.csv"), | |
| 259 | |
| 260 make_option(c("-o", "--output"), type="character", | |
| 261 default="comparative_analysis_summary.pdf", | |
| 262 help="File name for output figures (pdf document)"), | |
| 263 make_option(c("-N", "--number_of_colors"), type="integer", default=10, | |
| 264 help="Number of unique colors used from plotting (2-20, default is 10)"), | |
| 265 | |
| 266 make_option(c("-g", "--genome_size"),default = NA,type = "character", | |
| 267 help="file from genome sizes of species provided in tab delimited file in the format: | |
| 268 | |
| 269 species_code1 GenomeSize1 | |
| 270 species_code2 GenomeSize2 | |
| 271 species_code3 GenomeSize3 | |
| 272 species_code4 GenomeSize4 | |
| 273 | |
| 274 provide the same codes for species as in file COMPARATIVE_ANALYSIS_COUNTS.csv. The use of genome | |
| 275 sizes file imply the --nuclear_only option. If genome sizes are used, genomic abundance scale is added. | |
| 276 "), | |
| 277 make_option(c("-n", "--nuclear_only"), default = FALSE, type="logical", | |
| 278 action = "store_true", | |
| 279 help="remove all non-nuclear sequences (organelle and contamination). ") | |
| 280 ) | |
| 281 | |
| 282 | |
| 283 opt = parse_args(OptionParser(option_list = option_list)) | |
| 284 | |
| 285 if (any(is.na(c(opt$cluster_table, opt$comparative_counts)))){ | |
| 286 message("\nBoth files: CLUSTER_TABLE.csv and COMPARATIVE_ANALYSIS_COUNTS.csv must be provided\n") | |
| 287 q() | |
| 288 } | |
| 289 | |
| 290 if (!opt$number_of_colors %in% 1:20){ | |
| 291 message("number of color must be in range 1..20") | |
| 292 stop() | |
| 293 } | |
| 294 | |
| 295 if (!is.na(opt$genome_size)){ | |
| 296 GS = read.table(opt$genome_size, header=FALSE, as.is=TRUE, row.names = 1) | |
| 297 opt$nuclear_only=TRUE | |
| 298 }else{ | |
| 299 GS = NA | |
| 300 RL = NA | |
| 301 } | |
| 302 | |
| 303 plot_rect_map(opt$comparative_counts, opt$cluster_table, opt$output, GS, RL) | |
| 304 |
