Mercurial > repos > petr-novak > repeatrxplorer
comparison lib/utils.R @ 0:1d1b9e1b2e2f draft
Uploaded
| author | petr-novak | 
|---|---|
| date | Thu, 19 Dec 2019 10:24:45 -0500 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:1d1b9e1b2e2f | 
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 suppressPackageStartupMessages(library(DBI)) | |
| 3 suppressPackageStartupMessages(library(RSQLite)) | |
| 4 | |
| 5 CONNECTED = FALSE | |
| 6 if (FALSE) { | |
| 7 ## for testing | |
| 8 seqdb = "/mnt/raid/spolecny/petr/RE2/comparative_test/sequences.db" | |
| 9 hitsortdb = "/mnt/raid/spolecny/petr/RE2/comparative_test/hitsort.db" | |
| 10 class_file = "/mnt/raid/users/petr/workspace/repex_tarean/databases/classification_tree.rds" | |
| 11 ## connect to sqlite databases | |
| 12 SEQDB = dbConnect(RSQLite::SQLite(), seqdb) | |
| 13 HITSORTDB = dbConnect(RSQLite::SQLite(), hitsortdb) | |
| 14 CLS_TREE = readRDS(class_file) | |
| 15 } | |
| 16 | |
| 17 connect_to_databases = function(seqdb, hitsortdb,classification_hierarchy_file = NULL){ | |
| 18 if (!CONNECTED){ | |
| 19 SEQDB <<- dbConnect(RSQLite::SQLite(), seqdb) | |
| 20 HITSORTDB <<- dbConnect(RSQLite::SQLite(), hitsortdb) | |
| 21 if (!is.null(classification_hierarchy_file)){ | |
| 22 CLS_TREE <<- readRDS(classification_hierarchy_file) | |
| 23 } | |
| 24 CONNECTED <<- TRUE | |
| 25 } | |
| 26 } | |
| 27 | |
| 28 disconnect_database = function(){ | |
| 29 if (CONNECTED){ | |
| 30 dbDisconnect(SEQDB) | |
| 31 dbDisconnect(HITSORTDB) | |
| 32 CONNECTED <<- FALSE | |
| 33 } | |
| 34 } | |
| 35 | |
| 36 nested2named_list = function(x){ | |
| 37 y = as.list(unlist(x[[1]])) | |
| 38 names(y) = unlist(x[[2]]) | |
| 39 return(y) | |
| 40 } | |
| 41 | |
| 42 is_comparative = function(){ | |
| 43 prefix_codes = dbGetQuery(SEQDB,"SELECT * FROM prefix_codes") | |
| 44 if (nrow(prefix_codes) == 0){ | |
| 45 return(FALSE) | |
| 46 }else{ | |
| 47 return(TRUE) | |
| 48 } | |
| 49 } | |
| 50 | |
| 51 get_comparative_codes = function(){ | |
| 52 prefix_codes = dbGetQuery(SEQDB,"SELECT * FROM prefix_codes") | |
| 53 return(prefix_codes) | |
| 54 } | |
| 55 | |
| 56 add_preamble = function(html_file, preamble){ | |
| 57 html_content=readLines(html_file) | |
| 58 modified_html_content = gsub("<body>", | |
| 59 paste("<body>\n", preamble,"\n"), | |
| 60 html_content) | |
| 61 cat(modified_html_content, file = html_file, sep="\n") | |
| 62 } | |
| 63 | |
| 64 | |
| 65 df2html = function(df, header = NULL, sort_col = NULL, digits = 3, rounding_function=signif, decreasing = TRUE, scroling = FALSE, width = 300){ | |
| 66 if (!is.null(sort_col)){ | |
| 67 df = df[order(df[,sort_col], decreasing = decreasing),] | |
| 68 } | |
| 69 if (!is.null(digits)){ | |
| 70 for (i in seq_along(df)){ | |
| 71 if(is.numeric(df[,i])){ | |
| 72 df[,i] = rounding_function(df[,i], digits) | |
| 73 } | |
| 74 } | |
| 75 } | |
| 76 if (is.null(header)){ | |
| 77 h = "" | |
| 78 }else{ | |
| 79 h = paste0(" <th>",header,"</th>\n", collapse="") %>% | |
| 80 paste0(" <tr>\n", .," </tr>\n") | |
| 81 } | |
| 82 x = apply(df,1,function(x)paste0(" <td>",x,"</td>\n", collapse="")) %>% | |
| 83 paste0(" <tr>\n", .," </tr>\n", collapse = "") | |
| 84 if (scroling){ | |
| 85 cols = paste0('<col width="',rep(round(100/ncol(df)),ncol(df)),'%">\n',collapse ="") | |
| 86 height = min(200, 22 * nrow(df)) | |
| 87 out = paste0( | |
| 88 '<table cellspacing="0" cellpadding="0" border="0" width="',width,'">\n', | |
| 89 ' <tr>\n', | |
| 90 ' <td>\n', | |
| 91 ' <table cellspacing="0" cellpadding="1" border="1" width="', width,'" >\n', | |
| 92 cols, | |
| 93 h, | |
| 94 ' </table>\n', | |
| 95 ' </td>\n', | |
| 96 ' </tr>\n', | |
| 97 ' <tr>\n', | |
| 98 ' <td>\n', | |
| 99 ' <div style="width:',width,'px; height:',height,'px; overflow:auto;">\n', | |
| 100 ' <table cellspacing="0" cellpadding="1" border="1" width="',width,'" >\n', | |
| 101 cols, | |
| 102 x, | |
| 103 ' </table>\n', | |
| 104 ' </div>\n', | |
| 105 ' </td>\n', | |
| 106 ' </tr>\n', | |
| 107 '</table>\n' | |
| 108 ) | |
| 109 | |
| 110 }else{ | |
| 111 out = paste ("<table>\n", h,x, "</table>\n") | |
| 112 } | |
| 113 return(out) | |
| 114 } | |
| 115 | |
| 116 start_html = function(filename, header){ | |
| 117 cat(header, file = filename) | |
| 118 html_writer = function(content, fn=HTML, ...){ | |
| 119 fn(content, append = TRUE, file = filename, ...) | |
| 120 } | |
| 121 } | |
| 122 | |
| 123 preformatted = function(x){ | |
| 124 ## make preformatted html text | |
| 125 return( | |
| 126 paste( | |
| 127 "<pre>\n", | |
| 128 x, | |
| 129 "</pre>" | |
| 130 ,sep="") | |
| 131 ) | |
| 132 } | |
| 133 | |
| 134 | |
| 135 summary_histogram = function(fn, N_clustering, N_omit=0, size_adjusted=NULL, top_clusters_prop){ | |
| 136 ## assume connection do databases | |
| 137 communities = dbGetQuery( | |
| 138 HITSORTDB, | |
| 139 "SELECT DISTINCT cluster, size, supercluster, supercluster_size FROM communities ORDER BY supercluster" | |
| 140 ) | |
| 141 if (N_omit != 0){ | |
| 142 ## adjust communities and cluster sizes: | |
| 143 cluster_to_adjust = which( | |
| 144 communities$size[order(communities$cluster)][1:length(size_adjusted)] != size_adjusted | |
| 145 ) | |
| 146 ## keep original value: | |
| 147 communities$size_original = communities$size | |
| 148 superclusters_to_adjust = unique(communities$supercluster[communities$cluster %in% cluster_to_adjust]) | |
| 149 for (cl in cluster_to_adjust){ | |
| 150 communities[communities$cluster == cl,'size'] = size_adjusted[cl] | |
| 151 } | |
| 152 for (cl in superclusters_to_adjust){ | |
| 153 communities[communities$supercluster == cl,'supercluster_size'] = | |
| 154 sum(communities[communities$supercluster == cl,'size']) | |
| 155 } | |
| 156 }else{ | |
| 157 cluster_to_adjust=NULL | |
| 158 } | |
| 159 singlets = N_clustering - sum(communities$size) | |
| 160 | |
| 161 supercluster_size = sort(unique(communities[, c('supercluster', 'supercluster_size')])$supercluster_size, decreasing = TRUE) | |
| 162 | |
| 163 | |
| 164 clid2size = sort(communities$size, decreasing = TRUE) | |
| 165 | |
| 166 cluster_id = split(communities$cluster, communities$supercluster) | |
| 167 cluster_id_sort = lapply(cluster_id, function(x)x[order(clid2size[x], decreasing = FALSE)]) | |
| 168 | |
| 169 cluster_size_unsorted = split(communities$size, communities$supercluster) | |
| 170 cluster_size_sort = lapply(cluster_size_unsorted, function(x) (sort(x))) | |
| 171 ## reorder by size of superclusters | |
| 172 cluster_size_sort_sort = cluster_size_sort[order(sapply(cluster_size_sort, sum), decreasing = TRUE)] | |
| 173 cluster_id_sort_sort = cluster_id_sort[order(sapply(cluster_size_sort, sum), decreasing = TRUE)] | |
| 174 | |
| 175 | |
| 176 Nmax = max(sapply(cluster_size_sort_sort, length)) | |
| 177 M = cbind( | |
| 178 sapply(cluster_size_sort_sort, function(x)y = c(x, rep(0, 1 + Nmax - length(x)))), | |
| 179 c(1, rep(0, Nmax)) | |
| 180 ) | |
| 181 | |
| 182 Mid = cbind( | |
| 183 sapply(cluster_id_sort_sort, function(x)y = c(x, rep(0, 1 + Nmax - length(x)))), | |
| 184 c(1, rep(0, Nmax)) | |
| 185 ) | |
| 186 | |
| 187 recolor = matrix(ifelse(Mid %in% cluster_to_adjust,TRUE,FALSE), ncol=ncol(Mid)) | |
| 188 indices = which(recolor, arr.ind = TRUE) | |
| 189 | |
| 190 | |
| 191 png(fn, width = 1200, height = 700, pointsize = 20) | |
| 192 barplot(M, | |
| 193 width = c(supercluster_size, singlets), | |
| 194 space = 0, ylim = c(0, max(supercluster_size) * 1.2), | |
| 195 col = c("#CCCCCC"), names.arg = rep("", ncol(M))) | |
| 196 plot(0, | |
| 197 xlim = c(0, sum(c(supercluster_size, singlets))), | |
| 198 ylim = c(0, max(supercluster_size) * 1.2), | |
| 199 type = "n", yaxs = 'i', axes = FALSE, xlab = "", ylab = "") | |
| 200 | |
| 201 rect(0, 0, | |
| 202 sum(supercluster_size), | |
| 203 max(supercluster_size) * 1.2, | |
| 204 col = "#0000FF10") | |
| 205 | |
| 206 rect(sum(supercluster_size), 0, | |
| 207 sum(supercluster_size) + singlets, | |
| 208 max(supercluster_size) * 1.2, | |
| 209 col = "#FFAAFF10") | |
| 210 | |
| 211 barplot(M, | |
| 212 width = c(supercluster_size, singlets), | |
| 213 space = 0, ylim = c(0, max(supercluster_size) * 1.2), | |
| 214 col = "#AAAAAA", names.arg = rep("", ncol(M)), add = TRUE, | |
| 215 xlab = "Proportion of reads [%]", ylab = "Number of reads", | |
| 216 main = paste(N_clustering, "reads total")) | |
| 217 | |
| 218 | |
| 219 for (i in seq_along(indices[,1])){ | |
| 220 y1 = sum(M[1:indices[i,'row'],indices[i,'col']]) | |
| 221 x1 = sum(M[,1:indices[i,'col']]) | |
| 222 if(indices[i,'row'] == 1){ | |
| 223 y0=0 | |
| 224 }else{ | |
| 225 y0 = sum(M[1:(indices[i,'row']-1),indices[i,'col']]) | |
| 226 } | |
| 227 if (indices[i,'col']==1){ | |
| 228 x0=0 | |
| 229 }else{ | |
| 230 x0 = sum(M[,1:(indices[i,'col']-1)]) | |
| 231 } | |
| 232 rect(x0,y0,x1,y1, col="#88FF88") | |
| 233 } | |
| 234 abline(v=top_clusters_prop * sum(c(supercluster_size, singlets)), col="#00000088", lwd=3, lty=3) | |
| 235 | |
| 236 text(sum(supercluster_size) / 2, | |
| 237 max(supercluster_size) * 1.05, | |
| 238 labels = paste0(sum(supercluster_size), " reads in\n", | |
| 239 length(supercluster_size), " supeclusters (", nrow(communities), " clusters)") | |
| 240 ) | |
| 241 | |
| 242 | |
| 243 text(sum(supercluster_size) + singlets / 2, | |
| 244 max(supercluster_size) * 1.05, | |
| 245 labels = paste(singlets, "singlets")) | |
| 246 | |
| 247 axis(1,at=seq(0,N_clustering,length.out=11),label=seq(0,100,by=10)) | |
| 248 dev.off() | |
| 249 clustering_info = list( | |
| 250 Number_of_reads_in_clusters = sum(supercluster_size), | |
| 251 Number_of_clusters = nrow(communities), | |
| 252 Number_of_superclusters = length(supercluster_size), | |
| 253 Number_of_singlets = singlets | |
| 254 ) | |
| 255 return(clustering_info) | |
| 256 } | |
| 257 | |
| 258 | |
| 259 rectMap=function(x,scale.by='row',col=1,xlab="",ylab="",grid=TRUE,axis_pos=c(1,4),cexx=NULL,cexy=NULL){ | |
| 260 if (scale.by=='row'){ | |
| 261 #x=(x)/rowSums(x) | |
| 262 x=(x)/apply(x,1,max) | |
| 263 } | |
| 264 if (scale.by=='column'){ | |
| 265 x=t(t(x)/apply(x,2,max)) | |
| 266 } | |
| 267 nc=ncol(x) | |
| 268 nr=nrow(x) | |
| 269 coords=expand.grid(1:nr,1:nc) | |
| 270 plot(coords[,1],coords[,2],type='n',axes=F,xlim=range(coords[,1])+c(-.5,.5),ylim=range(coords[,2])+c(-.5,.5),xlab=xlab,ylab=ylab) | |
| 271 axis(axis_pos[1],at=1:nr,labels=rownames(x),lty=0,tick=FALSE,line=0,cex.axis=0.5/log10(nr)) | |
| 272 axis(axis_pos[2],at=1:nc,labels=colnames(x),lty=0,tick=FALSE,las=2,line=0 ,hadj=0, cex.axis=0.7) | |
| 273 axis(2,at=1:nc,labels=colnames(x),lty=0,tick=FALSE,las=2,line=0 ,hadj=1, cex.axis=0.7) | |
| 274 | |
| 275 mtext(side = 1, "Cluster id", las=1, line = 3, cex = 0.5) | |
| 276 line = 1.5 + log10(nr) | |
| 277 mtext(side = 2, "Proportions of individual samples", las =0, line = line, cex = 0.5) | |
| 278 s=c(x)/2 # to get it proportional | |
| 279 w = c(x)/2 | |
| 280 rect(coords[,1]-0.5,coords[,2]-s,coords[,1]+0.5,coords[,2]+s,col=col,border=NA) | |
| 281 if (grid){ | |
| 282 abline(v=0:(nr)+.5,h=0:(nc)+.5,lty=2,col="#60606030") | |
| 283 } | |
| 284 box(col="#60606030",lty=2) | |
| 285 } | |
| 286 | |
| 287 plot_rect_map = function(read_counts,cluster_annotation, output_file,Xcoef=1,Ycoef=1){ | |
| 288 counts = read.table(read_counts,header=TRUE,as.is=TRUE) | |
| 289 annot = read.table(cluster_annotation, sep="\t",header=FALSE,as.is=TRUE) | |
| 290 N = nrow(annot) | |
| 291 colnames(annot) = c("cluster", "Automatic.classification") | |
| 292 annot$number.of.reads = rowSums(counts[1 : nrow(annot) ,-1]) | |
| 293 unique_repeats = names(sort(table(c(annot$Automatic.classification,rep('nd',N))),decreasing = TRUE)) | |
| 294 | |
| 295 M = as.matrix(counts[1:N,-(1:2)]) | |
| 296 rownames(M) = paste0("CL",rownames(M)) | |
| 297 Mn1=(M)/apply(M,1,max) | |
| 298 Mn2=M/max(M) | |
| 299 Mn2=M/apply(M,1,sum) | |
| 300 | |
| 301 ord1 = hclust(dist(Mn1),method = "ward.D")$order | |
| 302 ord2 = hclust(dist(t(Mn2)))$order | |
| 303 wdth = (400 + N*10 ) * Xcoef | |
| 304 hgt = (600 + ncol(M)*50) * Ycoef | |
| 305 ptsize = round((wdth*hgt)^(1/4)) | |
| 306 png(output_file, width=wdth,height=hgt, pointsize = ptsize) # was 50 | |
| 307 ploting_area_width = 3 + log10(N)*3 | |
| 308 ploting_area_sides = 1 | |
| 309 layout(matrix(c(4,2,3,4,1,3),ncol=3,byrow = TRUE), | |
| 310 width=c(ploting_area_sides,ploting_area_width,ploting_area_sides), | |
| 311 height=c(3,ncol(M)*0.5)) | |
| 312 par(xaxs='i', yaxs = 'i') | |
| 313 par(las=2,mar=c(4,0,0,0),cex.axis=0.5) | |
| 314 rectMap(Mn2[ord1,ord2],scale.by='none',col=1, grid=TRUE) | |
| 315 par(las=2,mar=c(1,0,1,0), mgp = c(2,0.5,0)) | |
| 316 barplot(annot$number.of.reads[ord1], col = 1) | |
| 317 mtext(side = 2, "Cluster size", las = 3, line = 2, cex = 0.5) | |
| 318 par(mar=c(0,0,10,0)) | |
| 319 plot.new() | |
| 320 st = dev.off() | |
| 321 ## calculate coordinated if boxes to create hyperlink | |
| 322 X0 = wdth/(ploting_area_sides * 2 + ploting_area_width)* ploting_area_sides | |
| 323 X1 = wdth/(ploting_area_sides * 2 + ploting_area_width)*(ploting_area_sides + ploting_area_width) | |
| 324 L = round(seq(X0,X1, length.out = N + 1)[1:N]) | |
| 325 R = round(seq(X0,X1, length.out = N + 1)[2:(N + 1)]) | |
| 326 cn = rownames(Mn2[ord1,ord2]) | |
| 327 cluster_links = paste0( | |
| 328 "seqclust/clustering/clusters/dir_CL", | |
| 329 sprintf("%04d", as.integer(substring(cn,3 ))), | |
| 330 "/index.html") | |
| 331 coords = paste0(L, ",", 1, ",", R, ",", hgt) | |
| 332 clustermap = paste0( | |
| 333 '\n<map name="clustermap"> \n', | |
| 334 paste0( | |
| 335 '<area shape="rect"\n coords="',coords, '"\n', | |
| 336 ' href="', cluster_links, '"\n', | |
| 337 ' title="', cn, '"/>\n', | |
| 338 collapse = ""), | |
| 339 "</map>\n") | |
| 340 return(clustermap) | |
| 341 } | 
