Mercurial > repos > petr-novak > repeatrxplorer
comparison lib/tarean/methods.R @ 0:1d1b9e1b2e2f draft
Uploaded
| author | petr-novak |
|---|---|
| date | Thu, 19 Dec 2019 10:24:45 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1d1b9e1b2e2f |
|---|---|
| 1 #!/user/bin/env Rscript | |
| 2 | |
| 3 suppressPackageStartupMessages(library(igraph)) | |
| 4 suppressPackageStartupMessages(library(parallel)) | |
| 5 suppressPackageStartupMessages(library(Biostrings)) | |
| 6 suppressPackageStartupMessages(library(scales)) | |
| 7 suppressPackageStartupMessages(library(stringr)) | |
| 8 suppressPackageStartupMessages(library(hwriter)) | |
| 9 suppressPackageStartupMessages(library(R2HTML)) | |
| 10 suppressPackageStartupMessages(library(plyr)) | |
| 11 suppressPackageStartupMessages(library(dplyr)) | |
| 12 | |
| 13 max_ORF_length = function(s) { | |
| 14 ## check all frames | |
| 15 L = 0 | |
| 16 for (i in 1:3) { | |
| 17 L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(s, i))), "*", | |
| 18 fixed = TRUE)))) | |
| 19 L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(reverseComplement(s), | |
| 20 i))), "*", fixed = TRUE)))) | |
| 21 } | |
| 22 return(L) | |
| 23 } | |
| 24 | |
| 25 kmers2graph = function(kmers, mode = "strong", prop = NULL) { | |
| 26 kmerLength = nchar(kmers[1, 1]) | |
| 27 if (ncol(kmers) == 2) { | |
| 28 kmers$size = kmers[, 2]/sum(kmers[, 2]) | |
| 29 } | |
| 30 colnames(kmers) = c("name", "count", "size") | |
| 31 if (!is.null(prop)) { # tohle se nepouziva(prop je null), a je to asi spatne - filtuje se to pred tridenim!! | |
| 32 p = cumsum(kmers$size) | |
| 33 kmers = kmers[p < prop, ] | |
| 34 } | |
| 35 N = dim(kmers)[1] | |
| 36 kmers = kmers[order(kmers$size), ] | |
| 37 ## convert kmers to fasta file | |
| 38 kms = data.frame(kmer = substring(kmers$name, 1, kmerLength - 1), ids = 1:nrow(kmers),stringsAsFactors = FALSE) | |
| 39 kme = data.frame(kmer = substring(kmers$name, 2), ide = 1:nrow(kmers), stringsAsFactors = FALSE) | |
| 40 | |
| 41 ## df = merge(kms,kme, by = 'kmer',all=FALSE)[,c(2,3,1)] | |
| 42 df = inner_join(kme,kms, by = 'kmer')[,c(2,3)] | |
| 43 | |
| 44 ## names(kms) = seq_along(kms) | |
| 45 ## kme = substring(kmers$name, 2) | |
| 46 ## names(kme) = seq_along(kme) | |
| 47 ## ## use new blast! | |
| 48 ## database = tempfile() | |
| 49 ## query = tempfile() | |
| 50 ## output = tempfile() | |
| 51 ## writeXStringSet(DNAStringSet(kms), filepath = database, format = "fasta") | |
| 52 ## writeXStringSet(DNAStringSet(kme), filepath = query, format = "fasta") | |
| 53 ## cmd = paste("makeblastdb -in", database, "-dbtype nucl") | |
| 54 ## system(cmd, ignore.stdout = TRUE) | |
| 55 ## cmd = paste("blastn -outfmt '6 qseqid sseqid pident' -strand plus -dust no -perc_identity 100 -query ", | |
| 56 ## query, "-db", database, "-word_size", kmerLength - 1, "-out", output) | |
| 57 ## system(cmd) | |
| 58 ## df = try({ | |
| 59 ## read.table(output, as.is = TRUE) | |
| 60 ## }) | |
| 61 ## if (class(df) == "try-error"){ | |
| 62 ## print("creation of kmer graph failed") | |
| 63 ## print(query) | |
| 64 ## print(output) | |
| 65 ## print(database) | |
| 66 ## return(NULL) | |
| 67 ## } | |
| 68 ## unlink(query) | |
| 69 ## unlink(paste(database, "*", sep = "")) | |
| 70 ## unlist(output) | |
| 71 gm_mean = function(x, na.rm = TRUE) { | |
| 72 exp(sum(log(x[x > 0]), na.rm = na.rm)/length(x)) | |
| 73 } | |
| 74 | |
| 75 whg = apply(cbind(kmers[df[, 1], 2], V2 = kmers[df[, 2], 2]), 1, gm_mean) | |
| 76 G = graph.data.frame(data.frame(V1 = kmers$name[df[, 1]], V2 = kmers$name[df[, | |
| 77 2]], weight = whg), vertices = kmers[, 1:3]) | |
| 78 # separate to connected components: | |
| 79 ccs = clusters(G, mode = mode)$membership | |
| 80 sel_cls = which(tabulate(ccs) > 1) | |
| 81 Gs = list() | |
| 82 for (i in seq_along(sel_cls)) { | |
| 83 Gs[[i]] = induced.subgraph(G, vids = which(ccs %in% sel_cls[i])) | |
| 84 } | |
| 85 ## reorder!!! | |
| 86 Gs = Gs[order(sapply(Gs, vcount), decreasing = TRUE)] | |
| 87 return(Gs) | |
| 88 } | |
| 89 | |
| 90 | |
| 91 OGDFlayout = function(G, ncol = NULL, alg = "fmmm", OGDF = getOption("OGDF")) { | |
| 92 ## is ogdf binary available? | |
| 93 if (is.null(OGDF)) { | |
| 94 OGDF = Sys.getenv("OGDF") | |
| 95 if ("" == OGDF) { | |
| 96 options(warn = -1) | |
| 97 OGDF = system("which runOGDFlayout", intern = TRUE) | |
| 98 options(warn = 0) | |
| 99 if (length(OGDF) == 0) { | |
| 100 cat("path to runOGDFlayout not found\n") | |
| 101 return(NULL) | |
| 102 } | |
| 103 | |
| 104 } | |
| 105 } | |
| 106 if (is.null(ncol)) { | |
| 107 if (is.null(E(G)$weight)) { | |
| 108 el = data.frame(get.edgelist(G, names = TRUE), rep(1, ecount(G))) | |
| 109 } else { | |
| 110 el = data.frame(get.edgelist(G, names = TRUE), E(G)$weight) | |
| 111 } | |
| 112 ncol = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout", sep = "") | |
| 113 write.table(el, file = ncol, row.names = FALSE, col.names = FALSE, sep = "\t", | |
| 114 quote = FALSE) | |
| 115 } else { | |
| 116 # copy ncol: | |
| 117 ncol_tmp = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout", | |
| 118 sep = "") | |
| 119 file.copy(ncol, ncol_tmp) | |
| 120 ncol = ncol_tmp | |
| 121 } | |
| 122 algopt = c("fmmm", "sm", "fme") | |
| 123 if (!(alg %in% c("fmmm", "sm", "fme") && TRUE)) { | |
| 124 stop("alg must by :", algopt, "\n") | |
| 125 | |
| 126 } | |
| 127 | |
| 128 # output file: | |
| 129 Louts = list() | |
| 130 layout_file = tempfile(pattern = as.character(Sys.getpid())) | |
| 131 for (i in alg) { | |
| 132 cmd = paste(OGDF, "-alg", i, "-iff layout -off layout -out", layout_file, | |
| 133 ncol) | |
| 134 system(cmd, intern = TRUE) | |
| 135 L = read.table(layout_file, skip = ecount(G)) | |
| 136 L = L[match(V(G)$name, L$V2), ] | |
| 137 Lout = as.matrix(L[, -(1:2)]) | |
| 138 unlink(layout_file) | |
| 139 Louts[[i]] = Lout | |
| 140 | |
| 141 } | |
| 142 # clean up | |
| 143 unlink(ncol) | |
| 144 return(Louts) | |
| 145 } | |
| 146 | |
| 147 xcolor_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00") | |
| 148 | |
| 149 kmers2color = function(s, position = NULL) { | |
| 150 if (is.null(position)) { | |
| 151 position = round(nchar(s[1])/2, 0) | |
| 152 ## position = 1 | |
| 153 position = nchar(s[1]) | |
| 154 } | |
| 155 color_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00") | |
| 156 color_base = substring(s, position, position) | |
| 157 colors = color_code[color_base] | |
| 158 names(colors) = color_base | |
| 159 return(colors) | |
| 160 } | |
| 161 get_sequence = function(g, v, position = NULL) { | |
| 162 s = V(g)$name | |
| 163 if (is.null(position)) { | |
| 164 position = round(nchar(s[1])/2, 0) | |
| 165 ## position = 1 | |
| 166 position = nchar(s[1]) | |
| 167 } | |
| 168 nt = paste(substring(s[v], position, position), collapse = "") | |
| 169 return(nt) | |
| 170 } | |
| 171 | |
| 172 | |
| 173 get_mimimal_cc = function(km, thr = 20, min_coverage = 0.45, step = 2, start = NULL) { | |
| 174 if (is.null(start)) { | |
| 175 i = sum(cumsum(km$freq) < 0.5) | |
| 176 } else { | |
| 177 i = sum(cumsum(km$freq) < start) | |
| 178 } | |
| 179 continue = TRUE | |
| 180 while (continue) { | |
| 181 if (i > nrow(km)) { | |
| 182 i = nrow(km) | |
| 183 continue = FALSE | |
| 184 step = 1 | |
| 185 } | |
| 186 GG = kmers2graph(km[1:i, ]) | |
| 187 if (length(GG) > 0) { | |
| 188 if (vcount(GG[[1]]) > thr) { | |
| 189 if (sum(V(GG[[1]])$size) >= min_coverage) { | |
| 190 GG[[1]]$input_coverage = sum(km$freq[1:i]) | |
| 191 GG[[1]]$L = OGDFlayout(GG[[1]])[[1]] | |
| 192 return(GG[[1]]) | |
| 193 } | |
| 194 } | |
| 195 } | |
| 196 i = round(i * step) | |
| 197 | |
| 198 } | |
| 199 if (length(GG) == 0 | is.null(GG)) { | |
| 200 return(NULL) | |
| 201 } | |
| 202 | |
| 203 GG[[1]]$input_coverage = sum(km$freq[1:i]) | |
| 204 GG[[1]]$L = OGDFlayout(GG[[1]])[[1]] | |
| 205 return(GG[[1]]) | |
| 206 } | |
| 207 | |
| 208 | |
| 209 | |
| 210 paths2string = function(paths) { | |
| 211 pathstring = sapply(lapply(lapply(paths, as_ids), substring, 1, 1), paste, collapse = "") | |
| 212 return(pathstring) | |
| 213 } | |
| 214 | |
| 215 | |
| 216 align_paths = function(paths, G) { | |
| 217 shift = rep(NA, length(paths)) | |
| 218 thr = 0 # minimal length | |
| 219 tr_paths = list() | |
| 220 Seqs = list() | |
| 221 centre_node = as.numeric(names(sort(table(unlist(paths)), decreasing = TRUE)))[[1]] | |
| 222 | |
| 223 for (i in seq_along(paths)) { | |
| 224 if (centre_node %in% paths[[i]]) { | |
| 225 S = which(paths[[i]] %in% centre_node) | |
| 226 shift[i] = S | |
| 227 if (S == 1) { | |
| 228 tr_paths[[i]] = paths[[i]] | |
| 229 } else { | |
| 230 tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S - | |
| 231 1)]) | |
| 232 } | |
| 233 Seqs[[i]] = get_sequence(G, tr_paths[[i]]) | |
| 234 } else { | |
| 235 shift[i] = NA | |
| 236 } | |
| 237 } | |
| 238 paths_n = lapply(paths, as.numeric) | |
| 239 tr_paths_n = do.call(cbind, lapply(tr_paths, as.numeric)) | |
| 240 new_shift = shift | |
| 241 for (i in which(is.na(shift))) { | |
| 242 score = numeric(length(paths_n)) | |
| 243 for (S in seq_along(paths_n[[i]])) { | |
| 244 if (S == 1) { | |
| 245 path_tmp_n = paths_n[[i]] | |
| 246 } else { | |
| 247 path_tmp_n = c(paths_n[[i]][S:length(paths_n[[i]])], paths_n[[i]][1:(S - | |
| 248 1)]) | |
| 249 } | |
| 250 score[S] = sum(tr_paths_n == path_tmp_n) | |
| 251 } | |
| 252 if (sum(score) != 0) { | |
| 253 S = which.max(score) | |
| 254 new_shift[i] = S | |
| 255 if (S == 1) { | |
| 256 tr_paths[[i]] = paths[[i]] | |
| 257 } else { | |
| 258 tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S - | |
| 259 1)]) | |
| 260 } | |
| 261 Seqs[[i]] = get_sequence(G, tr_paths[[i]]) | |
| 262 } | |
| 263 } | |
| 264 shift = new_shift | |
| 265 # try to shift based on the sequences itself | |
| 266 return(list(Seqs = Seqs[!is.na(shift)], tr_paths = tr_paths[!is.na(shift)], shift = shift[!is.na(shift)])) | |
| 267 } | |
| 268 | |
| 269 make_consensus = function(paths_info, G) { | |
| 270 include = !is.na(paths_info$shift) | |
| 271 ## make alignments using mafft | |
| 272 aln = mafft(unlist(paths_info$Seqs[include])) | |
| 273 CM = calculate_consensus_matrix(aln = aln, tr_paths = paths_info$tr_paths[include], | |
| 274 G = G) | |
| 275 gaps = get_gaps_from_alignment(aln) | |
| 276 CMnorm = CM/rowSums(CM) | |
| 277 bases = colnames(CM) | |
| 278 consensus = sapply(apply(CMnorm, 1, function(x) bases[which(x > 0.2)][order(x[x > | |
| 279 0.2], decreasing = TRUE)]), paste, collapse = "") | |
| 280 consensus2 = gsub("-", "", paste0(substring(consensus, 1, 1), collapse = ""), | |
| 281 fixed = TRUE) | |
| 282 number_of_SNP = sum(rowSums(CM > 0) > 1) | |
| 283 SNP_positions = which(rowSums(CM > 0) > 1) | |
| 284 number_of_position_with_indels = sum(colSums(do.call(rbind, strsplit(as.character(aln), | |
| 285 "")) == "-") > 0) | |
| 286 indel_positions = which(colSums(do.call(rbind, strsplit(as.character(aln), "")) == | |
| 287 "-") > 0) | |
| 288 if (length(SNP_positions) > 0) { | |
| 289 variable_sites = unlist(c(c(mapply(paste, strsplit((sapply(apply(CMnorm, | |
| 290 1, function(x) bases[which(x > 0.2)]), paste, collapse = ""))[SNP_positions], | |
| 291 ""), SNP_positions, sep = "_")), paste("-", indel_positions, sep = "_"))) | |
| 292 } else { | |
| 293 variable_sites = NULL | |
| 294 } | |
| 295 variable_positions = unique(SNP_positions, indel_positions) | |
| 296 return(list(aln = aln, CM = CM, CMnorm = CMnorm, consensus = consensus, consensus2 = consensus2, | |
| 297 number_of_SNP = number_of_SNP, SNP_positions = SNP_positions, number_of_position_with_indels = number_of_position_with_indels, | |
| 298 indel_positions = indel_positions, variable_positions = variable_positions, | |
| 299 variable_sites = variable_sites, gaps = gaps)) | |
| 300 } | |
| 301 | |
| 302 | |
| 303 estimate_monomer = function(G, weights = NULL, limit = NULL) { | |
| 304 if (is.null(G)) { | |
| 305 return(NULL) | |
| 306 } | |
| 307 ## estimate monomer from kmer based graph | |
| 308 V(G)$id = 1:vcount(G) | |
| 309 GS = induced_subgraph(G, vids = which(degree(G) == 2)) ## keep only vertices without branching | |
| 310 cls = clusters(GS)$membership | |
| 311 | |
| 312 | |
| 313 ids = mapply(FUN = function(x, y) x[which.max(y)], split(V(GS)$id, cls), split(V(GS)$size, | |
| 314 cls)) ## from each branch use only one vertex with larges size! | |
| 315 | |
| 316 | |
| 317 ids = ids[order(V(G)$size[ids], decreasing = TRUE)] | |
| 318 ids_size = V(G)$size[ids] | |
| 319 N50 = sum(cumsum(ids_size)/sum(ids_size) < 0.5) | |
| 320 if (length(ids) > 10000) { | |
| 321 ids = ids[1:N50] | |
| 322 } | |
| 323 ## use only large vertices in search! how many? | |
| 324 el = get.edgelist(G, names = FALSE) | |
| 325 node_use = numeric(vcount(G)) | |
| 326 LL = numeric() | |
| 327 ## W=numeric() | |
| 328 i = 0 | |
| 329 paths = list() | |
| 330 if (is.null(weights)) { | |
| 331 weights = (max(E(G)$weight) - (E(G)$weight) + 1) | |
| 332 weights = E(G)$weight^(-3) | |
| 333 } | |
| 334 included = rep(FALSE, vcount(G)) | |
| 335 W_total = sum(V(G)$size) | |
| 336 | |
| 337 coverage = numeric() | |
| 338 t0 = c() | |
| 339 i = 0 | |
| 340 j = 0 | |
| 341 for (n in ids) { | |
| 342 j = j + 1 | |
| 343 t0[j] = Sys.time() | |
| 344 if (included[n]) { | |
| 345 next | |
| 346 } | |
| 347 m = which(el[, 1] %in% n) | |
| 348 i = i + 1 | |
| 349 s = get.shortest.paths(G, el[m, 2], el[m, 1], weights = weights, output = "vpath") | |
| 350 included[as.numeric(s$vpath[[1]])] = TRUE | |
| 351 paths[[i]] = s$vpath[[1]] | |
| 352 LL[i] = (length(s$vpath[[1]])) | |
| 353 } | |
| 354 | |
| 355 ## evaluate if paths should be divided to variants - by length and path weight | |
| 356 paths_clusters = split(paths, LL) | |
| 357 paths_clusters_tr = mclapply(paths_clusters, FUN = align_paths, G = G, mc.cores = getOption("CPU")) | |
| 358 | |
| 359 ## paths_clusters_tr = lapply(paths_clusters, FUN = align_paths, G = G) consensus | |
| 360 paths_consensus = mclapply(paths_clusters_tr, make_consensus, G = G, mc.cores = getOption("CPU")) | |
| 361 | |
| 362 ## evaluate weight for individual paths: | |
| 363 for (v in seq_along(paths_consensus)) { | |
| 364 p = paths_clusters_tr[[v]]$tr_paths | |
| 365 ## clean | |
| 366 p = p[!sapply(p, function(x) anyNA(x) | is.null(x))] | |
| 367 L = sapply(p, length) | |
| 368 p_groups = split(p, L) | |
| 369 w_groups = sapply(p_groups, function(x) sum(V(G)$size[unique(c(sapply(x, | |
| 370 as.numeric)))])) | |
| 371 total_score = sum(V(G)$size[unique(c(unlist(sapply(p, as.numeric))))]) | |
| 372 LW = data.frame(`Length estimate` = unique(L), weight = w_groups, stringsAsFactors = FALSE) | |
| 373 LW = LW[order(LW$weight, decreasing = TRUE), ] | |
| 374 rownames(LW) = NULL | |
| 375 paths_consensus[[v]]$total_score = total_score | |
| 376 paths_consensus[[v]]$length_variant_score = LW | |
| 377 | |
| 378 } | |
| 379 return(list(estimates = paths_consensus, paths = paths_clusters_tr)) | |
| 380 } | |
| 381 | |
| 382 | |
| 383 | |
| 384 detect_pbs = function(dimers_file, tRNA_database_path, reads_file, output) { | |
| 385 ## read_file contain oriented reads! | |
| 386 min_offset = 10 | |
| 387 max_end_dist = 2 | |
| 388 min_aln_length = 30 | |
| 389 max_pbs_search = 30 | |
| 390 thr_length = 12 | |
| 391 end_position = 23 | |
| 392 insertion_proportion_threshold <- 0.15 | |
| 393 ## read_file contain oriented reads! for testing | |
| 394 if (FALSE) { | |
| 395 library(Biostrings) | |
| 396 thr_length = 12 | |
| 397 end_position = 23 | |
| 398 dimers_file = "consensus_dimer.fasta" | |
| 399 reads_file = "reads_oriented.fas" | |
| 400 tRNA_database_path = "/mnt/raid/users/petr/workspace/repex_tarean/databases/tRNA_database2.fasta" | |
| 401 | |
| 402 } | |
| 403 ## find read which are half aligned do reference dimer | |
| 404 dimers_seq = readDNAStringSet(dimers_file) | |
| 405 cmd = paste("makeblastdb -in", dimers_file, "-dbtype nucl") | |
| 406 system(cmd, ignore.stdout = TRUE) | |
| 407 output = paste0(reads_file, "blast_out.cvs") | |
| 408 columns = c("qseqid", "qstart", "qend", "qlen", "sseqid", "sstart", "send", "sstrand", | |
| 409 "slen", "pident", "length") | |
| 410 cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), "' -perc_identity 90 -query ", | |
| 411 reads_file, "-db", dimers_file, "-word_size", 7, "-dust no -num_alignments 99999 -strand plus ", | |
| 412 "-out", output) | |
| 413 system(cmd) | |
| 414 blastdf = read.table(output, as.is = TRUE, col.names = columns, comment.char = "") | |
| 415 unlink(output) | |
| 416 blastdf = blastdf[blastdf$length >= min_aln_length, ] | |
| 417 | |
| 418 ## expand two whole read to see unligned reads parts | |
| 419 blastdf_expand = blastdf | |
| 420 blastdf_expand$qstart = blastdf$qstart - (blastdf$qstart - 1) | |
| 421 blastdf_expand$sstart = blastdf$sstart - (blastdf$qstart - 1) | |
| 422 blastdf_expand$qend = blastdf$qend + (blastdf$qlen - blastdf$qend) | |
| 423 blastdf_expand$send = blastdf$send + (blastdf$qlen - blastdf$qend) | |
| 424 pS = blastdf$sstart | |
| 425 pE = blastdf$send | |
| 426 pSF = blastdf_expand$sstart | |
| 427 pEF = blastdf_expand$send | |
| 428 cond1 = pS - pSF >= min_offset & blastdf$qend >= (blastdf$qlen - max_end_dist) & | |
| 429 blastdf$sstart >= max_pbs_search | |
| 430 cond2 = pEF - pE >= min_offset & blastdf$qstart <= max_end_dist & blastdf$send <= | |
| 431 blastdf$slen - max_pbs_search | |
| 432 | |
| 433 ## coverage of alignments: evaluate coverage at site with breaks, it is neccessary | |
| 434 ## that there must be at least 20% of interupted alignments at given position to | |
| 435 ## be considered for additional search | |
| 436 coverage_profiles = subject_coverage(blastdf) | |
| 437 | |
| 438 | |
| 439 ## extract flanking sequences - cca 50nt, it should contain tRNA sequence search | |
| 440 ## Left | |
| 441 fin = tempfile() | |
| 442 fout = tempfile() | |
| 443 scoreL = scoreR = 0 | |
| 444 | |
| 445 | |
| 446 ## left side unique insertion sites | |
| 447 if (any(cond1)) { | |
| 448 insertion_sites = ddply(blastdf[cond1, ], .(sseqid, sstart), nrow) | |
| 449 ## check coverage | |
| 450 insertion_sites$coverage = 0 | |
| 451 for (i in 1:nrow(insertion_sites)) { | |
| 452 insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$sstart[[i]]] | |
| 453 } | |
| 454 insertion_OK_left <- insertion_sites[with(insertion_sites, V1/coverage > | |
| 455 insertion_proportion_threshold), ] | |
| 456 | |
| 457 if (nrow(insertion_OK_left) > 0) { | |
| 458 s = ifelse(insertion_OK_left[, "sstart"] - max_pbs_search < 1, 1, insertion_OK_left[, | |
| 459 "sstart"] - max_pbs_search) | |
| 460 Lpart = subseq(dimers_seq[match(insertion_OK_left$sseqid, names(dimers_seq))], | |
| 461 s, insertion_OK_left$sstart) | |
| 462 | |
| 463 ## names are CONSENSUSID__READID_position | |
| 464 names(Lpart) = paste0(names(Lpart), "__", insertion_OK_left$V1, "__", | |
| 465 insertion_OK_left$sstart) | |
| 466 ## check presence TG dinucleotide | |
| 467 TG = vcountPattern("TG", subseq(dimers_seq[match(insertion_OK_left$sseqid, | |
| 468 names(dimers_seq))], insertion_OK_left$sstart - 2, insertion_OK_left$sstart + | |
| 469 2)) | |
| 470 if (any(TG > 0)) { | |
| 471 writeXStringSet(Lpart[TG > 0], filepath = fin) | |
| 472 cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), | |
| 473 "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path, | |
| 474 "-word_size", 7, "-strand plus -dust no -max_target_seqs 10000 -evalue 100", | |
| 475 "-out", fout) | |
| 476 | |
| 477 system(cmd, ignore.stdout = TRUE) | |
| 478 df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "") | |
| 479 filter1 = df$length >= thr_length | |
| 480 filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position | |
| 481 df_pass = df[filter1 & filter2, , drop = FALSE] | |
| 482 df_pass_L = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE] | |
| 483 scoreL = get_score(df_pass_L) | |
| 484 write.table(df_pass_L, file = paste0(output, "_L.csv"), row.names = FALSE, | |
| 485 sep = "\t") | |
| 486 } | |
| 487 } | |
| 488 } | |
| 489 if (any(cond2)) { | |
| 490 ## search Right | |
| 491 insertion_sites = ddply(blastdf[cond2, ], .(sseqid, send, slen), nrow) | |
| 492 ## check coverage | |
| 493 insertion_sites$coverage = 0 | |
| 494 for (i in 1:nrow(insertion_sites)) { | |
| 495 insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$send[[i]]] | |
| 496 } | |
| 497 insertion_OK_right <- insertion_sites[with(insertion_sites, V1/coverage > | |
| 498 insertion_proportion_threshold), ] | |
| 499 | |
| 500 if (nrow(insertion_OK_right) > 0) { | |
| 501 s = ifelse(insertion_OK_right$send + max_pbs_search > insertion_OK_right$slen, | |
| 502 insertion_OK_right$slen, insertion_OK_right$send + max_pbs_search) | |
| 503 Rpart = subseq(dimers_seq[match(insertion_OK_right$sseqid, names(dimers_seq))], | |
| 504 insertion_OK_right$send, s) | |
| 505 names(Rpart) = paste0(names(Rpart), "__", insertion_OK_right$V1, "__", | |
| 506 insertion_OK_right$send) | |
| 507 | |
| 508 ## check presence CA dinucleotide | |
| 509 CA = vcountPattern("CA", subseq(dimers_seq[match(insertion_OK_right$sseqid, | |
| 510 names(dimers_seq))], insertion_OK_right$send - 2, insertion_OK_right$send + | |
| 511 2, )) | |
| 512 if (any(CA > 0)) { | |
| 513 writeXStringSet(Rpart[CA > 0], filepath = fin) | |
| 514 cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), | |
| 515 "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path, | |
| 516 "-word_size", 7, "-strand minus -dust no -max_target_seqs 10000 -evalue 100", | |
| 517 "-out", fout) | |
| 518 | |
| 519 system(cmd, ignore.stdout = TRUE) | |
| 520 df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "") | |
| 521 filter1 = df$length >= thr_length | |
| 522 filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position | |
| 523 df_pass = df[filter1 & filter2, , drop = FALSE] | |
| 524 df_pass_R = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE] | |
| 525 write.table(df_pass_R, file = paste0(output, "_R.csv"), row.names = FALSE, | |
| 526 sep = "\t") | |
| 527 scoreR = get_score(df_pass_R) | |
| 528 } | |
| 529 } | |
| 530 } | |
| 531 unlink(fin) | |
| 532 unlink(fout) | |
| 533 return(max(scoreL, scoreR)) | |
| 534 } | |
| 535 | |
| 536 | |
| 537 subject_coverage = function(blastdf) { | |
| 538 ## calculate coverage for all blast subjects | |
| 539 coverage_profiles <- by(blastdf, INDICES = blastdf$sseqid, FUN = function(x) { | |
| 540 as.numeric(coverage(IRanges(start = x$sstart, end = x$send))) | |
| 541 }) | |
| 542 return(coverage_profiles) | |
| 543 | |
| 544 } | |
| 545 | |
| 546 get_score = function(x) { | |
| 547 ## keep best tRNA | |
| 548 if (nrow(x) == 0) { | |
| 549 return(0) | |
| 550 } | |
| 551 xm = data.frame(do.call(rbind, strsplit(x$qseqid, "__")), stringsAsFactors = FALSE) | |
| 552 xm$score = as.numeric(sapply(strsplit(xm[, 1], "_"), "[[", 4)) | |
| 553 xm$AA = gsub("-.+$", "", gsub("^.+__", "", x$sseqid)) | |
| 554 best_score = max(by(xm$score, INDICES = xm$AA, FUN = sum)) | |
| 555 return(best_score) | |
| 556 } | |
| 557 | |
| 558 | |
| 559 | |
| 560 dotter = function(seq1, seq2 = NULL, params = "") { | |
| 561 if (is.null(seq2)) { | |
| 562 seq2 = seq1 | |
| 563 } | |
| 564 library(Biostrings) | |
| 565 if (class(seq1) != "DNAStringSet") { | |
| 566 seq1 = BStringSet(seq1) | |
| 567 } | |
| 568 if (class(seq2) != "DNAStringSet") { | |
| 569 seq2 = BStringSet(seq2) | |
| 570 } | |
| 571 sf1 = tempfile("seq1") | |
| 572 writeXStringSet(seq1, file = sf1) | |
| 573 sf2 = tempfile("seq2") | |
| 574 writeXStringSet(seq2, file = sf2) | |
| 575 system(paste("dotter", params, sf1, sf2), wait = FALSE) | |
| 576 Sys.sleep(2) | |
| 577 unlink(c(sf1, sf2)) | |
| 578 return(NULL) | |
| 579 } | |
| 580 | |
| 581 | |
| 582 | |
| 583 dotter2 = function(seq1, seq2 = NULL, params = NULL) { | |
| 584 if (is.null(seq2)) { | |
| 585 seq2 = seq1 | |
| 586 } | |
| 587 if (is.null(params)) { | |
| 588 params = " -windowsize 30 -threshold 45 " | |
| 589 } | |
| 590 library(Biostrings) | |
| 591 if (class(seq1) != "DNAStringSet") { | |
| 592 seq1 = DNAStringSet(seq1) | |
| 593 } | |
| 594 if (class(seq2) != "DNAStringSet") { | |
| 595 seq2 = DNAStringSet(seq2) | |
| 596 } | |
| 597 L1 = nchar(seq1) | |
| 598 L2 = nchar(seq2) | |
| 599 | |
| 600 tmpdat1 = tempfile() | |
| 601 | |
| 602 dir.create(tmpdat1) | |
| 603 tmpdat2 = tempfile() | |
| 604 dir.create(tmpdat2) | |
| 605 oridir = getwd() | |
| 606 | |
| 607 | |
| 608 seq1_merged = DNAStringSet(paste(seq1, collapse = "")) | |
| 609 seq2_merged = DNAStringSet(paste(seq2, collapse = "")) | |
| 610 seq2rc_merged = reverseComplement(DNAStringSet(paste(seq2, collapse = ""))) | |
| 611 | |
| 612 | |
| 613 sf1 = tempfile("seq1") | |
| 614 writeXStringSet(seq1_merged, filepath = sf1) | |
| 615 sf2 = tempfile("seq2") | |
| 616 sf2rc = tempfile("seq2rc") | |
| 617 writeXStringSet(seq2_merged, filepath = sf2) | |
| 618 writeXStringSet(seq2rc_merged, filepath = sf2rc) | |
| 619 | |
| 620 cmd1 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2, params) | |
| 621 cmd2 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2rc, | |
| 622 params) | |
| 623 setwd(tmpdat1) | |
| 624 output1 = system(cmd1, intern = TRUE) | |
| 625 setwd(tmpdat2) | |
| 626 output2 = system(cmd2, intern = TRUE) | |
| 627 setwd(oridir) | |
| 628 | |
| 629 | |
| 630 | |
| 631 fout1 = strsplit(tail(output1, n = 1), split = " ")[[1]][2] | |
| 632 rawdat1 = readLines(paste(tmpdat1, "/", fout1, sep = "")) | |
| 633 rawdat1 = rawdat1[1:min(grep("^Rectangle", rawdat1))] | |
| 634 | |
| 635 if (length(rawdat1[grep("^Line", rawdat1)]) == 0) { | |
| 636 coord1 = NULL | |
| 637 } else { | |
| 638 coord1 = apply(sapply(strsplit(rawdat1[grep("^Line", rawdat1)], " "), "[", | |
| 639 c(3, 5, 7, 9, 11)), 1, as.numeric) | |
| 640 coord1 = matrix(coord1, ncol = 5) | |
| 641 } | |
| 642 | |
| 643 fout2 = strsplit(tail(output2, n = 1), split = " ")[[1]][2] | |
| 644 rawdat2 = readLines(paste(tmpdat2, "/", fout2, sep = "")) | |
| 645 rawdat2 = rawdat2[1:min(grep("^Rectangle", rawdat2))] | |
| 646 | |
| 647 if (length(rawdat2[grep("^Line", rawdat2)]) == 0) { | |
| 648 coord2 = NULL | |
| 649 } else { | |
| 650 coord2 = apply(sapply(strsplit(rawdat2[grep("^Line", rawdat2)], " "), "[", | |
| 651 c(3, 5, 7, 9, 11)), 1, as.numeric) | |
| 652 coord2 = matrix(coord2, ncol = 5) | |
| 653 } | |
| 654 unlink(sf1) | |
| 655 unlink(sf2) | |
| 656 unlink(sf2rc) | |
| 657 unlink(tmpdat1, recursive = TRUE) | |
| 658 unlink(tmpdat2, recursive = TRUE) | |
| 659 | |
| 660 N1 = sum(nchar(seq1)) | |
| 661 N2 = sum(nchar(seq2)) | |
| 662 op = par(xaxs = "i", yaxs = "i", mar = c(5, 2, 6, 10), las = 1) | |
| 663 on.exit(par(op)) | |
| 664 plot(c(1, N1), c(1, N2), type = "n", xlab = "", ylab = "", axes = FALSE) | |
| 665 if (!is.null(coord1)) { | |
| 666 segments(x0 = coord1[, 1], y0 = coord1[, 2], x1 = coord1[, 3], y1 = coord1[, | |
| 667 4]) | |
| 668 } | |
| 669 if (!is.null(coord2)) { | |
| 670 segments(x0 = coord2[, 1], y0 = N2 - coord2[, 2], x1 = coord2[, 3], y1 = N2 - | |
| 671 coord2[, 4]) | |
| 672 } | |
| 673 abline(v = c(0, cumsum(L1)), col = "green") | |
| 674 abline(h = c(0, cumsum(L2)), col = "green") | |
| 675 box() | |
| 676 axis(1, at = c(1, cumsum(L1))[-(length(L1) + 1)], labels = names(seq1), hadj = 0, | |
| 677 cex.axis = 0.7) | |
| 678 axis(4, at = c(1, cumsum(L2))[-(length(L2) + 1)], labels = names(seq2), hadj = 0, | |
| 679 cex.axis = 0.7) | |
| 680 invisible(list(coord1, coord2)) | |
| 681 } | |
| 682 | |
| 683 | |
| 684 CM2sequence = function(x) { | |
| 685 bases = c(colnames(x), "N") | |
| 686 x = cbind(x, 0) | |
| 687 colnames(x) = bases | |
| 688 seqs = paste(bases[apply(x, 1, which.max)], collapse = "") | |
| 689 | |
| 690 return(seqs) | |
| 691 } | |
| 692 | |
| 693 ## in alingment calculate significance from kmers | |
| 694 calculate_consensus_matrix = function(aln, tr_paths, G) { | |
| 695 bases = c("A", "C", "G", "T", "-") | |
| 696 positions = lapply(strsplit(as.character(aln), split = ""), function(x) which(!x %in% | |
| 697 "-")) | |
| 698 base_matrix = do.call(rbind, strsplit(as.character(aln), split = "")) | |
| 699 weights = matrix(0, nrow = length(aln), ncol = nchar(aln)) | |
| 700 kmer_rel_proportions = (V(G)$size/table(factor(unlist(tr_paths), levels = 1:vcount(G)))) | |
| 701 for (i in seq_along(positions)) { | |
| 702 weights[i, positions[[i]]] = kmer_rel_proportions[tr_paths[[i]]] | |
| 703 } | |
| 704 names(kmer_rel_proportions) = V(G)$name | |
| 705 ## get weights for gaps by approximation | |
| 706 fitgaps = function(y) { | |
| 707 if (sum(y == 0) == 0) { | |
| 708 return(y) | |
| 709 } else { | |
| 710 y0 = rep(y[y != 0], 3) | |
| 711 x0 = which(rep(y, 3) != 0) | |
| 712 fitted = approx(x0, y0, xout = seq_along(rep(y, 3)), rule = 1) | |
| 713 } | |
| 714 return(fitted$y[-seq_along(y)][seq_along(y)]) | |
| 715 } | |
| 716 weights_with_gaps = t(apply(weights, 1, fitgaps)) | |
| 717 ## weights_with_gaps = get_gap_weights(weights, aln,G) ## this step take wey too | |
| 718 ## long time!!!-not so important TODO handle gaps differently - more effectively | |
| 719 | |
| 720 ## get consensus matrix | |
| 721 CM = sapply(1:nchar(aln[[1]]), function(i) { | |
| 722 sapply(split(weights_with_gaps[, i], factor(base_matrix[, i], levels = bases)), | |
| 723 sum) | |
| 724 }) | |
| 725 return(t(CM)[, 1:5]) | |
| 726 } | |
| 727 | |
| 728 | |
| 729 get_gaps_from_alignment = function(aln) { | |
| 730 as.character(aln) | |
| 731 gaps_positions = unique(do.call(rbind, str_locate_all(as.character(aln), "-+"))) | |
| 732 return(gaps_positions) | |
| 733 } | |
| 734 | |
| 735 plot_kmer_graph = function(G, L = NULL, vertex.size = NULL, ord = NULL, upto = NULL, | |
| 736 highlight = NULL) { | |
| 737 if (!is.null(G$L) & is.null(L)) { | |
| 738 L = G$L | |
| 739 } | |
| 740 if (is.null(L)) { | |
| 741 ## L=layout.kamada.kawai(G) | |
| 742 L = OGDFlayout(G)[[1]] | |
| 743 } | |
| 744 clr = kmers2color(V(G)$name) | |
| 745 if (!is.null(highlight)) { | |
| 746 clr[highlight] = "#00000080" | |
| 747 } | |
| 748 if (!is.null(ord)) { | |
| 749 clr[ord[1:upto]] = paste(clr[ord[1:upto]], "30", sep = "") | |
| 750 } | |
| 751 if (is.null(vertex.size)) { | |
| 752 vertex.size = rescale(V(G)$size, to = c(0.5, 6)) | |
| 753 | |
| 754 } | |
| 755 plot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE, | |
| 756 vertex.color = clr, vertex.frame.color = "#00000020", edge.width = 2, edge.arrow.mode = 1, | |
| 757 edge.arrow.size = 0.2) | |
| 758 | |
| 759 } | |
| 760 | |
| 761 rglplot_kmer_graph = function(G, L = NULL, vertex.size = 4) { | |
| 762 if (is.null(L)) { | |
| 763 L = layout.kamada.kawai(G) | |
| 764 } | |
| 765 | |
| 766 rglplot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE, | |
| 767 vertex.color = kmers2color(V(G)$name), edge.width = 2, edge.arrow.mode = 1, | |
| 768 edge.arrow.size = 0.5) | |
| 769 } | |
| 770 | |
| 771 | |
| 772 | |
| 773 | |
| 774 mafft = function(seqs, params = "--auto --thread 1 ") { | |
| 775 if (length(seqs) < 2) { | |
| 776 return(seqs) | |
| 777 } | |
| 778 infile = tempfile() | |
| 779 if (class(seqs) == "character") { | |
| 780 seqs = DNAStringSet(seqs) | |
| 781 } | |
| 782 writeXStringSet(seqs, file = infile) | |
| 783 outfile = tempfile() | |
| 784 cmd = paste("mafft --quiet --nuc ", params, infile, "2> /dev/null > ", outfile) | |
| 785 system(cmd, intern = TRUE, ignore.stderr = FALSE) | |
| 786 aln = readDNAStringSet(outfile) | |
| 787 unlink(c(outfile, infile)) | |
| 788 return(aln) | |
| 789 } | |
| 790 | |
| 791 mgblast = function(databaseSeq, querySeq) { | |
| 792 params = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -C 30 -D 30 " | |
| 793 # no dust filtering: | |
| 794 paramsDF = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -F F" | |
| 795 | |
| 796 database = tempfile() | |
| 797 query = tempfile() | |
| 798 output = tempfile() | |
| 799 if (class(databaseSeq) == "character") { | |
| 800 database = databaseSeq | |
| 801 do_not_delete_database = TRUE | |
| 802 } else { | |
| 803 writeXStringSet(databaseSeq, filepath = database, format = "fasta") | |
| 804 do_not_delete_database = FALSE | |
| 805 } | |
| 806 if (class(querySeq) == "character") { | |
| 807 query = querySeq | |
| 808 do_not_delete_query = TRUE | |
| 809 } else { | |
| 810 writeXStringSet(querySeq, filepath = query, format = "fasta") | |
| 811 do_not_delete_query = FALSE | |
| 812 } | |
| 813 ## create database: | |
| 814 cmd = paste("formatdb -i", database, "-p F") | |
| 815 system(cmd) | |
| 816 cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, params) | |
| 817 system(cmd) | |
| 818 if (file.info(output)$size == 0) { | |
| 819 # no hist, try wthou dust masker | |
| 820 cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, paramsDF) | |
| 821 system(cmd) | |
| 822 } | |
| 823 blastOut = read.table(output, sep = "\t", header = FALSE, as.is = TRUE, comment.char = "") | |
| 824 unlink(output) | |
| 825 if (!do_not_delete_query) { | |
| 826 unlink(query) | |
| 827 } | |
| 828 if (!do_not_delete_database) { | |
| 829 unlink(paste(database, "*", sep = "")) | |
| 830 } | |
| 831 colnames(blastOut) = c("query", "q.length", "q.start", "q.end", "subject", "s.length", | |
| 832 "s.start", "s.end", "pid", "weight", "e.value", "strand") | |
| 833 blastOut | |
| 834 } | |
| 835 | |
| 836 | |
| 837 estimate_sample_size = function(NV, NE, maxv, maxe) { | |
| 838 ## density | |
| 839 d = (2 * NE)/(NV * (NV - 1)) | |
| 840 eEst = (maxv * (maxv - 1) * d)/2 | |
| 841 nEst = (d + sqrt(d^2 + 8 * d * maxe))/(2 * d) | |
| 842 if (eEst >= maxe) { | |
| 843 N = round(nEst) | |
| 844 E = round((N * (N - 1) * d)/2) | |
| 845 | |
| 846 } | |
| 847 if (nEst >= maxv) { | |
| 848 N = maxv | |
| 849 E = round((N * (N - 1) * d)/2) | |
| 850 | |
| 851 } | |
| 852 return(N) | |
| 853 } | |
| 854 | |
| 855 | |
| 856 | |
| 857 | |
| 858 mgblast2graph = function(blastfile, seqfile, graph_destination, directed_graph_destination, | |
| 859 oriented_sequences, paired = TRUE, repex = FALSE, image_file = NULL, image_file_tmb = NULL, | |
| 860 include_layout = TRUE, pair_completeness = NULL, satellite_model_path = NULL, | |
| 861 maxv = 40000, maxe = 5e+08, seqfile_full = seqfile) { | |
| 862 cat("loading blast results\n") | |
| 863 if (repex) { | |
| 864 cln = c("query", "subject", "weight", "q.length", "q.start", "q.end", "s.length", "s.start", | |
| 865 "s.end", "sign") | |
| 866 colcls = c("character", "character", "numeric", "numeric", "numeric", "numeric", | |
| 867 "numeric", "numeric", "numeric", "NULL", "NULL", "character") | |
| 868 | |
| 869 } else { | |
| 870 cln = c("query", "q.length", "q.start", "q.end", "subject", "s.length", "s.start", | |
| 871 "s.end","weight", "sign") | |
| 872 colcls = c("character", "numeric", "numeric", "numeric", "character", "numeric", | |
| 873 "numeric", "numeric", "NULL", "numeric", "NULL", "character") | |
| 874 } | |
| 875 if (class(blastfile) == "data.frame") { | |
| 876 blastTable = blastfile | |
| 877 colnames(blastTable)[12] = "sign" | |
| 878 } else { | |
| 879 blastTable = read.table(blastfile, sep = "\t", as.is = TRUE, header = FALSE, | |
| 880 colClasses = colcls, comment.char = "") | |
| 881 colnames(blastTable) = cln | |
| 882 } | |
| 883 ## check for duplicates! | |
| 884 key = with(blastTable, ifelse(query > subject, paste(query, subject), paste(subject, | |
| 885 query))) | |
| 886 if (any(duplicated(key))) { | |
| 887 blastTable = blastTable[!duplicated(key), ] | |
| 888 } | |
| 889 seqs = readDNAStringSet(seqfile) | |
| 890 ## calculate pair completeness for reads before sampling | |
| 891 if (is.null(pair_completeness)) { | |
| 892 if (paired) { | |
| 893 if (seqfile_full != seqfile) { | |
| 894 seqs_full = readDNAStringSet(seqfile_full) | |
| 895 pair_counts = tabulate(table(gsub(".$", "", names(seqs_full)))) | |
| 896 rm(seqs_full) | |
| 897 } else { | |
| 898 pair_counts = tabulate(table(gsub(".$", "", names(seqs)))) | |
| 899 | |
| 900 } | |
| 901 pair_completeness = 1 - pair_counts[1]/sum(pair_counts) | |
| 902 }else{ | |
| 903 pair_completeness = 0 | |
| 904 } | |
| 905 } | |
| 906 NV = length(seqs) # vertices | |
| 907 NE = nrow(blastTable) # nodes | |
| 908 if (maxv < NV | maxe < NE) { | |
| 909 ## Sample if graph is large | |
| 910 V_sample_size = estimate_sample_size(NV, NE, maxv, maxe) | |
| 911 seqs = sample(seqs, V_sample_size) | |
| 912 blastTable = blastTable[blastTable$query %in% names(seqs) & blastTable$subject %in% | |
| 913 names(seqs), ] | |
| 914 } | |
| 915 | |
| 916 blastTable$sign = ifelse(blastTable$sign == "+", 1, -1) | |
| 917 vnames = unique(c(blastTable$query, blastTable$subject)) | |
| 918 vindex = seq_along(vnames) | |
| 919 names(vindex) = vnames | |
| 920 ## correct orientation | |
| 921 cat("creating graph\n") | |
| 922 G = graph.data.frame(blastTable[, c("query", "subject", "weight", "sign")], directed = FALSE, | |
| 923 vertices = vnames) | |
| 924 if (include_layout) { | |
| 925 ## save temporarily modified blastTable if large | |
| 926 tmpB = tempfile() | |
| 927 save(blastTable, file = tmpB) | |
| 928 rm(blastTable) | |
| 929 ## | |
| 930 cat("graph layout calculation ") | |
| 931 if (ecount(G) > 2e+06) { | |
| 932 cat("using fruchterman reingold\n") | |
| 933 L = layout.fruchterman.reingold(G, dim = 3) | |
| 934 } else { | |
| 935 cat("using OGDF & frucherman reingold\n") | |
| 936 Ltmp = OGDFlayout(G, alg = c("fmmm")) | |
| 937 L = cbind(Ltmp[[1]][, 1:2], layout.fruchterman.reingold(G, dim = 2)) | |
| 938 | |
| 939 } | |
| 940 | |
| 941 GL = list(G = G, L = L) | |
| 942 save(GL, file = graph_destination) | |
| 943 if (!is.null(image_file)) { | |
| 944 cat("exporting graph figure") | |
| 945 png(image_file, width = 900, height = 900, pointsize = 20) | |
| 946 plot(GL$G, layout = GL$L[, 1:2], vertex.size = 2, vertex.color = "#000000A0", | |
| 947 edge.color = "#00000040", vertex.shape = "circle", edge.curved = FALSE, | |
| 948 vertex.label = NA, vertex.frame.color = NA, edge.width = 1) | |
| 949 dev.off() | |
| 950 ## create thunmbs: | |
| 951 system(paste("convert ", image_file, " -resize 100x100 ", image_file_tmb)) | |
| 952 | |
| 953 } | |
| 954 rm(GL) | |
| 955 gc() | |
| 956 load(tmpB) | |
| 957 unlink(tmpB) | |
| 958 } | |
| 959 | |
| 960 if (!is.connected(G)) { | |
| 961 cat("Warning - graph is not connected\n") | |
| 962 cc = clusters(G, "weak") | |
| 963 mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)), | |
| 964 decreasing = TRUE))))) | |
| 965 selvi = which(mb == 1) # largest component | |
| 966 cat("using largest component: \nsize =", round(length(selvi)/vcount(G) * | |
| 967 100, 1), "% ;", length(selvi), "reads", "\n") | |
| 968 G = induced.subgraph(G, vids = selvi) | |
| 969 blastTable = blastTable[blastTable$query %in% V(G)$name & blastTable$subject %in% | |
| 970 V(G)$name, ] | |
| 971 } | |
| 972 | |
| 973 Gmst = minimum.spanning.tree(G) | |
| 974 # create alternative trees - for case that unly suboptima solution is found | |
| 975 set.seed(123) | |
| 976 Gmst_alt = list() | |
| 977 Gmst_alt[[1]] = Gmst | |
| 978 for (i in 2:6){ | |
| 979 E(G)$weight = runif(ecount(G), 0.1,1) | |
| 980 Gmst_alt[[i]] = minimum.spanning.tree(G) | |
| 981 } | |
| 982 | |
| 983 rm(G) | |
| 984 gc() | |
| 985 ## six attempts to reorient reads | |
| 986 flip_names_all = list() | |
| 987 prop_of_notfit=numeric() | |
| 988 thr_fit=0.001 | |
| 989 for (ii in 1:6){ | |
| 990 Gmst = Gmst_alt[[ii]] | |
| 991 | |
| 992 | |
| 993 blastTable_mst = as.data.frame(get.edgelist(Gmst, names = TRUE)) | |
| 994 colnames(blastTable_mst) = c("query", "subject") | |
| 995 blastTable_mst$sign = E(Gmst)$sign | |
| 996 | |
| 997 d = dfs(Gmst, root = 1, father = TRUE, order.out = TRUE) | |
| 998 esign = E(Gmst)$sign | |
| 999 rc = rep(FALSE, vcount(Gmst)) | |
| 1000 j = 0 | |
| 1001 p = 0 | |
| 1002 for (v in d$order[-1]) { | |
| 1003 j = j + 1 | |
| 1004 f = as.numeric(d$father)[v] | |
| 1005 if (is.na(f)) { | |
| 1006 next | |
| 1007 } | |
| 1008 eid = get.edge.ids(Gmst, c(v, f)) | |
| 1009 if (esign[eid] == -1) { | |
| 1010 ie = unique(c(which(blastTable_mst$query %in% V(Gmst)$name[v]), which(blastTable_mst$subject %in% | |
| 1011 V(Gmst)$name[v]))) # incident edges | |
| 1012 esign[ie] = esign[ie] * -1 | |
| 1013 rc[v] = TRUE | |
| 1014 p = p + 1 | |
| 1015 if (p > 50) { | |
| 1016 p = 0 | |
| 1017 cat("\r") | |
| 1018 cat(j, " : ", sum(esign)/length(esign)) | |
| 1019 } | |
| 1020 } | |
| 1021 } | |
| 1022 cat("\t") | |
| 1023 flip_names_all[[ii]] = flip_names = V(Gmst)$name[rc] | |
| 1024 | |
| 1025 if (!exists("blastTable")) { | |
| 1026 load(tmpB) | |
| 1027 unlink(tmpB) | |
| 1028 } | |
| 1029 ## add nofit later!! | |
| 1030 s.mean = with(blastTable, (s.start + s.end)/2) | |
| 1031 s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length - | |
| 1032 s.mean, s.mean) | |
| 1033 q.mean = with(blastTable, (q.start + q.end)/2) | |
| 1034 q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length - | |
| 1035 q.mean, q.mean) | |
| 1036 V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query) | |
| 1037 V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject) | |
| 1038 sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1, | |
| 1039 1) * blastTable$sign | |
| 1040 nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"])) | |
| 1041 prop_of_notfit[ii] = length(nofit)/vcount(Gmst) | |
| 1042 ## check if correctly oriented | |
| 1043 cat("prop notfit", prop_of_notfit[[ii]],"\n") | |
| 1044 if (prop_of_notfit[[ii]]<thr_fit){ | |
| 1045 ## OK | |
| 1046 break | |
| 1047 } | |
| 1048 } | |
| 1049 if (!prop_of_notfit[[ii]]<thr_fit){ | |
| 1050 ## get best solution | |
| 1051 ii_best = which.min(prop_of_notfit) | |
| 1052 if (ii != ii_best){ # if the last solution was not best, get the best | |
| 1053 flip_names = flip_names_all[[ii_best]] | |
| 1054 s.mean = with(blastTable, (s.start + s.end)/2) | |
| 1055 s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length - | |
| 1056 s.mean, s.mean) | |
| 1057 q.mean = with(blastTable, (q.start + q.end)/2) | |
| 1058 q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length - | |
| 1059 q.mean, q.mean) | |
| 1060 V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query) | |
| 1061 V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject) | |
| 1062 sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1, | |
| 1063 1) * blastTable$sign | |
| 1064 nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"])) | |
| 1065 | |
| 1066 } | |
| 1067 } | |
| 1068 | |
| 1069 ## exclude all nofit | |
| 1070 | |
| 1071 df2 = data.frame(V1, V2, sign = blastTable$sign, sign_final = sign_final, stringsAsFactors = FALSE) | |
| 1072 rm(blastTable) | |
| 1073 gc() | |
| 1074 vertices = data.frame(name = vnames, reverse_complement = vnames %in% flip_names) | |
| 1075 G = graph.data.frame(df2, vertices = vertices) | |
| 1076 vcount_ori = vcount(G) | |
| 1077 G = induced.subgraph(G, vids = which(!V(G)$name %in% nofit)) | |
| 1078 | |
| 1079 G$escore_mst = sum(esign)/length(esign) | |
| 1080 G$escore = sum(sign_final == 1)/length(sign_final) | |
| 1081 cc = clusters(G, "strong") | |
| 1082 mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)), | |
| 1083 decreasing = TRUE))))) | |
| 1084 names(mb) = V(G)$name | |
| 1085 G$loop_index = max(cc$csize)/vcount(G) | |
| 1086 G$coverage = vcount(G)/vcount_ori | |
| 1087 ## check sign in all edges | |
| 1088 V(G)$membership = mb | |
| 1089 save(G, file = directed_graph_destination) | |
| 1090 ## remove nofit | |
| 1091 seqs = seqs[!names(seqs) %in% nofit] | |
| 1092 seqs[names(seqs) %in% flip_names] = reverseComplement(seqs[names(seqs) %in% flip_names]) | |
| 1093 names(seqs) = paste(names(seqs), mb[names(seqs)]) | |
| 1094 writeXStringSet(seqs, filepath = oriented_sequences) | |
| 1095 ## calculate satellite probability | |
| 1096 if (is.null(satellite_model_path)) { | |
| 1097 pSAT = isSAT = NULL | |
| 1098 } else { | |
| 1099 satellite_model = readRDS(satellite_model_path) | |
| 1100 pSAT = get_prob(G$loop_index, pair_completeness, model = satellite_model) | |
| 1101 isSAT = isSatellite(G$loop_index, pair_completeness, model = satellite_model) | |
| 1102 } | |
| 1103 | |
| 1104 ## get larges cc | |
| 1105 output = list(escore = G$escore, coverage = G$coverage, escore_mts = G$escore_mst, | |
| 1106 loop_index = G$loop_index, pair_completeness = pair_completeness, graph_file = graph_destination, | |
| 1107 oriented_sequences = oriented_sequences, vcount = vcount(G), ecount = ecount(G), | |
| 1108 satellite_probability = pSAT, satellite = isSAT) | |
| 1109 ## clean up | |
| 1110 all_objects = ls() | |
| 1111 do_not_remove = "output" | |
| 1112 rm(list = all_objects[!(all_objects %in% do_not_remove)]) | |
| 1113 gc(verbose = FALSE, reset = TRUE) | |
| 1114 return(list2dictionary(output)) | |
| 1115 } | |
| 1116 | |
| 1117 list2dictionary = function(l){ | |
| 1118 dict = "{" | |
| 1119 q='"' | |
| 1120 for (i in 1:length(l)){ | |
| 1121 if (class(l[[i]])=="character" | is.null(l[[i]])){ | |
| 1122 q2 = "'''" | |
| 1123 }else{ | |
| 1124 q2 = '' | |
| 1125 } | |
| 1126 dict = paste0( | |
| 1127 dict, | |
| 1128 q,names(l)[i],q,":",q2, l[[i]], q2,", " | |
| 1129 ) | |
| 1130 } | |
| 1131 dict = paste0(dict, "}") | |
| 1132 return(dict) | |
| 1133 } | |
| 1134 | |
| 1135 wrap = Vectorize(function(s, width = 80) { | |
| 1136 i1 = seq(1, nchar(s), width) | |
| 1137 i2 = seq(width, by = width, length.out = length(i1)) | |
| 1138 return(paste(substring(s, i1, i2), collapse = "\n")) | |
| 1139 }) | |
| 1140 | |
| 1141 | |
| 1142 tarean = function(input_sequences, output_dir, min_kmer_length = 11, max_kmer_length = 27, | |
| 1143 CPU = 2, sample_size = 10000, reorient_reads = TRUE, tRNA_database_path = NULL, | |
| 1144 include_layout = TRUE, paired = TRUE, lock_file=NULL) { | |
| 1145 options(CPU = CPU) | |
| 1146 time0 = Sys.time() | |
| 1147 dir.create(output_dir) | |
| 1148 input_sequences_copy = paste(output_dir, "/", basename(input_sequences), sep = "") | |
| 1149 | |
| 1150 if (!file.copy(input_sequences, input_sequences_copy, overwrite = TRUE)) { | |
| 1151 cat(paste("cannot copy", input_sequences, " to", output_dir), "\n") | |
| 1152 stop() | |
| 1153 } | |
| 1154 | |
| 1155 lock_file = waitForRAM(lock_file = lock_file) | |
| 1156 pair_completeness = NULL | |
| 1157 ## sampling | |
| 1158 if (sample_size != 0) { | |
| 1159 s = readDNAStringSet(input_sequences_copy) | |
| 1160 N = length(s) | |
| 1161 ## pair completness must be calculated before sampling! | |
| 1162 if (N > sample_size) { | |
| 1163 writeXStringSet(sample(s, sample_size), filepath = input_sequences_copy) | |
| 1164 if (paired) { | |
| 1165 pair_counts = tabulate(table(gsub(".$", "", names(s)))) | |
| 1166 pair_completeness = 1 - pair_counts[1]/sum(pair_counts) | |
| 1167 } | |
| 1168 } | |
| 1169 rm(s) | |
| 1170 } | |
| 1171 | |
| 1172 if (reorient_reads) { | |
| 1173 input_sequences_oriented = paste0(input_sequences_copy, "oriented.fasta") | |
| 1174 graph_file = paste0(input_sequences_copy, "_graph.RData") | |
| 1175 GLfile = paste0(input_sequences_copy, "_graph.GL") | |
| 1176 cat("reorientig sequences\n") | |
| 1177 | |
| 1178 blastTable = mgblast(input_sequences_copy, input_sequences_copy) | |
| 1179 | |
| 1180 graph_info = mgblast2graph(blastTable, input_sequences_copy, GLfile, graph_file, | |
| 1181 input_sequences_oriented, include_layout = include_layout, paired = paired, | |
| 1182 pair_completeness = pair_completeness) | |
| 1183 | |
| 1184 ## interupt if it does not look like tandem repeat at all # soft threshold! | |
| 1185 if (is.null(graph_info$pair_completeness)) { | |
| 1186 if (graph_info$loop_index <= 0.4) { | |
| 1187 cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
| 1188 return(list2dictionary(list(graph_info = graph_info))) | |
| 1189 } | |
| 1190 } else { | |
| 1191 ## for paired reads: | |
| 1192 if (graph_info$loop_index < 0.7 | graph_info$pair_completeness < 0.4) { | |
| 1193 cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
| 1194 return(list2dictionary(list(graph_info = graph_info))) | |
| 1195 } | |
| 1196 } | |
| 1197 | |
| 1198 } else { | |
| 1199 input_sequences_oriented = input_sequences_copy | |
| 1200 graph_info = NULL | |
| 1201 } | |
| 1202 | |
| 1203 | |
| 1204 | |
| 1205 kmer_counts = list() | |
| 1206 kmer_lengths = seq(min_kmer_length, max_kmer_length, by = 4) | |
| 1207 for (i in kmer_lengths) { | |
| 1208 ## use pythonis.null(l[[i]])) function - faster | |
| 1209 cmd = paste(script.dir, "/kmer_counting.py ", input_sequences_oriented, " ", | |
| 1210 i, sep = "") | |
| 1211 cat("calculation kmers of length ", i, "\n") | |
| 1212 f = system(cmd, intern = TRUE) | |
| 1213 x = read.table(f, as.is = TRUE, sep = "\t") | |
| 1214 kmer_counts[[as.character(i)]] = data.frame(x, freq = x$V2/sum(x$V2)) | |
| 1215 } | |
| 1216 | |
| 1217 ## number of kmers: | |
| 1218 N50 = sapply(kmer_counts, function(x) { | |
| 1219 sum(cumsum(x$freq) < 0.5) | |
| 1220 }) | |
| 1221 | |
| 1222 N70 = sapply(kmer_counts, function(x) { | |
| 1223 sum(cumsum(x$freq) < 0.7) | |
| 1224 }) | |
| 1225 | |
| 1226 | |
| 1227 time1 = Sys.time() | |
| 1228 ggmin = mclapply(kmer_counts, get_mimimal_cc, start = 0.5, mc.cores = CPU) | |
| 1229 time2 = Sys.time() | |
| 1230 cat("kmers graphs created ") | |
| 1231 print(time2 - time1) | |
| 1232 names(ggmin) = names(kmer_counts) | |
| 1233 | |
| 1234 | |
| 1235 ## estimate monomer | |
| 1236 monomers = mclapply(ggmin, estimate_monomer, mc.cores = CPU) | |
| 1237 | |
| 1238 names(monomers) = names(kmer_counts) | |
| 1239 monomers = monomers[!sapply(monomers, is.null)] | |
| 1240 ## error handling: | |
| 1241 error = sapply(monomers, class) == "try-error" | |
| 1242 if (any(error)) { | |
| 1243 cat("\nError in monomers estimation: ") | |
| 1244 cat("calculation failed for monomers length ", names(monomers)[error], "\n") | |
| 1245 print(monomers[error]) | |
| 1246 if (any(!error)) { | |
| 1247 monomers = monomers[!error] | |
| 1248 } else { | |
| 1249 stop("monomer estimation failed") | |
| 1250 } | |
| 1251 } | |
| 1252 | |
| 1253 ## summary - make function!! | |
| 1254 total_score = list() | |
| 1255 k = 0 | |
| 1256 length_best = numeric() | |
| 1257 score_bn = numeric() | |
| 1258 consensus = character() | |
| 1259 | |
| 1260 for (i in names(monomers)) { | |
| 1261 for (v in seq_along(monomers[[i]]$estimates)) { | |
| 1262 k = k + 1 | |
| 1263 total_score[[k]] = c(kmer = as.numeric(i), variant = v, total_score = monomers[[i]]$estimates[[v]]$total_score) | |
| 1264 score_bn[k] = min(rowSums(monomers[[i]]$estimates[[v]]$CM)) | |
| 1265 length_best[k] = monomers[[i]]$estimates[[v]]$length_variant_score[1, | |
| 1266 1] | |
| 1267 consensus[[k]] = monomers[[i]]$estimates[[v]]$consensus2 | |
| 1268 } | |
| 1269 } | |
| 1270 summary_table = as.data.frame(do.call(rbind, total_score)) | |
| 1271 summary_table$monomer_length = length_best | |
| 1272 summary_table$consensus_length = nchar(consensus) | |
| 1273 summary_table$score_bn = score_bn | |
| 1274 summary_table$consensus = paste("<pre>", wrap(consensus, width = 80), "<pre>", | |
| 1275 sep = "") | |
| 1276 consensus_DS = DNAStringSet(consensus) | |
| 1277 names(consensus_DS) = with(summary_table, paste0(kmer, "_", variant, "_sc_", | |
| 1278 signif(total_score), "_l_", monomer_length)) | |
| 1279 | |
| 1280 ## filter by score - keep | |
| 1281 | |
| 1282 ## reorder by score | |
| 1283 consensus_DS = consensus_DS[order(summary_table$total_score, decreasing = TRUE)] | |
| 1284 summary_table = summary_table[order(summary_table$total_score, decreasing = TRUE), | |
| 1285 ] | |
| 1286 rownames(summary_table) = NULL | |
| 1287 N = nrow(summary_table) | |
| 1288 ## concatenate concensus(ie. make dimer head 2 tail) for pbs detection and other | |
| 1289 ## make something like 'pseudo contig' multimer for mapping - min length 200 bp | |
| 1290 | |
| 1291 ## searches | |
| 1292 consensus_DS_dimer = DNAStringSet(paste0(consensus_DS, consensus_DS)) | |
| 1293 tarean_contigs = DNAStringSet(sapply(consensus_DS,function(x) | |
| 1294 ifelse(nchar(x)<200, | |
| 1295 paste(rep(as.character(x),round(300/nchar(as.character(x))+1)),collapse=''), | |
| 1296 as.character(x))) | |
| 1297 ) | |
| 1298 | |
| 1299 names(consensus_DS_dimer) = names(consensus_DS) | |
| 1300 # save sequences: | |
| 1301 consensus_DS_dimer_file = paste0(output_dir, "/consensus_dimer.fasta") | |
| 1302 consensus_DS_file = paste0(output_dir, "/consensus.fasta") | |
| 1303 tarean_contig_file = paste0(output_dir, "/tarean_contigs.fasta") | |
| 1304 writeXStringSet(consensus_DS, consensus_DS_file) | |
| 1305 writeXStringSet(tarean_contigs, tarean_contig_file) | |
| 1306 writeXStringSet(consensus_DS_dimer, consensus_DS_dimer_file) | |
| 1307 if (is.null(tRNA_database_path)) { | |
| 1308 pbs_score = -1 | |
| 1309 } else { | |
| 1310 pbs_blast_table = paste0(output_dir, "/pbs_detection") | |
| 1311 pbs_score = detect_pbs(dimers_file = consensus_DS_dimer_file, tRNA_database_path = tRNA_database_path, | |
| 1312 reads = input_sequences_oriented, output = pbs_blast_table) | |
| 1313 } | |
| 1314 ## search of open reading frames get the length of the longest | |
| 1315 orf_l = max_ORF_length(consensus_DS_dimer) | |
| 1316 | |
| 1317 | |
| 1318 dir.create(paste0(output_dir, "/img"), recursive = TRUE) | |
| 1319 summary_table$monomer_length_graph = numeric(N) | |
| 1320 summary_table$graph_image = character(N) | |
| 1321 summary_table$monomer_length_logo = numeric(N) | |
| 1322 summary_table$logo_image = character(N) | |
| 1323 ## export graph nd consensus estimate to cluster directory type of output may | |
| 1324 ## change in future | |
| 1325 save(ggmin, file = paste0(output_dir, "/ggmin.RData")) | |
| 1326 save(monomers, file = paste0(output_dir, "/monomers.RData")) | |
| 1327 | |
| 1328 cat("generating HTML output\n") | |
| 1329 for (i in 1:N) { | |
| 1330 kmer = as.character(summary_table$kmer[i]) | |
| 1331 variant = summary_table$variant[i] | |
| 1332 ## export graph | |
| 1333 fout_link = paste0("img/graph_", kmer, "mer_", variant, ".png") | |
| 1334 fout = paste0(output_dir, "/", fout_link) | |
| 1335 ## summary_table$monomer_length_graph[i] = summary_table$monomer_length[i] | |
| 1336 ## summary_table$monomer_length_logo[[i]] = nrow(monomers[[kmer]]$estimates[[variant]]$CM) | |
| 1337 summary_table$monomer_length[[i]] = length(monomers[[kmer]]$estimates[[variant]]$consensus) | |
| 1338 | |
| 1339 if (i <= 10) { | |
| 1340 png(fout, width = 800, height = 800) | |
| 1341 plot_kmer_graph(ggmin[[kmer]], highlight = unlist(monomers[[kmer]]$paths[[variant]]$tr_paths)) | |
| 1342 dev.off() | |
| 1343 summary_table$graph_image[i] = hwriteImage(fout_link, link = fout_link, | |
| 1344 table = FALSE, width = 100, height = 100) | |
| 1345 ## export logo | |
| 1346 png_link = paste0("img/logo_", kmer, "mer_", variant, ".png") | |
| 1347 fout = paste0(output_dir, "/", png_link) | |
| 1348 png(fout, width = 1200, height = round(summary_table$monomer_length[i] * | |
| 1349 1) + 550) | |
| 1350 try(plot_multiline_logo(monomers[[kmer]]$estimates[[variant]]$CM, W = 100)) | |
| 1351 dev.off() | |
| 1352 ## export corresponding position probability matrices | |
| 1353 ppm_file = paste0(output_dir, '/ppm_', kmer, "mer_", variant, ".csv") | |
| 1354 ppm_link = paste0('ppm_', kmer, "mer_", variant, ".csv") | |
| 1355 write.table(monomers[[kmer]]$estimates[[variant]]$CM, | |
| 1356 file = ppm_file, | |
| 1357 col.names = TRUE, quote = FALSE, | |
| 1358 row.names = FALSE, sep="\t") | |
| 1359 summary_table$logo_image[i] = hwriteImage(png_link, link = ppm_link, | |
| 1360 table = FALSE, width = 200, height = 100) | |
| 1361 } | |
| 1362 | |
| 1363 } | |
| 1364 | |
| 1365 ## html_report = HTMLReport() | |
| 1366 | |
| 1367 htmlfile = paste0(output_dir, "/report.html") | |
| 1368 cat(htmlheader, file = htmlfile) | |
| 1369 included_columns = c('kmer', 'variant', 'total_score', 'consensus_length','consensus', 'graph_image', 'logo_image') | |
| 1370 summary_table_clean = summary_table[,included_columns] | |
| 1371 colnames(summary_table_clean) = c('k-mer length', | |
| 1372 'Variant index', | |
| 1373 'k-mer coverage score', | |
| 1374 'Consensus length', | |
| 1375 'Consensus sequence', | |
| 1376 'k-mer based graph', | |
| 1377 'Sequence logo') | |
| 1378 HTML(summary_table_clean, file = htmlfile, sortableDF = TRUE) | |
| 1379 HTMLEndFile(file = htmlfile) | |
| 1380 time4 = Sys.time() | |
| 1381 print(time4 - time0) | |
| 1382 if (!is.null(lock_file)){ | |
| 1383 print("------removing-lock--------") | |
| 1384 removelock(lock_file) | |
| 1385 } | |
| 1386 | |
| 1387 print(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1], | |
| 1388 TR_monomer_length = as.numeric(summary_table$consensus_length[1]), | |
| 1389 TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info, | |
| 1390 orf_l = orf_l, tarean_contig_file = tarean_contig_file)) | |
| 1391 return(list2dictionary(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1], | |
| 1392 TR_monomer_length = as.numeric(summary_table$consensus_length[1]), | |
| 1393 TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info, | |
| 1394 orf_l = orf_l, tarean_contig_file = tarean_contig_file))) | |
| 1395 } | |
| 1396 | |
| 1397 | |
| 1398 ## graph loop index stability | |
| 1399 loop_index_instability = function(G) { | |
| 1400 N = 50 | |
| 1401 s = seq(vcount(G), vcount(G)/10, length.out = N) | |
| 1402 p = seq(1, 0.1, length.out = N) | |
| 1403 li = numeric() | |
| 1404 for (i in seq_along(s)) { | |
| 1405 print(i) | |
| 1406 gs = induced_subgraph(G, sample(1:vcount(G), s[i])) | |
| 1407 li[i] = max(clusters(gs, "strong")$csize)/vcount(gs) | |
| 1408 } | |
| 1409 instability = lm(li ~ p)$coefficient[2] | |
| 1410 return(instability) | |
| 1411 } | |
| 1412 | |
| 1413 isSatellite = function(x, y, model) { | |
| 1414 p = get_prob(x, y, model) | |
| 1415 if (p > model$cutoff) { | |
| 1416 return("Putative Satellite") | |
| 1417 } else { | |
| 1418 return("") | |
| 1419 } | |
| 1420 } | |
| 1421 | |
| 1422 get_prob = function(x, y, model) { | |
| 1423 pm = model$prob_matrix | |
| 1424 N = ncol(pm) | |
| 1425 i = round(x * (N - 1)) + 1 | |
| 1426 j = round(y * (N - 1)) + 1 | |
| 1427 p = pm[i, j] | |
| 1428 return(p) | |
| 1429 } | |
| 1430 | |
| 1431 | |
| 1432 detectMemUsage = function() { | |
| 1433 con = textConnection(gsub(" +", " ", readLines("/proc/meminfo"))) | |
| 1434 memInfo = read.table(con, fill = TRUE, row.names = 1) | |
| 1435 close(con) | |
| 1436 memUsage = 1 - (memInfo["MemFree", 1] + memInfo["Cached", 1])/memInfo["MemTotal", | |
| 1437 1] | |
| 1438 return(memUsage) | |
| 1439 } | |
| 1440 | |
| 1441 | |
| 1442 makelock<-function(lockfile,lockmsg,CreateDirectories=TRUE){ | |
| 1443 lockdir=dirname(lockfile) | |
| 1444 if(!file.exists(lockdir)){ | |
| 1445 if(CreateDirectories) dir.create(lockdir,recursive=TRUE) | |
| 1446 else stop("Lock Directory for lockfile ",lockfile," does not exist") | |
| 1447 } | |
| 1448 if(missing(lockmsg)) lockmsg=paste(system('hostname',intern=TRUE),Sys.getenv("R_SESSION_TMPDIR")) | |
| 1449 if (file.exists(lockfile)) return (FALSE) | |
| 1450 # note the use of paste makes the message writing atomic | |
| 1451 cat(paste(lockmsg,"\n",sep=""),file=lockfile,append=TRUE,sep="") | |
| 1452 firstline=readLines(lockfile,n=1) | |
| 1453 if(firstline!=lockmsg){ | |
| 1454 # somebody else got there first | |
| 1455 return(FALSE) | |
| 1456 } else return(TRUE) | |
| 1457 } | |
| 1458 | |
| 1459 | |
| 1460 removelock<-function(lockfile){ | |
| 1461 if(unlink(lockfile)!=0) { | |
| 1462 warning("Unable to remove ",lockfile) | |
| 1463 return (FALSE) | |
| 1464 } | |
| 1465 return (TRUE) | |
| 1466 } | |
| 1467 | |
| 1468 | |
| 1469 waitForRAM = function(p = 0.5,lock_file=NULL) { | |
| 1470 if (detectMemUsage() < p) { | |
| 1471 return(NULL) | |
| 1472 ## check lock file: | |
| 1473 } else { | |
| 1474 cat("waiting for RAM \n") | |
| 1475 free_count = 0 | |
| 1476 while (TRUE) { | |
| 1477 if (makelock(lock_file)){ | |
| 1478 print("---------locking--------") | |
| 1479 return(lock_file) | |
| 1480 } | |
| 1481 if (detectMemUsage() < p) { | |
| 1482 cat("RAM freed \n") | |
| 1483 return(NULL) | |
| 1484 } | |
| 1485 Sys.sleep(5) | |
| 1486 if (evaluate_user_cpu_usage() == 'free'){ | |
| 1487 free_count = free_count + 1 | |
| 1488 }else{ | |
| 1489 free_count = 0 | |
| 1490 } | |
| 1491 if (detectMemUsage() < 0.8 & free_count > 100){ | |
| 1492 cat("RAM not free but nothing else is running \n") | |
| 1493 return(NULL) | |
| 1494 } | |
| 1495 } | |
| 1496 } | |
| 1497 } | |
| 1498 | |
| 1499 lsmem = function() { | |
| 1500 g = globalenv() | |
| 1501 out_all = envs = list() | |
| 1502 envs = append(envs, g) | |
| 1503 total_size = numeric() | |
| 1504 while (environmentName(g) != "R_EmptyEnv") { | |
| 1505 g <- parent.env(g) | |
| 1506 envs = append(envs, g) | |
| 1507 } | |
| 1508 for (e in envs) { | |
| 1509 | |
| 1510 obj = ls(envir = e) | |
| 1511 if (length(obj) == 0) { | |
| 1512 break | |
| 1513 } | |
| 1514 obj.size = list() | |
| 1515 for (i in obj) { | |
| 1516 obj.size[[i]] = object.size(get(i, envir = e)) | |
| 1517 } | |
| 1518 out = data.frame(object = obj, size = unlist(obj.size), stringsAsFactors = FALSE) | |
| 1519 out = out[order(out$size, decreasing = TRUE), ] | |
| 1520 out_all = append(out_all, out) | |
| 1521 total_size = append(total_size, sum(out$size)) | |
| 1522 } | |
| 1523 return(list(objects = out_all, total_size = total_size)) | |
| 1524 } | |
| 1525 | |
| 1526 evaluate_user_cpu_usage = function(){ | |
| 1527 user = Sys.info()["user"] | |
| 1528 a = sum(as.numeric (system(paste ("ps -e -o %cpu -u", user), intern = TRUE)[-1])) | |
| 1529 s = substring (system(paste ("ps -e -o stat -u", user), intern = TRUE)[-1],1,1) | |
| 1530 if (a<5 & sum(s %in% 'D')==0 & sum(s%in% 'R')<2){ | |
| 1531 status = 'free' | |
| 1532 }else{ | |
| 1533 status = 'full' | |
| 1534 } | |
| 1535 return(status) | |
| 1536 } |
