| 0 | 1 #!/user/bin/env Rscript | 
|  | 2 | 
|  | 3 suppressPackageStartupMessages(library(igraph)) | 
|  | 4 suppressPackageStartupMessages(library(parallel)) | 
|  | 5 suppressPackageStartupMessages(library(Biostrings)) | 
|  | 6 suppressPackageStartupMessages(library(scales)) | 
|  | 7 suppressPackageStartupMessages(library(stringr)) | 
|  | 8 suppressPackageStartupMessages(library(hwriter)) | 
|  | 9 suppressPackageStartupMessages(library(R2HTML)) | 
|  | 10 suppressPackageStartupMessages(library(plyr)) | 
|  | 11 suppressPackageStartupMessages(library(dplyr)) | 
|  | 12 | 
|  | 13 max_ORF_length = function(s) { | 
|  | 14   ## check all frames | 
|  | 15   L = 0 | 
|  | 16   for (i in 1:3) { | 
|  | 17     L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(s, i))), "*", | 
|  | 18                                      fixed = TRUE)))) | 
|  | 19     L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(reverseComplement(s), | 
|  | 20                                                                    i))), "*", fixed = TRUE)))) | 
|  | 21   } | 
|  | 22   return(L) | 
|  | 23 } | 
|  | 24 | 
|  | 25 kmers2graph = function(kmers, mode = "strong", prop = NULL) { | 
|  | 26   kmerLength = nchar(kmers[1, 1]) | 
|  | 27   if (ncol(kmers) == 2) { | 
|  | 28     kmers$size = kmers[, 2]/sum(kmers[, 2]) | 
|  | 29   } | 
|  | 30   colnames(kmers) = c("name", "count", "size") | 
|  | 31   if (!is.null(prop)) {  # tohle se nepouziva(prop je null), a je to asi spatne - filtuje se to pred tridenim!! | 
|  | 32     p = cumsum(kmers$size) | 
|  | 33     kmers = kmers[p < prop, ] | 
|  | 34   } | 
|  | 35   N = dim(kmers)[1] | 
|  | 36   kmers = kmers[order(kmers$size), ] | 
|  | 37   ## convert kmers to fasta file | 
|  | 38   kms = data.frame(kmer = substring(kmers$name, 1, kmerLength - 1), ids = 1:nrow(kmers),stringsAsFactors = FALSE) | 
|  | 39   kme = data.frame(kmer = substring(kmers$name, 2), ide = 1:nrow(kmers), stringsAsFactors = FALSE) | 
|  | 40 | 
|  | 41   ## df = merge(kms,kme, by = 'kmer',all=FALSE)[,c(2,3,1)] | 
|  | 42   df = inner_join(kme,kms, by = 'kmer')[,c(2,3)] | 
|  | 43 | 
|  | 44   ## names(kms) = seq_along(kms) | 
|  | 45   ## kme = substring(kmers$name, 2) | 
|  | 46   ## names(kme) = seq_along(kme) | 
|  | 47   ## ## use new blast! | 
|  | 48   ## database = tempfile() | 
|  | 49   ## query = tempfile() | 
|  | 50   ## output = tempfile() | 
|  | 51   ## writeXStringSet(DNAStringSet(kms), filepath = database, format = "fasta") | 
|  | 52   ## writeXStringSet(DNAStringSet(kme), filepath = query, format = "fasta") | 
|  | 53   ## cmd = paste("makeblastdb -in", database, "-dbtype nucl") | 
|  | 54   ## system(cmd, ignore.stdout = TRUE) | 
|  | 55   ## cmd = paste("blastn -outfmt '6 qseqid sseqid pident'  -strand plus -dust no -perc_identity 100 -query ", | 
|  | 56   ##     query, "-db", database, "-word_size", kmerLength - 1, "-out", output) | 
|  | 57   ## system(cmd) | 
|  | 58   ## df = try({ | 
|  | 59   ##   read.table(output, as.is = TRUE) | 
|  | 60   ## }) | 
|  | 61   ## if (class(df) == "try-error"){ | 
|  | 62   ##   print("creation of kmer graph failed") | 
|  | 63   ##   print(query) | 
|  | 64   ##   print(output) | 
|  | 65   ##   print(database) | 
|  | 66   ##   return(NULL) | 
|  | 67   ## } | 
|  | 68   ## unlink(query) | 
|  | 69   ## unlink(paste(database, "*", sep = "")) | 
|  | 70   ## unlist(output) | 
|  | 71   gm_mean = function(x, na.rm = TRUE) { | 
|  | 72     exp(sum(log(x[x > 0]), na.rm = na.rm)/length(x)) | 
|  | 73   } | 
|  | 74 | 
|  | 75   whg = apply(cbind(kmers[df[, 1], 2], V2 = kmers[df[, 2], 2]), 1, gm_mean) | 
|  | 76   G = graph.data.frame(data.frame(V1 = kmers$name[df[, 1]], V2 = kmers$name[df[, | 
|  | 77                                                                                2]], weight = whg), vertices = kmers[, 1:3]) | 
|  | 78                                         # separate to connected components: | 
|  | 79   ccs = clusters(G, mode = mode)$membership | 
|  | 80   sel_cls = which(tabulate(ccs) > 1) | 
|  | 81   Gs = list() | 
|  | 82   for (i in seq_along(sel_cls)) { | 
|  | 83     Gs[[i]] = induced.subgraph(G, vids = which(ccs %in% sel_cls[i])) | 
|  | 84   } | 
|  | 85   ## reorder!!! | 
|  | 86   Gs = Gs[order(sapply(Gs, vcount), decreasing = TRUE)] | 
|  | 87   return(Gs) | 
|  | 88 } | 
|  | 89 | 
|  | 90 | 
|  | 91 OGDFlayout = function(G, ncol = NULL, alg = "fmmm", OGDF = getOption("OGDF")) { | 
|  | 92   ## is ogdf binary available? | 
|  | 93   if (is.null(OGDF)) { | 
|  | 94     OGDF = Sys.getenv("OGDF") | 
|  | 95     if ("" == OGDF) { | 
|  | 96       options(warn = -1) | 
|  | 97       OGDF = system("which runOGDFlayout", intern = TRUE) | 
|  | 98       options(warn = 0) | 
|  | 99       if (length(OGDF) == 0) { | 
|  | 100         cat("path to runOGDFlayout not found\n") | 
|  | 101         return(NULL) | 
|  | 102       } | 
|  | 103 | 
|  | 104     } | 
|  | 105   } | 
|  | 106   if (is.null(ncol)) { | 
|  | 107     if (is.null(E(G)$weight)) { | 
|  | 108       el = data.frame(get.edgelist(G, names = TRUE), rep(1, ecount(G))) | 
|  | 109     } else { | 
|  | 110       el = data.frame(get.edgelist(G, names = TRUE), E(G)$weight) | 
|  | 111     } | 
|  | 112     ncol = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout", sep = "") | 
|  | 113     write.table(el, file = ncol, row.names = FALSE, col.names = FALSE, sep = "\t", | 
|  | 114                 quote = FALSE) | 
|  | 115   } else { | 
|  | 116                                         # copy ncol: | 
|  | 117     ncol_tmp = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout", | 
|  | 118                      sep = "") | 
|  | 119     file.copy(ncol, ncol_tmp) | 
|  | 120     ncol = ncol_tmp | 
|  | 121   } | 
|  | 122   algopt = c("fmmm", "sm", "fme") | 
|  | 123   if (!(alg %in% c("fmmm", "sm", "fme") && TRUE)) { | 
|  | 124     stop("alg must by :", algopt, "\n") | 
|  | 125 | 
|  | 126   } | 
|  | 127 | 
|  | 128                                         # output file: | 
|  | 129   Louts = list() | 
|  | 130   layout_file = tempfile(pattern = as.character(Sys.getpid())) | 
|  | 131   for (i in alg) { | 
|  | 132     cmd = paste(OGDF, "-alg", i, "-iff layout -off layout -out", layout_file, | 
|  | 133                 ncol) | 
|  | 134     system(cmd, intern = TRUE) | 
|  | 135     L = read.table(layout_file, skip = ecount(G)) | 
|  | 136     L = L[match(V(G)$name, L$V2), ] | 
|  | 137     Lout = as.matrix(L[, -(1:2)]) | 
|  | 138     unlink(layout_file) | 
|  | 139     Louts[[i]] = Lout | 
|  | 140 | 
|  | 141   } | 
|  | 142                                         # clean up | 
|  | 143   unlink(ncol) | 
|  | 144   return(Louts) | 
|  | 145 } | 
|  | 146 | 
|  | 147 xcolor_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00") | 
|  | 148 | 
|  | 149 kmers2color = function(s, position = NULL) { | 
|  | 150   if (is.null(position)) { | 
|  | 151     position = round(nchar(s[1])/2, 0) | 
|  | 152     ## position = 1 | 
|  | 153     position = nchar(s[1]) | 
|  | 154   } | 
|  | 155   color_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00") | 
|  | 156   color_base = substring(s, position, position) | 
|  | 157   colors = color_code[color_base] | 
|  | 158   names(colors) = color_base | 
|  | 159   return(colors) | 
|  | 160 } | 
|  | 161 get_sequence = function(g, v, position = NULL) { | 
|  | 162   s = V(g)$name | 
|  | 163   if (is.null(position)) { | 
|  | 164     position = round(nchar(s[1])/2, 0) | 
|  | 165     ## position = 1 | 
|  | 166     position = nchar(s[1]) | 
|  | 167   } | 
|  | 168   nt = paste(substring(s[v], position, position), collapse = "") | 
|  | 169   return(nt) | 
|  | 170 } | 
|  | 171 | 
|  | 172 | 
|  | 173 get_mimimal_cc = function(km, thr = 20, min_coverage = 0.45, step = 2, start = NULL) { | 
|  | 174   if (is.null(start)) { | 
|  | 175     i = sum(cumsum(km$freq) < 0.5) | 
|  | 176   } else { | 
|  | 177     i = sum(cumsum(km$freq) < start) | 
|  | 178   } | 
|  | 179   continue = TRUE | 
|  | 180   while (continue) { | 
|  | 181     if (i > nrow(km)) { | 
|  | 182       i = nrow(km) | 
|  | 183       continue = FALSE | 
|  | 184       step = 1 | 
|  | 185     } | 
|  | 186     GG = kmers2graph(km[1:i, ]) | 
|  | 187     if (length(GG) > 0) { | 
|  | 188       if (vcount(GG[[1]]) > thr) { | 
|  | 189         if (sum(V(GG[[1]])$size) >= min_coverage) { | 
|  | 190           GG[[1]]$input_coverage = sum(km$freq[1:i]) | 
|  | 191           GG[[1]]$L = OGDFlayout(GG[[1]])[[1]] | 
|  | 192           return(GG[[1]]) | 
|  | 193         } | 
|  | 194       } | 
|  | 195     } | 
|  | 196     i = round(i * step) | 
|  | 197 | 
|  | 198   } | 
|  | 199   if (length(GG) == 0 | is.null(GG)) { | 
|  | 200     return(NULL) | 
|  | 201   } | 
|  | 202 | 
|  | 203   GG[[1]]$input_coverage = sum(km$freq[1:i]) | 
|  | 204   GG[[1]]$L = OGDFlayout(GG[[1]])[[1]] | 
|  | 205   return(GG[[1]]) | 
|  | 206 } | 
|  | 207 | 
|  | 208 | 
|  | 209 | 
|  | 210 paths2string = function(paths) { | 
|  | 211   pathstring = sapply(lapply(lapply(paths, as_ids), substring, 1, 1), paste, collapse = "") | 
|  | 212   return(pathstring) | 
|  | 213 } | 
|  | 214 | 
|  | 215 | 
|  | 216 align_paths = function(paths, G) { | 
|  | 217   shift = rep(NA, length(paths)) | 
|  | 218   thr = 0  # minimal length | 
|  | 219   tr_paths = list() | 
|  | 220   Seqs = list() | 
|  | 221   centre_node = as.numeric(names(sort(table(unlist(paths)), decreasing = TRUE)))[[1]] | 
|  | 222 | 
|  | 223   for (i in seq_along(paths)) { | 
|  | 224     if (centre_node %in% paths[[i]]) { | 
|  | 225       S = which(paths[[i]] %in% centre_node) | 
|  | 226       shift[i] = S | 
|  | 227       if (S == 1) { | 
|  | 228         tr_paths[[i]] = paths[[i]] | 
|  | 229       } else { | 
|  | 230         tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S - | 
|  | 231                                                                           1)]) | 
|  | 232       } | 
|  | 233       Seqs[[i]] = get_sequence(G, tr_paths[[i]]) | 
|  | 234     } else { | 
|  | 235       shift[i] = NA | 
|  | 236     } | 
|  | 237   } | 
|  | 238   paths_n = lapply(paths, as.numeric) | 
|  | 239   tr_paths_n = do.call(cbind, lapply(tr_paths, as.numeric)) | 
|  | 240   new_shift = shift | 
|  | 241   for (i in which(is.na(shift))) { | 
|  | 242     score = numeric(length(paths_n)) | 
|  | 243     for (S in seq_along(paths_n[[i]])) { | 
|  | 244       if (S == 1) { | 
|  | 245         path_tmp_n = paths_n[[i]] | 
|  | 246       } else { | 
|  | 247         path_tmp_n = c(paths_n[[i]][S:length(paths_n[[i]])], paths_n[[i]][1:(S - | 
|  | 248                                                                              1)]) | 
|  | 249       } | 
|  | 250       score[S] = sum(tr_paths_n == path_tmp_n) | 
|  | 251     } | 
|  | 252     if (sum(score) != 0) { | 
|  | 253       S = which.max(score) | 
|  | 254       new_shift[i] = S | 
|  | 255       if (S == 1) { | 
|  | 256         tr_paths[[i]] = paths[[i]] | 
|  | 257       } else { | 
|  | 258         tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S - | 
|  | 259                                                                           1)]) | 
|  | 260       } | 
|  | 261       Seqs[[i]] = get_sequence(G, tr_paths[[i]]) | 
|  | 262     } | 
|  | 263   } | 
|  | 264   shift = new_shift | 
|  | 265                                         # try to shift based on the sequences itself | 
|  | 266   return(list(Seqs = Seqs[!is.na(shift)], tr_paths = tr_paths[!is.na(shift)], shift = shift[!is.na(shift)])) | 
|  | 267 } | 
|  | 268 | 
|  | 269 make_consensus = function(paths_info, G) { | 
|  | 270   include = !is.na(paths_info$shift) | 
|  | 271   ## make alignments using mafft | 
|  | 272   aln = mafft(unlist(paths_info$Seqs[include])) | 
|  | 273   CM = calculate_consensus_matrix(aln = aln, tr_paths = paths_info$tr_paths[include], | 
|  | 274                                   G = G) | 
|  | 275   gaps = get_gaps_from_alignment(aln) | 
|  | 276   CMnorm = CM/rowSums(CM) | 
|  | 277   bases = colnames(CM) | 
|  | 278   consensus = sapply(apply(CMnorm, 1, function(x) bases[which(x > 0.2)][order(x[x > | 
|  | 279                                                                                 0.2], decreasing = TRUE)]), paste, collapse = "") | 
|  | 280   consensus2 = gsub("-", "", paste0(substring(consensus, 1, 1), collapse = ""), | 
|  | 281                     fixed = TRUE) | 
|  | 282   number_of_SNP = sum(rowSums(CM > 0) > 1) | 
|  | 283   SNP_positions = which(rowSums(CM > 0) > 1) | 
|  | 284   number_of_position_with_indels = sum(colSums(do.call(rbind, strsplit(as.character(aln), | 
|  | 285                                                                        "")) == "-") > 0) | 
|  | 286   indel_positions = which(colSums(do.call(rbind, strsplit(as.character(aln), "")) == | 
|  | 287                                   "-") > 0) | 
|  | 288   if (length(SNP_positions) > 0) { | 
|  | 289     variable_sites = unlist(c(c(mapply(paste, strsplit((sapply(apply(CMnorm, | 
|  | 290                                                                      1, function(x) bases[which(x > 0.2)]), paste, collapse = ""))[SNP_positions], | 
|  | 291                                                        ""), SNP_positions, sep = "_")), paste("-", indel_positions, sep = "_"))) | 
|  | 292   } else { | 
|  | 293     variable_sites = NULL | 
|  | 294   } | 
|  | 295   variable_positions = unique(SNP_positions, indel_positions) | 
|  | 296   return(list(aln = aln, CM = CM, CMnorm = CMnorm, consensus = consensus, consensus2 = consensus2, | 
|  | 297               number_of_SNP = number_of_SNP, SNP_positions = SNP_positions, number_of_position_with_indels = number_of_position_with_indels, | 
|  | 298               indel_positions = indel_positions, variable_positions = variable_positions, | 
|  | 299               variable_sites = variable_sites, gaps = gaps)) | 
|  | 300 } | 
|  | 301 | 
|  | 302 | 
|  | 303 estimate_monomer = function(G, weights = NULL, limit = NULL) { | 
|  | 304   if (is.null(G)) { | 
|  | 305     return(NULL) | 
|  | 306   } | 
|  | 307   ## estimate monomer from kmer based graph | 
|  | 308   V(G)$id = 1:vcount(G) | 
|  | 309   GS = induced_subgraph(G, vids = which(degree(G) == 2))  ## keep only vertices without branching | 
|  | 310   cls = clusters(GS)$membership | 
|  | 311 | 
|  | 312 | 
|  | 313   ids = mapply(FUN = function(x, y) x[which.max(y)], split(V(GS)$id, cls), split(V(GS)$size, | 
|  | 314                                                                                  cls))  ## from each branch use only one vertex with larges size! | 
|  | 315 | 
|  | 316 | 
|  | 317   ids = ids[order(V(G)$size[ids], decreasing = TRUE)] | 
|  | 318   ids_size = V(G)$size[ids] | 
|  | 319   N50 = sum(cumsum(ids_size)/sum(ids_size) < 0.5) | 
|  | 320   if (length(ids) > 10000) { | 
|  | 321     ids = ids[1:N50] | 
|  | 322   } | 
|  | 323   ## use only large vertices in search!  how many? | 
|  | 324   el = get.edgelist(G, names = FALSE) | 
|  | 325   node_use = numeric(vcount(G)) | 
|  | 326   LL = numeric() | 
|  | 327   ## W=numeric() | 
|  | 328   i = 0 | 
|  | 329   paths = list() | 
|  | 330   if (is.null(weights)) { | 
|  | 331     weights = (max(E(G)$weight) - (E(G)$weight) + 1) | 
|  | 332     weights = E(G)$weight^(-3) | 
|  | 333   } | 
|  | 334   included = rep(FALSE, vcount(G)) | 
|  | 335   W_total = sum(V(G)$size) | 
|  | 336 | 
|  | 337   coverage = numeric() | 
|  | 338   t0 = c() | 
|  | 339   i = 0 | 
|  | 340   j = 0 | 
|  | 341   for (n in ids) { | 
|  | 342     j = j + 1 | 
|  | 343     t0[j] = Sys.time() | 
|  | 344     if (included[n]) { | 
|  | 345       next | 
|  | 346     } | 
|  | 347     m = which(el[, 1] %in% n) | 
|  | 348     i = i + 1 | 
|  | 349     s = get.shortest.paths(G, el[m, 2], el[m, 1], weights = weights, output = "vpath") | 
|  | 350     included[as.numeric(s$vpath[[1]])] = TRUE | 
|  | 351     paths[[i]] = s$vpath[[1]] | 
|  | 352     LL[i] = (length(s$vpath[[1]])) | 
|  | 353   } | 
|  | 354 | 
|  | 355   ## evaluate if paths should be divided to variants - by length and path weight | 
|  | 356   paths_clusters = split(paths, LL) | 
|  | 357   paths_clusters_tr = mclapply(paths_clusters, FUN = align_paths, G = G, mc.cores = getOption("CPU")) | 
|  | 358 | 
|  | 359   ## paths_clusters_tr = lapply(paths_clusters, FUN = align_paths, G = G) consensus | 
|  | 360   paths_consensus = mclapply(paths_clusters_tr, make_consensus, G = G, mc.cores = getOption("CPU")) | 
|  | 361 | 
|  | 362   ## evaluate weight for individual paths: | 
|  | 363   for (v in seq_along(paths_consensus)) { | 
|  | 364     p = paths_clusters_tr[[v]]$tr_paths | 
|  | 365     ## clean | 
|  | 366     p = p[!sapply(p, function(x) anyNA(x) | is.null(x))] | 
|  | 367     L = sapply(p, length) | 
|  | 368     p_groups = split(p, L) | 
|  | 369     w_groups = sapply(p_groups, function(x) sum(V(G)$size[unique(c(sapply(x, | 
|  | 370                                                                           as.numeric)))])) | 
|  | 371     total_score = sum(V(G)$size[unique(c(unlist(sapply(p, as.numeric))))]) | 
|  | 372     LW = data.frame(`Length estimate` = unique(L), weight = w_groups, stringsAsFactors = FALSE) | 
|  | 373     LW = LW[order(LW$weight, decreasing = TRUE), ] | 
|  | 374     rownames(LW) = NULL | 
|  | 375     paths_consensus[[v]]$total_score = total_score | 
|  | 376     paths_consensus[[v]]$length_variant_score = LW | 
|  | 377 | 
|  | 378   } | 
|  | 379   return(list(estimates = paths_consensus, paths = paths_clusters_tr)) | 
|  | 380 } | 
|  | 381 | 
|  | 382 | 
|  | 383 | 
|  | 384 detect_pbs = function(dimers_file, tRNA_database_path, reads_file, output) { | 
|  | 385   ## read_file contain oriented reads! | 
|  | 386   min_offset = 10 | 
|  | 387   max_end_dist = 2 | 
|  | 388   min_aln_length = 30 | 
|  | 389   max_pbs_search = 30 | 
|  | 390   thr_length = 12 | 
|  | 391   end_position = 23 | 
|  | 392   insertion_proportion_threshold <- 0.15 | 
|  | 393   ## read_file contain oriented reads!  for testing | 
|  | 394   if (FALSE) { | 
|  | 395     library(Biostrings) | 
|  | 396     thr_length = 12 | 
|  | 397     end_position = 23 | 
|  | 398     dimers_file = "consensus_dimer.fasta" | 
|  | 399     reads_file = "reads_oriented.fas" | 
|  | 400     tRNA_database_path = "/mnt/raid/users/petr/workspace/repex_tarean/databases/tRNA_database2.fasta" | 
|  | 401 | 
|  | 402   } | 
|  | 403   ## find read which are half aligned do reference dimer | 
|  | 404   dimers_seq = readDNAStringSet(dimers_file) | 
|  | 405   cmd = paste("makeblastdb -in", dimers_file, "-dbtype nucl") | 
|  | 406   system(cmd, ignore.stdout = TRUE) | 
|  | 407   output = paste0(reads_file, "blast_out.cvs") | 
|  | 408   columns = c("qseqid", "qstart", "qend", "qlen", "sseqid", "sstart", "send", "sstrand", | 
|  | 409               "slen", "pident", "length") | 
|  | 410   cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), "' -perc_identity 90 -query ", | 
|  | 411               reads_file, "-db", dimers_file, "-word_size", 7, "-dust no -num_alignments 99999 -strand plus ", | 
|  | 412               "-out", output) | 
|  | 413   system(cmd) | 
|  | 414   blastdf = read.table(output, as.is = TRUE, col.names = columns, comment.char = "") | 
|  | 415   unlink(output) | 
|  | 416   blastdf = blastdf[blastdf$length >= min_aln_length, ] | 
|  | 417 | 
|  | 418   ## expand two whole read to see unligned reads parts | 
|  | 419   blastdf_expand = blastdf | 
|  | 420   blastdf_expand$qstart = blastdf$qstart - (blastdf$qstart - 1) | 
|  | 421   blastdf_expand$sstart = blastdf$sstart - (blastdf$qstart - 1) | 
|  | 422   blastdf_expand$qend = blastdf$qend + (blastdf$qlen - blastdf$qend) | 
|  | 423   blastdf_expand$send = blastdf$send + (blastdf$qlen - blastdf$qend) | 
|  | 424   pS = blastdf$sstart | 
|  | 425   pE = blastdf$send | 
|  | 426   pSF = blastdf_expand$sstart | 
|  | 427   pEF = blastdf_expand$send | 
|  | 428   cond1 = pS - pSF >= min_offset & blastdf$qend >= (blastdf$qlen - max_end_dist) & | 
|  | 429     blastdf$sstart >= max_pbs_search | 
|  | 430   cond2 = pEF - pE >= min_offset & blastdf$qstart <= max_end_dist & blastdf$send <= | 
|  | 431     blastdf$slen - max_pbs_search | 
|  | 432 | 
|  | 433   ## coverage of alignments: evaluate coverage at site with breaks, it is neccessary | 
|  | 434   ## that there must be at least 20% of interupted alignments at given position to | 
|  | 435   ## be considered for additional search | 
|  | 436   coverage_profiles = subject_coverage(blastdf) | 
|  | 437 | 
|  | 438 | 
|  | 439   ## extract flanking sequences - cca 50nt, it should contain tRNA sequence search | 
|  | 440   ## Left | 
|  | 441   fin = tempfile() | 
|  | 442   fout = tempfile() | 
|  | 443   scoreL = scoreR = 0 | 
|  | 444 | 
|  | 445 | 
|  | 446   ## left side unique insertion sites | 
|  | 447   if (any(cond1)) { | 
|  | 448     insertion_sites = ddply(blastdf[cond1, ], .(sseqid, sstart), nrow) | 
|  | 449     ## check coverage | 
|  | 450     insertion_sites$coverage = 0 | 
|  | 451     for (i in 1:nrow(insertion_sites)) { | 
|  | 452       insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$sstart[[i]]] | 
|  | 453     } | 
|  | 454     insertion_OK_left <- insertion_sites[with(insertion_sites, V1/coverage > | 
|  | 455                                                                insertion_proportion_threshold), ] | 
|  | 456 | 
|  | 457     if (nrow(insertion_OK_left) > 0) { | 
|  | 458       s = ifelse(insertion_OK_left[, "sstart"] - max_pbs_search < 1, 1, insertion_OK_left[, | 
|  | 459                                                                                           "sstart"] - max_pbs_search) | 
|  | 460       Lpart = subseq(dimers_seq[match(insertion_OK_left$sseqid, names(dimers_seq))], | 
|  | 461                      s, insertion_OK_left$sstart) | 
|  | 462 | 
|  | 463       ## names are CONSENSUSID__READID_position | 
|  | 464       names(Lpart) = paste0(names(Lpart), "__", insertion_OK_left$V1, "__", | 
|  | 465                             insertion_OK_left$sstart) | 
|  | 466       ## check presence TG dinucleotide | 
|  | 467       TG = vcountPattern("TG", subseq(dimers_seq[match(insertion_OK_left$sseqid, | 
|  | 468                                                        names(dimers_seq))], insertion_OK_left$sstart - 2, insertion_OK_left$sstart + | 
|  | 469                                                                                                           2)) | 
|  | 470       if (any(TG > 0)) { | 
|  | 471         writeXStringSet(Lpart[TG > 0], filepath = fin) | 
|  | 472         cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), | 
|  | 473                     "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path, | 
|  | 474                     "-word_size", 7, "-strand plus -dust no -max_target_seqs 10000 -evalue 100", | 
|  | 475                     "-out", fout) | 
|  | 476 | 
|  | 477         system(cmd, ignore.stdout = TRUE) | 
|  | 478         df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "") | 
|  | 479         filter1 = df$length >= thr_length | 
|  | 480         filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position | 
|  | 481         df_pass = df[filter1 & filter2, , drop = FALSE] | 
|  | 482         df_pass_L = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE] | 
|  | 483         scoreL = get_score(df_pass_L) | 
|  | 484         write.table(df_pass_L, file = paste0(output, "_L.csv"), row.names = FALSE, | 
|  | 485                     sep = "\t") | 
|  | 486       } | 
|  | 487     } | 
|  | 488   } | 
|  | 489   if (any(cond2)) { | 
|  | 490     ## search Right | 
|  | 491     insertion_sites = ddply(blastdf[cond2, ], .(sseqid, send, slen), nrow) | 
|  | 492     ## check coverage | 
|  | 493     insertion_sites$coverage = 0 | 
|  | 494     for (i in 1:nrow(insertion_sites)) { | 
|  | 495       insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$send[[i]]] | 
|  | 496     } | 
|  | 497     insertion_OK_right <- insertion_sites[with(insertion_sites, V1/coverage > | 
|  | 498                                                                 insertion_proportion_threshold), ] | 
|  | 499 | 
|  | 500     if (nrow(insertion_OK_right) > 0) { | 
|  | 501       s = ifelse(insertion_OK_right$send + max_pbs_search > insertion_OK_right$slen, | 
|  | 502                  insertion_OK_right$slen, insertion_OK_right$send + max_pbs_search) | 
|  | 503       Rpart = subseq(dimers_seq[match(insertion_OK_right$sseqid, names(dimers_seq))], | 
|  | 504                      insertion_OK_right$send, s) | 
|  | 505       names(Rpart) = paste0(names(Rpart), "__", insertion_OK_right$V1, "__", | 
|  | 506                             insertion_OK_right$send) | 
|  | 507 | 
|  | 508       ## check presence CA dinucleotide | 
|  | 509       CA = vcountPattern("CA", subseq(dimers_seq[match(insertion_OK_right$sseqid, | 
|  | 510                                                        names(dimers_seq))], insertion_OK_right$send - 2, insertion_OK_right$send + | 
|  | 511                                                                                                          2, )) | 
|  | 512       if (any(CA > 0)) { | 
|  | 513         writeXStringSet(Rpart[CA > 0], filepath = fin) | 
|  | 514         cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), | 
|  | 515                     "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path, | 
|  | 516                     "-word_size", 7, "-strand minus -dust no -max_target_seqs 10000 -evalue 100", | 
|  | 517                     "-out", fout) | 
|  | 518 | 
|  | 519         system(cmd, ignore.stdout = TRUE) | 
|  | 520         df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "") | 
|  | 521         filter1 = df$length >= thr_length | 
|  | 522         filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position | 
|  | 523         df_pass = df[filter1 & filter2, , drop = FALSE] | 
|  | 524         df_pass_R = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE] | 
|  | 525         write.table(df_pass_R, file = paste0(output, "_R.csv"), row.names = FALSE, | 
|  | 526                     sep = "\t") | 
|  | 527         scoreR = get_score(df_pass_R) | 
|  | 528       } | 
|  | 529     } | 
|  | 530   } | 
|  | 531   unlink(fin) | 
|  | 532   unlink(fout) | 
|  | 533   return(max(scoreL, scoreR)) | 
|  | 534 } | 
|  | 535 | 
|  | 536 | 
|  | 537 subject_coverage = function(blastdf) { | 
|  | 538   ## calculate coverage for all blast subjects | 
|  | 539   coverage_profiles <- by(blastdf, INDICES = blastdf$sseqid, FUN = function(x) { | 
|  | 540     as.numeric(coverage(IRanges(start = x$sstart, end = x$send))) | 
|  | 541   }) | 
|  | 542   return(coverage_profiles) | 
|  | 543 | 
|  | 544 } | 
|  | 545 | 
|  | 546 get_score = function(x) { | 
|  | 547   ## keep best tRNA | 
|  | 548   if (nrow(x) == 0) { | 
|  | 549     return(0) | 
|  | 550   } | 
|  | 551   xm = data.frame(do.call(rbind, strsplit(x$qseqid, "__")), stringsAsFactors = FALSE) | 
|  | 552   xm$score = as.numeric(sapply(strsplit(xm[, 1], "_"), "[[", 4)) | 
|  | 553   xm$AA = gsub("-.+$", "", gsub("^.+__", "", x$sseqid)) | 
|  | 554   best_score = max(by(xm$score, INDICES = xm$AA, FUN = sum)) | 
|  | 555   return(best_score) | 
|  | 556 } | 
|  | 557 | 
|  | 558 | 
|  | 559 | 
|  | 560 dotter = function(seq1, seq2 = NULL, params = "") { | 
|  | 561   if (is.null(seq2)) { | 
|  | 562     seq2 = seq1 | 
|  | 563   } | 
|  | 564   library(Biostrings) | 
|  | 565   if (class(seq1) != "DNAStringSet") { | 
|  | 566     seq1 = BStringSet(seq1) | 
|  | 567   } | 
|  | 568   if (class(seq2) != "DNAStringSet") { | 
|  | 569     seq2 = BStringSet(seq2) | 
|  | 570   } | 
|  | 571   sf1 = tempfile("seq1") | 
|  | 572   writeXStringSet(seq1, file = sf1) | 
|  | 573   sf2 = tempfile("seq2") | 
|  | 574   writeXStringSet(seq2, file = sf2) | 
|  | 575   system(paste("dotter", params, sf1, sf2), wait = FALSE) | 
|  | 576   Sys.sleep(2) | 
|  | 577   unlink(c(sf1, sf2)) | 
|  | 578   return(NULL) | 
|  | 579 } | 
|  | 580 | 
|  | 581 | 
|  | 582 | 
|  | 583 dotter2 = function(seq1, seq2 = NULL, params = NULL) { | 
|  | 584   if (is.null(seq2)) { | 
|  | 585     seq2 = seq1 | 
|  | 586   } | 
|  | 587   if (is.null(params)) { | 
|  | 588     params = " -windowsize 30 -threshold 45 " | 
|  | 589   } | 
|  | 590   library(Biostrings) | 
|  | 591   if (class(seq1) != "DNAStringSet") { | 
|  | 592     seq1 = DNAStringSet(seq1) | 
|  | 593   } | 
|  | 594   if (class(seq2) != "DNAStringSet") { | 
|  | 595     seq2 = DNAStringSet(seq2) | 
|  | 596   } | 
|  | 597   L1 = nchar(seq1) | 
|  | 598   L2 = nchar(seq2) | 
|  | 599 | 
|  | 600   tmpdat1 = tempfile() | 
|  | 601 | 
|  | 602   dir.create(tmpdat1) | 
|  | 603   tmpdat2 = tempfile() | 
|  | 604   dir.create(tmpdat2) | 
|  | 605   oridir = getwd() | 
|  | 606 | 
|  | 607 | 
|  | 608   seq1_merged = DNAStringSet(paste(seq1, collapse = "")) | 
|  | 609   seq2_merged = DNAStringSet(paste(seq2, collapse = "")) | 
|  | 610   seq2rc_merged = reverseComplement(DNAStringSet(paste(seq2, collapse = ""))) | 
|  | 611 | 
|  | 612 | 
|  | 613   sf1 = tempfile("seq1") | 
|  | 614   writeXStringSet(seq1_merged, filepath = sf1) | 
|  | 615   sf2 = tempfile("seq2") | 
|  | 616   sf2rc = tempfile("seq2rc") | 
|  | 617   writeXStringSet(seq2_merged, filepath = sf2) | 
|  | 618   writeXStringSet(seq2rc_merged, filepath = sf2rc) | 
|  | 619 | 
|  | 620   cmd1 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2, params) | 
|  | 621   cmd2 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2rc, | 
|  | 622                params) | 
|  | 623   setwd(tmpdat1) | 
|  | 624   output1 = system(cmd1, intern = TRUE) | 
|  | 625   setwd(tmpdat2) | 
|  | 626   output2 = system(cmd2, intern = TRUE) | 
|  | 627   setwd(oridir) | 
|  | 628 | 
|  | 629 | 
|  | 630 | 
|  | 631   fout1 = strsplit(tail(output1, n = 1), split = " ")[[1]][2] | 
|  | 632   rawdat1 = readLines(paste(tmpdat1, "/", fout1, sep = "")) | 
|  | 633   rawdat1 = rawdat1[1:min(grep("^Rectangle", rawdat1))] | 
|  | 634 | 
|  | 635   if (length(rawdat1[grep("^Line", rawdat1)]) == 0) { | 
|  | 636     coord1 = NULL | 
|  | 637   } else { | 
|  | 638     coord1 = apply(sapply(strsplit(rawdat1[grep("^Line", rawdat1)], " "), "[", | 
|  | 639                           c(3, 5, 7, 9, 11)), 1, as.numeric) | 
|  | 640     coord1 = matrix(coord1, ncol = 5) | 
|  | 641   } | 
|  | 642 | 
|  | 643   fout2 = strsplit(tail(output2, n = 1), split = " ")[[1]][2] | 
|  | 644   rawdat2 = readLines(paste(tmpdat2, "/", fout2, sep = "")) | 
|  | 645   rawdat2 = rawdat2[1:min(grep("^Rectangle", rawdat2))] | 
|  | 646 | 
|  | 647   if (length(rawdat2[grep("^Line", rawdat2)]) == 0) { | 
|  | 648     coord2 = NULL | 
|  | 649   } else { | 
|  | 650     coord2 = apply(sapply(strsplit(rawdat2[grep("^Line", rawdat2)], " "), "[", | 
|  | 651                           c(3, 5, 7, 9, 11)), 1, as.numeric) | 
|  | 652     coord2 = matrix(coord2, ncol = 5) | 
|  | 653   } | 
|  | 654   unlink(sf1) | 
|  | 655   unlink(sf2) | 
|  | 656   unlink(sf2rc) | 
|  | 657   unlink(tmpdat1, recursive = TRUE) | 
|  | 658   unlink(tmpdat2, recursive = TRUE) | 
|  | 659 | 
|  | 660   N1 = sum(nchar(seq1)) | 
|  | 661   N2 = sum(nchar(seq2)) | 
|  | 662   op = par(xaxs = "i", yaxs = "i", mar = c(5, 2, 6, 10), las = 1) | 
|  | 663   on.exit(par(op)) | 
|  | 664   plot(c(1, N1), c(1, N2), type = "n", xlab = "", ylab = "", axes = FALSE) | 
|  | 665   if (!is.null(coord1)) { | 
|  | 666     segments(x0 = coord1[, 1], y0 = coord1[, 2], x1 = coord1[, 3], y1 = coord1[, | 
|  | 667                                                                                4]) | 
|  | 668   } | 
|  | 669   if (!is.null(coord2)) { | 
|  | 670     segments(x0 = coord2[, 1], y0 = N2 - coord2[, 2], x1 = coord2[, 3], y1 = N2 - | 
|  | 671                                                                           coord2[, 4]) | 
|  | 672   } | 
|  | 673   abline(v = c(0, cumsum(L1)), col = "green") | 
|  | 674   abline(h = c(0, cumsum(L2)), col = "green") | 
|  | 675   box() | 
|  | 676   axis(1, at = c(1, cumsum(L1))[-(length(L1) + 1)], labels = names(seq1), hadj = 0, | 
|  | 677        cex.axis = 0.7) | 
|  | 678   axis(4, at = c(1, cumsum(L2))[-(length(L2) + 1)], labels = names(seq2), hadj = 0, | 
|  | 679        cex.axis = 0.7) | 
|  | 680   invisible(list(coord1, coord2)) | 
|  | 681 } | 
|  | 682 | 
|  | 683 | 
|  | 684 CM2sequence = function(x) { | 
|  | 685   bases = c(colnames(x), "N") | 
|  | 686   x = cbind(x, 0) | 
|  | 687   colnames(x) = bases | 
|  | 688   seqs = paste(bases[apply(x, 1, which.max)], collapse = "") | 
|  | 689 | 
|  | 690   return(seqs) | 
|  | 691 } | 
|  | 692 | 
|  | 693 ## in alingment calculate significance from kmers | 
|  | 694 calculate_consensus_matrix = function(aln, tr_paths, G) { | 
|  | 695   bases = c("A", "C", "G", "T", "-") | 
|  | 696   positions = lapply(strsplit(as.character(aln), split = ""), function(x) which(!x %in% | 
|  | 697                                                                                 "-")) | 
|  | 698   base_matrix = do.call(rbind, strsplit(as.character(aln), split = "")) | 
|  | 699   weights = matrix(0, nrow = length(aln), ncol = nchar(aln)) | 
|  | 700   kmer_rel_proportions = (V(G)$size/table(factor(unlist(tr_paths), levels = 1:vcount(G)))) | 
|  | 701   for (i in seq_along(positions)) { | 
|  | 702     weights[i, positions[[i]]] = kmer_rel_proportions[tr_paths[[i]]] | 
|  | 703   } | 
|  | 704   names(kmer_rel_proportions) = V(G)$name | 
|  | 705   ## get weights for gaps by approximation | 
|  | 706   fitgaps = function(y) { | 
|  | 707     if (sum(y == 0) == 0) { | 
|  | 708       return(y) | 
|  | 709     } else { | 
|  | 710       y0 = rep(y[y != 0], 3) | 
|  | 711       x0 = which(rep(y, 3) != 0) | 
|  | 712       fitted = approx(x0, y0, xout = seq_along(rep(y, 3)), rule = 1) | 
|  | 713     } | 
|  | 714     return(fitted$y[-seq_along(y)][seq_along(y)]) | 
|  | 715   } | 
|  | 716   weights_with_gaps = t(apply(weights, 1, fitgaps)) | 
|  | 717   ## weights_with_gaps = get_gap_weights(weights, aln,G) ## this step take wey too | 
|  | 718   ## long time!!!-not so important TODO handle gaps differently - more effectively | 
|  | 719 | 
|  | 720   ## get consensus matrix | 
|  | 721   CM = sapply(1:nchar(aln[[1]]), function(i) { | 
|  | 722     sapply(split(weights_with_gaps[, i], factor(base_matrix[, i], levels = bases)), | 
|  | 723            sum) | 
|  | 724   }) | 
|  | 725   return(t(CM)[, 1:5]) | 
|  | 726 } | 
|  | 727 | 
|  | 728 | 
|  | 729 get_gaps_from_alignment = function(aln) { | 
|  | 730   as.character(aln) | 
|  | 731   gaps_positions = unique(do.call(rbind, str_locate_all(as.character(aln), "-+"))) | 
|  | 732   return(gaps_positions) | 
|  | 733 } | 
|  | 734 | 
|  | 735 plot_kmer_graph = function(G, L = NULL, vertex.size = NULL, ord = NULL, upto = NULL, | 
|  | 736                            highlight = NULL) { | 
|  | 737   if (!is.null(G$L) & is.null(L)) { | 
|  | 738     L = G$L | 
|  | 739   } | 
|  | 740   if (is.null(L)) { | 
|  | 741     ## L=layout.kamada.kawai(G) | 
|  | 742     L = OGDFlayout(G)[[1]] | 
|  | 743   } | 
|  | 744   clr = kmers2color(V(G)$name) | 
|  | 745   if (!is.null(highlight)) { | 
|  | 746     clr[highlight] = "#00000080" | 
|  | 747   } | 
|  | 748   if (!is.null(ord)) { | 
|  | 749     clr[ord[1:upto]] = paste(clr[ord[1:upto]], "30", sep = "") | 
|  | 750   } | 
|  | 751   if (is.null(vertex.size)) { | 
|  | 752     vertex.size = rescale(V(G)$size, to = c(0.5, 6)) | 
|  | 753 | 
|  | 754   } | 
|  | 755   plot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE, | 
|  | 756        vertex.color = clr, vertex.frame.color = "#00000020", edge.width = 2, edge.arrow.mode = 1, | 
|  | 757        edge.arrow.size = 0.2) | 
|  | 758 | 
|  | 759 } | 
|  | 760 | 
|  | 761 rglplot_kmer_graph = function(G, L = NULL, vertex.size = 4) { | 
|  | 762   if (is.null(L)) { | 
|  | 763     L = layout.kamada.kawai(G) | 
|  | 764   } | 
|  | 765 | 
|  | 766   rglplot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE, | 
|  | 767           vertex.color = kmers2color(V(G)$name), edge.width = 2, edge.arrow.mode = 1, | 
|  | 768           edge.arrow.size = 0.5) | 
|  | 769 } | 
|  | 770 | 
|  | 771 | 
|  | 772 | 
|  | 773 | 
|  | 774 mafft = function(seqs, params = "--auto --thread 1 ") { | 
|  | 775   if (length(seqs) < 2) { | 
|  | 776     return(seqs) | 
|  | 777   } | 
|  | 778   infile = tempfile() | 
|  | 779   if (class(seqs) == "character") { | 
|  | 780     seqs = DNAStringSet(seqs) | 
|  | 781   } | 
|  | 782   writeXStringSet(seqs, file = infile) | 
|  | 783   outfile = tempfile() | 
|  | 784   cmd = paste("mafft --quiet --nuc ", params, infile, "2> /dev/null > ", outfile) | 
|  | 785   system(cmd, intern = TRUE, ignore.stderr = FALSE) | 
|  | 786   aln = readDNAStringSet(outfile) | 
|  | 787   unlink(c(outfile, infile)) | 
|  | 788   return(aln) | 
|  | 789 } | 
|  | 790 | 
|  | 791 mgblast = function(databaseSeq, querySeq) { | 
|  | 792   params = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -C 30 -D 30 " | 
|  | 793                                         # no dust filtering: | 
|  | 794   paramsDF = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -F F" | 
|  | 795 | 
|  | 796   database = tempfile() | 
|  | 797   query = tempfile() | 
|  | 798   output = tempfile() | 
|  | 799   if (class(databaseSeq) == "character") { | 
|  | 800     database = databaseSeq | 
|  | 801     do_not_delete_database = TRUE | 
|  | 802   } else { | 
|  | 803     writeXStringSet(databaseSeq, filepath = database, format = "fasta") | 
|  | 804     do_not_delete_database = FALSE | 
|  | 805   } | 
|  | 806   if (class(querySeq) == "character") { | 
|  | 807     query = querySeq | 
|  | 808     do_not_delete_query = TRUE | 
|  | 809   } else { | 
|  | 810     writeXStringSet(querySeq, filepath = query, format = "fasta") | 
|  | 811     do_not_delete_query = FALSE | 
|  | 812   } | 
|  | 813   ## create database: | 
|  | 814   cmd = paste("formatdb -i", database, "-p F") | 
|  | 815   system(cmd) | 
|  | 816   cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, params) | 
|  | 817   system(cmd) | 
|  | 818   if (file.info(output)$size == 0) { | 
|  | 819                                         # no hist, try wthou dust masker | 
|  | 820     cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, paramsDF) | 
|  | 821     system(cmd) | 
|  | 822   } | 
|  | 823   blastOut = read.table(output, sep = "\t", header = FALSE, as.is = TRUE, comment.char = "") | 
|  | 824   unlink(output) | 
|  | 825   if (!do_not_delete_query) { | 
|  | 826     unlink(query) | 
|  | 827   } | 
|  | 828   if (!do_not_delete_database) { | 
|  | 829     unlink(paste(database, "*", sep = "")) | 
|  | 830   } | 
|  | 831   colnames(blastOut) = c("query", "q.length", "q.start", "q.end", "subject", "s.length", | 
|  | 832                          "s.start", "s.end", "pid", "weight", "e.value", "strand") | 
|  | 833   blastOut | 
|  | 834 } | 
|  | 835 | 
|  | 836 | 
|  | 837 estimate_sample_size = function(NV, NE, maxv, maxe) { | 
|  | 838   ## density | 
|  | 839   d = (2 * NE)/(NV * (NV - 1)) | 
|  | 840   eEst = (maxv * (maxv - 1) * d)/2 | 
|  | 841   nEst = (d + sqrt(d^2 + 8 * d * maxe))/(2 * d) | 
|  | 842   if (eEst >= maxe) { | 
|  | 843     N = round(nEst) | 
|  | 844     E = round((N * (N - 1) * d)/2) | 
|  | 845 | 
|  | 846   } | 
|  | 847   if (nEst >= maxv) { | 
|  | 848     N = maxv | 
|  | 849     E = round((N * (N - 1) * d)/2) | 
|  | 850 | 
|  | 851   } | 
|  | 852   return(N) | 
|  | 853 } | 
|  | 854 | 
|  | 855 | 
|  | 856 | 
|  | 857 | 
|  | 858 mgblast2graph = function(blastfile, seqfile, graph_destination, directed_graph_destination, | 
|  | 859                          oriented_sequences, paired = TRUE, repex = FALSE, image_file = NULL, image_file_tmb = NULL, | 
|  | 860                          include_layout = TRUE, pair_completeness = NULL, satellite_model_path = NULL, | 
|  | 861                          maxv = 40000, maxe = 5e+08, seqfile_full = seqfile) { | 
|  | 862   cat("loading blast results\n") | 
|  | 863   if (repex) { | 
|  | 864     cln = c("query", "subject", "weight", "q.length", "q.start", "q.end", "s.length", "s.start", | 
|  | 865             "s.end", "sign") | 
|  | 866     colcls = c("character", "character", "numeric", "numeric", "numeric", "numeric", | 
|  | 867                "numeric", "numeric", "numeric", "NULL", "NULL", "character") | 
|  | 868 | 
|  | 869   } else { | 
|  | 870     cln = c("query", "q.length", "q.start", "q.end", "subject", "s.length", "s.start", | 
|  | 871             "s.end","weight", "sign") | 
|  | 872     colcls = c("character", "numeric", "numeric", "numeric", "character", "numeric", | 
|  | 873                "numeric", "numeric", "NULL", "numeric", "NULL", "character") | 
|  | 874   } | 
|  | 875   if (class(blastfile) == "data.frame") { | 
|  | 876     blastTable = blastfile | 
|  | 877     colnames(blastTable)[12] = "sign" | 
|  | 878   } else { | 
|  | 879     blastTable = read.table(blastfile, sep = "\t", as.is = TRUE, header = FALSE, | 
|  | 880                             colClasses = colcls, comment.char = "") | 
|  | 881     colnames(blastTable) = cln | 
|  | 882   } | 
|  | 883   ## check for duplicates! | 
|  | 884   key = with(blastTable, ifelse(query > subject, paste(query, subject), paste(subject, | 
|  | 885                                                                               query))) | 
|  | 886   if (any(duplicated(key))) { | 
|  | 887     blastTable = blastTable[!duplicated(key), ] | 
|  | 888   } | 
|  | 889   seqs = readDNAStringSet(seqfile) | 
|  | 890   ## calculate pair completeness for reads before sampling | 
|  | 891   if (is.null(pair_completeness)) { | 
|  | 892     if (paired) { | 
|  | 893       if (seqfile_full != seqfile) { | 
|  | 894         seqs_full = readDNAStringSet(seqfile_full) | 
|  | 895         pair_counts = tabulate(table(gsub(".$", "", names(seqs_full)))) | 
|  | 896         rm(seqs_full) | 
|  | 897       } else { | 
|  | 898         pair_counts = tabulate(table(gsub(".$", "", names(seqs)))) | 
|  | 899 | 
|  | 900       } | 
|  | 901       pair_completeness = 1 - pair_counts[1]/sum(pair_counts) | 
|  | 902     }else{ | 
|  | 903       pair_completeness = 0 | 
|  | 904     } | 
|  | 905   } | 
|  | 906   NV = length(seqs)  # vertices | 
|  | 907   NE = nrow(blastTable)  # nodes | 
|  | 908   if (maxv < NV | maxe < NE) { | 
|  | 909     ## Sample if graph is large | 
|  | 910     V_sample_size = estimate_sample_size(NV, NE, maxv, maxe) | 
|  | 911     seqs = sample(seqs, V_sample_size) | 
|  | 912     blastTable = blastTable[blastTable$query %in% names(seqs) & blastTable$subject %in% | 
|  | 913                             names(seqs), ] | 
|  | 914   } | 
|  | 915 | 
|  | 916   blastTable$sign = ifelse(blastTable$sign == "+", 1, -1) | 
|  | 917   vnames = unique(c(blastTable$query, blastTable$subject)) | 
|  | 918   vindex = seq_along(vnames) | 
|  | 919   names(vindex) = vnames | 
|  | 920   ## correct orientation | 
|  | 921   cat("creating graph\n") | 
|  | 922   G = graph.data.frame(blastTable[, c("query", "subject", "weight", "sign")], directed = FALSE, | 
|  | 923                        vertices = vnames) | 
|  | 924   if (include_layout) { | 
|  | 925     ## save temporarily modified blastTable if large | 
|  | 926     tmpB = tempfile() | 
|  | 927     save(blastTable, file = tmpB) | 
|  | 928     rm(blastTable) | 
|  | 929     ## | 
|  | 930     cat("graph layout calculation ") | 
|  | 931     if (ecount(G) > 2e+06) { | 
|  | 932       cat("using fruchterman reingold\n") | 
|  | 933       L = layout.fruchterman.reingold(G, dim = 3) | 
|  | 934     } else { | 
|  | 935       cat("using OGDF & frucherman reingold\n") | 
|  | 936       Ltmp = OGDFlayout(G, alg = c("fmmm")) | 
|  | 937       L = cbind(Ltmp[[1]][, 1:2], layout.fruchterman.reingold(G, dim = 2)) | 
|  | 938 | 
|  | 939     } | 
|  | 940 | 
|  | 941     GL = list(G = G, L = L) | 
|  | 942     save(GL, file = graph_destination) | 
|  | 943     if (!is.null(image_file)) { | 
|  | 944       cat("exporting graph figure") | 
|  | 945       png(image_file, width = 900, height = 900, pointsize = 20) | 
|  | 946       plot(GL$G, layout = GL$L[, 1:2], vertex.size = 2, vertex.color = "#000000A0", | 
|  | 947            edge.color = "#00000040", vertex.shape = "circle", edge.curved = FALSE, | 
|  | 948            vertex.label = NA, vertex.frame.color = NA, edge.width = 1) | 
|  | 949       dev.off() | 
|  | 950       ## create thunmbs: | 
|  | 951       system(paste("convert ", image_file, " -resize 100x100 ", image_file_tmb)) | 
|  | 952 | 
|  | 953     } | 
|  | 954     rm(GL) | 
|  | 955     gc() | 
|  | 956     load(tmpB) | 
|  | 957     unlink(tmpB) | 
|  | 958   } | 
|  | 959 | 
|  | 960   if (!is.connected(G)) { | 
|  | 961     cat("Warning - graph is not connected\n") | 
|  | 962     cc = clusters(G, "weak") | 
|  | 963     mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)), | 
|  | 964                                                                           decreasing = TRUE))))) | 
|  | 965     selvi = which(mb == 1)  # largest component | 
|  | 966     cat("using largest component: \nsize =", round(length(selvi)/vcount(G) * | 
|  | 967                                                    100, 1), "% ;", length(selvi), "reads", "\n") | 
|  | 968     G = induced.subgraph(G, vids = selvi) | 
|  | 969     blastTable = blastTable[blastTable$query %in% V(G)$name & blastTable$subject %in% | 
|  | 970                                                       V(G)$name, ] | 
|  | 971   } | 
|  | 972 | 
|  | 973   Gmst = minimum.spanning.tree(G) | 
|  | 974                                         # create alternative trees - for case that unly suboptima solution is found | 
|  | 975   set.seed(123) | 
|  | 976   Gmst_alt = list() | 
|  | 977   Gmst_alt[[1]] = Gmst | 
|  | 978   for (i in 2:6){ | 
|  | 979     E(G)$weight = runif(ecount(G), 0.1,1) | 
|  | 980     Gmst_alt[[i]] = minimum.spanning.tree(G) | 
|  | 981   } | 
|  | 982 | 
|  | 983   rm(G) | 
|  | 984   gc() | 
|  | 985   ## six attempts to reorient reads | 
|  | 986   flip_names_all = list() | 
|  | 987   prop_of_notfit=numeric() | 
|  | 988   thr_fit=0.001 | 
|  | 989   for (ii in 1:6){ | 
|  | 990     Gmst = Gmst_alt[[ii]] | 
|  | 991 | 
|  | 992 | 
|  | 993     blastTable_mst = as.data.frame(get.edgelist(Gmst, names = TRUE)) | 
|  | 994     colnames(blastTable_mst) = c("query", "subject") | 
|  | 995     blastTable_mst$sign = E(Gmst)$sign | 
|  | 996 | 
|  | 997     d = dfs(Gmst, root = 1, father = TRUE, order.out = TRUE) | 
|  | 998     esign = E(Gmst)$sign | 
|  | 999     rc = rep(FALSE, vcount(Gmst)) | 
|  | 1000     j = 0 | 
|  | 1001     p = 0 | 
|  | 1002     for (v in d$order[-1]) { | 
|  | 1003       j = j + 1 | 
|  | 1004       f = as.numeric(d$father)[v] | 
|  | 1005       if (is.na(f)) { | 
|  | 1006         next | 
|  | 1007       } | 
|  | 1008       eid = get.edge.ids(Gmst, c(v, f)) | 
|  | 1009       if (esign[eid] == -1) { | 
|  | 1010         ie = unique(c(which(blastTable_mst$query %in% V(Gmst)$name[v]), which(blastTable_mst$subject %in% | 
|  | 1011                                                                               V(Gmst)$name[v])))  # incident edges | 
|  | 1012         esign[ie] = esign[ie] * -1 | 
|  | 1013         rc[v] = TRUE | 
|  | 1014         p = p + 1 | 
|  | 1015         if (p > 50) { | 
|  | 1016           p = 0 | 
|  | 1017           cat("\r") | 
|  | 1018           cat(j, " : ", sum(esign)/length(esign)) | 
|  | 1019         } | 
|  | 1020       } | 
|  | 1021     } | 
|  | 1022     cat("\t") | 
|  | 1023     flip_names_all[[ii]] = flip_names = V(Gmst)$name[rc] | 
|  | 1024 | 
|  | 1025     if (!exists("blastTable")) { | 
|  | 1026       load(tmpB) | 
|  | 1027       unlink(tmpB) | 
|  | 1028     } | 
|  | 1029     ## add nofit later!! | 
|  | 1030     s.mean = with(blastTable, (s.start + s.end)/2) | 
|  | 1031     s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length - | 
|  | 1032                                                                   s.mean, s.mean) | 
|  | 1033     q.mean = with(blastTable, (q.start + q.end)/2) | 
|  | 1034     q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length - | 
|  | 1035                                                                 q.mean, q.mean) | 
|  | 1036     V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query) | 
|  | 1037     V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject) | 
|  | 1038     sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1, | 
|  | 1039                                                             1) * blastTable$sign | 
|  | 1040     nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"])) | 
|  | 1041     prop_of_notfit[ii] = length(nofit)/vcount(Gmst) | 
|  | 1042     ## check if correctly oriented | 
|  | 1043     cat("prop notfit", prop_of_notfit[[ii]],"\n") | 
|  | 1044     if (prop_of_notfit[[ii]]<thr_fit){ | 
|  | 1045       ## OK | 
|  | 1046       break | 
|  | 1047     } | 
|  | 1048   } | 
|  | 1049   if (!prop_of_notfit[[ii]]<thr_fit){ | 
|  | 1050     ## get best solution | 
|  | 1051     ii_best = which.min(prop_of_notfit) | 
|  | 1052     if (ii != ii_best){   # if the last solution was not best, get the best | 
|  | 1053       flip_names = flip_names_all[[ii_best]] | 
|  | 1054       s.mean = with(blastTable, (s.start + s.end)/2) | 
|  | 1055       s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length - | 
|  | 1056                                                                     s.mean, s.mean) | 
|  | 1057       q.mean = with(blastTable, (q.start + q.end)/2) | 
|  | 1058       q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length - | 
|  | 1059                                                                   q.mean, q.mean) | 
|  | 1060       V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query) | 
|  | 1061       V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject) | 
|  | 1062       sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1, | 
|  | 1063                                                               1) * blastTable$sign | 
|  | 1064       nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"])) | 
|  | 1065 | 
|  | 1066     } | 
|  | 1067   } | 
|  | 1068 | 
|  | 1069   ## exclude all nofit | 
|  | 1070 | 
|  | 1071   df2 = data.frame(V1, V2, sign = blastTable$sign, sign_final = sign_final, stringsAsFactors = FALSE) | 
|  | 1072   rm(blastTable) | 
|  | 1073   gc() | 
|  | 1074   vertices = data.frame(name = vnames, reverse_complement = vnames %in% flip_names) | 
|  | 1075   G = graph.data.frame(df2, vertices = vertices) | 
|  | 1076   vcount_ori = vcount(G) | 
|  | 1077   G = induced.subgraph(G, vids = which(!V(G)$name %in% nofit)) | 
|  | 1078 | 
|  | 1079   G$escore_mst = sum(esign)/length(esign) | 
|  | 1080   G$escore = sum(sign_final == 1)/length(sign_final) | 
|  | 1081   cc = clusters(G, "strong") | 
|  | 1082   mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)), | 
|  | 1083                                                                         decreasing = TRUE))))) | 
|  | 1084   names(mb) = V(G)$name | 
|  | 1085   G$loop_index = max(cc$csize)/vcount(G) | 
|  | 1086   G$coverage = vcount(G)/vcount_ori | 
|  | 1087   ## check sign in all edges | 
|  | 1088   V(G)$membership = mb | 
|  | 1089   save(G, file = directed_graph_destination) | 
|  | 1090   ## remove nofit | 
|  | 1091   seqs = seqs[!names(seqs) %in% nofit] | 
|  | 1092   seqs[names(seqs) %in% flip_names] = reverseComplement(seqs[names(seqs) %in% flip_names]) | 
|  | 1093   names(seqs) = paste(names(seqs), mb[names(seqs)]) | 
|  | 1094   writeXStringSet(seqs, filepath = oriented_sequences) | 
|  | 1095   ## calculate satellite probability | 
|  | 1096   if (is.null(satellite_model_path)) { | 
|  | 1097     pSAT = isSAT = NULL | 
|  | 1098   } else { | 
|  | 1099     satellite_model = readRDS(satellite_model_path) | 
|  | 1100     pSAT = get_prob(G$loop_index, pair_completeness, model = satellite_model) | 
|  | 1101     isSAT = isSatellite(G$loop_index, pair_completeness, model = satellite_model) | 
|  | 1102   } | 
|  | 1103 | 
|  | 1104   ## get larges cc | 
|  | 1105   output = list(escore = G$escore, coverage = G$coverage, escore_mts = G$escore_mst, | 
|  | 1106                 loop_index = G$loop_index, pair_completeness = pair_completeness, graph_file = graph_destination, | 
|  | 1107                 oriented_sequences = oriented_sequences, vcount = vcount(G), ecount = ecount(G), | 
|  | 1108                 satellite_probability = pSAT, satellite = isSAT) | 
|  | 1109   ## clean up | 
|  | 1110   all_objects = ls() | 
|  | 1111   do_not_remove = "output" | 
|  | 1112   rm(list = all_objects[!(all_objects %in% do_not_remove)]) | 
|  | 1113   gc(verbose = FALSE, reset = TRUE) | 
|  | 1114   return(list2dictionary(output)) | 
|  | 1115 } | 
|  | 1116 | 
|  | 1117 list2dictionary = function(l){ | 
|  | 1118     dict = "{" | 
|  | 1119     q='"' | 
|  | 1120     for (i in 1:length(l)){ | 
|  | 1121         if (class(l[[i]])=="character" | is.null(l[[i]])){ | 
|  | 1122             q2 = "'''" | 
|  | 1123         }else{ | 
|  | 1124             q2 = '' | 
|  | 1125         } | 
|  | 1126         dict = paste0( | 
|  | 1127             dict, | 
|  | 1128             q,names(l)[i],q,":",q2, l[[i]], q2,", " | 
|  | 1129         ) | 
|  | 1130     } | 
|  | 1131     dict = paste0(dict, "}") | 
|  | 1132     return(dict) | 
|  | 1133 } | 
|  | 1134 | 
|  | 1135 wrap = Vectorize(function(s, width = 80) { | 
|  | 1136   i1 = seq(1, nchar(s), width) | 
|  | 1137   i2 = seq(width, by = width, length.out = length(i1)) | 
|  | 1138   return(paste(substring(s, i1, i2), collapse = "\n")) | 
|  | 1139 }) | 
|  | 1140 | 
|  | 1141 | 
|  | 1142 tarean = function(input_sequences, output_dir, min_kmer_length = 11, max_kmer_length = 27, | 
|  | 1143                   CPU = 2, sample_size = 10000, reorient_reads = TRUE, tRNA_database_path = NULL, | 
|  | 1144                   include_layout = TRUE, paired = TRUE, lock_file=NULL) { | 
|  | 1145   options(CPU = CPU) | 
|  | 1146   time0 = Sys.time() | 
|  | 1147   dir.create(output_dir) | 
|  | 1148   input_sequences_copy = paste(output_dir, "/", basename(input_sequences), sep = "") | 
|  | 1149 | 
|  | 1150   if (!file.copy(input_sequences, input_sequences_copy, overwrite = TRUE)) { | 
|  | 1151     cat(paste("cannot copy", input_sequences, " to", output_dir), "\n") | 
|  | 1152     stop() | 
|  | 1153   } | 
|  | 1154 | 
|  | 1155   lock_file = waitForRAM(lock_file = lock_file) | 
|  | 1156   pair_completeness = NULL | 
|  | 1157   ## sampling | 
|  | 1158   if (sample_size != 0) { | 
|  | 1159     s = readDNAStringSet(input_sequences_copy) | 
|  | 1160     N = length(s) | 
|  | 1161     ## pair completness must be calculated before sampling! | 
|  | 1162     if (N > sample_size) { | 
|  | 1163       writeXStringSet(sample(s, sample_size), filepath = input_sequences_copy) | 
|  | 1164       if (paired) { | 
|  | 1165         pair_counts = tabulate(table(gsub(".$", "", names(s)))) | 
|  | 1166         pair_completeness = 1 - pair_counts[1]/sum(pair_counts) | 
|  | 1167       } | 
|  | 1168     } | 
|  | 1169     rm(s) | 
|  | 1170   } | 
|  | 1171 | 
|  | 1172   if (reorient_reads) { | 
|  | 1173     input_sequences_oriented = paste0(input_sequences_copy, "oriented.fasta") | 
|  | 1174     graph_file = paste0(input_sequences_copy, "_graph.RData") | 
|  | 1175     GLfile = paste0(input_sequences_copy, "_graph.GL") | 
|  | 1176     cat("reorientig sequences\n") | 
|  | 1177 | 
|  | 1178     blastTable = mgblast(input_sequences_copy, input_sequences_copy) | 
|  | 1179 | 
|  | 1180     graph_info = mgblast2graph(blastTable, input_sequences_copy, GLfile, graph_file, | 
|  | 1181                                input_sequences_oriented, include_layout = include_layout, paired = paired, | 
|  | 1182                                pair_completeness = pair_completeness) | 
|  | 1183 | 
|  | 1184     ## interupt if it does not look like tandem repeat at all # soft threshold! | 
|  | 1185     if (is.null(graph_info$pair_completeness)) { | 
|  | 1186       if (graph_info$loop_index <= 0.4) { | 
|  | 1187         cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | 
|  | 1188         return(list2dictionary(list(graph_info = graph_info))) | 
|  | 1189       } | 
|  | 1190     } else { | 
|  | 1191       ## for paired reads: | 
|  | 1192       if (graph_info$loop_index < 0.7 | graph_info$pair_completeness < 0.4) { | 
|  | 1193         cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | 
|  | 1194         return(list2dictionary(list(graph_info = graph_info))) | 
|  | 1195       } | 
|  | 1196     } | 
|  | 1197 | 
|  | 1198   } else { | 
|  | 1199     input_sequences_oriented = input_sequences_copy | 
|  | 1200     graph_info = NULL | 
|  | 1201   } | 
|  | 1202 | 
|  | 1203 | 
|  | 1204 | 
|  | 1205   kmer_counts = list() | 
|  | 1206   kmer_lengths = seq(min_kmer_length, max_kmer_length, by = 4) | 
|  | 1207   for (i in kmer_lengths) { | 
|  | 1208     ## use pythonis.null(l[[i]])) function - faster | 
|  | 1209     cmd = paste(script.dir, "/kmer_counting.py ", input_sequences_oriented, " ", | 
|  | 1210                 i, sep = "") | 
|  | 1211     cat("calculation kmers of length ", i, "\n") | 
|  | 1212     f = system(cmd, intern = TRUE) | 
|  | 1213     x = read.table(f, as.is = TRUE, sep = "\t") | 
|  | 1214     kmer_counts[[as.character(i)]] = data.frame(x, freq = x$V2/sum(x$V2)) | 
|  | 1215   } | 
|  | 1216 | 
|  | 1217   ## number of kmers: | 
|  | 1218   N50 = sapply(kmer_counts, function(x) { | 
|  | 1219     sum(cumsum(x$freq) < 0.5) | 
|  | 1220   }) | 
|  | 1221 | 
|  | 1222   N70 = sapply(kmer_counts, function(x) { | 
|  | 1223     sum(cumsum(x$freq) < 0.7) | 
|  | 1224   }) | 
|  | 1225 | 
|  | 1226 | 
|  | 1227   time1 = Sys.time() | 
|  | 1228   ggmin = mclapply(kmer_counts, get_mimimal_cc, start = 0.5, mc.cores = CPU) | 
|  | 1229   time2 = Sys.time() | 
|  | 1230   cat("kmers graphs created ") | 
|  | 1231   print(time2 - time1) | 
|  | 1232   names(ggmin) = names(kmer_counts) | 
|  | 1233 | 
|  | 1234 | 
|  | 1235   ## estimate monomer | 
|  | 1236   monomers = mclapply(ggmin, estimate_monomer, mc.cores = CPU) | 
|  | 1237 | 
|  | 1238   names(monomers) = names(kmer_counts) | 
|  | 1239   monomers = monomers[!sapply(monomers, is.null)] | 
|  | 1240   ## error handling: | 
|  | 1241   error = sapply(monomers, class) == "try-error" | 
|  | 1242   if (any(error)) { | 
|  | 1243     cat("\nError in monomers estimation: ") | 
|  | 1244     cat("calculation failed for monomers length ", names(monomers)[error], "\n") | 
|  | 1245     print(monomers[error]) | 
|  | 1246     if (any(!error)) { | 
|  | 1247       monomers = monomers[!error] | 
|  | 1248     } else { | 
|  | 1249       stop("monomer estimation failed") | 
|  | 1250     } | 
|  | 1251   } | 
|  | 1252 | 
|  | 1253   ## summary - make function!! | 
|  | 1254   total_score = list() | 
|  | 1255   k = 0 | 
|  | 1256   length_best = numeric() | 
|  | 1257   score_bn = numeric() | 
|  | 1258   consensus = character() | 
|  | 1259 | 
|  | 1260   for (i in names(monomers)) { | 
|  | 1261     for (v in seq_along(monomers[[i]]$estimates)) { | 
|  | 1262       k = k + 1 | 
|  | 1263       total_score[[k]] = c(kmer = as.numeric(i), variant = v, total_score = monomers[[i]]$estimates[[v]]$total_score) | 
|  | 1264       score_bn[k] = min(rowSums(monomers[[i]]$estimates[[v]]$CM)) | 
|  | 1265       length_best[k] = monomers[[i]]$estimates[[v]]$length_variant_score[1, | 
|  | 1266                                                                          1] | 
|  | 1267       consensus[[k]] = monomers[[i]]$estimates[[v]]$consensus2 | 
|  | 1268     } | 
|  | 1269   } | 
|  | 1270   summary_table = as.data.frame(do.call(rbind, total_score)) | 
|  | 1271   summary_table$monomer_length = length_best | 
|  | 1272   summary_table$consensus_length = nchar(consensus) | 
|  | 1273   summary_table$score_bn = score_bn | 
|  | 1274   summary_table$consensus = paste("<pre>", wrap(consensus, width = 80), "<pre>", | 
|  | 1275                                   sep = "") | 
|  | 1276   consensus_DS = DNAStringSet(consensus) | 
|  | 1277   names(consensus_DS) = with(summary_table, paste0(kmer, "_", variant, "_sc_", | 
|  | 1278                                                    signif(total_score), "_l_", monomer_length)) | 
|  | 1279 | 
|  | 1280   ## filter by score - keep | 
|  | 1281 | 
|  | 1282   ## reorder by score | 
|  | 1283   consensus_DS = consensus_DS[order(summary_table$total_score, decreasing = TRUE)] | 
|  | 1284   summary_table = summary_table[order(summary_table$total_score, decreasing = TRUE), | 
|  | 1285                                 ] | 
|  | 1286   rownames(summary_table) = NULL | 
|  | 1287   N = nrow(summary_table) | 
|  | 1288   ## concatenate concensus(ie. make dimer head 2 tail) for pbs detection and other | 
|  | 1289   ## make something like 'pseudo contig' multimer for mapping - min length 200 bp | 
|  | 1290 | 
|  | 1291   ## searches | 
|  | 1292   consensus_DS_dimer = DNAStringSet(paste0(consensus_DS, consensus_DS)) | 
|  | 1293   tarean_contigs = DNAStringSet(sapply(consensus_DS,function(x) | 
|  | 1294     ifelse(nchar(x)<200, | 
|  | 1295            paste(rep(as.character(x),round(300/nchar(as.character(x))+1)),collapse=''), | 
|  | 1296            as.character(x))) | 
|  | 1297     ) | 
|  | 1298 | 
|  | 1299   names(consensus_DS_dimer) = names(consensus_DS) | 
|  | 1300                                         # save sequences: | 
|  | 1301   consensus_DS_dimer_file = paste0(output_dir, "/consensus_dimer.fasta") | 
|  | 1302   consensus_DS_file = paste0(output_dir, "/consensus.fasta") | 
|  | 1303   tarean_contig_file = paste0(output_dir, "/tarean_contigs.fasta") | 
|  | 1304   writeXStringSet(consensus_DS, consensus_DS_file) | 
|  | 1305   writeXStringSet(tarean_contigs, tarean_contig_file) | 
|  | 1306   writeXStringSet(consensus_DS_dimer, consensus_DS_dimer_file) | 
|  | 1307   if (is.null(tRNA_database_path)) { | 
|  | 1308     pbs_score = -1 | 
|  | 1309   } else { | 
|  | 1310     pbs_blast_table = paste0(output_dir, "/pbs_detection") | 
|  | 1311     pbs_score = detect_pbs(dimers_file = consensus_DS_dimer_file, tRNA_database_path = tRNA_database_path, | 
|  | 1312                            reads = input_sequences_oriented, output = pbs_blast_table) | 
|  | 1313   } | 
|  | 1314   ## search of open reading frames get the length of the longest | 
|  | 1315   orf_l = max_ORF_length(consensus_DS_dimer) | 
|  | 1316 | 
|  | 1317 | 
|  | 1318   dir.create(paste0(output_dir, "/img"), recursive = TRUE) | 
|  | 1319   summary_table$monomer_length_graph = numeric(N) | 
|  | 1320   summary_table$graph_image = character(N) | 
|  | 1321   summary_table$monomer_length_logo = numeric(N) | 
|  | 1322   summary_table$logo_image = character(N) | 
|  | 1323   ## export graph nd consensus estimate to cluster directory type of output may | 
|  | 1324   ## change in future | 
|  | 1325   save(ggmin, file = paste0(output_dir, "/ggmin.RData")) | 
|  | 1326   save(monomers, file = paste0(output_dir, "/monomers.RData")) | 
|  | 1327 | 
|  | 1328   cat("generating HTML output\n") | 
|  | 1329   for (i in 1:N) { | 
|  | 1330     kmer = as.character(summary_table$kmer[i]) | 
|  | 1331     variant = summary_table$variant[i] | 
|  | 1332     ## export graph | 
|  | 1333     fout_link = paste0("img/graph_", kmer, "mer_", variant, ".png") | 
|  | 1334     fout = paste0(output_dir, "/", fout_link) | 
|  | 1335     ## summary_table$monomer_length_graph[i] = summary_table$monomer_length[i] | 
|  | 1336     ## summary_table$monomer_length_logo[[i]] = nrow(monomers[[kmer]]$estimates[[variant]]$CM) | 
|  | 1337     summary_table$monomer_length[[i]] = length(monomers[[kmer]]$estimates[[variant]]$consensus) | 
|  | 1338 | 
|  | 1339     if (i <= 10) { | 
|  | 1340       png(fout, width = 800, height = 800) | 
|  | 1341       plot_kmer_graph(ggmin[[kmer]], highlight = unlist(monomers[[kmer]]$paths[[variant]]$tr_paths)) | 
|  | 1342       dev.off() | 
|  | 1343       summary_table$graph_image[i] = hwriteImage(fout_link, link = fout_link, | 
|  | 1344                                                  table = FALSE, width = 100, height = 100) | 
|  | 1345       ## export logo | 
|  | 1346       png_link = paste0("img/logo_", kmer, "mer_", variant, ".png") | 
|  | 1347       fout = paste0(output_dir, "/", png_link) | 
|  | 1348       png(fout, width = 1200, height = round(summary_table$monomer_length[i] * | 
|  | 1349                                              1) + 550) | 
|  | 1350       try(plot_multiline_logo(monomers[[kmer]]$estimates[[variant]]$CM, W = 100)) | 
|  | 1351       dev.off() | 
|  | 1352       ## export corresponding position probability matrices | 
|  | 1353       ppm_file = paste0(output_dir, '/ppm_', kmer, "mer_", variant, ".csv") | 
|  | 1354       ppm_link = paste0('ppm_', kmer, "mer_", variant, ".csv") | 
|  | 1355       write.table(monomers[[kmer]]$estimates[[variant]]$CM, | 
|  | 1356                   file = ppm_file, | 
|  | 1357                   col.names = TRUE, quote = FALSE, | 
|  | 1358                   row.names = FALSE, sep="\t") | 
|  | 1359       summary_table$logo_image[i] = hwriteImage(png_link, link = ppm_link, | 
|  | 1360                                                 table = FALSE, width = 200, height = 100) | 
|  | 1361     } | 
|  | 1362 | 
|  | 1363   } | 
|  | 1364 | 
|  | 1365   ## html_report = HTMLReport() | 
|  | 1366 | 
|  | 1367   htmlfile = paste0(output_dir, "/report.html") | 
|  | 1368   cat(htmlheader, file = htmlfile) | 
|  | 1369   included_columns = c('kmer', 'variant', 'total_score', 'consensus_length','consensus', 'graph_image', 'logo_image') | 
|  | 1370   summary_table_clean = summary_table[,included_columns] | 
|  | 1371   colnames(summary_table_clean) = c('k-mer length', | 
|  | 1372                                     'Variant index', | 
|  | 1373                                     'k-mer coverage score', | 
|  | 1374                                     'Consensus length', | 
|  | 1375                                     'Consensus sequence', | 
|  | 1376                                     'k-mer based graph', | 
|  | 1377                                     'Sequence logo') | 
|  | 1378   HTML(summary_table_clean, file = htmlfile, sortableDF = TRUE) | 
|  | 1379   HTMLEndFile(file = htmlfile) | 
|  | 1380   time4 = Sys.time() | 
|  | 1381   print(time4 - time0) | 
|  | 1382   if (!is.null(lock_file)){ | 
|  | 1383     print("------removing-lock--------") | 
|  | 1384     removelock(lock_file) | 
|  | 1385   } | 
|  | 1386 | 
|  | 1387   print(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1], | 
|  | 1388              TR_monomer_length = as.numeric(summary_table$consensus_length[1]), | 
|  | 1389              TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info, | 
|  | 1390              orf_l = orf_l, tarean_contig_file = tarean_contig_file)) | 
|  | 1391   return(list2dictionary(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1], | 
|  | 1392               TR_monomer_length = as.numeric(summary_table$consensus_length[1]), | 
|  | 1393               TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info, | 
|  | 1394               orf_l = orf_l, tarean_contig_file = tarean_contig_file))) | 
|  | 1395 } | 
|  | 1396 | 
|  | 1397 | 
|  | 1398 ## graph loop index stability | 
|  | 1399 loop_index_instability = function(G) { | 
|  | 1400   N = 50 | 
|  | 1401   s = seq(vcount(G), vcount(G)/10, length.out = N) | 
|  | 1402   p = seq(1, 0.1, length.out = N) | 
|  | 1403   li = numeric() | 
|  | 1404   for (i in seq_along(s)) { | 
|  | 1405     print(i) | 
|  | 1406     gs = induced_subgraph(G, sample(1:vcount(G), s[i])) | 
|  | 1407     li[i] = max(clusters(gs, "strong")$csize)/vcount(gs) | 
|  | 1408   } | 
|  | 1409   instability = lm(li ~ p)$coefficient[2] | 
|  | 1410   return(instability) | 
|  | 1411 } | 
|  | 1412 | 
|  | 1413 isSatellite = function(x, y, model) { | 
|  | 1414   p = get_prob(x, y, model) | 
|  | 1415   if (p > model$cutoff) { | 
|  | 1416     return("Putative Satellite") | 
|  | 1417   } else { | 
|  | 1418     return("") | 
|  | 1419   } | 
|  | 1420 } | 
|  | 1421 | 
|  | 1422 get_prob = function(x, y, model) { | 
|  | 1423   pm = model$prob_matrix | 
|  | 1424   N = ncol(pm) | 
|  | 1425   i = round(x * (N - 1)) + 1 | 
|  | 1426   j = round(y * (N - 1)) + 1 | 
|  | 1427   p = pm[i, j] | 
|  | 1428   return(p) | 
|  | 1429 } | 
|  | 1430 | 
|  | 1431 | 
|  | 1432 detectMemUsage = function() { | 
|  | 1433   con = textConnection(gsub(" +", " ", readLines("/proc/meminfo"))) | 
|  | 1434   memInfo = read.table(con, fill = TRUE, row.names = 1) | 
|  | 1435   close(con) | 
|  | 1436   memUsage = 1 - (memInfo["MemFree", 1] + memInfo["Cached", 1])/memInfo["MemTotal", | 
|  | 1437                                                                         1] | 
|  | 1438   return(memUsage) | 
|  | 1439 } | 
|  | 1440 | 
|  | 1441 | 
|  | 1442 makelock<-function(lockfile,lockmsg,CreateDirectories=TRUE){ | 
|  | 1443     lockdir=dirname(lockfile) | 
|  | 1444     if(!file.exists(lockdir)){ | 
|  | 1445         if(CreateDirectories) dir.create(lockdir,recursive=TRUE) | 
|  | 1446         else stop("Lock Directory for lockfile ",lockfile," does not exist") | 
|  | 1447     } | 
|  | 1448     if(missing(lockmsg)) lockmsg=paste(system('hostname',intern=TRUE),Sys.getenv("R_SESSION_TMPDIR")) | 
|  | 1449     if (file.exists(lockfile)) return (FALSE) | 
|  | 1450                                         # note the use of paste makes the message writing atomic | 
|  | 1451     cat(paste(lockmsg,"\n",sep=""),file=lockfile,append=TRUE,sep="") | 
|  | 1452     firstline=readLines(lockfile,n=1) | 
|  | 1453     if(firstline!=lockmsg){ | 
|  | 1454                                         # somebody else got there first | 
|  | 1455         return(FALSE) | 
|  | 1456     } else return(TRUE) | 
|  | 1457 } | 
|  | 1458 | 
|  | 1459 | 
|  | 1460 removelock<-function(lockfile){ | 
|  | 1461   if(unlink(lockfile)!=0) { | 
|  | 1462     warning("Unable to remove ",lockfile) | 
|  | 1463     return (FALSE) | 
|  | 1464   } | 
|  | 1465   return (TRUE) | 
|  | 1466 } | 
|  | 1467 | 
|  | 1468 | 
|  | 1469 waitForRAM = function(p = 0.5,lock_file=NULL) { | 
|  | 1470   if (detectMemUsage() < p) { | 
|  | 1471     return(NULL) | 
|  | 1472     ## check lock file: | 
|  | 1473   } else { | 
|  | 1474     cat("waiting for RAM \n") | 
|  | 1475     free_count = 0 | 
|  | 1476     while (TRUE) { | 
|  | 1477         if (makelock(lock_file)){ | 
|  | 1478             print("---------locking--------") | 
|  | 1479             return(lock_file) | 
|  | 1480         } | 
|  | 1481       if (detectMemUsage() < p) { | 
|  | 1482         cat("RAM freed \n") | 
|  | 1483         return(NULL) | 
|  | 1484       } | 
|  | 1485       Sys.sleep(5) | 
|  | 1486       if (evaluate_user_cpu_usage() == 'free'){ | 
|  | 1487         free_count = free_count + 1 | 
|  | 1488       }else{ | 
|  | 1489         free_count = 0 | 
|  | 1490       } | 
|  | 1491       if (detectMemUsage() < 0.8 & free_count > 100){ | 
|  | 1492         cat("RAM not free but nothing else is running \n") | 
|  | 1493         return(NULL) | 
|  | 1494       } | 
|  | 1495     } | 
|  | 1496   } | 
|  | 1497 } | 
|  | 1498 | 
|  | 1499 lsmem = function() { | 
|  | 1500   g = globalenv() | 
|  | 1501   out_all = envs = list() | 
|  | 1502   envs = append(envs, g) | 
|  | 1503   total_size = numeric() | 
|  | 1504   while (environmentName(g) != "R_EmptyEnv") { | 
|  | 1505     g <- parent.env(g) | 
|  | 1506     envs = append(envs, g) | 
|  | 1507   } | 
|  | 1508   for (e in envs) { | 
|  | 1509 | 
|  | 1510     obj = ls(envir = e) | 
|  | 1511     if (length(obj) == 0) { | 
|  | 1512       break | 
|  | 1513     } | 
|  | 1514     obj.size = list() | 
|  | 1515     for (i in obj) { | 
|  | 1516       obj.size[[i]] = object.size(get(i, envir = e)) | 
|  | 1517     } | 
|  | 1518     out = data.frame(object = obj, size = unlist(obj.size), stringsAsFactors = FALSE) | 
|  | 1519     out = out[order(out$size, decreasing = TRUE), ] | 
|  | 1520     out_all = append(out_all, out) | 
|  | 1521     total_size = append(total_size, sum(out$size)) | 
|  | 1522   } | 
|  | 1523   return(list(objects = out_all, total_size = total_size)) | 
|  | 1524 } | 
|  | 1525 | 
|  | 1526 evaluate_user_cpu_usage = function(){ | 
|  | 1527   user = Sys.info()["user"] | 
|  | 1528   a = sum(as.numeric (system(paste ("ps -e -o %cpu -u", user), intern = TRUE)[-1])) | 
|  | 1529   s = substring (system(paste ("ps -e -o stat -u", user), intern = TRUE)[-1],1,1) | 
|  | 1530   if (a<5 & sum(s %in% 'D')==0 & sum(s%in% 'R')<2){ | 
|  | 1531     status = 'free' | 
|  | 1532   }else{ | 
|  | 1533     status = 'full' | 
|  | 1534   } | 
|  | 1535   return(status) | 
|  | 1536 } |