comparison R/ltr_utils.R @ 3:6ae4a341d1f3 draft

"planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
author petr-novak
date Tue, 03 May 2022 12:38:12 +0000
parents f131886ea194
children c33d6583e548
comparison
equal deleted inserted replaced
2:f131886ea194 3:6ae4a341d1f3
158 158
159 bl_list <- lapply(outf, read.table, stringsAsFactors = FALSE, col.names = unlist(strsplit(cols, " ")), sep="\t", comment.char = "") 159 bl_list <- lapply(outf, read.table, stringsAsFactors = FALSE, col.names = unlist(strsplit(cols, " ")), sep="\t", comment.char = "")
160 bl_table <- do.call(rbind, bl_list) 160 bl_table <- do.call(rbind, bl_list)
161 unlink(qf) 161 unlink(qf)
162 #unlink(outf) 162 #unlink(outf)
163 print(outf)
164 unlink(dbf) 163 unlink(dbf)
165 unlink(script) 164 unlink(script)
166 return(bl_table) 165 return(bl_table)
167 } 166 }
168 167
183 ) 182 )
184 } 183 }
185 184
186 185
187 analyze_TE <- function(seqs, ncpus = 10, word_size = 20){ 186 analyze_TE <- function(seqs, ncpus = 10, word_size = 20){
188 blt <- blast_all2all(seqs, ncpus = ncpus, word_size = word_size) 187 blt <- blast_all2all(seqs, ncpus = ncpus, word_size = word_size, perc_identity = 90)
189 te_conflict_info <- identify_conflicts(blt) 188 te_conflict_info <- identify_conflicts(blt)
190 blt_te_ok <- blast_table_subset(blt, te_conflict_info$ok) 189 blt_te_ok <- blast_table_subset(blt, te_conflict_info$ok)
191 te_ok_lineages <- split(blt_te_ok, 190 te_ok_lineages <- split(blt_te_ok,
192 gsub( 191 gsub(
193 ".+[|]", 192 ".+[|]",
282 281
283 blast_table_subset <- function(bl,id){ 282 blast_table_subset <- function(bl,id){
284 return(bl[bl$qaccver %in% id & bl$saccver %in% id,, drop = FALSE]) 283 return(bl[bl$qaccver %in% id & bl$saccver %in% id,, drop = FALSE])
285 } 284 }
286 285
287 get_representative_ranges <- function(bl, min_length = 60){ 286 get_representative_ranges <- function(bl, min_length = 200, min_identity = 98){
287 bl <- bl[bl$pident>=min_identity, , drop=FALSE]
288 bl <- bl[bl$pident>=min_identity & bl$length >= min_length, , drop=FALSE]
288 score <- sort(unlist(by(bl$bitscore, bl$qaccver, sum, simplify = FALSE)), 289 score <- sort(unlist(by(bl$bitscore, bl$qaccver, sum, simplify = FALSE)),
289 decreasing = TRUE) 290 decreasing = TRUE)
290 L <- bl$qlen[!duplicated(bl$qaccver)] 291 L <- bl$qlen[!duplicated(bl$qaccver)]
291 names(L) <- bl$qaccver[!duplicated(bl$qaccver)] 292 names(L) <- bl$qaccver[!duplicated(bl$qaccver)]
292 gr <- GRanges(seqnames = bl$qaccver, 293 gr <- GRanges(seqnames = bl$qaccver,
318 319
319 expected_diversity <- function(seqs, niter=100, km = 6){ 320 expected_diversity <- function(seqs, niter=100, km = 6){
320 L <- nchar(seqs) 321 L <- nchar(seqs)
321 R <- matrix(ncol = niter, nrow = length(seqs)) 322 R <- matrix(ncol = niter, nrow = length(seqs))
322 for (i in 1:niter){ 323 for (i in 1:niter){
323 print(i)
324 seqs_rnd <- DNAStringSet(sapply(L, function(n) paste(sample(c("A", "C", "T", "G"), n, replace=TRUE), collapse=""))) 324 seqs_rnd <- DNAStringSet(sapply(L, function(n) paste(sample(c("A", "C", "T", "G"), n, replace=TRUE), collapse="")))
325 R[,i] <- seq_diversity(seqs_rnd, km = km)$richness 325 R[,i] <- seq_diversity(seqs_rnd, km = km)$richness
326 } 326 }
327 R 327 R
328 328
621 total <- colSums(out) 621 total <- colSums(out)
622 out <- rbind(out, Total = total) 622 out <- rbind(out, Total = total)
623 return(out) 623 return(out)
624 } 624 }
625 625
626 getSeqNamed <- function(s, gr) { 626 getSeqNamed <- function(s, gr, name = NULL) {
627 spart <- getSeq(s, gr) 627 spart <- getSeq(s, gr)
628 id1 <- paste0(seqnames(gr), '_', start(gr), "_", end(gr)) 628 if (is.null(name)){
629 id1 <- paste0(seqnames(gr), '_', start(gr), "_", end(gr))
630 }else{
631 id1 <- mcols(gr)[,name]
632 }
629 id2 <- gr$Final_Classification 633 id2 <- gr$Final_Classification
630 names(spart) <- paste0(id1, "#", id2) 634 names(spart) <- paste0(id1, "#", id2)
631 spart 635 spart
632 } 636 }
633 637