# HG changeset patch # User petr-novak # Date 1649768132 0 # Node ID f131886ea1940849243c57410078e35e8629f120 # Parent c1498f679b50bc41d1b73c80ddc0c6740010afa5 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a" diff -r c1498f679b50 -r f131886ea194 R/ltr_utils.R --- a/R/ltr_utils.R Wed Mar 09 09:31:31 2022 +0000 +++ b/R/ltr_utils.R Tue Apr 12 12:55:32 2022 +0000 @@ -92,22 +92,23 @@ } } + + trim2TGAC <- function(bl) { for (i in 1:nrow(bl)) { - tg_L <- firstTG(bl$qseq[i]) - tg_R <- firstTG(bl$sseq[i]) - ca_L <- lastCA(bl$qseq[i]) - ca_R <- lastCA(bl$sseq[i]) - e_dist <- bl$length[i] - ca_R - no_match <- any(tg_L == 0, tg_R == 0, ca_L == 0, ca_R == 0) + cons <- consensusString(c(bl$qseq[i], bl$sseq[i]), ambiguityMap="?") + tg_P <- firstTG(cons) + ca_P <- lastCA(cons) + e_dist <- bl$length[i] - ca_P + max_dist = 50 # was 25 + no_match <- any(tg_P == 0, ca_P == 0) if (!no_match & - tg_L == tg_R & - ca_L == ca_R & - tg_L < 8 & - e_dist < 8) { + tg_P < max_dist & + e_dist < max_dist) { ## trim alignment - bl[i,] <- trim_blast_table(bl[i,], tg_L, e_dist - 1) + bl[i,] <- trim_blast_table(bl[i,], tg_P, e_dist - 1) } + } return(bl) } @@ -566,6 +567,30 @@ return(te) } + +get_te_rank <- function (gr){ + DL_id <- gr$ID[gr$type == "transposable_element" & + gr$TSD == "not_found" & + is.na(gr$trna_id)] + DLT_id <- gr$ID[gr$type == "transposable_element" & + gr$TSD != "not_found" & + is.na(gr$trna_id)] + DLTP_id <- gr$ID[gr$type == "transposable_element" & + gr$TSD != "not_found" & + !is.na(gr$trna_id)] + DLP_id <- gr$ID[gr$type == "transposable_element" & + gr$TSD == "not_found" & + !is.na(gr$trna_id)] + Rank <- character(length(gr)) + ID <- unlist(ifelse(is.na(gr$ID), gr$Parent, gr$ID)) + + Rank[ID %in% DL_id] <- "DL" + Rank[ID %in% DLT_id] <- "DLT" + Rank[ID %in% DLP_id] <- "DLP" + Rank[ID %in% DLTP_id] <- "DLTP" + return(Rank) +} + get_te_statistics <- function(gr, RT) { DOMAINS_LTR <- gr[gr$type == "transposable_element" & gr$TSD == "not_found" & diff -r c1498f679b50 -r f131886ea194 clean_dante_ltr.xml --- a/clean_dante_ltr.xml Wed Mar 09 09:31:31 2022 +0000 +++ b/clean_dante_ltr.xml Tue Apr 12 12:55:32 2022 +0000 @@ -1,4 +1,4 @@ - + r-optparse diff -r c1498f679b50 -r f131886ea194 dante_ltr_search.xml --- a/dante_ltr_search.xml Wed Mar 09 09:31:31 2022 +0000 +++ b/dante_ltr_search.xml Tue Apr 12 12:55:32 2022 +0000 @@ -1,4 +1,4 @@ - + blast r-optparse diff -r c1498f679b50 -r f131886ea194 extract_putative_ltr.R --- a/extract_putative_ltr.R Wed Mar 09 09:31:31 2022 +0000 +++ b/extract_putative_ltr.R Tue Apr 12 12:55:32 2022 +0000 @@ -72,6 +72,11 @@ s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910 /fasta_parts/211010_Vfaba_chr5.fasta") + g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data//Cocoa_theobroma_DANTE_filtered.gff3") + s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/Cocoa_theobroma_chr1.fasta.gz") + + source("R/ltr_utils.R") + g <- rtracklayer::import("./test_data/sample_DANTE.gff3") s <- readDNAStringSet("./test_data/sample_genome.fasta") outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3" @@ -159,6 +164,7 @@ grR[x]), mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE ) + cat('done.\n') good_TE <- TE[!sapply(TE, is.null)] @@ -177,14 +183,13 @@ src <- as.character(gff3_out$source) src[is.na(src)] <- "dante_ltr" gff3_out$source <- src - -# TODO export all files to single directory -# TODO export individual groups DL, DLT, DLP DLPT gff3 +gff3_out$Rank <- get_te_rank(gff3_out) +# TODO add attributte specifying individual groups DL, DLT, DLP DLPT gff3 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') -# summary statistics + all_tbl <- get_te_statistics(gff3_out, RT) -write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE) +write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = TRUE) # export fasta files: s_te <- get_te_sequences(gff3_out, s)