dante_ltr: clean_ltr.R comparison

comparison clean_ltr.R @ 7:c33d6583e548 draft

"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"

author	petr-novak
date	Fri, 24 Jun 2022 14:19:48 +0000
parents	b91ca438a1cb
children	ff01d4263391

comparison

equal deleted inserted replaced

-:b91ca438a1cb
+:c33d6583e548
 if (FALSE) {
 g <- rtracklayer::import("./test_data/big_test_data/dante_ltr_unfiltered_t.cacao.gff3")
 s <- readDNAStringSet("./test_data/big_test_data/T_cacao_chromosomes.fasta")
 g <- rtracklayer::import("./test_data/sample_ltr_annotation.gff3")
+g <- rtracklayer::import("./test_data/sample_DANTE_LTR_annotation.gff3")
 s <- readDNAStringSet("./test_data/sample_genome.fasta")
 g <- rtracklayer::import("./test_data/DANTE_LTR_Vfaba_chr5.gff3")
 s <- readDNAStringSet("./test_data/211010_Vfaba_chr5.fasta")
 names(s) <- gsub(" .+", "", names(s))
 g$ID <- ifelse(is.na(g$ID), NA, paste0(g$ID,"_", suffix))
 g$Parent <- ifelse(is.na(g$Parent), NA, paste0(g$Parent,"_", suffix))
 # get te sequence based on rank
+# best quality - split by lineage
+s_te <- get_te_sequences(g, s)  # split by 'element quality'
 # evaluate best TE -  DLTP grou
-s_te <- get_te_sequences(g, s)  # split by 'element quality'
-# best quality - split by lineage
+# comparison parameters
-word_size <- 15
+word_size <- 11
+task <- "blastn"
+perc_identity <- 80
 # best TE
-TE_DLTP_info <- analyze_TE(s_te$DLTP, word_size = word_size, ncpus = ncpus)
+TE_DLTP_info <- analyze_TE(s_te$DLTP,
+word_size = word_size,
+ncpus = ncpus,
+perc_identity = perc_identity,
+task = task)
 # TE rank 2:
-TE_DLT_plus_DLP_info <- analyze_TE(c(s_te$DLP, s_te$DLT), word_size = word_size, ncpus
+TE_DLT_plus_DLP_info <- analyze_TE(c(s_te$DLP, s_te$DLT),
-= ncpus)
+word_size = word_size,
-TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets(c(s_te$DLT, s_te$DLP), ncpus
+ncpus = ncpus,
-= ncpus,TE_DLTP_info$seqs_representative, word_size = word_size
+perc_identity = perc_identity,
-)
+task = task)
+TE_D_plus_DL_info <- analyze_TE(c(s_te$DL, s_te$D),
+word_size = word_size,
+ncpus = ncpus,
+perc_identity = perc_identity,
+task = task)
+TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets(
+c(s_te$DLT, s_te$DLP),
+ncpus = ncpus,
+TE_DLTP_info$seqs_representative,
+word_size = word_size,
+perc_identity = perc_identity,
+task = task)
 TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info)
-# create additional library from rank 2 verified by multiplicity
+TE_D_plus_DL_info_multiplicity <- verify_based_on_multiplicity(TE_D_plus_DL_info)
-id_for_additional_library <- setdiff(
+# create additional library from rank 2 verified by multiplicity and DLTP
+id_for_additional_library <- union(
 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified,
 TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified)
 if (length(id_for_additional_library) > 1) {
 seqs_for_additional_library <- c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in%
 id_for_additional_library]
 seqs_additional_info <- analyze_TE(seqs_for_additional_library, word_size =
 word_size, ncpus = ncpus)
-seq_representative <- c(
+seqs_representative <- c(
 TE_DLTP_info$seqs_representative,
 seqs_additional_info$seqs_representative
 )
 }else {
 if (length(id_for_additional_library) == 1) {
-seq_representative <- c(
+seqs_representative <- c(
 TE_DLTP_info$seqs_representative,
 c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in% id_for_additional_library]
 )
 }else {
-seq_representative <- TE_DLTP_info$seqs_representative
+seqs_representative <- TE_DLTP_info$seqs_representative
 }
 }
 # TE  rank 3 - verify agains good DLTP
 TE_DL_info_DLTP_verified <- compare_TE_datasets(
 s_te$DL,
-TE_DLTP_info$seqs_representative, min_coverage = 0.95,
+seqs_representative, min_coverage = 0.90,
-ncpus = ncpus, word_size = word_size
+ncpus = ncpus,
-)
+word_size = word_size,
+task = task,
+perc_identity = perc_identity)
-R <- seq_diversity(seq_representative)$richness
-SI <- seq_diversity(seq_representative)$shannon_index
+TE_D_info_DLTP_verified <- compare_TE_datasets(
+s_te$D,
+seqs_representative, min_coverage = 0.90,
+ncpus = ncpus,
+word_size = word_size,
+task = task,
+perc_identity = perc_identity)
+R <- seq_diversity(seqs_representative)$richness
+SI <- seq_diversity(seqs_representative)$shannon_index
 # final RM library:
-seq_representative_no_ssr <- seq_representative[R > 20]
+seqs_representative_no_ssr <- seqs_representative[R > 20]
 ID <- g$ID[g$type == "transposable_element"]
 names(ID) <- paste0(seqnames(g), "_",
 start(g), "_",
 end(g), "#",
 # create clean gff3
 id_of_good_te <- unique(c(
 TE_DLTP_info$te_conflict_info$ok,
 TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified,
 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified,
-TE_DL_info_DLTP_verified$id_ok_verified)
+TE_DL_info_DLTP_verified$id_ok_verified,
+TE_D_info_DLTP_verified$id_ok_verified,
+TE_D_plus_DL_info_multiplicity$id_ok_mp_verified),
 )
 c1 <- g$ID %in% ID[id_of_good_te]
 c2 <- sapply(g$Parent, function(x)ifelse(length(x) == 0, "", x)) %in% ID[id_of_good_te]
 gff_out <- g[c1 | c2]
 gff_te <- gff_out[gff_out$type %in% "transposable_element"]
+# remove partial elements
+gff_te_with_ltr <- gff_out[gff_out$type %in% "transposable_element" & gff_out$Rank != "D"]
 gff_5ltr <- gff_out[gff_out$LTR %in% "5LTR"]
 gff_3ltr <- gff_out[gff_out$LTR %in% "3LTR"]
 full_te <- getSeqNamed(s, gff_te)
 names(full_te) <- paste0(gff_te$ID,":",names(full_te))
 ltr5 <-  getSeqNamed(s, gff_5ltr)
 names(ltr5) <-  paste0(gff_5ltr$Parent,":",names(ltr5))
 ltr3 <-  getSeqNamed(s, gff_3ltr)
 names(ltr3) <- paste0(gff_3ltr$Parent,":",names(ltr3))
-inc <- gff_te$Rank != "DL"
+inc <- gff_te_with_ltr$Rank != "DL"
-writeXStringSet(seq_representative, paste0(opt$output, "_RM_lib_non_redundant.fasta"))
+writeXStringSet(seqs_representative, paste0(opt$output, "_RM_lib_non_redundant.fasta"))
 writeXStringSet(full_te, paste0(opt$output, "_RM_lib_full_TE.fasta"))
 writeXStringSet(ltr5, paste0(opt$output, "_RM_lib_5LTR.fasta"))
 writeXStringSet(ltr3, paste0(opt$output, "_RM_lib_3LTR.fasta"))
 export(gff_out, paste0(opt$output, "_clean.gff3"), format = "gff3")
-lv <- sort(unique(gff_te$Final_Classification))
+lv <- sort(unique(gff_te_with_ltr$Final_Classification))
-te_count <- table(factor(gff_te$Final_Classification, levels=lv))
+te_count <- table(factor(gff_te_with_ltr$Final_Classification, levels=lv))
 pdf(paste0(opt$output, "_summary.pdf"), width = 13, height=8, pointsize = 10)
 par(mfrow=c(1,2), mar=c(5,7,2,0), las=1)
-boxplot(width(gff_te) ~ factor(gff_te$Final_Classification, levels=lv),
+boxplot(width(gff_te_with_ltr) ~ factor(gff_te_with_ltr$Final_Classification, levels=lv),
 horizontal = TRUE, xlab="length[bp]", ylab="",
 names = paste0(gsub("^.+[|]", "", lv), " (", te_count, ")"),
 main = 'Full TE', at = seq_along(lv)*4
 )
-boxplot(width(gff_te[inc]) ~ factor(gff_te$Final_Classification[inc], levels=lv),
+boxplot(width(gff_te_with_ltr[inc]) ~ factor(gff_te_with_ltr$Final_Classification[inc], levels=lv),
 horizontal = TRUE, xlab="length[bp]", ylab="",
 names = rep("", length(lv)),
 main = 'Full TE', at = seq_along(lv)*4-1, add=TRUE, col=2
 )
 par(mar=c(5,0,2,7))

Mercurial > repos > petr-novak > dante_ltr

comparison clean_ltr.R @ 7:c33d6583e548 draft