diff clean_ltr.R @ 7:c33d6583e548 draft

"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
author petr-novak
date Fri, 24 Jun 2022 14:19:48 +0000
parents b91ca438a1cb
children ff01d4263391
line wrap: on
line diff
--- a/clean_ltr.R	Thu May 19 08:21:55 2022 +0000
+++ b/clean_ltr.R	Fri Jun 24 14:19:48 2022 +0000
@@ -80,6 +80,7 @@
   s <- readDNAStringSet("./test_data/big_test_data/T_cacao_chromosomes.fasta")
 
   g <- rtracklayer::import("./test_data/sample_ltr_annotation.gff3")
+  g <- rtracklayer::import("./test_data/sample_DANTE_LTR_annotation.gff3")
   s <- readDNAStringSet("./test_data/sample_genome.fasta")
 
   g <- rtracklayer::import("./test_data/DANTE_LTR_Vfaba_chr5.gff3")
@@ -100,23 +101,48 @@
 
 # get te sequence based on rank
 
-# evaluate best TE -  DLTP grou
+# best quality - split by lineage
 s_te <- get_te_sequences(g, s)  # split by 'element quality'
-# best quality - split by lineage
-word_size <- 15
+# evaluate best TE -  DLTP grou
+
+# comparison parameters
+word_size <- 11
+task <- "blastn"
+perc_identity <- 80
+
 # best TE
-TE_DLTP_info <- analyze_TE(s_te$DLTP, word_size = word_size, ncpus = ncpus)
+TE_DLTP_info <- analyze_TE(s_te$DLTP,
+                           word_size = word_size,
+                           ncpus = ncpus,
+                           perc_identity = perc_identity,
+                           task = task)
 
 # TE rank 2:
-TE_DLT_plus_DLP_info <- analyze_TE(c(s_te$DLP, s_te$DLT), word_size = word_size, ncpus
-  = ncpus)
-  TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets(c(s_te$DLT, s_te$DLP), ncpus
-    = ncpus,TE_DLTP_info$seqs_representative, word_size = word_size
-  )
+TE_DLT_plus_DLP_info <- analyze_TE(c(s_te$DLP, s_te$DLT),
+                                   word_size = word_size,
+                                   ncpus = ncpus,
+                                   perc_identity = perc_identity,
+                                   task = task)
+
+TE_D_plus_DL_info <- analyze_TE(c(s_te$DL, s_te$D),
+                                word_size = word_size,
+                                ncpus = ncpus,
+                                perc_identity = perc_identity,
+                                task = task)
+
+TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets(
+  c(s_te$DLT, s_te$DLP),
+  ncpus = ncpus,
+  TE_DLTP_info$seqs_representative,
+  word_size = word_size,
+  perc_identity = perc_identity,
+  task = task)
 
 TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info)
-# create additional library from rank 2 verified by multiplicity
-id_for_additional_library <- setdiff(
+TE_D_plus_DL_info_multiplicity <- verify_based_on_multiplicity(TE_D_plus_DL_info)
+
+# create additional library from rank 2 verified by multiplicity and DLTP
+id_for_additional_library <- union(
   TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified,
   TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified)
 
@@ -125,34 +151,45 @@
                                                          id_for_additional_library]
   seqs_additional_info <- analyze_TE(seqs_for_additional_library, word_size =
     word_size, ncpus = ncpus)
-  seq_representative <- c(
+  seqs_representative <- c(
     TE_DLTP_info$seqs_representative,
     seqs_additional_info$seqs_representative
   )
 }else {
   if (length(id_for_additional_library) == 1) {
-    seq_representative <- c(
+    seqs_representative <- c(
       TE_DLTP_info$seqs_representative,
       c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in% id_for_additional_library]
     )
   }else {
-    seq_representative <- TE_DLTP_info$seqs_representative
+    seqs_representative <- TE_DLTP_info$seqs_representative
   }
 }
 
 # TE  rank 3 - verify agains good DLTP
 TE_DL_info_DLTP_verified <- compare_TE_datasets(
   s_te$DL,
-  TE_DLTP_info$seqs_representative, min_coverage = 0.95,
-  ncpus = ncpus, word_size = word_size
-)
+  seqs_representative, min_coverage = 0.90,
+  ncpus = ncpus,
+  word_size = word_size,
+  task = task,
+  perc_identity = perc_identity)
+
+TE_D_info_DLTP_verified <- compare_TE_datasets(
+  s_te$D,
+  seqs_representative, min_coverage = 0.90,
+  ncpus = ncpus,
+  word_size = word_size,
+  task = task,
+  perc_identity = perc_identity)
 
 
-R <- seq_diversity(seq_representative)$richness
-SI <- seq_diversity(seq_representative)$shannon_index
+
+R <- seq_diversity(seqs_representative)$richness
+SI <- seq_diversity(seqs_representative)$shannon_index
 
 # final RM library:
-seq_representative_no_ssr <- seq_representative[R > 20]
+seqs_representative_no_ssr <- seqs_representative[R > 20]
 
 ID <- g$ID[g$type == "transposable_element"]
 names(ID) <- paste0(seqnames(g), "_",
@@ -166,7 +203,9 @@
   TE_DLTP_info$te_conflict_info$ok,
   TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified,
   TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified,
-  TE_DL_info_DLTP_verified$id_ok_verified)
+  TE_DL_info_DLTP_verified$id_ok_verified,
+  TE_D_info_DLTP_verified$id_ok_verified,
+  TE_D_plus_DL_info_multiplicity$id_ok_mp_verified),
 )
 
 c1 <- g$ID %in% ID[id_of_good_te]
@@ -175,6 +214,10 @@
 gff_out <- g[c1 | c2]
 
 gff_te <- gff_out[gff_out$type %in% "transposable_element"]
+# remove partial elements
+gff_te_with_ltr <- gff_out[gff_out$type %in% "transposable_element" & gff_out$Rank != "D"]
+
+
 gff_5ltr <- gff_out[gff_out$LTR %in% "5LTR"]
 gff_3ltr <- gff_out[gff_out$LTR %in% "3LTR"]
 
@@ -184,26 +227,26 @@
 names(ltr5) <-  paste0(gff_5ltr$Parent,":",names(ltr5))
 ltr3 <-  getSeqNamed(s, gff_3ltr)
 names(ltr3) <- paste0(gff_3ltr$Parent,":",names(ltr3))
-inc <- gff_te$Rank != "DL"
+inc <- gff_te_with_ltr$Rank != "DL"
 
-writeXStringSet(seq_representative, paste0(opt$output, "_RM_lib_non_redundant.fasta"))
+writeXStringSet(seqs_representative, paste0(opt$output, "_RM_lib_non_redundant.fasta"))
 writeXStringSet(full_te, paste0(opt$output, "_RM_lib_full_TE.fasta"))
 writeXStringSet(ltr5, paste0(opt$output, "_RM_lib_5LTR.fasta"))
 writeXStringSet(ltr3, paste0(opt$output, "_RM_lib_3LTR.fasta"))
 
 export(gff_out, paste0(opt$output, "_clean.gff3"), format = "gff3")
 
-lv <- sort(unique(gff_te$Final_Classification))
-te_count <- table(factor(gff_te$Final_Classification, levels=lv))
+lv <- sort(unique(gff_te_with_ltr$Final_Classification))
+te_count <- table(factor(gff_te_with_ltr$Final_Classification, levels=lv))
 
 pdf(paste0(opt$output, "_summary.pdf"), width = 13, height=8, pointsize = 10)
 par(mfrow=c(1,2), mar=c(5,7,2,0), las=1)
-boxplot(width(gff_te) ~ factor(gff_te$Final_Classification, levels=lv),
+boxplot(width(gff_te_with_ltr) ~ factor(gff_te_with_ltr$Final_Classification, levels=lv),
         horizontal = TRUE, xlab="length[bp]", ylab="",
         names = paste0(gsub("^.+[|]", "", lv), " (", te_count, ")"),
         main = 'Full TE', at = seq_along(lv)*4
 )
-boxplot(width(gff_te[inc]) ~ factor(gff_te$Final_Classification[inc], levels=lv),
+boxplot(width(gff_te_with_ltr[inc]) ~ factor(gff_te_with_ltr$Final_Classification[inc], levels=lv),
         horizontal = TRUE, xlab="length[bp]", ylab="",
         names = rep("", length(lv)),
         main = 'Full TE', at = seq_along(lv)*4-1, add=TRUE, col=2