changeset 2:f131886ea194 draft

"planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
author petr-novak
date Tue, 12 Apr 2022 12:55:32 +0000
parents c1498f679b50
children 6ae4a341d1f3
files R/ltr_utils.R clean_dante_ltr.xml dante_ltr_search.xml extract_putative_ltr.R
diffstat 4 files changed, 48 insertions(+), 18 deletions(-) [+]
line wrap: on
line diff
--- a/R/ltr_utils.R	Wed Mar 09 09:31:31 2022 +0000
+++ b/R/ltr_utils.R	Tue Apr 12 12:55:32 2022 +0000
@@ -92,22 +92,23 @@
   }
 }
 
+
+
 trim2TGAC <- function(bl) {
   for (i in 1:nrow(bl)) {
-    tg_L <- firstTG(bl$qseq[i])
-    tg_R <- firstTG(bl$sseq[i])
-    ca_L <- lastCA(bl$qseq[i])
-    ca_R <- lastCA(bl$sseq[i])
-    e_dist <- bl$length[i] - ca_R
-    no_match <- any(tg_L == 0, tg_R == 0, ca_L == 0, ca_R == 0)
+    cons <- consensusString(c(bl$qseq[i], bl$sseq[i]), ambiguityMap="?")
+    tg_P <- firstTG(cons)
+    ca_P <- lastCA(cons)
+    e_dist <- bl$length[i] - ca_P
+    max_dist = 50 # was 25
+    no_match <- any(tg_P == 0, ca_P == 0)
     if (!no_match &
-      tg_L == tg_R &
-      ca_L == ca_R &
-      tg_L < 8 &
-      e_dist < 8) {
+      tg_P < max_dist &
+      e_dist < max_dist) {
       ## trim alignment
-      bl[i,] <- trim_blast_table(bl[i,], tg_L, e_dist - 1)
+      bl[i,] <- trim_blast_table(bl[i,], tg_P, e_dist - 1)
     }
+
   }
   return(bl)
 }
@@ -566,6 +567,30 @@
   return(te)
 }
 
+
+get_te_rank <- function (gr){
+  DL_id <- gr$ID[gr$type == "transposable_element" &
+                  gr$TSD == "not_found" &
+                  is.na(gr$trna_id)]
+  DLT_id <- gr$ID[gr$type == "transposable_element" &
+                          gr$TSD != "not_found" &
+                          is.na(gr$trna_id)]
+  DLTP_id <- gr$ID[gr$type == "transposable_element" &
+                              gr$TSD != "not_found" &
+                              !is.na(gr$trna_id)]
+  DLP_id <- gr$ID[gr$type == "transposable_element" &
+                          gr$TSD == "not_found" &
+                          !is.na(gr$trna_id)]
+  Rank <- character(length(gr))
+  ID <- unlist(ifelse(is.na(gr$ID), gr$Parent, gr$ID))
+
+  Rank[ID %in% DL_id] <- "DL"
+  Rank[ID %in% DLT_id] <- "DLT"
+  Rank[ID %in% DLP_id] <- "DLP"
+  Rank[ID %in% DLTP_id] <- "DLTP"
+  return(Rank)
+}
+
 get_te_statistics <- function(gr, RT) {
   DOMAINS_LTR <- gr[gr$type == "transposable_element" &
                       gr$TSD == "not_found" &
--- a/clean_dante_ltr.xml	Wed Mar 09 09:31:31 2022 +0000
+++ b/clean_dante_ltr.xml	Tue Apr 12 12:55:32 2022 +0000
@@ -1,4 +1,4 @@
-<tool id="clean_dante_ltr" name="DANTE_LTR transposamble elements filtering" version="0.1.0" python_template_version="3.5">
+<tool id="clean_dante_ltr" name="DANTE_LTR transposamble elements filtering" version="0.1.4" python_template_version="3.5">
     <requirements>
 
         <requirement type="package">r-optparse</requirement>
--- a/dante_ltr_search.xml	Wed Mar 09 09:31:31 2022 +0000
+++ b/dante_ltr_search.xml	Tue Apr 12 12:55:32 2022 +0000
@@ -1,4 +1,4 @@
-<tool id="dante_ltr_search" name="DANTE_LTR transposable element identification" version="0.1.0" python_template_version="3.5">
+<tool id="dante_ltr_search" name="DANTE_LTR transposable element identification" version="0.1.4" python_template_version="3.5">
     <requirements>
         <requirement type="package">blast</requirement>
         <requirement type="package">r-optparse</requirement>
--- a/extract_putative_ltr.R	Wed Mar 09 09:31:31 2022 +0000
+++ b/extract_putative_ltr.R	Tue Apr 12 12:55:32 2022 +0000
@@ -72,6 +72,11 @@
   s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910
   /fasta_parts/211010_Vfaba_chr5.fasta")
 
+  g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data//Cocoa_theobroma_DANTE_filtered.gff3")
+  s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/Cocoa_theobroma_chr1.fasta.gz")
+
+  source("R/ltr_utils.R")
+
   g <- rtracklayer::import("./test_data/sample_DANTE.gff3")
   s <- readDNAStringSet("./test_data/sample_genome.fasta")
   outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3"
@@ -159,6 +164,7 @@
                                                 grR[x]),
                mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE
 )
+
 cat('done.\n')
 
 good_TE <- TE[!sapply(TE, is.null)]
@@ -177,14 +183,13 @@
 src <- as.character(gff3_out$source)
 src[is.na(src)] <- "dante_ltr"
 gff3_out$source <- src
-
-# TODO export all files to single directory
-# TODO export individual groups DL, DLT, DLP DLPT gff3
+gff3_out$Rank <- get_te_rank(gff3_out)
+# TODO add attributte specifying individual groups DL, DLT, DLP DLPT gff3
 
 export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3')
-# summary statistics
+
 all_tbl <- get_te_statistics(gff3_out, RT)
-write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE)
+write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = TRUE)
 # export fasta files:
 s_te <- get_te_sequences(gff3_out, s)