# HG changeset patch
# User petr-novak
# Date 1652099851 0
# Node ID 93d35ae65e1b810756ae729e8ef47d0e62962ffc
# Parent  6ae4a341d1f3a6dcc8c084a0ebb71eee21e9ac11
"planemo upload commit 57a4f4a749b60b4e1d992dc3a879add7bb4bb56b"

diff -r 6ae4a341d1f3 -r 93d35ae65e1b clean_ltr.R
--- a/clean_ltr.R	Tue May 03 12:38:12 2022 +0000
+++ b/clean_ltr.R	Mon May 09 12:37:31 2022 +0000
@@ -76,6 +76,9 @@
 cat("done\n")
 # testing
 if (FALSE) {
+  g <- rtracklayer::import("./test_data/big_test_data/dante_ltr_unfiltered_t.cacao.gff3")
+  s <- readDNAStringSet("./test_data/big_test_data/T_cacao_chromosomes.fasta")
+
   g <- rtracklayer::import("./test_data/sample_ltr_annotation.gff3")
   s <- readDNAStringSet("./test_data/sample_genome.fasta")
 
@@ -88,6 +91,12 @@
   source("./R/ltr_utils.R")
 }
 
+## ID in g must be unique - this could be a problem if gff is concatenated from multiple files!
+## id ID is renamed - rename parent to!
+## add chromosom index to disctinguish same IDs
+suffix <- as.numeric(seqnames(g))
+g$ID <- ifelse(is.na(g$ID), NA, paste0(g$ID,"_", suffix))
+g$Parent <- ifelse(is.na(g$Parent), NA, paste0(g$Parent,"_", suffix))
 
 # get te sequence based on rank
 
@@ -101,11 +110,9 @@
 # TE rank 2:
 TE_DLT_plus_DLP_info <- analyze_TE(c(s_te$DLP, s_te$DLT), word_size = word_size, ncpus
   = ncpus)
-TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets(c(s_te$DLT, s_te$DLP), ncpus
-  = ncpus,
-                                                          TE_DLTP_info$seqs_representative,
-                                                          word_size = word_size
-)
+  TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets(c(s_te$DLT, s_te$DLP), ncpus
+    = ncpus,TE_DLTP_info$seqs_representative, word_size = word_size
+  )
 
 TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info)
 # create additional library from rank 2 verified by multiplicity
@@ -133,11 +140,11 @@
   }
 }
 
-# TE  rank 3
+# TE  rank 3 - verify agains good DLTP
 TE_DL_info_DLTP_verified <- compare_TE_datasets(
   s_te$DL,
-  TE_DLTP_info$seqs_representative, min_coverage = 0.98,
-  ncpus = ncpus
+  TE_DLTP_info$seqs_representative, min_coverage = 0.95,
+  ncpus = ncpus, word_size = word_size
 )