changeset 12:ff01d4263391 draft

"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
author petr-novak
date Thu, 21 Jul 2022 08:23:15 +0000
parents 54bd36973253
children 559940c04c44
files R/ltr_utils.R README.md clean_dante_ltr.xml clean_ltr.R dante_ltr_search.xml dante_ltr_workflow.png detect_putative_ltr.R detect_putative_ltr_wrapper.py extract_putative_ltr.R requirements.txt tests.sh
diffstat 11 files changed, 575 insertions(+), 288 deletions(-) [+]
line wrap: on
line diff
--- a/R/ltr_utils.R	Wed Jul 13 11:02:55 2022 +0000
+++ b/R/ltr_utils.R	Thu Jul 21 08:23:15 2022 +0000
@@ -1,7 +1,7 @@
 add_coordinates_of_closest_neighbor <- function(gff) {
   gff <- gff[order(seqnames(gff), as.vector(start(gff)))]
   # split to chromosomes:
-  gff_parts <- split(gff, seqnames(gff))
+  gff_parts <- split(gff, as.vector(seqnames(gff)))
   upstreams <- c(sapply(gff_parts, function(x) c(1, head(end(x), -1))))
   downstreams <- c(sapply(gff_parts, function(x) c(start(x)[-1], seqlengths(x)[runValue(seqnames(x[1]))])))
   gff_updated <- unlist(gff_parts)
--- a/README.md	Wed Jul 13 11:02:55 2022 +0000
+++ b/README.md	Thu Jul 21 08:23:15 2022 +0000
@@ -29,7 +29,7 @@
 ### Detection of complete LTR retrotransposons
 
 ```shell
-Usage: ./extract_putative_ltr.R COMMAND [OPTIONS]
+Usage: ./detect_putative_ltr.R COMMAND [OPTIONS]
 
 
 Options:
@@ -59,7 +59,7 @@
 
 ```shell
 mkdir -p tmp
-./extract_putative_ltr.R -g test_data/sample_DANTE.gff3 -s test_data/sample_genome.fasta -o tmp/ltr_annotation
+./detect_putative_ltr.R -g test_data/sample_DANTE.gff3 -s test_data/sample_genome.fasta -o tmp/ltr_annotation
 ```
 
 ####  Files in the output of `extract_putative_ltr.R`:
@@ -72,7 +72,35 @@
 - `prefix_DLT.fasta` - elements with **d**omains, **L**TR, **T**SD 
 - `prefix_statistics.csv` - number of elements in individual categories  
 
+For large genomes, you can your `detect_putative_ltr_wrapper.py`. This script will split input fasta to smaller chunks and run `detect_putative_ltr.R` on each chunk to limit memory usage. Output will be merged after all chunks are processed.
 
+```shell
+usage: detect_putative_ltr_wrapper.py [-h] -g GFF3 -s REFERENCE_SEQUENCE -o
+                                      OUTPUT [-c CPU] [-M MAX_MISSING_DOMAINS]
+                                      [-L MIN_RELATIVE_LENGTH]
+                                      [-S MAX_CHUNK_SIZE]
+
+detect_putative_ltr_wrapper.py is a wrapper for 
+    detect_putative_ltr.R
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -g GFF3, --gff3 GFF3  gff3 file
+  -s REFERENCE_SEQUENCE, --reference_sequence REFERENCE_SEQUENCE
+                        reference sequence as fasta file
+  -o OUTPUT, --output OUTPUT
+                        output file path and prefix
+  -c CPU, --cpu CPU     number of CPUs
+  -M MAX_MISSING_DOMAINS, --max_missing_domains MAX_MISSING_DOMAINS
+  -L MIN_RELATIVE_LENGTH, --min_relative_length MIN_RELATIVE_LENGTH
+                        Minimum relative length of protein domain to be considered
+                        for retrostransposon detection
+  -S MAX_CHUNK_SIZE, --max_chunk_size MAX_CHUNK_SIZE
+                        If size of reference sequence is greater than this value,
+                         reference is analyzed in chunks of this size. This is
+                          just approximate value - sequences which are longer 
+                          are are not split, default is 100000000
+```
 
 ### Validation of LTR retrotransposons detected un previous step:
 
--- a/clean_dante_ltr.xml	Wed Jul 13 11:02:55 2022 +0000
+++ b/clean_dante_ltr.xml	Thu Jul 21 08:23:15 2022 +0000
@@ -1,4 +1,4 @@
-<tool id="clean_dante_ltr" name="DANTE_LTR retrotransposons filtering" version="0.1.6.1" python_template_version="3.5">
+<tool id="clean_dante_ltr" name="DANTE_LTR retrotransposons filtering" version="0.1.7.0" python_template_version="3.5">
     <requirements>
 
         <requirement type="package">r-optparse</requirement>
@@ -34,7 +34,7 @@
                $dante_ltr.hid and reference $reference.hid"/>
         <data name="rm_lib" format="fasta" label="Non-redundant library of LTR retrotransposons (FASTA) based on annotation $dante_ltr.hid and reference $reference.hid"/>
 
-        <data name="te_full" format="fasta" label="Library of full length LTR retrotransposons (FASTA) based on annotation $dante_ltr.hid and reference $reference.hid"/>
+        <data name="te_full" format="fasta" label="Library of LTR retrotransposons (FASTA) based on annotation $dante_ltr.hid and reference $reference.hid"/>
 
         <data name="ltr5" format="fasta" label="Library of 5'LTR of retrotransposons (FASTA) based on annotation $dante_ltr.hid and reference $reference.hid"/>
 
--- a/clean_ltr.R	Wed Jul 13 11:02:55 2022 +0000
+++ b/clean_ltr.R	Thu Jul 21 08:23:15 2022 +0000
@@ -95,9 +95,12 @@
 ## ID in g must be unique - this could be a problem if gff is concatenated from multiple files!
 ## id ID is renamed - rename parent to!
 ## add chromosom index to disctinguish same IDs
-suffix <- as.numeric(seqnames(g))
-g$ID <- ifelse(is.na(g$ID), NA, paste0(g$ID,"_", suffix))
-g$Parent <- ifelse(is.na(g$Parent), NA, paste0(g$Parent,"_", suffix))
+## do this only if IDs are not unique
+if (any(duplicated(na.omit(g$ID)))){
+  suffix <- as.numeric(seqnames(g))
+  g$ID <- ifelse(is.na(g$ID), NA, paste0(g$ID,"_", suffix))
+  g$Parent <- ifelse(is.na(g$Parent), NA, paste0(g$Parent,"_", suffix))
+}
 
 # get te sequence based on rank
 
--- a/dante_ltr_search.xml	Wed Jul 13 11:02:55 2022 +0000
+++ b/dante_ltr_search.xml	Thu Jul 21 08:23:15 2022 +0000
@@ -1,14 +1,16 @@
-<tool id="dante_ltr_search" name="DANTE_LTR retrotransposon identification" version="0.1.6.1" python_template_version="3.5">
+<tool id="dante_ltr_search" name="DANTE_LTR retrotransposon identification" version="0.1.7.0" python_template_version="3.5">
     <requirements>
         <requirement type="package">blast</requirement>
         <requirement type="package">r-optparse</requirement>
         <requirement type="package">bioconductor-bsgenome</requirement>
         <requirement type="package">bioconductor-biostrings</requirement>
         <requirement type="package">bioconductor-rtracklayer</requirement>
+        <requirement type="package" version="3.7.12">python</requirement>
 
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
-        Rscript ${__tool_directory__}/extract_putative_ltr.R --gff3 '$dante' --reference_sequence '$reference' -M $max_missing --output output --cpu 32
+        python ${__tool_directory__}/detect_putative_ltr_wrapper.py --gff3 '$dante'
+          --reference_sequence '$reference' -M $max_missing --output output --cpu 32
         &&
         mv output.gff3 $te_ltr_gff
         &&
@@ -31,7 +33,7 @@
         5' and 3' Long Terminal Repeats, Target Site Duplication (TSD) and primer binding site (PBS).
 
         All identified elements contains set of protein domains as defined in
-        REXdb_.Based on the results of detection of structural features,
+        REXdb_. Based on the results of detection of structural features,
         elements falls into five categories:
 
         - elements with domains, 5'LTR, 3'LTR, TSD and PBS - rank DLTP
@@ -42,5 +44,10 @@
 
        .. _REXdb: https://doi.org/10.1186/s13100-018-0144-1
 
+      Principles of detection of LTR retrotransposons is describet in github_ repository of this tool.
+
+      .. _github: https://github.com/kavonrtep/dante_ltr
+
+
     ]]></help>
 </tool>
\ No newline at end of file
Binary file dante_ltr_workflow.png has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/detect_putative_ltr.R	Thu Jul 21 08:23:15 2022 +0000
@@ -0,0 +1,283 @@
+#!/usr/bin/env Rscript
+initial_options <- commandArgs(trailingOnly = FALSE)
+file_arg_name <- "--file="
+script_name <- normalizePath(sub(file_arg_name, "", initial_options[grep(file_arg_name, initial_options)]))
+script_dir <- dirname(script_name)
+library(optparse)
+
+parser <- OptionParser()
+option_list <- list(
+  make_option(c("-g", "--gff3"), action = "store", type = "character",
+              help = "gff3 with dante results", default = NULL),
+  make_option(c("-s", "--reference_sequence"), action = "store", type = "character",
+              help = "reference sequence as fasta", default = NULL),
+  make_option(c("-o", "--output"), action = "store", type = "character",
+              help = "output file path and prefix", default = NULL),
+  make_option(c("-c", "--cpu"), type = "integer", default = 5,
+              help = "Number of cpu to use [default %default]", metavar = "number"),
+  make_option(c("-M", "--max_missing_domains"), type = "integer", default = 0,
+              help = "Maximum number of missing domains is retrotransposon [default %default]",
+              metavar = "number"),
+  make_option(c("-L", "--min_relative_length"), type = "numeric", default = 0.6,
+              help = "Minimum relative length of protein domain to be considered for retrostransposon detection [default %default]",
+              metavar = "number")
+
+)
+description <- paste(strwrap(""))
+
+epilogue <- ""
+parser <- OptionParser(option_list = option_list, epilogue = epilogue, description = description,
+                       usage = "usage: %prog COMMAND [OPTIONS]")
+opt <- parse_args(parser, args = commandArgs(TRUE))
+
+
+# load packages
+suppressPackageStartupMessages({
+  library(rtracklayer)
+  library(Biostrings)
+  library(BSgenome)
+  library(parallel)
+})
+
+
+# CONFIGURATION
+OFFSET <- 15000
+# load configuration files and functions:
+lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv")
+FDM_file <- paste0(script_dir, "/databases/feature_distances_model.RDS")
+trna_db <- paste0(script_dir, "/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta")
+ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R")
+if (file.exists(lineage_file) & file.exists(trna_db)) {
+  lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE)
+  FDM <- readRDS(FDM_file)
+  source(ltr_utils_r)
+}else {
+  # this destination work is installed using conda
+  lineage_file <- paste0(script_dir, "/../share/dante_ltr/databases/lineage_domain_order.csv")
+  FDM_file <- paste0(script_dir, "/../share/dante_ltr/databases/feature_distances_model.RDS")
+  trna_db <- paste0(script_dir, "/../share/dante_ltr/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta")
+  ltr_utils_r <- paste0(script_dir, "/../share/dante_ltr/R/ltr_utils.R")
+  if (file.exists(lineage_file) & file.exists((trna_db))) {
+    lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE)
+    source(ltr_utils_r)
+    FDM <- readRDS(FDM_file)
+  }else(
+    stop("configuration files not found")
+  )
+}
+
+
+# for testing
+if (FALSE) {
+  g <- rtracklayer::import("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/repeat_annotation/DANTE_on_CEUR_filtered_short_names.gff3")
+  s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa")
+  lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE)
+
+  g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/sample_DANTE_unfiltered.gff3")
+  g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/DANTE_filtered_part.gff3")
+  s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/Rbp_part.fa")
+
+  # oriza
+  g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/DANTE_full_oryza.gff3")
+  s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/o_sativa_msu7.0.fasta")
+
+  g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data
+  /DANTE_Vfaba_chr5.gff3")
+  s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910
+  /fasta_parts/211010_Vfaba_chr5.fasta")
+
+  g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data//Cocoa_theobroma_DANTE_filtered.gff3")
+  s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/Cocoa_theobroma_chr1.fasta.gz")
+  # test on bigger data:
+
+  g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/tmp/DANTE_unfiltered/1.gff3")
+  s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/tmp/fasta_parts/1.fasta")
+
+  source("R/ltr_utils.R")
+  ## feature distance model
+  FDM <- readRDS("./databases/feature_distances_model.RDS")
+  g <- rtracklayer::import("./test_data/sample_DANTE.gff3")
+  s <- readDNAStringSet("./test_data/sample_genome.fasta")
+  outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3"
+  lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header =
+    TRUE, as.is = TRUE)
+  trna_db <- "./databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta"
+  opt <- list(min_relative_length=0.6, cpu = 8)
+
+}
+
+# MAIN #############################################################
+
+# load data:
+
+cat("reading gff...")
+g <- rtracklayer::import(opt$gff3, format = 'gff3')  # DANTE gff3
+cat("done\n")
+cat("reading fasta...")
+s <- readDNAStringSet(opt$reference_sequence)  # genome assembly
+cat("done\n")
+outfile <- opt$output
+# clean sequence names:
+names(s) <- gsub(" .+", "", names(s))
+lineage_domain <- lineage_info$Domains.order
+lineage_domain_span <- lineage_info$domain_span
+lineage_ltr_mean_length <- lineage_info$ltr_length
+lineage_offset5prime <- lineage_info$offset5prime
+lineage_offset3prime <- lineage_info$offset3prime
+ln <- gsub("ss/I", "ss_I", gsub("_", "/", gsub("/", "|", lineage_info$Lineage)))
+names(lineage_offset3prime) <-  ln
+names(lineage_offset5prime) <-  ln
+names(lineage_domain) <- ln
+names(lineage_domain_span) <- ln
+names(lineage_ltr_mean_length) <- ln
+lineage_domains_sequence <- unlist(mapply(function(d,l) {
+  paste(strsplit(d, " ")[[1]], ":", l, sep = "")
+}, d = lineage_domain, l = names(lineage_domain)))
+
+# filter g gff3
+g <- dante_filtering(g, Relative_Length = opt$min_relative_length) # default
+
+seqlengths(g) <- seqlengths(s)[names(seqlengths(g))]
+g <- add_coordinates_of_closest_neighbor(g)
+
+# add info about domain order:
+g$domain_order <- as.numeric(factor(paste(g$Name, g$Final_Classification, sep = ":"), levels = lineage_domains_sequence))
+# set NA to 0 in  g$domain_order ( some domains are not fromm ClassI elements
+g$domain_order[is.na(g$domain_order)] <- 0
+
+# NOTE - some operation is faster of GrangesList but some on list of data.frames
+# this is primary clusteing
+cls <- get_domain_clusters(g)
+gcl <- split(as.data.frame(g), cls)
+# gcl_as_GRL <- split(g, cls)  # delete?
+
+cls_alt <- get_domain_clusters_alt(g, FDM)
+g$Cluster <- as.numeric(factor(cls_alt))
+
+gcl_alt <- split(as.data.frame(g), cls_alt)
+
+TE_partial <-  GRanges(seqnames =  sapply(gcl_alt, function(x) x$seqnames[1]),
+                       Name = sapply(gcl_alt, function(x) x$Final_Classification[1]),
+                       Final_Classification = sapply(gcl_alt, function(x) x$Final_Classification[1]),
+                       ID = sapply(gcl_alt, function(x) paste0("TE_partial_", sprintf("%08d", x$Cluster[1]))),
+                       strand = sapply(gcl_alt, function(x) x$strand[1]),
+                       Ndomains = sapply(gcl_alt, function(x) nrow(x)),
+                       type = "transposable_element",
+                       source = "dante_ltr",
+                       Rank="D",
+                       IRanges(start = sapply(gcl_alt, function(x) min(x$start)),
+                               end = sapply(gcl_alt, function(x) max(x$end)))
+)
+g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster)
+g$Parent <- paste0("TE_partial_", sprintf("%08d", g$Cluster))
+g$Rank <- "D"
+
+# keep only partial TE with more than one domain
+TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1]
+g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)]
+
+# first filtering  - remove TEs with low number of domains
+gcl_clean <- clean_domain_clusters(gcl, lineage_domain_span, min_domains = 5 - opt$max_missing_domains)
+
+# glc annotation
+gcl_clean_lineage <- sapply(gcl_clean, function(x)  x$Final_Classification[1])
+gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-",
+                                                paste(rev(x$Name), collapse = " "),
+                                                paste(x$Name, collapse = " "))
+)
+
+# compare detected domains with domains in lineages from REXdb database
+dd <- mapply(domain_distance,
+             d_query = gcl_clean_domains,
+             d_reference = lineage_domain[gcl_clean_lineage])
+
+# get lineages which has correct number and order of domains
+# gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]]
+gcl_clean2 <- gcl_clean[dd <= opt$max_missing_domains]
+
+gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)]
+gr <- get_ranges(gcl_clean_with_domains)
+
+
+cat('Number of analyzed regions:\n')
+cat('Total number of domain clusters             : ', length(gcl), '\n')
+cat('Number of clean clusters                    : ', length(gcl_clean), '\n')
+cat('Number of clusters with complete domain set : ', length(gcl_clean_with_domains), '\n')
+
+
+te_strand <- sapply(gcl_clean_with_domains, function(x)x$strand[1])
+te_lineage <- sapply(gcl_clean_with_domains, function(x)x$Final_Classification[1])
+
+max_left_offset <- ifelse(te_strand == "+", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage])
+max_right_offset <- ifelse(te_strand == "-", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage])
+
+grL <- get_ranges_left(gcl_clean_with_domains, max_left_offset)
+grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset)
+
+s_left <- getSeq(s, grL)
+s_right <- getSeq(s, grR)
+
+expected_ltr_length <- lineage_ltr_mean_length[sapply(gcl_clean_with_domains, function (x)x$Final_Classification[1])]
+# for statistics
+RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"]
+# cleanup
+#rm(g)
+rm(gcl)
+rm(gcl_clean)
+rm(gcl_clean2)
+
+names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_")
+names(s_left) <- paste(seqnames(grL), start(grL), end(grL), sep = "_")
+names(s_right) <- paste(seqnames(grR), start(grR), end(grR), sep = "_")
+cat('Identification of LTRs...')
+TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x],
+                                                s_right[x],
+                                                gcl_clean_with_domains[[x]],
+                                                gr[x],
+                                                grL[x],
+                                                grR[x],
+                                                expected_ltr_length[x]),
+               mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE
+)
+
+cat('done.\n')
+
+good_TE <- TE[!sapply(TE, is.null)]
+cat('Number of putative TE with identified LTR   :', length(good_TE), '\n')
+
+# TODO - extent TE region to cover also TSD
+ID <- paste0('TE_', sprintf("%08d", seq(good_TE)))
+gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu)
+
+cat('Identification of PBS ...')
+gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE)
+cat('done\n')
+gff3_out <- do.call(c, gff3_list2)
+
+# define new source
+src <- as.character(gff3_out$source)
+src[is.na(src)] <- "dante_ltr"
+gff3_out$source <- src
+gff3_out$Rank <- get_te_rank(gff3_out)
+
+# add partial TEs but first remove all ovelaping
+TE_partial_parent_part <- TE_partial_with_more_than_one_domain[TE_partial_with_more_than_one_domain %outside% gff3_out]
+TE_partial_domain_part <-  g[g$Parent %in% TE_partial_parent_part$ID]
+
+gff3_out <- sort(c(gff3_out, TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start)
+# modify ID and Parent - add seqname - this will ensure it is unique is done on chunk level
+gff3_out$ID[!is.na(gff3_out$ID)] <- paste0(gff3_out$ID[!is.na(gff3_out$ID)], "_", seqnames(gff3_out)[!is.na(gff3_out$ID)])
+gff3_out$Parent[!is.na(gff3_out$Parent)] <- paste0(gff3_out$Parent[!is.na(gff3_out$Parent)], "_", seqnames(gff3_out)[!is.na(gff3_out$Parent)])
+
+export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3')
+
+all_tbl <- get_te_statistics(gff3_out, RT)
+all_tbl <- cbind(Classification = rownames(all_tbl), all_tbl)
+write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE)
+# export fasta files:
+s_te <- get_te_sequences(gff3_out, s)
+for (i in seq_along(s_te)) {
+  outname <- paste0(outfile, "_", names(s_te)[i], ".fasta")
+  writeXStringSet(s_te[[i]], filepath = outname)
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/detect_putative_ltr_wrapper.py	Thu Jul 21 08:23:15 2022 +0000
@@ -0,0 +1,224 @@
+#!/usr/bin/env python
+"""This wrapper is intended to be used on large genomes and large DANTE input to
+minimize memory usage, It splits input files to pieces and analyze it on by one by
+detect_putative_ltr.R
+If input does not exceed memory limit, it will run detect_putative_ltr.R directly
+"""
+
+import argparse
+import os
+import sys
+import tempfile
+from itertools import cycle
+import subprocess
+
+
+def get_arguments():
+    """
+    Get arguments from command line
+    :return:
+    args
+    """
+    parser = argparse.ArgumentParser(
+        description="""detect_putative_ltr_wrapper.py is a wrapper for 
+    detect_putative_ltr.R""", formatter_class=argparse.RawTextHelpFormatter
+        )
+    parser.add_argument(
+        '-g', '--gff3', default=None, required=True, help="gff3 file", type=str,
+        action='store'
+        )
+    parser.add_argument(
+        '-s', '--reference_sequence', default=None, required=True,
+        help="reference sequence as fasta file", type=str, action='store'
+        )
+    parser.add_argument(
+        '-o', '--output', default=None, required=True, help="output file path and prefix",
+        type=str, action='store'
+        )
+    parser.add_argument(
+        '-c', '--cpu', default=1, required=False, help="number of CPUs", type=int,
+        action='store'
+        )
+    parser.add_argument(
+        '-M', '--max_missing_domains', default=0, required=False, type=int
+        )
+    parser.add_argument(
+        '-L', '--min_relative_length', default=0.6, required=False, type=float,
+        help="Minimum relative length of protein domain to be "
+             "considered for retrostransposon detection"
+        )
+    parser.add_argument(
+        '-S', '--max_chunk_size', default=100000000, required=False, type=int,
+        help='If size of reference sequence is greater than this value, reference is '
+             'analyzed in chunks of this size. This is just approximate value - '
+             'sequences '
+             'which are longer are are not split, default is %(default)s'
+        )
+    args = parser.parse_args()
+    return args
+
+
+def read_fasta_sequence_size(fasta_file):
+    """Read size of sequence into dictionary"""
+    fasta_dict = {}
+    with open(fasta_file, 'r') as f:
+        for line in f:
+            if line[0] == '>':
+                header = line.strip().split(' ')[0][1:]  # remove part of name after space
+                fasta_dict[header] = 0
+            else:
+                fasta_dict[header] += len(line.strip())
+    return fasta_dict
+
+
+def make_temp_files(number_of_files):
+    """
+    Make named temporary files, file will not be deleted upon exit!
+    :param number_of_files:
+    :return:
+    filepaths
+    """
+    temp_files = []
+    for i in range(number_of_files):
+        temp_files.append(tempfile.NamedTemporaryFile(delete=False).name)
+    return temp_files
+
+
+def sum_up_stats_files(files):
+    """
+    Sum up statistics files
+    :return:
+    """
+    new_statistics = {}
+    for file in files:
+        with open(file, 'r') as fh:
+            for line in fh:
+                items = line.strip().split('\t')
+                if items[0] == 'Classification':
+                    header = items
+                    continue
+                else:
+                    counts = [int(item) for item in items[1:]]
+                    if items[0] in new_statistics:
+                        new_statistics[items[0]] = [sum(x) for x in
+                                                    zip(new_statistics[items[0]], counts)]
+                    else:
+                        new_statistics[items[0]] = counts
+    # convert to string, first line is header
+    statistics_str = []
+    for classification, counts in new_statistics.items():
+        statistics_str.append(classification + '\t' + '\t'.join([str(x) for x in counts]))
+    sorted_stat_with_header = ['\t'.join(header)] + sorted(statistics_str)
+    return sorted_stat_with_header
+
+
+def main():
+    """
+    Main function
+    """
+    args = get_arguments()
+    # locate directory of current script
+    tool_path = os.path.dirname(os.path.realpath(__file__))
+    fasta_seq_size = read_fasta_sequence_size(args.reference_sequence)
+    total_size = sum(fasta_seq_size.values())
+    number_of_sequences = len(fasta_seq_size)
+    if total_size > args.max_chunk_size and number_of_sequences > 1:
+        # sort dictionary by values
+        seq_id_size_sorted = [i[0] for i in sorted(
+            fasta_seq_size.items(), key=lambda x: int(x[1]), reverse=True
+            )]
+        number_of_temp_files = int(total_size / args.max_chunk_size) + 1
+        if number_of_temp_files > number_of_sequences:
+            number_of_temp_files = number_of_sequences
+
+        temp_files_fasta = make_temp_files(number_of_temp_files)
+        file_handles = [open(temp_file, 'w') for temp_file in temp_files_fasta]
+        # make dictionary seq_id_sorted as keys and values as file handles
+        seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles)))
+
+        # write sequences to temporary files
+        with open(args.reference_sequence, 'r') as f:
+            for line in f:
+                if line[0] == '>':
+                    header = line.strip().split(' ')[0][1:]
+                    print(header)
+                    seq_id_file_handle_dict[header].write(line)
+                else:
+                    seq_id_file_handle_dict[header].write(line)
+        # close file handles
+        for file_handle in file_handles:
+            file_handle.close()
+
+        # split gff3 file to temporary files -
+        # each temporary file will contain gff lines matching fasta
+        temp_files_gff = make_temp_files(number_of_temp_files)
+        file_handles = [open(temp_file, 'w') for temp_file in temp_files_gff]
+        # make dictionary seq_id_sorted as keys and values as file handles
+        seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles)))
+        # write gff lines to chunks
+        with open(args.gff3, 'r') as f:
+            for line in f:
+                if line[0] == '#':
+                    continue
+                else:
+                    header = line.strip().split('\t')[0]
+                    seq_id_file_handle_dict[header].write(line)
+        # close file handles
+        for file_handle in file_handles:
+            file_handle.close()
+
+        # run retrotransposon detection on each temporary file
+        output_files = make_temp_files(number_of_temp_files)
+        for i in range(number_of_temp_files):
+            print('Running retrotransposon detection on file ' + str(i))
+            subprocess.check_call(
+                [f'{tool_path}/detect_putative_ltr.R', '-s', temp_files_fasta[i], '-g',
+                 temp_files_gff[i], '-o', output_files[i], '-c', str(args.cpu), '-M',
+                 str(args.max_missing_domains), '-L', str(args.min_relative_length)]
+                )
+
+        # remove all temporary input files
+        for temp_file in temp_files_fasta + temp_files_gff:
+            os.remove(temp_file)
+
+        # concatenate output files
+        output_file_suffixes = ['_D.fasta', '_DL.fasta', '_DLT.fasta', '_DLTP.fasta',
+                                '_DLP.fasta', '.gff3', '_statistics.csv']
+
+        for suffix in output_file_suffixes:
+            if suffix == '_statistics.csv':
+                # sum up line with same word in first column
+                stat_files = [output_file + suffix for output_file in output_files]
+                new_statistics = sum_up_stats_files(stat_files)
+                with open(args.output + suffix, 'w') as f:
+                    f.write("\n".join(new_statistics))
+                # remove parsed temporary statistics files
+                for file in stat_files:
+                    os.remove(file)
+            else:
+                with open(args.output + suffix, 'w') as f:
+                    for i in range(number_of_temp_files):
+                        # some file may not exist, so we need to check
+                        try:
+                            with open(output_files[i] + suffix, 'r') as g:
+                                for line in g:
+                                    f.write(line)
+                            # remove parsed temporary output files
+                            os.remove(output_files[i])
+                        except FileNotFoundError:
+                            pass
+    else:
+        # no need to split sequences into chunks
+        subprocess.check_call(
+            [f'{tool_path}/detect_putative_ltr.R', '-s', args.reference_sequence, '-g',
+             args.gff3, '-o', args.output, '-c', str(args.cpu), '-M',
+             str(args.max_missing_domains), '-L', str(args.min_relative_length)]
+            )
+
+
+if __name__ == '__main__':
+    # check version of python must be 3.6 or greater
+    if sys.version_info < (3, 6):
+        print('Python version must be 3.6 or greater')
+        sys.exit(1)
+    main()
--- a/extract_putative_ltr.R	Wed Jul 13 11:02:55 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,275 +0,0 @@
-#!/usr/bin/env Rscript
-initial_options <- commandArgs(trailingOnly = FALSE)
-file_arg_name <- "--file="
-script_name <- normalizePath(sub(file_arg_name, "", initial_options[grep(file_arg_name, initial_options)]))
-script_dir <- dirname(script_name)
-library(optparse)
-
-parser <- OptionParser()
-option_list <- list(
-  make_option(c("-g", "--gff3"), action = "store", type = "character",
-              help = "gff3 with dante results", default = NULL),
-  make_option(c("-s", "--reference_sequence"), action = "store", type = "character",
-              help = "reference sequence as fasta", default = NULL),
-  make_option(c("-o", "--output"), action = "store", type = "character",
-              help = "output file path and prefix", default = NULL),
-  make_option(c("-c", "--cpu"), type = "integer", default = 5,
-              help = "Number of cpu to use [default %default]", metavar = "number"),
-  make_option(c("-M", "--max_missing_domains"), type = "integer", default = 0,
-              help = "Maximum number of missing domains is retrotransposon [default %default]",
-              metavar = "number"),
-  make_option(c("-L", "--min_relative_length"), type = "numeric", default = 0.6,
-              help = "Minimum relative length of protein domain to be considered for retrostransposon detection [default %default]",
-              metavar = "number")
-
-)
-description <- paste(strwrap(""))
-
-epilogue <- ""
-parser <- OptionParser(option_list = option_list, epilogue = epilogue, description = description,
-                       usage = "usage: %prog COMMAND [OPTIONS]")
-opt <- parse_args(parser, args = commandArgs(TRUE))
-
-
-# load packages
-suppressPackageStartupMessages({
-  library(rtracklayer)
-  library(Biostrings)
-  library(BSgenome)
-  library(parallel)
-})
-
-
-# CONFIGURATION
-OFFSET <- 15000
-# load configuration files and functions:
-lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv")
-FDM_file <- paste0(script_dir, "/databases/feature_distances_model.RDS")
-trna_db <- paste0(script_dir, "/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta")
-ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R")
-if (file.exists(lineage_file) & file.exists(trna_db)) {
-  lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE)
-  FDM <- readRDS(FDM_file)
-  source(ltr_utils_r)
-}else {
-  # this destination work is installed using conda
-  lineage_file <- paste0(script_dir, "/../share/dante_ltr/databases/lineage_domain_order.csv")
-  FDM_file <- paste0(script_dir, "/../share/dante_ltr/databases/feature_distances_model.RDS")
-  trna_db <- paste0(script_dir, "/../share/dante_ltr/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta")
-  ltr_utils_r <- paste0(script_dir, "/../share/dante_ltr/R/ltr_utils.R")
-  if (file.exists(lineage_file) & file.exists((trna_db))) {
-    lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE)
-    source(ltr_utils_r)
-    FDM <- readRDS(FDM_file)
-  }else(
-    stop("configuration files not found")
-  )
-}
-
-
-# for testing
-if (FALSE) {
-  g <- rtracklayer::import("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/repeat_annotation/DANTE_on_CEUR_filtered_short_names.gff3")
-  s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa")
-  lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE)
-
-  g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/sample_DANTE_unfiltered.gff3")
-  g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/DANTE_filtered_part.gff3")
-  s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/Rbp_part.fa")
-
-  # oriza
-  g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/DANTE_full_oryza.gff3")
-  s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/o_sativa_msu7.0.fasta")
-
-  g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data
-  /DANTE_Vfaba_chr5.gff3")
-  s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910
-  /fasta_parts/211010_Vfaba_chr5.fasta")
-
-  g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data//Cocoa_theobroma_DANTE_filtered.gff3")
-  s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/Cocoa_theobroma_chr1.fasta.gz")
-
-  source("R/ltr_utils.R")
-  ## feature distance model
-  FDM <- readRDS("./databases/feature_distances_model.RDS")
-  g <- rtracklayer::import("./test_data/sample_DANTE.gff3")
-  s <- readDNAStringSet("./test_data/sample_genome.fasta")
-  outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3"
-  lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header =
-    TRUE, as.is = TRUE)
-  trna_db <- "./databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta"
-
-}
-
-# MAIN #############################################################
-
-# load data:
-
-cat("reading gff...")
-g <- rtracklayer::import(opt$gff3, format = 'gff3')  # DANTE gff3
-cat("done\n")
-cat("reading fasta...")
-s <- readDNAStringSet(opt$reference_sequence)  # genome assembly
-cat("done\n")
-outfile <- opt$output
-# clean sequence names:
-names(s) <- gsub(" .+", "", names(s))
-lineage_domain <- lineage_info$Domains.order
-lineage_domain_span <- lineage_info$domain_span
-lineage_ltr_mean_length <- lineage_info$ltr_length
-lineage_offset5prime <- lineage_info$offset5prime
-lineage_offset3prime <- lineage_info$offset3prime
-ln <- gsub("ss/I", "ss_I", gsub("_", "/", gsub("/", "|", lineage_info$Lineage)))
-names(lineage_offset3prime) <-  ln
-names(lineage_offset5prime) <-  ln
-names(lineage_domain) <- ln
-names(lineage_domain_span) <- ln
-names(lineage_ltr_mean_length) <- ln
-lineage_domains_sequence <- unlist(mapply(function(d,l) {
-  paste(strsplit(d, " ")[[1]], ":", l, sep = "")
-}, d = lineage_domain, l = names(lineage_domain)))
-
-# filter g gff3
-g <- dante_filtering(g, Relative_Length = opt$min_relative_length) # default
-
-seqlengths(g) <- seqlengths(s)[names(seqlengths(g))]
-g <- add_coordinates_of_closest_neighbor(g)
-
-# add info about domain order:
-g$domain_order <- as.numeric(factor(paste(g$Name, g$Final_Classification, sep = ":"), levels = lineage_domains_sequence))
-# set NA to 0 in  g$domain_order ( some domains are not fromm ClassI elements
-g$domain_order[is.na(g$domain_order)] <- 0
-
-# NOTE - some operation is faster of GrangesList but some on list of data.frames
-# this is primary clusteing
-cls <- get_domain_clusters(g)
-gcl <- split(as.data.frame(g), cls)
-# gcl_as_GRL <- split(g, cls)  # delete?
-
-cls_alt <- get_domain_clusters_alt(g, FDM)
-g$Cluster <- as.numeric(factor(cls_alt))
-
-gcl_alt <- split(as.data.frame(g), cls_alt)
-
-TE_partial <-  GRanges(seqnames =  sapply(gcl_alt, function(x) x$seqnames[1]),
-                       Name = sapply(gcl_alt, function(x) x$Final_Classification[1]),
-                       Final_Classification = sapply(gcl_alt, function(x) x$Final_Classification[1]),
-                       ID = sapply(gcl_alt, function(x) paste0("TE_partial_", x$Cluster[1])),
-                       strand = sapply(gcl_alt, function(x) x$strand[1]),
-                       Ndomains = sapply(gcl_alt, function(x) nrow(x)),
-                       type = "transposable_element",
-                       source = "dante_ltr",
-                       Rank="D",
-                       IRanges(start = sapply(gcl_alt, function(x) min(x$start)),
-                               end = sapply(gcl_alt, function(x) max(x$end)))
-)
-g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster)
-g$Parent <- paste0("TE_partial_", g$Cluster)
-g$Rank <- "D"
-
-# keep only partial TE with more than one domain
-TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1]
-g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)]
-
-# first filtering  - remove TEs with low number of domains
-gcl_clean <- clean_domain_clusters(gcl, lineage_domain_span, min_domains = 5 - opt$max_missing_domains)
-
-# glc annotation
-gcl_clean_lineage <- sapply(gcl_clean, function(x)  x$Final_Classification[1])
-gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-",
-                                                paste(rev(x$Name), collapse = " "),
-                                                paste(x$Name, collapse = " "))
-)
-
-# compare detected domains with domains in lineages from REXdb database
-dd <- mapply(domain_distance,
-             d_query = gcl_clean_domains,
-             d_reference = lineage_domain[gcl_clean_lineage])
-
-# get lineages which has correct number and order of domains
-# gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]]
-gcl_clean2 <- gcl_clean[dd <= opt$max_missing_domains]
-
-gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)]
-gr <- get_ranges(gcl_clean_with_domains)
-
-
-cat('Number of analyzed regions:\n')
-cat('Total number of domain clusters             : ', length(gcl), '\n')
-cat('Number of clean clusters                    : ', length(gcl_clean), '\n')
-cat('Number of clusters with complete domain set : ', length(gcl_clean_with_domains), '\n')
-
-
-te_strand <- sapply(gcl_clean_with_domains, function(x)x$strand[1])
-te_lineage <- sapply(gcl_clean_with_domains, function(x)x$Final_Classification[1])
-
-max_left_offset <- ifelse(te_strand == "+", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage])
-max_right_offset <- ifelse(te_strand == "-", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage])
-
-grL <- get_ranges_left(gcl_clean_with_domains, max_left_offset)
-grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset)
-
-s_left <- getSeq(s, grL)
-s_right <- getSeq(s, grR)
-
-expected_ltr_length <- lineage_ltr_mean_length[sapply(gcl_clean_with_domains, function (x)x$Final_Classification[1])]
-# for statistics
-RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"]
-# cleanup
-#rm(g)
-rm(gcl)
-rm(gcl_clean)
-rm(gcl_clean2)
-
-names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_")
-names(s_left) <- paste(seqnames(grL), start(grL), end(grL), sep = "_")
-names(s_right) <- paste(seqnames(grR), start(grR), end(grR), sep = "_")
-cat('Identification of LTRs...')
-TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x],
-                                                s_right[x],
-                                                gcl_clean_with_domains[[x]],
-                                                gr[x],
-                                                grL[x],
-                                                grR[x],
-                                                expected_ltr_length[x]),
-               mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE
-)
-
-cat('done.\n')
-
-good_TE <- TE[!sapply(TE, is.null)]
-cat('Number of putative TE with identified LTR   :', length(good_TE), '\n')
-
-# TODO - extent TE region to cover also TSD
-ID <- paste0('TE_', sprintf("%08d", seq(good_TE)))
-gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu)
-
-cat('Identification of PBS ...')
-gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE)
-cat('done\n')
-gff3_out <- do.call(c, gff3_list2)
-
-# define new source
-src <- as.character(gff3_out$source)
-src[is.na(src)] <- "dante_ltr"
-gff3_out$source <- src
-gff3_out$Rank <- get_te_rank(gff3_out)
-
-# add partial TEs but first remove all ovelaping
-TE_partial_parent_part <- TE_partial_with_more_than_one_domain[TE_partial_with_more_than_one_domain %outside% gff3_out]
-TE_partial_domain_part <-  g[g$Parent %in% TE_partial_parent_part$ID]
-
-gff3_out <- sort(c(gff3_out, TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start)
-
-export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3')
-
-all_tbl <- get_te_statistics(gff3_out, RT)
-all_tbl <- cbind(Classification = rownames(all_tbl), all_tbl)
-write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE)
-# export fasta files:
-s_te <- get_te_sequences(gff3_out, s)
-for (i in seq_along(s_te)) {
-  outname <- paste0(outfile, "_", names(s_te)[i], ".fasta")
-  writeXStringSet(s_te[[i]], filepath = outname)
-}
-
--- a/requirements.txt	Wed Jul 13 11:02:55 2022 +0000
+++ b/requirements.txt	Thu Jul 21 08:23:15 2022 +0000
@@ -4,3 +4,4 @@
 bioconductor-rtracklayer=1.54
 r-optparse=1.7.1
 r-dplyr=1.0.7
+python>=3.6.0
--- a/tests.sh	Wed Jul 13 11:02:55 2022 +0000
+++ b/tests.sh	Thu Jul 21 08:23:15 2022 +0000
@@ -3,10 +3,17 @@
 set -e
 # first argument is cpu number
 NCPU_TO_USE=$1
+
+# test if NCPU_TO_USE is set:
+if [ -z "${NCPU_TO_USE}" ]; then
+    echo "NCPU_TO_USE is not set, using 10"
+    NCPU_TO_USE=10
+fi
+
 eval "$(conda shell.bash hook)"
 conda activate dante_ltr
 echo "Running tests 1, detection of LTRs"
-./extract_putative_ltr.R -s test_data/sample_genome.fasta \
+./detect_putative_ltr.R -s test_data/sample_genome.fasta \
 -g test_data/sample_DANTE.gff3 -o tmp/test_output1 -c $NCPU_TO_USE
 
 
@@ -17,7 +24,7 @@
 -o tmp/test_output2 -c $NCPU_TO_USE
 
 echo "Running tests 3, detection of LTRs, allow missing domains"
-./extract_putative_ltr.R -s test_data/sample_genome.fasta \
+./detect_putative_ltr.R -s test_data/sample_genome.fasta \
 -g test_data/sample_DANTE.gff3 -o tmp/test_output3 -c $NCPU_TO_USE -M 2
 
 
@@ -28,3 +35,12 @@
 -o tmp/test_output4 -c $NCPU_TO_USE
 
 
+echo "Running tests 5, detection of LTRs using python wrapper"
+./detect_putative_ltr_wrapper.py -s test_data/sample_genome.fasta \
+-g test_data/sample_DANTE.gff3 -o tmp/test_output5 -c $NCPU_TO_USE \
+-S 10000000
+cat tmp/test_output5_statistics.csv
+
+echo "Running tests 4, filtering gff"
+./clean_ltr.R -g tmp/test_output5.gff3 -s test_data/sample_genome.fasta \
+-o tmp/test_output6 -c $NCPU_TO_USE