Mercurial > repos > petr-novak > dante_ltr
changeset 12:ff01d4263391 draft
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
author | petr-novak |
---|---|
date | Thu, 21 Jul 2022 08:23:15 +0000 |
parents | 54bd36973253 |
children | 559940c04c44 |
files | R/ltr_utils.R README.md clean_dante_ltr.xml clean_ltr.R dante_ltr_search.xml dante_ltr_workflow.png detect_putative_ltr.R detect_putative_ltr_wrapper.py extract_putative_ltr.R requirements.txt tests.sh |
diffstat | 11 files changed, 575 insertions(+), 288 deletions(-) [+] |
line wrap: on
line diff
--- a/R/ltr_utils.R Wed Jul 13 11:02:55 2022 +0000 +++ b/R/ltr_utils.R Thu Jul 21 08:23:15 2022 +0000 @@ -1,7 +1,7 @@ add_coordinates_of_closest_neighbor <- function(gff) { gff <- gff[order(seqnames(gff), as.vector(start(gff)))] # split to chromosomes: - gff_parts <- split(gff, seqnames(gff)) + gff_parts <- split(gff, as.vector(seqnames(gff))) upstreams <- c(sapply(gff_parts, function(x) c(1, head(end(x), -1)))) downstreams <- c(sapply(gff_parts, function(x) c(start(x)[-1], seqlengths(x)[runValue(seqnames(x[1]))]))) gff_updated <- unlist(gff_parts)
--- a/README.md Wed Jul 13 11:02:55 2022 +0000 +++ b/README.md Thu Jul 21 08:23:15 2022 +0000 @@ -29,7 +29,7 @@ ### Detection of complete LTR retrotransposons ```shell -Usage: ./extract_putative_ltr.R COMMAND [OPTIONS] +Usage: ./detect_putative_ltr.R COMMAND [OPTIONS] Options: @@ -59,7 +59,7 @@ ```shell mkdir -p tmp -./extract_putative_ltr.R -g test_data/sample_DANTE.gff3 -s test_data/sample_genome.fasta -o tmp/ltr_annotation +./detect_putative_ltr.R -g test_data/sample_DANTE.gff3 -s test_data/sample_genome.fasta -o tmp/ltr_annotation ``` #### Files in the output of `extract_putative_ltr.R`: @@ -72,7 +72,35 @@ - `prefix_DLT.fasta` - elements with **d**omains, **L**TR, **T**SD - `prefix_statistics.csv` - number of elements in individual categories +For large genomes, you can your `detect_putative_ltr_wrapper.py`. This script will split input fasta to smaller chunks and run `detect_putative_ltr.R` on each chunk to limit memory usage. Output will be merged after all chunks are processed. +```shell +usage: detect_putative_ltr_wrapper.py [-h] -g GFF3 -s REFERENCE_SEQUENCE -o + OUTPUT [-c CPU] [-M MAX_MISSING_DOMAINS] + [-L MIN_RELATIVE_LENGTH] + [-S MAX_CHUNK_SIZE] + +detect_putative_ltr_wrapper.py is a wrapper for + detect_putative_ltr.R + +optional arguments: + -h, --help show this help message and exit + -g GFF3, --gff3 GFF3 gff3 file + -s REFERENCE_SEQUENCE, --reference_sequence REFERENCE_SEQUENCE + reference sequence as fasta file + -o OUTPUT, --output OUTPUT + output file path and prefix + -c CPU, --cpu CPU number of CPUs + -M MAX_MISSING_DOMAINS, --max_missing_domains MAX_MISSING_DOMAINS + -L MIN_RELATIVE_LENGTH, --min_relative_length MIN_RELATIVE_LENGTH + Minimum relative length of protein domain to be considered + for retrostransposon detection + -S MAX_CHUNK_SIZE, --max_chunk_size MAX_CHUNK_SIZE + If size of reference sequence is greater than this value, + reference is analyzed in chunks of this size. This is + just approximate value - sequences which are longer + are are not split, default is 100000000 +``` ### Validation of LTR retrotransposons detected un previous step:
--- a/clean_dante_ltr.xml Wed Jul 13 11:02:55 2022 +0000 +++ b/clean_dante_ltr.xml Thu Jul 21 08:23:15 2022 +0000 @@ -1,4 +1,4 @@ -<tool id="clean_dante_ltr" name="DANTE_LTR retrotransposons filtering" version="0.1.6.1" python_template_version="3.5"> +<tool id="clean_dante_ltr" name="DANTE_LTR retrotransposons filtering" version="0.1.7.0" python_template_version="3.5"> <requirements> <requirement type="package">r-optparse</requirement> @@ -34,7 +34,7 @@ $dante_ltr.hid and reference $reference.hid"/> <data name="rm_lib" format="fasta" label="Non-redundant library of LTR retrotransposons (FASTA) based on annotation $dante_ltr.hid and reference $reference.hid"/> - <data name="te_full" format="fasta" label="Library of full length LTR retrotransposons (FASTA) based on annotation $dante_ltr.hid and reference $reference.hid"/> + <data name="te_full" format="fasta" label="Library of LTR retrotransposons (FASTA) based on annotation $dante_ltr.hid and reference $reference.hid"/> <data name="ltr5" format="fasta" label="Library of 5'LTR of retrotransposons (FASTA) based on annotation $dante_ltr.hid and reference $reference.hid"/>
--- a/clean_ltr.R Wed Jul 13 11:02:55 2022 +0000 +++ b/clean_ltr.R Thu Jul 21 08:23:15 2022 +0000 @@ -95,9 +95,12 @@ ## ID in g must be unique - this could be a problem if gff is concatenated from multiple files! ## id ID is renamed - rename parent to! ## add chromosom index to disctinguish same IDs -suffix <- as.numeric(seqnames(g)) -g$ID <- ifelse(is.na(g$ID), NA, paste0(g$ID,"_", suffix)) -g$Parent <- ifelse(is.na(g$Parent), NA, paste0(g$Parent,"_", suffix)) +## do this only if IDs are not unique +if (any(duplicated(na.omit(g$ID)))){ + suffix <- as.numeric(seqnames(g)) + g$ID <- ifelse(is.na(g$ID), NA, paste0(g$ID,"_", suffix)) + g$Parent <- ifelse(is.na(g$Parent), NA, paste0(g$Parent,"_", suffix)) +} # get te sequence based on rank
--- a/dante_ltr_search.xml Wed Jul 13 11:02:55 2022 +0000 +++ b/dante_ltr_search.xml Thu Jul 21 08:23:15 2022 +0000 @@ -1,14 +1,16 @@ -<tool id="dante_ltr_search" name="DANTE_LTR retrotransposon identification" version="0.1.6.1" python_template_version="3.5"> +<tool id="dante_ltr_search" name="DANTE_LTR retrotransposon identification" version="0.1.7.0" python_template_version="3.5"> <requirements> <requirement type="package">blast</requirement> <requirement type="package">r-optparse</requirement> <requirement type="package">bioconductor-bsgenome</requirement> <requirement type="package">bioconductor-biostrings</requirement> <requirement type="package">bioconductor-rtracklayer</requirement> + <requirement type="package" version="3.7.12">python</requirement> </requirements> <command detect_errors="exit_code"><![CDATA[ - Rscript ${__tool_directory__}/extract_putative_ltr.R --gff3 '$dante' --reference_sequence '$reference' -M $max_missing --output output --cpu 32 + python ${__tool_directory__}/detect_putative_ltr_wrapper.py --gff3 '$dante' + --reference_sequence '$reference' -M $max_missing --output output --cpu 32 && mv output.gff3 $te_ltr_gff && @@ -31,7 +33,7 @@ 5' and 3' Long Terminal Repeats, Target Site Duplication (TSD) and primer binding site (PBS). All identified elements contains set of protein domains as defined in - REXdb_.Based on the results of detection of structural features, + REXdb_. Based on the results of detection of structural features, elements falls into five categories: - elements with domains, 5'LTR, 3'LTR, TSD and PBS - rank DLTP @@ -42,5 +44,10 @@ .. _REXdb: https://doi.org/10.1186/s13100-018-0144-1 + Principles of detection of LTR retrotransposons is describet in github_ repository of this tool. + + .. _github: https://github.com/kavonrtep/dante_ltr + + ]]></help> </tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/detect_putative_ltr.R Thu Jul 21 08:23:15 2022 +0000 @@ -0,0 +1,283 @@ +#!/usr/bin/env Rscript +initial_options <- commandArgs(trailingOnly = FALSE) +file_arg_name <- "--file=" +script_name <- normalizePath(sub(file_arg_name, "", initial_options[grep(file_arg_name, initial_options)])) +script_dir <- dirname(script_name) +library(optparse) + +parser <- OptionParser() +option_list <- list( + make_option(c("-g", "--gff3"), action = "store", type = "character", + help = "gff3 with dante results", default = NULL), + make_option(c("-s", "--reference_sequence"), action = "store", type = "character", + help = "reference sequence as fasta", default = NULL), + make_option(c("-o", "--output"), action = "store", type = "character", + help = "output file path and prefix", default = NULL), + make_option(c("-c", "--cpu"), type = "integer", default = 5, + help = "Number of cpu to use [default %default]", metavar = "number"), + make_option(c("-M", "--max_missing_domains"), type = "integer", default = 0, + help = "Maximum number of missing domains is retrotransposon [default %default]", + metavar = "number"), + make_option(c("-L", "--min_relative_length"), type = "numeric", default = 0.6, + help = "Minimum relative length of protein domain to be considered for retrostransposon detection [default %default]", + metavar = "number") + +) +description <- paste(strwrap("")) + +epilogue <- "" +parser <- OptionParser(option_list = option_list, epilogue = epilogue, description = description, + usage = "usage: %prog COMMAND [OPTIONS]") +opt <- parse_args(parser, args = commandArgs(TRUE)) + + +# load packages +suppressPackageStartupMessages({ + library(rtracklayer) + library(Biostrings) + library(BSgenome) + library(parallel) +}) + + +# CONFIGURATION +OFFSET <- 15000 +# load configuration files and functions: +lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv") +FDM_file <- paste0(script_dir, "/databases/feature_distances_model.RDS") +trna_db <- paste0(script_dir, "/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta") +ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R") +if (file.exists(lineage_file) & file.exists(trna_db)) { + lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) + FDM <- readRDS(FDM_file) + source(ltr_utils_r) +}else { + # this destination work is installed using conda + lineage_file <- paste0(script_dir, "/../share/dante_ltr/databases/lineage_domain_order.csv") + FDM_file <- paste0(script_dir, "/../share/dante_ltr/databases/feature_distances_model.RDS") + trna_db <- paste0(script_dir, "/../share/dante_ltr/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta") + ltr_utils_r <- paste0(script_dir, "/../share/dante_ltr/R/ltr_utils.R") + if (file.exists(lineage_file) & file.exists((trna_db))) { + lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) + source(ltr_utils_r) + FDM <- readRDS(FDM_file) + }else( + stop("configuration files not found") + ) +} + + +# for testing +if (FALSE) { + g <- rtracklayer::import("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/repeat_annotation/DANTE_on_CEUR_filtered_short_names.gff3") + s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa") + lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE) + + g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/sample_DANTE_unfiltered.gff3") + g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/DANTE_filtered_part.gff3") + s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/Rbp_part.fa") + + # oriza + g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/DANTE_full_oryza.gff3") + s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/o_sativa_msu7.0.fasta") + + g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data + /DANTE_Vfaba_chr5.gff3") + s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910 + /fasta_parts/211010_Vfaba_chr5.fasta") + + g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data//Cocoa_theobroma_DANTE_filtered.gff3") + s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/Cocoa_theobroma_chr1.fasta.gz") + # test on bigger data: + + g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/tmp/DANTE_unfiltered/1.gff3") + s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/tmp/fasta_parts/1.fasta") + + source("R/ltr_utils.R") + ## feature distance model + FDM <- readRDS("./databases/feature_distances_model.RDS") + g <- rtracklayer::import("./test_data/sample_DANTE.gff3") + s <- readDNAStringSet("./test_data/sample_genome.fasta") + outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3" + lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header = + TRUE, as.is = TRUE) + trna_db <- "./databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta" + opt <- list(min_relative_length=0.6, cpu = 8) + +} + +# MAIN ############################################################# + +# load data: + +cat("reading gff...") +g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3 +cat("done\n") +cat("reading fasta...") +s <- readDNAStringSet(opt$reference_sequence) # genome assembly +cat("done\n") +outfile <- opt$output +# clean sequence names: +names(s) <- gsub(" .+", "", names(s)) +lineage_domain <- lineage_info$Domains.order +lineage_domain_span <- lineage_info$domain_span +lineage_ltr_mean_length <- lineage_info$ltr_length +lineage_offset5prime <- lineage_info$offset5prime +lineage_offset3prime <- lineage_info$offset3prime +ln <- gsub("ss/I", "ss_I", gsub("_", "/", gsub("/", "|", lineage_info$Lineage))) +names(lineage_offset3prime) <- ln +names(lineage_offset5prime) <- ln +names(lineage_domain) <- ln +names(lineage_domain_span) <- ln +names(lineage_ltr_mean_length) <- ln +lineage_domains_sequence <- unlist(mapply(function(d,l) { + paste(strsplit(d, " ")[[1]], ":", l, sep = "") +}, d = lineage_domain, l = names(lineage_domain))) + +# filter g gff3 +g <- dante_filtering(g, Relative_Length = opt$min_relative_length) # default + +seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] +g <- add_coordinates_of_closest_neighbor(g) + +# add info about domain order: +g$domain_order <- as.numeric(factor(paste(g$Name, g$Final_Classification, sep = ":"), levels = lineage_domains_sequence)) +# set NA to 0 in g$domain_order ( some domains are not fromm ClassI elements +g$domain_order[is.na(g$domain_order)] <- 0 + +# NOTE - some operation is faster of GrangesList but some on list of data.frames +# this is primary clusteing +cls <- get_domain_clusters(g) +gcl <- split(as.data.frame(g), cls) +# gcl_as_GRL <- split(g, cls) # delete? + +cls_alt <- get_domain_clusters_alt(g, FDM) +g$Cluster <- as.numeric(factor(cls_alt)) + +gcl_alt <- split(as.data.frame(g), cls_alt) + +TE_partial <- GRanges(seqnames = sapply(gcl_alt, function(x) x$seqnames[1]), + Name = sapply(gcl_alt, function(x) x$Final_Classification[1]), + Final_Classification = sapply(gcl_alt, function(x) x$Final_Classification[1]), + ID = sapply(gcl_alt, function(x) paste0("TE_partial_", sprintf("%08d", x$Cluster[1]))), + strand = sapply(gcl_alt, function(x) x$strand[1]), + Ndomains = sapply(gcl_alt, function(x) nrow(x)), + type = "transposable_element", + source = "dante_ltr", + Rank="D", + IRanges(start = sapply(gcl_alt, function(x) min(x$start)), + end = sapply(gcl_alt, function(x) max(x$end))) +) +g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster) +g$Parent <- paste0("TE_partial_", sprintf("%08d", g$Cluster)) +g$Rank <- "D" + +# keep only partial TE with more than one domain +TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1] +g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)] + +# first filtering - remove TEs with low number of domains +gcl_clean <- clean_domain_clusters(gcl, lineage_domain_span, min_domains = 5 - opt$max_missing_domains) + +# glc annotation +gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1]) +gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-", + paste(rev(x$Name), collapse = " "), + paste(x$Name, collapse = " ")) +) + +# compare detected domains with domains in lineages from REXdb database +dd <- mapply(domain_distance, + d_query = gcl_clean_domains, + d_reference = lineage_domain[gcl_clean_lineage]) + +# get lineages which has correct number and order of domains +# gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]] +gcl_clean2 <- gcl_clean[dd <= opt$max_missing_domains] + +gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)] +gr <- get_ranges(gcl_clean_with_domains) + + +cat('Number of analyzed regions:\n') +cat('Total number of domain clusters : ', length(gcl), '\n') +cat('Number of clean clusters : ', length(gcl_clean), '\n') +cat('Number of clusters with complete domain set : ', length(gcl_clean_with_domains), '\n') + + +te_strand <- sapply(gcl_clean_with_domains, function(x)x$strand[1]) +te_lineage <- sapply(gcl_clean_with_domains, function(x)x$Final_Classification[1]) + +max_left_offset <- ifelse(te_strand == "+", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) +max_right_offset <- ifelse(te_strand == "-", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) + +grL <- get_ranges_left(gcl_clean_with_domains, max_left_offset) +grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset) + +s_left <- getSeq(s, grL) +s_right <- getSeq(s, grR) + +expected_ltr_length <- lineage_ltr_mean_length[sapply(gcl_clean_with_domains, function (x)x$Final_Classification[1])] +# for statistics +RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"] +# cleanup +#rm(g) +rm(gcl) +rm(gcl_clean) +rm(gcl_clean2) + +names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_") +names(s_left) <- paste(seqnames(grL), start(grL), end(grL), sep = "_") +names(s_right) <- paste(seqnames(grR), start(grR), end(grR), sep = "_") +cat('Identification of LTRs...') +TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x], + s_right[x], + gcl_clean_with_domains[[x]], + gr[x], + grL[x], + grR[x], + expected_ltr_length[x]), + mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE +) + +cat('done.\n') + +good_TE <- TE[!sapply(TE, is.null)] +cat('Number of putative TE with identified LTR :', length(good_TE), '\n') + +# TODO - extent TE region to cover also TSD +ID <- paste0('TE_', sprintf("%08d", seq(good_TE))) +gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu) + +cat('Identification of PBS ...') +gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE) +cat('done\n') +gff3_out <- do.call(c, gff3_list2) + +# define new source +src <- as.character(gff3_out$source) +src[is.na(src)] <- "dante_ltr" +gff3_out$source <- src +gff3_out$Rank <- get_te_rank(gff3_out) + +# add partial TEs but first remove all ovelaping +TE_partial_parent_part <- TE_partial_with_more_than_one_domain[TE_partial_with_more_than_one_domain %outside% gff3_out] +TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID] + +gff3_out <- sort(c(gff3_out, TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start) +# modify ID and Parent - add seqname - this will ensure it is unique is done on chunk level +gff3_out$ID[!is.na(gff3_out$ID)] <- paste0(gff3_out$ID[!is.na(gff3_out$ID)], "_", seqnames(gff3_out)[!is.na(gff3_out$ID)]) +gff3_out$Parent[!is.na(gff3_out$Parent)] <- paste0(gff3_out$Parent[!is.na(gff3_out$Parent)], "_", seqnames(gff3_out)[!is.na(gff3_out$Parent)]) + +export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') + +all_tbl <- get_te_statistics(gff3_out, RT) +all_tbl <- cbind(Classification = rownames(all_tbl), all_tbl) +write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE) +# export fasta files: +s_te <- get_te_sequences(gff3_out, s) +for (i in seq_along(s_te)) { + outname <- paste0(outfile, "_", names(s_te)[i], ".fasta") + writeXStringSet(s_te[[i]], filepath = outname) +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/detect_putative_ltr_wrapper.py Thu Jul 21 08:23:15 2022 +0000 @@ -0,0 +1,224 @@ +#!/usr/bin/env python +"""This wrapper is intended to be used on large genomes and large DANTE input to +minimize memory usage, It splits input files to pieces and analyze it on by one by +detect_putative_ltr.R +If input does not exceed memory limit, it will run detect_putative_ltr.R directly +""" + +import argparse +import os +import sys +import tempfile +from itertools import cycle +import subprocess + + +def get_arguments(): + """ + Get arguments from command line + :return: + args + """ + parser = argparse.ArgumentParser( + description="""detect_putative_ltr_wrapper.py is a wrapper for + detect_putative_ltr.R""", formatter_class=argparse.RawTextHelpFormatter + ) + parser.add_argument( + '-g', '--gff3', default=None, required=True, help="gff3 file", type=str, + action='store' + ) + parser.add_argument( + '-s', '--reference_sequence', default=None, required=True, + help="reference sequence as fasta file", type=str, action='store' + ) + parser.add_argument( + '-o', '--output', default=None, required=True, help="output file path and prefix", + type=str, action='store' + ) + parser.add_argument( + '-c', '--cpu', default=1, required=False, help="number of CPUs", type=int, + action='store' + ) + parser.add_argument( + '-M', '--max_missing_domains', default=0, required=False, type=int + ) + parser.add_argument( + '-L', '--min_relative_length', default=0.6, required=False, type=float, + help="Minimum relative length of protein domain to be " + "considered for retrostransposon detection" + ) + parser.add_argument( + '-S', '--max_chunk_size', default=100000000, required=False, type=int, + help='If size of reference sequence is greater than this value, reference is ' + 'analyzed in chunks of this size. This is just approximate value - ' + 'sequences ' + 'which are longer are are not split, default is %(default)s' + ) + args = parser.parse_args() + return args + + +def read_fasta_sequence_size(fasta_file): + """Read size of sequence into dictionary""" + fasta_dict = {} + with open(fasta_file, 'r') as f: + for line in f: + if line[0] == '>': + header = line.strip().split(' ')[0][1:] # remove part of name after space + fasta_dict[header] = 0 + else: + fasta_dict[header] += len(line.strip()) + return fasta_dict + + +def make_temp_files(number_of_files): + """ + Make named temporary files, file will not be deleted upon exit! + :param number_of_files: + :return: + filepaths + """ + temp_files = [] + for i in range(number_of_files): + temp_files.append(tempfile.NamedTemporaryFile(delete=False).name) + return temp_files + + +def sum_up_stats_files(files): + """ + Sum up statistics files + :return: + """ + new_statistics = {} + for file in files: + with open(file, 'r') as fh: + for line in fh: + items = line.strip().split('\t') + if items[0] == 'Classification': + header = items + continue + else: + counts = [int(item) for item in items[1:]] + if items[0] in new_statistics: + new_statistics[items[0]] = [sum(x) for x in + zip(new_statistics[items[0]], counts)] + else: + new_statistics[items[0]] = counts + # convert to string, first line is header + statistics_str = [] + for classification, counts in new_statistics.items(): + statistics_str.append(classification + '\t' + '\t'.join([str(x) for x in counts])) + sorted_stat_with_header = ['\t'.join(header)] + sorted(statistics_str) + return sorted_stat_with_header + + +def main(): + """ + Main function + """ + args = get_arguments() + # locate directory of current script + tool_path = os.path.dirname(os.path.realpath(__file__)) + fasta_seq_size = read_fasta_sequence_size(args.reference_sequence) + total_size = sum(fasta_seq_size.values()) + number_of_sequences = len(fasta_seq_size) + if total_size > args.max_chunk_size and number_of_sequences > 1: + # sort dictionary by values + seq_id_size_sorted = [i[0] for i in sorted( + fasta_seq_size.items(), key=lambda x: int(x[1]), reverse=True + )] + number_of_temp_files = int(total_size / args.max_chunk_size) + 1 + if number_of_temp_files > number_of_sequences: + number_of_temp_files = number_of_sequences + + temp_files_fasta = make_temp_files(number_of_temp_files) + file_handles = [open(temp_file, 'w') for temp_file in temp_files_fasta] + # make dictionary seq_id_sorted as keys and values as file handles + seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles))) + + # write sequences to temporary files + with open(args.reference_sequence, 'r') as f: + for line in f: + if line[0] == '>': + header = line.strip().split(' ')[0][1:] + print(header) + seq_id_file_handle_dict[header].write(line) + else: + seq_id_file_handle_dict[header].write(line) + # close file handles + for file_handle in file_handles: + file_handle.close() + + # split gff3 file to temporary files - + # each temporary file will contain gff lines matching fasta + temp_files_gff = make_temp_files(number_of_temp_files) + file_handles = [open(temp_file, 'w') for temp_file in temp_files_gff] + # make dictionary seq_id_sorted as keys and values as file handles + seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles))) + # write gff lines to chunks + with open(args.gff3, 'r') as f: + for line in f: + if line[0] == '#': + continue + else: + header = line.strip().split('\t')[0] + seq_id_file_handle_dict[header].write(line) + # close file handles + for file_handle in file_handles: + file_handle.close() + + # run retrotransposon detection on each temporary file + output_files = make_temp_files(number_of_temp_files) + for i in range(number_of_temp_files): + print('Running retrotransposon detection on file ' + str(i)) + subprocess.check_call( + [f'{tool_path}/detect_putative_ltr.R', '-s', temp_files_fasta[i], '-g', + temp_files_gff[i], '-o', output_files[i], '-c', str(args.cpu), '-M', + str(args.max_missing_domains), '-L', str(args.min_relative_length)] + ) + + # remove all temporary input files + for temp_file in temp_files_fasta + temp_files_gff: + os.remove(temp_file) + + # concatenate output files + output_file_suffixes = ['_D.fasta', '_DL.fasta', '_DLT.fasta', '_DLTP.fasta', + '_DLP.fasta', '.gff3', '_statistics.csv'] + + for suffix in output_file_suffixes: + if suffix == '_statistics.csv': + # sum up line with same word in first column + stat_files = [output_file + suffix for output_file in output_files] + new_statistics = sum_up_stats_files(stat_files) + with open(args.output + suffix, 'w') as f: + f.write("\n".join(new_statistics)) + # remove parsed temporary statistics files + for file in stat_files: + os.remove(file) + else: + with open(args.output + suffix, 'w') as f: + for i in range(number_of_temp_files): + # some file may not exist, so we need to check + try: + with open(output_files[i] + suffix, 'r') as g: + for line in g: + f.write(line) + # remove parsed temporary output files + os.remove(output_files[i]) + except FileNotFoundError: + pass + else: + # no need to split sequences into chunks + subprocess.check_call( + [f'{tool_path}/detect_putative_ltr.R', '-s', args.reference_sequence, '-g', + args.gff3, '-o', args.output, '-c', str(args.cpu), '-M', + str(args.max_missing_domains), '-L', str(args.min_relative_length)] + ) + + +if __name__ == '__main__': + # check version of python must be 3.6 or greater + if sys.version_info < (3, 6): + print('Python version must be 3.6 or greater') + sys.exit(1) + main()
--- a/extract_putative_ltr.R Wed Jul 13 11:02:55 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,275 +0,0 @@ -#!/usr/bin/env Rscript -initial_options <- commandArgs(trailingOnly = FALSE) -file_arg_name <- "--file=" -script_name <- normalizePath(sub(file_arg_name, "", initial_options[grep(file_arg_name, initial_options)])) -script_dir <- dirname(script_name) -library(optparse) - -parser <- OptionParser() -option_list <- list( - make_option(c("-g", "--gff3"), action = "store", type = "character", - help = "gff3 with dante results", default = NULL), - make_option(c("-s", "--reference_sequence"), action = "store", type = "character", - help = "reference sequence as fasta", default = NULL), - make_option(c("-o", "--output"), action = "store", type = "character", - help = "output file path and prefix", default = NULL), - make_option(c("-c", "--cpu"), type = "integer", default = 5, - help = "Number of cpu to use [default %default]", metavar = "number"), - make_option(c("-M", "--max_missing_domains"), type = "integer", default = 0, - help = "Maximum number of missing domains is retrotransposon [default %default]", - metavar = "number"), - make_option(c("-L", "--min_relative_length"), type = "numeric", default = 0.6, - help = "Minimum relative length of protein domain to be considered for retrostransposon detection [default %default]", - metavar = "number") - -) -description <- paste(strwrap("")) - -epilogue <- "" -parser <- OptionParser(option_list = option_list, epilogue = epilogue, description = description, - usage = "usage: %prog COMMAND [OPTIONS]") -opt <- parse_args(parser, args = commandArgs(TRUE)) - - -# load packages -suppressPackageStartupMessages({ - library(rtracklayer) - library(Biostrings) - library(BSgenome) - library(parallel) -}) - - -# CONFIGURATION -OFFSET <- 15000 -# load configuration files and functions: -lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv") -FDM_file <- paste0(script_dir, "/databases/feature_distances_model.RDS") -trna_db <- paste0(script_dir, "/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta") -ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R") -if (file.exists(lineage_file) & file.exists(trna_db)) { - lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) - FDM <- readRDS(FDM_file) - source(ltr_utils_r) -}else { - # this destination work is installed using conda - lineage_file <- paste0(script_dir, "/../share/dante_ltr/databases/lineage_domain_order.csv") - FDM_file <- paste0(script_dir, "/../share/dante_ltr/databases/feature_distances_model.RDS") - trna_db <- paste0(script_dir, "/../share/dante_ltr/databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta") - ltr_utils_r <- paste0(script_dir, "/../share/dante_ltr/R/ltr_utils.R") - if (file.exists(lineage_file) & file.exists((trna_db))) { - lineage_info <- read.table(lineage_file, sep = "\t", header = TRUE, as.is = TRUE) - source(ltr_utils_r) - FDM <- readRDS(FDM_file) - }else( - stop("configuration files not found") - ) -} - - -# for testing -if (FALSE) { - g <- rtracklayer::import("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/repeat_annotation/DANTE_on_CEUR_filtered_short_names.gff3") - s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa") - lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE) - - g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/sample_DANTE_unfiltered.gff3") - g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/DANTE_filtered_part.gff3") - s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/Rbp_part.fa") - - # oriza - g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/DANTE_full_oryza.gff3") - s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/o_sativa_msu7.0.fasta") - - g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data - /DANTE_Vfaba_chr5.gff3") - s <- readDNAStringSet("/mnt/ceph/454_data/Vicia_faba_assembly/assembly/ver_210910 - /fasta_parts/211010_Vfaba_chr5.fasta") - - g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data//Cocoa_theobroma_DANTE_filtered.gff3") - s <- readDNAStringSet("/mnt/raid/users/petr/workspace/dante_ltr/test_data/big_test_data/Cocoa_theobroma_chr1.fasta.gz") - - source("R/ltr_utils.R") - ## feature distance model - FDM <- readRDS("./databases/feature_distances_model.RDS") - g <- rtracklayer::import("./test_data/sample_DANTE.gff3") - s <- readDNAStringSet("./test_data/sample_genome.fasta") - outfile <- "/mnt/raid/users/petr/workspace/ltr_finder_test/te_with_domains_2.gff3" - lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header = - TRUE, as.is = TRUE) - trna_db <- "./databases/tRNAscan-SE_ALL_spliced-no_plus-old-tRNAs_UC_unique-3ends.fasta" - -} - -# MAIN ############################################################# - -# load data: - -cat("reading gff...") -g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3 -cat("done\n") -cat("reading fasta...") -s <- readDNAStringSet(opt$reference_sequence) # genome assembly -cat("done\n") -outfile <- opt$output -# clean sequence names: -names(s) <- gsub(" .+", "", names(s)) -lineage_domain <- lineage_info$Domains.order -lineage_domain_span <- lineage_info$domain_span -lineage_ltr_mean_length <- lineage_info$ltr_length -lineage_offset5prime <- lineage_info$offset5prime -lineage_offset3prime <- lineage_info$offset3prime -ln <- gsub("ss/I", "ss_I", gsub("_", "/", gsub("/", "|", lineage_info$Lineage))) -names(lineage_offset3prime) <- ln -names(lineage_offset5prime) <- ln -names(lineage_domain) <- ln -names(lineage_domain_span) <- ln -names(lineage_ltr_mean_length) <- ln -lineage_domains_sequence <- unlist(mapply(function(d,l) { - paste(strsplit(d, " ")[[1]], ":", l, sep = "") -}, d = lineage_domain, l = names(lineage_domain))) - -# filter g gff3 -g <- dante_filtering(g, Relative_Length = opt$min_relative_length) # default - -seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] -g <- add_coordinates_of_closest_neighbor(g) - -# add info about domain order: -g$domain_order <- as.numeric(factor(paste(g$Name, g$Final_Classification, sep = ":"), levels = lineage_domains_sequence)) -# set NA to 0 in g$domain_order ( some domains are not fromm ClassI elements -g$domain_order[is.na(g$domain_order)] <- 0 - -# NOTE - some operation is faster of GrangesList but some on list of data.frames -# this is primary clusteing -cls <- get_domain_clusters(g) -gcl <- split(as.data.frame(g), cls) -# gcl_as_GRL <- split(g, cls) # delete? - -cls_alt <- get_domain_clusters_alt(g, FDM) -g$Cluster <- as.numeric(factor(cls_alt)) - -gcl_alt <- split(as.data.frame(g), cls_alt) - -TE_partial <- GRanges(seqnames = sapply(gcl_alt, function(x) x$seqnames[1]), - Name = sapply(gcl_alt, function(x) x$Final_Classification[1]), - Final_Classification = sapply(gcl_alt, function(x) x$Final_Classification[1]), - ID = sapply(gcl_alt, function(x) paste0("TE_partial_", x$Cluster[1])), - strand = sapply(gcl_alt, function(x) x$strand[1]), - Ndomains = sapply(gcl_alt, function(x) nrow(x)), - type = "transposable_element", - source = "dante_ltr", - Rank="D", - IRanges(start = sapply(gcl_alt, function(x) min(x$start)), - end = sapply(gcl_alt, function(x) max(x$end))) -) -g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster) -g$Parent <- paste0("TE_partial_", g$Cluster) -g$Rank <- "D" - -# keep only partial TE with more than one domain -TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1] -g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)] - -# first filtering - remove TEs with low number of domains -gcl_clean <- clean_domain_clusters(gcl, lineage_domain_span, min_domains = 5 - opt$max_missing_domains) - -# glc annotation -gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1]) -gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-", - paste(rev(x$Name), collapse = " "), - paste(x$Name, collapse = " ")) -) - -# compare detected domains with domains in lineages from REXdb database -dd <- mapply(domain_distance, - d_query = gcl_clean_domains, - d_reference = lineage_domain[gcl_clean_lineage]) - -# get lineages which has correct number and order of domains -# gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]] -gcl_clean2 <- gcl_clean[dd <= opt$max_missing_domains] - -gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)] -gr <- get_ranges(gcl_clean_with_domains) - - -cat('Number of analyzed regions:\n') -cat('Total number of domain clusters : ', length(gcl), '\n') -cat('Number of clean clusters : ', length(gcl_clean), '\n') -cat('Number of clusters with complete domain set : ', length(gcl_clean_with_domains), '\n') - - -te_strand <- sapply(gcl_clean_with_domains, function(x)x$strand[1]) -te_lineage <- sapply(gcl_clean_with_domains, function(x)x$Final_Classification[1]) - -max_left_offset <- ifelse(te_strand == "+", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) -max_right_offset <- ifelse(te_strand == "-", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) - -grL <- get_ranges_left(gcl_clean_with_domains, max_left_offset) -grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset) - -s_left <- getSeq(s, grL) -s_right <- getSeq(s, grR) - -expected_ltr_length <- lineage_ltr_mean_length[sapply(gcl_clean_with_domains, function (x)x$Final_Classification[1])] -# for statistics -RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"] -# cleanup -#rm(g) -rm(gcl) -rm(gcl_clean) -rm(gcl_clean2) - -names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_") -names(s_left) <- paste(seqnames(grL), start(grL), end(grL), sep = "_") -names(s_right) <- paste(seqnames(grR), start(grR), end(grR), sep = "_") -cat('Identification of LTRs...') -TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x], - s_right[x], - gcl_clean_with_domains[[x]], - gr[x], - grL[x], - grR[x], - expected_ltr_length[x]), - mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE -) - -cat('done.\n') - -good_TE <- TE[!sapply(TE, is.null)] -cat('Number of putative TE with identified LTR :', length(good_TE), '\n') - -# TODO - extent TE region to cover also TSD -ID <- paste0('TE_', sprintf("%08d", seq(good_TE))) -gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu) - -cat('Identification of PBS ...') -gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE) -cat('done\n') -gff3_out <- do.call(c, gff3_list2) - -# define new source -src <- as.character(gff3_out$source) -src[is.na(src)] <- "dante_ltr" -gff3_out$source <- src -gff3_out$Rank <- get_te_rank(gff3_out) - -# add partial TEs but first remove all ovelaping -TE_partial_parent_part <- TE_partial_with_more_than_one_domain[TE_partial_with_more_than_one_domain %outside% gff3_out] -TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID] - -gff3_out <- sort(c(gff3_out, TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start) - -export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') - -all_tbl <- get_te_statistics(gff3_out, RT) -all_tbl <- cbind(Classification = rownames(all_tbl), all_tbl) -write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE) -# export fasta files: -s_te <- get_te_sequences(gff3_out, s) -for (i in seq_along(s_te)) { - outname <- paste0(outfile, "_", names(s_te)[i], ".fasta") - writeXStringSet(s_te[[i]], filepath = outname) -} -
--- a/requirements.txt Wed Jul 13 11:02:55 2022 +0000 +++ b/requirements.txt Thu Jul 21 08:23:15 2022 +0000 @@ -4,3 +4,4 @@ bioconductor-rtracklayer=1.54 r-optparse=1.7.1 r-dplyr=1.0.7 +python>=3.6.0
--- a/tests.sh Wed Jul 13 11:02:55 2022 +0000 +++ b/tests.sh Thu Jul 21 08:23:15 2022 +0000 @@ -3,10 +3,17 @@ set -e # first argument is cpu number NCPU_TO_USE=$1 + +# test if NCPU_TO_USE is set: +if [ -z "${NCPU_TO_USE}" ]; then + echo "NCPU_TO_USE is not set, using 10" + NCPU_TO_USE=10 +fi + eval "$(conda shell.bash hook)" conda activate dante_ltr echo "Running tests 1, detection of LTRs" -./extract_putative_ltr.R -s test_data/sample_genome.fasta \ +./detect_putative_ltr.R -s test_data/sample_genome.fasta \ -g test_data/sample_DANTE.gff3 -o tmp/test_output1 -c $NCPU_TO_USE @@ -17,7 +24,7 @@ -o tmp/test_output2 -c $NCPU_TO_USE echo "Running tests 3, detection of LTRs, allow missing domains" -./extract_putative_ltr.R -s test_data/sample_genome.fasta \ +./detect_putative_ltr.R -s test_data/sample_genome.fasta \ -g test_data/sample_DANTE.gff3 -o tmp/test_output3 -c $NCPU_TO_USE -M 2 @@ -28,3 +35,12 @@ -o tmp/test_output4 -c $NCPU_TO_USE +echo "Running tests 5, detection of LTRs using python wrapper" +./detect_putative_ltr_wrapper.py -s test_data/sample_genome.fasta \ +-g test_data/sample_DANTE.gff3 -o tmp/test_output5 -c $NCPU_TO_USE \ +-S 10000000 +cat tmp/test_output5_statistics.csv + +echo "Running tests 4, filtering gff" +./clean_ltr.R -g tmp/test_output5.gff3 -s test_data/sample_genome.fasta \ +-o tmp/test_output6 -c $NCPU_TO_USE