Mercurial > repos > petr-novak > dante_ltr
changeset 13:559940c04c44 draft
"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
author | petr-novak |
---|---|
date | Thu, 11 Aug 2022 07:29:06 +0000 |
parents | ff01d4263391 |
children | 3b3a87915ef6 |
files | clean_dante_ltr.xml dante_ltr_search.xml detect_putative_ltr.R detect_putative_ltr_wrapper.py tests.sh |
diffstat | 5 files changed, 538 insertions(+), 136 deletions(-) [+] |
line wrap: on
line diff
--- a/clean_dante_ltr.xml Thu Jul 21 08:23:15 2022 +0000 +++ b/clean_dante_ltr.xml Thu Aug 11 07:29:06 2022 +0000 @@ -1,4 +1,4 @@ -<tool id="clean_dante_ltr" name="DANTE_LTR retrotransposons filtering" version="0.1.7.0" python_template_version="3.5"> +<tool id="clean_dante_ltr" name="DANTE_LTR retrotransposons filtering" version="0.1.8.0" python_template_version="3.5"> <requirements> <requirement type="package">r-optparse</requirement>
--- a/dante_ltr_search.xml Thu Jul 21 08:23:15 2022 +0000 +++ b/dante_ltr_search.xml Thu Aug 11 07:29:06 2022 +0000 @@ -1,4 +1,4 @@ -<tool id="dante_ltr_search" name="DANTE_LTR retrotransposon identification" version="0.1.7.0" python_template_version="3.5"> +<tool id="dante_ltr_search" name="DANTE_LTR retrotransposon identification" version="0.1.8.0" python_template_version="3.5"> <requirements> <requirement type="package">blast</requirement> <requirement type="package">r-optparse</requirement>
--- a/detect_putative_ltr.R Thu Jul 21 08:23:15 2022 +0000 +++ b/detect_putative_ltr.R Thu Aug 11 07:29:06 2022 +0000 @@ -109,7 +109,6 @@ # MAIN ############################################################# # load data: - cat("reading gff...") g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3 cat("done\n") @@ -134,150 +133,173 @@ paste(strsplit(d, " ")[[1]], ":", l, sep = "") }, d = lineage_domain, l = names(lineage_domain))) -# filter g gff3 -g <- dante_filtering(g, Relative_Length = opt$min_relative_length) # default +# this repeat block is run just once +# it can breaks in eny point if zero TE is found +repeat{ -seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] -g <- add_coordinates_of_closest_neighbor(g) + # filter g gff3 + g <- dante_filtering(g, Relative_Length = opt$min_relative_length) # default -# add info about domain order: -g$domain_order <- as.numeric(factor(paste(g$Name, g$Final_Classification, sep = ":"), levels = lineage_domains_sequence)) -# set NA to 0 in g$domain_order ( some domains are not fromm ClassI elements -g$domain_order[is.na(g$domain_order)] <- 0 + seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] + g <- add_coordinates_of_closest_neighbor(g) -# NOTE - some operation is faster of GrangesList but some on list of data.frames -# this is primary clusteing -cls <- get_domain_clusters(g) -gcl <- split(as.data.frame(g), cls) -# gcl_as_GRL <- split(g, cls) # delete? + # add info about domain order: + g$domain_order <- as.numeric(factor(paste(g$Name, g$Final_Classification, sep = ":"), levels = lineage_domains_sequence)) + # set NA to 0 in g$domain_order ( some domains are not fromm ClassI elements + g$domain_order[is.na(g$domain_order)] <- 0 -cls_alt <- get_domain_clusters_alt(g, FDM) -g$Cluster <- as.numeric(factor(cls_alt)) + # NOTE - some operation is faster of GrangesList but some on list of data.frames + # this is primary clusteing + cls <- get_domain_clusters(g) + gcl <- split(as.data.frame(g), cls) + # gcl_as_GRL <- split(g, cls) # delete? -gcl_alt <- split(as.data.frame(g), cls_alt) + cls_alt <- get_domain_clusters_alt(g, FDM) + g$Cluster <- as.numeric(factor(cls_alt)) + + gcl_alt <- split(as.data.frame(g), cls_alt) -TE_partial <- GRanges(seqnames = sapply(gcl_alt, function(x) x$seqnames[1]), - Name = sapply(gcl_alt, function(x) x$Final_Classification[1]), - Final_Classification = sapply(gcl_alt, function(x) x$Final_Classification[1]), - ID = sapply(gcl_alt, function(x) paste0("TE_partial_", sprintf("%08d", x$Cluster[1]))), - strand = sapply(gcl_alt, function(x) x$strand[1]), - Ndomains = sapply(gcl_alt, function(x) nrow(x)), - type = "transposable_element", - source = "dante_ltr", - Rank="D", - IRanges(start = sapply(gcl_alt, function(x) min(x$start)), - end = sapply(gcl_alt, function(x) max(x$end))) -) -g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster) -g$Parent <- paste0("TE_partial_", sprintf("%08d", g$Cluster)) -g$Rank <- "D" - -# keep only partial TE with more than one domain -TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1] -g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)] + TE_partial <- GRanges(seqnames = sapply(gcl_alt, function(x) x$seqnames[1]), + Name = sapply(gcl_alt, function(x) x$Final_Classification[1]), + Final_Classification = sapply(gcl_alt, function(x) x$Final_Classification[1]), + ID = sapply(gcl_alt, function(x) paste0("TE_partial_", sprintf("%08d", x$Cluster[1]))), + strand = sapply(gcl_alt, function(x) x$strand[1]), + Ndomains = sapply(gcl_alt, function(x) nrow(x)), + type = "transposable_element", + source = "dante_ltr", + Rank="D", + IRanges(start = sapply(gcl_alt, function(x) min(x$start)), + end = sapply(gcl_alt, function(x) max(x$end))) + ) + g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster) + g$Parent <- paste0("TE_partial_", sprintf("%08d", g$Cluster)) + g$Rank <- "D" + # for statistics + RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"] -# first filtering - remove TEs with low number of domains -gcl_clean <- clean_domain_clusters(gcl, lineage_domain_span, min_domains = 5 - opt$max_missing_domains) + # keep only partial TE with more than one domain + TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1] + g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)] -# glc annotation -gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1]) -gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-", - paste(rev(x$Name), collapse = " "), - paste(x$Name, collapse = " ")) -) + # first filtering - remove TEs with low number of domains + gcl_clean <- clean_domain_clusters(gcl, lineage_domain_span, min_domains = 5 - opt$max_missing_domains) -# compare detected domains with domains in lineages from REXdb database -dd <- mapply(domain_distance, - d_query = gcl_clean_domains, - d_reference = lineage_domain[gcl_clean_lineage]) + # glc annotation + gcl_clean_lineage <- sapply(gcl_clean, function(x) x$Final_Classification[1]) + gcl_clean_domains <- sapply(gcl_clean, function(x) ifelse(x$strand[1] == "-", + paste(rev(x$Name), collapse = " "), + paste(x$Name, collapse = " ")) + ) -# get lineages which has correct number and order of domains -# gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]] -gcl_clean2 <- gcl_clean[dd <= opt$max_missing_domains] + # compare detected domains with domains in lineages from REXdb database + dd <- mapply(domain_distance, + d_query = gcl_clean_domains, + d_reference = lineage_domain[gcl_clean_lineage]) -gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)] -gr <- get_ranges(gcl_clean_with_domains) + # get lineages which has correct number and order of domains + # gcl_clean2 <- gcl_clean[gcl_clean_domains == lineage_domain[gcl_clean_lineage]] -cat('Number of analyzed regions:\n') -cat('Total number of domain clusters : ', length(gcl), '\n') -cat('Number of clean clusters : ', length(gcl_clean), '\n') -cat('Number of clusters with complete domain set : ', length(gcl_clean_with_domains), '\n') + gcl_clean2 <- gcl_clean[dd <= opt$max_missing_domains] + + if(length(gcl_clean2) == 0) { + cat("No complete TE found\n") + good_TE <- list() + break + } + gcl_clean_with_domains <- gcl_clean2[check_ranges(gcl_clean2, s)] + gr <- get_ranges(gcl_clean_with_domains) + + cat('Number of analyzed regions:\n') + cat('Total number of domain clusters : ', length(gcl), '\n') + cat('Number of clean clusters : ', length(gcl_clean), '\n') + cat('Number of clusters with complete domain set : ', length(gcl_clean_with_domains), '\n') -te_strand <- sapply(gcl_clean_with_domains, function(x)x$strand[1]) -te_lineage <- sapply(gcl_clean_with_domains, function(x)x$Final_Classification[1]) + te_strand <- sapply(gcl_clean_with_domains, function(x)x$strand[1]) + te_lineage <- sapply(gcl_clean_with_domains, function(x)x$Final_Classification[1]) -max_left_offset <- ifelse(te_strand == "+", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) -max_right_offset <- ifelse(te_strand == "-", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) + max_left_offset <- ifelse(te_strand == "+", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) + max_right_offset <- ifelse(te_strand == "-", lineage_offset5prime[te_lineage], lineage_offset3prime[te_lineage]) -grL <- get_ranges_left(gcl_clean_with_domains, max_left_offset) -grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset) + grL <- get_ranges_left(gcl_clean_with_domains, max_left_offset) + grR <- get_ranges_right(gcl_clean_with_domains, max_right_offset) -s_left <- getSeq(s, grL) -s_right <- getSeq(s, grR) + s_left <- getSeq(s, grL) + s_right <- getSeq(s, grR) + + expected_ltr_length <- lineage_ltr_mean_length[sapply(gcl_clean_with_domains, function (x)x$Final_Classification[1])] -expected_ltr_length <- lineage_ltr_mean_length[sapply(gcl_clean_with_domains, function (x)x$Final_Classification[1])] -# for statistics -RT <- g[g$Name == "RT" & substring(g$Final_Classification, 1, 11) == "Class_I|LTR"] -# cleanup -#rm(g) -rm(gcl) -rm(gcl_clean) -rm(gcl_clean2) + # cleanup + #rm(g) + rm(gcl) + rm(gcl_clean) + rm(gcl_clean2) -names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_") -names(s_left) <- paste(seqnames(grL), start(grL), end(grL), sep = "_") -names(s_right) <- paste(seqnames(grR), start(grR), end(grR), sep = "_") -cat('Identification of LTRs...') -TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x], - s_right[x], - gcl_clean_with_domains[[x]], - gr[x], - grL[x], - grR[x], - expected_ltr_length[x]), - mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE -) - -cat('done.\n') + names(te_strand) <- paste(seqnames(gr), start(gr), end(gr), sep = "_") + names(s_left) <- paste(seqnames(grL), start(grL), end(grL), sep = "_") + names(s_right) <- paste(seqnames(grR), start(grR), end(grR), sep = "_") + cat('Identification of LTRs...') + TE <- mclapply(seq_along(gr), function(x)get_TE(s_left[x], + s_right[x], + gcl_clean_with_domains[[x]], + gr[x], + grL[x], + grR[x], + expected_ltr_length[x]), + mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE + ) -good_TE <- TE[!sapply(TE, is.null)] -cat('Number of putative TE with identified LTR :', length(good_TE), '\n') - -# TODO - extent TE region to cover also TSD -ID <- paste0('TE_', sprintf("%08d", seq(good_TE))) -gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu) + cat('done.\n') + good_TE <- TE[!sapply(TE, is.null)] + cat('Number of putative TE with identified LTR :', length(good_TE), '\n') + break + } -cat('Identification of PBS ...') -gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE) -cat('done\n') -gff3_out <- do.call(c, gff3_list2) +if (length(good_TE)>0){ # handle empty list + ID <- paste0('TE_', sprintf("%08d", seq(good_TE))) + gff3_list <- mcmapply(get_te_gff3, g = good_TE, ID = ID, mc.cores = opt$cpu) -# define new source -src <- as.character(gff3_out$source) -src[is.na(src)] <- "dante_ltr" -gff3_out$source <- src -gff3_out$Rank <- get_te_rank(gff3_out) + cat('Identification of PBS ...') + gff3_list2 <- mclapply(gff3_list, FUN = add_pbs, s = s, trna_db = trna_db, mc.set.seed = TRUE, mc.cores = opt$cpu, mc.preschedule = FALSE) + cat('done\n') + gff3_out <- do.call(c, gff3_list2) -# add partial TEs but first remove all ovelaping -TE_partial_parent_part <- TE_partial_with_more_than_one_domain[TE_partial_with_more_than_one_domain %outside% gff3_out] -TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID] - -gff3_out <- sort(c(gff3_out, TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start) -# modify ID and Parent - add seqname - this will ensure it is unique is done on chunk level -gff3_out$ID[!is.na(gff3_out$ID)] <- paste0(gff3_out$ID[!is.na(gff3_out$ID)], "_", seqnames(gff3_out)[!is.na(gff3_out$ID)]) -gff3_out$Parent[!is.na(gff3_out$Parent)] <- paste0(gff3_out$Parent[!is.na(gff3_out$Parent)], "_", seqnames(gff3_out)[!is.na(gff3_out$Parent)]) - -export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') - -all_tbl <- get_te_statistics(gff3_out, RT) -all_tbl <- cbind(Classification = rownames(all_tbl), all_tbl) -write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE) -# export fasta files: -s_te <- get_te_sequences(gff3_out, s) -for (i in seq_along(s_te)) { - outname <- paste0(outfile, "_", names(s_te)[i], ".fasta") - writeXStringSet(s_te[[i]], filepath = outname) + # define new source + src <- as.character(gff3_out$source) + src[is.na(src)] <- "dante_ltr" + gff3_out$source <- src + gff3_out$Rank <- get_te_rank(gff3_out) + # add partial TEs but first remove all ovelaping + TE_partial_parent_part <- TE_partial_with_more_than_one_domain[TE_partial_with_more_than_one_domain %outside% gff3_out] + TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID] + gff3_out <- sort(c(gff3_out, TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start) +}else{ + # but this could be a problem if there are no TEs in the sequence + if (length(TE_partial_with_more_than_one_domain)>0){ + TE_partial_parent_part <- TE_partial_with_more_than_one_domain + TE_partial_domain_part <- g[g$Parent %in% TE_partial_parent_part$ID] + gff3_out <- sort(c(TE_partial_domain_part, TE_partial_parent_part), by = ~ seqnames * start) + }else{ + gff3_out <- NULL + } } +if (is.null(gff3_out)){ + cat('No TEs found.\n') +}else{ +# modify ID and Parent - add seqname - this will ensure it is unique is done on chunk level + gff3_out$ID[!is.na(gff3_out$ID)] <- paste0(gff3_out$ID[!is.na(gff3_out$ID)], "_", seqnames(gff3_out)[!is.na(gff3_out$ID)]) + gff3_out$Parent[!is.na(gff3_out$Parent)] <- paste0(gff3_out$Parent[!is.na(gff3_out$Parent)], "_", seqnames(gff3_out)[!is.na(gff3_out$Parent)]) + export(gff3_out, con = paste0(outfile, ".gff3"), format = 'gff3') + all_tbl <- get_te_statistics(gff3_out, RT) + all_tbl <- cbind(Classification = rownames(all_tbl), all_tbl) + write.table(all_tbl, file = paste0(outfile, "_statistics.csv"), sep = "\t", quote = FALSE, row.names = FALSE) + # export fasta files: + s_te <- get_te_sequences(gff3_out, s) + for (i in seq_along(s_te)) { + outname <- paste0(outfile, "_", names(s_te)[i], ".fasta") + writeXStringSet(s_te[[i]], filepath = outname) + } +} +
--- a/detect_putative_ltr_wrapper.py Thu Jul 21 08:23:15 2022 +0000 +++ b/detect_putative_ltr_wrapper.py Thu Aug 11 07:29:06 2022 +0000 @@ -2,7 +2,8 @@ """This wrapper is intended to be used on large genomes and large DANTE input to minimize memory usage, It splits input files to pieces and analyze it on by one by detect_putative_ltr.R -If input does not exceed memory limit, it will run detect_putative_ltr.R directly +If input does not exceed specified max-chunk_size, it will run detect_putative_ltr.R +directly """ import argparse @@ -13,6 +14,99 @@ import subprocess +class Gff3Feature: + """ + Class for gff3 feature + """ + + def __init__(self, line): + self.line = line + self.items = line.strip().split('\t') + self.header = self.items[0] + self.source = self.items[1] + self.type = self.items[2] + self.start = int(self.items[3]) + self.end = int(self.items[4]) + self.score = self.items[5] + self.strand = self.items[6] + self.frame = self.items[7] + self.attributes = self.items[8] + self.attributes_dict = {} + for item in self.attributes.split(';'): + if item != '': + key, value = item.split('=') + self.attributes_dict[key] = value + + self.attributes_str = ';'.join( + ['{}={}'.format(key, value) for key, value in self.attributes_dict.items()] + ) + + def __str__(self): + return '\t'.join( + [self.header, self.source, self.type, str(self.start), str(self.end), + self.score, self.strand, self.frame, self.attributes_str] + ) + '\n' + + def __repr__(self): + return '\t'.join( + [self.header, self.source, self.type, str(self.start), str(self.end), + self.score, self.strand, self.frame, self.attributes_str] + ) + '\n' + + def __eq__(self, other): + return self.line_recalculated() == other.line_recalculated() + + def __hash__(self): + return hash(self.line_recalculated()) + + def get_line(self): + """returns original line""" + return self.line + + def overlap(self, other): + """ + Check if two features overlap + :param other: + :return: + """ + if self.start <= other.end and self.end >= other.start: + return True + else: + return False + + def line_recalculated(self): + """ + :return: + string with recalculated line + """ + return '\t'.join( + [self.header, self.source, self.type, str(self.start), str(self.end), + self.score, self.strand, self.frame, self.attributes_str] + ) + '\n' + + def __lt__(self, other): + width = self.end - self.start + other_width = other.end - other.start + return width < other_width + + def __gt__(self, other): + width = self.end - self.start + other_width = other.end - other.start + return width > other_width + + def identical_region(self, other): + """ + Check if two features are identical + :param other: + :return: + """ + if self.start == other.start and self.end == other.end and self.header == \ + other.header: + return True + else: + return False + + def get_arguments(): """ Get arguments from command line @@ -50,9 +144,8 @@ parser.add_argument( '-S', '--max_chunk_size', default=100000000, required=False, type=int, help='If size of reference sequence is greater than this value, reference is ' - 'analyzed in chunks of this size. This is just approximate value - ' - 'sequences ' - 'which are longer are are not split, default is %(default)s' + 'analyzed in chunks of this size. default is %(default)s' + 'Setting this value too small will slow down the analysis' ) args = parser.parse_args() return args @@ -81,6 +174,7 @@ temp_files = [] for i in range(number_of_files): temp_files.append(tempfile.NamedTemporaryFile(delete=False).name) + os.remove(temp_files[-1]) return temp_files @@ -112,6 +206,259 @@ return sorted_stat_with_header +def read_single_fasta_to_dictionary(fh): + """ + Read fasta file into dictionary + :param fh: + :return: + fasta_dict + """ + fasta_dict = {} + for line in fh: + if line[0] == '>': + header = line.strip().split(' ')[0][1:] # remove part of name after space + fasta_dict[header] = [] + else: + fasta_dict[header] += [line.strip()] + fasta_dict = {k: ''.join(v) for k, v in fasta_dict.items()} + return fasta_dict + + +def split_fasta_to_chunks(fasta_file, chunk_size=100000000, overlap=100000): + """ + Split fasta file to chunks, sequences longe than chuck size are split to overlaping + peaces. If sequences are shorter, chunck with multiple sequences are created. + :param fasta_file: + + :param fasta_file: + :param chunk_size: + :param overlap: + :return: + fasta_file_split + matching_table + """ + min_chunk_size = chunk_size * 2 + fasta_dict = read_fasta_sequence_size(fasta_file) + # calculates ranges for splitting of fasta files and store them in list + matching_table = [] + fasta_file_split = tempfile.NamedTemporaryFile(delete=False).name + for header, size in fasta_dict.items(): + if size > min_chunk_size: + number_of_chunks = int(size / chunk_size) + adjusted_chunk_size = int(size / number_of_chunks) + for i in range(number_of_chunks): + start = i * adjusted_chunk_size + end = ((i + 1) * + adjusted_chunk_size + + overlap) if i + 1 < number_of_chunks else size + new_header = header + '_' + str(i) + matching_table.append([header, i, start, end, new_header]) + else: + new_header = header + '_0' + matching_table.append([header, 0, 0, size, new_header]) + # read sequences from fasta files and split them to chunks according to matching table + # open output and input files, use with statement to close files + fasta_dict = read_single_fasta_to_dictionary(open(fasta_file, 'r')) + with open(fasta_file_split, 'w') as fh_out: + for header in fasta_dict: + matching_table_part = [x for x in matching_table if x[0] == header] + for header2, i, start, end, new_header in matching_table_part: + fh_out.write('>' + new_header + '\n') + fh_out.write(fasta_dict[header][start:end] + '\n') + return fasta_file_split, matching_table + + +def get_new_header_and_coordinates(header, start, end, matching_table): + """ + Get new header and coordinates for sequence + :param header: + :param start: + :param end: + :param matching_table: + :return: + new_header + new_start + new_end + """ + matching_table_part = [x for x in matching_table if x[0] == header] + new_coords = [] + for chunk in matching_table_part: + if chunk[2] <= start < chunk[3]: + new_header = chunk[4] + new_start = start - chunk[2] + new_end = end - chunk[2] + new_sequence_length = chunk[3] - chunk[2] + new_coords.append([new_header, new_start, new_end, new_sequence_length]) + return new_coords + + +def get_original_header_and_coordinates(new_header, new_start, new_end, matching_table): + """ + Get original header and coordinates for sequence + :param new_header: + :param new_start: + :param new_end: + :param matching_table: + :return: + original_header + original_start + original_end + """ + matching_table_part = [x for x in matching_table if x[4] == new_header] + ori_header = matching_table_part[0][0] + start = matching_table_part[0][2] + ori_start = new_start + start + ori_end = new_end + start + return ori_header, ori_start, ori_end + + +# recalculate gff3 coordinates, use gff3_feature class +def recalculate_gff3_coordinates(gff3_file, matching_table): + """ + Recalculate gff3 coordinates, use gff3_feature class + :param gff3_file: + :param matching_table: + :return: + gff3_file_recalculated + """ + gff3_file_recalculated = tempfile.NamedTemporaryFile(delete=False).name + + with open(gff3_file, 'r') as fh_in: + with open(gff3_file_recalculated, 'w') as fh_out: + for line in fh_in: + if line[0] == '#': + fh_out.write(line) + else: + feature = Gff3Feature(line) + new_coords = get_new_header_and_coordinates( + feature.header, feature.start, feature.end, matching_table + ) + for new_header, new_start, new_end, sequence_length in new_coords: + if new_start >= 1 and new_end <= sequence_length: + feature.header = new_header + feature.start = new_start + feature.end = new_end + fh_out.write(str(feature)) + return gff3_file_recalculated + + +# recalculate gff3 back to original coordinates, use gff3_feature class +def recalculate_gff3_back_to_original_coordinates(gff3_file, matching_table): + """ + Recalculate gff3 back to original coordinates, use gff3_feature class + :param gff3_file: + :param matching_table: + :return: + gff3_file_recalculated + """ + gff3_file_recalculated = tempfile.NamedTemporaryFile(delete=False).name + with open(gff3_file, 'r') as fh_in: + with open(gff3_file_recalculated, 'w') as fh_out: + for line in fh_in: + if line[0] == '#': + fh_out.write(line) + else: + feature = Gff3Feature(line) + ori_header, ori_start, ori_end = get_original_header_and_coordinates( + feature.header, feature.start, feature.end, matching_table + ) + feature.header = ori_header + feature.start = ori_start + feature.end = ori_end + fh_out.write(str(feature)) + return gff3_file_recalculated + + +def get_feature_attributes(line): + """ + Get attributes as dictionary from gff3 list + :param line: + :return: + attributes_dict + """ + attributes_dict = {} + for item in line[8].split(';'): + if item.strip(): + key, value = item.strip().split('=') + attributes_dict[key] = value + return attributes_dict + + +def get_unique_features(gff3_file): + """ + return list of ID of non-ovelaping features. + + :param gff3_file: + :return: + duplicated_features + """ + good_id = [] + feature_list = [] + with open(gff3_file, 'r') as fh: + for line in fh: + if line[0] == '#': + continue + feature = Gff3Feature(line) + if feature.type != 'transposable_element': + continue + feature_list.append(feature) # sort by start position and header + feature_list.sort(key=lambda x: (x.start, x.header)) + i = 0 + while i < len(feature_list) - 1: + ch = feature_list[i].header == feature_list[i + 1].header + if not ch: + good_id.append(feature_list[i].attributes_dict['ID']) + i += 1 + continue + if feature_list[i].identical_region(feature_list[i + 1]): + # identical position + good_id.append(feature_list[i].attributes_dict['ID']) + i += 2 + continue + if feature_list[i].overlap(feature_list[i + 1]): + # overlap + if feature_list[i] > feature_list[i + 1]: + good_id.append(feature_list[i].attributes_dict['ID']) + i += 2 + continue + else: + good_id.append(feature_list[i + 1].attributes_dict['ID']) + i += 2 + continue + else: + good_id.append(feature_list[i].attributes_dict['ID']) + i += 1 + if i == len(feature_list) - 1: + good_id.append(feature_list[i].attributes_dict['ID']) + return good_id + + +def filter_gff3_file(gff3_file, good_id, output_file): + """ + Filter gff3 file by good_id + :param gff3_file: + :param good_id: + :param output_file: + :return: + filtered_gff3_file + """ + + with open(gff3_file, 'r') as fh, open(output_file, 'w') as fout: + for line in fh: + if line[0] == '#': + fout.write(line) + else: + feature = Gff3Feature(line) + if ('ID' in feature.attributes_dict and + feature.attributes_dict['ID'] in good_id): + fout.write(line) + continue + if 'Parent' in feature.attributes_dict: + if feature.attributes_dict['Parent'] in good_id: + fout.write(line) + continue + + def main(): """ Main function @@ -119,10 +466,18 @@ args = get_arguments() # locate directory of current script tool_path = os.path.dirname(os.path.realpath(__file__)) - fasta_seq_size = read_fasta_sequence_size(args.reference_sequence) + # split fasta file to chunks + fasta_file_split, matching_table = split_fasta_to_chunks( + args.reference_sequence, chunk_size=args.max_chunk_size + ) + # recalculate gff3 coordinates + gff3_file_recalculated = recalculate_gff3_coordinates(args.gff3, matching_table) + + fasta_seq_size = read_fasta_sequence_size(fasta_file_split) total_size = sum(fasta_seq_size.values()) number_of_sequences = len(fasta_seq_size) if total_size > args.max_chunk_size and number_of_sequences > 1: + print('running analysis on chunks of ~ {} Mb'.format(args.max_chunk_size)) # sort dictionary by values seq_id_size_sorted = [i[0] for i in sorted( fasta_seq_size.items(), key=lambda x: int(x[1]), reverse=True @@ -137,14 +492,14 @@ seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles))) # write sequences to temporary files - with open(args.reference_sequence, 'r') as f: + with open(fasta_file_split, 'r') as f: for line in f: if line[0] == '>': header = line.strip().split(' ')[0][1:] - print(header) seq_id_file_handle_dict[header].write(line) else: seq_id_file_handle_dict[header].write(line) + os.remove(fasta_file_split) # close file handles for file_handle in file_handles: file_handle.close() @@ -156,7 +511,7 @@ # make dictionary seq_id_sorted as keys and values as file handles seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles))) # write gff lines to chunks - with open(args.gff3, 'r') as f: + with open(gff3_file_recalculated, 'r') as f: for line in f: if line[0] == '#': continue @@ -166,7 +521,7 @@ # close file handles for file_handle in file_handles: file_handle.close() - + os.remove(gff3_file_recalculated) # run retrotransposon detection on each temporary file output_files = make_temp_files(number_of_temp_files) for i in range(number_of_temp_files): @@ -177,7 +532,7 @@ str(args.max_missing_domains), '-L', str(args.min_relative_length)] ) - # remove all temporary input files + #remove all temporary input files for temp_file in temp_files_fasta + temp_files_gff: os.remove(temp_file) @@ -195,6 +550,26 @@ # remove parsed temporary statistics files for file in stat_files: os.remove(file) + elif suffix == '.gff3': + tmp_gff_unfiltered = tempfile.NamedTemporaryFile(delete=False).name + with open(tmp_gff_unfiltered, 'w') as f: + for i in range(number_of_temp_files): + tmp_gff = recalculate_gff3_back_to_original_coordinates( + output_files[i] + suffix, matching_table + ) + # remove temporary gff3 file + os.remove(output_files[i] + suffix) + with open(tmp_gff, 'r') as f_tmp: + for line in f_tmp: + f.write(line) + os.remove(tmp_gff) + # filter overlapping features + good_id = get_unique_features(tmp_gff_unfiltered) + filter_gff3_file( + tmp_gff_unfiltered, good_id, args.output + suffix + ) + # remove temporary gff3 file + os.remove(tmp_gff_unfiltered) else: with open(args.output + suffix, 'w') as f: for i in range(number_of_temp_files): @@ -204,10 +579,11 @@ for line in g: f.write(line) # remove parsed temporary output files - os.remove(output_files[i]) + os.remove(output_files[i] + suffix) except FileNotFoundError: pass else: + print('running analysis on whole input') # no need to split sequences into chunks subprocess.check_call( [f'{tool_path}/detect_putative_ltr.R', '-s', args.reference_sequence, '-g',
--- a/tests.sh Thu Jul 21 08:23:15 2022 +0000 +++ b/tests.sh Thu Aug 11 07:29:06 2022 +0000 @@ -41,6 +41,10 @@ -S 10000000 cat tmp/test_output5_statistics.csv -echo "Running tests 4, filtering gff" +echo "Running tests 6, filtering gff" ./clean_ltr.R -g tmp/test_output5.gff3 -s test_data/sample_genome.fasta \ -o tmp/test_output6 -c $NCPU_TO_USE + +/detect_putative_ltr_wrapper.py -s test_data/sample_genome_part.fasta \ +-g test_data/sample_DANTE_part.gff3 -o tmp/test_output7 -c $NCPU_TO_USE \ +-S 10000000 -M 2 \ No newline at end of file