comparison clean_ltr.R @ 0:7b0bbe7477c4 draft

"planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
author petr-novak
date Tue, 08 Mar 2022 13:24:33 +0000
parents
children 6ae4a341d1f3
comparison
equal deleted inserted replaced
-1:000000000000 0:7b0bbe7477c4
1 #!/usr/bin/env Rscript
2 initial_options <- commandArgs(trailingOnly = FALSE)
3 file_arg_name <- "--file="
4 script_name <- normalizePath(sub(file_arg_name, "", initial_options[grep
5 (file_arg_name,
6 initial_options)]))
7 script_dir <- dirname(script_name)
8 library(optparse)
9
10 parser <- OptionParser()
11 option_list <- list(
12 make_option(c("-g", "--gff3"), action = "store", type = "character",
13 help = "gff3 with LTR Transposable elements", default = NULL),
14 make_option(c("-s", "--reference_sequence"), action = "store", type = "character",
15 help = "reference sequence as fasta",
16 default = NULL),
17 make_option(c("-o", "--output"), action = "store", type = "character",
18 help = "output file prefix", default = NULL),
19 make_option(c("-c", "--cpu"), type =
20 "integer", default = 5, help = "Number of cpu to use [default %default]",
21 metavar = "number")
22
23 )
24 description <- paste(strwrap(""))
25
26 epilogue <- ""
27 parser <- OptionParser(option_list = option_list, epilogue = epilogue, description =
28 description, usage = "usage: %prog COMMAND [OPTIONS]")
29 opt <- parse_args(parser, args = commandArgs(TRUE))
30
31
32 # load packages
33 suppressPackageStartupMessages({ library(rtracklayer)
34 library(Biostrings)
35 library(BSgenome)
36 library(parallel)
37 })
38
39 # CONFIGURATION
40 # load configuration files and functions:
41 lineage_file <- paste0(script_dir, "/databases/lineage_domain_order.csv")
42 ltr_utils_r <- paste0(script_dir, "/R/ltr_utils.R")
43 if (file.exists(lineage_file)) {
44 lineage_info <- read.table(lineage_file, sep = "\t",
45 header = TRUE,
46 as.is = TRUE)
47 source(ltr_utils_r)
48 }else {
49 lineage_file <- paste0(script_dir, "/.
50 ./share/dante_ltr/databases/lineage_domain_order.csv")
51 ltr_utils_r <- paste0(script_dir, "/.
52 ./share/dante_ltr/R/ltr_utils.R")
53 if (file.exists(lineage_file)) {
54 lineage_info <- read.table(lineage_file, sep = "\t",
55 header = TRUE,
56 as.is = TRUE)
57 source(ltr_utils_r)
58 }else {
59 (stop("configuration files not found"))
60 }
61 }
62
63
64 ncpus <- opt$cpu
65
66
67 # load data
68 cat("reading fasta...")
69 s <- readDNAStringSet(opt$reference_sequence) # genome assembly
70 cat("done\n")
71 outfile <- opt$output
72 # clean sequence names:
73 names(s) <- gsub(" .+", "", names(s))
74 cat("reading gff...")
75 g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3
76 cat("done\n")
77 # testing
78 if (FALSE) {
79 g <- rtracklayer::import("./test_data/sample_ltr_annotation.gff3")
80 s <- readDNAStringSet("./test_data/sample_genome.fasta")
81
82 g <- rtracklayer::import("./test_data/DANTE_LTR_Vfaba_chr5.gff3")
83 s <- readDNAStringSet("./test_data/211010_Vfaba_chr5.fasta")
84 names(s) <- gsub(" .+", "", names(s))
85 ncpus <- 10
86 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header =
87 TRUE, as.is = TRUE)
88 source("./R/ltr_utils.R")
89 }
90
91
92 # get te sequence based on rank
93
94 # evaluate best TE - DLTP grou
95 s_te <- get_te_sequences(g, s) # split by 'element quality'
96 # best quality - split by lineage
97 word_size <- 15
98 # best TE
99 TE_DLTP_info <- analyze_TE(s_te$DLTP, word_size = word_size, ncpus = ncpus)
100
101 # TE rank 2:
102 TE_DLT_plus_DLP_info <- analyze_TE(c(s_te$DLP, s_te$DLT), word_size = word_size, ncpus
103 = ncpus)
104 TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets(c(s_te$DLT, s_te$DLP), ncpus
105 = ncpus,
106 TE_DLTP_info$seqs_representative,
107 word_size = word_size
108 )
109
110 TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info)
111
112 # create additional library from rank 2 verified by multiplicity
113 id_for_additional_library <- setdiff(
114 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified,
115 TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified)
116
117 if (length(id_for_additional_library) > 1) {
118 seqs_for_additional_library <- c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in%
119 id_for_additional_library]
120 seqs_additional_info <- analyze_TE(seqs_for_additional_library, word_size =
121 word_size, ncpus = ncpus)
122 seq_representative <- c(
123 TE_DLTP_info$seqs_representative,
124 seqs_additional_info$seqs_representative
125 )
126 }else {
127 if (length(id_for_additional_library) == 1) {
128 seq_representative <- c(
129 TE_DLTP_info$seqs_representative,
130 c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in% id_for_additional_library]
131 )
132 }else {
133 seq_representative <- TE_DLTP_info$seqs_representative
134 }
135 }
136
137
138 # TE rank 3
139 TE_DL_info_DLTP_verified <- compare_TE_datasets(
140 s_te$DL,
141 TE_DLTP_info$seqs_representative, min_coverage = 0.98,
142 ncpus = ncpus
143 )
144
145
146 R <- seq_diversity(seq_representative)$richness
147 SI <- seq_diversity(seq_representative)$shannon_index
148
149 # final RM library:
150 seq_representative_no_ssr <- seq_representative[R > 20]
151
152 ID <- g$ID[g$type == "transposable_element"]
153 names(ID) <- paste0(seqnames(g), "_",
154 start(g), "_",
155 end(g), "#",
156 g$Final_Classification
157 )[g$type %in% "transposable_element"]
158
159 # create clean gff3
160 id_of_good_te <- unique(c(
161 TE_DLTP_info$te_conflict_info$ok,
162 TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified,
163 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified,
164 TE_DL_info_DLTP_verified$id_ok_verified)
165 )
166
167 c1 <- g$ID %in% ID[id_of_good_te]
168 c2 <- sapply(g$Parent, function(x)ifelse(length(x) == 0, "", x)) %in% ID[id_of_good_te]
169
170 gff_out <- g[c1 | c2]
171
172
173 writeXStringSet(seq_representative, paste0(opt$output, "_RM_lib.fasta"))
174 export(gff_out, paste0(opt$output, "_clean.gff3"), format = "gff3")
175