comparison extract_putative_ltr.R @ 8:9de392f2fc02 draft

"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
author petr-novak
date Tue, 28 Jun 2022 12:33:22 +0000
parents c33d6583e548
children 1aa578e6c8b3
comparison
equal deleted inserted replaced
7:c33d6583e548 8:9de392f2fc02
15 help = "output file path and prefix", default = NULL), 15 help = "output file path and prefix", default = NULL),
16 make_option(c("-c", "--cpu"), type = "integer", default = 5, 16 make_option(c("-c", "--cpu"), type = "integer", default = 5,
17 help = "Number of cpu to use [default %default]", metavar = "number"), 17 help = "Number of cpu to use [default %default]", metavar = "number"),
18 make_option(c("-M", "--max_missing_domains"), type = "integer", default = 0, 18 make_option(c("-M", "--max_missing_domains"), type = "integer", default = 0,
19 help = "Maximum number of missing domains is retrotransposon [default %default]", 19 help = "Maximum number of missing domains is retrotransposon [default %default]",
20 metavar = "number"),
21 make_option(c("-L", "--min_relative_length"), type = "numeric", default = 0.6,
22 help = "Minimum relative length of protein domain to be considered for retrostransposon detection [default %default]",
20 metavar = "number") 23 metavar = "number")
21
22 24
23 ) 25 )
24 description <- paste(strwrap("")) 26 description <- paste(strwrap(""))
25 27
26 epilogue <- "" 28 epilogue <- ""
69 if (FALSE) { 71 if (FALSE) {
70 g <- rtracklayer::import("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/repeat_annotation/DANTE_on_CEUR_filtered_short_names.gff3") 72 g <- rtracklayer::import("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/repeat_annotation/DANTE_on_CEUR_filtered_short_names.gff3")
71 s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa") 73 s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa")
72 lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE) 74 lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE)
73 75
76 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/sample_DANTE_unfiltered.gff3")
74 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/DANTE_filtered_part.gff3") 77 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/DANTE_filtered_part.gff3")
75 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/Rbp_part.fa") 78 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/Rbp_part.fa")
76 79
77 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data 80 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data
78 /DANTE_Vfaba_chr5.gff3") 81 /DANTE_Vfaba_chr5.gff3")
120 names(lineage_ltr_mean_length) <- ln 123 names(lineage_ltr_mean_length) <- ln
121 lineage_domains_sequence <- unlist(mapply(function(d,l) { 124 lineage_domains_sequence <- unlist(mapply(function(d,l) {
122 paste(strsplit(d, " ")[[1]], ":", l, sep = "") 125 paste(strsplit(d, " ")[[1]], ":", l, sep = "")
123 }, d = lineage_domain, l = names(lineage_domain))) 126 }, d = lineage_domain, l = names(lineage_domain)))
124 127
128 # filter g gff3
129 g <- dante_filtering(g, Relative_Length = opt$min_relative_length) # default
125 130
126 seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] 131 seqlengths(g) <- seqlengths(s)[names(seqlengths(g))]
127 g <- add_coordinates_of_closest_neighbor(g) 132 g <- add_coordinates_of_closest_neighbor(g)
128 133
129 # add info about domain order: 134 # add info about domain order:
154 IRanges(start = sapply(gcl_alt, function(x) min(x$start)), 159 IRanges(start = sapply(gcl_alt, function(x) min(x$start)),
155 end = sapply(gcl_alt, function(x) max(x$end))) 160 end = sapply(gcl_alt, function(x) max(x$end)))
156 ) 161 )
157 g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster) 162 g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster)
158 g$Parent <- paste0("TE_partial_", g$Cluster) 163 g$Parent <- paste0("TE_partial_", g$Cluster)
159 g$Rank="D" 164 g$Rank <- "D"
160 165
161 # keep only partial TE with more than one domain 166 # keep only partial TE with more than one domain
162 TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1] 167 TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1]
163 g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)] 168 g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)]
164 169