Mercurial > repos > petr-novak > dante_ltr
comparison extract_putative_ltr.R @ 8:9de392f2fc02 draft
"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
author | petr-novak |
---|---|
date | Tue, 28 Jun 2022 12:33:22 +0000 |
parents | c33d6583e548 |
children | 1aa578e6c8b3 |
comparison
equal
deleted
inserted
replaced
7:c33d6583e548 | 8:9de392f2fc02 |
---|---|
15 help = "output file path and prefix", default = NULL), | 15 help = "output file path and prefix", default = NULL), |
16 make_option(c("-c", "--cpu"), type = "integer", default = 5, | 16 make_option(c("-c", "--cpu"), type = "integer", default = 5, |
17 help = "Number of cpu to use [default %default]", metavar = "number"), | 17 help = "Number of cpu to use [default %default]", metavar = "number"), |
18 make_option(c("-M", "--max_missing_domains"), type = "integer", default = 0, | 18 make_option(c("-M", "--max_missing_domains"), type = "integer", default = 0, |
19 help = "Maximum number of missing domains is retrotransposon [default %default]", | 19 help = "Maximum number of missing domains is retrotransposon [default %default]", |
20 metavar = "number"), | |
21 make_option(c("-L", "--min_relative_length"), type = "numeric", default = 0.6, | |
22 help = "Minimum relative length of protein domain to be considered for retrostransposon detection [default %default]", | |
20 metavar = "number") | 23 metavar = "number") |
21 | |
22 | 24 |
23 ) | 25 ) |
24 description <- paste(strwrap("")) | 26 description <- paste(strwrap("")) |
25 | 27 |
26 epilogue <- "" | 28 epilogue <- "" |
69 if (FALSE) { | 71 if (FALSE) { |
70 g <- rtracklayer::import("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/repeat_annotation/DANTE_on_CEUR_filtered_short_names.gff3") | 72 g <- rtracklayer::import("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/repeat_annotation/DANTE_on_CEUR_filtered_short_names.gff3") |
71 s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa") | 73 s <- readDNAStringSet("/mnt/raid/454_data/cuscuta/Ceuropea_assembly_v4/0_final_asm_hifiasm+longstitch/asm.bp.p.ctg_scaffolds.short_names.fa") |
72 lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE) | 74 lineage_info <- read.table("/mnt/raid/users/petr/workspace/ltr_finder_test/lineage_domain_order.csv", sep = "\t", header = TRUE, as.is = TRUE) |
73 | 75 |
76 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data/sample_DANTE_unfiltered.gff3") | |
74 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/DANTE_filtered_part.gff3") | 77 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/DANTE_filtered_part.gff3") |
75 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/Rbp_part.fa") | 78 s <- readDNAStringSet("/mnt/raid/users/petr/workspace/ltr_finder_test/test_data/Rbp_part.fa") |
76 | 79 |
77 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data | 80 g <- rtracklayer::import("/mnt/raid/users/petr/workspace/dante_ltr/test_data |
78 /DANTE_Vfaba_chr5.gff3") | 81 /DANTE_Vfaba_chr5.gff3") |
120 names(lineage_ltr_mean_length) <- ln | 123 names(lineage_ltr_mean_length) <- ln |
121 lineage_domains_sequence <- unlist(mapply(function(d,l) { | 124 lineage_domains_sequence <- unlist(mapply(function(d,l) { |
122 paste(strsplit(d, " ")[[1]], ":", l, sep = "") | 125 paste(strsplit(d, " ")[[1]], ":", l, sep = "") |
123 }, d = lineage_domain, l = names(lineage_domain))) | 126 }, d = lineage_domain, l = names(lineage_domain))) |
124 | 127 |
128 # filter g gff3 | |
129 g <- dante_filtering(g, Relative_Length = opt$min_relative_length) # default | |
125 | 130 |
126 seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] | 131 seqlengths(g) <- seqlengths(s)[names(seqlengths(g))] |
127 g <- add_coordinates_of_closest_neighbor(g) | 132 g <- add_coordinates_of_closest_neighbor(g) |
128 | 133 |
129 # add info about domain order: | 134 # add info about domain order: |
154 IRanges(start = sapply(gcl_alt, function(x) min(x$start)), | 159 IRanges(start = sapply(gcl_alt, function(x) min(x$start)), |
155 end = sapply(gcl_alt, function(x) max(x$end))) | 160 end = sapply(gcl_alt, function(x) max(x$end))) |
156 ) | 161 ) |
157 g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster) | 162 g$Ndomains_in_cluster <- count_occurences_for_each_element(g$Cluster) |
158 g$Parent <- paste0("TE_partial_", g$Cluster) | 163 g$Parent <- paste0("TE_partial_", g$Cluster) |
159 g$Rank="D" | 164 g$Rank <- "D" |
160 | 165 |
161 # keep only partial TE with more than one domain | 166 # keep only partial TE with more than one domain |
162 TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1] | 167 TE_partial_with_more_than_one_domain <- TE_partial[TE_partial$Ndomains > 1] |
163 g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)] | 168 g_with_more_than_one_domain <- g[as.vector(g$Ndomains_in_cluster > 1)] |
164 | 169 |