Mercurial > repos > petr-novak > dante_ltr
comparison clean_ltr.R @ 4:93d35ae65e1b draft
"planemo upload commit 57a4f4a749b60b4e1d992dc3a879add7bb4bb56b"
author | petr-novak |
---|---|
date | Mon, 09 May 2022 12:37:31 +0000 |
parents | 6ae4a341d1f3 |
children | b91ca438a1cb |
comparison
equal
deleted
inserted
replaced
3:6ae4a341d1f3 | 4:93d35ae65e1b |
---|---|
74 cat("reading gff...") | 74 cat("reading gff...") |
75 g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3 | 75 g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3 |
76 cat("done\n") | 76 cat("done\n") |
77 # testing | 77 # testing |
78 if (FALSE) { | 78 if (FALSE) { |
79 g <- rtracklayer::import("./test_data/big_test_data/dante_ltr_unfiltered_t.cacao.gff3") | |
80 s <- readDNAStringSet("./test_data/big_test_data/T_cacao_chromosomes.fasta") | |
81 | |
79 g <- rtracklayer::import("./test_data/sample_ltr_annotation.gff3") | 82 g <- rtracklayer::import("./test_data/sample_ltr_annotation.gff3") |
80 s <- readDNAStringSet("./test_data/sample_genome.fasta") | 83 s <- readDNAStringSet("./test_data/sample_genome.fasta") |
81 | 84 |
82 g <- rtracklayer::import("./test_data/DANTE_LTR_Vfaba_chr5.gff3") | 85 g <- rtracklayer::import("./test_data/DANTE_LTR_Vfaba_chr5.gff3") |
83 s <- readDNAStringSet("./test_data/211010_Vfaba_chr5.fasta") | 86 s <- readDNAStringSet("./test_data/211010_Vfaba_chr5.fasta") |
86 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header = | 89 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header = |
87 TRUE, as.is = TRUE) | 90 TRUE, as.is = TRUE) |
88 source("./R/ltr_utils.R") | 91 source("./R/ltr_utils.R") |
89 } | 92 } |
90 | 93 |
94 ## ID in g must be unique - this could be a problem if gff is concatenated from multiple files! | |
95 ## id ID is renamed - rename parent to! | |
96 ## add chromosom index to disctinguish same IDs | |
97 suffix <- as.numeric(seqnames(g)) | |
98 g$ID <- ifelse(is.na(g$ID), NA, paste0(g$ID,"_", suffix)) | |
99 g$Parent <- ifelse(is.na(g$Parent), NA, paste0(g$Parent,"_", suffix)) | |
91 | 100 |
92 # get te sequence based on rank | 101 # get te sequence based on rank |
93 | 102 |
94 # evaluate best TE - DLTP grou | 103 # evaluate best TE - DLTP grou |
95 s_te <- get_te_sequences(g, s) # split by 'element quality' | 104 s_te <- get_te_sequences(g, s) # split by 'element quality' |
99 TE_DLTP_info <- analyze_TE(s_te$DLTP, word_size = word_size, ncpus = ncpus) | 108 TE_DLTP_info <- analyze_TE(s_te$DLTP, word_size = word_size, ncpus = ncpus) |
100 | 109 |
101 # TE rank 2: | 110 # TE rank 2: |
102 TE_DLT_plus_DLP_info <- analyze_TE(c(s_te$DLP, s_te$DLT), word_size = word_size, ncpus | 111 TE_DLT_plus_DLP_info <- analyze_TE(c(s_te$DLP, s_te$DLT), word_size = word_size, ncpus |
103 = ncpus) | 112 = ncpus) |
104 TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets(c(s_te$DLT, s_te$DLP), ncpus | 113 TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets(c(s_te$DLT, s_te$DLP), ncpus |
105 = ncpus, | 114 = ncpus,TE_DLTP_info$seqs_representative, word_size = word_size |
106 TE_DLTP_info$seqs_representative, | 115 ) |
107 word_size = word_size | |
108 ) | |
109 | 116 |
110 TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info) | 117 TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info) |
111 # create additional library from rank 2 verified by multiplicity | 118 # create additional library from rank 2 verified by multiplicity |
112 id_for_additional_library <- setdiff( | 119 id_for_additional_library <- setdiff( |
113 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified, | 120 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified, |
131 }else { | 138 }else { |
132 seq_representative <- TE_DLTP_info$seqs_representative | 139 seq_representative <- TE_DLTP_info$seqs_representative |
133 } | 140 } |
134 } | 141 } |
135 | 142 |
136 # TE rank 3 | 143 # TE rank 3 - verify agains good DLTP |
137 TE_DL_info_DLTP_verified <- compare_TE_datasets( | 144 TE_DL_info_DLTP_verified <- compare_TE_datasets( |
138 s_te$DL, | 145 s_te$DL, |
139 TE_DLTP_info$seqs_representative, min_coverage = 0.98, | 146 TE_DLTP_info$seqs_representative, min_coverage = 0.95, |
140 ncpus = ncpus | 147 ncpus = ncpus, word_size = word_size |
141 ) | 148 ) |
142 | 149 |
143 | 150 |
144 R <- seq_diversity(seq_representative)$richness | 151 R <- seq_diversity(seq_representative)$richness |
145 SI <- seq_diversity(seq_representative)$shannon_index | 152 SI <- seq_diversity(seq_representative)$shannon_index |