comparison clean_ltr.R @ 4:93d35ae65e1b draft

"planemo upload commit 57a4f4a749b60b4e1d992dc3a879add7bb4bb56b"
author petr-novak
date Mon, 09 May 2022 12:37:31 +0000
parents 6ae4a341d1f3
children b91ca438a1cb
comparison
equal deleted inserted replaced
3:6ae4a341d1f3 4:93d35ae65e1b
74 cat("reading gff...") 74 cat("reading gff...")
75 g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3 75 g <- rtracklayer::import(opt$gff3, format = 'gff3') # DANTE gff3
76 cat("done\n") 76 cat("done\n")
77 # testing 77 # testing
78 if (FALSE) { 78 if (FALSE) {
79 g <- rtracklayer::import("./test_data/big_test_data/dante_ltr_unfiltered_t.cacao.gff3")
80 s <- readDNAStringSet("./test_data/big_test_data/T_cacao_chromosomes.fasta")
81
79 g <- rtracklayer::import("./test_data/sample_ltr_annotation.gff3") 82 g <- rtracklayer::import("./test_data/sample_ltr_annotation.gff3")
80 s <- readDNAStringSet("./test_data/sample_genome.fasta") 83 s <- readDNAStringSet("./test_data/sample_genome.fasta")
81 84
82 g <- rtracklayer::import("./test_data/DANTE_LTR_Vfaba_chr5.gff3") 85 g <- rtracklayer::import("./test_data/DANTE_LTR_Vfaba_chr5.gff3")
83 s <- readDNAStringSet("./test_data/211010_Vfaba_chr5.fasta") 86 s <- readDNAStringSet("./test_data/211010_Vfaba_chr5.fasta")
86 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header = 89 lineage_info <- read.table("databases/lineage_domain_order.csv", sep = "\t", header =
87 TRUE, as.is = TRUE) 90 TRUE, as.is = TRUE)
88 source("./R/ltr_utils.R") 91 source("./R/ltr_utils.R")
89 } 92 }
90 93
94 ## ID in g must be unique - this could be a problem if gff is concatenated from multiple files!
95 ## id ID is renamed - rename parent to!
96 ## add chromosom index to disctinguish same IDs
97 suffix <- as.numeric(seqnames(g))
98 g$ID <- ifelse(is.na(g$ID), NA, paste0(g$ID,"_", suffix))
99 g$Parent <- ifelse(is.na(g$Parent), NA, paste0(g$Parent,"_", suffix))
91 100
92 # get te sequence based on rank 101 # get te sequence based on rank
93 102
94 # evaluate best TE - DLTP grou 103 # evaluate best TE - DLTP grou
95 s_te <- get_te_sequences(g, s) # split by 'element quality' 104 s_te <- get_te_sequences(g, s) # split by 'element quality'
99 TE_DLTP_info <- analyze_TE(s_te$DLTP, word_size = word_size, ncpus = ncpus) 108 TE_DLTP_info <- analyze_TE(s_te$DLTP, word_size = word_size, ncpus = ncpus)
100 109
101 # TE rank 2: 110 # TE rank 2:
102 TE_DLT_plus_DLP_info <- analyze_TE(c(s_te$DLP, s_te$DLT), word_size = word_size, ncpus 111 TE_DLT_plus_DLP_info <- analyze_TE(c(s_te$DLP, s_te$DLT), word_size = word_size, ncpus
103 = ncpus) 112 = ncpus)
104 TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets(c(s_te$DLT, s_te$DLP), ncpus 113 TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets(c(s_te$DLT, s_te$DLP), ncpus
105 = ncpus, 114 = ncpus,TE_DLTP_info$seqs_representative, word_size = word_size
106 TE_DLTP_info$seqs_representative, 115 )
107 word_size = word_size
108 )
109 116
110 TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info) 117 TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info)
111 # create additional library from rank 2 verified by multiplicity 118 # create additional library from rank 2 verified by multiplicity
112 id_for_additional_library <- setdiff( 119 id_for_additional_library <- setdiff(
113 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified, 120 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified,
131 }else { 138 }else {
132 seq_representative <- TE_DLTP_info$seqs_representative 139 seq_representative <- TE_DLTP_info$seqs_representative
133 } 140 }
134 } 141 }
135 142
136 # TE rank 3 143 # TE rank 3 - verify agains good DLTP
137 TE_DL_info_DLTP_verified <- compare_TE_datasets( 144 TE_DL_info_DLTP_verified <- compare_TE_datasets(
138 s_te$DL, 145 s_te$DL,
139 TE_DLTP_info$seqs_representative, min_coverage = 0.98, 146 TE_DLTP_info$seqs_representative, min_coverage = 0.95,
140 ncpus = ncpus 147 ncpus = ncpus, word_size = word_size
141 ) 148 )
142 149
143 150
144 R <- seq_diversity(seq_representative)$richness 151 R <- seq_diversity(seq_representative)$richness
145 SI <- seq_diversity(seq_representative)$shannon_index 152 SI <- seq_diversity(seq_representative)$shannon_index