comparison clean_ltr.R @ 7:c33d6583e548 draft

"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
author petr-novak
date Fri, 24 Jun 2022 14:19:48 +0000
parents b91ca438a1cb
children ff01d4263391
comparison
equal deleted inserted replaced
6:b91ca438a1cb 7:c33d6583e548
78 if (FALSE) { 78 if (FALSE) {
79 g <- rtracklayer::import("./test_data/big_test_data/dante_ltr_unfiltered_t.cacao.gff3") 79 g <- rtracklayer::import("./test_data/big_test_data/dante_ltr_unfiltered_t.cacao.gff3")
80 s <- readDNAStringSet("./test_data/big_test_data/T_cacao_chromosomes.fasta") 80 s <- readDNAStringSet("./test_data/big_test_data/T_cacao_chromosomes.fasta")
81 81
82 g <- rtracklayer::import("./test_data/sample_ltr_annotation.gff3") 82 g <- rtracklayer::import("./test_data/sample_ltr_annotation.gff3")
83 g <- rtracklayer::import("./test_data/sample_DANTE_LTR_annotation.gff3")
83 s <- readDNAStringSet("./test_data/sample_genome.fasta") 84 s <- readDNAStringSet("./test_data/sample_genome.fasta")
84 85
85 g <- rtracklayer::import("./test_data/DANTE_LTR_Vfaba_chr5.gff3") 86 g <- rtracklayer::import("./test_data/DANTE_LTR_Vfaba_chr5.gff3")
86 s <- readDNAStringSet("./test_data/211010_Vfaba_chr5.fasta") 87 s <- readDNAStringSet("./test_data/211010_Vfaba_chr5.fasta")
87 names(s) <- gsub(" .+", "", names(s)) 88 names(s) <- gsub(" .+", "", names(s))
98 g$ID <- ifelse(is.na(g$ID), NA, paste0(g$ID,"_", suffix)) 99 g$ID <- ifelse(is.na(g$ID), NA, paste0(g$ID,"_", suffix))
99 g$Parent <- ifelse(is.na(g$Parent), NA, paste0(g$Parent,"_", suffix)) 100 g$Parent <- ifelse(is.na(g$Parent), NA, paste0(g$Parent,"_", suffix))
100 101
101 # get te sequence based on rank 102 # get te sequence based on rank
102 103
104 # best quality - split by lineage
105 s_te <- get_te_sequences(g, s) # split by 'element quality'
103 # evaluate best TE - DLTP grou 106 # evaluate best TE - DLTP grou
104 s_te <- get_te_sequences(g, s) # split by 'element quality' 107
105 # best quality - split by lineage 108 # comparison parameters
106 word_size <- 15 109 word_size <- 11
110 task <- "blastn"
111 perc_identity <- 80
112
107 # best TE 113 # best TE
108 TE_DLTP_info <- analyze_TE(s_te$DLTP, word_size = word_size, ncpus = ncpus) 114 TE_DLTP_info <- analyze_TE(s_te$DLTP,
115 word_size = word_size,
116 ncpus = ncpus,
117 perc_identity = perc_identity,
118 task = task)
109 119
110 # TE rank 2: 120 # TE rank 2:
111 TE_DLT_plus_DLP_info <- analyze_TE(c(s_te$DLP, s_te$DLT), word_size = word_size, ncpus 121 TE_DLT_plus_DLP_info <- analyze_TE(c(s_te$DLP, s_te$DLT),
112 = ncpus) 122 word_size = word_size,
113 TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets(c(s_te$DLT, s_te$DLP), ncpus 123 ncpus = ncpus,
114 = ncpus,TE_DLTP_info$seqs_representative, word_size = word_size 124 perc_identity = perc_identity,
115 ) 125 task = task)
126
127 TE_D_plus_DL_info <- analyze_TE(c(s_te$DL, s_te$D),
128 word_size = word_size,
129 ncpus = ncpus,
130 perc_identity = perc_identity,
131 task = task)
132
133 TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets(
134 c(s_te$DLT, s_te$DLP),
135 ncpus = ncpus,
136 TE_DLTP_info$seqs_representative,
137 word_size = word_size,
138 perc_identity = perc_identity,
139 task = task)
116 140
117 TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info) 141 TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info)
118 # create additional library from rank 2 verified by multiplicity 142 TE_D_plus_DL_info_multiplicity <- verify_based_on_multiplicity(TE_D_plus_DL_info)
119 id_for_additional_library <- setdiff( 143
144 # create additional library from rank 2 verified by multiplicity and DLTP
145 id_for_additional_library <- union(
120 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified, 146 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified,
121 TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified) 147 TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified)
122 148
123 if (length(id_for_additional_library) > 1) { 149 if (length(id_for_additional_library) > 1) {
124 seqs_for_additional_library <- c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in% 150 seqs_for_additional_library <- c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in%
125 id_for_additional_library] 151 id_for_additional_library]
126 seqs_additional_info <- analyze_TE(seqs_for_additional_library, word_size = 152 seqs_additional_info <- analyze_TE(seqs_for_additional_library, word_size =
127 word_size, ncpus = ncpus) 153 word_size, ncpus = ncpus)
128 seq_representative <- c( 154 seqs_representative <- c(
129 TE_DLTP_info$seqs_representative, 155 TE_DLTP_info$seqs_representative,
130 seqs_additional_info$seqs_representative 156 seqs_additional_info$seqs_representative
131 ) 157 )
132 }else { 158 }else {
133 if (length(id_for_additional_library) == 1) { 159 if (length(id_for_additional_library) == 1) {
134 seq_representative <- c( 160 seqs_representative <- c(
135 TE_DLTP_info$seqs_representative, 161 TE_DLTP_info$seqs_representative,
136 c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in% id_for_additional_library] 162 c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in% id_for_additional_library]
137 ) 163 )
138 }else { 164 }else {
139 seq_representative <- TE_DLTP_info$seqs_representative 165 seqs_representative <- TE_DLTP_info$seqs_representative
140 } 166 }
141 } 167 }
142 168
143 # TE rank 3 - verify agains good DLTP 169 # TE rank 3 - verify agains good DLTP
144 TE_DL_info_DLTP_verified <- compare_TE_datasets( 170 TE_DL_info_DLTP_verified <- compare_TE_datasets(
145 s_te$DL, 171 s_te$DL,
146 TE_DLTP_info$seqs_representative, min_coverage = 0.95, 172 seqs_representative, min_coverage = 0.90,
147 ncpus = ncpus, word_size = word_size 173 ncpus = ncpus,
148 ) 174 word_size = word_size,
149 175 task = task,
150 176 perc_identity = perc_identity)
151 R <- seq_diversity(seq_representative)$richness 177
152 SI <- seq_diversity(seq_representative)$shannon_index 178 TE_D_info_DLTP_verified <- compare_TE_datasets(
179 s_te$D,
180 seqs_representative, min_coverage = 0.90,
181 ncpus = ncpus,
182 word_size = word_size,
183 task = task,
184 perc_identity = perc_identity)
185
186
187
188 R <- seq_diversity(seqs_representative)$richness
189 SI <- seq_diversity(seqs_representative)$shannon_index
153 190
154 # final RM library: 191 # final RM library:
155 seq_representative_no_ssr <- seq_representative[R > 20] 192 seqs_representative_no_ssr <- seqs_representative[R > 20]
156 193
157 ID <- g$ID[g$type == "transposable_element"] 194 ID <- g$ID[g$type == "transposable_element"]
158 names(ID) <- paste0(seqnames(g), "_", 195 names(ID) <- paste0(seqnames(g), "_",
159 start(g), "_", 196 start(g), "_",
160 end(g), "#", 197 end(g), "#",
164 # create clean gff3 201 # create clean gff3
165 id_of_good_te <- unique(c( 202 id_of_good_te <- unique(c(
166 TE_DLTP_info$te_conflict_info$ok, 203 TE_DLTP_info$te_conflict_info$ok,
167 TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified, 204 TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified,
168 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified, 205 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified,
169 TE_DL_info_DLTP_verified$id_ok_verified) 206 TE_DL_info_DLTP_verified$id_ok_verified,
207 TE_D_info_DLTP_verified$id_ok_verified,
208 TE_D_plus_DL_info_multiplicity$id_ok_mp_verified),
170 ) 209 )
171 210
172 c1 <- g$ID %in% ID[id_of_good_te] 211 c1 <- g$ID %in% ID[id_of_good_te]
173 c2 <- sapply(g$Parent, function(x)ifelse(length(x) == 0, "", x)) %in% ID[id_of_good_te] 212 c2 <- sapply(g$Parent, function(x)ifelse(length(x) == 0, "", x)) %in% ID[id_of_good_te]
174 213
175 gff_out <- g[c1 | c2] 214 gff_out <- g[c1 | c2]
176 215
177 gff_te <- gff_out[gff_out$type %in% "transposable_element"] 216 gff_te <- gff_out[gff_out$type %in% "transposable_element"]
217 # remove partial elements
218 gff_te_with_ltr <- gff_out[gff_out$type %in% "transposable_element" & gff_out$Rank != "D"]
219
220
178 gff_5ltr <- gff_out[gff_out$LTR %in% "5LTR"] 221 gff_5ltr <- gff_out[gff_out$LTR %in% "5LTR"]
179 gff_3ltr <- gff_out[gff_out$LTR %in% "3LTR"] 222 gff_3ltr <- gff_out[gff_out$LTR %in% "3LTR"]
180 223
181 full_te <- getSeqNamed(s, gff_te) 224 full_te <- getSeqNamed(s, gff_te)
182 names(full_te) <- paste0(gff_te$ID,":",names(full_te)) 225 names(full_te) <- paste0(gff_te$ID,":",names(full_te))
183 ltr5 <- getSeqNamed(s, gff_5ltr) 226 ltr5 <- getSeqNamed(s, gff_5ltr)
184 names(ltr5) <- paste0(gff_5ltr$Parent,":",names(ltr5)) 227 names(ltr5) <- paste0(gff_5ltr$Parent,":",names(ltr5))
185 ltr3 <- getSeqNamed(s, gff_3ltr) 228 ltr3 <- getSeqNamed(s, gff_3ltr)
186 names(ltr3) <- paste0(gff_3ltr$Parent,":",names(ltr3)) 229 names(ltr3) <- paste0(gff_3ltr$Parent,":",names(ltr3))
187 inc <- gff_te$Rank != "DL" 230 inc <- gff_te_with_ltr$Rank != "DL"
188 231
189 writeXStringSet(seq_representative, paste0(opt$output, "_RM_lib_non_redundant.fasta")) 232 writeXStringSet(seqs_representative, paste0(opt$output, "_RM_lib_non_redundant.fasta"))
190 writeXStringSet(full_te, paste0(opt$output, "_RM_lib_full_TE.fasta")) 233 writeXStringSet(full_te, paste0(opt$output, "_RM_lib_full_TE.fasta"))
191 writeXStringSet(ltr5, paste0(opt$output, "_RM_lib_5LTR.fasta")) 234 writeXStringSet(ltr5, paste0(opt$output, "_RM_lib_5LTR.fasta"))
192 writeXStringSet(ltr3, paste0(opt$output, "_RM_lib_3LTR.fasta")) 235 writeXStringSet(ltr3, paste0(opt$output, "_RM_lib_3LTR.fasta"))
193 236
194 export(gff_out, paste0(opt$output, "_clean.gff3"), format = "gff3") 237 export(gff_out, paste0(opt$output, "_clean.gff3"), format = "gff3")
195 238
196 lv <- sort(unique(gff_te$Final_Classification)) 239 lv <- sort(unique(gff_te_with_ltr$Final_Classification))
197 te_count <- table(factor(gff_te$Final_Classification, levels=lv)) 240 te_count <- table(factor(gff_te_with_ltr$Final_Classification, levels=lv))
198 241
199 pdf(paste0(opt$output, "_summary.pdf"), width = 13, height=8, pointsize = 10) 242 pdf(paste0(opt$output, "_summary.pdf"), width = 13, height=8, pointsize = 10)
200 par(mfrow=c(1,2), mar=c(5,7,2,0), las=1) 243 par(mfrow=c(1,2), mar=c(5,7,2,0), las=1)
201 boxplot(width(gff_te) ~ factor(gff_te$Final_Classification, levels=lv), 244 boxplot(width(gff_te_with_ltr) ~ factor(gff_te_with_ltr$Final_Classification, levels=lv),
202 horizontal = TRUE, xlab="length[bp]", ylab="", 245 horizontal = TRUE, xlab="length[bp]", ylab="",
203 names = paste0(gsub("^.+[|]", "", lv), " (", te_count, ")"), 246 names = paste0(gsub("^.+[|]", "", lv), " (", te_count, ")"),
204 main = 'Full TE', at = seq_along(lv)*4 247 main = 'Full TE', at = seq_along(lv)*4
205 ) 248 )
206 boxplot(width(gff_te[inc]) ~ factor(gff_te$Final_Classification[inc], levels=lv), 249 boxplot(width(gff_te_with_ltr[inc]) ~ factor(gff_te_with_ltr$Final_Classification[inc], levels=lv),
207 horizontal = TRUE, xlab="length[bp]", ylab="", 250 horizontal = TRUE, xlab="length[bp]", ylab="",
208 names = rep("", length(lv)), 251 names = rep("", length(lv)),
209 main = 'Full TE', at = seq_along(lv)*4-1, add=TRUE, col=2 252 main = 'Full TE', at = seq_along(lv)*4-1, add=TRUE, col=2
210 ) 253 )
211 par(mar=c(5,0,2,7)) 254 par(mar=c(5,0,2,7))