Mercurial > repos > petr-novak > dante_ltr
diff clean_ltr.R @ 7:c33d6583e548 draft
"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
author | petr-novak |
---|---|
date | Fri, 24 Jun 2022 14:19:48 +0000 |
parents | b91ca438a1cb |
children | ff01d4263391 |
line wrap: on
line diff
--- a/clean_ltr.R Thu May 19 08:21:55 2022 +0000 +++ b/clean_ltr.R Fri Jun 24 14:19:48 2022 +0000 @@ -80,6 +80,7 @@ s <- readDNAStringSet("./test_data/big_test_data/T_cacao_chromosomes.fasta") g <- rtracklayer::import("./test_data/sample_ltr_annotation.gff3") + g <- rtracklayer::import("./test_data/sample_DANTE_LTR_annotation.gff3") s <- readDNAStringSet("./test_data/sample_genome.fasta") g <- rtracklayer::import("./test_data/DANTE_LTR_Vfaba_chr5.gff3") @@ -100,23 +101,48 @@ # get te sequence based on rank -# evaluate best TE - DLTP grou +# best quality - split by lineage s_te <- get_te_sequences(g, s) # split by 'element quality' -# best quality - split by lineage -word_size <- 15 +# evaluate best TE - DLTP grou + +# comparison parameters +word_size <- 11 +task <- "blastn" +perc_identity <- 80 + # best TE -TE_DLTP_info <- analyze_TE(s_te$DLTP, word_size = word_size, ncpus = ncpus) +TE_DLTP_info <- analyze_TE(s_te$DLTP, + word_size = word_size, + ncpus = ncpus, + perc_identity = perc_identity, + task = task) # TE rank 2: -TE_DLT_plus_DLP_info <- analyze_TE(c(s_te$DLP, s_te$DLT), word_size = word_size, ncpus - = ncpus) - TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets(c(s_te$DLT, s_te$DLP), ncpus - = ncpus,TE_DLTP_info$seqs_representative, word_size = word_size - ) +TE_DLT_plus_DLP_info <- analyze_TE(c(s_te$DLP, s_te$DLT), + word_size = word_size, + ncpus = ncpus, + perc_identity = perc_identity, + task = task) + +TE_D_plus_DL_info <- analyze_TE(c(s_te$DL, s_te$D), + word_size = word_size, + ncpus = ncpus, + perc_identity = perc_identity, + task = task) + +TE_DLT_plus_DLP_info_DLTP_verified <- compare_TE_datasets( + c(s_te$DLT, s_te$DLP), + ncpus = ncpus, + TE_DLTP_info$seqs_representative, + word_size = word_size, + perc_identity = perc_identity, + task = task) TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info) -# create additional library from rank 2 verified by multiplicity -id_for_additional_library <- setdiff( +TE_D_plus_DL_info_multiplicity <- verify_based_on_multiplicity(TE_D_plus_DL_info) + +# create additional library from rank 2 verified by multiplicity and DLTP +id_for_additional_library <- union( TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified, TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified) @@ -125,34 +151,45 @@ id_for_additional_library] seqs_additional_info <- analyze_TE(seqs_for_additional_library, word_size = word_size, ncpus = ncpus) - seq_representative <- c( + seqs_representative <- c( TE_DLTP_info$seqs_representative, seqs_additional_info$seqs_representative ) }else { if (length(id_for_additional_library) == 1) { - seq_representative <- c( + seqs_representative <- c( TE_DLTP_info$seqs_representative, c(s_te$DLP, s_te$DLT)[names(c(s_te$DLP, s_te$DLT)) %in% id_for_additional_library] ) }else { - seq_representative <- TE_DLTP_info$seqs_representative + seqs_representative <- TE_DLTP_info$seqs_representative } } # TE rank 3 - verify agains good DLTP TE_DL_info_DLTP_verified <- compare_TE_datasets( s_te$DL, - TE_DLTP_info$seqs_representative, min_coverage = 0.95, - ncpus = ncpus, word_size = word_size -) + seqs_representative, min_coverage = 0.90, + ncpus = ncpus, + word_size = word_size, + task = task, + perc_identity = perc_identity) + +TE_D_info_DLTP_verified <- compare_TE_datasets( + s_te$D, + seqs_representative, min_coverage = 0.90, + ncpus = ncpus, + word_size = word_size, + task = task, + perc_identity = perc_identity) -R <- seq_diversity(seq_representative)$richness -SI <- seq_diversity(seq_representative)$shannon_index + +R <- seq_diversity(seqs_representative)$richness +SI <- seq_diversity(seqs_representative)$shannon_index # final RM library: -seq_representative_no_ssr <- seq_representative[R > 20] +seqs_representative_no_ssr <- seqs_representative[R > 20] ID <- g$ID[g$type == "transposable_element"] names(ID) <- paste0(seqnames(g), "_", @@ -166,7 +203,9 @@ TE_DLTP_info$te_conflict_info$ok, TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified, TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified, - TE_DL_info_DLTP_verified$id_ok_verified) + TE_DL_info_DLTP_verified$id_ok_verified, + TE_D_info_DLTP_verified$id_ok_verified, + TE_D_plus_DL_info_multiplicity$id_ok_mp_verified), ) c1 <- g$ID %in% ID[id_of_good_te] @@ -175,6 +214,10 @@ gff_out <- g[c1 | c2] gff_te <- gff_out[gff_out$type %in% "transposable_element"] +# remove partial elements +gff_te_with_ltr <- gff_out[gff_out$type %in% "transposable_element" & gff_out$Rank != "D"] + + gff_5ltr <- gff_out[gff_out$LTR %in% "5LTR"] gff_3ltr <- gff_out[gff_out$LTR %in% "3LTR"] @@ -184,26 +227,26 @@ names(ltr5) <- paste0(gff_5ltr$Parent,":",names(ltr5)) ltr3 <- getSeqNamed(s, gff_3ltr) names(ltr3) <- paste0(gff_3ltr$Parent,":",names(ltr3)) -inc <- gff_te$Rank != "DL" +inc <- gff_te_with_ltr$Rank != "DL" -writeXStringSet(seq_representative, paste0(opt$output, "_RM_lib_non_redundant.fasta")) +writeXStringSet(seqs_representative, paste0(opt$output, "_RM_lib_non_redundant.fasta")) writeXStringSet(full_te, paste0(opt$output, "_RM_lib_full_TE.fasta")) writeXStringSet(ltr5, paste0(opt$output, "_RM_lib_5LTR.fasta")) writeXStringSet(ltr3, paste0(opt$output, "_RM_lib_3LTR.fasta")) export(gff_out, paste0(opt$output, "_clean.gff3"), format = "gff3") -lv <- sort(unique(gff_te$Final_Classification)) -te_count <- table(factor(gff_te$Final_Classification, levels=lv)) +lv <- sort(unique(gff_te_with_ltr$Final_Classification)) +te_count <- table(factor(gff_te_with_ltr$Final_Classification, levels=lv)) pdf(paste0(opt$output, "_summary.pdf"), width = 13, height=8, pointsize = 10) par(mfrow=c(1,2), mar=c(5,7,2,0), las=1) -boxplot(width(gff_te) ~ factor(gff_te$Final_Classification, levels=lv), +boxplot(width(gff_te_with_ltr) ~ factor(gff_te_with_ltr$Final_Classification, levels=lv), horizontal = TRUE, xlab="length[bp]", ylab="", names = paste0(gsub("^.+[|]", "", lv), " (", te_count, ")"), main = 'Full TE', at = seq_along(lv)*4 ) -boxplot(width(gff_te[inc]) ~ factor(gff_te$Final_Classification[inc], levels=lv), +boxplot(width(gff_te_with_ltr[inc]) ~ factor(gff_te_with_ltr$Final_Classification[inc], levels=lv), horizontal = TRUE, xlab="length[bp]", ylab="", names = rep("", length(lv)), main = 'Full TE', at = seq_along(lv)*4-1, add=TRUE, col=2