diff clean_ltr.R @ 3:6ae4a341d1f3 draft

"planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
author petr-novak
date Tue, 03 May 2022 12:38:12 +0000
parents 7b0bbe7477c4
children 93d35ae65e1b
line wrap: on
line diff
--- a/clean_ltr.R	Tue Apr 12 12:55:32 2022 +0000
+++ b/clean_ltr.R	Tue May 03 12:38:12 2022 +0000
@@ -108,7 +108,6 @@
 )
 
 TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info)
-
 # create additional library from rank 2 verified by multiplicity
 id_for_additional_library <- setdiff(
   TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified,
@@ -134,7 +133,6 @@
   }
 }
 
-
 # TE  rank 3
 TE_DL_info_DLTP_verified <- compare_TE_datasets(
   s_te$DL,
@@ -169,7 +167,46 @@
 
 gff_out <- g[c1 | c2]
 
+gff_te <- gff_out[gff_out$type %in% "transposable_element"]
+gff_5ltr <- gff_out[gff_out$LTR %in% "5LTR"]
+gff_3ltr <- gff_out[gff_out$LTR %in% "3LTR"]
+full_te <- getSeqNamed(s, gff_te)
+ltr5 <-  getSeqNamed(s, gff_5ltr)
+ltr3 <-  getSeqNamed(s, gff_3ltr)
+inc <-  gff_te$Rank != "DL"
 
-writeXStringSet(seq_representative, paste0(opt$output, "_RM_lib.fasta"))
+writeXStringSet(seq_representative, paste0(opt$output, "_RM_lib_non_redundant.fasta"))
+writeXStringSet(full_te, paste0(opt$output, "_RM_lib_full_TE.fasta"))
+writeXStringSet(ltr5, paste0(opt$output, "_RM_lib_5LTR.fasta"))
+writeXStringSet(ltr3, paste0(opt$output, "_RM_lib_3LTR.fasta"))
+
 export(gff_out, paste0(opt$output, "_clean.gff3"), format = "gff3")
 
+lv <- sort(unique(gff_te$Final_Classification))
+te_count <- table(factor(gff_te$Final_Classification, levels=lv))
+
+pdf(paste0(opt$output, "_summary.pdf"), width = 13, height=8, pointsize = 10)
+par(mfrow=c(1,2), mar=c(5,7,2,0), las=1)
+boxplot(width(gff_te) ~ factor(gff_te$Final_Classification, levels=lv),
+        horizontal = TRUE, xlab="length[bp]", ylab="",
+        names = paste0(gsub("^.+[|]", "", lv), " (", te_count, ")"),
+        main = 'Full TE', at = seq_along(lv)*4
+)
+boxplot(width(gff_te[inc]) ~ factor(gff_te$Final_Classification[inc], levels=lv),
+        horizontal = TRUE, xlab="length[bp]", ylab="",
+        names = rep("", length(lv)),
+        main = 'Full TE', at = seq_along(lv)*4-1, add=TRUE, col=2
+)
+par(mar=c(5,0,2,7))
+boxplot(width(gff_5ltr) ~ factor(gff_5ltr$Final_Classification, levels=lv),
+        horizontal = TRUE, xlab="length[bp]", ylab="",
+        names = rep("", length(lv)),
+        main = "5'LTR", at = seq_along(lv)*4
+)
+boxplot(width(gff_5ltr[inc]) ~ factor(gff_5ltr$Final_Classification[inc], levels=lv),
+        horizontal = TRUE, xlab="length[bp]", ylab="",
+        names = rep("", length(lv)),
+        main = "5'LTR", at = seq_along(lv)*4-1, add=TRUE, col=2
+)
+legend('bottomright', col=c("grey","2"), legend=c("All TE", "TE with PBS/TSD"), pch=15)
+dev.off()