comparison clean_ltr.R @ 3:6ae4a341d1f3 draft

"planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
author petr-novak
date Tue, 03 May 2022 12:38:12 +0000
parents 7b0bbe7477c4
children 93d35ae65e1b
comparison
equal deleted inserted replaced
2:f131886ea194 3:6ae4a341d1f3
106 TE_DLTP_info$seqs_representative, 106 TE_DLTP_info$seqs_representative,
107 word_size = word_size 107 word_size = word_size
108 ) 108 )
109 109
110 TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info) 110 TE_DLT_plus_DLP_info_multiplicity <- verify_based_on_multiplicity(TE_DLT_plus_DLP_info)
111
112 # create additional library from rank 2 verified by multiplicity 111 # create additional library from rank 2 verified by multiplicity
113 id_for_additional_library <- setdiff( 112 id_for_additional_library <- setdiff(
114 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified, 113 TE_DLT_plus_DLP_info_multiplicity$id_ok_mp_verified,
115 TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified) 114 TE_DLT_plus_DLP_info_DLTP_verified$id_ok_verified)
116 115
132 }else { 131 }else {
133 seq_representative <- TE_DLTP_info$seqs_representative 132 seq_representative <- TE_DLTP_info$seqs_representative
134 } 133 }
135 } 134 }
136 135
137
138 # TE rank 3 136 # TE rank 3
139 TE_DL_info_DLTP_verified <- compare_TE_datasets( 137 TE_DL_info_DLTP_verified <- compare_TE_datasets(
140 s_te$DL, 138 s_te$DL,
141 TE_DLTP_info$seqs_representative, min_coverage = 0.98, 139 TE_DLTP_info$seqs_representative, min_coverage = 0.98,
142 ncpus = ncpus 140 ncpus = ncpus
167 c1 <- g$ID %in% ID[id_of_good_te] 165 c1 <- g$ID %in% ID[id_of_good_te]
168 c2 <- sapply(g$Parent, function(x)ifelse(length(x) == 0, "", x)) %in% ID[id_of_good_te] 166 c2 <- sapply(g$Parent, function(x)ifelse(length(x) == 0, "", x)) %in% ID[id_of_good_te]
169 167
170 gff_out <- g[c1 | c2] 168 gff_out <- g[c1 | c2]
171 169
172 170 gff_te <- gff_out[gff_out$type %in% "transposable_element"]
173 writeXStringSet(seq_representative, paste0(opt$output, "_RM_lib.fasta")) 171 gff_5ltr <- gff_out[gff_out$LTR %in% "5LTR"]
172 gff_3ltr <- gff_out[gff_out$LTR %in% "3LTR"]
173 full_te <- getSeqNamed(s, gff_te)
174 ltr5 <- getSeqNamed(s, gff_5ltr)
175 ltr3 <- getSeqNamed(s, gff_3ltr)
176 inc <- gff_te$Rank != "DL"
177
178 writeXStringSet(seq_representative, paste0(opt$output, "_RM_lib_non_redundant.fasta"))
179 writeXStringSet(full_te, paste0(opt$output, "_RM_lib_full_TE.fasta"))
180 writeXStringSet(ltr5, paste0(opt$output, "_RM_lib_5LTR.fasta"))
181 writeXStringSet(ltr3, paste0(opt$output, "_RM_lib_3LTR.fasta"))
182
174 export(gff_out, paste0(opt$output, "_clean.gff3"), format = "gff3") 183 export(gff_out, paste0(opt$output, "_clean.gff3"), format = "gff3")
175 184
185 lv <- sort(unique(gff_te$Final_Classification))
186 te_count <- table(factor(gff_te$Final_Classification, levels=lv))
187
188 pdf(paste0(opt$output, "_summary.pdf"), width = 13, height=8, pointsize = 10)
189 par(mfrow=c(1,2), mar=c(5,7,2,0), las=1)
190 boxplot(width(gff_te) ~ factor(gff_te$Final_Classification, levels=lv),
191 horizontal = TRUE, xlab="length[bp]", ylab="",
192 names = paste0(gsub("^.+[|]", "", lv), " (", te_count, ")"),
193 main = 'Full TE', at = seq_along(lv)*4
194 )
195 boxplot(width(gff_te[inc]) ~ factor(gff_te$Final_Classification[inc], levels=lv),
196 horizontal = TRUE, xlab="length[bp]", ylab="",
197 names = rep("", length(lv)),
198 main = 'Full TE', at = seq_along(lv)*4-1, add=TRUE, col=2
199 )
200 par(mar=c(5,0,2,7))
201 boxplot(width(gff_5ltr) ~ factor(gff_5ltr$Final_Classification, levels=lv),
202 horizontal = TRUE, xlab="length[bp]", ylab="",
203 names = rep("", length(lv)),
204 main = "5'LTR", at = seq_along(lv)*4
205 )
206 boxplot(width(gff_5ltr[inc]) ~ factor(gff_5ltr$Final_Classification[inc], levels=lv),
207 horizontal = TRUE, xlab="length[bp]", ylab="",
208 names = rep("", length(lv)),
209 main = "5'LTR", at = seq_along(lv)*4-1, add=TRUE, col=2
210 )
211 legend('bottomright', col=c("grey","2"), legend=c("All TE", "TE with PBS/TSD"), pch=15)
212 dev.off()