diff do_not_track_info/te_analysis.R @ 7:c33d6583e548 draft

"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
author petr-novak
date Fri, 24 Jun 2022 14:19:48 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/do_not_track_info/te_analysis.R	Fri Jun 24 14:19:48 2022 +0000
@@ -0,0 +1,178 @@
+#!/usr/bin/env Rscript
+library(ggplot2)
+lf <- read.table("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/ltr_finder_13100_2018_144_MOESM1_ESM.csv",
+                as.is=TRUE, header=TRUE, sep="\t", quote = "")
+cls <-  read.table("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/classification__domain_features_13100_2018_144_MOESM1_ESM.csv",
+                 as.is=TRUE, header=TRUE, sep="\t", quote='', comment.char = "")
+
+# add classification to ltr
+rexdb <- cls$Classification..new03.
+names(rexdb) <- cls$REXdb.name
+
+
+lf$classification <- cls$Classification..new03.[match(lf$REXdb.name, cls$REXdb.name)]
+
+
+
+
+length_quantiles = sapply(split(lf$size..including.TSD., lf$classification), quantile, seq(0,1,by=0.025))
+length95min = length_quantiles[2,]
+length95max = length_quantiles[40,]
+
+ltr_length_quantiles = sapply(split(lf$X3.LTR.size, lf$classification), quantile, seq(0,1,by=0.025))
+ltr_length95min = ltr_length_quantiles[2,]
+ltr_length95max = ltr_length_quantiles[40,]
+
+write.table(length_quantiles, file = "/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/TE_length_quantiles.csv", sep="\t")
+write.table(ltr_length_quantiles, file = "/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/LTR_length_quantiles.csv", sep="\t")
+
+library(rtracklayer)
+d = import("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/dante_on_rexdb.gff3")
+
+# some elements have duplicated domains - remove it
+dom_dup = sapply(split(d$Name, seqnames(d)), function (x)any(duplicated(x)))
+d2 = d[seqnames(d) %in% names(dom_dup)[!dom_dup]]
+dparts = split(d2, d2$Final_Classification)
+
+dparts_rex = sapply(dparts, function(x) split(x, seqnames(x), drop = TRUE))
+
+
+get_dom_pos <- function(g){
+  if (length(g) == 0){
+    return(NULL)
+  }
+  gdf <-  data.frame(rexdb = as.character(seqnames(g)),
+                   domain = g$Name,
+                   S = start(g),
+                   E = end(g),
+                   stringsAsFactors = FALSE)
+  gSmat <- xtabs(S  ~ rexdb + domain, data = gdf)
+  colnames(gSmat) <-  paste0(colnames(gSmat),"_S")
+  gEmat <- xtabs(E  ~ rexdb + domain, data = gdf)
+  colnames(gEmat) <-  paste0(colnames(gEmat),"_E")
+  SEmat <- cbind(gSmat,gEmat)
+  # reorder
+  dom_position <- colMeans(SEmat)
+  SEmat_sorted <-  SEmat[,order(dom_position)]
+  return(SEmat_sorted)
+}
+
+
+domains_SE = lapply(dparts, get_dom_pos)
+
+# add ltr5_E, ltr5_S, ..
+LSE = list()
+for (i in seq_along(domains_SE)){
+  ltr3 = lf$X3.LTR.position[match(rownames(domains_SE[[i]]), lf$REXdb.name)]
+  ltr5 = lf$X5.LTR..position[match(rownames(domains_SE[[i]]), lf$REXdb.name)]
+  if (all(is.na(ltr3))){
+    # not ltr retrotransposon
+    LSE[[i]] = domains_SE[[i]]
+    next
+  }
+  ltr5_S = sapply(strsplit(ltr5, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[1])})
+  ltr5_E = sapply(strsplit(ltr5, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[2])})
+  ltr3_S = sapply(strsplit(ltr3, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[1])})
+  ltr3_E = sapply(strsplit(ltr3, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[2])})
+  LSE[[i]] = cbind(ltr5_S, ltr5_E, domains_SE[[i]], ltr3_S, ltr3_E)
+}
+
+names(LSE) = names(domains_SE)
+newN = function(a,b){
+  ap = gsub("_.+","", a)
+  bp = gsub("_.+","", b)
+  ifelse(ap == bp, ap, paste0(ap,"_", bp))
+}
+
+# positions to distances:
+LSED = lapply(LSE, function(x){
+  x[x==0]=NA
+  N = ncol(x)
+  if (is.null(N)){
+    return(NULL)
+  }
+  NM2 = colnames(x[,2:N])
+  NM1 = colnames(x[,1:(N - 1)])
+  d = x[,2:N, drop = FALSE] - x[,1:(N-1), drop=FALSE]
+  colnames(d) = newN(NM1, NM2)
+  return(d)
+}
+)
+
+
+# create empirical cumulative distribution function
+get_ecdf <- function(dd, s = 1) {
+  ecdf_list = list()
+  for (i1 in colnames(dd)){
+    for (i2 in colnames(dd)){
+      if (i1 == i2){
+        next
+      }else{
+        ii = sort(c(i1,i2))
+        ecdf_list[[ii[1]]][[ii[2]]]=ecdf(s * abs(dd[,i2]-dd[,i1]))
+
+      }
+    }
+  }
+return(ecdf_list)
+}
+
+
+ecdf_list = lapply(LSE, get_ecdf)
+ecdf_list_m = lapply(LSE, get_ecdf, -1)
+
+ecdf_model = list(plus = ecdf_list, minus = ecdf_list_m)
+
+saveRDS(ecdf_model, "databases/feature_distances_model.RDS")
+
+dante2dist <-  function(dc){
+  # dc - cluster of domains, granges
+  dpos <- get_dom_pos(dc)
+  D <-  c(dist(dpos))
+  NN <-  combn(names(dpos), 2)
+  # order feature in pair alphabeticaly
+  N1 <-  ifelse(NN[1,]<NN[2,], NN[1,], NN[2, ])
+  N2 <-  ifelse(NN[1,]>NN[2,], NN[1,], NN[2, ])
+  dfd <-  data.frame(F1 = N1, F2 = N2, distance = D)
+}
+
+
+
+dante_to_quantiles <- function(dc, model_function, lineage=NULL){
+  dfd <- dante2dist(dc)
+  if (is.null(lineage)){
+    lineage <- dc$Final_Classification[1]
+  }
+  fn <- model_function$plus[[lineage]]
+  fnm <- model_function$minus[[lineage]]
+  dstat1 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fn[[a]][[b]]), dfd$distance, FUN=function(f, x)f(x))
+  dstat2 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fnm[[a]][[b]]), dfd$distance, FUN=function(f, x)f(-x))
+  dout <- cbind(dfd, dstat1, dstat2, dstat12 = ifelse(dstat1>dstat2, dstat2, dstat1))
+}
+
+
+
+q=dante_to_quantiles(dparts_rex[[29]][[500]], ecdf_model)
+
+qq = lapply(dparts_rex[[29]], dante_to_quantiles, ecdf_model, lineage = dparts_rex[[43]][[1]]$Final_Classification[1])
+qq = lapply(dparts_rex[[29]], dante_to_quantiles, ecdf_model, lineage = dparts_rex[[29]][[1]]$Final_Classification[1])
+
+ll = sapply(qq, function(x)(sum(x$dstat12<0.01))/nrow(x))
+lln = sapply(qq, function(x)(sum(x$dstat12<0.01)))
+
+
+table(ll > 0.1) # more that 10% dist are outliers
+
+table(ll)
+table(lln)
+lll = sapply(ll, min)
+hist(lll, breaks= 500)
+dc = dparts_rex[[30]][[30]]
+
+print(min(dout[4:5]))
+plot(dout[4:5], xlim=c(0,1), ylim=c(0,1))
+
+hist(dout[,6], breaks=50, xlim=c(0,1))
+mtext(min(dout[,6]))
+
+apply(LSED[[30]],2, median, na.rm = TRUE)
\ No newline at end of file