comparison do_not_track_info/te_analysis.R @ 7:c33d6583e548 draft

"planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
author petr-novak
date Fri, 24 Jun 2022 14:19:48 +0000
parents
children
comparison
equal deleted inserted replaced
6:b91ca438a1cb 7:c33d6583e548
1 #!/usr/bin/env Rscript
2 library(ggplot2)
3 lf <- read.table("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/ltr_finder_13100_2018_144_MOESM1_ESM.csv",
4 as.is=TRUE, header=TRUE, sep="\t", quote = "")
5 cls <- read.table("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/classification__domain_features_13100_2018_144_MOESM1_ESM.csv",
6 as.is=TRUE, header=TRUE, sep="\t", quote='', comment.char = "")
7
8 # add classification to ltr
9 rexdb <- cls$Classification..new03.
10 names(rexdb) <- cls$REXdb.name
11
12
13 lf$classification <- cls$Classification..new03.[match(lf$REXdb.name, cls$REXdb.name)]
14
15
16
17
18 length_quantiles = sapply(split(lf$size..including.TSD., lf$classification), quantile, seq(0,1,by=0.025))
19 length95min = length_quantiles[2,]
20 length95max = length_quantiles[40,]
21
22 ltr_length_quantiles = sapply(split(lf$X3.LTR.size, lf$classification), quantile, seq(0,1,by=0.025))
23 ltr_length95min = ltr_length_quantiles[2,]
24 ltr_length95max = ltr_length_quantiles[40,]
25
26 write.table(length_quantiles, file = "/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/TE_length_quantiles.csv", sep="\t")
27 write.table(ltr_length_quantiles, file = "/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/LTR_length_quantiles.csv", sep="\t")
28
29 library(rtracklayer)
30 d = import("/mnt/raid/users/petr/workspace/dante_ltr/do_not_track_info/dante_on_rexdb.gff3")
31
32 # some elements have duplicated domains - remove it
33 dom_dup = sapply(split(d$Name, seqnames(d)), function (x)any(duplicated(x)))
34 d2 = d[seqnames(d) %in% names(dom_dup)[!dom_dup]]
35 dparts = split(d2, d2$Final_Classification)
36
37 dparts_rex = sapply(dparts, function(x) split(x, seqnames(x), drop = TRUE))
38
39
40 get_dom_pos <- function(g){
41 if (length(g) == 0){
42 return(NULL)
43 }
44 gdf <- data.frame(rexdb = as.character(seqnames(g)),
45 domain = g$Name,
46 S = start(g),
47 E = end(g),
48 stringsAsFactors = FALSE)
49 gSmat <- xtabs(S ~ rexdb + domain, data = gdf)
50 colnames(gSmat) <- paste0(colnames(gSmat),"_S")
51 gEmat <- xtabs(E ~ rexdb + domain, data = gdf)
52 colnames(gEmat) <- paste0(colnames(gEmat),"_E")
53 SEmat <- cbind(gSmat,gEmat)
54 # reorder
55 dom_position <- colMeans(SEmat)
56 SEmat_sorted <- SEmat[,order(dom_position)]
57 return(SEmat_sorted)
58 }
59
60
61 domains_SE = lapply(dparts, get_dom_pos)
62
63 # add ltr5_E, ltr5_S, ..
64 LSE = list()
65 for (i in seq_along(domains_SE)){
66 ltr3 = lf$X3.LTR.position[match(rownames(domains_SE[[i]]), lf$REXdb.name)]
67 ltr5 = lf$X5.LTR..position[match(rownames(domains_SE[[i]]), lf$REXdb.name)]
68 if (all(is.na(ltr3))){
69 # not ltr retrotransposon
70 LSE[[i]] = domains_SE[[i]]
71 next
72 }
73 ltr5_S = sapply(strsplit(ltr5, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[1])})
74 ltr5_E = sapply(strsplit(ltr5, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[2])})
75 ltr3_S = sapply(strsplit(ltr3, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[1])})
76 ltr3_E = sapply(strsplit(ltr3, "-"), function(x) if(is.na(x[[1]])){NA} else{as.numeric(x[2])})
77 LSE[[i]] = cbind(ltr5_S, ltr5_E, domains_SE[[i]], ltr3_S, ltr3_E)
78 }
79
80 names(LSE) = names(domains_SE)
81 newN = function(a,b){
82 ap = gsub("_.+","", a)
83 bp = gsub("_.+","", b)
84 ifelse(ap == bp, ap, paste0(ap,"_", bp))
85 }
86
87 # positions to distances:
88 LSED = lapply(LSE, function(x){
89 x[x==0]=NA
90 N = ncol(x)
91 if (is.null(N)){
92 return(NULL)
93 }
94 NM2 = colnames(x[,2:N])
95 NM1 = colnames(x[,1:(N - 1)])
96 d = x[,2:N, drop = FALSE] - x[,1:(N-1), drop=FALSE]
97 colnames(d) = newN(NM1, NM2)
98 return(d)
99 }
100 )
101
102
103 # create empirical cumulative distribution function
104 get_ecdf <- function(dd, s = 1) {
105 ecdf_list = list()
106 for (i1 in colnames(dd)){
107 for (i2 in colnames(dd)){
108 if (i1 == i2){
109 next
110 }else{
111 ii = sort(c(i1,i2))
112 ecdf_list[[ii[1]]][[ii[2]]]=ecdf(s * abs(dd[,i2]-dd[,i1]))
113
114 }
115 }
116 }
117 return(ecdf_list)
118 }
119
120
121 ecdf_list = lapply(LSE, get_ecdf)
122 ecdf_list_m = lapply(LSE, get_ecdf, -1)
123
124 ecdf_model = list(plus = ecdf_list, minus = ecdf_list_m)
125
126 saveRDS(ecdf_model, "databases/feature_distances_model.RDS")
127
128 dante2dist <- function(dc){
129 # dc - cluster of domains, granges
130 dpos <- get_dom_pos(dc)
131 D <- c(dist(dpos))
132 NN <- combn(names(dpos), 2)
133 # order feature in pair alphabeticaly
134 N1 <- ifelse(NN[1,]<NN[2,], NN[1,], NN[2, ])
135 N2 <- ifelse(NN[1,]>NN[2,], NN[1,], NN[2, ])
136 dfd <- data.frame(F1 = N1, F2 = N2, distance = D)
137 }
138
139
140
141 dante_to_quantiles <- function(dc, model_function, lineage=NULL){
142 dfd <- dante2dist(dc)
143 if (is.null(lineage)){
144 lineage <- dc$Final_Classification[1]
145 }
146 fn <- model_function$plus[[lineage]]
147 fnm <- model_function$minus[[lineage]]
148 dstat1 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fn[[a]][[b]]), dfd$distance, FUN=function(f, x)f(x))
149 dstat2 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fnm[[a]][[b]]), dfd$distance, FUN=function(f, x)f(-x))
150 dout <- cbind(dfd, dstat1, dstat2, dstat12 = ifelse(dstat1>dstat2, dstat2, dstat1))
151 }
152
153
154
155 q=dante_to_quantiles(dparts_rex[[29]][[500]], ecdf_model)
156
157 qq = lapply(dparts_rex[[29]], dante_to_quantiles, ecdf_model, lineage = dparts_rex[[43]][[1]]$Final_Classification[1])
158 qq = lapply(dparts_rex[[29]], dante_to_quantiles, ecdf_model, lineage = dparts_rex[[29]][[1]]$Final_Classification[1])
159
160 ll = sapply(qq, function(x)(sum(x$dstat12<0.01))/nrow(x))
161 lln = sapply(qq, function(x)(sum(x$dstat12<0.01)))
162
163
164 table(ll > 0.1) # more that 10% dist are outliers
165
166 table(ll)
167 table(lln)
168 lll = sapply(ll, min)
169 hist(lll, breaks= 500)
170 dc = dparts_rex[[30]][[30]]
171
172 print(min(dout[4:5]))
173 plot(dout[4:5], xlim=c(0,1), ylim=c(0,1))
174
175 hist(dout[,6], breaks=50, xlim=c(0,1))
176 mtext(min(dout[,6]))
177
178 apply(LSED[[30]],2, median, na.rm = TRUE)