annotate R/ltr_utils.R @ 8:9de392f2fc02 draft

"planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
author petr-novak
date Tue, 28 Jun 2022 12:33:22 +0000
parents c33d6583e548
children 1aa578e6c8b3
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
1 add_coordinates_of_closest_neighbor <- function(gff) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
2 gff <- gff[order(seqnames(gff), start(gff))]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
3 # split to chromosomes:
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
4 gff_parts <- split(gff, seqnames(gff))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
5 upstreams <- c(sapply(gff_parts, function(x) c(1, head(end(x), -1))))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
6 downstreams <- c(sapply(gff_parts, function(x) c(start(x)[-1], seqlengths(x)[runValue(seqnames(x[1]))])))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
7 gff_updated <- unlist(gff_parts)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
8 gff_updated$upstream_domain <- unlist(upstreams)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
9 gff_updated$downstream_domain <- unlist(downstreams)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
10 names(gff_updated) <- NULL
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
11 return(gff_updated)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
12 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
13
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
14 get_domain_clusters_alt <- function(gff, dist_models, threshold=0.99){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
15 gff <- sort(gff, by = ~ seqnames * start)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
16 domain_pairs <- data.frame(
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
17 D1 = paste0(head(gff$Name,-1),"_S"),
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
18 D2 = paste0(gff$Name[-1],"_S"),
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
19 C1 = head(gff$Final_Classification,-1),
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
20 C2 = gff$Final_Classification[-1],
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
21 S1 = head(strand(gff),-1),
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
22 S2 = strand(gff)[-1],
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
23 start1 = head(start(gff),-1),
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
24 start2 = start(gff)[-1],
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
25 chrpos= paste0(seqnames(gff)[-1],"_",start(gff)[-1])
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
26 )
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
27 domain_pairs$D_A <- with(domain_pairs, ifelse(D1 < D2, D1, D2))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
28 domain_pairs$D_B <- with(domain_pairs, ifelse(D1 > D2, D1, D2))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
29 domain_pairs$quantile <- 1
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
30 same_element_cluster <- domain_pairs$S1 == domain_pairs$S2 & domain_pairs$C1 == domain_pairs$C2
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
31 domain_pairs$same_element_cluster <- same_element_cluster
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
32
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
33 q_plus_function <- apply(domain_pairs[same_element_cluster,],
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
34 1,
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
35 function(x) dist_models$plus[[x["C1"]]][[x["D_A"]]][[x["D_B"]]])
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
36
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
37 D <- abs(domain_pairs[same_element_cluster,]$start1 - domain_pairs[same_element_cluster,]$start2)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
38
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
39 q_plus_value <- mapply(function(x, D)if(is.null(x)){0}else{x(D)}, x= q_plus_function, D = D)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
40
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
41 domain_pairs$quantile[same_element_cluster] <- q_plus_value
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
42
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
43 domain_pairs$split_position <- !same_element_cluster | domain_pairs$quantile > threshold
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
44 # combine clusters based on distances and original cluster
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
45 clusters <- paste(cumsum(c(TRUE, domain_pairs$split_position)), get_domain_clusters(gff))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
46 return(clusters)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
47 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
48
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
49
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
50 get_domain_clusters <- function(gff) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
51 # get consecutive domains from same linage
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
52 # must be sorted!
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
53 plus_order_split <- c(0, as.numeric(cumsum(head(gff$domain_order, -1) >= gff$domain_order[-1] & strand(gff)[-1] == '+')))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
54 minus_order_split <- rev(as.numeric(cumsum(rev(c(gff$domain_order[-1] >= head(gff$domain_order,-1) & strand(gff)[-1] == '-', FALSE)))))
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
55 # split based on classification - must be same and consecutive
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
56 x <- rle(gff$Final_Classification)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
57 # split on strand change
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
58 s <- rep(seq_along(runLength(strand(gff))), runLength(strand(gff)))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
59 domain_cluster <- paste0(rep(seq_along(x$lengths), x$lengths), "_", seqnames(gff),
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
60 "_", plus_order_split, "_", minus_order_split, "_", s)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
61 clusters <- factor(domain_cluster, levels = unique(domain_cluster))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
62 return(clusters)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
63 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
64
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
65 # create partial TE from clusters
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
66 get_partial_te_from_cluster_of_domains <- function (gpart){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
67 ID <- paste("TE_partial_", gpart$Cluster[1], sep="")
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
68 te_partial <- GRanges(type="transposable_element_partial",
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
69 strand=strand(gpart)[1],
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
70 ID = ID,
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
71 source = "dante_ltr",
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
72 seqnames=seqnames(gpart)[1],
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
73 Final_Classification=gpart$Final_Classification[1],
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
74 Name=gpart$Final_Classification[1],
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
75 IRanges(min(start(gpart)), max(end(gpart))))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
76 gpart$Parent <- ID
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
77 return(c(te_partial,gpart))
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
78 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
79
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
80 # create partial TE from clusters
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
81 get_partial_te_from_cluster_of_domains <- function(gpart, ID) {
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
82 te_partial <- makeGRangesFromDataFrame(
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
83 data.frame(type = "transposable_element_partial",
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
84 strand = gpart$strand[1],
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
85 ID = ID,
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
86 source = "dante_ltr",
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
87 seqnames = gpart$seqnames[1],
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
88 Final_Classification =
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
89 gpart$Final_Classification[1],
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
90 Name = gpart$Final_Classification[1],
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
91 IRanges(min(gpart$start), max(gpart$end))),
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
92 keep.extra.columns = TRUE)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
93
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
94
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
95 gpart$Parent <- ID
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
96 gpart_gr = makeGRangesFromDataFrame(gpart, keep.extra.columns = TRUE)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
97 return(c(te_partial, gpart_gr))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
98 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
99
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
100 # function to count for each element number of occurences:
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
101 count_occurences_for_each_element <- function(x) {
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
102 counts_unique <- table(x)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
103 counts <- counts_unique[x]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
104 counts
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
105 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
106
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
107
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
108 clean_domain_clusters <- function(gcl, lineage_domain_span, min_domains) {
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
109 ## remove clusters wich does not have enough domains or domains
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
110 ## are on different strand
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
111 N_domains <- sapply(gcl, nrow)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
112 N_unique_domains <- sapply(gcl, function(x)length(unique(x$Name)))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
113 S <- sapply(gcl, function(x)paste(sort(unique(x$strand)), collapse = " "))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
114 S_OK <- S %in% c("+", "-")
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
115 max_span <- lineage_domain_span[sapply(gcl, function(x) x$Final_Classification[1])]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
116 # set to zero if lineage is not covered in lineage domain span
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
117 max_span[is.na(max_span)] <- 0
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
118 span <- sapply(gcl, function(x)max(x$end) - min(x$start))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
119 cond1 <- S_OK &
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
120 N_unique_domains == N_domains &
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
121 N_domains >= min_domains &
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
122 span <= max_span
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
123 return(gcl[cond1])
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
124 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
125
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
126 check_ranges <- function(gx, s, offset = OFFSET) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
127 # check is range is not out of sequence length
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
128 START <- sapply(gx, function(x)min(x$start)) - offset
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
129 END <- sapply(gx, function(x)max(x$end)) + offset
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
130 MAX <- seqlengths(s)[sapply(gx, function(x)as.character(x$seqnames[1]))]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
131 good_ranges <- (START > 0) & (END <= MAX)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
132 return(good_ranges)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
133 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
134
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
135 get_ranges <- function(gx, offset = OFFSET) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
136 S <- sapply(gx, function(x)min(x$start))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
137 E <- sapply(gx, function(x)max(x$end))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
138 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = S - offset, end = E + offset))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
139 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
140
8
9de392f2fc02 "planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents: 7
diff changeset
141 get_ranges_left <- function(gx, offset = OFFSET, offset2 = 10) {
9de392f2fc02 "planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents: 7
diff changeset
142 ## offset2 - how many nt cen LTR extend to closes protein domain
9de392f2fc02 "planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents: 7
diff changeset
143 ## this is necassary as some detected proteins domains does not have correct bopundaries
9de392f2fc02 "planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents: 7
diff changeset
144 ## if LTR retrotransposons insters to other protein domain.
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
145 S <- sapply(gx, function(x)min(x$start))
8
9de392f2fc02 "planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents: 7
diff changeset
146 max_offset <- S - sapply(gx, function(x)min(x$upstream_domain)) + offset2
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
147 offset_adjusted <- ifelse(max_offset < offset, max_offset, offset)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
148 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = S - offset_adjusted, end = S + offset2))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
149 return(gr)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
150 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
151
8
9de392f2fc02 "planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents: 7
diff changeset
152 get_ranges_right <- function(gx, offset = OFFSET, offset2 = 10) {
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
153 E <- sapply(gx, function(x)max(x$end))
8
9de392f2fc02 "planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents: 7
diff changeset
154 max_offset <- sapply(gx, function(x)max(x$downstream_domain)) - E + offset2
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
155 offset_adjusted <- ifelse(max_offset < offset, max_offset, offset)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
156 gr <- GRanges(seqnames = sapply(gx, function(x)x$seqnames[1]), IRanges(start = E - offset2, end = E + offset_adjusted))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
157 return(gr)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
158 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
159
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
160 firstTG <- function(ss) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
161 x <- matchPattern("TG", ss)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
162 if (length(x) == 0) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
163 return(0)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
164 }else {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
165 return(min(start(x)))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
166 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
167 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
168
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
169 lastCA <- function(ss) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
170 x <- matchPattern("CA", ss)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
171 if (length(x) == 0) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
172 return(0)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
173 }else {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
174 return(max(start(x)))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
175 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
176 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
177
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
178 domain_distance <- function(d_query, d_reference){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
179 if (d_query == d_reference){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
180 return (0)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
181 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
182 d_query_p <- strsplit(d_query," ")[[1]]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
183 d_reference_p <- strsplit(d_reference," ")[[1]]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
184 d <- length(d_reference_p) - sum(d_query_p == d_reference_p[d_reference_p %in% d_query_p])
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
185 return(d)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
186 }
2
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
187
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
188
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
189 trim2TGAC <- function(bl) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
190 for (i in 1:nrow(bl)) {
2
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
191 cons <- consensusString(c(bl$qseq[i], bl$sseq[i]), ambiguityMap="?")
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
192 tg_P <- firstTG(cons)
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
193 ca_P <- lastCA(cons)
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
194 e_dist <- bl$length[i] - ca_P
8
9de392f2fc02 "planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents: 7
diff changeset
195 max_dist <- 50 # was 25
2
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
196 no_match <- any(tg_P == 0, ca_P == 0)
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
197 if (!no_match &
2
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
198 tg_P < max_dist &
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
199 e_dist < max_dist) {
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
200 ## trim alignment
2
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
201 bl[i,] <- trim_blast_table(bl[i,], tg_P, e_dist - 1)
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
202 }
2
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
203
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
204 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
205 return(bl)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
206 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
207
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
208 trim_blast_table <- function(b, T1, T2) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
209 b$qstart <- b$qstart + T1
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
210 b$sstart <- b$sstart + T1
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
211 b$qend <- b$qend - T2
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
212 b$send <- b$send - T2
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
213 b$sseq <- substring(b$sseq, T1, b$length - T2)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
214 b$qseq <- substring(b$qseq, T1, b$length - T2)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
215 b$length <- nchar(b$sseq)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
216 return(b)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
217 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
218
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
219 blast_all2all <- function(seqs, db=NULL, ncpus=1, word_size=20, perc_identity=90, max_target_seq = 5000, task = "megablast", additional_params= ""){
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
220 if (ncpus == 1){
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
221 query <- list(seqs)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
222 }else{
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
223 query <-split(seqs, round(seq(1,ncpus,length.out = length(seqs))))
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
224 if (length(query) < ncpus){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
225 ncpus <- length(query)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
226 }
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
227 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
228 if(is.null(db)){
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
229 # search against itself
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
230 db <- seqs
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
231 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
232 qf <-tempfile(fileext=paste0("_",1:ncpus,".fasta"))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
233 outf <-tempfile(fileext=paste0("_",1:ncpus,".csv"))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
234 dbf <- tempfile()
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
235 script <- tempfile()
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
236 writeXStringSet(db, dbf)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
237 mapply(query, qf, FUN = writeXStringSet)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
238 cols <- "qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen"
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
239 cmd_db <- paste("makeblastdb -dbtype nucl -in ", dbf)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
240 cmd_blast <- paste("blastn -task ", task, " -word_size", word_size,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
241 "-outfmt \"6 ", cols, "\" ",
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
242 "-perc_identity", perc_identity, " -min_raw_gapped_score 500",
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
243 "-max_target_seqs", max_target_seq, additional_params,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
244 "-query", qf, "-db", dbf, "-out", outf,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
245 "&"
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
246 )
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
247
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
248 # TODO - inspect only forward strand??
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
249 system(cmd_db)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
250 cmd_all <- paste(paste(cmd_blast,collapse="\n"),"\nwait")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
251 cat(cmd_all, file = script)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
252 system(paste("sh ", script))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
253
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
254 bl_list <- lapply(outf, read.table, stringsAsFactors = FALSE, col.names = unlist(strsplit(cols, " ")), sep="\t", comment.char = "")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
255 bl_table <- do.call(rbind, bl_list)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
256 unlink(qf)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
257 #unlink(outf)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
258 unlink(dbf)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
259 unlink(script)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
260 return(bl_table)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
261 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
262
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
263 identify_conflicts <- function (bl){
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
264 QL <- gsub(".+[|]", "", bl$qaccver)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
265 SL <- gsub(".+[|]", "", bl$saccver)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
266 id_with_conflict <- unique(c(bl$qaccver[QL != SL],bl$saccver[QL != SL]))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
267 id_ok <- setdiff(unique(c(bl$qaccver,bl$saccver)), id_with_conflict)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
268 single_hit <- sapply(
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
269 sapply(
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
270 split(bl$qaccver, bl$saccver), unique
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
271 ), length) == 1
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
272 id_with_no_hits <- names(single_hit)[single_hit] # except hit to itself
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
273 return(list(
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
274 ok = id_ok,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
275 conflict = id_with_conflict,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
276 no_hit = id_with_no_hits)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
277 )
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
278 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
279
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
280
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
281 analyze_TE <- function(seqs, ncpus = 10, word_size = 20, perc_identity = 90, ...){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
282 blt <- blast_all2all(seqs, ncpus = ncpus, word_size = word_size, perc_identity = perc_identity, ...)
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
283 te_conflict_info <- identify_conflicts(blt)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
284 blt_te_ok <- blast_table_subset(blt, te_conflict_info$ok)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
285 te_ok_lineages <- split(blt_te_ok,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
286 gsub(
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
287 ".+[|]",
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
288 "",
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
289 blt_te_ok$qaccver))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
290 gr_representative <- GRangesList(mclapply(te_ok_lineages,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
291 FUN = get_representative_ranges,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
292 mc.cores = ncpus
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
293 ))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
294 seqs_representative <- getSeq(seqs, Reduce(c, gr_representative))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
295 names(seqs_representative) <- seqnames(Reduce(c, gr_representative))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
296
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
297 return(
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
298 list(
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
299 seqs_representative = seqs_representative,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
300 te_conflict_info = te_conflict_info,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
301 gr_representative = gr_representative,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
302 blast = blt
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
303 )
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
304 )
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
305 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
306
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
307 query_coverage <- function(blt){
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
308 blt <- blt[blt$qaccver != blt$saccver,]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
309 Q_lengths <- blt$qlen[!duplicated(blt$qaccver)]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
310 names(Q_lengths) <- blt$qaccver[!duplicated(blt$qaccver)]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
311 gr <- GRanges(seqnames = blt$qaccver, seqlengths = Q_lengths,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
312 IRanges(start = blt$qstart, end = blt$qend))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
313 return(coverage(gr))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
314 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
315
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
316 multiplicity_of_te <- function(blt){
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
317 # exclude self to self hits and calculate coverage + mean_multiplicity of TE
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
318 # assuption is that TE which are 'identical' to other TE from the same lineage are
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
319 # likely correct
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
320 blt_no_self <- blt[blt$qaccver != blt$saccver,]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
321 cvr <- query_coverage(blt_no_self)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
322 L <- sapply(cvr, function(x) sum(width(x)))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
323 C1 <- sapply(cvr, function(x) sum(as.numeric(runValue(x) >= 1) * runLength(x)))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
324 multiplicity <- sapply(cvr, function(x) sum(as.numeric(runValue(x)) * runLength(x)))/L
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
325 data.frame(L = L, C1 = C1, multiplicity = multiplicity )
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
326 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
327
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
328 verify_based_on_multiplicity <- function(TE_info, min_coverage=0.99, min_multiplicity=3){
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
329 blt <- TE_info$blast[TE_info$blast$qaccver %in% TE_info$te_conflict_info$ok,]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
330 mp <- multiplicity_of_te(blt)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
331 id_ok_mp_verified <- rownames(mp)[mp$C1/mp$L > min_coverage & mp$multiplicity >= min_multiplicity]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
332 return(list(multiplicity = mp,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
333 id_ok_mp_verified = id_ok_mp_verified))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
334
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
335 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
336
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
337 compare_TE_datasets <- function(Q, S, word_size = 20, min_coverage = 0.95, ncpus=10, ...){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
338 blt <- blast_all2all(seqs = Q, db = S, ncpus = ncpus, word_size = word_size, ...)
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
339 QL <- gsub(".+[|]", "", blt$qaccver)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
340 SL <- gsub(".+[|]", "", blt$saccver)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
341 id_with_conflict <- unique(c(blt$qaccver[QL != SL]))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
342 id_ok <- setdiff(unique(blt$qaccver), id_with_conflict)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
343 # check coverage hits
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
344 blt_ok <- blt[blt$qaccver %in% id_ok,]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
345 Q_lengths <- blt_ok$qlen[!duplicated(blt_ok$qaccver)]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
346 names(Q_lengths) <- blt_ok$qaccver[!duplicated(blt_ok$qaccver)]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
347 gr <- GRanges(seqnames = blt_ok$qaccver, seqlengths = Q_lengths,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
348 IRanges(start = blt_ok$qstart, end = blt_ok$qend))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
349 cvr <- coverage(gr)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
350 L <- sapply(cvr, function(x) sum(width(x)))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
351 C1 <- sapply(cvr, function(x) sum(as.numeric(runValue(x) >= 1) * runLength(x)))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
352 Max_uncovered <- sapply(cvr, function(x){
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
353 if(any(runValue(x)==0)){
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
354 return(max(runLength(x)[runValue(x) == 0]))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
355 }else{
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
356 return(0)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
357 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
358 })
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
359
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
360 # verified based on hit to reference - S
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
361 C1_prop <- C1/L
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
362 pass <- (C1_prop >= min_coverage & Max_uncovered < 500)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
363 if (any(pass)){
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
364 id_ok_verified <- names(C1_prop)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
365 }else {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
366 id_ok_verified <- NULL
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
367 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
368 return(list(id_with_conflict = id_with_conflict,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
369 id_ok = id_ok,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
370 id_ok_verified = id_ok_verified
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
371 ))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
372 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
373
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
374
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
375
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
376 blast_table_subset <- function(bl,id){
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
377 return(bl[bl$qaccver %in% id & bl$saccver %in% id,, drop = FALSE])
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
378 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
379
3
6ae4a341d1f3 "planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
petr-novak
parents: 2
diff changeset
380 get_representative_ranges <- function(bl, min_length = 200, min_identity = 98){
6ae4a341d1f3 "planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
petr-novak
parents: 2
diff changeset
381 bl <- bl[bl$pident>=min_identity, , drop=FALSE]
6ae4a341d1f3 "planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
petr-novak
parents: 2
diff changeset
382 bl <- bl[bl$pident>=min_identity & bl$length >= min_length, , drop=FALSE]
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
383 score <- sort(unlist(by(bl$bitscore, bl$qaccver, sum, simplify = FALSE)),
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
384 decreasing = TRUE)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
385 L <- bl$qlen[!duplicated(bl$qaccver)]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
386 names(L) <- bl$qaccver[!duplicated(bl$qaccver)]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
387 gr <- GRanges(seqnames = bl$qaccver,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
388 IRanges(start = bl$qstart, end = bl$qend),
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
389 seqlengths = L,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
390 subject = bl$saccver,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
391 sstart = ifelse(bl$send < bl$sstart, bl$send, bl$sstart),
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
392 send = ifelse(bl$send > bl$sstart, bl$send, bl$sstart))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
393 SN <- levels(seqnames(gr))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
394 inc <- rep(TRUE, length(gr))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
395 MSK <- GRangesList()
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
396 for (i in names(score)){
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
397 inc[gr$subject %in% i] <- FALSE
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
398 gr_part <- gr[seqnames(gr) %in% i & inc]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
399 MSK[[i]] <- GRanges(seqnames = factor(gr_part$subject, levels = SN),
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
400 IRanges(start = gr_part$sstart, end = gr_part$send),
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
401 seqlengths = L
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
402 )
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
403 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
404 gout <- unlist(MSK)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
405
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
406 full_gr <- GRanges(seqnames = factor(SN, levels = SN),
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
407 IRanges(start = rep(1,length(L)),
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
408 end = L)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
409 )
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
410 unmasked_gr <- GenomicRanges::setdiff(full_gr, gout)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
411 return(unmasked_gr[width(unmasked_gr) >= min_length])
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
412 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
413
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
414 expected_diversity <- function(seqs, niter=100, km = 6){
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
415 L <- nchar(seqs)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
416 R <- matrix(ncol = niter, nrow = length(seqs))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
417 for (i in 1:niter){
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
418 seqs_rnd <- DNAStringSet(sapply(L, function(n) paste(sample(c("A", "C", "T", "G"), n, replace=TRUE), collapse="")))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
419 R[,i] <- seq_diversity(seqs_rnd, km = km)$richness
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
420 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
421 R
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
422
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
423 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
424
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
425 seq_diversity <- function (seqs, km=6){
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
426 K <- oligonucleotideFrequency(seqs, width=km)>0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
427 P <- t(K)/rowSums(K)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
428 # shannon index
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
429 SI <- apply(P, 2, function(x) {x1 <- x[x>0]; -sum(x1*log(x1))})
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
430 # richness
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
431 R <- rowSums(K)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
432 list(richness=R, shannon_index=SI)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
433 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
434
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
435
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
436
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
437 blast <- function(s1, s2, expected_aln_lenght=NULL) {
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
438 tmp1 <- tempfile()
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
439 tmp2 <- tempfile()
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
440 tmp_out <- tempfile()
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
441 writeXStringSet(DNAStringSet(s1), tmp1)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
442 writeXStringSet(DNAStringSet(s2), tmp2)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
443 # alternative blast:
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
444 cmd <- paste("blastn -task blastn -word_size 7 -dust no -gapextend 1 -gapopen 2 -reward 1 -penalty -1",
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
445 " -query ", tmp1, ' -subject ', tmp2, ' -strand plus ',
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
446 '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq"',
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
447 ' -out', tmp_out)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
448
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
449 system(cmd)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
450 out_raw <- read.table(tmp_out, as.is = TRUE, sep = "\t",
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
451 col.names = strsplit("qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq", split = ' ')[[1]])
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
452
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
453 if (nrow(out_raw) == 0) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
454 return(out_raw)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
455 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
456 out <- trim2TGAC(out_raw)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
457 # remove alingment shorted that
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
458 # out <- out[out$length > 100,]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
459 # alngment must be at least 80% of expected LTRmin95
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
460 out <- out[out$length > (expected_aln_lenght *0.8),]
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
461 if (nrow(out) == 0) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
462 return(out)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
463 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
464 ## filter for TGCA
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
465 TG_L <- substring(out$qseq, 1, 2) == "TG"
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
466 TG_R <- substring(out$sseq, 1, 2) == "TG"
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
467 CA_L <- substring(out$qseq, out$length - 1, out$length) == "CA"
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
468 CA_R <- substring(out$sseq, out$length - 1, out$length) == "CA"
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
469 cond <- TG_L & TG_R & CA_L & CA_R
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
470 out <- out[cond, , drop = FALSE]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
471 unlink(tmp1)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
472 unlink(tmp2)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
473 unlink(tmp_out)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
474 # TODO - detele all temp files!
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
475 # unlink(tmp1, tmp2, tmp_out)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
476 return(out)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
477 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
478
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
479 get_correct_coordinates <- function(b) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
480 do.call(rbind, strsplit(b$qaccver, split = "_"))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
481 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
482
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
483 evaluate_ltr <- function(GR, GRL, GRR, blast_line, Lseq, Rseq) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
484 LTR_L <- makeGRangesFromDataFrame(data.frame(seqnames = seqnames(GR),
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
485 start = start(GRL) + blast_line$qstart - 2,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
486 end = start(GRL) + blast_line$qend - 1))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
487 LTR_R <- makeGRangesFromDataFrame(data.frame(seqnames = seqnames(GR),
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
488 start = start(GRR) + blast_line$sstart - 2,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
489 end = start(GRR) + blast_line$send - 1))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
490
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
491 TSD_L <- makeGRangesFromDataFrame(data.frame(seqnames = seqnames(GR),
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
492 start = start(GRL) + blast_line$qstart - 3:10,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
493 end = start(GRL) + blast_line$qstart - 3))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
494 TSD_R <- makeGRangesFromDataFrame(data.frame(seqnames = seqnames(GR),
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
495 start = start(GRR) + blast_line$send,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
496 end = start(GRR) + blast_line$send + 0:7))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
497
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
498 TSD_L_seq <- DNAStringSet(substring(Lseq, blast_line$qstart - 2:9, blast_line$qstart - 2))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
499 TSD_R_seq <- DNAStringSet(substring(Rseq, blast_line$send + 1, blast_line$send + 1:8))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
500
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
501 matching_tsd <- TSD_R_seq == TSD_L_seq
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
502 matching_tsd[1:3] <- FALSE # remove short tsd
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
503 p <- which(matching_tsd)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
504 if (length(p) > 0) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
505 TSD_Length <- max(p)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
506 TSD_sequence <- TSD_L_seq[TSD_Length]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
507 TSD_position <- append(TSD_R[TSD_Length], TSD_L[TSD_Length])
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
508 }else {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
509 TSD_Length <- 0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
510 TSD_sequence <- ""
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
511 TSD_position <- NA
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
512 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
513
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
514 TE_Length <- end(LTR_R) - start(LTR_L)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
515 LTR_Identity <- blast_line$pident
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
516 out <- list(TSD_position = TSD_position, TSD_sequence = TSD_sequence, TSD_Length = TSD_Length,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
517 LTR_R_position = LTR_R, LTR_L_position = LTR_L, TE_Length = TE_Length, LTR_Identity = LTR_Identity)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
518 return(out)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
519 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
520
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
521 get_best_ltr <- function(x) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
522 tsd_ok <- sapply(x, function(k)k$TSD_Length > 3)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
523 te_length_ok <- sapply(x, function(k)k$TE_Length < 30000)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
524 ltr_length_ok <- sapply(x, function(k)width(k$LTR_R_position) >= 100 & width(k$LTR_L_position) >= 100)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
525 if (sum(tsd_ok & te_length_ok & ltr_length_ok) >= 1) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
526 # return the first one (best bitscore)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
527 return(x[tsd_ok & te_length_ok][1])
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
528 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
529 if (any(te_length_ok & ltr_length_ok)) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
530 return(x[te_length_ok & ltr_length_ok][1])
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
531 }else {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
532 return(NULL)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
533 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
534 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
535
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
536 get_te_gff3 <- function(g, ID) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
537 D <- makeGRangesFromDataFrame(g$domain, keep.extra.columns = TRUE)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
538 sn <- seqnames(D)[1]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
539 S <- strand(D)[1]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
540 TE <- GRanges(seqnames = sn,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
541 IRanges(start = start(g$ltr_info[[1]]$LTR_L_position),
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
542 end = end(g$ltr_info[[1]]$LTR_R_position)), strand = S)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
543 TE$type <- "transposable_element"
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
544 TE$ID <- ID
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
545
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
546 if (as.character(S) == "+") {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
547 LTR_5 <- g$ltr_info[[1]]$LTR_L
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
548 LTR_3 <- g$ltr_info[[1]]$LTR_R
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
549 }else {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
550 LTR_3 <- g$ltr_info[[1]]$LTR_L
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
551 LTR_5 <- g$ltr_info[[1]]$LTR_R
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
552 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
553 LTR_5$LTR <- '5LTR'
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
554 LTR_3$LTR <- '3LTR'
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
555 LTR_5$type <- "long_terminal_repeat"
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
556 LTR_3$type <- "long_terminal_repeat"
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
557 strand(LTR_3) <- S
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
558 strand(LTR_5) <- S
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
559 LTR_3$Parent <- ID
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
560 LTR_5$Parent <- ID
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
561 LTR_3$Final_Classification <- D$Final_Classification[1]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
562 LTR_5$Final_Classification <- D$Final_Classification[1]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
563 LTR_5$LTR_Identity <- g$ltr_info[[1]]$LTR_Identity
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
564 LTR_3$LTR_Identity <- g$ltr_info[[1]]$LTR_Identity
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
565
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
566 TE$LTR_Identity <- g$ltr_info[[1]]$LTR_Identity
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
567 TE$LTR5_length <- width(LTR_5)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
568 TE$LTR3_length <- width(LTR_3)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
569
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
570 if (is.na(g$ltr_info[[1]]$TSD_position)[1]) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
571 # no TSD found
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
572 TSD <- NULL
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
573 TE$TSD <- 'not_found'
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
574 }else {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
575 TSD <- g$ltr_info[[1]]$TSD_position
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
576 TSD$type <- "target_site_duplication"
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
577 TSD$Parent <- ID
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
578 TE$TSD <- as.character(g$ltr_info[[1]]$TSD_sequence)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
579 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
580
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
581 TE$Name <- TE$Final_Classification <- D$Final_Classification[1]
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
582
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
583 D$Parent <- ID
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
584 out <- c(TE, LTR_3, LTR_5, D, TSD)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
585 return(out)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
586 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
587
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
588 get_TE <- function(Lseq, Rseq, domains_gff, GR, GRL, GRR,LTR_length) {
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
589 xx <- blast(Lseq, Rseq, LTR_length)
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
590 if (nrow(xx) == 0) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
591 return(NULL)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
592 }else {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
593 ltr_tmp <- list()
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
594 for (j in 1:nrow(xx)) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
595 ltr_tmp[[j]] <- evaluate_ltr(GR, GRL, GRR, xx[j, , drop = FALSE], Lseq, Rseq)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
596 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
597 ltr <- get_best_ltr(ltr_tmp)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
598 if (length(ltr) == 0) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
599 return(NULL)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
600 ## add good ltr
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
601 }else {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
602 return(list(domain = domains_gff, ltr_info = ltr, blast_out = xx))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
603 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
604 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
605 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
606
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
607 add_pbs <- function(te, s, trna_db) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
608 ltr5 <- te[which(te$LTR == "5LTR")]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
609 STRAND <- as.character(strand(te)[1])
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
610 if (STRAND == "+") {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
611 pbs_gr <- GRanges(seqnames(ltr5), IRanges(start = end(ltr5) + 1, end = end(ltr5) + 31))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
612 pbs_s <- reverseComplement(getSeq(s, pbs_gr))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
613 }else {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
614 pbs_gr <- GRanges(seqnames(ltr5), IRanges(end = start(ltr5) - 1, start = start(ltr5) - 30))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
615 pbs_s <- getSeq(s, pbs_gr)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
616 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
617
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
618 names(pbs_s) <- "pbs_region"
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
619 # find trna match
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
620 tmp <- tempfile()
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
621 tmp_out <- tempfile()
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
622 writeXStringSet(DNAStringSet(pbs_s), tmp)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
623 # alternative blast:
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
624 cmd <- paste("blastn -task blastn -word_size 7 -dust no -perc_identity 100",
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
625 " -query ", tmp, ' -db ', trna_db, ' -strand plus ',
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
626 '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq"',
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
627 ' -out', tmp_out)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
628
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
629 system(cmd)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
630 pbs_exact_gr <- NULL
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
631 out_raw <- read.table(tmp_out, as.is = TRUE, sep = "\t",
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
632 col.names = strsplit(
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
633 "qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq",
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
634 split = ' ')[[1]])
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
635 if (nrow(out_raw) > 0) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
636 cca <- grepl("CCA$", out_raw$qseq)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
637 to_end <- out_raw$send == 23 # align to end of sequence
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
638 max_dist <- out_raw$qend > 25 # max 5 bp from ltr
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
639 min_length <- out_raw$length >= 10
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
640 out_pass <- out_raw[cca & to_end & max_dist & min_length,]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
641 if (nrow(out_pass) > 0) {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
642 trna_id <- out_pass$saccver[1]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
643 if (STRAND == "+") {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
644 S <- end(ltr5) + 32 - out_pass$qend[1]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
645 E <- end(ltr5) + 32 - out_pass$qstart[1]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
646 }else {
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
647 S <- start(ltr5) - 31 + out_pass$qstart[1]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
648 E <- start(ltr5) - 31 + out_pass$qend[1]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
649 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
650 pbs_exact_gr <- GRanges(seqnames(ltr5), IRanges(start = S, end = E))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
651 pbs_exact_gr$trna_id <- trna_id
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
652 pbs_exact_gr$Length <- out_pass$Length
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
653 strand(pbs_exact_gr) <- STRAND
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
654 pbs_exact_gr$type <- 'primer_binding_site'
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
655 pbs_exact_gr$Parent <- te[1]$ID
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
656 te$trna_id <- c(trna_id, rep(NA, length(te) - 1))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
657
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
658 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
659 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
660 te <- append(te, pbs_exact_gr)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
661 unlink(tmp)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
662 unlink(tmp_out)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
663 return(te)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
664 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
665
2
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
666
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
667 get_te_rank <- function (gr){
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
668 DL_id <- gr$ID[gr$type == "transposable_element" &
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
669 gr$TSD == "not_found" &
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
670 is.na(gr$trna_id)]
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
671 DLT_id <- gr$ID[gr$type == "transposable_element" &
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
672 gr$TSD != "not_found" &
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
673 is.na(gr$trna_id)]
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
674 DLTP_id <- gr$ID[gr$type == "transposable_element" &
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
675 gr$TSD != "not_found" &
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
676 !is.na(gr$trna_id)]
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
677 DLP_id <- gr$ID[gr$type == "transposable_element" &
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
678 gr$TSD == "not_found" &
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
679 !is.na(gr$trna_id)]
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
680 Rank <- character(length(gr))
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
681 ID <- unlist(ifelse(is.na(gr$ID), gr$Parent, gr$ID))
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
682
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
683 Rank[ID %in% DL_id] <- "DL"
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
684 Rank[ID %in% DLT_id] <- "DLT"
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
685 Rank[ID %in% DLP_id] <- "DLP"
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
686 Rank[ID %in% DLTP_id] <- "DLTP"
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
687 return(Rank)
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
688 }
f131886ea194 "planemo upload commit 891bfe9acf7349c2b887aff6d7e52a7f4ebf3b3a"
petr-novak
parents: 0
diff changeset
689
8
9de392f2fc02 "planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents: 7
diff changeset
690 dante_filtering <- function(dante_gff, min_similarity=0.4,
9de392f2fc02 "planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents: 7
diff changeset
691 min_identity=0.2, Relative_Length=0.6,
9de392f2fc02 "planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents: 7
diff changeset
692 min_relat_interuptions=8) {
9de392f2fc02 "planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents: 7
diff changeset
693 include <- as.numeric(dante_gff$Similarity) >= min_similarity &
9de392f2fc02 "planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents: 7
diff changeset
694 as.numeric(dante_gff$Identity) >= min_identity &
9de392f2fc02 "planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents: 7
diff changeset
695 as.numeric(dante_gff$Relat_Length) >= Relative_Length &
9de392f2fc02 "planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents: 7
diff changeset
696 as.numeric(dante_gff$Relat_Interruptions) <= min_relat_interuptions
9de392f2fc02 "planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents: 7
diff changeset
697
9de392f2fc02 "planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents: 7
diff changeset
698 include[is.na(include)] <- FALSE
9de392f2fc02 "planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents: 7
diff changeset
699 return(dante_gff[include])
9de392f2fc02 "planemo upload commit d6433b48c9bae079edb06364147f19500501c986"
petr-novak
parents: 7
diff changeset
700 }
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
701
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
702 get_te_statistics <- function(gr, RT){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
703 Ranks <- c("D", "DL", "DLT", "DLP", "DLTP")
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
704 all_class <- names(sort(table(RT$Final_Classification), decreasing = TRUE))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
705 RT_domain <- as.integer(table(factor(RT$Final_Classification, levels = all_class)))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
706 rank_table <- list()
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
707 for (i in 1:length(Ranks)) {
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
708 gr_part <- gr[gr$type == "transposable_element" & gr$Rank == Ranks[i]]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
709 rank_table[[Ranks[i]]] <- as.integer(table(factor(gr_part$Final_Classification, levels = all_class)))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
710
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
711 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
712 out <- cbind(do.call(cbind, rank_table), RT_domain=RT_domain)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
713 out <- rbind(out, Total = colSums(out))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
714 rownames(out) <- c(all_class, "Total")
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
715 return(out)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
716 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
717
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
718
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
719 get_te_statistics_old <- function(gr, RT) {
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
720 DOMAINS <- gr[gr$type == "transposable_element" & gr$Rank == "D"]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
721 DOMAINS_LTR <- gr[gr$type == "transposable_element" & gr$Rank == "DL"]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
722 DOMAINS_LTR_TSD <- gr[gr$type == "transposable_element" & gr$Rank == "DLT"]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
723 DOMAINS_LTR_TSD_PBS <- gr[gr$type == "transposable_element" & gr$Rank == "DLTP"]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
724 DOMAINS_LTR_PBS <- gr[gr$type == "transposable_element" & gr$Rank == "DLP"]
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
725
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
726 all_class <- names(sort(table(RT$Final_Classification), decreasing = TRUE))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
727 RT_domain <- as.integer(table(factor(RT$Final_Classification, levels = all_class)))
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
728 D <- as.integer(table(factor(DOMAINS$Final_Classification, levels = all_class)))
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
729 DL <- as.integer(table(factor(DOMAINS_LTR$Final_Classification, levels = all_class)))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
730 DLT <- as.integer(table(factor(DOMAINS_LTR_TSD$Final_Classification, levels = all_class)))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
731 DLTP <- as.integer(table(factor(DOMAINS_LTR_TSD_PBS$Final_Classification, levels = all_class)))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
732 DLP <- as.integer(table(factor(DOMAINS_LTR_PBS$Final_Classification, levels = all_class)))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
733 out <- data.frame(RT_domain = RT_domain,
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
734 DOMAINS = D,
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
735 DOMAINS_LTR = DL,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
736 DOMAINS_LTR_TSD = DLT,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
737 DOMAINS_LTR_PBS = DLP,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
738 DOMAINS_LTR_TSD_PBS = DLTP,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
739 row.names = all_class
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
740 )
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
741 total <- colSums(out)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
742 out <- rbind(out, Total = total)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
743 return(out)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
744 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
745
3
6ae4a341d1f3 "planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
petr-novak
parents: 2
diff changeset
746 getSeqNamed <- function(s, gr, name = NULL) {
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
747 spart <- getSeq(s, gr)
3
6ae4a341d1f3 "planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
petr-novak
parents: 2
diff changeset
748 if (is.null(name)){
6ae4a341d1f3 "planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
petr-novak
parents: 2
diff changeset
749 id1 <- paste0(seqnames(gr), '_', start(gr), "_", end(gr))
6ae4a341d1f3 "planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
petr-novak
parents: 2
diff changeset
750 }else{
6ae4a341d1f3 "planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
petr-novak
parents: 2
diff changeset
751 id1 <- mcols(gr)[,name]
6ae4a341d1f3 "planemo upload commit 8bd6029a4de4a8f5031a5cc71303bb06217cc88a"
petr-novak
parents: 2
diff changeset
752 }
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
753 id2 <- gr$Final_Classification
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
754 names(spart) <- paste0(id1, "#", id2)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
755 spart
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
756 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
757
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
758 get_TE_id <- function (gr){
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
759 gr_te <- gr[gr$type == "transposable_element"]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
760 ID <- gr_te$ID
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
761 A <- paste0(seqnames(gr_te), '_', start(gr_te), "_", end(gr_te))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
762 B <- gr_te$Final_Classification
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
763 names(ID) <- paste0(A, "#", B)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
764
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
765 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
766
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
767
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
768 get_te_sequences <- function (gr, s) {
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
769 Ranks <- c("D", "DL", "DLT", "DLP", "DLTP")
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
770 s_te <- list()
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
771 for (i in 1:length(Ranks)) {
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
772 gr_te <- gr[gr$type == "transposable_element" & gr$Rank == Ranks[i]]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
773 if (length(gr_te) > 0) {
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
774 s_te[[Ranks[i]]] <- getSeqNamed(s, gr_te)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
775 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
776 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
777 return(s_te)
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
778 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
779
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
780
0
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
781 cd_hit_est <- function(seqs, min_identity = 0.9, word_size = 10, ncpu = 2){
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
782 # runs cd-hi-est and return table with cluster membership, and size and if reads was repesentative
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
783 # input sequences must be in the same orientation!!!
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
784 sfile <- tempfile()
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
785 fasta_out <- tempfile()
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
786 clstr <- paste0(fasta_out,".clstr")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
787 # cdhit is triming names!!
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
788 ori_names <- names(seqs)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
789 names(seqs) <- seq_along(seqs)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
790 writeXStringSet(seqs, sfile)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
791 cmd <- paste("cd-hit-est",
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
792 "-i", sfile,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
793 "-o", fasta_out,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
794 "-c", min_identity,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
795 "-n", word_size,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
796 "-T", ncpu,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
797 "-r", 0)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
798 system(cmd)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
799 cls_raw <- grep("^>", readLines(clstr), invert = TRUE, value = TRUE)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
800 unlink(fasta_out)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
801 unlink(clstr)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
802 index <- gsub("\t.+","",cls_raw)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
803 id <- as.numeric(gsub("[.].+","",
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
804 gsub(".+>", "", cls_raw))
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
805 )
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
806 is_representative <- id %in% id[grep("[*]$",cls_raw)]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
807 membership <- cumsum(index=="0")
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
808 cluster_size <- tabulate(membership)[membership]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
809 # reorder
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
810 ord <- order(id)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
811 cls_info <- data.frame(
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
812 seq_id = ori_names,
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
813 membership = membership[ord],
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
814 cluster_size = cluster_size[ord],
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
815 is_representative = is_representative[ord]
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
816 )
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
817 return(cls_info)
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
818 }
7b0bbe7477c4 "planemo upload commit 92c684dff3b377c8c08654c7f3d46a133385e3e0-dirty"
petr-novak
parents:
diff changeset
819
7
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
820
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
821 dante2dist <- function(dc){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
822 # dc - cluster of domains, granges
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
823 dpos <- get_dom_pos(dc)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
824 D <- c(dist(dpos))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
825 NN <- combn(names(dpos), 2)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
826 # order feature in pair alphabeticaly
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
827 N1 <- ifelse(NN[1,]<NN[2,], NN[1,], NN[2, ])
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
828 N2 <- ifelse(NN[1,]>NN[2,], NN[1,], NN[2, ])
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
829 dfd <- data.frame(F1 = N1, F2 = N2, distance = D)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
830 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
831
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
832
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
833
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
834 dante_to_quantiles <- function(dc, model_function, lineage=NULL){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
835 dfd <- dante2dist(dc)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
836 if (is.null(lineage)){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
837 lineage <- dc$Final_Classification[1]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
838 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
839 # check if lineage has defined ecdf
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
840 if (lineage %in% names(model_function$plus)){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
841 fn <- model_function$plus[[lineage]]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
842 fnm <- model_function$minus[[lineage]]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
843 dstat1 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fn[[a]][[b]]), dfd$distance, FUN=function(f, x)f(x))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
844 dstat2 <- mapply(mapply(dfd$F1, dfd$F2, FUN=function(a, b)fnm[[a]][[b]]), dfd$distance, FUN=function(f, x)f(-x))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
845 dout <- cbind(dfd, dstat1, dstat2, dstat12 = ifelse(dstat1>dstat2, dstat2, dstat1))
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
846 return(dout)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
847 }else{
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
848 return(NULL)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
849 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
850 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
851
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
852 get_dom_pos <- function(g){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
853 if (length(g) == 0){
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
854 return(NULL)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
855 }
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
856 gdf <- data.frame(rexdb = as.character(seqnames(g)),
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
857 domain = g$Name,
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
858 S = start(g),
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
859 E = end(g),
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
860 stringsAsFactors = FALSE)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
861 gSmat <- xtabs(S ~ rexdb + domain, data = gdf)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
862 colnames(gSmat) <- paste0(colnames(gSmat),"_S")
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
863 gEmat <- xtabs(E ~ rexdb + domain, data = gdf)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
864 colnames(gEmat) <- paste0(colnames(gEmat),"_E")
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
865 SEmat <- cbind(gSmat,gEmat)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
866 # reorder
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
867 dom_position <- colMeans(SEmat)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
868 SEmat_sorted <- SEmat[,order(dom_position)]
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
869 return(SEmat_sorted)
c33d6583e548 "planemo upload commit 50884f7f0269a0bbde078f24fe5020975693bcd9"
petr-novak
parents: 3
diff changeset
870 }