Mercurial > repos > petr-novak > repeat_annotation_pipeline2
comparison clean_rm_output.R @ 3:e955b40ad3a4 draft default tip
Uploaded
author | petr-novak |
---|---|
date | Tue, 12 Oct 2021 07:43:54 +0000 |
parents | cf3cea0a3039 |
children |
comparison
equal
deleted
inserted
replaced
2:3f8ae272f4f3 | 3:e955b40ad3a4 |
---|---|
6 gff_disjoin = disjoin(gff, with.revmap=TRUE) | 6 gff_disjoin = disjoin(gff, with.revmap=TRUE) |
7 ## append annotation: | 7 ## append annotation: |
8 gff_names = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)gff$Name[x], mc.cores = 8) | 8 gff_names = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)gff$Name[x], mc.cores = 8) |
9 gff_strands = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)strand(gff[x]), mc.cores = 8) | 9 gff_strands = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)strand(gff[x]), mc.cores = 8) |
10 new_annot = sapply(sapply(gff_names, unique), paste, collapse="|") | 10 new_annot = sapply(sapply(gff_names, unique), paste, collapse="|") |
11 new_annot_uniq = unique(new_annot) | |
12 lca_annot = sapply(strsplit(new_annot_uniq, "|", fixed = TRUE), resolve_name) | |
13 names(lca_annot) = new_annot_uniq | |
14 new_annot_lca = lca_annot[new_annot] | |
15 #new_annot_lca = sapply(sapply(gff_names, unique), resolve_name) | |
11 strand_attribute = sapply(sapply(gff_strands, unique), paste, collapse="|") | 16 strand_attribute = sapply(sapply(gff_strands, unique), paste, collapse="|") |
12 gff_disjoin$strands=strand_attribute | 17 gff_disjoin$strands=strand_attribute |
13 gff_disjoin$source="RM" | 18 gff_disjoin$source="RM" |
14 gff_disjoin$type="repeat" | 19 gff_disjoin$type="repeat" |
15 gff_disjoin$score=NA | 20 gff_disjoin$score=NA |
16 gff_disjoin$phase=NA | 21 gff_disjoin$phase=NA |
17 gff_disjoin$Name=new_annot | 22 gff_disjoin$Name=new_annot_lca |
23 gff_disjoin$Original_names=new_annot | |
18 gff_disjoin$revmap=NULL | 24 gff_disjoin$revmap=NULL |
19 return(gff_disjoin) | 25 return(gff_disjoin) |
20 } | 26 } |
27 | |
28 resolve_name=function(x){ | |
29 if (length(x)==1){ | |
30 # no conflict | |
31 return(x) | |
32 } else{ | |
33 y = sapply(x, strsplit, split="/", fixed = TRUE) | |
34 ny = table(unlist(sapply(y, function(x)paste(seq_along(x),x)))) | |
35 if (max(ny)<length(x)){ | |
36 return("Unknown") | |
37 }else{ | |
38 k = which(ny==length(x)) | |
39 r = max(as.numeric((gsub(" .+","",names(k))))) | |
40 out = paste(y[[1]][1:r], collapse="/") | |
41 return(out) | |
42 } | |
43 } | |
44 } | |
45 | |
46 | |
47 | |
21 infile = commandArgs(T)[1] | 48 infile = commandArgs(T)[1] |
22 outfile = commandArgs(T)[2] | 49 outfile = commandArgs(T)[2] |
23 | 50 |
24 ## infile = "./test_data/raw_rm.out" | 51 ## infile = "./test_data/raw_rm.out" |
25 | 52 |
42 } | 69 } |
43 | 70 |
44 | 71 |
45 ## join neighbors with the same annotation, disregard strand! | 72 ## join neighbors with the same annotation, disregard strand! |
46 result <- unlist(reduce(split(gff, gff$Name))) | 73 result <- unlist(reduce(split(gff, gff$Name))) |
74 | |
47 result$Name <- names(result) | 75 result$Name <- names(result) |
76 | |
77 result_clean = gff_cleanup(result) | |
48 | 78 |
49 ## TODO | 79 ## TODO |
50 ## identify conflicting annotation, replace by LCA but keep origin list of classifications | 80 ## identify conflicting annotation, replace by LCA but keep origin list of classifications |
51 | 81 |
52 gff_out = sortSeqlevels(result) | 82 gff_out = sortSeqlevels(result_clean) |
53 gff_out = sort(gff_out) | 83 gff_out = sort(gff_out) |
54 gff_out$type = "repeat_region" | 84 gff_out$type = "repeat_region" |
55 gff_out$source = "RepeatMasker_parsed" | 85 gff_out$source = "RepeatMasker_parsed" |
56 gff_out$ID=paste0(gff_out$Name, "_", seq_along(gff_out$Name)) | 86 gff_out$ID=paste0(gff_out$Name, "_", seq_along(gff_out$Name)) |
57 export(gff_out, format = "gff3", con=outfile) | 87 export(gff_out, format = "gff3", con=outfile) |