comparison clean_rm_output.R @ 0:cf3cea0a3039 draft

Uploaded
author petr-novak
date Thu, 07 Oct 2021 06:07:34 +0000
parents
children e955b40ad3a4
comparison
equal deleted inserted replaced
-1:000000000000 0:cf3cea0a3039
1 #!/usr/bin/env Rscript
2 suppressPackageStartupMessages(library(rtracklayer))
3
4 gff_cleanup = function(gff){
5 ## remove overlapin annotation track - assign new annot
6 gff_disjoin = disjoin(gff, with.revmap=TRUE)
7 ## append annotation:
8 gff_names = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)gff$Name[x], mc.cores = 8)
9 gff_strands = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)strand(gff[x]), mc.cores = 8)
10 new_annot = sapply(sapply(gff_names, unique), paste, collapse="|")
11 strand_attribute = sapply(sapply(gff_strands, unique), paste, collapse="|")
12 gff_disjoin$strands=strand_attribute
13 gff_disjoin$source="RM"
14 gff_disjoin$type="repeat"
15 gff_disjoin$score=NA
16 gff_disjoin$phase=NA
17 gff_disjoin$Name=new_annot
18 gff_disjoin$revmap=NULL
19 return(gff_disjoin)
20 }
21 infile = commandArgs(T)[1]
22 outfile = commandArgs(T)[2]
23
24 ## infile = "./test_data/raw_rm.out"
25
26 rm_out = read.table(infile, as.is=TRUE, sep="", skip = 2, fill=TRUE, header=FALSE)
27
28 gff = GRanges(seqnames = rm_out$V5, ranges = IRanges(start = rm_out$V6, end=rm_out$V7))
29
30 # repeat class after # symbol - syntax 1
31 gff$Name=rm_out$V11
32
33 ## is repeat type is specifies by double underscore:
34 ## then rm_out$V11 is unspecified
35 if (any(rm_out$V11 == "Unspecified")){
36 ## set Name from prefix
37 ## TODO
38 inc = rm_out$V11 == "Unspecified"
39 Name = gsub("__.+","",rm_out$V10)
40 # chanche Usnpsecified to new name
41 gff$Name[inc] = Name
42 }
43
44
45 ## join neighbors with the same annotation, disregard strand!
46 result <- unlist(reduce(split(gff, gff$Name)))
47 result$Name <- names(result)
48
49 ## TODO
50 ## identify conflicting annotation, replace by LCA but keep origin list of classifications
51
52 gff_out = sortSeqlevels(result)
53 gff_out = sort(gff_out)
54 gff_out$type = "repeat_region"
55 gff_out$source = "RepeatMasker_parsed"
56 gff_out$ID=paste0(gff_out$Name, "_", seq_along(gff_out$Name))
57 export(gff_out, format = "gff3", con=outfile)
58
59