Mercurial > repos > petr-novak > repeat_annotation_pipeline2
comparison clean_rm_output.R @ 0:cf3cea0a3039 draft
Uploaded
author | petr-novak |
---|---|
date | Thu, 07 Oct 2021 06:07:34 +0000 |
parents | |
children | e955b40ad3a4 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:cf3cea0a3039 |
---|---|
1 #!/usr/bin/env Rscript | |
2 suppressPackageStartupMessages(library(rtracklayer)) | |
3 | |
4 gff_cleanup = function(gff){ | |
5 ## remove overlapin annotation track - assign new annot | |
6 gff_disjoin = disjoin(gff, with.revmap=TRUE) | |
7 ## append annotation: | |
8 gff_names = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)gff$Name[x], mc.cores = 8) | |
9 gff_strands = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)strand(gff[x]), mc.cores = 8) | |
10 new_annot = sapply(sapply(gff_names, unique), paste, collapse="|") | |
11 strand_attribute = sapply(sapply(gff_strands, unique), paste, collapse="|") | |
12 gff_disjoin$strands=strand_attribute | |
13 gff_disjoin$source="RM" | |
14 gff_disjoin$type="repeat" | |
15 gff_disjoin$score=NA | |
16 gff_disjoin$phase=NA | |
17 gff_disjoin$Name=new_annot | |
18 gff_disjoin$revmap=NULL | |
19 return(gff_disjoin) | |
20 } | |
21 infile = commandArgs(T)[1] | |
22 outfile = commandArgs(T)[2] | |
23 | |
24 ## infile = "./test_data/raw_rm.out" | |
25 | |
26 rm_out = read.table(infile, as.is=TRUE, sep="", skip = 2, fill=TRUE, header=FALSE) | |
27 | |
28 gff = GRanges(seqnames = rm_out$V5, ranges = IRanges(start = rm_out$V6, end=rm_out$V7)) | |
29 | |
30 # repeat class after # symbol - syntax 1 | |
31 gff$Name=rm_out$V11 | |
32 | |
33 ## is repeat type is specifies by double underscore: | |
34 ## then rm_out$V11 is unspecified | |
35 if (any(rm_out$V11 == "Unspecified")){ | |
36 ## set Name from prefix | |
37 ## TODO | |
38 inc = rm_out$V11 == "Unspecified" | |
39 Name = gsub("__.+","",rm_out$V10) | |
40 # chanche Usnpsecified to new name | |
41 gff$Name[inc] = Name | |
42 } | |
43 | |
44 | |
45 ## join neighbors with the same annotation, disregard strand! | |
46 result <- unlist(reduce(split(gff, gff$Name))) | |
47 result$Name <- names(result) | |
48 | |
49 ## TODO | |
50 ## identify conflicting annotation, replace by LCA but keep origin list of classifications | |
51 | |
52 gff_out = sortSeqlevels(result) | |
53 gff_out = sort(gff_out) | |
54 gff_out$type = "repeat_region" | |
55 gff_out$source = "RepeatMasker_parsed" | |
56 gff_out$ID=paste0(gff_out$Name, "_", seq_along(gff_out$Name)) | |
57 export(gff_out, format = "gff3", con=outfile) | |
58 | |
59 |