comparison clean_rm_output.R @ 2:7f1032da7a0a draft

Uploaded
author petr-novak
date Mon, 21 Feb 2022 10:35:13 +0000
parents 814cba36e435
children
comparison
equal deleted inserted replaced
1:814cba36e435 2:7f1032da7a0a
14 lca_annot = sapply(strsplit(new_annot_uniq, "|", fixed = TRUE), resolve_name) 14 lca_annot = sapply(strsplit(new_annot_uniq, "|", fixed = TRUE), resolve_name)
15 names(lca_annot) = new_annot_uniq 15 names(lca_annot) = new_annot_uniq
16 new_annot_lca = lca_annot[new_annot] 16 new_annot_lca = lca_annot[new_annot]
17 #new_annot_lca = sapply(sapply(gff_names, unique), resolve_name) 17 #new_annot_lca = sapply(sapply(gff_names, unique), resolve_name)
18 strand_attribute = sapply(sapply(gff_strands, unique), paste, collapse="|") 18 strand_attribute = sapply(sapply(gff_strands, unique), paste, collapse="|")
19 gff_disjoin$strands=strand_attribute
20 gff_disjoin$source="RM" 19 gff_disjoin$source="RM"
21 gff_disjoin$type="repeat" 20 gff_disjoin$type="repeat"
22 gff_disjoin$score=NA 21 gff_disjoin$score=NA
23 gff_disjoin$phase=NA 22 gff_disjoin$phase=NA
24 gff_disjoin$Name=new_annot_lca 23 gff_disjoin$Name=new_annot_lca
25 gff_disjoin$Original_names=new_annot 24 gff_disjoin$Original_names=new_annot
25 gff_disjoin$strands=strand_attribute
26 gff_disjoin$revmap=NULL 26 gff_disjoin$revmap=NULL
27 return(gff_disjoin) 27 return(gff_disjoin)
28 } 28 }
29 29
30 resolve_name=function(x){ 30 resolve_name=function(x){
43 return(out) 43 return(out)
44 } 44 }
45 } 45 }
46 } 46 }
47 47
48 convert_names <- function(n, old_sep = "|" , new_sep = "\""){
49 # remove all characters which are new_sep with -
50 n_new = gsub(old_sep, new_sep,
51 gsub(new_sep,"-", n, fixed = TRUE),
52 fixed = TRUE)
53 return(n_new)
54 }
48 55
49 56
50 infile = commandArgs(T)[1] 57 infile = commandArgs(T)[1]
51 outfile = commandArgs(T)[2] 58 outfile = commandArgs(T)[2]
59
52 60
53 ## infile = "./test_data/raw_rm.out" 61 ## infile = "./test_data/raw_rm.out"
54 62
55 rm_out = read.table(infile, as.is=TRUE, sep="", skip = 2, fill=TRUE, header=FALSE, col.names=paste0("V",1:16)) 63 rm_out = read.table(infile, as.is=TRUE, sep="", skip = 2, fill=TRUE, header=FALSE, col.names=paste0("V",1:16))
56 64
57 gff = GRanges(seqnames = rm_out$V5, ranges = IRanges(start = rm_out$V6, end=rm_out$V7)) 65 gff = GRanges(seqnames = rm_out$V5, ranges = IRanges(start = rm_out$V6, end=rm_out$V7))
58 66
59 # repeat class after # symbol - syntax 1 67 # repeat class after # symbol - syntax 1
60 gff$Name=rm_out$V11 68 # detect separator
69 # if "|" is present replace "|" -> "/" and "/" -> "-"
70 if (any(grepl("|", rm_out$V11, fixed = TRUE))){
71 gff$Name <- convert_names(rm_out$V11, old_sep = "|", new_sep = "/")
72 message('replacing classification separator character "|" with "/"')
73 print(gff)
74 }else{
75 gff$Name <- rm_out$V11
76 }
61 77
62 ## is repeat type is specifies by double underscore: 78 ## is repeat type is specifies by double underscore:
63 ## then rm_out$V11 is unspecified 79 ## then rm_out$V11 is unspecified
64 if (any(rm_out$V11 == "Unspecified")){ 80 if (any(rm_out$V11 == "Unspecified")){
65 ## set Name from prefix 81 ## set Name from prefix