# HG changeset patch # User petr-novak # Date 1645439713 0 # Node ID 7f1032da7a0aed9c509cdcd97bc2fdcb168655c7 # Parent 814cba36e435638180c0925e17be69f73bf47743 Uploaded diff -r 814cba36e435 -r 7f1032da7a0a README.org --- a/README.org Mon Feb 21 10:21:39 2022 +0000 +++ b/README.org Mon Feb 21 10:35:13 2022 +0000 @@ -30,5 +30,6 @@ #+begin_comment create tarball for toolshed: -tar -czvf ../repeat_annotation_pipeline.tar.gz --exclude test_data --exclude .git --exclude tmp . +tar -czvf ../repeat_annotation_pipeline.tar.gz --exclude test_data \ +--exclude .git --exclude tmp --exclude hg_repository --exclude .idea --exclude .gitignore . #+end_comment diff -r 814cba36e435 -r 7f1032da7a0a clean_rm_output.R --- a/clean_rm_output.R Mon Feb 21 10:21:39 2022 +0000 +++ b/clean_rm_output.R Mon Feb 21 10:35:13 2022 +0000 @@ -16,13 +16,13 @@ new_annot_lca = lca_annot[new_annot] #new_annot_lca = sapply(sapply(gff_names, unique), resolve_name) strand_attribute = sapply(sapply(gff_strands, unique), paste, collapse="|") - gff_disjoin$strands=strand_attribute gff_disjoin$source="RM" gff_disjoin$type="repeat" gff_disjoin$score=NA gff_disjoin$phase=NA gff_disjoin$Name=new_annot_lca gff_disjoin$Original_names=new_annot + gff_disjoin$strands=strand_attribute gff_disjoin$revmap=NULL return(gff_disjoin) } @@ -45,11 +45,19 @@ } } +convert_names <- function(n, old_sep = "|" , new_sep = "\""){ + # remove all characters which are new_sep with - + n_new = gsub(old_sep, new_sep, + gsub(new_sep,"-", n, fixed = TRUE), + fixed = TRUE) + return(n_new) +} infile = commandArgs(T)[1] outfile = commandArgs(T)[2] + ## infile = "./test_data/raw_rm.out" rm_out = read.table(infile, as.is=TRUE, sep="", skip = 2, fill=TRUE, header=FALSE, col.names=paste0("V",1:16)) @@ -57,7 +65,15 @@ gff = GRanges(seqnames = rm_out$V5, ranges = IRanges(start = rm_out$V6, end=rm_out$V7)) # repeat class after # symbol - syntax 1 -gff$Name=rm_out$V11 +# detect separator +# if "|" is present replace "|" -> "/" and "/" -> "-" +if (any(grepl("|", rm_out$V11, fixed = TRUE))){ + gff$Name <- convert_names(rm_out$V11, old_sep = "|", new_sep = "/") + message('replacing classification separator character "|" with "/"') + print(gff) +}else{ + gff$Name <- rm_out$V11 +} ## is repeat type is specifies by double underscore: ## then rm_out$V11 is unspecified diff -r 814cba36e435 -r 7f1032da7a0a repeat_annotate_custom.xml --- a/repeat_annotate_custom.xml Mon Feb 21 10:21:39 2022 +0000 +++ b/repeat_annotate_custom.xml Mon Feb 21 10:35:13 2022 +0000 @@ -1,4 +1,4 @@ - + repeatmasker bioconductor-rtracklayer