Mercurial > repos > petr-novak > repeat_annotation_pipeline3
changeset 1:814cba36e435 draft
Uploaded
author | mvdbeek |
---|---|
date | Mon, 21 Feb 2022 10:21:39 +0000 |
parents | ea6a3059a6af |
children | 7f1032da7a0a |
files | README.org annotate_contigs.R clean_rm_output.R get_contigs_from_re_archive.py requirements.txt |
diffstat | 5 files changed, 70 insertions(+), 26 deletions(-) [+] |
line wrap: on
line diff
--- a/README.org Mon Oct 18 11:01:20 2021 +0000 +++ b/README.org Mon Feb 21 10:21:39 2022 +0000 @@ -31,5 +31,4 @@ #+begin_comment create tarball for toolshed: tar -czvf ../repeat_annotation_pipeline.tar.gz --exclude test_data --exclude .git --exclude tmp . - -#+end_comment> +#+end_comment
--- a/annotate_contigs.R Mon Oct 18 11:01:20 2021 +0000 +++ b/annotate_contigs.R Mon Feb 21 10:21:39 2022 +0000 @@ -4,15 +4,15 @@ ## output 3 - annotated fasta suppressPackageStartupMessages(library(Biostrings)) -clean_contigs = function(s){ +clean_contigs = function(s) { ## remove all N sc = as.character(s) - sc_trimmed = gsub("N+$", "", gsub("^N+","",s)) + sc_trimmed = gsub("N+$", "", gsub("^N+", "", s)) ## remove blank and short sequences: - sc_trimmed_not_empty = sc_trimmed[nchar(sc_trimmed) !=0] - sc_trimmed_short = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) <=20] - sc_trimmed_long = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) >20] - sc_trimmed_short_tarean = sc_trimmed_short[grep("sc_", names(sc_trimmed_short), fixed=TRUE)] + sc_trimmed_not_empty = sc_trimmed[nchar(sc_trimmed) != 0] + sc_trimmed_short = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) <= 20] + sc_trimmed_long = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) > 20] + sc_trimmed_short_tarean = sc_trimmed_short[grep("sc_", names(sc_trimmed_short), fixed = TRUE)] sc_out = DNAStringSet(c(sc_trimmed_long, sc_trimmed_short_tarean)) } @@ -29,27 +29,26 @@ hl = intersect(grep("cluster", tolower(x)), grep("automatic_annotation", tolower(x))) message("using line ", hl, " as header") -annotation_table=read.table(commandArgs(T)[2], sep="\t", header=TRUE, skip = hl - 1) +annotation_table = read.table(commandArgs(T)[2], sep = "\t", header = TRUE, skip = hl - 1) colnames(annotation_table) = tolower(colnames(annotation_table)) contigs = readDNAStringSet(commandArgs(T)[1]) -if("final_annotation" %in% colnames(annotation_table) & all(!is.na(annotation_table$final_annotation))){ +if ("final_annotation" %in% colnames(annotation_table) & all(!is.na(annotation_table$final_annotation))) { annot_dict = annotation_table$final_annotation message("using final annotation column") -}else{ +}else { message("using automatic annotation column") annot_dict = annotation_table$automatic_annotation } - -names(annot_dict) = paste0("CL",annotation_table$cluster) +names(annot_dict) = paste0("CL", annotation_table$cluster) #print(annot_dict) contigs_ok = clean_contigs(contigs) -contig_name = gsub("Contig.+","",names(contigs_ok)) +contig_name = gsub("Contig.+", "", names(contigs_ok)) ## keep only contigs which are in annot table include = contig_name %in% names(annot_dict) @@ -57,7 +56,7 @@ contig_name_inc = contig_name[include] contig_ok_include = contigs_ok[include] -new_name_with_annot = paste0(names(contig_ok_include),"#",annot_dict[contig_name_inc]) +new_name_with_annot = paste0(names(contig_ok_include), "#", annot_dict[contig_name_inc]) names(contig_ok_include) = new_name_with_annot writeXStringSet(contig_ok_include, filepath = commandArgs(T)[3])
--- a/clean_rm_output.R Mon Oct 18 11:01:20 2021 +0000 +++ b/clean_rm_output.R Mon Feb 21 10:21:39 2022 +0000 @@ -1,5 +1,7 @@ #!/usr/bin/env Rscript suppressPackageStartupMessages(library(rtracklayer)) +suppressPackageStartupMessages(library(parallel)) + gff_cleanup = function(gff){ ## remove overlapin annotation track - assign new annot @@ -50,7 +52,7 @@ ## infile = "./test_data/raw_rm.out" -rm_out = read.table(infile, as.is=TRUE, sep="", skip = 2, fill=TRUE, header=FALSE) +rm_out = read.table(infile, as.is=TRUE, sep="", skip = 2, fill=TRUE, header=FALSE, col.names=paste0("V",1:16)) gff = GRanges(seqnames = rm_out$V5, ranges = IRanges(start = rm_out$V6, end=rm_out$V7))
--- a/get_contigs_from_re_archive.py Mon Oct 18 11:01:20 2021 +0000 +++ b/get_contigs_from_re_archive.py Mon Feb 21 10:21:39 2022 +0000 @@ -8,6 +8,7 @@ import zipfile import tempfile import textwrap +import os def parse_args(): '''Argument parsin''' @@ -143,7 +144,10 @@ ids = [] s = [] for i in fobj: - ii = i.decode('utf-8') + if isinstance(i, str): + ii = i + else: + ii = i.decode('utf-8') if ii[0] == ">": ids.append(ii) s.append("") @@ -151,6 +155,21 @@ s[-1] = s[-1] + ii.strip() return ids, s + +def extract_contigs_from_re_directory(re_dir, aln_output): + with open(aln_output, 'w') as fout: + for subdir, dirs, files in os.walk(re_dir): + for fn in files: + fn_full = subdir + os.sep + fn + if re.match('^.+seqclust.+[.]aln$', fn_full): + print(fn_full) + with open(fn_full) as aln: + for l in aln: + fout.write(l) + return aln_output + + + def extract_tarean_contigs_from_re_archive(archive): with zipfile.ZipFile(archive, 'r') as zip_object: flist = zip_object.infolist() @@ -168,10 +187,21 @@ return ids_all, seqs_all -def extract_contigs_from_re_directory(dir, aln_output): - # TODO - pass - +def extract_tarean_contigs_from_re_directory(re_dir): + seqs_all = [] + ids_all = [] + for subdir, dirs, files in os.walk(re_dir): + for fn in files: + fn_full = subdir + os.sep + fn + if re.match("^.+seqclust.+dir_CL[0-9]+[/]tarean_contigs.fasta", fn_full): + print (fn_full) + with open(fn_full) as fobj: + ids, seqs = read_tarean_fasta(fobj) + # wrap sequences + seqs = ["\n".join(textwrap.wrap(s + s, 80)) for s in seqs] + seqs_all += seqs + ids_all += ids + return ids_all, seqs_all def filter_contigs(consensus, coverage, min_coverage=5): x = "".join([ @@ -184,11 +214,19 @@ def main(): args = parse_args() - # extract aln from archive - ids, seqs = extract_tarean_contigs_from_re_archive(args.re_file) - aln_file = extract_contigs_from_re_archive( - args.re_file, - tempfile.NamedTemporaryFile().name) + if os.path.isdir(args.re_file): + # extract from directory + ids, seqs = extract_tarean_contigs_from_re_directory(args.re_file) + aln_file = extract_contigs_from_re_directory( + args.re_file, + tempfile.NamedTemporaryFile().name) + else: + # extract aln from archive + ids, seqs = extract_tarean_contigs_from_re_archive(args.re_file) + aln_file = extract_contigs_from_re_archive( + args.re_file, + tempfile.NamedTemporaryFile().name) + with open(aln_file, 'r') as f1, open(args.fasta, 'w') as ffasta: while True: contig_name, seq_start = get_header(f1)