diff annotate_contigs.R @ 0:ea6a3059a6af draft

Uploaded
author petr-novak
date Mon, 18 Oct 2021 11:01:20 +0000
parents
children 814cba36e435
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/annotate_contigs.R	Mon Oct 18 11:01:20 2021 +0000
@@ -0,0 +1,63 @@
+#!/usr/bin/env Rscript
+## input 1 - fasta with CLXContig names
+## input 2 - annotation
+## output 3 - annotated fasta
+suppressPackageStartupMessages(library(Biostrings))
+
+clean_contigs = function(s){
+  ## remove all N
+  sc = as.character(s)
+  sc_trimmed = gsub("N+$", "", gsub("^N+","",s))
+  ## remove blank and short sequences:
+  sc_trimmed_not_empty = sc_trimmed[nchar(sc_trimmed) !=0]
+  sc_trimmed_short = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) <=20]
+  sc_trimmed_long = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) >20]
+  sc_trimmed_short_tarean = sc_trimmed_short[grep("sc_", names(sc_trimmed_short), fixed=TRUE)]
+  sc_out = DNAStringSet(c(sc_trimmed_long, sc_trimmed_short_tarean))
+}
+
+## annotate_rm_fasta.R input.fasta annot.csv output.fasta
+## input 1 - input.fasta - contigs from clustering
+## input 2 - annot.csv of clusters, firts column is CL number, seciond is annotation
+##
+## output - clean conntigs with appended annotation
+
+## find header row of annotation table
+x = readLines(commandArgs(T)[2])
+
+## TODO - check mandatory names!!!
+hl = intersect(grep("cluster", tolower(x)), grep("automatic_annotation", tolower(x)))
+message("using line ", hl, " as header")
+
+annotation_table=read.table(commandArgs(T)[2], sep="\t", header=TRUE, skip = hl - 1)
+colnames(annotation_table) = tolower(colnames(annotation_table))
+
+contigs = readDNAStringSet(commandArgs(T)[1])
+
+
+if("final_annotation" %in% colnames(annotation_table) & all(!is.na(annotation_table$final_annotation))){
+  annot_dict = annotation_table$final_annotation
+  message("using final annotation column")
+}else{
+  message("using automatic annotation column")
+  annot_dict = annotation_table$automatic_annotation
+}
+
+
+
+names(annot_dict) = paste0("CL",annotation_table$cluster)
+#print(annot_dict)
+
+contigs_ok = clean_contigs(contigs)
+contig_name = gsub("Contig.+","",names(contigs_ok))
+
+## keep only contigs which are in annot table
+include = contig_name %in% names(annot_dict)
+
+contig_name_inc = contig_name[include]
+contig_ok_include = contigs_ok[include]
+
+new_name_with_annot = paste0(names(contig_ok_include),"#",annot_dict[contig_name_inc])
+names(contig_ok_include) = new_name_with_annot
+
+writeXStringSet(contig_ok_include, filepath = commandArgs(T)[3])