diff annotate_contigs.R @ 1:814cba36e435 draft

Uploaded
author mvdbeek
date Mon, 21 Feb 2022 10:21:39 +0000
parents ea6a3059a6af
children 5366d5ea04bc
line wrap: on
line diff
--- a/annotate_contigs.R	Mon Oct 18 11:01:20 2021 +0000
+++ b/annotate_contigs.R	Mon Feb 21 10:21:39 2022 +0000
@@ -4,15 +4,15 @@
 ## output 3 - annotated fasta
 suppressPackageStartupMessages(library(Biostrings))
 
-clean_contigs = function(s){
+clean_contigs = function(s) {
   ## remove all N
   sc = as.character(s)
-  sc_trimmed = gsub("N+$", "", gsub("^N+","",s))
+  sc_trimmed = gsub("N+$", "", gsub("^N+", "", s))
   ## remove blank and short sequences:
-  sc_trimmed_not_empty = sc_trimmed[nchar(sc_trimmed) !=0]
-  sc_trimmed_short = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) <=20]
-  sc_trimmed_long = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) >20]
-  sc_trimmed_short_tarean = sc_trimmed_short[grep("sc_", names(sc_trimmed_short), fixed=TRUE)]
+  sc_trimmed_not_empty = sc_trimmed[nchar(sc_trimmed) != 0]
+  sc_trimmed_short = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) <= 20]
+  sc_trimmed_long = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) > 20]
+  sc_trimmed_short_tarean = sc_trimmed_short[grep("sc_", names(sc_trimmed_short), fixed = TRUE)]
   sc_out = DNAStringSet(c(sc_trimmed_long, sc_trimmed_short_tarean))
 }
 
@@ -29,27 +29,26 @@
 hl = intersect(grep("cluster", tolower(x)), grep("automatic_annotation", tolower(x)))
 message("using line ", hl, " as header")
 
-annotation_table=read.table(commandArgs(T)[2], sep="\t", header=TRUE, skip = hl - 1)
+annotation_table = read.table(commandArgs(T)[2], sep = "\t", header = TRUE, skip = hl - 1)
 colnames(annotation_table) = tolower(colnames(annotation_table))
 
 contigs = readDNAStringSet(commandArgs(T)[1])
 
 
-if("final_annotation" %in% colnames(annotation_table) & all(!is.na(annotation_table$final_annotation))){
+if ("final_annotation" %in% colnames(annotation_table) & all(!is.na(annotation_table$final_annotation))) {
   annot_dict = annotation_table$final_annotation
   message("using final annotation column")
-}else{
+}else {
   message("using automatic annotation column")
   annot_dict = annotation_table$automatic_annotation
 }
 
 
-
-names(annot_dict) = paste0("CL",annotation_table$cluster)
+names(annot_dict) = paste0("CL", annotation_table$cluster)
 #print(annot_dict)
 
 contigs_ok = clean_contigs(contigs)
-contig_name = gsub("Contig.+","",names(contigs_ok))
+contig_name = gsub("Contig.+", "", names(contigs_ok))
 
 ## keep only contigs which are in annot table
 include = contig_name %in% names(annot_dict)
@@ -57,7 +56,7 @@
 contig_name_inc = contig_name[include]
 contig_ok_include = contigs_ok[include]
 
-new_name_with_annot = paste0(names(contig_ok_include),"#",annot_dict[contig_name_inc])
+new_name_with_annot = paste0(names(contig_ok_include), "#", annot_dict[contig_name_inc])
 names(contig_ok_include) = new_name_with_annot
 
 writeXStringSet(contig_ok_include, filepath = commandArgs(T)[3])