comparison annotate_contigs.R @ 1:814cba36e435 draft

Uploaded
author mvdbeek
date Mon, 21 Feb 2022 10:21:39 +0000
parents ea6a3059a6af
children 5366d5ea04bc
comparison
equal deleted inserted replaced
0:ea6a3059a6af 1:814cba36e435
2 ## input 1 - fasta with CLXContig names 2 ## input 1 - fasta with CLXContig names
3 ## input 2 - annotation 3 ## input 2 - annotation
4 ## output 3 - annotated fasta 4 ## output 3 - annotated fasta
5 suppressPackageStartupMessages(library(Biostrings)) 5 suppressPackageStartupMessages(library(Biostrings))
6 6
7 clean_contigs = function(s){ 7 clean_contigs = function(s) {
8 ## remove all N 8 ## remove all N
9 sc = as.character(s) 9 sc = as.character(s)
10 sc_trimmed = gsub("N+$", "", gsub("^N+","",s)) 10 sc_trimmed = gsub("N+$", "", gsub("^N+", "", s))
11 ## remove blank and short sequences: 11 ## remove blank and short sequences:
12 sc_trimmed_not_empty = sc_trimmed[nchar(sc_trimmed) !=0] 12 sc_trimmed_not_empty = sc_trimmed[nchar(sc_trimmed) != 0]
13 sc_trimmed_short = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) <=20] 13 sc_trimmed_short = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) <= 20]
14 sc_trimmed_long = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) >20] 14 sc_trimmed_long = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) > 20]
15 sc_trimmed_short_tarean = sc_trimmed_short[grep("sc_", names(sc_trimmed_short), fixed=TRUE)] 15 sc_trimmed_short_tarean = sc_trimmed_short[grep("sc_", names(sc_trimmed_short), fixed = TRUE)]
16 sc_out = DNAStringSet(c(sc_trimmed_long, sc_trimmed_short_tarean)) 16 sc_out = DNAStringSet(c(sc_trimmed_long, sc_trimmed_short_tarean))
17 } 17 }
18 18
19 ## annotate_rm_fasta.R input.fasta annot.csv output.fasta 19 ## annotate_rm_fasta.R input.fasta annot.csv output.fasta
20 ## input 1 - input.fasta - contigs from clustering 20 ## input 1 - input.fasta - contigs from clustering
27 27
28 ## TODO - check mandatory names!!! 28 ## TODO - check mandatory names!!!
29 hl = intersect(grep("cluster", tolower(x)), grep("automatic_annotation", tolower(x))) 29 hl = intersect(grep("cluster", tolower(x)), grep("automatic_annotation", tolower(x)))
30 message("using line ", hl, " as header") 30 message("using line ", hl, " as header")
31 31
32 annotation_table=read.table(commandArgs(T)[2], sep="\t", header=TRUE, skip = hl - 1) 32 annotation_table = read.table(commandArgs(T)[2], sep = "\t", header = TRUE, skip = hl - 1)
33 colnames(annotation_table) = tolower(colnames(annotation_table)) 33 colnames(annotation_table) = tolower(colnames(annotation_table))
34 34
35 contigs = readDNAStringSet(commandArgs(T)[1]) 35 contigs = readDNAStringSet(commandArgs(T)[1])
36 36
37 37
38 if("final_annotation" %in% colnames(annotation_table) & all(!is.na(annotation_table$final_annotation))){ 38 if ("final_annotation" %in% colnames(annotation_table) & all(!is.na(annotation_table$final_annotation))) {
39 annot_dict = annotation_table$final_annotation 39 annot_dict = annotation_table$final_annotation
40 message("using final annotation column") 40 message("using final annotation column")
41 }else{ 41 }else {
42 message("using automatic annotation column") 42 message("using automatic annotation column")
43 annot_dict = annotation_table$automatic_annotation 43 annot_dict = annotation_table$automatic_annotation
44 } 44 }
45 45
46 46
47 47 names(annot_dict) = paste0("CL", annotation_table$cluster)
48 names(annot_dict) = paste0("CL",annotation_table$cluster)
49 #print(annot_dict) 48 #print(annot_dict)
50 49
51 contigs_ok = clean_contigs(contigs) 50 contigs_ok = clean_contigs(contigs)
52 contig_name = gsub("Contig.+","",names(contigs_ok)) 51 contig_name = gsub("Contig.+", "", names(contigs_ok))
53 52
54 ## keep only contigs which are in annot table 53 ## keep only contigs which are in annot table
55 include = contig_name %in% names(annot_dict) 54 include = contig_name %in% names(annot_dict)
56 55
57 contig_name_inc = contig_name[include] 56 contig_name_inc = contig_name[include]
58 contig_ok_include = contigs_ok[include] 57 contig_ok_include = contigs_ok[include]
59 58
60 new_name_with_annot = paste0(names(contig_ok_include),"#",annot_dict[contig_name_inc]) 59 new_name_with_annot = paste0(names(contig_ok_include), "#", annot_dict[contig_name_inc])
61 names(contig_ok_include) = new_name_with_annot 60 names(contig_ok_include) = new_name_with_annot
62 61
63 writeXStringSet(contig_ok_include, filepath = commandArgs(T)[3]) 62 writeXStringSet(contig_ok_include, filepath = commandArgs(T)[3])