0
|
1 #!/usr/bin/env Rscript
|
|
2 ## input 1 - fasta with CLXContig names
|
|
3 ## input 2 - annotation
|
|
4 ## output 3 - annotated fasta
|
|
5 suppressPackageStartupMessages(library(Biostrings))
|
|
6
|
|
7 clean_contigs = function(s){
|
|
8 ## remove all N
|
|
9 sc = as.character(s)
|
|
10 sc_trimmed = gsub("N+$", "", gsub("^N+","",s))
|
|
11 ## remove blank and short sequences:
|
|
12 sc_trimmed_not_empty = sc_trimmed[nchar(sc_trimmed) !=0]
|
|
13 sc_trimmed_short = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) <=20]
|
|
14 sc_trimmed_long = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) >20]
|
|
15 sc_trimmed_short_tarean = sc_trimmed_short[grep("sc_", names(sc_trimmed_short), fixed=TRUE)]
|
|
16 sc_out = DNAStringSet(c(sc_trimmed_long, sc_trimmed_short_tarean))
|
|
17 }
|
|
18
|
|
19 ## annotate_rm_fasta.R input.fasta annot.csv output.fasta
|
|
20 ## input 1 - input.fasta - contigs from clustering
|
|
21 ## input 2 - annot.csv of clusters, firts column is CL number, seciond is annotation
|
|
22 ##
|
|
23 ## output - clean conntigs with appended annotation
|
|
24
|
|
25 ## find header row of annotation table
|
|
26 x = readLines(commandArgs(T)[2])
|
|
27
|
|
28 ## TODO - check mandatory names!!!
|
|
29 hl = intersect(grep("cluster", tolower(x)), grep("automatic_annotation", tolower(x)))
|
|
30 message("using line ", hl, " as header")
|
|
31
|
|
32 annotation_table=read.table(commandArgs(T)[2], sep="\t", header=TRUE, skip = hl - 1)
|
|
33 colnames(annotation_table) = tolower(colnames(annotation_table))
|
|
34
|
|
35 contigs = readDNAStringSet(commandArgs(T)[1])
|
|
36 if("final_annotation" %in% colnames(annotation) & all(!is.na(annotation_table$final_annotation))){
|
|
37 annot_dict = annotation_table$final_annotation
|
|
38 message("using final annotation column")
|
|
39 }else{
|
|
40 message("using automatic annotation column")
|
|
41 annot_dict = annotation_table$automatic_annotation
|
|
42 }
|
|
43
|
|
44
|
|
45
|
|
46 names(annot_dict) = paste0("CL",annotation_table$cluster)
|
|
47 print(annot_dict)
|
|
48
|
|
49 contigs_ok = clean_contigs(contigs)
|
|
50 contig_name = gsub("Contig.+","",names(contigs_ok))
|
|
51
|
|
52 ## keep only contigs which are in annot table
|
|
53 include = contig_name %in% names(annot_dict)
|
|
54
|
|
55 contig_name_inc = contig_name[include]
|
|
56 contig_ok_include = contigs_ok[include]
|
|
57
|
|
58 new_name_with_annot = paste0(names(contig_ok_include),"#",annot_dict[contig_name_inc])
|
|
59 names(contig_ok_include) = new_name_with_annot
|
|
60
|
|
61 writeXStringSet(contig_ok_include, filepath = commandArgs(T)[3])
|