annotate annotate_contigs.R @ 0:ea6a3059a6af draft

Uploaded
author petr-novak
date Mon, 18 Oct 2021 11:01:20 +0000
parents
children 814cba36e435
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env Rscript
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
2 ## input 1 - fasta with CLXContig names
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
3 ## input 2 - annotation
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
4 ## output 3 - annotated fasta
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
5 suppressPackageStartupMessages(library(Biostrings))
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
6
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
7 clean_contigs = function(s){
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
8 ## remove all N
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
9 sc = as.character(s)
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
10 sc_trimmed = gsub("N+$", "", gsub("^N+","",s))
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
11 ## remove blank and short sequences:
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
12 sc_trimmed_not_empty = sc_trimmed[nchar(sc_trimmed) !=0]
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
13 sc_trimmed_short = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) <=20]
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
14 sc_trimmed_long = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) >20]
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
15 sc_trimmed_short_tarean = sc_trimmed_short[grep("sc_", names(sc_trimmed_short), fixed=TRUE)]
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
16 sc_out = DNAStringSet(c(sc_trimmed_long, sc_trimmed_short_tarean))
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
17 }
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
18
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
19 ## annotate_rm_fasta.R input.fasta annot.csv output.fasta
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
20 ## input 1 - input.fasta - contigs from clustering
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
21 ## input 2 - annot.csv of clusters, firts column is CL number, seciond is annotation
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
22 ##
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
23 ## output - clean conntigs with appended annotation
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
24
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
25 ## find header row of annotation table
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
26 x = readLines(commandArgs(T)[2])
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
27
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
28 ## TODO - check mandatory names!!!
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
29 hl = intersect(grep("cluster", tolower(x)), grep("automatic_annotation", tolower(x)))
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
30 message("using line ", hl, " as header")
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
31
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
32 annotation_table=read.table(commandArgs(T)[2], sep="\t", header=TRUE, skip = hl - 1)
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
33 colnames(annotation_table) = tolower(colnames(annotation_table))
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
34
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
35 contigs = readDNAStringSet(commandArgs(T)[1])
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
36
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
37
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
38 if("final_annotation" %in% colnames(annotation_table) & all(!is.na(annotation_table$final_annotation))){
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
39 annot_dict = annotation_table$final_annotation
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
40 message("using final annotation column")
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
41 }else{
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
42 message("using automatic annotation column")
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
43 annot_dict = annotation_table$automatic_annotation
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
44 }
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
45
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
46
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
47
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
48 names(annot_dict) = paste0("CL",annotation_table$cluster)
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
49 #print(annot_dict)
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
50
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
51 contigs_ok = clean_contigs(contigs)
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
52 contig_name = gsub("Contig.+","",names(contigs_ok))
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
53
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
54 ## keep only contigs which are in annot table
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
55 include = contig_name %in% names(annot_dict)
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
56
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
57 contig_name_inc = contig_name[include]
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
58 contig_ok_include = contigs_ok[include]
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
59
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
60 new_name_with_annot = paste0(names(contig_ok_include),"#",annot_dict[contig_name_inc])
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
61 names(contig_ok_include) = new_name_with_annot
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
62
ea6a3059a6af Uploaded
petr-novak
parents:
diff changeset
63 writeXStringSet(contig_ok_include, filepath = commandArgs(T)[3])