Mercurial > repos > petr-novak > repeat_annotation_pipeline3
annotate annotate_contigs.R @ 12:755a4d643184 draft default tip
planemo upload commit a61591d548f42ff417781e7fe7418dc2901ccc23
author | petr-novak |
---|---|
date | Tue, 26 Sep 2023 07:28:04 +0000 |
parents | 5366d5ea04bc |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env Rscript |
2 ## input 1 - fasta with CLXContig names | |
3 ## input 2 - annotation | |
4 ## output 3 - annotated fasta | |
5 suppressPackageStartupMessages(library(Biostrings)) | |
6 | |
1 | 7 clean_contigs = function(s) { |
0 | 8 ## remove all N |
9 sc = as.character(s) | |
1 | 10 sc_trimmed = gsub("N+$", "", gsub("^N+", "", s)) |
0 | 11 ## remove blank and short sequences: |
1 | 12 sc_trimmed_not_empty = sc_trimmed[nchar(sc_trimmed) != 0] |
13 sc_trimmed_short = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) <= 20] | |
14 sc_trimmed_long = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) > 20] | |
15 sc_trimmed_short_tarean = sc_trimmed_short[grep("sc_", names(sc_trimmed_short), fixed = TRUE)] | |
0 | 16 sc_out = DNAStringSet(c(sc_trimmed_long, sc_trimmed_short_tarean)) |
17 } | |
18 | |
19 ## annotate_rm_fasta.R input.fasta annot.csv output.fasta | |
20 ## input 1 - input.fasta - contigs from clustering | |
11
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
1
diff
changeset
|
21 ## input 2 - CLUSTER_TABLE.csv |
0 | 22 ## |
23 ## output - clean conntigs with appended annotation | |
24 | |
25 ## find header row of annotation table | |
26 x = readLines(commandArgs(T)[2]) | |
27 | |
28 ## TODO - check mandatory names!!! | |
29 hl = intersect(grep("cluster", tolower(x)), grep("automatic_annotation", tolower(x))) | |
30 message("using line ", hl, " as header") | |
31 | |
1 | 32 annotation_table = read.table(commandArgs(T)[2], sep = "\t", header = TRUE, skip = hl - 1) |
0 | 33 colnames(annotation_table) = tolower(colnames(annotation_table)) |
34 | |
35 contigs = readDNAStringSet(commandArgs(T)[1]) | |
36 | |
37 | |
1 | 38 if ("final_annotation" %in% colnames(annotation_table) & all(!is.na(annotation_table$final_annotation))) { |
0 | 39 annot_dict = annotation_table$final_annotation |
40 message("using final annotation column") | |
1 | 41 }else { |
0 | 42 message("using automatic annotation column") |
43 annot_dict = annotation_table$automatic_annotation | |
44 } | |
45 | |
46 | |
1 | 47 names(annot_dict) = paste0("CL", annotation_table$cluster) |
0 | 48 #print(annot_dict) |
49 | |
50 contigs_ok = clean_contigs(contigs) | |
1 | 51 contig_name = gsub("Contig.+", "", names(contigs_ok)) |
0 | 52 |
53 ## keep only contigs which are in annot table | |
54 include = contig_name %in% names(annot_dict) | |
55 | |
56 contig_name_inc = contig_name[include] | |
57 contig_ok_include = contigs_ok[include] | |
58 | |
1 | 59 new_name_with_annot = paste0(names(contig_ok_include), "#", annot_dict[contig_name_inc]) |
0 | 60 names(contig_ok_include) = new_name_with_annot |
61 | |
62 writeXStringSet(contig_ok_include, filepath = commandArgs(T)[3]) |