Mercurial > repos > petr-novak > repeat_annotation_pipeline3
comparison annotate_contigs.R @ 1:814cba36e435 draft
Uploaded
author | mvdbeek |
---|---|
date | Mon, 21 Feb 2022 10:21:39 +0000 |
parents | ea6a3059a6af |
children | 5366d5ea04bc |
comparison
equal
deleted
inserted
replaced
0:ea6a3059a6af | 1:814cba36e435 |
---|---|
2 ## input 1 - fasta with CLXContig names | 2 ## input 1 - fasta with CLXContig names |
3 ## input 2 - annotation | 3 ## input 2 - annotation |
4 ## output 3 - annotated fasta | 4 ## output 3 - annotated fasta |
5 suppressPackageStartupMessages(library(Biostrings)) | 5 suppressPackageStartupMessages(library(Biostrings)) |
6 | 6 |
7 clean_contigs = function(s){ | 7 clean_contigs = function(s) { |
8 ## remove all N | 8 ## remove all N |
9 sc = as.character(s) | 9 sc = as.character(s) |
10 sc_trimmed = gsub("N+$", "", gsub("^N+","",s)) | 10 sc_trimmed = gsub("N+$", "", gsub("^N+", "", s)) |
11 ## remove blank and short sequences: | 11 ## remove blank and short sequences: |
12 sc_trimmed_not_empty = sc_trimmed[nchar(sc_trimmed) !=0] | 12 sc_trimmed_not_empty = sc_trimmed[nchar(sc_trimmed) != 0] |
13 sc_trimmed_short = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) <=20] | 13 sc_trimmed_short = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) <= 20] |
14 sc_trimmed_long = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) >20] | 14 sc_trimmed_long = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) > 20] |
15 sc_trimmed_short_tarean = sc_trimmed_short[grep("sc_", names(sc_trimmed_short), fixed=TRUE)] | 15 sc_trimmed_short_tarean = sc_trimmed_short[grep("sc_", names(sc_trimmed_short), fixed = TRUE)] |
16 sc_out = DNAStringSet(c(sc_trimmed_long, sc_trimmed_short_tarean)) | 16 sc_out = DNAStringSet(c(sc_trimmed_long, sc_trimmed_short_tarean)) |
17 } | 17 } |
18 | 18 |
19 ## annotate_rm_fasta.R input.fasta annot.csv output.fasta | 19 ## annotate_rm_fasta.R input.fasta annot.csv output.fasta |
20 ## input 1 - input.fasta - contigs from clustering | 20 ## input 1 - input.fasta - contigs from clustering |
27 | 27 |
28 ## TODO - check mandatory names!!! | 28 ## TODO - check mandatory names!!! |
29 hl = intersect(grep("cluster", tolower(x)), grep("automatic_annotation", tolower(x))) | 29 hl = intersect(grep("cluster", tolower(x)), grep("automatic_annotation", tolower(x))) |
30 message("using line ", hl, " as header") | 30 message("using line ", hl, " as header") |
31 | 31 |
32 annotation_table=read.table(commandArgs(T)[2], sep="\t", header=TRUE, skip = hl - 1) | 32 annotation_table = read.table(commandArgs(T)[2], sep = "\t", header = TRUE, skip = hl - 1) |
33 colnames(annotation_table) = tolower(colnames(annotation_table)) | 33 colnames(annotation_table) = tolower(colnames(annotation_table)) |
34 | 34 |
35 contigs = readDNAStringSet(commandArgs(T)[1]) | 35 contigs = readDNAStringSet(commandArgs(T)[1]) |
36 | 36 |
37 | 37 |
38 if("final_annotation" %in% colnames(annotation_table) & all(!is.na(annotation_table$final_annotation))){ | 38 if ("final_annotation" %in% colnames(annotation_table) & all(!is.na(annotation_table$final_annotation))) { |
39 annot_dict = annotation_table$final_annotation | 39 annot_dict = annotation_table$final_annotation |
40 message("using final annotation column") | 40 message("using final annotation column") |
41 }else{ | 41 }else { |
42 message("using automatic annotation column") | 42 message("using automatic annotation column") |
43 annot_dict = annotation_table$automatic_annotation | 43 annot_dict = annotation_table$automatic_annotation |
44 } | 44 } |
45 | 45 |
46 | 46 |
47 | 47 names(annot_dict) = paste0("CL", annotation_table$cluster) |
48 names(annot_dict) = paste0("CL",annotation_table$cluster) | |
49 #print(annot_dict) | 48 #print(annot_dict) |
50 | 49 |
51 contigs_ok = clean_contigs(contigs) | 50 contigs_ok = clean_contigs(contigs) |
52 contig_name = gsub("Contig.+","",names(contigs_ok)) | 51 contig_name = gsub("Contig.+", "", names(contigs_ok)) |
53 | 52 |
54 ## keep only contigs which are in annot table | 53 ## keep only contigs which are in annot table |
55 include = contig_name %in% names(annot_dict) | 54 include = contig_name %in% names(annot_dict) |
56 | 55 |
57 contig_name_inc = contig_name[include] | 56 contig_name_inc = contig_name[include] |
58 contig_ok_include = contigs_ok[include] | 57 contig_ok_include = contigs_ok[include] |
59 | 58 |
60 new_name_with_annot = paste0(names(contig_ok_include),"#",annot_dict[contig_name_inc]) | 59 new_name_with_annot = paste0(names(contig_ok_include), "#", annot_dict[contig_name_inc]) |
61 names(contig_ok_include) = new_name_with_annot | 60 names(contig_ok_include) = new_name_with_annot |
62 | 61 |
63 writeXStringSet(contig_ok_include, filepath = commandArgs(T)[3]) | 62 writeXStringSet(contig_ok_include, filepath = commandArgs(T)[3]) |