annotate summarize_cluster_table.R @ 20:5a05925340b0 draft

Uploaded
author petr-novak
date Mon, 07 Jun 2021 08:46:07 +0000
parents d7f3eff34c27
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env Rscript
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
2 library(optparse)
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
3 option_list <- list(
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
4 make_option(c("-c", "--cluster_table"), default=NA, type = "character",
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
5 help="file from RepeatExplorer2 clustering - CLUSTER_TABLE.csv"),
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
6
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
7 make_option(c("-m", "--comparative_counts"),default = NA,type = "character",
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
8 help="file from RepeatExplorer2 output - COMPARATIVE_ANALYSIS_COUNTS.csv"),
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
9 make_option(c("-o", "--output"), type="character",
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
10 help="output file name")
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
11 )
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
12
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
13
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
14 opt = parse_args(OptionParser(option_list = option_list))
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
15
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
16 ## for testing
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
17 cluster_annotation = opt$cluster_table
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
18 header_line = grep(".*Cluster.*Supercluster.*Size", readLines(cluster_annotation))
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
19 annot = read.table(cluster_annotation, sep="\t",header=TRUE,as.is=TRUE, skip = header_line - 1)
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
20
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
21
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
22 input_read_counts = as.numeric(strsplit(
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
23 grep("Number_of_analyzed_reads",
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
24 readLines(con=cluster_annotation, n=header_line),
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
25 value=TRUE)
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
26 ,split="\t")[[1]][2]
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
27 )
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
28
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
29 ## complete classification table:
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
30 unique_groups = sort(unique(annot$Final_annotatio))
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
31
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
32 groups_to_remove = grep("contamination|organelle", unique_groups, value=TRUE)
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
33 groups_to_keep = unique_groups[!(unique_groups %in% groups_to_remove)]
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
34
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
35 if (length(groups_to_remove)>0){
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
36 input_count_reads_corrected = input_read_counts - sum(annot$Size_adjusted[annot$Final_annotation %in% groups_to_remove])
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
37
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
38 }else{
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
39 input_count_reads_corrected = input_read_counts
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
40 }
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
41
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
42 proportion = numeric()
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
43 sum_of_reads = numeric()
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
44 for (g in groups_to_keep){
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
45 sum_of_reads[g] = sum(annot$Size_adjusted[annot$Final_annotation %in% g])
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
46 proportion[g] = sum_of_reads[g] / input_count_reads_corrected
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
47 }
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
48
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
49
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
50
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
51 summary_table = data.frame(Annotation = groups_to_keep,
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
52 Number_of_reads = sum_of_reads,
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
53 "Proportion[%]" = proportion * 100 , check.names = FALSE)
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
54
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
55 print(opt$output)
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
56 write.table(summary_table,file = opt$output,
d7f3eff34c27 Uploaded
petr-novak
parents:
diff changeset
57 row.names = FALSE, col.names = TRUE, sep="\t")